Home | History | Annotate | Download | only in fts2
      1 /*
      2 ** 2007 June 22
      3 **
      4 ** The author disclaims copyright to this source code.  In place of
      5 ** a legal notice, here is a blessing:
      6 **
      7 **    May you do good and not evil.
      8 **    May you find forgiveness for yourself and forgive others.
      9 **    May you share freely, never taking more than you give.
     10 **
     11 *************************************************************************
     12 ** This file implements a tokenizer for fts2 based on the ICU library.
     13 **
     14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
     15 */
     16 
     17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
     18 #ifdef SQLITE_ENABLE_ICU
     19 
     20 #include <assert.h>
     21 #include <string.h>
     22 #include "fts2_tokenizer.h"
     23 
     24 #include <unicode/ubrk.h>
     25 #include <unicode/ucol.h>
     26 #include <unicode/ustring.h>
     27 #include <unicode/utf16.h>
     28 
     29 typedef struct IcuTokenizer IcuTokenizer;
     30 typedef struct IcuCursor IcuCursor;
     31 
     32 struct IcuTokenizer {
     33   sqlite3_tokenizer base;
     34   char *zLocale;
     35 };
     36 
     37 struct IcuCursor {
     38   sqlite3_tokenizer_cursor base;
     39 
     40   UBreakIterator *pIter;      /* ICU break-iterator object */
     41   int nChar;                  /* Number of UChar elements in pInput */
     42   UChar *aChar;               /* Copy of input using utf-16 encoding */
     43   int *aOffset;               /* Offsets of each character in utf-8 input */
     44 
     45   int nBuffer;
     46   char *zBuffer;
     47 
     48   int iToken;
     49 };
     50 
     51 /*
     52 ** Create a new tokenizer instance.
     53 */
     54 static int icuCreate(
     55   int argc,                            /* Number of entries in argv[] */
     56   const char * const *argv,            /* Tokenizer creation arguments */
     57   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
     58 ){
     59   IcuTokenizer *p;
     60   int n = 0;
     61 
     62   if( argc>0 ){
     63     n = strlen(argv[0])+1;
     64   }
     65   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
     66   if( !p ){
     67     return SQLITE_NOMEM;
     68   }
     69   memset(p, 0, sizeof(IcuTokenizer));
     70 
     71   if( n ){
     72     p->zLocale = (char *)&p[1];
     73     memcpy(p->zLocale, argv[0], n);
     74   }
     75 
     76   *ppTokenizer = (sqlite3_tokenizer *)p;
     77 
     78   return SQLITE_OK;
     79 }
     80 
     81 /*
     82 ** Destroy a tokenizer
     83 */
     84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
     85   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
     86   sqlite3_free(p);
     87   return SQLITE_OK;
     88 }
     89 
     90 /*
     91 ** Prepare to begin tokenizing a particular string.  The input
     92 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
     93 ** used to incrementally tokenize this string is returned in
     94 ** *ppCursor.
     95 */
     96 static int icuOpen(
     97   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
     98   const char *zInput,                    /* Input string */
     99   int nInput,                            /* Length of zInput in bytes */
    100   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
    101 ){
    102   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
    103   IcuCursor *pCsr;
    104 
    105   const int32_t opt = U_FOLD_CASE_DEFAULT;
    106   UErrorCode status = U_ZERO_ERROR;
    107   int nChar;
    108 
    109   UChar32 c;
    110   int iInput = 0;
    111   int iOut = 0;
    112 
    113   *ppCursor = 0;
    114 
    115   if( nInput<0 ){
    116     nInput = strlen(zInput);
    117   }
    118   nChar = nInput+1;
    119   pCsr = (IcuCursor *)sqlite3_malloc(
    120       sizeof(IcuCursor) +                /* IcuCursor */
    121       (nChar+1) * sizeof(int) +          /* IcuCursor.aOffset[] */
    122       nChar * sizeof(UChar)              /* IcuCursor.aChar[] */
    123   );
    124   if( !pCsr ){
    125     return SQLITE_NOMEM;
    126   }
    127   memset(pCsr, 0, sizeof(IcuCursor));
    128   pCsr->aOffset = (int *)&pCsr[1];
    129   pCsr->aChar = (UChar *)&pCsr->aOffset[nChar+1];
    130 
    131   pCsr->aOffset[iOut] = iInput;
    132   U8_NEXT(zInput, iInput, nInput, c);
    133   while( c>0 ){
    134     int isError = 0;
    135     c = u_foldCase(c, opt);
    136     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
    137     if( isError ){
    138       sqlite3_free(pCsr);
    139       return SQLITE_ERROR;
    140     }
    141     pCsr->aOffset[iOut] = iInput;
    142 
    143     if( iInput<nInput ){
    144       U8_NEXT(zInput, iInput, nInput, c);
    145     }else{
    146       c = 0;
    147     }
    148   }
    149 
    150   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
    151   if( !U_SUCCESS(status) ){
    152     sqlite3_free(pCsr);
    153     return SQLITE_ERROR;
    154   }
    155   pCsr->nChar = iOut;
    156 
    157   ubrk_first(pCsr->pIter);
    158   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
    159   return SQLITE_OK;
    160 }
    161 
    162 /*
    163 ** Close a tokenization cursor previously opened by a call to icuOpen().
    164 */
    165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
    166   IcuCursor *pCsr = (IcuCursor *)pCursor;
    167   ubrk_close(pCsr->pIter);
    168   sqlite3_free(pCsr->zBuffer);
    169   sqlite3_free(pCsr);
    170   return SQLITE_OK;
    171 }
    172 
    173 /*
    174 ** Extract the next token from a tokenization cursor.
    175 */
    176 static int icuNext(
    177   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
    178   const char **ppToken,               /* OUT: *ppToken is the token text */
    179   int *pnBytes,                       /* OUT: Number of bytes in token */
    180   int *piStartOffset,                 /* OUT: Starting offset of token */
    181   int *piEndOffset,                   /* OUT: Ending offset of token */
    182   int *piPosition                     /* OUT: Position integer of token */
    183 ){
    184   IcuCursor *pCsr = (IcuCursor *)pCursor;
    185 
    186   int iStart = 0;
    187   int iEnd = 0;
    188   int nByte = 0;
    189 
    190   while( iStart==iEnd ){
    191     UChar32 c;
    192 
    193     iStart = ubrk_current(pCsr->pIter);
    194     iEnd = ubrk_next(pCsr->pIter);
    195     if( iEnd==UBRK_DONE ){
    196       return SQLITE_DONE;
    197     }
    198 
    199     while( iStart<iEnd ){
    200       int iWhite = iStart;
    201       U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
    202       if( u_isspace(c) ){
    203         iStart = iWhite;
    204       }else{
    205         break;
    206       }
    207     }
    208     assert(iStart<=iEnd);
    209   }
    210 
    211   do {
    212     UErrorCode status = U_ZERO_ERROR;
    213     if( nByte ){
    214       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
    215       if( !zNew ){
    216         return SQLITE_NOMEM;
    217       }
    218       pCsr->zBuffer = zNew;
    219       pCsr->nBuffer = nByte;
    220     }
    221 
    222     u_strToUTF8(
    223         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
    224         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
    225         &status                                  /* Output success/failure */
    226     );
    227   } while( nByte>pCsr->nBuffer );
    228 
    229   *ppToken = pCsr->zBuffer;
    230   *pnBytes = nByte;
    231   *piStartOffset = pCsr->aOffset[iStart];
    232   *piEndOffset = pCsr->aOffset[iEnd];
    233   *piPosition = pCsr->iToken++;
    234 
    235   return SQLITE_OK;
    236 }
    237 
    238 /*
    239 ** The set of routines that implement the simple tokenizer
    240 */
    241 static const sqlite3_tokenizer_module icuTokenizerModule = {
    242   0,                           /* iVersion */
    243   icuCreate,                   /* xCreate  */
    244   icuDestroy,                  /* xCreate  */
    245   icuOpen,                     /* xOpen    */
    246   icuClose,                    /* xClose   */
    247   icuNext,                     /* xNext    */
    248 };
    249 
    250 /*
    251 ** Set *ppModule to point at the implementation of the ICU tokenizer.
    252 */
    253 void sqlite3Fts2IcuTokenizerModule(
    254   sqlite3_tokenizer_module const**ppModule
    255 ){
    256   *ppModule = &icuTokenizerModule;
    257 }
    258 
    259 #endif /* defined(SQLITE_ENABLE_ICU) */
    260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
    261