1 /* 2 ** 2007 June 22 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** This file implements a tokenizer for fts2 based on the ICU library. 13 ** 14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $ 15 */ 16 17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) 18 #ifdef SQLITE_ENABLE_ICU 19 20 #include <assert.h> 21 #include <string.h> 22 #include "fts2_tokenizer.h" 23 24 #include <unicode/ubrk.h> 25 #include <unicode/ucol.h> 26 #include <unicode/ustring.h> 27 #include <unicode/utf16.h> 28 29 typedef struct IcuTokenizer IcuTokenizer; 30 typedef struct IcuCursor IcuCursor; 31 32 struct IcuTokenizer { 33 sqlite3_tokenizer base; 34 char *zLocale; 35 }; 36 37 struct IcuCursor { 38 sqlite3_tokenizer_cursor base; 39 40 UBreakIterator *pIter; /* ICU break-iterator object */ 41 int nChar; /* Number of UChar elements in pInput */ 42 UChar *aChar; /* Copy of input using utf-16 encoding */ 43 int *aOffset; /* Offsets of each character in utf-8 input */ 44 45 int nBuffer; 46 char *zBuffer; 47 48 int iToken; 49 }; 50 51 /* 52 ** Create a new tokenizer instance. 53 */ 54 static int icuCreate( 55 int argc, /* Number of entries in argv[] */ 56 const char * const *argv, /* Tokenizer creation arguments */ 57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ 58 ){ 59 IcuTokenizer *p; 60 int n = 0; 61 62 if( argc>0 ){ 63 n = strlen(argv[0])+1; 64 } 65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); 66 if( !p ){ 67 return SQLITE_NOMEM; 68 } 69 memset(p, 0, sizeof(IcuTokenizer)); 70 71 if( n ){ 72 p->zLocale = (char *)&p[1]; 73 memcpy(p->zLocale, argv[0], n); 74 } 75 76 *ppTokenizer = (sqlite3_tokenizer *)p; 77 78 return SQLITE_OK; 79 } 80 81 /* 82 ** Destroy a tokenizer 83 */ 84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ 85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 86 sqlite3_free(p); 87 return SQLITE_OK; 88 } 89 90 /* 91 ** Prepare to begin tokenizing a particular string. The input 92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor 93 ** used to incrementally tokenize this string is returned in 94 ** *ppCursor. 95 */ 96 static int icuOpen( 97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 98 const char *zInput, /* Input string */ 99 int nInput, /* Length of zInput in bytes */ 100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 101 ){ 102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 103 IcuCursor *pCsr; 104 105 const int32_t opt = U_FOLD_CASE_DEFAULT; 106 UErrorCode status = U_ZERO_ERROR; 107 int nChar; 108 109 UChar32 c; 110 int iInput = 0; 111 int iOut = 0; 112 113 *ppCursor = 0; 114 115 if( nInput<0 ){ 116 nInput = strlen(zInput); 117 } 118 nChar = nInput+1; 119 pCsr = (IcuCursor *)sqlite3_malloc( 120 sizeof(IcuCursor) + /* IcuCursor */ 121 (nChar+1) * sizeof(int) + /* IcuCursor.aOffset[] */ 122 nChar * sizeof(UChar) /* IcuCursor.aChar[] */ 123 ); 124 if( !pCsr ){ 125 return SQLITE_NOMEM; 126 } 127 memset(pCsr, 0, sizeof(IcuCursor)); 128 pCsr->aOffset = (int *)&pCsr[1]; 129 pCsr->aChar = (UChar *)&pCsr->aOffset[nChar+1]; 130 131 pCsr->aOffset[iOut] = iInput; 132 U8_NEXT(zInput, iInput, nInput, c); 133 while( c>0 ){ 134 int isError = 0; 135 c = u_foldCase(c, opt); 136 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); 137 if( isError ){ 138 sqlite3_free(pCsr); 139 return SQLITE_ERROR; 140 } 141 pCsr->aOffset[iOut] = iInput; 142 143 if( iInput<nInput ){ 144 U8_NEXT(zInput, iInput, nInput, c); 145 }else{ 146 c = 0; 147 } 148 } 149 150 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); 151 if( !U_SUCCESS(status) ){ 152 sqlite3_free(pCsr); 153 return SQLITE_ERROR; 154 } 155 pCsr->nChar = iOut; 156 157 ubrk_first(pCsr->pIter); 158 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; 159 return SQLITE_OK; 160 } 161 162 /* 163 ** Close a tokenization cursor previously opened by a call to icuOpen(). 164 */ 165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ 166 IcuCursor *pCsr = (IcuCursor *)pCursor; 167 ubrk_close(pCsr->pIter); 168 sqlite3_free(pCsr->zBuffer); 169 sqlite3_free(pCsr); 170 return SQLITE_OK; 171 } 172 173 /* 174 ** Extract the next token from a tokenization cursor. 175 */ 176 static int icuNext( 177 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ 178 const char **ppToken, /* OUT: *ppToken is the token text */ 179 int *pnBytes, /* OUT: Number of bytes in token */ 180 int *piStartOffset, /* OUT: Starting offset of token */ 181 int *piEndOffset, /* OUT: Ending offset of token */ 182 int *piPosition /* OUT: Position integer of token */ 183 ){ 184 IcuCursor *pCsr = (IcuCursor *)pCursor; 185 186 int iStart = 0; 187 int iEnd = 0; 188 int nByte = 0; 189 190 while( iStart==iEnd ){ 191 UChar32 c; 192 193 iStart = ubrk_current(pCsr->pIter); 194 iEnd = ubrk_next(pCsr->pIter); 195 if( iEnd==UBRK_DONE ){ 196 return SQLITE_DONE; 197 } 198 199 while( iStart<iEnd ){ 200 int iWhite = iStart; 201 U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); 202 if( u_isspace(c) ){ 203 iStart = iWhite; 204 }else{ 205 break; 206 } 207 } 208 assert(iStart<=iEnd); 209 } 210 211 do { 212 UErrorCode status = U_ZERO_ERROR; 213 if( nByte ){ 214 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); 215 if( !zNew ){ 216 return SQLITE_NOMEM; 217 } 218 pCsr->zBuffer = zNew; 219 pCsr->nBuffer = nByte; 220 } 221 222 u_strToUTF8( 223 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ 224 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ 225 &status /* Output success/failure */ 226 ); 227 } while( nByte>pCsr->nBuffer ); 228 229 *ppToken = pCsr->zBuffer; 230 *pnBytes = nByte; 231 *piStartOffset = pCsr->aOffset[iStart]; 232 *piEndOffset = pCsr->aOffset[iEnd]; 233 *piPosition = pCsr->iToken++; 234 235 return SQLITE_OK; 236 } 237 238 /* 239 ** The set of routines that implement the simple tokenizer 240 */ 241 static const sqlite3_tokenizer_module icuTokenizerModule = { 242 0, /* iVersion */ 243 icuCreate, /* xCreate */ 244 icuDestroy, /* xCreate */ 245 icuOpen, /* xOpen */ 246 icuClose, /* xClose */ 247 icuNext, /* xNext */ 248 }; 249 250 /* 251 ** Set *ppModule to point at the implementation of the ICU tokenizer. 252 */ 253 void sqlite3Fts2IcuTokenizerModule( 254 sqlite3_tokenizer_module const**ppModule 255 ){ 256 *ppModule = &icuTokenizerModule; 257 } 258 259 #endif /* defined(SQLITE_ENABLE_ICU) */ 260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ 261