1 /* 2 ** 2007 June 22 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ****************************************************************************** 12 ** 13 ** This is part of an SQLite module implementing full-text search. 14 ** This particular file implements the generic tokenizer interface. 15 */ 16 17 /* 18 ** The code in this file is only compiled if: 19 ** 20 ** * The FTS2 module is being built as an extension 21 ** (in which case SQLITE_CORE is not defined), or 22 ** 23 ** * The FTS2 module is being built into the core of 24 ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). 25 */ 26 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) 27 28 29 #include "sqlite3.h" 30 #include "sqlite3ext.h" 31 #ifndef SQLITE_CORE 32 SQLITE_EXTENSION_INIT1 33 #endif 34 35 #include "fts2_hash.h" 36 #include "fts2_tokenizer.h" 37 #include <assert.h> 38 #include <stddef.h> 39 40 /* 41 ** Implementation of the SQL scalar function for accessing the underlying 42 ** hash table. This function may be called as follows: 43 ** 44 ** SELECT <function-name>(<key-name>); 45 ** SELECT <function-name>(<key-name>, <pointer>); 46 ** 47 ** where <function-name> is the name passed as the second argument 48 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer'). 49 ** 50 ** If the <pointer> argument is specified, it must be a blob value 51 ** containing a pointer to be stored as the hash data corresponding 52 ** to the string <key-name>. If <pointer> is not specified, then 53 ** the string <key-name> must already exist in the has table. Otherwise, 54 ** an error is returned. 55 ** 56 ** Whether or not the <pointer> argument is specified, the value returned 57 ** is a blob containing the pointer stored as the hash data corresponding 58 ** to string <key-name> (after the hash-table is updated, if applicable). 59 */ 60 static void scalarFunc( 61 sqlite3_context *context, 62 int argc, 63 sqlite3_value **argv 64 ){ 65 fts2Hash *pHash; 66 void *pPtr = 0; 67 const unsigned char *zName; 68 int nName; 69 70 assert( argc==1 || argc==2 ); 71 72 pHash = (fts2Hash *)sqlite3_user_data(context); 73 74 zName = sqlite3_value_text(argv[0]); 75 nName = sqlite3_value_bytes(argv[0])+1; 76 77 if( argc==2 ){ 78 void *pOld; 79 int n = sqlite3_value_bytes(argv[1]); 80 if( n!=sizeof(pPtr) ){ 81 sqlite3_result_error(context, "argument type mismatch", -1); 82 return; 83 } 84 pPtr = *(void **)sqlite3_value_blob(argv[1]); 85 pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr); 86 if( pOld==pPtr ){ 87 sqlite3_result_error(context, "out of memory", -1); 88 return; 89 } 90 }else{ 91 pPtr = sqlite3Fts2HashFind(pHash, zName, nName); 92 if( !pPtr ){ 93 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); 94 sqlite3_result_error(context, zErr, -1); 95 sqlite3_free(zErr); 96 return; 97 } 98 } 99 100 sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT); 101 } 102 103 #ifdef SQLITE_TEST 104 105 #include <tcl.h> 106 #include <string.h> 107 108 /* 109 ** Implementation of a special SQL scalar function for testing tokenizers 110 ** designed to be used in concert with the Tcl testing framework. This 111 ** function must be called with two arguments: 112 ** 113 ** SELECT <function-name>(<key-name>, <input-string>); 114 ** SELECT <function-name>(<key-name>, <pointer>); 115 ** 116 ** where <function-name> is the name passed as the second argument 117 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer') 118 ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test'). 119 ** 120 ** The return value is a string that may be interpreted as a Tcl 121 ** list. For each token in the <input-string>, three elements are 122 ** added to the returned list. The first is the token position, the 123 ** second is the token text (folded, stemmed, etc.) and the third is the 124 ** substring of <input-string> associated with the token. For example, 125 ** using the built-in "simple" tokenizer: 126 ** 127 ** SELECT fts_tokenizer_test('simple', 'I don't see how'); 128 ** 129 ** will return the string: 130 ** 131 ** "{0 i I 1 dont don't 2 see see 3 how how}" 132 ** 133 */ 134 static void testFunc( 135 sqlite3_context *context, 136 int argc, 137 sqlite3_value **argv 138 ){ 139 fts2Hash *pHash; 140 sqlite3_tokenizer_module *p; 141 sqlite3_tokenizer *pTokenizer = 0; 142 sqlite3_tokenizer_cursor *pCsr = 0; 143 144 const char *zErr = 0; 145 146 const char *zName; 147 int nName; 148 const char *zInput; 149 int nInput; 150 151 const char *zArg = 0; 152 153 const char *zToken; 154 int nToken; 155 int iStart; 156 int iEnd; 157 int iPos; 158 159 Tcl_Obj *pRet; 160 161 assert( argc==2 || argc==3 ); 162 163 nName = sqlite3_value_bytes(argv[0]); 164 zName = (const char *)sqlite3_value_text(argv[0]); 165 nInput = sqlite3_value_bytes(argv[argc-1]); 166 zInput = (const char *)sqlite3_value_text(argv[argc-1]); 167 168 if( argc==3 ){ 169 zArg = (const char *)sqlite3_value_text(argv[1]); 170 } 171 172 pHash = (fts2Hash *)sqlite3_user_data(context); 173 p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1); 174 175 if( !p ){ 176 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); 177 sqlite3_result_error(context, zErr, -1); 178 sqlite3_free(zErr); 179 return; 180 } 181 182 pRet = Tcl_NewObj(); 183 Tcl_IncrRefCount(pRet); 184 185 if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ 186 zErr = "error in xCreate()"; 187 goto finish; 188 } 189 pTokenizer->pModule = p; 190 if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){ 191 zErr = "error in xOpen()"; 192 goto finish; 193 } 194 pCsr->pTokenizer = pTokenizer; 195 196 while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){ 197 Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos)); 198 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); 199 zToken = &zInput[iStart]; 200 nToken = iEnd-iStart; 201 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); 202 } 203 204 if( SQLITE_OK!=p->xClose(pCsr) ){ 205 zErr = "error in xClose()"; 206 goto finish; 207 } 208 if( SQLITE_OK!=p->xDestroy(pTokenizer) ){ 209 zErr = "error in xDestroy()"; 210 goto finish; 211 } 212 213 finish: 214 if( zErr ){ 215 sqlite3_result_error(context, zErr, -1); 216 }else{ 217 sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT); 218 } 219 Tcl_DecrRefCount(pRet); 220 } 221 222 static 223 int registerTokenizer( 224 sqlite3 *db, 225 char *zName, 226 const sqlite3_tokenizer_module *p 227 ){ 228 int rc; 229 sqlite3_stmt *pStmt; 230 const char zSql[] = "SELECT fts2_tokenizer(?, ?)"; 231 232 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); 233 if( rc!=SQLITE_OK ){ 234 return rc; 235 } 236 237 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); 238 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); 239 sqlite3_step(pStmt); 240 241 return sqlite3_finalize(pStmt); 242 } 243 244 static 245 int queryFts2Tokenizer( 246 sqlite3 *db, 247 char *zName, 248 const sqlite3_tokenizer_module **pp 249 ){ 250 int rc; 251 sqlite3_stmt *pStmt; 252 const char zSql[] = "SELECT fts2_tokenizer(?)"; 253 254 *pp = 0; 255 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); 256 if( rc!=SQLITE_OK ){ 257 return rc; 258 } 259 260 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); 261 if( SQLITE_ROW==sqlite3_step(pStmt) ){ 262 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ 263 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); 264 } 265 } 266 267 return sqlite3_finalize(pStmt); 268 } 269 270 void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); 271 272 /* 273 ** Implementation of the scalar function fts2_tokenizer_internal_test(). 274 ** This function is used for testing only, it is not included in the 275 ** build unless SQLITE_TEST is defined. 276 ** 277 ** The purpose of this is to test that the fts2_tokenizer() function 278 ** can be used as designed by the C-code in the queryFts2Tokenizer and 279 ** registerTokenizer() functions above. These two functions are repeated 280 ** in the README.tokenizer file as an example, so it is important to 281 ** test them. 282 ** 283 ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar 284 ** function with no arguments. An assert() will fail if a problem is 285 ** detected. i.e.: 286 ** 287 ** SELECT fts2_tokenizer_internal_test(); 288 ** 289 */ 290 static void intTestFunc( 291 sqlite3_context *context, 292 int argc, 293 sqlite3_value **argv 294 ){ 295 int rc; 296 const sqlite3_tokenizer_module *p1; 297 const sqlite3_tokenizer_module *p2; 298 sqlite3 *db = (sqlite3 *)sqlite3_user_data(context); 299 300 /* Test the query function */ 301 sqlite3Fts2SimpleTokenizerModule(&p1); 302 rc = queryFts2Tokenizer(db, "simple", &p2); 303 assert( rc==SQLITE_OK ); 304 assert( p1==p2 ); 305 rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2); 306 assert( rc==SQLITE_ERROR ); 307 assert( p2==0 ); 308 assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") ); 309 310 /* Test the storage function */ 311 rc = registerTokenizer(db, "nosuchtokenizer", p1); 312 assert( rc==SQLITE_OK ); 313 rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2); 314 assert( rc==SQLITE_OK ); 315 assert( p2==p1 ); 316 317 sqlite3_result_text(context, "ok", -1, SQLITE_STATIC); 318 } 319 320 #endif 321 322 /* 323 ** Set up SQL objects in database db used to access the contents of 324 ** the hash table pointed to by argument pHash. The hash table must 325 ** been initialised to use string keys, and to take a private copy 326 ** of the key when a value is inserted. i.e. by a call similar to: 327 ** 328 ** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1); 329 ** 330 ** This function adds a scalar function (see header comment above 331 ** scalarFunc() in this file for details) and, if ENABLE_TABLE is 332 ** defined at compilation time, a temporary virtual table (see header 333 ** comment above struct HashTableVtab) to the database schema. Both 334 ** provide read/write access to the contents of *pHash. 335 ** 336 ** The third argument to this function, zName, is used as the name 337 ** of both the scalar and, if created, the virtual table. 338 */ 339 int sqlite3Fts2InitHashTable( 340 sqlite3 *db, 341 fts2Hash *pHash, 342 const char *zName 343 ){ 344 int rc = SQLITE_OK; 345 void *p = (void *)pHash; 346 const int any = SQLITE_ANY; 347 char *zTest = 0; 348 char *zTest2 = 0; 349 350 #ifdef SQLITE_TEST 351 void *pdb = (void *)db; 352 zTest = sqlite3_mprintf("%s_test", zName); 353 zTest2 = sqlite3_mprintf("%s_internal_test", zName); 354 if( !zTest || !zTest2 ){ 355 rc = SQLITE_NOMEM; 356 } 357 #endif 358 359 if( rc!=SQLITE_OK 360 || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0)) 361 || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0)) 362 #ifdef SQLITE_TEST 363 || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0)) 364 || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0)) 365 || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0)) 366 #endif 367 ); 368 369 sqlite3_free(zTest); 370 sqlite3_free(zTest2); 371 return rc; 372 } 373 374 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ 375