Home | History | Annotate | Download | only in fts2
      1 /*
      2 ** 2007 June 22
      3 **
      4 ** The author disclaims copyright to this source code.  In place of
      5 ** a legal notice, here is a blessing:
      6 **
      7 **    May you do good and not evil.
      8 **    May you find forgiveness for yourself and forgive others.
      9 **    May you share freely, never taking more than you give.
     10 **
     11 ******************************************************************************
     12 **
     13 ** This is part of an SQLite module implementing full-text search.
     14 ** This particular file implements the generic tokenizer interface.
     15 */
     16 
     17 /*
     18 ** The code in this file is only compiled if:
     19 **
     20 **     * The FTS2 module is being built as an extension
     21 **       (in which case SQLITE_CORE is not defined), or
     22 **
     23 **     * The FTS2 module is being built into the core of
     24 **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
     25 */
     26 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
     27 
     28 
     29 #include "sqlite3.h"
     30 #include "sqlite3ext.h"
     31 #ifndef SQLITE_CORE
     32   SQLITE_EXTENSION_INIT1
     33 #endif
     34 
     35 #include "fts2_hash.h"
     36 #include "fts2_tokenizer.h"
     37 #include <assert.h>
     38 #include <stddef.h>
     39 
     40 /*
     41 ** Implementation of the SQL scalar function for accessing the underlying
     42 ** hash table. This function may be called as follows:
     43 **
     44 **   SELECT <function-name>(<key-name>);
     45 **   SELECT <function-name>(<key-name>, <pointer>);
     46 **
     47 ** where <function-name> is the name passed as the second argument
     48 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
     49 **
     50 ** If the <pointer> argument is specified, it must be a blob value
     51 ** containing a pointer to be stored as the hash data corresponding
     52 ** to the string <key-name>. If <pointer> is not specified, then
     53 ** the string <key-name> must already exist in the has table. Otherwise,
     54 ** an error is returned.
     55 **
     56 ** Whether or not the <pointer> argument is specified, the value returned
     57 ** is a blob containing the pointer stored as the hash data corresponding
     58 ** to string <key-name> (after the hash-table is updated, if applicable).
     59 */
     60 static void scalarFunc(
     61   sqlite3_context *context,
     62   int argc,
     63   sqlite3_value **argv
     64 ){
     65   fts2Hash *pHash;
     66   void *pPtr = 0;
     67   const unsigned char *zName;
     68   int nName;
     69 
     70   assert( argc==1 || argc==2 );
     71 
     72   pHash = (fts2Hash *)sqlite3_user_data(context);
     73 
     74   zName = sqlite3_value_text(argv[0]);
     75   nName = sqlite3_value_bytes(argv[0])+1;
     76 
     77   if( argc==2 ){
     78     void *pOld;
     79     int n = sqlite3_value_bytes(argv[1]);
     80     if( n!=sizeof(pPtr) ){
     81       sqlite3_result_error(context, "argument type mismatch", -1);
     82       return;
     83     }
     84     pPtr = *(void **)sqlite3_value_blob(argv[1]);
     85     pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
     86     if( pOld==pPtr ){
     87       sqlite3_result_error(context, "out of memory", -1);
     88       return;
     89     }
     90   }else{
     91     pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
     92     if( !pPtr ){
     93       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
     94       sqlite3_result_error(context, zErr, -1);
     95       sqlite3_free(zErr);
     96       return;
     97     }
     98   }
     99 
    100   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
    101 }
    102 
    103 #ifdef SQLITE_TEST
    104 
    105 #include <tcl.h>
    106 #include <string.h>
    107 
    108 /*
    109 ** Implementation of a special SQL scalar function for testing tokenizers
    110 ** designed to be used in concert with the Tcl testing framework. This
    111 ** function must be called with two arguments:
    112 **
    113 **   SELECT <function-name>(<key-name>, <input-string>);
    114 **   SELECT <function-name>(<key-name>, <pointer>);
    115 **
    116 ** where <function-name> is the name passed as the second argument
    117 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
    118 ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
    119 **
    120 ** The return value is a string that may be interpreted as a Tcl
    121 ** list. For each token in the <input-string>, three elements are
    122 ** added to the returned list. The first is the token position, the
    123 ** second is the token text (folded, stemmed, etc.) and the third is the
    124 ** substring of <input-string> associated with the token. For example,
    125 ** using the built-in "simple" tokenizer:
    126 **
    127 **   SELECT fts_tokenizer_test('simple', 'I don't see how');
    128 **
    129 ** will return the string:
    130 **
    131 **   "{0 i I 1 dont don't 2 see see 3 how how}"
    132 **
    133 */
    134 static void testFunc(
    135   sqlite3_context *context,
    136   int argc,
    137   sqlite3_value **argv
    138 ){
    139   fts2Hash *pHash;
    140   sqlite3_tokenizer_module *p;
    141   sqlite3_tokenizer *pTokenizer = 0;
    142   sqlite3_tokenizer_cursor *pCsr = 0;
    143 
    144   const char *zErr = 0;
    145 
    146   const char *zName;
    147   int nName;
    148   const char *zInput;
    149   int nInput;
    150 
    151   const char *zArg = 0;
    152 
    153   const char *zToken;
    154   int nToken;
    155   int iStart;
    156   int iEnd;
    157   int iPos;
    158 
    159   Tcl_Obj *pRet;
    160 
    161   assert( argc==2 || argc==3 );
    162 
    163   nName = sqlite3_value_bytes(argv[0]);
    164   zName = (const char *)sqlite3_value_text(argv[0]);
    165   nInput = sqlite3_value_bytes(argv[argc-1]);
    166   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
    167 
    168   if( argc==3 ){
    169     zArg = (const char *)sqlite3_value_text(argv[1]);
    170   }
    171 
    172   pHash = (fts2Hash *)sqlite3_user_data(context);
    173   p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
    174 
    175   if( !p ){
    176     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
    177     sqlite3_result_error(context, zErr, -1);
    178     sqlite3_free(zErr);
    179     return;
    180   }
    181 
    182   pRet = Tcl_NewObj();
    183   Tcl_IncrRefCount(pRet);
    184 
    185   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
    186     zErr = "error in xCreate()";
    187     goto finish;
    188   }
    189   pTokenizer->pModule = p;
    190   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
    191     zErr = "error in xOpen()";
    192     goto finish;
    193   }
    194   pCsr->pTokenizer = pTokenizer;
    195 
    196   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
    197     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
    198     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
    199     zToken = &zInput[iStart];
    200     nToken = iEnd-iStart;
    201     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
    202   }
    203 
    204   if( SQLITE_OK!=p->xClose(pCsr) ){
    205     zErr = "error in xClose()";
    206     goto finish;
    207   }
    208   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
    209     zErr = "error in xDestroy()";
    210     goto finish;
    211   }
    212 
    213 finish:
    214   if( zErr ){
    215     sqlite3_result_error(context, zErr, -1);
    216   }else{
    217     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
    218   }
    219   Tcl_DecrRefCount(pRet);
    220 }
    221 
    222 static
    223 int registerTokenizer(
    224   sqlite3 *db,
    225   char *zName,
    226   const sqlite3_tokenizer_module *p
    227 ){
    228   int rc;
    229   sqlite3_stmt *pStmt;
    230   const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
    231 
    232   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
    233   if( rc!=SQLITE_OK ){
    234     return rc;
    235   }
    236 
    237   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
    238   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
    239   sqlite3_step(pStmt);
    240 
    241   return sqlite3_finalize(pStmt);
    242 }
    243 
    244 static
    245 int queryFts2Tokenizer(
    246   sqlite3 *db,
    247   char *zName,
    248   const sqlite3_tokenizer_module **pp
    249 ){
    250   int rc;
    251   sqlite3_stmt *pStmt;
    252   const char zSql[] = "SELECT fts2_tokenizer(?)";
    253 
    254   *pp = 0;
    255   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
    256   if( rc!=SQLITE_OK ){
    257     return rc;
    258   }
    259 
    260   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
    261   if( SQLITE_ROW==sqlite3_step(pStmt) ){
    262     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
    263       memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
    264     }
    265   }
    266 
    267   return sqlite3_finalize(pStmt);
    268 }
    269 
    270 void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
    271 
    272 /*
    273 ** Implementation of the scalar function fts2_tokenizer_internal_test().
    274 ** This function is used for testing only, it is not included in the
    275 ** build unless SQLITE_TEST is defined.
    276 **
    277 ** The purpose of this is to test that the fts2_tokenizer() function
    278 ** can be used as designed by the C-code in the queryFts2Tokenizer and
    279 ** registerTokenizer() functions above. These two functions are repeated
    280 ** in the README.tokenizer file as an example, so it is important to
    281 ** test them.
    282 **
    283 ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
    284 ** function with no arguments. An assert() will fail if a problem is
    285 ** detected. i.e.:
    286 **
    287 **     SELECT fts2_tokenizer_internal_test();
    288 **
    289 */
    290 static void intTestFunc(
    291   sqlite3_context *context,
    292   int argc,
    293   sqlite3_value **argv
    294 ){
    295   int rc;
    296   const sqlite3_tokenizer_module *p1;
    297   const sqlite3_tokenizer_module *p2;
    298   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
    299 
    300   /* Test the query function */
    301   sqlite3Fts2SimpleTokenizerModule(&p1);
    302   rc = queryFts2Tokenizer(db, "simple", &p2);
    303   assert( rc==SQLITE_OK );
    304   assert( p1==p2 );
    305   rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
    306   assert( rc==SQLITE_ERROR );
    307   assert( p2==0 );
    308   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
    309 
    310   /* Test the storage function */
    311   rc = registerTokenizer(db, "nosuchtokenizer", p1);
    312   assert( rc==SQLITE_OK );
    313   rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
    314   assert( rc==SQLITE_OK );
    315   assert( p2==p1 );
    316 
    317   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
    318 }
    319 
    320 #endif
    321 
    322 /*
    323 ** Set up SQL objects in database db used to access the contents of
    324 ** the hash table pointed to by argument pHash. The hash table must
    325 ** been initialised to use string keys, and to take a private copy
    326 ** of the key when a value is inserted. i.e. by a call similar to:
    327 **
    328 **    sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
    329 **
    330 ** This function adds a scalar function (see header comment above
    331 ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
    332 ** defined at compilation time, a temporary virtual table (see header
    333 ** comment above struct HashTableVtab) to the database schema. Both
    334 ** provide read/write access to the contents of *pHash.
    335 **
    336 ** The third argument to this function, zName, is used as the name
    337 ** of both the scalar and, if created, the virtual table.
    338 */
    339 int sqlite3Fts2InitHashTable(
    340   sqlite3 *db,
    341   fts2Hash *pHash,
    342   const char *zName
    343 ){
    344   int rc = SQLITE_OK;
    345   void *p = (void *)pHash;
    346   const int any = SQLITE_ANY;
    347   char *zTest = 0;
    348   char *zTest2 = 0;
    349 
    350 #ifdef SQLITE_TEST
    351   void *pdb = (void *)db;
    352   zTest = sqlite3_mprintf("%s_test", zName);
    353   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
    354   if( !zTest || !zTest2 ){
    355     rc = SQLITE_NOMEM;
    356   }
    357 #endif
    358 
    359   if( rc!=SQLITE_OK
    360    || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
    361    || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
    362 #ifdef SQLITE_TEST
    363    || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
    364    || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
    365    || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
    366 #endif
    367   );
    368 
    369   sqlite3_free(zTest);
    370   sqlite3_free(zTest2);
    371   return rc;
    372 }
    373 
    374 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
    375