Home | History | Annotate | Download | only in fts3
      1 /*
      2 ** 2006 Oct 10
      3 **
      4 ** The author disclaims copyright to this source code.  In place of
      5 ** a legal notice, here is a blessing:
      6 **
      7 **    May you do good and not evil.
      8 **    May you find forgiveness for yourself and forgive others.
      9 **    May you share freely, never taking more than you give.
     10 **
     11 ******************************************************************************
     12 **
     13 ** Implementation of the "simple" full-text-search tokenizer.
     14 */
     15 
     16 /*
     17 ** The code in this file is only compiled if:
     18 **
     19 **     * The FTS3 module is being built as an extension
     20 **       (in which case SQLITE_CORE is not defined), or
     21 **
     22 **     * The FTS3 module is being built into the core of
     23 **       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
     24 */
     25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
     26 
     27 #include "fts3Int.h"
     28 
     29 #include <assert.h>
     30 #include <stdlib.h>
     31 #include <stdio.h>
     32 #include <string.h>
     33 
     34 #include "fts3_tokenizer.h"
     35 
     36 typedef struct simple_tokenizer {
     37   sqlite3_tokenizer base;
     38   char delim[128];             /* flag ASCII delimiters */
     39 } simple_tokenizer;
     40 
     41 typedef struct simple_tokenizer_cursor {
     42   sqlite3_tokenizer_cursor base;
     43   const char *pInput;          /* input we are tokenizing */
     44   int nBytes;                  /* size of the input */
     45   int iOffset;                 /* current position in pInput */
     46   int iToken;                  /* index of next token to be returned */
     47   char *pToken;                /* storage for current token */
     48   int nTokenAllocated;         /* space allocated to zToken buffer */
     49 } simple_tokenizer_cursor;
     50 
     51 
     52 static int simpleDelim(simple_tokenizer *t, unsigned char c){
     53   return c<0x80 && t->delim[c];
     54 }
     55 static int fts3_isalnum(int x){
     56   return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z');
     57 }
     58 
     59 /*
     60 ** Create a new tokenizer instance.
     61 */
     62 static int simpleCreate(
     63   int argc, const char * const *argv,
     64   sqlite3_tokenizer **ppTokenizer
     65 ){
     66   simple_tokenizer *t;
     67 
     68   t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
     69   if( t==NULL ) return SQLITE_NOMEM;
     70   memset(t, 0, sizeof(*t));
     71 
     72   /* TODO(shess) Delimiters need to remain the same from run to run,
     73   ** else we need to reindex.  One solution would be a meta-table to
     74   ** track such information in the database, then we'd only want this
     75   ** information on the initial create.
     76   */
     77   if( argc>1 ){
     78     int i, n = (int)strlen(argv[1]);
     79     for(i=0; i<n; i++){
     80       unsigned char ch = argv[1][i];
     81       /* We explicitly don't support UTF-8 delimiters for now. */
     82       if( ch>=0x80 ){
     83         sqlite3_free(t);
     84         return SQLITE_ERROR;
     85       }
     86       t->delim[ch] = 1;
     87     }
     88   } else {
     89     /* Mark non-alphanumeric ASCII characters as delimiters */
     90     int i;
     91     for(i=1; i<0x80; i++){
     92       t->delim[i] = !fts3_isalnum(i) ? -1 : 0;
     93     }
     94   }
     95 
     96   *ppTokenizer = &t->base;
     97   return SQLITE_OK;
     98 }
     99 
    100 /*
    101 ** Destroy a tokenizer
    102 */
    103 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
    104   sqlite3_free(pTokenizer);
    105   return SQLITE_OK;
    106 }
    107 
    108 /*
    109 ** Prepare to begin tokenizing a particular string.  The input
    110 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
    111 ** used to incrementally tokenize this string is returned in
    112 ** *ppCursor.
    113 */
    114 static int simpleOpen(
    115   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
    116   const char *pInput, int nBytes,        /* String to be tokenized */
    117   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
    118 ){
    119   simple_tokenizer_cursor *c;
    120 
    121   UNUSED_PARAMETER(pTokenizer);
    122 
    123   c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
    124   if( c==NULL ) return SQLITE_NOMEM;
    125 
    126   c->pInput = pInput;
    127   if( pInput==0 ){
    128     c->nBytes = 0;
    129   }else if( nBytes<0 ){
    130     c->nBytes = (int)strlen(pInput);
    131   }else{
    132     c->nBytes = nBytes;
    133   }
    134   c->iOffset = 0;                 /* start tokenizing at the beginning */
    135   c->iToken = 0;
    136   c->pToken = NULL;               /* no space allocated, yet. */
    137   c->nTokenAllocated = 0;
    138 
    139   *ppCursor = &c->base;
    140   return SQLITE_OK;
    141 }
    142 
    143 /*
    144 ** Close a tokenization cursor previously opened by a call to
    145 ** simpleOpen() above.
    146 */
    147 static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
    148   simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
    149   sqlite3_free(c->pToken);
    150   sqlite3_free(c);
    151   return SQLITE_OK;
    152 }
    153 
    154 /*
    155 ** Extract the next token from a tokenization cursor.  The cursor must
    156 ** have been opened by a prior call to simpleOpen().
    157 */
    158 static int simpleNext(
    159   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
    160   const char **ppToken,               /* OUT: *ppToken is the token text */
    161   int *pnBytes,                       /* OUT: Number of bytes in token */
    162   int *piStartOffset,                 /* OUT: Starting offset of token */
    163   int *piEndOffset,                   /* OUT: Ending offset of token */
    164   int *piPosition                     /* OUT: Position integer of token */
    165 ){
    166   simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
    167   simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
    168   unsigned char *p = (unsigned char *)c->pInput;
    169 
    170   while( c->iOffset<c->nBytes ){
    171     int iStartOffset;
    172 
    173     /* Scan past delimiter characters */
    174     while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
    175       c->iOffset++;
    176     }
    177 
    178     /* Count non-delimiter characters. */
    179     iStartOffset = c->iOffset;
    180     while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
    181       c->iOffset++;
    182     }
    183 
    184     if( c->iOffset>iStartOffset ){
    185       int i, n = c->iOffset-iStartOffset;
    186       if( n>c->nTokenAllocated ){
    187         char *pNew;
    188         c->nTokenAllocated = n+20;
    189         pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
    190         if( !pNew ) return SQLITE_NOMEM;
    191         c->pToken = pNew;
    192       }
    193       for(i=0; i<n; i++){
    194         /* TODO(shess) This needs expansion to handle UTF-8
    195         ** case-insensitivity.
    196         */
    197         unsigned char ch = p[iStartOffset+i];
    198         c->pToken[i] = (char)((ch>='A' && ch<='Z') ? ch-'A'+'a' : ch);
    199       }
    200       *ppToken = c->pToken;
    201       *pnBytes = n;
    202       *piStartOffset = iStartOffset;
    203       *piEndOffset = c->iOffset;
    204       *piPosition = c->iToken++;
    205 
    206       return SQLITE_OK;
    207     }
    208   }
    209   return SQLITE_DONE;
    210 }
    211 
    212 /*
    213 ** The set of routines that implement the simple tokenizer
    214 */
    215 static const sqlite3_tokenizer_module simpleTokenizerModule = {
    216   0,
    217   simpleCreate,
    218   simpleDestroy,
    219   simpleOpen,
    220   simpleClose,
    221   simpleNext,
    222 };
    223 
    224 /*
    225 ** Allocate a new simple tokenizer.  Return a pointer to the new
    226 ** tokenizer in *ppModule
    227 */
    228 void sqlite3Fts3SimpleTokenizerModule(
    229   sqlite3_tokenizer_module const**ppModule
    230 ){
    231   *ppModule = &simpleTokenizerModule;
    232 }
    233 
    234 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
    235