Home | History | Annotate | Download | only in fts1
      1 /* fts1 has a design flaw which can lead to database corruption (see
      2 ** below).  It is recommended not to use it any longer, instead use
      3 ** fts3 (or higher).  If you believe that your use of fts1 is safe,
      4 ** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS.
      5 */
      6 #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)) \
      7         && !defined(SQLITE_ENABLE_BROKEN_FTS1)
      8 #error fts1 has a design flaw and has been deprecated.
      9 #endif
     10 /* The flaw is that fts1 uses the content table's unaliased rowid as
     11 ** the unique docid.  fts1 embeds the rowid in the index it builds,
     12 ** and expects the rowid to not change.  The SQLite VACUUM operation
     13 ** will renumber such rowids, thereby breaking fts1.  If you are using
     14 ** fts1 in a system which has disabled VACUUM, then you can continue
     15 ** to use it safely.  Note that PRAGMA auto_vacuum does NOT disable
     16 ** VACUUM, though systems using auto_vacuum are unlikely to invoke
     17 ** VACUUM.
     18 **
     19 ** fts1 should be safe even across VACUUM if you only insert documents
     20 ** and never delete.
     21 */
     22 
     23 /* The author disclaims copyright to this source code.
     24  *
     25  * This is an SQLite module implementing full-text search.
     26  */
     27 
     28 /*
     29 ** The code in this file is only compiled if:
     30 **
     31 **     * The FTS1 module is being built as an extension
     32 **       (in which case SQLITE_CORE is not defined), or
     33 **
     34 **     * The FTS1 module is being built into the core of
     35 **       SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
     36 */
     37 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
     38 
     39 #if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
     40 # define SQLITE_CORE 1
     41 #endif
     42 
     43 #include <assert.h>
     44 #include <stdlib.h>
     45 #include <stdio.h>
     46 #include <string.h>
     47 #include <ctype.h>
     48 
     49 #include "fts1.h"
     50 #include "fts1_hash.h"
     51 #include "fts1_tokenizer.h"
     52 #include "sqlite3.h"
     53 #include "sqlite3ext.h"
     54 SQLITE_EXTENSION_INIT1
     55 
     56 
     57 #if 0
     58 # define TRACE(A)  printf A; fflush(stdout)
     59 #else
     60 # define TRACE(A)
     61 #endif
     62 
     63 /* utility functions */
     64 
     65 typedef struct StringBuffer {
     66   int len;      /* length, not including null terminator */
     67   int alloced;  /* Space allocated for s[] */
     68   char *s;      /* Content of the string */
     69 } StringBuffer;
     70 
     71 static void initStringBuffer(StringBuffer *sb){
     72   sb->len = 0;
     73   sb->alloced = 100;
     74   sb->s = malloc(100);
     75   sb->s[0] = '\0';
     76 }
     77 
     78 static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
     79   if( sb->len + nFrom >= sb->alloced ){
     80     sb->alloced = sb->len + nFrom + 100;
     81     sb->s = realloc(sb->s, sb->alloced+1);
     82     if( sb->s==0 ){
     83       initStringBuffer(sb);
     84       return;
     85     }
     86   }
     87   memcpy(sb->s + sb->len, zFrom, nFrom);
     88   sb->len += nFrom;
     89   sb->s[sb->len] = 0;
     90 }
     91 static void append(StringBuffer *sb, const char *zFrom){
     92   nappend(sb, zFrom, strlen(zFrom));
     93 }
     94 
     95 /* We encode variable-length integers in little-endian order using seven bits
     96  * per byte as follows:
     97 **
     98 ** KEY:
     99 **         A = 0xxxxxxx    7 bits of data and one flag bit
    100 **         B = 1xxxxxxx    7 bits of data and one flag bit
    101 **
    102 **  7 bits - A
    103 ** 14 bits - BA
    104 ** 21 bits - BBA
    105 ** and so on.
    106 */
    107 
    108 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
    109 #define VARINT_MAX 10
    110 
    111 /* Write a 64-bit variable-length integer to memory starting at p[0].
    112  * The length of data written will be between 1 and VARINT_MAX bytes.
    113  * The number of bytes written is returned. */
    114 static int putVarint(char *p, sqlite_int64 v){
    115   unsigned char *q = (unsigned char *) p;
    116   sqlite_uint64 vu = v;
    117   do{
    118     *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
    119     vu >>= 7;
    120   }while( vu!=0 );
    121   q[-1] &= 0x7f;  /* turn off high bit in final byte */
    122   assert( q - (unsigned char *)p <= VARINT_MAX );
    123   return (int) (q - (unsigned char *)p);
    124 }
    125 
    126 /* Read a 64-bit variable-length integer from memory starting at p[0].
    127  * Return the number of bytes read, or 0 on error.
    128  * The value is stored in *v. */
    129 static int getVarint(const char *p, sqlite_int64 *v){
    130   const unsigned char *q = (const unsigned char *) p;
    131   sqlite_uint64 x = 0, y = 1;
    132   while( (*q & 0x80) == 0x80 ){
    133     x += y * (*q++ & 0x7f);
    134     y <<= 7;
    135     if( q - (unsigned char *)p >= VARINT_MAX ){  /* bad data */
    136       assert( 0 );
    137       return 0;
    138     }
    139   }
    140   x += y * (*q++);
    141   *v = (sqlite_int64) x;
    142   return (int) (q - (unsigned char *)p);
    143 }
    144 
    145 static int getVarint32(const char *p, int *pi){
    146  sqlite_int64 i;
    147  int ret = getVarint(p, &i);
    148  *pi = (int) i;
    149  assert( *pi==i );
    150  return ret;
    151 }
    152 
    153 /*** Document lists ***
    154  *
    155  * A document list holds a sorted list of varint-encoded document IDs.
    156  *
    157  * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
    158  *
    159  * array {
    160  *   varint docid;
    161  *   array {
    162  *     varint position;     (delta from previous position plus POS_BASE)
    163  *     varint startOffset;  (delta from previous startOffset)
    164  *     varint endOffset;    (delta from startOffset)
    165  *   }
    166  * }
    167  *
    168  * Here, array { X } means zero or more occurrences of X, adjacent in memory.
    169  *
    170  * A position list may hold positions for text in multiple columns.  A position
    171  * POS_COLUMN is followed by a varint containing the index of the column for
    172  * following positions in the list.  Any positions appearing before any
    173  * occurrences of POS_COLUMN are for column 0.
    174  *
    175  * A doclist with type DL_POSITIONS is like the above, but holds only docids
    176  * and positions without offset information.
    177  *
    178  * A doclist with type DL_DOCIDS is like the above, but holds only docids
    179  * without positions or offset information.
    180  *
    181  * On disk, every document list has positions and offsets, so we don't bother
    182  * to serialize a doclist's type.
    183  *
    184  * We don't yet delta-encode document IDs; doing so will probably be a
    185  * modest win.
    186  *
    187  * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
    188  * After the first offset, estimate the next offset by using the
    189  * current token position and the previous token position and offset,
    190  * offset to handle some variance.  So the estimate would be
    191  * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
    192  * as normal.  Offsets more than 64 chars from the estimate are
    193  * encoded as the delta to the previous start offset + 128.  An
    194  * additional tiny increment can be gained by using the end offset of
    195  * the previous token to make the estimate a tiny bit more precise.
    196 */
    197 
    198 /* It is not safe to call isspace(), tolower(), or isalnum() on
    199 ** hi-bit-set characters.  This is the same solution used in the
    200 ** tokenizer.
    201 */
    202 /* TODO(shess) The snippet-generation code should be using the
    203 ** tokenizer-generated tokens rather than doing its own local
    204 ** tokenization.
    205 */
    206 /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
    207 static int safe_isspace(char c){
    208   return (c&0x80)==0 ? isspace(c) : 0;
    209 }
    210 static int safe_tolower(char c){
    211   return (c&0x80)==0 ? tolower(c) : c;
    212 }
    213 static int safe_isalnum(char c){
    214   return (c&0x80)==0 ? isalnum(c) : 0;
    215 }
    216 
    217 typedef enum DocListType {
    218   DL_DOCIDS,              /* docids only */
    219   DL_POSITIONS,           /* docids + positions */
    220   DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
    221 } DocListType;
    222 
    223 /*
    224 ** By default, only positions and not offsets are stored in the doclists.
    225 ** To change this so that offsets are stored too, compile with
    226 **
    227 **          -DDL_DEFAULT=DL_POSITIONS_OFFSETS
    228 **
    229 */
    230 #ifndef DL_DEFAULT
    231 # define DL_DEFAULT DL_POSITIONS
    232 #endif
    233 
    234 typedef struct DocList {
    235   char *pData;
    236   int nData;
    237   DocListType iType;
    238   int iLastColumn;    /* the last column written */
    239   int iLastPos;       /* the last position written */
    240   int iLastOffset;    /* the last start offset written */
    241 } DocList;
    242 
    243 enum {
    244   POS_END = 0,        /* end of this position list */
    245   POS_COLUMN,         /* followed by new column number */
    246   POS_BASE
    247 };
    248 
    249 /* Initialize a new DocList to hold the given data. */
    250 static void docListInit(DocList *d, DocListType iType,
    251                         const char *pData, int nData){
    252   d->nData = nData;
    253   if( nData>0 ){
    254     d->pData = malloc(nData);
    255     memcpy(d->pData, pData, nData);
    256   } else {
    257     d->pData = NULL;
    258   }
    259   d->iType = iType;
    260   d->iLastColumn = 0;
    261   d->iLastPos = d->iLastOffset = 0;
    262 }
    263 
    264 /* Create a new dynamically-allocated DocList. */
    265 static DocList *docListNew(DocListType iType){
    266   DocList *d = (DocList *) malloc(sizeof(DocList));
    267   docListInit(d, iType, 0, 0);
    268   return d;
    269 }
    270 
    271 static void docListDestroy(DocList *d){
    272   free(d->pData);
    273 #ifndef NDEBUG
    274   memset(d, 0x55, sizeof(*d));
    275 #endif
    276 }
    277 
    278 static void docListDelete(DocList *d){
    279   docListDestroy(d);
    280   free(d);
    281 }
    282 
    283 static char *docListEnd(DocList *d){
    284   return d->pData + d->nData;
    285 }
    286 
    287 /* Append a varint to a DocList's data. */
    288 static void appendVarint(DocList *d, sqlite_int64 i){
    289   char c[VARINT_MAX];
    290   int n = putVarint(c, i);
    291   d->pData = realloc(d->pData, d->nData + n);
    292   memcpy(d->pData + d->nData, c, n);
    293   d->nData += n;
    294 }
    295 
    296 static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
    297   appendVarint(d, iDocid);
    298   if( d->iType>=DL_POSITIONS ){
    299     appendVarint(d, POS_END);  /* initially empty position list */
    300     d->iLastColumn = 0;
    301     d->iLastPos = d->iLastOffset = 0;
    302   }
    303 }
    304 
    305 /* helper function for docListAddPos and docListAddPosOffset */
    306 static void addPos(DocList *d, int iColumn, int iPos){
    307   assert( d->nData>0 );
    308   --d->nData;  /* remove previous terminator */
    309   if( iColumn!=d->iLastColumn ){
    310     assert( iColumn>d->iLastColumn );
    311     appendVarint(d, POS_COLUMN);
    312     appendVarint(d, iColumn);
    313     d->iLastColumn = iColumn;
    314     d->iLastPos = d->iLastOffset = 0;
    315   }
    316   assert( iPos>=d->iLastPos );
    317   appendVarint(d, iPos-d->iLastPos+POS_BASE);
    318   d->iLastPos = iPos;
    319 }
    320 
    321 /* Add a position to the last position list in a doclist. */
    322 static void docListAddPos(DocList *d, int iColumn, int iPos){
    323   assert( d->iType==DL_POSITIONS );
    324   addPos(d, iColumn, iPos);
    325   appendVarint(d, POS_END);  /* add new terminator */
    326 }
    327 
    328 /*
    329 ** Add a position and starting and ending offsets to a doclist.
    330 **
    331 ** If the doclist is setup to handle only positions, then insert
    332 ** the position only and ignore the offsets.
    333 */
    334 static void docListAddPosOffset(
    335   DocList *d,             /* Doclist under construction */
    336   int iColumn,            /* Column the inserted term is part of */
    337   int iPos,               /* Position of the inserted term */
    338   int iStartOffset,       /* Starting offset of inserted term */
    339   int iEndOffset          /* Ending offset of inserted term */
    340 ){
    341   assert( d->iType>=DL_POSITIONS );
    342   addPos(d, iColumn, iPos);
    343   if( d->iType==DL_POSITIONS_OFFSETS ){
    344     assert( iStartOffset>=d->iLastOffset );
    345     appendVarint(d, iStartOffset-d->iLastOffset);
    346     d->iLastOffset = iStartOffset;
    347     assert( iEndOffset>=iStartOffset );
    348     appendVarint(d, iEndOffset-iStartOffset);
    349   }
    350   appendVarint(d, POS_END);  /* add new terminator */
    351 }
    352 
    353 /*
    354 ** A DocListReader object is a cursor into a doclist.  Initialize
    355 ** the cursor to the beginning of the doclist by calling readerInit().
    356 ** Then use routines
    357 **
    358 **      peekDocid()
    359 **      readDocid()
    360 **      readPosition()
    361 **      skipPositionList()
    362 **      and so forth...
    363 **
    364 ** to read information out of the doclist.  When we reach the end
    365 ** of the doclist, atEnd() returns TRUE.
    366 */
    367 typedef struct DocListReader {
    368   DocList *pDoclist;  /* The document list we are stepping through */
    369   char *p;            /* Pointer to next unread byte in the doclist */
    370   int iLastColumn;
    371   int iLastPos;  /* the last position read, or -1 when not in a position list */
    372 } DocListReader;
    373 
    374 /*
    375 ** Initialize the DocListReader r to point to the beginning of pDoclist.
    376 */
    377 static void readerInit(DocListReader *r, DocList *pDoclist){
    378   r->pDoclist = pDoclist;
    379   if( pDoclist!=NULL ){
    380     r->p = pDoclist->pData;
    381   }
    382   r->iLastColumn = -1;
    383   r->iLastPos = -1;
    384 }
    385 
    386 /*
    387 ** Return TRUE if we have reached then end of pReader and there is
    388 ** nothing else left to read.
    389 */
    390 static int atEnd(DocListReader *pReader){
    391   return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
    392 }
    393 
    394 /* Peek at the next docid without advancing the read pointer.
    395 */
    396 static sqlite_int64 peekDocid(DocListReader *pReader){
    397   sqlite_int64 ret;
    398   assert( !atEnd(pReader) );
    399   assert( pReader->iLastPos==-1 );
    400   getVarint(pReader->p, &ret);
    401   return ret;
    402 }
    403 
    404 /* Read the next docid.   See also nextDocid().
    405 */
    406 static sqlite_int64 readDocid(DocListReader *pReader){
    407   sqlite_int64 ret;
    408   assert( !atEnd(pReader) );
    409   assert( pReader->iLastPos==-1 );
    410   pReader->p += getVarint(pReader->p, &ret);
    411   if( pReader->pDoclist->iType>=DL_POSITIONS ){
    412     pReader->iLastColumn = 0;
    413     pReader->iLastPos = 0;
    414   }
    415   return ret;
    416 }
    417 
    418 /* Read the next position and column index from a position list.
    419  * Returns the position, or -1 at the end of the list. */
    420 static int readPosition(DocListReader *pReader, int *iColumn){
    421   int i;
    422   int iType = pReader->pDoclist->iType;
    423 
    424   if( pReader->iLastPos==-1 ){
    425     return -1;
    426   }
    427   assert( !atEnd(pReader) );
    428 
    429   if( iType<DL_POSITIONS ){
    430     return -1;
    431   }
    432   pReader->p += getVarint32(pReader->p, &i);
    433   if( i==POS_END ){
    434     pReader->iLastColumn = pReader->iLastPos = -1;
    435     *iColumn = -1;
    436     return -1;
    437   }
    438   if( i==POS_COLUMN ){
    439     pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
    440     pReader->iLastPos = 0;
    441     pReader->p += getVarint32(pReader->p, &i);
    442     assert( i>=POS_BASE );
    443   }
    444   pReader->iLastPos += ((int) i)-POS_BASE;
    445   if( iType>=DL_POSITIONS_OFFSETS ){
    446     /* Skip over offsets, ignoring them for now. */
    447     int iStart, iEnd;
    448     pReader->p += getVarint32(pReader->p, &iStart);
    449     pReader->p += getVarint32(pReader->p, &iEnd);
    450   }
    451   *iColumn = pReader->iLastColumn;
    452   return pReader->iLastPos;
    453 }
    454 
    455 /* Skip past the end of a position list. */
    456 static void skipPositionList(DocListReader *pReader){
    457   DocList *p = pReader->pDoclist;
    458   if( p && p->iType>=DL_POSITIONS ){
    459     int iColumn;
    460     while( readPosition(pReader, &iColumn)!=-1 ){}
    461   }
    462 }
    463 
    464 /* Skip over a docid, including its position list if the doclist has
    465  * positions. */
    466 static void skipDocument(DocListReader *pReader){
    467   readDocid(pReader);
    468   skipPositionList(pReader);
    469 }
    470 
    471 /* Skip past all docids which are less than [iDocid].  Returns 1 if a docid
    472  * matching [iDocid] was found.  */
    473 static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
    474   sqlite_int64 d = 0;
    475   while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){
    476     skipDocument(pReader);
    477   }
    478   return !atEnd(pReader) && d==iDocid;
    479 }
    480 
    481 /* Return the first document in a document list.
    482 */
    483 static sqlite_int64 firstDocid(DocList *d){
    484   DocListReader r;
    485   readerInit(&r, d);
    486   return readDocid(&r);
    487 }
    488 
    489 #ifdef SQLITE_DEBUG
    490 /*
    491 ** This routine is used for debugging purpose only.
    492 **
    493 ** Write the content of a doclist to standard output.
    494 */
    495 static void printDoclist(DocList *p){
    496   DocListReader r;
    497   const char *zSep = "";
    498 
    499   readerInit(&r, p);
    500   while( !atEnd(&r) ){
    501     sqlite_int64 docid = readDocid(&r);
    502     if( docid==0 ){
    503       skipPositionList(&r);
    504       continue;
    505     }
    506     printf("%s%lld", zSep, docid);
    507     zSep =  ",";
    508     if( p->iType>=DL_POSITIONS ){
    509       int iPos, iCol;
    510       const char *zDiv = "";
    511       printf("(");
    512       while( (iPos = readPosition(&r, &iCol))>=0 ){
    513         printf("%s%d:%d", zDiv, iCol, iPos);
    514         zDiv = ":";
    515       }
    516       printf(")");
    517     }
    518   }
    519   printf("\n");
    520   fflush(stdout);
    521 }
    522 #endif /* SQLITE_DEBUG */
    523 
    524 /* Trim the given doclist to contain only positions in column
    525  * [iRestrictColumn]. */
    526 static void docListRestrictColumn(DocList *in, int iRestrictColumn){
    527   DocListReader r;
    528   DocList out;
    529 
    530   assert( in->iType>=DL_POSITIONS );
    531   readerInit(&r, in);
    532   docListInit(&out, DL_POSITIONS, NULL, 0);
    533 
    534   while( !atEnd(&r) ){
    535     sqlite_int64 iDocid = readDocid(&r);
    536     int iPos, iColumn;
    537 
    538     docListAddDocid(&out, iDocid);
    539     while( (iPos = readPosition(&r, &iColumn)) != -1 ){
    540       if( iColumn==iRestrictColumn ){
    541         docListAddPos(&out, iColumn, iPos);
    542       }
    543     }
    544   }
    545 
    546   docListDestroy(in);
    547   *in = out;
    548 }
    549 
    550 /* Trim the given doclist by discarding any docids without any remaining
    551  * positions. */
    552 static void docListDiscardEmpty(DocList *in) {
    553   DocListReader r;
    554   DocList out;
    555 
    556   /* TODO: It would be nice to implement this operation in place; that
    557    * could save a significant amount of memory in queries with long doclists. */
    558   assert( in->iType>=DL_POSITIONS );
    559   readerInit(&r, in);
    560   docListInit(&out, DL_POSITIONS, NULL, 0);
    561 
    562   while( !atEnd(&r) ){
    563     sqlite_int64 iDocid = readDocid(&r);
    564     int match = 0;
    565     int iPos, iColumn;
    566     while( (iPos = readPosition(&r, &iColumn)) != -1 ){
    567       if( !match ){
    568         docListAddDocid(&out, iDocid);
    569         match = 1;
    570       }
    571       docListAddPos(&out, iColumn, iPos);
    572     }
    573   }
    574 
    575   docListDestroy(in);
    576   *in = out;
    577 }
    578 
    579 /* Helper function for docListUpdate() and docListAccumulate().
    580 ** Splices a doclist element into the doclist represented by r,
    581 ** leaving r pointing after the newly spliced element.
    582 */
    583 static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
    584                                  const char *pSource, int nSource){
    585   DocList *d = r->pDoclist;
    586   char *pTarget;
    587   int nTarget, found;
    588 
    589   found = skipToDocid(r, iDocid);
    590 
    591   /* Describe slice in d to place pSource/nSource. */
    592   pTarget = r->p;
    593   if( found ){
    594     skipDocument(r);
    595     nTarget = r->p-pTarget;
    596   }else{
    597     nTarget = 0;
    598   }
    599 
    600   /* The sense of the following is that there are three possibilities.
    601   ** If nTarget==nSource, we should not move any memory nor realloc.
    602   ** If nTarget>nSource, trim target and realloc.
    603   ** If nTarget<nSource, realloc then expand target.
    604   */
    605   if( nTarget>nSource ){
    606     memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
    607   }
    608   if( nTarget!=nSource ){
    609     int iDoclist = pTarget-d->pData;
    610     d->pData = realloc(d->pData, d->nData+nSource-nTarget);
    611     pTarget = d->pData+iDoclist;
    612   }
    613   if( nTarget<nSource ){
    614     memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
    615   }
    616 
    617   memcpy(pTarget, pSource, nSource);
    618   d->nData += nSource-nTarget;
    619   r->p = pTarget+nSource;
    620 }
    621 
    622 /* Insert/update pUpdate into the doclist. */
    623 static void docListUpdate(DocList *d, DocList *pUpdate){
    624   DocListReader reader;
    625 
    626   assert( d!=NULL && pUpdate!=NULL );
    627   assert( d->iType==pUpdate->iType);
    628 
    629   readerInit(&reader, d);
    630   docListSpliceElement(&reader, firstDocid(pUpdate),
    631                        pUpdate->pData, pUpdate->nData);
    632 }
    633 
    634 /* Propagate elements from pUpdate to pAcc, overwriting elements with
    635 ** matching docids.
    636 */
    637 static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
    638   DocListReader accReader, updateReader;
    639 
    640   /* Handle edge cases where one doclist is empty. */
    641   assert( pAcc!=NULL );
    642   if( pUpdate==NULL || pUpdate->nData==0 ) return;
    643   if( pAcc->nData==0 ){
    644     pAcc->pData = malloc(pUpdate->nData);
    645     memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
    646     pAcc->nData = pUpdate->nData;
    647     return;
    648   }
    649 
    650   readerInit(&accReader, pAcc);
    651   readerInit(&updateReader, pUpdate);
    652 
    653   while( !atEnd(&updateReader) ){
    654     char *pSource = updateReader.p;
    655     sqlite_int64 iDocid = readDocid(&updateReader);
    656     skipPositionList(&updateReader);
    657     docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
    658   }
    659 }
    660 
    661 /*
    662 ** Read the next docid off of pIn.  Return 0 if we reach the end.
    663 *
    664 * TODO: This assumes that docids are never 0, but they may actually be 0 since
    665 * users can choose docids when inserting into a full-text table.  Fix this.
    666 */
    667 static sqlite_int64 nextDocid(DocListReader *pIn){
    668   skipPositionList(pIn);
    669   return atEnd(pIn) ? 0 : readDocid(pIn);
    670 }
    671 
    672 /*
    673 ** pLeft and pRight are two DocListReaders that are pointing to
    674 ** positions lists of the same document: iDocid.
    675 **
    676 ** If there are no instances in pLeft or pRight where the position
    677 ** of pLeft is one less than the position of pRight, then this
    678 ** routine adds nothing to pOut.
    679 **
    680 ** If there are one or more instances where positions from pLeft
    681 ** are exactly one less than positions from pRight, then add a new
    682 ** document record to pOut.  If pOut wants to hold positions, then
    683 ** include the positions from pRight that are one more than a
    684 ** position in pLeft.  In other words:  pRight.iPos==pLeft.iPos+1.
    685 **
    686 ** pLeft and pRight are left pointing at the next document record.
    687 */
    688 static void mergePosList(
    689   DocListReader *pLeft,    /* Left position list */
    690   DocListReader *pRight,   /* Right position list */
    691   sqlite_int64 iDocid,     /* The docid from pLeft and pRight */
    692   DocList *pOut            /* Write the merged document record here */
    693 ){
    694   int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol);
    695   int iRightCol, iRightPos = readPosition(pRight, &iRightCol);
    696   int match = 0;
    697 
    698   /* Loop until we've reached the end of both position lists. */
    699   while( iLeftPos!=-1 && iRightPos!=-1 ){
    700     if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){
    701       if( !match ){
    702         docListAddDocid(pOut, iDocid);
    703         match = 1;
    704       }
    705       if( pOut->iType>=DL_POSITIONS ){
    706         docListAddPos(pOut, iRightCol, iRightPos);
    707       }
    708       iLeftPos = readPosition(pLeft, &iLeftCol);
    709       iRightPos = readPosition(pRight, &iRightCol);
    710     }else if( iRightCol<iLeftCol ||
    711               (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){
    712       iRightPos = readPosition(pRight, &iRightCol);
    713     }else{
    714       iLeftPos = readPosition(pLeft, &iLeftCol);
    715     }
    716   }
    717   if( iLeftPos>=0 ) skipPositionList(pLeft);
    718   if( iRightPos>=0 ) skipPositionList(pRight);
    719 }
    720 
    721 /* We have two doclists:  pLeft and pRight.
    722 ** Write the phrase intersection of these two doclists into pOut.
    723 **
    724 ** A phrase intersection means that two documents only match
    725 ** if pLeft.iPos+1==pRight.iPos.
    726 **
    727 ** The output pOut may or may not contain positions.  If pOut
    728 ** does contain positions, they are the positions of pRight.
    729 */
    730 static void docListPhraseMerge(
    731   DocList *pLeft,    /* Doclist resulting from the words on the left */
    732   DocList *pRight,   /* Doclist for the next word to the right */
    733   DocList *pOut      /* Write the combined doclist here */
    734 ){
    735   DocListReader left, right;
    736   sqlite_int64 docidLeft, docidRight;
    737 
    738   readerInit(&left, pLeft);
    739   readerInit(&right, pRight);
    740   docidLeft = nextDocid(&left);
    741   docidRight = nextDocid(&right);
    742 
    743   while( docidLeft>0 && docidRight>0 ){
    744     if( docidLeft<docidRight ){
    745       docidLeft = nextDocid(&left);
    746     }else if( docidRight<docidLeft ){
    747       docidRight = nextDocid(&right);
    748     }else{
    749       mergePosList(&left, &right, docidLeft, pOut);
    750       docidLeft = nextDocid(&left);
    751       docidRight = nextDocid(&right);
    752     }
    753   }
    754 }
    755 
    756 /* We have two doclists:  pLeft and pRight.
    757 ** Write the intersection of these two doclists into pOut.
    758 ** Only docids are matched.  Position information is ignored.
    759 **
    760 ** The output pOut never holds positions.
    761 */
    762 static void docListAndMerge(
    763   DocList *pLeft,    /* Doclist resulting from the words on the left */
    764   DocList *pRight,   /* Doclist for the next word to the right */
    765   DocList *pOut      /* Write the combined doclist here */
    766 ){
    767   DocListReader left, right;
    768   sqlite_int64 docidLeft, docidRight;
    769 
    770   assert( pOut->iType<DL_POSITIONS );
    771 
    772   readerInit(&left, pLeft);
    773   readerInit(&right, pRight);
    774   docidLeft = nextDocid(&left);
    775   docidRight = nextDocid(&right);
    776 
    777   while( docidLeft>0 && docidRight>0 ){
    778     if( docidLeft<docidRight ){
    779       docidLeft = nextDocid(&left);
    780     }else if( docidRight<docidLeft ){
    781       docidRight = nextDocid(&right);
    782     }else{
    783       docListAddDocid(pOut, docidLeft);
    784       docidLeft = nextDocid(&left);
    785       docidRight = nextDocid(&right);
    786     }
    787   }
    788 }
    789 
    790 /* We have two doclists:  pLeft and pRight.
    791 ** Write the union of these two doclists into pOut.
    792 ** Only docids are matched.  Position information is ignored.
    793 **
    794 ** The output pOut never holds positions.
    795 */
    796 static void docListOrMerge(
    797   DocList *pLeft,    /* Doclist resulting from the words on the left */
    798   DocList *pRight,   /* Doclist for the next word to the right */
    799   DocList *pOut      /* Write the combined doclist here */
    800 ){
    801   DocListReader left, right;
    802   sqlite_int64 docidLeft, docidRight, priorLeft;
    803 
    804   readerInit(&left, pLeft);
    805   readerInit(&right, pRight);
    806   docidLeft = nextDocid(&left);
    807   docidRight = nextDocid(&right);
    808 
    809   while( docidLeft>0 && docidRight>0 ){
    810     if( docidLeft<=docidRight ){
    811       docListAddDocid(pOut, docidLeft);
    812     }else{
    813       docListAddDocid(pOut, docidRight);
    814     }
    815     priorLeft = docidLeft;
    816     if( docidLeft<=docidRight ){
    817       docidLeft = nextDocid(&left);
    818     }
    819     if( docidRight>0 && docidRight<=priorLeft ){
    820       docidRight = nextDocid(&right);
    821     }
    822   }
    823   while( docidLeft>0 ){
    824     docListAddDocid(pOut, docidLeft);
    825     docidLeft = nextDocid(&left);
    826   }
    827   while( docidRight>0 ){
    828     docListAddDocid(pOut, docidRight);
    829     docidRight = nextDocid(&right);
    830   }
    831 }
    832 
    833 /* We have two doclists:  pLeft and pRight.
    834 ** Write into pOut all documents that occur in pLeft but not
    835 ** in pRight.
    836 **
    837 ** Only docids are matched.  Position information is ignored.
    838 **
    839 ** The output pOut never holds positions.
    840 */
    841 static void docListExceptMerge(
    842   DocList *pLeft,    /* Doclist resulting from the words on the left */
    843   DocList *pRight,   /* Doclist for the next word to the right */
    844   DocList *pOut      /* Write the combined doclist here */
    845 ){
    846   DocListReader left, right;
    847   sqlite_int64 docidLeft, docidRight, priorLeft;
    848 
    849   readerInit(&left, pLeft);
    850   readerInit(&right, pRight);
    851   docidLeft = nextDocid(&left);
    852   docidRight = nextDocid(&right);
    853 
    854   while( docidLeft>0 && docidRight>0 ){
    855     priorLeft = docidLeft;
    856     if( docidLeft<docidRight ){
    857       docListAddDocid(pOut, docidLeft);
    858     }
    859     if( docidLeft<=docidRight ){
    860       docidLeft = nextDocid(&left);
    861     }
    862     if( docidRight>0 && docidRight<=priorLeft ){
    863       docidRight = nextDocid(&right);
    864     }
    865   }
    866   while( docidLeft>0 ){
    867     docListAddDocid(pOut, docidLeft);
    868     docidLeft = nextDocid(&left);
    869   }
    870 }
    871 
    872 static char *string_dup_n(const char *s, int n){
    873   char *str = malloc(n + 1);
    874   memcpy(str, s, n);
    875   str[n] = '\0';
    876   return str;
    877 }
    878 
    879 /* Duplicate a string; the caller must free() the returned string.
    880  * (We don't use strdup() since it is not part of the standard C library and
    881  * may not be available everywhere.) */
    882 static char *string_dup(const char *s){
    883   return string_dup_n(s, strlen(s));
    884 }
    885 
    886 /* Format a string, replacing each occurrence of the % character with
    887  * zDb.zName.  This may be more convenient than sqlite_mprintf()
    888  * when one string is used repeatedly in a format string.
    889  * The caller must free() the returned string. */
    890 static char *string_format(const char *zFormat,
    891                            const char *zDb, const char *zName){
    892   const char *p;
    893   size_t len = 0;
    894   size_t nDb = strlen(zDb);
    895   size_t nName = strlen(zName);
    896   size_t nFullTableName = nDb+1+nName;
    897   char *result;
    898   char *r;
    899 
    900   /* first compute length needed */
    901   for(p = zFormat ; *p ; ++p){
    902     len += (*p=='%' ? nFullTableName : 1);
    903   }
    904   len += 1;  /* for null terminator */
    905 
    906   r = result = malloc(len);
    907   for(p = zFormat; *p; ++p){
    908     if( *p=='%' ){
    909       memcpy(r, zDb, nDb);
    910       r += nDb;
    911       *r++ = '.';
    912       memcpy(r, zName, nName);
    913       r += nName;
    914     } else {
    915       *r++ = *p;
    916     }
    917   }
    918   *r++ = '\0';
    919   assert( r == result + len );
    920   return result;
    921 }
    922 
    923 static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
    924                     const char *zFormat){
    925   char *zCommand = string_format(zFormat, zDb, zName);
    926   int rc;
    927   TRACE(("FTS1 sql: %s\n", zCommand));
    928   rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
    929   free(zCommand);
    930   return rc;
    931 }
    932 
    933 static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
    934                        sqlite3_stmt **ppStmt, const char *zFormat){
    935   char *zCommand = string_format(zFormat, zDb, zName);
    936   int rc;
    937   TRACE(("FTS1 prepare: %s\n", zCommand));
    938   rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
    939   free(zCommand);
    940   return rc;
    941 }
    942 
    943 /* end utility functions */
    944 
    945 /* Forward reference */
    946 typedef struct fulltext_vtab fulltext_vtab;
    947 
    948 /* A single term in a query is represented by an instances of
    949 ** the following structure.
    950 */
    951 typedef struct QueryTerm {
    952   short int nPhrase; /* How many following terms are part of the same phrase */
    953   short int iPhrase; /* This is the i-th term of a phrase. */
    954   short int iColumn; /* Column of the index that must match this term */
    955   signed char isOr;  /* this term is preceded by "OR" */
    956   signed char isNot; /* this term is preceded by "-" */
    957   char *pTerm;       /* text of the term.  '\000' terminated.  malloced */
    958   int nTerm;         /* Number of bytes in pTerm[] */
    959 } QueryTerm;
    960 
    961 
    962 /* A query string is parsed into a Query structure.
    963  *
    964  * We could, in theory, allow query strings to be complicated
    965  * nested expressions with precedence determined by parentheses.
    966  * But none of the major search engines do this.  (Perhaps the
    967  * feeling is that an parenthesized expression is two complex of
    968  * an idea for the average user to grasp.)  Taking our lead from
    969  * the major search engines, we will allow queries to be a list
    970  * of terms (with an implied AND operator) or phrases in double-quotes,
    971  * with a single optional "-" before each non-phrase term to designate
    972  * negation and an optional OR connector.
    973  *
    974  * OR binds more tightly than the implied AND, which is what the
    975  * major search engines seem to do.  So, for example:
    976  *
    977  *    [one two OR three]     ==>    one AND (two OR three)
    978  *    [one OR two three]     ==>    (one OR two) AND three
    979  *
    980  * A "-" before a term matches all entries that lack that term.
    981  * The "-" must occur immediately before the term with in intervening
    982  * space.  This is how the search engines do it.
    983  *
    984  * A NOT term cannot be the right-hand operand of an OR.  If this
    985  * occurs in the query string, the NOT is ignored:
    986  *
    987  *    [one OR -two]          ==>    one OR two
    988  *
    989  */
    990 typedef struct Query {
    991   fulltext_vtab *pFts;  /* The full text index */
    992   int nTerms;           /* Number of terms in the query */
    993   QueryTerm *pTerms;    /* Array of terms.  Space obtained from malloc() */
    994   int nextIsOr;         /* Set the isOr flag on the next inserted term */
    995   int nextColumn;       /* Next word parsed must be in this column */
    996   int dfltColumn;       /* The default column */
    997 } Query;
    998 
    999 
   1000 /*
   1001 ** An instance of the following structure keeps track of generated
   1002 ** matching-word offset information and snippets.
   1003 */
   1004 typedef struct Snippet {
   1005   int nMatch;     /* Total number of matches */
   1006   int nAlloc;     /* Space allocated for aMatch[] */
   1007   struct snippetMatch { /* One entry for each matching term */
   1008     char snStatus;       /* Status flag for use while constructing snippets */
   1009     short int iCol;      /* The column that contains the match */
   1010     short int iTerm;     /* The index in Query.pTerms[] of the matching term */
   1011     short int nByte;     /* Number of bytes in the term */
   1012     int iStart;          /* The offset to the first character of the term */
   1013   } *aMatch;      /* Points to space obtained from malloc */
   1014   char *zOffset;  /* Text rendering of aMatch[] */
   1015   int nOffset;    /* strlen(zOffset) */
   1016   char *zSnippet; /* Snippet text */
   1017   int nSnippet;   /* strlen(zSnippet) */
   1018 } Snippet;
   1019 
   1020 
   1021 typedef enum QueryType {
   1022   QUERY_GENERIC,   /* table scan */
   1023   QUERY_ROWID,     /* lookup by rowid */
   1024   QUERY_FULLTEXT   /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
   1025 } QueryType;
   1026 
   1027 /* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
   1028 ** before we start aggregating into larger segments.  Lower CHUNK_MAX
   1029 ** means that for a given input we have more individual segments per
   1030 ** term, which means more rows in the table and a bigger index (due to
   1031 ** both more rows and bigger rowids).  But it also reduces the average
   1032 ** cost of adding new elements to the segment 0 doclist, and it seems
   1033 ** to reduce the number of pages read and written during inserts.  256
   1034 ** was chosen by measuring insertion times for a certain input (first
   1035 ** 10k documents of Enron corpus), though including query performance
   1036 ** in the decision may argue for a larger value.
   1037 */
   1038 #define CHUNK_MAX 256
   1039 
   1040 typedef enum fulltext_statement {
   1041   CONTENT_INSERT_STMT,
   1042   CONTENT_SELECT_STMT,
   1043   CONTENT_UPDATE_STMT,
   1044   CONTENT_DELETE_STMT,
   1045 
   1046   TERM_SELECT_STMT,
   1047   TERM_SELECT_ALL_STMT,
   1048   TERM_INSERT_STMT,
   1049   TERM_UPDATE_STMT,
   1050   TERM_DELETE_STMT,
   1051 
   1052   MAX_STMT                     /* Always at end! */
   1053 } fulltext_statement;
   1054 
   1055 /* These must exactly match the enum above. */
   1056 /* TODO(adam): Is there some risk that a statement (in particular,
   1057 ** pTermSelectStmt) will be used in two cursors at once, e.g.  if a
   1058 ** query joins a virtual table to itself?  If so perhaps we should
   1059 ** move some of these to the cursor object.
   1060 */
   1061 static const char *const fulltext_zStatement[MAX_STMT] = {
   1062   /* CONTENT_INSERT */ NULL,  /* generated in contentInsertStatement() */
   1063   /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
   1064   /* CONTENT_UPDATE */ NULL,  /* generated in contentUpdateStatement() */
   1065   /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
   1066 
   1067   /* TERM_SELECT */
   1068   "select rowid, doclist from %_term where term = ? and segment = ?",
   1069   /* TERM_SELECT_ALL */
   1070   "select doclist from %_term where term = ? order by segment",
   1071   /* TERM_INSERT */
   1072   "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
   1073   /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
   1074   /* TERM_DELETE */ "delete from %_term where rowid = ?",
   1075 };
   1076 
   1077 /*
   1078 ** A connection to a fulltext index is an instance of the following
   1079 ** structure.  The xCreate and xConnect methods create an instance
   1080 ** of this structure and xDestroy and xDisconnect free that instance.
   1081 ** All other methods receive a pointer to the structure as one of their
   1082 ** arguments.
   1083 */
   1084 struct fulltext_vtab {
   1085   sqlite3_vtab base;               /* Base class used by SQLite core */
   1086   sqlite3 *db;                     /* The database connection */
   1087   const char *zDb;                 /* logical database name */
   1088   const char *zName;               /* virtual table name */
   1089   int nColumn;                     /* number of columns in virtual table */
   1090   char **azColumn;                 /* column names.  malloced */
   1091   char **azContentColumn;          /* column names in content table; malloced */
   1092   sqlite3_tokenizer *pTokenizer;   /* tokenizer for inserts and queries */
   1093 
   1094   /* Precompiled statements which we keep as long as the table is
   1095   ** open.
   1096   */
   1097   sqlite3_stmt *pFulltextStatements[MAX_STMT];
   1098 };
   1099 
   1100 /*
   1101 ** When the core wants to do a query, it create a cursor using a
   1102 ** call to xOpen.  This structure is an instance of a cursor.  It
   1103 ** is destroyed by xClose.
   1104 */
   1105 typedef struct fulltext_cursor {
   1106   sqlite3_vtab_cursor base;        /* Base class used by SQLite core */
   1107   QueryType iCursorType;           /* Copy of sqlite3_index_info.idxNum */
   1108   sqlite3_stmt *pStmt;             /* Prepared statement in use by the cursor */
   1109   int eof;                         /* True if at End Of Results */
   1110   Query q;                         /* Parsed query string */
   1111   Snippet snippet;                 /* Cached snippet for the current row */
   1112   int iColumn;                     /* Column being searched */
   1113   DocListReader result;  /* used when iCursorType == QUERY_FULLTEXT */
   1114 } fulltext_cursor;
   1115 
   1116 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
   1117   return (fulltext_vtab *) c->base.pVtab;
   1118 }
   1119 
   1120 static const sqlite3_module fulltextModule;   /* forward declaration */
   1121 
   1122 /* Append a list of strings separated by commas to a StringBuffer. */
   1123 static void appendList(StringBuffer *sb, int nString, char **azString){
   1124   int i;
   1125   for(i=0; i<nString; ++i){
   1126     if( i>0 ) append(sb, ", ");
   1127     append(sb, azString[i]);
   1128   }
   1129 }
   1130 
   1131 /* Return a dynamically generated statement of the form
   1132  *   insert into %_content (rowid, ...) values (?, ...)
   1133  */
   1134 static const char *contentInsertStatement(fulltext_vtab *v){
   1135   StringBuffer sb;
   1136   int i;
   1137 
   1138   initStringBuffer(&sb);
   1139   append(&sb, "insert into %_content (rowid, ");
   1140   appendList(&sb, v->nColumn, v->azContentColumn);
   1141   append(&sb, ") values (?");
   1142   for(i=0; i<v->nColumn; ++i)
   1143     append(&sb, ", ?");
   1144   append(&sb, ")");
   1145   return sb.s;
   1146 }
   1147 
   1148 /* Return a dynamically generated statement of the form
   1149  *   update %_content set [col_0] = ?, [col_1] = ?, ...
   1150  *                    where rowid = ?
   1151  */
   1152 static const char *contentUpdateStatement(fulltext_vtab *v){
   1153   StringBuffer sb;
   1154   int i;
   1155 
   1156   initStringBuffer(&sb);
   1157   append(&sb, "update %_content set ");
   1158   for(i=0; i<v->nColumn; ++i) {
   1159     if( i>0 ){
   1160       append(&sb, ", ");
   1161     }
   1162     append(&sb, v->azContentColumn[i]);
   1163     append(&sb, " = ?");
   1164   }
   1165   append(&sb, " where rowid = ?");
   1166   return sb.s;
   1167 }
   1168 
   1169 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
   1170 ** If the indicated statement has never been prepared, it is prepared
   1171 ** and cached, otherwise the cached version is reset.
   1172 */
   1173 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
   1174                              sqlite3_stmt **ppStmt){
   1175   assert( iStmt<MAX_STMT );
   1176   if( v->pFulltextStatements[iStmt]==NULL ){
   1177     const char *zStmt;
   1178     int rc;
   1179     switch( iStmt ){
   1180       case CONTENT_INSERT_STMT:
   1181         zStmt = contentInsertStatement(v); break;
   1182       case CONTENT_UPDATE_STMT:
   1183         zStmt = contentUpdateStatement(v); break;
   1184       default:
   1185         zStmt = fulltext_zStatement[iStmt];
   1186     }
   1187     rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
   1188                          zStmt);
   1189     if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
   1190     if( rc!=SQLITE_OK ) return rc;
   1191   } else {
   1192     int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
   1193     if( rc!=SQLITE_OK ) return rc;
   1194   }
   1195 
   1196   *ppStmt = v->pFulltextStatements[iStmt];
   1197   return SQLITE_OK;
   1198 }
   1199 
   1200 /* Step the indicated statement, handling errors SQLITE_BUSY (by
   1201 ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
   1202 ** bindings to the new statement).
   1203 ** TODO(adam): We should extend this function so that it can work with
   1204 ** statements declared locally, not only globally cached statements.
   1205 */
   1206 static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
   1207                               sqlite3_stmt **ppStmt){
   1208   int rc;
   1209   sqlite3_stmt *s = *ppStmt;
   1210   assert( iStmt<MAX_STMT );
   1211   assert( s==v->pFulltextStatements[iStmt] );
   1212 
   1213   while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
   1214     if( rc==SQLITE_BUSY ) continue;
   1215     if( rc!=SQLITE_ERROR ) return rc;
   1216 
   1217     /* If an SQLITE_SCHEMA error has occurred, then finalizing this
   1218      * statement is going to delete the fulltext_vtab structure. If
   1219      * the statement just executed is in the pFulltextStatements[]
   1220      * array, it will be finalized twice. So remove it before
   1221      * calling sqlite3_finalize().
   1222      */
   1223     v->pFulltextStatements[iStmt] = NULL;
   1224     rc = sqlite3_finalize(s);
   1225     break;
   1226   }
   1227   return rc;
   1228 
   1229  err:
   1230   sqlite3_finalize(s);
   1231   return rc;
   1232 }
   1233 
   1234 /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
   1235 ** Useful for statements like UPDATE, where we expect no results.
   1236 */
   1237 static int sql_single_step_statement(fulltext_vtab *v,
   1238                                      fulltext_statement iStmt,
   1239                                      sqlite3_stmt **ppStmt){
   1240   int rc = sql_step_statement(v, iStmt, ppStmt);
   1241   return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
   1242 }
   1243 
   1244 /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
   1245 static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
   1246                           sqlite3_value **pValues){
   1247   sqlite3_stmt *s;
   1248   int i;
   1249   int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
   1250   if( rc!=SQLITE_OK ) return rc;
   1251 
   1252   rc = sqlite3_bind_value(s, 1, rowid);
   1253   if( rc!=SQLITE_OK ) return rc;
   1254 
   1255   for(i=0; i<v->nColumn; ++i){
   1256     rc = sqlite3_bind_value(s, 2+i, pValues[i]);
   1257     if( rc!=SQLITE_OK ) return rc;
   1258   }
   1259 
   1260   return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
   1261 }
   1262 
   1263 /* update %_content set col0 = pValues[0], col1 = pValues[1], ...
   1264  *                  where rowid = [iRowid] */
   1265 static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
   1266                           sqlite_int64 iRowid){
   1267   sqlite3_stmt *s;
   1268   int i;
   1269   int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
   1270   if( rc!=SQLITE_OK ) return rc;
   1271 
   1272   for(i=0; i<v->nColumn; ++i){
   1273     rc = sqlite3_bind_value(s, 1+i, pValues[i]);
   1274     if( rc!=SQLITE_OK ) return rc;
   1275   }
   1276 
   1277   rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
   1278   if( rc!=SQLITE_OK ) return rc;
   1279 
   1280   return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s);
   1281 }
   1282 
   1283 static void freeStringArray(int nString, const char **pString){
   1284   int i;
   1285 
   1286   for (i=0 ; i < nString ; ++i) {
   1287     if( pString[i]!=NULL ) free((void *) pString[i]);
   1288   }
   1289   free((void *) pString);
   1290 }
   1291 
   1292 /* select * from %_content where rowid = [iRow]
   1293  * The caller must delete the returned array and all strings in it.
   1294  * null fields will be NULL in the returned array.
   1295  *
   1296  * TODO: Perhaps we should return pointer/length strings here for consistency
   1297  * with other code which uses pointer/length. */
   1298 static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
   1299                           const char ***pValues){
   1300   sqlite3_stmt *s;
   1301   const char **values;
   1302   int i;
   1303   int rc;
   1304 
   1305   *pValues = NULL;
   1306 
   1307   rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
   1308   if( rc!=SQLITE_OK ) return rc;
   1309 
   1310   rc = sqlite3_bind_int64(s, 1, iRow);
   1311   if( rc!=SQLITE_OK ) return rc;
   1312 
   1313   rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
   1314   if( rc!=SQLITE_ROW ) return rc;
   1315 
   1316   values = (const char **) malloc(v->nColumn * sizeof(const char *));
   1317   for(i=0; i<v->nColumn; ++i){
   1318     if( sqlite3_column_type(s, i)==SQLITE_NULL ){
   1319       values[i] = NULL;
   1320     }else{
   1321       values[i] = string_dup((char*)sqlite3_column_text(s, i));
   1322     }
   1323   }
   1324 
   1325   /* We expect only one row.  We must execute another sqlite3_step()
   1326    * to complete the iteration; otherwise the table will remain locked. */
   1327   rc = sqlite3_step(s);
   1328   if( rc==SQLITE_DONE ){
   1329     *pValues = values;
   1330     return SQLITE_OK;
   1331   }
   1332 
   1333   freeStringArray(v->nColumn, values);
   1334   return rc;
   1335 }
   1336 
   1337 /* delete from %_content where rowid = [iRow ] */
   1338 static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
   1339   sqlite3_stmt *s;
   1340   int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
   1341   if( rc!=SQLITE_OK ) return rc;
   1342 
   1343   rc = sqlite3_bind_int64(s, 1, iRow);
   1344   if( rc!=SQLITE_OK ) return rc;
   1345 
   1346   return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
   1347 }
   1348 
   1349 /* select rowid, doclist from %_term
   1350  *  where term = [pTerm] and segment = [iSegment]
   1351  * If found, returns SQLITE_ROW; the caller must free the
   1352  * returned doclist.  If no rows found, returns SQLITE_DONE. */
   1353 static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
   1354                        int iSegment,
   1355                        sqlite_int64 *rowid, DocList *out){
   1356   sqlite3_stmt *s;
   1357   int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
   1358   if( rc!=SQLITE_OK ) return rc;
   1359 
   1360   rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
   1361   if( rc!=SQLITE_OK ) return rc;
   1362 
   1363   rc = sqlite3_bind_int(s, 2, iSegment);
   1364   if( rc!=SQLITE_OK ) return rc;
   1365 
   1366   rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
   1367   if( rc!=SQLITE_ROW ) return rc;
   1368 
   1369   *rowid = sqlite3_column_int64(s, 0);
   1370   docListInit(out, DL_DEFAULT,
   1371               sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
   1372 
   1373   /* We expect only one row.  We must execute another sqlite3_step()
   1374    * to complete the iteration; otherwise the table will remain locked. */
   1375   rc = sqlite3_step(s);
   1376   return rc==SQLITE_DONE ? SQLITE_ROW : rc;
   1377 }
   1378 
   1379 /* Load the segment doclists for term pTerm and merge them in
   1380 ** appropriate order into out.  Returns SQLITE_OK if successful.  If
   1381 ** there are no segments for pTerm, successfully returns an empty
   1382 ** doclist in out.
   1383 **
   1384 ** Each document consists of 1 or more "columns".  The number of
   1385 ** columns is v->nColumn.  If iColumn==v->nColumn, then return
   1386 ** position information about all columns.  If iColumn<v->nColumn,
   1387 ** then only return position information about the iColumn-th column
   1388 ** (where the first column is 0).
   1389 */
   1390 static int term_select_all(
   1391   fulltext_vtab *v,     /* The fulltext index we are querying against */
   1392   int iColumn,          /* If <nColumn, only look at the iColumn-th column */
   1393   const char *pTerm,    /* The term whose posting lists we want */
   1394   int nTerm,            /* Number of bytes in pTerm */
   1395   DocList *out          /* Write the resulting doclist here */
   1396 ){
   1397   DocList doclist;
   1398   sqlite3_stmt *s;
   1399   int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s);
   1400   if( rc!=SQLITE_OK ) return rc;
   1401 
   1402   rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
   1403   if( rc!=SQLITE_OK ) return rc;
   1404 
   1405   docListInit(&doclist, DL_DEFAULT, 0, 0);
   1406 
   1407   /* TODO(shess) Handle schema and busy errors. */
   1408   while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
   1409     DocList old;
   1410 
   1411     /* TODO(shess) If we processed doclists from oldest to newest, we
   1412     ** could skip the malloc() involved with the following call.  For
   1413     ** now, I'd rather keep this logic similar to index_insert_term().
   1414     ** We could additionally drop elements when we see deletes, but
   1415     ** that would require a distinct version of docListAccumulate().
   1416     */
   1417     docListInit(&old, DL_DEFAULT,
   1418                 sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
   1419 
   1420     if( iColumn<v->nColumn ){   /* querying a single column */
   1421       docListRestrictColumn(&old, iColumn);
   1422     }
   1423 
   1424     /* doclist contains the newer data, so write it over old.  Then
   1425     ** steal accumulated result for doclist.
   1426     */
   1427     docListAccumulate(&old, &doclist);
   1428     docListDestroy(&doclist);
   1429     doclist = old;
   1430   }
   1431   if( rc!=SQLITE_DONE ){
   1432     docListDestroy(&doclist);
   1433     return rc;
   1434   }
   1435 
   1436   docListDiscardEmpty(&doclist);
   1437   *out = doclist;
   1438   return SQLITE_OK;
   1439 }
   1440 
   1441 /* insert into %_term (rowid, term, segment, doclist)
   1442                values ([piRowid], [pTerm], [iSegment], [doclist])
   1443 ** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
   1444 **
   1445 ** NOTE(shess) piRowid is IN, with values of "space of int64" plus
   1446 ** null, it is not used to pass data back to the caller.
   1447 */
   1448 static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
   1449                        const char *pTerm, int nTerm,
   1450                        int iSegment, DocList *doclist){
   1451   sqlite3_stmt *s;
   1452   int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
   1453   if( rc!=SQLITE_OK ) return rc;
   1454 
   1455   if( piRowid==NULL ){
   1456     rc = sqlite3_bind_null(s, 1);
   1457   }else{
   1458     rc = sqlite3_bind_int64(s, 1, *piRowid);
   1459   }
   1460   if( rc!=SQLITE_OK ) return rc;
   1461 
   1462   rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
   1463   if( rc!=SQLITE_OK ) return rc;
   1464 
   1465   rc = sqlite3_bind_int(s, 3, iSegment);
   1466   if( rc!=SQLITE_OK ) return rc;
   1467 
   1468   rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
   1469   if( rc!=SQLITE_OK ) return rc;
   1470 
   1471   return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
   1472 }
   1473 
   1474 /* update %_term set doclist = [doclist] where rowid = [rowid] */
   1475 static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
   1476                        DocList *doclist){
   1477   sqlite3_stmt *s;
   1478   int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
   1479   if( rc!=SQLITE_OK ) return rc;
   1480 
   1481   rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
   1482   if( rc!=SQLITE_OK ) return rc;
   1483 
   1484   rc = sqlite3_bind_int64(s, 2, rowid);
   1485   if( rc!=SQLITE_OK ) return rc;
   1486 
   1487   return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
   1488 }
   1489 
   1490 static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
   1491   sqlite3_stmt *s;
   1492   int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
   1493   if( rc!=SQLITE_OK ) return rc;
   1494 
   1495   rc = sqlite3_bind_int64(s, 1, rowid);
   1496   if( rc!=SQLITE_OK ) return rc;
   1497 
   1498   return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
   1499 }
   1500 
   1501 /*
   1502 ** Free the memory used to contain a fulltext_vtab structure.
   1503 */
   1504 static void fulltext_vtab_destroy(fulltext_vtab *v){
   1505   int iStmt, i;
   1506 
   1507   TRACE(("FTS1 Destroy %p\n", v));
   1508   for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
   1509     if( v->pFulltextStatements[iStmt]!=NULL ){
   1510       sqlite3_finalize(v->pFulltextStatements[iStmt]);
   1511       v->pFulltextStatements[iStmt] = NULL;
   1512     }
   1513   }
   1514 
   1515   if( v->pTokenizer!=NULL ){
   1516     v->pTokenizer->pModule->xDestroy(v->pTokenizer);
   1517     v->pTokenizer = NULL;
   1518   }
   1519 
   1520   free(v->azColumn);
   1521   for(i = 0; i < v->nColumn; ++i) {
   1522     sqlite3_free(v->azContentColumn[i]);
   1523   }
   1524   free(v->azContentColumn);
   1525   free(v);
   1526 }
   1527 
   1528 /*
   1529 ** Token types for parsing the arguments to xConnect or xCreate.
   1530 */
   1531 #define TOKEN_EOF         0    /* End of file */
   1532 #define TOKEN_SPACE       1    /* Any kind of whitespace */
   1533 #define TOKEN_ID          2    /* An identifier */
   1534 #define TOKEN_STRING      3    /* A string literal */
   1535 #define TOKEN_PUNCT       4    /* A single punctuation character */
   1536 
   1537 /*
   1538 ** If X is a character that can be used in an identifier then
   1539 ** IdChar(X) will be true.  Otherwise it is false.
   1540 **
   1541 ** For ASCII, any character with the high-order bit set is
   1542 ** allowed in an identifier.  For 7-bit characters,
   1543 ** sqlite3IsIdChar[X] must be 1.
   1544 **
   1545 ** Ticket #1066.  the SQL standard does not allow '$' in the
   1546 ** middle of identfiers.  But many SQL implementations do.
   1547 ** SQLite will allow '$' in identifiers for compatibility.
   1548 ** But the feature is undocumented.
   1549 */
   1550 static const char isIdChar[] = {
   1551 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
   1552     0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
   1553     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
   1554     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
   1555     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
   1556     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
   1557     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
   1558 };
   1559 #define IdChar(C)  (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
   1560 
   1561 
   1562 /*
   1563 ** Return the length of the token that begins at z[0].
   1564 ** Store the token type in *tokenType before returning.
   1565 */
   1566 static int getToken(const char *z, int *tokenType){
   1567   int i, c;
   1568   switch( *z ){
   1569     case 0: {
   1570       *tokenType = TOKEN_EOF;
   1571       return 0;
   1572     }
   1573     case ' ': case '\t': case '\n': case '\f': case '\r': {
   1574       for(i=1; safe_isspace(z[i]); i++){}
   1575       *tokenType = TOKEN_SPACE;
   1576       return i;
   1577     }
   1578     case '`':
   1579     case '\'':
   1580     case '"': {
   1581       int delim = z[0];
   1582       for(i=1; (c=z[i])!=0; i++){
   1583         if( c==delim ){
   1584           if( z[i+1]==delim ){
   1585             i++;
   1586           }else{
   1587             break;
   1588           }
   1589         }
   1590       }
   1591       *tokenType = TOKEN_STRING;
   1592       return i + (c!=0);
   1593     }
   1594     case '[': {
   1595       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
   1596       *tokenType = TOKEN_ID;
   1597       return i;
   1598     }
   1599     default: {
   1600       if( !IdChar(*z) ){
   1601         break;
   1602       }
   1603       for(i=1; IdChar(z[i]); i++){}
   1604       *tokenType = TOKEN_ID;
   1605       return i;
   1606     }
   1607   }
   1608   *tokenType = TOKEN_PUNCT;
   1609   return 1;
   1610 }
   1611 
   1612 /*
   1613 ** A token extracted from a string is an instance of the following
   1614 ** structure.
   1615 */
   1616 typedef struct Token {
   1617   const char *z;       /* Pointer to token text.  Not '\000' terminated */
   1618   short int n;         /* Length of the token text in bytes. */
   1619 } Token;
   1620 
   1621 /*
   1622 ** Given a input string (which is really one of the argv[] parameters
   1623 ** passed into xConnect or xCreate) split the string up into tokens.
   1624 ** Return an array of pointers to '\000' terminated strings, one string
   1625 ** for each non-whitespace token.
   1626 **
   1627 ** The returned array is terminated by a single NULL pointer.
   1628 **
   1629 ** Space to hold the returned array is obtained from a single
   1630 ** malloc and should be freed by passing the return value to free().
   1631 ** The individual strings within the token list are all a part of
   1632 ** the single memory allocation and will all be freed at once.
   1633 */
   1634 static char **tokenizeString(const char *z, int *pnToken){
   1635   int nToken = 0;
   1636   Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
   1637   int n = 1;
   1638   int e, i;
   1639   int totalSize = 0;
   1640   char **azToken;
   1641   char *zCopy;
   1642   while( n>0 ){
   1643     n = getToken(z, &e);
   1644     if( e!=TOKEN_SPACE ){
   1645       aToken[nToken].z = z;
   1646       aToken[nToken].n = n;
   1647       nToken++;
   1648       totalSize += n+1;
   1649     }
   1650     z += n;
   1651   }
   1652   azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
   1653   zCopy = (char*)&azToken[nToken];
   1654   nToken--;
   1655   for(i=0; i<nToken; i++){
   1656     azToken[i] = zCopy;
   1657     n = aToken[i].n;
   1658     memcpy(zCopy, aToken[i].z, n);
   1659     zCopy[n] = 0;
   1660     zCopy += n+1;
   1661   }
   1662   azToken[nToken] = 0;
   1663   free(aToken);
   1664   *pnToken = nToken;
   1665   return azToken;
   1666 }
   1667 
   1668 /*
   1669 ** Convert an SQL-style quoted string into a normal string by removing
   1670 ** the quote characters.  The conversion is done in-place.  If the
   1671 ** input does not begin with a quote character, then this routine
   1672 ** is a no-op.
   1673 **
   1674 ** Examples:
   1675 **
   1676 **     "abc"   becomes   abc
   1677 **     'xyz'   becomes   xyz
   1678 **     [pqr]   becomes   pqr
   1679 **     `mno`   becomes   mno
   1680 */
   1681 static void dequoteString(char *z){
   1682   int quote;
   1683   int i, j;
   1684   if( z==0 ) return;
   1685   quote = z[0];
   1686   switch( quote ){
   1687     case '\'':  break;
   1688     case '"':   break;
   1689     case '`':   break;                /* For MySQL compatibility */
   1690     case '[':   quote = ']';  break;  /* For MS SqlServer compatibility */
   1691     default:    return;
   1692   }
   1693   for(i=1, j=0; z[i]; i++){
   1694     if( z[i]==quote ){
   1695       if( z[i+1]==quote ){
   1696         z[j++] = quote;
   1697         i++;
   1698       }else{
   1699         z[j++] = 0;
   1700         break;
   1701       }
   1702     }else{
   1703       z[j++] = z[i];
   1704     }
   1705   }
   1706 }
   1707 
   1708 /*
   1709 ** The input azIn is a NULL-terminated list of tokens.  Remove the first
   1710 ** token and all punctuation tokens.  Remove the quotes from
   1711 ** around string literal tokens.
   1712 **
   1713 ** Example:
   1714 **
   1715 **     input:      tokenize chinese ( 'simplifed' , 'mixed' )
   1716 **     output:     chinese simplifed mixed
   1717 **
   1718 ** Another example:
   1719 **
   1720 **     input:      delimiters ( '[' , ']' , '...' )
   1721 **     output:     [ ] ...
   1722 */
   1723 static void tokenListToIdList(char **azIn){
   1724   int i, j;
   1725   if( azIn ){
   1726     for(i=0, j=-1; azIn[i]; i++){
   1727       if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
   1728         dequoteString(azIn[i]);
   1729         if( j>=0 ){
   1730           azIn[j] = azIn[i];
   1731         }
   1732         j++;
   1733       }
   1734     }
   1735     azIn[j] = 0;
   1736   }
   1737 }
   1738 
   1739 
   1740 /*
   1741 ** Find the first alphanumeric token in the string zIn.  Null-terminate
   1742 ** this token.  Remove any quotation marks.  And return a pointer to
   1743 ** the result.
   1744 */
   1745 static char *firstToken(char *zIn, char **pzTail){
   1746   int n, ttype;
   1747   while(1){
   1748     n = getToken(zIn, &ttype);
   1749     if( ttype==TOKEN_SPACE ){
   1750       zIn += n;
   1751     }else if( ttype==TOKEN_EOF ){
   1752       *pzTail = zIn;
   1753       return 0;
   1754     }else{
   1755       zIn[n] = 0;
   1756       *pzTail = &zIn[1];
   1757       dequoteString(zIn);
   1758       return zIn;
   1759     }
   1760   }
   1761   /*NOTREACHED*/
   1762 }
   1763 
   1764 /* Return true if...
   1765 **
   1766 **   *  s begins with the string t, ignoring case
   1767 **   *  s is longer than t
   1768 **   *  The first character of s beyond t is not a alphanumeric
   1769 **
   1770 ** Ignore leading space in *s.
   1771 **
   1772 ** To put it another way, return true if the first token of
   1773 ** s[] is t[].
   1774 */
   1775 static int startsWith(const char *s, const char *t){
   1776   while( safe_isspace(*s) ){ s++; }
   1777   while( *t ){
   1778     if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
   1779   }
   1780   return *s!='_' && !safe_isalnum(*s);
   1781 }
   1782 
   1783 /*
   1784 ** An instance of this structure defines the "spec" of a
   1785 ** full text index.  This structure is populated by parseSpec
   1786 ** and use by fulltextConnect and fulltextCreate.
   1787 */
   1788 typedef struct TableSpec {
   1789   const char *zDb;         /* Logical database name */
   1790   const char *zName;       /* Name of the full-text index */
   1791   int nColumn;             /* Number of columns to be indexed */
   1792   char **azColumn;         /* Original names of columns to be indexed */
   1793   char **azContentColumn;  /* Column names for %_content */
   1794   char **azTokenizer;      /* Name of tokenizer and its arguments */
   1795 } TableSpec;
   1796 
   1797 /*
   1798 ** Reclaim all of the memory used by a TableSpec
   1799 */
   1800 static void clearTableSpec(TableSpec *p) {
   1801   free(p->azColumn);
   1802   free(p->azContentColumn);
   1803   free(p->azTokenizer);
   1804 }
   1805 
   1806 /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
   1807  *
   1808  * CREATE VIRTUAL TABLE email
   1809  *        USING fts1(subject, body, tokenize mytokenizer(myarg))
   1810  *
   1811  * We return parsed information in a TableSpec structure.
   1812  *
   1813  */
   1814 static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
   1815                      char**pzErr){
   1816   int i, n;
   1817   char *z, *zDummy;
   1818   char **azArg;
   1819   const char *zTokenizer = 0;    /* argv[] entry describing the tokenizer */
   1820 
   1821   assert( argc>=3 );
   1822   /* Current interface:
   1823   ** argv[0] - module name
   1824   ** argv[1] - database name
   1825   ** argv[2] - table name
   1826   ** argv[3..] - columns, optionally followed by tokenizer specification
   1827   **             and snippet delimiters specification.
   1828   */
   1829 
   1830   /* Make a copy of the complete argv[][] array in a single allocation.
   1831   ** The argv[][] array is read-only and transient.  We can write to the
   1832   ** copy in order to modify things and the copy is persistent.
   1833   */
   1834   memset(pSpec, 0, sizeof(*pSpec));
   1835   for(i=n=0; i<argc; i++){
   1836     n += strlen(argv[i]) + 1;
   1837   }
   1838   azArg = malloc( sizeof(char*)*argc + n );
   1839   if( azArg==0 ){
   1840     return SQLITE_NOMEM;
   1841   }
   1842   z = (char*)&azArg[argc];
   1843   for(i=0; i<argc; i++){
   1844     azArg[i] = z;
   1845     strcpy(z, argv[i]);
   1846     z += strlen(z)+1;
   1847   }
   1848 
   1849   /* Identify the column names and the tokenizer and delimiter arguments
   1850   ** in the argv[][] array.
   1851   */
   1852   pSpec->zDb = azArg[1];
   1853   pSpec->zName = azArg[2];
   1854   pSpec->nColumn = 0;
   1855   pSpec->azColumn = azArg;
   1856   zTokenizer = "tokenize simple";
   1857   for(i=3; i<argc; ++i){
   1858     if( startsWith(azArg[i],"tokenize") ){
   1859       zTokenizer = azArg[i];
   1860     }else{
   1861       z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
   1862       pSpec->nColumn++;
   1863     }
   1864   }
   1865   if( pSpec->nColumn==0 ){
   1866     azArg[0] = "content";
   1867     pSpec->nColumn = 1;
   1868   }
   1869 
   1870   /*
   1871   ** Construct the list of content column names.
   1872   **
   1873   ** Each content column name will be of the form cNNAAAA
   1874   ** where NN is the column number and AAAA is the sanitized
   1875   ** column name.  "sanitized" means that special characters are
   1876   ** converted to "_".  The cNN prefix guarantees that all column
   1877   ** names are unique.
   1878   **
   1879   ** The AAAA suffix is not strictly necessary.  It is included
   1880   ** for the convenience of people who might examine the generated
   1881   ** %_content table and wonder what the columns are used for.
   1882   */
   1883   pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
   1884   if( pSpec->azContentColumn==0 ){
   1885     clearTableSpec(pSpec);
   1886     return SQLITE_NOMEM;
   1887   }
   1888   for(i=0; i<pSpec->nColumn; i++){
   1889     char *p;
   1890     pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
   1891     for (p = pSpec->azContentColumn[i]; *p ; ++p) {
   1892       if( !safe_isalnum(*p) ) *p = '_';
   1893     }
   1894   }
   1895 
   1896   /*
   1897   ** Parse the tokenizer specification string.
   1898   */
   1899   pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
   1900   tokenListToIdList(pSpec->azTokenizer);
   1901 
   1902   return SQLITE_OK;
   1903 }
   1904 
   1905 /*
   1906 ** Generate a CREATE TABLE statement that describes the schema of
   1907 ** the virtual table.  Return a pointer to this schema string.
   1908 **
   1909 ** Space is obtained from sqlite3_mprintf() and should be freed
   1910 ** using sqlite3_free().
   1911 */
   1912 static char *fulltextSchema(
   1913   int nColumn,                  /* Number of columns */
   1914   const char *const* azColumn,  /* List of columns */
   1915   const char *zTableName        /* Name of the table */
   1916 ){
   1917   int i;
   1918   char *zSchema, *zNext;
   1919   const char *zSep = "(";
   1920   zSchema = sqlite3_mprintf("CREATE TABLE x");
   1921   for(i=0; i<nColumn; i++){
   1922     zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
   1923     sqlite3_free(zSchema);
   1924     zSchema = zNext;
   1925     zSep = ",";
   1926   }
   1927   zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
   1928   sqlite3_free(zSchema);
   1929   return zNext;
   1930 }
   1931 
   1932 /*
   1933 ** Build a new sqlite3_vtab structure that will describe the
   1934 ** fulltext index defined by spec.
   1935 */
   1936 static int constructVtab(
   1937   sqlite3 *db,              /* The SQLite database connection */
   1938   TableSpec *spec,          /* Parsed spec information from parseSpec() */
   1939   sqlite3_vtab **ppVTab,    /* Write the resulting vtab structure here */
   1940   char **pzErr              /* Write any error message here */
   1941 ){
   1942   int rc;
   1943   int n;
   1944   fulltext_vtab *v = 0;
   1945   const sqlite3_tokenizer_module *m = NULL;
   1946   char *schema;
   1947 
   1948   v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
   1949   if( v==0 ) return SQLITE_NOMEM;
   1950   memset(v, 0, sizeof(*v));
   1951   /* sqlite will initialize v->base */
   1952   v->db = db;
   1953   v->zDb = spec->zDb;       /* Freed when azColumn is freed */
   1954   v->zName = spec->zName;   /* Freed when azColumn is freed */
   1955   v->nColumn = spec->nColumn;
   1956   v->azContentColumn = spec->azContentColumn;
   1957   spec->azContentColumn = 0;
   1958   v->azColumn = spec->azColumn;
   1959   spec->azColumn = 0;
   1960 
   1961   if( spec->azTokenizer==0 ){
   1962     return SQLITE_NOMEM;
   1963   }
   1964   /* TODO(shess) For now, add new tokenizers as else if clauses. */
   1965   if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){
   1966     sqlite3Fts1SimpleTokenizerModule(&m);
   1967   }else if( startsWith(spec->azTokenizer[0], "porter") ){
   1968     sqlite3Fts1PorterTokenizerModule(&m);
   1969   }else{
   1970     *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
   1971     rc = SQLITE_ERROR;
   1972     goto err;
   1973   }
   1974   for(n=0; spec->azTokenizer[n]; n++){}
   1975   if( n ){
   1976     rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
   1977                     &v->pTokenizer);
   1978   }else{
   1979     rc = m->xCreate(0, 0, &v->pTokenizer);
   1980   }
   1981   if( rc!=SQLITE_OK ) goto err;
   1982   v->pTokenizer->pModule = m;
   1983 
   1984   /* TODO: verify the existence of backing tables foo_content, foo_term */
   1985 
   1986   schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
   1987                           spec->zName);
   1988   rc = sqlite3_declare_vtab(db, schema);
   1989   sqlite3_free(schema);
   1990   if( rc!=SQLITE_OK ) goto err;
   1991 
   1992   memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
   1993 
   1994   *ppVTab = &v->base;
   1995   TRACE(("FTS1 Connect %p\n", v));
   1996 
   1997   return rc;
   1998 
   1999 err:
   2000   fulltext_vtab_destroy(v);
   2001   return rc;
   2002 }
   2003 
   2004 static int fulltextConnect(
   2005   sqlite3 *db,
   2006   void *pAux,
   2007   int argc, const char *const*argv,
   2008   sqlite3_vtab **ppVTab,
   2009   char **pzErr
   2010 ){
   2011   TableSpec spec;
   2012   int rc = parseSpec(&spec, argc, argv, pzErr);
   2013   if( rc!=SQLITE_OK ) return rc;
   2014 
   2015   rc = constructVtab(db, &spec, ppVTab, pzErr);
   2016   clearTableSpec(&spec);
   2017   return rc;
   2018 }
   2019 
   2020   /* The %_content table holds the text of each document, with
   2021   ** the rowid used as the docid.
   2022   **
   2023   ** The %_term table maps each term to a document list blob
   2024   ** containing elements sorted by ascending docid, each element
   2025   ** encoded as:
   2026   **
   2027   **   docid varint-encoded
   2028   **   token elements:
   2029   **     position+1 varint-encoded as delta from previous position
   2030   **     start offset varint-encoded as delta from previous start offset
   2031   **     end offset varint-encoded as delta from start offset
   2032   **
   2033   ** The sentinel position of 0 indicates the end of the token list.
   2034   **
   2035   ** Additionally, doclist blobs are chunked into multiple segments,
   2036   ** using segment to order the segments.  New elements are added to
   2037   ** the segment at segment 0, until it exceeds CHUNK_MAX.  Then
   2038   ** segment 0 is deleted, and the doclist is inserted at segment 1.
   2039   ** If there is already a doclist at segment 1, the segment 0 doclist
   2040   ** is merged with it, the segment 1 doclist is deleted, and the
   2041   ** merged doclist is inserted at segment 2, repeating those
   2042   ** operations until an insert succeeds.
   2043   **
   2044   ** Since this structure doesn't allow us to update elements in place
   2045   ** in case of deletion or update, these are simply written to
   2046   ** segment 0 (with an empty token list in case of deletion), with
   2047   ** docListAccumulate() taking care to retain lower-segment
   2048   ** information in preference to higher-segment information.
   2049   */
   2050   /* TODO(shess) Provide a VACUUM type operation which both removes
   2051   ** deleted elements which are no longer necessary, and duplicated
   2052   ** elements.  I suspect this will probably not be necessary in
   2053   ** practice, though.
   2054   */
   2055 static int fulltextCreate(sqlite3 *db, void *pAux,
   2056                           int argc, const char * const *argv,
   2057                           sqlite3_vtab **ppVTab, char **pzErr){
   2058   int rc;
   2059   TableSpec spec;
   2060   StringBuffer schema;
   2061   TRACE(("FTS1 Create\n"));
   2062 
   2063   rc = parseSpec(&spec, argc, argv, pzErr);
   2064   if( rc!=SQLITE_OK ) return rc;
   2065 
   2066   initStringBuffer(&schema);
   2067   append(&schema, "CREATE TABLE %_content(");
   2068   appendList(&schema, spec.nColumn, spec.azContentColumn);
   2069   append(&schema, ")");
   2070   rc = sql_exec(db, spec.zDb, spec.zName, schema.s);
   2071   free(schema.s);
   2072   if( rc!=SQLITE_OK ) goto out;
   2073 
   2074   rc = sql_exec(db, spec.zDb, spec.zName,
   2075     "create table %_term(term text, segment integer, doclist blob, "
   2076                         "primary key(term, segment));");
   2077   if( rc!=SQLITE_OK ) goto out;
   2078 
   2079   rc = constructVtab(db, &spec, ppVTab, pzErr);
   2080 
   2081 out:
   2082   clearTableSpec(&spec);
   2083   return rc;
   2084 }
   2085 
   2086 /* Decide how to handle an SQL query. */
   2087 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
   2088   int i;
   2089   TRACE(("FTS1 BestIndex\n"));
   2090 
   2091   for(i=0; i<pInfo->nConstraint; ++i){
   2092     const struct sqlite3_index_constraint *pConstraint;
   2093     pConstraint = &pInfo->aConstraint[i];
   2094     if( pConstraint->usable ) {
   2095       if( pConstraint->iColumn==-1 &&
   2096           pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
   2097         pInfo->idxNum = QUERY_ROWID;      /* lookup by rowid */
   2098         TRACE(("FTS1 QUERY_ROWID\n"));
   2099       } else if( pConstraint->iColumn>=0 &&
   2100                  pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
   2101         /* full-text search */
   2102         pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
   2103         TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
   2104       } else continue;
   2105 
   2106       pInfo->aConstraintUsage[i].argvIndex = 1;
   2107       pInfo->aConstraintUsage[i].omit = 1;
   2108 
   2109       /* An arbitrary value for now.
   2110        * TODO: Perhaps rowid matches should be considered cheaper than
   2111        * full-text searches. */
   2112       pInfo->estimatedCost = 1.0;
   2113 
   2114       return SQLITE_OK;
   2115     }
   2116   }
   2117   pInfo->idxNum = QUERY_GENERIC;
   2118   return SQLITE_OK;
   2119 }
   2120 
   2121 static int fulltextDisconnect(sqlite3_vtab *pVTab){
   2122   TRACE(("FTS1 Disconnect %p\n", pVTab));
   2123   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
   2124   return SQLITE_OK;
   2125 }
   2126 
   2127 static int fulltextDestroy(sqlite3_vtab *pVTab){
   2128   fulltext_vtab *v = (fulltext_vtab *)pVTab;
   2129   int rc;
   2130 
   2131   TRACE(("FTS1 Destroy %p\n", pVTab));
   2132   rc = sql_exec(v->db, v->zDb, v->zName,
   2133                 "drop table if exists %_content;"
   2134                 "drop table if exists %_term;"
   2135                 );
   2136   if( rc!=SQLITE_OK ) return rc;
   2137 
   2138   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
   2139   return SQLITE_OK;
   2140 }
   2141 
   2142 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
   2143   fulltext_cursor *c;
   2144 
   2145   c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
   2146   /* sqlite will initialize c->base */
   2147   *ppCursor = &c->base;
   2148   TRACE(("FTS1 Open %p: %p\n", pVTab, c));
   2149 
   2150   return SQLITE_OK;
   2151 }
   2152 
   2153 
   2154 /* Free all of the dynamically allocated memory held by *q
   2155 */
   2156 static void queryClear(Query *q){
   2157   int i;
   2158   for(i = 0; i < q->nTerms; ++i){
   2159     free(q->pTerms[i].pTerm);
   2160   }
   2161   free(q->pTerms);
   2162   memset(q, 0, sizeof(*q));
   2163 }
   2164 
   2165 /* Free all of the dynamically allocated memory held by the
   2166 ** Snippet
   2167 */
   2168 static void snippetClear(Snippet *p){
   2169   free(p->aMatch);
   2170   free(p->zOffset);
   2171   free(p->zSnippet);
   2172   memset(p, 0, sizeof(*p));
   2173 }
   2174 /*
   2175 ** Append a single entry to the p->aMatch[] log.
   2176 */
   2177 static void snippetAppendMatch(
   2178   Snippet *p,               /* Append the entry to this snippet */
   2179   int iCol, int iTerm,      /* The column and query term */
   2180   int iStart, int nByte     /* Offset and size of the match */
   2181 ){
   2182   int i;
   2183   struct snippetMatch *pMatch;
   2184   if( p->nMatch+1>=p->nAlloc ){
   2185     p->nAlloc = p->nAlloc*2 + 10;
   2186     p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
   2187     if( p->aMatch==0 ){
   2188       p->nMatch = 0;
   2189       p->nAlloc = 0;
   2190       return;
   2191     }
   2192   }
   2193   i = p->nMatch++;
   2194   pMatch = &p->aMatch[i];
   2195   pMatch->iCol = iCol;
   2196   pMatch->iTerm = iTerm;
   2197   pMatch->iStart = iStart;
   2198   pMatch->nByte = nByte;
   2199 }
   2200 
   2201 /*
   2202 ** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
   2203 */
   2204 #define FTS1_ROTOR_SZ   (32)
   2205 #define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
   2206 
   2207 /*
   2208 ** Add entries to pSnippet->aMatch[] for every match that occurs against
   2209 ** document zDoc[0..nDoc-1] which is stored in column iColumn.
   2210 */
   2211 static void snippetOffsetsOfColumn(
   2212   Query *pQuery,
   2213   Snippet *pSnippet,
   2214   int iColumn,
   2215   const char *zDoc,
   2216   int nDoc
   2217 ){
   2218   const sqlite3_tokenizer_module *pTModule;  /* The tokenizer module */
   2219   sqlite3_tokenizer *pTokenizer;             /* The specific tokenizer */
   2220   sqlite3_tokenizer_cursor *pTCursor;        /* Tokenizer cursor */
   2221   fulltext_vtab *pVtab;                /* The full text index */
   2222   int nColumn;                         /* Number of columns in the index */
   2223   const QueryTerm *aTerm;              /* Query string terms */
   2224   int nTerm;                           /* Number of query string terms */
   2225   int i, j;                            /* Loop counters */
   2226   int rc;                              /* Return code */
   2227   unsigned int match, prevMatch;       /* Phrase search bitmasks */
   2228   const char *zToken;                  /* Next token from the tokenizer */
   2229   int nToken;                          /* Size of zToken */
   2230   int iBegin, iEnd, iPos;              /* Offsets of beginning and end */
   2231 
   2232   /* The following variables keep a circular buffer of the last
   2233   ** few tokens */
   2234   unsigned int iRotor = 0;             /* Index of current token */
   2235   int iRotorBegin[FTS1_ROTOR_SZ];      /* Beginning offset of token */
   2236   int iRotorLen[FTS1_ROTOR_SZ];        /* Length of token */
   2237 
   2238   pVtab = pQuery->pFts;
   2239   nColumn = pVtab->nColumn;
   2240   pTokenizer = pVtab->pTokenizer;
   2241   pTModule = pTokenizer->pModule;
   2242   rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
   2243   if( rc ) return;
   2244   pTCursor->pTokenizer = pTokenizer;
   2245   aTerm = pQuery->pTerms;
   2246   nTerm = pQuery->nTerms;
   2247   if( nTerm>=FTS1_ROTOR_SZ ){
   2248     nTerm = FTS1_ROTOR_SZ - 1;
   2249   }
   2250   prevMatch = 0;
   2251   while(1){
   2252     rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
   2253     if( rc ) break;
   2254     iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
   2255     iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
   2256     match = 0;
   2257     for(i=0; i<nTerm; i++){
   2258       int iCol;
   2259       iCol = aTerm[i].iColumn;
   2260       if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
   2261       if( aTerm[i].nTerm!=nToken ) continue;
   2262       if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
   2263       if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
   2264       match |= 1<<i;
   2265       if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
   2266         for(j=aTerm[i].iPhrase-1; j>=0; j--){
   2267           int k = (iRotor-j) & FTS1_ROTOR_MASK;
   2268           snippetAppendMatch(pSnippet, iColumn, i-j,
   2269                 iRotorBegin[k], iRotorLen[k]);
   2270         }
   2271       }
   2272     }
   2273     prevMatch = match<<1;
   2274     iRotor++;
   2275   }
   2276   pTModule->xClose(pTCursor);
   2277 }
   2278 
   2279 
   2280 /*
   2281 ** Compute all offsets for the current row of the query.
   2282 ** If the offsets have already been computed, this routine is a no-op.
   2283 */
   2284 static void snippetAllOffsets(fulltext_cursor *p){
   2285   int nColumn;
   2286   int iColumn, i;
   2287   int iFirst, iLast;
   2288   fulltext_vtab *pFts;
   2289 
   2290   if( p->snippet.nMatch ) return;
   2291   if( p->q.nTerms==0 ) return;
   2292   pFts = p->q.pFts;
   2293   nColumn = pFts->nColumn;
   2294   iColumn = p->iCursorType - QUERY_FULLTEXT;
   2295   if( iColumn<0 || iColumn>=nColumn ){
   2296     iFirst = 0;
   2297     iLast = nColumn-1;
   2298   }else{
   2299     iFirst = iColumn;
   2300     iLast = iColumn;
   2301   }
   2302   for(i=iFirst; i<=iLast; i++){
   2303     const char *zDoc;
   2304     int nDoc;
   2305     zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
   2306     nDoc = sqlite3_column_bytes(p->pStmt, i+1);
   2307     snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
   2308   }
   2309 }
   2310 
   2311 /*
   2312 ** Convert the information in the aMatch[] array of the snippet
   2313 ** into the string zOffset[0..nOffset-1].
   2314 */
   2315 static void snippetOffsetText(Snippet *p){
   2316   int i;
   2317   int cnt = 0;
   2318   StringBuffer sb;
   2319   char zBuf[200];
   2320   if( p->zOffset ) return;
   2321   initStringBuffer(&sb);
   2322   for(i=0; i<p->nMatch; i++){
   2323     struct snippetMatch *pMatch = &p->aMatch[i];
   2324     zBuf[0] = ' ';
   2325     sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
   2326         pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
   2327     append(&sb, zBuf);
   2328     cnt++;
   2329   }
   2330   p->zOffset = sb.s;
   2331   p->nOffset = sb.len;
   2332 }
   2333 
   2334 /*
   2335 ** zDoc[0..nDoc-1] is phrase of text.  aMatch[0..nMatch-1] are a set
   2336 ** of matching words some of which might be in zDoc.  zDoc is column
   2337 ** number iCol.
   2338 **
   2339 ** iBreak is suggested spot in zDoc where we could begin or end an
   2340 ** excerpt.  Return a value similar to iBreak but possibly adjusted
   2341 ** to be a little left or right so that the break point is better.
   2342 */
   2343 static int wordBoundary(
   2344   int iBreak,                   /* The suggested break point */
   2345   const char *zDoc,             /* Document text */
   2346   int nDoc,                     /* Number of bytes in zDoc[] */
   2347   struct snippetMatch *aMatch,  /* Matching words */
   2348   int nMatch,                   /* Number of entries in aMatch[] */
   2349   int iCol                      /* The column number for zDoc[] */
   2350 ){
   2351   int i;
   2352   if( iBreak<=10 ){
   2353     return 0;
   2354   }
   2355   if( iBreak>=nDoc-10 ){
   2356     return nDoc;
   2357   }
   2358   for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
   2359   while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
   2360   if( i<nMatch ){
   2361     if( aMatch[i].iStart<iBreak+10 ){
   2362       return aMatch[i].iStart;
   2363     }
   2364     if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
   2365       return aMatch[i-1].iStart;
   2366     }
   2367   }
   2368   for(i=1; i<=10; i++){
   2369     if( safe_isspace(zDoc[iBreak-i]) ){
   2370       return iBreak - i + 1;
   2371     }
   2372     if( safe_isspace(zDoc[iBreak+i]) ){
   2373       return iBreak + i + 1;
   2374     }
   2375   }
   2376   return iBreak;
   2377 }
   2378 
   2379 /*
   2380 ** If the StringBuffer does not end in white space, add a single
   2381 ** space character to the end.
   2382 */
   2383 static void appendWhiteSpace(StringBuffer *p){
   2384   if( p->len==0 ) return;
   2385   if( safe_isspace(p->s[p->len-1]) ) return;
   2386   append(p, " ");
   2387 }
   2388 
   2389 /*
   2390 ** Remove white space from teh end of the StringBuffer
   2391 */
   2392 static void trimWhiteSpace(StringBuffer *p){
   2393   while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
   2394     p->len--;
   2395   }
   2396 }
   2397 
   2398 
   2399 
   2400 /*
   2401 ** Allowed values for Snippet.aMatch[].snStatus
   2402 */
   2403 #define SNIPPET_IGNORE  0   /* It is ok to omit this match from the snippet */
   2404 #define SNIPPET_DESIRED 1   /* We want to include this match in the snippet */
   2405 
   2406 /*
   2407 ** Generate the text of a snippet.
   2408 */
   2409 static void snippetText(
   2410   fulltext_cursor *pCursor,   /* The cursor we need the snippet for */
   2411   const char *zStartMark,     /* Markup to appear before each match */
   2412   const char *zEndMark,       /* Markup to appear after each match */
   2413   const char *zEllipsis       /* Ellipsis mark */
   2414 ){
   2415   int i, j;
   2416   struct snippetMatch *aMatch;
   2417   int nMatch;
   2418   int nDesired;
   2419   StringBuffer sb;
   2420   int tailCol;
   2421   int tailOffset;
   2422   int iCol;
   2423   int nDoc;
   2424   const char *zDoc;
   2425   int iStart, iEnd;
   2426   int tailEllipsis = 0;
   2427   int iMatch;
   2428 
   2429 
   2430   free(pCursor->snippet.zSnippet);
   2431   pCursor->snippet.zSnippet = 0;
   2432   aMatch = pCursor->snippet.aMatch;
   2433   nMatch = pCursor->snippet.nMatch;
   2434   initStringBuffer(&sb);
   2435 
   2436   for(i=0; i<nMatch; i++){
   2437     aMatch[i].snStatus = SNIPPET_IGNORE;
   2438   }
   2439   nDesired = 0;
   2440   for(i=0; i<pCursor->q.nTerms; i++){
   2441     for(j=0; j<nMatch; j++){
   2442       if( aMatch[j].iTerm==i ){
   2443         aMatch[j].snStatus = SNIPPET_DESIRED;
   2444         nDesired++;
   2445         break;
   2446       }
   2447     }
   2448   }
   2449 
   2450   iMatch = 0;
   2451   tailCol = -1;
   2452   tailOffset = 0;
   2453   for(i=0; i<nMatch && nDesired>0; i++){
   2454     if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
   2455     nDesired--;
   2456     iCol = aMatch[i].iCol;
   2457     zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
   2458     nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
   2459     iStart = aMatch[i].iStart - 40;
   2460     iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
   2461     if( iStart<=10 ){
   2462       iStart = 0;
   2463     }
   2464     if( iCol==tailCol && iStart<=tailOffset+20 ){
   2465       iStart = tailOffset;
   2466     }
   2467     if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
   2468       trimWhiteSpace(&sb);
   2469       appendWhiteSpace(&sb);
   2470       append(&sb, zEllipsis);
   2471       appendWhiteSpace(&sb);
   2472     }
   2473     iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
   2474     iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
   2475     if( iEnd>=nDoc-10 ){
   2476       iEnd = nDoc;
   2477       tailEllipsis = 0;
   2478     }else{
   2479       tailEllipsis = 1;
   2480     }
   2481     while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
   2482     while( iStart<iEnd ){
   2483       while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
   2484              && aMatch[iMatch].iCol<=iCol ){
   2485         iMatch++;
   2486       }
   2487       if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
   2488              && aMatch[iMatch].iCol==iCol ){
   2489         nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
   2490         iStart = aMatch[iMatch].iStart;
   2491         append(&sb, zStartMark);
   2492         nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
   2493         append(&sb, zEndMark);
   2494         iStart += aMatch[iMatch].nByte;
   2495         for(j=iMatch+1; j<nMatch; j++){
   2496           if( aMatch[j].iTerm==aMatch[iMatch].iTerm
   2497               && aMatch[j].snStatus==SNIPPET_DESIRED ){
   2498             nDesired--;
   2499             aMatch[j].snStatus = SNIPPET_IGNORE;
   2500           }
   2501         }
   2502       }else{
   2503         nappend(&sb, &zDoc[iStart], iEnd - iStart);
   2504         iStart = iEnd;
   2505       }
   2506     }
   2507     tailCol = iCol;
   2508     tailOffset = iEnd;
   2509   }
   2510   trimWhiteSpace(&sb);
   2511   if( tailEllipsis ){
   2512     appendWhiteSpace(&sb);
   2513     append(&sb, zEllipsis);
   2514   }
   2515   pCursor->snippet.zSnippet = sb.s;
   2516   pCursor->snippet.nSnippet = sb.len;
   2517 }
   2518 
   2519 
   2520 /*
   2521 ** Close the cursor.  For additional information see the documentation
   2522 ** on the xClose method of the virtual table interface.
   2523 */
   2524 static int fulltextClose(sqlite3_vtab_cursor *pCursor){
   2525   fulltext_cursor *c = (fulltext_cursor *) pCursor;
   2526   TRACE(("FTS1 Close %p\n", c));
   2527   sqlite3_finalize(c->pStmt);
   2528   queryClear(&c->q);
   2529   snippetClear(&c->snippet);
   2530   if( c->result.pDoclist!=NULL ){
   2531     docListDelete(c->result.pDoclist);
   2532   }
   2533   free(c);
   2534   return SQLITE_OK;
   2535 }
   2536 
   2537 static int fulltextNext(sqlite3_vtab_cursor *pCursor){
   2538   fulltext_cursor *c = (fulltext_cursor *) pCursor;
   2539   sqlite_int64 iDocid;
   2540   int rc;
   2541 
   2542   TRACE(("FTS1 Next %p\n", pCursor));
   2543   snippetClear(&c->snippet);
   2544   if( c->iCursorType < QUERY_FULLTEXT ){
   2545     /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
   2546     rc = sqlite3_step(c->pStmt);
   2547     switch( rc ){
   2548       case SQLITE_ROW:
   2549         c->eof = 0;
   2550         return SQLITE_OK;
   2551       case SQLITE_DONE:
   2552         c->eof = 1;
   2553         return SQLITE_OK;
   2554       default:
   2555         c->eof = 1;
   2556         return rc;
   2557     }
   2558   } else {  /* full-text query */
   2559     rc = sqlite3_reset(c->pStmt);
   2560     if( rc!=SQLITE_OK ) return rc;
   2561 
   2562     iDocid = nextDocid(&c->result);
   2563     if( iDocid==0 ){
   2564       c->eof = 1;
   2565       return SQLITE_OK;
   2566     }
   2567     rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
   2568     if( rc!=SQLITE_OK ) return rc;
   2569     /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
   2570     rc = sqlite3_step(c->pStmt);
   2571     if( rc==SQLITE_ROW ){   /* the case we expect */
   2572       c->eof = 0;
   2573       return SQLITE_OK;
   2574     }
   2575     /* an error occurred; abort */
   2576     return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
   2577   }
   2578 }
   2579 
   2580 
   2581 /* Return a DocList corresponding to the query term *pTerm.  If *pTerm
   2582 ** is the first term of a phrase query, go ahead and evaluate the phrase
   2583 ** query and return the doclist for the entire phrase query.
   2584 **
   2585 ** The result is stored in pTerm->doclist.
   2586 */
   2587 static int docListOfTerm(
   2588   fulltext_vtab *v,     /* The full text index */
   2589   int iColumn,          /* column to restrict to.  No restrition if >=nColumn */
   2590   QueryTerm *pQTerm,    /* Term we are looking for, or 1st term of a phrase */
   2591   DocList **ppResult    /* Write the result here */
   2592 ){
   2593   DocList *pLeft, *pRight, *pNew;
   2594   int i, rc;
   2595 
   2596   pLeft = docListNew(DL_POSITIONS);
   2597   rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
   2598   if( rc ){
   2599     docListDelete(pLeft);
   2600     return rc;
   2601   }
   2602   for(i=1; i<=pQTerm->nPhrase; i++){
   2603     pRight = docListNew(DL_POSITIONS);
   2604     rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
   2605     if( rc ){
   2606       docListDelete(pLeft);
   2607       return rc;
   2608     }
   2609     pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS);
   2610     docListPhraseMerge(pLeft, pRight, pNew);
   2611     docListDelete(pLeft);
   2612     docListDelete(pRight);
   2613     pLeft = pNew;
   2614   }
   2615   *ppResult = pLeft;
   2616   return SQLITE_OK;
   2617 }
   2618 
   2619 /* Add a new term pTerm[0..nTerm-1] to the query *q.
   2620 */
   2621 static void queryAdd(Query *q, const char *pTerm, int nTerm){
   2622   QueryTerm *t;
   2623   ++q->nTerms;
   2624   q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
   2625   if( q->pTerms==0 ){
   2626     q->nTerms = 0;
   2627     return;
   2628   }
   2629   t = &q->pTerms[q->nTerms - 1];
   2630   memset(t, 0, sizeof(*t));
   2631   t->pTerm = malloc(nTerm+1);
   2632   memcpy(t->pTerm, pTerm, nTerm);
   2633   t->pTerm[nTerm] = 0;
   2634   t->nTerm = nTerm;
   2635   t->isOr = q->nextIsOr;
   2636   q->nextIsOr = 0;
   2637   t->iColumn = q->nextColumn;
   2638   q->nextColumn = q->dfltColumn;
   2639 }
   2640 
   2641 /*
   2642 ** Check to see if the string zToken[0...nToken-1] matches any
   2643 ** column name in the virtual table.   If it does,
   2644 ** return the zero-indexed column number.  If not, return -1.
   2645 */
   2646 static int checkColumnSpecifier(
   2647   fulltext_vtab *pVtab,    /* The virtual table */
   2648   const char *zToken,      /* Text of the token */
   2649   int nToken               /* Number of characters in the token */
   2650 ){
   2651   int i;
   2652   for(i=0; i<pVtab->nColumn; i++){
   2653     if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
   2654         && pVtab->azColumn[i][nToken]==0 ){
   2655       return i;
   2656     }
   2657   }
   2658   return -1;
   2659 }
   2660 
   2661 /*
   2662 ** Parse the text at pSegment[0..nSegment-1].  Add additional terms
   2663 ** to the query being assemblied in pQuery.
   2664 **
   2665 ** inPhrase is true if pSegment[0..nSegement-1] is contained within
   2666 ** double-quotes.  If inPhrase is true, then the first term
   2667 ** is marked with the number of terms in the phrase less one and
   2668 ** OR and "-" syntax is ignored.  If inPhrase is false, then every
   2669 ** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
   2670 */
   2671 static int tokenizeSegment(
   2672   sqlite3_tokenizer *pTokenizer,          /* The tokenizer to use */
   2673   const char *pSegment, int nSegment,     /* Query expression being parsed */
   2674   int inPhrase,                           /* True if within "..." */
   2675   Query *pQuery                           /* Append results here */
   2676 ){
   2677   const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
   2678   sqlite3_tokenizer_cursor *pCursor;
   2679   int firstIndex = pQuery->nTerms;
   2680   int iCol;
   2681   int nTerm = 1;
   2682 
   2683   int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
   2684   if( rc!=SQLITE_OK ) return rc;
   2685   pCursor->pTokenizer = pTokenizer;
   2686 
   2687   while( 1 ){
   2688     const char *pToken;
   2689     int nToken, iBegin, iEnd, iPos;
   2690 
   2691     rc = pModule->xNext(pCursor,
   2692                         &pToken, &nToken,
   2693                         &iBegin, &iEnd, &iPos);
   2694     if( rc!=SQLITE_OK ) break;
   2695     if( !inPhrase &&
   2696         pSegment[iEnd]==':' &&
   2697          (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
   2698       pQuery->nextColumn = iCol;
   2699       continue;
   2700     }
   2701     if( !inPhrase && pQuery->nTerms>0 && nToken==2
   2702          && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
   2703       pQuery->nextIsOr = 1;
   2704       continue;
   2705     }
   2706     queryAdd(pQuery, pToken, nToken);
   2707     if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
   2708       pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
   2709     }
   2710     pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
   2711     if( inPhrase ){
   2712       nTerm++;
   2713     }
   2714   }
   2715 
   2716   if( inPhrase && pQuery->nTerms>firstIndex ){
   2717     pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
   2718   }
   2719 
   2720   return pModule->xClose(pCursor);
   2721 }
   2722 
   2723 /* Parse a query string, yielding a Query object pQuery.
   2724 **
   2725 ** The calling function will need to queryClear() to clean up
   2726 ** the dynamically allocated memory held by pQuery.
   2727 */
   2728 static int parseQuery(
   2729   fulltext_vtab *v,        /* The fulltext index */
   2730   const char *zInput,      /* Input text of the query string */
   2731   int nInput,              /* Size of the input text */
   2732   int dfltColumn,          /* Default column of the index to match against */
   2733   Query *pQuery            /* Write the parse results here. */
   2734 ){
   2735   int iInput, inPhrase = 0;
   2736 
   2737   if( zInput==0 ) nInput = 0;
   2738   if( nInput<0 ) nInput = strlen(zInput);
   2739   pQuery->nTerms = 0;
   2740   pQuery->pTerms = NULL;
   2741   pQuery->nextIsOr = 0;
   2742   pQuery->nextColumn = dfltColumn;
   2743   pQuery->dfltColumn = dfltColumn;
   2744   pQuery->pFts = v;
   2745 
   2746   for(iInput=0; iInput<nInput; ++iInput){
   2747     int i;
   2748     for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
   2749     if( i>iInput ){
   2750       tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
   2751                        pQuery);
   2752     }
   2753     iInput = i;
   2754     if( i<nInput ){
   2755       assert( zInput[i]=='"' );
   2756       inPhrase = !inPhrase;
   2757     }
   2758   }
   2759 
   2760   if( inPhrase ){
   2761     /* unmatched quote */
   2762     queryClear(pQuery);
   2763     return SQLITE_ERROR;
   2764   }
   2765   return SQLITE_OK;
   2766 }
   2767 
   2768 /* Perform a full-text query using the search expression in
   2769 ** zInput[0..nInput-1].  Return a list of matching documents
   2770 ** in pResult.
   2771 **
   2772 ** Queries must match column iColumn.  Or if iColumn>=nColumn
   2773 ** they are allowed to match against any column.
   2774 */
   2775 static int fulltextQuery(
   2776   fulltext_vtab *v,      /* The full text index */
   2777   int iColumn,           /* Match against this column by default */
   2778   const char *zInput,    /* The query string */
   2779   int nInput,            /* Number of bytes in zInput[] */
   2780   DocList **pResult,     /* Write the result doclist here */
   2781   Query *pQuery          /* Put parsed query string here */
   2782 ){
   2783   int i, iNext, rc;
   2784   DocList *pLeft = NULL;
   2785   DocList *pRight, *pNew, *pOr;
   2786   int nNot = 0;
   2787   QueryTerm *aTerm;
   2788 
   2789   rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
   2790   if( rc!=SQLITE_OK ) return rc;
   2791 
   2792   /* Merge AND terms. */
   2793   aTerm = pQuery->pTerms;
   2794   for(i = 0; i<pQuery->nTerms; i=iNext){
   2795     if( aTerm[i].isNot ){
   2796       /* Handle all NOT terms in a separate pass */
   2797       nNot++;
   2798       iNext = i + aTerm[i].nPhrase+1;
   2799       continue;
   2800     }
   2801     iNext = i + aTerm[i].nPhrase + 1;
   2802     rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
   2803     if( rc ){
   2804       queryClear(pQuery);
   2805       return rc;
   2806     }
   2807     while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
   2808       rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr);
   2809       iNext += aTerm[iNext].nPhrase + 1;
   2810       if( rc ){
   2811         queryClear(pQuery);
   2812         return rc;
   2813       }
   2814       pNew = docListNew(DL_DOCIDS);
   2815       docListOrMerge(pRight, pOr, pNew);
   2816       docListDelete(pRight);
   2817       docListDelete(pOr);
   2818       pRight = pNew;
   2819     }
   2820     if( pLeft==0 ){
   2821       pLeft = pRight;
   2822     }else{
   2823       pNew = docListNew(DL_DOCIDS);
   2824       docListAndMerge(pLeft, pRight, pNew);
   2825       docListDelete(pRight);
   2826       docListDelete(pLeft);
   2827       pLeft = pNew;
   2828     }
   2829   }
   2830 
   2831   if( nNot && pLeft==0 ){
   2832     /* We do not yet know how to handle a query of only NOT terms */
   2833     return SQLITE_ERROR;
   2834   }
   2835 
   2836   /* Do the EXCEPT terms */
   2837   for(i=0; i<pQuery->nTerms;  i += aTerm[i].nPhrase + 1){
   2838     if( !aTerm[i].isNot ) continue;
   2839     rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
   2840     if( rc ){
   2841       queryClear(pQuery);
   2842       docListDelete(pLeft);
   2843       return rc;
   2844     }
   2845     pNew = docListNew(DL_DOCIDS);
   2846     docListExceptMerge(pLeft, pRight, pNew);
   2847     docListDelete(pRight);
   2848     docListDelete(pLeft);
   2849     pLeft = pNew;
   2850   }
   2851 
   2852   *pResult = pLeft;
   2853   return rc;
   2854 }
   2855 
   2856 /*
   2857 ** This is the xFilter interface for the virtual table.  See
   2858 ** the virtual table xFilter method documentation for additional
   2859 ** information.
   2860 **
   2861 ** If idxNum==QUERY_GENERIC then do a full table scan against
   2862 ** the %_content table.
   2863 **
   2864 ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
   2865 ** in the %_content table.
   2866 **
   2867 ** If idxNum>=QUERY_FULLTEXT then use the full text index.  The
   2868 ** column on the left-hand side of the MATCH operator is column
   2869 ** number idxNum-QUERY_FULLTEXT, 0 indexed.  argv[0] is the right-hand
   2870 ** side of the MATCH operator.
   2871 */
   2872 /* TODO(shess) Upgrade the cursor initialization and destruction to
   2873 ** account for fulltextFilter() being called multiple times on the
   2874 ** same cursor.  The current solution is very fragile.  Apply fix to
   2875 ** fts2 as appropriate.
   2876 */
   2877 static int fulltextFilter(
   2878   sqlite3_vtab_cursor *pCursor,     /* The cursor used for this query */
   2879   int idxNum, const char *idxStr,   /* Which indexing scheme to use */
   2880   int argc, sqlite3_value **argv    /* Arguments for the indexing scheme */
   2881 ){
   2882   fulltext_cursor *c = (fulltext_cursor *) pCursor;
   2883   fulltext_vtab *v = cursor_vtab(c);
   2884   int rc;
   2885   char *zSql;
   2886 
   2887   TRACE(("FTS1 Filter %p\n",pCursor));
   2888 
   2889   zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
   2890                           idxNum==QUERY_GENERIC ? "" : "where rowid=?");
   2891   sqlite3_finalize(c->pStmt);
   2892   rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
   2893   sqlite3_free(zSql);
   2894   if( rc!=SQLITE_OK ) return rc;
   2895 
   2896   c->iCursorType = idxNum;
   2897   switch( idxNum ){
   2898     case QUERY_GENERIC:
   2899       break;
   2900 
   2901     case QUERY_ROWID:
   2902       rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
   2903       if( rc!=SQLITE_OK ) return rc;
   2904       break;
   2905 
   2906     default:   /* full-text search */
   2907     {
   2908       const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
   2909       DocList *pResult;
   2910       assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
   2911       assert( argc==1 );
   2912       queryClear(&c->q);
   2913       rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q);
   2914       if( rc!=SQLITE_OK ) return rc;
   2915       if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist);
   2916       readerInit(&c->result, pResult);
   2917       break;
   2918     }
   2919   }
   2920 
   2921   return fulltextNext(pCursor);
   2922 }
   2923 
   2924 /* This is the xEof method of the virtual table.  The SQLite core
   2925 ** calls this routine to find out if it has reached the end of
   2926 ** a query's results set.
   2927 */
   2928 static int fulltextEof(sqlite3_vtab_cursor *pCursor){
   2929   fulltext_cursor *c = (fulltext_cursor *) pCursor;
   2930   return c->eof;
   2931 }
   2932 
   2933 /* This is the xColumn method of the virtual table.  The SQLite
   2934 ** core calls this method during a query when it needs the value
   2935 ** of a column from the virtual table.  This method needs to use
   2936 ** one of the sqlite3_result_*() routines to store the requested
   2937 ** value back in the pContext.
   2938 */
   2939 static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
   2940                           sqlite3_context *pContext, int idxCol){
   2941   fulltext_cursor *c = (fulltext_cursor *) pCursor;
   2942   fulltext_vtab *v = cursor_vtab(c);
   2943 
   2944   if( idxCol<v->nColumn ){
   2945     sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
   2946     sqlite3_result_value(pContext, pVal);
   2947   }else if( idxCol==v->nColumn ){
   2948     /* The extra column whose name is the same as the table.
   2949     ** Return a blob which is a pointer to the cursor
   2950     */
   2951     sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
   2952   }
   2953   return SQLITE_OK;
   2954 }
   2955 
   2956 /* This is the xRowid method.  The SQLite core calls this routine to
   2957 ** retrive the rowid for the current row of the result set.  The
   2958 ** rowid should be written to *pRowid.
   2959 */
   2960 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
   2961   fulltext_cursor *c = (fulltext_cursor *) pCursor;
   2962 
   2963   *pRowid = sqlite3_column_int64(c->pStmt, 0);
   2964   return SQLITE_OK;
   2965 }
   2966 
   2967 /* Add all terms in [zText] to the given hash table.  If [iColumn] > 0,
   2968  * we also store positions and offsets in the hash table using the given
   2969  * column number. */
   2970 static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid,
   2971                       const char *zText, int iColumn){
   2972   sqlite3_tokenizer *pTokenizer = v->pTokenizer;
   2973   sqlite3_tokenizer_cursor *pCursor;
   2974   const char *pToken;
   2975   int nTokenBytes;
   2976   int iStartOffset, iEndOffset, iPosition;
   2977   int rc;
   2978 
   2979   rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
   2980   if( rc!=SQLITE_OK ) return rc;
   2981 
   2982   pCursor->pTokenizer = pTokenizer;
   2983   while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
   2984                                                &pToken, &nTokenBytes,
   2985                                                &iStartOffset, &iEndOffset,
   2986                                                &iPosition) ){
   2987     DocList *p;
   2988 
   2989     /* Positions can't be negative; we use -1 as a terminator internally. */
   2990     if( iPosition<0 ){
   2991       pTokenizer->pModule->xClose(pCursor);
   2992       return SQLITE_ERROR;
   2993     }
   2994 
   2995     p = fts1HashFind(terms, pToken, nTokenBytes);
   2996     if( p==NULL ){
   2997       p = docListNew(DL_DEFAULT);
   2998       docListAddDocid(p, iDocid);
   2999       fts1HashInsert(terms, pToken, nTokenBytes, p);
   3000     }
   3001     if( iColumn>=0 ){
   3002       docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset);
   3003     }
   3004   }
   3005 
   3006   /* TODO(shess) Check return?  Should this be able to cause errors at
   3007   ** this point?  Actually, same question about sqlite3_finalize(),
   3008   ** though one could argue that failure there means that the data is
   3009   ** not durable.  *ponder*
   3010   */
   3011   pTokenizer->pModule->xClose(pCursor);
   3012   return rc;
   3013 }
   3014 
   3015 /* Update the %_terms table to map the term [pTerm] to the given rowid. */
   3016 static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
   3017                              DocList *d){
   3018   sqlite_int64 iIndexRow;
   3019   DocList doclist;
   3020   int iSegment = 0, rc;
   3021 
   3022   rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
   3023   if( rc==SQLITE_DONE ){
   3024     docListInit(&doclist, DL_DEFAULT, 0, 0);
   3025     docListUpdate(&doclist, d);
   3026     /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
   3027     rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
   3028     goto err;
   3029   }
   3030   if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
   3031 
   3032   docListUpdate(&doclist, d);
   3033   if( doclist.nData<=CHUNK_MAX ){
   3034     rc = term_update(v, iIndexRow, &doclist);
   3035     goto err;
   3036   }
   3037 
   3038   /* Doclist doesn't fit, delete what's there, and accumulate
   3039   ** forward.
   3040   */
   3041   rc = term_delete(v, iIndexRow);
   3042   if( rc!=SQLITE_OK ) goto err;
   3043 
   3044   /* Try to insert the doclist into a higher segment bucket.  On
   3045   ** failure, accumulate existing doclist with the doclist from that
   3046   ** bucket, and put results in the next bucket.
   3047   */
   3048   iSegment++;
   3049   while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
   3050                          &doclist))!=SQLITE_OK ){
   3051     sqlite_int64 iSegmentRow;
   3052     DocList old;
   3053     int rc2;
   3054 
   3055     /* Retain old error in case the term_insert() error was really an
   3056     ** error rather than a bounced insert.
   3057     */
   3058     rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
   3059     if( rc2!=SQLITE_ROW ) goto err;
   3060 
   3061     rc = term_delete(v, iSegmentRow);
   3062     if( rc!=SQLITE_OK ) goto err;
   3063 
   3064     /* Reusing lowest-number deleted row keeps the index smaller. */
   3065     if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow;
   3066 
   3067     /* doclist contains the newer data, so accumulate it over old.
   3068     ** Then steal accumulated data for doclist.
   3069     */
   3070     docListAccumulate(&old, &doclist);
   3071     docListDestroy(&doclist);
   3072     doclist = old;
   3073 
   3074     iSegment++;
   3075   }
   3076 
   3077  err:
   3078   docListDestroy(&doclist);
   3079   return rc;
   3080 }
   3081 
   3082 /* Add doclists for all terms in [pValues] to the hash table [terms]. */
   3083 static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid,
   3084                 sqlite3_value **pValues){
   3085   int i;
   3086   for(i = 0; i < v->nColumn ; ++i){
   3087     char *zText = (char*)sqlite3_value_text(pValues[i]);
   3088     int rc = buildTerms(v, terms, iRowid, zText, i);
   3089     if( rc!=SQLITE_OK ) return rc;
   3090   }
   3091   return SQLITE_OK;
   3092 }
   3093 
   3094 /* Add empty doclists for all terms in the given row's content to the hash
   3095  * table [pTerms]. */
   3096 static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){
   3097   const char **pValues;
   3098   int i;
   3099 
   3100   int rc = content_select(v, iRowid, &pValues);
   3101   if( rc!=SQLITE_OK ) return rc;
   3102 
   3103   for(i = 0 ; i < v->nColumn; ++i) {
   3104     rc = buildTerms(v, pTerms, iRowid, pValues[i], -1);
   3105     if( rc!=SQLITE_OK ) break;
   3106   }
   3107 
   3108   freeStringArray(v->nColumn, pValues);
   3109   return SQLITE_OK;
   3110 }
   3111 
   3112 /* Insert a row into the %_content table; set *piRowid to be the ID of the
   3113  * new row.  Fill [pTerms] with new doclists for the %_term table. */
   3114 static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
   3115                         sqlite3_value **pValues,
   3116                         sqlite_int64 *piRowid, fts1Hash *pTerms){
   3117   int rc;
   3118 
   3119   rc = content_insert(v, pRequestRowid, pValues);  /* execute an SQL INSERT */
   3120   if( rc!=SQLITE_OK ) return rc;
   3121   *piRowid = sqlite3_last_insert_rowid(v->db);
   3122   return insertTerms(v, pTerms, *piRowid, pValues);
   3123 }
   3124 
   3125 /* Delete a row from the %_content table; fill [pTerms] with empty doclists
   3126  * to be written to the %_term table. */
   3127 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){
   3128   int rc = deleteTerms(v, pTerms, iRow);
   3129   if( rc!=SQLITE_OK ) return rc;
   3130   return content_delete(v, iRow);  /* execute an SQL DELETE */
   3131 }
   3132 
   3133 /* Update a row in the %_content table; fill [pTerms] with new doclists for the
   3134  * %_term table. */
   3135 static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
   3136                         sqlite3_value **pValues, fts1Hash *pTerms){
   3137   /* Generate an empty doclist for each term that previously appeared in this
   3138    * row. */
   3139   int rc = deleteTerms(v, pTerms, iRow);
   3140   if( rc!=SQLITE_OK ) return rc;
   3141 
   3142   rc = content_update(v, pValues, iRow);  /* execute an SQL UPDATE */
   3143   if( rc!=SQLITE_OK ) return rc;
   3144 
   3145   /* Now add positions for terms which appear in the updated row. */
   3146   return insertTerms(v, pTerms, iRow, pValues);
   3147 }
   3148 
   3149 /* This function implements the xUpdate callback; it is the top-level entry
   3150  * point for inserting, deleting or updating a row in a full-text table. */
   3151 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
   3152                    sqlite_int64 *pRowid){
   3153   fulltext_vtab *v = (fulltext_vtab *) pVtab;
   3154   fts1Hash terms;   /* maps term string -> PosList */
   3155   int rc;
   3156   fts1HashElem *e;
   3157 
   3158   TRACE(("FTS1 Update %p\n", pVtab));
   3159 
   3160   fts1HashInit(&terms, FTS1_HASH_STRING, 1);
   3161 
   3162   if( nArg<2 ){
   3163     rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms);
   3164   } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
   3165     /* An update:
   3166      * ppArg[0] = old rowid
   3167      * ppArg[1] = new rowid
   3168      * ppArg[2..2+v->nColumn-1] = values
   3169      * ppArg[2+v->nColumn] = value for magic column (we ignore this)
   3170      */
   3171     sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
   3172     if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
   3173       sqlite3_value_int64(ppArg[1]) != rowid ){
   3174       rc = SQLITE_ERROR;  /* we don't allow changing the rowid */
   3175     } else {
   3176       assert( nArg==2+v->nColumn+1);
   3177       rc = index_update(v, rowid, &ppArg[2], &terms);
   3178     }
   3179   } else {
   3180     /* An insert:
   3181      * ppArg[1] = requested rowid
   3182      * ppArg[2..2+v->nColumn-1] = values
   3183      * ppArg[2+v->nColumn] = value for magic column (we ignore this)
   3184      */
   3185     assert( nArg==2+v->nColumn+1);
   3186     rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
   3187   }
   3188 
   3189   if( rc==SQLITE_OK ){
   3190     /* Write updated doclists to disk. */
   3191     for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
   3192       DocList *p = fts1HashData(e);
   3193       rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
   3194       if( rc!=SQLITE_OK ) break;
   3195     }
   3196   }
   3197 
   3198   /* clean up */
   3199   for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
   3200     DocList *p = fts1HashData(e);
   3201     docListDelete(p);
   3202   }
   3203   fts1HashClear(&terms);
   3204 
   3205   return rc;
   3206 }
   3207 
   3208 /*
   3209 ** Implementation of the snippet() function for FTS1
   3210 */
   3211 static void snippetFunc(
   3212   sqlite3_context *pContext,
   3213   int argc,
   3214   sqlite3_value **argv
   3215 ){
   3216   fulltext_cursor *pCursor;
   3217   if( argc<1 ) return;
   3218   if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
   3219       sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
   3220     sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
   3221   }else{
   3222     const char *zStart = "<b>";
   3223     const char *zEnd = "</b>";
   3224     const char *zEllipsis = "<b>...</b>";
   3225     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
   3226     if( argc>=2 ){
   3227       zStart = (const char*)sqlite3_value_text(argv[1]);
   3228       if( argc>=3 ){
   3229         zEnd = (const char*)sqlite3_value_text(argv[2]);
   3230         if( argc>=4 ){
   3231           zEllipsis = (const char*)sqlite3_value_text(argv[3]);
   3232         }
   3233       }
   3234     }
   3235     snippetAllOffsets(pCursor);
   3236     snippetText(pCursor, zStart, zEnd, zEllipsis);
   3237     sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
   3238                         pCursor->snippet.nSnippet, SQLITE_STATIC);
   3239   }
   3240 }
   3241 
   3242 /*
   3243 ** Implementation of the offsets() function for FTS1
   3244 */
   3245 static void snippetOffsetsFunc(
   3246   sqlite3_context *pContext,
   3247   int argc,
   3248   sqlite3_value **argv
   3249 ){
   3250   fulltext_cursor *pCursor;
   3251   if( argc<1 ) return;
   3252   if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
   3253       sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
   3254     sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
   3255   }else{
   3256     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
   3257     snippetAllOffsets(pCursor);
   3258     snippetOffsetText(&pCursor->snippet);
   3259     sqlite3_result_text(pContext,
   3260                         pCursor->snippet.zOffset, pCursor->snippet.nOffset,
   3261                         SQLITE_STATIC);
   3262   }
   3263 }
   3264 
   3265 /*
   3266 ** This routine implements the xFindFunction method for the FTS1
   3267 ** virtual table.
   3268 */
   3269 static int fulltextFindFunction(
   3270   sqlite3_vtab *pVtab,
   3271   int nArg,
   3272   const char *zName,
   3273   void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
   3274   void **ppArg
   3275 ){
   3276   if( strcmp(zName,"snippet")==0 ){
   3277     *pxFunc = snippetFunc;
   3278     return 1;
   3279   }else if( strcmp(zName,"offsets")==0 ){
   3280     *pxFunc = snippetOffsetsFunc;
   3281     return 1;
   3282   }
   3283   return 0;
   3284 }
   3285 
   3286 /*
   3287 ** Rename an fts1 table.
   3288 */
   3289 static int fulltextRename(
   3290   sqlite3_vtab *pVtab,
   3291   const char *zName
   3292 ){
   3293   fulltext_vtab *p = (fulltext_vtab *)pVtab;
   3294   int rc = SQLITE_NOMEM;
   3295   char *zSql = sqlite3_mprintf(
   3296     "ALTER TABLE %Q.'%q_content'  RENAME TO '%q_content';"
   3297     "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';"
   3298     , p->zDb, p->zName, zName
   3299     , p->zDb, p->zName, zName
   3300   );
   3301   if( zSql ){
   3302     rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
   3303     sqlite3_free(zSql);
   3304   }
   3305   return rc;
   3306 }
   3307 
   3308 static const sqlite3_module fulltextModule = {
   3309   /* iVersion      */ 0,
   3310   /* xCreate       */ fulltextCreate,
   3311   /* xConnect      */ fulltextConnect,
   3312   /* xBestIndex    */ fulltextBestIndex,
   3313   /* xDisconnect   */ fulltextDisconnect,
   3314   /* xDestroy      */ fulltextDestroy,
   3315   /* xOpen         */ fulltextOpen,
   3316   /* xClose        */ fulltextClose,
   3317   /* xFilter       */ fulltextFilter,
   3318   /* xNext         */ fulltextNext,
   3319   /* xEof          */ fulltextEof,
   3320   /* xColumn       */ fulltextColumn,
   3321   /* xRowid        */ fulltextRowid,
   3322   /* xUpdate       */ fulltextUpdate,
   3323   /* xBegin        */ 0,
   3324   /* xSync         */ 0,
   3325   /* xCommit       */ 0,
   3326   /* xRollback     */ 0,
   3327   /* xFindFunction */ fulltextFindFunction,
   3328   /* xRename       */ fulltextRename,
   3329 };
   3330 
   3331 int sqlite3Fts1Init(sqlite3 *db){
   3332   sqlite3_overload_function(db, "snippet", -1);
   3333   sqlite3_overload_function(db, "offsets", -1);
   3334   return sqlite3_create_module(db, "fts1", &fulltextModule, 0);
   3335 }
   3336 
   3337 #if !SQLITE_CORE
   3338 int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
   3339                            const sqlite3_api_routines *pApi){
   3340   SQLITE_EXTENSION_INIT2(pApi)
   3341   return sqlite3Fts1Init(db);
   3342 }
   3343 #endif
   3344 
   3345 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
   3346