Home | History | Annotate | Download | only in fts3
      1 /*
      2 ** 2009 Oct 23
      3 **
      4 ** The author disclaims copyright to this source code.  In place of
      5 ** a legal notice, here is a blessing:
      6 **
      7 **    May you do good and not evil.
      8 **    May you find forgiveness for yourself and forgive others.
      9 **    May you share freely, never taking more than you give.
     10 **
     11 ******************************************************************************
     12 */
     13 
     14 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
     15 
     16 #include "fts3Int.h"
     17 #include <string.h>
     18 #include <assert.h>
     19 
     20 /*
     21 ** Characters that may appear in the second argument to matchinfo().
     22 */
     23 #define FTS3_MATCHINFO_NPHRASE   'p'        /* 1 value */
     24 #define FTS3_MATCHINFO_NCOL      'c'        /* 1 value */
     25 #define FTS3_MATCHINFO_NDOC      'n'        /* 1 value */
     26 #define FTS3_MATCHINFO_AVGLENGTH 'a'        /* nCol values */
     27 #define FTS3_MATCHINFO_LENGTH    'l'        /* nCol values */
     28 #define FTS3_MATCHINFO_LCS       's'        /* nCol values */
     29 #define FTS3_MATCHINFO_HITS      'x'        /* 3*nCol*nPhrase values */
     30 
     31 /*
     32 ** The default value for the second argument to matchinfo().
     33 */
     34 #define FTS3_MATCHINFO_DEFAULT   "pcx"
     35 
     36 
     37 /*
     38 ** Used as an fts3ExprIterate() context when loading phrase doclists to
     39 ** Fts3Expr.aDoclist[]/nDoclist.
     40 */
     41 typedef struct LoadDoclistCtx LoadDoclistCtx;
     42 struct LoadDoclistCtx {
     43   Fts3Cursor *pCsr;               /* FTS3 Cursor */
     44   int nPhrase;                    /* Number of phrases seen so far */
     45   int nToken;                     /* Number of tokens seen so far */
     46 };
     47 
     48 /*
     49 ** The following types are used as part of the implementation of the
     50 ** fts3BestSnippet() routine.
     51 */
     52 typedef struct SnippetIter SnippetIter;
     53 typedef struct SnippetPhrase SnippetPhrase;
     54 typedef struct SnippetFragment SnippetFragment;
     55 
     56 struct SnippetIter {
     57   Fts3Cursor *pCsr;               /* Cursor snippet is being generated from */
     58   int iCol;                       /* Extract snippet from this column */
     59   int nSnippet;                   /* Requested snippet length (in tokens) */
     60   int nPhrase;                    /* Number of phrases in query */
     61   SnippetPhrase *aPhrase;         /* Array of size nPhrase */
     62   int iCurrent;                   /* First token of current snippet */
     63 };
     64 
     65 struct SnippetPhrase {
     66   int nToken;                     /* Number of tokens in phrase */
     67   char *pList;                    /* Pointer to start of phrase position list */
     68   int iHead;                      /* Next value in position list */
     69   char *pHead;                    /* Position list data following iHead */
     70   int iTail;                      /* Next value in trailing position list */
     71   char *pTail;                    /* Position list data following iTail */
     72 };
     73 
     74 struct SnippetFragment {
     75   int iCol;                       /* Column snippet is extracted from */
     76   int iPos;                       /* Index of first token in snippet */
     77   u64 covered;                    /* Mask of query phrases covered */
     78   u64 hlmask;                     /* Mask of snippet terms to highlight */
     79 };
     80 
     81 /*
     82 ** This type is used as an fts3ExprIterate() context object while
     83 ** accumulating the data returned by the matchinfo() function.
     84 */
     85 typedef struct MatchInfo MatchInfo;
     86 struct MatchInfo {
     87   Fts3Cursor *pCursor;            /* FTS3 Cursor */
     88   int nCol;                       /* Number of columns in table */
     89   int nPhrase;                    /* Number of matchable phrases in query */
     90   sqlite3_int64 nDoc;             /* Number of docs in database */
     91   u32 *aMatchinfo;                /* Pre-allocated buffer */
     92 };
     93 
     94 
     95 
     96 /*
     97 ** The snippet() and offsets() functions both return text values. An instance
     98 ** of the following structure is used to accumulate those values while the
     99 ** functions are running. See fts3StringAppend() for details.
    100 */
    101 typedef struct StrBuffer StrBuffer;
    102 struct StrBuffer {
    103   char *z;                        /* Pointer to buffer containing string */
    104   int n;                          /* Length of z in bytes (excl. nul-term) */
    105   int nAlloc;                     /* Allocated size of buffer z in bytes */
    106 };
    107 
    108 
    109 /*
    110 ** This function is used to help iterate through a position-list. A position
    111 ** list is a list of unique integers, sorted from smallest to largest. Each
    112 ** element of the list is represented by an FTS3 varint that takes the value
    113 ** of the difference between the current element and the previous one plus
    114 ** two. For example, to store the position-list:
    115 **
    116 **     4 9 113
    117 **
    118 ** the three varints:
    119 **
    120 **     6 7 106
    121 **
    122 ** are encoded.
    123 **
    124 ** When this function is called, *pp points to the start of an element of
    125 ** the list. *piPos contains the value of the previous entry in the list.
    126 ** After it returns, *piPos contains the value of the next element of the
    127 ** list and *pp is advanced to the following varint.
    128 */
    129 static void fts3GetDeltaPosition(char **pp, int *piPos){
    130   int iVal;
    131   *pp += sqlite3Fts3GetVarint32(*pp, &iVal);
    132   *piPos += (iVal-2);
    133 }
    134 
    135 /*
    136 ** Helper function for fts3ExprIterate() (see below).
    137 */
    138 static int fts3ExprIterate2(
    139   Fts3Expr *pExpr,                /* Expression to iterate phrases of */
    140   int *piPhrase,                  /* Pointer to phrase counter */
    141   int (*x)(Fts3Expr*,int,void*),  /* Callback function to invoke for phrases */
    142   void *pCtx                      /* Second argument to pass to callback */
    143 ){
    144   int rc;                         /* Return code */
    145   int eType = pExpr->eType;       /* Type of expression node pExpr */
    146 
    147   if( eType!=FTSQUERY_PHRASE ){
    148     assert( pExpr->pLeft && pExpr->pRight );
    149     rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx);
    150     if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){
    151       rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx);
    152     }
    153   }else{
    154     rc = x(pExpr, *piPhrase, pCtx);
    155     (*piPhrase)++;
    156   }
    157   return rc;
    158 }
    159 
    160 /*
    161 ** Iterate through all phrase nodes in an FTS3 query, except those that
    162 ** are part of a sub-tree that is the right-hand-side of a NOT operator.
    163 ** For each phrase node found, the supplied callback function is invoked.
    164 **
    165 ** If the callback function returns anything other than SQLITE_OK,
    166 ** the iteration is abandoned and the error code returned immediately.
    167 ** Otherwise, SQLITE_OK is returned after a callback has been made for
    168 ** all eligible phrase nodes.
    169 */
    170 static int fts3ExprIterate(
    171   Fts3Expr *pExpr,                /* Expression to iterate phrases of */
    172   int (*x)(Fts3Expr*,int,void*),  /* Callback function to invoke for phrases */
    173   void *pCtx                      /* Second argument to pass to callback */
    174 ){
    175   int iPhrase = 0;                /* Variable used as the phrase counter */
    176   return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx);
    177 }
    178 
    179 /*
    180 ** The argument to this function is always a phrase node. Its doclist
    181 ** (Fts3Expr.aDoclist[]) and the doclists associated with all phrase nodes
    182 ** to the left of this one in the query tree have already been loaded.
    183 **
    184 ** If this phrase node is part of a series of phrase nodes joined by
    185 ** NEAR operators (and is not the left-most of said series), then elements are
    186 ** removed from the phrases doclist consistent with the NEAR restriction. If
    187 ** required, elements may be removed from the doclists of phrases to the
    188 ** left of this one that are part of the same series of NEAR operator
    189 ** connected phrases.
    190 **
    191 ** If an OOM error occurs, SQLITE_NOMEM is returned. Otherwise, SQLITE_OK.
    192 */
    193 static int fts3ExprNearTrim(Fts3Expr *pExpr){
    194   int rc = SQLITE_OK;
    195   Fts3Expr *pParent = pExpr->pParent;
    196 
    197   assert( pExpr->eType==FTSQUERY_PHRASE );
    198   while( rc==SQLITE_OK
    199    && pParent
    200    && pParent->eType==FTSQUERY_NEAR
    201    && pParent->pRight==pExpr
    202   ){
    203     /* This expression (pExpr) is the right-hand-side of a NEAR operator.
    204     ** Find the expression to the left of the same operator.
    205     */
    206     int nNear = pParent->nNear;
    207     Fts3Expr *pLeft = pParent->pLeft;
    208 
    209     if( pLeft->eType!=FTSQUERY_PHRASE ){
    210       assert( pLeft->eType==FTSQUERY_NEAR );
    211       assert( pLeft->pRight->eType==FTSQUERY_PHRASE );
    212       pLeft = pLeft->pRight;
    213     }
    214 
    215     rc = sqlite3Fts3ExprNearTrim(pLeft, pExpr, nNear);
    216 
    217     pExpr = pLeft;
    218     pParent = pExpr->pParent;
    219   }
    220 
    221   return rc;
    222 }
    223 
    224 /*
    225 ** This is an fts3ExprIterate() callback used while loading the doclists
    226 ** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also
    227 ** fts3ExprLoadDoclists().
    228 */
    229 static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
    230   int rc = SQLITE_OK;
    231   LoadDoclistCtx *p = (LoadDoclistCtx *)ctx;
    232 
    233   UNUSED_PARAMETER(iPhrase);
    234 
    235   p->nPhrase++;
    236   p->nToken += pExpr->pPhrase->nToken;
    237 
    238   if( pExpr->isLoaded==0 ){
    239     rc = sqlite3Fts3ExprLoadDoclist(p->pCsr, pExpr);
    240     pExpr->isLoaded = 1;
    241     if( rc==SQLITE_OK ){
    242       rc = fts3ExprNearTrim(pExpr);
    243     }
    244   }
    245 
    246   return rc;
    247 }
    248 
    249 /*
    250 ** Load the doclists for each phrase in the query associated with FTS3 cursor
    251 ** pCsr.
    252 **
    253 ** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable
    254 ** phrases in the expression (all phrases except those directly or
    255 ** indirectly descended from the right-hand-side of a NOT operator). If
    256 ** pnToken is not NULL, then it is set to the number of tokens in all
    257 ** matchable phrases of the expression.
    258 */
    259 static int fts3ExprLoadDoclists(
    260   Fts3Cursor *pCsr,               /* Fts3 cursor for current query */
    261   int *pnPhrase,                  /* OUT: Number of phrases in query */
    262   int *pnToken                    /* OUT: Number of tokens in query */
    263 ){
    264   int rc;                         /* Return Code */
    265   LoadDoclistCtx sCtx = {0,0,0};  /* Context for fts3ExprIterate() */
    266   sCtx.pCsr = pCsr;
    267   rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx);
    268   if( pnPhrase ) *pnPhrase = sCtx.nPhrase;
    269   if( pnToken ) *pnToken = sCtx.nToken;
    270   return rc;
    271 }
    272 
    273 static int fts3ExprPhraseCountCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
    274   (*(int *)ctx)++;
    275   UNUSED_PARAMETER(pExpr);
    276   UNUSED_PARAMETER(iPhrase);
    277   return SQLITE_OK;
    278 }
    279 static int fts3ExprPhraseCount(Fts3Expr *pExpr){
    280   int nPhrase = 0;
    281   (void)fts3ExprIterate(pExpr, fts3ExprPhraseCountCb, (void *)&nPhrase);
    282   return nPhrase;
    283 }
    284 
    285 /*
    286 ** Advance the position list iterator specified by the first two
    287 ** arguments so that it points to the first element with a value greater
    288 ** than or equal to parameter iNext.
    289 */
    290 static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){
    291   char *pIter = *ppIter;
    292   if( pIter ){
    293     int iIter = *piIter;
    294 
    295     while( iIter<iNext ){
    296       if( 0==(*pIter & 0xFE) ){
    297         iIter = -1;
    298         pIter = 0;
    299         break;
    300       }
    301       fts3GetDeltaPosition(&pIter, &iIter);
    302     }
    303 
    304     *piIter = iIter;
    305     *ppIter = pIter;
    306   }
    307 }
    308 
    309 /*
    310 ** Advance the snippet iterator to the next candidate snippet.
    311 */
    312 static int fts3SnippetNextCandidate(SnippetIter *pIter){
    313   int i;                          /* Loop counter */
    314 
    315   if( pIter->iCurrent<0 ){
    316     /* The SnippetIter object has just been initialized. The first snippet
    317     ** candidate always starts at offset 0 (even if this candidate has a
    318     ** score of 0.0).
    319     */
    320     pIter->iCurrent = 0;
    321 
    322     /* Advance the 'head' iterator of each phrase to the first offset that
    323     ** is greater than or equal to (iNext+nSnippet).
    324     */
    325     for(i=0; i<pIter->nPhrase; i++){
    326       SnippetPhrase *pPhrase = &pIter->aPhrase[i];
    327       fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet);
    328     }
    329   }else{
    330     int iStart;
    331     int iEnd = 0x7FFFFFFF;
    332 
    333     for(i=0; i<pIter->nPhrase; i++){
    334       SnippetPhrase *pPhrase = &pIter->aPhrase[i];
    335       if( pPhrase->pHead && pPhrase->iHead<iEnd ){
    336         iEnd = pPhrase->iHead;
    337       }
    338     }
    339     if( iEnd==0x7FFFFFFF ){
    340       return 1;
    341     }
    342 
    343     pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1;
    344     for(i=0; i<pIter->nPhrase; i++){
    345       SnippetPhrase *pPhrase = &pIter->aPhrase[i];
    346       fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1);
    347       fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart);
    348     }
    349   }
    350 
    351   return 0;
    352 }
    353 
    354 /*
    355 ** Retrieve information about the current candidate snippet of snippet
    356 ** iterator pIter.
    357 */
    358 static void fts3SnippetDetails(
    359   SnippetIter *pIter,             /* Snippet iterator */
    360   u64 mCovered,                   /* Bitmask of phrases already covered */
    361   int *piToken,                   /* OUT: First token of proposed snippet */
    362   int *piScore,                   /* OUT: "Score" for this snippet */
    363   u64 *pmCover,                   /* OUT: Bitmask of phrases covered */
    364   u64 *pmHighlight                /* OUT: Bitmask of terms to highlight */
    365 ){
    366   int iStart = pIter->iCurrent;   /* First token of snippet */
    367   int iScore = 0;                 /* Score of this snippet */
    368   int i;                          /* Loop counter */
    369   u64 mCover = 0;                 /* Mask of phrases covered by this snippet */
    370   u64 mHighlight = 0;             /* Mask of tokens to highlight in snippet */
    371 
    372   for(i=0; i<pIter->nPhrase; i++){
    373     SnippetPhrase *pPhrase = &pIter->aPhrase[i];
    374     if( pPhrase->pTail ){
    375       char *pCsr = pPhrase->pTail;
    376       int iCsr = pPhrase->iTail;
    377 
    378       while( iCsr<(iStart+pIter->nSnippet) ){
    379         int j;
    380         u64 mPhrase = (u64)1 << i;
    381         u64 mPos = (u64)1 << (iCsr - iStart);
    382         assert( iCsr>=iStart );
    383         if( (mCover|mCovered)&mPhrase ){
    384           iScore++;
    385         }else{
    386           iScore += 1000;
    387         }
    388         mCover |= mPhrase;
    389 
    390         for(j=0; j<pPhrase->nToken; j++){
    391           mHighlight |= (mPos>>j);
    392         }
    393 
    394         if( 0==(*pCsr & 0x0FE) ) break;
    395         fts3GetDeltaPosition(&pCsr, &iCsr);
    396       }
    397     }
    398   }
    399 
    400   /* Set the output variables before returning. */
    401   *piToken = iStart;
    402   *piScore = iScore;
    403   *pmCover = mCover;
    404   *pmHighlight = mHighlight;
    405 }
    406 
    407 /*
    408 ** This function is an fts3ExprIterate() callback used by fts3BestSnippet().
    409 ** Each invocation populates an element of the SnippetIter.aPhrase[] array.
    410 */
    411 static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){
    412   SnippetIter *p = (SnippetIter *)ctx;
    413   SnippetPhrase *pPhrase = &p->aPhrase[iPhrase];
    414   char *pCsr;
    415 
    416   pPhrase->nToken = pExpr->pPhrase->nToken;
    417 
    418   pCsr = sqlite3Fts3FindPositions(pExpr, p->pCsr->iPrevId, p->iCol);
    419   if( pCsr ){
    420     int iFirst = 0;
    421     pPhrase->pList = pCsr;
    422     fts3GetDeltaPosition(&pCsr, &iFirst);
    423     pPhrase->pHead = pCsr;
    424     pPhrase->pTail = pCsr;
    425     pPhrase->iHead = iFirst;
    426     pPhrase->iTail = iFirst;
    427   }else{
    428     assert( pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 );
    429   }
    430 
    431   return SQLITE_OK;
    432 }
    433 
    434 /*
    435 ** Select the fragment of text consisting of nFragment contiguous tokens
    436 ** from column iCol that represent the "best" snippet. The best snippet
    437 ** is the snippet with the highest score, where scores are calculated
    438 ** by adding:
    439 **
    440 **   (a) +1 point for each occurence of a matchable phrase in the snippet.
    441 **
    442 **   (b) +1000 points for the first occurence of each matchable phrase in
    443 **       the snippet for which the corresponding mCovered bit is not set.
    444 **
    445 ** The selected snippet parameters are stored in structure *pFragment before
    446 ** returning. The score of the selected snippet is stored in *piScore
    447 ** before returning.
    448 */
    449 static int fts3BestSnippet(
    450   int nSnippet,                   /* Desired snippet length */
    451   Fts3Cursor *pCsr,               /* Cursor to create snippet for */
    452   int iCol,                       /* Index of column to create snippet from */
    453   u64 mCovered,                   /* Mask of phrases already covered */
    454   u64 *pmSeen,                    /* IN/OUT: Mask of phrases seen */
    455   SnippetFragment *pFragment,     /* OUT: Best snippet found */
    456   int *piScore                    /* OUT: Score of snippet pFragment */
    457 ){
    458   int rc;                         /* Return Code */
    459   int nList;                      /* Number of phrases in expression */
    460   SnippetIter sIter;              /* Iterates through snippet candidates */
    461   int nByte;                      /* Number of bytes of space to allocate */
    462   int iBestScore = -1;            /* Best snippet score found so far */
    463   int i;                          /* Loop counter */
    464 
    465   memset(&sIter, 0, sizeof(sIter));
    466 
    467   /* Iterate through the phrases in the expression to count them. The same
    468   ** callback makes sure the doclists are loaded for each phrase.
    469   */
    470   rc = fts3ExprLoadDoclists(pCsr, &nList, 0);
    471   if( rc!=SQLITE_OK ){
    472     return rc;
    473   }
    474 
    475   /* Now that it is known how many phrases there are, allocate and zero
    476   ** the required space using malloc().
    477   */
    478   nByte = sizeof(SnippetPhrase) * nList;
    479   sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte);
    480   if( !sIter.aPhrase ){
    481     return SQLITE_NOMEM;
    482   }
    483   memset(sIter.aPhrase, 0, nByte);
    484 
    485   /* Initialize the contents of the SnippetIter object. Then iterate through
    486   ** the set of phrases in the expression to populate the aPhrase[] array.
    487   */
    488   sIter.pCsr = pCsr;
    489   sIter.iCol = iCol;
    490   sIter.nSnippet = nSnippet;
    491   sIter.nPhrase = nList;
    492   sIter.iCurrent = -1;
    493   (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter);
    494 
    495   /* Set the *pmSeen output variable. */
    496   for(i=0; i<nList; i++){
    497     if( sIter.aPhrase[i].pHead ){
    498       *pmSeen |= (u64)1 << i;
    499     }
    500   }
    501 
    502   /* Loop through all candidate snippets. Store the best snippet in
    503   ** *pFragment. Store its associated 'score' in iBestScore.
    504   */
    505   pFragment->iCol = iCol;
    506   while( !fts3SnippetNextCandidate(&sIter) ){
    507     int iPos;
    508     int iScore;
    509     u64 mCover;
    510     u64 mHighlight;
    511     fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight);
    512     assert( iScore>=0 );
    513     if( iScore>iBestScore ){
    514       pFragment->iPos = iPos;
    515       pFragment->hlmask = mHighlight;
    516       pFragment->covered = mCover;
    517       iBestScore = iScore;
    518     }
    519   }
    520 
    521   sqlite3_free(sIter.aPhrase);
    522   *piScore = iBestScore;
    523   return SQLITE_OK;
    524 }
    525 
    526 
    527 /*
    528 ** Append a string to the string-buffer passed as the first argument.
    529 **
    530 ** If nAppend is negative, then the length of the string zAppend is
    531 ** determined using strlen().
    532 */
    533 static int fts3StringAppend(
    534   StrBuffer *pStr,                /* Buffer to append to */
    535   const char *zAppend,            /* Pointer to data to append to buffer */
    536   int nAppend                     /* Size of zAppend in bytes (or -1) */
    537 ){
    538   if( nAppend<0 ){
    539     nAppend = (int)strlen(zAppend);
    540   }
    541 
    542   /* If there is insufficient space allocated at StrBuffer.z, use realloc()
    543   ** to grow the buffer until so that it is big enough to accomadate the
    544   ** appended data.
    545   */
    546   if( pStr->n+nAppend+1>=pStr->nAlloc ){
    547     int nAlloc = pStr->nAlloc+nAppend+100;
    548     char *zNew = sqlite3_realloc(pStr->z, nAlloc);
    549     if( !zNew ){
    550       return SQLITE_NOMEM;
    551     }
    552     pStr->z = zNew;
    553     pStr->nAlloc = nAlloc;
    554   }
    555 
    556   /* Append the data to the string buffer. */
    557   memcpy(&pStr->z[pStr->n], zAppend, nAppend);
    558   pStr->n += nAppend;
    559   pStr->z[pStr->n] = '\0';
    560 
    561   return SQLITE_OK;
    562 }
    563 
    564 /*
    565 ** The fts3BestSnippet() function often selects snippets that end with a
    566 ** query term. That is, the final term of the snippet is always a term
    567 ** that requires highlighting. For example, if 'X' is a highlighted term
    568 ** and '.' is a non-highlighted term, BestSnippet() may select:
    569 **
    570 **     ........X.....X
    571 **
    572 ** This function "shifts" the beginning of the snippet forward in the
    573 ** document so that there are approximately the same number of
    574 ** non-highlighted terms to the right of the final highlighted term as there
    575 ** are to the left of the first highlighted term. For example, to this:
    576 **
    577 **     ....X.....X....
    578 **
    579 ** This is done as part of extracting the snippet text, not when selecting
    580 ** the snippet. Snippet selection is done based on doclists only, so there
    581 ** is no way for fts3BestSnippet() to know whether or not the document
    582 ** actually contains terms that follow the final highlighted term.
    583 */
    584 static int fts3SnippetShift(
    585   Fts3Table *pTab,                /* FTS3 table snippet comes from */
    586   int nSnippet,                   /* Number of tokens desired for snippet */
    587   const char *zDoc,               /* Document text to extract snippet from */
    588   int nDoc,                       /* Size of buffer zDoc in bytes */
    589   int *piPos,                     /* IN/OUT: First token of snippet */
    590   u64 *pHlmask                    /* IN/OUT: Mask of tokens to highlight */
    591 ){
    592   u64 hlmask = *pHlmask;          /* Local copy of initial highlight-mask */
    593 
    594   if( hlmask ){
    595     int nLeft;                    /* Tokens to the left of first highlight */
    596     int nRight;                   /* Tokens to the right of last highlight */
    597     int nDesired;                 /* Ideal number of tokens to shift forward */
    598 
    599     for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++);
    600     for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++);
    601     nDesired = (nLeft-nRight)/2;
    602 
    603     /* Ideally, the start of the snippet should be pushed forward in the
    604     ** document nDesired tokens. This block checks if there are actually
    605     ** nDesired tokens to the right of the snippet. If so, *piPos and
    606     ** *pHlMask are updated to shift the snippet nDesired tokens to the
    607     ** right. Otherwise, the snippet is shifted by the number of tokens
    608     ** available.
    609     */
    610     if( nDesired>0 ){
    611       int nShift;                 /* Number of tokens to shift snippet by */
    612       int iCurrent = 0;           /* Token counter */
    613       int rc;                     /* Return Code */
    614       sqlite3_tokenizer_module *pMod;
    615       sqlite3_tokenizer_cursor *pC;
    616       pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
    617 
    618       /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
    619       ** or more tokens in zDoc/nDoc.
    620       */
    621       rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
    622       if( rc!=SQLITE_OK ){
    623         return rc;
    624       }
    625       pC->pTokenizer = pTab->pTokenizer;
    626       while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
    627         const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3;
    628         rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
    629       }
    630       pMod->xClose(pC);
    631       if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }
    632 
    633       nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet;
    634       assert( nShift<=nDesired );
    635       if( nShift>0 ){
    636         *piPos += nShift;
    637         *pHlmask = hlmask >> nShift;
    638       }
    639     }
    640   }
    641   return SQLITE_OK;
    642 }
    643 
    644 /*
    645 ** Extract the snippet text for fragment pFragment from cursor pCsr and
    646 ** append it to string buffer pOut.
    647 */
    648 static int fts3SnippetText(
    649   Fts3Cursor *pCsr,               /* FTS3 Cursor */
    650   SnippetFragment *pFragment,     /* Snippet to extract */
    651   int iFragment,                  /* Fragment number */
    652   int isLast,                     /* True for final fragment in snippet */
    653   int nSnippet,                   /* Number of tokens in extracted snippet */
    654   const char *zOpen,              /* String inserted before highlighted term */
    655   const char *zClose,             /* String inserted after highlighted term */
    656   const char *zEllipsis,          /* String inserted between snippets */
    657   StrBuffer *pOut                 /* Write output here */
    658 ){
    659   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
    660   int rc;                         /* Return code */
    661   const char *zDoc;               /* Document text to extract snippet from */
    662   int nDoc;                       /* Size of zDoc in bytes */
    663   int iCurrent = 0;               /* Current token number of document */
    664   int iEnd = 0;                   /* Byte offset of end of current token */
    665   int isShiftDone = 0;            /* True after snippet is shifted */
    666   int iPos = pFragment->iPos;     /* First token of snippet */
    667   u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */
    668   int iCol = pFragment->iCol+1;   /* Query column to extract text from */
    669   sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */
    670   sqlite3_tokenizer_cursor *pC;   /* Tokenizer cursor open on zDoc/nDoc */
    671   const char *ZDUMMY;             /* Dummy argument used with tokenizer */
    672   int DUMMY1;                     /* Dummy argument used with tokenizer */
    673 
    674   zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol);
    675   if( zDoc==0 ){
    676     if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){
    677       return SQLITE_NOMEM;
    678     }
    679     return SQLITE_OK;
    680   }
    681   nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);
    682 
    683   /* Open a token cursor on the document. */
    684   pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
    685   rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
    686   if( rc!=SQLITE_OK ){
    687     return rc;
    688   }
    689   pC->pTokenizer = pTab->pTokenizer;
    690 
    691   while( rc==SQLITE_OK ){
    692     int iBegin;                   /* Offset in zDoc of start of token */
    693     int iFin;                     /* Offset in zDoc of end of token */
    694     int isHighlight;              /* True for highlighted terms */
    695 
    696     rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);
    697     if( rc!=SQLITE_OK ){
    698       if( rc==SQLITE_DONE ){
    699         /* Special case - the last token of the snippet is also the last token
    700         ** of the column. Append any punctuation that occurred between the end
    701         ** of the previous token and the end of the document to the output.
    702         ** Then break out of the loop. */
    703         rc = fts3StringAppend(pOut, &zDoc[iEnd], -1);
    704       }
    705       break;
    706     }
    707     if( iCurrent<iPos ){ continue; }
    708 
    709     if( !isShiftDone ){
    710       int n = nDoc - iBegin;
    711       rc = fts3SnippetShift(pTab, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask);
    712       isShiftDone = 1;
    713 
    714       /* Now that the shift has been done, check if the initial "..." are
    715       ** required. They are required if (a) this is not the first fragment,
    716       ** or (b) this fragment does not begin at position 0 of its column.
    717       */
    718       if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){
    719         rc = fts3StringAppend(pOut, zEllipsis, -1);
    720       }
    721       if( rc!=SQLITE_OK || iCurrent<iPos ) continue;
    722     }
    723 
    724     if( iCurrent>=(iPos+nSnippet) ){
    725       if( isLast ){
    726         rc = fts3StringAppend(pOut, zEllipsis, -1);
    727       }
    728       break;
    729     }
    730 
    731     /* Set isHighlight to true if this term should be highlighted. */
    732     isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0;
    733 
    734     if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd);
    735     if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1);
    736     if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin);
    737     if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1);
    738 
    739     iEnd = iFin;
    740   }
    741 
    742   pMod->xClose(pC);
    743   return rc;
    744 }
    745 
    746 
    747 /*
    748 ** This function is used to count the entries in a column-list (a
    749 ** delta-encoded list of term offsets within a single column of a single
    750 ** row). When this function is called, *ppCollist should point to the
    751 ** beginning of the first varint in the column-list (the varint that
    752 ** contains the position of the first matching term in the column data).
    753 ** Before returning, *ppCollist is set to point to the first byte after
    754 ** the last varint in the column-list (either the 0x00 signifying the end
    755 ** of the position-list, or the 0x01 that precedes the column number of
    756 ** the next column in the position-list).
    757 **
    758 ** The number of elements in the column-list is returned.
    759 */
    760 static int fts3ColumnlistCount(char **ppCollist){
    761   char *pEnd = *ppCollist;
    762   char c = 0;
    763   int nEntry = 0;
    764 
    765   /* A column-list is terminated by either a 0x01 or 0x00. */
    766   while( 0xFE & (*pEnd | c) ){
    767     c = *pEnd++ & 0x80;
    768     if( !c ) nEntry++;
    769   }
    770 
    771   *ppCollist = pEnd;
    772   return nEntry;
    773 }
    774 
    775 static void fts3LoadColumnlistCounts(char **pp, u32 *aOut, int isGlobal){
    776   char *pCsr = *pp;
    777   while( *pCsr ){
    778     int nHit;
    779     sqlite3_int64 iCol = 0;
    780     if( *pCsr==0x01 ){
    781       pCsr++;
    782       pCsr += sqlite3Fts3GetVarint(pCsr, &iCol);
    783     }
    784     nHit = fts3ColumnlistCount(&pCsr);
    785     assert( nHit>0 );
    786     if( isGlobal ){
    787       aOut[iCol*3+1]++;
    788     }
    789     aOut[iCol*3] += nHit;
    790   }
    791   pCsr++;
    792   *pp = pCsr;
    793 }
    794 
    795 /*
    796 ** fts3ExprIterate() callback used to collect the "global" matchinfo stats
    797 ** for a single query.
    798 **
    799 ** fts3ExprIterate() callback to load the 'global' elements of a
    800 ** FTS3_MATCHINFO_HITS matchinfo array. The global stats are those elements
    801 ** of the matchinfo array that are constant for all rows returned by the
    802 ** current query.
    803 **
    804 ** Argument pCtx is actually a pointer to a struct of type MatchInfo. This
    805 ** function populates Matchinfo.aMatchinfo[] as follows:
    806 **
    807 **   for(iCol=0; iCol<nCol; iCol++){
    808 **     aMatchinfo[3*iPhrase*nCol + 3*iCol + 1] = X;
    809 **     aMatchinfo[3*iPhrase*nCol + 3*iCol + 2] = Y;
    810 **   }
    811 **
    812 ** where X is the number of matches for phrase iPhrase is column iCol of all
    813 ** rows of the table. Y is the number of rows for which column iCol contains
    814 ** at least one instance of phrase iPhrase.
    815 **
    816 ** If the phrase pExpr consists entirely of deferred tokens, then all X and
    817 ** Y values are set to nDoc, where nDoc is the number of documents in the
    818 ** file system. This is done because the full-text index doclist is required
    819 ** to calculate these values properly, and the full-text index doclist is
    820 ** not available for deferred tokens.
    821 */
    822 static int fts3ExprGlobalHitsCb(
    823   Fts3Expr *pExpr,                /* Phrase expression node */
    824   int iPhrase,                    /* Phrase number (numbered from zero) */
    825   void *pCtx                      /* Pointer to MatchInfo structure */
    826 ){
    827   MatchInfo *p = (MatchInfo *)pCtx;
    828   Fts3Cursor *pCsr = p->pCursor;
    829   char *pIter;
    830   char *pEnd;
    831   char *pFree = 0;
    832   u32 *aOut = &p->aMatchinfo[3*iPhrase*p->nCol];
    833 
    834   assert( pExpr->isLoaded );
    835   assert( pExpr->eType==FTSQUERY_PHRASE );
    836 
    837   if( pCsr->pDeferred ){
    838     Fts3Phrase *pPhrase = pExpr->pPhrase;
    839     int ii;
    840     for(ii=0; ii<pPhrase->nToken; ii++){
    841       if( pPhrase->aToken[ii].bFulltext ) break;
    842     }
    843     if( ii<pPhrase->nToken ){
    844       int nFree = 0;
    845       int rc = sqlite3Fts3ExprLoadFtDoclist(pCsr, pExpr, &pFree, &nFree);
    846       if( rc!=SQLITE_OK ) return rc;
    847       pIter = pFree;
    848       pEnd = &pFree[nFree];
    849     }else{
    850       int iCol;                   /* Column index */
    851       for(iCol=0; iCol<p->nCol; iCol++){
    852         aOut[iCol*3 + 1] = (u32)p->nDoc;
    853         aOut[iCol*3 + 2] = (u32)p->nDoc;
    854       }
    855       return SQLITE_OK;
    856     }
    857   }else{
    858     pIter = pExpr->aDoclist;
    859     pEnd = &pExpr->aDoclist[pExpr->nDoclist];
    860   }
    861 
    862   /* Fill in the global hit count matrix row for this phrase. */
    863   while( pIter<pEnd ){
    864     while( *pIter++ & 0x80 );      /* Skip past docid. */
    865     fts3LoadColumnlistCounts(&pIter, &aOut[1], 1);
    866   }
    867 
    868   sqlite3_free(pFree);
    869   return SQLITE_OK;
    870 }
    871 
    872 /*
    873 ** fts3ExprIterate() callback used to collect the "local" part of the
    874 ** FTS3_MATCHINFO_HITS array. The local stats are those elements of the
    875 ** array that are different for each row returned by the query.
    876 */
    877 static int fts3ExprLocalHitsCb(
    878   Fts3Expr *pExpr,                /* Phrase expression node */
    879   int iPhrase,                    /* Phrase number */
    880   void *pCtx                      /* Pointer to MatchInfo structure */
    881 ){
    882   MatchInfo *p = (MatchInfo *)pCtx;
    883   int iStart = iPhrase * p->nCol * 3;
    884   int i;
    885 
    886   for(i=0; i<p->nCol; i++) p->aMatchinfo[iStart+i*3] = 0;
    887 
    888   if( pExpr->aDoclist ){
    889     char *pCsr;
    890 
    891     pCsr = sqlite3Fts3FindPositions(pExpr, p->pCursor->iPrevId, -1);
    892     if( pCsr ){
    893       fts3LoadColumnlistCounts(&pCsr, &p->aMatchinfo[iStart], 0);
    894     }
    895   }
    896 
    897   return SQLITE_OK;
    898 }
    899 
    900 static int fts3MatchinfoCheck(
    901   Fts3Table *pTab,
    902   char cArg,
    903   char **pzErr
    904 ){
    905   if( (cArg==FTS3_MATCHINFO_NPHRASE)
    906    || (cArg==FTS3_MATCHINFO_NCOL)
    907    || (cArg==FTS3_MATCHINFO_NDOC && pTab->bHasStat)
    908    || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bHasStat)
    909    || (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize)
    910    || (cArg==FTS3_MATCHINFO_LCS)
    911    || (cArg==FTS3_MATCHINFO_HITS)
    912   ){
    913     return SQLITE_OK;
    914   }
    915   *pzErr = sqlite3_mprintf("unrecognized matchinfo request: %c", cArg);
    916   return SQLITE_ERROR;
    917 }
    918 
    919 static int fts3MatchinfoSize(MatchInfo *pInfo, char cArg){
    920   int nVal;                       /* Number of integers output by cArg */
    921 
    922   switch( cArg ){
    923     case FTS3_MATCHINFO_NDOC:
    924     case FTS3_MATCHINFO_NPHRASE:
    925     case FTS3_MATCHINFO_NCOL:
    926       nVal = 1;
    927       break;
    928 
    929     case FTS3_MATCHINFO_AVGLENGTH:
    930     case FTS3_MATCHINFO_LENGTH:
    931     case FTS3_MATCHINFO_LCS:
    932       nVal = pInfo->nCol;
    933       break;
    934 
    935     default:
    936       assert( cArg==FTS3_MATCHINFO_HITS );
    937       nVal = pInfo->nCol * pInfo->nPhrase * 3;
    938       break;
    939   }
    940 
    941   return nVal;
    942 }
    943 
    944 static int fts3MatchinfoSelectDoctotal(
    945   Fts3Table *pTab,
    946   sqlite3_stmt **ppStmt,
    947   sqlite3_int64 *pnDoc,
    948   const char **paLen
    949 ){
    950   sqlite3_stmt *pStmt;
    951   const char *a;
    952   sqlite3_int64 nDoc;
    953 
    954   if( !*ppStmt ){
    955     int rc = sqlite3Fts3SelectDoctotal(pTab, ppStmt);
    956     if( rc!=SQLITE_OK ) return rc;
    957   }
    958   pStmt = *ppStmt;
    959   assert( sqlite3_data_count(pStmt)==1 );
    960 
    961   a = sqlite3_column_blob(pStmt, 0);
    962   a += sqlite3Fts3GetVarint(a, &nDoc);
    963   if( nDoc==0 ) return SQLITE_CORRUPT;
    964   *pnDoc = (u32)nDoc;
    965 
    966   if( paLen ) *paLen = a;
    967   return SQLITE_OK;
    968 }
    969 
    970 /*
    971 ** An instance of the following structure is used to store state while
    972 ** iterating through a multi-column position-list corresponding to the
    973 ** hits for a single phrase on a single row in order to calculate the
    974 ** values for a matchinfo() FTS3_MATCHINFO_LCS request.
    975 */
    976 typedef struct LcsIterator LcsIterator;
    977 struct LcsIterator {
    978   Fts3Expr *pExpr;                /* Pointer to phrase expression */
    979   char *pRead;                    /* Cursor used to iterate through aDoclist */
    980   int iPosOffset;                 /* Tokens count up to end of this phrase */
    981   int iCol;                       /* Current column number */
    982   int iPos;                       /* Current position */
    983 };
    984 
    985 /*
    986 ** If LcsIterator.iCol is set to the following value, the iterator has
    987 ** finished iterating through all offsets for all columns.
    988 */
    989 #define LCS_ITERATOR_FINISHED 0x7FFFFFFF;
    990 
    991 static int fts3MatchinfoLcsCb(
    992   Fts3Expr *pExpr,                /* Phrase expression node */
    993   int iPhrase,                    /* Phrase number (numbered from zero) */
    994   void *pCtx                      /* Pointer to MatchInfo structure */
    995 ){
    996   LcsIterator *aIter = (LcsIterator *)pCtx;
    997   aIter[iPhrase].pExpr = pExpr;
    998   return SQLITE_OK;
    999 }
   1000 
   1001 /*
   1002 ** Advance the iterator passed as an argument to the next position. Return
   1003 ** 1 if the iterator is at EOF or if it now points to the start of the
   1004 ** position list for the next column.
   1005 */
   1006 static int fts3LcsIteratorAdvance(LcsIterator *pIter){
   1007   char *pRead = pIter->pRead;
   1008   sqlite3_int64 iRead;
   1009   int rc = 0;
   1010 
   1011   pRead += sqlite3Fts3GetVarint(pRead, &iRead);
   1012   if( iRead==0 ){
   1013     pIter->iCol = LCS_ITERATOR_FINISHED;
   1014     rc = 1;
   1015   }else{
   1016     if( iRead==1 ){
   1017       pRead += sqlite3Fts3GetVarint(pRead, &iRead);
   1018       pIter->iCol = (int)iRead;
   1019       pIter->iPos = pIter->iPosOffset;
   1020       pRead += sqlite3Fts3GetVarint(pRead, &iRead);
   1021       rc = 1;
   1022     }
   1023     pIter->iPos += (int)(iRead-2);
   1024   }
   1025 
   1026   pIter->pRead = pRead;
   1027   return rc;
   1028 }
   1029 
   1030 /*
   1031 ** This function implements the FTS3_MATCHINFO_LCS matchinfo() flag.
   1032 **
   1033 ** If the call is successful, the longest-common-substring lengths for each
   1034 ** column are written into the first nCol elements of the pInfo->aMatchinfo[]
   1035 ** array before returning. SQLITE_OK is returned in this case.
   1036 **
   1037 ** Otherwise, if an error occurs, an SQLite error code is returned and the
   1038 ** data written to the first nCol elements of pInfo->aMatchinfo[] is
   1039 ** undefined.
   1040 */
   1041 static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){
   1042   LcsIterator *aIter;
   1043   int i;
   1044   int iCol;
   1045   int nToken = 0;
   1046 
   1047   /* Allocate and populate the array of LcsIterator objects. The array
   1048   ** contains one element for each matchable phrase in the query.
   1049   **/
   1050   aIter = sqlite3_malloc(sizeof(LcsIterator) * pCsr->nPhrase);
   1051   if( !aIter ) return SQLITE_NOMEM;
   1052   memset(aIter, 0, sizeof(LcsIterator) * pCsr->nPhrase);
   1053   (void)fts3ExprIterate(pCsr->pExpr, fts3MatchinfoLcsCb, (void*)aIter);
   1054   for(i=0; i<pInfo->nPhrase; i++){
   1055     LcsIterator *pIter = &aIter[i];
   1056     nToken -= pIter->pExpr->pPhrase->nToken;
   1057     pIter->iPosOffset = nToken;
   1058     pIter->pRead = sqlite3Fts3FindPositions(pIter->pExpr, pCsr->iPrevId, -1);
   1059     if( pIter->pRead ){
   1060       pIter->iPos = pIter->iPosOffset;
   1061       fts3LcsIteratorAdvance(&aIter[i]);
   1062     }else{
   1063       pIter->iCol = LCS_ITERATOR_FINISHED;
   1064     }
   1065   }
   1066 
   1067   for(iCol=0; iCol<pInfo->nCol; iCol++){
   1068     int nLcs = 0;                 /* LCS value for this column */
   1069     int nLive = 0;                /* Number of iterators in aIter not at EOF */
   1070 
   1071     /* Loop through the iterators in aIter[]. Set nLive to the number of
   1072     ** iterators that point to a position-list corresponding to column iCol.
   1073     */
   1074     for(i=0; i<pInfo->nPhrase; i++){
   1075       assert( aIter[i].iCol>=iCol );
   1076       if( aIter[i].iCol==iCol ) nLive++;
   1077     }
   1078 
   1079     /* The following loop runs until all iterators in aIter[] have finished
   1080     ** iterating through positions in column iCol. Exactly one of the
   1081     ** iterators is advanced each time the body of the loop is run.
   1082     */
   1083     while( nLive>0 ){
   1084       LcsIterator *pAdv = 0;      /* The iterator to advance by one position */
   1085       int nThisLcs = 0;           /* LCS for the current iterator positions */
   1086 
   1087       for(i=0; i<pInfo->nPhrase; i++){
   1088         LcsIterator *pIter = &aIter[i];
   1089         if( iCol!=pIter->iCol ){
   1090           /* This iterator is already at EOF for this column. */
   1091           nThisLcs = 0;
   1092         }else{
   1093           if( pAdv==0 || pIter->iPos<pAdv->iPos ){
   1094             pAdv = pIter;
   1095           }
   1096           if( nThisLcs==0 || pIter->iPos==pIter[-1].iPos ){
   1097             nThisLcs++;
   1098           }else{
   1099             nThisLcs = 1;
   1100           }
   1101           if( nThisLcs>nLcs ) nLcs = nThisLcs;
   1102         }
   1103       }
   1104       if( fts3LcsIteratorAdvance(pAdv) ) nLive--;
   1105     }
   1106 
   1107     pInfo->aMatchinfo[iCol] = nLcs;
   1108   }
   1109 
   1110   sqlite3_free(aIter);
   1111   return SQLITE_OK;
   1112 }
   1113 
   1114 /*
   1115 ** Populate the buffer pInfo->aMatchinfo[] with an array of integers to
   1116 ** be returned by the matchinfo() function. Argument zArg contains the
   1117 ** format string passed as the second argument to matchinfo (or the
   1118 ** default value "pcx" if no second argument was specified). The format
   1119 ** string has already been validated and the pInfo->aMatchinfo[] array
   1120 ** is guaranteed to be large enough for the output.
   1121 **
   1122 ** If bGlobal is true, then populate all fields of the matchinfo() output.
   1123 ** If it is false, then assume that those fields that do not change between
   1124 ** rows (i.e. FTS3_MATCHINFO_NPHRASE, NCOL, NDOC, AVGLENGTH and part of HITS)
   1125 ** have already been populated.
   1126 **
   1127 ** Return SQLITE_OK if successful, or an SQLite error code if an error
   1128 ** occurs. If a value other than SQLITE_OK is returned, the state the
   1129 ** pInfo->aMatchinfo[] buffer is left in is undefined.
   1130 */
   1131 static int fts3MatchinfoValues(
   1132   Fts3Cursor *pCsr,               /* FTS3 cursor object */
   1133   int bGlobal,                    /* True to grab the global stats */
   1134   MatchInfo *pInfo,               /* Matchinfo context object */
   1135   const char *zArg                /* Matchinfo format string */
   1136 ){
   1137   int rc = SQLITE_OK;
   1138   int i;
   1139   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
   1140   sqlite3_stmt *pSelect = 0;
   1141 
   1142   for(i=0; rc==SQLITE_OK && zArg[i]; i++){
   1143 
   1144     switch( zArg[i] ){
   1145       case FTS3_MATCHINFO_NPHRASE:
   1146         if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nPhrase;
   1147         break;
   1148 
   1149       case FTS3_MATCHINFO_NCOL:
   1150         if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nCol;
   1151         break;
   1152 
   1153       case FTS3_MATCHINFO_NDOC:
   1154         if( bGlobal ){
   1155           sqlite3_int64 nDoc;
   1156           rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, 0);
   1157           pInfo->aMatchinfo[0] = (u32)nDoc;
   1158         }
   1159         break;
   1160 
   1161       case FTS3_MATCHINFO_AVGLENGTH:
   1162         if( bGlobal ){
   1163           sqlite3_int64 nDoc;     /* Number of rows in table */
   1164           const char *a;          /* Aggregate column length array */
   1165 
   1166           rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, &a);
   1167           if( rc==SQLITE_OK ){
   1168             int iCol;
   1169             for(iCol=0; iCol<pInfo->nCol; iCol++){
   1170               u32 iVal;
   1171               sqlite3_int64 nToken;
   1172               a += sqlite3Fts3GetVarint(a, &nToken);
   1173               iVal = (u32)(((u32)(nToken&0xffffffff)+nDoc/2)/nDoc);
   1174               pInfo->aMatchinfo[iCol] = iVal;
   1175             }
   1176           }
   1177         }
   1178         break;
   1179 
   1180       case FTS3_MATCHINFO_LENGTH: {
   1181         sqlite3_stmt *pSelectDocsize = 0;
   1182         rc = sqlite3Fts3SelectDocsize(pTab, pCsr->iPrevId, &pSelectDocsize);
   1183         if( rc==SQLITE_OK ){
   1184           int iCol;
   1185           const char *a = sqlite3_column_blob(pSelectDocsize, 0);
   1186           for(iCol=0; iCol<pInfo->nCol; iCol++){
   1187             sqlite3_int64 nToken;
   1188             a += sqlite3Fts3GetVarint(a, &nToken);
   1189             pInfo->aMatchinfo[iCol] = (u32)nToken;
   1190           }
   1191         }
   1192         sqlite3_reset(pSelectDocsize);
   1193         break;
   1194       }
   1195 
   1196       case FTS3_MATCHINFO_LCS:
   1197         rc = fts3ExprLoadDoclists(pCsr, 0, 0);
   1198         if( rc==SQLITE_OK ){
   1199           rc = fts3MatchinfoLcs(pCsr, pInfo);
   1200         }
   1201         break;
   1202 
   1203       default: {
   1204         Fts3Expr *pExpr;
   1205         assert( zArg[i]==FTS3_MATCHINFO_HITS );
   1206         pExpr = pCsr->pExpr;
   1207         rc = fts3ExprLoadDoclists(pCsr, 0, 0);
   1208         if( rc!=SQLITE_OK ) break;
   1209         if( bGlobal ){
   1210           if( pCsr->pDeferred ){
   1211             rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &pInfo->nDoc, 0);
   1212             if( rc!=SQLITE_OK ) break;
   1213           }
   1214           rc = fts3ExprIterate(pExpr, fts3ExprGlobalHitsCb,(void*)pInfo);
   1215           if( rc!=SQLITE_OK ) break;
   1216         }
   1217         (void)fts3ExprIterate(pExpr, fts3ExprLocalHitsCb,(void*)pInfo);
   1218         break;
   1219       }
   1220     }
   1221 
   1222     pInfo->aMatchinfo += fts3MatchinfoSize(pInfo, zArg[i]);
   1223   }
   1224 
   1225   sqlite3_reset(pSelect);
   1226   return rc;
   1227 }
   1228 
   1229 
   1230 /*
   1231 ** Populate pCsr->aMatchinfo[] with data for the current row. The
   1232 ** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32).
   1233 */
   1234 static int fts3GetMatchinfo(
   1235   Fts3Cursor *pCsr,               /* FTS3 Cursor object */
   1236   const char *zArg                /* Second argument to matchinfo() function */
   1237 ){
   1238   MatchInfo sInfo;
   1239   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
   1240   int rc = SQLITE_OK;
   1241   int bGlobal = 0;                /* Collect 'global' stats as well as local */
   1242 
   1243   memset(&sInfo, 0, sizeof(MatchInfo));
   1244   sInfo.pCursor = pCsr;
   1245   sInfo.nCol = pTab->nColumn;
   1246 
   1247   /* If there is cached matchinfo() data, but the format string for the
   1248   ** cache does not match the format string for this request, discard
   1249   ** the cached data. */
   1250   if( pCsr->zMatchinfo && strcmp(pCsr->zMatchinfo, zArg) ){
   1251     assert( pCsr->aMatchinfo );
   1252     sqlite3_free(pCsr->aMatchinfo);
   1253     pCsr->zMatchinfo = 0;
   1254     pCsr->aMatchinfo = 0;
   1255   }
   1256 
   1257   /* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the
   1258   ** matchinfo function has been called for this query. In this case
   1259   ** allocate the array used to accumulate the matchinfo data and
   1260   ** initialize those elements that are constant for every row.
   1261   */
   1262   if( pCsr->aMatchinfo==0 ){
   1263     int nMatchinfo = 0;           /* Number of u32 elements in match-info */
   1264     int nArg;                     /* Bytes in zArg */
   1265     int i;                        /* Used to iterate through zArg */
   1266 
   1267     /* Determine the number of phrases in the query */
   1268     pCsr->nPhrase = fts3ExprPhraseCount(pCsr->pExpr);
   1269     sInfo.nPhrase = pCsr->nPhrase;
   1270 
   1271     /* Determine the number of integers in the buffer returned by this call. */
   1272     for(i=0; zArg[i]; i++){
   1273       nMatchinfo += fts3MatchinfoSize(&sInfo, zArg[i]);
   1274     }
   1275 
   1276     /* Allocate space for Fts3Cursor.aMatchinfo[] and Fts3Cursor.zMatchinfo. */
   1277     nArg = (int)strlen(zArg);
   1278     pCsr->aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo + nArg + 1);
   1279     if( !pCsr->aMatchinfo ) return SQLITE_NOMEM;
   1280 
   1281     pCsr->zMatchinfo = (char *)&pCsr->aMatchinfo[nMatchinfo];
   1282     pCsr->nMatchinfo = nMatchinfo;
   1283     memcpy(pCsr->zMatchinfo, zArg, nArg+1);
   1284     memset(pCsr->aMatchinfo, 0, sizeof(u32)*nMatchinfo);
   1285     pCsr->isMatchinfoNeeded = 1;
   1286     bGlobal = 1;
   1287   }
   1288 
   1289   sInfo.aMatchinfo = pCsr->aMatchinfo;
   1290   sInfo.nPhrase = pCsr->nPhrase;
   1291   if( pCsr->isMatchinfoNeeded ){
   1292     rc = fts3MatchinfoValues(pCsr, bGlobal, &sInfo, zArg);
   1293     pCsr->isMatchinfoNeeded = 0;
   1294   }
   1295 
   1296   return rc;
   1297 }
   1298 
   1299 /*
   1300 ** Implementation of snippet() function.
   1301 */
   1302 void sqlite3Fts3Snippet(
   1303   sqlite3_context *pCtx,          /* SQLite function call context */
   1304   Fts3Cursor *pCsr,               /* Cursor object */
   1305   const char *zStart,             /* Snippet start text - "<b>" */
   1306   const char *zEnd,               /* Snippet end text - "</b>" */
   1307   const char *zEllipsis,          /* Snippet ellipsis text - "<b>...</b>" */
   1308   int iCol,                       /* Extract snippet from this column */
   1309   int nToken                      /* Approximate number of tokens in snippet */
   1310 ){
   1311   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
   1312   int rc = SQLITE_OK;
   1313   int i;
   1314   StrBuffer res = {0, 0, 0};
   1315 
   1316   /* The returned text includes up to four fragments of text extracted from
   1317   ** the data in the current row. The first iteration of the for(...) loop
   1318   ** below attempts to locate a single fragment of text nToken tokens in
   1319   ** size that contains at least one instance of all phrases in the query
   1320   ** expression that appear in the current row. If such a fragment of text
   1321   ** cannot be found, the second iteration of the loop attempts to locate
   1322   ** a pair of fragments, and so on.
   1323   */
   1324   int nSnippet = 0;               /* Number of fragments in this snippet */
   1325   SnippetFragment aSnippet[4];    /* Maximum of 4 fragments per snippet */
   1326   int nFToken = -1;               /* Number of tokens in each fragment */
   1327 
   1328   if( !pCsr->pExpr ){
   1329     sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
   1330     return;
   1331   }
   1332 
   1333   for(nSnippet=1; 1; nSnippet++){
   1334 
   1335     int iSnip;                    /* Loop counter 0..nSnippet-1 */
   1336     u64 mCovered = 0;             /* Bitmask of phrases covered by snippet */
   1337     u64 mSeen = 0;                /* Bitmask of phrases seen by BestSnippet() */
   1338 
   1339     if( nToken>=0 ){
   1340       nFToken = (nToken+nSnippet-1) / nSnippet;
   1341     }else{
   1342       nFToken = -1 * nToken;
   1343     }
   1344 
   1345     for(iSnip=0; iSnip<nSnippet; iSnip++){
   1346       int iBestScore = -1;        /* Best score of columns checked so far */
   1347       int iRead;                  /* Used to iterate through columns */
   1348       SnippetFragment *pFragment = &aSnippet[iSnip];
   1349 
   1350       memset(pFragment, 0, sizeof(*pFragment));
   1351 
   1352       /* Loop through all columns of the table being considered for snippets.
   1353       ** If the iCol argument to this function was negative, this means all
   1354       ** columns of the FTS3 table. Otherwise, only column iCol is considered.
   1355       */
   1356       for(iRead=0; iRead<pTab->nColumn; iRead++){
   1357         SnippetFragment sF = {0, 0, 0, 0};
   1358         int iS;
   1359         if( iCol>=0 && iRead!=iCol ) continue;
   1360 
   1361         /* Find the best snippet of nFToken tokens in column iRead. */
   1362         rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS);
   1363         if( rc!=SQLITE_OK ){
   1364           goto snippet_out;
   1365         }
   1366         if( iS>iBestScore ){
   1367           *pFragment = sF;
   1368           iBestScore = iS;
   1369         }
   1370       }
   1371 
   1372       mCovered |= pFragment->covered;
   1373     }
   1374 
   1375     /* If all query phrases seen by fts3BestSnippet() are present in at least
   1376     ** one of the nSnippet snippet fragments, break out of the loop.
   1377     */
   1378     assert( (mCovered&mSeen)==mCovered );
   1379     if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break;
   1380   }
   1381 
   1382   assert( nFToken>0 );
   1383 
   1384   for(i=0; i<nSnippet && rc==SQLITE_OK; i++){
   1385     rc = fts3SnippetText(pCsr, &aSnippet[i],
   1386         i, (i==nSnippet-1), nFToken, zStart, zEnd, zEllipsis, &res
   1387     );
   1388   }
   1389 
   1390  snippet_out:
   1391   sqlite3Fts3SegmentsClose(pTab);
   1392   if( rc!=SQLITE_OK ){
   1393     sqlite3_result_error_code(pCtx, rc);
   1394     sqlite3_free(res.z);
   1395   }else{
   1396     sqlite3_result_text(pCtx, res.z, -1, sqlite3_free);
   1397   }
   1398 }
   1399 
   1400 
   1401 typedef struct TermOffset TermOffset;
   1402 typedef struct TermOffsetCtx TermOffsetCtx;
   1403 
   1404 struct TermOffset {
   1405   char *pList;                    /* Position-list */
   1406   int iPos;                       /* Position just read from pList */
   1407   int iOff;                       /* Offset of this term from read positions */
   1408 };
   1409 
   1410 struct TermOffsetCtx {
   1411   int iCol;                       /* Column of table to populate aTerm for */
   1412   int iTerm;
   1413   sqlite3_int64 iDocid;
   1414   TermOffset *aTerm;
   1415 };
   1416 
   1417 /*
   1418 ** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets().
   1419 */
   1420 static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){
   1421   TermOffsetCtx *p = (TermOffsetCtx *)ctx;
   1422   int nTerm;                      /* Number of tokens in phrase */
   1423   int iTerm;                      /* For looping through nTerm phrase terms */
   1424   char *pList;                    /* Pointer to position list for phrase */
   1425   int iPos = 0;                   /* First position in position-list */
   1426 
   1427   UNUSED_PARAMETER(iPhrase);
   1428   pList = sqlite3Fts3FindPositions(pExpr, p->iDocid, p->iCol);
   1429   nTerm = pExpr->pPhrase->nToken;
   1430   if( pList ){
   1431     fts3GetDeltaPosition(&pList, &iPos);
   1432     assert( iPos>=0 );
   1433   }
   1434 
   1435   for(iTerm=0; iTerm<nTerm; iTerm++){
   1436     TermOffset *pT = &p->aTerm[p->iTerm++];
   1437     pT->iOff = nTerm-iTerm-1;
   1438     pT->pList = pList;
   1439     pT->iPos = iPos;
   1440   }
   1441 
   1442   return SQLITE_OK;
   1443 }
   1444 
   1445 /*
   1446 ** Implementation of offsets() function.
   1447 */
   1448 void sqlite3Fts3Offsets(
   1449   sqlite3_context *pCtx,          /* SQLite function call context */
   1450   Fts3Cursor *pCsr                /* Cursor object */
   1451 ){
   1452   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
   1453   sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule;
   1454   const char *ZDUMMY;             /* Dummy argument used with xNext() */
   1455   int NDUMMY;                     /* Dummy argument used with xNext() */
   1456   int rc;                         /* Return Code */
   1457   int nToken;                     /* Number of tokens in query */
   1458   int iCol;                       /* Column currently being processed */
   1459   StrBuffer res = {0, 0, 0};      /* Result string */
   1460   TermOffsetCtx sCtx;             /* Context for fts3ExprTermOffsetInit() */
   1461 
   1462   if( !pCsr->pExpr ){
   1463     sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
   1464     return;
   1465   }
   1466 
   1467   memset(&sCtx, 0, sizeof(sCtx));
   1468   assert( pCsr->isRequireSeek==0 );
   1469 
   1470   /* Count the number of terms in the query */
   1471   rc = fts3ExprLoadDoclists(pCsr, 0, &nToken);
   1472   if( rc!=SQLITE_OK ) goto offsets_out;
   1473 
   1474   /* Allocate the array of TermOffset iterators. */
   1475   sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken);
   1476   if( 0==sCtx.aTerm ){
   1477     rc = SQLITE_NOMEM;
   1478     goto offsets_out;
   1479   }
   1480   sCtx.iDocid = pCsr->iPrevId;
   1481 
   1482   /* Loop through the table columns, appending offset information to
   1483   ** string-buffer res for each column.
   1484   */
   1485   for(iCol=0; iCol<pTab->nColumn; iCol++){
   1486     sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */
   1487     int iStart;
   1488     int iEnd;
   1489     int iCurrent;
   1490     const char *zDoc;
   1491     int nDoc;
   1492 
   1493     /* Initialize the contents of sCtx.aTerm[] for column iCol. There is
   1494     ** no way that this operation can fail, so the return code from
   1495     ** fts3ExprIterate() can be discarded.
   1496     */
   1497     sCtx.iCol = iCol;
   1498     sCtx.iTerm = 0;
   1499     (void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void *)&sCtx);
   1500 
   1501     /* Retreive the text stored in column iCol. If an SQL NULL is stored
   1502     ** in column iCol, jump immediately to the next iteration of the loop.
   1503     ** If an OOM occurs while retrieving the data (this can happen if SQLite
   1504     ** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM
   1505     ** to the caller.
   1506     */
   1507     zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1);
   1508     nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1);
   1509     if( zDoc==0 ){
   1510       if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){
   1511         continue;
   1512       }
   1513       rc = SQLITE_NOMEM;
   1514       goto offsets_out;
   1515     }
   1516 
   1517     /* Initialize a tokenizer iterator to iterate through column iCol. */
   1518     rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
   1519     if( rc!=SQLITE_OK ) goto offsets_out;
   1520     pC->pTokenizer = pTab->pTokenizer;
   1521 
   1522     rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
   1523     while( rc==SQLITE_OK ){
   1524       int i;                      /* Used to loop through terms */
   1525       int iMinPos = 0x7FFFFFFF;   /* Position of next token */
   1526       TermOffset *pTerm = 0;      /* TermOffset associated with next token */
   1527 
   1528       for(i=0; i<nToken; i++){
   1529         TermOffset *pT = &sCtx.aTerm[i];
   1530         if( pT->pList && (pT->iPos-pT->iOff)<iMinPos ){
   1531           iMinPos = pT->iPos-pT->iOff;
   1532           pTerm = pT;
   1533         }
   1534       }
   1535 
   1536       if( !pTerm ){
   1537         /* All offsets for this column have been gathered. */
   1538         break;
   1539       }else{
   1540         assert( iCurrent<=iMinPos );
   1541         if( 0==(0xFE&*pTerm->pList) ){
   1542           pTerm->pList = 0;
   1543         }else{
   1544           fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos);
   1545         }
   1546         while( rc==SQLITE_OK && iCurrent<iMinPos ){
   1547           rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
   1548         }
   1549         if( rc==SQLITE_OK ){
   1550           char aBuffer[64];
   1551           sqlite3_snprintf(sizeof(aBuffer), aBuffer,
   1552               "%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart
   1553           );
   1554           rc = fts3StringAppend(&res, aBuffer, -1);
   1555         }else if( rc==SQLITE_DONE ){
   1556           rc = SQLITE_CORRUPT;
   1557         }
   1558       }
   1559     }
   1560     if( rc==SQLITE_DONE ){
   1561       rc = SQLITE_OK;
   1562     }
   1563 
   1564     pMod->xClose(pC);
   1565     if( rc!=SQLITE_OK ) goto offsets_out;
   1566   }
   1567 
   1568  offsets_out:
   1569   sqlite3_free(sCtx.aTerm);
   1570   assert( rc!=SQLITE_DONE );
   1571   sqlite3Fts3SegmentsClose(pTab);
   1572   if( rc!=SQLITE_OK ){
   1573     sqlite3_result_error_code(pCtx,  rc);
   1574     sqlite3_free(res.z);
   1575   }else{
   1576     sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free);
   1577   }
   1578   return;
   1579 }
   1580 
   1581 /*
   1582 ** Implementation of matchinfo() function.
   1583 */
   1584 void sqlite3Fts3Matchinfo(
   1585   sqlite3_context *pContext,      /* Function call context */
   1586   Fts3Cursor *pCsr,               /* FTS3 table cursor */
   1587   const char *zArg                /* Second arg to matchinfo() function */
   1588 ){
   1589   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
   1590   int rc;
   1591   int i;
   1592   const char *zFormat;
   1593 
   1594   if( zArg ){
   1595     for(i=0; zArg[i]; i++){
   1596       char *zErr = 0;
   1597       if( fts3MatchinfoCheck(pTab, zArg[i], &zErr) ){
   1598         sqlite3_result_error(pContext, zErr, -1);
   1599         sqlite3_free(zErr);
   1600         return;
   1601       }
   1602     }
   1603     zFormat = zArg;
   1604   }else{
   1605     zFormat = FTS3_MATCHINFO_DEFAULT;
   1606   }
   1607 
   1608   if( !pCsr->pExpr ){
   1609     sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC);
   1610     return;
   1611   }
   1612 
   1613   /* Retrieve matchinfo() data. */
   1614   rc = fts3GetMatchinfo(pCsr, zFormat);
   1615   sqlite3Fts3SegmentsClose(pTab);
   1616 
   1617   if( rc!=SQLITE_OK ){
   1618     sqlite3_result_error_code(pContext, rc);
   1619   }else{
   1620     int n = pCsr->nMatchinfo * sizeof(u32);
   1621     sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT);
   1622   }
   1623 }
   1624 
   1625 #endif
   1626