Home | History | Annotate | Download | only in fts3
      1 /*
      2 ** 2009 Nov 12
      3 **
      4 ** The author disclaims copyright to this source code.  In place of
      5 ** a legal notice, here is a blessing:
      6 **
      7 **    May you do good and not evil.
      8 **    May you find forgiveness for yourself and forgive others.
      9 **    May you share freely, never taking more than you give.
     10 **
     11 ******************************************************************************
     12 **
     13 */
     14 
     15 #ifndef _FTSINT_H
     16 #define _FTSINT_H
     17 
     18 #if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
     19 # define NDEBUG 1
     20 #endif
     21 
     22 #include "sqlite3.h"
     23 #include "fts3_tokenizer.h"
     24 #include "fts3_hash.h"
     25 
     26 /*
     27 ** This constant controls how often segments are merged. Once there are
     28 ** FTS3_MERGE_COUNT segments of level N, they are merged into a single
     29 ** segment of level N+1.
     30 */
     31 #define FTS3_MERGE_COUNT 16
     32 
     33 /*
     34 ** This is the maximum amount of data (in bytes) to store in the
     35 ** Fts3Table.pendingTerms hash table. Normally, the hash table is
     36 ** populated as documents are inserted/updated/deleted in a transaction
     37 ** and used to create a new segment when the transaction is committed.
     38 ** However if this limit is reached midway through a transaction, a new
     39 ** segment is created and the hash table cleared immediately.
     40 */
     41 #define FTS3_MAX_PENDING_DATA (1*1024*1024)
     42 
     43 /*
     44 ** Macro to return the number of elements in an array. SQLite has a
     45 ** similar macro called ArraySize(). Use a different name to avoid
     46 ** a collision when building an amalgamation with built-in FTS3.
     47 */
     48 #define SizeofArray(X) ((int)(sizeof(X)/sizeof(X[0])))
     49 
     50 /*
     51 ** Maximum length of a varint encoded integer. The varint format is different
     52 ** from that used by SQLite, so the maximum length is 10, not 9.
     53 */
     54 #define FTS3_VARINT_MAX 10
     55 
     56 /*
     57 ** The testcase() macro is only used by the amalgamation.  If undefined,
     58 ** make it a no-op.
     59 */
     60 #ifndef testcase
     61 # define testcase(X)
     62 #endif
     63 
     64 /*
     65 ** Terminator values for position-lists and column-lists.
     66 */
     67 #define POS_COLUMN  (1)     /* Column-list terminator */
     68 #define POS_END     (0)     /* Position-list terminator */
     69 
     70 /*
     71 ** This section provides definitions to allow the
     72 ** FTS3 extension to be compiled outside of the
     73 ** amalgamation.
     74 */
     75 #ifndef SQLITE_AMALGAMATION
     76 /*
     77 ** Macros indicating that conditional expressions are always true or
     78 ** false.
     79 */
     80 #ifdef SQLITE_COVERAGE_TEST
     81 # define ALWAYS(x) (1)
     82 # define NEVER(X)  (0)
     83 #else
     84 # define ALWAYS(x) (x)
     85 # define NEVER(X)  (x)
     86 #endif
     87 
     88 /*
     89 ** Internal types used by SQLite.
     90 */
     91 typedef unsigned char u8;         /* 1-byte (or larger) unsigned integer */
     92 typedef short int i16;            /* 2-byte (or larger) signed integer */
     93 typedef unsigned int u32;         /* 4-byte unsigned integer */
     94 typedef sqlite3_uint64 u64;       /* 8-byte unsigned integer */
     95 /*
     96 ** Macro used to suppress compiler warnings for unused parameters.
     97 */
     98 #define UNUSED_PARAMETER(x) (void)(x)
     99 #endif
    100 
    101 typedef struct Fts3Table Fts3Table;
    102 typedef struct Fts3Cursor Fts3Cursor;
    103 typedef struct Fts3Expr Fts3Expr;
    104 typedef struct Fts3Phrase Fts3Phrase;
    105 typedef struct Fts3PhraseToken Fts3PhraseToken;
    106 
    107 typedef struct Fts3SegFilter Fts3SegFilter;
    108 typedef struct Fts3DeferredToken Fts3DeferredToken;
    109 typedef struct Fts3SegReader Fts3SegReader;
    110 typedef struct Fts3SegReaderCursor Fts3SegReaderCursor;
    111 
    112 /*
    113 ** A connection to a fulltext index is an instance of the following
    114 ** structure. The xCreate and xConnect methods create an instance
    115 ** of this structure and xDestroy and xDisconnect free that instance.
    116 ** All other methods receive a pointer to the structure as one of their
    117 ** arguments.
    118 */
    119 struct Fts3Table {
    120   sqlite3_vtab base;              /* Base class used by SQLite core */
    121   sqlite3 *db;                    /* The database connection */
    122   const char *zDb;                /* logical database name */
    123   const char *zName;              /* virtual table name */
    124   int nColumn;                    /* number of named columns in virtual table */
    125   char **azColumn;                /* column names.  malloced */
    126   sqlite3_tokenizer *pTokenizer;  /* tokenizer for inserts and queries */
    127 
    128   /* Precompiled statements used by the implementation. Each of these
    129   ** statements is run and reset within a single virtual table API call.
    130   */
    131   sqlite3_stmt *aStmt[24];
    132 
    133   char *zReadExprlist;
    134   char *zWriteExprlist;
    135 
    136   int nNodeSize;                  /* Soft limit for node size */
    137   u8 bHasStat;                    /* True if %_stat table exists */
    138   u8 bHasDocsize;                 /* True if %_docsize table exists */
    139   int nPgsz;                      /* Page size for host database */
    140   char *zSegmentsTbl;             /* Name of %_segments table */
    141   sqlite3_blob *pSegments;        /* Blob handle open on %_segments table */
    142 
    143   /* The following hash table is used to buffer pending index updates during
    144   ** transactions. Variable nPendingData estimates the memory size of the
    145   ** pending data, including hash table overhead, but not malloc overhead.
    146   ** When nPendingData exceeds nMaxPendingData, the buffer is flushed
    147   ** automatically. Variable iPrevDocid is the docid of the most recently
    148   ** inserted record.
    149   */
    150   int nMaxPendingData;
    151   int nPendingData;
    152   sqlite_int64 iPrevDocid;
    153   Fts3Hash pendingTerms;
    154 };
    155 
    156 /*
    157 ** When the core wants to read from the virtual table, it creates a
    158 ** virtual table cursor (an instance of the following structure) using
    159 ** the xOpen method. Cursors are destroyed using the xClose method.
    160 */
    161 struct Fts3Cursor {
    162   sqlite3_vtab_cursor base;       /* Base class used by SQLite core */
    163   i16 eSearch;                    /* Search strategy (see below) */
    164   u8 isEof;                       /* True if at End Of Results */
    165   u8 isRequireSeek;               /* True if must seek pStmt to %_content row */
    166   sqlite3_stmt *pStmt;            /* Prepared statement in use by the cursor */
    167   Fts3Expr *pExpr;                /* Parsed MATCH query string */
    168   int nPhrase;                    /* Number of matchable phrases in query */
    169   Fts3DeferredToken *pDeferred;   /* Deferred search tokens, if any */
    170   sqlite3_int64 iPrevId;          /* Previous id read from aDoclist */
    171   char *pNextId;                  /* Pointer into the body of aDoclist */
    172   char *aDoclist;                 /* List of docids for full-text queries */
    173   int nDoclist;                   /* Size of buffer at aDoclist */
    174   int eEvalmode;                  /* An FTS3_EVAL_XX constant */
    175   int nRowAvg;                    /* Average size of database rows, in pages */
    176 
    177   int isMatchinfoNeeded;          /* True when aMatchinfo[] needs filling in */
    178   u32 *aMatchinfo;                /* Information about most recent match */
    179   int nMatchinfo;                 /* Number of elements in aMatchinfo[] */
    180   char *zMatchinfo;               /* Matchinfo specification */
    181 };
    182 
    183 #define FTS3_EVAL_FILTER    0
    184 #define FTS3_EVAL_NEXT      1
    185 #define FTS3_EVAL_MATCHINFO 2
    186 
    187 /*
    188 ** The Fts3Cursor.eSearch member is always set to one of the following.
    189 ** Actualy, Fts3Cursor.eSearch can be greater than or equal to
    190 ** FTS3_FULLTEXT_SEARCH.  If so, then Fts3Cursor.eSearch - 2 is the index
    191 ** of the column to be searched.  For example, in
    192 **
    193 **     CREATE VIRTUAL TABLE ex1 USING fts3(a,b,c,d);
    194 **     SELECT docid FROM ex1 WHERE b MATCH 'one two three';
    195 **
    196 ** Because the LHS of the MATCH operator is 2nd column "b",
    197 ** Fts3Cursor.eSearch will be set to FTS3_FULLTEXT_SEARCH+1.  (+0 for a,
    198 ** +1 for b, +2 for c, +3 for d.)  If the LHS of MATCH were "ex1"
    199 ** indicating that all columns should be searched,
    200 ** then eSearch would be set to FTS3_FULLTEXT_SEARCH+4.
    201 */
    202 #define FTS3_FULLSCAN_SEARCH 0    /* Linear scan of %_content table */
    203 #define FTS3_DOCID_SEARCH    1    /* Lookup by rowid on %_content table */
    204 #define FTS3_FULLTEXT_SEARCH 2    /* Full-text index search */
    205 
    206 /*
    207 ** A "phrase" is a sequence of one or more tokens that must match in
    208 ** sequence.  A single token is the base case and the most common case.
    209 ** For a sequence of tokens contained in double-quotes (i.e. "one two three")
    210 ** nToken will be the number of tokens in the string.
    211 **
    212 ** The nDocMatch and nMatch variables contain data that may be used by the
    213 ** matchinfo() function. They are populated when the full-text index is
    214 ** queried for hits on the phrase. If one or more tokens in the phrase
    215 ** are deferred, the nDocMatch and nMatch variables are populated based
    216 ** on the assumption that the
    217 */
    218 struct Fts3PhraseToken {
    219   char *z;                        /* Text of the token */
    220   int n;                          /* Number of bytes in buffer z */
    221   int isPrefix;                   /* True if token ends with a "*" character */
    222   int bFulltext;                  /* True if full-text index was used */
    223   Fts3SegReaderCursor *pSegcsr;   /* Segment-reader for this token */
    224   Fts3DeferredToken *pDeferred;   /* Deferred token object for this token */
    225 };
    226 
    227 struct Fts3Phrase {
    228   /* Variables populated by fts3_expr.c when parsing a MATCH expression */
    229   int nToken;                /* Number of tokens in the phrase */
    230   int iColumn;               /* Index of column this phrase must match */
    231   int isNot;                 /* Phrase prefixed by unary not (-) operator */
    232   Fts3PhraseToken aToken[1]; /* One entry for each token in the phrase */
    233 };
    234 
    235 /*
    236 ** A tree of these objects forms the RHS of a MATCH operator.
    237 **
    238 ** If Fts3Expr.eType is either FTSQUERY_NEAR or FTSQUERY_PHRASE and isLoaded
    239 ** is true, then aDoclist points to a malloced buffer, size nDoclist bytes,
    240 ** containing the results of the NEAR or phrase query in FTS3 doclist
    241 ** format. As usual, the initial "Length" field found in doclists stored
    242 ** on disk is omitted from this buffer.
    243 **
    244 ** Variable pCurrent always points to the start of a docid field within
    245 ** aDoclist. Since the doclist is usually scanned in docid order, this can
    246 ** be used to accelerate seeking to the required docid within the doclist.
    247 */
    248 struct Fts3Expr {
    249   int eType;                 /* One of the FTSQUERY_XXX values defined below */
    250   int nNear;                 /* Valid if eType==FTSQUERY_NEAR */
    251   Fts3Expr *pParent;         /* pParent->pLeft==this or pParent->pRight==this */
    252   Fts3Expr *pLeft;           /* Left operand */
    253   Fts3Expr *pRight;          /* Right operand */
    254   Fts3Phrase *pPhrase;       /* Valid if eType==FTSQUERY_PHRASE */
    255 
    256   int isLoaded;              /* True if aDoclist/nDoclist are initialized. */
    257   char *aDoclist;            /* Buffer containing doclist */
    258   int nDoclist;              /* Size of aDoclist in bytes */
    259 
    260   sqlite3_int64 iCurrent;
    261   char *pCurrent;
    262 };
    263 
    264 /*
    265 ** Candidate values for Fts3Query.eType. Note that the order of the first
    266 ** four values is in order of precedence when parsing expressions. For
    267 ** example, the following:
    268 **
    269 **   "a OR b AND c NOT d NEAR e"
    270 **
    271 ** is equivalent to:
    272 **
    273 **   "a OR (b AND (c NOT (d NEAR e)))"
    274 */
    275 #define FTSQUERY_NEAR   1
    276 #define FTSQUERY_NOT    2
    277 #define FTSQUERY_AND    3
    278 #define FTSQUERY_OR     4
    279 #define FTSQUERY_PHRASE 5
    280 
    281 
    282 /* fts3_write.c */
    283 int sqlite3Fts3UpdateMethod(sqlite3_vtab*,int,sqlite3_value**,sqlite3_int64*);
    284 int sqlite3Fts3PendingTermsFlush(Fts3Table *);
    285 void sqlite3Fts3PendingTermsClear(Fts3Table *);
    286 int sqlite3Fts3Optimize(Fts3Table *);
    287 int sqlite3Fts3SegReaderNew(int, sqlite3_int64,
    288   sqlite3_int64, sqlite3_int64, const char *, int, Fts3SegReader**);
    289 int sqlite3Fts3SegReaderPending(Fts3Table*,const char*,int,int,Fts3SegReader**);
    290 void sqlite3Fts3SegReaderFree(Fts3SegReader *);
    291 int sqlite3Fts3SegReaderCost(Fts3Cursor *, Fts3SegReader *, int *);
    292 int sqlite3Fts3AllSegdirs(Fts3Table*, int, sqlite3_stmt **);
    293 int sqlite3Fts3ReadLock(Fts3Table *);
    294 int sqlite3Fts3ReadBlock(Fts3Table*, sqlite3_int64, char **, int*);
    295 
    296 int sqlite3Fts3SelectDoctotal(Fts3Table *, sqlite3_stmt **);
    297 int sqlite3Fts3SelectDocsize(Fts3Table *, sqlite3_int64, sqlite3_stmt **);
    298 
    299 void sqlite3Fts3FreeDeferredTokens(Fts3Cursor *);
    300 int sqlite3Fts3DeferToken(Fts3Cursor *, Fts3PhraseToken *, int);
    301 int sqlite3Fts3CacheDeferredDoclists(Fts3Cursor *);
    302 void sqlite3Fts3FreeDeferredDoclists(Fts3Cursor *);
    303 char *sqlite3Fts3DeferredDoclist(Fts3DeferredToken *, int *);
    304 void sqlite3Fts3SegmentsClose(Fts3Table *);
    305 
    306 #define FTS3_SEGCURSOR_PENDING -1
    307 #define FTS3_SEGCURSOR_ALL     -2
    308 
    309 int sqlite3Fts3SegReaderStart(Fts3Table*, Fts3SegReaderCursor*, Fts3SegFilter*);
    310 int sqlite3Fts3SegReaderStep(Fts3Table *, Fts3SegReaderCursor *);
    311 void sqlite3Fts3SegReaderFinish(Fts3SegReaderCursor *);
    312 int sqlite3Fts3SegReaderCursor(
    313     Fts3Table *, int, const char *, int, int, int, Fts3SegReaderCursor *);
    314 
    315 /* Flags allowed as part of the 4th argument to SegmentReaderIterate() */
    316 #define FTS3_SEGMENT_REQUIRE_POS   0x00000001
    317 #define FTS3_SEGMENT_IGNORE_EMPTY  0x00000002
    318 #define FTS3_SEGMENT_COLUMN_FILTER 0x00000004
    319 #define FTS3_SEGMENT_PREFIX        0x00000008
    320 #define FTS3_SEGMENT_SCAN          0x00000010
    321 
    322 /* Type passed as 4th argument to SegmentReaderIterate() */
    323 struct Fts3SegFilter {
    324   const char *zTerm;
    325   int nTerm;
    326   int iCol;
    327   int flags;
    328 };
    329 
    330 struct Fts3SegReaderCursor {
    331   /* Used internally by sqlite3Fts3SegReaderXXX() calls */
    332   Fts3SegReader **apSegment;      /* Array of Fts3SegReader objects */
    333   int nSegment;                   /* Size of apSegment array */
    334   int nAdvance;                   /* How many seg-readers to advance */
    335   Fts3SegFilter *pFilter;         /* Pointer to filter object */
    336   char *aBuffer;                  /* Buffer to merge doclists in */
    337   int nBuffer;                    /* Allocated size of aBuffer[] in bytes */
    338 
    339   /* Cost of running this iterator. Used by fts3.c only. */
    340   int nCost;
    341 
    342   /* Output values. Valid only after Fts3SegReaderStep() returns SQLITE_ROW. */
    343   char *zTerm;                    /* Pointer to term buffer */
    344   int nTerm;                      /* Size of zTerm in bytes */
    345   char *aDoclist;                 /* Pointer to doclist buffer */
    346   int nDoclist;                   /* Size of aDoclist[] in bytes */
    347 };
    348 
    349 /* fts3.c */
    350 int sqlite3Fts3PutVarint(char *, sqlite3_int64);
    351 int sqlite3Fts3GetVarint(const char *, sqlite_int64 *);
    352 int sqlite3Fts3GetVarint32(const char *, int *);
    353 int sqlite3Fts3VarintLen(sqlite3_uint64);
    354 void sqlite3Fts3Dequote(char *);
    355 
    356 char *sqlite3Fts3FindPositions(Fts3Expr *, sqlite3_int64, int);
    357 int sqlite3Fts3ExprLoadDoclist(Fts3Cursor *, Fts3Expr *);
    358 int sqlite3Fts3ExprLoadFtDoclist(Fts3Cursor *, Fts3Expr *, char **, int *);
    359 int sqlite3Fts3ExprNearTrim(Fts3Expr *, Fts3Expr *, int);
    360 
    361 /* fts3_tokenizer.c */
    362 const char *sqlite3Fts3NextToken(const char *, int *);
    363 int sqlite3Fts3InitHashTable(sqlite3 *, Fts3Hash *, const char *);
    364 int sqlite3Fts3InitTokenizer(Fts3Hash *pHash, const char *,
    365     sqlite3_tokenizer **, char **
    366 );
    367 int sqlite3Fts3IsIdChar(char);
    368 
    369 /* fts3_snippet.c */
    370 void sqlite3Fts3Offsets(sqlite3_context*, Fts3Cursor*);
    371 void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *,
    372   const char *, const char *, int, int
    373 );
    374 void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *);
    375 
    376 /* fts3_expr.c */
    377 int sqlite3Fts3ExprParse(sqlite3_tokenizer *,
    378   char **, int, int, const char *, int, Fts3Expr **
    379 );
    380 void sqlite3Fts3ExprFree(Fts3Expr *);
    381 #ifdef SQLITE_TEST
    382 int sqlite3Fts3ExprInitTestInterface(sqlite3 *db);
    383 #endif
    384 
    385 /* fts3_aux.c */
    386 int sqlite3Fts3InitAux(sqlite3 *db);
    387 
    388 #endif /* _FTSINT_H */
    389