1 /* The author disclaims copyright to this source code. 2 * 3 * This is an SQLite module implementing full-text search. 4 */ 5 6 #include <assert.h> 7 #if !defined(__APPLE__) 8 #include <malloc.h> 9 #else 10 #include <stdlib.h> 11 #endif 12 #include <stdio.h> 13 #include <string.h> 14 #include <ctype.h> 15 16 #include "fulltext.h" 17 #include "ft_hash.h" 18 #include "tokenizer.h" 19 #include "sqlite3.h" 20 #include "sqlite3ext.h" 21 SQLITE_EXTENSION_INIT1 22 23 /* utility functions */ 24 25 /* We encode variable-length integers in little-endian order using seven bits 26 * per byte as follows: 27 ** 28 ** KEY: 29 ** A = 0xxxxxxx 7 bits of data and one flag bit 30 ** B = 1xxxxxxx 7 bits of data and one flag bit 31 ** 32 ** 7 bits - A 33 ** 14 bits - BA 34 ** 21 bits - BBA 35 ** and so on. 36 */ 37 38 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ 39 #define VARINT_MAX 10 40 41 /* Write a 64-bit variable-length integer to memory starting at p[0]. 42 * The length of data written will be between 1 and VARINT_MAX bytes. 43 * The number of bytes written is returned. */ 44 static int putVarint(char *p, sqlite_int64 v){ 45 unsigned char *q = (unsigned char *) p; 46 sqlite_uint64 vu = v; 47 do{ 48 *q++ = (unsigned char) ((vu & 0x7f) | 0x80); 49 vu >>= 7; 50 }while( vu!=0 ); 51 q[-1] &= 0x7f; /* turn off high bit in final byte */ 52 assert( q - (unsigned char *)p <= VARINT_MAX ); 53 return (int) (q - (unsigned char *)p); 54 } 55 56 /* Read a 64-bit variable-length integer from memory starting at p[0]. 57 * Return the number of bytes read, or 0 on error. 58 * The value is stored in *v. */ 59 static int getVarint(const char *p, sqlite_int64 *v){ 60 const unsigned char *q = (const unsigned char *) p; 61 sqlite_uint64 x = 0, y = 1; 62 while( (*q & 0x80) == 0x80 ){ 63 x += y * (*q++ & 0x7f); 64 y <<= 7; 65 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ 66 assert( 0 ); 67 return 0; 68 } 69 } 70 x += y * (*q++); 71 *v = (sqlite_int64) x; 72 return (int) (q - (unsigned char *)p); 73 } 74 75 static int getVarint32(const char *p, int *pi){ 76 sqlite_int64 i; 77 int ret = getVarint(p, &i); 78 *pi = (int) i; 79 assert( *pi==i ); 80 return ret; 81 } 82 83 /*** Document lists *** 84 * 85 * A document list holds a sorted list of varint-encoded document IDs. 86 * 87 * A doclist with type DL_POSITIONS_OFFSETS is stored like this: 88 * 89 * array { 90 * varint docid; 91 * array { 92 * varint position; (delta from previous position plus 1, or 0 for end) 93 * varint startOffset; (delta from previous startOffset) 94 * varint endOffset; (delta from startOffset) 95 * } 96 * } 97 * 98 * Here, array { X } means zero or more occurrences of X, adjacent in memory. 99 * 100 * A doclist with type DL_POSITIONS is like the above, but holds only docids 101 * and positions without offset information. 102 * 103 * A doclist with type DL_DOCIDS is like the above, but holds only docids 104 * without positions or offset information. 105 * 106 * On disk, every document list has positions and offsets, so we don't bother 107 * to serialize a doclist's type. 108 * 109 * We don't yet delta-encode document IDs; doing so will probably be a 110 * modest win. 111 * 112 * NOTE(shess) I've thought of a slightly (1%) better offset encoding. 113 * After the first offset, estimate the next offset by using the 114 * current token position and the previous token position and offset, 115 * offset to handle some variance. So the estimate would be 116 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded 117 * as normal. Offsets more than 64 chars from the estimate are 118 * encoded as the delta to the previous start offset + 128. An 119 * additional tiny increment can be gained by using the end offset of 120 * the previous token to make the estimate a tiny bit more precise. 121 */ 122 123 typedef enum DocListType { 124 DL_DOCIDS, /* docids only */ 125 DL_POSITIONS, /* docids + positions */ 126 DL_POSITIONS_OFFSETS /* docids + positions + offsets */ 127 } DocListType; 128 129 typedef struct DocList { 130 char *pData; 131 int nData; 132 DocListType iType; 133 int iLastPos; /* the last position written */ 134 int iLastOffset; /* the last start offset written */ 135 } DocList; 136 137 /* Initialize a new DocList to hold the given data. */ 138 static void docListInit(DocList *d, DocListType iType, 139 const char *pData, int nData){ 140 d->nData = nData; 141 if( nData>0 ){ 142 d->pData = malloc(nData); 143 memcpy(d->pData, pData, nData); 144 } else { 145 d->pData = NULL; 146 } 147 d->iType = iType; 148 d->iLastPos = 0; 149 d->iLastOffset = 0; 150 } 151 152 /* Create a new dynamically-allocated DocList. */ 153 static DocList *docListNew(DocListType iType){ 154 DocList *d = (DocList *) malloc(sizeof(DocList)); 155 docListInit(d, iType, 0, 0); 156 return d; 157 } 158 159 static void docListDestroy(DocList *d){ 160 free(d->pData); 161 #ifndef NDEBUG 162 memset(d, 0x55, sizeof(*d)); 163 #endif 164 } 165 166 static void docListDelete(DocList *d){ 167 docListDestroy(d); 168 free(d); 169 } 170 171 static char *docListEnd(DocList *d){ 172 return d->pData + d->nData; 173 } 174 175 /* Append a varint to a DocList's data. */ 176 static void appendVarint(DocList *d, sqlite_int64 i){ 177 char c[VARINT_MAX]; 178 int n = putVarint(c, i); 179 d->pData = realloc(d->pData, d->nData + n); 180 memcpy(d->pData + d->nData, c, n); 181 d->nData += n; 182 } 183 184 static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ 185 appendVarint(d, iDocid); 186 d->iLastPos = 0; 187 } 188 189 /* Add a position to the last position list in a doclist. */ 190 static void docListAddPos(DocList *d, int iPos){ 191 assert( d->iType>=DL_POSITIONS ); 192 appendVarint(d, iPos-d->iLastPos+1); 193 d->iLastPos = iPos; 194 } 195 196 static void docListAddPosOffset(DocList *d, int iPos, 197 int iStartOffset, int iEndOffset){ 198 assert( d->iType==DL_POSITIONS_OFFSETS ); 199 docListAddPos(d, iPos); 200 appendVarint(d, iStartOffset-d->iLastOffset); 201 d->iLastOffset = iStartOffset; 202 appendVarint(d, iEndOffset-iStartOffset); 203 } 204 205 /* Terminate the last position list in the given doclist. */ 206 static void docListAddEndPos(DocList *d){ 207 appendVarint(d, 0); 208 } 209 210 typedef struct DocListReader { 211 DocList *pDoclist; 212 char *p; 213 int iLastPos; /* the last position read */ 214 } DocListReader; 215 216 static void readerInit(DocListReader *r, DocList *pDoclist){ 217 r->pDoclist = pDoclist; 218 if( pDoclist!=NULL ){ 219 r->p = pDoclist->pData; 220 } 221 r->iLastPos = 0; 222 } 223 224 static int readerAtEnd(DocListReader *pReader){ 225 return pReader->p >= docListEnd(pReader->pDoclist); 226 } 227 228 /* Peek at the next docid without advancing the read pointer. */ 229 static sqlite_int64 peekDocid(DocListReader *pReader){ 230 sqlite_int64 ret; 231 assert( !readerAtEnd(pReader) ); 232 getVarint(pReader->p, &ret); 233 return ret; 234 } 235 236 /* Read the next docid. */ 237 static sqlite_int64 readDocid(DocListReader *pReader){ 238 sqlite_int64 ret; 239 assert( !readerAtEnd(pReader) ); 240 pReader->p += getVarint(pReader->p, &ret); 241 pReader->iLastPos = 0; 242 return ret; 243 } 244 245 /* Read the next position from a position list. 246 * Returns the position, or -1 at the end of the list. */ 247 static int readPosition(DocListReader *pReader){ 248 int i; 249 int iType = pReader->pDoclist->iType; 250 assert( iType>=DL_POSITIONS ); 251 assert( !readerAtEnd(pReader) ); 252 253 pReader->p += getVarint32(pReader->p, &i); 254 if( i==0 ){ 255 pReader->iLastPos = -1; 256 return -1; 257 } 258 pReader->iLastPos += ((int) i)-1; 259 if( iType>=DL_POSITIONS_OFFSETS ){ 260 /* Skip over offsets, ignoring them for now. */ 261 int iStart, iEnd; 262 pReader->p += getVarint32(pReader->p, &iStart); 263 pReader->p += getVarint32(pReader->p, &iEnd); 264 } 265 return pReader->iLastPos; 266 } 267 268 /* Skip past the end of a position list. */ 269 static void skipPositionList(DocListReader *pReader){ 270 while( readPosition(pReader)!=-1 ) 271 ; 272 } 273 274 /* Skip over a docid, including its position list if the doclist has 275 * positions. */ 276 static void skipDocument(DocListReader *pReader){ 277 readDocid(pReader); 278 if( pReader->pDoclist->iType >= DL_POSITIONS ){ 279 skipPositionList(pReader); 280 } 281 } 282 283 static sqlite_int64 firstDocid(DocList *d){ 284 DocListReader r; 285 readerInit(&r, d); 286 return readDocid(&r); 287 } 288 289 /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; 290 * otherwise pUpdate, which must contain only the single docid [iDocid], is 291 * inserted (if not present) or updated (if already present). */ 292 static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ 293 int modified = 0; 294 DocListReader reader; 295 char *p; 296 297 if( pUpdate!=NULL ){ 298 assert( d->iType==pUpdate->iType); 299 assert( iDocid==firstDocid(pUpdate) ); 300 } 301 302 readerInit(&reader, d); 303 while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){ 304 skipDocument(&reader); 305 } 306 307 p = reader.p; 308 /* Delete if there is a matching element. */ 309 if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ 310 skipDocument(&reader); 311 memmove(p, reader.p, docListEnd(d) - reader.p); 312 d->nData -= (reader.p - p); 313 modified = 1; 314 } 315 316 /* Insert if indicated. */ 317 if( pUpdate!=NULL ){ 318 int iDoclist = p-d->pData; 319 docListAddEndPos(pUpdate); 320 321 d->pData = realloc(d->pData, d->nData+pUpdate->nData); 322 p = d->pData + iDoclist; 323 324 memmove(p+pUpdate->nData, p, docListEnd(d) - p); 325 memcpy(p, pUpdate->pData, pUpdate->nData); 326 d->nData += pUpdate->nData; 327 modified = 1; 328 } 329 330 return modified; 331 } 332 333 /* Split the second half of doclist d into a separate doclist d2. Returns 1 334 * if successful, or 0 if d contains a single document and hence can't be 335 * split. */ 336 static int docListSplit(DocList *d, DocList *d2){ 337 const char *pSplitPoint = d->pData + d->nData / 2; 338 DocListReader reader; 339 340 readerInit(&reader, d); 341 while( reader.p<pSplitPoint ){ 342 skipDocument(&reader); 343 } 344 if( readerAtEnd(&reader) ) return 0; 345 docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); 346 d->nData = reader.p - d->pData; 347 d->pData = realloc(d->pData, d->nData); 348 return 1; 349 } 350 351 /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked 352 * on-disk doclist, resulting in another in-memory DocList [out]. [in] 353 * and [out] may or may not store position information according to the 354 * caller's wishes. The on-disk doclist always comes with positions. 355 * 356 * The caller must read each chunk of the on-disk doclist in succession and 357 * pass it to mergeBlock(). 358 * 359 * If [in] has positions, then the merge output contains only documents with 360 * matching positions in the two input doclists. If [in] does not have 361 * positions, then the merge output contains all documents common to the two 362 * input doclists. 363 * 364 * If [in] is NULL, then the on-disk doclist is copied to [out] directly. 365 * 366 * A merge is performed using an integer [iOffset] provided by the caller. 367 * [iOffset] is subtracted from each position in the on-disk doclist for the 368 * purpose of position comparison; this is helpful in implementing phrase 369 * searches. 370 * 371 * A DocListMerge is not yet able to propagate offsets through query 372 * processing; we should add that capability soon. 373 */ 374 typedef struct DocListMerge { 375 DocListReader in; 376 DocList *pOut; 377 int iOffset; 378 } DocListMerge; 379 380 static void mergeInit(DocListMerge *m, 381 DocList *pIn, int iOffset, DocList *pOut){ 382 readerInit(&m->in, pIn); 383 m->pOut = pOut; 384 m->iOffset = iOffset; 385 386 /* can't handle offsets yet */ 387 assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); 388 assert( pOut->iType <= DL_POSITIONS ); 389 } 390 391 /* A helper function for mergeBlock(), below. Merge the position lists 392 * pointed to by m->in and pBlockReader. 393 * If the merge matches, write [iDocid] to m->pOut; if m->pOut 394 * has positions then write all matching positions as well. */ 395 static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, 396 DocListReader *pBlockReader){ 397 int block_pos = readPosition(pBlockReader); 398 int in_pos = readPosition(&m->in); 399 int match = 0; 400 while( block_pos!=-1 || in_pos!=-1 ){ 401 if( block_pos-m->iOffset==in_pos ){ 402 if( !match ){ 403 docListAddDocid(m->pOut, iDocid); 404 match = 1; 405 } 406 if( m->pOut->iType >= DL_POSITIONS ){ 407 docListAddPos(m->pOut, in_pos); 408 } 409 block_pos = readPosition(pBlockReader); 410 in_pos = readPosition(&m->in); 411 } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){ 412 block_pos = readPosition(pBlockReader); 413 } else { 414 in_pos = readPosition(&m->in); 415 } 416 } 417 if( m->pOut->iType >= DL_POSITIONS && match ){ 418 docListAddEndPos(m->pOut); 419 } 420 } 421 422 /* Merge one block of an on-disk doclist into a DocListMerge. */ 423 static void mergeBlock(DocListMerge *m, DocList *pBlock){ 424 DocListReader blockReader; 425 assert( pBlock->iType >= DL_POSITIONS ); 426 readerInit(&blockReader, pBlock); 427 while( !readerAtEnd(&blockReader) ){ 428 sqlite_int64 iDocid = readDocid(&blockReader); 429 if( m->in.pDoclist!=NULL ){ 430 while( 1 ){ 431 if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ 432 if( peekDocid(&m->in)>=iDocid ) break; 433 skipDocument(&m->in); 434 } 435 if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ 436 skipPositionList(&blockReader); /* skip this docid in the block */ 437 continue; 438 } 439 readDocid(&m->in); 440 } 441 /* We have a document match. */ 442 if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ 443 /* We don't need to do a poslist merge. */ 444 docListAddDocid(m->pOut, iDocid); 445 if( m->pOut->iType >= DL_POSITIONS ){ 446 /* Copy all positions to the output doclist. */ 447 while( 1 ){ 448 int pos = readPosition(&blockReader); 449 if( pos==-1 ) break; 450 docListAddPos(m->pOut, pos); 451 } 452 docListAddEndPos(m->pOut); 453 } else skipPositionList(&blockReader); 454 continue; 455 } 456 mergePosList(m, iDocid, &blockReader); 457 } 458 } 459 460 static char *string_dup_n(const char *s, int n){ 461 char *str = malloc(n + 1); 462 memcpy(str, s, n); 463 str[n] = '\0'; 464 return str; 465 } 466 467 /* Duplicate a string; the caller must free() the returned string. 468 * (We don't use strdup() since it's not part of the standard C library and 469 * may not be available everywhere.) */ 470 static char *string_dup(const char *s){ 471 return string_dup_n(s, strlen(s)); 472 } 473 474 /* Format a string, replacing each occurrence of the % character with 475 * zName. This may be more convenient than sqlite_mprintf() 476 * when one string is used repeatedly in a format string. 477 * The caller must free() the returned string. */ 478 static char *string_format(const char *zFormat, const char *zName){ 479 const char *p; 480 size_t len = 0; 481 size_t nName = strlen(zName); 482 char *result; 483 char *r; 484 485 /* first compute length needed */ 486 for(p = zFormat ; *p ; ++p){ 487 len += (*p=='%' ? nName : 1); 488 } 489 len += 1; /* for null terminator */ 490 491 r = result = malloc(len); 492 for(p = zFormat; *p; ++p){ 493 if( *p=='%' ){ 494 memcpy(r, zName, nName); 495 r += nName; 496 } else { 497 *r++ = *p; 498 } 499 } 500 *r++ = '\0'; 501 assert( r == result + len ); 502 return result; 503 } 504 505 static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ 506 char *zCommand = string_format(zFormat, zName); 507 int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); 508 free(zCommand); 509 return rc; 510 } 511 512 static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, 513 const char *zFormat){ 514 char *zCommand = string_format(zFormat, zName); 515 int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); 516 free(zCommand); 517 return rc; 518 } 519 520 /* end utility functions */ 521 522 #define QUERY_GENERIC 0 523 #define QUERY_FULLTEXT 1 524 525 #define CHUNK_MAX 1024 526 527 typedef enum fulltext_statement { 528 CONTENT_INSERT_STMT, 529 CONTENT_SELECT_STMT, 530 CONTENT_DELETE_STMT, 531 532 TERM_SELECT_STMT, 533 TERM_CHUNK_SELECT_STMT, 534 TERM_INSERT_STMT, 535 TERM_UPDATE_STMT, 536 TERM_DELETE_STMT, 537 538 MAX_STMT /* Always at end! */ 539 } fulltext_statement; 540 541 /* These must exactly match the enum above. */ 542 /* TODO(adam): Is there some risk that a statement (in particular, 543 ** pTermSelectStmt) will be used in two cursors at once, e.g. if a 544 ** query joins a virtual table to itself? If so perhaps we should 545 ** move some of these to the cursor object. 546 */ 547 static const char *fulltext_zStatement[MAX_STMT] = { 548 /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", 549 /* CONTENT_SELECT */ "select content from %_content where rowid = ?", 550 /* CONTENT_DELETE */ "delete from %_content where rowid = ?", 551 552 /* TERM_SELECT */ 553 "select rowid, doclist from %_term where term = ? and first = ?", 554 /* TERM_CHUNK_SELECT */ 555 "select max(first) from %_term where term = ? and first <= ?", 556 /* TERM_INSERT */ 557 "insert into %_term (term, first, doclist) values (?, ?, ?)", 558 /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", 559 /* TERM_DELETE */ "delete from %_term where rowid = ?", 560 }; 561 562 typedef struct fulltext_vtab { 563 sqlite3_vtab base; 564 sqlite3 *db; 565 const char *zName; /* virtual table name */ 566 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ 567 568 /* Precompiled statements which we keep as long as the table is 569 ** open. 570 */ 571 sqlite3_stmt *pFulltextStatements[MAX_STMT]; 572 } fulltext_vtab; 573 574 typedef struct fulltext_cursor { 575 sqlite3_vtab_cursor base; 576 int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ 577 578 sqlite3_stmt *pStmt; 579 580 int eof; 581 582 /* The following is used only when iCursorType == QUERY_FULLTEXT. */ 583 DocListReader result; 584 } fulltext_cursor; 585 586 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ 587 return (fulltext_vtab *) c->base.pVtab; 588 } 589 590 static sqlite3_module fulltextModule; /* forward declaration */ 591 592 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. 593 ** If the indicated statement has never been prepared, it is prepared 594 ** and cached, otherwise the cached version is reset. 595 */ 596 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, 597 sqlite3_stmt **ppStmt){ 598 assert( iStmt<MAX_STMT ); 599 if( v->pFulltextStatements[iStmt]==NULL ){ 600 int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], 601 fulltext_zStatement[iStmt]); 602 if( rc!=SQLITE_OK ) return rc; 603 } else { 604 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); 605 if( rc!=SQLITE_OK ) return rc; 606 } 607 608 *ppStmt = v->pFulltextStatements[iStmt]; 609 return SQLITE_OK; 610 } 611 612 /* Step the indicated statement, handling errors SQLITE_BUSY (by 613 ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring 614 ** bindings to the new statement). 615 ** TODO(adam): We should extend this function so that it can work with 616 ** statements declared locally, not only globally cached statements. 617 */ 618 static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, 619 sqlite3_stmt **ppStmt){ 620 int rc; 621 sqlite3_stmt *s = *ppStmt; 622 assert( iStmt<MAX_STMT ); 623 assert( s==v->pFulltextStatements[iStmt] ); 624 625 while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ 626 sqlite3_stmt *pNewStmt; 627 628 if( rc==SQLITE_BUSY ) continue; 629 if( rc!=SQLITE_ERROR ) return rc; 630 631 rc = sqlite3_reset(s); 632 if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; 633 634 v->pFulltextStatements[iStmt] = NULL; /* Still in s */ 635 rc = sql_get_statement(v, iStmt, &pNewStmt); 636 if( rc!=SQLITE_OK ) goto err; 637 *ppStmt = pNewStmt; 638 639 rc = sqlite3_transfer_bindings(s, pNewStmt); 640 if( rc!=SQLITE_OK ) goto err; 641 642 rc = sqlite3_finalize(s); 643 if( rc!=SQLITE_OK ) return rc; 644 s = pNewStmt; 645 } 646 return rc; 647 648 err: 649 sqlite3_finalize(s); 650 return rc; 651 } 652 653 /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. 654 ** Useful for statements like UPDATE, where we expect no results. 655 */ 656 static int sql_single_step_statement(fulltext_vtab *v, 657 fulltext_statement iStmt, 658 sqlite3_stmt **ppStmt){ 659 int rc = sql_step_statement(v, iStmt, ppStmt); 660 return (rc==SQLITE_DONE) ? SQLITE_OK : rc; 661 } 662 663 /* insert into %_content (rowid, content) values ([rowid], [zContent]) */ 664 static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, 665 const char *zContent, int nContent){ 666 sqlite3_stmt *s; 667 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); 668 if( rc!=SQLITE_OK ) return rc; 669 670 rc = sqlite3_bind_value(s, 1, rowid); 671 if( rc!=SQLITE_OK ) return rc; 672 673 rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); 674 if( rc!=SQLITE_OK ) return rc; 675 676 return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); 677 } 678 679 /* select content from %_content where rowid = [iRow] 680 * The caller must delete the returned string. */ 681 static int content_select(fulltext_vtab *v, sqlite_int64 iRow, 682 char **pzContent){ 683 sqlite3_stmt *s; 684 int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); 685 if( rc!=SQLITE_OK ) return rc; 686 687 rc = sqlite3_bind_int64(s, 1, iRow); 688 if( rc!=SQLITE_OK ) return rc; 689 690 rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); 691 if( rc!=SQLITE_ROW ) return rc; 692 693 *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); 694 695 /* We expect only one row. We must execute another sqlite3_step() 696 * to complete the iteration; otherwise the table will remain locked. */ 697 rc = sqlite3_step(s); 698 if( rc==SQLITE_DONE ) return SQLITE_OK; 699 700 free(*pzContent); 701 return rc; 702 } 703 704 /* delete from %_content where rowid = [iRow ] */ 705 static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ 706 sqlite3_stmt *s; 707 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); 708 if( rc!=SQLITE_OK ) return rc; 709 710 rc = sqlite3_bind_int64(s, 1, iRow); 711 if( rc!=SQLITE_OK ) return rc; 712 713 return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); 714 } 715 716 /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] 717 * If found, returns SQLITE_OK; the caller must free the returned doclist. 718 * If no rows found, returns SQLITE_ERROR. */ 719 static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, 720 sqlite_int64 iFirst, 721 sqlite_int64 *rowid, 722 DocList *out){ 723 sqlite3_stmt *s; 724 int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); 725 if( rc!=SQLITE_OK ) return rc; 726 727 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); 728 if( rc!=SQLITE_OK ) return rc; 729 730 rc = sqlite3_bind_int64(s, 2, iFirst); 731 if( rc!=SQLITE_OK ) return rc; 732 733 rc = sql_step_statement(v, TERM_SELECT_STMT, &s); 734 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 735 736 *rowid = sqlite3_column_int64(s, 0); 737 docListInit(out, DL_POSITIONS_OFFSETS, 738 sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); 739 740 /* We expect only one row. We must execute another sqlite3_step() 741 * to complete the iteration; otherwise the table will remain locked. */ 742 rc = sqlite3_step(s); 743 return rc==SQLITE_DONE ? SQLITE_OK : rc; 744 } 745 746 /* select max(first) from %_term where term = [zTerm] and first <= [iFirst] 747 * If found, returns SQLITE_ROW and result in *piResult; if the query returns 748 * NULL (meaning no row found) returns SQLITE_DONE. 749 */ 750 static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, 751 sqlite_int64 iFirst, sqlite_int64 *piResult){ 752 sqlite3_stmt *s; 753 int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); 754 if( rc!=SQLITE_OK ) return rc; 755 756 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); 757 if( rc!=SQLITE_OK ) return rc; 758 759 rc = sqlite3_bind_int64(s, 2, iFirst); 760 if( rc!=SQLITE_OK ) return rc; 761 762 rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); 763 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 764 765 switch( sqlite3_column_type(s, 0) ){ 766 case SQLITE_NULL: 767 rc = SQLITE_DONE; 768 break; 769 case SQLITE_INTEGER: 770 *piResult = sqlite3_column_int64(s, 0); 771 break; 772 default: 773 return SQLITE_ERROR; 774 } 775 /* We expect only one row. We must execute another sqlite3_step() 776 * to complete the iteration; otherwise the table will remain locked. */ 777 if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; 778 return rc; 779 } 780 781 /* insert into %_term (term, first, doclist) 782 values ([zTerm], [iFirst], [doclist]) */ 783 static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, 784 sqlite_int64 iFirst, DocList *doclist){ 785 sqlite3_stmt *s; 786 int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); 787 if( rc!=SQLITE_OK ) return rc; 788 789 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); 790 if( rc!=SQLITE_OK ) return rc; 791 792 rc = sqlite3_bind_int64(s, 2, iFirst); 793 if( rc!=SQLITE_OK ) return rc; 794 795 rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); 796 if( rc!=SQLITE_OK ) return rc; 797 798 return sql_single_step_statement(v, TERM_INSERT_STMT, &s); 799 } 800 801 /* update %_term set doclist = [doclist] where rowid = [rowid] */ 802 static int term_update(fulltext_vtab *v, sqlite_int64 rowid, 803 DocList *doclist){ 804 sqlite3_stmt *s; 805 int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); 806 if( rc!=SQLITE_OK ) return rc; 807 808 rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, 809 SQLITE_STATIC); 810 if( rc!=SQLITE_OK ) return rc; 811 812 rc = sqlite3_bind_int64(s, 2, rowid); 813 if( rc!=SQLITE_OK ) return rc; 814 815 return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); 816 } 817 818 static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ 819 sqlite3_stmt *s; 820 int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); 821 if( rc!=SQLITE_OK ) return rc; 822 823 rc = sqlite3_bind_int64(s, 1, rowid); 824 if( rc!=SQLITE_OK ) return rc; 825 826 return sql_single_step_statement(v, TERM_DELETE_STMT, &s); 827 } 828 829 static void fulltext_vtab_destroy(fulltext_vtab *v){ 830 int iStmt; 831 832 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ 833 if( v->pFulltextStatements[iStmt]!=NULL ){ 834 sqlite3_finalize(v->pFulltextStatements[iStmt]); 835 v->pFulltextStatements[iStmt] = NULL; 836 } 837 } 838 839 if( v->pTokenizer!=NULL ){ 840 v->pTokenizer->pModule->xDestroy(v->pTokenizer); 841 v->pTokenizer = NULL; 842 } 843 844 free((void *) v->zName); 845 free(v); 846 } 847 848 /* Current interface: 849 ** argv[0] - module name 850 ** argv[1] - database name 851 ** argv[2] - table name 852 ** argv[3] - tokenizer name (optional, a sensible default is provided) 853 ** argv[4..] - passed to tokenizer (optional based on tokenizer) 854 **/ 855 static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv, 856 sqlite3_vtab **ppVTab){ 857 int rc; 858 fulltext_vtab *v; 859 sqlite3_tokenizer_module *m = NULL; 860 861 assert( argc>=3 ); 862 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); 863 /* sqlite will initialize v->base */ 864 v->db = db; 865 v->zName = string_dup(argv[2]); 866 v->pTokenizer = NULL; 867 868 if( argc==3 ){ 869 get_simple_tokenizer_module(&m); 870 } else { 871 /* TODO(shess) For now, add new tokenizers as else if clauses. */ 872 if( !strcmp(argv[3], "simple") ){ 873 get_simple_tokenizer_module(&m); 874 } else { 875 assert( "unrecognized tokenizer"==NULL ); 876 } 877 } 878 879 /* TODO(shess) Since tokenization impacts the index, the parameters 880 ** to the tokenizer need to be identical when a persistent virtual 881 ** table is re-created. One solution would be a meta-table to track 882 ** such information in the database. Then we could verify that the 883 ** information is identical on subsequent creates. 884 */ 885 /* TODO(shess) Why isn't argv already (const char **)? */ 886 rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); 887 if( rc!=SQLITE_OK ) return rc; 888 v->pTokenizer->pModule = m; 889 890 /* TODO: verify the existence of backing tables foo_content, foo_term */ 891 892 rc = sqlite3_declare_vtab(db, "create table x(content text)"); 893 if( rc!=SQLITE_OK ) return rc; 894 895 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); 896 897 *ppVTab = &v->base; 898 return SQLITE_OK; 899 } 900 901 static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv, 902 sqlite3_vtab **ppVTab){ 903 int rc; 904 assert( argc>=3 ); 905 906 /* The %_content table holds the text of each full-text item, with 907 ** the rowid used as the docid. 908 ** 909 ** The %_term table maps each term to a document list blob 910 ** containing elements sorted by ascending docid, each element 911 ** encoded as: 912 ** 913 ** docid varint-encoded 914 ** token count varint-encoded 915 ** "count" token elements (poslist): 916 ** position varint-encoded as delta from previous position 917 ** start offset varint-encoded as delta from previous start offset 918 ** end offset varint-encoded as delta from start offset 919 ** 920 ** Additionally, doclist blobs can be chunked into multiple rows, 921 ** using "first" to order the blobs. "first" is simply the first 922 ** docid in the blob. 923 */ 924 /* 925 ** NOTE(shess) That last sentence is incorrect in the face of 926 ** deletion, which can leave a doclist that doesn't contain the 927 ** first from that row. I _believe_ this does not matter to the 928 ** operation of the system, but it might be reasonable to update 929 ** appropriately in case this assumption becomes more important. 930 */ 931 rc = sql_exec(db, argv[2], 932 "create table %_content(content text);" 933 "create table %_term(term text, first integer, doclist blob);" 934 "create index %_index on %_term(term, first)"); 935 if( rc!=SQLITE_OK ) return rc; 936 937 return fulltextConnect(db, pAux, argc, argv, ppVTab); 938 } 939 940 /* Decide how to handle an SQL query. 941 * At the moment, MATCH queries can include implicit boolean ANDs; we 942 * haven't implemented phrase searches or OR yet. */ 943 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ 944 int i; 945 946 for(i=0; i<pInfo->nConstraint; ++i){ 947 const struct sqlite3_index_constraint *pConstraint; 948 pConstraint = &pInfo->aConstraint[i]; 949 if( pConstraint->iColumn==0 && 950 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && 951 pConstraint->usable ){ /* a full-text search */ 952 pInfo->aConstraintUsage[i].argvIndex = 1; 953 pInfo->aConstraintUsage[i].omit = 1; 954 pInfo->idxNum = QUERY_FULLTEXT; 955 pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ 956 return SQLITE_OK; 957 } 958 } 959 pInfo->idxNum = QUERY_GENERIC; 960 return SQLITE_OK; 961 } 962 963 static int fulltextDisconnect(sqlite3_vtab *pVTab){ 964 fulltext_vtab_destroy((fulltext_vtab *)pVTab); 965 return SQLITE_OK; 966 } 967 968 static int fulltextDestroy(sqlite3_vtab *pVTab){ 969 fulltext_vtab *v = (fulltext_vtab *)pVTab; 970 971 int rc = sql_exec(v->db, v->zName, 972 "drop table %_content; drop table %_term"); 973 if( rc!=SQLITE_OK ) return rc; 974 975 fulltext_vtab_destroy((fulltext_vtab *)pVTab); 976 return SQLITE_OK; 977 } 978 979 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ 980 fulltext_cursor *c; 981 982 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); 983 /* sqlite will initialize c->base */ 984 *ppCursor = &c->base; 985 986 return SQLITE_OK; 987 } 988 989 static int fulltextClose(sqlite3_vtab_cursor *pCursor){ 990 fulltext_cursor *c = (fulltext_cursor *) pCursor; 991 sqlite3_finalize(c->pStmt); 992 if( c->result.pDoclist!=NULL ){ 993 docListDelete(c->result.pDoclist); 994 } 995 free(c); 996 return SQLITE_OK; 997 } 998 999 static int fulltextNext(sqlite3_vtab_cursor *pCursor){ 1000 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1001 sqlite_int64 iDocid; 1002 int rc; 1003 1004 switch( c->iCursorType ){ 1005 case QUERY_GENERIC: 1006 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ 1007 rc = sqlite3_step(c->pStmt); 1008 switch( rc ){ 1009 case SQLITE_ROW: 1010 c->eof = 0; 1011 return SQLITE_OK; 1012 case SQLITE_DONE: 1013 c->eof = 1; 1014 return SQLITE_OK; 1015 default: 1016 c->eof = 1; 1017 return rc; 1018 } 1019 case QUERY_FULLTEXT: 1020 rc = sqlite3_reset(c->pStmt); 1021 if( rc!=SQLITE_OK ) return rc; 1022 1023 if( readerAtEnd(&c->result)){ 1024 c->eof = 1; 1025 return SQLITE_OK; 1026 } 1027 iDocid = readDocid(&c->result); 1028 rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); 1029 if( rc!=SQLITE_OK ) return rc; 1030 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ 1031 rc = sqlite3_step(c->pStmt); 1032 if( rc==SQLITE_ROW ){ /* the case we expect */ 1033 c->eof = 0; 1034 return SQLITE_OK; 1035 } 1036 /* an error occurred; abort */ 1037 return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 1038 default: 1039 assert( 0 ); 1040 return SQLITE_ERROR; /* not reached */ 1041 } 1042 } 1043 1044 static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, 1045 sqlite3_stmt **ppStmt){ 1046 int rc; 1047 if( *ppStmt ){ 1048 rc = sqlite3_reset(*ppStmt); 1049 } else { 1050 rc = sql_prepare(v->db, v->zName, ppStmt, 1051 "select doclist from %_term where term = ? order by first"); 1052 } 1053 if( rc!=SQLITE_OK ) return rc; 1054 1055 rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); 1056 if( rc!=SQLITE_OK ) return rc; 1057 1058 return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ 1059 } 1060 1061 /* Read the posting list for [zTerm]; AND it with the doclist [in] to 1062 * produce the doclist [out], using the given offset [iOffset] for phrase 1063 * matching. 1064 * (*pSelect) is used to hold an SQLite statement used inside this function; 1065 * the caller should initialize *pSelect to NULL before the first call. 1066 */ 1067 static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, 1068 const char *zTerm, 1069 DocList *pIn, int iOffset, DocList *out){ 1070 int rc; 1071 DocListMerge merge; 1072 1073 if( pIn!=NULL && !pIn->nData ){ 1074 /* If [pIn] is already empty, there's no point in reading the 1075 * posting list to AND it in; return immediately. */ 1076 return SQLITE_OK; 1077 } 1078 1079 rc = term_select_doclist(v, zTerm, -1, pSelect); 1080 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; 1081 1082 mergeInit(&merge, pIn, iOffset, out); 1083 while( rc==SQLITE_ROW ){ 1084 DocList block; 1085 docListInit(&block, DL_POSITIONS_OFFSETS, 1086 sqlite3_column_blob(*pSelect, 0), 1087 sqlite3_column_bytes(*pSelect, 0)); 1088 mergeBlock(&merge, &block); 1089 docListDestroy(&block); 1090 1091 rc = sqlite3_step(*pSelect); 1092 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ 1093 return rc; 1094 } 1095 } 1096 1097 return SQLITE_OK; 1098 } 1099 1100 typedef struct QueryTerm { 1101 int is_phrase; /* true if this term begins a new phrase */ 1102 const char *zTerm; 1103 } QueryTerm; 1104 1105 /* A parsed query. 1106 * 1107 * As an example, parsing the query ["four score" years "new nation"] will 1108 * yield a Query with 5 terms: 1109 * "four", is_phrase = 1 1110 * "score", is_phrase = 0 1111 * "years", is_phrase = 1 1112 * "new", is_phrase = 1 1113 * "nation", is_phrase = 0 1114 */ 1115 typedef struct Query { 1116 int nTerms; 1117 QueryTerm *pTerm; 1118 } Query; 1119 1120 static void query_add(Query *q, int is_phrase, const char *zTerm){ 1121 QueryTerm *t; 1122 ++q->nTerms; 1123 q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); 1124 t = &q->pTerm[q->nTerms - 1]; 1125 t->is_phrase = is_phrase; 1126 t->zTerm = zTerm; 1127 } 1128 1129 static void query_free(Query *q){ 1130 int i; 1131 for(i = 0; i < q->nTerms; ++i){ 1132 free((void *) q->pTerm[i].zTerm); 1133 } 1134 free(q->pTerm); 1135 } 1136 1137 static int tokenize_segment(sqlite3_tokenizer *pTokenizer, 1138 const char *zQuery, int in_phrase, 1139 Query *pQuery){ 1140 sqlite3_tokenizer_module *pModule = pTokenizer->pModule; 1141 sqlite3_tokenizer_cursor *pCursor; 1142 int is_first = 1; 1143 1144 int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); 1145 if( rc!=SQLITE_OK ) return rc; 1146 pCursor->pTokenizer = pTokenizer; 1147 1148 while( 1 ){ 1149 const char *zToken; 1150 int nToken, iStartOffset, iEndOffset, dummy_pos; 1151 1152 rc = pModule->xNext(pCursor, 1153 &zToken, &nToken, 1154 &iStartOffset, &iEndOffset, 1155 &dummy_pos); 1156 if( rc!=SQLITE_OK ) break; 1157 query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); 1158 is_first = 0; 1159 } 1160 1161 return pModule->xClose(pCursor); 1162 } 1163 1164 /* Parse a query string, yielding a Query object. */ 1165 static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ 1166 char *zQuery1 = string_dup(zQuery); 1167 int in_phrase = 0; 1168 char *s = zQuery1; 1169 pQuery->nTerms = 0; 1170 pQuery->pTerm = NULL; 1171 1172 while( *s ){ 1173 char *t = s; 1174 while( *t ){ 1175 if( *t=='"' ){ 1176 *t++ = '\0'; 1177 break; 1178 } 1179 ++t; 1180 } 1181 if( *s ){ 1182 tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); 1183 } 1184 s = t; 1185 in_phrase = !in_phrase; 1186 } 1187 1188 free(zQuery1); 1189 return SQLITE_OK; 1190 } 1191 1192 /* Perform a full-text query; return a list of documents in [pResult]. */ 1193 static int fulltext_query(fulltext_vtab *v, const char *zQuery, 1194 DocList **pResult){ 1195 Query q; 1196 int phrase_start = -1; 1197 int i; 1198 sqlite3_stmt *pSelect = NULL; 1199 DocList *d = NULL; 1200 1201 int rc = parse_query(v, zQuery, &q); 1202 if( rc!=SQLITE_OK ) return rc; 1203 1204 /* Merge terms. */ 1205 for(i = 0 ; i < q.nTerms ; ++i){ 1206 /* In each merge step, we need to generate positions whenever we're 1207 * processing a phrase which hasn't ended yet. */ 1208 int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase; 1209 DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS); 1210 if( q.pTerm[i].is_phrase ){ 1211 phrase_start = i; 1212 } 1213 rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next); 1214 if( rc!=SQLITE_OK ) break; 1215 if( d!=NULL ){ 1216 docListDelete(d); 1217 } 1218 d = next; 1219 } 1220 1221 sqlite3_finalize(pSelect); 1222 query_free(&q); 1223 *pResult = d; 1224 return rc; 1225 } 1226 1227 static int fulltextFilter(sqlite3_vtab_cursor *pCursor, 1228 int idxNum, const char *idxStr, 1229 int argc, sqlite3_value **argv){ 1230 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1231 fulltext_vtab *v = cursor_vtab(c); 1232 int rc; 1233 const char *zStatement; 1234 1235 c->iCursorType = idxNum; 1236 switch( idxNum ){ 1237 case QUERY_GENERIC: 1238 zStatement = "select rowid, content from %_content"; 1239 break; 1240 1241 case QUERY_FULLTEXT: /* full-text search */ 1242 { 1243 const char *zQuery = (const char *)sqlite3_value_text(argv[0]); 1244 DocList *pResult; 1245 assert( argc==1 ); 1246 rc = fulltext_query(v, zQuery, &pResult); 1247 if( rc!=SQLITE_OK ) return rc; 1248 readerInit(&c->result, pResult); 1249 zStatement = "select rowid, content from %_content where rowid = ?"; 1250 break; 1251 } 1252 1253 default: 1254 assert( 0 ); 1255 } 1256 1257 rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); 1258 if( rc!=SQLITE_OK ) return rc; 1259 1260 return fulltextNext(pCursor); 1261 } 1262 1263 static int fulltextEof(sqlite3_vtab_cursor *pCursor){ 1264 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1265 return c->eof; 1266 } 1267 1268 static int fulltextColumn(sqlite3_vtab_cursor *pCursor, 1269 sqlite3_context *pContext, int idxCol){ 1270 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1271 const char *s; 1272 1273 assert( idxCol==0 ); 1274 s = (const char *) sqlite3_column_text(c->pStmt, 1); 1275 sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); 1276 1277 return SQLITE_OK; 1278 } 1279 1280 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ 1281 fulltext_cursor *c = (fulltext_cursor *) pCursor; 1282 1283 *pRowid = sqlite3_column_int64(c->pStmt, 0); 1284 return SQLITE_OK; 1285 } 1286 1287 /* Build a hash table containing all terms in zText. */ 1288 static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, 1289 const char *zText, sqlite_int64 iDocid){ 1290 sqlite3_tokenizer_cursor *pCursor; 1291 const char *pToken; 1292 int nTokenBytes; 1293 int iStartOffset, iEndOffset, iPosition; 1294 1295 int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); 1296 if( rc!=SQLITE_OK ) return rc; 1297 1298 pCursor->pTokenizer = pTokenizer; 1299 HashInit(terms, HASH_STRING, 1); 1300 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, 1301 &pToken, &nTokenBytes, 1302 &iStartOffset, &iEndOffset, 1303 &iPosition) ){ 1304 DocList *p; 1305 1306 /* Positions can't be negative; we use -1 as a terminator internally. */ 1307 if( iPosition<0 ) { 1308 rc = SQLITE_ERROR; 1309 goto err; 1310 } 1311 1312 p = HashFind(terms, pToken, nTokenBytes); 1313 if( p==NULL ){ 1314 p = docListNew(DL_POSITIONS_OFFSETS); 1315 docListAddDocid(p, iDocid); 1316 HashInsert(terms, pToken, nTokenBytes, p); 1317 } 1318 docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); 1319 } 1320 1321 err: 1322 /* TODO(shess) Check return? Should this be able to cause errors at 1323 ** this point? Actually, same question about sqlite3_finalize(), 1324 ** though one could argue that failure there means that the data is 1325 ** not durable. *ponder* 1326 */ 1327 pTokenizer->pModule->xClose(pCursor); 1328 return rc; 1329 } 1330 /* Update the %_terms table to map the term [zTerm] to the given rowid. */ 1331 static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, 1332 sqlite_int64 iDocid, DocList *p){ 1333 sqlite_int64 iFirst; 1334 sqlite_int64 iIndexRow; 1335 DocList doclist; 1336 1337 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); 1338 if( rc==SQLITE_DONE ){ 1339 docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); 1340 if( docListUpdate(&doclist, iDocid, p) ){ 1341 rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); 1342 docListDestroy(&doclist); 1343 return rc; 1344 } 1345 return SQLITE_OK; 1346 } 1347 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; 1348 1349 /* This word is in the index; add this document ID to its blob. */ 1350 1351 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); 1352 if( rc!=SQLITE_OK ) return rc; 1353 1354 if( docListUpdate(&doclist, iDocid, p) ){ 1355 /* If the blob is too big, split it in half. */ 1356 if( doclist.nData>CHUNK_MAX ){ 1357 DocList half; 1358 if( docListSplit(&doclist, &half) ){ 1359 rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); 1360 docListDestroy(&half); 1361 if( rc!=SQLITE_OK ) goto err; 1362 } 1363 } 1364 rc = term_update(v, iIndexRow, &doclist); 1365 } 1366 1367 err: 1368 docListDestroy(&doclist); 1369 return rc; 1370 } 1371 1372 /* Insert a row into the full-text index; set *piRowid to be the ID of the 1373 * new row. */ 1374 static int index_insert(fulltext_vtab *v, 1375 sqlite3_value *pRequestRowid, const char *zText, 1376 sqlite_int64 *piRowid){ 1377 Hash terms; /* maps term string -> PosList */ 1378 HashElem *e; 1379 1380 int rc = content_insert(v, pRequestRowid, zText, -1); 1381 if( rc!=SQLITE_OK ) return rc; 1382 *piRowid = sqlite3_last_insert_rowid(v->db); 1383 1384 if( !zText ) return SQLITE_OK; /* nothing to index */ 1385 1386 rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); 1387 if( rc!=SQLITE_OK ) return rc; 1388 1389 for(e=HashFirst(&terms); e; e=HashNext(e)){ 1390 DocList *p = HashData(e); 1391 rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); 1392 if( rc!=SQLITE_OK ) break; 1393 } 1394 1395 for(e=HashFirst(&terms); e; e=HashNext(e)){ 1396 DocList *p = HashData(e); 1397 docListDelete(p); 1398 } 1399 HashClear(&terms); 1400 return rc; 1401 } 1402 1403 static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, 1404 sqlite_int64 iDocid){ 1405 sqlite_int64 iFirst; 1406 sqlite_int64 iIndexRow; 1407 DocList doclist; 1408 1409 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); 1410 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; 1411 1412 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); 1413 if( rc!=SQLITE_OK ) return rc; 1414 1415 if( docListUpdate(&doclist, iDocid, NULL) ){ 1416 if( doclist.nData>0 ){ 1417 rc = term_update(v, iIndexRow, &doclist); 1418 } else { /* empty posting list */ 1419 rc = term_delete(v, iIndexRow); 1420 } 1421 } 1422 docListDestroy(&doclist); 1423 return rc; 1424 } 1425 1426 /* Delete a row from the full-text index. */ 1427 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ 1428 char *zText; 1429 Hash terms; 1430 HashElem *e; 1431 1432 int rc = content_select(v, iRow, &zText); 1433 if( rc!=SQLITE_OK ) return rc; 1434 1435 rc = build_terms(&terms, v->pTokenizer, zText, iRow); 1436 free(zText); 1437 if( rc!=SQLITE_OK ) return rc; 1438 1439 for(e=HashFirst(&terms); e; e=HashNext(e)){ 1440 rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); 1441 if( rc!=SQLITE_OK ) break; 1442 } 1443 for(e=HashFirst(&terms); e; e=HashNext(e)){ 1444 DocList *p = HashData(e); 1445 docListDelete(p); 1446 } 1447 HashClear(&terms); 1448 1449 return content_delete(v, iRow); 1450 } 1451 1452 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, 1453 sqlite_int64 *pRowid){ 1454 fulltext_vtab *v = (fulltext_vtab *) pVtab; 1455 1456 if( nArg<2 ){ 1457 return index_delete(v, sqlite3_value_int64(ppArg[0])); 1458 } 1459 1460 if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ 1461 return SQLITE_ERROR; /* an update; not yet supported */ 1462 } 1463 1464 assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ 1465 return index_insert(v, ppArg[1], 1466 (const char *)sqlite3_value_text(ppArg[2]), pRowid); 1467 } 1468 1469 static sqlite3_module fulltextModule = { 1470 0, 1471 fulltextCreate, 1472 fulltextConnect, 1473 fulltextBestIndex, 1474 fulltextDisconnect, 1475 fulltextDestroy, 1476 fulltextOpen, 1477 fulltextClose, 1478 fulltextFilter, 1479 fulltextNext, 1480 fulltextEof, 1481 fulltextColumn, 1482 fulltextRowid, 1483 fulltextUpdate 1484 }; 1485 1486 int fulltext_init(sqlite3 *db){ 1487 return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); 1488 } 1489 1490 #if !SQLITE_CORE 1491 int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, 1492 const sqlite3_api_routines *pApi){ 1493 SQLITE_EXTENSION_INIT2(pApi) 1494 return fulltext_init(db); 1495 } 1496 #endif 1497