1 /* 2 ** The author disclaims copyright to this source code. 3 ** 4 ************************************************************************* 5 ** Implementation of the "simple" full-text-search tokenizer. 6 */ 7 8 #include <assert.h> 9 #if !defined(__APPLE__) 10 #include <malloc.h> 11 #else 12 #include <stdlib.h> 13 #endif 14 #include <stdio.h> 15 #include <string.h> 16 #include <ctype.h> 17 18 #include "tokenizer.h" 19 20 /* Duplicate a string; the caller must free() the returned string. 21 * (We don't use strdup() since it's not part of the standard C library and 22 * may not be available everywhere.) */ 23 /* TODO(shess) Copied from fulltext.c, consider util.c for such 24 ** things. */ 25 static char *string_dup(const char *s){ 26 char *str = malloc(strlen(s) + 1); 27 strcpy(str, s); 28 return str; 29 } 30 31 typedef struct simple_tokenizer { 32 sqlite3_tokenizer base; 33 const char *zDelim; /* token delimiters */ 34 } simple_tokenizer; 35 36 typedef struct simple_tokenizer_cursor { 37 sqlite3_tokenizer_cursor base; 38 const char *pInput; /* input we are tokenizing */ 39 int nBytes; /* size of the input */ 40 const char *pCurrent; /* current position in pInput */ 41 int iToken; /* index of next token to be returned */ 42 char *zToken; /* storage for current token */ 43 int nTokenBytes; /* actual size of current token */ 44 int nTokenAllocated; /* space allocated to zToken buffer */ 45 } simple_tokenizer_cursor; 46 47 static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */ 48 49 static int simpleCreate( 50 int argc, const char **argv, 51 sqlite3_tokenizer **ppTokenizer 52 ){ 53 simple_tokenizer *t; 54 55 t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer)); 56 /* TODO(shess) Delimiters need to remain the same from run to run, 57 ** else we need to reindex. One solution would be a meta-table to 58 ** track such information in the database, then we'd only want this 59 ** information on the initial create. 60 */ 61 if( argc>1 ){ 62 t->zDelim = string_dup(argv[1]); 63 } else { 64 /* Build a string excluding alphanumeric ASCII characters */ 65 char zDelim[0x80]; /* nul-terminated, so nul not a member */ 66 int i, j; 67 for(i=1, j=0; i<0x80; i++){ 68 if( !isalnum(i) ){ 69 zDelim[j++] = i; 70 } 71 } 72 zDelim[j++] = '\0'; 73 assert( j<=sizeof(zDelim) ); 74 t->zDelim = string_dup(zDelim); 75 } 76 77 *ppTokenizer = &t->base; 78 return SQLITE_OK; 79 } 80 81 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ 82 simple_tokenizer *t = (simple_tokenizer *) pTokenizer; 83 84 free((void *) t->zDelim); 85 free(t); 86 87 return SQLITE_OK; 88 } 89 90 static int simpleOpen( 91 sqlite3_tokenizer *pTokenizer, 92 const char *pInput, int nBytes, 93 sqlite3_tokenizer_cursor **ppCursor 94 ){ 95 simple_tokenizer_cursor *c; 96 97 c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor)); 98 c->pInput = pInput; 99 c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes; 100 c->pCurrent = c->pInput; /* start tokenizing at the beginning */ 101 c->iToken = 0; 102 c->zToken = NULL; /* no space allocated, yet. */ 103 c->nTokenBytes = 0; 104 c->nTokenAllocated = 0; 105 106 *ppCursor = &c->base; 107 return SQLITE_OK; 108 } 109 110 static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ 111 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; 112 113 if( NULL!=c->zToken ){ 114 free(c->zToken); 115 } 116 free(c); 117 118 return SQLITE_OK; 119 } 120 121 static int simpleNext( 122 sqlite3_tokenizer_cursor *pCursor, 123 const char **ppToken, int *pnBytes, 124 int *piStartOffset, int *piEndOffset, int *piPosition 125 ){ 126 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; 127 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; 128 int ii; 129 130 while( c->pCurrent-c->pInput<c->nBytes ){ 131 int n = (int) strcspn(c->pCurrent, t->zDelim); 132 if( n>0 ){ 133 if( n+1>c->nTokenAllocated ){ 134 c->zToken = realloc(c->zToken, n+1); 135 } 136 for(ii=0; ii<n; ii++){ 137 /* TODO(shess) This needs expansion to handle UTF-8 138 ** case-insensitivity. 139 */ 140 char ch = c->pCurrent[ii]; 141 c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch; 142 } 143 c->zToken[n] = '\0'; 144 *ppToken = c->zToken; 145 *pnBytes = n; 146 *piStartOffset = (int) (c->pCurrent-c->pInput); 147 *piEndOffset = *piStartOffset+n; 148 *piPosition = c->iToken++; 149 c->pCurrent += n + 1; 150 151 return SQLITE_OK; 152 } 153 c->pCurrent += n + 1; 154 /* TODO(shess) could strspn() to skip delimiters en masse. Needs 155 ** to happen in two places, though, which is annoying. 156 */ 157 } 158 return SQLITE_DONE; 159 } 160 161 static sqlite3_tokenizer_module simpleTokenizerModule = { 162 0, 163 simpleCreate, 164 simpleDestroy, 165 simpleOpen, 166 simpleClose, 167 simpleNext, 168 }; 169 170 void get_simple_tokenizer_module( 171 sqlite3_tokenizer_module **ppModule 172 ){ 173 *ppModule = &simpleTokenizerModule; 174 } 175