1 # 2009 November 04 2 # 3 # The author disclaims copyright to this source code. In place of 4 # a legal notice, here is a blessing: 5 # 6 # May you do good and not evil. 7 # May you find forgiveness for yourself and forgive others. 8 # May you share freely, never taking more than you give. 9 # 10 #*********************************************************************** 11 # 12 # This file contains common code used the fts3 tests. At one point 13 # equivalent functionality was implemented in C code. But it is easier 14 # to use Tcl. 15 # 16 17 #------------------------------------------------------------------------- 18 # USAGE: fts3_integrity_check TBL 19 # 20 # This proc is used to verify that the full-text index is consistent with 21 # the contents of the fts3 table. In other words, it checks that the 22 # data in the %_contents table matches that in the %_segdir and %_segments 23 # tables. 24 # 25 # This is not an efficient procedure. It uses a lot of memory and a lot 26 # of CPU. But it is better than not checking at all. 27 # 28 # The procedure is: 29 # 30 # 1) Read the entire full-text index from the %_segdir and %_segments 31 # tables into memory. For each entry in the index, the following is 32 # done: 33 # 34 # set C($iDocid,$iCol,$iPosition) $zTerm 35 # 36 # 2) Iterate through each column of each row of the %_content table. 37 # Tokenize all documents, and check that for each token there is 38 # a corresponding entry in the $C array. After checking a token, 39 # [unset] the $C array entry. 40 # 41 # 3) Check that array $C is now empty. 42 # 43 # 44 proc fts3_integrity_check {tbl} { 45 46 fts3_read2 $tbl 1 A 47 48 foreach zTerm [array names A] { 49 foreach doclist $A($zTerm) { 50 set docid 0 51 while {[string length $doclist]>0} { 52 set iCol 0 53 set iPos 0 54 set lPos [list] 55 set lCol [list] 56 57 # First varint of a doclist-entry is the docid. Delta-compressed 58 # with respect to the docid of the previous entry. 59 # 60 incr docid [gobble_varint doclist] 61 if {[info exists D($zTerm,$docid)]} { 62 while {[set iDelta [gobble_varint doclist]] != 0} {} 63 continue 64 } 65 set D($zTerm,$docid) 1 66 67 # Gobble varints until the 0x00 that terminates the doclist-entry 68 # is found. 69 while {[set iDelta [gobble_varint doclist]] > 0} { 70 if {$iDelta == 1} { 71 set iCol [gobble_varint doclist] 72 set iPos 0 73 } else { 74 incr iPos $iDelta 75 incr iPos -2 76 set C($docid,$iCol,$iPos) $zTerm 77 } 78 } 79 } 80 } 81 } 82 83 foreach key [array names C] { 84 #puts "$key -> $C($key)" 85 } 86 87 88 db eval "SELECT * FROM ${tbl}_content" E { 89 set iCol 0 90 set iDoc $E(docid) 91 foreach col [lrange $E(*) 1 end] { 92 set c $E($col) 93 set sql {SELECT fts3_tokenizer_test('simple', $c)} 94 95 foreach {pos term dummy} [db one $sql] { 96 if {![info exists C($iDoc,$iCol,$pos)]} { 97 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing" 98 lappend errors $es 99 } else { 100 if {$C($iDoc,$iCol,$pos) != "$term"} { 101 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index " 102 append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\"" 103 lappend errors $es 104 } 105 unset C($iDoc,$iCol,$pos) 106 } 107 } 108 incr iCol 109 } 110 } 111 112 foreach c [array names C] { 113 lappend errors "Bad index entry: $c -> $C($c)" 114 } 115 116 if {[info exists errors]} { return [join $errors "\n"] } 117 return "ok" 118 } 119 120 # USAGE: fts3_terms TBL WHERE 121 # 122 # Argument TBL must be the name of an FTS3 table. Argument WHERE is an 123 # SQL expression that will be used as the WHERE clause when scanning 124 # the %_segdir table. As in the following query: 125 # 126 # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}" 127 # 128 # This function returns a list of all terms present in the segments 129 # selected by the statement above. 130 # 131 proc fts3_terms {tbl where} { 132 fts3_read $tbl $where a 133 return [lsort [array names a]] 134 } 135 136 137 # USAGE: fts3_doclist TBL TERM WHERE 138 # 139 # Argument TBL must be the name of an FTS3 table. TERM is a term that may 140 # or may not be present in the table. Argument WHERE is used to select a 141 # subset of the b-tree segments in the associated full-text index as 142 # described above for [fts3_terms]. 143 # 144 # This function returns the results of merging the doclists associated 145 # with TERM in the selected segments. Each doclist is an element of the 146 # returned list. Each doclist is formatted as follows: 147 # 148 # [$docid ?$col[$off1 $off2...]?...] 149 # 150 # The formatting is odd for a Tcl command in order to be compatible with 151 # the original C-language implementation. If argument WHERE is "1", then 152 # any empty doclists are omitted from the returned list. 153 # 154 proc fts3_doclist {tbl term where} { 155 fts3_read $tbl $where a 156 157 158 foreach doclist $a($term) { 159 set docid 0 160 161 while {[string length $doclist]>0} { 162 set iCol 0 163 set iPos 0 164 set lPos [list] 165 set lCol [list] 166 incr docid [gobble_varint doclist] 167 168 while {[set iDelta [gobble_varint doclist]] > 0} { 169 if {$iDelta == 1} { 170 lappend lCol [list $iCol $lPos] 171 set iPos 0 172 set lPos [list] 173 set iCol [gobble_varint doclist] 174 } else { 175 incr iPos $iDelta 176 incr iPos -2 177 lappend lPos $iPos 178 } 179 } 180 181 if {[llength $lPos]>0} { 182 lappend lCol [list $iCol $lPos] 183 } 184 185 if {$where != "1" || [llength $lCol]>0} { 186 set ret($docid) $lCol 187 } else { 188 unset -nocomplain ret($docid) 189 } 190 } 191 } 192 193 set lDoc [list] 194 foreach docid [lsort -integer [array names ret]] { 195 set lCol [list] 196 set cols "" 197 foreach col $ret($docid) { 198 foreach {iCol lPos} $col {} 199 append cols " $iCol\[[join $lPos { }]\]" 200 } 201 lappend lDoc "\[${docid}${cols}\]" 202 } 203 204 join $lDoc " " 205 } 206 207 ########################################################################### 208 209 proc gobble_varint {varname} { 210 upvar $varname blob 211 set n [read_fts3varint $blob ret] 212 set blob [string range $blob $n end] 213 return $ret 214 } 215 proc gobble_string {varname nLength} { 216 upvar $varname blob 217 set ret [string range $blob 0 [expr $nLength-1]] 218 set blob [string range $blob $nLength end] 219 return $ret 220 } 221 222 # The argument is a blob of data representing an FTS3 segment leaf. 223 # Return a list consisting of alternating terms (strings) and doclists 224 # (blobs of data). 225 # 226 proc fts3_readleaf {blob} { 227 set zPrev "" 228 set terms [list] 229 230 while {[string length $blob] > 0} { 231 set nPrefix [gobble_varint blob] 232 set nSuffix [gobble_varint blob] 233 234 set zTerm [string range $zPrev 0 [expr $nPrefix-1]] 235 append zTerm [gobble_string blob $nSuffix] 236 set doclist [gobble_string blob [gobble_varint blob]] 237 238 lappend terms $zTerm $doclist 239 set zPrev $zTerm 240 } 241 242 return $terms 243 } 244 245 proc fts3_read2 {tbl where varname} { 246 upvar $varname a 247 array unset a 248 db eval " SELECT start_block, leaves_end_block, root 249 FROM ${tbl}_segdir WHERE $where 250 ORDER BY level ASC, idx DESC 251 " { 252 if {$start_block == 0} { 253 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } 254 } else { 255 db eval " SELECT block 256 FROM ${tbl}_segments 257 WHERE blockid>=$start_block AND blockid<=$leaves_end_block 258 ORDER BY blockid 259 " { 260 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } 261 262 } 263 } 264 } 265 } 266 267 proc fts3_read {tbl where varname} { 268 upvar $varname a 269 array unset a 270 db eval " SELECT start_block, leaves_end_block, root 271 FROM ${tbl}_segdir WHERE $where 272 ORDER BY level DESC, idx ASC 273 " { 274 if {$start_block == 0} { 275 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } 276 } else { 277 db eval " SELECT block 278 FROM ${tbl}_segments 279 WHERE blockid>=$start_block AND blockid<$leaves_end_block 280 ORDER BY blockid 281 " { 282 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } 283 284 } 285 } 286 } 287 } 288 289 ########################################################################## 290 291