Home | History | Annotate | Download | only in test
      1 # 2009 November 04
      2 #
      3 # The author disclaims copyright to this source code.  In place of
      4 # a legal notice, here is a blessing:
      5 #
      6 #    May you do good and not evil.
      7 #    May you find forgiveness for yourself and forgive others.
      8 #    May you share freely, never taking more than you give.
      9 #
     10 #***********************************************************************
     11 #
     12 # This file contains common code used the fts3 tests. At one point
     13 # equivalent functionality was implemented in C code. But it is easier
     14 # to use Tcl.
     15 #
     16 
     17 #-------------------------------------------------------------------------
     18 # USAGE: fts3_integrity_check TBL
     19 #
     20 # This proc is used to verify that the full-text index is consistent with
     21 # the contents of the fts3 table. In other words, it checks that the
     22 # data in the %_contents table matches that in the %_segdir and %_segments
     23 # tables.
     24 #
     25 # This is not an efficient procedure. It uses a lot of memory and a lot
     26 # of CPU. But it is better than not checking at all.
     27 #
     28 # The procedure is:
     29 #
     30 #   1) Read the entire full-text index from the %_segdir and %_segments
     31 #      tables into memory. For each entry in the index, the following is
     32 #      done:
     33 #
     34 #          set C($iDocid,$iCol,$iPosition) $zTerm
     35 #
     36 #   2) Iterate through each column of each row of the %_content table.
     37 #      Tokenize all documents, and check that for each token there is
     38 #      a corresponding entry in the $C array. After checking a token,
     39 #      [unset] the $C array entry.
     40 #
     41 #   3) Check that array $C is now empty.
     42 #
     43 #
     44 proc fts3_integrity_check {tbl} {
     45 
     46   fts3_read2 $tbl 1 A
     47 
     48   foreach zTerm [array names A] {
     49     foreach doclist $A($zTerm) {
     50       set docid 0
     51       while {[string length $doclist]>0} {
     52         set iCol 0
     53         set iPos 0
     54         set lPos [list]
     55         set lCol [list]
     56 
     57         # First varint of a doclist-entry is the docid. Delta-compressed
     58         # with respect to the docid of the previous entry.
     59         #
     60         incr docid [gobble_varint doclist]
     61         if {[info exists D($zTerm,$docid)]} {
     62           while {[set iDelta [gobble_varint doclist]] != 0} {}
     63           continue
     64         }
     65         set D($zTerm,$docid) 1
     66 
     67         # Gobble varints until the 0x00 that terminates the doclist-entry
     68         # is found.
     69         while {[set iDelta [gobble_varint doclist]] > 0} {
     70           if {$iDelta == 1} {
     71             set iCol [gobble_varint doclist]
     72             set iPos 0
     73           } else {
     74             incr iPos $iDelta
     75             incr iPos -2
     76             set C($docid,$iCol,$iPos) $zTerm
     77           }
     78         }
     79       }
     80     }
     81   }
     82 
     83   foreach key [array names C] {
     84     #puts "$key -> $C($key)"
     85   }
     86 
     87 
     88   db eval "SELECT * FROM ${tbl}_content" E {
     89     set iCol 0
     90     set iDoc $E(docid)
     91     foreach col [lrange $E(*) 1 end] {
     92       set c $E($col)
     93       set sql {SELECT fts3_tokenizer_test('simple', $c)}
     94 
     95       foreach {pos term dummy} [db one $sql] {
     96         if {![info exists C($iDoc,$iCol,$pos)]} {
     97           set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
     98           lappend errors $es
     99         } else {
    100           if {$C($iDoc,$iCol,$pos) != "$term"} {
    101             set    es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
    102             append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
    103             lappend errors $es
    104           }
    105           unset C($iDoc,$iCol,$pos)
    106         }
    107       }
    108       incr iCol
    109     }
    110   }
    111 
    112   foreach c [array names C] {
    113     lappend errors "Bad index entry: $c -> $C($c)"
    114   }
    115 
    116   if {[info exists errors]} { return [join $errors "\n"] }
    117   return "ok"
    118 }
    119 
    120 # USAGE: fts3_terms TBL WHERE
    121 #
    122 # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
    123 # SQL expression that will be used as the WHERE clause when scanning
    124 # the %_segdir table. As in the following query:
    125 #
    126 #   "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
    127 #
    128 # This function returns a list of all terms present in the segments
    129 # selected by the statement above.
    130 #
    131 proc fts3_terms {tbl where} {
    132   fts3_read $tbl $where a
    133   return [lsort [array names a]]
    134 }
    135 
    136 
    137 # USAGE: fts3_doclist TBL TERM WHERE
    138 #
    139 # Argument TBL must be the name of an FTS3 table. TERM is a term that may
    140 # or may not be present in the table. Argument WHERE is used to select a
    141 # subset of the b-tree segments in the associated full-text index as
    142 # described above for [fts3_terms].
    143 #
    144 # This function returns the results of merging the doclists associated
    145 # with TERM in the selected segments. Each doclist is an element of the
    146 # returned list. Each doclist is formatted as follows:
    147 #
    148 #   [$docid ?$col[$off1 $off2...]?...]
    149 #
    150 # The formatting is odd for a Tcl command in order to be compatible with
    151 # the original C-language implementation. If argument WHERE is "1", then
    152 # any empty doclists are omitted from the returned list.
    153 #
    154 proc fts3_doclist {tbl term where} {
    155   fts3_read $tbl $where a
    156 
    157 
    158   foreach doclist $a($term) {
    159     set docid 0
    160 
    161     while {[string length $doclist]>0} {
    162       set iCol 0
    163       set iPos 0
    164       set lPos [list]
    165       set lCol [list]
    166       incr docid [gobble_varint doclist]
    167 
    168       while {[set iDelta [gobble_varint doclist]] > 0} {
    169         if {$iDelta == 1} {
    170           lappend lCol [list $iCol $lPos]
    171           set iPos 0
    172           set lPos [list]
    173           set iCol [gobble_varint doclist]
    174         } else {
    175           incr iPos $iDelta
    176           incr iPos -2
    177           lappend lPos $iPos
    178         }
    179       }
    180 
    181       if {[llength $lPos]>0} {
    182         lappend lCol [list $iCol $lPos]
    183       }
    184 
    185       if {$where != "1" || [llength $lCol]>0} {
    186         set ret($docid) $lCol
    187       } else {
    188         unset -nocomplain ret($docid)
    189       }
    190     }
    191   }
    192 
    193   set lDoc [list]
    194   foreach docid [lsort -integer [array names ret]] {
    195     set lCol [list]
    196     set cols ""
    197     foreach col $ret($docid) {
    198       foreach {iCol lPos} $col {}
    199       append cols " $iCol\[[join $lPos { }]\]"
    200     }
    201     lappend lDoc "\[${docid}${cols}\]"
    202   }
    203 
    204   join $lDoc " "
    205 }
    206 
    207 ###########################################################################
    208 
    209 proc gobble_varint {varname} {
    210   upvar $varname blob
    211   set n [read_fts3varint $blob ret]
    212   set blob [string range $blob $n end]
    213   return $ret
    214 }
    215 proc gobble_string {varname nLength} {
    216   upvar $varname blob
    217   set ret [string range $blob 0 [expr $nLength-1]]
    218   set blob [string range $blob $nLength end]
    219   return $ret
    220 }
    221 
    222 # The argument is a blob of data representing an FTS3 segment leaf.
    223 # Return a list consisting of alternating terms (strings) and doclists
    224 # (blobs of data).
    225 #
    226 proc fts3_readleaf {blob} {
    227   set zPrev ""
    228   set terms [list]
    229 
    230   while {[string length $blob] > 0} {
    231     set nPrefix [gobble_varint blob]
    232     set nSuffix [gobble_varint blob]
    233 
    234     set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
    235     append zTerm [gobble_string blob $nSuffix]
    236     set doclist [gobble_string blob [gobble_varint blob]]
    237 
    238     lappend terms $zTerm $doclist
    239     set zPrev $zTerm
    240   }
    241 
    242   return $terms
    243 }
    244 
    245 proc fts3_read2 {tbl where varname} {
    246   upvar $varname a
    247   array unset a
    248   db eval " SELECT start_block, leaves_end_block, root
    249             FROM ${tbl}_segdir WHERE $where
    250             ORDER BY level ASC, idx DESC
    251   " {
    252     if {$start_block == 0} {
    253       foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
    254     } else {
    255       db eval " SELECT block
    256                 FROM ${tbl}_segments
    257                 WHERE blockid>=$start_block AND blockid<=$leaves_end_block
    258                 ORDER BY blockid
    259       " {
    260         foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
    261 
    262       }
    263     }
    264   }
    265 }
    266 
    267 proc fts3_read {tbl where varname} {
    268   upvar $varname a
    269   array unset a
    270   db eval " SELECT start_block, leaves_end_block, root
    271             FROM ${tbl}_segdir WHERE $where
    272             ORDER BY level DESC, idx ASC
    273   " {
    274     if {$start_block == 0} {
    275       foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
    276     } else {
    277       db eval " SELECT block
    278                 FROM ${tbl}_segments
    279                 WHERE blockid>=$start_block AND blockid<$leaves_end_block
    280                 ORDER BY blockid
    281       " {
    282         foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
    283 
    284       }
    285     }
    286   }
    287 }
    288 
    289 ##########################################################################
    290 
    291