Home | History | Annotate | Download | only in test
      1 # 2009 December 03
      2 #
      3 #    May you do good and not evil.
      4 #    May you find forgiveness for yourself and forgive others.
      5 #    May you share freely, never taking more than you give.
      6 #
      7 #***********************************************************************
      8 #
      9 # Brute force (random data) tests for FTS3.
     10 #
     11 
     12 #-------------------------------------------------------------------------
     13 #
     14 # The FTS3 tests implemented in this file focus on testing that FTS3
     15 # returns the correct set of documents for various types of full-text
     16 # query. This is done using pseudo-randomly generated data and queries.
     17 # The expected result of each query is calculated using Tcl code.
     18 #
     19 #   1. The database is initialized to contain a single table with three
     20 #      columns. 100 rows are inserted into the table. Each of the three
     21 #      values in each row is a document consisting of between 0 and 100
     22 #      terms. Terms are selected from a vocabulary of $G(nVocab) terms.
     23 #
     24 #   2. The following is performed 100 times:
     25 #
     26 #      a. A row is inserted into the database. The row contents are 
     27 #         generated as in step 1. The docid is a pseudo-randomly selected
     28 #         value between 0 and 1000000.
     29 # 
     30 #      b. A psuedo-randomly selected row is updated. One of its columns is
     31 #         set to contain a new document generated in the same way as the
     32 #         documents in step 1.
     33 # 
     34 #      c. A psuedo-randomly selected row is deleted.
     35 # 
     36 #      d. For each of several types of fts3 queries, 10 SELECT queries
     37 #         of the form:
     38 # 
     39 #           SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
     40 # 
     41 #         are evaluated. The results are compared to those calculated by
     42 #         Tcl code in this file. The patterns used for the different query
     43 #         types are:
     44 # 
     45 #           1.  query = <term>
     46 #           2.  query = <prefix>
     47 #           3.  query = "<term> <term>"
     48 #           4.  query = "<term> <term> <term>"
     49 #           5.  query = "<prefix> <prefix> <prefix>"
     50 #           6.  query = <term> NEAR <term>
     51 #           7.  query = <term> NEAR/11 <term> NEAR/11 <term>
     52 #           8.  query = <term> OR <term>
     53 #           9.  query = <term> NOT <term>
     54 #           10. query = <term> AND <term>
     55 #           11. query = <term> NEAR <term> OR <term> NEAR <term>
     56 #           12. query = <term> NEAR <term> NOT <term> NEAR <term>
     57 #           13. query = <term> NEAR <term> AND <term> NEAR <term>
     58 # 
     59 #         where <term> is a term psuedo-randomly selected from the vocabulary
     60 #         and prefix is the first 2 characters of such a term followed by
     61 #         a "*" character.
     62 #     
     63 #      Every second iteration, steps (a) through (d) above are performed
     64 #      within a single transaction. This forces the queries in (d) to
     65 #      read data from both the database and the in-memory hash table
     66 #      that caches the full-text index entries created by steps (a), (b)
     67 #      and (c) until the transaction is committed.
     68 #
     69 # The procedure above is run 5 times, using advisory fts3 node sizes of 50,
     70 # 500, 1000 and 2000 bytes.
     71 #
     72 # After the test using an advisory node-size of 50, an OOM test is run using
     73 # the database. This test is similar to step (d) above, except that it tests
     74 # the effects of transient and persistent OOM conditions encountered while
     75 # executing each query.
     76 #
     77 
     78 set testdir [file dirname $argv0]
     79 source $testdir/tester.tcl
     80 
     81 # If this build does not include FTS3, skip the tests in this file.
     82 #
     83 ifcapable !fts3 { finish_test ; return }
     84 source $testdir/fts3_common.tcl
     85 source $testdir/malloc_common.tcl
     86 
     87 set G(nVocab) 100
     88 
     89 set nVocab 100
     90 set lVocab [list]
     91 
     92 expr srand(0)
     93 
     94 # Generate a vocabulary of nVocab words. Each word is 3 characters long.
     95 #
     96 set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
     97 for {set i 0} {$i < $nVocab} {incr i} {
     98   set len [expr int(rand()*3)+2]
     99   set    word [lindex $lChar [expr int(rand()*26)]]
    100   append word [lindex $lChar [expr int(rand()*26)]]
    101   if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
    102   if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
    103   lappend lVocab $word
    104 }
    105 
    106 proc random_term {} {
    107   lindex $::lVocab [expr {int(rand()*$::nVocab)}]
    108 }
    109 
    110 # Return a document consisting of $nWord arbitrarily selected terms
    111 # from the $::lVocab list.
    112 #
    113 proc generate_doc {nWord} {
    114   set doc [list]
    115   for {set i 0} {$i < $nWord} {incr i} {
    116     lappend doc [random_term]
    117   }
    118   return $doc
    119 }
    120 
    121 
    122 
    123 # Primitives to update the table.
    124 #
    125 unset -nocomplain t1
    126 proc insert_row {rowid} {
    127   set a [generate_doc [expr int((rand()*100))]]
    128   set b [generate_doc [expr int((rand()*100))]]
    129   set c [generate_doc [expr int((rand()*100))]]
    130   execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
    131   set ::t1($rowid) [list $a $b $c]
    132 }
    133 proc delete_row {rowid} {
    134   execsql { DELETE FROM t1 WHERE rowid = $rowid }
    135   catch {unset ::t1($rowid)}
    136 }
    137 proc update_row {rowid} {
    138   set cols {a b c}
    139   set iCol [expr int(rand()*3)]
    140   set doc  [generate_doc [expr int((rand()*100))]]
    141   lset ::t1($rowid) $iCol $doc
    142   execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
    143 }
    144 
    145 proc simple_phrase {zPrefix} {
    146   set ret [list]
    147 
    148   set reg [string map {* {[^ ]*}} $zPrefix]
    149   set reg " $reg "
    150 
    151   foreach key [lsort -integer [array names ::t1]] {
    152     set value $::t1($key)
    153     set cnt [list]
    154     foreach col $value {
    155       if {[regexp $reg " $col "]} { lappend ret $key ; break }
    156     }
    157   }
    158 
    159   #lsort -uniq -integer $ret
    160   set ret
    161 }
    162 
    163 # This [proc] is used to test the FTS3 matchinfo() function.
    164 # 
    165 proc simple_token_matchinfo {zToken} {
    166 
    167   set nDoc(0) 0
    168   set nDoc(1) 0
    169   set nDoc(2) 0
    170   set nHit(0) 0
    171   set nHit(1) 0
    172   set nHit(2) 0
    173 
    174 
    175   foreach key [array names ::t1] {
    176     set value $::t1($key)
    177     set a($key) [list]
    178     foreach i {0 1 2} col $value {
    179       set hit [llength [lsearch -all $col $zToken]]
    180       lappend a($key) $hit
    181       incr nHit($i) $hit
    182       if {$hit>0} { incr nDoc($i) }
    183     }
    184   }
    185 
    186   set ret [list]
    187   foreach docid [lsort -integer [array names a]] {
    188     if { [lindex [lsort -integer $a($docid)] end] } {
    189       set matchinfo [list 1 3]
    190       foreach i {0 1 2} hit $a($docid) {
    191         lappend matchinfo $hit $nHit($i) $nDoc($i)
    192       }
    193       lappend ret $docid $matchinfo
    194     }
    195   }
    196 
    197   set ret
    198 } 
    199 
    200 proc simple_near {termlist nNear} {
    201   set ret [list]
    202 
    203   foreach {key value} [array get ::t1] {
    204     foreach v $value {
    205 
    206       set l [lsearch -exact -all $v [lindex $termlist 0]]
    207       foreach T [lrange $termlist 1 end] {
    208         set l2 [list]
    209         foreach i $l {
    210           set iStart [expr $i - $nNear - 1]
    211           set iEnd [expr $i + $nNear + 1]
    212           if {$iStart < 0} {set iStart 0}
    213           foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
    214             incr i2 $iStart
    215             if {$i2 != $i} { lappend l2 $i2 } 
    216           }
    217         }
    218         set l [lsort -uniq -integer $l2]
    219       }
    220 
    221       if {[llength $l]} {
    222 #puts "MATCH($key): $v"
    223         lappend ret $key
    224       } 
    225     }
    226   }
    227 
    228   lsort -unique -integer $ret
    229 }
    230 
    231 # The following three procs:
    232 # 
    233 #   setup_not A B
    234 #   setup_or  A B
    235 #   setup_and A B
    236 #
    237 # each take two arguments. Both arguments must be lists of integer values
    238 # sorted by value. The return value is the list produced by evaluating
    239 # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
    240 # AND.
    241 #
    242 proc setop_not {A B} {
    243   foreach b $B { set n($b) {} }
    244   set ret [list]
    245   foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
    246   return $ret
    247 }
    248 proc setop_or {A B} {
    249   lsort -integer -uniq [concat $A $B]
    250 }
    251 proc setop_and {A B} {
    252   foreach b $B { set n($b) {} }
    253   set ret [list]
    254   foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
    255   return $ret
    256 }
    257 
    258 proc mit {blob} {
    259   set scan(littleEndian) i*
    260   set scan(bigEndian) I*
    261   binary scan $blob $scan($::tcl_platform(byteOrder)) r
    262   return $r
    263 }
    264 db func mit mit
    265 
    266 set sqlite_fts3_enable_parentheses 1
    267 
    268 foreach nodesize {50 500 1000 2000} {
    269   catch { array unset ::t1 }
    270 
    271   # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
    272   #
    273   db transaction {
    274     catchsql { DROP TABLE t1 }
    275     execsql "CREATE VIRTUAL TABLE t1 USING fts3(a, b, c)"
    276     execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
    277     for {set i 0} {$i < 100} {incr i} { insert_row $i }
    278   }
    279   
    280   for {set iTest 1} {$iTest <= 100} {incr iTest} {
    281     catchsql COMMIT
    282 
    283     set DO_MALLOC_TEST 0
    284     set nRep 10
    285     if {$iTest==100 && $nodesize==50} { 
    286       set DO_MALLOC_TEST 1 
    287       set nRep 2
    288     }
    289   
    290     # Delete one row, update one row and insert one row.
    291     #
    292     set rows [array names ::t1]
    293     set nRow [llength $rows]
    294     set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
    295     set iDelete $iUpdate
    296     while {$iDelete == $iUpdate} {
    297       set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
    298     }
    299     set iInsert $iUpdate
    300     while {[info exists ::t1($iInsert)]} {
    301       set iInsert [expr {int(rand()*1000000)}]
    302     }
    303     execsql BEGIN
    304       insert_row $iInsert
    305       update_row $iUpdate
    306       delete_row $iDelete
    307     if {0==($iTest%2)} { execsql COMMIT }
    308 
    309     if {0==($iTest%2)} { 
    310       do_test fts3rnd-1.$nodesize.$iTest.0 { fts3_integrity_check t1 } ok 
    311     }
    312 
    313     # Pick 10 terms from the vocabulary. Check that the results of querying
    314     # the database for the set of documents containing each of these terms
    315     # is the same as the result obtained by scanning the contents of the Tcl 
    316     # array for each term.
    317     #
    318     for {set i 0} {$i < 10} {incr i} {
    319       set term [random_term]
    320       do_select_test fts3rnd-1.$nodesize.$iTest.1.$i {
    321         SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
    322       } [simple_token_matchinfo $term]
    323     }
    324 
    325     # This time, use the first two characters of each term as a term prefix
    326     # to query for. Test that querying the Tcl array produces the same results
    327     # as querying the FTS3 table for the prefix.
    328     #
    329     for {set i 0} {$i < $nRep} {incr i} {
    330       set prefix [string range [random_term] 0 end-1]
    331       set match "${prefix}*"
    332       do_select_test fts3rnd-1.$nodesize.$iTest.2.$i {
    333         SELECT docid FROM t1 WHERE t1 MATCH $match
    334       } [simple_phrase $match]
    335     }
    336 
    337     # Similar to the above, except for phrase queries.
    338     #
    339     for {set i 0} {$i < $nRep} {incr i} {
    340       set term [list [random_term] [random_term]]
    341       set match "\"$term\""
    342       do_select_test fts3rnd-1.$nodesize.$iTest.3.$i {
    343         SELECT docid FROM t1 WHERE t1 MATCH $match
    344       } [simple_phrase $term]
    345     }
    346 
    347     # Three word phrases.
    348     #
    349     for {set i 0} {$i < $nRep} {incr i} {
    350       set term [list [random_term] [random_term] [random_term]]
    351       set match "\"$term\""
    352       do_select_test fts3rnd-1.$nodesize.$iTest.4.$i {
    353         SELECT docid FROM t1 WHERE t1 MATCH $match
    354       } [simple_phrase $term]
    355     }
    356 
    357     # Three word phrases made up of term-prefixes.
    358     #
    359     for {set i 0} {$i < $nRep} {incr i} {
    360       set    query "[string range [random_term] 0 end-1]* "
    361       append query "[string range [random_term] 0 end-1]* "
    362       append query "[string range [random_term] 0 end-1]*"
    363 
    364       set match "\"$query\""
    365       do_select_test fts3rnd-1.$nodesize.$iTest.5.$i {
    366         SELECT docid FROM t1 WHERE t1 MATCH $match
    367       } [simple_phrase $query]
    368     }
    369 
    370     # A NEAR query with terms as the arguments.
    371     #
    372     for {set i 0} {$i < $nRep} {incr i} {
    373       set terms [list [random_term] [random_term]]
    374       set match [join $terms " NEAR "]
    375       do_select_test fts3rnd-1.$nodesize.$iTest.6.$i {
    376         SELECT docid FROM t1 WHERE t1 MATCH $match 
    377       } [simple_near $terms 10]
    378     }
    379 
    380     # A 3-way NEAR query with terms as the arguments.
    381     #
    382     for {set i 0} {$i < $nRep} {incr i} {
    383       set terms [list [random_term] [random_term] [random_term]]
    384       set nNear 11
    385       set match [join $terms " NEAR/$nNear "]
    386       do_select_test fts3rnd-1.$nodesize.$iTest.7.$i {
    387         SELECT docid FROM t1 WHERE t1 MATCH $match
    388       } [simple_near $terms $nNear]
    389     }
    390     
    391     # Set operations on simple term queries.
    392     #
    393     foreach {tn op proc} {
    394       8  OR  setop_or
    395       9  NOT setop_not
    396       10 AND setop_and
    397     } {
    398       for {set i 0} {$i < $nRep} {incr i} {
    399         set term1 [random_term]
    400         set term2 [random_term]
    401         set match "$term1 $op $term2"
    402         do_select_test fts3rnd-1.$nodesize.$iTest.$tn.$i {
    403           SELECT docid FROM t1 WHERE t1 MATCH $match
    404         } [$proc [simple_phrase $term1] [simple_phrase $term2]]
    405       }
    406     }
    407  
    408     # Set operations on NEAR queries.
    409     #
    410     foreach {tn op proc} {
    411       8  OR  setop_or
    412       9  NOT setop_not
    413       10 AND setop_and
    414     } {
    415       for {set i 0} {$i < $nRep} {incr i} {
    416         set term1 [random_term]
    417         set term2 [random_term]
    418         set term3 [random_term]
    419         set term4 [random_term]
    420         set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
    421         do_select_test fts3rnd-1.$nodesize.$iTest.$tn.$i {
    422           SELECT docid FROM t1 WHERE t1 MATCH $match
    423         } [$proc                                  \
    424             [simple_near [list $term1 $term2] 10] \
    425             [simple_near [list $term3 $term4] 10]
    426           ]
    427       }
    428     }
    429 
    430     catchsql COMMIT
    431   }
    432 }
    433 
    434 finish_test
    435