Home | History | Annotate | Download | only in src
      1 /*
      2 ** 2012 Jan 11
      3 **
      4 ** The author disclaims copyright to this source code.  In place of
      5 ** a legal notice, here is a blessing:
      6 **
      7 **    May you do good and not evil.
      8 **    May you find forgiveness for yourself and forgive others.
      9 **    May you share freely, never taking more than you give.
     10 */
     11 /* TODO(shess): THIS MODULE IS STILL EXPERIMENTAL.  DO NOT USE IT. */
     12 /* Implements a virtual table "recover" which can be used to recover
     13  * data from a corrupt table.  The table is walked manually, with
     14  * corrupt items skipped.  Additionally, any errors while reading will
     15  * be skipped.
     16  *
     17  * Given a table with this definition:
     18  *
     19  * CREATE TABLE Stuff (
     20  *   name TEXT PRIMARY KEY,
     21  *   value TEXT NOT NULL
     22  * );
     23  *
     24  * to recover the data from teh table, you could do something like:
     25  *
     26  * -- Attach another database, the original is not trustworthy.
     27  * ATTACH DATABASE '/tmp/db.db' AS rdb;
     28  * -- Create a new version of the table.
     29  * CREATE TABLE rdb.Stuff (
     30  *   name TEXT PRIMARY KEY,
     31  *   value TEXT NOT NULL
     32  * );
     33  * -- This will read the original table's data.
     34  * CREATE VIRTUAL TABLE temp.recover_Stuff using recover(
     35  *   main.Stuff,
     36  *   name TEXT STRICT NOT NULL,  -- only real TEXT data allowed
     37  *   value TEXT STRICT NOT NULL
     38  * );
     39  * -- Corruption means the UNIQUE constraint may no longer hold for
     40  * -- Stuff, so either OR REPLACE or OR IGNORE must be used.
     41  * INSERT OR REPLACE INTO rdb.Stuff (rowid, name, value )
     42  *   SELECT rowid, name, value FROM temp.recover_Stuff;
     43  * DROP TABLE temp.recover_Stuff;
     44  * DETACH DATABASE rdb;
     45  * -- Move db.db to replace original db in filesystem.
     46  *
     47  *
     48  * Usage
     49  *
     50  * Given the goal of dealing with corruption, it would not be safe to
     51  * create a recovery table in the database being recovered.  So
     52  * recovery tables must be created in the temp database.  They are not
     53  * appropriate to persist, in any case.  [As a bonus, sqlite_master
     54  * tables can be recovered.  Perhaps more cute than useful, though.]
     55  *
     56  * The parameters are a specifier for the table to read, and a column
     57  * definition for each bit of data stored in that table.  The named
     58  * table must be convertable to a root page number by reading the
     59  * sqlite_master table.  Bare table names are assumed to be in
     60  * database 0 ("main"), other databases can be specified in db.table
     61  * fashion.
     62  *
     63  * Column definitions are similar to BUT NOT THE SAME AS those
     64  * provided to CREATE statements:
     65  *  column-def: column-name [type-name [STRICT] [NOT NULL]]
     66  *  type-name: (ANY|ROWID|INTEGER|FLOAT|NUMERIC|TEXT|BLOB)
     67  *
     68  * Only those exact type names are accepted, there is no type
     69  * intuition.  The only constraints accepted are STRICT (see below)
     70  * and NOT NULL.  Anything unexpected will cause the create to fail.
     71  *
     72  * ANY is a convenience to indicate that manifest typing is desired.
     73  * It is equivalent to not specifying a type at all.  The results for
     74  * such columns will have the type of the data's storage.  The exposed
     75  * schema will contain no type for that column.
     76  *
     77  * ROWID is used for columns representing aliases to the rowid
     78  * (INTEGER PRIMARY KEY, with or without AUTOINCREMENT), to make the
     79  * concept explicit.  Such columns are actually stored as NULL, so
     80  * they cannot be simply ignored.  The exposed schema will be INTEGER
     81  * for that column.
     82  *
     83  * NOT NULL causes rows with a NULL in that column to be skipped.  It
     84  * also adds NOT NULL to the column in the exposed schema.  If the
     85  * table has ever had columns added using ALTER TABLE, then those
     86  * columns implicitly contain NULL for rows which have not been
     87  * updated.  [Workaround using COALESCE() in your SELECT statement.]
     88  *
     89  * The created table is read-only, with no indices.  Any SELECT will
     90  * be a full-table scan, returning each valid row read from the
     91  * storage of the backing table.  The rowid will be the rowid of the
     92  * row from the backing table.  "Valid" means:
     93  * - The cell metadata for the row is well-formed.  Mainly this means that
     94  *   the cell header info describes a payload of the size indicated by
     95  *   the cell's payload size.
     96  * - The cell does not run off the page.
     97  * - The cell does not overlap any other cell on the page.
     98  * - The cell contains doesn't contain too many columns.
     99  * - The types of the serialized data match the indicated types (see below).
    100  *
    101  *
    102  * Type affinity versus type storage.
    103  *
    104  * http://www.sqlite.org/datatype3.html describes SQLite's type
    105  * affinity system.  The system provides for automated coercion of
    106  * types in certain cases, transparently enough that many developers
    107  * do not realize that it is happening.  Importantly, it implies that
    108  * the raw data stored in the database may not have the obvious type.
    109  *
    110  * Differences between the stored data types and the expected data
    111  * types may be a signal of corruption.  This module makes some
    112  * allowances for automatic coercion.  It is important to be concious
    113  * of the difference between the schema exposed by the module, and the
    114  * data types read from storage.  The following table describes how
    115  * the module interprets things:
    116  *
    117  * type     schema   data                     STRICT
    118  * ----     ------   ----                     ------
    119  * ANY      <none>   any                      any
    120  * ROWID    INTEGER  n/a                      n/a
    121  * INTEGER  INTEGER  integer                  integer
    122  * FLOAT    FLOAT    integer or float         float
    123  * NUMERIC  NUMERIC  integer, float, or text  integer or float
    124  * TEXT     TEXT     text or blob             text
    125  * BLOB     BLOB     blob                     blob
    126  *
    127  * type is the type provided to the recover module, schema is the
    128  * schema exposed by the module, data is the acceptable types of data
    129  * decoded from storage, and STRICT is a modification of that.
    130  *
    131  * A very loose recovery system might use ANY for all columns, then
    132  * use the appropriate sqlite3_column_*() calls to coerce to expected
    133  * types.  This doesn't provide much protection if a page from a
    134  * different table with the same column count is linked into an
    135  * inappropriate btree.
    136  *
    137  * A very tight recovery system might use STRICT to enforce typing on
    138  * all columns, preferring to skip rows which are valid at the storage
    139  * level but don't contain the right types.  Note that FLOAT STRICT is
    140  * almost certainly not appropriate, since integral values are
    141  * transparently stored as integers, when that is more efficient.
    142  *
    143  * Another option is to use ANY for all columns and inspect each
    144  * result manually (using sqlite3_column_*).  This should only be
    145  * necessary in cases where developers have used manifest typing (test
    146  * to make sure before you decide that you aren't using manifest
    147  * typing!).
    148  *
    149  *
    150  * Caveats
    151  *
    152  * Leaf pages not referenced by interior nodes will not be found.
    153  *
    154  * Leaf pages referenced from interior nodes of other tables will not
    155  * be resolved.
    156  *
    157  * Rows referencing invalid overflow pages will be skipped.
    158  *
    159  * SQlite rows have a header which describes how to interpret the rest
    160  * of the payload.  The header can be valid in cases where the rest of
    161  * the record is actually corrupt (in the sense that the data is not
    162  * the intended data).  This can especially happen WRT overflow pages,
    163  * as lack of atomic updates between pages is the primary form of
    164  * corruption I have seen in the wild.
    165  */
    166 /* The implementation is via a series of cursors.  The cursor
    167  * implementations follow the pattern:
    168  *
    169  * // Creates the cursor using various initialization info.
    170  * int cursorCreate(...);
    171  *
    172  * // Returns 1 if there is no more data, 0 otherwise.
    173  * int cursorEOF(Cursor *pCursor);
    174  *
    175  * // Various accessors can be used if not at EOF.
    176  *
    177  * // Move to the next item.
    178  * int cursorNext(Cursor *pCursor);
    179  *
    180  * // Destroy the memory associated with the cursor.
    181  * void cursorDestroy(Cursor *pCursor);
    182  *
    183  * References in the following are to sections at
    184  * http://www.sqlite.org/fileformat2.html .
    185  *
    186  * RecoverLeafCursor iterates the records in a leaf table node
    187  * described in section 1.5 "B-tree Pages".  When the node is
    188  * exhausted, an interior cursor is used to get the next leaf node,
    189  * and iteration continues there.
    190  *
    191  * RecoverInteriorCursor iterates the child pages in an interior table
    192  * node described in section 1.5 "B-tree Pages".  When the node is
    193  * exhausted, a parent interior cursor is used to get the next
    194  * interior node at the same level, and iteration continues there.
    195  *
    196  * Together these record the path from the leaf level to the root of
    197  * the tree.  Iteration happens from the leaves rather than the root
    198  * both for efficiency and putting the special case at the front of
    199  * the list is easier to implement.
    200  *
    201  * RecoverCursor uses a RecoverLeafCursor to iterate the rows of a
    202  * table, returning results via the SQLite virtual table interface.
    203  */
    204 /* TODO(shess): It might be useful to allow DEFAULT in types to
    205  * specify what to do for NULL when an ALTER TABLE case comes up.
    206  * Unfortunately, simply adding it to the exposed schema and using
    207  * sqlite3_result_null() does not cause the default to be generate.
    208  * Handling it ourselves seems hard, unfortunately.
    209  */
    210 
    211 #include <assert.h>
    212 #include <ctype.h>
    213 #include <stdio.h>
    214 #include <string.h>
    215 
    216 /* Internal SQLite things that are used:
    217  * u32, u64, i64 types.
    218  * Btree, Pager, and DbPage structs.
    219  * DbPage.pData, .pPager, and .pgno
    220  * sqlite3 struct.
    221  * sqlite3BtreePager() and sqlite3BtreeGetPageSize()
    222  * sqlite3PagerAcquire() and sqlite3PagerUnref()
    223  * getVarint().
    224  */
    225 #include "sqliteInt.h"
    226 
    227 /* For debugging. */
    228 #if 0
    229 #define FNENTRY() fprintf(stderr, "In %s\n", __FUNCTION__)
    230 #else
    231 #define FNENTRY()
    232 #endif
    233 
    234 /* Generic constants and helper functions. */
    235 
    236 static const unsigned char kTableLeafPage = 0x0D;
    237 static const unsigned char kTableInteriorPage = 0x05;
    238 
    239 /* From section 1.5. */
    240 static const unsigned kiPageTypeOffset = 0;
    241 static const unsigned kiPageFreeBlockOffset = 1;
    242 static const unsigned kiPageCellCountOffset = 3;
    243 static const unsigned kiPageCellContentOffset = 5;
    244 static const unsigned kiPageFragmentedBytesOffset = 7;
    245 static const unsigned knPageLeafHeaderBytes = 8;
    246 /* Interior pages contain an additional field. */
    247 static const unsigned kiPageRightChildOffset = 8;
    248 static const unsigned kiPageInteriorHeaderBytes = 12;
    249 
    250 /* Accepted types are specified by a mask. */
    251 #define MASK_ROWID (1<<0)
    252 #define MASK_INTEGER (1<<1)
    253 #define MASK_FLOAT (1<<2)
    254 #define MASK_TEXT (1<<3)
    255 #define MASK_BLOB (1<<4)
    256 #define MASK_NULL (1<<5)
    257 
    258 /* Helpers to decode fixed-size fields. */
    259 static u32 decodeUnsigned16(const unsigned char *pData){
    260   return (pData[0]<<8) + pData[1];
    261 }
    262 static u32 decodeUnsigned32(const unsigned char *pData){
    263   return (decodeUnsigned16(pData)<<16) + decodeUnsigned16(pData+2);
    264 }
    265 static i64 decodeSigned(const unsigned char *pData, unsigned nBytes){
    266   i64 r = (char)(*pData);
    267   while( --nBytes ){
    268     r <<= 8;
    269     r += *(++pData);
    270   }
    271   return r;
    272 }
    273 /* Derived from vdbeaux.c, sqlite3VdbeSerialGet(), case 7. */
    274 /* TODO(shess): Determine if swapMixedEndianFloat() applies. */
    275 static double decodeFloat64(const unsigned char *pData){
    276 #if !defined(NDEBUG)
    277   static const u64 t1 = ((u64)0x3ff00000)<<32;
    278   static const double r1 = 1.0;
    279   u64 t2 = t1;
    280   assert( sizeof(r1)==sizeof(t2) && memcmp(&r1, &t2, sizeof(r1))==0 );
    281 #endif
    282   i64 x = decodeSigned(pData, 8);
    283   double d;
    284   memcpy(&d, &x, sizeof(x));
    285   return d;
    286 }
    287 
    288 /* Return true if a varint can safely be read from pData/nData. */
    289 /* TODO(shess): DbPage points into the middle of a buffer which
    290  * contains the page data before DbPage.  So code should always be
    291  * able to read a small number of varints safely.  Consider whether to
    292  * trust that or not.
    293  */
    294 static int checkVarint(const unsigned char *pData, unsigned nData){
    295   unsigned i;
    296 
    297   /* In the worst case the decoder takes all 8 bits of the 9th byte. */
    298   if( nData>=9 ){
    299     return 1;
    300   }
    301 
    302   /* Look for a high-bit-clear byte in what's left. */
    303   for( i=0; i<nData; ++i ){
    304     if( !(pData[i]&0x80) ){
    305       return 1;
    306     }
    307   }
    308 
    309   /* Cannot decode in the space given. */
    310   return 0;
    311 }
    312 
    313 /* Return 1 if n varints can be read from pData/nData. */
    314 static int checkVarints(const unsigned char *pData, unsigned nData,
    315                         unsigned n){
    316   unsigned nCur = 0;   /* Byte offset within current varint. */
    317   unsigned nFound = 0; /* Number of varints found. */
    318   unsigned i;
    319 
    320   /* In the worst case the decoder takes all 8 bits of the 9th byte. */
    321   if( nData>=9*n ){
    322     return 1;
    323   }
    324 
    325   for( i=0; nFound<n && i<nData; ++i ){
    326     nCur++;
    327     if( nCur==9 || !(pData[i]&0x80) ){
    328       nFound++;
    329       nCur = 0;
    330     }
    331   }
    332 
    333   return nFound==n;
    334 }
    335 
    336 /* ctype and str[n]casecmp() can be affected by locale (eg, tr_TR).
    337  * These versions consider only the ASCII space.
    338  */
    339 /* TODO(shess): It may be reasonable to just remove the need for these
    340  * entirely.  The module could require "TEXT STRICT NOT NULL", not
    341  * "Text Strict Not Null" or whatever the developer felt like typing
    342  * that day.  Handling corrupt data is a PERFECT place to be pedantic.
    343  */
    344 static int ascii_isspace(char c){
    345   /* From fts3_expr.c */
    346   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
    347 }
    348 static int ascii_isalnum(int x){
    349   /* From fts3_tokenizer1.c */
    350   return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z');
    351 }
    352 static int ascii_tolower(int x){
    353   /* From fts3_tokenizer1.c */
    354   return (x>='A' && x<='Z') ? x-'A'+'a' : x;
    355 }
    356 /* TODO(shess): Consider sqlite3_strnicmp() */
    357 static int ascii_strncasecmp(const char *s1, const char *s2, size_t n){
    358   const unsigned char *us1 = (const unsigned char *)s1;
    359   const unsigned char *us2 = (const unsigned char *)s2;
    360   while( *us1 && *us2 && n && ascii_tolower(*us1)==ascii_tolower(*us2) ){
    361     us1++, us2++, n--;
    362   }
    363   return n ? ascii_tolower(*us1)-ascii_tolower(*us2) : 0;
    364 }
    365 static int ascii_strcasecmp(const char *s1, const char *s2){
    366   /* If s2 is equal through strlen(s1), will exit while() due to s1's
    367    * trailing NUL, and return NUL-s2[strlen(s1)].
    368    */
    369   return ascii_strncasecmp(s1, s2, strlen(s1)+1);
    370 }
    371 
    372 /* For some reason I kept making mistakes with offset calculations. */
    373 static const unsigned char *PageData(DbPage *pPage, unsigned iOffset){
    374   assert( iOffset<=pPage->nPageSize );
    375   return (unsigned char *)pPage->pData + iOffset;
    376 }
    377 
    378 /* The first page in the file contains a file header in the first 100
    379  * bytes.  The page's header information comes after that.  Note that
    380  * the offsets in the page's header information are relative to the
    381  * beginning of the page, NOT the end of the page header.
    382  */
    383 static const unsigned char *PageHeader(DbPage *pPage){
    384   if( pPage->pgno==1 ){
    385     const unsigned nDatabaseHeader = 100;
    386     return PageData(pPage, nDatabaseHeader);
    387   }else{
    388     return PageData(pPage, 0);
    389   }
    390 }
    391 
    392 /* Helper to fetch the pager and page size for the named database. */
    393 static int GetPager(sqlite3 *db, const char *zName,
    394                     Pager **pPager, unsigned *pnPageSize){
    395   Btree *pBt = NULL;
    396   int i;
    397   for( i=0; i<db->nDb; ++i ){
    398     if( ascii_strcasecmp(db->aDb[i].zName, zName)==0 ){
    399       pBt = db->aDb[i].pBt;
    400       break;
    401     }
    402   }
    403   if( !pBt ){
    404     return SQLITE_ERROR;
    405   }
    406 
    407   *pPager = sqlite3BtreePager(pBt);
    408   *pnPageSize = sqlite3BtreeGetPageSize(pBt) - sqlite3BtreeGetReserve(pBt);
    409   return SQLITE_OK;
    410 }
    411 
    412 /* iSerialType is a type read from a record header.  See "2.1 Record Format".
    413  */
    414 
    415 /* Storage size of iSerialType in bytes.  My interpretation of SQLite
    416  * documentation is that text and blob fields can have 32-bit length.
    417  * Values past 2^31-12 will need more than 32 bits to encode, which is
    418  * why iSerialType is u64.
    419  */
    420 static u32 SerialTypeLength(u64 iSerialType){
    421   switch( iSerialType ){
    422     case 0 : return 0;  /* NULL */
    423     case 1 : return 1;  /* Various integers. */
    424     case 2 : return 2;
    425     case 3 : return 3;
    426     case 4 : return 4;
    427     case 5 : return 6;
    428     case 6 : return 8;
    429     case 7 : return 8;  /* 64-bit float. */
    430     case 8 : return 0;  /* Constant 0. */
    431     case 9 : return 0;  /* Constant 1. */
    432     case 10 : case 11 : assert( !"RESERVED TYPE"); return 0;
    433   }
    434   return (u32)((iSerialType>>1) - 6);
    435 }
    436 
    437 /* True if iSerialType refers to a blob. */
    438 static int SerialTypeIsBlob(u64 iSerialType){
    439   assert( iSerialType>=12 );
    440   return (iSerialType%2)==0;
    441 }
    442 
    443 /* Returns true if the serialized type represented by iSerialType is
    444  * compatible with the given type mask.
    445  */
    446 static int SerialTypeIsCompatible(u64 iSerialType, unsigned char mask){
    447   switch( iSerialType ){
    448     case 0  : return (mask&MASK_NULL)!=0;
    449     case 1  : return (mask&MASK_INTEGER)!=0;
    450     case 2  : return (mask&MASK_INTEGER)!=0;
    451     case 3  : return (mask&MASK_INTEGER)!=0;
    452     case 4  : return (mask&MASK_INTEGER)!=0;
    453     case 5  : return (mask&MASK_INTEGER)!=0;
    454     case 6  : return (mask&MASK_INTEGER)!=0;
    455     case 7  : return (mask&MASK_FLOAT)!=0;
    456     case 8  : return (mask&MASK_INTEGER)!=0;
    457     case 9  : return (mask&MASK_INTEGER)!=0;
    458     case 10 : assert( !"RESERVED TYPE"); return 0;
    459     case 11 : assert( !"RESERVED TYPE"); return 0;
    460   }
    461   return (mask&(SerialTypeIsBlob(iSerialType) ? MASK_BLOB : MASK_TEXT));
    462 }
    463 
    464 /* Versions of strdup() with return values appropriate for
    465  * sqlite3_free().  malloc.c has sqlite3DbStrDup()/NDup(), but those
    466  * need sqlite3DbFree(), which seems intrusive.
    467  */
    468 static char *sqlite3_strndup(const char *z, unsigned n){
    469   char *zNew;
    470 
    471   if( z==NULL ){
    472     return NULL;
    473   }
    474 
    475   zNew = sqlite3_malloc(n+1);
    476   if( zNew!=NULL ){
    477     memcpy(zNew, z, n);
    478     zNew[n] = '\0';
    479   }
    480   return zNew;
    481 }
    482 static char *sqlite3_strdup(const char *z){
    483   if( z==NULL ){
    484     return NULL;
    485   }
    486   return sqlite3_strndup(z, strlen(z));
    487 }
    488 
    489 /* Fetch the page number of zTable in zDb from sqlite_master in zDb,
    490  * and put it in *piRootPage.
    491  */
    492 static int getRootPage(sqlite3 *db, const char *zDb, const char *zTable,
    493                        u32 *piRootPage){
    494   char *zSql;  /* SQL selecting root page of named element. */
    495   sqlite3_stmt *pStmt;
    496   int rc;
    497 
    498   if( strcmp(zTable, "sqlite_master")==0 ){
    499     *piRootPage = 1;
    500     return SQLITE_OK;
    501   }
    502 
    503   zSql = sqlite3_mprintf("SELECT rootpage FROM %s.sqlite_master "
    504                          "WHERE type = 'table' AND tbl_name = %Q",
    505                          zDb, zTable);
    506   if( !zSql ){
    507     return SQLITE_NOMEM;
    508   }
    509 
    510   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
    511   sqlite3_free(zSql);
    512   if( rc!=SQLITE_OK ){
    513     return rc;
    514   }
    515 
    516   /* Require a result. */
    517   rc = sqlite3_step(pStmt);
    518   if( rc==SQLITE_DONE ){
    519     rc = SQLITE_CORRUPT;
    520   }else if( rc==SQLITE_ROW ){
    521     *piRootPage = sqlite3_column_int(pStmt, 0);
    522 
    523     /* Require only one result. */
    524     rc = sqlite3_step(pStmt);
    525     if( rc==SQLITE_DONE ){
    526       rc = SQLITE_OK;
    527     }else if( rc==SQLITE_ROW ){
    528       rc = SQLITE_CORRUPT;
    529     }
    530   }
    531   sqlite3_finalize(pStmt);
    532   return rc;
    533 }
    534 
    535 static int getEncoding(sqlite3 *db, const char *zDb, int* piEncoding){
    536   sqlite3_stmt *pStmt;
    537   int rc;
    538   char *zSql = sqlite3_mprintf("PRAGMA %s.encoding", zDb);
    539   if( !zSql ){
    540     return SQLITE_NOMEM;
    541   }
    542 
    543   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
    544   sqlite3_free(zSql);
    545   if( rc!=SQLITE_OK ){
    546     return rc;
    547   }
    548 
    549   /* Require a result. */
    550   rc = sqlite3_step(pStmt);
    551   if( rc==SQLITE_DONE ){
    552     /* This case should not be possible. */
    553     rc = SQLITE_CORRUPT;
    554   }else if( rc==SQLITE_ROW ){
    555     if( sqlite3_column_type(pStmt, 0)==SQLITE_TEXT ){
    556       const char* z = (const char *)sqlite3_column_text(pStmt, 0);
    557       /* These strings match the literals in pragma.c. */
    558       if( !strcmp(z, "UTF-16le") ){
    559         *piEncoding = SQLITE_UTF16LE;
    560       }else if( !strcmp(z, "UTF-16be") ){
    561         *piEncoding = SQLITE_UTF16BE;
    562       }else if( !strcmp(z, "UTF-8") ){
    563         *piEncoding = SQLITE_UTF8;
    564       }else{
    565         /* This case should not be possible. */
    566         *piEncoding = SQLITE_UTF8;
    567       }
    568     }else{
    569       /* This case should not be possible. */
    570       *piEncoding = SQLITE_UTF8;
    571     }
    572 
    573     /* Require only one result. */
    574     rc = sqlite3_step(pStmt);
    575     if( rc==SQLITE_DONE ){
    576       rc = SQLITE_OK;
    577     }else if( rc==SQLITE_ROW ){
    578       /* This case should not be possible. */
    579       rc = SQLITE_CORRUPT;
    580     }
    581   }
    582   sqlite3_finalize(pStmt);
    583   return rc;
    584 }
    585 
    586 /* Cursor for iterating interior nodes.  Interior page cells contain a
    587  * child page number and a rowid.  The child page contains items left
    588  * of the rowid (less than).  The rightmost page of the subtree is
    589  * stored in the page header.
    590  *
    591  * interiorCursorDestroy - release all resources associated with the
    592  *                         cursor and any parent cursors.
    593  * interiorCursorCreate - create a cursor with the given parent and page.
    594  * interiorCursorEOF - returns true if neither the cursor nor the
    595  *                     parent cursors can return any more data.
    596  * interiorCursorNextPage - fetch the next child page from the cursor.
    597  *
    598  * Logically, interiorCursorNextPage() returns the next child page
    599  * number from the page the cursor is currently reading, calling the
    600  * parent cursor as necessary to get new pages to read, until done.
    601  * SQLITE_ROW if a page is returned, SQLITE_DONE if out of pages,
    602  * error otherwise.  Unfortunately, if the table is corrupted
    603  * unexpected pages can be returned.  If any unexpected page is found,
    604  * leaf or otherwise, it is returned to the caller for processing,
    605  * with the interior cursor left empty.  The next call to
    606  * interiorCursorNextPage() will recurse to the parent cursor until an
    607  * interior page to iterate is returned.
    608  *
    609  * Note that while interiorCursorNextPage() will refuse to follow
    610  * loops, it does not keep track of pages returned for purposes of
    611  * preventing duplication.
    612  *
    613  * Note that interiorCursorEOF() could return false (not at EOF), and
    614  * interiorCursorNextPage() could still return SQLITE_DONE.  This
    615  * could happen if there are more cells to iterate in an interior
    616  * page, but those cells refer to invalid pages.
    617  */
    618 typedef struct RecoverInteriorCursor RecoverInteriorCursor;
    619 struct RecoverInteriorCursor {
    620   RecoverInteriorCursor *pParent; /* Parent node to this node. */
    621   DbPage *pPage;                  /* Reference to leaf page. */
    622   unsigned nPageSize;             /* Size of page. */
    623   unsigned nChildren;             /* Number of children on the page. */
    624   unsigned iChild;                /* Index of next child to return. */
    625 };
    626 
    627 static void interiorCursorDestroy(RecoverInteriorCursor *pCursor){
    628   /* Destroy all the cursors to the root. */
    629   while( pCursor ){
    630     RecoverInteriorCursor *p = pCursor;
    631     pCursor = pCursor->pParent;
    632 
    633     if( p->pPage ){
    634       sqlite3PagerUnref(p->pPage);
    635       p->pPage = NULL;
    636     }
    637 
    638     memset(p, 0xA5, sizeof(*p));
    639     sqlite3_free(p);
    640   }
    641 }
    642 
    643 /* Internal helper.  Reset storage in preparation for iterating pPage. */
    644 static void interiorCursorSetPage(RecoverInteriorCursor *pCursor,
    645                                   DbPage *pPage){
    646   assert( PageHeader(pPage)[kiPageTypeOffset]==kTableInteriorPage );
    647 
    648   if( pCursor->pPage ){
    649     sqlite3PagerUnref(pCursor->pPage);
    650     pCursor->pPage = NULL;
    651   }
    652   pCursor->pPage = pPage;
    653   pCursor->iChild = 0;
    654 
    655   /* A child for each cell, plus one in the header. */
    656   /* TODO(shess): Sanity-check the count?  Page header plus per-cell
    657    * cost of 16-bit offset, 32-bit page number, and one varint
    658    * (minimum 1 byte).
    659    */
    660   pCursor->nChildren = decodeUnsigned16(PageHeader(pPage) +
    661                                         kiPageCellCountOffset) + 1;
    662 }
    663 
    664 static int interiorCursorCreate(RecoverInteriorCursor *pParent,
    665                                 DbPage *pPage, int nPageSize,
    666                                 RecoverInteriorCursor **ppCursor){
    667   RecoverInteriorCursor *pCursor =
    668     sqlite3_malloc(sizeof(RecoverInteriorCursor));
    669   if( !pCursor ){
    670     return SQLITE_NOMEM;
    671   }
    672 
    673   memset(pCursor, 0, sizeof(*pCursor));
    674   pCursor->pParent = pParent;
    675   pCursor->nPageSize = nPageSize;
    676   interiorCursorSetPage(pCursor, pPage);
    677   *ppCursor = pCursor;
    678   return SQLITE_OK;
    679 }
    680 
    681 /* Internal helper.  Return the child page number at iChild. */
    682 static unsigned interiorCursorChildPage(RecoverInteriorCursor *pCursor){
    683   const unsigned char *pPageHeader;  /* Header of the current page. */
    684   const unsigned char *pCellOffsets; /* Offset to page's cell offsets. */
    685   unsigned iCellOffset;              /* Offset of target cell. */
    686 
    687   assert( pCursor->iChild<pCursor->nChildren );
    688 
    689   /* Rightmost child is in the header. */
    690   pPageHeader = PageHeader(pCursor->pPage);
    691   if( pCursor->iChild==pCursor->nChildren-1 ){
    692     return decodeUnsigned32(pPageHeader + kiPageRightChildOffset);
    693   }
    694 
    695   /* Each cell is a 4-byte integer page number and a varint rowid
    696    * which is greater than the rowid of items in that sub-tree (this
    697    * module ignores ordering). The offset is from the beginning of the
    698    * page, not from the page header.
    699    */
    700   pCellOffsets = pPageHeader + kiPageInteriorHeaderBytes;
    701   iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iChild*2);
    702   if( iCellOffset<=pCursor->nPageSize-4 ){
    703     return decodeUnsigned32(PageData(pCursor->pPage, iCellOffset));
    704   }
    705 
    706   /* TODO(shess): Check for cell overlaps?  Cells require 4 bytes plus
    707    * a varint.  Check could be identical to leaf check (or even a
    708    * shared helper testing for "Cells starting in this range"?).
    709    */
    710 
    711   /* If the offset is broken, return an invalid page number. */
    712   return 0;
    713 }
    714 
    715 static int interiorCursorEOF(RecoverInteriorCursor *pCursor){
    716   /* Find a parent with remaining children.  EOF if none found. */
    717   while( pCursor && pCursor->iChild>=pCursor->nChildren ){
    718     pCursor = pCursor->pParent;
    719   }
    720   return pCursor==NULL;
    721 }
    722 
    723 /* Internal helper.  Used to detect if iPage would cause a loop. */
    724 static int interiorCursorPageInUse(RecoverInteriorCursor *pCursor,
    725                                    unsigned iPage){
    726   /* Find any parent using the indicated page. */
    727   while( pCursor && pCursor->pPage->pgno!=iPage ){
    728     pCursor = pCursor->pParent;
    729   }
    730   return pCursor!=NULL;
    731 }
    732 
    733 /* Get the next page from the interior cursor at *ppCursor.  Returns
    734  * SQLITE_ROW with the page in *ppPage, or SQLITE_DONE if out of
    735  * pages, or the error SQLite returned.
    736  *
    737  * If the tree is uneven, then when the cursor attempts to get a new
    738  * interior page from the parent cursor, it may get a non-interior
    739  * page.  In that case, the new page is returned, and *ppCursor is
    740  * updated to point to the parent cursor (this cursor is freed).
    741  */
    742 /* TODO(shess): I've tried to avoid recursion in most of this code,
    743  * but this case is more challenging because the recursive call is in
    744  * the middle of operation.  One option for converting it without
    745  * adding memory management would be to retain the head pointer and
    746  * use a helper to "back up" as needed.  Another option would be to
    747  * reverse the list during traversal.
    748  */
    749 static int interiorCursorNextPage(RecoverInteriorCursor **ppCursor,
    750                                   DbPage **ppPage){
    751   RecoverInteriorCursor *pCursor = *ppCursor;
    752   while( 1 ){
    753     int rc;
    754     const unsigned char *pPageHeader;  /* Header of found page. */
    755 
    756     /* Find a valid child page which isn't on the stack. */
    757     while( pCursor->iChild<pCursor->nChildren ){
    758       const unsigned iPage = interiorCursorChildPage(pCursor);
    759       pCursor->iChild++;
    760       if( interiorCursorPageInUse(pCursor, iPage) ){
    761         fprintf(stderr, "Loop detected at %d\n", iPage);
    762       }else{
    763         int rc = sqlite3PagerAcquire(pCursor->pPage->pPager, iPage, ppPage, 0);
    764         if( rc==SQLITE_OK ){
    765           return SQLITE_ROW;
    766         }
    767       }
    768     }
    769 
    770     /* This page has no more children.  Get next page from parent. */
    771     if( !pCursor->pParent ){
    772       return SQLITE_DONE;
    773     }
    774     rc = interiorCursorNextPage(&pCursor->pParent, ppPage);
    775     if( rc!=SQLITE_ROW ){
    776       return rc;
    777     }
    778 
    779     /* If a non-interior page is received, that either means that the
    780      * tree is uneven, or that a child was re-used (say as an overflow
    781      * page).  Remove this cursor and let the caller handle the page.
    782      */
    783     pPageHeader = PageHeader(*ppPage);
    784     if( pPageHeader[kiPageTypeOffset]!=kTableInteriorPage ){
    785       *ppCursor = pCursor->pParent;
    786       pCursor->pParent = NULL;
    787       interiorCursorDestroy(pCursor);
    788       return SQLITE_ROW;
    789     }
    790 
    791     /* Iterate the new page. */
    792     interiorCursorSetPage(pCursor, *ppPage);
    793     *ppPage = NULL;
    794   }
    795 
    796   assert(NULL);  /* NOTREACHED() */
    797   return SQLITE_CORRUPT;
    798 }
    799 
    800 /* Large rows are spilled to overflow pages.  The row's main page
    801  * stores the overflow page number after the local payload, with a
    802  * linked list forward from there as necessary.  overflowMaybeCreate()
    803  * and overflowGetSegment() provide an abstraction for accessing such
    804  * data while centralizing the code.
    805  *
    806  * overflowDestroy - releases all resources associated with the structure.
    807  * overflowMaybeCreate - create the overflow structure if it is needed
    808  *                       to represent the given record.  See function comment.
    809  * overflowGetSegment - fetch a segment from the record, accounting
    810  *                      for overflow pages.  Segments which are not
    811  *                      entirely contained with a page are constructed
    812  *                      into a buffer which is returned.  See function comment.
    813  */
    814 typedef struct RecoverOverflow RecoverOverflow;
    815 struct RecoverOverflow {
    816   RecoverOverflow *pNextOverflow;
    817   DbPage *pPage;
    818   unsigned nPageSize;
    819 };
    820 
    821 static void overflowDestroy(RecoverOverflow *pOverflow){
    822   while( pOverflow ){
    823     RecoverOverflow *p = pOverflow;
    824     pOverflow = p->pNextOverflow;
    825 
    826     if( p->pPage ){
    827       sqlite3PagerUnref(p->pPage);
    828       p->pPage = NULL;
    829     }
    830 
    831     memset(p, 0xA5, sizeof(*p));
    832     sqlite3_free(p);
    833   }
    834 }
    835 
    836 /* Internal helper.  Used to detect if iPage would cause a loop. */
    837 static int overflowPageInUse(RecoverOverflow *pOverflow, unsigned iPage){
    838   while( pOverflow && pOverflow->pPage->pgno!=iPage ){
    839     pOverflow = pOverflow->pNextOverflow;
    840   }
    841   return pOverflow!=NULL;
    842 }
    843 
    844 /* Setup to access an nRecordBytes record beginning at iRecordOffset
    845  * in pPage.  If nRecordBytes can be satisfied entirely from pPage,
    846  * then no overflow pages are needed an *pnLocalRecordBytes is set to
    847  * nRecordBytes.  Otherwise, *ppOverflow is set to the head of a list
    848  * of overflow pages, and *pnLocalRecordBytes is set to the number of
    849  * bytes local to pPage.
    850  *
    851  * overflowGetSegment() will do the right thing regardless of whether
    852  * those values are set to be in-page or not.
    853  */
    854 static int overflowMaybeCreate(DbPage *pPage, unsigned nPageSize,
    855                                unsigned iRecordOffset, unsigned nRecordBytes,
    856                                unsigned *pnLocalRecordBytes,
    857                                RecoverOverflow **ppOverflow){
    858   unsigned nLocalRecordBytes;  /* Record bytes in the leaf page. */
    859   unsigned iNextPage;          /* Next page number for record data. */
    860   unsigned nBytes;             /* Maximum record bytes as of current page. */
    861   int rc;
    862   RecoverOverflow *pFirstOverflow;  /* First in linked list of pages. */
    863   RecoverOverflow *pLastOverflow;   /* End of linked list. */
    864 
    865   /* Calculations from the "Table B-Tree Leaf Cell" part of section
    866    * 1.5 of http://www.sqlite.org/fileformat2.html .  maxLocal and
    867    * minLocal to match naming in btree.c.
    868    */
    869   const unsigned maxLocal = nPageSize - 35;
    870   const unsigned minLocal = ((nPageSize-12)*32/255)-23;  /* m */
    871 
    872   /* Always fit anything smaller than maxLocal. */
    873   if( nRecordBytes<=maxLocal ){
    874     *pnLocalRecordBytes = nRecordBytes;
    875     *ppOverflow = NULL;
    876     return SQLITE_OK;
    877   }
    878 
    879   /* Calculate the remainder after accounting for minLocal on the leaf
    880    * page and what packs evenly into overflow pages.  If the remainder
    881    * does not fit into maxLocal, then a partially-full overflow page
    882    * will be required in any case, so store as little as possible locally.
    883    */
    884   nLocalRecordBytes = minLocal+((nRecordBytes-minLocal)%(nPageSize-4));
    885   if( maxLocal<nLocalRecordBytes ){
    886     nLocalRecordBytes = minLocal;
    887   }
    888 
    889   /* Don't read off the end of the page. */
    890   if( iRecordOffset+nLocalRecordBytes+4>nPageSize ){
    891     return SQLITE_CORRUPT;
    892   }
    893 
    894   /* First overflow page number is after the local bytes. */
    895   iNextPage =
    896       decodeUnsigned32(PageData(pPage, iRecordOffset + nLocalRecordBytes));
    897   nBytes = nLocalRecordBytes;
    898 
    899   /* While there are more pages to read, and more bytes are needed,
    900    * get another page.
    901    */
    902   pFirstOverflow = pLastOverflow = NULL;
    903   rc = SQLITE_OK;
    904   while( iNextPage && nBytes<nRecordBytes ){
    905     RecoverOverflow *pOverflow;  /* New overflow page for the list. */
    906 
    907     rc = sqlite3PagerAcquire(pPage->pPager, iNextPage, &pPage, 0);
    908     if( rc!=SQLITE_OK ){
    909       break;
    910     }
    911 
    912     pOverflow = sqlite3_malloc(sizeof(RecoverOverflow));
    913     if( !pOverflow ){
    914       sqlite3PagerUnref(pPage);
    915       rc = SQLITE_NOMEM;
    916       break;
    917     }
    918     memset(pOverflow, 0, sizeof(*pOverflow));
    919     pOverflow->pPage = pPage;
    920     pOverflow->nPageSize = nPageSize;
    921 
    922     if( !pFirstOverflow ){
    923       pFirstOverflow = pOverflow;
    924     }else{
    925       pLastOverflow->pNextOverflow = pOverflow;
    926     }
    927     pLastOverflow = pOverflow;
    928 
    929     iNextPage = decodeUnsigned32(pPage->pData);
    930     nBytes += nPageSize-4;
    931 
    932     /* Avoid loops. */
    933     if( overflowPageInUse(pFirstOverflow, iNextPage) ){
    934       fprintf(stderr, "Overflow loop detected at %d\n", iNextPage);
    935       rc = SQLITE_CORRUPT;
    936       break;
    937     }
    938   }
    939 
    940   /* If there were not enough pages, or too many, things are corrupt.
    941    * Not having enough pages is an obvious problem, all the data
    942    * cannot be read.  Too many pages means that the contents of the
    943    * row between the main page and the overflow page(s) is
    944    * inconsistent (most likely one or more of the overflow pages does
    945    * not really belong to this row).
    946    */
    947   if( rc==SQLITE_OK && (nBytes<nRecordBytes || iNextPage) ){
    948     rc = SQLITE_CORRUPT;
    949   }
    950 
    951   if( rc==SQLITE_OK ){
    952     *ppOverflow = pFirstOverflow;
    953     *pnLocalRecordBytes = nLocalRecordBytes;
    954   }else if( pFirstOverflow ){
    955     overflowDestroy(pFirstOverflow);
    956   }
    957   return rc;
    958 }
    959 
    960 /* Use in concert with overflowMaybeCreate() to efficiently read parts
    961  * of a potentially-overflowing record.  pPage and iRecordOffset are
    962  * the values passed into overflowMaybeCreate(), nLocalRecordBytes and
    963  * pOverflow are the values returned by that call.
    964  *
    965  * On SQLITE_OK, *ppBase points to nRequestBytes of data at
    966  * iRequestOffset within the record.  If the data exists contiguously
    967  * in a page, a direct pointer is returned, otherwise a buffer from
    968  * sqlite3_malloc() is returned with the data.  *pbFree is set true if
    969  * sqlite3_free() should be called on *ppBase.
    970  */
    971 /* Operation of this function is subtle.  At any time, pPage is the
    972  * current page, with iRecordOffset and nLocalRecordBytes being record
    973  * data within pPage, and pOverflow being the overflow page after
    974  * pPage.  This allows the code to handle both the initial leaf page
    975  * and overflow pages consistently by adjusting the values
    976  * appropriately.
    977  */
    978 static int overflowGetSegment(DbPage *pPage, unsigned iRecordOffset,
    979                               unsigned nLocalRecordBytes,
    980                               RecoverOverflow *pOverflow,
    981                               unsigned iRequestOffset, unsigned nRequestBytes,
    982                               unsigned char **ppBase, int *pbFree){
    983   unsigned nBase;         /* Amount of data currently collected. */
    984   unsigned char *pBase;   /* Buffer to collect record data into. */
    985 
    986   /* Skip to the page containing the start of the data. */
    987   while( iRequestOffset>=nLocalRecordBytes && pOverflow ){
    988     /* Factor out current page's contribution. */
    989     iRequestOffset -= nLocalRecordBytes;
    990 
    991     /* Move forward to the next page in the list. */
    992     pPage = pOverflow->pPage;
    993     iRecordOffset = 4;
    994     nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
    995     pOverflow = pOverflow->pNextOverflow;
    996   }
    997 
    998   /* If the requested data is entirely within this page, return a
    999    * pointer into the page.
   1000    */
   1001   if( iRequestOffset+nRequestBytes<=nLocalRecordBytes ){
   1002     /* TODO(shess): "assignment discards qualifiers from pointer target type"
   1003      * Having ppBase be const makes sense, but sqlite3_free() takes non-const.
   1004      */
   1005     *ppBase = (unsigned char *)PageData(pPage, iRecordOffset + iRequestOffset);
   1006     *pbFree = 0;
   1007     return SQLITE_OK;
   1008   }
   1009 
   1010   /* The data range would require additional pages. */
   1011   if( !pOverflow ){
   1012     /* Should never happen, the range is outside the nRecordBytes
   1013      * passed to overflowMaybeCreate().
   1014      */
   1015     assert(NULL);  /* NOTREACHED */
   1016     return SQLITE_ERROR;
   1017   }
   1018 
   1019   /* Get a buffer to construct into. */
   1020   nBase = 0;
   1021   pBase = sqlite3_malloc(nRequestBytes);
   1022   if( !pBase ){
   1023     return SQLITE_NOMEM;
   1024   }
   1025   while( nBase<nRequestBytes ){
   1026     /* Copy over data present on this page. */
   1027     unsigned nCopyBytes = nRequestBytes - nBase;
   1028     if( nLocalRecordBytes-iRequestOffset<nCopyBytes ){
   1029       nCopyBytes = nLocalRecordBytes - iRequestOffset;
   1030     }
   1031     memcpy(pBase + nBase, PageData(pPage, iRecordOffset + iRequestOffset),
   1032            nCopyBytes);
   1033     nBase += nCopyBytes;
   1034 
   1035     if( pOverflow ){
   1036       /* Copy from start of record data in future pages. */
   1037       iRequestOffset = 0;
   1038 
   1039       /* Move forward to the next page in the list.  Should match
   1040        * first while() loop.
   1041        */
   1042       pPage = pOverflow->pPage;
   1043       iRecordOffset = 4;
   1044       nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
   1045       pOverflow = pOverflow->pNextOverflow;
   1046     }else if( nBase<nRequestBytes ){
   1047       /* Ran out of overflow pages with data left to deliver.  Not
   1048        * possible if the requested range fits within nRecordBytes
   1049        * passed to overflowMaybeCreate() when creating pOverflow.
   1050        */
   1051       assert(NULL);  /* NOTREACHED */
   1052       sqlite3_free(pBase);
   1053       return SQLITE_ERROR;
   1054     }
   1055   }
   1056   assert( nBase==nRequestBytes );
   1057   *ppBase = pBase;
   1058   *pbFree = 1;
   1059   return SQLITE_OK;
   1060 }
   1061 
   1062 /* Primary structure for iterating the contents of a table.
   1063  *
   1064  * leafCursorDestroy - release all resources associated with the cursor.
   1065  * leafCursorCreate - create a cursor to iterate items from tree at
   1066  *                    the provided root page.
   1067  * leafCursorNextValidCell - get the cursor ready to access data from
   1068  *                           the next valid cell in the table.
   1069  * leafCursorCellRowid - get the current cell's rowid.
   1070  * leafCursorCellColumns - get current cell's column count.
   1071  * leafCursorCellColInfo - get type and data for a column in current cell.
   1072  *
   1073  * leafCursorNextValidCell skips cells which fail simple integrity
   1074  * checks, such as overlapping other cells, or being located at
   1075  * impossible offsets, or where header data doesn't correctly describe
   1076  * payload data.  Returns SQLITE_ROW if a valid cell is found,
   1077  * SQLITE_DONE if all pages in the tree were exhausted.
   1078  *
   1079  * leafCursorCellColInfo() accounts for overflow pages in the style of
   1080  * overflowGetSegment().
   1081  */
   1082 typedef struct RecoverLeafCursor RecoverLeafCursor;
   1083 struct RecoverLeafCursor {
   1084   RecoverInteriorCursor *pParent;  /* Parent node to this node. */
   1085   DbPage *pPage;                   /* Reference to leaf page. */
   1086   unsigned nPageSize;              /* Size of pPage. */
   1087   unsigned nCells;                 /* Number of cells in pPage. */
   1088   unsigned iCell;                  /* Current cell. */
   1089 
   1090   /* Info parsed from data in iCell. */
   1091   i64 iRowid;                      /* rowid parsed. */
   1092   unsigned nRecordCols;            /* how many items in the record. */
   1093   u64 iRecordOffset;               /* offset to record data. */
   1094   /* TODO(shess): nRecordBytes and nRecordHeaderBytes are used in
   1095    * leafCursorCellColInfo() to prevent buffer overruns.
   1096    * leafCursorCellDecode() already verified that the cell is valid, so
   1097    * those checks should be redundant.
   1098    */
   1099   u64 nRecordBytes;                /* Size of record data. */
   1100   unsigned nLocalRecordBytes;      /* Amount of record data in-page. */
   1101   unsigned nRecordHeaderBytes;     /* Size of record header data. */
   1102   unsigned char *pRecordHeader;    /* Pointer to record header data. */
   1103   int bFreeRecordHeader;           /* True if record header requires free. */
   1104   RecoverOverflow *pOverflow;      /* Cell overflow info, if needed. */
   1105 };
   1106 
   1107 /* Internal helper shared between next-page and create-cursor.  If
   1108  * pPage is a leaf page, it will be stored in the cursor and state
   1109  * initialized for reading cells.
   1110  *
   1111  * If pPage is an interior page, a new parent cursor is created and
   1112  * injected on the stack.  This is necessary to handle trees with
   1113  * uneven depth, but also is used during initial setup.
   1114  *
   1115  * If pPage is not a table page at all, it is discarded.
   1116  *
   1117  * If SQLITE_OK is returned, the caller no longer owns pPage,
   1118  * otherwise the caller is responsible for discarding it.
   1119  */
   1120 static int leafCursorLoadPage(RecoverLeafCursor *pCursor, DbPage *pPage){
   1121   const unsigned char *pPageHeader;  /* Header of *pPage */
   1122 
   1123   /* Release the current page. */
   1124   if( pCursor->pPage ){
   1125     sqlite3PagerUnref(pCursor->pPage);
   1126     pCursor->pPage = NULL;
   1127     pCursor->iCell = pCursor->nCells = 0;
   1128   }
   1129 
   1130   /* If the page is an unexpected interior node, inject a new stack
   1131    * layer and try again from there.
   1132    */
   1133   pPageHeader = PageHeader(pPage);
   1134   if( pPageHeader[kiPageTypeOffset]==kTableInteriorPage ){
   1135     RecoverInteriorCursor *pParent;
   1136     int rc = interiorCursorCreate(pCursor->pParent, pPage, pCursor->nPageSize,
   1137                                   &pParent);
   1138     if( rc!=SQLITE_OK ){
   1139       return rc;
   1140     }
   1141     pCursor->pParent = pParent;
   1142     return SQLITE_OK;
   1143   }
   1144 
   1145   /* Not a leaf page, skip it. */
   1146   if( pPageHeader[kiPageTypeOffset]!=kTableLeafPage ){
   1147     sqlite3PagerUnref(pPage);
   1148     return SQLITE_OK;
   1149   }
   1150 
   1151   /* Take ownership of the page and start decoding. */
   1152   pCursor->pPage = pPage;
   1153   pCursor->iCell = 0;
   1154   pCursor->nCells = decodeUnsigned16(pPageHeader + kiPageCellCountOffset);
   1155   return SQLITE_OK;
   1156 }
   1157 
   1158 /* Get the next leaf-level page in the tree.  Returns SQLITE_ROW when
   1159  * a leaf page is found, SQLITE_DONE when no more leaves exist, or any
   1160  * error which occurred.
   1161  */
   1162 static int leafCursorNextPage(RecoverLeafCursor *pCursor){
   1163   if( !pCursor->pParent ){
   1164     return SQLITE_DONE;
   1165   }
   1166 
   1167   /* Repeatedly load the parent's next child page until a leaf is found. */
   1168   do {
   1169     DbPage *pNextPage;
   1170     int rc = interiorCursorNextPage(&pCursor->pParent, &pNextPage);
   1171     if( rc!=SQLITE_ROW ){
   1172       assert( rc==SQLITE_DONE );
   1173       return rc;
   1174     }
   1175 
   1176     rc = leafCursorLoadPage(pCursor, pNextPage);
   1177     if( rc!=SQLITE_OK ){
   1178       sqlite3PagerUnref(pNextPage);
   1179       return rc;
   1180     }
   1181   } while( !pCursor->pPage );
   1182 
   1183   return SQLITE_ROW;
   1184 }
   1185 
   1186 static void leafCursorDestroyCellData(RecoverLeafCursor *pCursor){
   1187   if( pCursor->bFreeRecordHeader ){
   1188     sqlite3_free(pCursor->pRecordHeader);
   1189   }
   1190   pCursor->bFreeRecordHeader = 0;
   1191   pCursor->pRecordHeader = NULL;
   1192 
   1193   if( pCursor->pOverflow ){
   1194     overflowDestroy(pCursor->pOverflow);
   1195     pCursor->pOverflow = NULL;
   1196   }
   1197 }
   1198 
   1199 static void leafCursorDestroy(RecoverLeafCursor *pCursor){
   1200   leafCursorDestroyCellData(pCursor);
   1201 
   1202   if( pCursor->pParent ){
   1203     interiorCursorDestroy(pCursor->pParent);
   1204     pCursor->pParent = NULL;
   1205   }
   1206 
   1207   if( pCursor->pPage ){
   1208     sqlite3PagerUnref(pCursor->pPage);
   1209     pCursor->pPage = NULL;
   1210   }
   1211 
   1212   memset(pCursor, 0xA5, sizeof(*pCursor));
   1213   sqlite3_free(pCursor);
   1214 }
   1215 
   1216 /* Create a cursor to iterate the rows from the leaf pages of a table
   1217  * rooted at iRootPage.
   1218  */
   1219 /* TODO(shess): recoverOpen() calls this to setup the cursor, and I
   1220  * think that recoverFilter() may make a hard assumption that the
   1221  * cursor returned will turn up at least one valid cell.
   1222  *
   1223  * The cases I can think of which break this assumption are:
   1224  * - pPage is a valid leaf page with no valid cells.
   1225  * - pPage is a valid interior page with no valid leaves.
   1226  * - pPage is a valid interior page who's leaves contain no valid cells.
   1227  * - pPage is not a valid leaf or interior page.
   1228  */
   1229 static int leafCursorCreate(Pager *pPager, unsigned nPageSize,
   1230                             u32 iRootPage, RecoverLeafCursor **ppCursor){
   1231   DbPage *pPage;               /* Reference to page at iRootPage. */
   1232   RecoverLeafCursor *pCursor;  /* Leaf cursor being constructed. */
   1233   int rc;
   1234 
   1235   /* Start out with the root page. */
   1236   rc = sqlite3PagerAcquire(pPager, iRootPage, &pPage, 0);
   1237   if( rc!=SQLITE_OK ){
   1238     return rc;
   1239   }
   1240 
   1241   pCursor = sqlite3_malloc(sizeof(RecoverLeafCursor));
   1242   if( !pCursor ){
   1243     sqlite3PagerUnref(pPage);
   1244     return SQLITE_NOMEM;
   1245   }
   1246   memset(pCursor, 0, sizeof(*pCursor));
   1247 
   1248   pCursor->nPageSize = nPageSize;
   1249 
   1250   rc = leafCursorLoadPage(pCursor, pPage);
   1251   if( rc!=SQLITE_OK ){
   1252     sqlite3PagerUnref(pPage);
   1253     leafCursorDestroy(pCursor);
   1254     return rc;
   1255   }
   1256 
   1257   /* pPage wasn't a leaf page, find the next leaf page. */
   1258   if( !pCursor->pPage ){
   1259     rc = leafCursorNextPage(pCursor);
   1260     if( rc!=SQLITE_DONE && rc!=SQLITE_ROW ){
   1261       leafCursorDestroy(pCursor);
   1262       return rc;
   1263     }
   1264   }
   1265 
   1266   *ppCursor = pCursor;
   1267   return SQLITE_OK;
   1268 }
   1269 
   1270 /* Useful for setting breakpoints. */
   1271 static int ValidateError(){
   1272   return SQLITE_ERROR;
   1273 }
   1274 
   1275 /* Setup the cursor for reading the information from cell iCell. */
   1276 static int leafCursorCellDecode(RecoverLeafCursor *pCursor){
   1277   const unsigned char *pPageHeader;  /* Header of current page. */
   1278   const unsigned char *pPageEnd;     /* Byte after end of current page. */
   1279   const unsigned char *pCellOffsets; /* Pointer to page's cell offsets. */
   1280   unsigned iCellOffset;              /* Offset of current cell (iCell). */
   1281   const unsigned char *pCell;        /* Pointer to data at iCellOffset. */
   1282   unsigned nCellMaxBytes;            /* Maximum local size of iCell. */
   1283   unsigned iEndOffset;               /* End of iCell's in-page data. */
   1284   u64 nRecordBytes;                  /* Expected size of cell, w/overflow. */
   1285   u64 iRowid;                        /* iCell's rowid (in table). */
   1286   unsigned nRead;                    /* Amount of cell read. */
   1287   unsigned nRecordHeaderRead;        /* Header data read. */
   1288   u64 nRecordHeaderBytes;            /* Header size expected. */
   1289   unsigned nRecordCols;              /* Columns read from header. */
   1290   u64 nRecordColBytes;               /* Bytes in payload for those columns. */
   1291   unsigned i;
   1292   int rc;
   1293 
   1294   assert( pCursor->iCell<pCursor->nCells );
   1295 
   1296   leafCursorDestroyCellData(pCursor);
   1297 
   1298   /* Find the offset to the row. */
   1299   pPageHeader = PageHeader(pCursor->pPage);
   1300   pCellOffsets = pPageHeader + knPageLeafHeaderBytes;
   1301   pPageEnd = PageData(pCursor->pPage, pCursor->nPageSize);
   1302   if( pCellOffsets + pCursor->iCell*2 + 2 > pPageEnd ){
   1303     return ValidateError();
   1304   }
   1305   iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iCell*2);
   1306   if( iCellOffset>=pCursor->nPageSize ){
   1307     return ValidateError();
   1308   }
   1309 
   1310   pCell = PageData(pCursor->pPage, iCellOffset);
   1311   nCellMaxBytes = pCursor->nPageSize - iCellOffset;
   1312 
   1313   /* B-tree leaf cells lead with varint record size, varint rowid and
   1314    * varint header size.
   1315    */
   1316   /* TODO(shess): The smallest page size is 512 bytes, which has an m
   1317    * of 39.  Three varints need at most 27 bytes to encode.  I think.
   1318    */
   1319   if( !checkVarints(pCell, nCellMaxBytes, 3) ){
   1320     return ValidateError();
   1321   }
   1322 
   1323   nRead = getVarint(pCell, &nRecordBytes);
   1324   assert( iCellOffset+nRead<=pCursor->nPageSize );
   1325   pCursor->nRecordBytes = nRecordBytes;
   1326 
   1327   nRead += getVarint(pCell + nRead, &iRowid);
   1328   assert( iCellOffset+nRead<=pCursor->nPageSize );
   1329   pCursor->iRowid = (i64)iRowid;
   1330 
   1331   pCursor->iRecordOffset = iCellOffset + nRead;
   1332 
   1333   /* Start overflow setup here because nLocalRecordBytes is needed to
   1334    * check cell overlap.
   1335    */
   1336   rc = overflowMaybeCreate(pCursor->pPage, pCursor->nPageSize,
   1337                            pCursor->iRecordOffset, pCursor->nRecordBytes,
   1338                            &pCursor->nLocalRecordBytes,
   1339                            &pCursor->pOverflow);
   1340   if( rc!=SQLITE_OK ){
   1341     return ValidateError();
   1342   }
   1343 
   1344   /* Check that no other cell starts within this cell. */
   1345   iEndOffset = pCursor->iRecordOffset + pCursor->nLocalRecordBytes;
   1346   for( i=0; i<pCursor->nCells && pCellOffsets + i*2 + 2 <= pPageEnd; ++i ){
   1347     const unsigned iOtherOffset = decodeUnsigned16(pCellOffsets + i*2);
   1348     if( iOtherOffset>iCellOffset && iOtherOffset<iEndOffset ){
   1349       return ValidateError();
   1350     }
   1351   }
   1352 
   1353   nRecordHeaderRead = getVarint(pCell + nRead, &nRecordHeaderBytes);
   1354   assert( nRecordHeaderBytes<=nRecordBytes );
   1355   pCursor->nRecordHeaderBytes = nRecordHeaderBytes;
   1356 
   1357   /* Large headers could overflow if pages are small. */
   1358   rc = overflowGetSegment(pCursor->pPage,
   1359                           pCursor->iRecordOffset, pCursor->nLocalRecordBytes,
   1360                           pCursor->pOverflow, 0, nRecordHeaderBytes,
   1361                           &pCursor->pRecordHeader, &pCursor->bFreeRecordHeader);
   1362   if( rc!=SQLITE_OK ){
   1363     return ValidateError();
   1364   }
   1365 
   1366   /* Tally up the column count and size of data. */
   1367   nRecordCols = 0;
   1368   nRecordColBytes = 0;
   1369   while( nRecordHeaderRead<nRecordHeaderBytes ){
   1370     u64 iSerialType;  /* Type descriptor for current column. */
   1371     if( !checkVarint(pCursor->pRecordHeader + nRecordHeaderRead,
   1372                      nRecordHeaderBytes - nRecordHeaderRead) ){
   1373       return ValidateError();
   1374     }
   1375     nRecordHeaderRead += getVarint(pCursor->pRecordHeader + nRecordHeaderRead,
   1376                                    &iSerialType);
   1377     if( iSerialType==10 || iSerialType==11 ){
   1378       return ValidateError();
   1379     }
   1380     nRecordColBytes += SerialTypeLength(iSerialType);
   1381     nRecordCols++;
   1382   }
   1383   pCursor->nRecordCols = nRecordCols;
   1384 
   1385   /* Parsing the header used as many bytes as expected. */
   1386   if( nRecordHeaderRead!=nRecordHeaderBytes ){
   1387     return ValidateError();
   1388   }
   1389 
   1390   /* Calculated record is size of expected record. */
   1391   if( nRecordHeaderBytes+nRecordColBytes!=nRecordBytes ){
   1392     return ValidateError();
   1393   }
   1394 
   1395   return SQLITE_OK;
   1396 }
   1397 
   1398 static i64 leafCursorCellRowid(RecoverLeafCursor *pCursor){
   1399   return pCursor->iRowid;
   1400 }
   1401 
   1402 static unsigned leafCursorCellColumns(RecoverLeafCursor *pCursor){
   1403   return pCursor->nRecordCols;
   1404 }
   1405 
   1406 /* Get the column info for the cell.  Pass NULL for ppBase to prevent
   1407  * retrieving the data segment.  If *pbFree is true, *ppBase must be
   1408  * freed by the caller using sqlite3_free().
   1409  */
   1410 static int leafCursorCellColInfo(RecoverLeafCursor *pCursor,
   1411                                  unsigned iCol, u64 *piColType,
   1412                                  unsigned char **ppBase, int *pbFree){
   1413   const unsigned char *pRecordHeader;  /* Current cell's header. */
   1414   u64 nRecordHeaderBytes;              /* Bytes in pRecordHeader. */
   1415   unsigned nRead;                      /* Bytes read from header. */
   1416   u64 iColEndOffset;                   /* Offset to end of column in cell. */
   1417   unsigned nColsSkipped;               /* Count columns as procesed. */
   1418   u64 iSerialType;                     /* Type descriptor for current column. */
   1419 
   1420   /* Implicit NULL for columns past the end.  This case happens when
   1421    * rows have not been updated since an ALTER TABLE added columns.
   1422    * It is more convenient to address here than in callers.
   1423    */
   1424   if( iCol>=pCursor->nRecordCols ){
   1425     *piColType = 0;
   1426     if( ppBase ){
   1427       *ppBase = 0;
   1428       *pbFree = 0;
   1429     }
   1430     return SQLITE_OK;
   1431   }
   1432 
   1433   /* Must be able to decode header size. */
   1434   pRecordHeader = pCursor->pRecordHeader;
   1435   if( !checkVarint(pRecordHeader, pCursor->nRecordHeaderBytes) ){
   1436     return SQLITE_CORRUPT;
   1437   }
   1438 
   1439   /* Rather than caching the header size and how many bytes it took,
   1440    * decode it every time.
   1441    */
   1442   nRead = getVarint(pRecordHeader, &nRecordHeaderBytes);
   1443   assert( nRecordHeaderBytes==pCursor->nRecordHeaderBytes );
   1444 
   1445   /* Scan forward to the indicated column.  Scans to _after_ column
   1446    * for later range checking.
   1447    */
   1448   /* TODO(shess): This could get expensive for very wide tables.  An
   1449    * array of iSerialType could be built in leafCursorCellDecode(), but
   1450    * the number of columns is dynamic per row, so it would add memory
   1451    * management complexity.  Enough info to efficiently forward
   1452    * iterate could be kept, if all clients forward iterate
   1453    * (recoverColumn() may not).
   1454    */
   1455   iColEndOffset = 0;
   1456   nColsSkipped = 0;
   1457   while( nColsSkipped<=iCol && nRead<nRecordHeaderBytes ){
   1458     if( !checkVarint(pRecordHeader + nRead, nRecordHeaderBytes - nRead) ){
   1459       return SQLITE_CORRUPT;
   1460     }
   1461     nRead += getVarint(pRecordHeader + nRead, &iSerialType);
   1462     iColEndOffset += SerialTypeLength(iSerialType);
   1463     nColsSkipped++;
   1464   }
   1465 
   1466   /* Column's data extends past record's end. */
   1467   if( nRecordHeaderBytes+iColEndOffset>pCursor->nRecordBytes ){
   1468     return SQLITE_CORRUPT;
   1469   }
   1470 
   1471   *piColType = iSerialType;
   1472   if( ppBase ){
   1473     const u32 nColBytes = SerialTypeLength(iSerialType);
   1474 
   1475     /* Offset from start of record to beginning of column. */
   1476     const unsigned iColOffset = nRecordHeaderBytes+iColEndOffset-nColBytes;
   1477 
   1478     return overflowGetSegment(pCursor->pPage, pCursor->iRecordOffset,
   1479                               pCursor->nLocalRecordBytes, pCursor->pOverflow,
   1480                               iColOffset, nColBytes, ppBase, pbFree);
   1481   }
   1482   return SQLITE_OK;
   1483 }
   1484 
   1485 static int leafCursorNextValidCell(RecoverLeafCursor *pCursor){
   1486   while( 1 ){
   1487     int rc;
   1488 
   1489     /* Move to the next cell. */
   1490     pCursor->iCell++;
   1491 
   1492     /* No more cells, get the next leaf. */
   1493     if( pCursor->iCell>=pCursor->nCells ){
   1494       rc = leafCursorNextPage(pCursor);
   1495       if( rc!=SQLITE_ROW ){
   1496         return rc;
   1497       }
   1498       assert( pCursor->iCell==0 );
   1499     }
   1500 
   1501     /* If the cell is valid, indicate that a row is available. */
   1502     rc = leafCursorCellDecode(pCursor);
   1503     if( rc==SQLITE_OK ){
   1504       return SQLITE_ROW;
   1505     }
   1506 
   1507     /* Iterate until done or a valid row is found. */
   1508     /* TODO(shess): Remove debugging output. */
   1509     fprintf(stderr, "Skipping invalid cell\n");
   1510   }
   1511   return SQLITE_ERROR;
   1512 }
   1513 
   1514 typedef struct Recover Recover;
   1515 struct Recover {
   1516   sqlite3_vtab base;
   1517   sqlite3 *db;                /* Host database connection */
   1518   char *zDb;                  /* Database containing target table */
   1519   char *zTable;               /* Target table */
   1520   unsigned nCols;             /* Number of columns in target table */
   1521   unsigned char *pTypes;      /* Types of columns in target table */
   1522 };
   1523 
   1524 /* Internal helper for deleting the module. */
   1525 static void recoverRelease(Recover *pRecover){
   1526   sqlite3_free(pRecover->zDb);
   1527   sqlite3_free(pRecover->zTable);
   1528   sqlite3_free(pRecover->pTypes);
   1529   memset(pRecover, 0xA5, sizeof(*pRecover));
   1530   sqlite3_free(pRecover);
   1531 }
   1532 
   1533 /* Helper function for initializing the module.  Forward-declared so
   1534  * recoverCreate() and recoverConnect() can see it.
   1535  */
   1536 static int recoverInit(
   1537   sqlite3 *, void *, int, const char *const*, sqlite3_vtab **, char **
   1538 );
   1539 
   1540 static int recoverCreate(
   1541   sqlite3 *db,
   1542   void *pAux,
   1543   int argc, const char *const*argv,
   1544   sqlite3_vtab **ppVtab,
   1545   char **pzErr
   1546 ){
   1547   FNENTRY();
   1548   return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
   1549 }
   1550 
   1551 /* This should never be called. */
   1552 static int recoverConnect(
   1553   sqlite3 *db,
   1554   void *pAux,
   1555   int argc, const char *const*argv,
   1556   sqlite3_vtab **ppVtab,
   1557   char **pzErr
   1558 ){
   1559   FNENTRY();
   1560   return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
   1561 }
   1562 
   1563 /* No indices supported. */
   1564 static int recoverBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
   1565   FNENTRY();
   1566   return SQLITE_OK;
   1567 }
   1568 
   1569 /* Logically, this should never be called. */
   1570 static int recoverDisconnect(sqlite3_vtab *pVtab){
   1571   FNENTRY();
   1572   recoverRelease((Recover*)pVtab);
   1573   return SQLITE_OK;
   1574 }
   1575 
   1576 static int recoverDestroy(sqlite3_vtab *pVtab){
   1577   FNENTRY();
   1578   recoverRelease((Recover*)pVtab);
   1579   return SQLITE_OK;
   1580 }
   1581 
   1582 typedef struct RecoverCursor RecoverCursor;
   1583 struct RecoverCursor {
   1584   sqlite3_vtab_cursor base;
   1585   RecoverLeafCursor *pLeafCursor;
   1586   int iEncoding;
   1587   int bEOF;
   1588 };
   1589 
   1590 static int recoverOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
   1591   Recover *pRecover = (Recover*)pVTab;
   1592   u32 iRootPage;                   /* Root page of the backing table. */
   1593   int iEncoding;                   /* UTF encoding for backing database. */
   1594   unsigned nPageSize;              /* Size of pages in backing database. */
   1595   Pager *pPager;                   /* Backing database pager. */
   1596   RecoverLeafCursor *pLeafCursor;  /* Cursor to read table's leaf pages. */
   1597   RecoverCursor *pCursor;          /* Cursor to read rows from leaves. */
   1598   int rc;
   1599 
   1600   FNENTRY();
   1601 
   1602   iRootPage = 0;
   1603   rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable,
   1604                    &iRootPage);
   1605   if( rc!=SQLITE_OK ){
   1606     return rc;
   1607   }
   1608 
   1609   iEncoding = 0;
   1610   rc = getEncoding(pRecover->db, pRecover->zDb, &iEncoding);
   1611   if( rc!=SQLITE_OK ){
   1612     return rc;
   1613   }
   1614 
   1615   rc = GetPager(pRecover->db, pRecover->zDb, &pPager, &nPageSize);
   1616   if( rc!=SQLITE_OK ){
   1617     return rc;
   1618   }
   1619 
   1620   rc = leafCursorCreate(pPager, nPageSize, iRootPage, &pLeafCursor);
   1621   if( rc!=SQLITE_OK ){
   1622     return rc;
   1623   }
   1624 
   1625   pCursor = sqlite3_malloc(sizeof(RecoverCursor));
   1626   if( !pCursor ){
   1627     leafCursorDestroy(pLeafCursor);
   1628     return SQLITE_NOMEM;
   1629   }
   1630   memset(pCursor, 0, sizeof(*pCursor));
   1631   pCursor->base.pVtab = pVTab;
   1632   pCursor->pLeafCursor = pLeafCursor;
   1633   pCursor->iEncoding = iEncoding;
   1634 
   1635   /* If no leaf pages were found, empty result set. */
   1636   /* TODO(shess): leafCursorNextValidCell() would return SQLITE_ROW or
   1637    * SQLITE_DONE to indicate whether there is further data to consider.
   1638    */
   1639   pCursor->bEOF = (pLeafCursor->pPage==NULL);
   1640 
   1641   *ppCursor = (sqlite3_vtab_cursor*)pCursor;
   1642   return SQLITE_OK;
   1643 }
   1644 
   1645 static int recoverClose(sqlite3_vtab_cursor *cur){
   1646   RecoverCursor *pCursor = (RecoverCursor*)cur;
   1647   FNENTRY();
   1648   if( pCursor->pLeafCursor ){
   1649     leafCursorDestroy(pCursor->pLeafCursor);
   1650     pCursor->pLeafCursor = NULL;
   1651   }
   1652   memset(pCursor, 0xA5, sizeof(*pCursor));
   1653   sqlite3_free(cur);
   1654   return SQLITE_OK;
   1655 }
   1656 
   1657 /* Helpful place to set a breakpoint. */
   1658 static int RecoverInvalidCell(){
   1659   return SQLITE_ERROR;
   1660 }
   1661 
   1662 /* Returns SQLITE_OK if the cell has an appropriate number of columns
   1663  * with the appropriate types of data.
   1664  */
   1665 static int recoverValidateLeafCell(Recover *pRecover, RecoverCursor *pCursor){
   1666   unsigned i;
   1667 
   1668   /* If the row's storage has too many columns, skip it. */
   1669   if( leafCursorCellColumns(pCursor->pLeafCursor)>pRecover->nCols ){
   1670     return RecoverInvalidCell();
   1671   }
   1672 
   1673   /* Skip rows with unexpected types. */
   1674   for( i=0; i<pRecover->nCols; ++i ){
   1675     u64 iType;  /* Storage type of column i. */
   1676     int rc;
   1677 
   1678     /* ROWID alias. */
   1679     if( (pRecover->pTypes[i]&MASK_ROWID) ){
   1680       continue;
   1681     }
   1682 
   1683     rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iType, NULL, NULL);
   1684     assert( rc==SQLITE_OK );
   1685     if( rc!=SQLITE_OK || !SerialTypeIsCompatible(iType, pRecover->pTypes[i]) ){
   1686       return RecoverInvalidCell();
   1687     }
   1688   }
   1689 
   1690   return SQLITE_OK;
   1691 }
   1692 
   1693 static int recoverNext(sqlite3_vtab_cursor *pVtabCursor){
   1694   RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
   1695   Recover *pRecover = (Recover*)pCursor->base.pVtab;
   1696   int rc;
   1697 
   1698   FNENTRY();
   1699 
   1700   /* Scan forward to the next cell with valid storage, then check that
   1701    * the stored data matches the schema.
   1702    */
   1703   while( (rc = leafCursorNextValidCell(pCursor->pLeafCursor))==SQLITE_ROW ){
   1704     if( recoverValidateLeafCell(pRecover, pCursor)==SQLITE_OK ){
   1705       return SQLITE_OK;
   1706     }
   1707   }
   1708 
   1709   if( rc==SQLITE_DONE ){
   1710     pCursor->bEOF = 1;
   1711     return SQLITE_OK;
   1712   }
   1713 
   1714   assert( rc!=SQLITE_OK );
   1715   return rc;
   1716 }
   1717 
   1718 static int recoverFilter(
   1719   sqlite3_vtab_cursor *pVtabCursor,
   1720   int idxNum, const char *idxStr,
   1721   int argc, sqlite3_value **argv
   1722 ){
   1723   RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
   1724   Recover *pRecover = (Recover*)pCursor->base.pVtab;
   1725   int rc;
   1726 
   1727   FNENTRY();
   1728 
   1729   /* There were no valid leaf pages in the table. */
   1730   if( pCursor->bEOF ){
   1731     return SQLITE_OK;
   1732   }
   1733 
   1734   /* Load the first cell, and iterate forward if it's not valid.  If no cells at
   1735    * all are valid, recoverNext() sets bEOF and returns appropriately.
   1736    */
   1737   rc = leafCursorCellDecode(pCursor->pLeafCursor);
   1738   if( rc!=SQLITE_OK || recoverValidateLeafCell(pRecover, pCursor)!=SQLITE_OK ){
   1739     return recoverNext(pVtabCursor);
   1740   }
   1741 
   1742   return SQLITE_OK;
   1743 }
   1744 
   1745 static int recoverEof(sqlite3_vtab_cursor *pVtabCursor){
   1746   RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
   1747   FNENTRY();
   1748   return pCursor->bEOF;
   1749 }
   1750 
   1751 static int recoverColumn(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i){
   1752   RecoverCursor *pCursor = (RecoverCursor*)cur;
   1753   Recover *pRecover = (Recover*)pCursor->base.pVtab;
   1754   u64 iColType;             /* Storage type of column i. */
   1755   unsigned char *pColData;  /* Column i's data. */
   1756   int shouldFree;           /* Non-zero if pColData should be freed. */
   1757   int rc;
   1758 
   1759   FNENTRY();
   1760 
   1761   if( i>=pRecover->nCols ){
   1762     return SQLITE_ERROR;
   1763   }
   1764 
   1765   /* ROWID alias. */
   1766   if( (pRecover->pTypes[i]&MASK_ROWID) ){
   1767     sqlite3_result_int64(ctx, leafCursorCellRowid(pCursor->pLeafCursor));
   1768     return SQLITE_OK;
   1769   }
   1770 
   1771   pColData = NULL;
   1772   shouldFree = 0;
   1773   rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iColType,
   1774                              &pColData, &shouldFree);
   1775   if( rc!=SQLITE_OK ){
   1776     return rc;
   1777   }
   1778   /* recoverValidateLeafCell() should guarantee that this will never
   1779    * occur.
   1780    */
   1781   if( !SerialTypeIsCompatible(iColType, pRecover->pTypes[i]) ){
   1782     if( shouldFree ){
   1783       sqlite3_free(pColData);
   1784     }
   1785     return SQLITE_ERROR;
   1786   }
   1787 
   1788   switch( iColType ){
   1789     case 0 : sqlite3_result_null(ctx); break;
   1790     case 1 : sqlite3_result_int64(ctx, decodeSigned(pColData, 1)); break;
   1791     case 2 : sqlite3_result_int64(ctx, decodeSigned(pColData, 2)); break;
   1792     case 3 : sqlite3_result_int64(ctx, decodeSigned(pColData, 3)); break;
   1793     case 4 : sqlite3_result_int64(ctx, decodeSigned(pColData, 4)); break;
   1794     case 5 : sqlite3_result_int64(ctx, decodeSigned(pColData, 6)); break;
   1795     case 6 : sqlite3_result_int64(ctx, decodeSigned(pColData, 8)); break;
   1796     case 7 : sqlite3_result_double(ctx, decodeFloat64(pColData)); break;
   1797     case 8 : sqlite3_result_int(ctx, 0); break;
   1798     case 9 : sqlite3_result_int(ctx, 1); break;
   1799     case 10 : assert( iColType!=10 ); break;
   1800     case 11 : assert( iColType!=11 ); break;
   1801 
   1802     default : {
   1803       u32 l = SerialTypeLength(iColType);
   1804 
   1805       /* If pColData was already allocated, arrange to pass ownership. */
   1806       sqlite3_destructor_type pFn = SQLITE_TRANSIENT;
   1807       if( shouldFree ){
   1808         pFn = sqlite3_free;
   1809         shouldFree = 0;
   1810       }
   1811 
   1812       if( SerialTypeIsBlob(iColType) ){
   1813         sqlite3_result_blob(ctx, pColData, l, pFn);
   1814       }else{
   1815         if( pCursor->iEncoding==SQLITE_UTF16LE ){
   1816           sqlite3_result_text16le(ctx, (const void*)pColData, l, pFn);
   1817         }else if( pCursor->iEncoding==SQLITE_UTF16BE ){
   1818           sqlite3_result_text16be(ctx, (const void*)pColData, l, pFn);
   1819         }else{
   1820           sqlite3_result_text(ctx, (const char*)pColData, l, pFn);
   1821         }
   1822       }
   1823     } break;
   1824   }
   1825   if( shouldFree ){
   1826     sqlite3_free(pColData);
   1827   }
   1828   return SQLITE_OK;
   1829 }
   1830 
   1831 static int recoverRowid(sqlite3_vtab_cursor *pVtabCursor, sqlite_int64 *pRowid){
   1832   RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
   1833   FNENTRY();
   1834   *pRowid = leafCursorCellRowid(pCursor->pLeafCursor);
   1835   return SQLITE_OK;
   1836 }
   1837 
   1838 static sqlite3_module recoverModule = {
   1839   0,                         /* iVersion */
   1840   recoverCreate,             /* xCreate - create a table */
   1841   recoverConnect,            /* xConnect - connect to an existing table */
   1842   recoverBestIndex,          /* xBestIndex - Determine search strategy */
   1843   recoverDisconnect,         /* xDisconnect - Disconnect from a table */
   1844   recoverDestroy,            /* xDestroy - Drop a table */
   1845   recoverOpen,               /* xOpen - open a cursor */
   1846   recoverClose,              /* xClose - close a cursor */
   1847   recoverFilter,             /* xFilter - configure scan constraints */
   1848   recoverNext,               /* xNext - advance a cursor */
   1849   recoverEof,                /* xEof */
   1850   recoverColumn,             /* xColumn - read data */
   1851   recoverRowid,              /* xRowid - read data */
   1852   0,                         /* xUpdate - write data */
   1853   0,                         /* xBegin - begin transaction */
   1854   0,                         /* xSync - sync transaction */
   1855   0,                         /* xCommit - commit transaction */
   1856   0,                         /* xRollback - rollback transaction */
   1857   0,                         /* xFindFunction - function overloading */
   1858   0,                         /* xRename - rename the table */
   1859 };
   1860 
   1861 int recoverVtableInit(sqlite3 *db){
   1862   return sqlite3_create_module_v2(db, "recover", &recoverModule, NULL, 0);
   1863 }
   1864 
   1865 /* This section of code is for parsing the create input and
   1866  * initializing the module.
   1867  */
   1868 
   1869 /* Find the next word in zText and place the endpoints in pzWord*.
   1870  * Returns true if the word is non-empty.  "Word" is defined as
   1871  * ASCII alphanumeric plus '_' at this time.
   1872  */
   1873 static int findWord(const char *zText,
   1874                     const char **pzWordStart, const char **pzWordEnd){
   1875   int r;
   1876   while( ascii_isspace(*zText) ){
   1877     zText++;
   1878   }
   1879   *pzWordStart = zText;
   1880   while( ascii_isalnum(*zText) || *zText=='_' ){
   1881     zText++;
   1882   }
   1883   r = zText>*pzWordStart;  /* In case pzWordStart==pzWordEnd */
   1884   *pzWordEnd = zText;
   1885   return r;
   1886 }
   1887 
   1888 /* Return true if the next word in zText is zWord, also setting
   1889  * *pzContinue to the character after the word.
   1890  */
   1891 static int expectWord(const char *zText, const char *zWord,
   1892                       const char **pzContinue){
   1893   const char *zWordStart, *zWordEnd;
   1894   if( findWord(zText, &zWordStart, &zWordEnd) &&
   1895       ascii_strncasecmp(zWord, zWordStart, zWordEnd - zWordStart)==0 ){
   1896     *pzContinue = zWordEnd;
   1897     return 1;
   1898   }
   1899   return 0;
   1900 }
   1901 
   1902 /* Parse the name and type information out of parameter.  In case of
   1903  * success, *pzNameStart/End contain the name of the column,
   1904  * *pzTypeStart/End contain the top-level type, and *pTypeMask has the
   1905  * type mask to use for the column.
   1906  */
   1907 static int findNameAndType(const char *parameter,
   1908                            const char **pzNameStart, const char **pzNameEnd,
   1909                            const char **pzTypeStart, const char **pzTypeEnd,
   1910                            unsigned char *pTypeMask){
   1911   unsigned nNameLen;   /* Length of found name. */
   1912   const char *zEnd;    /* Current end of parsed column information. */
   1913   int bNotNull;        /* Non-zero if NULL is not allowed for name. */
   1914   int bStrict;         /* Non-zero if column requires exact type match. */
   1915   const char *zDummy;  /* Dummy parameter, result unused. */
   1916   unsigned i;
   1917 
   1918   /* strictMask is used for STRICT, strictMask|otherMask if STRICT is
   1919    * not supplied.  zReplace provides an alternate type to expose to
   1920    * the caller.
   1921    */
   1922   static struct {
   1923     const char *zName;
   1924     unsigned char strictMask;
   1925     unsigned char otherMask;
   1926     const char *zReplace;
   1927   } kTypeInfo[] = {
   1928     { "ANY",
   1929       MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL,
   1930       0, "",
   1931     },
   1932     { "ROWID",   MASK_INTEGER | MASK_ROWID,             0, "INTEGER", },
   1933     { "INTEGER", MASK_INTEGER | MASK_NULL,              0, NULL, },
   1934     { "FLOAT",   MASK_FLOAT | MASK_NULL,                MASK_INTEGER, NULL, },
   1935     { "NUMERIC", MASK_INTEGER | MASK_FLOAT | MASK_NULL, MASK_TEXT, NULL, },
   1936     { "TEXT",    MASK_TEXT | MASK_NULL,                 MASK_BLOB, NULL, },
   1937     { "BLOB",    MASK_BLOB | MASK_NULL,                 0, NULL, },
   1938   };
   1939 
   1940   if( !findWord(parameter, pzNameStart, pzNameEnd) ){
   1941     return SQLITE_MISUSE;
   1942   }
   1943 
   1944   /* Manifest typing, accept any storage type. */
   1945   if( !findWord(*pzNameEnd, pzTypeStart, pzTypeEnd) ){
   1946     *pzTypeEnd = *pzTypeStart = "";
   1947     *pTypeMask = MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL;
   1948     return SQLITE_OK;
   1949   }
   1950 
   1951   nNameLen = *pzTypeEnd - *pzTypeStart;
   1952   for( i=0; i<ArraySize(kTypeInfo); ++i ){
   1953     if( ascii_strncasecmp(kTypeInfo[i].zName, *pzTypeStart, nNameLen)==0 ){
   1954       break;
   1955     }
   1956   }
   1957   if( i==ArraySize(kTypeInfo) ){
   1958     return SQLITE_MISUSE;
   1959   }
   1960 
   1961   zEnd = *pzTypeEnd;
   1962   bStrict = 0;
   1963   if( expectWord(zEnd, "STRICT", &zEnd) ){
   1964     /* TODO(shess): Ick.  But I don't want another single-purpose
   1965      * flag, either.
   1966      */
   1967     if( kTypeInfo[i].zReplace && !kTypeInfo[i].zReplace[0] ){
   1968       return SQLITE_MISUSE;
   1969     }
   1970     bStrict = 1;
   1971   }
   1972 
   1973   bNotNull = 0;
   1974   if( expectWord(zEnd, "NOT", &zEnd) ){
   1975     if( expectWord(zEnd, "NULL", &zEnd) ){
   1976       bNotNull = 1;
   1977     }else{
   1978       /* Anything other than NULL after NOT is an error. */
   1979       return SQLITE_MISUSE;
   1980     }
   1981   }
   1982 
   1983   /* Anything else is an error. */
   1984   if( findWord(zEnd, &zDummy, &zDummy) ){
   1985     return SQLITE_MISUSE;
   1986   }
   1987 
   1988   *pTypeMask = kTypeInfo[i].strictMask;
   1989   if( !bStrict ){
   1990     *pTypeMask |= kTypeInfo[i].otherMask;
   1991   }
   1992   if( bNotNull ){
   1993     *pTypeMask &= ~MASK_NULL;
   1994   }
   1995   if( kTypeInfo[i].zReplace ){
   1996     *pzTypeStart = kTypeInfo[i].zReplace;
   1997     *pzTypeEnd = *pzTypeStart + strlen(*pzTypeStart);
   1998   }
   1999   return SQLITE_OK;
   2000 }
   2001 
   2002 /* Parse the arguments, placing type masks in *pTypes and the exposed
   2003  * schema in *pzCreateSql (for sqlite3_declare_vtab).
   2004  */
   2005 static int ParseColumnsAndGenerateCreate(unsigned nCols,
   2006                                          const char *const *pCols,
   2007                                          char **pzCreateSql,
   2008                                          unsigned char *pTypes,
   2009                                          char **pzErr){
   2010   unsigned i;
   2011   char *zCreateSql = sqlite3_mprintf("CREATE TABLE x(");
   2012   if( !zCreateSql ){
   2013     return SQLITE_NOMEM;
   2014   }
   2015 
   2016   for( i=0; i<nCols; i++ ){
   2017     const char *zSep = (i < nCols - 1 ? ", " : ")");
   2018     const char *zNotNull = "";
   2019     const char *zNameStart, *zNameEnd;
   2020     const char *zTypeStart, *zTypeEnd;
   2021     int rc = findNameAndType(pCols[i],
   2022                              &zNameStart, &zNameEnd,
   2023                              &zTypeStart, &zTypeEnd,
   2024                              &pTypes[i]);
   2025     if( rc!=SQLITE_OK ){
   2026       *pzErr = sqlite3_mprintf("unable to parse column %d", i);
   2027       sqlite3_free(zCreateSql);
   2028       return rc;
   2029     }
   2030 
   2031     if( !(pTypes[i]&MASK_NULL) ){
   2032       zNotNull = " NOT NULL";
   2033     }
   2034 
   2035     /* Add name and type to the create statement. */
   2036     zCreateSql = sqlite3_mprintf("%z%.*s %.*s%s%s",
   2037                                  zCreateSql,
   2038                                  zNameEnd - zNameStart, zNameStart,
   2039                                  zTypeEnd - zTypeStart, zTypeStart,
   2040                                  zNotNull, zSep);
   2041     if( !zCreateSql ){
   2042       return SQLITE_NOMEM;
   2043     }
   2044   }
   2045 
   2046   *pzCreateSql = zCreateSql;
   2047   return SQLITE_OK;
   2048 }
   2049 
   2050 /* Helper function for initializing the module. */
   2051 /* argv[0] module name
   2052  * argv[1] db name for virtual table
   2053  * argv[2] virtual table name
   2054  * argv[3] backing table name
   2055  * argv[4] columns
   2056  */
   2057 /* TODO(shess): Since connect isn't supported, could inline into
   2058  * recoverCreate().
   2059  */
   2060 /* TODO(shess): Explore cases where it would make sense to set *pzErr. */
   2061 static int recoverInit(
   2062   sqlite3 *db,                        /* Database connection */
   2063   void *pAux,                         /* unused */
   2064   int argc, const char *const*argv,   /* Parameters to CREATE TABLE statement */
   2065   sqlite3_vtab **ppVtab,              /* OUT: New virtual table */
   2066   char **pzErr                        /* OUT: Error message, if any */
   2067 ){
   2068   const unsigned kTypeCol = 4;  /* First argument with column type info. */
   2069   Recover *pRecover;            /* Virtual table structure being created. */
   2070   char *zDot;                   /* Any dot found in "db.table" backing. */
   2071   u32 iRootPage;                /* Root page of backing table. */
   2072   char *zCreateSql;             /* Schema of created virtual table. */
   2073   int rc;
   2074 
   2075   /* Require to be in the temp database. */
   2076   if( ascii_strcasecmp(argv[1], "temp")!=0 ){
   2077     *pzErr = sqlite3_mprintf("recover table must be in temp database");
   2078     return SQLITE_MISUSE;
   2079   }
   2080 
   2081   /* Need the backing table and at least one column. */
   2082   if( argc<=kTypeCol ){
   2083     *pzErr = sqlite3_mprintf("no columns specified");
   2084     return SQLITE_MISUSE;
   2085   }
   2086 
   2087   pRecover = sqlite3_malloc(sizeof(Recover));
   2088   if( !pRecover ){
   2089     return SQLITE_NOMEM;
   2090   }
   2091   memset(pRecover, 0, sizeof(*pRecover));
   2092   pRecover->base.pModule = &recoverModule;
   2093   pRecover->db = db;
   2094 
   2095   /* Parse out db.table, assuming main if no dot. */
   2096   zDot = strchr(argv[3], '.');
   2097   if( !zDot ){
   2098     pRecover->zDb = sqlite3_strdup(db->aDb[0].zName);
   2099     pRecover->zTable = sqlite3_strdup(argv[3]);
   2100   }else if( zDot>argv[3] && zDot[1]!='\0' ){
   2101     pRecover->zDb = sqlite3_strndup(argv[3], zDot - argv[3]);
   2102     pRecover->zTable = sqlite3_strdup(zDot + 1);
   2103   }else{
   2104     /* ".table" or "db." not allowed. */
   2105     *pzErr = sqlite3_mprintf("ill-formed table specifier");
   2106     recoverRelease(pRecover);
   2107     return SQLITE_ERROR;
   2108   }
   2109 
   2110   pRecover->nCols = argc - kTypeCol;
   2111   pRecover->pTypes = sqlite3_malloc(pRecover->nCols);
   2112   if( !pRecover->zDb || !pRecover->zTable || !pRecover->pTypes ){
   2113     recoverRelease(pRecover);
   2114     return SQLITE_NOMEM;
   2115   }
   2116 
   2117   /* Require the backing table to exist. */
   2118   /* TODO(shess): Be more pedantic about the form of the descriptor
   2119    * string.  This already fails for poorly-formed strings, simply
   2120    * because there won't be a root page, but it would make more sense
   2121    * to be explicit.
   2122    */
   2123   rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable, &iRootPage);
   2124   if( rc!=SQLITE_OK ){
   2125     *pzErr = sqlite3_mprintf("unable to find backing table");
   2126     recoverRelease(pRecover);
   2127     return rc;
   2128   }
   2129 
   2130   /* Parse the column definitions. */
   2131   rc = ParseColumnsAndGenerateCreate(pRecover->nCols, argv + kTypeCol,
   2132                                      &zCreateSql, pRecover->pTypes, pzErr);
   2133   if( rc!=SQLITE_OK ){
   2134     recoverRelease(pRecover);
   2135     return rc;
   2136   }
   2137 
   2138   rc = sqlite3_declare_vtab(db, zCreateSql);
   2139   sqlite3_free(zCreateSql);
   2140   if( rc!=SQLITE_OK ){
   2141     recoverRelease(pRecover);
   2142     return rc;
   2143   }
   2144 
   2145   *ppVtab = (sqlite3_vtab *)pRecover;
   2146   return SQLITE_OK;
   2147 }
   2148