Home | History | Annotate | Download | only in pppd
      1  /*
      2    Unix SMB/CIFS implementation.
      3 
      4    trivial database library
      5 
      6    Copyright (C) Andrew Tridgell              1999-2004
      7    Copyright (C) Paul `Rusty' Russell		   2000
      8    Copyright (C) Jeremy Allison			   2000-2003
      9 
     10      ** NOTE! The following LGPL license applies to the tdb
     11      ** library. This does NOT imply that all of Samba is released
     12      ** under the LGPL
     13 
     14    This library is free software; you can redistribute it and/or
     15    modify it under the terms of the GNU Lesser General Public
     16    License as published by the Free Software Foundation; either
     17    version 2 of the License, or (at your option) any later version.
     18 
     19    This library is distributed in the hope that it will be useful,
     20    but WITHOUT ANY WARRANTY; without even the implied warranty of
     21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     22    Lesser General Public License for more details.
     23 
     24    You should have received a copy of the GNU Lesser General Public
     25    License along with this library; if not, write to the Free Software
     26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
     27 */
     28 
     29 
     30 /* NOTE: If you use tdbs under valgrind, and in particular if you run
     31  * tdbtorture, you may get spurious "uninitialized value" warnings.  I
     32  * think this is because valgrind doesn't understand that the mmap'd
     33  * area may be written to by other processes.  Memory can, from the
     34  * point of view of the grinded process, spontaneously become
     35  * initialized.
     36  *
     37  * I can think of a few solutions.  [mbp 20030311]
     38  *
     39  * 1 - Write suppressions for Valgrind so that it doesn't complain
     40  * about this.  Probably the most reasonable but people need to
     41  * remember to use them.
     42  *
     43  * 2 - Use IO not mmap when running under valgrind.  Not so nice.
     44  *
     45  * 3 - Use the special valgrind macros to mark memory as valid at the
     46  * right time.  Probably too hard -- the process just doesn't know.
     47  */
     48 
     49 #include <stdlib.h>
     50 #include <stdio.h>
     51 #include <fcntl.h>
     52 #include <unistd.h>
     53 #include <string.h>
     54 #include <fcntl.h>
     55 #include <errno.h>
     56 #include <sys/mman.h>
     57 #include <sys/stat.h>
     58 #include <signal.h>
     59 #include "tdb.h"
     60 #include "spinlock.h"
     61 
     62 #define TDB_MAGIC_FOOD "TDB file\n"
     63 #define TDB_VERSION (0x26011967 + 6)
     64 #define TDB_MAGIC (0x26011999U)
     65 #define TDB_FREE_MAGIC (~TDB_MAGIC)
     66 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
     67 #define TDB_ALIGNMENT 4
     68 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
     69 #define DEFAULT_HASH_SIZE 131
     70 #define TDB_PAGE_SIZE 0x2000
     71 #define FREELIST_TOP (sizeof(struct tdb_header))
     72 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
     73 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
     74 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
     75 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
     76 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
     77 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
     78 
     79 
     80 /* NB assumes there is a local variable called "tdb" that is the
     81  * current context, also takes doubly-parenthesized print-style
     82  * argument. */
     83 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
     84 
     85 /* lock offsets */
     86 #define GLOBAL_LOCK 0
     87 #define ACTIVE_LOCK 4
     88 
     89 #ifndef MAP_FILE
     90 #define MAP_FILE 0
     91 #endif
     92 
     93 #ifndef MAP_FAILED
     94 #define MAP_FAILED ((void *)-1)
     95 #endif
     96 
     97 /* free memory if the pointer is valid and zero the pointer */
     98 #ifndef SAFE_FREE
     99 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
    100 #endif
    101 
    102 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
    103 TDB_DATA tdb_null;
    104 
    105 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
    106 static TDB_CONTEXT *tdbs = NULL;
    107 
    108 static int tdb_munmap(TDB_CONTEXT *tdb)
    109 {
    110 	if (tdb->flags & TDB_INTERNAL)
    111 		return 0;
    112 
    113 #ifdef HAVE_MMAP
    114 	if (tdb->map_ptr) {
    115 		int ret = munmap(tdb->map_ptr, tdb->map_size);
    116 		if (ret != 0)
    117 			return ret;
    118 	}
    119 #endif
    120 	tdb->map_ptr = NULL;
    121 	return 0;
    122 }
    123 
    124 static void tdb_mmap(TDB_CONTEXT *tdb)
    125 {
    126 	if (tdb->flags & TDB_INTERNAL)
    127 		return;
    128 
    129 #ifdef HAVE_MMAP
    130 	if (!(tdb->flags & TDB_NOMMAP)) {
    131 		tdb->map_ptr = mmap(NULL, tdb->map_size,
    132 				    PROT_READ|(tdb->read_only? 0:PROT_WRITE),
    133 				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
    134 
    135 		/*
    136 		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
    137 		 */
    138 
    139 		if (tdb->map_ptr == MAP_FAILED) {
    140 			tdb->map_ptr = NULL;
    141 			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
    142 				 tdb->map_size, strerror(errno)));
    143 		}
    144 	} else {
    145 		tdb->map_ptr = NULL;
    146 	}
    147 #else
    148 	tdb->map_ptr = NULL;
    149 #endif
    150 }
    151 
    152 /* Endian conversion: we only ever deal with 4 byte quantities */
    153 static void *convert(void *buf, u32 size)
    154 {
    155 	u32 i, *p = buf;
    156 	for (i = 0; i < size / 4; i++)
    157 		p[i] = TDB_BYTEREV(p[i]);
    158 	return buf;
    159 }
    160 #define DOCONV() (tdb->flags & TDB_CONVERT)
    161 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
    162 
    163 /* the body of the database is made of one list_struct for the free space
    164    plus a separate data list for each hash value */
    165 struct list_struct {
    166 	tdb_off next; /* offset of the next record in the list */
    167 	tdb_len rec_len; /* total byte length of record */
    168 	tdb_len key_len; /* byte length of key */
    169 	tdb_len data_len; /* byte length of data */
    170 	u32 full_hash; /* the full 32 bit hash of the key */
    171 	u32 magic;   /* try to catch errors */
    172 	/* the following union is implied:
    173 		union {
    174 			char record[rec_len];
    175 			struct {
    176 				char key[key_len];
    177 				char data[data_len];
    178 			}
    179 			u32 totalsize; (tailer)
    180 		}
    181 	*/
    182 };
    183 
    184 /***************************************************************
    185  Allow a caller to set a "alarm" flag that tdb can check to abort
    186  a blocking lock on SIGALRM.
    187 ***************************************************************/
    188 
    189 static sig_atomic_t *palarm_fired;
    190 
    191 void tdb_set_lock_alarm(sig_atomic_t *palarm)
    192 {
    193 	palarm_fired = palarm;
    194 }
    195 
    196 /* a byte range locking function - return 0 on success
    197    this functions locks/unlocks 1 byte at the specified offset.
    198 
    199    On error, errno is also set so that errors are passed back properly
    200    through tdb_open(). */
    201 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
    202 		      int rw_type, int lck_type, int probe)
    203 {
    204 	struct flock fl;
    205 	int ret;
    206 
    207 	if (tdb->flags & TDB_NOLOCK)
    208 		return 0;
    209 	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
    210 		errno = EACCES;
    211 		return -1;
    212 	}
    213 
    214 	fl.l_type = rw_type;
    215 	fl.l_whence = SEEK_SET;
    216 	fl.l_start = offset;
    217 	fl.l_len = 1;
    218 	fl.l_pid = 0;
    219 
    220 	do {
    221 		ret = fcntl(tdb->fd,lck_type,&fl);
    222 		if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
    223 			break;
    224 	} while (ret == -1 && errno == EINTR);
    225 
    226 	if (ret == -1) {
    227 		if (!probe && lck_type != F_SETLK) {
    228 			/* Ensure error code is set for log fun to examine. */
    229 			if (errno == EINTR && palarm_fired && *palarm_fired)
    230 				tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
    231 			else
    232 				tdb->ecode = TDB_ERR_LOCK;
    233 			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
    234 				 tdb->fd, offset, rw_type, lck_type));
    235 		}
    236 		/* Was it an alarm timeout ? */
    237 		if (errno == EINTR && palarm_fired && *palarm_fired) {
    238 			TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
    239 				 tdb->fd, offset, rw_type, lck_type));
    240 			return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
    241 		}
    242 		/* Otherwise - generic lock error. errno set by fcntl.
    243 		 * EAGAIN is an expected return from non-blocking
    244 		 * locks. */
    245 		if (errno != EAGAIN) {
    246 			TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
    247 				 tdb->fd, offset, rw_type, lck_type,
    248 				 strerror(errno)));
    249 		}
    250 		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
    251 	}
    252 	return 0;
    253 }
    254 
    255 /* lock a list in the database. list -1 is the alloc list */
    256 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
    257 {
    258 	if (list < -1 || list >= (int)tdb->header.hash_size) {
    259 		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
    260 			   list, ltype));
    261 		return -1;
    262 	}
    263 	if (tdb->flags & TDB_NOLOCK)
    264 		return 0;
    265 
    266 	/* Since fcntl locks don't nest, we do a lock for the first one,
    267 	   and simply bump the count for future ones */
    268 	if (tdb->locked[list+1].count == 0) {
    269 		if (!tdb->read_only && tdb->header.rwlocks) {
    270 			if (tdb_spinlock(tdb, list, ltype)) {
    271 				TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list %d ltype=%d\n",
    272 					   list, ltype));
    273 				return -1;
    274 			}
    275 		} else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
    276 			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
    277 					   list, ltype, strerror(errno)));
    278 			return -1;
    279 		}
    280 		tdb->locked[list+1].ltype = ltype;
    281 	}
    282 	tdb->locked[list+1].count++;
    283 	return 0;
    284 }
    285 
    286 /* unlock the database: returns void because it's too late for errors. */
    287 	/* changed to return int it may be interesting to know there
    288 	   has been an error  --simo */
    289 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
    290 {
    291 	int ret = -1;
    292 
    293 	if (tdb->flags & TDB_NOLOCK)
    294 		return 0;
    295 
    296 	/* Sanity checks */
    297 	if (list < -1 || list >= (int)tdb->header.hash_size) {
    298 		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
    299 		return ret;
    300 	}
    301 
    302 	if (tdb->locked[list+1].count==0) {
    303 		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
    304 		return ret;
    305 	}
    306 
    307 	if (tdb->locked[list+1].count == 1) {
    308 		/* Down to last nested lock: unlock underneath */
    309 		if (!tdb->read_only && tdb->header.rwlocks) {
    310 			ret = tdb_spinunlock(tdb, list, ltype);
    311 		} else {
    312 			ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
    313 		}
    314 	} else {
    315 		ret = 0;
    316 	}
    317 	tdb->locked[list+1].count--;
    318 
    319 	if (ret)
    320 		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
    321 	return ret;
    322 }
    323 
    324 /* check for an out of bounds access - if it is out of bounds then
    325    see if the database has been expanded by someone else and expand
    326    if necessary
    327    note that "len" is the minimum length needed for the db
    328 */
    329 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
    330 {
    331 	struct stat st;
    332 	if (len <= tdb->map_size)
    333 		return 0;
    334 	if (tdb->flags & TDB_INTERNAL) {
    335 		if (!probe) {
    336 			/* Ensure ecode is set for log fn. */
    337 			tdb->ecode = TDB_ERR_IO;
    338 			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
    339 				 (int)len, (int)tdb->map_size));
    340 		}
    341 		return TDB_ERRCODE(TDB_ERR_IO, -1);
    342 	}
    343 
    344 	if (fstat(tdb->fd, &st) == -1)
    345 		return TDB_ERRCODE(TDB_ERR_IO, -1);
    346 
    347 	if (st.st_size < (size_t)len) {
    348 		if (!probe) {
    349 			/* Ensure ecode is set for log fn. */
    350 			tdb->ecode = TDB_ERR_IO;
    351 			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
    352 				 (int)len, (int)st.st_size));
    353 		}
    354 		return TDB_ERRCODE(TDB_ERR_IO, -1);
    355 	}
    356 
    357 	/* Unmap, update size, remap */
    358 	if (tdb_munmap(tdb) == -1)
    359 		return TDB_ERRCODE(TDB_ERR_IO, -1);
    360 	tdb->map_size = st.st_size;
    361 	tdb_mmap(tdb);
    362 	return 0;
    363 }
    364 
    365 /* write a lump of data at a specified offset */
    366 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
    367 {
    368 	if (tdb_oob(tdb, off + len, 0) != 0)
    369 		return -1;
    370 
    371 	if (tdb->map_ptr)
    372 		memcpy(off + (char *)tdb->map_ptr, buf, len);
    373 #ifdef HAVE_PWRITE
    374 	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
    375 #else
    376 	else if (lseek(tdb->fd, off, SEEK_SET) != off
    377 		 || write(tdb->fd, buf, len) != (ssize_t)len) {
    378 #endif
    379 		/* Ensure ecode is set for log fn. */
    380 		tdb->ecode = TDB_ERR_IO;
    381 		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
    382 			   off, len, strerror(errno)));
    383 		return TDB_ERRCODE(TDB_ERR_IO, -1);
    384 	}
    385 	return 0;
    386 }
    387 
    388 /* read a lump of data at a specified offset, maybe convert */
    389 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
    390 {
    391 	if (tdb_oob(tdb, off + len, 0) != 0)
    392 		return -1;
    393 
    394 	if (tdb->map_ptr)
    395 		memcpy(buf, off + (char *)tdb->map_ptr, len);
    396 #ifdef HAVE_PREAD
    397 	else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
    398 #else
    399 	else if (lseek(tdb->fd, off, SEEK_SET) != off
    400 		 || read(tdb->fd, buf, len) != (ssize_t)len) {
    401 #endif
    402 		/* Ensure ecode is set for log fn. */
    403 		tdb->ecode = TDB_ERR_IO;
    404 		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
    405 			   off, len, strerror(errno)));
    406 		return TDB_ERRCODE(TDB_ERR_IO, -1);
    407 	}
    408 	if (cv)
    409 		convert(buf, len);
    410 	return 0;
    411 }
    412 
    413 /* read a lump of data, allocating the space for it */
    414 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
    415 {
    416 	char *buf;
    417 
    418 	if (!(buf = malloc(len))) {
    419 		/* Ensure ecode is set for log fn. */
    420 		tdb->ecode = TDB_ERR_OOM;
    421 		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
    422 			   len, strerror(errno)));
    423 		return TDB_ERRCODE(TDB_ERR_OOM, buf);
    424 	}
    425 	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
    426 		SAFE_FREE(buf);
    427 		return NULL;
    428 	}
    429 	return buf;
    430 }
    431 
    432 /* read/write a tdb_off */
    433 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
    434 {
    435 	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
    436 }
    437 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
    438 {
    439 	tdb_off off = *d;
    440 	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
    441 }
    442 
    443 /* read/write a record */
    444 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
    445 {
    446 	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
    447 		return -1;
    448 	if (TDB_BAD_MAGIC(rec)) {
    449 		/* Ensure ecode is set for log fn. */
    450 		tdb->ecode = TDB_ERR_CORRUPT;
    451 		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
    452 		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
    453 	}
    454 	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
    455 }
    456 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
    457 {
    458 	struct list_struct r = *rec;
    459 	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
    460 }
    461 
    462 /* read a freelist record and check for simple errors */
    463 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
    464 {
    465 	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
    466 		return -1;
    467 
    468 	if (rec->magic == TDB_MAGIC) {
    469 		/* this happens when a app is showdown while deleting a record - we should
    470 		   not completely fail when this happens */
    471 		TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
    472 			 rec->magic, off));
    473 		rec->magic = TDB_FREE_MAGIC;
    474 		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
    475 			return -1;
    476 	}
    477 
    478 	if (rec->magic != TDB_FREE_MAGIC) {
    479 		/* Ensure ecode is set for log fn. */
    480 		tdb->ecode = TDB_ERR_CORRUPT;
    481 		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
    482 			   rec->magic, off));
    483 		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
    484 	}
    485 	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
    486 		return -1;
    487 	return 0;
    488 }
    489 
    490 /* update a record tailer (must hold allocation lock) */
    491 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
    492 			 const struct list_struct *rec)
    493 {
    494 	tdb_off totalsize;
    495 
    496 	/* Offset of tailer from record header */
    497 	totalsize = sizeof(*rec) + rec->rec_len;
    498 	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
    499 			 &totalsize);
    500 }
    501 
    502 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
    503 {
    504 	struct list_struct rec;
    505 	tdb_off tailer_ofs, tailer;
    506 
    507 	if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
    508 		printf("ERROR: failed to read record at %u\n", offset);
    509 		return 0;
    510 	}
    511 
    512 	printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
    513 	       offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
    514 
    515 	tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
    516 	if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
    517 		printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
    518 		return rec.next;
    519 	}
    520 
    521 	if (tailer != rec.rec_len + sizeof(rec)) {
    522 		printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
    523 				(unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
    524 	}
    525 	return rec.next;
    526 }
    527 
    528 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
    529 {
    530 	tdb_off rec_ptr, top;
    531 
    532 	top = TDB_HASH_TOP(i);
    533 
    534 	if (tdb_lock(tdb, i, F_WRLCK) != 0)
    535 		return -1;
    536 
    537 	if (ofs_read(tdb, top, &rec_ptr) == -1)
    538 		return tdb_unlock(tdb, i, F_WRLCK);
    539 
    540 	if (rec_ptr)
    541 		printf("hash=%d\n", i);
    542 
    543 	while (rec_ptr) {
    544 		rec_ptr = tdb_dump_record(tdb, rec_ptr);
    545 	}
    546 
    547 	return tdb_unlock(tdb, i, F_WRLCK);
    548 }
    549 
    550 void tdb_dump_all(TDB_CONTEXT *tdb)
    551 {
    552 	int i;
    553 	for (i=0;i<tdb->header.hash_size;i++) {
    554 		tdb_dump_chain(tdb, i);
    555 	}
    556 	printf("freelist:\n");
    557 	tdb_dump_chain(tdb, -1);
    558 }
    559 
    560 int tdb_printfreelist(TDB_CONTEXT *tdb)
    561 {
    562 	int ret;
    563 	long total_free = 0;
    564 	tdb_off offset, rec_ptr;
    565 	struct list_struct rec;
    566 
    567 	if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
    568 		return ret;
    569 
    570 	offset = FREELIST_TOP;
    571 
    572 	/* read in the freelist top */
    573 	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
    574 		tdb_unlock(tdb, -1, F_WRLCK);
    575 		return 0;
    576 	}
    577 
    578 	printf("freelist top=[0x%08x]\n", rec_ptr );
    579 	while (rec_ptr) {
    580 		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
    581 			tdb_unlock(tdb, -1, F_WRLCK);
    582 			return -1;
    583 		}
    584 
    585 		if (rec.magic != TDB_FREE_MAGIC) {
    586 			printf("bad magic 0x%08x in free list\n", rec.magic);
    587 			tdb_unlock(tdb, -1, F_WRLCK);
    588 			return -1;
    589 		}
    590 
    591 		printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
    592 		total_free += rec.rec_len;
    593 
    594 		/* move to the next record */
    595 		rec_ptr = rec.next;
    596 	}
    597 	printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
    598                (int)total_free);
    599 
    600 	return tdb_unlock(tdb, -1, F_WRLCK);
    601 }
    602 
    603 /* Remove an element from the freelist.  Must have alloc lock. */
    604 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
    605 {
    606 	tdb_off last_ptr, i;
    607 
    608 	/* read in the freelist top */
    609 	last_ptr = FREELIST_TOP;
    610 	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
    611 		if (i == off) {
    612 			/* We've found it! */
    613 			return ofs_write(tdb, last_ptr, &next);
    614 		}
    615 		/* Follow chain (next offset is at start of record) */
    616 		last_ptr = i;
    617 	}
    618 	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
    619 	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
    620 }
    621 
    622 /* Add an element into the freelist. Merge adjacent records if
    623    neccessary. */
    624 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
    625 {
    626 	tdb_off right, left;
    627 
    628 	/* Allocation and tailer lock */
    629 	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
    630 		return -1;
    631 
    632 	/* set an initial tailer, so if we fail we don't leave a bogus record */
    633 	if (update_tailer(tdb, offset, rec) != 0) {
    634 		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
    635 		goto fail;
    636 	}
    637 
    638 	/* Look right first (I'm an Australian, dammit) */
    639 	right = offset + sizeof(*rec) + rec->rec_len;
    640 	if (right + sizeof(*rec) <= tdb->map_size) {
    641 		struct list_struct r;
    642 
    643 		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
    644 			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
    645 			goto left;
    646 		}
    647 
    648 		/* If it's free, expand to include it. */
    649 		if (r.magic == TDB_FREE_MAGIC) {
    650 			if (remove_from_freelist(tdb, right, r.next) == -1) {
    651 				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
    652 				goto left;
    653 			}
    654 			rec->rec_len += sizeof(r) + r.rec_len;
    655 		}
    656 	}
    657 
    658 left:
    659 	/* Look left */
    660 	left = offset - sizeof(tdb_off);
    661 	if (left > TDB_DATA_START(tdb->header.hash_size)) {
    662 		struct list_struct l;
    663 		tdb_off leftsize;
    664 
    665 		/* Read in tailer and jump back to header */
    666 		if (ofs_read(tdb, left, &leftsize) == -1) {
    667 			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
    668 			goto update;
    669 		}
    670 		left = offset - leftsize;
    671 
    672 		/* Now read in record */
    673 		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
    674 			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
    675 			goto update;
    676 		}
    677 
    678 		/* If it's free, expand to include it. */
    679 		if (l.magic == TDB_FREE_MAGIC) {
    680 			if (remove_from_freelist(tdb, left, l.next) == -1) {
    681 				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
    682 				goto update;
    683 			} else {
    684 				offset = left;
    685 				rec->rec_len += leftsize;
    686 			}
    687 		}
    688 	}
    689 
    690 update:
    691 	if (update_tailer(tdb, offset, rec) == -1) {
    692 		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
    693 		goto fail;
    694 	}
    695 
    696 	/* Now, prepend to free list */
    697 	rec->magic = TDB_FREE_MAGIC;
    698 
    699 	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
    700 	    rec_write(tdb, offset, rec) == -1 ||
    701 	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
    702 		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
    703 		goto fail;
    704 	}
    705 
    706 	/* And we're done. */
    707 	tdb_unlock(tdb, -1, F_WRLCK);
    708 	return 0;
    709 
    710  fail:
    711 	tdb_unlock(tdb, -1, F_WRLCK);
    712 	return -1;
    713 }
    714 
    715 
    716 /* expand a file.  we prefer to use ftruncate, as that is what posix
    717   says to use for mmap expansion */
    718 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
    719 {
    720 	char buf[1024];
    721 #if HAVE_FTRUNCATE_EXTEND
    722 	if (ftruncate(tdb->fd, size+addition) != 0) {
    723 		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
    724 			   size+addition, strerror(errno)));
    725 		return -1;
    726 	}
    727 #else
    728 	char b = 0;
    729 
    730 #ifdef HAVE_PWRITE
    731 	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
    732 #else
    733 	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
    734 	    write(tdb->fd, &b, 1) != 1) {
    735 #endif
    736 		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
    737 			   size+addition, strerror(errno)));
    738 		return -1;
    739 	}
    740 #endif
    741 
    742 	/* now fill the file with something. This ensures that the file isn't sparse, which would be
    743 	   very bad if we ran out of disk. This must be done with write, not via mmap */
    744 	memset(buf, 0x42, sizeof(buf));
    745 	while (addition) {
    746 		int n = addition>sizeof(buf)?sizeof(buf):addition;
    747 #ifdef HAVE_PWRITE
    748 		int ret = pwrite(tdb->fd, buf, n, size);
    749 #else
    750 		int ret;
    751 		if (lseek(tdb->fd, size, SEEK_SET) != size)
    752 			return -1;
    753 		ret = write(tdb->fd, buf, n);
    754 #endif
    755 		if (ret != n) {
    756 			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
    757 				   n, strerror(errno)));
    758 			return -1;
    759 		}
    760 		addition -= n;
    761 		size += n;
    762 	}
    763 	return 0;
    764 }
    765 
    766 
    767 /* expand the database at least size bytes by expanding the underlying
    768    file and doing the mmap again if necessary */
    769 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
    770 {
    771 	struct list_struct rec;
    772 	tdb_off offset;
    773 
    774 	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
    775 		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
    776 		return -1;
    777 	}
    778 
    779 	/* must know about any previous expansions by another process */
    780 	tdb_oob(tdb, tdb->map_size + 1, 1);
    781 
    782 	/* always make room for at least 10 more records, and round
    783            the database up to a multiple of TDB_PAGE_SIZE */
    784 	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
    785 
    786 	if (!(tdb->flags & TDB_INTERNAL))
    787 		tdb_munmap(tdb);
    788 
    789 	/*
    790 	 * We must ensure the file is unmapped before doing this
    791 	 * to ensure consistency with systems like OpenBSD where
    792 	 * writes and mmaps are not consistent.
    793 	 */
    794 
    795 	/* expand the file itself */
    796 	if (!(tdb->flags & TDB_INTERNAL)) {
    797 		if (expand_file(tdb, tdb->map_size, size) != 0)
    798 			goto fail;
    799 	}
    800 
    801 	tdb->map_size += size;
    802 
    803 	if (tdb->flags & TDB_INTERNAL)
    804 		tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
    805 	else {
    806 		/*
    807 		 * We must ensure the file is remapped before adding the space
    808 		 * to ensure consistency with systems like OpenBSD where
    809 		 * writes and mmaps are not consistent.
    810 		 */
    811 
    812 		/* We're ok if the mmap fails as we'll fallback to read/write */
    813 		tdb_mmap(tdb);
    814 	}
    815 
    816 	/* form a new freelist record */
    817 	memset(&rec,'\0',sizeof(rec));
    818 	rec.rec_len = size - sizeof(rec);
    819 
    820 	/* link it into the free list */
    821 	offset = tdb->map_size - size;
    822 	if (tdb_free(tdb, offset, &rec) == -1)
    823 		goto fail;
    824 
    825 	tdb_unlock(tdb, -1, F_WRLCK);
    826 	return 0;
    827  fail:
    828 	tdb_unlock(tdb, -1, F_WRLCK);
    829 	return -1;
    830 }
    831 
    832 /* allocate some space from the free list. The offset returned points
    833    to a unconnected list_struct within the database with room for at
    834    least length bytes of total data
    835 
    836    0 is returned if the space could not be allocated
    837  */
    838 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
    839 			    struct list_struct *rec)
    840 {
    841 	tdb_off rec_ptr, last_ptr, newrec_ptr;
    842 	struct list_struct newrec;
    843 
    844 	memset(&newrec, '\0', sizeof(newrec));
    845 
    846 	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
    847 		return 0;
    848 
    849 	/* Extra bytes required for tailer */
    850 	length += sizeof(tdb_off);
    851 
    852  again:
    853 	last_ptr = FREELIST_TOP;
    854 
    855 	/* read in the freelist top */
    856 	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
    857 		goto fail;
    858 
    859 	/* keep looking until we find a freelist record big enough */
    860 	while (rec_ptr) {
    861 		if (rec_free_read(tdb, rec_ptr, rec) == -1)
    862 			goto fail;
    863 
    864 		if (rec->rec_len >= length) {
    865 			/* found it - now possibly split it up  */
    866 			if (rec->rec_len > length + MIN_REC_SIZE) {
    867 				/* Length of left piece */
    868 				length = TDB_ALIGN(length, TDB_ALIGNMENT);
    869 
    870 				/* Right piece to go on free list */
    871 				newrec.rec_len = rec->rec_len
    872 					- (sizeof(*rec) + length);
    873 				newrec_ptr = rec_ptr + sizeof(*rec) + length;
    874 
    875 				/* And left record is shortened */
    876 				rec->rec_len = length;
    877 			} else
    878 				newrec_ptr = 0;
    879 
    880 			/* Remove allocated record from the free list */
    881 			if (ofs_write(tdb, last_ptr, &rec->next) == -1)
    882 				goto fail;
    883 
    884 			/* Update header: do this before we drop alloc
    885                            lock, otherwise tdb_free() might try to
    886                            merge with us, thinking we're free.
    887                            (Thanks Jeremy Allison). */
    888 			rec->magic = TDB_MAGIC;
    889 			if (rec_write(tdb, rec_ptr, rec) == -1)
    890 				goto fail;
    891 
    892 			/* Did we create new block? */
    893 			if (newrec_ptr) {
    894 				/* Update allocated record tailer (we
    895                                    shortened it). */
    896 				if (update_tailer(tdb, rec_ptr, rec) == -1)
    897 					goto fail;
    898 
    899 				/* Free new record */
    900 				if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
    901 					goto fail;
    902 			}
    903 
    904 			/* all done - return the new record offset */
    905 			tdb_unlock(tdb, -1, F_WRLCK);
    906 			return rec_ptr;
    907 		}
    908 		/* move to the next record */
    909 		last_ptr = rec_ptr;
    910 		rec_ptr = rec->next;
    911 	}
    912 	/* we didn't find enough space. See if we can expand the
    913 	   database and if we can then try again */
    914 	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
    915 		goto again;
    916  fail:
    917 	tdb_unlock(tdb, -1, F_WRLCK);
    918 	return 0;
    919 }
    920 
    921 /* initialise a new database with a specified hash size */
    922 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
    923 {
    924 	struct tdb_header *newdb;
    925 	int size, ret = -1;
    926 
    927 	/* We make it up in memory, then write it out if not internal */
    928 	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
    929 	if (!(newdb = calloc(size, 1)))
    930 		return TDB_ERRCODE(TDB_ERR_OOM, -1);
    931 
    932 	/* Fill in the header */
    933 	newdb->version = TDB_VERSION;
    934 	newdb->hash_size = hash_size;
    935 	if (tdb->flags & TDB_INTERNAL) {
    936 		tdb->map_size = size;
    937 		tdb->map_ptr = (char *)newdb;
    938 		memcpy(&tdb->header, newdb, sizeof(tdb->header));
    939 		/* Convert the `ondisk' version if asked. */
    940 		CONVERT(*newdb);
    941 		return 0;
    942 	}
    943 	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
    944 		goto fail;
    945 
    946 	if (ftruncate(tdb->fd, 0) == -1)
    947 		goto fail;
    948 
    949 	/* This creates an endian-converted header, as if read from disk */
    950 	CONVERT(*newdb);
    951 	memcpy(&tdb->header, newdb, sizeof(tdb->header));
    952 	/* Don't endian-convert the magic food! */
    953 	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
    954 	if (write(tdb->fd, newdb, size) != size)
    955 		ret = -1;
    956 	else
    957 		ret = tdb_create_rwlocks(tdb->fd, hash_size);
    958 
    959   fail:
    960 	SAFE_FREE(newdb);
    961 	return ret;
    962 }
    963 
    964 /* Returns 0 on fail.  On success, return offset of record, and fills
    965    in rec */
    966 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
    967 			struct list_struct *r)
    968 {
    969 	tdb_off rec_ptr;
    970 
    971 	/* read in the hash top */
    972 	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
    973 		return 0;
    974 
    975 	/* keep looking until we find the right record */
    976 	while (rec_ptr) {
    977 		if (rec_read(tdb, rec_ptr, r) == -1)
    978 			return 0;
    979 
    980 		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
    981 			char *k;
    982 			/* a very likely hit - read the key */
    983 			k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
    984 					   r->key_len);
    985 			if (!k)
    986 				return 0;
    987 
    988 			if (memcmp(key.dptr, k, key.dsize) == 0) {
    989 				SAFE_FREE(k);
    990 				return rec_ptr;
    991 			}
    992 			SAFE_FREE(k);
    993 		}
    994 		rec_ptr = r->next;
    995 	}
    996 	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
    997 }
    998 
    999 /* As tdb_find, but if you succeed, keep the lock */
   1000 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
   1001 			     struct list_struct *rec)
   1002 {
   1003 	u32 rec_ptr;
   1004 
   1005 	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
   1006 		return 0;
   1007 	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
   1008 		tdb_unlock(tdb, BUCKET(hash), locktype);
   1009 	return rec_ptr;
   1010 }
   1011 
   1012 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
   1013 {
   1014 	return tdb->ecode;
   1015 }
   1016 
   1017 static struct tdb_errname {
   1018 	enum TDB_ERROR ecode; const char *estring;
   1019 } emap[] = { {TDB_SUCCESS, "Success"},
   1020 	     {TDB_ERR_CORRUPT, "Corrupt database"},
   1021 	     {TDB_ERR_IO, "IO Error"},
   1022 	     {TDB_ERR_LOCK, "Locking error"},
   1023 	     {TDB_ERR_OOM, "Out of memory"},
   1024 	     {TDB_ERR_EXISTS, "Record exists"},
   1025 	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
   1026 	     {TDB_ERR_NOEXIST, "Record does not exist"} };
   1027 
   1028 /* Error string for the last tdb error */
   1029 const char *tdb_errorstr(TDB_CONTEXT *tdb)
   1030 {
   1031 	u32 i;
   1032 	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
   1033 		if (tdb->ecode == emap[i].ecode)
   1034 			return emap[i].estring;
   1035 	return "Invalid error code";
   1036 }
   1037 
   1038 /* update an entry in place - this only works if the new data size
   1039    is <= the old data size and the key exists.
   1040    on failure return -1.
   1041 */
   1042 
   1043 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
   1044 {
   1045 	struct list_struct rec;
   1046 	tdb_off rec_ptr;
   1047 
   1048 	/* find entry */
   1049 	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
   1050 		return -1;
   1051 
   1052 	/* must be long enough key, data and tailer */
   1053 	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
   1054 		tdb->ecode = TDB_SUCCESS; /* Not really an error */
   1055 		return -1;
   1056 	}
   1057 
   1058 	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
   1059 		      dbuf.dptr, dbuf.dsize) == -1)
   1060 		return -1;
   1061 
   1062 	if (dbuf.dsize != rec.data_len) {
   1063 		/* update size */
   1064 		rec.data_len = dbuf.dsize;
   1065 		return rec_write(tdb, rec_ptr, &rec);
   1066 	}
   1067 
   1068 	return 0;
   1069 }
   1070 
   1071 /* find an entry in the database given a key */
   1072 /* If an entry doesn't exist tdb_err will be set to
   1073  * TDB_ERR_NOEXIST. If a key has no data attached
   1074  * tdb_err will not be set. Both will return a
   1075  * zero pptr and zero dsize.
   1076  */
   1077 
   1078 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
   1079 {
   1080 	tdb_off rec_ptr;
   1081 	struct list_struct rec;
   1082 	TDB_DATA ret;
   1083 	u32 hash;
   1084 
   1085 	/* find which hash bucket it is in */
   1086 	hash = tdb->hash_fn(&key);
   1087 	if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
   1088 		return tdb_null;
   1089 
   1090 	if (rec.data_len)
   1091 		ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
   1092 					  rec.data_len);
   1093 	else
   1094 		ret.dptr = NULL;
   1095 	ret.dsize = rec.data_len;
   1096 	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
   1097 	return ret;
   1098 }
   1099 
   1100 /* check if an entry in the database exists
   1101 
   1102    note that 1 is returned if the key is found and 0 is returned if not found
   1103    this doesn't match the conventions in the rest of this module, but is
   1104    compatible with gdbm
   1105 */
   1106 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
   1107 {
   1108 	struct list_struct rec;
   1109 
   1110 	if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
   1111 		return 0;
   1112 	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
   1113 	return 1;
   1114 }
   1115 
   1116 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
   1117 {
   1118 	u32 hash = tdb->hash_fn(&key);
   1119 	return tdb_exists_hash(tdb, key, hash);
   1120 }
   1121 
   1122 /* record lock stops delete underneath */
   1123 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
   1124 {
   1125 	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
   1126 }
   1127 /*
   1128   Write locks override our own fcntl readlocks, so check it here.
   1129   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
   1130   an error to fail to get the lock here.
   1131 */
   1132 
   1133 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
   1134 {
   1135 	struct tdb_traverse_lock *i;
   1136 	for (i = &tdb->travlocks; i; i = i->next)
   1137 		if (i->off == off)
   1138 			return -1;
   1139 	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
   1140 }
   1141 
   1142 /*
   1143   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
   1144   an error to fail to get the lock here.
   1145 */
   1146 
   1147 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
   1148 {
   1149 	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
   1150 }
   1151 /* fcntl locks don't stack: avoid unlocking someone else's */
   1152 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
   1153 {
   1154 	struct tdb_traverse_lock *i;
   1155 	u32 count = 0;
   1156 
   1157 	if (off == 0)
   1158 		return 0;
   1159 	for (i = &tdb->travlocks; i; i = i->next)
   1160 		if (i->off == off)
   1161 			count++;
   1162 	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
   1163 }
   1164 
   1165 /* actually delete an entry in the database given the offset */
   1166 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
   1167 {
   1168 	tdb_off last_ptr, i;
   1169 	struct list_struct lastrec;
   1170 
   1171 	if (tdb->read_only) return -1;
   1172 
   1173 	if (write_lock_record(tdb, rec_ptr) == -1) {
   1174 		/* Someone traversing here: mark it as dead */
   1175 		rec->magic = TDB_DEAD_MAGIC;
   1176 		return rec_write(tdb, rec_ptr, rec);
   1177 	}
   1178 	if (write_unlock_record(tdb, rec_ptr) != 0)
   1179 		return -1;
   1180 
   1181 	/* find previous record in hash chain */
   1182 	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
   1183 		return -1;
   1184 	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
   1185 		if (rec_read(tdb, i, &lastrec) == -1)
   1186 			return -1;
   1187 
   1188 	/* unlink it: next ptr is at start of record. */
   1189 	if (last_ptr == 0)
   1190 		last_ptr = TDB_HASH_TOP(rec->full_hash);
   1191 	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
   1192 		return -1;
   1193 
   1194 	/* recover the space */
   1195 	if (tdb_free(tdb, rec_ptr, rec) == -1)
   1196 		return -1;
   1197 	return 0;
   1198 }
   1199 
   1200 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
   1201 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
   1202 			 struct list_struct *rec)
   1203 {
   1204 	int want_next = (tlock->off != 0);
   1205 
   1206 	/* Lock each chain from the start one. */
   1207 	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
   1208 		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
   1209 			return -1;
   1210 
   1211 		/* No previous record?  Start at top of chain. */
   1212 		if (!tlock->off) {
   1213 			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
   1214 				     &tlock->off) == -1)
   1215 				goto fail;
   1216 		} else {
   1217 			/* Otherwise unlock the previous record. */
   1218 			if (unlock_record(tdb, tlock->off) != 0)
   1219 				goto fail;
   1220 		}
   1221 
   1222 		if (want_next) {
   1223 			/* We have offset of old record: grab next */
   1224 			if (rec_read(tdb, tlock->off, rec) == -1)
   1225 				goto fail;
   1226 			tlock->off = rec->next;
   1227 		}
   1228 
   1229 		/* Iterate through chain */
   1230 		while( tlock->off) {
   1231 			tdb_off current;
   1232 			if (rec_read(tdb, tlock->off, rec) == -1)
   1233 				goto fail;
   1234 			if (!TDB_DEAD(rec)) {
   1235 				/* Woohoo: we found one! */
   1236 				if (lock_record(tdb, tlock->off) != 0)
   1237 					goto fail;
   1238 				return tlock->off;
   1239 			}
   1240 			/* Try to clean dead ones from old traverses */
   1241 			current = tlock->off;
   1242 			tlock->off = rec->next;
   1243 			if (!tdb->read_only &&
   1244 			    do_delete(tdb, current, rec) != 0)
   1245 				goto fail;
   1246 		}
   1247 		tdb_unlock(tdb, tlock->hash, F_WRLCK);
   1248 		want_next = 0;
   1249 	}
   1250 	/* We finished iteration without finding anything */
   1251 	return TDB_ERRCODE(TDB_SUCCESS, 0);
   1252 
   1253  fail:
   1254 	tlock->off = 0;
   1255 	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
   1256 		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
   1257 	return -1;
   1258 }
   1259 
   1260 /* traverse the entire database - calling fn(tdb, key, data) on each element.
   1261    return -1 on error or the record count traversed
   1262    if fn is NULL then it is not called
   1263    a non-zero return value from fn() indicates that the traversal should stop
   1264   */
   1265 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
   1266 {
   1267 	TDB_DATA key, dbuf;
   1268 	struct list_struct rec;
   1269 	struct tdb_traverse_lock tl = { NULL, 0, 0 };
   1270 	int ret, count = 0;
   1271 
   1272 	/* This was in the initializaton, above, but the IRIX compiler
   1273 	 * did not like it.  crh
   1274 	 */
   1275 	tl.next = tdb->travlocks.next;
   1276 
   1277 	/* fcntl locks don't stack: beware traverse inside traverse */
   1278 	tdb->travlocks.next = &tl;
   1279 
   1280 	/* tdb_next_lock places locks on the record returned, and its chain */
   1281 	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
   1282 		count++;
   1283 		/* now read the full record */
   1284 		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
   1285 					  rec.key_len + rec.data_len);
   1286 		if (!key.dptr) {
   1287 			ret = -1;
   1288 			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
   1289 				goto out;
   1290 			if (unlock_record(tdb, tl.off) != 0)
   1291 				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
   1292 			goto out;
   1293 		}
   1294 		key.dsize = rec.key_len;
   1295 		dbuf.dptr = key.dptr + rec.key_len;
   1296 		dbuf.dsize = rec.data_len;
   1297 
   1298 		/* Drop chain lock, call out */
   1299 		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
   1300 			ret = -1;
   1301 			goto out;
   1302 		}
   1303 		if (fn && fn(tdb, key, dbuf, private)) {
   1304 			/* They want us to terminate traversal */
   1305 			ret = count;
   1306 			if (unlock_record(tdb, tl.off) != 0) {
   1307 				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
   1308 				ret = -1;
   1309 			}
   1310 			tdb->travlocks.next = tl.next;
   1311 			SAFE_FREE(key.dptr);
   1312 			return count;
   1313 		}
   1314 		SAFE_FREE(key.dptr);
   1315 	}
   1316 out:
   1317 	tdb->travlocks.next = tl.next;
   1318 	if (ret < 0)
   1319 		return -1;
   1320 	else
   1321 		return count;
   1322 }
   1323 
   1324 /* find the first entry in the database and return its key */
   1325 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
   1326 {
   1327 	TDB_DATA key;
   1328 	struct list_struct rec;
   1329 
   1330 	/* release any old lock */
   1331 	if (unlock_record(tdb, tdb->travlocks.off) != 0)
   1332 		return tdb_null;
   1333 	tdb->travlocks.off = tdb->travlocks.hash = 0;
   1334 
   1335 	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
   1336 		return tdb_null;
   1337 	/* now read the key */
   1338 	key.dsize = rec.key_len;
   1339 	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
   1340 	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
   1341 		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
   1342 	return key;
   1343 }
   1344 
   1345 /* find the next entry in the database, returning its key */
   1346 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
   1347 {
   1348 	u32 oldhash;
   1349 	TDB_DATA key = tdb_null;
   1350 	struct list_struct rec;
   1351 	char *k = NULL;
   1352 
   1353 	/* Is locked key the old key?  If so, traverse will be reliable. */
   1354 	if (tdb->travlocks.off) {
   1355 		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
   1356 			return tdb_null;
   1357 		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
   1358 		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
   1359 					    rec.key_len))
   1360 		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
   1361 			/* No, it wasn't: unlock it and start from scratch */
   1362 			if (unlock_record(tdb, tdb->travlocks.off) != 0)
   1363 				return tdb_null;
   1364 			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
   1365 				return tdb_null;
   1366 			tdb->travlocks.off = 0;
   1367 		}
   1368 
   1369 		SAFE_FREE(k);
   1370 	}
   1371 
   1372 	if (!tdb->travlocks.off) {
   1373 		/* No previous element: do normal find, and lock record */
   1374 		tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
   1375 		if (!tdb->travlocks.off)
   1376 			return tdb_null;
   1377 		tdb->travlocks.hash = BUCKET(rec.full_hash);
   1378 		if (lock_record(tdb, tdb->travlocks.off) != 0) {
   1379 			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
   1380 			return tdb_null;
   1381 		}
   1382 	}
   1383 	oldhash = tdb->travlocks.hash;
   1384 
   1385 	/* Grab next record: locks chain and returned record,
   1386 	   unlocks old record */
   1387 	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
   1388 		key.dsize = rec.key_len;
   1389 		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
   1390 					  key.dsize);
   1391 		/* Unlock the chain of this new record */
   1392 		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
   1393 			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
   1394 	}
   1395 	/* Unlock the chain of old record */
   1396 	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
   1397 		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
   1398 	return key;
   1399 }
   1400 
   1401 /* delete an entry in the database given a key */
   1402 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
   1403 {
   1404 	tdb_off rec_ptr;
   1405 	struct list_struct rec;
   1406 	int ret;
   1407 
   1408 	if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
   1409 		return -1;
   1410 	ret = do_delete(tdb, rec_ptr, &rec);
   1411 	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
   1412 		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
   1413 	return ret;
   1414 }
   1415 
   1416 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
   1417 {
   1418 	u32 hash = tdb->hash_fn(&key);
   1419 	return tdb_delete_hash(tdb, key, hash);
   1420 }
   1421 
   1422 /* store an element in the database, replacing any existing element
   1423    with the same key
   1424 
   1425    return 0 on success, -1 on failure
   1426 */
   1427 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
   1428 {
   1429 	struct list_struct rec;
   1430 	u32 hash;
   1431 	tdb_off rec_ptr;
   1432 	char *p = NULL;
   1433 	int ret = 0;
   1434 
   1435 	/* find which hash bucket it is in */
   1436 	hash = tdb->hash_fn(&key);
   1437 	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
   1438 		return -1;
   1439 
   1440 	/* check for it existing, on insert. */
   1441 	if (flag == TDB_INSERT) {
   1442 		if (tdb_exists_hash(tdb, key, hash)) {
   1443 			tdb->ecode = TDB_ERR_EXISTS;
   1444 			goto fail;
   1445 		}
   1446 	} else {
   1447 		/* first try in-place update, on modify or replace. */
   1448 		if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
   1449 			goto out;
   1450 		if (tdb->ecode == TDB_ERR_NOEXIST &&
   1451 		    flag == TDB_MODIFY) {
   1452 			/* if the record doesn't exist and we are in TDB_MODIFY mode then
   1453 			 we should fail the store */
   1454 			goto fail;
   1455 	}
   1456 	}
   1457 	/* reset the error code potentially set by the tdb_update() */
   1458 	tdb->ecode = TDB_SUCCESS;
   1459 
   1460 	/* delete any existing record - if it doesn't exist we don't
   1461            care.  Doing this first reduces fragmentation, and avoids
   1462            coalescing with `allocated' block before it's updated. */
   1463 	if (flag != TDB_INSERT)
   1464 		tdb_delete_hash(tdb, key, hash);
   1465 
   1466 	/* Copy key+value *before* allocating free space in case malloc
   1467 	   fails and we are left with a dead spot in the tdb. */
   1468 
   1469 	if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
   1470 		tdb->ecode = TDB_ERR_OOM;
   1471 		goto fail;
   1472 	}
   1473 
   1474 	memcpy(p, key.dptr, key.dsize);
   1475 	if (dbuf.dsize)
   1476 		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
   1477 
   1478 	/* we have to allocate some space */
   1479 	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
   1480 		goto fail;
   1481 
   1482 	/* Read hash top into next ptr */
   1483 	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
   1484 		goto fail;
   1485 
   1486 	rec.key_len = key.dsize;
   1487 	rec.data_len = dbuf.dsize;
   1488 	rec.full_hash = hash;
   1489 	rec.magic = TDB_MAGIC;
   1490 
   1491 	/* write out and point the top of the hash chain at it */
   1492 	if (rec_write(tdb, rec_ptr, &rec) == -1
   1493 	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
   1494 	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
   1495 		/* Need to tdb_unallocate() here */
   1496 		goto fail;
   1497 	}
   1498  out:
   1499 	SAFE_FREE(p);
   1500 	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
   1501 	return ret;
   1502 fail:
   1503 	ret = -1;
   1504 	goto out;
   1505 }
   1506 
   1507 /* Attempt to append data to an entry in place - this only works if the new data size
   1508    is <= the old data size and the key exists.
   1509    on failure return -1. Record must be locked before calling.
   1510 */
   1511 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
   1512 {
   1513 	struct list_struct rec;
   1514 	tdb_off rec_ptr;
   1515 
   1516 	/* find entry */
   1517 	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
   1518 		return -1;
   1519 
   1520 	/* Append of 0 is always ok. */
   1521 	if (new_dbuf.dsize == 0)
   1522 		return 0;
   1523 
   1524 	/* must be long enough for key, old data + new data and tailer */
   1525 	if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
   1526 		/* No room. */
   1527 		tdb->ecode = TDB_SUCCESS; /* Not really an error */
   1528 		return -1;
   1529 	}
   1530 
   1531 	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
   1532 		      new_dbuf.dptr, new_dbuf.dsize) == -1)
   1533 		return -1;
   1534 
   1535 	/* update size */
   1536 	rec.data_len += new_dbuf.dsize;
   1537 	return rec_write(tdb, rec_ptr, &rec);
   1538 }
   1539 
   1540 /* Append to an entry. Create if not exist. */
   1541 
   1542 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
   1543 {
   1544 	struct list_struct rec;
   1545 	u32 hash;
   1546 	tdb_off rec_ptr;
   1547 	char *p = NULL;
   1548 	int ret = 0;
   1549 	size_t new_data_size = 0;
   1550 
   1551 	/* find which hash bucket it is in */
   1552 	hash = tdb->hash_fn(&key);
   1553 	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
   1554 		return -1;
   1555 
   1556 	/* first try in-place. */
   1557 	if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
   1558 		goto out;
   1559 
   1560 	/* reset the error code potentially set by the tdb_append_inplace() */
   1561 	tdb->ecode = TDB_SUCCESS;
   1562 
   1563 	/* find entry */
   1564 	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
   1565 		if (tdb->ecode != TDB_ERR_NOEXIST)
   1566 			goto fail;
   1567 
   1568 		/* Not found - create. */
   1569 
   1570 		ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
   1571 		goto out;
   1572 	}
   1573 
   1574 	new_data_size = rec.data_len + new_dbuf.dsize;
   1575 
   1576 	/* Copy key+old_value+value *before* allocating free space in case malloc
   1577 	   fails and we are left with a dead spot in the tdb. */
   1578 
   1579 	if (!(p = (char *)malloc(key.dsize + new_data_size))) {
   1580 		tdb->ecode = TDB_ERR_OOM;
   1581 		goto fail;
   1582 	}
   1583 
   1584 	/* Copy the key in place. */
   1585 	memcpy(p, key.dptr, key.dsize);
   1586 
   1587 	/* Now read the old data into place. */
   1588 	if (rec.data_len &&
   1589 		tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
   1590 			goto fail;
   1591 
   1592 	/* Finally append the new data. */
   1593 	if (new_dbuf.dsize)
   1594 		memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
   1595 
   1596 	/* delete any existing record - if it doesn't exist we don't
   1597            care.  Doing this first reduces fragmentation, and avoids
   1598            coalescing with `allocated' block before it's updated. */
   1599 
   1600 	tdb_delete_hash(tdb, key, hash);
   1601 
   1602 	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
   1603 		goto fail;
   1604 
   1605 	/* Read hash top into next ptr */
   1606 	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
   1607 		goto fail;
   1608 
   1609 	rec.key_len = key.dsize;
   1610 	rec.data_len = new_data_size;
   1611 	rec.full_hash = hash;
   1612 	rec.magic = TDB_MAGIC;
   1613 
   1614 	/* write out and point the top of the hash chain at it */
   1615 	if (rec_write(tdb, rec_ptr, &rec) == -1
   1616 	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
   1617 	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
   1618 		/* Need to tdb_unallocate() here */
   1619 		goto fail;
   1620 	}
   1621 
   1622  out:
   1623 	SAFE_FREE(p);
   1624 	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
   1625 	return ret;
   1626 
   1627 fail:
   1628 	ret = -1;
   1629 	goto out;
   1630 }
   1631 
   1632 static int tdb_already_open(dev_t device,
   1633 			    ino_t ino)
   1634 {
   1635 	TDB_CONTEXT *i;
   1636 
   1637 	for (i = tdbs; i; i = i->next) {
   1638 		if (i->device == device && i->inode == ino) {
   1639 			return 1;
   1640 		}
   1641 	}
   1642 
   1643 	return 0;
   1644 }
   1645 
   1646 /* This is based on the hash algorithm from gdbm */
   1647 static u32 default_tdb_hash(TDB_DATA *key)
   1648 {
   1649 	u32 value;	/* Used to compute the hash value.  */
   1650 	u32   i;	/* Used to cycle through random values. */
   1651 
   1652 	/* Set the initial value from the key size. */
   1653 	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
   1654 		value = (value + (key->dptr[i] << (i*5 % 24)));
   1655 
   1656 	return (1103515243 * value + 12345);
   1657 }
   1658 
   1659 /* open the database, creating it if necessary
   1660 
   1661    The open_flags and mode are passed straight to the open call on the
   1662    database file. A flags value of O_WRONLY is invalid. The hash size
   1663    is advisory, use zero for a default value.
   1664 
   1665    Return is NULL on error, in which case errno is also set.  Don't
   1666    try to call tdb_error or tdb_errname, just do strerror(errno).
   1667 
   1668    @param name may be NULL for internal databases. */
   1669 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
   1670 		      int open_flags, mode_t mode)
   1671 {
   1672 	return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
   1673 }
   1674 
   1675 
   1676 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
   1677 			 int open_flags, mode_t mode,
   1678 			 tdb_log_func log_fn,
   1679 			 tdb_hash_func hash_fn)
   1680 {
   1681 	TDB_CONTEXT *tdb;
   1682 	struct stat st;
   1683 	int rev = 0, locked = 0;
   1684 	unsigned char *vp;
   1685 	u32 vertest;
   1686 
   1687 	if (!(tdb = calloc(1, sizeof *tdb))) {
   1688 		/* Can't log this */
   1689 		errno = ENOMEM;
   1690 		goto fail;
   1691 	}
   1692 	tdb->fd = -1;
   1693 	tdb->name = NULL;
   1694 	tdb->map_ptr = NULL;
   1695 	tdb->flags = tdb_flags;
   1696 	tdb->open_flags = open_flags;
   1697 	tdb->log_fn = log_fn;
   1698 	tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
   1699 
   1700 	if ((open_flags & O_ACCMODE) == O_WRONLY) {
   1701 		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
   1702 			 name));
   1703 		errno = EINVAL;
   1704 		goto fail;
   1705 	}
   1706 
   1707 	if (hash_size == 0)
   1708 		hash_size = DEFAULT_HASH_SIZE;
   1709 	if ((open_flags & O_ACCMODE) == O_RDONLY) {
   1710 		tdb->read_only = 1;
   1711 		/* read only databases don't do locking or clear if first */
   1712 		tdb->flags |= TDB_NOLOCK;
   1713 		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
   1714 	}
   1715 
   1716 	/* internal databases don't mmap or lock, and start off cleared */
   1717 	if (tdb->flags & TDB_INTERNAL) {
   1718 		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
   1719 		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
   1720 		if (tdb_new_database(tdb, hash_size) != 0) {
   1721 			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
   1722 			goto fail;
   1723 		}
   1724 		goto internal;
   1725 	}
   1726 
   1727 	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
   1728 		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
   1729 			 name, strerror(errno)));
   1730 		goto fail;	/* errno set by open(2) */
   1731 	}
   1732 
   1733 	/* ensure there is only one process initialising at once */
   1734 	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
   1735 		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
   1736 			 name, strerror(errno)));
   1737 		goto fail;	/* errno set by tdb_brlock */
   1738 	}
   1739 
   1740 	/* we need to zero database if we are the only one with it open */
   1741 	if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
   1742 		(locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
   1743 		open_flags |= O_CREAT;
   1744 		if (ftruncate(tdb->fd, 0) == -1) {
   1745 			TDB_LOG((tdb, 0, "tdb_open_ex: "
   1746 				 "failed to truncate %s: %s\n",
   1747 				 name, strerror(errno)));
   1748 			goto fail; /* errno set by ftruncate */
   1749 		}
   1750 	}
   1751 
   1752 	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
   1753 	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
   1754 	    || (tdb->header.version != TDB_VERSION
   1755 		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
   1756 		/* its not a valid database - possibly initialise it */
   1757 		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
   1758 			errno = EIO; /* ie bad format or something */
   1759 			goto fail;
   1760 		}
   1761 		rev = (tdb->flags & TDB_CONVERT);
   1762 	}
   1763 	vp = (unsigned char *)&tdb->header.version;
   1764 	vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
   1765 		  (((u32)vp[2]) << 8) | (u32)vp[3];
   1766 	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
   1767 	if (!rev)
   1768 		tdb->flags &= ~TDB_CONVERT;
   1769 	else {
   1770 		tdb->flags |= TDB_CONVERT;
   1771 		convert(&tdb->header, sizeof(tdb->header));
   1772 	}
   1773 	if (fstat(tdb->fd, &st) == -1)
   1774 		goto fail;
   1775 
   1776 	/* Is it already in the open list?  If so, fail. */
   1777 	if (tdb_already_open(st.st_dev, st.st_ino)) {
   1778 		TDB_LOG((tdb, 2, "tdb_open_ex: "
   1779 			 "%s (%d,%d) is already open in this process\n",
   1780 			 name, (int)st.st_dev, (int)st.st_ino));
   1781 		errno = EBUSY;
   1782 		goto fail;
   1783 	}
   1784 
   1785 	if (!(tdb->name = (char *)strdup(name))) {
   1786 		errno = ENOMEM;
   1787 		goto fail;
   1788 	}
   1789 
   1790 	tdb->map_size = st.st_size;
   1791 	tdb->device = st.st_dev;
   1792 	tdb->inode = st.st_ino;
   1793 	tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
   1794 	if (!tdb->locked) {
   1795 		TDB_LOG((tdb, 2, "tdb_open_ex: "
   1796 			 "failed to allocate lock structure for %s\n",
   1797 			 name));
   1798 		errno = ENOMEM;
   1799 		goto fail;
   1800 	}
   1801 	tdb_mmap(tdb);
   1802 	if (locked) {
   1803 		if (!tdb->read_only)
   1804 			if (tdb_clear_spinlocks(tdb) != 0) {
   1805 				TDB_LOG((tdb, 0, "tdb_open_ex: "
   1806 				"failed to clear spinlock\n"));
   1807 				goto fail;
   1808 			}
   1809 		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
   1810 			TDB_LOG((tdb, 0, "tdb_open_ex: "
   1811 				 "failed to take ACTIVE_LOCK on %s: %s\n",
   1812 				 name, strerror(errno)));
   1813 			goto fail;
   1814 		}
   1815 
   1816 	}
   1817 
   1818 	/* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
   1819 	   we didn't get the initial exclusive lock as we need to let all other
   1820 	   users know we're using it. */
   1821 
   1822 	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
   1823 		/* leave this lock in place to indicate it's in use */
   1824 		if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
   1825 			goto fail;
   1826 	}
   1827 
   1828 
   1829  internal:
   1830 	/* Internal (memory-only) databases skip all the code above to
   1831 	 * do with disk files, and resume here by releasing their
   1832 	 * global lock and hooking into the active list. */
   1833 	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
   1834 		goto fail;
   1835 	tdb->next = tdbs;
   1836 	tdbs = tdb;
   1837 	return tdb;
   1838 
   1839  fail:
   1840 	{ int save_errno = errno;
   1841 
   1842 	if (!tdb)
   1843 		return NULL;
   1844 
   1845 	if (tdb->map_ptr) {
   1846 		if (tdb->flags & TDB_INTERNAL)
   1847 			SAFE_FREE(tdb->map_ptr);
   1848 		else
   1849 			tdb_munmap(tdb);
   1850 	}
   1851 	SAFE_FREE(tdb->name);
   1852 	if (tdb->fd != -1)
   1853 		if (close(tdb->fd) != 0)
   1854 			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
   1855 	SAFE_FREE(tdb->locked);
   1856 	SAFE_FREE(tdb);
   1857 	errno = save_errno;
   1858 	return NULL;
   1859 	}
   1860 }
   1861 
   1862 /**
   1863  * Close a database.
   1864  *
   1865  * @returns -1 for error; 0 for success.
   1866  **/
   1867 int tdb_close(TDB_CONTEXT *tdb)
   1868 {
   1869 	TDB_CONTEXT **i;
   1870 	int ret = 0;
   1871 
   1872 	if (tdb->map_ptr) {
   1873 		if (tdb->flags & TDB_INTERNAL)
   1874 			SAFE_FREE(tdb->map_ptr);
   1875 		else
   1876 			tdb_munmap(tdb);
   1877 	}
   1878 	SAFE_FREE(tdb->name);
   1879 	if (tdb->fd != -1)
   1880 		ret = close(tdb->fd);
   1881 	SAFE_FREE(tdb->locked);
   1882 
   1883 	/* Remove from contexts list */
   1884 	for (i = &tdbs; *i; i = &(*i)->next) {
   1885 		if (*i == tdb) {
   1886 			*i = tdb->next;
   1887 			break;
   1888 		}
   1889 	}
   1890 
   1891 	memset(tdb, 0, sizeof(*tdb));
   1892 	SAFE_FREE(tdb);
   1893 
   1894 	return ret;
   1895 }
   1896 
   1897 /* lock/unlock entire database */
   1898 int tdb_lockall(TDB_CONTEXT *tdb)
   1899 {
   1900 	u32 i;
   1901 
   1902 	/* There are no locks on read-only dbs */
   1903 	if (tdb->read_only)
   1904 		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
   1905 	for (i = 0; i < tdb->header.hash_size; i++)
   1906 		if (tdb_lock(tdb, i, F_WRLCK))
   1907 			break;
   1908 
   1909 	/* If error, release locks we have... */
   1910 	if (i < tdb->header.hash_size) {
   1911 		u32 j;
   1912 
   1913 		for ( j = 0; j < i; j++)
   1914 			tdb_unlock(tdb, j, F_WRLCK);
   1915 		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
   1916 	}
   1917 
   1918 	return 0;
   1919 }
   1920 void tdb_unlockall(TDB_CONTEXT *tdb)
   1921 {
   1922 	u32 i;
   1923 	for (i=0; i < tdb->header.hash_size; i++)
   1924 		tdb_unlock(tdb, i, F_WRLCK);
   1925 }
   1926 
   1927 /* lock/unlock one hash chain. This is meant to be used to reduce
   1928    contention - it cannot guarantee how many records will be locked */
   1929 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
   1930 {
   1931 	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
   1932 }
   1933 
   1934 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
   1935 {
   1936 	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
   1937 }
   1938 
   1939 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
   1940 {
   1941 	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
   1942 }
   1943 
   1944 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
   1945 {
   1946 	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
   1947 }
   1948 
   1949 
   1950 /* register a loging function */
   1951 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
   1952 {
   1953 	tdb->log_fn = fn;
   1954 }
   1955 
   1956 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
   1957    seek pointer from our parent and to re-establish locks */
   1958 int tdb_reopen(TDB_CONTEXT *tdb)
   1959 {
   1960 	struct stat st;
   1961 
   1962 	if (tdb->flags & TDB_INTERNAL)
   1963 		return 0; /* Nothing to do. */
   1964 	if (tdb_munmap(tdb) != 0) {
   1965 		TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
   1966 		goto fail;
   1967 	}
   1968 	if (close(tdb->fd) != 0)
   1969 		TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
   1970 	tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
   1971 	if (tdb->fd == -1) {
   1972 		TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
   1973 		goto fail;
   1974 	}
   1975 	if (fstat(tdb->fd, &st) != 0) {
   1976 		TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
   1977 		goto fail;
   1978 	}
   1979 	if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
   1980 		TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
   1981 		goto fail;
   1982 	}
   1983 	tdb_mmap(tdb);
   1984 	if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
   1985 		TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
   1986 		goto fail;
   1987 	}
   1988 
   1989 	return 0;
   1990 
   1991 fail:
   1992 	tdb_close(tdb);
   1993 	return -1;
   1994 }
   1995 
   1996 /* reopen all tdb's */
   1997 int tdb_reopen_all(void)
   1998 {
   1999 	TDB_CONTEXT *tdb;
   2000 
   2001 	for (tdb=tdbs; tdb; tdb = tdb->next) {
   2002 		/* Ensure no clear-if-first. */
   2003 		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
   2004 		if (tdb_reopen(tdb) != 0)
   2005 			return -1;
   2006 	}
   2007 
   2008 	return 0;
   2009 }
   2010