Home | History | Annotate | Download | only in dbm
      1 """A dumb and slow but simple dbm clone.
      2 
      3 For database spam, spam.dir contains the index (a text file),
      4 spam.bak *may* contain a backup of the index (also a text file),
      5 while spam.dat contains the data (a binary file).
      6 
      7 XXX TO DO:
      8 
      9 - seems to contain a bug when updating...
     10 
     11 - reclaim free space (currently, space once occupied by deleted or expanded
     12 items is never reused)
     13 
     14 - support concurrent access (currently, if two processes take turns making
     15 updates, they can mess up the index)
     16 
     17 - support efficient access to large databases (currently, the whole index
     18 is read when the database is opened, and some updates rewrite the whole index)
     19 
     20 - support opening for read-only (flag = 'm')
     21 
     22 """
     23 
     24 import ast as _ast
     25 import io as _io
     26 import os as _os
     27 import collections
     28 
     29 __all__ = ["error", "open"]
     30 
     31 _BLOCKSIZE = 512
     32 
     33 error = OSError
     34 
     35 class _Database(collections.MutableMapping):
     36 
     37     # The on-disk directory and data files can remain in mutually
     38     # inconsistent states for an arbitrarily long time (see comments
     39     # at the end of __setitem__).  This is only repaired when _commit()
     40     # gets called.  One place _commit() gets called is from __del__(),
     41     # and if that occurs at program shutdown time, module globals may
     42     # already have gotten rebound to None.  Since it's crucial that
     43     # _commit() finish successfully, we can't ignore shutdown races
     44     # here, and _commit() must not reference any globals.
     45     _os = _os       # for _commit()
     46     _io = _io       # for _commit()
     47 
     48     def __init__(self, filebasename, mode, flag='c'):
     49         self._mode = mode
     50         self._readonly = (flag == 'r')
     51 
     52         # The directory file is a text file.  Each line looks like
     53         #    "%r, (%d, %d)\n" % (key, pos, siz)
     54         # where key is the string key, pos is the offset into the dat
     55         # file of the associated value's first byte, and siz is the number
     56         # of bytes in the associated value.
     57         self._dirfile = filebasename + '.dir'
     58 
     59         # The data file is a binary file pointed into by the directory
     60         # file, and holds the values associated with keys.  Each value
     61         # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
     62         # binary 8-bit string value.
     63         self._datfile = filebasename + '.dat'
     64         self._bakfile = filebasename + '.bak'
     65 
     66         # The index is an in-memory dict, mirroring the directory file.
     67         self._index = None  # maps keys to (pos, siz) pairs
     68 
     69         # Handle the creation
     70         self._create(flag)
     71         self._update()
     72 
     73     def _create(self, flag):
     74         if flag == 'n':
     75             for filename in (self._datfile, self._bakfile, self._dirfile):
     76                 try:
     77                     _os.remove(filename)
     78                 except OSError:
     79                     pass
     80         # Mod by Jack: create data file if needed
     81         try:
     82             f = _io.open(self._datfile, 'r', encoding="Latin-1")
     83         except OSError:
     84             if flag not in ('c', 'n'):
     85                 import warnings
     86                 warnings.warn("The database file is missing, the "
     87                               "semantics of the 'c' flag will be used.",
     88                               DeprecationWarning, stacklevel=4)
     89             with _io.open(self._datfile, 'w', encoding="Latin-1") as f:
     90                 self._chmod(self._datfile)
     91         else:
     92             f.close()
     93 
     94     # Read directory file into the in-memory index dict.
     95     def _update(self):
     96         self._index = {}
     97         try:
     98             f = _io.open(self._dirfile, 'r', encoding="Latin-1")
     99         except OSError:
    100             self._modified = not self._readonly
    101         else:
    102             self._modified = False
    103             with f:
    104                 for line in f:
    105                     line = line.rstrip()
    106                     key, pos_and_siz_pair = _ast.literal_eval(line)
    107                     key = key.encode('Latin-1')
    108                     self._index[key] = pos_and_siz_pair
    109 
    110     # Write the index dict to the directory file.  The original directory
    111     # file (if any) is renamed with a .bak extension first.  If a .bak
    112     # file currently exists, it's deleted.
    113     def _commit(self):
    114         # CAUTION:  It's vital that _commit() succeed, and _commit() can
    115         # be called from __del__().  Therefore we must never reference a
    116         # global in this routine.
    117         if self._index is None or not self._modified:
    118             return  # nothing to do
    119 
    120         try:
    121             self._os.unlink(self._bakfile)
    122         except OSError:
    123             pass
    124 
    125         try:
    126             self._os.rename(self._dirfile, self._bakfile)
    127         except OSError:
    128             pass
    129 
    130         with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f:
    131             self._chmod(self._dirfile)
    132             for key, pos_and_siz_pair in self._index.items():
    133                 # Use Latin-1 since it has no qualms with any value in any
    134                 # position; UTF-8, though, does care sometimes.
    135                 entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair)
    136                 f.write(entry)
    137 
    138     sync = _commit
    139 
    140     def _verify_open(self):
    141         if self._index is None:
    142             raise error('DBM object has already been closed')
    143 
    144     def __getitem__(self, key):
    145         if isinstance(key, str):
    146             key = key.encode('utf-8')
    147         self._verify_open()
    148         pos, siz = self._index[key]     # may raise KeyError
    149         with _io.open(self._datfile, 'rb') as f:
    150             f.seek(pos)
    151             dat = f.read(siz)
    152         return dat
    153 
    154     # Append val to the data file, starting at a _BLOCKSIZE-aligned
    155     # offset.  The data file is first padded with NUL bytes (if needed)
    156     # to get to an aligned offset.  Return pair
    157     #     (starting offset of val, len(val))
    158     def _addval(self, val):
    159         with _io.open(self._datfile, 'rb+') as f:
    160             f.seek(0, 2)
    161             pos = int(f.tell())
    162             npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
    163             f.write(b'\0'*(npos-pos))
    164             pos = npos
    165             f.write(val)
    166         return (pos, len(val))
    167 
    168     # Write val to the data file, starting at offset pos.  The caller
    169     # is responsible for ensuring that there's enough room starting at
    170     # pos to hold val, without overwriting some other value.  Return
    171     # pair (pos, len(val)).
    172     def _setval(self, pos, val):
    173         with _io.open(self._datfile, 'rb+') as f:
    174             f.seek(pos)
    175             f.write(val)
    176         return (pos, len(val))
    177 
    178     # key is a new key whose associated value starts in the data file
    179     # at offset pos and with length siz.  Add an index record to
    180     # the in-memory index dict, and append one to the directory file.
    181     def _addkey(self, key, pos_and_siz_pair):
    182         self._index[key] = pos_and_siz_pair
    183         with _io.open(self._dirfile, 'a', encoding="Latin-1") as f:
    184             self._chmod(self._dirfile)
    185             f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair))
    186 
    187     def __setitem__(self, key, val):
    188         if self._readonly:
    189             import warnings
    190             warnings.warn('The database is opened for reading only',
    191                           DeprecationWarning, stacklevel=2)
    192         if isinstance(key, str):
    193             key = key.encode('utf-8')
    194         elif not isinstance(key, (bytes, bytearray)):
    195             raise TypeError("keys must be bytes or strings")
    196         if isinstance(val, str):
    197             val = val.encode('utf-8')
    198         elif not isinstance(val, (bytes, bytearray)):
    199             raise TypeError("values must be bytes or strings")
    200         self._verify_open()
    201         self._modified = True
    202         if key not in self._index:
    203             self._addkey(key, self._addval(val))
    204         else:
    205             # See whether the new value is small enough to fit in the
    206             # (padded) space currently occupied by the old value.
    207             pos, siz = self._index[key]
    208             oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
    209             newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
    210             if newblocks <= oldblocks:
    211                 self._index[key] = self._setval(pos, val)
    212             else:
    213                 # The new value doesn't fit in the (padded) space used
    214                 # by the old value.  The blocks used by the old value are
    215                 # forever lost.
    216                 self._index[key] = self._addval(val)
    217 
    218             # Note that _index may be out of synch with the directory
    219             # file now:  _setval() and _addval() don't update the directory
    220             # file.  This also means that the on-disk directory and data
    221             # files are in a mutually inconsistent state, and they'll
    222             # remain that way until _commit() is called.  Note that this
    223             # is a disaster (for the database) if the program crashes
    224             # (so that _commit() never gets called).
    225 
    226     def __delitem__(self, key):
    227         if self._readonly:
    228             import warnings
    229             warnings.warn('The database is opened for reading only',
    230                           DeprecationWarning, stacklevel=2)
    231         if isinstance(key, str):
    232             key = key.encode('utf-8')
    233         self._verify_open()
    234         self._modified = True
    235         # The blocks used by the associated value are lost.
    236         del self._index[key]
    237         # XXX It's unclear why we do a _commit() here (the code always
    238         # XXX has, so I'm not changing it).  __setitem__ doesn't try to
    239         # XXX keep the directory file in synch.  Why should we?  Or
    240         # XXX why shouldn't __setitem__?
    241         self._commit()
    242 
    243     def keys(self):
    244         try:
    245             return list(self._index)
    246         except TypeError:
    247             raise error('DBM object has already been closed') from None
    248 
    249     def items(self):
    250         self._verify_open()
    251         return [(key, self[key]) for key in self._index.keys()]
    252 
    253     def __contains__(self, key):
    254         if isinstance(key, str):
    255             key = key.encode('utf-8')
    256         try:
    257             return key in self._index
    258         except TypeError:
    259             if self._index is None:
    260                 raise error('DBM object has already been closed') from None
    261             else:
    262                 raise
    263 
    264     def iterkeys(self):
    265         try:
    266             return iter(self._index)
    267         except TypeError:
    268             raise error('DBM object has already been closed') from None
    269     __iter__ = iterkeys
    270 
    271     def __len__(self):
    272         try:
    273             return len(self._index)
    274         except TypeError:
    275             raise error('DBM object has already been closed') from None
    276 
    277     def close(self):
    278         try:
    279             self._commit()
    280         finally:
    281             self._index = self._datfile = self._dirfile = self._bakfile = None
    282 
    283     __del__ = close
    284 
    285     def _chmod(self, file):
    286         if hasattr(self._os, 'chmod'):
    287             self._os.chmod(file, self._mode)
    288 
    289     def __enter__(self):
    290         return self
    291 
    292     def __exit__(self, *args):
    293         self.close()
    294 
    295 
    296 def open(file, flag='c', mode=0o666):
    297     """Open the database file, filename, and return corresponding object.
    298 
    299     The flag argument, used to control how the database is opened in the
    300     other DBM implementations, supports only the semantics of 'c' and 'n'
    301     values.  Other values will default to the semantics of 'c' value:
    302     the database will always opened for update and will be created if it
    303     does not exist.
    304 
    305     The optional mode argument is the UNIX mode of the file, used only when
    306     the database has to be created.  It defaults to octal code 0o666 (and
    307     will be modified by the prevailing umask).
    308 
    309     """
    310 
    311     # Modify mode depending on the umask
    312     try:
    313         um = _os.umask(0)
    314         _os.umask(um)
    315     except AttributeError:
    316         pass
    317     else:
    318         # Turn off any bits that are set in the umask
    319         mode = mode & (~um)
    320     if flag not in ('r', 'w', 'c', 'n'):
    321         import warnings
    322         warnings.warn("Flag must be one of 'r', 'w', 'c', or 'n'",
    323                       DeprecationWarning, stacklevel=2)
    324     return _Database(file, mode, flag=flag)
    325