Home | History | Annotate | Download | only in research
      1 #!python3
      2 """Program to dump contents of Brotli compressed files showing the compression format.
      3 Jurjen N.E. Bos, 2016.
      4 I found the following issues with the Brotli format:
      5 - The distance alphabet has size 16+(48<<POSTFIX),
      6   but the last symbols are useless.
      7   It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter.
      8 - The block type code is useless if NBLTYPES==2, you would only need 1 symbol
      9   anyway, so why don't you just switch to "the other" type?
     10 """
     11 import struct
     12 from operator import itemgetter, methodcaller
     13 from itertools import accumulate, repeat
     14 from collections import defaultdict, deque
     15 from functools import partial
     16 
     17 class InvalidStream(Exception): pass
     18 #lookup table
     19 L, I, D = "literal", "insert&copy", "distance"
     20 pL, pI, pD = 'P'+L, 'P'+I, 'P'+D
     21 
     22 def outputCharFormatter(c):
     23     """Show character in readable format
     24     """
     25     #TODO 2: allow hex only output
     26     if 32<c<127: return chr(c)
     27     elif c==10: return '\\n'
     28     elif c==13: return '\\r'
     29     elif c==32: return '" "'
     30     else: return '\\x{:02x}'.format(c)
     31 
     32 def outputFormatter(s):
     33     """Show string or char.
     34     """
     35     result = ''
     36     def formatSubString(s):
     37         for c in s:
     38             if c==32: yield ' '
     39             else: yield outputCharFormatter(c)
     40     if len(result)<200: return ''.join(formatSubString(s))
     41     else:
     42         return ''.join(formatSubString(s[:100]))+'...'+ \
     43                ''.join(formatSubString(s[-100:]))
     44 
     45 
     46 class BitStream:
     47     """Represent a bytes object. Can read bits and prefix codes the way
     48     Brotli does.
     49     """
     50     def __init__(self, byteString):
     51         self.data = byteString
     52         #position in bits: byte pos is pos>>3, bit pos is pos&7
     53         self.pos = 0
     54 
     55     def __repr__(self):
     56         """Representation
     57         >>> olleke
     58         BitStream(pos=0:0)
     59         """
     60         return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7)
     61 
     62     def read(self, n):
     63         """Read n bits from the stream and return as an integer.
     64         Produces zero bits beyond the stream.
     65         >>> olleke.data[0]==27
     66         True
     67         >>> olleke.read(5)
     68         27
     69 
     70         >>> olleke
     71         BitStream(pos=0:5)
     72         """
     73         value = self.peek(n)
     74         self.pos += n
     75         if self.pos>len(self.data)*8:
     76             raise ValueError('Read past end of stream')
     77         return value
     78 
     79     def peek(self, n):
     80         """Peek an n bit integer from the stream without updating the pointer.
     81         It is not an error to read beyond the end of the stream.
     82         >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803
     83         True
     84         >>> olleke.peek(15)
     85         11803
     86         >>> hex(olleke.peek(32))
     87         '0x2e1b'
     88         """
     89         #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3]
     90         #convert to int: int.from_bytes(..., 'little')
     91         #shift out the bits from the first byte: >>(self.pos&7)
     92         #mask unwanted bits: & (1<<n)-1
     93         return int.from_bytes(
     94             self.data[self.pos>>3:self.pos+n+7>>3],
     95             'little')>>(self.pos&7) & (1<<n)-1
     96 
     97     def readBytes(self, n):
     98         """Read n bytes from the stream on a byte boundary.
     99         """
    100         if self.pos&7: raise ValueError('readBytes: need byte boundary')
    101         result = self.data[self.pos>>3:(self.pos>>3)+n]
    102         self.pos += 8*n
    103         return result
    104 
    105 #-----------------------Symbol-------------------------------------------
    106 class Symbol:
    107     """A symbol in a code.
    108     Refers back to the code that contains it.
    109     Index is the place in the alphabet of the symbol.
    110     """
    111     __slots__ = 'code', 'index'
    112     def __init__(self, code, value):
    113         self.code = code
    114         self.index = value
    115 
    116     def __repr__(self):
    117         return 'Symbol({}, {})'.format(self.code.name, self.index)
    118 
    119     def __len__(self):
    120         """Number of bits in the prefix notation of this symbol
    121         """
    122         return self.code.length(self.index)
    123 
    124     def __int__(self):
    125         return self.index
    126 
    127     #these routines call equivalent routine in Code class
    128     def bitPattern(self):
    129         """Value of the symbol in the stream
    130         """
    131         return self.code.bitPattern(self.index)
    132 
    133     def extraBits(self):
    134         """Number of extra bits to read for this symbol
    135         """
    136         return self.code.extraBits(self.index)
    137 
    138     def __str__(self):
    139         """Short descriptor of the symbol without extra bits.
    140         """
    141         return self.code.mnemonic(self.index)
    142 
    143     #requiring optional extra bits, if self.code supports them
    144     def value(self, extra=None):
    145         """The value used for processing. Can be a tuple.
    146         with optional extra bits
    147         """
    148         if isinstance(self.code, WithExtra):
    149             if not 0<=extra<1<<self.extraBits():
    150                 raise ValueError("value: extra value doesn't fit in extraBits")
    151             return self.code.value(self.index, extra)
    152         if extra is not None:
    153             raise ValueError('value: no extra bits for this code')
    154         return self.code.value(self.index)
    155 
    156     def explanation(self, extra=None):
    157         """Long explanation of the value from the numeric value
    158         with optional extra bits
    159         Used by Layout.verboseRead when printing the value
    160         """
    161         if isinstance(self.code, WithExtra):
    162             return self.code.callback(self, extra)
    163         return self.code.callback(self)
    164 
    165 #========================Code definitions==================================
    166 class RangeDecoder:
    167     """A decoder for the Code class that assumes the symbols
    168     are encoded consecutively in binary.
    169     It all depends on the "alphabetSize" property.
    170     The range runs from 0 to alphabetSize-1.
    171     This is the default decoder.
    172     """
    173     def __init__(self, *, alphabetSize=None, bitLength=None, **args):
    174         if bitLength is not None: alphabetSize = 1<<bitLength
    175         if alphabetSize is not None:
    176             self.alphabetSize = alphabetSize
    177             self.maxLength = (alphabetSize-1).bit_length()
    178 
    179     def __len__(self):
    180         return self.alphabetSize
    181 
    182     def __iter__(self):
    183         """Produce all symbols.
    184         """
    185         return map(partial(Symbol, self), range(len(self)))
    186 
    187     def __getitem__(self, index):
    188         if index>=self.alphabetSize: raise ValueError('index out of range')
    189         return Symbol(self, index)
    190 
    191     def bitPattern(self, index):
    192         return '{:0{}b}'.format(index, self.maxLength)
    193 
    194     def length(self, index):
    195         """Encoding length of given symbol.
    196         Does not depend on index in this case.
    197         """
    198         return self.maxLength
    199 
    200     def decodePeek(self, data):
    201         """Find which symbol index matches the given data (from peek, as a number)
    202         and return the number of bits decoded.
    203         Can also be used to figure out length of a symbol.
    204         """
    205         return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1)
    206 
    207 class PrefixDecoder:
    208     """A decoder for the Code class that uses a prefix code.
    209     The code is determined by encoding:
    210     encode[p] gives the index corresponding to bit pattern p.
    211     Used setDecode(decodeTable) to switch the decoder from the default
    212     to a prefix decoder, or pass decodeTable at init.
    213     You can also use setLength(lengthTable)
    214     to define the encoding from the lengths.
    215     The set of symbol values does not need to be consecutive.
    216     """
    217     def __init__(self, *, decodeTable=None, **args):
    218         if decodeTable is not None: self.setDecode(decodeTable)
    219 
    220     def __len__(self):
    221         return len(self.decodeTable)
    222 
    223     def __iter__(self):
    224         def revBits(index):
    225             return self.bitPattern(index)[::-1]
    226         return (
    227             Symbol(self, index)
    228             for index in sorted(self.decodeTable.values(), key=revBits)
    229             )
    230 
    231     def __getitem__(self, index):
    232         if index not in self.lengthTable:
    233             raise ValueError('No symbol {}[{}]'.format(
    234                 self.__class__.__name__, index))
    235         return Symbol(self, index)
    236 
    237     def bitPattern(self, index):
    238         bits = next(b for (b,s) in self.decodeTable.items() if s==index)
    239         return '{:0{}b}'.format(bits, self.length(index))
    240 
    241     def length(self, index):
    242         """Encoding length of given symbol.
    243         """
    244         return self.lengthTable[index]
    245 
    246     def decodePeek(self, data):
    247         """Find which symbol index matches the given data (from peek, as a number)
    248         and return the number of bits decoded.
    249         Can also be used to figure out length of a symbol.
    250         """
    251         #do binary search for word length
    252         #invariant: lo<=length<=hi
    253         lo, hi = self.minLength, self.maxLength
    254         while lo<=hi:
    255             mid = lo+hi>>1
    256             #note lo<=mid<hi at this point
    257             mask = (1<<mid)-1
    258             #lets see what happens if we guess length is mid
    259             try: index = self.decodeTable[data&mask]
    260             except KeyError:
    261                 #too many bits specified, reduce estimated length
    262                 hi = mid-1
    263                 continue
    264             #we found a symbol, but there could be a longer match
    265             symbolLength = self.lengthTable[index]
    266             if symbolLength<=mid:
    267                 #all bits match, symbol must be right
    268                 return symbolLength, Symbol(self, index)
    269             #there must be more bits to match
    270             lo = mid+1
    271         return lo, Symbol(self, index)
    272 
    273     #routine to set up the tables
    274     def setDecode(self, decodeTable):
    275         """Store decodeTable,
    276         and compute lengthTable, minLength, maxLength from encodings.
    277         """
    278         self.decodeTable = decodeTable
    279         #set of symbols with unknown length
    280         todo = set(decodeTable)
    281         #bit size under investigation
    282         maskLength = 0
    283         lengthTable = {}
    284         while todo:
    285             mask = (1<<maskLength)-1
    286             #split the encodings that we didn't find yet using b bits
    287             splitSymbols = defaultdict(list)
    288             for s in todo: splitSymbols[s&mask].append(s)
    289             #unique encodings have a length of maskLength bits
    290             #set length, and remove from todo list
    291             for s,subset in splitSymbols.items():
    292                 if len(subset)==1:
    293                     lengthTable[self.decodeTable[s]] = maskLength
    294                     todo.remove(s)
    295             #now investigate with longer mask
    296             maskLength +=1
    297         #save result
    298         self.lengthTable = lengthTable
    299         self.minLength = min(lengthTable.values())
    300         self.maxLength = max(lengthTable.values())
    301         self.switchToPrefix()
    302 
    303     def setLength(self, lengthTable):
    304         """Given the bit pattern lengths for symbols given in lengthTable,
    305         set decodeTable, minLength, maxLength
    306         """
    307         self.lengthTable = lengthTable
    308         self.minLength = min(lengthTable.values())
    309         self.maxLength = max(lengthTable.values())
    310         #compute the backwards codes first; then reverse them
    311         #compute (backwards) first code for every separate lengths
    312         nextCodes = []
    313         #build codes for each length, from right to left
    314         code = 0
    315         for bits in range(self.maxLength+1):
    316             code <<= 1
    317             nextCodes.append(code)
    318             code += sum(x==bits for x in lengthTable.values())
    319         self.decodeTable = {}
    320         #count codes for each length, and store reversed in the table
    321         for symbol in sorted(lengthTable):
    322             bits = lengthTable[symbol]
    323             bitpattern = '{:0{}b}'.format(nextCodes[bits], bits)
    324             self.decodeTable[int(bitpattern[::-1], 2)] = symbol
    325             nextCodes[bits] += 1
    326         self.switchToPrefix()
    327 
    328     def switchToPrefix(self):
    329         """This routine makes sure the prefix decoder is activated.
    330         """
    331         self.mode = PrefixDecoder
    332 
    333 class Code(RangeDecoder, PrefixDecoder):
    334     """An alphabet of symbols, that can be read from a stream.
    335     If you use setDecode or setLength, you have a prefix code,
    336     otherwise you have a range code.
    337     Features:
    338     code[index] produces symbol with given index
    339     value(index): value of symbol
    340     mnemonic(index): short description of symbol
    341     explanation(index): show meaning of symbol, shown in Layout.verboseRead
    342     iter(code): produce all symbols in some order
    343     name: show as context in Layout.verboseRead
    344     """
    345     name = '?'
    346     #callback is a function that gets the symbol and the extra bits
    347     #default callback calls explanation
    348     def __init__(self, name=None, *, callback=None, description='', **args):
    349         """Don't forget to set either alphabetSize or decodeTable
    350         """
    351         #set name when provided, otherwise take class variable
    352         if name is not None: self.name = name
    353         if callback is not None: self.callback = callback
    354         self.description = description
    355         #mode switch
    356         if 'bitLength' in args or 'alphabetSize' in args:
    357             self.mode = RangeDecoder
    358             RangeDecoder.__init__(self, **args)
    359         elif 'decodeTable' in args:
    360             self.mode = PrefixDecoder
    361             PrefixDecoder.__init__(self, **args)
    362         else:
    363             super().__init__(**args)
    364 
    365     def __repr__(self):
    366         return self.__class__.__name__+' '+self.name
    367 
    368     #the routines that get switched between RangeDecoder and PrefixDecoder
    369     def __len__(self): return self.mode.__len__(self)
    370     def __iter__(self): return self.mode.__iter__(self)
    371     def __getitem__(self, index): return self.mode.__getitem__(self, index)
    372     def bitPattern(self, index): return self.mode.bitPattern(self, index)
    373     def length(self, index): return self.mode.length(self, index)
    374     def decodePeek(self, data): return self.mode.decodePeek(self, data)
    375     #general routines
    376     def value(self, index, extra=None):
    377         """Get value of symbol for computations.
    378         Override where needed.
    379         """
    380         if extra is not None:
    381             raise ValueError('value: no extra for this symbol')
    382         return index
    383 
    384     def mnemonic(self, index):
    385         """Give mnemonic of symbol.
    386         Override where needed.
    387         """
    388         return str(self.value(index))
    389 
    390     def callback(self, symbol):
    391         return self.explanation(symbol.index)
    392 
    393     def explanation(self, index):
    394         """Long explanation of the value from the numeric value
    395         This is a default routine.
    396         You can customize in three ways:
    397         - set description to add some text
    398         - override to get more control
    399         - set callback to make it dependent on you local variables
    400         """
    401         value = self.value(index)
    402         return '{0}{1}: {2}'.format(
    403             self.description and self.description+': ',
    404             self.bitPattern(index),
    405             value,
    406             )
    407 
    408     def extraBits(self, index):
    409         return 0
    410 
    411     #Routines that use the decode interface
    412     def showCode(self, width=80):
    413         """Show all words of the code in a nice format.
    414         """
    415         #make table of all symbols with binary strings
    416         symbolStrings = [
    417             (self.bitPattern(s.index), self.mnemonic(s.index))
    418             for s in self
    419             ]
    420         #determine column widths the way Lisp programmers do it
    421         leftColWidth, rightColWidth = map(max, map(
    422             map,
    423             repeat(len),
    424             zip(*symbolStrings)
    425             ))
    426         colwidth = leftColWidth+rightColWidth
    427         columns = 81//(colwidth+2)
    428         rows = -(-len(symbolStrings)//columns)
    429         def justify(bs):
    430             b,s = bs
    431             return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth)
    432         for i in range(rows):
    433             print(' '.join(map(justify, symbolStrings[i::rows])).rstrip())
    434 
    435     def readTuple(self, stream):
    436         """Read symbol from stream. Returns symbol, length.
    437         """
    438         length, symbol = self.decodePeek(stream.peek(self.maxLength))
    439         stream.pos += length
    440         return length, symbol
    441 
    442     def readTupleAndExtra(self, stream):
    443         return self.readTuple(stream)+(0, None)
    444 
    445 class WithExtra(Code):
    446     """Extension for Code so that symbol may have extra bits associated.
    447     If you supply an extraTable, you can use extraBits
    448     You can define an extraTable,
    449     which allows to call extraBits to get the number of extraBits.
    450     Otherwise, you can supply extraBits yourself.
    451     Routine readTupleAndExtra now reads the extra bits too.
    452     Value probably needs to be overridden; see Enumerator.
    453     Note: this does not give you an decodeTable.
    454     """
    455     #redefine these if you don't want to use an extraTable
    456     def extraBits(self, index):
    457         """Get the number of extra bits for this symbol.
    458         """
    459         return self.extraTable[index]
    460 
    461     def mnemonic(self, index):
    462         """This value must be independent of extra.
    463         """
    464         return str(index)
    465 
    466     def readTupleAndExtra(self, stream):
    467         """Read symbol and extrabits from stream.
    468         Returns symbol length, symbol, extraBits, extra
    469         >>> olleke.pos = 6
    470         >>> MetablockLengthAlphabet().readTupleAndExtra(olleke)
    471         (2, Symbol(MLEN, 4), 16, 46)
    472         """
    473         length, symbol = self.decodePeek(stream.peek(self.maxLength))
    474         stream.pos += length
    475         extraBits = self.extraBits(symbol.index)
    476         return length, symbol, extraBits, stream.read(extraBits)
    477 
    478     def explanation(self, index, extra=None):
    479         """Expanded version of Code.explanation supporting extra bits.
    480         If you don't supply extra, it is not mentioned.
    481         """
    482         extraBits = 0 if extra is None else self.extraBits(index)
    483         if not hasattr(self, 'extraTable'):
    484             formatString = '{0}{3}'
    485             lo = hi = value = self.value(index, extra)
    486         elif extraBits==0:
    487             formatString = '{0}{2}: {3}'
    488             lo, hi = self.span(index)
    489             value = lo
    490         else:
    491             formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}'
    492             lo, hi = self.span(index)
    493             value = lo+extra
    494         return formatString.format(
    495             self.description and self.description+': ',
    496             'x'*extraBits,
    497             self.bitPattern(index),
    498             lo, hi,
    499             extra,
    500             value,
    501             )
    502 
    503     def callback(self, symbol, extra):
    504         return self.explanation(symbol.index, extra)
    505 
    506 class BoolCode(Code):
    507     """Same as Code(bitLength=1), but shows a boolean.
    508     """
    509     def __init__(self, name=None, **args):
    510         super().__init__(name, bitLength=1, **args)
    511 
    512     def value(self, index, extra=None):
    513         return bool(super().value(index, extra))
    514 
    515 class Enumerator(WithExtra):
    516     """Code that is defined by the ExtraTable.
    517     extraTable is a class variable that contains
    518     the extraBits of the symbols from 0
    519     value0 contains the value of symbol 0
    520     encodings is not neccessary, but allowed.
    521     Note: place for FixedCode to make sure extraBits works
    522     """
    523     def __init__(self, name=None, **args):
    524         #if there is no decodeTable to determine length, compute it ourselves
    525         if 'decodeTable' not in args:
    526             args['alphabetSize'] = len(self.extraTable)
    527         super().__init__(name, **args)
    528 
    529     def __len__(self):
    530         return len(self.extraTable)
    531 
    532     def __getitem__(self, index):
    533         """Faster than PrefixDecoder
    534         """
    535         if index>=len(self.extraTable):
    536             raise ValueError("No symbol {}[{}]".format(
    537                 self.__class__.__name__, index))
    538         return Symbol(self, index)
    539 
    540     def value(self, index, extra):
    541         """Override if you don't define value0 and extraTable
    542         """
    543         lower, upper = self.span(index)
    544         value = lower+(extra or 0)
    545         if value>upper:
    546             raise ValueError('value: extra out of range')
    547         return value
    548 
    549     def span(self, index):
    550         """Give the range of possible values in a tuple
    551         Useful for mnemonic and explanation
    552         """
    553         lower = self.value0+sum(1<<x for x in self.extraTable[:index])
    554         upper = lower+(1<<self.extraTable[index])
    555         return lower, upper-1
    556 
    557 #======================Code subclasses======================================
    558 #Alphabets used in the metablock header----------------------------------
    559 #For prefix codes
    560 class PrefixCodeHeader(WithExtra):
    561     """Header of prefix codes.
    562     """
    563     def __init__(self, codename):
    564         super().__init__('PFX', bitLength=2)
    565         #this is the name of the code that it describes
    566         self.codename = codename
    567 
    568     def extraBits(self, index):
    569         return 2 if index==1 else 0
    570 
    571     def value(self, index, extra):
    572         """Returns ('Simple', #codewords) or ('Complex', HSKIP)
    573         """
    574         if index==1:
    575             if extra>3:
    576                 raise ValueError('value: extra out of range')
    577             return 'Simple', extra+1
    578         if extra:
    579             raise ValueError('value: extra out of range')
    580         return 'Complex', index
    581 
    582     def explanation(self, index, extra):
    583         if index==1:
    584             return '{} is simple with {} code word{}'.format(
    585                 self.codename, extra+1, 's' if extra else '')
    586         lengths = [1, 2, 3, 4, 0, 5, 17, 6]
    587         return '{} is complex with lengths {}...'.format(
    588             self.codename,
    589             ','.join(
    590                 map(str, lengths[index:index+5]))
    591             )
    592 
    593 class TreeShapeAlhabet(BoolCode):
    594     """The bit used to indicate if four word code is "deep" or "wide"
    595     """
    596     name = 'SHAPE'
    597     def value(self, index):
    598         return [(2,2,2,2), (1,2,3,3)][index]
    599 
    600     def explanation(self, index):
    601         return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index))
    602 
    603 class LengthOfLengthAlphabet(Code):
    604     """For use in decoding complex code descriptors.
    605     >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('')
    606     >>> print(lengthOfLengthAlphabet[2])
    607     coded with 2 bits
    608     >>> len(lengthOfLengthAlphabet[0])
    609     2
    610     >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)]
    611     [2, 4, 3, 2, 2, 4]
    612     >>> lengthOfLengthAlphabet.showCode()
    613       00:skipped             01:coded with 4 bits 0111:coded with 1 bits
    614       10:coded with 3 bits  011:coded with 2 bits 1111:coded with 5 bits
    615     """
    616     decodeTable = {
    617          0b00:0,     0b10:3,
    618        0b0111:1,     0b01:4,
    619         0b011:2,   0b1111:5,
    620        }
    621 
    622     def __init__(self, name=None, **args):
    623         super().__init__(name, decodeTable=self.decodeTable, **args)
    624 
    625     def mnemonic(self, index):
    626         if index==0: return 'skipped'
    627         return 'coded with {} bits'.format(index)
    628 
    629     def explanation(self, index, extra=None):
    630         return self.description+': '+self.mnemonic(index)
    631 
    632 class LengthAlphabet(WithExtra):
    633     """Length of symbols
    634     Used during construction of a code.
    635     """
    636     def __init__(self, name):
    637         super().__init__(name, alphabetSize=18)
    638 
    639     def extraBits(self, index):
    640         return {16:2, 17:3}.get(index, 0)
    641 
    642     def mnemonic(self, index):
    643         if index==0: return 'unused'
    644         elif index==16: return 'rep xx'
    645         elif index==17: return 'zero xxx'
    646         else: return 'len {}'.format(index)
    647 
    648     def explanation(self, index, extra):
    649         return self.description.format(self[index], extra)
    650 
    651     def value(self, index, extra):
    652         #the caller got the length already, so extra is enough
    653         return extra
    654 
    655 #Stream header
    656 class WindowSizeAlphabet(Code):
    657     """The alphabet used for window size in the stream header.
    658     >>> WindowSizeAlphabet()[10].explanation()
    659     'windowsize=(1<<10)-16=1008'
    660     """
    661     decodeTable = {
    662         0b0100001: 10,   0b1100001: 14,   0b0011: 18,   0b1011: 22,
    663         0b0110001: 11,   0b1110001: 15,   0b0101: 19,   0b1101: 23,
    664         0b1000001: 12,         0b0: 16,   0b0111: 20,   0b1111: 24,
    665         0b1010001: 13,   0b0000001: 17,   0b1001: 21,
    666         0b0010001: None,
    667         }
    668 
    669     name = 'WSIZE'
    670 
    671     def __init__(self, name=None):
    672         super().__init__(name, decodeTable=self.decodeTable)
    673 
    674     def value(self, index):
    675         #missing value gives index None
    676         if index is None: return None
    677         return (1<<index)-16
    678 
    679     def explanation(self, index):
    680         return 'windowsize=(1<<{})-16={}'.format(
    681             index, (1<<index)-16)
    682 
    683 #Metablock
    684 class MetablockLengthAlphabet(WithExtra):
    685     """Used for the meta block length;
    686     also indicates a block with no data
    687     >>> metablockLengthAlphabet = MetablockLengthAlphabet()
    688     >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0])
    689     Symbol(MLEN, 0)
    690     'empty'
    691     >>> metablockLengthAlphabet[3]
    692     Traceback (most recent call last):
    693         ...
    694     ValueError: No symbol MetablockLengthAlphabet[3]
    695     >>> print(metablockLengthAlphabet[4])
    696     hhhh00
    697     >>> metablockLengthAlphabet[4].value(0x1000)
    698     4097
    699     >>> metablockLengthAlphabet[5].value(0x1000)
    700     Traceback (most recent call last):
    701         ...
    702     InvalidStream: Zeros in high nibble of MLEN
    703     >>> metablockLengthAlphabet[5].explanation(0x12345)
    704     'data length: 12345h+1=74566'
    705     >>> metablockLengthAlphabet.showCode()
    706     00:hhhh00   10:hhhhhh10 01:hhhhh01  11:empty
    707     """
    708     decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6}
    709 
    710     name = 'MLEN'
    711     def __init__(self, name=None):
    712         super().__init__(name, decodeTable=self.decodeTable)
    713 
    714     def extraBits(self, index):
    715         return index*4
    716 
    717     def mnemonic(self, index):
    718         if index==0: return 'empty'
    719         return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
    720 
    721     def value(self, index, extra):
    722         extraBits = self.extraBits(index)
    723         if not 0<=extra<1<<extraBits:
    724             raise ValueError('value: extra out of range')
    725         if index==0: return 0
    726         if index>4 and extra>>extraBits-4==0: raise InvalidStream(
    727             'Zeros in high nibble of MLEN')
    728         return extra+1
    729 
    730     def explanation(self, index, extra):
    731         if index==0: return '11: empty block'
    732         extraBits = self.extraBits(index)
    733         return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1)
    734 
    735 
    736 class ReservedAlphabet(BoolCode):
    737     """The reserved bit that must be zero.
    738     """
    739     name = 'RSVD'
    740     def value(self, index):
    741         if index: raise ValueError('Reserved bit is not zero')
    742 
    743     def explanation(self, index):
    744         return 'Reserved (must be zero)'
    745 
    746 class FillerAlphabet(Code):
    747     def __init__(self, *, streamPos):
    748         super().__init__('SKIP', bitLength=(-streamPos)&7)
    749 
    750     def explanation(self, index):
    751         return '{} bit{} ignored'.format(
    752             self.length(index),
    753             '' if self.length(index)==1 else 's',
    754             )
    755 
    756 class SkipLengthAlphabet(WithExtra):
    757     """Used for the skip length in an empty metablock
    758     >>> skipLengthAlphabet = SkipLengthAlphabet()
    759     >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0])
    760     Symbol(SKIP, 0)
    761     'empty'
    762     >>> skipLengthAlphabet[4]
    763     Traceback (most recent call last):
    764         ...
    765     ValueError: index out of range
    766     >>> print(skipLengthAlphabet[3])
    767     hhhhhh11
    768     >>> skipLengthAlphabet[2].value(0x1000)
    769     4097
    770     >>> skipLengthAlphabet[3].value(0x1000)
    771     Traceback (most recent call last):
    772         ...
    773     InvalidStream: Zeros in high byte of SKIPBYTES
    774     >>> skipLengthAlphabet[3].explanation(0x12345)
    775     'skip length: 12345h+1=74566'
    776     >>> skipLengthAlphabet.showCode()
    777     00:empty    01:hh01     10:hhhh10   11:hhhhhh11
    778     """
    779     def __init__(self):
    780         super().__init__('SKIP', bitLength=2)
    781 
    782     def extraBits(self, index):
    783         return index*8
    784 
    785     def mnemonic(self, index):
    786         if index==0: return 'empty'
    787         return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
    788 
    789     def value(self, index, extra):
    790         extraBits = self.extraBits(index)
    791         if not 0<=extra<1<<extraBits:
    792             raise ValueError('value: extra out of range')
    793         if index==0: return 0
    794         if index>1 and extra>>extraBits-8==0:
    795             raise InvalidStream('Zeros in high byte of SKIPBYTES')
    796         return extra+1
    797 
    798     def explanation(self, index, extra):
    799         if index==0: return '00: no skip'
    800         extraBits = self.extraBits(index)
    801         return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1)
    802 
    803 
    804 class TypeCountAlphabet(Enumerator):
    805     """Used for giving block type counts and tree counts.
    806     >>> TypeCountAlphabet(description='').showCode()
    807        0:0            0101:xx,0101      1011:xxxxx,1011
    808     0001:0001         1101:xxxxxx,1101  0111:xxx,0111
    809     1001:xxxx,1001    0011:x,0011       1111:xxxxxxx,1111
    810     """
    811     decodeTable = {
    812              0b0: 0,   0b1001: 5,
    813           0b0001: 1,   0b1011: 6,
    814           0b0011: 2,   0b1101: 7,
    815           0b0101: 3,   0b1111: 8,
    816           0b0111: 4,
    817           }
    818 
    819     value0 = 1
    820     extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7]
    821     name = 'BT#'
    822 
    823     def __init__(self, name=None, *, description):
    824         super().__init__(
    825             name,
    826             decodeTable=self.decodeTable,
    827             description=description)
    828 
    829     def mnemonic(self, index):
    830         if index==0: return '0'
    831         if index==1: return '0001'
    832         return 'x'*(self.extraBits(index))+','+self.bitPattern(index)
    833 
    834     def explanation(self, index, extra):
    835         value = self.value(index, extra)
    836         description = self.description
    837         if value==1: description = description[:-1]
    838         return '{}: {} {}'.format(
    839             self.mnemonic(index),
    840             value,
    841             description)
    842 
    843 class BlockTypeAlphabet(Code):
    844     """The block types; this code works for all three kinds.
    845     >>> b = BlockTypeAlphabet('T', NBLTYPES=5)
    846     >>> print(*(x for x in b))
    847     prev +1 #0 #1 #2 #3 #4
    848     """
    849     def __init__(self, name, NBLTYPES, **args):
    850         super().__init__(name, alphabetSize=NBLTYPES+2, **args)
    851         self.NBLTYPES = NBLTYPES
    852 
    853     def mnemonic(self, index):
    854         if index==0: return 'prev'
    855         elif index==1: return '+1'
    856         else: return '#'+str(index-2)
    857 
    858     def value(self, index):
    859         return index-2
    860 
    861     def explanation(self, index):
    862         if index==0: return '0: previous'
    863         elif index==1: return '1: increment'
    864         else: return 'Set block type to: '+str(index-2)
    865 
    866 class BlockCountAlphabet(Enumerator):
    867     """Block counts
    868     >>> b = BlockCountAlphabet('L')
    869     >>> print(b[25])
    870     [24*x]: BC16625-16793840
    871     """
    872 
    873     value0 = 1
    874     extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24]
    875     def __init__(self, name, **args):
    876         super().__init__(name, alphabetSize=26, **args)
    877 
    878     def mnemonic(self, index):
    879         extraBits = self.extraBits(index)
    880         return '{}: BC{}-{}'.format(
    881             'x'*extraBits if index<5 else '[{}*x]'.format(extraBits),
    882             *self.span(index))
    883 
    884     def explanation(self, index, extra):
    885         return 'Block count: '+super().explanation(index, extra)
    886 
    887 class DistanceParamAlphabet(WithExtra):
    888     """The distance parameters NPOSTFIX and NDIRECT.
    889     Although these are treated as two in the description, this is easier.
    890     """
    891     def __init__(self):
    892         super().__init__('DIST', bitLength=2)
    893 
    894     def extraBits(self, index):
    895         return 4
    896 
    897     def value(self, index, extra):
    898         """Returns NPOSTFIX and NDIRECT<<NPOSTFIX
    899         """
    900         if extra>15:
    901             raise ValueError('value: extra out of range')
    902         return index, extra<<index
    903 
    904     def explanation(self, index, extra):
    905         return '{} postfix bits and {:04b}<<{}={} direct codes'.format(
    906             index, extra, index, extra<<index)
    907 
    908     def mnemonic(self, index):
    909         return 'PF'+str(index)
    910 
    911 class LiteralContextMode(Code):
    912     """For the literal context modes.
    913     >>> LiteralContextMode().showCode()
    914     00:LSB6   01:MSB6   10:UTF8   11:Signed
    915     >>> LiteralContextMode().explanation(2)
    916     'Context mode for type 9: 2(UTF8)'
    917     """
    918 
    919     def __init__(self, *, number=9):
    920         super().__init__('LC'+str(number), bitLength=2)
    921         self.number = number
    922 
    923     def mnemonic(self, index):
    924         return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index]
    925 
    926     def explanation(self, index):
    927         return 'Context mode for type {}: {}({})'.format(
    928             self.number,
    929             index,
    930             self.mnemonic(index))
    931 
    932 class RLEmaxAlphabet(Enumerator):
    933     """Used for describing the run length encoding used for describing context maps.
    934     >>> RLEmaxAlphabet().showCode()
    935     0:1    1:more
    936     """
    937     value0 = 0
    938     extraTable = [0, 4]
    939     name = 'RLE#'
    940 
    941     def mnemonic(self, index):
    942         return ['1', 'more'][index]
    943 
    944     def explanation(self, index, extra):
    945         description = self.description and self.description+': '
    946         if index==0: return description+'No RLE coding'
    947         return '{}xxxx 1: RLEMAX={}'.format(description, extra+1)
    948 
    949 class TreeAlphabet(WithExtra):
    950     """The alphabet to enumerate entries (called trees) in the context map.
    951     parameters are RLEMAX and NTREES
    952     >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5)
    953     >>> len(t)
    954     8
    955     >>> print(t[2])
    956     xx+4 zeroes
    957     >>> t[3].explanation(2)
    958     '8+010=10 zeroes'
    959     >>> t[0].value(0)
    960     (1, 0)
    961     """
    962     name = 'CMI'
    963     def __init__(self, name=None, *, RLEMAX, NTREES, **args):
    964         super().__init__(name, alphabetSize=RLEMAX+NTREES, **args)
    965         self.RLEMAX = RLEMAX
    966         self.NTREES = NTREES
    967 
    968     def extraBits(self, index):
    969         if 0<index<=self.RLEMAX: return index
    970         return 0
    971 
    972     def mnemonic(self, index):
    973         if index==0: return 'map #0'
    974         if index<=self.RLEMAX:
    975             return '{}+{} zeroes'.format('x'*index, 1<<index)
    976         return 'map #{}'.format(index-self.RLEMAX)
    977 
    978     def value(self, index, extra):
    979         """Give count and value."""
    980         index = index
    981         if index==0: return 1, 0
    982         if index<=self.RLEMAX: return (1<<index)+extra, 0
    983         return 1, index-self.RLEMAX
    984 
    985     def explanation(self, index, extra):
    986         description = self.description and self.description+': '
    987         if index==0: return description+'map #0'
    988         if index<=self.RLEMAX:
    989             return '{}+{:0{}b}={} zeroes'.format(
    990                 (1<<index),
    991                 extra, self.extraBits(index),
    992                 (1<<index)+extra)
    993         return '{}map #{}-{}={}'.format(
    994             description,
    995             index, self.RLEMAX, index-self.RLEMAX)
    996 
    997 #Prefix alphabets for the data stream----------------------------------
    998 class LiteralAlphabet(Code):
    999     """Alphabet of symbols.
   1000     """
   1001     minLength = maxLength = 8
   1002     def __init__(self, number):
   1003         super().__init__('L'+str(number), alphabetSize=1<<8)
   1004 
   1005     def mnemonic(self, index):
   1006         return outputCharFormatter(index)
   1007 
   1008     def value(self, index, extra=None):
   1009         return index
   1010 
   1011     def explanation(self, index, extra=None):
   1012         return self.mnemonic(index)
   1013 
   1014 class InsertLengthAlphabet(Enumerator):
   1015     """Intern code for insert counts
   1016     """
   1017     value0 = 0
   1018     extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24]
   1019 
   1020 class CopyLengthAlphabet(Enumerator):
   1021     value0 = 2
   1022     extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24]
   1023 
   1024 class InsertAndCopyAlphabet(WithExtra):
   1025     """The insert and copy code
   1026     >>> for x in range(0,704,704//13):
   1027     ...    print('{:10b}'.format(x), InsertAndCopyAlphabet()[x])
   1028              0 I0C2&D=0
   1029         110110 I6+xC8&D=0
   1030        1101100 I5C22+xxx&D=0
   1031       10100010 I4C4
   1032       11011000 I3C10+x
   1033      100001110 I14+xxC8
   1034      101000100 I10+xxC22+xxx
   1035      101111010 I98+xxxxxC14+xx
   1036      110110000 I6+xC70+xxxxx
   1037      111100110 I1090+[10*x]C8
   1038     1000011100 I26+xxxC326+[8*x]
   1039     1001010010 I322+[8*x]C14+xx
   1040     1010001000 I194+[7*x]C70+xxxxx
   1041     1010111110 I22594+[24*x]C1094+[10*x]
   1042     """
   1043     insertLengthAlphabet = InsertLengthAlphabet(None)
   1044     copyLengthAlphabet = CopyLengthAlphabet(None)
   1045 
   1046     def __init__(self, number=''):
   1047         super().__init__('IC'+str(number), bitLength=10)
   1048 
   1049     def __len__(self):
   1050         return 704
   1051 
   1052     def extraBits(self, index):
   1053         insertSymbol, copySymbol, dist0 = self.splitSymbol(index)
   1054         return InsertLengthAlphabet.extraTable[insertSymbol.index] + \
   1055             CopyLengthAlphabet.extraTable[copySymbol.index]
   1056 
   1057     def splitSymbol(self, index):
   1058         """Give relevant values for computations:
   1059         (insertSymbol, copySymbol, dist0flag)
   1060         """
   1061         #determine insert and copy upper bits from table
   1062         row = [0,0,1,1,2,2,1,3,2,3,3][index>>6]
   1063         col = [0,1,0,1,0,1,2,0,2,1,2][index>>6]
   1064         #determine inserts and copy sub codes
   1065         insertLengthCode = row<<3 | index>>3&7
   1066         if row: insertLengthCode -= 8
   1067         copyLengthCode = col<<3 | index&7
   1068         return (
   1069             Symbol(self.insertLengthAlphabet, insertLengthCode),
   1070             Symbol(self.copyLengthAlphabet, copyLengthCode),
   1071             row==0
   1072             )
   1073 
   1074     def mnemonic(self, index):
   1075         """Make a nice mnemonic
   1076         """
   1077         i,c,d0 = self.splitSymbol(index)
   1078         iLower, _ = i.code.span(i.index)
   1079         iExtra = i.extraBits()
   1080         cLower, _ = c.code.span(c.index)
   1081         cExtra = c.extraBits()
   1082         return 'I{}{}{}C{}{}{}{}'.format(
   1083             iLower,
   1084             '+' if iExtra else '',
   1085             'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra),
   1086             cLower,
   1087             '+' if cExtra else '',
   1088             'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra),
   1089             '&D=0' if d0 else '')
   1090 
   1091     def value(self, index, extra):
   1092         i,c,d0 = self.splitSymbol(index)
   1093         iExtra = i.extraBits()
   1094         ce, ie = extra>>iExtra, extra&(1<<iExtra)-1
   1095         insert = i.value(ie)
   1096         copy = c.value(ce)
   1097         return insert, copy, d0
   1098 
   1099     def explanation(self, index, extra):
   1100         insert, copy, d0 = self.value(index, extra)
   1101         if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy)
   1102         else: return 'Literal: {}, copy: {}'.format(insert, copy)
   1103 
   1104 class DistanceAlphabet(WithExtra):
   1105     """Represent the distance encoding.
   1106     Dynamically generated alphabet.
   1107     This is what the documentation should have said:
   1108     Ignoring offsets for the moment, the "long" encoding works as follows:
   1109     Write the distance in binary as follows:
   1110     1xy..yz..z, then the distance symbol consists of n..nxz..z
   1111     Where:
   1112     n is one less than number of bits in y
   1113     x is a single bit
   1114     y..y are n+1 extra bits (encoded in the bit stream)
   1115     z..z is NPOSTFIX bits that are part of the symbol
   1116     The offsets are so as to start at the lowest useable value:
   1117     if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1
   1118     then n..nxz..z is symbol -NDIRECT-16
   1119     >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
   1120     >>> print(d[4], d[17], d[34])
   1121     last-1 1 10xx00-5
   1122     >>> [str(d[x]) for x in range(26, 32)]
   1123     ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5']
   1124     """
   1125     def __init__(self, number, *, NPOSTFIX, NDIRECT):
   1126         self.NPOSTFIX = NPOSTFIX
   1127         self.NDIRECT = NDIRECT
   1128         #set length
   1129         #Actually, not all symbols are used,
   1130         #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX)
   1131         super().__init__('D'+str(number),
   1132             alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX))
   1133 
   1134     def extraBits(self, index):
   1135         """Indicate how many extra bits are needed to interpret symbol
   1136         >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
   1137         >>> [d[i].extraBits() for i in range(26)]
   1138         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   1139         >>> [d[i].extraBits() for i in range(26,36)]
   1140         [1, 1, 1, 1, 1, 1, 1, 1, 2, 2]
   1141         """
   1142         if index<16+self.NDIRECT: return 0
   1143         return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
   1144 
   1145     def value(self, dcode, dextra):
   1146         """Decode value of symbol together with the extra bits.
   1147         >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
   1148         >>> d[34].value(2)
   1149         (0, 35)
   1150         """
   1151         if dcode<16:
   1152             return [(1,0),(2,0),(3,0),(4,0),
   1153                     (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3),
   1154                     (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3)
   1155                 ][dcode]
   1156         if dcode<16+self.NDIRECT:
   1157             return (0,dcode-16)
   1158         #we use the original formulas, instead of my clear explanation
   1159         POSTFIX_MASK = (1 << self.NPOSTFIX) - 1
   1160         ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
   1161         hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX
   1162         lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK
   1163         offset = ((2 + (hcode & 1)) << ndistbits) - 4
   1164         distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1
   1165         return (0,distance)
   1166 
   1167     def mnemonic(self, index, verbose=False):
   1168         """Give mnemonic representation of meaning.
   1169         verbose compresses strings of x's
   1170         """
   1171         if index<16:
   1172             return ['last', '2last', '3last', '4last',
   1173                 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3',
   1174                 '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3'
   1175                 ][index]
   1176         if index<16+self.NDIRECT:
   1177             return str(index-16)
   1178         #construct strings like "1xx01-15"
   1179         index -= self.NDIRECT+16
   1180         hcode = index >> self.NPOSTFIX
   1181         lcode = index & (1<<self.NPOSTFIX)-1
   1182         if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}'
   1183         else: formatString = '1{0}{1}{4:+d}'
   1184         return formatString.format(
   1185             hcode&1,
   1186             'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1),
   1187             lcode, self.NPOSTFIX,
   1188             self.NDIRECT+1-(4<<self.NPOSTFIX))
   1189 
   1190     def explanation(self, index, extra):
   1191         """
   1192         >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
   1193         >>> d[55].explanation(13)
   1194         '11[1101]01-5: [0]+240'
   1195         """
   1196         extraBits = self.extraBits(index)
   1197         extraString = '[{:0{}b}]'.format(extra, extraBits)
   1198         return '{0}: [{1[0]}]{1[1]:+d}'.format(
   1199             self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString),
   1200             self.value(index, extra))
   1201 
   1202 #Classes for doing actual work------------------------------------------
   1203 class ContextModeKeeper:
   1204     """For computing the literal context mode.
   1205     You feed it characters, and it computes indices in the context map.
   1206     """
   1207     def __init__(self, mode):
   1208         self.chars = deque([0,0], maxlen=2)
   1209         self.mode = mode
   1210 
   1211     def setContextMode(self, mode):
   1212         """Switch to given context mode (0..3)"""
   1213         self.mode = mode
   1214     def getIndex(self):
   1215         if self.mode==0:  #LSB6
   1216             return self.chars[1]&0x3f
   1217         elif self.mode==1: #MSB6
   1218             return self.chars[1]>>2
   1219         elif self.mode==2: #UTF8: character class of previous and a bit of the second
   1220             p2,p1 = self.chars
   1221             return self.lut0[p1]|self.lut1[p2]
   1222         elif self.mode==3: #Signed: initial bits of last two bytes
   1223             p2,p1 = self.chars
   1224             return self.lut2[p1]<<3|self.lut2[p2]
   1225 
   1226     def add(self, index):
   1227         """Adjust the context for output char (as int)."""
   1228         self.chars.append(index)
   1229 
   1230     #0: control     #16: quote  #32: ,:;  #48: AEIOU
   1231     #4: tab/lf/cr   #20: %      #36: .    #52: BC..Z
   1232     #8: space       #24: (<[{   #40: =    #56: aeiou
   1233     #12:!#$&*+-/?@| #28: )>]}   #44: 0-9  #60: bc..z
   1234     lut0 = [0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
   1235             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   1236             8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
   1237            44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
   1238            12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
   1239            52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
   1240            12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
   1241            60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0
   1242            ]+[0,1]*32+[2,3]*32
   1243     #0: space  1:punctuation  2:digit/upper 3:lower
   1244     lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1245              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1246              0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1247              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
   1248              1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   1249              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
   1250              1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   1251              3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0
   1252            ]+[0]*96+[2]*32
   1253     #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1
   1254     lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7]
   1255     assert len(lut0)==len(lut1)==len(lut2)==256
   1256 
   1257 class WordList:
   1258     """Word list.
   1259     >>> WordList().word(7, 35555)
   1260     b'Program to '
   1261     """
   1262     NDBITS = [0,  0,  0,  0, 10, 10, 11, 11, 10, 10,
   1263              10, 10, 10,  9,  9,  8,  7,  7,  8,  7,
   1264               7,  6,  6,  5,  5]
   1265     def __init__(self):
   1266         self.file = open('dict', 'rb')
   1267         self.compileActions()
   1268 
   1269     def word(self, size, dist):
   1270         """Get word
   1271         """
   1272         #split dist in index and action
   1273         ndbits = self.NDBITS[size]
   1274         index = dist&(1<<ndbits)-1
   1275         action = dist>>ndbits
   1276         #compute position in file
   1277         position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index
   1278         self.file.seek(position)
   1279         return self.doAction(self.file.read(size), action)
   1280 
   1281     def upperCase1(self, word):
   1282         word = word.decode('utf8')
   1283         word = word[0].upper()+word[1:]
   1284         return word.encode('utf8')
   1285 
   1286 
   1287     #Super compact form of action table.
   1288     #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst
   1289     actionTable = r"""
   1290         0:w        25:w+_for_     50:w+\n\t       75:w+. This_100:w+ize_
   1291         1:w+_      26:w[3:]       51:w+:          76:w+,      101:w.U+.
   1292         2:_+w+_    27:w[:-2]      52:_+w+._       77:.+w+_    102:\xc2\xa0+w
   1293         3:w[1:]    28:w+_a_       53:w+ed_        78:U(w)+(   103:_+w+,
   1294         4:U(w)+_   29:w+_that_    54:w[9:]        79:U(w)+.   104:U(w)+="
   1295         5:w+_the_  30:_+U(w)      55:w[7:]        80:w+_not_  105:w.U+="
   1296         6:_+w      31:w+._        56:w[:-6]       81:_+w+="   106:w+ous_
   1297         7:s_+w+_   32:.+w         57:w+(          82:w+er_    107:w.U+,_
   1298         8:w+_of_   33:_+w+,_      58:U(w)+,_      83:_+w.U+_  108:U(w)+=\'
   1299         9:U(w)     34:w[4:]       59:w[:-8]       84:w+al_    109:_+U(w)+,
   1300        10:w+_and_  35:w+_with_    60:w+_at_       85:_+w.U    110:_+w.U+="
   1301        11:w[2:]    36:w+\'        61:w+ly_        86:w+=\'    111:_+w.U+,_
   1302        12:w[:-1]   37:w+_from_    62:_the_+w+_of_ 87:w.U+"    112:_+w.U+,
   1303        13:,_+w+_   38:w+_by_      63:w[:-5]       88:U(w)+._  113:w.U+(
   1304        14:w+,_     39:w[5:]       64:w[:-9]       89:_+w+(    114:w.U+._
   1305        15:_+U(w)+_ 40:w[6:]       65:_+U(w)+,_    90:w+ful_   115:_+w.U+.
   1306        16:w+_in_   41:_the_+w     66:U(w)+"       91:_+U(w)+._116:w.U+=\'
   1307        17:w+_to_   42:w[:-4]      67:.+w+(        92:w+ive_   117:_+w.U+._
   1308        18:e_+w+_   43:w+. The_    68:w.U+_        93:w+less_  118:_+U(w)+="
   1309        19:w+"      44:w.U         69:U(w)+">      94:w.U+\'   119:_+w.U+=\'
   1310        20:w+.      45:w+_on_      70:w+="         95:w+est_   120:_+U(w)+=\'
   1311        21:w+">     46:w+_as_      71:_+w+.        96:_+U(w)+.
   1312        22:w+\n     47:w+_is_      72:.com/+w      97:w.U+">
   1313        23:w[:-3]   48:w[:-7]                      98:_+w+=\'
   1314        24:w+]      49:w[:-1]+ing_ 74:U(w)+\'      99:U(w)+,
   1315         """
   1316 
   1317     def compileActions(self):
   1318         """Build the action table from the text above
   1319         """
   1320         import re
   1321         self.actionList = actions = [None]*121
   1322         #Action 73, which is too long, looks like this when expanded:
   1323         actions[73] = "b' the '+w+b' of the '"
   1324         #find out what the columns are
   1325         actionLines = self.actionTable.splitlines()
   1326         colonPositions = [m.start()
   1327             for m in re.finditer(':',actionLines[1])
   1328             ]+[100]
   1329         columns = [(colonPositions[i]-3,colonPositions[i+1]-3)
   1330             for i in range(len(colonPositions)-1)]
   1331         for line in self.actionTable.splitlines(keepends=False):
   1332             for start,end in columns:
   1333                 action = line[start:end]
   1334                 #skip empty actions
   1335                 if not action or action.isspace(): continue
   1336                 #chop it up, and check if the colon is properly placed
   1337                 index, colon, action = action[:3], action[3], action[4:]
   1338                 assert colon==':'
   1339                 #remove filler spaces at right
   1340                 action = action.rstrip()
   1341                 #replace space symbols
   1342                 action = action.replace('_', ' ')
   1343                 wPos = action.index('w')
   1344                 #add quotes around left string when present
   1345                 #translation: any pattern from beginning, up to
   1346                 #(but not including) a + following by a w later on
   1347                 action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action)
   1348                 #add quotes around right string when present
   1349                 #translation: anything with a w in it, followed by a +
   1350                 #and a pattern up to the end
   1351                 #(there is no variable lookbehind assertion,
   1352                 #so we have to copy the pattern)
   1353                 action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action)
   1354                 #expand shortcut for uppercaseAll
   1355                 action = action.replace(".U", ".upper()")
   1356                 #store action
   1357                 actions[int(index)] = action
   1358 
   1359     def doAction(self, w, action):
   1360         """Perform the proper action
   1361         """
   1362         #set environment for the UpperCaseFirst
   1363         U = self.upperCase1
   1364         return eval(self.actionList[action], locals())
   1365 
   1366 class Layout:
   1367     """Class to layout the output.
   1368     """
   1369     #display width of hexdata+bitdata
   1370     width = 25
   1371     #general
   1372     def __init__(self, stream):
   1373         self.stream = stream
   1374         self.bitPtr = self.width
   1375 
   1376     def makeHexData(self, pos):
   1377         """Produce hex dump of all data containing the bits
   1378         from pos to stream.pos
   1379         """
   1380         firstAddress = pos+7>>3
   1381         lastAddress = self.stream.pos+7>>3
   1382         return ''.join(map('{:02x} '.format,
   1383             self.stream.data[firstAddress:lastAddress]))
   1384 
   1385     def formatBitData(self, pos, width1, width2=0):
   1386         """Show formatted bit data:
   1387         Bytes are separated by commas
   1388         whole bytes are displayed in hex
   1389         >>> Layout(olleke).formatBitData(6, 2, 16)
   1390         '|00h|2Eh,|00'
   1391         >>> Layout(olleke).formatBitData(4, 1, 0)
   1392         '1'
   1393         """
   1394         result = []
   1395         #make empty prefix code explicit
   1396         if width1==0: result = ['()', ',']
   1397         for width in width1, width2:
   1398             #skip empty width2
   1399             if width==0: continue
   1400             #build result backwards in a list
   1401             while width>0:
   1402                 availableBits = 8-(pos&7)
   1403                 if width<availableBits:
   1404                     #read partial byte, beginning nor ending at boundary
   1405                     data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1
   1406                     result.append('{:0{}b}'.format(data, width))
   1407                 elif availableBits<8:
   1408                     #read rest of byte, ending at boundary
   1409                     data = self.stream.data[pos>>3] >> (pos&7)
   1410                     result.append('|{:0{}b}'.format(data, availableBits))
   1411                 else:
   1412                     #read whole byte (in hex), beginning and ending at boundary
   1413                     data = self.stream.data[pos>>3]
   1414                     result.append('|{:02X}h'.format(data))
   1415                 width -= availableBits
   1416                 pos += availableBits
   1417             #if width overshot from the availableBits subtraction, fix it
   1418             pos += width
   1419             #add comma to separate fields
   1420             result.append(',')
   1421         #concatenate pieces, reversed, skipping the last space
   1422         return ''.join(result[-2::-1])
   1423 
   1424     def readPrefixCode(self, alphabet):
   1425         """give alphabet the prefix code that is read from the stream
   1426         Called for the following alphabets, in this order:
   1427         The alphabet in question must have a "logical" order,
   1428         otherwise the assignment of symbols doesn't work.
   1429         """
   1430         mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name))
   1431         if mode=='Complex':
   1432             #for a complex code, numberOfSymbols means hskip
   1433             self.readComplexCode(numberOfSymbols, alphabet)
   1434             return alphabet
   1435         else:
   1436             table = []
   1437             #Set table of lengths for mnemonic function
   1438             lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1]
   1439             #adjust mnemonic function of alphabet class
   1440             def myMnemonic(index):
   1441                 return '{} bit{}: {}'.format(
   1442                     lengths[i],
   1443                     '' if lengths[i]==1 else 's',
   1444                     alphabet.__class__.mnemonic(alphabet, index)
   1445                     )
   1446             alphabet.mnemonic = myMnemonic
   1447             for i in range(numberOfSymbols):
   1448                 table.append(self.verboseRead(alphabet, skipExtra=True).index)
   1449             #restore mnemonic
   1450             del alphabet.mnemonic
   1451             if numberOfSymbols==4:
   1452                 #read tree shape to redefine lengths
   1453                 lengths = self.verboseRead(TreeShapeAlhabet())
   1454             #construct the alphabet prefix code
   1455             alphabet.setLength(dict(zip(table, lengths)))
   1456         return alphabet
   1457 
   1458     def readComplexCode(self, hskip, alphabet):
   1459         """Read complex code"""
   1460         stream = self.stream
   1461         #read the lengths for the length code
   1462         lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:]
   1463         codeLengths = {}
   1464         total = 0
   1465         lol = LengthOfLengthAlphabet('##'+alphabet.name)
   1466         #lengthCode will be used for coding the lengths of the new code
   1467         #we use it for display until now; definition comes below
   1468         lengthCode = LengthAlphabet('#'+alphabet.name)
   1469         lengthIter = iter(lengths)
   1470         while total<32:
   1471             newSymbol = next(lengthIter)
   1472             lol.description = str(lengthCode[newSymbol])
   1473             length = self.verboseRead(lol)
   1474             if length:
   1475                 codeLengths[newSymbol] = length
   1476                 total += 32>>length
   1477             if total>=32:
   1478                 break
   1479         if total>32: raise ValueError("Stream format")
   1480         #Now set the encoding of the lengthCode
   1481         lengthCode.setLength(codeLengths)
   1482         print("***** Lengths for {} will be coded as:".format(alphabet.name))
   1483         lengthCode.showCode()
   1484         #Now determine the symbol lengths with the lengthCode
   1485         symbolLengths = {}
   1486         total = 0
   1487         lastLength = 8
   1488         alphabetIter = iter(alphabet)
   1489         while total<32768:
   1490             #look ahead to see what is going to happen
   1491             length = lengthCode.decodePeek(
   1492                 self.stream.peek(lengthCode.maxLength))[1].index
   1493             #in every branch, set lengthCode.description to explanatory text
   1494             #lengthCode calls format(symbol, extra) with this string
   1495             if length==0:
   1496                 symbol = next(alphabetIter)
   1497                 lengthCode.description = 'symbol {} unused'.format(symbol)
   1498                 self.verboseRead(lengthCode)
   1499                 #unused symbol
   1500                 continue
   1501             if length==16:
   1502                 lengthCode.description = \
   1503                     '{1}+3 symbols of length '+str(lastLength)
   1504                 extra = self.verboseRead(lengthCode)
   1505                 #scan series of 16s (repeat counts)
   1506                 #start with repeat count 2
   1507                 repeat = 2
   1508                 startSymbol = next(alphabetIter)
   1509                 endSymbol = next(alphabetIter)
   1510                 symbolLengths[startSymbol.index] = \
   1511                     symbolLengths[endSymbol.index] = lastLength
   1512                 #count the two just defined symbols
   1513                 total += 2*32768>>lastLength
   1514                 #note: loop may end because we're there
   1515                 #even if a 16 _appears_ to follow
   1516                 while True:
   1517                     #determine last symbol
   1518                     oldRepeat = repeat
   1519                     repeat = (repeat-2<<2)+extra+3
   1520                     #read as many symbols as repeat increased
   1521                     for i in range(oldRepeat, repeat):
   1522                         endSymbol = next(alphabetIter)
   1523                         symbolLengths[endSymbol.index] = lastLength
   1524                     #compute new total; it may be end of loop
   1525                     total += (repeat-oldRepeat)*32768>>lastLength
   1526                     if total>=32768: break
   1527                     #see if there is more to do
   1528                     length = lengthCode.decodePeek(
   1529                         self.stream.peek(lengthCode.maxLength))[1].index
   1530                     if length!=16: break
   1531                     lengthCode.description = 'total {}+{{1}} symbols'.format(
   1532                         (repeat-2<<2)+3)
   1533                     extra = self.verboseRead(lengthCode)
   1534             elif length==17:
   1535                 #read, and show explanation
   1536                 lengthCode.description = '{1}+3 unused'
   1537                 extra = self.verboseRead(lengthCode)
   1538                 #scan series of 17s (groups of zero counts)
   1539                 #start with repeat count 2
   1540                 repeat = 2
   1541                 startSymbol = next(alphabetIter)
   1542                 endSymbol = next(alphabetIter)
   1543                 #note: loop will not end with total==32768,
   1544                 #since total doesn't change here
   1545                 while True:
   1546                     #determine last symbol
   1547                     oldRepeat = repeat
   1548                     repeat = (repeat-2<<3)+extra+3
   1549                     #read as many symbols as repeat increases
   1550                     for i in range(repeat-oldRepeat):
   1551                         endSymbol = next(alphabetIter)
   1552                     #see if there is more to do
   1553                     length = lengthCode.decodePeek(
   1554                         self.stream.peek(lengthCode.maxLength))[1].index
   1555                     if length!=17: break
   1556                     lengthCode.description = 'total {}+{{1}} unused'.format(
   1557                         (repeat-2<<3)+3)
   1558                     extra = self.verboseRead(lengthCode)
   1559             else:
   1560                 symbol = next(alphabetIter)
   1561                 #double braces for format
   1562                 char = str(symbol)
   1563                 if char in '{}': char *= 2
   1564                 lengthCode.description = \
   1565                     'Length for {} is {{0.index}} bits'.format(char)
   1566                 #output is not needed (will be 0)
   1567                 self.verboseRead(lengthCode)
   1568                 symbolLengths[symbol.index] = length
   1569                 total += 32768>>length
   1570                 lastLength = length
   1571         assert total==32768
   1572         alphabet.setLength(symbolLengths)
   1573         print('End of table. Prefix code '+alphabet.name+':')
   1574         alphabet.showCode()
   1575 
   1576     #stream
   1577     def processStream(self):
   1578         """Process a brotli stream.
   1579         """
   1580         print('addr  hex{:{}s}binary context explanation'.format(
   1581             '', self.width-10))
   1582         print('Stream header'.center(60, '-'))
   1583         self.windowSize = self.verboseRead(WindowSizeAlphabet())
   1584         print('Metablock header'.center(60, '='))
   1585         self.ISLAST = False
   1586         self.output = bytearray()
   1587         while not self.ISLAST:
   1588             self.ISLAST = self.verboseRead(
   1589                 BoolCode('LAST', description="Last block"))
   1590             if self.ISLAST:
   1591                 if self.verboseRead(
   1592                     BoolCode('EMPTY', description="Empty block")): break
   1593             if self.metablockLength(): continue
   1594             if not self.ISLAST and self.uncompressed(): continue
   1595             print('Block type descriptors'.center(60, '-'))
   1596             self.numberOfBlockTypes = {}
   1597             self.currentBlockCounts = {}
   1598             self.blockTypeCodes = {}
   1599             self.blockCountCodes = {}
   1600             for blockType in (L,I,D): self.blockType(blockType)
   1601             print('Distance code parameters'.center(60, '-'))
   1602             self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet())
   1603             self.readLiteralContextModes()
   1604             print('Context maps'.center(60, '-'))
   1605             self.cmaps = {}
   1606             #keep the number of each kind of prefix tree for the last loop
   1607             numberOfTrees = {I: self.numberOfBlockTypes[I]}
   1608             for blockType in (L,D):
   1609                 numberOfTrees[blockType] = self.contextMap(blockType)
   1610             print('Prefix code lists'.center(60, '-'))
   1611             self.prefixCodes = {}
   1612             for blockType in (L,I,D):
   1613                 self.readPrefixArray(blockType, numberOfTrees[blockType])
   1614             self.metablock()
   1615 
   1616     #metablock header
   1617     def verboseRead(self, alphabet, context='', skipExtra=False):
   1618         """Read symbol and extra from stream and explain what happens.
   1619         Returns the value of the symbol
   1620         >>> olleke.pos = 0
   1621         >>> l = Layout(olleke)
   1622         >>> l.verboseRead(WindowSizeAlphabet())
   1623         0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
   1624         4194288
   1625         """
   1626         #TODO 2: verbosity level, e.g. show only codes and maps in header
   1627         stream = self.stream
   1628         pos = stream.pos
   1629         if skipExtra:
   1630             length, symbol = alphabet.readTuple(stream)
   1631             extraBits, extra = 0, None
   1632         else:
   1633             length, symbol, extraBits, extra = alphabet.readTupleAndExtra(
   1634                 stream)
   1635         #fields: address, hex data, binary data, name of alphabet, explanation
   1636         hexdata = self.makeHexData(pos)
   1637         addressField = '{:04x}'.format(pos+7>>3) if hexdata else ''
   1638         bitdata = self.formatBitData(pos, length, extraBits)
   1639         #bitPtr moves bitdata so that the bytes are easier to read
   1640         #jump back to right if a new byte starts
   1641         if '|' in bitdata[1:]:
   1642             #start over on the right side
   1643             self.bitPtr = self.width
   1644         fillWidth = self.bitPtr-(len(hexdata)+len(bitdata))
   1645         if fillWidth<0: fillWidth = 0
   1646         print('{:<5s} {:<{}s} {:7s} {}'.format(
   1647             addressField,
   1648             hexdata+' '*fillWidth+bitdata, self.width,
   1649             context+alphabet.name,
   1650             symbol if skipExtra else symbol.explanation(extra),
   1651             ))
   1652         #jump to the right if we started with a '|'
   1653         #because we didn't jump before printing
   1654         if bitdata.startswith('|'): self.bitPtr = self.width
   1655         else: self.bitPtr -= len(bitdata)
   1656         return symbol if skipExtra else symbol.value(extra)
   1657 
   1658     def metablockLength(self):
   1659         """Read MNIBBLES and meta block length;
   1660         if empty block, skip block and return true.
   1661         """
   1662         self.MLEN = self.verboseRead(MetablockLengthAlphabet())
   1663         if self.MLEN:
   1664             return False
   1665         #empty block; skip and return False
   1666         self.verboseRead(ReservedAlphabet())
   1667         MSKIP = self.verboseRead(SkipLengthAlphabet())
   1668         self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
   1669         self.stream.pos += 8*MSKIP
   1670         print("Skipping to {:x}".format(self.stream.pos>>3))
   1671         return True
   1672 
   1673     def uncompressed(self):
   1674         """If true, handle uncompressed data
   1675         """
   1676         ISUNCOMPRESSED = self.verboseRead(
   1677             BoolCode('UNCMPR', description='Is uncompressed?'))
   1678         if ISUNCOMPRESSED:
   1679             self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
   1680             print('Uncompressed data:')
   1681             self.output += self.stream.readBytes(self.MLEN)
   1682             print(outputFormatter(self.output[-self.MLEN:]))
   1683         return ISUNCOMPRESSED
   1684 
   1685     def blockType(self, kind):
   1686         """Read block type switch descriptor for given kind of blockType."""
   1687         NBLTYPES = self.verboseRead(TypeCountAlphabet(
   1688             'BT#'+kind[0].upper(),
   1689             description='{} block types'.format(kind),
   1690             ))
   1691         self.numberOfBlockTypes[kind] = NBLTYPES
   1692         if NBLTYPES>=2:
   1693             self.blockTypeCodes[kind] = self.readPrefixCode(
   1694                 BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES))
   1695             self.blockCountCodes[kind] = self.readPrefixCode(
   1696                 BlockCountAlphabet('BC'+kind[0].upper()))
   1697             blockCount = self.verboseRead(self.blockCountCodes[kind])
   1698         else:
   1699             blockCount = 1<<24
   1700         self.currentBlockCounts[kind] = blockCount
   1701 
   1702     def readLiteralContextModes(self):
   1703         """Read literal context modes.
   1704         LSB6: lower 6 bits of last char
   1705         MSB6: upper 6 bits of last char
   1706         UTF8: rougly dependent on categories:
   1707             upper 4 bits depend on category of last char:
   1708                 control/whitespace/space/ punctuation/quote/%/open/close/
   1709                 comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant
   1710             lower 2 bits depend on category of 2nd last char:
   1711                 space/punctuation/digit or upper/lowercase
   1712         signed: hamming weight of last 2 chars
   1713         """
   1714         print('Context modes'.center(60, '-'))
   1715         self.literalContextModes = []
   1716         for i in range(self.numberOfBlockTypes[L]):
   1717             self.literalContextModes.append(
   1718                 self.verboseRead(LiteralContextMode(number=i)))
   1719 
   1720     def contextMap(self, kind):
   1721         """Read context maps
   1722         Returns the number of differnt values on the context map
   1723         (In other words, the number of prefix trees)
   1724         """
   1725         NTREES = self.verboseRead(TypeCountAlphabet(
   1726             kind[0].upper()+'T#',
   1727             description='{} prefix trees'.format(kind)))
   1728         mapSize = {L:64, D:4}[kind]
   1729         if NTREES<2:
   1730             self.cmaps[kind] = [0]*mapSize
   1731         else:
   1732             #read CMAPkind
   1733             RLEMAX = self.verboseRead(RLEmaxAlphabet(
   1734                 'RLE#'+kind[0].upper(),
   1735                 description=kind+' context map'))
   1736             alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX)
   1737             cmapCode = self.readPrefixCode(alphabet)
   1738             tableSize = mapSize*self.numberOfBlockTypes[kind]
   1739             cmap = []
   1740             while len(cmap)<tableSize:
   1741                 cmapCode.description = 'map {}, entry {}'.format(
   1742                     *divmod(len(cmap), mapSize))
   1743                 count, value = self.verboseRead(cmapCode)
   1744                 cmap.extend([value]*count)
   1745             assert len(cmap)==tableSize
   1746             IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF'))
   1747             if IMTF:
   1748                 self.IMTF(cmap)
   1749             if kind==L:
   1750                 print('Context maps for literal data:')
   1751                 for i in range(0, len(cmap), 64):
   1752                     print(*(
   1753                         ''.join(map(str, cmap[j:j+8]))
   1754                         for j in range(i, i+64, 8)
   1755                         ))
   1756             else:
   1757                 print('Context map for distances:')
   1758                 print(*(
   1759                     ''.join(map('{:x}'.format, cmap[i:i+4]))
   1760                     for i in range(0, len(cmap), 4)
   1761                     ))
   1762             self.cmaps[kind] = cmap
   1763         return NTREES
   1764 
   1765     @staticmethod
   1766     def IMTF(v):
   1767         """In place inverse move to front transform.
   1768         """
   1769         #mtf is initialized virtually with range(infinity)
   1770         mtf = []
   1771         for i, vi in enumerate(v):
   1772             #get old value from mtf. If never seen, take virtual value
   1773             try: value = mtf.pop(vi)
   1774             except IndexError: value = vi
   1775             #put value at front
   1776             mtf.insert(0, value)
   1777             #replace transformed value
   1778             v[i] = value
   1779 
   1780     def readPrefixArray(self, kind, numberOfTrees):
   1781         """Read prefix code array"""
   1782         prefixes = []
   1783         for i in range(numberOfTrees):
   1784             if kind==L: alphabet = LiteralAlphabet(i)
   1785             elif kind==I: alphabet = InsertAndCopyAlphabet(i)
   1786             elif kind==D: alphabet = DistanceAlphabet(
   1787                 i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT)
   1788             self.readPrefixCode(alphabet)
   1789             prefixes.append(alphabet)
   1790         self.prefixCodes[kind] = prefixes
   1791 
   1792     #metablock data
   1793     def metablock(self):
   1794         """Process the data.
   1795         Relevant variables of self:
   1796         numberOfBlockTypes[kind]: number of block types
   1797         currentBlockTypes[kind]: current block types (=0)
   1798         literalContextModes: the context modes for the literal block types
   1799         currentBlockCounts[kind]: counters for block types
   1800         blockTypeCodes[kind]: code for block type
   1801         blockCountCodes[kind]: code for block count
   1802         cmaps[kind]: the context maps (not for I)
   1803         prefixCodes[kind][#]: the prefix codes
   1804         lastDistances: the last four distances
   1805         lastChars: the last two chars
   1806         output: the result
   1807         """
   1808         print('Meta block contents'.center(60, '='))
   1809         self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1}
   1810         self.lastDistances = deque([17,16,11,4], maxlen=4)
   1811         #the current context mode is for block type 0
   1812         self.contextMode = ContextModeKeeper(self.literalContextModes[0])
   1813         wordList = WordList()
   1814 
   1815         #setup distance callback function
   1816         def distanceCallback(symbol, extra):
   1817             "callback function for displaying decoded distance"
   1818             index, offset = symbol.value(extra)
   1819             if index:
   1820                 #recent distance
   1821                 distance = self.lastDistances[-index]+offset
   1822                 return 'Distance: {}last{:+d}={}'.format(index, offset, distance)
   1823             #absolute value
   1824             if offset<=maxDistance:
   1825                 return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset)
   1826             #word list value
   1827             action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen])
   1828             return '{}-{} gives word {},{} action {}'.format(
   1829                 offset, maxDistance, copyLen, word, action)
   1830         for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback
   1831 
   1832         blockLen = 0
   1833         #there we go
   1834         while blockLen<self.MLEN:
   1835             #get insert&copy command
   1836             litLen, copyLen, dist0Flag = self.verboseRead(
   1837                 self.prefixCodes[I][
   1838                     self.figureBlockType(I)])
   1839             #literal data
   1840             for i in range(litLen):
   1841                 bt = self.figureBlockType(L)
   1842                 cm = self.contextMode.getIndex()
   1843                 ct = self.cmaps[L][bt<<6|cm]
   1844                 char = self.verboseRead(
   1845                     self.prefixCodes[L][ct],
   1846                     context='{},{}='.format(bt,cm))
   1847                 self.contextMode.add(char)
   1848                 self.output.append(char)
   1849             blockLen += litLen
   1850             #check if we're done
   1851             if blockLen>=self.MLEN: return
   1852             #distance
   1853             #distances are computed relative to output length, at most window size
   1854             maxDistance = min(len(self.output), self.windowSize)
   1855             if dist0Flag:
   1856                 distance = self.lastDistances[-1]
   1857             else:
   1858                 bt = self.figureBlockType(D)
   1859                 cm = {2:0, 3:1, 4:2}.get(copyLen, 3)
   1860                 ct = self.cmaps[D][bt<<2|cm]
   1861                 index, offset = self.verboseRead(
   1862                     self.prefixCodes[D][ct],
   1863                     context='{},{}='.format(bt,cm))
   1864                 distance = self.lastDistances[-index]+offset if index else offset
   1865                 if index==1 and offset==0:
   1866                     #to make sure distance is not put in last distance list
   1867                     dist0Flag = True
   1868             if distance<=maxDistance:
   1869                 #copy from output
   1870                 for i in range(
   1871                         maxDistance-distance,
   1872                         maxDistance-distance+copyLen):
   1873                     self.output.append(self.output[i])
   1874                 if not dist0Flag: self.lastDistances.append(distance)
   1875                 comment = 'Seen before'
   1876             else:
   1877                 #fetch from wordlist
   1878                 newWord = wordList.word(copyLen, distance-maxDistance-1)
   1879                 self.output.extend(newWord)
   1880                 #adjust copyLen to reflect actual new data
   1881                 copyLen = len(newWord)
   1882                 comment = 'From wordlist'
   1883             blockLen += copyLen
   1884             print(' '*40,
   1885                 comment,
   1886                 ': "',
   1887                 outputFormatter(self.output[-copyLen:]),
   1888                 '"',
   1889                 sep='')
   1890             self.contextMode.add(self.output[-2])
   1891             self.contextMode.add(self.output[-1])
   1892 
   1893     def figureBlockType(self, kind):
   1894         counts, types = self.currentBlockCounts, self.currentBlockTypes
   1895         if counts[kind]==0:
   1896             newType = self.verboseRead(self.blockTypeCodes[kind])
   1897             if newType==-2: newType = types['P'+kind]
   1898             elif newType==-1:
   1899                 newType = (types[kind]+1)%self.numberOfBlockTypes[kind]
   1900             types['P'+kind] = types[kind]
   1901             types[kind] = newType
   1902             counts[kind] = self.verboseRead(self.blockCountCodes[kind])
   1903         counts[kind] -=1
   1904         return types[kind]
   1905 
   1906 __test__ = {
   1907 'BitStream': """
   1908     >>> bs = BitStream(b'Jurjen')
   1909     >>> bs.readBytes(2)
   1910     b'Ju'
   1911     >>> bs.read(6) #r=01110010
   1912     50
   1913     >>> bs
   1914     BitStream(pos=2:6)
   1915     >>> bs.peek(5)  #j=01101010
   1916     9
   1917     >>> bs.readBytes(2)
   1918     Traceback (most recent call last):
   1919         ...
   1920     ValueError: readBytes: need byte boundary
   1921     """,
   1922 
   1923 'Symbol': """
   1924     >>> a=Symbol(MetablockLengthAlphabet(),5)
   1925     >>> len(a)
   1926     2
   1927     >>> int(a)
   1928     5
   1929     >>> a.bitPattern()
   1930     '01'
   1931     >>> a.value(200000)
   1932     200001
   1933     >>> a.explanation(300000)
   1934     'data length: 493e0h+1=300001'
   1935     """,
   1936 
   1937 'RangeDecoder': """
   1938     >>> a=RangeDecoder(bitLength=3)
   1939     >>> len(a)
   1940     8
   1941     >>> a.name='t'
   1942     >>> list(a)
   1943     [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)]
   1944     >>> a[2]
   1945     Symbol(t, 2)
   1946     >>> a.bitPattern(4)
   1947     '100'
   1948     >>> a.length(2)
   1949     3
   1950     >>> a.decodePeek(15)
   1951     (3, Symbol(t, 7))
   1952     >>>
   1953 
   1954     """,
   1955 
   1956 'PrefixDecoder': """
   1957     >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4})
   1958     >>> len(a)
   1959     4
   1960     >>> a.name='t'
   1961     >>> list(a)
   1962     [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)]
   1963     >>> a.decodePeek(22)
   1964     (1, Symbol(t, 1))
   1965     >>> a.decodePeek(27)
   1966     (3, Symbol(t, 3))
   1967     >>> a.length(1)
   1968     1
   1969     >>> a.length(4)
   1970     3
   1971     """,
   1972 
   1973 'Code': """
   1974     >>> a=Code('t',alphabetSize=10)
   1975     >>> len(a)
   1976     10
   1977     >>> a.showCode()
   1978     0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9
   1979     >>> a.setLength({2:1,3:2,5:3,6:3})
   1980     >>> a.showCode()
   1981       0:2  01:3 011:5 111:6
   1982     >>> len(a)
   1983     4
   1984     >>> def callback(i): return 'call{}back'.format(i)
   1985     >>> a=Code('t',callback=callback,bitLength=3)
   1986     >>> a[6].explanation()
   1987     'call6back'
   1988     """,
   1989 
   1990 'WithExtra': """
   1991     >>> class A(WithExtra):
   1992     ...    extraTable = [0,1,1,2,2]
   1993     >>> a=A('t',alphabetSize=5)
   1994     >>> a[1]
   1995     Symbol(t, 1)
   1996     >>> a.extraBits(2)
   1997     1
   1998     >>> a.mnemonic(4)
   1999     '4'
   2000     >>> a.readTupleAndExtra(BitStream(b'\x5b'))
   2001     (3, Symbol(t, 3), 2, 3)
   2002     """,
   2003 
   2004 'BoolCode': """
   2005     >>> BoolCode('test')[0].explanation()
   2006     '0: False'
   2007     """,
   2008 
   2009 'Enumerator': """
   2010     >>> class A(Enumerator):
   2011     ...    extraTable = [0,1,1,2,2]
   2012     ...    value0=3
   2013     >>> a=A(alphabetLength=5)
   2014     >>> a.value(3)
   2015     Traceback (most recent call last):
   2016         ...
   2017     TypeError: value() missing 1 required positional argument: 'extra'
   2018     >>> a.explanation(3,4)
   2019     'xx 011: 8-11; 8+4=12'
   2020     """,
   2021 
   2022 'WindowSizeAlphabet': """
   2023     >>> windowSizeAlphabet = WindowSizeAlphabet()
   2024     >>> windowSizeAlphabet[0]
   2025     Traceback (most recent call last):
   2026         ...
   2027     ValueError: No symbol WindowSizeAlphabet[0]
   2028     >>> len(windowSizeAlphabet)
   2029     16
   2030     >>> windowSizeAlphabet[21]
   2031     Symbol(WSIZE, 21)
   2032     >>> windowSizeAlphabet[21].bitPattern()
   2033     '1001'
   2034     >>> windowSizeAlphabet[21].extraBits()
   2035     0
   2036     >>> windowSizeAlphabet[21].index
   2037     21
   2038     >>> windowSizeAlphabet[10].value()
   2039     1008
   2040     >>> windowSizeAlphabet[10].explanation()
   2041     'windowsize=(1<<10)-16=1008'
   2042     >>> windowSizeAlphabet.showCode()
   2043           0:65520    1100001:16368    1110001:32752       0011:262128
   2044     0000001:131056   0010001:None        1001:2097136     1011:4194288
   2045     1000001:4080     1010001:8176        0101:524272      0111:1048560
   2046     0100001:1008     0110001:2032        1101:8388592     1111:16777200
   2047     """,
   2048 
   2049 'TypeCountAlphabet': """
   2050     >>> typeCountAlphabet = TypeCountAlphabet(description='bananas')
   2051     >>> len(typeCountAlphabet)
   2052     9
   2053     >>> typeCountAlphabet[3]
   2054     Symbol(BT#, 3)
   2055     >>> typeCountAlphabet[9]
   2056     Traceback (most recent call last):
   2057         ...
   2058     ValueError: No symbol TypeCountAlphabet[9]
   2059     >>> print(typeCountAlphabet[3])
   2060     xx,0101
   2061     >>> typeCountAlphabet[8].value(127)
   2062     256
   2063     >>> typeCountAlphabet[4].explanation(2)
   2064     'xxx,0111: 11 bananas'
   2065     >>> typeCountAlphabet[0].explanation()
   2066     '0: 1 banana'
   2067     """,
   2068 
   2069 'DistanceParamAlphabet': """
   2070     >>> dpa = DistanceParamAlphabet()
   2071     >>> dpa.showCode()
   2072     00:PF0 01:PF1 10:PF2 11:PF3
   2073     >>> dpa.readTupleAndExtra(BitStream(b'\\x29'))
   2074     (2, Symbol(DIST, 1), 4, 10)
   2075     >>> dpa.explanation(2, 5)
   2076     '2 postfix bits and 0101<<2=20 direct codes'
   2077     """,
   2078 
   2079 'LiteralAlphabet': """
   2080     >>> LiteralAlphabet(-1).showCode()   #doctest: +ELLIPSIS
   2081     00000000:\\x00 00110100:4    01101000:h    10011100:\\x9c 11010000:\\xd0
   2082     00000001:\\x01 00110101:5    01101001:i    10011101:\\x9d 11010001:\\xd1
   2083     00000010:\\x02 00110110:6    01101010:j    10011110:\\x9e 11010010:\\xd2
   2084     ...
   2085     00101111:/    01100011:c    10010111:\\x97 11001011:\\xcb 11111111:\\xff
   2086     00110000:0    01100100:d    10011000:\\x98 11001100:\\xcc
   2087     00110001:1    01100101:e    10011001:\\x99 11001101:\\xcd
   2088     00110010:2    01100110:f    10011010:\\x9a 11001110:\\xce
   2089     00110011:3    01100111:g    10011011:\\x9b 11001111:\\xcf
   2090     """,
   2091 
   2092 'BlockCountAlphabet': """
   2093     >>> bc=BlockCountAlphabet('BCL')
   2094     >>> len(bc)
   2095     26
   2096     >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02')
   2097     >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
   2098     'Block count: xx 00000: 1-4; 1+2=3'
   2099     >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
   2100     'Block count: xxx 00110: 33-40; 33+0=33'
   2101     >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
   2102     'Block count: xxxxxx 10001: 305-368; 305+28=333'
   2103     >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
   2104     'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333'
   2105     """,
   2106 
   2107 'Layout': """
   2108     >>> olleke.pos = 0
   2109     >>> l = Layout(olleke)
   2110     >>> l.verboseRead(WindowSizeAlphabet())
   2111     0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
   2112     4194288
   2113     >>> l.verboseRead(BoolCode('LAST', description="Last block"))
   2114                               1     LAST    Last block: 1: True
   2115     True
   2116     >>> l.verboseRead(BoolCode('EMPTY', description="Empty block"))
   2117                              0      EMPTY   Empty block: 0: False
   2118     False
   2119     >>> l.verboseRead(MetablockLengthAlphabet())
   2120     0001  2e 00        |00h|2Eh,|00 MLEN    data length: 002eh+1=47
   2121     47
   2122     >>> olleke.pos = 76
   2123     >>> l = Layout(olleke)
   2124     >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True)
   2125     000a  82                10|1100 D0      10[15*x]-3
   2126     >>> x.explanation(0x86a3)
   2127     '10[1000011010100011]-3: [0]+100000'
   2128     """,
   2129 
   2130 'olleke': """
   2131     >>> olleke.pos = 0
   2132     >>> try: Layout(olleke).processStream()
   2133     ... except NotImplementedError: pass
   2134     ... #doctest: +REPORT_NDIFF
   2135     addr  hex               binary context explanation
   2136     -----------------------Stream header------------------------
   2137     0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
   2138     ======================Metablock header======================
   2139                               1     LAST    Last block: 1: True
   2140                              0      EMPTY   Empty block: 0: False
   2141     0001  2e 00        |00h|2Eh,|00 MLEN    data length: 002eh+1=47
   2142     -------------------Block type descriptors-------------------
   2143     0003  00                      0 BT#L    0: 1 literal block type
   2144                                  0  BT#I    0: 1 insert&copy block type
   2145                                 0   BT#D    0: 1 distance block type
   2146     ------------------Distance code parameters------------------
   2147     0004  44               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
   2148     -----------------------Context modes------------------------
   2149                          10         LC0     Context mode for type 0: 2(UTF8)
   2150     ------------------------Context maps------------------------
   2151                         0           LT#     0: 1 literal prefix tree
   2152                        0            DT#     0: 1 distance prefix tree
   2153     ---------------------Prefix code lists----------------------
   2154                      10             PFX     L0 is complex with lengths 3,4,0,5,17...
   2155     0005  4f                    1|0 ##L0    len 3: coded with 3 bits
   2156                             0111    ##L0    len 4: coded with 1 bits
   2157                           10        ##L0    unused: coded with 3 bits
   2158     0006  d6                    0|0 ##L0    len 5: skipped
   2159                              011    ##L0    zero xxx: coded with 2 bits
   2160     ***** Lengths for L0 will be coded as:
   2161       0:len 4     01:zero xxx 011:unused   111:len 3
   2162     0007  95                1|11,01 #L0     7+3 unused
   2163                            0        #L0     Length for \\n is 4 bits
   2164                      001,01         #L0     1+3 unused
   2165     0008  44                010,0|1 #L0     total 19+2 unused
   2166                            0        #L0     Length for " " is 4 bits
   2167                           0         #L0     Length for ! is 4 bits
   2168     0009  cb                011,|01 #L0     3+3 unused
   2169                      |110,01        #L0     total 35+6 unused
   2170     000a  82                      0 #L0     Length for K is 4 bits
   2171                             000,01  #L0     0+3 unused
   2172                            0        #L0     Length for O is 4 bits
   2173     000b  4d                   01|1 #L0     symbol P unused
   2174                             011     #L0     symbol Q unused
   2175                            0        #L0     Length for R is 4 bits
   2176     000c  88                000,|01 #L0     0+3 unused
   2177                      |100,01        #L0     total 11+4 unused
   2178     000d  b6                      0 #L0     Length for b is 4 bits
   2179                                011  #L0     symbol c unused
   2180                             011     #L0     symbol d unused
   2181     000e  27                   11|1 #L0     Length for e is 3 bits
   2182                          010,01     #L0     2+3 unused
   2183                        |0           #L0     Length for k is 4 bits
   2184     000f  1f                    111 #L0     Length for l is 3 bits
   2185                              011    #L0     symbol m unused
   2186                             0       #L0     Length for n is 4 bits
   2187                           |0        #L0     Length for o is 4 bits
   2188     0010  c1                 000,01 #L0     0+3 unused
   2189                             0       #L0     Length for s is 4 bits
   2190     0011  b4                   0|11 #L0     symbol t unused
   2191                               0     #L0     Length for u is 4 bits
   2192     End of table. Prefix code L0:
   2193      000:e   0010:\\n  0110:!   0001:O   0101:b   0011:n   0111:s
   2194      100:l   1010:" " 1110:K   1001:R   1101:k   1011:o   1111:u
   2195                          11,01      PFX     IC0 is simple with 4 code words
   2196     0012  2a                |2Ah|10 IC0     ? bits: I5C4
   2197     0013  b5 ec              00|B5h IC0     ? bits: I6+xC7
   2198     0015  22            0010|111011 IC0     ? bits: I8+xC5
   2199     0016  8c            001100|0010 IC0     ? bits: I0C14+xx
   2200                        0            SHAPE   False: lengths 2,2,2,2
   2201     0017  74                 10,0|1 PFX     D0 is simple with 3 code words
   2202     0018  a6                0|01110 D0      1 bit: 2last-3
   2203                       010011        D0      2 bits: 11xx-3
   2204     0019  aa                01010|1 D0      2 bits: 11xxx-3
   2205     ====================Meta block contents=====================
   2206                        |1,01        IC0     Literal: 9, copy: 5
   2207     001a  41                   0001 0,0=L0  O
   2208                             100     0,48=L0 l
   2209     001b  a2                   10|0 0,62=L0 l
   2210                             000     0,63=L0 e
   2211     001c  a1                  1|101 0,59=L0 k
   2212                            000      0,63=L0 e
   2213                       |1010         0,59=L0 " "
   2214     001d  b5                   0101 0,11=L0 b
   2215                           |1011     0,60=L0 o
   2216     001e  24                      0 0,3=D0  Distance: 2last-3=8
   2217                                             Seen before: "lleke"
   2218                               0,10  IC0     Literal: 6, copy: 7
   2219                          |0010      0,59=L0 \\n
   2220     001f  89                   1001 0,7=L0  R
   2221                             000     0,52=L0 e
   2222     0020  fa                  010|1 0,58=L0 b
   2223                           1111      0,63=L0 u
   2224     0021  eb                  011|1 0,59=L0 s
   2225                          11,01      0,3=D0  Absolute value: 12 (pos 8)
   2226                                             Seen before: "olleke\\n"
   2227     0022  db                 01,1|1 IC0     Literal: 0, copy: 15
   2228                       |110,11       0,3=D0  Absolute value: 27 (pos 0)
   2229                                             Seen before: "Olleke bolleke\\n"
   2230     0023  f8                     00 IC0     Literal: 5, copy: 4
   2231                              1110   0,7=L0  K
   2232     0024  2c                  00|11 0,52=L0 n
   2233                           1011      0,62=L0 o
   2234     0025  0d                   1|00 0,59=L0 l
   2235                            0110     0,63=L0 !
   2236     """,
   2237 
   2238 'file': """
   2239     >>> try: Layout(BitStream(
   2240     ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb')
   2241     ...     .read())).processStream()
   2242     ... except NotImplementedError: pass
   2243     addr  hex               binary context explanation
   2244     -----------------------Stream header------------------------
   2245     0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
   2246     ======================Metablock header======================
   2247                               1     LAST    Last block: 1: True
   2248                              0      EMPTY   Empty block: 0: False
   2249     0001  13 00        |00h|13h,|00 MLEN    data length: 0013h+1=20
   2250     -------------------Block type descriptors-------------------
   2251     0003  00                      0 BT#L    0: 1 literal block type
   2252                                  0  BT#I    0: 1 insert&copy block type
   2253                                 0   BT#D    0: 1 distance block type
   2254     ------------------Distance code parameters------------------
   2255     0004  a4               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
   2256     -----------------------Context modes------------------------
   2257                          10         LC0     Context mode for type 0: 2(UTF8)
   2258     ------------------------Context maps------------------------
   2259                         0           LT#     0: 1 literal prefix tree
   2260                        0            DT#     0: 1 distance prefix tree
   2261     ---------------------Prefix code lists----------------------
   2262     0005  b0                 0|1,01 PFX     L0 is simple with 2 code words
   2263     0006  b2              0|1011000 L0      1 bit: X
   2264     0007  ea              0|1011001 L0      1 bit: Y
   2265                      01,01          PFX     IC0 is simple with 2 code words
   2266     0008  81            0000001|111 IC0     1 bit: I1C9&D=0
   2267     0009  47 02             0|47h|1 IC0     1 bit: I1C9
   2268                        00,01        PFX     D0 is simple with 1 code word
   2269     000b  8a                010|000 D0      0 bits: 10x-3
   2270     ====================Meta block contents=====================
   2271                            1        IC0     Literal: 1, copy: 9
   2272                           0         0,0=L0  X
   2273                       0,()          0,3=D0  Absolute value: 1 (pos 0)
   2274                                             Seen before: "XXXXXXXXX"
   2275                      0              IC0     Literal: 1, copy: 9, same distance
   2276                    |1               0,54=L0 Y
   2277                                             Seen before: "YYYYYYYYY"
   2278     """,
   2279 
   2280 'XY': """
   2281     >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream()
   2282     ... except NotImplementedError: pass
   2283     addr  hex               binary context explanation
   2284     -----------------------Stream header------------------------
   2285     0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
   2286     ======================Metablock header======================
   2287                               1     LAST    Last block: 1: True
   2288                              0      EMPTY   Empty block: 0: False
   2289     0001  13 00        |00h|13h,|00 MLEN    data length: 0013h+1=20
   2290     -------------------Block type descriptors-------------------
   2291     0003  00                      0 BT#L    0: 1 literal block type
   2292                                  0  BT#I    0: 1 insert&copy block type
   2293                                 0   BT#D    0: 1 distance block type
   2294     ------------------Distance code parameters------------------
   2295     0004  a4               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
   2296     -----------------------Context modes------------------------
   2297                          10         LC0     Context mode for type 0: 2(UTF8)
   2298     ------------------------Context maps------------------------
   2299                         0           LT#     0: 1 literal prefix tree
   2300                        0            DT#     0: 1 distance prefix tree
   2301     ---------------------Prefix code lists----------------------
   2302     0005  b0                 0|1,01 PFX     L0 is simple with 2 code words
   2303     0006  b2              0|1011000 L0      1 bit: X
   2304     0007  82              0|1011001 L0      1 bit: Y
   2305                      00,01          PFX     IC0 is simple with 1 code word
   2306     0008  84            0000100|100 IC0     0 bits: I4C6&D=0
   2307     0009  00                 00,0|1 PFX     D0 is simple with 1 code word
   2308     000a  e0                0|00000 D0      0 bits: last
   2309     ====================Meta block contents=====================
   2310                           ()        IC0     Literal: 4, copy: 6, same distance
   2311                          0          0,0=L0  X
   2312                         0           0,52=L0 X
   2313                        0            0,54=L0 X
   2314                       0             0,54=L0 X
   2315                                             Seen before: "XXXXXX"
   2316                     ()              IC0     Literal: 4, copy: 6, same distance
   2317                    1                0,54=L0 Y
   2318                   1                 0,54=L0 Y
   2319                 |1                  0,54=L0 Y
   2320     000b  01                      1 0,54=L0 Y
   2321                                             Seen before: "YYYYYY"
   2322     """,
   2323 
   2324 'empty': """
   2325     >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream()
   2326     ... except NotImplementedError: pass
   2327     addr  hex               binary context explanation
   2328     -----------------------Stream header------------------------
   2329     0000  81                0000001 WSIZE   windowsize=(1<<17)-16=131056
   2330     ======================Metablock header======================
   2331                           |1        LAST    Last block: 1: True
   2332     0001  16                      0 EMPTY   Empty block: 0: False
   2333                                 11  MLEN    11: empty block
   2334                                0    RSVD    Reserved (must be zero)
   2335     0002  00           000000|00,01 SKIP    skip length: 0h+1=1
   2336                     |00             SKIP    2 bits ignored
   2337     Skipping to 4
   2338     """,
   2339 
   2340 }
   2341 
   2342 if __name__=='__main__':
   2343     import sys
   2344     if len(sys.argv)>1:
   2345         l = Layout(BitStream(open(sys.argv[1],'rb').read()))
   2346         l.processStream()
   2347     else:
   2348         sys.path.append("h:/Persoonlijk/bin")
   2349         try:
   2350             import brotli
   2351             open('brotlidump.br', 'wb').write(
   2352                 brotli.compress(
   2353                     open('brotlidump.py', 'r').read()
   2354                 ))
   2355             olleke = BitStream(brotli.compress(
   2356                 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!'))
   2357         except ImportError: pass
   2358         import doctest
   2359         doctest.testmod(optionflags=doctest.REPORT_NDIFF
   2360             #|doctest.FAIL_FAST
   2361             )
   2362