Home | History | Annotate | Download | only in Lib
      1 #! /usr/bin/env python3
      2 
      3 """Base16, Base32, Base64 (RFC 3548), Base85 and Ascii85 data encodings"""
      4 
      5 # Modified 04-Oct-1995 by Jack Jansen to use binascii module
      6 # Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support
      7 # Modified 22-May-2007 by Guido van Rossum to use bytes everywhere
      8 
      9 import re
     10 import struct
     11 import binascii
     12 
     13 
     14 __all__ = [
     15     # Legacy interface exports traditional RFC 2045 Base64 encodings
     16     'encode', 'decode', 'encodebytes', 'decodebytes',
     17     # Generalized interface for other encodings
     18     'b64encode', 'b64decode', 'b32encode', 'b32decode',
     19     'b16encode', 'b16decode',
     20     # Base85 and Ascii85 encodings
     21     'b85encode', 'b85decode', 'a85encode', 'a85decode',
     22     # Standard Base64 encoding
     23     'standard_b64encode', 'standard_b64decode',
     24     # Some common Base64 alternatives.  As referenced by RFC 3458, see thread
     25     # starting at:
     26     #
     27     # http://zgp.org/pipermail/p2p-hackers/2001-September/000316.html
     28     'urlsafe_b64encode', 'urlsafe_b64decode',
     29     ]
     30 
     31 
     32 bytes_types = (bytes, bytearray)  # Types acceptable as binary data
     33 
     34 def _bytes_from_decode_data(s):
     35     if isinstance(s, str):
     36         try:
     37             return s.encode('ascii')
     38         except UnicodeEncodeError:
     39             raise ValueError('string argument should contain only ASCII characters')
     40     if isinstance(s, bytes_types):
     41         return s
     42     try:
     43         return memoryview(s).tobytes()
     44     except TypeError:
     45         raise TypeError("argument should be a bytes-like object or ASCII "
     46                         "string, not %r" % s.__class__.__name__) from None
     47 
     48 
     49 # Base64 encoding/decoding uses binascii
     50 
     51 def b64encode(s, altchars=None):
     52     """Encode the bytes-like object s using Base64 and return a bytes object.
     53 
     54     Optional altchars should be a byte string of length 2 which specifies an
     55     alternative alphabet for the '+' and '/' characters.  This allows an
     56     application to e.g. generate url or filesystem safe Base64 strings.
     57     """
     58     encoded = binascii.b2a_base64(s, newline=False)
     59     if altchars is not None:
     60         assert len(altchars) == 2, repr(altchars)
     61         return encoded.translate(bytes.maketrans(b'+/', altchars))
     62     return encoded
     63 
     64 
     65 def b64decode(s, altchars=None, validate=False):
     66     """Decode the Base64 encoded bytes-like object or ASCII string s.
     67 
     68     Optional altchars must be a bytes-like object or ASCII string of length 2
     69     which specifies the alternative alphabet used instead of the '+' and '/'
     70     characters.
     71 
     72     The result is returned as a bytes object.  A binascii.Error is raised if
     73     s is incorrectly padded.
     74 
     75     If validate is False (the default), characters that are neither in the
     76     normal base-64 alphabet nor the alternative alphabet are discarded prior
     77     to the padding check.  If validate is True, these non-alphabet characters
     78     in the input result in a binascii.Error.
     79     """
     80     s = _bytes_from_decode_data(s)
     81     if altchars is not None:
     82         altchars = _bytes_from_decode_data(altchars)
     83         assert len(altchars) == 2, repr(altchars)
     84         s = s.translate(bytes.maketrans(altchars, b'+/'))
     85     if validate and not re.match(b'^[A-Za-z0-9+/]*={0,2}$', s):
     86         raise binascii.Error('Non-base64 digit found')
     87     return binascii.a2b_base64(s)
     88 
     89 
     90 def standard_b64encode(s):
     91     """Encode bytes-like object s using the standard Base64 alphabet.
     92 
     93     The result is returned as a bytes object.
     94     """
     95     return b64encode(s)
     96 
     97 def standard_b64decode(s):
     98     """Decode bytes encoded with the standard Base64 alphabet.
     99 
    100     Argument s is a bytes-like object or ASCII string to decode.  The result
    101     is returned as a bytes object.  A binascii.Error is raised if the input
    102     is incorrectly padded.  Characters that are not in the standard alphabet
    103     are discarded prior to the padding check.
    104     """
    105     return b64decode(s)
    106 
    107 
    108 _urlsafe_encode_translation = bytes.maketrans(b'+/', b'-_')
    109 _urlsafe_decode_translation = bytes.maketrans(b'-_', b'+/')
    110 
    111 def urlsafe_b64encode(s):
    112     """Encode bytes using the URL- and filesystem-safe Base64 alphabet.
    113 
    114     Argument s is a bytes-like object to encode.  The result is returned as a
    115     bytes object.  The alphabet uses '-' instead of '+' and '_' instead of
    116     '/'.
    117     """
    118     return b64encode(s).translate(_urlsafe_encode_translation)
    119 
    120 def urlsafe_b64decode(s):
    121     """Decode bytes using the URL- and filesystem-safe Base64 alphabet.
    122 
    123     Argument s is a bytes-like object or ASCII string to decode.  The result
    124     is returned as a bytes object.  A binascii.Error is raised if the input
    125     is incorrectly padded.  Characters that are not in the URL-safe base-64
    126     alphabet, and are not a plus '+' or slash '/', are discarded prior to the
    127     padding check.
    128 
    129     The alphabet uses '-' instead of '+' and '_' instead of '/'.
    130     """
    131     s = _bytes_from_decode_data(s)
    132     s = s.translate(_urlsafe_decode_translation)
    133     return b64decode(s)
    134 
    135 
    136 
    137 # Base32 encoding/decoding must be done in Python
    138 _b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'
    139 _b32tab2 = None
    140 _b32rev = None
    141 
    142 def b32encode(s):
    143     """Encode the bytes-like object s using Base32 and return a bytes object.
    144     """
    145     global _b32tab2
    146     # Delay the initialization of the table to not waste memory
    147     # if the function is never called
    148     if _b32tab2 is None:
    149         b32tab = [bytes((i,)) for i in _b32alphabet]
    150         _b32tab2 = [a + b for a in b32tab for b in b32tab]
    151         b32tab = None
    152 
    153     if not isinstance(s, bytes_types):
    154         s = memoryview(s).tobytes()
    155     leftover = len(s) % 5
    156     # Pad the last quantum with zero bits if necessary
    157     if leftover:
    158         s = s + b'\0' * (5 - leftover)  # Don't use += !
    159     encoded = bytearray()
    160     from_bytes = int.from_bytes
    161     b32tab2 = _b32tab2
    162     for i in range(0, len(s), 5):
    163         c = from_bytes(s[i: i + 5], 'big')
    164         encoded += (b32tab2[c >> 30] +           # bits 1 - 10
    165                     b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20
    166                     b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30
    167                     b32tab2[c & 0x3ff]           # bits 31 - 40
    168                    )
    169     # Adjust for any leftover partial quanta
    170     if leftover == 1:
    171         encoded[-6:] = b'======'
    172     elif leftover == 2:
    173         encoded[-4:] = b'===='
    174     elif leftover == 3:
    175         encoded[-3:] = b'==='
    176     elif leftover == 4:
    177         encoded[-1:] = b'='
    178     return bytes(encoded)
    179 
    180 def b32decode(s, casefold=False, map01=None):
    181     """Decode the Base32 encoded bytes-like object or ASCII string s.
    182 
    183     Optional casefold is a flag specifying whether a lowercase alphabet is
    184     acceptable as input.  For security purposes, the default is False.
    185 
    186     RFC 3548 allows for optional mapping of the digit 0 (zero) to the
    187     letter O (oh), and for optional mapping of the digit 1 (one) to
    188     either the letter I (eye) or letter L (el).  The optional argument
    189     map01 when not None, specifies which letter the digit 1 should be
    190     mapped to (when map01 is not None, the digit 0 is always mapped to
    191     the letter O).  For security purposes the default is None, so that
    192     0 and 1 are not allowed in the input.
    193 
    194     The result is returned as a bytes object.  A binascii.Error is raised if
    195     the input is incorrectly padded or if there are non-alphabet
    196     characters present in the input.
    197     """
    198     global _b32rev
    199     # Delay the initialization of the table to not waste memory
    200     # if the function is never called
    201     if _b32rev is None:
    202         _b32rev = {v: k for k, v in enumerate(_b32alphabet)}
    203     s = _bytes_from_decode_data(s)
    204     if len(s) % 8:
    205         raise binascii.Error('Incorrect padding')
    206     # Handle section 2.4 zero and one mapping.  The flag map01 will be either
    207     # False, or the character to map the digit 1 (one) to.  It should be
    208     # either L (el) or I (eye).
    209     if map01 is not None:
    210         map01 = _bytes_from_decode_data(map01)
    211         assert len(map01) == 1, repr(map01)
    212         s = s.translate(bytes.maketrans(b'01', b'O' + map01))
    213     if casefold:
    214         s = s.upper()
    215     # Strip off pad characters from the right.  We need to count the pad
    216     # characters because this will tell us how many null bytes to remove from
    217     # the end of the decoded string.
    218     l = len(s)
    219     s = s.rstrip(b'=')
    220     padchars = l - len(s)
    221     # Now decode the full quanta
    222     decoded = bytearray()
    223     b32rev = _b32rev
    224     for i in range(0, len(s), 8):
    225         quanta = s[i: i + 8]
    226         acc = 0
    227         try:
    228             for c in quanta:
    229                 acc = (acc << 5) + b32rev[c]
    230         except KeyError:
    231             raise binascii.Error('Non-base32 digit found') from None
    232         decoded += acc.to_bytes(5, 'big')
    233     # Process the last, partial quanta
    234     if padchars:
    235         acc <<= 5 * padchars
    236         last = acc.to_bytes(5, 'big')
    237         if padchars == 1:
    238             decoded[-5:] = last[:-1]
    239         elif padchars == 3:
    240             decoded[-5:] = last[:-2]
    241         elif padchars == 4:
    242             decoded[-5:] = last[:-3]
    243         elif padchars == 6:
    244             decoded[-5:] = last[:-4]
    245         else:
    246             raise binascii.Error('Incorrect padding')
    247     return bytes(decoded)
    248 
    249 
    250 
    251 # RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
    252 # lowercase.  The RFC also recommends against accepting input case
    253 # insensitively.
    254 def b16encode(s):
    255     """Encode the bytes-like object s using Base16 and return a bytes object.
    256     """
    257     return binascii.hexlify(s).upper()
    258 
    259 
    260 def b16decode(s, casefold=False):
    261     """Decode the Base16 encoded bytes-like object or ASCII string s.
    262 
    263     Optional casefold is a flag specifying whether a lowercase alphabet is
    264     acceptable as input.  For security purposes, the default is False.
    265 
    266     The result is returned as a bytes object.  A binascii.Error is raised if
    267     s is incorrectly padded or if there are non-alphabet characters present
    268     in the input.
    269     """
    270     s = _bytes_from_decode_data(s)
    271     if casefold:
    272         s = s.upper()
    273     if re.search(b'[^0-9A-F]', s):
    274         raise binascii.Error('Non-base16 digit found')
    275     return binascii.unhexlify(s)
    276 
    277 #
    278 # Ascii85 encoding/decoding
    279 #
    280 
    281 _a85chars = None
    282 _a85chars2 = None
    283 _A85START = b"<~"
    284 _A85END = b"~>"
    285 
    286 def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False):
    287     # Helper function for a85encode and b85encode
    288     if not isinstance(b, bytes_types):
    289         b = memoryview(b).tobytes()
    290 
    291     padding = (-len(b)) % 4
    292     if padding:
    293         b = b + b'\0' * padding
    294     words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b)
    295 
    296     chunks = [b'z' if foldnuls and not word else
    297               b'y' if foldspaces and word == 0x20202020 else
    298               (chars2[word // 614125] +
    299                chars2[word // 85 % 7225] +
    300                chars[word % 85])
    301               for word in words]
    302 
    303     if padding and not pad:
    304         if chunks[-1] == b'z':
    305             chunks[-1] = chars[0] * 5
    306         chunks[-1] = chunks[-1][:-padding]
    307 
    308     return b''.join(chunks)
    309 
    310 def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
    311     """Encode bytes-like object b using Ascii85 and return a bytes object.
    312 
    313     foldspaces is an optional flag that uses the special short sequence 'y'
    314     instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This
    315     feature is not supported by the "standard" Adobe encoding.
    316 
    317     wrapcol controls whether the output should have newline (b'\\n') characters
    318     added to it. If this is non-zero, each output line will be at most this
    319     many characters long.
    320 
    321     pad controls whether the input is padded to a multiple of 4 before
    322     encoding. Note that the btoa implementation always pads.
    323 
    324     adobe controls whether the encoded byte sequence is framed with <~ and ~>,
    325     which is used by the Adobe implementation.
    326     """
    327     global _a85chars, _a85chars2
    328     # Delay the initialization of tables to not waste memory
    329     # if the function is never called
    330     if _a85chars is None:
    331         _a85chars = [bytes((i,)) for i in range(33, 118)]
    332         _a85chars2 = [(a + b) for a in _a85chars for b in _a85chars]
    333 
    334     result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces)
    335 
    336     if adobe:
    337         result = _A85START + result
    338     if wrapcol:
    339         wrapcol = max(2 if adobe else 1, wrapcol)
    340         chunks = [result[i: i + wrapcol]
    341                   for i in range(0, len(result), wrapcol)]
    342         if adobe:
    343             if len(chunks[-1]) + 2 > wrapcol:
    344                 chunks.append(b'')
    345         result = b'\n'.join(chunks)
    346     if adobe:
    347         result += _A85END
    348 
    349     return result
    350 
    351 def a85decode(b, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v'):
    352     """Decode the Ascii85 encoded bytes-like object or ASCII string b.
    353 
    354     foldspaces is a flag that specifies whether the 'y' short sequence should be
    355     accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is
    356     not supported by the "standard" Adobe encoding.
    357 
    358     adobe controls whether the input sequence is in Adobe Ascii85 format (i.e.
    359     is framed with <~ and ~>).
    360 
    361     ignorechars should be a byte string containing characters to ignore from the
    362     input. This should only contain whitespace characters, and by default
    363     contains all whitespace characters in ASCII.
    364 
    365     The result is returned as a bytes object.
    366     """
    367     b = _bytes_from_decode_data(b)
    368     if adobe:
    369         if not b.endswith(_A85END):
    370             raise ValueError(
    371                 "Ascii85 encoded byte sequences must end "
    372                 "with {!r}".format(_A85END)
    373                 )
    374         if b.startswith(_A85START):
    375             b = b[2:-2]  # Strip off start/end markers
    376         else:
    377             b = b[:-2]
    378     #
    379     # We have to go through this stepwise, so as to ignore spaces and handle
    380     # special short sequences
    381     #
    382     packI = struct.Struct('!I').pack
    383     decoded = []
    384     decoded_append = decoded.append
    385     curr = []
    386     curr_append = curr.append
    387     curr_clear = curr.clear
    388     for x in b + b'u' * 4:
    389         if b'!'[0] <= x <= b'u'[0]:
    390             curr_append(x)
    391             if len(curr) == 5:
    392                 acc = 0
    393                 for x in curr:
    394                     acc = 85 * acc + (x - 33)
    395                 try:
    396                     decoded_append(packI(acc))
    397                 except struct.error:
    398                     raise ValueError('Ascii85 overflow') from None
    399                 curr_clear()
    400         elif x == b'z'[0]:
    401             if curr:
    402                 raise ValueError('z inside Ascii85 5-tuple')
    403             decoded_append(b'\0\0\0\0')
    404         elif foldspaces and x == b'y'[0]:
    405             if curr:
    406                 raise ValueError('y inside Ascii85 5-tuple')
    407             decoded_append(b'\x20\x20\x20\x20')
    408         elif x in ignorechars:
    409             # Skip whitespace
    410             continue
    411         else:
    412             raise ValueError('Non-Ascii85 digit found: %c' % x)
    413 
    414     result = b''.join(decoded)
    415     padding = 4 - len(curr)
    416     if padding:
    417         # Throw away the extra padding
    418         result = result[:-padding]
    419     return result
    420 
    421 # The following code is originally taken (with permission) from Mercurial
    422 
    423 _b85alphabet = (b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    424                 b"abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~")
    425 _b85chars = None
    426 _b85chars2 = None
    427 _b85dec = None
    428 
    429 def b85encode(b, pad=False):
    430     """Encode bytes-like object b in base85 format and return a bytes object.
    431 
    432     If pad is true, the input is padded with b'\\0' so its length is a multiple of
    433     4 bytes before encoding.
    434     """
    435     global _b85chars, _b85chars2
    436     # Delay the initialization of tables to not waste memory
    437     # if the function is never called
    438     if _b85chars is None:
    439         _b85chars = [bytes((i,)) for i in _b85alphabet]
    440         _b85chars2 = [(a + b) for a in _b85chars for b in _b85chars]
    441     return _85encode(b, _b85chars, _b85chars2, pad)
    442 
    443 def b85decode(b):
    444     """Decode the base85-encoded bytes-like object or ASCII string b
    445 
    446     The result is returned as a bytes object.
    447     """
    448     global _b85dec
    449     # Delay the initialization of tables to not waste memory
    450     # if the function is never called
    451     if _b85dec is None:
    452         _b85dec = [None] * 256
    453         for i, c in enumerate(_b85alphabet):
    454             _b85dec[c] = i
    455 
    456     b = _bytes_from_decode_data(b)
    457     padding = (-len(b)) % 5
    458     b = b + b'~' * padding
    459     out = []
    460     packI = struct.Struct('!I').pack
    461     for i in range(0, len(b), 5):
    462         chunk = b[i:i + 5]
    463         acc = 0
    464         try:
    465             for c in chunk:
    466                 acc = acc * 85 + _b85dec[c]
    467         except TypeError:
    468             for j, c in enumerate(chunk):
    469                 if _b85dec[c] is None:
    470                     raise ValueError('bad base85 character at position %d'
    471                                     % (i + j)) from None
    472             raise
    473         try:
    474             out.append(packI(acc))
    475         except struct.error:
    476             raise ValueError('base85 overflow in hunk starting at byte %d'
    477                              % i) from None
    478 
    479     result = b''.join(out)
    480     if padding:
    481         result = result[:-padding]
    482     return result
    483 
    484 # Legacy interface.  This code could be cleaned up since I don't believe
    485 # binascii has any line length limitations.  It just doesn't seem worth it
    486 # though.  The files should be opened in binary mode.
    487 
    488 MAXLINESIZE = 76 # Excluding the CRLF
    489 MAXBINSIZE = (MAXLINESIZE//4)*3
    490 
    491 def encode(input, output):
    492     """Encode a file; input and output are binary files."""
    493     while True:
    494         s = input.read(MAXBINSIZE)
    495         if not s:
    496             break
    497         while len(s) < MAXBINSIZE:
    498             ns = input.read(MAXBINSIZE-len(s))
    499             if not ns:
    500                 break
    501             s += ns
    502         line = binascii.b2a_base64(s)
    503         output.write(line)
    504 
    505 
    506 def decode(input, output):
    507     """Decode a file; input and output are binary files."""
    508     while True:
    509         line = input.readline()
    510         if not line:
    511             break
    512         s = binascii.a2b_base64(line)
    513         output.write(s)
    514 
    515 def _input_type_check(s):
    516     try:
    517         m = memoryview(s)
    518     except TypeError as err:
    519         msg = "expected bytes-like object, not %s" % s.__class__.__name__
    520         raise TypeError(msg) from err
    521     if m.format not in ('c', 'b', 'B'):
    522         msg = ("expected single byte elements, not %r from %s" %
    523                                           (m.format, s.__class__.__name__))
    524         raise TypeError(msg)
    525     if m.ndim != 1:
    526         msg = ("expected 1-D data, not %d-D data from %s" %
    527                                           (m.ndim, s.__class__.__name__))
    528         raise TypeError(msg)
    529 
    530 
    531 def encodebytes(s):
    532     """Encode a bytestring into a bytes object containing multiple lines
    533     of base-64 data."""
    534     _input_type_check(s)
    535     pieces = []
    536     for i in range(0, len(s), MAXBINSIZE):
    537         chunk = s[i : i + MAXBINSIZE]
    538         pieces.append(binascii.b2a_base64(chunk))
    539     return b"".join(pieces)
    540 
    541 def encodestring(s):
    542     """Legacy alias of encodebytes()."""
    543     import warnings
    544     warnings.warn("encodestring() is a deprecated alias since 3.1, "
    545                   "use encodebytes()",
    546                   DeprecationWarning, 2)
    547     return encodebytes(s)
    548 
    549 
    550 def decodebytes(s):
    551     """Decode a bytestring of base-64 data into a bytes object."""
    552     _input_type_check(s)
    553     return binascii.a2b_base64(s)
    554 
    555 def decodestring(s):
    556     """Legacy alias of decodebytes()."""
    557     import warnings
    558     warnings.warn("decodestring() is a deprecated alias since Python 3.1, "
    559                   "use decodebytes()",
    560                   DeprecationWarning, 2)
    561     return decodebytes(s)
    562 
    563 
    564 # Usable as a script...
    565 def main():
    566     """Small main program"""
    567     import sys, getopt
    568     try:
    569         opts, args = getopt.getopt(sys.argv[1:], 'deut')
    570     except getopt.error as msg:
    571         sys.stdout = sys.stderr
    572         print(msg)
    573         print("""usage: %s [-d|-e|-u|-t] [file|-]
    574         -d, -u: decode
    575         -e: encode (default)
    576         -t: encode and decode string 'Aladdin:open sesame'"""%sys.argv[0])
    577         sys.exit(2)
    578     func = encode
    579     for o, a in opts:
    580         if o == '-e': func = encode
    581         if o == '-d': func = decode
    582         if o == '-u': func = decode
    583         if o == '-t': test(); return
    584     if args and args[0] != '-':
    585         with open(args[0], 'rb') as f:
    586             func(f, sys.stdout.buffer)
    587     else:
    588         func(sys.stdin.buffer, sys.stdout.buffer)
    589 
    590 
    591 def test():
    592     s0 = b"Aladdin:open sesame"
    593     print(repr(s0))
    594     s1 = encodebytes(s0)
    595     print(repr(s1))
    596     s2 = decodebytes(s1)
    597     print(repr(s2))
    598     assert s0 == s2
    599 
    600 
    601 if __name__ == '__main__':
    602     main()
    603