Home | History | Annotate | Download | only in Lib
      1 #! /usr/bin/env python3
      2 
      3 """Base16, Base32, Base64 (RFC 3548), Base85 and Ascii85 data encodings"""
      4 
      5 # Modified 04-Oct-1995 by Jack Jansen to use binascii module
      6 # Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support
      7 # Modified 22-May-2007 by Guido van Rossum to use bytes everywhere
      8 
      9 import re
     10 import struct
     11 import binascii
     12 
     13 
     14 __all__ = [
     15     # Legacy interface exports traditional RFC 2045 Base64 encodings
     16     'encode', 'decode', 'encodebytes', 'decodebytes',
     17     # Generalized interface for other encodings
     18     'b64encode', 'b64decode', 'b32encode', 'b32decode',
     19     'b16encode', 'b16decode',
     20     # Base85 and Ascii85 encodings
     21     'b85encode', 'b85decode', 'a85encode', 'a85decode',
     22     # Standard Base64 encoding
     23     'standard_b64encode', 'standard_b64decode',
     24     # Some common Base64 alternatives.  As referenced by RFC 3458, see thread
     25     # starting at:
     26     #
     27     # http://zgp.org/pipermail/p2p-hackers/2001-September/000316.html
     28     'urlsafe_b64encode', 'urlsafe_b64decode',
     29     ]
     30 
     31 
     32 bytes_types = (bytes, bytearray)  # Types acceptable as binary data
     33 
     34 def _bytes_from_decode_data(s):
     35     if isinstance(s, str):
     36         try:
     37             return s.encode('ascii')
     38         except UnicodeEncodeError:
     39             raise ValueError('string argument should contain only ASCII characters')
     40     if isinstance(s, bytes_types):
     41         return s
     42     try:
     43         return memoryview(s).tobytes()
     44     except TypeError:
     45         raise TypeError("argument should be a bytes-like object or ASCII "
     46                         "string, not %r" % s.__class__.__name__) from None
     47 
     48 
     49 # Base64 encoding/decoding uses binascii
     50 
     51 def b64encode(s, altchars=None):
     52     """Encode the bytes-like object s using Base64 and return a bytes object.
     53 
     54     Optional altchars should be a byte string of length 2 which specifies an
     55     alternative alphabet for the '+' and '/' characters.  This allows an
     56     application to e.g. generate url or filesystem safe Base64 strings.
     57     """
     58     encoded = binascii.b2a_base64(s, newline=False)
     59     if altchars is not None:
     60         assert len(altchars) == 2, repr(altchars)
     61         return encoded.translate(bytes.maketrans(b'+/', altchars))
     62     return encoded
     63 
     64 
     65 def b64decode(s, altchars=None, validate=False):
     66     """Decode the Base64 encoded bytes-like object or ASCII string s.
     67 
     68     Optional altchars must be a bytes-like object or ASCII string of length 2
     69     which specifies the alternative alphabet used instead of the '+' and '/'
     70     characters.
     71 
     72     The result is returned as a bytes object.  A binascii.Error is raised if
     73     s is incorrectly padded.
     74 
     75     If validate is False (the default), characters that are neither in the
     76     normal base-64 alphabet nor the alternative alphabet are discarded prior
     77     to the padding check.  If validate is True, these non-alphabet characters
     78     in the input result in a binascii.Error.
     79     """
     80     s = _bytes_from_decode_data(s)
     81     if altchars is not None:
     82         altchars = _bytes_from_decode_data(altchars)
     83         assert len(altchars) == 2, repr(altchars)
     84         s = s.translate(bytes.maketrans(altchars, b'+/'))
     85     if validate and not re.match(b'^[A-Za-z0-9+/]*={0,2}$', s):
     86         raise binascii.Error('Non-base64 digit found')
     87     return binascii.a2b_base64(s)
     88 
     89 
     90 def standard_b64encode(s):
     91     """Encode bytes-like object s using the standard Base64 alphabet.
     92 
     93     The result is returned as a bytes object.
     94     """
     95     return b64encode(s)
     96 
     97 def standard_b64decode(s):
     98     """Decode bytes encoded with the standard Base64 alphabet.
     99 
    100     Argument s is a bytes-like object or ASCII string to decode.  The result
    101     is returned as a bytes object.  A binascii.Error is raised if the input
    102     is incorrectly padded.  Characters that are not in the standard alphabet
    103     are discarded prior to the padding check.
    104     """
    105     return b64decode(s)
    106 
    107 
    108 _urlsafe_encode_translation = bytes.maketrans(b'+/', b'-_')
    109 _urlsafe_decode_translation = bytes.maketrans(b'-_', b'+/')
    110 
    111 def urlsafe_b64encode(s):
    112     """Encode bytes using the URL- and filesystem-safe Base64 alphabet.
    113 
    114     Argument s is a bytes-like object to encode.  The result is returned as a
    115     bytes object.  The alphabet uses '-' instead of '+' and '_' instead of
    116     '/'.
    117     """
    118     return b64encode(s).translate(_urlsafe_encode_translation)
    119 
    120 def urlsafe_b64decode(s):
    121     """Decode bytes using the URL- and filesystem-safe Base64 alphabet.
    122 
    123     Argument s is a bytes-like object or ASCII string to decode.  The result
    124     is returned as a bytes object.  A binascii.Error is raised if the input
    125     is incorrectly padded.  Characters that are not in the URL-safe base-64
    126     alphabet, and are not a plus '+' or slash '/', are discarded prior to the
    127     padding check.
    128 
    129     The alphabet uses '-' instead of '+' and '_' instead of '/'.
    130     """
    131     s = _bytes_from_decode_data(s)
    132     s = s.translate(_urlsafe_decode_translation)
    133     return b64decode(s)
    134 
    135 
    136 
    137 # Base32 encoding/decoding must be done in Python
    138 _b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'
    139 _b32tab2 = None
    140 _b32rev = None
    141 
    142 def b32encode(s):
    143     """Encode the bytes-like object s using Base32 and return a bytes object.
    144     """
    145     global _b32tab2
    146     # Delay the initialization of the table to not waste memory
    147     # if the function is never called
    148     if _b32tab2 is None:
    149         b32tab = [bytes((i,)) for i in _b32alphabet]
    150         _b32tab2 = [a + b for a in b32tab for b in b32tab]
    151         b32tab = None
    152 
    153     if not isinstance(s, bytes_types):
    154         s = memoryview(s).tobytes()
    155     leftover = len(s) % 5
    156     # Pad the last quantum with zero bits if necessary
    157     if leftover:
    158         s = s + b'\0' * (5 - leftover)  # Don't use += !
    159     encoded = bytearray()
    160     from_bytes = int.from_bytes
    161     b32tab2 = _b32tab2
    162     for i in range(0, len(s), 5):
    163         c = from_bytes(s[i: i + 5], 'big')
    164         encoded += (b32tab2[c >> 30] +           # bits 1 - 10
    165                     b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20
    166                     b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30
    167                     b32tab2[c & 0x3ff]           # bits 31 - 40
    168                    )
    169     # Adjust for any leftover partial quanta
    170     if leftover == 1:
    171         encoded[-6:] = b'======'
    172     elif leftover == 2:
    173         encoded[-4:] = b'===='
    174     elif leftover == 3:
    175         encoded[-3:] = b'==='
    176     elif leftover == 4:
    177         encoded[-1:] = b'='
    178     return bytes(encoded)
    179 
    180 def b32decode(s, casefold=False, map01=None):
    181     """Decode the Base32 encoded bytes-like object or ASCII string s.
    182 
    183     Optional casefold is a flag specifying whether a lowercase alphabet is
    184     acceptable as input.  For security purposes, the default is False.
    185 
    186     RFC 3548 allows for optional mapping of the digit 0 (zero) to the
    187     letter O (oh), and for optional mapping of the digit 1 (one) to
    188     either the letter I (eye) or letter L (el).  The optional argument
    189     map01 when not None, specifies which letter the digit 1 should be
    190     mapped to (when map01 is not None, the digit 0 is always mapped to
    191     the letter O).  For security purposes the default is None, so that
    192     0 and 1 are not allowed in the input.
    193 
    194     The result is returned as a bytes object.  A binascii.Error is raised if
    195     the input is incorrectly padded or if there are non-alphabet
    196     characters present in the input.
    197     """
    198     global _b32rev
    199     # Delay the initialization of the table to not waste memory
    200     # if the function is never called
    201     if _b32rev is None:
    202         _b32rev = {v: k for k, v in enumerate(_b32alphabet)}
    203     s = _bytes_from_decode_data(s)
    204     if len(s) % 8:
    205         raise binascii.Error('Incorrect padding')
    206     # Handle section 2.4 zero and one mapping.  The flag map01 will be either
    207     # False, or the character to map the digit 1 (one) to.  It should be
    208     # either L (el) or I (eye).
    209     if map01 is not None:
    210         map01 = _bytes_from_decode_data(map01)
    211         assert len(map01) == 1, repr(map01)
    212         s = s.translate(bytes.maketrans(b'01', b'O' + map01))
    213     if casefold:
    214         s = s.upper()
    215     # Strip off pad characters from the right.  We need to count the pad
    216     # characters because this will tell us how many null bytes to remove from
    217     # the end of the decoded string.
    218     l = len(s)
    219     s = s.rstrip(b'=')
    220     padchars = l - len(s)
    221     # Now decode the full quanta
    222     decoded = bytearray()
    223     b32rev = _b32rev
    224     for i in range(0, len(s), 8):
    225         quanta = s[i: i + 8]
    226         acc = 0
    227         try:
    228             for c in quanta:
    229                 acc = (acc << 5) + b32rev[c]
    230         except KeyError:
    231             raise binascii.Error('Non-base32 digit found') from None
    232         decoded += acc.to_bytes(5, 'big')
    233     # Process the last, partial quanta
    234     if l % 8 or padchars not in {0, 1, 3, 4, 6}:
    235         raise binascii.Error('Incorrect padding')
    236     if padchars and decoded:
    237         acc <<= 5 * padchars
    238         last = acc.to_bytes(5, 'big')
    239         leftover = (43 - 5 * padchars) // 8  # 1: 4, 3: 3, 4: 2, 6: 1
    240         decoded[-5:] = last[:leftover]
    241     return bytes(decoded)
    242 
    243 
    244 # RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
    245 # lowercase.  The RFC also recommends against accepting input case
    246 # insensitively.
    247 def b16encode(s):
    248     """Encode the bytes-like object s using Base16 and return a bytes object.
    249     """
    250     return binascii.hexlify(s).upper()
    251 
    252 
    253 def b16decode(s, casefold=False):
    254     """Decode the Base16 encoded bytes-like object or ASCII string s.
    255 
    256     Optional casefold is a flag specifying whether a lowercase alphabet is
    257     acceptable as input.  For security purposes, the default is False.
    258 
    259     The result is returned as a bytes object.  A binascii.Error is raised if
    260     s is incorrectly padded or if there are non-alphabet characters present
    261     in the input.
    262     """
    263     s = _bytes_from_decode_data(s)
    264     if casefold:
    265         s = s.upper()
    266     if re.search(b'[^0-9A-F]', s):
    267         raise binascii.Error('Non-base16 digit found')
    268     return binascii.unhexlify(s)
    269 
    270 #
    271 # Ascii85 encoding/decoding
    272 #
    273 
    274 _a85chars = None
    275 _a85chars2 = None
    276 _A85START = b"<~"
    277 _A85END = b"~>"
    278 
    279 def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False):
    280     # Helper function for a85encode and b85encode
    281     if not isinstance(b, bytes_types):
    282         b = memoryview(b).tobytes()
    283 
    284     padding = (-len(b)) % 4
    285     if padding:
    286         b = b + b'\0' * padding
    287     words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b)
    288 
    289     chunks = [b'z' if foldnuls and not word else
    290               b'y' if foldspaces and word == 0x20202020 else
    291               (chars2[word // 614125] +
    292                chars2[word // 85 % 7225] +
    293                chars[word % 85])
    294               for word in words]
    295 
    296     if padding and not pad:
    297         if chunks[-1] == b'z':
    298             chunks[-1] = chars[0] * 5
    299         chunks[-1] = chunks[-1][:-padding]
    300 
    301     return b''.join(chunks)
    302 
    303 def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
    304     """Encode bytes-like object b using Ascii85 and return a bytes object.
    305 
    306     foldspaces is an optional flag that uses the special short sequence 'y'
    307     instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This
    308     feature is not supported by the "standard" Adobe encoding.
    309 
    310     wrapcol controls whether the output should have newline (b'\\n') characters
    311     added to it. If this is non-zero, each output line will be at most this
    312     many characters long.
    313 
    314     pad controls whether the input is padded to a multiple of 4 before
    315     encoding. Note that the btoa implementation always pads.
    316 
    317     adobe controls whether the encoded byte sequence is framed with <~ and ~>,
    318     which is used by the Adobe implementation.
    319     """
    320     global _a85chars, _a85chars2
    321     # Delay the initialization of tables to not waste memory
    322     # if the function is never called
    323     if _a85chars is None:
    324         _a85chars = [bytes((i,)) for i in range(33, 118)]
    325         _a85chars2 = [(a + b) for a in _a85chars for b in _a85chars]
    326 
    327     result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces)
    328 
    329     if adobe:
    330         result = _A85START + result
    331     if wrapcol:
    332         wrapcol = max(2 if adobe else 1, wrapcol)
    333         chunks = [result[i: i + wrapcol]
    334                   for i in range(0, len(result), wrapcol)]
    335         if adobe:
    336             if len(chunks[-1]) + 2 > wrapcol:
    337                 chunks.append(b'')
    338         result = b'\n'.join(chunks)
    339     if adobe:
    340         result += _A85END
    341 
    342     return result
    343 
    344 def a85decode(b, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v'):
    345     """Decode the Ascii85 encoded bytes-like object or ASCII string b.
    346 
    347     foldspaces is a flag that specifies whether the 'y' short sequence should be
    348     accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is
    349     not supported by the "standard" Adobe encoding.
    350 
    351     adobe controls whether the input sequence is in Adobe Ascii85 format (i.e.
    352     is framed with <~ and ~>).
    353 
    354     ignorechars should be a byte string containing characters to ignore from the
    355     input. This should only contain whitespace characters, and by default
    356     contains all whitespace characters in ASCII.
    357 
    358     The result is returned as a bytes object.
    359     """
    360     b = _bytes_from_decode_data(b)
    361     if adobe:
    362         if not b.endswith(_A85END):
    363             raise ValueError(
    364                 "Ascii85 encoded byte sequences must end "
    365                 "with {!r}".format(_A85END)
    366                 )
    367         if b.startswith(_A85START):
    368             b = b[2:-2]  # Strip off start/end markers
    369         else:
    370             b = b[:-2]
    371     #
    372     # We have to go through this stepwise, so as to ignore spaces and handle
    373     # special short sequences
    374     #
    375     packI = struct.Struct('!I').pack
    376     decoded = []
    377     decoded_append = decoded.append
    378     curr = []
    379     curr_append = curr.append
    380     curr_clear = curr.clear
    381     for x in b + b'u' * 4:
    382         if b'!'[0] <= x <= b'u'[0]:
    383             curr_append(x)
    384             if len(curr) == 5:
    385                 acc = 0
    386                 for x in curr:
    387                     acc = 85 * acc + (x - 33)
    388                 try:
    389                     decoded_append(packI(acc))
    390                 except struct.error:
    391                     raise ValueError('Ascii85 overflow') from None
    392                 curr_clear()
    393         elif x == b'z'[0]:
    394             if curr:
    395                 raise ValueError('z inside Ascii85 5-tuple')
    396             decoded_append(b'\0\0\0\0')
    397         elif foldspaces and x == b'y'[0]:
    398             if curr:
    399                 raise ValueError('y inside Ascii85 5-tuple')
    400             decoded_append(b'\x20\x20\x20\x20')
    401         elif x in ignorechars:
    402             # Skip whitespace
    403             continue
    404         else:
    405             raise ValueError('Non-Ascii85 digit found: %c' % x)
    406 
    407     result = b''.join(decoded)
    408     padding = 4 - len(curr)
    409     if padding:
    410         # Throw away the extra padding
    411         result = result[:-padding]
    412     return result
    413 
    414 # The following code is originally taken (with permission) from Mercurial
    415 
    416 _b85alphabet = (b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    417                 b"abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~")
    418 _b85chars = None
    419 _b85chars2 = None
    420 _b85dec = None
    421 
    422 def b85encode(b, pad=False):
    423     """Encode bytes-like object b in base85 format and return a bytes object.
    424 
    425     If pad is true, the input is padded with b'\\0' so its length is a multiple of
    426     4 bytes before encoding.
    427     """
    428     global _b85chars, _b85chars2
    429     # Delay the initialization of tables to not waste memory
    430     # if the function is never called
    431     if _b85chars is None:
    432         _b85chars = [bytes((i,)) for i in _b85alphabet]
    433         _b85chars2 = [(a + b) for a in _b85chars for b in _b85chars]
    434     return _85encode(b, _b85chars, _b85chars2, pad)
    435 
    436 def b85decode(b):
    437     """Decode the base85-encoded bytes-like object or ASCII string b
    438 
    439     The result is returned as a bytes object.
    440     """
    441     global _b85dec
    442     # Delay the initialization of tables to not waste memory
    443     # if the function is never called
    444     if _b85dec is None:
    445         _b85dec = [None] * 256
    446         for i, c in enumerate(_b85alphabet):
    447             _b85dec[c] = i
    448 
    449     b = _bytes_from_decode_data(b)
    450     padding = (-len(b)) % 5
    451     b = b + b'~' * padding
    452     out = []
    453     packI = struct.Struct('!I').pack
    454     for i in range(0, len(b), 5):
    455         chunk = b[i:i + 5]
    456         acc = 0
    457         try:
    458             for c in chunk:
    459                 acc = acc * 85 + _b85dec[c]
    460         except TypeError:
    461             for j, c in enumerate(chunk):
    462                 if _b85dec[c] is None:
    463                     raise ValueError('bad base85 character at position %d'
    464                                     % (i + j)) from None
    465             raise
    466         try:
    467             out.append(packI(acc))
    468         except struct.error:
    469             raise ValueError('base85 overflow in hunk starting at byte %d'
    470                              % i) from None
    471 
    472     result = b''.join(out)
    473     if padding:
    474         result = result[:-padding]
    475     return result
    476 
    477 # Legacy interface.  This code could be cleaned up since I don't believe
    478 # binascii has any line length limitations.  It just doesn't seem worth it
    479 # though.  The files should be opened in binary mode.
    480 
    481 MAXLINESIZE = 76 # Excluding the CRLF
    482 MAXBINSIZE = (MAXLINESIZE//4)*3
    483 
    484 def encode(input, output):
    485     """Encode a file; input and output are binary files."""
    486     while True:
    487         s = input.read(MAXBINSIZE)
    488         if not s:
    489             break
    490         while len(s) < MAXBINSIZE:
    491             ns = input.read(MAXBINSIZE-len(s))
    492             if not ns:
    493                 break
    494             s += ns
    495         line = binascii.b2a_base64(s)
    496         output.write(line)
    497 
    498 
    499 def decode(input, output):
    500     """Decode a file; input and output are binary files."""
    501     while True:
    502         line = input.readline()
    503         if not line:
    504             break
    505         s = binascii.a2b_base64(line)
    506         output.write(s)
    507 
    508 def _input_type_check(s):
    509     try:
    510         m = memoryview(s)
    511     except TypeError as err:
    512         msg = "expected bytes-like object, not %s" % s.__class__.__name__
    513         raise TypeError(msg) from err
    514     if m.format not in ('c', 'b', 'B'):
    515         msg = ("expected single byte elements, not %r from %s" %
    516                                           (m.format, s.__class__.__name__))
    517         raise TypeError(msg)
    518     if m.ndim != 1:
    519         msg = ("expected 1-D data, not %d-D data from %s" %
    520                                           (m.ndim, s.__class__.__name__))
    521         raise TypeError(msg)
    522 
    523 
    524 def encodebytes(s):
    525     """Encode a bytestring into a bytes object containing multiple lines
    526     of base-64 data."""
    527     _input_type_check(s)
    528     pieces = []
    529     for i in range(0, len(s), MAXBINSIZE):
    530         chunk = s[i : i + MAXBINSIZE]
    531         pieces.append(binascii.b2a_base64(chunk))
    532     return b"".join(pieces)
    533 
    534 def encodestring(s):
    535     """Legacy alias of encodebytes()."""
    536     import warnings
    537     warnings.warn("encodestring() is a deprecated alias since 3.1, "
    538                   "use encodebytes()",
    539                   DeprecationWarning, 2)
    540     return encodebytes(s)
    541 
    542 
    543 def decodebytes(s):
    544     """Decode a bytestring of base-64 data into a bytes object."""
    545     _input_type_check(s)
    546     return binascii.a2b_base64(s)
    547 
    548 def decodestring(s):
    549     """Legacy alias of decodebytes()."""
    550     import warnings
    551     warnings.warn("decodestring() is a deprecated alias since Python 3.1, "
    552                   "use decodebytes()",
    553                   DeprecationWarning, 2)
    554     return decodebytes(s)
    555 
    556 
    557 # Usable as a script...
    558 def main():
    559     """Small main program"""
    560     import sys, getopt
    561     try:
    562         opts, args = getopt.getopt(sys.argv[1:], 'deut')
    563     except getopt.error as msg:
    564         sys.stdout = sys.stderr
    565         print(msg)
    566         print("""usage: %s [-d|-e|-u|-t] [file|-]
    567         -d, -u: decode
    568         -e: encode (default)
    569         -t: encode and decode string 'Aladdin:open sesame'"""%sys.argv[0])
    570         sys.exit(2)
    571     func = encode
    572     for o, a in opts:
    573         if o == '-e': func = encode
    574         if o == '-d': func = decode
    575         if o == '-u': func = decode
    576         if o == '-t': test(); return
    577     if args and args[0] != '-':
    578         with open(args[0], 'rb') as f:
    579             func(f, sys.stdout.buffer)
    580     else:
    581         func(sys.stdin.buffer, sys.stdout.buffer)
    582 
    583 
    584 def test():
    585     s0 = b"Aladdin:open sesame"
    586     print(repr(s0))
    587     s1 = encodebytes(s0)
    588     print(repr(s1))
    589     s2 = decodebytes(s1)
    590     print(repr(s2))
    591     assert s0 == s2
    592 
    593 
    594 if __name__ == '__main__':
    595     main()
    596