Home | History | Annotate | Download | only in Compiler
      1 #
      2 #   Cython -- encoding related tools
      3 #
      4 
      5 import re
      6 import sys
      7 
      8 if sys.version_info[0] >= 3:
      9     _unicode, _str, _bytes = str, str, bytes
     10     IS_PYTHON3 = True
     11 else:
     12     _unicode, _str, _bytes = unicode, str, str
     13     IS_PYTHON3 = False
     14 
     15 empty_bytes = _bytes()
     16 empty_unicode = _unicode()
     17 
     18 join_bytes = empty_bytes.join
     19 
     20 class UnicodeLiteralBuilder(object):
     21     """Assemble a unicode string.
     22     """
     23     def __init__(self):
     24         self.chars = []
     25 
     26     def append(self, characters):
     27         if isinstance(characters, _bytes):
     28             # this came from a Py2 string literal in the parser code
     29             characters = characters.decode("ASCII")
     30         assert isinstance(characters, _unicode), str(type(characters))
     31         self.chars.append(characters)
     32 
     33     if sys.maxunicode == 65535:
     34         def append_charval(self, char_number):
     35             if char_number > 65535:
     36                 # wide Unicode character on narrow platform => replace
     37                 # by surrogate pair
     38                 char_number -= 0x10000
     39                 self.chars.append( unichr((char_number // 1024) + 0xD800) )
     40                 self.chars.append( unichr((char_number  % 1024) + 0xDC00) )
     41             else:
     42                 self.chars.append( unichr(char_number) )
     43     else:
     44         def append_charval(self, char_number):
     45             self.chars.append( unichr(char_number) )
     46 
     47     def append_uescape(self, char_number, escape_string):
     48         self.append_charval(char_number)
     49 
     50     def getstring(self):
     51         return EncodedString(u''.join(self.chars))
     52 
     53     def getstrings(self):
     54         return (None, self.getstring())
     55 
     56 
     57 class BytesLiteralBuilder(object):
     58     """Assemble a byte string or char value.
     59     """
     60     def __init__(self, target_encoding):
     61         self.chars = []
     62         self.target_encoding = target_encoding
     63 
     64     def append(self, characters):
     65         if isinstance(characters, _unicode):
     66             characters = characters.encode(self.target_encoding)
     67         assert isinstance(characters, _bytes), str(type(characters))
     68         self.chars.append(characters)
     69 
     70     def append_charval(self, char_number):
     71         self.chars.append( unichr(char_number).encode('ISO-8859-1') )
     72 
     73     def append_uescape(self, char_number, escape_string):
     74         self.append(escape_string)
     75 
     76     def getstring(self):
     77         # this *must* return a byte string!
     78         s = BytesLiteral(join_bytes(self.chars))
     79         s.encoding = self.target_encoding
     80         return s
     81 
     82     def getchar(self):
     83         # this *must* return a byte string!
     84         return self.getstring()
     85 
     86     def getstrings(self):
     87         return (self.getstring(), None)
     88 
     89 class StrLiteralBuilder(object):
     90     """Assemble both a bytes and a unicode representation of a string.
     91     """
     92     def __init__(self, target_encoding):
     93         self._bytes   = BytesLiteralBuilder(target_encoding)
     94         self._unicode = UnicodeLiteralBuilder()
     95 
     96     def append(self, characters):
     97         self._bytes.append(characters)
     98         self._unicode.append(characters)
     99 
    100     def append_charval(self, char_number):
    101         self._bytes.append_charval(char_number)
    102         self._unicode.append_charval(char_number)
    103 
    104     def append_uescape(self, char_number, escape_string):
    105         self._bytes.append(escape_string)
    106         self._unicode.append_charval(char_number)
    107 
    108     def getstrings(self):
    109         return (self._bytes.getstring(), self._unicode.getstring())
    110 
    111 
    112 class EncodedString(_unicode):
    113     # unicode string subclass to keep track of the original encoding.
    114     # 'encoding' is None for unicode strings and the source encoding
    115     # otherwise
    116     encoding = None
    117 
    118     def __deepcopy__(self, memo):
    119         return self
    120 
    121     def byteencode(self):
    122         assert self.encoding is not None
    123         return self.encode(self.encoding)
    124 
    125     def utf8encode(self):
    126         assert self.encoding is None
    127         return self.encode("UTF-8")
    128 
    129     @property
    130     def is_unicode(self):
    131         return self.encoding is None
    132 
    133     def contains_surrogates(self):
    134         return string_contains_surrogates(self)
    135 
    136 
    137 def string_contains_surrogates(ustring):
    138     """
    139     Check if the unicode string contains surrogate code points
    140     on a CPython platform with wide (UCS-4) or narrow (UTF-16)
    141     Unicode, i.e. characters that would be spelled as two
    142     separate code units on a narrow platform.
    143     """
    144     for c in map(ord, ustring):
    145         if c > 65535:  # can only happen on wide platforms
    146             return True
    147         if 0xD800 <= c <= 0xDFFF:
    148             return True
    149     return False
    150 
    151 
    152 class BytesLiteral(_bytes):
    153     # bytes subclass that is compatible with EncodedString
    154     encoding = None
    155 
    156     def __deepcopy__(self, memo):
    157         return self
    158 
    159     def byteencode(self):
    160         if IS_PYTHON3:
    161             return _bytes(self)
    162         else:
    163             # fake-recode the string to make it a plain bytes object
    164             return self.decode('ISO-8859-1').encode('ISO-8859-1')
    165 
    166     def utf8encode(self):
    167         assert False, "this is not a unicode string: %r" % self
    168 
    169     def __str__(self):
    170         """Fake-decode the byte string to unicode to support %
    171         formatting of unicode strings.
    172         """
    173         return self.decode('ISO-8859-1')
    174 
    175     is_unicode = False
    176 
    177 
    178 char_from_escape_sequence = {
    179     r'\a' : u'\a',
    180     r'\b' : u'\b',
    181     r'\f' : u'\f',
    182     r'\n' : u'\n',
    183     r'\r' : u'\r',
    184     r'\t' : u'\t',
    185     r'\v' : u'\v',
    186     }.get
    187 
    188 _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
    189 
    190 
    191 def _to_escape_sequence(s):
    192     if s in '\n\r\t':
    193         return repr(s)[1:-1]
    194     elif s == '"':
    195         return r'\"'
    196     elif s == '\\':
    197         return r'\\'
    198     else:
    199         # within a character sequence, oct passes much better than hex
    200         return ''.join(['\\%03o' % ord(c) for c in s])
    201 
    202 
    203 def _build_specials_replacer():
    204     subexps = []
    205     replacements = {}
    206     for special in _c_special:
    207         regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
    208         subexps.append(regexp)
    209         replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
    210     sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
    211     def replace_specials(m):
    212         return replacements[m.group(1)]
    213     def replace(s):
    214         return sub(replace_specials, s)
    215     return replace
    216 
    217 _replace_specials = _build_specials_replacer()
    218 
    219 
    220 def escape_char(c):
    221     if IS_PYTHON3:
    222         c = c.decode('ISO-8859-1')
    223     if c in '\n\r\t\\':
    224         return repr(c)[1:-1]
    225     elif c == "'":
    226         return "\\'"
    227     n = ord(c)
    228     if n < 32 or n > 127:
    229         # hex works well for characters
    230         return "\\x%02X" % n
    231     else:
    232         return c
    233 
    234 def escape_byte_string(s):
    235     """Escape a byte string so that it can be written into C code.
    236     Note that this returns a Unicode string instead which, when
    237     encoded as ISO-8859-1, will result in the correct byte sequence
    238     being written.
    239     """
    240     s = _replace_specials(s)
    241     try:
    242         return s.decode("ASCII") # trial decoding: plain ASCII => done
    243     except UnicodeDecodeError:
    244         pass
    245     if IS_PYTHON3:
    246         s_new = bytearray()
    247         append, extend = s_new.append, s_new.extend
    248         for b in s:
    249             if b >= 128:
    250                 extend(('\\%3o' % b).encode('ASCII'))
    251             else:
    252                 append(b)
    253         return s_new.decode('ISO-8859-1')
    254     else:
    255         l = []
    256         append = l.append
    257         for c in s:
    258             o = ord(c)
    259             if o >= 128:
    260                 append('\\%3o' % o)
    261             else:
    262                 append(c)
    263         return join_bytes(l).decode('ISO-8859-1')
    264 
    265 def split_string_literal(s, limit=2000):
    266     # MSVC can't handle long string literals.
    267     if len(s) < limit:
    268         return s
    269     else:
    270         start = 0
    271         chunks = []
    272         while start < len(s):
    273             end = start + limit
    274             if len(s) > end-4 and '\\' in s[end-4:end]:
    275                 end -= 4 - s[end-4:end].find('\\') # just before the backslash
    276                 while s[end-1] == '\\':
    277                     end -= 1
    278                     if end == start:
    279                         # must have been a long line of backslashes
    280                         end = start + limit - (limit % 2) - 4
    281                         break
    282             chunks.append(s[start:end])
    283             start = end
    284         return '""'.join(chunks)
    285 
    286 def encode_pyunicode_string(s):
    287     """Create Py_UNICODE[] representation of a given unicode string.
    288     """
    289     s = map(ord, s) + [0]
    290 
    291     if sys.maxunicode >= 0x10000:  # Wide build or Py3.3
    292         utf16, utf32 = [], s
    293         for code_point in s:
    294             if code_point >= 0x10000:  # outside of BMP
    295                 high, low = divmod(code_point - 0x10000, 1024)
    296                 utf16.append(high + 0xD800)
    297                 utf16.append(low + 0xDC00)
    298             else:
    299                 utf16.append(code_point)
    300     else:
    301         utf16, utf32 = s, []
    302         for code_unit in s:
    303             if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
    304                 high, low = utf32[-1], code_unit
    305                 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
    306             else:
    307                 utf32.append(code_unit)
    308 
    309     if utf16 == utf32:
    310         utf16 = []
    311     return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))
    312