Home | History | Annotate | Download | only in encodings
      1 # -*- coding: iso-8859-1 -*-
      2 """ Codec for the Punicode encoding, as specified in RFC 3492
      3 
      4 Written by Martin v. Lwis.
      5 """
      6 
      7 import codecs
      8 
      9 ##################### Encoding #####################################
     10 
     11 def segregate(str):
     12     """3.1 Basic code point segregation"""
     13     base = []
     14     extended = {}
     15     for c in str:
     16         if ord(c) < 128:
     17             base.append(c)
     18         else:
     19             extended[c] = 1
     20     extended = extended.keys()
     21     extended.sort()
     22     return "".join(base).encode("ascii"),extended
     23 
     24 def selective_len(str, max):
     25     """Return the length of str, considering only characters below max."""
     26     res = 0
     27     for c in str:
     28         if ord(c) < max:
     29             res += 1
     30     return res
     31 
     32 def selective_find(str, char, index, pos):
     33     """Return a pair (index, pos), indicating the next occurrence of
     34     char in str. index is the position of the character considering
     35     only ordinals up to and including char, and pos is the position in
     36     the full string. index/pos is the starting position in the full
     37     string."""
     38 
     39     l = len(str)
     40     while 1:
     41         pos += 1
     42         if pos == l:
     43             return (-1, -1)
     44         c = str[pos]
     45         if c == char:
     46             return index+1, pos
     47         elif c < char:
     48             index += 1
     49 
     50 def insertion_unsort(str, extended):
     51     """3.2 Insertion unsort coding"""
     52     oldchar = 0x80
     53     result = []
     54     oldindex = -1
     55     for c in extended:
     56         index = pos = -1
     57         char = ord(c)
     58         curlen = selective_len(str, char)
     59         delta = (curlen+1) * (char - oldchar)
     60         while 1:
     61             index,pos = selective_find(str,c,index,pos)
     62             if index == -1:
     63                 break
     64             delta += index - oldindex
     65             result.append(delta-1)
     66             oldindex = index
     67             delta = 0
     68         oldchar = char
     69 
     70     return result
     71 
     72 def T(j, bias):
     73     # Punycode parameters: tmin = 1, tmax = 26, base = 36
     74     res = 36 * (j + 1) - bias
     75     if res < 1: return 1
     76     if res > 26: return 26
     77     return res
     78 
     79 digits = "abcdefghijklmnopqrstuvwxyz0123456789"
     80 def generate_generalized_integer(N, bias):
     81     """3.3 Generalized variable-length integers"""
     82     result = []
     83     j = 0
     84     while 1:
     85         t = T(j, bias)
     86         if N < t:
     87             result.append(digits[N])
     88             return result
     89         result.append(digits[t + ((N - t) % (36 - t))])
     90         N = (N - t) // (36 - t)
     91         j += 1
     92 
     93 def adapt(delta, first, numchars):
     94     if first:
     95         delta //= 700
     96     else:
     97         delta //= 2
     98     delta += delta // numchars
     99     # ((base - tmin) * tmax) // 2 == 455
    100     divisions = 0
    101     while delta > 455:
    102         delta = delta // 35 # base - tmin
    103         divisions += 36
    104     bias = divisions + (36 * delta // (delta + 38))
    105     return bias
    106 
    107 
    108 def generate_integers(baselen, deltas):
    109     """3.4 Bias adaptation"""
    110     # Punycode parameters: initial bias = 72, damp = 700, skew = 38
    111     result = []
    112     bias = 72
    113     for points, delta in enumerate(deltas):
    114         s = generate_generalized_integer(delta, bias)
    115         result.extend(s)
    116         bias = adapt(delta, points==0, baselen+points+1)
    117     return "".join(result)
    118 
    119 def punycode_encode(text):
    120     base, extended = segregate(text)
    121     base = base.encode("ascii")
    122     deltas = insertion_unsort(text, extended)
    123     extended = generate_integers(len(base), deltas)
    124     if base:
    125         return base + "-" + extended
    126     return extended
    127 
    128 ##################### Decoding #####################################
    129 
    130 def decode_generalized_number(extended, extpos, bias, errors):
    131     """3.3 Generalized variable-length integers"""
    132     result = 0
    133     w = 1
    134     j = 0
    135     while 1:
    136         try:
    137             char = ord(extended[extpos])
    138         except IndexError:
    139             if errors == "strict":
    140                 raise UnicodeError, "incomplete punicode string"
    141             return extpos + 1, None
    142         extpos += 1
    143         if 0x41 <= char <= 0x5A: # A-Z
    144             digit = char - 0x41
    145         elif 0x30 <= char <= 0x39:
    146             digit = char - 22 # 0x30-26
    147         elif errors == "strict":
    148             raise UnicodeError("Invalid extended code point '%s'"
    149                                % extended[extpos])
    150         else:
    151             return extpos, None
    152         t = T(j, bias)
    153         result += digit * w
    154         if digit < t:
    155             return extpos, result
    156         w = w * (36 - t)
    157         j += 1
    158 
    159 
    160 def insertion_sort(base, extended, errors):
    161     """3.2 Insertion unsort coding"""
    162     char = 0x80
    163     pos = -1
    164     bias = 72
    165     extpos = 0
    166     while extpos < len(extended):
    167         newpos, delta = decode_generalized_number(extended, extpos,
    168                                                   bias, errors)
    169         if delta is None:
    170             # There was an error in decoding. We can't continue because
    171             # synchronization is lost.
    172             return base
    173         pos += delta+1
    174         char += pos // (len(base) + 1)
    175         if char > 0x10FFFF:
    176             if errors == "strict":
    177                 raise UnicodeError, ("Invalid character U+%x" % char)
    178             char = ord('?')
    179         pos = pos % (len(base) + 1)
    180         base = base[:pos] + unichr(char) + base[pos:]
    181         bias = adapt(delta, (extpos == 0), len(base))
    182         extpos = newpos
    183     return base
    184 
    185 def punycode_decode(text, errors):
    186     pos = text.rfind("-")
    187     if pos == -1:
    188         base = ""
    189         extended = text
    190     else:
    191         base = text[:pos]
    192         extended = text[pos+1:]
    193     base = unicode(base, "ascii", errors)
    194     extended = extended.upper()
    195     return insertion_sort(base, extended, errors)
    196 
    197 ### Codec APIs
    198 
    199 class Codec(codecs.Codec):
    200 
    201     def encode(self,input,errors='strict'):
    202         res = punycode_encode(input)
    203         return res, len(input)
    204 
    205     def decode(self,input,errors='strict'):
    206         if errors not in ('strict', 'replace', 'ignore'):
    207             raise UnicodeError, "Unsupported error handling "+errors
    208         res = punycode_decode(input, errors)
    209         return res, len(input)
    210 
    211 class IncrementalEncoder(codecs.IncrementalEncoder):
    212     def encode(self, input, final=False):
    213         return punycode_encode(input)
    214 
    215 class IncrementalDecoder(codecs.IncrementalDecoder):
    216     def decode(self, input, final=False):
    217         if self.errors not in ('strict', 'replace', 'ignore'):
    218             raise UnicodeError, "Unsupported error handling "+self.errors
    219         return punycode_decode(input, self.errors)
    220 
    221 class StreamWriter(Codec,codecs.StreamWriter):
    222     pass
    223 
    224 class StreamReader(Codec,codecs.StreamReader):
    225     pass
    226 
    227 ### encodings module API
    228 
    229 def getregentry():
    230     return codecs.CodecInfo(
    231         name='punycode',
    232         encode=Codec().encode,
    233         decode=Codec().decode,
    234         incrementalencoder=IncrementalEncoder,
    235         incrementaldecoder=IncrementalDecoder,
    236         streamwriter=StreamWriter,
    237         streamreader=StreamReader,
    238     )
    239