1 # -*- coding: iso-8859-1 -*- 2 """ Codec for the Punicode encoding, as specified in RFC 3492 3 4 Written by Martin v. Lwis. 5 """ 6 7 import codecs 8 9 ##################### Encoding ##################################### 10 11 def segregate(str): 12 """3.1 Basic code point segregation""" 13 base = [] 14 extended = {} 15 for c in str: 16 if ord(c) < 128: 17 base.append(c) 18 else: 19 extended[c] = 1 20 extended = extended.keys() 21 extended.sort() 22 return "".join(base).encode("ascii"),extended 23 24 def selective_len(str, max): 25 """Return the length of str, considering only characters below max.""" 26 res = 0 27 for c in str: 28 if ord(c) < max: 29 res += 1 30 return res 31 32 def selective_find(str, char, index, pos): 33 """Return a pair (index, pos), indicating the next occurrence of 34 char in str. index is the position of the character considering 35 only ordinals up to and including char, and pos is the position in 36 the full string. index/pos is the starting position in the full 37 string.""" 38 39 l = len(str) 40 while 1: 41 pos += 1 42 if pos == l: 43 return (-1, -1) 44 c = str[pos] 45 if c == char: 46 return index+1, pos 47 elif c < char: 48 index += 1 49 50 def insertion_unsort(str, extended): 51 """3.2 Insertion unsort coding""" 52 oldchar = 0x80 53 result = [] 54 oldindex = -1 55 for c in extended: 56 index = pos = -1 57 char = ord(c) 58 curlen = selective_len(str, char) 59 delta = (curlen+1) * (char - oldchar) 60 while 1: 61 index,pos = selective_find(str,c,index,pos) 62 if index == -1: 63 break 64 delta += index - oldindex 65 result.append(delta-1) 66 oldindex = index 67 delta = 0 68 oldchar = char 69 70 return result 71 72 def T(j, bias): 73 # Punycode parameters: tmin = 1, tmax = 26, base = 36 74 res = 36 * (j + 1) - bias 75 if res < 1: return 1 76 if res > 26: return 26 77 return res 78 79 digits = "abcdefghijklmnopqrstuvwxyz0123456789" 80 def generate_generalized_integer(N, bias): 81 """3.3 Generalized variable-length integers""" 82 result = [] 83 j = 0 84 while 1: 85 t = T(j, bias) 86 if N < t: 87 result.append(digits[N]) 88 return result 89 result.append(digits[t + ((N - t) % (36 - t))]) 90 N = (N - t) // (36 - t) 91 j += 1 92 93 def adapt(delta, first, numchars): 94 if first: 95 delta //= 700 96 else: 97 delta //= 2 98 delta += delta // numchars 99 # ((base - tmin) * tmax) // 2 == 455 100 divisions = 0 101 while delta > 455: 102 delta = delta // 35 # base - tmin 103 divisions += 36 104 bias = divisions + (36 * delta // (delta + 38)) 105 return bias 106 107 108 def generate_integers(baselen, deltas): 109 """3.4 Bias adaptation""" 110 # Punycode parameters: initial bias = 72, damp = 700, skew = 38 111 result = [] 112 bias = 72 113 for points, delta in enumerate(deltas): 114 s = generate_generalized_integer(delta, bias) 115 result.extend(s) 116 bias = adapt(delta, points==0, baselen+points+1) 117 return "".join(result) 118 119 def punycode_encode(text): 120 base, extended = segregate(text) 121 base = base.encode("ascii") 122 deltas = insertion_unsort(text, extended) 123 extended = generate_integers(len(base), deltas) 124 if base: 125 return base + "-" + extended 126 return extended 127 128 ##################### Decoding ##################################### 129 130 def decode_generalized_number(extended, extpos, bias, errors): 131 """3.3 Generalized variable-length integers""" 132 result = 0 133 w = 1 134 j = 0 135 while 1: 136 try: 137 char = ord(extended[extpos]) 138 except IndexError: 139 if errors == "strict": 140 raise UnicodeError, "incomplete punicode string" 141 return extpos + 1, None 142 extpos += 1 143 if 0x41 <= char <= 0x5A: # A-Z 144 digit = char - 0x41 145 elif 0x30 <= char <= 0x39: 146 digit = char - 22 # 0x30-26 147 elif errors == "strict": 148 raise UnicodeError("Invalid extended code point '%s'" 149 % extended[extpos]) 150 else: 151 return extpos, None 152 t = T(j, bias) 153 result += digit * w 154 if digit < t: 155 return extpos, result 156 w = w * (36 - t) 157 j += 1 158 159 160 def insertion_sort(base, extended, errors): 161 """3.2 Insertion unsort coding""" 162 char = 0x80 163 pos = -1 164 bias = 72 165 extpos = 0 166 while extpos < len(extended): 167 newpos, delta = decode_generalized_number(extended, extpos, 168 bias, errors) 169 if delta is None: 170 # There was an error in decoding. We can't continue because 171 # synchronization is lost. 172 return base 173 pos += delta+1 174 char += pos // (len(base) + 1) 175 if char > 0x10FFFF: 176 if errors == "strict": 177 raise UnicodeError, ("Invalid character U+%x" % char) 178 char = ord('?') 179 pos = pos % (len(base) + 1) 180 base = base[:pos] + unichr(char) + base[pos:] 181 bias = adapt(delta, (extpos == 0), len(base)) 182 extpos = newpos 183 return base 184 185 def punycode_decode(text, errors): 186 pos = text.rfind("-") 187 if pos == -1: 188 base = "" 189 extended = text 190 else: 191 base = text[:pos] 192 extended = text[pos+1:] 193 base = unicode(base, "ascii", errors) 194 extended = extended.upper() 195 return insertion_sort(base, extended, errors) 196 197 ### Codec APIs 198 199 class Codec(codecs.Codec): 200 201 def encode(self,input,errors='strict'): 202 res = punycode_encode(input) 203 return res, len(input) 204 205 def decode(self,input,errors='strict'): 206 if errors not in ('strict', 'replace', 'ignore'): 207 raise UnicodeError, "Unsupported error handling "+errors 208 res = punycode_decode(input, errors) 209 return res, len(input) 210 211 class IncrementalEncoder(codecs.IncrementalEncoder): 212 def encode(self, input, final=False): 213 return punycode_encode(input) 214 215 class IncrementalDecoder(codecs.IncrementalDecoder): 216 def decode(self, input, final=False): 217 if self.errors not in ('strict', 'replace', 'ignore'): 218 raise UnicodeError, "Unsupported error handling "+self.errors 219 return punycode_decode(input, self.errors) 220 221 class StreamWriter(Codec,codecs.StreamWriter): 222 pass 223 224 class StreamReader(Codec,codecs.StreamReader): 225 pass 226 227 ### encodings module API 228 229 def getregentry(): 230 return codecs.CodecInfo( 231 name='punycode', 232 encode=Codec().encode, 233 decode=Codec().decode, 234 incrementalencoder=IncrementalEncoder, 235 incrementaldecoder=IncrementalDecoder, 236 streamwriter=StreamWriter, 237 streamreader=StreamReader, 238 ) 239