1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) 2 3 import stringprep, re, codecs 4 from unicodedata import ucd_3_2_0 as unicodedata 5 6 # IDNA section 3.1 7 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") 8 9 # IDNA section 5 10 ace_prefix = "xn--" 11 uace_prefix = unicode(ace_prefix, "ascii") 12 13 # This assumes query strings, so AllowUnassigned is true 14 def nameprep(label): 15 # Map 16 newlabel = [] 17 for c in label: 18 if stringprep.in_table_b1(c): 19 # Map to nothing 20 continue 21 newlabel.append(stringprep.map_table_b2(c)) 22 label = u"".join(newlabel) 23 24 # Normalize 25 label = unicodedata.normalize("NFKC", label) 26 27 # Prohibit 28 for c in label: 29 if stringprep.in_table_c12(c) or \ 30 stringprep.in_table_c22(c) or \ 31 stringprep.in_table_c3(c) or \ 32 stringprep.in_table_c4(c) or \ 33 stringprep.in_table_c5(c) or \ 34 stringprep.in_table_c6(c) or \ 35 stringprep.in_table_c7(c) or \ 36 stringprep.in_table_c8(c) or \ 37 stringprep.in_table_c9(c): 38 raise UnicodeError("Invalid character %r" % c) 39 40 # Check bidi 41 RandAL = map(stringprep.in_table_d1, label) 42 for c in RandAL: 43 if c: 44 # There is a RandAL char in the string. Must perform further 45 # tests: 46 # 1) The characters in section 5.8 MUST be prohibited. 47 # This is table C.8, which was already checked 48 # 2) If a string contains any RandALCat character, the string 49 # MUST NOT contain any LCat character. 50 if filter(stringprep.in_table_d2, label): 51 raise UnicodeError("Violation of BIDI requirement 2") 52 53 # 3) If a string contains any RandALCat character, a 54 # RandALCat character MUST be the first character of the 55 # string, and a RandALCat character MUST be the last 56 # character of the string. 57 if not RandAL[0] or not RandAL[-1]: 58 raise UnicodeError("Violation of BIDI requirement 3") 59 60 return label 61 62 def ToASCII(label): 63 try: 64 # Step 1: try ASCII 65 label = label.encode("ascii") 66 except UnicodeError: 67 pass 68 else: 69 # Skip to step 3: UseSTD3ASCIIRules is false, so 70 # Skip to step 8. 71 if 0 < len(label) < 64: 72 return label 73 raise UnicodeError("label empty or too long") 74 75 # Step 2: nameprep 76 label = nameprep(label) 77 78 # Step 3: UseSTD3ASCIIRules is false 79 # Step 4: try ASCII 80 try: 81 label = label.encode("ascii") 82 except UnicodeError: 83 pass 84 else: 85 # Skip to step 8. 86 if 0 < len(label) < 64: 87 return label 88 raise UnicodeError("label empty or too long") 89 90 # Step 5: Check ACE prefix 91 if label.startswith(uace_prefix): 92 raise UnicodeError("Label starts with ACE prefix") 93 94 # Step 6: Encode with PUNYCODE 95 label = label.encode("punycode") 96 97 # Step 7: Prepend ACE prefix 98 label = ace_prefix + label 99 100 # Step 8: Check size 101 if 0 < len(label) < 64: 102 return label 103 raise UnicodeError("label empty or too long") 104 105 def ToUnicode(label): 106 # Step 1: Check for ASCII 107 if isinstance(label, str): 108 pure_ascii = True 109 else: 110 try: 111 label = label.encode("ascii") 112 pure_ascii = True 113 except UnicodeError: 114 pure_ascii = False 115 if not pure_ascii: 116 # Step 2: Perform nameprep 117 label = nameprep(label) 118 # It doesn't say this, but apparently, it should be ASCII now 119 try: 120 label = label.encode("ascii") 121 except UnicodeError: 122 raise UnicodeError("Invalid character in IDN label") 123 # Step 3: Check for ACE prefix 124 if not label.startswith(ace_prefix): 125 return unicode(label, "ascii") 126 127 # Step 4: Remove ACE prefix 128 label1 = label[len(ace_prefix):] 129 130 # Step 5: Decode using PUNYCODE 131 result = label1.decode("punycode") 132 133 # Step 6: Apply ToASCII 134 label2 = ToASCII(result) 135 136 # Step 7: Compare the result of step 6 with the one of step 3 137 # label2 will already be in lower case. 138 if label.lower() != label2: 139 raise UnicodeError("IDNA does not round-trip", label, label2) 140 141 # Step 8: return the result of step 5 142 return result 143 144 ### Codec APIs 145 146 class Codec(codecs.Codec): 147 def encode(self,input,errors='strict'): 148 149 if errors != 'strict': 150 # IDNA is quite clear that implementations must be strict 151 raise UnicodeError("unsupported error handling "+errors) 152 153 if not input: 154 return "", 0 155 156 result = [] 157 labels = dots.split(input) 158 if labels and len(labels[-1])==0: 159 trailing_dot = '.' 160 del labels[-1] 161 else: 162 trailing_dot = '' 163 for label in labels: 164 result.append(ToASCII(label)) 165 # Join with U+002E 166 return ".".join(result)+trailing_dot, len(input) 167 168 def decode(self,input,errors='strict'): 169 170 if errors != 'strict': 171 raise UnicodeError("Unsupported error handling "+errors) 172 173 if not input: 174 return u"", 0 175 176 # IDNA allows decoding to operate on Unicode strings, too. 177 if isinstance(input, unicode): 178 labels = dots.split(input) 179 else: 180 # Must be ASCII string 181 input = str(input) 182 unicode(input, "ascii") 183 labels = input.split(".") 184 185 if labels and len(labels[-1]) == 0: 186 trailing_dot = u'.' 187 del labels[-1] 188 else: 189 trailing_dot = u'' 190 191 result = [] 192 for label in labels: 193 result.append(ToUnicode(label)) 194 195 return u".".join(result)+trailing_dot, len(input) 196 197 class IncrementalEncoder(codecs.BufferedIncrementalEncoder): 198 def _buffer_encode(self, input, errors, final): 199 if errors != 'strict': 200 # IDNA is quite clear that implementations must be strict 201 raise UnicodeError("unsupported error handling "+errors) 202 203 if not input: 204 return ("", 0) 205 206 labels = dots.split(input) 207 trailing_dot = u'' 208 if labels: 209 if not labels[-1]: 210 trailing_dot = '.' 211 del labels[-1] 212 elif not final: 213 # Keep potentially unfinished label until the next call 214 del labels[-1] 215 if labels: 216 trailing_dot = '.' 217 218 result = [] 219 size = 0 220 for label in labels: 221 result.append(ToASCII(label)) 222 if size: 223 size += 1 224 size += len(label) 225 226 # Join with U+002E 227 result = ".".join(result) + trailing_dot 228 size += len(trailing_dot) 229 return (result, size) 230 231 class IncrementalDecoder(codecs.BufferedIncrementalDecoder): 232 def _buffer_decode(self, input, errors, final): 233 if errors != 'strict': 234 raise UnicodeError("Unsupported error handling "+errors) 235 236 if not input: 237 return (u"", 0) 238 239 # IDNA allows decoding to operate on Unicode strings, too. 240 if isinstance(input, unicode): 241 labels = dots.split(input) 242 else: 243 # Must be ASCII string 244 input = str(input) 245 unicode(input, "ascii") 246 labels = input.split(".") 247 248 trailing_dot = u'' 249 if labels: 250 if not labels[-1]: 251 trailing_dot = u'.' 252 del labels[-1] 253 elif not final: 254 # Keep potentially unfinished label until the next call 255 del labels[-1] 256 if labels: 257 trailing_dot = u'.' 258 259 result = [] 260 size = 0 261 for label in labels: 262 result.append(ToUnicode(label)) 263 if size: 264 size += 1 265 size += len(label) 266 267 result = u".".join(result) + trailing_dot 268 size += len(trailing_dot) 269 return (result, size) 270 271 class StreamWriter(Codec,codecs.StreamWriter): 272 pass 273 274 class StreamReader(Codec,codecs.StreamReader): 275 pass 276 277 ### encodings module API 278 279 def getregentry(): 280 return codecs.CodecInfo( 281 name='idna', 282 encode=Codec().encode, 283 decode=Codec().decode, 284 incrementalencoder=IncrementalEncoder, 285 incrementaldecoder=IncrementalDecoder, 286 streamwriter=StreamWriter, 287 streamreader=StreamReader, 288 ) 289