Home | History | Annotate | Download | only in encodings
      1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
      2 
      3 import stringprep, re, codecs
      4 from unicodedata import ucd_3_2_0 as unicodedata
      5 
      6 # IDNA section 3.1
      7 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
      8 
      9 # IDNA section 5
     10 ace_prefix = "xn--"
     11 uace_prefix = unicode(ace_prefix, "ascii")
     12 
     13 # This assumes query strings, so AllowUnassigned is true
     14 def nameprep(label):
     15     # Map
     16     newlabel = []
     17     for c in label:
     18         if stringprep.in_table_b1(c):
     19             # Map to nothing
     20             continue
     21         newlabel.append(stringprep.map_table_b2(c))
     22     label = u"".join(newlabel)
     23 
     24     # Normalize
     25     label = unicodedata.normalize("NFKC", label)
     26 
     27     # Prohibit
     28     for c in label:
     29         if stringprep.in_table_c12(c) or \
     30            stringprep.in_table_c22(c) or \
     31            stringprep.in_table_c3(c) or \
     32            stringprep.in_table_c4(c) or \
     33            stringprep.in_table_c5(c) or \
     34            stringprep.in_table_c6(c) or \
     35            stringprep.in_table_c7(c) or \
     36            stringprep.in_table_c8(c) or \
     37            stringprep.in_table_c9(c):
     38             raise UnicodeError("Invalid character %r" % c)
     39 
     40     # Check bidi
     41     RandAL = map(stringprep.in_table_d1, label)
     42     for c in RandAL:
     43         if c:
     44             # There is a RandAL char in the string. Must perform further
     45             # tests:
     46             # 1) The characters in section 5.8 MUST be prohibited.
     47             # This is table C.8, which was already checked
     48             # 2) If a string contains any RandALCat character, the string
     49             # MUST NOT contain any LCat character.
     50             if filter(stringprep.in_table_d2, label):
     51                 raise UnicodeError("Violation of BIDI requirement 2")
     52 
     53             # 3) If a string contains any RandALCat character, a
     54             # RandALCat character MUST be the first character of the
     55             # string, and a RandALCat character MUST be the last
     56             # character of the string.
     57             if not RandAL[0] or not RandAL[-1]:
     58                 raise UnicodeError("Violation of BIDI requirement 3")
     59 
     60     return label
     61 
     62 def ToASCII(label):
     63     try:
     64         # Step 1: try ASCII
     65         label = label.encode("ascii")
     66     except UnicodeError:
     67         pass
     68     else:
     69         # Skip to step 3: UseSTD3ASCIIRules is false, so
     70         # Skip to step 8.
     71         if 0 < len(label) < 64:
     72             return label
     73         raise UnicodeError("label empty or too long")
     74 
     75     # Step 2: nameprep
     76     label = nameprep(label)
     77 
     78     # Step 3: UseSTD3ASCIIRules is false
     79     # Step 4: try ASCII
     80     try:
     81         label = label.encode("ascii")
     82     except UnicodeError:
     83         pass
     84     else:
     85         # Skip to step 8.
     86         if 0 < len(label) < 64:
     87             return label
     88         raise UnicodeError("label empty or too long")
     89 
     90     # Step 5: Check ACE prefix
     91     if label.startswith(uace_prefix):
     92         raise UnicodeError("Label starts with ACE prefix")
     93 
     94     # Step 6: Encode with PUNYCODE
     95     label = label.encode("punycode")
     96 
     97     # Step 7: Prepend ACE prefix
     98     label = ace_prefix + label
     99 
    100     # Step 8: Check size
    101     if 0 < len(label) < 64:
    102         return label
    103     raise UnicodeError("label empty or too long")
    104 
    105 def ToUnicode(label):
    106     # Step 1: Check for ASCII
    107     if isinstance(label, str):
    108         pure_ascii = True
    109     else:
    110         try:
    111             label = label.encode("ascii")
    112             pure_ascii = True
    113         except UnicodeError:
    114             pure_ascii = False
    115     if not pure_ascii:
    116         # Step 2: Perform nameprep
    117         label = nameprep(label)
    118         # It doesn't say this, but apparently, it should be ASCII now
    119         try:
    120             label = label.encode("ascii")
    121         except UnicodeError:
    122             raise UnicodeError("Invalid character in IDN label")
    123     # Step 3: Check for ACE prefix
    124     if not label.startswith(ace_prefix):
    125         return unicode(label, "ascii")
    126 
    127     # Step 4: Remove ACE prefix
    128     label1 = label[len(ace_prefix):]
    129 
    130     # Step 5: Decode using PUNYCODE
    131     result = label1.decode("punycode")
    132 
    133     # Step 6: Apply ToASCII
    134     label2 = ToASCII(result)
    135 
    136     # Step 7: Compare the result of step 6 with the one of step 3
    137     # label2 will already be in lower case.
    138     if label.lower() != label2:
    139         raise UnicodeError("IDNA does not round-trip", label, label2)
    140 
    141     # Step 8: return the result of step 5
    142     return result
    143 
    144 ### Codec APIs
    145 
    146 class Codec(codecs.Codec):
    147     def encode(self,input,errors='strict'):
    148 
    149         if errors != 'strict':
    150             # IDNA is quite clear that implementations must be strict
    151             raise UnicodeError("unsupported error handling "+errors)
    152 
    153         if not input:
    154             return "", 0
    155 
    156         result = []
    157         labels = dots.split(input)
    158         if labels and len(labels[-1])==0:
    159             trailing_dot = '.'
    160             del labels[-1]
    161         else:
    162             trailing_dot = ''
    163         for label in labels:
    164             result.append(ToASCII(label))
    165         # Join with U+002E
    166         return ".".join(result)+trailing_dot, len(input)
    167 
    168     def decode(self,input,errors='strict'):
    169 
    170         if errors != 'strict':
    171             raise UnicodeError("Unsupported error handling "+errors)
    172 
    173         if not input:
    174             return u"", 0
    175 
    176         # IDNA allows decoding to operate on Unicode strings, too.
    177         if isinstance(input, unicode):
    178             labels = dots.split(input)
    179         else:
    180             # Must be ASCII string
    181             input = str(input)
    182             unicode(input, "ascii")
    183             labels = input.split(".")
    184 
    185         if labels and len(labels[-1]) == 0:
    186             trailing_dot = u'.'
    187             del labels[-1]
    188         else:
    189             trailing_dot = u''
    190 
    191         result = []
    192         for label in labels:
    193             result.append(ToUnicode(label))
    194 
    195         return u".".join(result)+trailing_dot, len(input)
    196 
    197 class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
    198     def _buffer_encode(self, input, errors, final):
    199         if errors != 'strict':
    200             # IDNA is quite clear that implementations must be strict
    201             raise UnicodeError("unsupported error handling "+errors)
    202 
    203         if not input:
    204             return ("", 0)
    205 
    206         labels = dots.split(input)
    207         trailing_dot = u''
    208         if labels:
    209             if not labels[-1]:
    210                 trailing_dot = '.'
    211                 del labels[-1]
    212             elif not final:
    213                 # Keep potentially unfinished label until the next call
    214                 del labels[-1]
    215                 if labels:
    216                     trailing_dot = '.'
    217 
    218         result = []
    219         size = 0
    220         for label in labels:
    221             result.append(ToASCII(label))
    222             if size:
    223                 size += 1
    224             size += len(label)
    225 
    226         # Join with U+002E
    227         result = ".".join(result) + trailing_dot
    228         size += len(trailing_dot)
    229         return (result, size)
    230 
    231 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
    232     def _buffer_decode(self, input, errors, final):
    233         if errors != 'strict':
    234             raise UnicodeError("Unsupported error handling "+errors)
    235 
    236         if not input:
    237             return (u"", 0)
    238 
    239         # IDNA allows decoding to operate on Unicode strings, too.
    240         if isinstance(input, unicode):
    241             labels = dots.split(input)
    242         else:
    243             # Must be ASCII string
    244             input = str(input)
    245             unicode(input, "ascii")
    246             labels = input.split(".")
    247 
    248         trailing_dot = u''
    249         if labels:
    250             if not labels[-1]:
    251                 trailing_dot = u'.'
    252                 del labels[-1]
    253             elif not final:
    254                 # Keep potentially unfinished label until the next call
    255                 del labels[-1]
    256                 if labels:
    257                     trailing_dot = u'.'
    258 
    259         result = []
    260         size = 0
    261         for label in labels:
    262             result.append(ToUnicode(label))
    263             if size:
    264                 size += 1
    265             size += len(label)
    266 
    267         result = u".".join(result) + trailing_dot
    268         size += len(trailing_dot)
    269         return (result, size)
    270 
    271 class StreamWriter(Codec,codecs.StreamWriter):
    272     pass
    273 
    274 class StreamReader(Codec,codecs.StreamReader):
    275     pass
    276 
    277 ### encodings module API
    278 
    279 def getregentry():
    280     return codecs.CodecInfo(
    281         name='idna',
    282         encode=Codec().encode,
    283         decode=Codec().decode,
    284         incrementalencoder=IncrementalEncoder,
    285         incrementaldecoder=IncrementalDecoder,
    286         streamwriter=StreamWriter,
    287         streamreader=StreamReader,
    288     )
    289