1 """ 2 iri2uri 3 4 Converts an IRI to a URI. 5 6 """ 7 __author__ = "Joe Gregorio (joe (at] bitworking.org)" 8 __copyright__ = "Copyright 2006, Joe Gregorio" 9 __contributors__ = [] 10 __version__ = "1.0.0" 11 __license__ = "MIT" 12 __history__ = """ 13 """ 14 15 import urlparse 16 17 18 # Convert an IRI to a URI following the rules in RFC 3987 19 # 20 # The characters we need to enocde and escape are defined in the spec: 21 # 22 # iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD 23 # ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF 24 # / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD 25 # / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD 26 # / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD 27 # / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD 28 # / %xD0000-DFFFD / %xE1000-EFFFD 29 30 escape_range = [ 31 (0xA0, 0xD7FF ), 32 (0xE000, 0xF8FF ), 33 (0xF900, 0xFDCF ), 34 (0xFDF0, 0xFFEF), 35 (0x10000, 0x1FFFD ), 36 (0x20000, 0x2FFFD ), 37 (0x30000, 0x3FFFD), 38 (0x40000, 0x4FFFD ), 39 (0x50000, 0x5FFFD ), 40 (0x60000, 0x6FFFD), 41 (0x70000, 0x7FFFD ), 42 (0x80000, 0x8FFFD ), 43 (0x90000, 0x9FFFD), 44 (0xA0000, 0xAFFFD ), 45 (0xB0000, 0xBFFFD ), 46 (0xC0000, 0xCFFFD), 47 (0xD0000, 0xDFFFD ), 48 (0xE1000, 0xEFFFD), 49 (0xF0000, 0xFFFFD ), 50 (0x100000, 0x10FFFD) 51 ] 52 53 def encode(c): 54 retval = c 55 i = ord(c) 56 for low, high in escape_range: 57 if i < low: 58 break 59 if i >= low and i <= high: 60 retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')]) 61 break 62 return retval 63 64 65 def iri2uri(uri): 66 """Convert an IRI to a URI. Note that IRIs must be 67 passed in a unicode strings. That is, do not utf-8 encode 68 the IRI before passing it into the function.""" 69 if isinstance(uri ,unicode): 70 (scheme, authority, path, query, fragment) = urlparse.urlsplit(uri) 71 authority = authority.encode('idna') 72 # For each character in 'ucschar' or 'iprivate' 73 # 1. encode as utf-8 74 # 2. then %-encode each octet of that utf-8 75 uri = urlparse.urlunsplit((scheme, authority, path, query, fragment)) 76 uri = "".join([encode(c) for c in uri]) 77 return uri 78 79 if __name__ == "__main__": 80 import unittest 81 82 class Test(unittest.TestCase): 83 84 def test_uris(self): 85 """Test that URIs are invariant under the transformation.""" 86 invariant = [ 87 u"ftp://ftp.is.co.za/rfc/rfc1808.txt", 88 u"http://www.ietf.org/rfc/rfc2396.txt", 89 u"ldap://[2001:db8::7]/c=GB?objectClass?one", 90 u"mailto:John.Doe (at] example.com", 91 u"news:comp.infosystems.www.servers.unix", 92 u"tel:+1-816-555-1212", 93 u"telnet://192.0.2.16:80/", 94 u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ] 95 for uri in invariant: 96 self.assertEqual(uri, iri2uri(uri)) 97 98 def test_iri(self): 99 """ Test that the right type of escaping is done for each part of the URI.""" 100 self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}")) 101 self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}")) 102 self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}")) 103 self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}")) 104 self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")) 105 self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))) 106 self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8'))) 107 108 unittest.main() 109 110 111