1 """ Standard "encodings" Package 2 3 Standard Python encoding modules are stored in this package 4 directory. 5 6 Codec modules must have names corresponding to normalized encoding 7 names as defined in the normalize_encoding() function below, e.g. 8 'utf-8' must be implemented by the module 'utf_8.py'. 9 10 Each codec module must export the following interface: 11 12 * getregentry() -> codecs.CodecInfo object 13 The getregentry() API must a CodecInfo object with encoder, decoder, 14 incrementalencoder, incrementaldecoder, streamwriter and streamreader 15 atttributes which adhere to the Python Codec Interface Standard. 16 17 In addition, a module may optionally also define the following 18 APIs which are then used by the package's codec search function: 19 20 * getaliases() -> sequence of encoding name strings to use as aliases 21 22 Alias names returned by getaliases() must be normalized encoding 23 names as defined by normalize_encoding(). 24 25 Written by Marc-Andre Lemburg (mal (at] lemburg.com). 26 27 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 28 29 """#" 30 31 import codecs 32 from encodings import aliases 33 import __builtin__ 34 35 _cache = {} 36 _unknown = '--unknown--' 37 _import_tail = ['*'] 38 _norm_encoding_map = (' . ' 39 '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' 40 ' abcdefghijklmnopqrstuvwxyz ' 41 ' ' 42 ' ' 43 ' ') 44 _aliases = aliases.aliases 45 46 class CodecRegistryError(LookupError, SystemError): 47 pass 48 49 def normalize_encoding(encoding): 50 51 """ Normalize an encoding name. 52 53 Normalization works as follows: all non-alphanumeric 54 characters except the dot used for Python package names are 55 collapsed and replaced with a single underscore, e.g. ' -;#' 56 becomes '_'. Leading and trailing underscores are removed. 57 58 Note that encoding names should be ASCII only; if they do use 59 non-ASCII characters, these must be Latin-1 compatible. 60 61 """ 62 # Make sure we have an 8-bit string, because .translate() works 63 # differently for Unicode strings. 64 if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode): 65 # Note that .encode('latin-1') does *not* use the codec 66 # registry, so this call doesn't recurse. (See unicodeobject.c 67 # PyUnicode_AsEncodedString() for details) 68 encoding = encoding.encode('latin-1') 69 return '_'.join(encoding.translate(_norm_encoding_map).split()) 70 71 def search_function(encoding): 72 73 # Cache lookup 74 entry = _cache.get(encoding, _unknown) 75 if entry is not _unknown: 76 return entry 77 78 # Import the module: 79 # 80 # First try to find an alias for the normalized encoding 81 # name and lookup the module using the aliased name, then try to 82 # lookup the module using the standard import scheme, i.e. first 83 # try in the encodings package, then at top-level. 84 # 85 norm_encoding = normalize_encoding(encoding) 86 aliased_encoding = _aliases.get(norm_encoding) or \ 87 _aliases.get(norm_encoding.replace('.', '_')) 88 if aliased_encoding is not None: 89 modnames = [aliased_encoding, 90 norm_encoding] 91 else: 92 modnames = [norm_encoding] 93 for modname in modnames: 94 if not modname or '.' in modname: 95 continue 96 try: 97 # Import is absolute to prevent the possibly malicious import of a 98 # module with side-effects that is not in the 'encodings' package. 99 mod = __import__('encodings.' + modname, fromlist=_import_tail, 100 level=0) 101 except ImportError: 102 pass 103 else: 104 break 105 else: 106 mod = None 107 108 try: 109 getregentry = mod.getregentry 110 except AttributeError: 111 # Not a codec module 112 mod = None 113 114 if mod is None: 115 # Cache misses 116 _cache[encoding] = None 117 return None 118 119 # Now ask the module for the registry entry 120 entry = getregentry() 121 if not isinstance(entry, codecs.CodecInfo): 122 if not 4 <= len(entry) <= 7: 123 raise CodecRegistryError,\ 124 'module "%s" (%s) failed to register' % \ 125 (mod.__name__, mod.__file__) 126 if not hasattr(entry[0], '__call__') or \ 127 not hasattr(entry[1], '__call__') or \ 128 (entry[2] is not None and not hasattr(entry[2], '__call__')) or \ 129 (entry[3] is not None and not hasattr(entry[3], '__call__')) or \ 130 (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \ 131 (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')): 132 raise CodecRegistryError,\ 133 'incompatible codecs in module "%s" (%s)' % \ 134 (mod.__name__, mod.__file__) 135 if len(entry)<7 or entry[6] is None: 136 entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],) 137 entry = codecs.CodecInfo(*entry) 138 139 # Cache the codec registry entry 140 _cache[encoding] = entry 141 142 # Register its aliases (without overwriting previously registered 143 # aliases) 144 try: 145 codecaliases = mod.getaliases() 146 except AttributeError: 147 pass 148 else: 149 for alias in codecaliases: 150 if alias not in _aliases: 151 _aliases[alias] = modname 152 153 # Return the registry entry 154 return entry 155 156 # Register the search_function in the Python codec registry 157 codecs.register(search_function) 158