Home | History | Annotate | Download | only in markdown
      1 # markdown/html4.py
      2 #
      3 # Add html4 serialization to older versions of Elementree
      4 # Taken from ElementTree 1.3 preview with slight modifications
      5 #
      6 # Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
      7 #
      8 # fredrik (at] pythonware.com
      9 # http://www.pythonware.com
     10 #
     11 # --------------------------------------------------------------------
     12 # The ElementTree toolkit is
     13 #
     14 # Copyright (c) 1999-2007 by Fredrik Lundh
     15 #
     16 # By obtaining, using, and/or copying this software and/or its
     17 # associated documentation, you agree that you have read, understood,
     18 # and will comply with the following terms and conditions:
     19 #
     20 # Permission to use, copy, modify, and distribute this software and
     21 # its associated documentation for any purpose and without fee is
     22 # hereby granted, provided that the above copyright notice appears in
     23 # all copies, and that both that copyright notice and this permission
     24 # notice appear in supporting documentation, and that the name of
     25 # Secret Labs AB or the author not be used in advertising or publicity
     26 # pertaining to distribution of the software without specific, written
     27 # prior permission.
     28 #
     29 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
     30 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
     31 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
     32 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
     33 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
     34 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
     35 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
     36 # OF THIS SOFTWARE.
     37 # --------------------------------------------------------------------
     38 
     39 
     40 import markdown
     41 ElementTree = markdown.etree.ElementTree
     42 QName = markdown.etree.QName
     43 Comment = markdown.etree.Comment
     44 PI = markdown.etree.PI
     45 ProcessingInstruction = markdown.etree.ProcessingInstruction
     46 
     47 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
     48               "img", "input", "isindex", "link", "meta" "param")
     49 
     50 try:
     51     HTML_EMPTY = set(HTML_EMPTY)
     52 except NameError:
     53     pass
     54 
     55 _namespace_map = {
     56     # "well-known" namespace prefixes
     57     "http://www.w3.org/XML/1998/namespace": "xml",
     58     "http://www.w3.org/1999/xhtml": "html",
     59     "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
     60     "http://schemas.xmlsoap.org/wsdl/": "wsdl",
     61     # xml schema
     62     "http://www.w3.org/2001/XMLSchema": "xs",
     63     "http://www.w3.org/2001/XMLSchema-instance": "xsi",
     64     # dublic core
     65     "http://purl.org/dc/elements/1.1/": "dc",
     66 }
     67 
     68 
     69 def _raise_serialization_error(text):
     70     raise TypeError(
     71         "cannot serialize %r (type %s)" % (text, type(text).__name__)
     72         )
     73 
     74 def _encode(text, encoding):
     75     try:
     76         return text.encode(encoding, "xmlcharrefreplace")
     77     except (TypeError, AttributeError):
     78         _raise_serialization_error(text)
     79 
     80 def _escape_cdata(text, encoding):
     81     # escape character data
     82     try:
     83         # it's worth avoiding do-nothing calls for strings that are
     84         # shorter than 500 character, or so.  assume that's, by far,
     85         # the most common case in most applications.
     86         if "&" in text:
     87             text = text.replace("&", "&")
     88         if "<" in text:
     89             text = text.replace("<", "&lt;")
     90         if ">" in text:
     91             text = text.replace(">", "&gt;")
     92         return text.encode(encoding, "xmlcharrefreplace")
     93     except (TypeError, AttributeError):
     94         _raise_serialization_error(text)
     95 
     96 
     97 def _escape_attrib(text, encoding):
     98     # escape attribute value
     99     try:
    100         if "&" in text:
    101             text = text.replace("&", "&amp;")
    102         if "<" in text:
    103             text = text.replace("<", "&lt;")
    104         if ">" in text:
    105             text = text.replace(">", "&gt;")
    106         if "\"" in text:
    107             text = text.replace("\"", "&quot;")
    108         if "\n" in text:
    109             text = text.replace("\n", "&#10;")
    110         return text.encode(encoding, "xmlcharrefreplace")
    111     except (TypeError, AttributeError):
    112         _raise_serialization_error(text)
    113 
    114 def _escape_attrib_html(text, encoding):
    115     # escape attribute value
    116     try:
    117         if "&" in text:
    118             text = text.replace("&", "&amp;")
    119         if ">" in text:
    120             text = text.replace(">", "&gt;")
    121         if "\"" in text:
    122             text = text.replace("\"", "&quot;")
    123         return text.encode(encoding, "xmlcharrefreplace")
    124     except (TypeError, AttributeError):
    125         _raise_serialization_error(text)
    126 
    127 
    128 def _serialize_html(write, elem, encoding, qnames, namespaces):
    129     tag = elem.tag
    130     text = elem.text
    131     if tag is Comment:
    132         write("<!--%s-->" % _escape_cdata(text, encoding))
    133     elif tag is ProcessingInstruction:
    134         write("<?%s?>" % _escape_cdata(text, encoding))
    135     else:
    136         tag = qnames[tag]
    137         if tag is None:
    138             if text:
    139                 write(_escape_cdata(text, encoding))
    140             for e in elem:
    141                 _serialize_html(write, e, encoding, qnames, None)
    142         else:
    143             write("<" + tag)
    144             items = elem.items()
    145             if items or namespaces:
    146                 items.sort() # lexical order
    147                 for k, v in items:
    148                     if isinstance(k, QName):
    149                         k = k.text
    150                     if isinstance(v, QName):
    151                         v = qnames[v.text]
    152                     else:
    153                         v = _escape_attrib_html(v, encoding)
    154                     # FIXME: handle boolean attributes
    155                     write(" %s=\"%s\"" % (qnames[k], v))
    156                 if namespaces:
    157                     items = namespaces.items()
    158                     items.sort(key=lambda x: x[1]) # sort on prefix
    159                     for v, k in items:
    160                         if k:
    161                             k = ":" + k
    162                         write(" xmlns%s=\"%s\"" % (
    163                             k.encode(encoding),
    164                             _escape_attrib(v, encoding)
    165                             ))
    166             write(">")
    167             tag = tag.lower()
    168             if text:
    169                 if tag == "script" or tag == "style":
    170                     write(_encode(text, encoding))
    171                 else:
    172                     write(_escape_cdata(text, encoding))
    173             for e in elem:
    174                 _serialize_html(write, e, encoding, qnames, None)
    175             if tag not in HTML_EMPTY:
    176                 write("</" + tag + ">")
    177     if elem.tail:
    178         write(_escape_cdata(elem.tail, encoding))
    179 
    180 def write_html(root, f,
    181           # keyword arguments
    182           encoding="us-ascii",
    183           default_namespace=None):
    184     assert root is not None
    185     if not hasattr(f, "write"):
    186         f = open(f, "wb")
    187     write = f.write
    188     if not encoding:
    189         encoding = "us-ascii"
    190     qnames, namespaces = _namespaces(
    191             root, encoding, default_namespace
    192             )
    193     _serialize_html(
    194                 write, root, encoding, qnames, namespaces
    195                 )
    196 
    197 # --------------------------------------------------------------------
    198 # serialization support
    199 
    200 def _namespaces(elem, encoding, default_namespace=None):
    201     # identify namespaces used in this tree
    202 
    203     # maps qnames to *encoded* prefix:local names
    204     qnames = {None: None}
    205 
    206     # maps uri:s to prefixes
    207     namespaces = {}
    208     if default_namespace:
    209         namespaces[default_namespace] = ""
    210 
    211     def encode(text):
    212         return text.encode(encoding)
    213 
    214     def add_qname(qname):
    215         # calculate serialized qname representation
    216         try:
    217             if qname[:1] == "{":
    218                 uri, tag = qname[1:].split("}", 1)
    219                 prefix = namespaces.get(uri)
    220                 if prefix is None:
    221                     prefix = _namespace_map.get(uri)
    222                     if prefix is None:
    223                         prefix = "ns%d" % len(namespaces)
    224                     if prefix != "xml":
    225                         namespaces[uri] = prefix
    226                 if prefix:
    227                     qnames[qname] = encode("%s:%s" % (prefix, tag))
    228                 else:
    229                     qnames[qname] = encode(tag) # default element
    230             else:
    231                 if default_namespace:
    232                     # FIXME: can this be handled in XML 1.0?
    233                     raise ValueError(
    234                         "cannot use non-qualified names with "
    235                         "default_namespace option"
    236                         )
    237                 qnames[qname] = encode(qname)
    238         except TypeError:
    239             _raise_serialization_error(qname)
    240 
    241     # populate qname and namespaces table
    242     try:
    243         iterate = elem.iter
    244     except AttributeError:
    245         iterate = elem.getiterator # cET compatibility
    246     for elem in iterate():
    247         tag = elem.tag
    248         if isinstance(tag, QName) and tag.text not in qnames:
    249             add_qname(tag.text)
    250         elif isinstance(tag, basestring):
    251             if tag not in qnames:
    252                 add_qname(tag)
    253         elif tag is not None and tag is not Comment and tag is not PI:
    254             _raise_serialization_error(tag)
    255         for key, value in elem.items():
    256             if isinstance(key, QName):
    257                 key = key.text
    258             if key not in qnames:
    259                 add_qname(key)
    260             if isinstance(value, QName) and value.text not in qnames:
    261                 add_qname(value.text)
    262         text = elem.text
    263         if isinstance(text, QName) and text.text not in qnames:
    264             add_qname(text.text)
    265     return qnames, namespaces
    266 
    267 def to_html_string(element, encoding=None):
    268     class dummy:
    269         pass
    270     data = []
    271     file = dummy()
    272     file.write = data.append
    273     write_html(ElementTree(element).getroot(),file,encoding)
    274     return "".join(data)
    275