Home | History | Annotate | Download | only in markdown
      1 # markdown is released under the BSD license
      2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
      3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
      4 # Copyright 2004 Manfred Stienstra (the original version)
      5 # 
      6 # All rights reserved.
      7 # 
      8 # Redistribution and use in source and binary forms, with or without
      9 # modification, are permitted provided that the following conditions are met:
     10 # 
     11 # *   Redistributions of source code must retain the above copyright
     12 #     notice, this list of conditions and the following disclaimer.
     13 # *   Redistributions in binary form must reproduce the above copyright
     14 #     notice, this list of conditions and the following disclaimer in the
     15 #     documentation and/or other materials provided with the distribution.
     16 # *   Neither the name of the <organization> nor the
     17 #     names of its contributors may be used to endorse or promote products
     18 #     derived from this software without specific prior written permission.
     19 # 
     20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
     21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
     24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30 # POSSIBILITY OF SUCH DAMAGE.
     31 
     32 
     33 """
     34 Python Markdown
     35 ===============
     36 
     37 Python Markdown converts Markdown to HTML and can be used as a library or
     38 called from the command line.
     39 
     40 ## Basic usage as a module:
     41 
     42     import markdown
     43     html = markdown.markdown(your_text_string)
     44 
     45 See <http://packages.python.org/Markdown/> for more
     46 information and instructions on how to extend the functionality of
     47 Python Markdown.  Read that before you try modifying this file.
     48 
     49 ## Authors and License
     50 
     51 Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
     52 maintained  by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
     53 Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
     54 
     55 Contact: markdown (at] freewisdom.org
     56 
     57 Copyright 2007-2013 The Python Markdown Project (v. 1.7 and later)
     58 Copyright 200? Django Software Foundation (OrderedDict implementation)
     59 Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
     60 Copyright 2004 Manfred Stienstra (the original version)
     61 
     62 License: BSD (see LICENSE for details).
     63 """
     64 
     65 from __future__ import absolute_import
     66 from __future__ import unicode_literals
     67 from .__version__ import version, version_info
     68 import re
     69 import codecs
     70 import sys
     71 import logging
     72 from . import util
     73 from .preprocessors import build_preprocessors
     74 from .blockprocessors import build_block_parser
     75 from .treeprocessors import build_treeprocessors
     76 from .inlinepatterns import build_inlinepatterns
     77 from .postprocessors import build_postprocessors
     78 from .extensions import Extension
     79 from .serializers import to_html_string, to_xhtml_string
     80 
     81 __all__ = ['Markdown', 'markdown', 'markdownFromFile']
     82 
     83 logger = logging.getLogger('MARKDOWN')
     84 
     85 
     86 class Markdown(object):
     87     """Convert Markdown to HTML."""
     88 
     89     doc_tag = "div"     # Element used to wrap document - later removed
     90 
     91     option_defaults = {
     92         'html_replacement_text' : '[HTML_REMOVED]',
     93         'tab_length'            : 4,
     94         'enable_attributes'     : True,
     95         'smart_emphasis'        : True,
     96         'lazy_ol'               : True,
     97     }
     98 
     99     output_formats = {
    100         'html'  : to_html_string,
    101         'html4' : to_html_string,
    102         'html5' : to_html_string,
    103         'xhtml' : to_xhtml_string,
    104         'xhtml1': to_xhtml_string,
    105         'xhtml5': to_xhtml_string,
    106     }
    107 
    108     ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
    109                     '(', ')', '>', '#', '+', '-', '.', '!']
    110 
    111     def __init__(self, *args, **kwargs):
    112         """
    113         Creates a new Markdown instance.
    114 
    115         Keyword arguments:
    116 
    117         * extensions: A list of extensions.
    118            If they are of type string, the module mdx_name.py will be loaded.
    119            If they are a subclass of markdown.Extension, they will be used
    120            as-is.
    121         * extension_configs: Configuration settingis for extensions.
    122         * output_format: Format of output. Supported formats are:
    123             * "xhtml1": Outputs XHTML 1.x. Default.
    124             * "xhtml5": Outputs XHTML style tags of HTML 5
    125             * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
    126             * "html4": Outputs HTML 4
    127             * "html5": Outputs HTML style tags of HTML 5
    128             * "html": Outputs latest supported version of HTML (currently HTML 4).
    129             Note that it is suggested that the more specific formats ("xhtml1"
    130             and "html4") be used as "xhtml" or "html" may change in the future
    131             if it makes sense at that time.
    132         * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
    133         * html_replacement_text: Text used when safe_mode is set to "replace".
    134         * tab_length: Length of tabs in the source. Default: 4
    135         * enable_attributes: Enable the conversion of attributes. Default: True
    136         * smart_emphasis: Treat `_connected_words_` intelegently Default: True
    137         * lazy_ol: Ignore number of first item of ordered lists. Default: True
    138 
    139         """
    140 
    141         # For backward compatibility, loop through old positional args
    142         pos = ['extensions', 'extension_configs', 'safe_mode', 'output_format']
    143         c = 0
    144         for arg in args:
    145             if pos[c] not in kwargs:
    146                 kwargs[pos[c]] = arg
    147             c += 1
    148             if c == len(pos):
    149                 # ignore any additional args
    150                 break
    151 
    152         # Loop through kwargs and assign defaults
    153         for option, default in self.option_defaults.items():
    154             setattr(self, option, kwargs.get(option, default))
    155 
    156         self.safeMode = kwargs.get('safe_mode', False)
    157         if self.safeMode and 'enable_attributes' not in kwargs:
    158             # Disable attributes in safeMode when not explicitly set
    159             self.enable_attributes = False
    160 
    161         self.registeredExtensions = []
    162         self.docType = ""
    163         self.stripTopLevelTags = True
    164 
    165         self.build_parser()
    166 
    167         self.references = {}
    168         self.htmlStash = util.HtmlStash()
    169         self.set_output_format(kwargs.get('output_format', 'xhtml1'))
    170         self.registerExtensions(extensions=kwargs.get('extensions', []),
    171                                 configs=kwargs.get('extension_configs', {}))
    172         self.reset()
    173 
    174     def build_parser(self):
    175         """ Build the parser from the various parts. """
    176         self.preprocessors = build_preprocessors(self)
    177         self.parser = build_block_parser(self)
    178         self.inlinePatterns = build_inlinepatterns(self)
    179         self.treeprocessors = build_treeprocessors(self)
    180         self.postprocessors = build_postprocessors(self)
    181         return self
    182 
    183     def registerExtensions(self, extensions, configs):
    184         """
    185         Register extensions with this instance of Markdown.
    186 
    187         Keyword arguments:
    188 
    189         * extensions: A list of extensions, which can either
    190            be strings or objects.  See the docstring on Markdown.
    191         * configs: A dictionary mapping module names to config options.
    192 
    193         """
    194         for ext in extensions:
    195             if isinstance(ext, util.string_type):
    196                 ext = self.build_extension(ext, configs.get(ext, []))
    197             if isinstance(ext, Extension):
    198                 ext.extendMarkdown(self, globals())
    199             elif ext is not None:
    200                 raise TypeError(
    201                     'Extension "%s.%s" must be of type: "markdown.Extension"'
    202                     % (ext.__class__.__module__, ext.__class__.__name__))
    203 
    204         return self
    205 
    206     def build_extension(self, ext_name, configs = []):
    207         """Build extension by name, then return the module.
    208 
    209         The extension name may contain arguments as part of the string in the
    210         following format: "extname(key1=value1,key2=value2)"
    211 
    212         """
    213 
    214         # Parse extensions config params (ignore the order)
    215         configs = dict(configs)
    216         pos = ext_name.find("(") # find the first "("
    217         if pos > 0:
    218             ext_args = ext_name[pos+1:-1]
    219             ext_name = ext_name[:pos]
    220             pairs = [x.split("=") for x in ext_args.split(",")]
    221             configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
    222 
    223         # Setup the module name
    224         module_name = ext_name
    225         if '.' not in ext_name:
    226             module_name = '.'.join(['third_party.markdown.extensions', ext_name])
    227 
    228         # Try loading the extension first from one place, then another
    229         try: # New style (markdown.extensons.<extension>)
    230             module = __import__(module_name, {}, {}, [module_name.rpartition('.')[0]])
    231         except ImportError:
    232             module_name_old_style = '_'.join(['mdx', ext_name])
    233             try: # Old style (mdx_<extension>)
    234                 module = __import__(module_name_old_style)
    235             except ImportError as e:
    236                 message = "Failed loading extension '%s' from '%s' or '%s'" \
    237                     % (ext_name, module_name, module_name_old_style)
    238                 e.args = (message,) + e.args[1:]
    239                 raise
    240 
    241         # If the module is loaded successfully, we expect it to define a
    242         # function called makeExtension()
    243         try:
    244             return module.makeExtension(configs.items())
    245         except AttributeError as e:
    246             message = e.args[0]
    247             message = "Failed to initiate extension " \
    248                       "'%s': %s" % (ext_name, message)
    249             e.args = (message,) + e.args[1:]
    250             raise
    251 
    252     def registerExtension(self, extension):
    253         """ This gets called by the extension """
    254         self.registeredExtensions.append(extension)
    255         return self
    256 
    257     def reset(self):
    258         """
    259         Resets all state variables so that we can start with a new text.
    260         """
    261         self.htmlStash.reset()
    262         self.references.clear()
    263 
    264         for extension in self.registeredExtensions:
    265             if hasattr(extension, 'reset'):
    266                 extension.reset()
    267 
    268         return self
    269 
    270     def set_output_format(self, format):
    271         """ Set the output format for the class instance. """
    272         self.output_format = format.lower()
    273         try:
    274             self.serializer = self.output_formats[self.output_format]
    275         except KeyError as e:
    276             valid_formats = list(self.output_formats.keys())
    277             valid_formats.sort()
    278             message = 'Invalid Output Format: "%s". Use one of %s.' \
    279                        % (self.output_format, 
    280                           '"' + '", "'.join(valid_formats) + '"')
    281             e.args = (message,) + e.args[1:]
    282             raise
    283         return self
    284 
    285     def convert(self, source):
    286         """
    287         Convert markdown to serialized XHTML or HTML.
    288 
    289         Keyword arguments:
    290 
    291         * source: Source text as a Unicode string.
    292 
    293         Markdown processing takes place in five steps:
    294 
    295         1. A bunch of "preprocessors" munge the input text.
    296         2. BlockParser() parses the high-level structural elements of the
    297            pre-processed text into an ElementTree.
    298         3. A bunch of "treeprocessors" are run against the ElementTree. One
    299            such treeprocessor runs InlinePatterns against the ElementTree,
    300            detecting inline markup.
    301         4. Some post-processors are run against the text after the ElementTree
    302            has been serialized into text.
    303         5. The output is written to a string.
    304 
    305         """
    306 
    307         # Fixup the source text
    308         if not source.strip():
    309             return ''  # a blank unicode string
    310 
    311         try:
    312             source = util.text_type(source)
    313         except UnicodeDecodeError as e:
    314             # Customise error message while maintaining original trackback
    315             e.reason += '. -- Note: Markdown only accepts unicode input!'
    316             raise
    317 
    318         # Split into lines and run the line preprocessors.
    319         self.lines = source.split("\n")
    320         for prep in self.preprocessors.values():
    321             self.lines = prep.run(self.lines)
    322 
    323         # Parse the high-level elements.
    324         root = self.parser.parseDocument(self.lines).getroot()
    325 
    326         # Run the tree-processors
    327         for treeprocessor in self.treeprocessors.values():
    328             newRoot = treeprocessor.run(root)
    329             if newRoot:
    330                 root = newRoot
    331 
    332         # Serialize _properly_.  Strip top-level tags.
    333         output = self.serializer(root)
    334         if self.stripTopLevelTags:
    335             try:
    336                 start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
    337                 end = output.rindex('</%s>'%self.doc_tag)
    338                 output = output[start:end].strip()
    339             except ValueError:
    340                 if output.strip().endswith('<%s />'%self.doc_tag):
    341                     # We have an empty document
    342                     output = ''
    343                 else:
    344                     # We have a serious problem
    345                     raise ValueError('Markdown failed to strip top-level tags. Document=%r' % output.strip())
    346 
    347         # Run the text post-processors
    348         for pp in self.postprocessors.values():
    349             output = pp.run(output)
    350 
    351         return output.strip()
    352 
    353     def convertFile(self, input=None, output=None, encoding=None):
    354         """Converts a markdown file and returns the HTML as a unicode string.
    355 
    356         Decodes the file using the provided encoding (defaults to utf-8),
    357         passes the file content to markdown, and outputs the html to either
    358         the provided stream or the file with provided name, using the same
    359         encoding as the source file. The 'xmlcharrefreplace' error handler is
    360         used when encoding the output.
    361 
    362         **Note:** This is the only place that decoding and encoding of unicode
    363         takes place in Python-Markdown.  (All other code is unicode-in /
    364         unicode-out.)
    365 
    366         Keyword arguments:
    367 
    368         * input: File object or path. Reads from stdin if `None`.
    369         * output: File object or path. Writes to stdout if `None`.
    370         * encoding: Encoding of input and output files. Defaults to utf-8.
    371 
    372         """
    373 
    374         encoding = encoding or "utf-8"
    375 
    376         # Read the source
    377         if input:
    378             if isinstance(input, util.string_type):
    379                 input_file = codecs.open(input, mode="r", encoding=encoding)
    380             else:
    381                 input_file = codecs.getreader(encoding)(input)
    382             text = input_file.read()
    383             input_file.close()
    384         else:
    385             text = sys.stdin.read()
    386             if not isinstance(text, util.text_type):
    387                 text = text.decode(encoding)
    388 
    389         text = text.lstrip('\ufeff') # remove the byte-order mark
    390 
    391         # Convert
    392         html = self.convert(text)
    393 
    394         # Write to file or stdout
    395         if output:
    396             if isinstance(output, util.string_type):
    397                 output_file = codecs.open(output, "w",
    398                                           encoding=encoding,
    399                                           errors="xmlcharrefreplace")
    400                 output_file.write(html)
    401                 output_file.close()
    402             else:
    403                 writer = codecs.getwriter(encoding)
    404                 output_file = writer(output, errors="xmlcharrefreplace")
    405                 output_file.write(html)
    406                 # Don't close here. User may want to write more.
    407         else:
    408             # Encode manually and write bytes to stdout. 
    409             html = html.encode(encoding, "xmlcharrefreplace")
    410             try:
    411                 # Write bytes directly to buffer (Python 3).
    412                 sys.stdout.buffer.write(html)
    413             except AttributeError:
    414                 # Probably Python 2, which works with bytes by default.
    415                 sys.stdout.write(html)
    416 
    417         return self
    418 
    419 
    420 """
    421 EXPORTED FUNCTIONS
    422 =============================================================================
    423 
    424 Those are the two functions we really mean to export: markdown() and
    425 markdownFromFile().
    426 """
    427 
    428 def markdown(text, *args, **kwargs):
    429     """Convert a markdown string to HTML and return HTML as a unicode string.
    430 
    431     This is a shortcut function for `Markdown` class to cover the most
    432     basic use case.  It initializes an instance of Markdown, loads the
    433     necessary extensions and runs the parser on the given text.
    434 
    435     Keyword arguments:
    436 
    437     * text: Markdown formatted text as Unicode or ASCII string.
    438     * Any arguments accepted by the Markdown class.
    439 
    440     Returns: An HTML document as a string.
    441 
    442     """
    443     md = Markdown(*args, **kwargs)
    444     return md.convert(text)
    445 
    446 
    447 def markdownFromFile(*args, **kwargs):
    448     """Read markdown code from a file and write it to a file or a stream.
    449 
    450     This is a shortcut function which initializes an instance of Markdown,
    451     and calls the convertFile method rather than convert.
    452 
    453     Keyword arguments:
    454 
    455     * input: a file name or readable object.
    456     * output: a file name or writable object.
    457     * encoding: Encoding of input and output.
    458     * Any arguments accepted by the Markdown class.
    459 
    460     """
    461     # For backward compatibility loop through positional args
    462     pos = ['input', 'output', 'extensions', 'encoding']
    463     c = 0
    464     for arg in args:
    465         if pos[c] not in kwargs:
    466             kwargs[pos[c]] = arg
    467         c += 1
    468         if c == len(pos):
    469             break
    470 
    471     md = Markdown(**kwargs)
    472     md.convertFile(kwargs.get('input', None),
    473                    kwargs.get('output', None),
    474                    kwargs.get('encoding', None))
    475 
    476