Home | History | Annotate | Download | only in markdown
      1 """
      2 Python Markdown
      3 ===============
      4 
      5 Python Markdown converts Markdown to HTML and can be used as a library or
      6 called from the command line.
      7 
      8 ## Basic usage as a module:
      9 
     10     import markdown
     11     md = Markdown()
     12     html = md.convert(your_text_string)
     13 
     14 ## Basic use from the command line:
     15 
     16     markdown source.txt > destination.html
     17 
     18 Run "markdown --help" to see more options.
     19 
     20 ## Extensions
     21 
     22 See <http://www.freewisdom.org/projects/python-markdown/> for more
     23 information and instructions on how to extend the functionality of
     24 Python Markdown.  Read that before you try modifying this file.
     25 
     26 ## Authors and License
     27 
     28 Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
     29 maintained  by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
     30 Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
     31 
     32 Contact: markdown (at] freewisdom.org
     33 
     34 Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
     35 Copyright 200? Django Software Foundation (OrderedDict implementation)
     36 Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
     37 Copyright 2004 Manfred Stienstra (the original version)
     38 
     39 License: BSD (see docs/LICENSE for details).
     40 """
     41 
     42 version = "2.0.3"
     43 version_info = (2,0,3, "Final")
     44 
     45 import re
     46 import codecs
     47 import sys
     48 import warnings
     49 import logging
     50 from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
     51 
     52 
     53 """
     54 CONSTANTS
     55 =============================================================================
     56 """
     57 
     58 """
     59 Constants you might want to modify
     60 -----------------------------------------------------------------------------
     61 """
     62 
     63 # default logging level for command-line use
     64 COMMAND_LINE_LOGGING_LEVEL = CRITICAL
     65 TAB_LENGTH = 4               # expand tabs to this many spaces
     66 ENABLE_ATTRIBUTES = True     # @id = xyz -> <... id="xyz">
     67 SMART_EMPHASIS = True        # this_or_that does not become this<i>or</i>that
     68 DEFAULT_OUTPUT_FORMAT = 'xhtml1'     # xhtml or html4 output
     69 HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
     70 BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
     71                                   "|script|noscript|form|fieldset|iframe|math"
     72                                   "|ins|del|hr|hr/|style|li|dt|dd|thead|tbody"
     73                                   "|tr|th|td")
     74 DOC_TAG = "div"     # Element used to wrap document - later removed
     75 
     76 # Placeholders
     77 STX = u'\u0002'  # Use STX ("Start of text") for start-of-placeholder
     78 ETX = u'\u0003'  # Use ETX ("End of text") for end-of-placeholder
     79 INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
     80 INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
     81 AMP_SUBSTITUTE = STX+"amp"+ETX
     82 
     83 
     84 """
     85 Constants you probably do not need to change
     86 -----------------------------------------------------------------------------
     87 """
     88 
     89 RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
     90                      # Hebrew (0590-05FF), Arabic (0600-06FF),
     91                      # Syriac (0700-074F), Arabic supplement (0750-077F),
     92                      # Thaana (0780-07BF), Nko (07C0-07FF).
     93                     (u'\u2D30', u'\u2D7F'), # Tifinagh
     94                     )
     95 
     96 
     97 """
     98 AUXILIARY GLOBAL FUNCTIONS
     99 =============================================================================
    100 """
    101 
    102 
    103 def message(level, text):
    104     """ A wrapper method for logging debug messages. """
    105     logger =  logging.getLogger('MARKDOWN')
    106     if logger.handlers:
    107         # The logger is configured
    108         logger.log(level, text)
    109         if level > WARN:
    110             sys.exit(0)
    111     elif level > WARN:
    112         raise MarkdownException, text
    113     else:
    114         warnings.warn(text, MarkdownWarning)
    115 
    116 
    117 def isBlockLevel(tag):
    118     """Check if the tag is a block level HTML tag."""
    119     return BLOCK_LEVEL_ELEMENTS.match(tag)
    120 
    121 """
    122 MISC AUXILIARY CLASSES
    123 =============================================================================
    124 """
    125 
    126 class AtomicString(unicode):
    127     """A string which should not be further processed."""
    128     pass
    129 
    130 
    131 class MarkdownException(Exception):
    132     """ A Markdown Exception. """
    133     pass
    134 
    135 
    136 class MarkdownWarning(Warning):
    137     """ A Markdown Warning. """
    138     pass
    139 
    140 
    141 """
    142 OVERALL DESIGN
    143 =============================================================================
    144 
    145 Markdown processing takes place in four steps:
    146 
    147 1. A bunch of "preprocessors" munge the input text.
    148 2. BlockParser() parses the high-level structural elements of the
    149    pre-processed text into an ElementTree.
    150 3. A bunch of "treeprocessors" are run against the ElementTree. One such
    151    treeprocessor runs InlinePatterns against the ElementTree, detecting inline
    152    markup.
    153 4. Some post-processors are run against the text after the ElementTree has
    154    been serialized into text.
    155 5. The output is written to a string.
    156 
    157 Those steps are put together by the Markdown() class.
    158 
    159 """
    160 
    161 import preprocessors
    162 import blockprocessors
    163 import treeprocessors
    164 import inlinepatterns
    165 import postprocessors
    166 import blockparser
    167 import etree_loader
    168 import odict
    169 
    170 # Extensions should use "markdown.etree" instead of "etree" (or do `from
    171 # markdown import etree`).  Do not import it by yourself.
    172 
    173 etree = etree_loader.importETree()
    174 
    175 # Adds the ability to output html4
    176 import html4
    177 
    178 
    179 class Markdown:
    180     """Convert Markdown to HTML."""
    181 
    182     def __init__(self,
    183                  extensions=[],
    184                  extension_configs={},
    185                  safe_mode = False, 
    186                  output_format=DEFAULT_OUTPUT_FORMAT):
    187         """
    188         Creates a new Markdown instance.
    189 
    190         Keyword arguments:
    191 
    192         * extensions: A list of extensions.
    193            If they are of type string, the module mdx_name.py will be loaded.
    194            If they are a subclass of markdown.Extension, they will be used
    195            as-is.
    196         * extension-configs: Configuration setting for extensions.
    197         * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
    198         * output_format: Format of output. Supported formats are:
    199             * "xhtml1": Outputs XHTML 1.x. Default.
    200             * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
    201             * "html4": Outputs HTML 4
    202             * "html": Outputs latest supported version of HTML (currently HTML 4).
    203             Note that it is suggested that the more specific formats ("xhtml1" 
    204             and "html4") be used as "xhtml" or "html" may change in the future
    205             if it makes sense at that time. 
    206 
    207         """
    208         
    209         self.safeMode = safe_mode
    210         self.registeredExtensions = []
    211         self.docType = ""
    212         self.stripTopLevelTags = True
    213 
    214         # Preprocessors
    215         self.preprocessors = odict.OrderedDict()
    216         self.preprocessors["html_block"] = \
    217                 preprocessors.HtmlBlockPreprocessor(self)
    218         self.preprocessors["reference"] = \
    219                 preprocessors.ReferencePreprocessor(self)
    220         # footnote preprocessor will be inserted with "<reference"
    221 
    222         # Block processors - ran by the parser
    223         self.parser = blockparser.BlockParser()
    224         self.parser.blockprocessors['empty'] = \
    225                 blockprocessors.EmptyBlockProcessor(self.parser)
    226         self.parser.blockprocessors['indent'] = \
    227                 blockprocessors.ListIndentProcessor(self.parser)
    228         self.parser.blockprocessors['code'] = \
    229                 blockprocessors.CodeBlockProcessor(self.parser)
    230         self.parser.blockprocessors['hashheader'] = \
    231                 blockprocessors.HashHeaderProcessor(self.parser)
    232         self.parser.blockprocessors['setextheader'] = \
    233                 blockprocessors.SetextHeaderProcessor(self.parser)
    234         self.parser.blockprocessors['hr'] = \
    235                 blockprocessors.HRProcessor(self.parser)
    236         self.parser.blockprocessors['olist'] = \
    237                 blockprocessors.OListProcessor(self.parser)
    238         self.parser.blockprocessors['ulist'] = \
    239                 blockprocessors.UListProcessor(self.parser)
    240         self.parser.blockprocessors['quote'] = \
    241                 blockprocessors.BlockQuoteProcessor(self.parser)
    242         self.parser.blockprocessors['paragraph'] = \
    243                 blockprocessors.ParagraphProcessor(self.parser)
    244 
    245 
    246         #self.prePatterns = []
    247 
    248         # Inline patterns - Run on the tree
    249         self.inlinePatterns = odict.OrderedDict()
    250         self.inlinePatterns["backtick"] = \
    251                 inlinepatterns.BacktickPattern(inlinepatterns.BACKTICK_RE)
    252         self.inlinePatterns["escape"] = \
    253                 inlinepatterns.SimpleTextPattern(inlinepatterns.ESCAPE_RE)
    254         self.inlinePatterns["reference"] = \
    255             inlinepatterns.ReferencePattern(inlinepatterns.REFERENCE_RE, self)
    256         self.inlinePatterns["link"] = \
    257                 inlinepatterns.LinkPattern(inlinepatterns.LINK_RE, self)
    258         self.inlinePatterns["image_link"] = \
    259                 inlinepatterns.ImagePattern(inlinepatterns.IMAGE_LINK_RE, self)
    260         self.inlinePatterns["image_reference"] = \
    261             inlinepatterns.ImageReferencePattern(inlinepatterns.IMAGE_REFERENCE_RE, self)
    262         self.inlinePatterns["autolink"] = \
    263             inlinepatterns.AutolinkPattern(inlinepatterns.AUTOLINK_RE, self)
    264         self.inlinePatterns["automail"] = \
    265             inlinepatterns.AutomailPattern(inlinepatterns.AUTOMAIL_RE, self)
    266         self.inlinePatterns["linebreak2"] = \
    267             inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_2_RE, 'br')
    268         self.inlinePatterns["linebreak"] = \
    269             inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_RE, 'br')
    270         self.inlinePatterns["html"] = \
    271                 inlinepatterns.HtmlPattern(inlinepatterns.HTML_RE, self)
    272         self.inlinePatterns["entity"] = \
    273                 inlinepatterns.HtmlPattern(inlinepatterns.ENTITY_RE, self)
    274         self.inlinePatterns["not_strong"] = \
    275                 inlinepatterns.SimpleTextPattern(inlinepatterns.NOT_STRONG_RE)
    276         self.inlinePatterns["strong_em"] = \
    277             inlinepatterns.DoubleTagPattern(inlinepatterns.STRONG_EM_RE, 'strong,em')
    278         self.inlinePatterns["strong"] = \
    279             inlinepatterns.SimpleTagPattern(inlinepatterns.STRONG_RE, 'strong')
    280         self.inlinePatterns["emphasis"] = \
    281             inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_RE, 'em')
    282         self.inlinePatterns["emphasis2"] = \
    283             inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_2_RE, 'em')
    284         # The order of the handlers matters!!!
    285 
    286 
    287         # Tree processors - run once we have a basic parse.
    288         self.treeprocessors = odict.OrderedDict()
    289         self.treeprocessors["inline"] = treeprocessors.InlineProcessor(self)
    290         self.treeprocessors["prettify"] = \
    291                 treeprocessors.PrettifyTreeprocessor(self)
    292 
    293         # Postprocessors - finishing touches.
    294         self.postprocessors = odict.OrderedDict()
    295         self.postprocessors["raw_html"] = \
    296                 postprocessors.RawHtmlPostprocessor(self)
    297         self.postprocessors["amp_substitute"] = \
    298                 postprocessors.AndSubstitutePostprocessor()
    299         # footnote postprocessor will be inserted with ">amp_substitute"
    300 
    301         # Map format keys to serializers
    302         self.output_formats = {
    303             'html'  : html4.to_html_string, 
    304             'html4' : html4.to_html_string,
    305             'xhtml' : etree.tostring, 
    306             'xhtml1': etree.tostring,
    307         }
    308 
    309         self.references = {}
    310         self.htmlStash = preprocessors.HtmlStash()
    311         self.registerExtensions(extensions = extensions,
    312                                 configs = extension_configs)
    313         self.set_output_format(output_format)
    314         self.reset()
    315 
    316     def registerExtensions(self, extensions, configs):
    317         """
    318         Register extensions with this instance of Markdown.
    319 
    320         Keyword aurguments:
    321 
    322         * extensions: A list of extensions, which can either
    323            be strings or objects.  See the docstring on Markdown.
    324         * configs: A dictionary mapping module names to config options.
    325 
    326         """
    327         for ext in extensions:
    328             if isinstance(ext, basestring):
    329                 ext = load_extension(ext, configs.get(ext, []))
    330             if isinstance(ext, Extension):
    331                 try:
    332                     ext.extendMarkdown(self, globals())
    333                 except NotImplementedError, e:
    334                     message(ERROR, e)
    335             else:
    336                 message(ERROR, 'Extension "%s.%s" must be of type: "markdown.Extension".' \
    337                     % (ext.__class__.__module__, ext.__class__.__name__))
    338 
    339     def registerExtension(self, extension):
    340         """ This gets called by the extension """
    341         self.registeredExtensions.append(extension)
    342 
    343     def reset(self):
    344         """
    345         Resets all state variables so that we can start with a new text.
    346         """
    347         self.htmlStash.reset()
    348         self.references.clear()
    349 
    350         for extension in self.registeredExtensions:
    351             extension.reset()
    352 
    353     def set_output_format(self, format):
    354         """ Set the output format for the class instance. """
    355         try:
    356             self.serializer = self.output_formats[format.lower()]
    357         except KeyError:
    358             message(CRITICAL, 'Invalid Output Format: "%s". Use one of %s.' \
    359                                % (format, self.output_formats.keys()))
    360 
    361     def convert(self, source):
    362         """
    363         Convert markdown to serialized XHTML or HTML.
    364 
    365         Keyword arguments:
    366 
    367         * source: Source text as a Unicode string.
    368 
    369         """
    370 
    371         # Fixup the source text
    372         if not source.strip():
    373             return u""  # a blank unicode string
    374         try:
    375             source = unicode(source)
    376         except UnicodeDecodeError:
    377             message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
    378             return u""
    379 
    380         source = source.replace(STX, "").replace(ETX, "")
    381         source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
    382         source = re.sub(r'\n\s+\n', '\n\n', source)
    383         source = source.expandtabs(TAB_LENGTH)
    384 
    385         # Split into lines and run the line preprocessors.
    386         self.lines = source.split("\n")
    387         for prep in self.preprocessors.values():
    388             self.lines = prep.run(self.lines)
    389 
    390         # Parse the high-level elements.
    391         root = self.parser.parseDocument(self.lines).getroot()
    392 
    393         # Run the tree-processors
    394         for treeprocessor in self.treeprocessors.values():
    395             newRoot = treeprocessor.run(root)
    396             if newRoot:
    397                 root = newRoot
    398 
    399         # Serialize _properly_.  Strip top-level tags.
    400         output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8"))
    401         if self.stripTopLevelTags:
    402             try:
    403                 start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
    404                 end = output.rindex('</%s>'%DOC_TAG)
    405                 output = output[start:end].strip()
    406             except ValueError:
    407                 if output.strip().endswith('<%s />'%DOC_TAG):
    408                     # We have an empty document
    409                     output = ''
    410                 else:
    411                     # We have a serious problem
    412                     message(CRITICAL, 'Failed to strip top level tags.')
    413 
    414         # Run the text post-processors
    415         for pp in self.postprocessors.values():
    416             output = pp.run(output)
    417 
    418         return output.strip()
    419 
    420     def convertFile(self, input=None, output=None, encoding=None):
    421         """Converts a markdown file and returns the HTML as a unicode string.
    422 
    423         Decodes the file using the provided encoding (defaults to utf-8),
    424         passes the file content to markdown, and outputs the html to either
    425         the provided stream or the file with provided name, using the same
    426         encoding as the source file.
    427 
    428         **Note:** This is the only place that decoding and encoding of unicode
    429         takes place in Python-Markdown.  (All other code is unicode-in /
    430         unicode-out.)
    431 
    432         Keyword arguments:
    433 
    434         * input: Name of source text file.
    435         * output: Name of output file. Writes to stdout if `None`.
    436         * encoding: Encoding of input and output files. Defaults to utf-8.
    437 
    438         """
    439 
    440         encoding = encoding or "utf-8"
    441 
    442         # Read the source
    443         input_file = codecs.open(input, mode="r", encoding=encoding)
    444         text = input_file.read()
    445         input_file.close()
    446         text = text.lstrip(u'\ufeff') # remove the byte-order mark
    447 
    448         # Convert
    449         html = self.convert(text)
    450 
    451         # Write to file or stdout
    452         if isinstance(output, (str, unicode)):
    453             output_file = codecs.open(output, "w", encoding=encoding)
    454             output_file.write(html)
    455             output_file.close()
    456         else:
    457             output.write(html.encode(encoding))
    458 
    459 
    460 """
    461 Extensions
    462 -----------------------------------------------------------------------------
    463 """
    464 
    465 class Extension:
    466     """ Base class for extensions to subclass. """
    467     def __init__(self, configs = {}):
    468         """Create an instance of an Extention.
    469 
    470         Keyword arguments:
    471 
    472         * configs: A dict of configuration setting used by an Extension.
    473         """
    474         self.config = configs
    475 
    476     def getConfig(self, key):
    477         """ Return a setting for the given key or an empty string. """
    478         if key in self.config:
    479             return self.config[key][0]
    480         else:
    481             return ""
    482 
    483     def getConfigInfo(self):
    484         """ Return all config settings as a list of tuples. """
    485         return [(key, self.config[key][1]) for key in self.config.keys()]
    486 
    487     def setConfig(self, key, value):
    488         """ Set a config setting for `key` with the given `value`. """
    489         self.config[key][0] = value
    490 
    491     def extendMarkdown(self, md, md_globals):
    492         """
    493         Add the various proccesors and patterns to the Markdown Instance.
    494 
    495         This method must be overriden by every extension.
    496 
    497         Keyword arguments:
    498 
    499         * md: The Markdown instance.
    500 
    501         * md_globals: Global variables in the markdown module namespace.
    502 
    503         """
    504         raise NotImplementedError, 'Extension "%s.%s" must define an "extendMarkdown"' \
    505             'method.' % (self.__class__.__module__, self.__class__.__name__)
    506 
    507 
    508 def load_extension(ext_name, configs = []):
    509     """Load extension by name, then return the module.
    510 
    511     The extension name may contain arguments as part of the string in the
    512     following format: "extname(key1=value1,key2=value2)"
    513 
    514     """
    515 
    516     # Parse extensions config params (ignore the order)
    517     configs = dict(configs)
    518     pos = ext_name.find("(") # find the first "("
    519     if pos > 0:
    520         ext_args = ext_name[pos+1:-1]
    521         ext_name = ext_name[:pos]
    522         pairs = [x.split("=") for x in ext_args.split(",")]
    523         configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
    524 
    525     # Setup the module names
    526     ext_module = 'markdown.extensions'
    527     module_name_new_style = '.'.join([ext_module, ext_name])
    528     module_name_old_style = '_'.join(['mdx', ext_name])
    529 
    530     # Try loading the extention first from one place, then another
    531     try: # New style (markdown.extensons.<extension>)
    532         module = __import__(module_name_new_style, {}, {}, [ext_module])
    533     except ImportError:
    534         try: # Old style (mdx.<extension>)
    535             module = __import__(module_name_old_style)
    536         except ImportError:
    537            message(WARN, "Failed loading extension '%s' from '%s' or '%s'"
    538                % (ext_name, module_name_new_style, module_name_old_style))
    539            # Return None so we don't try to initiate none-existant extension
    540            return None
    541 
    542     # If the module is loaded successfully, we expect it to define a
    543     # function called makeExtension()
    544     try:
    545         return module.makeExtension(configs.items())
    546     except AttributeError:
    547         message(CRITICAL, "Failed to initiate extension '%s'" % ext_name)
    548 
    549 
    550 def load_extensions(ext_names):
    551     """Loads multiple extensions"""
    552     extensions = []
    553     for ext_name in ext_names:
    554         extension = load_extension(ext_name)
    555         if extension:
    556             extensions.append(extension)
    557     return extensions
    558 
    559 
    560 """
    561 EXPORTED FUNCTIONS
    562 =============================================================================
    563 
    564 Those are the two functions we really mean to export: markdown() and
    565 markdownFromFile().
    566 """
    567 
    568 def markdown(text,
    569              extensions = [],
    570              safe_mode = False,
    571              output_format = DEFAULT_OUTPUT_FORMAT):
    572     """Convert a markdown string to HTML and return HTML as a unicode string.
    573 
    574     This is a shortcut function for `Markdown` class to cover the most
    575     basic use case.  It initializes an instance of Markdown, loads the
    576     necessary extensions and runs the parser on the given text.
    577 
    578     Keyword arguments:
    579 
    580     * text: Markdown formatted text as Unicode or ASCII string.
    581     * extensions: A list of extensions or extension names (may contain config args).
    582     * safe_mode: Disallow raw html.  One of "remove", "replace" or "escape".
    583     * output_format: Format of output. Supported formats are:
    584         * "xhtml1": Outputs XHTML 1.x. Default.
    585         * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
    586         * "html4": Outputs HTML 4
    587         * "html": Outputs latest supported version of HTML (currently HTML 4).
    588         Note that it is suggested that the more specific formats ("xhtml1" 
    589         and "html4") be used as "xhtml" or "html" may change in the future
    590         if it makes sense at that time. 
    591 
    592     Returns: An HTML document as a string.
    593 
    594     """
    595     md = Markdown(extensions=load_extensions(extensions),
    596                   safe_mode=safe_mode, 
    597                   output_format=output_format)
    598     return md.convert(text)
    599 
    600 
    601 def markdownFromFile(input = None,
    602                      output = None,
    603                      extensions = [],
    604                      encoding = None,
    605                      safe_mode = False,
    606                      output_format = DEFAULT_OUTPUT_FORMAT):
    607     """Read markdown code from a file and write it to a file or a stream."""
    608     md = Markdown(extensions=load_extensions(extensions), 
    609                   safe_mode=safe_mode,
    610                   output_format=output_format)
    611     md.convertFile(input, output, encoding)
    612 
    613 
    614 
    615