1 """ 2 Python Markdown 3 =============== 4 5 Python Markdown converts Markdown to HTML and can be used as a library or 6 called from the command line. 7 8 ## Basic usage as a module: 9 10 import markdown 11 md = Markdown() 12 html = md.convert(your_text_string) 13 14 ## Basic use from the command line: 15 16 markdown source.txt > destination.html 17 18 Run "markdown --help" to see more options. 19 20 ## Extensions 21 22 See <http://www.freewisdom.org/projects/python-markdown/> for more 23 information and instructions on how to extend the functionality of 24 Python Markdown. Read that before you try modifying this file. 25 26 ## Authors and License 27 28 Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and 29 maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan 30 Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com). 31 32 Contact: markdown (at] freewisdom.org 33 34 Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) 35 Copyright 200? Django Software Foundation (OrderedDict implementation) 36 Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 37 Copyright 2004 Manfred Stienstra (the original version) 38 39 License: BSD (see docs/LICENSE for details). 40 """ 41 42 version = "2.0.3" 43 version_info = (2,0,3, "Final") 44 45 import re 46 import codecs 47 import sys 48 import warnings 49 import logging 50 from logging import DEBUG, INFO, WARN, ERROR, CRITICAL 51 52 53 """ 54 CONSTANTS 55 ============================================================================= 56 """ 57 58 """ 59 Constants you might want to modify 60 ----------------------------------------------------------------------------- 61 """ 62 63 # default logging level for command-line use 64 COMMAND_LINE_LOGGING_LEVEL = CRITICAL 65 TAB_LENGTH = 4 # expand tabs to this many spaces 66 ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> 67 SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that 68 DEFAULT_OUTPUT_FORMAT = 'xhtml1' # xhtml or html4 output 69 HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode 70 BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul" 71 "|script|noscript|form|fieldset|iframe|math" 72 "|ins|del|hr|hr/|style|li|dt|dd|thead|tbody" 73 "|tr|th|td") 74 DOC_TAG = "div" # Element used to wrap document - later removed 75 76 # Placeholders 77 STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder 78 ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder 79 INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" 80 INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX 81 AMP_SUBSTITUTE = STX+"amp"+ETX 82 83 84 """ 85 Constants you probably do not need to change 86 ----------------------------------------------------------------------------- 87 """ 88 89 RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), 90 # Hebrew (0590-05FF), Arabic (0600-06FF), 91 # Syriac (0700-074F), Arabic supplement (0750-077F), 92 # Thaana (0780-07BF), Nko (07C0-07FF). 93 (u'\u2D30', u'\u2D7F'), # Tifinagh 94 ) 95 96 97 """ 98 AUXILIARY GLOBAL FUNCTIONS 99 ============================================================================= 100 """ 101 102 103 def message(level, text): 104 """ A wrapper method for logging debug messages. """ 105 logger = logging.getLogger('MARKDOWN') 106 if logger.handlers: 107 # The logger is configured 108 logger.log(level, text) 109 if level > WARN: 110 sys.exit(0) 111 elif level > WARN: 112 raise MarkdownException, text 113 else: 114 warnings.warn(text, MarkdownWarning) 115 116 117 def isBlockLevel(tag): 118 """Check if the tag is a block level HTML tag.""" 119 return BLOCK_LEVEL_ELEMENTS.match(tag) 120 121 """ 122 MISC AUXILIARY CLASSES 123 ============================================================================= 124 """ 125 126 class AtomicString(unicode): 127 """A string which should not be further processed.""" 128 pass 129 130 131 class MarkdownException(Exception): 132 """ A Markdown Exception. """ 133 pass 134 135 136 class MarkdownWarning(Warning): 137 """ A Markdown Warning. """ 138 pass 139 140 141 """ 142 OVERALL DESIGN 143 ============================================================================= 144 145 Markdown processing takes place in four steps: 146 147 1. A bunch of "preprocessors" munge the input text. 148 2. BlockParser() parses the high-level structural elements of the 149 pre-processed text into an ElementTree. 150 3. A bunch of "treeprocessors" are run against the ElementTree. One such 151 treeprocessor runs InlinePatterns against the ElementTree, detecting inline 152 markup. 153 4. Some post-processors are run against the text after the ElementTree has 154 been serialized into text. 155 5. The output is written to a string. 156 157 Those steps are put together by the Markdown() class. 158 159 """ 160 161 import preprocessors 162 import blockprocessors 163 import treeprocessors 164 import inlinepatterns 165 import postprocessors 166 import blockparser 167 import etree_loader 168 import odict 169 170 # Extensions should use "markdown.etree" instead of "etree" (or do `from 171 # markdown import etree`). Do not import it by yourself. 172 173 etree = etree_loader.importETree() 174 175 # Adds the ability to output html4 176 import html4 177 178 179 class Markdown: 180 """Convert Markdown to HTML.""" 181 182 def __init__(self, 183 extensions=[], 184 extension_configs={}, 185 safe_mode = False, 186 output_format=DEFAULT_OUTPUT_FORMAT): 187 """ 188 Creates a new Markdown instance. 189 190 Keyword arguments: 191 192 * extensions: A list of extensions. 193 If they are of type string, the module mdx_name.py will be loaded. 194 If they are a subclass of markdown.Extension, they will be used 195 as-is. 196 * extension-configs: Configuration setting for extensions. 197 * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". 198 * output_format: Format of output. Supported formats are: 199 * "xhtml1": Outputs XHTML 1.x. Default. 200 * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1). 201 * "html4": Outputs HTML 4 202 * "html": Outputs latest supported version of HTML (currently HTML 4). 203 Note that it is suggested that the more specific formats ("xhtml1" 204 and "html4") be used as "xhtml" or "html" may change in the future 205 if it makes sense at that time. 206 207 """ 208 209 self.safeMode = safe_mode 210 self.registeredExtensions = [] 211 self.docType = "" 212 self.stripTopLevelTags = True 213 214 # Preprocessors 215 self.preprocessors = odict.OrderedDict() 216 self.preprocessors["html_block"] = \ 217 preprocessors.HtmlBlockPreprocessor(self) 218 self.preprocessors["reference"] = \ 219 preprocessors.ReferencePreprocessor(self) 220 # footnote preprocessor will be inserted with "<reference" 221 222 # Block processors - ran by the parser 223 self.parser = blockparser.BlockParser() 224 self.parser.blockprocessors['empty'] = \ 225 blockprocessors.EmptyBlockProcessor(self.parser) 226 self.parser.blockprocessors['indent'] = \ 227 blockprocessors.ListIndentProcessor(self.parser) 228 self.parser.blockprocessors['code'] = \ 229 blockprocessors.CodeBlockProcessor(self.parser) 230 self.parser.blockprocessors['hashheader'] = \ 231 blockprocessors.HashHeaderProcessor(self.parser) 232 self.parser.blockprocessors['setextheader'] = \ 233 blockprocessors.SetextHeaderProcessor(self.parser) 234 self.parser.blockprocessors['hr'] = \ 235 blockprocessors.HRProcessor(self.parser) 236 self.parser.blockprocessors['olist'] = \ 237 blockprocessors.OListProcessor(self.parser) 238 self.parser.blockprocessors['ulist'] = \ 239 blockprocessors.UListProcessor(self.parser) 240 self.parser.blockprocessors['quote'] = \ 241 blockprocessors.BlockQuoteProcessor(self.parser) 242 self.parser.blockprocessors['paragraph'] = \ 243 blockprocessors.ParagraphProcessor(self.parser) 244 245 246 #self.prePatterns = [] 247 248 # Inline patterns - Run on the tree 249 self.inlinePatterns = odict.OrderedDict() 250 self.inlinePatterns["backtick"] = \ 251 inlinepatterns.BacktickPattern(inlinepatterns.BACKTICK_RE) 252 self.inlinePatterns["escape"] = \ 253 inlinepatterns.SimpleTextPattern(inlinepatterns.ESCAPE_RE) 254 self.inlinePatterns["reference"] = \ 255 inlinepatterns.ReferencePattern(inlinepatterns.REFERENCE_RE, self) 256 self.inlinePatterns["link"] = \ 257 inlinepatterns.LinkPattern(inlinepatterns.LINK_RE, self) 258 self.inlinePatterns["image_link"] = \ 259 inlinepatterns.ImagePattern(inlinepatterns.IMAGE_LINK_RE, self) 260 self.inlinePatterns["image_reference"] = \ 261 inlinepatterns.ImageReferencePattern(inlinepatterns.IMAGE_REFERENCE_RE, self) 262 self.inlinePatterns["autolink"] = \ 263 inlinepatterns.AutolinkPattern(inlinepatterns.AUTOLINK_RE, self) 264 self.inlinePatterns["automail"] = \ 265 inlinepatterns.AutomailPattern(inlinepatterns.AUTOMAIL_RE, self) 266 self.inlinePatterns["linebreak2"] = \ 267 inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_2_RE, 'br') 268 self.inlinePatterns["linebreak"] = \ 269 inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_RE, 'br') 270 self.inlinePatterns["html"] = \ 271 inlinepatterns.HtmlPattern(inlinepatterns.HTML_RE, self) 272 self.inlinePatterns["entity"] = \ 273 inlinepatterns.HtmlPattern(inlinepatterns.ENTITY_RE, self) 274 self.inlinePatterns["not_strong"] = \ 275 inlinepatterns.SimpleTextPattern(inlinepatterns.NOT_STRONG_RE) 276 self.inlinePatterns["strong_em"] = \ 277 inlinepatterns.DoubleTagPattern(inlinepatterns.STRONG_EM_RE, 'strong,em') 278 self.inlinePatterns["strong"] = \ 279 inlinepatterns.SimpleTagPattern(inlinepatterns.STRONG_RE, 'strong') 280 self.inlinePatterns["emphasis"] = \ 281 inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_RE, 'em') 282 self.inlinePatterns["emphasis2"] = \ 283 inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_2_RE, 'em') 284 # The order of the handlers matters!!! 285 286 287 # Tree processors - run once we have a basic parse. 288 self.treeprocessors = odict.OrderedDict() 289 self.treeprocessors["inline"] = treeprocessors.InlineProcessor(self) 290 self.treeprocessors["prettify"] = \ 291 treeprocessors.PrettifyTreeprocessor(self) 292 293 # Postprocessors - finishing touches. 294 self.postprocessors = odict.OrderedDict() 295 self.postprocessors["raw_html"] = \ 296 postprocessors.RawHtmlPostprocessor(self) 297 self.postprocessors["amp_substitute"] = \ 298 postprocessors.AndSubstitutePostprocessor() 299 # footnote postprocessor will be inserted with ">amp_substitute" 300 301 # Map format keys to serializers 302 self.output_formats = { 303 'html' : html4.to_html_string, 304 'html4' : html4.to_html_string, 305 'xhtml' : etree.tostring, 306 'xhtml1': etree.tostring, 307 } 308 309 self.references = {} 310 self.htmlStash = preprocessors.HtmlStash() 311 self.registerExtensions(extensions = extensions, 312 configs = extension_configs) 313 self.set_output_format(output_format) 314 self.reset() 315 316 def registerExtensions(self, extensions, configs): 317 """ 318 Register extensions with this instance of Markdown. 319 320 Keyword aurguments: 321 322 * extensions: A list of extensions, which can either 323 be strings or objects. See the docstring on Markdown. 324 * configs: A dictionary mapping module names to config options. 325 326 """ 327 for ext in extensions: 328 if isinstance(ext, basestring): 329 ext = load_extension(ext, configs.get(ext, [])) 330 if isinstance(ext, Extension): 331 try: 332 ext.extendMarkdown(self, globals()) 333 except NotImplementedError, e: 334 message(ERROR, e) 335 else: 336 message(ERROR, 'Extension "%s.%s" must be of type: "markdown.Extension".' \ 337 % (ext.__class__.__module__, ext.__class__.__name__)) 338 339 def registerExtension(self, extension): 340 """ This gets called by the extension """ 341 self.registeredExtensions.append(extension) 342 343 def reset(self): 344 """ 345 Resets all state variables so that we can start with a new text. 346 """ 347 self.htmlStash.reset() 348 self.references.clear() 349 350 for extension in self.registeredExtensions: 351 extension.reset() 352 353 def set_output_format(self, format): 354 """ Set the output format for the class instance. """ 355 try: 356 self.serializer = self.output_formats[format.lower()] 357 except KeyError: 358 message(CRITICAL, 'Invalid Output Format: "%s". Use one of %s.' \ 359 % (format, self.output_formats.keys())) 360 361 def convert(self, source): 362 """ 363 Convert markdown to serialized XHTML or HTML. 364 365 Keyword arguments: 366 367 * source: Source text as a Unicode string. 368 369 """ 370 371 # Fixup the source text 372 if not source.strip(): 373 return u"" # a blank unicode string 374 try: 375 source = unicode(source) 376 except UnicodeDecodeError: 377 message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.') 378 return u"" 379 380 source = source.replace(STX, "").replace(ETX, "") 381 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" 382 source = re.sub(r'\n\s+\n', '\n\n', source) 383 source = source.expandtabs(TAB_LENGTH) 384 385 # Split into lines and run the line preprocessors. 386 self.lines = source.split("\n") 387 for prep in self.preprocessors.values(): 388 self.lines = prep.run(self.lines) 389 390 # Parse the high-level elements. 391 root = self.parser.parseDocument(self.lines).getroot() 392 393 # Run the tree-processors 394 for treeprocessor in self.treeprocessors.values(): 395 newRoot = treeprocessor.run(root) 396 if newRoot: 397 root = newRoot 398 399 # Serialize _properly_. Strip top-level tags. 400 output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8")) 401 if self.stripTopLevelTags: 402 try: 403 start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2 404 end = output.rindex('</%s>'%DOC_TAG) 405 output = output[start:end].strip() 406 except ValueError: 407 if output.strip().endswith('<%s />'%DOC_TAG): 408 # We have an empty document 409 output = '' 410 else: 411 # We have a serious problem 412 message(CRITICAL, 'Failed to strip top level tags.') 413 414 # Run the text post-processors 415 for pp in self.postprocessors.values(): 416 output = pp.run(output) 417 418 return output.strip() 419 420 def convertFile(self, input=None, output=None, encoding=None): 421 """Converts a markdown file and returns the HTML as a unicode string. 422 423 Decodes the file using the provided encoding (defaults to utf-8), 424 passes the file content to markdown, and outputs the html to either 425 the provided stream or the file with provided name, using the same 426 encoding as the source file. 427 428 **Note:** This is the only place that decoding and encoding of unicode 429 takes place in Python-Markdown. (All other code is unicode-in / 430 unicode-out.) 431 432 Keyword arguments: 433 434 * input: Name of source text file. 435 * output: Name of output file. Writes to stdout if `None`. 436 * encoding: Encoding of input and output files. Defaults to utf-8. 437 438 """ 439 440 encoding = encoding or "utf-8" 441 442 # Read the source 443 input_file = codecs.open(input, mode="r", encoding=encoding) 444 text = input_file.read() 445 input_file.close() 446 text = text.lstrip(u'\ufeff') # remove the byte-order mark 447 448 # Convert 449 html = self.convert(text) 450 451 # Write to file or stdout 452 if isinstance(output, (str, unicode)): 453 output_file = codecs.open(output, "w", encoding=encoding) 454 output_file.write(html) 455 output_file.close() 456 else: 457 output.write(html.encode(encoding)) 458 459 460 """ 461 Extensions 462 ----------------------------------------------------------------------------- 463 """ 464 465 class Extension: 466 """ Base class for extensions to subclass. """ 467 def __init__(self, configs = {}): 468 """Create an instance of an Extention. 469 470 Keyword arguments: 471 472 * configs: A dict of configuration setting used by an Extension. 473 """ 474 self.config = configs 475 476 def getConfig(self, key): 477 """ Return a setting for the given key or an empty string. """ 478 if key in self.config: 479 return self.config[key][0] 480 else: 481 return "" 482 483 def getConfigInfo(self): 484 """ Return all config settings as a list of tuples. """ 485 return [(key, self.config[key][1]) for key in self.config.keys()] 486 487 def setConfig(self, key, value): 488 """ Set a config setting for `key` with the given `value`. """ 489 self.config[key][0] = value 490 491 def extendMarkdown(self, md, md_globals): 492 """ 493 Add the various proccesors and patterns to the Markdown Instance. 494 495 This method must be overriden by every extension. 496 497 Keyword arguments: 498 499 * md: The Markdown instance. 500 501 * md_globals: Global variables in the markdown module namespace. 502 503 """ 504 raise NotImplementedError, 'Extension "%s.%s" must define an "extendMarkdown"' \ 505 'method.' % (self.__class__.__module__, self.__class__.__name__) 506 507 508 def load_extension(ext_name, configs = []): 509 """Load extension by name, then return the module. 510 511 The extension name may contain arguments as part of the string in the 512 following format: "extname(key1=value1,key2=value2)" 513 514 """ 515 516 # Parse extensions config params (ignore the order) 517 configs = dict(configs) 518 pos = ext_name.find("(") # find the first "(" 519 if pos > 0: 520 ext_args = ext_name[pos+1:-1] 521 ext_name = ext_name[:pos] 522 pairs = [x.split("=") for x in ext_args.split(",")] 523 configs.update([(x.strip(), y.strip()) for (x, y) in pairs]) 524 525 # Setup the module names 526 ext_module = 'markdown.extensions' 527 module_name_new_style = '.'.join([ext_module, ext_name]) 528 module_name_old_style = '_'.join(['mdx', ext_name]) 529 530 # Try loading the extention first from one place, then another 531 try: # New style (markdown.extensons.<extension>) 532 module = __import__(module_name_new_style, {}, {}, [ext_module]) 533 except ImportError: 534 try: # Old style (mdx.<extension>) 535 module = __import__(module_name_old_style) 536 except ImportError: 537 message(WARN, "Failed loading extension '%s' from '%s' or '%s'" 538 % (ext_name, module_name_new_style, module_name_old_style)) 539 # Return None so we don't try to initiate none-existant extension 540 return None 541 542 # If the module is loaded successfully, we expect it to define a 543 # function called makeExtension() 544 try: 545 return module.makeExtension(configs.items()) 546 except AttributeError: 547 message(CRITICAL, "Failed to initiate extension '%s'" % ext_name) 548 549 550 def load_extensions(ext_names): 551 """Loads multiple extensions""" 552 extensions = [] 553 for ext_name in ext_names: 554 extension = load_extension(ext_name) 555 if extension: 556 extensions.append(extension) 557 return extensions 558 559 560 """ 561 EXPORTED FUNCTIONS 562 ============================================================================= 563 564 Those are the two functions we really mean to export: markdown() and 565 markdownFromFile(). 566 """ 567 568 def markdown(text, 569 extensions = [], 570 safe_mode = False, 571 output_format = DEFAULT_OUTPUT_FORMAT): 572 """Convert a markdown string to HTML and return HTML as a unicode string. 573 574 This is a shortcut function for `Markdown` class to cover the most 575 basic use case. It initializes an instance of Markdown, loads the 576 necessary extensions and runs the parser on the given text. 577 578 Keyword arguments: 579 580 * text: Markdown formatted text as Unicode or ASCII string. 581 * extensions: A list of extensions or extension names (may contain config args). 582 * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". 583 * output_format: Format of output. Supported formats are: 584 * "xhtml1": Outputs XHTML 1.x. Default. 585 * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1). 586 * "html4": Outputs HTML 4 587 * "html": Outputs latest supported version of HTML (currently HTML 4). 588 Note that it is suggested that the more specific formats ("xhtml1" 589 and "html4") be used as "xhtml" or "html" may change in the future 590 if it makes sense at that time. 591 592 Returns: An HTML document as a string. 593 594 """ 595 md = Markdown(extensions=load_extensions(extensions), 596 safe_mode=safe_mode, 597 output_format=output_format) 598 return md.convert(text) 599 600 601 def markdownFromFile(input = None, 602 output = None, 603 extensions = [], 604 encoding = None, 605 safe_mode = False, 606 output_format = DEFAULT_OUTPUT_FORMAT): 607 """Read markdown code from a file and write it to a file or a stream.""" 608 md = Markdown(extensions=load_extensions(extensions), 609 safe_mode=safe_mode, 610 output_format=output_format) 611 md.convertFile(input, output, encoding) 612 613 614 615