Home | History | Annotate | Download | only in markdown
      1 """
      2 CORE MARKDOWN BLOCKPARSER
      3 =============================================================================
      4 
      5 This parser handles basic parsing of Markdown blocks.  It doesn't concern itself
      6 with inline elements such as **bold** or *italics*, but rather just catches 
      7 blocks, lists, quotes, etc.
      8 
      9 The BlockParser is made up of a bunch of BlockProssors, each handling a 
     10 different type of block. Extensions may add/replace/remove BlockProcessors
     11 as they need to alter how markdown blocks are parsed.
     12 
     13 """
     14 
     15 import re
     16 import markdown
     17 
     18 class BlockProcessor:
     19     """ Base class for block processors. 
     20     
     21     Each subclass will provide the methods below to work with the source and
     22     tree. Each processor will need to define it's own ``test`` and ``run``
     23     methods. The ``test`` method should return True or False, to indicate
     24     whether the current block should be processed by this processor. If the
     25     test passes, the parser will call the processors ``run`` method.
     26 
     27     """
     28 
     29     def __init__(self, parser=None):
     30         self.parser = parser
     31 
     32     def lastChild(self, parent):
     33         """ Return the last child of an etree element. """
     34         if len(parent):
     35             return parent[-1]
     36         else:
     37             return None
     38 
     39     def detab(self, text):
     40         """ Remove a tab from the front of each line of the given text. """
     41         newtext = []
     42         lines = text.split('\n')
     43         for line in lines:
     44             if line.startswith(' '*markdown.TAB_LENGTH):
     45                 newtext.append(line[markdown.TAB_LENGTH:])
     46             elif not line.strip():
     47                 newtext.append('')
     48             else:
     49                 break
     50         return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
     51 
     52     def looseDetab(self, text, level=1):
     53         """ Remove a tab from front of lines but allowing dedented lines. """
     54         lines = text.split('\n')
     55         for i in range(len(lines)):
     56             if lines[i].startswith(' '*markdown.TAB_LENGTH*level):
     57                 lines[i] = lines[i][markdown.TAB_LENGTH*level:]
     58         return '\n'.join(lines)
     59 
     60     def test(self, parent, block):
     61         """ Test for block type. Must be overridden by subclasses. 
     62         
     63         As the parser loops through processors, it will call the ``test`` method
     64         on each to determine if the given block of text is of that type. This
     65         method must return a boolean ``True`` or ``False``. The actual method of
     66         testing is left to the needs of that particular block type. It could 
     67         be as simple as ``block.startswith(some_string)`` or a complex regular
     68         expression. As the block type may be different depending on the parent
     69         of the block (i.e. inside a list), the parent etree element is also 
     70         provided and may be used as part of the test.
     71 
     72         Keywords:
     73         
     74         * ``parent``: A etree element which will be the parent of the block.
     75         * ``block``: A block of text from the source which has been split at 
     76             blank lines.
     77         """
     78         pass
     79 
     80     def run(self, parent, blocks):
     81         """ Run processor. Must be overridden by subclasses. 
     82         
     83         When the parser determines the appropriate type of a block, the parser
     84         will call the corresponding processor's ``run`` method. This method
     85         should parse the individual lines of the block and append them to
     86         the etree. 
     87 
     88         Note that both the ``parent`` and ``etree`` keywords are pointers
     89         to instances of the objects which should be edited in place. Each
     90         processor must make changes to the existing objects as there is no
     91         mechanism to return new/different objects to replace them.
     92 
     93         This means that this method should be adding SubElements or adding text
     94         to the parent, and should remove (``pop``) or add (``insert``) items to
     95         the list of blocks.
     96 
     97         Keywords:
     98 
     99         * ``parent``: A etree element which is the parent of the current block.
    100         * ``blocks``: A list of all remaining blocks of the document.
    101         """
    102         pass
    103 
    104 
    105 class ListIndentProcessor(BlockProcessor):
    106     """ Process children of list items. 
    107     
    108     Example:
    109         * a list item
    110             process this part
    111 
    112             or this part
    113 
    114     """
    115 
    116     INDENT_RE = re.compile(r'^(([ ]{%s})+)'% markdown.TAB_LENGTH)
    117     ITEM_TYPES = ['li']
    118     LIST_TYPES = ['ul', 'ol']
    119 
    120     def test(self, parent, block):
    121         return block.startswith(' '*markdown.TAB_LENGTH) and \
    122                 not self.parser.state.isstate('detabbed') and  \
    123                 (parent.tag in self.ITEM_TYPES or \
    124                     (len(parent) and parent[-1] and \
    125                         (parent[-1].tag in self.LIST_TYPES)
    126                     )
    127                 )
    128 
    129     def run(self, parent, blocks):
    130         block = blocks.pop(0)
    131         level, sibling = self.get_level(parent, block)
    132         block = self.looseDetab(block, level)
    133 
    134         self.parser.state.set('detabbed')
    135         if parent.tag in self.ITEM_TYPES:
    136             # The parent is already a li. Just parse the child block.
    137             self.parser.parseBlocks(parent, [block])
    138         elif sibling.tag in self.ITEM_TYPES:
    139             # The sibling is a li. Use it as parent.
    140             self.parser.parseBlocks(sibling, [block])
    141         elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES:
    142             # The parent is a list (``ol`` or ``ul``) which has children.
    143             # Assume the last child li is the parent of this block.
    144             if sibling[-1].text:
    145                 # If the parent li has text, that text needs to be moved to a p
    146                 block = '%s\n\n%s' % (sibling[-1].text, block)
    147                 sibling[-1].text = ''
    148             self.parser.parseChunk(sibling[-1], block)
    149         else:
    150             self.create_item(sibling, block)
    151         self.parser.state.reset()
    152 
    153     def create_item(self, parent, block):
    154         """ Create a new li and parse the block with it as the parent. """
    155         li = markdown.etree.SubElement(parent, 'li')
    156         self.parser.parseBlocks(li, [block])
    157  
    158     def get_level(self, parent, block):
    159         """ Get level of indent based on list level. """
    160         # Get indent level
    161         m = self.INDENT_RE.match(block)
    162         if m:
    163             indent_level = len(m.group(1))/markdown.TAB_LENGTH
    164         else:
    165             indent_level = 0
    166         if self.parser.state.isstate('list'):
    167             # We're in a tightlist - so we already are at correct parent.
    168             level = 1
    169         else:
    170             # We're in a looselist - so we need to find parent.
    171             level = 0
    172         # Step through children of tree to find matching indent level.
    173         while indent_level > level:
    174             child = self.lastChild(parent)
    175             if child and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES):
    176                 if child.tag in self.LIST_TYPES:
    177                     level += 1
    178                 parent = child
    179             else:
    180                 # No more child levels. If we're short of indent_level,
    181                 # we have a code block. So we stop here.
    182                 break
    183         return level, parent
    184 
    185 
    186 class CodeBlockProcessor(BlockProcessor):
    187     """ Process code blocks. """
    188 
    189     def test(self, parent, block):
    190         return block.startswith(' '*markdown.TAB_LENGTH)
    191     
    192     def run(self, parent, blocks):
    193         sibling = self.lastChild(parent)
    194         block = blocks.pop(0)
    195         theRest = ''
    196         if sibling and sibling.tag == "pre" and len(sibling) \
    197                     and sibling[0].tag == "code":
    198             # The previous block was a code block. As blank lines do not start
    199             # new code blocks, append this block to the previous, adding back
    200             # linebreaks removed from the split into a list.
    201             code = sibling[0]
    202             block, theRest = self.detab(block)
    203             code.text = markdown.AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
    204         else:
    205             # This is a new codeblock. Create the elements and insert text.
    206             pre = markdown.etree.SubElement(parent, 'pre')
    207             code = markdown.etree.SubElement(pre, 'code')
    208             block, theRest = self.detab(block)
    209             code.text = markdown.AtomicString('%s\n' % block.rstrip())
    210         if theRest:
    211             # This block contained unindented line(s) after the first indented 
    212             # line. Insert these lines as the first block of the master blocks
    213             # list for future processing.
    214             blocks.insert(0, theRest)
    215 
    216 
    217 class BlockQuoteProcessor(BlockProcessor):
    218 
    219     RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
    220 
    221     def test(self, parent, block):
    222         return bool(self.RE.search(block))
    223 
    224     def run(self, parent, blocks):
    225         block = blocks.pop(0)
    226         m = self.RE.search(block)
    227         if m:
    228             before = block[:m.start()] # Lines before blockquote
    229             # Pass lines before blockquote in recursively for parsing forst.
    230             self.parser.parseBlocks(parent, [before])
    231             # Remove ``> `` from begining of each line.
    232             block = '\n'.join([self.clean(line) for line in 
    233                             block[m.start():].split('\n')])
    234         sibling = self.lastChild(parent)
    235         if sibling and sibling.tag == "blockquote":
    236             # Previous block was a blockquote so set that as this blocks parent
    237             quote = sibling
    238         else:
    239             # This is a new blockquote. Create a new parent element.
    240             quote = markdown.etree.SubElement(parent, 'blockquote')
    241         # Recursively parse block with blockquote as parent.
    242         self.parser.parseChunk(quote, block)
    243 
    244     def clean(self, line):
    245         """ Remove ``>`` from beginning of a line. """
    246         m = self.RE.match(line)
    247         if line.strip() == ">":
    248             return ""
    249         elif m:
    250             return m.group(2)
    251         else:
    252             return line
    253 
    254 class OListProcessor(BlockProcessor):
    255     """ Process ordered list blocks. """
    256 
    257     TAG = 'ol'
    258     # Detect an item (``1. item``). ``group(1)`` contains contents of item.
    259     RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)')
    260     # Detect items on secondary lines. they can be of either list type.
    261     CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)')
    262     # Detect indented (nested) items of either type
    263     INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*')
    264 
    265     def test(self, parent, block):
    266         return bool(self.RE.match(block))
    267 
    268     def run(self, parent, blocks):
    269         # Check fr multiple items in one block.
    270         items = self.get_items(blocks.pop(0))
    271         sibling = self.lastChild(parent)
    272         if sibling and sibling.tag in ['ol', 'ul']:
    273             # Previous block was a list item, so set that as parent
    274             lst = sibling
    275             # make sure previous item is in a p.
    276             if len(lst) and lst[-1].text and not len(lst[-1]):
    277                 p = markdown.etree.SubElement(lst[-1], 'p')
    278                 p.text = lst[-1].text
    279                 lst[-1].text = ''
    280             # parse first block differently as it gets wrapped in a p.
    281             li = markdown.etree.SubElement(lst, 'li')
    282             self.parser.state.set('looselist')
    283             firstitem = items.pop(0)
    284             self.parser.parseBlocks(li, [firstitem])
    285             self.parser.state.reset()
    286         else:
    287             # This is a new list so create parent with appropriate tag.
    288             lst = markdown.etree.SubElement(parent, self.TAG)
    289         self.parser.state.set('list')
    290         # Loop through items in block, recursively parsing each with the
    291         # appropriate parent.
    292         for item in items:
    293             if item.startswith(' '*markdown.TAB_LENGTH):
    294                 # Item is indented. Parse with last item as parent
    295                 self.parser.parseBlocks(lst[-1], [item])
    296             else:
    297                 # New item. Create li and parse with it as parent
    298                 li = markdown.etree.SubElement(lst, 'li')
    299                 self.parser.parseBlocks(li, [item])
    300         self.parser.state.reset()
    301 
    302     def get_items(self, block):
    303         """ Break a block into list items. """
    304         items = []
    305         for line in block.split('\n'):
    306             m = self.CHILD_RE.match(line)
    307             if m:
    308                 # This is a new item. Append
    309                 items.append(m.group(3))
    310             elif self.INDENT_RE.match(line):
    311                 # This is an indented (possibly nested) item.
    312                 if items[-1].startswith(' '*markdown.TAB_LENGTH):
    313                     # Previous item was indented. Append to that item.
    314                     items[-1] = '%s\n%s' % (items[-1], line)
    315                 else:
    316                     items.append(line)
    317             else:
    318                 # This is another line of previous item. Append to that item.
    319                 items[-1] = '%s\n%s' % (items[-1], line)
    320         return items
    321 
    322 
    323 class UListProcessor(OListProcessor):
    324     """ Process unordered list blocks. """
    325 
    326     TAG = 'ul'
    327     RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)')
    328 
    329 
    330 class HashHeaderProcessor(BlockProcessor):
    331     """ Process Hash Headers. """
    332 
    333     # Detect a header at start of any line in block
    334     RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
    335 
    336     def test(self, parent, block):
    337         return bool(self.RE.search(block))
    338 
    339     def run(self, parent, blocks):
    340         block = blocks.pop(0)
    341         m = self.RE.search(block)
    342         if m:
    343             before = block[:m.start()] # All lines before header
    344             after = block[m.end():]    # All lines after header
    345             if before:
    346                 # As the header was not the first line of the block and the
    347                 # lines before the header must be parsed first,
    348                 # recursively parse this lines as a block.
    349                 self.parser.parseBlocks(parent, [before])
    350             # Create header using named groups from RE
    351             h = markdown.etree.SubElement(parent, 'h%d' % len(m.group('level')))
    352             h.text = m.group('header').strip()
    353             if after:
    354                 # Insert remaining lines as first block for future parsing.
    355                 blocks.insert(0, after)
    356         else:
    357             # This should never happen, but just in case...
    358             message(CRITICAL, "We've got a problem header!")
    359 
    360 
    361 class SetextHeaderProcessor(BlockProcessor):
    362     """ Process Setext-style Headers. """
    363 
    364     # Detect Setext-style header. Must be first 2 lines of block.
    365     RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE)
    366 
    367     def test(self, parent, block):
    368         return bool(self.RE.match(block))
    369 
    370     def run(self, parent, blocks):
    371         lines = blocks.pop(0).split('\n')
    372         # Determine level. ``=`` is 1 and ``-`` is 2.
    373         if lines[1].startswith('='):
    374             level = 1
    375         else:
    376             level = 2
    377         h = markdown.etree.SubElement(parent, 'h%d' % level)
    378         h.text = lines[0].strip()
    379         if len(lines) > 2:
    380             # Block contains additional lines. Add to  master blocks for later.
    381             blocks.insert(0, '\n'.join(lines[2:]))
    382 
    383 
    384 class HRProcessor(BlockProcessor):
    385     """ Process Horizontal Rules. """
    386 
    387     RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*'
    388     # Detect hr on any line of a block.
    389     SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
    390     # Match a hr on a single line of text.
    391     MATCH_RE = re.compile(r'^%s$' % RE)
    392 
    393     def test(self, parent, block):
    394         return bool(self.SEARCH_RE.search(block))
    395 
    396     def run(self, parent, blocks):
    397         lines = blocks.pop(0).split('\n')
    398         prelines = []
    399         # Check for lines in block before hr.
    400         for line in lines:
    401             m = self.MATCH_RE.match(line)
    402             if m:
    403                 break
    404             else:
    405                 prelines.append(line)
    406         if len(prelines):
    407             # Recursively parse lines before hr so they get parsed first.
    408             self.parser.parseBlocks(parent, ['\n'.join(prelines)])
    409         # create hr
    410         hr = markdown.etree.SubElement(parent, 'hr')
    411         # check for lines in block after hr.
    412         lines = lines[len(prelines)+1:]
    413         if len(lines):
    414             # Add lines after hr to master blocks for later parsing.
    415             blocks.insert(0, '\n'.join(lines))
    416 
    417 
    418 class EmptyBlockProcessor(BlockProcessor):
    419     """ Process blocks and start with an empty line. """
    420 
    421     # Detect a block that only contains whitespace 
    422     # or only whitespace on the first line.
    423     RE = re.compile(r'^\s*\n')
    424 
    425     def test(self, parent, block):
    426         return bool(self.RE.match(block))
    427 
    428     def run(self, parent, blocks):
    429         block = blocks.pop(0)
    430         m = self.RE.match(block)
    431         if m:
    432             # Add remaining line to master blocks for later.
    433             blocks.insert(0, block[m.end():])
    434             sibling = self.lastChild(parent)
    435             if sibling and sibling.tag == 'pre' and sibling[0] and \
    436                     sibling[0].tag == 'code':
    437                 # Last block is a codeblock. Append to preserve whitespace.
    438                 sibling[0].text = markdown.AtomicString('%s/n/n/n' % sibling[0].text )
    439 
    440 
    441 class ParagraphProcessor(BlockProcessor):
    442     """ Process Paragraph blocks. """
    443 
    444     def test(self, parent, block):
    445         return True
    446 
    447     def run(self, parent, blocks):
    448         block = blocks.pop(0)
    449         if block.strip():
    450             # Not a blank block. Add to parent, otherwise throw it away.
    451             if self.parser.state.isstate('list'):
    452                 # The parent is a tight-list. Append to parent.text
    453                 if parent.text:
    454                     parent.text = '%s\n%s' % (parent.text, block)
    455                 else:
    456                     parent.text = block.lstrip()
    457             else:
    458                 # Create a regular paragraph
    459                 p = markdown.etree.SubElement(parent, 'p')
    460                 p.text = block.lstrip()
    461