Home | History | Annotate | Download | only in markdown
      1 # markdown is released under the BSD license
      2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
      3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
      4 # Copyright 2004 Manfred Stienstra (the original version)
      5 # 
      6 # All rights reserved.
      7 # 
      8 # Redistribution and use in source and binary forms, with or without
      9 # modification, are permitted provided that the following conditions are met:
     10 # 
     11 # *   Redistributions of source code must retain the above copyright
     12 #     notice, this list of conditions and the following disclaimer.
     13 # *   Redistributions in binary form must reproduce the above copyright
     14 #     notice, this list of conditions and the following disclaimer in the
     15 #     documentation and/or other materials provided with the distribution.
     16 # *   Neither the name of the <organization> nor the
     17 #     names of its contributors may be used to endorse or promote products
     18 #     derived from this software without specific prior written permission.
     19 # 
     20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
     21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
     24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30 # POSSIBILITY OF SUCH DAMAGE.
     31 
     32 
     33 """
     34 PRE-PROCESSORS
     35 =============================================================================
     36 
     37 Preprocessors work on source text before we start doing anything too
     38 complicated. 
     39 """
     40 
     41 from __future__ import absolute_import
     42 from __future__ import unicode_literals
     43 from . import util
     44 from . import odict
     45 import re
     46 
     47 
     48 def build_preprocessors(md_instance, **kwargs):
     49     """ Build the default set of preprocessors used by Markdown. """
     50     preprocessors = odict.OrderedDict()
     51     preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
     52     if md_instance.safeMode != 'escape':
     53         preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
     54     preprocessors["reference"] = ReferencePreprocessor(md_instance)
     55     return preprocessors
     56 
     57 
     58 class Preprocessor(util.Processor):
     59     """
     60     Preprocessors are run after the text is broken into lines.
     61 
     62     Each preprocessor implements a "run" method that takes a pointer to a
     63     list of lines of the document, modifies it as necessary and returns
     64     either the same pointer or a pointer to a new list.
     65 
     66     Preprocessors must extend markdown.Preprocessor.
     67 
     68     """
     69     def run(self, lines):
     70         """
     71         Each subclass of Preprocessor should override the `run` method, which
     72         takes the document as a list of strings split by newlines and returns
     73         the (possibly modified) list of lines.
     74 
     75         """
     76         pass
     77 
     78 
     79 class NormalizeWhitespace(Preprocessor):
     80     """ Normalize whitespace for consistant parsing. """
     81 
     82     def run(self, lines):
     83         source = '\n'.join(lines)
     84         source = source.replace(util.STX, "").replace(util.ETX, "")
     85         source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
     86         source = source.expandtabs(self.markdown.tab_length)
     87         source = re.sub(r'(?<=\n) +\n', '\n', source)
     88         return source.split('\n')
     89 
     90 
     91 class HtmlBlockPreprocessor(Preprocessor):
     92     """Remove html blocks from the text and store them for later retrieval."""
     93 
     94     right_tag_patterns = ["</%s>", "%s>"]
     95     attrs_pattern = r"""
     96         \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
     97         |                                                         # OR 
     98         \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
     99         |                                                         # OR
    100         \s+(?P<attr2>[^>"'/= ]+)                                  # attr
    101         """
    102     left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
    103     attrs_re = re.compile(attrs_pattern, re.VERBOSE)
    104     left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
    105     markdown_in_raw = False
    106 
    107     def _get_left_tag(self, block):
    108         m = self.left_tag_re.match(block)
    109         if m:
    110             tag = m.group('tag')
    111             raw_attrs = m.group('attrs')
    112             attrs = {}
    113             if raw_attrs:
    114                 for ma in self.attrs_re.finditer(raw_attrs):
    115                     if ma.group('attr'):
    116                         if ma.group('value'):
    117                             attrs[ma.group('attr').strip()] = ma.group('value')
    118                         else:
    119                             attrs[ma.group('attr').strip()] = ""
    120                     elif ma.group('attr1'):
    121                         if ma.group('value1'):
    122                             attrs[ma.group('attr1').strip()] = ma.group('value1')
    123                         else:
    124                             attrs[ma.group('attr1').strip()] = ""
    125                     elif ma.group('attr2'):
    126                         attrs[ma.group('attr2').strip()] = ""
    127             return tag, len(m.group(0)), attrs
    128         else:
    129             tag = block[1:].split(">", 1)[0].lower()
    130             return tag, len(tag)+2, {}
    131 
    132     def _recursive_tagfind(self, ltag, rtag, start_index, block):
    133         while 1:
    134             i = block.find(rtag, start_index)
    135             if i == -1:
    136                 return -1
    137             j = block.find(ltag, start_index) 
    138             # if no ltag, or rtag found before another ltag, return index
    139             if (j > i or j == -1):
    140                 return i + len(rtag)
    141             # another ltag found before rtag, use end of ltag as starting
    142             # point and search again
    143             j = block.find('>', j)
    144             start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
    145             if start_index == -1:
    146                 # HTML potentially malformed- ltag has no corresponding 
    147                 # rtag
    148                 return -1
    149 
    150     def _get_right_tag(self, left_tag, left_index, block):
    151         for p in self.right_tag_patterns:
    152             tag = p % left_tag
    153             i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
    154             if i > 2:
    155                 return tag.lstrip("<").rstrip(">"), i
    156         return block.rstrip()[-left_index:-1].lower(), len(block)
    157     
    158     def _equal_tags(self, left_tag, right_tag):
    159         if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
    160             return True
    161         if ("/" + left_tag) == right_tag:
    162             return True
    163         if (right_tag == "--" and left_tag == "--"):
    164             return True
    165         elif left_tag == right_tag[1:] \
    166             and right_tag[0] == "/":
    167             return True
    168         else:
    169             return False
    170 
    171     def _is_oneliner(self, tag):
    172         return (tag in ['hr', 'hr/'])
    173 
    174     def run(self, lines):
    175         text = "\n".join(lines)
    176         new_blocks = []
    177         text = text.rsplit("\n\n")
    178         items = []
    179         left_tag = ''
    180         right_tag = ''
    181         in_tag = False # flag
    182 
    183         while text:
    184             block = text[0]
    185             if block.startswith("\n"):
    186                 block = block[1:]
    187             text = text[1:]
    188 
    189             if block.startswith("\n"):
    190                 block = block[1:]
    191 
    192             if not in_tag:
    193                 if block.startswith("<") and len(block.strip()) > 1:
    194 
    195                     if block[1] == "!":
    196                         # is a comment block
    197                         left_tag, left_index, attrs  = "--", 2, {}
    198                     else:
    199                         left_tag, left_index, attrs = self._get_left_tag(block)
    200                     right_tag, data_index = self._get_right_tag(left_tag, 
    201                                                                 left_index,
    202                                                                 block)
    203                     # keep checking conditions below and maybe just append
    204                     
    205                     if data_index < len(block) \
    206                         and (util.isBlockLevel(left_tag)
    207                         or left_tag == '--'): 
    208                         text.insert(0, block[data_index:])
    209                         block = block[:data_index]
    210 
    211                     if not (util.isBlockLevel(left_tag) \
    212                         or block[1] in ["!", "?", "@", "%"]):
    213                         new_blocks.append(block)
    214                         continue
    215 
    216                     if self._is_oneliner(left_tag):
    217                         new_blocks.append(block.strip())
    218                         continue
    219 
    220                     if block.rstrip().endswith(">") \
    221                         and self._equal_tags(left_tag, right_tag):
    222                         if self.markdown_in_raw and 'markdown' in attrs.keys():
    223                             start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
    224                                            '', block[:left_index])
    225                             end = block[-len(right_tag)-2:]
    226                             block = block[left_index:-len(right_tag)-2]
    227                             new_blocks.append(
    228                                 self.markdown.htmlStash.store(start))
    229                             new_blocks.append(block)
    230                             new_blocks.append(
    231                                 self.markdown.htmlStash.store(end))
    232                         else:
    233                             new_blocks.append(
    234                                 self.markdown.htmlStash.store(block.strip()))
    235                         continue
    236                     else: 
    237                         # if is block level tag and is not complete
    238 
    239                         if util.isBlockLevel(left_tag) or left_tag == "--" \
    240                             and not block.rstrip().endswith(">"):
    241                             items.append(block.strip())
    242                             in_tag = True
    243                         else:
    244                             new_blocks.append(
    245                             self.markdown.htmlStash.store(block.strip()))
    246 
    247                         continue
    248 
    249                 new_blocks.append(block)
    250 
    251             else:
    252                 items.append(block)
    253 
    254                 right_tag, data_index = self._get_right_tag(left_tag, 0, block)
    255 
    256                 if self._equal_tags(left_tag, right_tag):
    257                     # if find closing tag
    258                     
    259                     if data_index < len(block):
    260                         # we have more text after right_tag
    261                         items[-1] = block[:data_index]
    262                         text.insert(0, block[data_index:])
    263 
    264                     in_tag = False
    265                     if self.markdown_in_raw and 'markdown' in attrs.keys():
    266                         start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
    267                                        '', items[0][:left_index])
    268                         items[0] = items[0][left_index:]
    269                         end = items[-1][-len(right_tag)-2:]
    270                         items[-1] = items[-1][:-len(right_tag)-2]
    271                         new_blocks.append(
    272                             self.markdown.htmlStash.store(start))
    273                         new_blocks.extend(items)
    274                         new_blocks.append(
    275                             self.markdown.htmlStash.store(end))
    276                     else:
    277                         new_blocks.append(
    278                             self.markdown.htmlStash.store('\n\n'.join(items)))
    279                     items = []
    280 
    281         if items:
    282             if self.markdown_in_raw and 'markdown' in attrs.keys():
    283                 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
    284                                '', items[0][:left_index])
    285                 items[0] = items[0][left_index:]
    286                 end = items[-1][-len(right_tag)-2:]
    287                 items[-1] = items[-1][:-len(right_tag)-2]
    288                 new_blocks.append(
    289                     self.markdown.htmlStash.store(start))
    290                 new_blocks.extend(items)
    291                 if end.strip():
    292                     new_blocks.append(
    293                         self.markdown.htmlStash.store(end))
    294             else:
    295                 new_blocks.append(
    296                     self.markdown.htmlStash.store('\n\n'.join(items)))
    297             #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
    298             new_blocks.append('\n')
    299 
    300         new_text = "\n\n".join(new_blocks)
    301         return new_text.split("\n")
    302 
    303 
    304 class ReferencePreprocessor(Preprocessor):
    305     """ Remove reference definitions from text and store for later use. """
    306 
    307     TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
    308     RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
    309     TITLE_RE = re.compile(r'^%s$' % TITLE)
    310 
    311     def run (self, lines):
    312         new_text = [];
    313         while lines:
    314             line = lines.pop(0)
    315             m = self.RE.match(line)
    316             if m:
    317                 id = m.group(1).strip().lower()
    318                 link = m.group(2).lstrip('<').rstrip('>')
    319                 t = m.group(5) or m.group(6) or m.group(7)
    320                 if not t:
    321                     # Check next line for title
    322                     tm = self.TITLE_RE.match(lines[0])
    323                     if tm:
    324                         lines.pop(0)
    325                         t = tm.group(2) or tm.group(3) or tm.group(4)
    326                 self.markdown.references[id] = (link, t)
    327             else:
    328                 new_text.append(line)
    329 
    330         return new_text #+ "\n"
    331