Home | History | Annotate | Download | only in markdown
      1 
      2 """
      3 PRE-PROCESSORS
      4 =============================================================================
      5 
      6 Preprocessors work on source text before we start doing anything too
      7 complicated. 
      8 """
      9 
     10 import re
     11 import markdown
     12 
     13 HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:"
     14 HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX
     15 
     16 class Processor:
     17     def __init__(self, markdown_instance=None):
     18         if markdown_instance:
     19             self.markdown = markdown_instance
     20 
     21 class Preprocessor (Processor):
     22     """
     23     Preprocessors are run after the text is broken into lines.
     24 
     25     Each preprocessor implements a "run" method that takes a pointer to a
     26     list of lines of the document, modifies it as necessary and returns
     27     either the same pointer or a pointer to a new list.
     28 
     29     Preprocessors must extend markdown.Preprocessor.
     30 
     31     """
     32     def run(self, lines):
     33         """
     34         Each subclass of Preprocessor should override the `run` method, which
     35         takes the document as a list of strings split by newlines and returns
     36         the (possibly modified) list of lines.
     37 
     38         """
     39         pass
     40 
     41 class HtmlStash:
     42     """
     43     This class is used for stashing HTML objects that we extract
     44     in the beginning and replace with place-holders.
     45     """
     46 
     47     def __init__ (self):
     48         """ Create a HtmlStash. """
     49         self.html_counter = 0 # for counting inline html segments
     50         self.rawHtmlBlocks=[]
     51 
     52     def store(self, html, safe=False):
     53         """
     54         Saves an HTML segment for later reinsertion.  Returns a
     55         placeholder string that needs to be inserted into the
     56         document.
     57 
     58         Keyword arguments:
     59 
     60         * html: an html segment
     61         * safe: label an html segment as safe for safemode
     62 
     63         Returns : a placeholder string
     64 
     65         """
     66         self.rawHtmlBlocks.append((html, safe))
     67         placeholder = HTML_PLACEHOLDER % self.html_counter
     68         self.html_counter += 1
     69         return placeholder
     70 
     71     def reset(self):
     72         self.html_counter = 0
     73         self.rawHtmlBlocks = []
     74 
     75 
     76 class HtmlBlockPreprocessor(Preprocessor):
     77     """Remove html blocks from the text and store them for later retrieval."""
     78 
     79     right_tag_patterns = ["</%s>", "%s>"]
     80 
     81     def _get_left_tag(self, block):
     82         return block[1:].replace(">", " ", 1).split()[0].lower()
     83 
     84     def _get_right_tag(self, left_tag, block):
     85         for p in self.right_tag_patterns:
     86             tag = p % left_tag
     87             i = block.rfind(tag)
     88             if i > 2:
     89                 return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
     90         return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
     91 
     92     def _equal_tags(self, left_tag, right_tag):
     93         if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
     94             return True
     95         if ("/" + left_tag) == right_tag:
     96             return True
     97         if (right_tag == "--" and left_tag == "--"):
     98             return True
     99         elif left_tag == right_tag[1:] \
    100             and right_tag[0] != "<":
    101             return True
    102         else:
    103             return False
    104 
    105     def _is_oneliner(self, tag):
    106         return (tag in ['hr', 'hr/'])
    107 
    108     def run(self, lines):
    109         text = "\n".join(lines)
    110         new_blocks = []
    111         text = text.split("\n\n")
    112         items = []
    113         left_tag = ''
    114         right_tag = ''
    115         in_tag = False # flag
    116 
    117         while text:
    118             block = text[0]
    119             if block.startswith("\n"):
    120                 block = block[1:]
    121             text = text[1:]
    122 
    123             if block.startswith("\n"):
    124                 block = block[1:]
    125 
    126             if not in_tag:
    127                 if block.startswith("<"):
    128                     left_tag = self._get_left_tag(block)
    129                     right_tag, data_index = self._get_right_tag(left_tag, block)
    130 
    131                     if block[1] == "!":
    132                         # is a comment block
    133                         left_tag = "--"
    134                         right_tag, data_index = self._get_right_tag(left_tag, block)
    135                         # keep checking conditions below and maybe just append
    136                     
    137                     if data_index < len(block) \
    138                         and markdown.isBlockLevel(left_tag): 
    139                         text.insert(0, block[data_index:])
    140                         block = block[:data_index]
    141 
    142                     if not (markdown.isBlockLevel(left_tag) \
    143                         or block[1] in ["!", "?", "@", "%"]):
    144                         new_blocks.append(block)
    145                         continue
    146 
    147                     if self._is_oneliner(left_tag):
    148                         new_blocks.append(block.strip())
    149                         continue
    150 
    151                     if block.rstrip().endswith(">") \
    152                         and self._equal_tags(left_tag, right_tag):
    153                         new_blocks.append(
    154                             self.markdown.htmlStash.store(block.strip()))
    155                         continue
    156                     else: #if not block[1] == "!":
    157                         # if is block level tag and is not complete
    158 
    159                         if markdown.isBlockLevel(left_tag) or left_tag == "--" \
    160                             and not block.rstrip().endswith(">"):
    161                             items.append(block.strip())
    162                             in_tag = True
    163                         else:
    164                             new_blocks.append(
    165                             self.markdown.htmlStash.store(block.strip()))
    166 
    167                         continue
    168 
    169                 new_blocks.append(block)
    170 
    171             else:
    172                 items.append(block.strip())
    173 
    174                 right_tag, data_index = self._get_right_tag(left_tag, block)
    175 
    176                 if self._equal_tags(left_tag, right_tag):
    177                     # if find closing tag
    178                     in_tag = False
    179                     new_blocks.append(
    180                         self.markdown.htmlStash.store('\n\n'.join(items)))
    181                     items = []
    182 
    183         if items:
    184             new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
    185             new_blocks.append('\n')
    186 
    187         new_text = "\n\n".join(new_blocks)
    188         return new_text.split("\n")
    189 
    190 
    191 class ReferencePreprocessor(Preprocessor):
    192     """ Remove reference definitions from text and store for later use. """
    193 
    194     RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
    195 
    196     def run (self, lines):
    197         new_text = [];
    198         for line in lines:
    199             m = self.RE.match(line)
    200             if m:
    201                 id = m.group(2).strip().lower()
    202                 t = m.group(4).strip()  # potential title
    203                 if not t:
    204                     self.markdown.references[id] = (m.group(3), t)
    205                 elif (len(t) >= 2
    206                       and (t[0] == t[-1] == "\""
    207                            or t[0] == t[-1] == "\'"
    208                            or (t[0] == "(" and t[-1] == ")") ) ):
    209                     self.markdown.references[id] = (m.group(3), t[1:-1])
    210                 else:
    211                     new_text.append(line)
    212             else:
    213                 new_text.append(line)
    214 
    215         return new_text #+ "\n"
    216