1 # markdown is released under the BSD license 2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) 3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 4 # Copyright 2004 Manfred Stienstra (the original version) 5 # 6 # All rights reserved. 7 # 8 # Redistribution and use in source and binary forms, with or without 9 # modification, are permitted provided that the following conditions are met: 10 # 11 # * Redistributions of source code must retain the above copyright 12 # notice, this list of conditions and the following disclaimer. 13 # * Redistributions in binary form must reproduce the above copyright 14 # notice, this list of conditions and the following disclaimer in the 15 # documentation and/or other materials provided with the distribution. 16 # * Neither the name of the <organization> nor the 17 # names of its contributors may be used to endorse or promote products 18 # derived from this software without specific prior written permission. 19 # 20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY 21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT 24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 # POSSIBILITY OF SUCH DAMAGE. 31 32 33 """ 34 PRE-PROCESSORS 35 ============================================================================= 36 37 Preprocessors work on source text before we start doing anything too 38 complicated. 39 """ 40 41 from __future__ import absolute_import 42 from __future__ import unicode_literals 43 from . import util 44 from . import odict 45 import re 46 47 48 def build_preprocessors(md_instance, **kwargs): 49 """ Build the default set of preprocessors used by Markdown. """ 50 preprocessors = odict.OrderedDict() 51 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) 52 if md_instance.safeMode != 'escape': 53 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) 54 preprocessors["reference"] = ReferencePreprocessor(md_instance) 55 return preprocessors 56 57 58 class Preprocessor(util.Processor): 59 """ 60 Preprocessors are run after the text is broken into lines. 61 62 Each preprocessor implements a "run" method that takes a pointer to a 63 list of lines of the document, modifies it as necessary and returns 64 either the same pointer or a pointer to a new list. 65 66 Preprocessors must extend markdown.Preprocessor. 67 68 """ 69 def run(self, lines): 70 """ 71 Each subclass of Preprocessor should override the `run` method, which 72 takes the document as a list of strings split by newlines and returns 73 the (possibly modified) list of lines. 74 75 """ 76 pass 77 78 79 class NormalizeWhitespace(Preprocessor): 80 """ Normalize whitespace for consistant parsing. """ 81 82 def run(self, lines): 83 source = '\n'.join(lines) 84 source = source.replace(util.STX, "").replace(util.ETX, "") 85 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" 86 source = source.expandtabs(self.markdown.tab_length) 87 source = re.sub(r'(?<=\n) +\n', '\n', source) 88 return source.split('\n') 89 90 91 class HtmlBlockPreprocessor(Preprocessor): 92 """Remove html blocks from the text and store them for later retrieval.""" 93 94 right_tag_patterns = ["</%s>", "%s>"] 95 attrs_pattern = r""" 96 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" 97 | # OR 98 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value 99 | # OR 100 \s+(?P<attr2>[^>"'/= ]+) # attr 101 """ 102 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern 103 attrs_re = re.compile(attrs_pattern, re.VERBOSE) 104 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) 105 markdown_in_raw = False 106 107 def _get_left_tag(self, block): 108 m = self.left_tag_re.match(block) 109 if m: 110 tag = m.group('tag') 111 raw_attrs = m.group('attrs') 112 attrs = {} 113 if raw_attrs: 114 for ma in self.attrs_re.finditer(raw_attrs): 115 if ma.group('attr'): 116 if ma.group('value'): 117 attrs[ma.group('attr').strip()] = ma.group('value') 118 else: 119 attrs[ma.group('attr').strip()] = "" 120 elif ma.group('attr1'): 121 if ma.group('value1'): 122 attrs[ma.group('attr1').strip()] = ma.group('value1') 123 else: 124 attrs[ma.group('attr1').strip()] = "" 125 elif ma.group('attr2'): 126 attrs[ma.group('attr2').strip()] = "" 127 return tag, len(m.group(0)), attrs 128 else: 129 tag = block[1:].split(">", 1)[0].lower() 130 return tag, len(tag)+2, {} 131 132 def _recursive_tagfind(self, ltag, rtag, start_index, block): 133 while 1: 134 i = block.find(rtag, start_index) 135 if i == -1: 136 return -1 137 j = block.find(ltag, start_index) 138 # if no ltag, or rtag found before another ltag, return index 139 if (j > i or j == -1): 140 return i + len(rtag) 141 # another ltag found before rtag, use end of ltag as starting 142 # point and search again 143 j = block.find('>', j) 144 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) 145 if start_index == -1: 146 # HTML potentially malformed- ltag has no corresponding 147 # rtag 148 return -1 149 150 def _get_right_tag(self, left_tag, left_index, block): 151 for p in self.right_tag_patterns: 152 tag = p % left_tag 153 i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block) 154 if i > 2: 155 return tag.lstrip("<").rstrip(">"), i 156 return block.rstrip()[-left_index:-1].lower(), len(block) 157 158 def _equal_tags(self, left_tag, right_tag): 159 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. 160 return True 161 if ("/" + left_tag) == right_tag: 162 return True 163 if (right_tag == "--" and left_tag == "--"): 164 return True 165 elif left_tag == right_tag[1:] \ 166 and right_tag[0] == "/": 167 return True 168 else: 169 return False 170 171 def _is_oneliner(self, tag): 172 return (tag in ['hr', 'hr/']) 173 174 def run(self, lines): 175 text = "\n".join(lines) 176 new_blocks = [] 177 text = text.rsplit("\n\n") 178 items = [] 179 left_tag = '' 180 right_tag = '' 181 in_tag = False # flag 182 183 while text: 184 block = text[0] 185 if block.startswith("\n"): 186 block = block[1:] 187 text = text[1:] 188 189 if block.startswith("\n"): 190 block = block[1:] 191 192 if not in_tag: 193 if block.startswith("<") and len(block.strip()) > 1: 194 195 if block[1] == "!": 196 # is a comment block 197 left_tag, left_index, attrs = "--", 2, {} 198 else: 199 left_tag, left_index, attrs = self._get_left_tag(block) 200 right_tag, data_index = self._get_right_tag(left_tag, 201 left_index, 202 block) 203 # keep checking conditions below and maybe just append 204 205 if data_index < len(block) \ 206 and (util.isBlockLevel(left_tag) 207 or left_tag == '--'): 208 text.insert(0, block[data_index:]) 209 block = block[:data_index] 210 211 if not (util.isBlockLevel(left_tag) \ 212 or block[1] in ["!", "?", "@", "%"]): 213 new_blocks.append(block) 214 continue 215 216 if self._is_oneliner(left_tag): 217 new_blocks.append(block.strip()) 218 continue 219 220 if block.rstrip().endswith(">") \ 221 and self._equal_tags(left_tag, right_tag): 222 if self.markdown_in_raw and 'markdown' in attrs.keys(): 223 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 224 '', block[:left_index]) 225 end = block[-len(right_tag)-2:] 226 block = block[left_index:-len(right_tag)-2] 227 new_blocks.append( 228 self.markdown.htmlStash.store(start)) 229 new_blocks.append(block) 230 new_blocks.append( 231 self.markdown.htmlStash.store(end)) 232 else: 233 new_blocks.append( 234 self.markdown.htmlStash.store(block.strip())) 235 continue 236 else: 237 # if is block level tag and is not complete 238 239 if util.isBlockLevel(left_tag) or left_tag == "--" \ 240 and not block.rstrip().endswith(">"): 241 items.append(block.strip()) 242 in_tag = True 243 else: 244 new_blocks.append( 245 self.markdown.htmlStash.store(block.strip())) 246 247 continue 248 249 new_blocks.append(block) 250 251 else: 252 items.append(block) 253 254 right_tag, data_index = self._get_right_tag(left_tag, 0, block) 255 256 if self._equal_tags(left_tag, right_tag): 257 # if find closing tag 258 259 if data_index < len(block): 260 # we have more text after right_tag 261 items[-1] = block[:data_index] 262 text.insert(0, block[data_index:]) 263 264 in_tag = False 265 if self.markdown_in_raw and 'markdown' in attrs.keys(): 266 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 267 '', items[0][:left_index]) 268 items[0] = items[0][left_index:] 269 end = items[-1][-len(right_tag)-2:] 270 items[-1] = items[-1][:-len(right_tag)-2] 271 new_blocks.append( 272 self.markdown.htmlStash.store(start)) 273 new_blocks.extend(items) 274 new_blocks.append( 275 self.markdown.htmlStash.store(end)) 276 else: 277 new_blocks.append( 278 self.markdown.htmlStash.store('\n\n'.join(items))) 279 items = [] 280 281 if items: 282 if self.markdown_in_raw and 'markdown' in attrs.keys(): 283 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 284 '', items[0][:left_index]) 285 items[0] = items[0][left_index:] 286 end = items[-1][-len(right_tag)-2:] 287 items[-1] = items[-1][:-len(right_tag)-2] 288 new_blocks.append( 289 self.markdown.htmlStash.store(start)) 290 new_blocks.extend(items) 291 if end.strip(): 292 new_blocks.append( 293 self.markdown.htmlStash.store(end)) 294 else: 295 new_blocks.append( 296 self.markdown.htmlStash.store('\n\n'.join(items))) 297 #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) 298 new_blocks.append('\n') 299 300 new_text = "\n\n".join(new_blocks) 301 return new_text.split("\n") 302 303 304 class ReferencePreprocessor(Preprocessor): 305 """ Remove reference definitions from text and store for later use. """ 306 307 TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' 308 RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL) 309 TITLE_RE = re.compile(r'^%s$' % TITLE) 310 311 def run (self, lines): 312 new_text = []; 313 while lines: 314 line = lines.pop(0) 315 m = self.RE.match(line) 316 if m: 317 id = m.group(1).strip().lower() 318 link = m.group(2).lstrip('<').rstrip('>') 319 t = m.group(5) or m.group(6) or m.group(7) 320 if not t: 321 # Check next line for title 322 tm = self.TITLE_RE.match(lines[0]) 323 if tm: 324 lines.pop(0) 325 t = tm.group(2) or tm.group(3) or tm.group(4) 326 self.markdown.references[id] = (link, t) 327 else: 328 new_text.append(line) 329 330 return new_text #+ "\n" 331