1 # markdown is released under the BSD license 2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) 3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 4 # Copyright 2004 Manfred Stienstra (the original version) 5 # 6 # All rights reserved. 7 # 8 # Redistribution and use in source and binary forms, with or without 9 # modification, are permitted provided that the following conditions are met: 10 # 11 # * Redistributions of source code must retain the above copyright 12 # notice, this list of conditions and the following disclaimer. 13 # * Redistributions in binary form must reproduce the above copyright 14 # notice, this list of conditions and the following disclaimer in the 15 # documentation and/or other materials provided with the distribution. 16 # * Neither the name of the <organization> nor the 17 # names of its contributors may be used to endorse or promote products 18 # derived from this software without specific prior written permission. 19 # 20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY 21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT 24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 # POSSIBILITY OF SUCH DAMAGE. 31 32 33 """ 34 INLINE PATTERNS 35 ============================================================================= 36 37 Inline patterns such as *emphasis* are handled by means of auxiliary 38 objects, one per pattern. Pattern objects must be instances of classes 39 that extend markdown.Pattern. Each pattern object uses a single regular 40 expression and needs support the following methods: 41 42 pattern.getCompiledRegExp() # returns a regular expression 43 44 pattern.handleMatch(m) # takes a match object and returns 45 # an ElementTree element or just plain text 46 47 All of python markdown's built-in patterns subclass from Pattern, 48 but you can add additional patterns that don't. 49 50 Also note that all the regular expressions used by inline must 51 capture the whole block. For this reason, they all start with 52 '^(.*)' and end with '(.*)!'. In case with built-in expression 53 Pattern takes care of adding the "^(.*)" and "(.*)!". 54 55 Finally, the order in which regular expressions are applied is very 56 important - e.g. if we first replace http://.../ links with <a> tags 57 and _then_ try to replace inline html, we would end up with a mess. 58 So, we apply the expressions in the following order: 59 60 * escape and backticks have to go before everything else, so 61 that we can preempt any markdown patterns by escaping them. 62 63 * then we handle auto-links (must be done before inline html) 64 65 * then we handle inline HTML. At this point we will simply 66 replace all inline HTML strings with a placeholder and add 67 the actual HTML to a hash. 68 69 * then inline images (must be done before links) 70 71 * then bracketed links, first regular then reference-style 72 73 * finally we apply strong and emphasis 74 """ 75 76 from __future__ import absolute_import 77 from __future__ import unicode_literals 78 from . import util 79 from . import odict 80 import re 81 try: 82 from urllib.parse import urlparse, urlunparse 83 except ImportError: 84 from urlparse import urlparse, urlunparse 85 try: 86 from html import entities 87 except ImportError: 88 import htmlentitydefs as entities 89 90 91 def build_inlinepatterns(md_instance, **kwargs): 92 """ Build the default set of inline patterns for Markdown. """ 93 inlinePatterns = odict.OrderedDict() 94 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE) 95 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance) 96 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance) 97 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance) 98 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance) 99 inlinePatterns["image_reference"] = \ 100 ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance) 101 inlinePatterns["short_reference"] = \ 102 ReferencePattern(SHORT_REF_RE, md_instance) 103 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance) 104 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance) 105 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br') 106 if md_instance.safeMode != 'escape': 107 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance) 108 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance) 109 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE) 110 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em') 111 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong') 112 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em') 113 if md_instance.smart_emphasis: 114 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em') 115 else: 116 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em') 117 return inlinePatterns 118 119 """ 120 The actual regular expressions for patterns 121 ----------------------------------------------------------------------------- 122 """ 123 124 NOBRACKET = r'[^\]\[]*' 125 BRK = ( r'\[(' 126 + (NOBRACKET + r'(\[')*6 127 + (NOBRACKET+ r'\])*')*6 128 + NOBRACKET + r')\]' ) 129 NOIMG = r'(?<!\!)' 130 131 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")`` 132 ESCAPE_RE = r'\\(.)' # \< 133 EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis* 134 STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong** 135 STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong*** 136 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_ 137 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_ 138 LINK_RE = NOIMG + BRK + \ 139 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)''' 140 # [text](url) or [text](<url>) or [text](url "title") 141 142 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' 143 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>) 144 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3] 145 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google] 146 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2] 147 NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _ 148 AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.com> 149 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me (at] example.com> 150 151 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> 152 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & 153 LINE_BREAK_RE = r' \n' # two spaces at end of line 154 155 156 def dequote(string): 157 """Remove quotes from around a string.""" 158 if ( ( string.startswith('"') and string.endswith('"')) 159 or (string.startswith("'") and string.endswith("'")) ): 160 return string[1:-1] 161 else: 162 return string 163 164 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} 165 166 def handleAttributes(text, parent): 167 """Set values of an element based on attribute definitions ({@id=123}).""" 168 def attributeCallback(match): 169 parent.set(match.group(1), match.group(2).replace('\n', ' ')) 170 return ATTR_RE.sub(attributeCallback, text) 171 172 173 """ 174 The pattern classes 175 ----------------------------------------------------------------------------- 176 """ 177 178 class Pattern(object): 179 """Base class that inline patterns subclass. """ 180 181 def __init__(self, pattern, markdown_instance=None): 182 """ 183 Create an instant of an inline pattern. 184 185 Keyword arguments: 186 187 * pattern: A regular expression that matches a pattern 188 189 """ 190 self.pattern = pattern 191 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, 192 re.DOTALL | re.UNICODE) 193 194 # Api for Markdown to pass safe_mode into instance 195 self.safe_mode = False 196 if markdown_instance: 197 self.markdown = markdown_instance 198 199 def getCompiledRegExp(self): 200 """ Return a compiled regular expression. """ 201 return self.compiled_re 202 203 def handleMatch(self, m): 204 """Return a ElementTree element from the given match. 205 206 Subclasses should override this method. 207 208 Keyword arguments: 209 210 * m: A re match object containing a match of the pattern. 211 212 """ 213 pass 214 215 def type(self): 216 """ Return class name, to define pattern type """ 217 return self.__class__.__name__ 218 219 def unescape(self, text): 220 """ Return unescaped text given text with an inline placeholder. """ 221 try: 222 stash = self.markdown.treeprocessors['inline'].stashed_nodes 223 except KeyError: 224 return text 225 def itertext(el): 226 ' Reimplement Element.itertext for older python versions ' 227 tag = el.tag 228 if not isinstance(tag, util.string_type) and tag is not None: 229 return 230 if el.text: 231 yield el.text 232 for e in el: 233 for s in itertext(e): 234 yield s 235 if e.tail: 236 yield e.tail 237 def get_stash(m): 238 id = m.group(1) 239 if id in stash: 240 value = stash.get(id) 241 if isinstance(value, util.string_type): 242 return value 243 else: 244 # An etree Element - return text content only 245 return ''.join(itertext(value)) 246 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 247 248 249 class SimpleTextPattern(Pattern): 250 """ Return a simple text of group(2) of a Pattern. """ 251 def handleMatch(self, m): 252 text = m.group(2) 253 if text == util.INLINE_PLACEHOLDER_PREFIX: 254 return None 255 return text 256 257 258 class EscapePattern(Pattern): 259 """ Return an escaped character. """ 260 261 def handleMatch(self, m): 262 char = m.group(2) 263 if char in self.markdown.ESCAPED_CHARS: 264 return '%s%s%s' % (util.STX, ord(char), util.ETX) 265 else: 266 return '\\%s' % char 267 268 269 class SimpleTagPattern(Pattern): 270 """ 271 Return element of type `tag` with a text attribute of group(3) 272 of a Pattern. 273 274 """ 275 def __init__ (self, pattern, tag): 276 Pattern.__init__(self, pattern) 277 self.tag = tag 278 279 def handleMatch(self, m): 280 el = util.etree.Element(self.tag) 281 el.text = m.group(3) 282 return el 283 284 285 class SubstituteTagPattern(SimpleTagPattern): 286 """ Return an element of type `tag` with no children. """ 287 def handleMatch (self, m): 288 return util.etree.Element(self.tag) 289 290 291 class BacktickPattern(Pattern): 292 """ Return a `<code>` element containing the matching text. """ 293 def __init__ (self, pattern): 294 Pattern.__init__(self, pattern) 295 self.tag = "code" 296 297 def handleMatch(self, m): 298 el = util.etree.Element(self.tag) 299 el.text = util.AtomicString(m.group(3).strip()) 300 return el 301 302 303 class DoubleTagPattern(SimpleTagPattern): 304 """Return a ElementTree element nested in tag2 nested in tag1. 305 306 Useful for strong emphasis etc. 307 308 """ 309 def handleMatch(self, m): 310 tag1, tag2 = self.tag.split(",") 311 el1 = util.etree.Element(tag1) 312 el2 = util.etree.SubElement(el1, tag2) 313 el2.text = m.group(3) 314 return el1 315 316 317 class HtmlPattern(Pattern): 318 """ Store raw inline html and return a placeholder. """ 319 def handleMatch (self, m): 320 rawhtml = self.unescape(m.group(2)) 321 place_holder = self.markdown.htmlStash.store(rawhtml) 322 return place_holder 323 324 def unescape(self, text): 325 """ Return unescaped text given text with an inline placeholder. """ 326 try: 327 stash = self.markdown.treeprocessors['inline'].stashed_nodes 328 except KeyError: 329 return text 330 def get_stash(m): 331 id = m.group(1) 332 value = stash.get(id) 333 if value is not None: 334 try: 335 return self.markdown.serializer(value) 336 except: 337 return '\%s' % value 338 339 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 340 341 342 class LinkPattern(Pattern): 343 """ Return a link element from the given match. """ 344 def handleMatch(self, m): 345 el = util.etree.Element("a") 346 el.text = m.group(2) 347 title = m.group(13) 348 href = m.group(9) 349 350 if href: 351 if href[0] == "<": 352 href = href[1:-1] 353 el.set("href", self.sanitize_url(self.unescape(href.strip()))) 354 else: 355 el.set("href", "") 356 357 if title: 358 title = dequote(self.unescape(title)) 359 el.set("title", title) 360 return el 361 362 def sanitize_url(self, url): 363 """ 364 Sanitize a url against xss attacks in "safe_mode". 365 366 Rather than specifically blacklisting `javascript:alert("XSS")` and all 367 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known 368 safe url formats. Most urls contain a network location, however some 369 are known not to (i.e.: mailto links). Script urls do not contain a 370 location. Additionally, for `javascript:...`, the scheme would be 371 "javascript" but some aliases will appear to `urlparse()` to have no 372 scheme. On top of that relative links (i.e.: "foo/bar.html") have no 373 scheme. Therefore we must check "path", "parameters", "query" and 374 "fragment" for any literal colons. We don't check "scheme" for colons 375 because it *should* never have any and "netloc" must allow the form: 376 `username:password@host:port`. 377 378 """ 379 url = url.replace(' ', '%20') 380 if not self.markdown.safeMode: 381 # Return immediately bipassing parsing. 382 return url 383 384 try: 385 scheme, netloc, path, params, query, fragment = url = urlparse(url) 386 except ValueError: 387 # Bad url - so bad it couldn't be parsed. 388 return '' 389 390 locless_schemes = ['', 'mailto', 'news'] 391 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps'] 392 if scheme not in allowed_schemes: 393 # Not a known (allowed) scheme. Not safe. 394 return '' 395 396 if netloc == '' and scheme not in locless_schemes: 397 # This should not happen. Treat as suspect. 398 return '' 399 400 for part in url[2:]: 401 if ":" in part: 402 # A colon in "path", "parameters", "query" or "fragment" is suspect. 403 return '' 404 405 # Url passes all tests. Return url as-is. 406 return urlunparse(url) 407 408 class ImagePattern(LinkPattern): 409 """ Return a img element from the given match. """ 410 def handleMatch(self, m): 411 el = util.etree.Element("img") 412 src_parts = m.group(9).split() 413 if src_parts: 414 src = src_parts[0] 415 if src[0] == "<" and src[-1] == ">": 416 src = src[1:-1] 417 el.set('src', self.sanitize_url(self.unescape(src))) 418 else: 419 el.set('src', "") 420 if len(src_parts) > 1: 421 el.set('title', dequote(self.unescape(" ".join(src_parts[1:])))) 422 423 if self.markdown.enable_attributes: 424 truealt = handleAttributes(m.group(2), el) 425 else: 426 truealt = m.group(2) 427 428 el.set('alt', self.unescape(truealt)) 429 return el 430 431 class ReferencePattern(LinkPattern): 432 """ Match to a stored reference and return link element. """ 433 434 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE) 435 436 def handleMatch(self, m): 437 try: 438 id = m.group(9).lower() 439 except IndexError: 440 id = None 441 if not id: 442 # if we got something like "[Google][]" or "[Goggle]" 443 # we'll use "google" as the id 444 id = m.group(2).lower() 445 446 # Clean up linebreaks in id 447 id = self.NEWLINE_CLEANUP_RE.sub(' ', id) 448 if not id in self.markdown.references: # ignore undefined refs 449 return None 450 href, title = self.markdown.references[id] 451 452 text = m.group(2) 453 return self.makeTag(href, title, text) 454 455 def makeTag(self, href, title, text): 456 el = util.etree.Element('a') 457 458 el.set('href', self.sanitize_url(href)) 459 if title: 460 el.set('title', title) 461 462 el.text = text 463 return el 464 465 466 class ImageReferencePattern(ReferencePattern): 467 """ Match to a stored reference and return img element. """ 468 def makeTag(self, href, title, text): 469 el = util.etree.Element("img") 470 el.set("src", self.sanitize_url(href)) 471 if title: 472 el.set("title", title) 473 474 if self.markdown.enable_attributes: 475 text = handleAttributes(text, el) 476 477 el.set("alt", self.unescape(text)) 478 return el 479 480 481 class AutolinkPattern(Pattern): 482 """ Return a link Element given an autolink (`<http://example/com>`). """ 483 def handleMatch(self, m): 484 el = util.etree.Element("a") 485 el.set('href', self.unescape(m.group(2))) 486 el.text = util.AtomicString(m.group(2)) 487 return el 488 489 class AutomailPattern(Pattern): 490 """ 491 Return a mailto link Element given an automail link (`<foo@example.com>`). 492 """ 493 def handleMatch(self, m): 494 el = util.etree.Element('a') 495 email = self.unescape(m.group(2)) 496 if email.startswith("mailto:"): 497 email = email[len("mailto:"):] 498 499 def codepoint2name(code): 500 """Return entity definition by code, or the code if not defined.""" 501 entity = entities.codepoint2name.get(code) 502 if entity: 503 return "%s%s;" % (util.AMP_SUBSTITUTE, entity) 504 else: 505 return "%s#%d;" % (util.AMP_SUBSTITUTE, code) 506 507 letters = [codepoint2name(ord(letter)) for letter in email] 508 el.text = util.AtomicString(''.join(letters)) 509 510 mailto = "mailto:" + email 511 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % 512 ord(letter) for letter in mailto]) 513 el.set('href', mailto) 514 return el 515 516