Home | History | Annotate | Download | only in markdown
      1 # markdown is released under the BSD license
      2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
      3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
      4 # Copyright 2004 Manfred Stienstra (the original version)
      5 # 
      6 # All rights reserved.
      7 # 
      8 # Redistribution and use in source and binary forms, with or without
      9 # modification, are permitted provided that the following conditions are met:
     10 # 
     11 # *   Redistributions of source code must retain the above copyright
     12 #     notice, this list of conditions and the following disclaimer.
     13 # *   Redistributions in binary form must reproduce the above copyright
     14 #     notice, this list of conditions and the following disclaimer in the
     15 #     documentation and/or other materials provided with the distribution.
     16 # *   Neither the name of the <organization> nor the
     17 #     names of its contributors may be used to endorse or promote products
     18 #     derived from this software without specific prior written permission.
     19 # 
     20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
     21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
     24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30 # POSSIBILITY OF SUCH DAMAGE.
     31 
     32 
     33 """
     34 INLINE PATTERNS
     35 =============================================================================
     36 
     37 Inline patterns such as *emphasis* are handled by means of auxiliary
     38 objects, one per pattern.  Pattern objects must be instances of classes
     39 that extend markdown.Pattern.  Each pattern object uses a single regular
     40 expression and needs support the following methods:
     41 
     42     pattern.getCompiledRegExp() # returns a regular expression
     43 
     44     pattern.handleMatch(m) # takes a match object and returns
     45                            # an ElementTree element or just plain text
     46 
     47 All of python markdown's built-in patterns subclass from Pattern,
     48 but you can add additional patterns that don't.
     49 
     50 Also note that all the regular expressions used by inline must
     51 capture the whole block.  For this reason, they all start with
     52 '^(.*)' and end with '(.*)!'.  In case with built-in expression
     53 Pattern takes care of adding the "^(.*)" and "(.*)!".
     54 
     55 Finally, the order in which regular expressions are applied is very
     56 important - e.g. if we first replace http://.../ links with <a> tags
     57 and _then_ try to replace inline html, we would end up with a mess.
     58 So, we apply the expressions in the following order:
     59 
     60 * escape and backticks have to go before everything else, so
     61   that we can preempt any markdown patterns by escaping them.
     62 
     63 * then we handle auto-links (must be done before inline html)
     64 
     65 * then we handle inline HTML.  At this point we will simply
     66   replace all inline HTML strings with a placeholder and add
     67   the actual HTML to a hash.
     68 
     69 * then inline images (must be done before links)
     70 
     71 * then bracketed links, first regular then reference-style
     72 
     73 * finally we apply strong and emphasis
     74 """
     75 
     76 from __future__ import absolute_import
     77 from __future__ import unicode_literals
     78 from . import util
     79 from . import odict
     80 import re
     81 try:
     82     from urllib.parse import urlparse, urlunparse
     83 except ImportError:
     84     from urlparse import urlparse, urlunparse
     85 try:
     86     from html import entities
     87 except ImportError:
     88     import htmlentitydefs as entities
     89 
     90 
     91 def build_inlinepatterns(md_instance, **kwargs):
     92     """ Build the default set of inline patterns for Markdown. """
     93     inlinePatterns = odict.OrderedDict()
     94     inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
     95     inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
     96     inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
     97     inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
     98     inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
     99     inlinePatterns["image_reference"] = \
    100             ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)
    101     inlinePatterns["short_reference"] = \
    102             ReferencePattern(SHORT_REF_RE, md_instance)
    103     inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
    104     inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
    105     inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
    106     if md_instance.safeMode != 'escape':
    107         inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
    108     inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
    109     inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
    110     inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
    111     inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
    112     inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
    113     if md_instance.smart_emphasis:
    114         inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
    115     else:
    116         inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
    117     return inlinePatterns
    118 
    119 """
    120 The actual regular expressions for patterns
    121 -----------------------------------------------------------------------------
    122 """
    123 
    124 NOBRACKET = r'[^\]\[]*'
    125 BRK = ( r'\[('
    126         + (NOBRACKET + r'(\[')*6
    127         + (NOBRACKET+ r'\])*')*6
    128         + NOBRACKET + r')\]' )
    129 NOIMG = r'(?<!\!)'
    130 
    131 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
    132 ESCAPE_RE = r'\\(.)'                             # \<
    133 EMPHASIS_RE = r'(\*)([^\*]+)\2'                    # *emphasis*
    134 STRONG_RE = r'(\*{2}|_{2})(.+?)\2'                      # **strong**
    135 STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2'            # ***strong***
    136 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'  # _smart_emphasis_
    137 EMPHASIS_2_RE = r'(_)(.+?)\2'                 # _emphasis_
    138 LINK_RE = NOIMG + BRK + \
    139 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
    140 # [text](url) or [text](<url>) or [text](url "title")
    141 
    142 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
    143 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
    144 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]'           # [Google][3]
    145 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'                   # [Google]
    146 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]
    147 NOT_STRONG_RE = r'((^| )(\*|_)( |$))'                        # stand-alone * or _
    148 AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.com>
    149 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'               # <me (at] example.com>
    150 
    151 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'               # <...>
    152 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'               # &amp;
    153 LINE_BREAK_RE = r'  \n'                     # two spaces at end of line
    154 
    155 
    156 def dequote(string):
    157     """Remove quotes from around a string."""
    158     if ( ( string.startswith('"') and string.endswith('"'))
    159          or (string.startswith("'") and string.endswith("'")) ):
    160         return string[1:-1]
    161     else:
    162         return string
    163 
    164 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
    165 
    166 def handleAttributes(text, parent):
    167     """Set values of an element based on attribute definitions ({@id=123})."""
    168     def attributeCallback(match):
    169         parent.set(match.group(1), match.group(2).replace('\n', ' '))
    170     return ATTR_RE.sub(attributeCallback, text)
    171 
    172 
    173 """
    174 The pattern classes
    175 -----------------------------------------------------------------------------
    176 """
    177 
    178 class Pattern(object):
    179     """Base class that inline patterns subclass. """
    180 
    181     def __init__(self, pattern, markdown_instance=None):
    182         """
    183         Create an instant of an inline pattern.
    184 
    185         Keyword arguments:
    186 
    187         * pattern: A regular expression that matches a pattern
    188 
    189         """
    190         self.pattern = pattern
    191         self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, 
    192                                       re.DOTALL | re.UNICODE)
    193 
    194         # Api for Markdown to pass safe_mode into instance
    195         self.safe_mode = False
    196         if markdown_instance:
    197             self.markdown = markdown_instance
    198 
    199     def getCompiledRegExp(self):
    200         """ Return a compiled regular expression. """
    201         return self.compiled_re
    202 
    203     def handleMatch(self, m):
    204         """Return a ElementTree element from the given match.
    205 
    206         Subclasses should override this method.
    207 
    208         Keyword arguments:
    209 
    210         * m: A re match object containing a match of the pattern.
    211 
    212         """
    213         pass
    214 
    215     def type(self):
    216         """ Return class name, to define pattern type """
    217         return self.__class__.__name__
    218 
    219     def unescape(self, text):
    220         """ Return unescaped text given text with an inline placeholder. """
    221         try:
    222             stash = self.markdown.treeprocessors['inline'].stashed_nodes
    223         except KeyError:
    224             return text
    225         def itertext(el):
    226             ' Reimplement Element.itertext for older python versions '
    227             tag = el.tag
    228             if not isinstance(tag, util.string_type) and tag is not None:
    229                 return
    230             if el.text:
    231                 yield el.text
    232             for e in el:
    233                 for s in itertext(e):
    234                     yield s
    235                 if e.tail:
    236                     yield e.tail
    237         def get_stash(m):
    238             id = m.group(1)
    239             if id in stash:
    240                 value = stash.get(id)
    241                 if isinstance(value, util.string_type):
    242                     return value
    243                 else:
    244                     # An etree Element - return text content only
    245                     return ''.join(itertext(value)) 
    246         return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
    247 
    248 
    249 class SimpleTextPattern(Pattern):
    250     """ Return a simple text of group(2) of a Pattern. """
    251     def handleMatch(self, m):
    252         text = m.group(2)
    253         if text == util.INLINE_PLACEHOLDER_PREFIX:
    254             return None
    255         return text
    256 
    257 
    258 class EscapePattern(Pattern):
    259     """ Return an escaped character. """
    260 
    261     def handleMatch(self, m):
    262         char = m.group(2)
    263         if char in self.markdown.ESCAPED_CHARS:
    264             return '%s%s%s' % (util.STX, ord(char), util.ETX)
    265         else:
    266             return '\\%s' % char
    267 
    268 
    269 class SimpleTagPattern(Pattern):
    270     """
    271     Return element of type `tag` with a text attribute of group(3)
    272     of a Pattern.
    273 
    274     """
    275     def __init__ (self, pattern, tag):
    276         Pattern.__init__(self, pattern)
    277         self.tag = tag
    278 
    279     def handleMatch(self, m):
    280         el = util.etree.Element(self.tag)
    281         el.text = m.group(3)
    282         return el
    283 
    284 
    285 class SubstituteTagPattern(SimpleTagPattern):
    286     """ Return an element of type `tag` with no children. """
    287     def handleMatch (self, m):
    288         return util.etree.Element(self.tag)
    289 
    290 
    291 class BacktickPattern(Pattern):
    292     """ Return a `<code>` element containing the matching text. """
    293     def __init__ (self, pattern):
    294         Pattern.__init__(self, pattern)
    295         self.tag = "code"
    296 
    297     def handleMatch(self, m):
    298         el = util.etree.Element(self.tag)
    299         el.text = util.AtomicString(m.group(3).strip())
    300         return el
    301 
    302 
    303 class DoubleTagPattern(SimpleTagPattern):
    304     """Return a ElementTree element nested in tag2 nested in tag1.
    305 
    306     Useful for strong emphasis etc.
    307 
    308     """
    309     def handleMatch(self, m):
    310         tag1, tag2 = self.tag.split(",")
    311         el1 = util.etree.Element(tag1)
    312         el2 = util.etree.SubElement(el1, tag2)
    313         el2.text = m.group(3)
    314         return el1
    315 
    316 
    317 class HtmlPattern(Pattern):
    318     """ Store raw inline html and return a placeholder. """
    319     def handleMatch (self, m):
    320         rawhtml = self.unescape(m.group(2))
    321         place_holder = self.markdown.htmlStash.store(rawhtml)
    322         return place_holder
    323 
    324     def unescape(self, text):
    325         """ Return unescaped text given text with an inline placeholder. """
    326         try:
    327             stash = self.markdown.treeprocessors['inline'].stashed_nodes
    328         except KeyError:
    329             return text
    330         def get_stash(m):
    331             id = m.group(1)
    332             value = stash.get(id)
    333             if value is not None:
    334                 try:
    335                     return self.markdown.serializer(value)
    336                 except:
    337                     return '\%s' % value
    338             
    339         return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
    340 
    341 
    342 class LinkPattern(Pattern):
    343     """ Return a link element from the given match. """
    344     def handleMatch(self, m):
    345         el = util.etree.Element("a")
    346         el.text = m.group(2)
    347         title = m.group(13)
    348         href = m.group(9)
    349 
    350         if href:
    351             if href[0] == "<":
    352                 href = href[1:-1]
    353             el.set("href", self.sanitize_url(self.unescape(href.strip())))
    354         else:
    355             el.set("href", "")
    356 
    357         if title:
    358             title = dequote(self.unescape(title)) 
    359             el.set("title", title)
    360         return el
    361 
    362     def sanitize_url(self, url):
    363         """
    364         Sanitize a url against xss attacks in "safe_mode".
    365 
    366         Rather than specifically blacklisting `javascript:alert("XSS")` and all
    367         its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
    368         safe url formats. Most urls contain a network location, however some
    369         are known not to (i.e.: mailto links). Script urls do not contain a
    370         location. Additionally, for `javascript:...`, the scheme would be
    371         "javascript" but some aliases will appear to `urlparse()` to have no
    372         scheme. On top of that relative links (i.e.: "foo/bar.html") have no
    373         scheme. Therefore we must check "path", "parameters", "query" and
    374         "fragment" for any literal colons. We don't check "scheme" for colons
    375         because it *should* never have any and "netloc" must allow the form:
    376         `username:password@host:port`.
    377 
    378         """
    379         url = url.replace(' ', '%20')
    380         if not self.markdown.safeMode:
    381             # Return immediately bipassing parsing.
    382             return url
    383         
    384         try:
    385             scheme, netloc, path, params, query, fragment = url = urlparse(url)
    386         except ValueError:
    387             # Bad url - so bad it couldn't be parsed.
    388             return ''
    389         
    390         locless_schemes = ['', 'mailto', 'news']
    391         allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
    392         if scheme not in allowed_schemes:
    393             # Not a known (allowed) scheme. Not safe.
    394             return ''
    395             
    396         if netloc == '' and scheme not in locless_schemes:
    397             # This should not happen. Treat as suspect.
    398             return ''
    399 
    400         for part in url[2:]:
    401             if ":" in part:
    402                 # A colon in "path", "parameters", "query" or "fragment" is suspect.
    403                 return ''
    404 
    405         # Url passes all tests. Return url as-is.
    406         return urlunparse(url)
    407 
    408 class ImagePattern(LinkPattern):
    409     """ Return a img element from the given match. """
    410     def handleMatch(self, m):
    411         el = util.etree.Element("img")
    412         src_parts = m.group(9).split()
    413         if src_parts:
    414             src = src_parts[0]
    415             if src[0] == "<" and src[-1] == ">":
    416                 src = src[1:-1]
    417             el.set('src', self.sanitize_url(self.unescape(src)))
    418         else:
    419             el.set('src', "")
    420         if len(src_parts) > 1:
    421             el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
    422 
    423         if self.markdown.enable_attributes:
    424             truealt = handleAttributes(m.group(2), el)
    425         else:
    426             truealt = m.group(2)
    427 
    428         el.set('alt', self.unescape(truealt))
    429         return el
    430 
    431 class ReferencePattern(LinkPattern):
    432     """ Match to a stored reference and return link element. """
    433 
    434     NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
    435 
    436     def handleMatch(self, m):
    437         try:
    438             id = m.group(9).lower()
    439         except IndexError:
    440             id = None
    441         if not id:
    442             # if we got something like "[Google][]" or "[Goggle]"
    443             # we'll use "google" as the id
    444             id = m.group(2).lower()
    445 
    446         # Clean up linebreaks in id
    447         id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
    448         if not id in self.markdown.references: # ignore undefined refs
    449             return None
    450         href, title = self.markdown.references[id]
    451 
    452         text = m.group(2)
    453         return self.makeTag(href, title, text)
    454 
    455     def makeTag(self, href, title, text):
    456         el = util.etree.Element('a')
    457 
    458         el.set('href', self.sanitize_url(href))
    459         if title:
    460             el.set('title', title)
    461 
    462         el.text = text
    463         return el
    464 
    465 
    466 class ImageReferencePattern(ReferencePattern):
    467     """ Match to a stored reference and return img element. """
    468     def makeTag(self, href, title, text):
    469         el = util.etree.Element("img")
    470         el.set("src", self.sanitize_url(href))
    471         if title:
    472             el.set("title", title)
    473 
    474         if self.markdown.enable_attributes:
    475             text = handleAttributes(text, el)
    476 
    477         el.set("alt", self.unescape(text))
    478         return el
    479 
    480 
    481 class AutolinkPattern(Pattern):
    482     """ Return a link Element given an autolink (`<http://example/com>`). """
    483     def handleMatch(self, m):
    484         el = util.etree.Element("a")
    485         el.set('href', self.unescape(m.group(2)))
    486         el.text = util.AtomicString(m.group(2))
    487         return el
    488 
    489 class AutomailPattern(Pattern):
    490     """
    491     Return a mailto link Element given an automail link (`<foo@example.com>`).
    492     """
    493     def handleMatch(self, m):
    494         el = util.etree.Element('a')
    495         email = self.unescape(m.group(2))
    496         if email.startswith("mailto:"):
    497             email = email[len("mailto:"):]
    498 
    499         def codepoint2name(code):
    500             """Return entity definition by code, or the code if not defined."""
    501             entity = entities.codepoint2name.get(code)
    502             if entity:
    503                 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
    504             else:
    505                 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
    506 
    507         letters = [codepoint2name(ord(letter)) for letter in email]
    508         el.text = util.AtomicString(''.join(letters))
    509 
    510         mailto = "mailto:" + email
    511         mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
    512                           ord(letter) for letter in mailto])
    513         el.set('href', mailto)
    514         return el
    515 
    516