Home | History | Annotate | Download | only in markdown
      1 """
      2 INLINE PATTERNS
      3 =============================================================================
      4 
      5 Inline patterns such as *emphasis* are handled by means of auxiliary
      6 objects, one per pattern.  Pattern objects must be instances of classes
      7 that extend markdown.Pattern.  Each pattern object uses a single regular
      8 expression and needs support the following methods:
      9 
     10     pattern.getCompiledRegExp() # returns a regular expression
     11 
     12     pattern.handleMatch(m) # takes a match object and returns
     13                            # an ElementTree element or just plain text
     14 
     15 All of python markdown's built-in patterns subclass from Pattern,
     16 but you can add additional patterns that don't.
     17 
     18 Also note that all the regular expressions used by inline must
     19 capture the whole block.  For this reason, they all start with
     20 '^(.*)' and end with '(.*)!'.  In case with built-in expression
     21 Pattern takes care of adding the "^(.*)" and "(.*)!".
     22 
     23 Finally, the order in which regular expressions are applied is very
     24 important - e.g. if we first replace http://.../ links with <a> tags
     25 and _then_ try to replace inline html, we would end up with a mess.
     26 So, we apply the expressions in the following order:
     27 
     28 * escape and backticks have to go before everything else, so
     29   that we can preempt any markdown patterns by escaping them.
     30 
     31 * then we handle auto-links (must be done before inline html)
     32 
     33 * then we handle inline HTML.  At this point we will simply
     34   replace all inline HTML strings with a placeholder and add
     35   the actual HTML to a hash.
     36 
     37 * then inline images (must be done before links)
     38 
     39 * then bracketed links, first regular then reference-style
     40 
     41 * finally we apply strong and emphasis
     42 """
     43 
     44 import markdown
     45 import re
     46 from urlparse import urlparse, urlunparse
     47 import sys
     48 if sys.version >= "3.0":
     49     from html import entities as htmlentitydefs
     50 else:
     51     import htmlentitydefs
     52 
     53 """
     54 The actual regular expressions for patterns
     55 -----------------------------------------------------------------------------
     56 """
     57 
     58 NOBRACKET = r'[^\]\[]*'
     59 BRK = ( r'\[('
     60         + (NOBRACKET + r'(\[')*6
     61         + (NOBRACKET+ r'\])*')*6
     62         + NOBRACKET + r')\]' )
     63 NOIMG = r'(?<!\!)'
     64 
     65 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
     66 ESCAPE_RE = r'\\(.)'                             # \<
     67 EMPHASIS_RE = r'(\*)([^\*]+)\2'                    # *emphasis*
     68 STRONG_RE = r'(\*{2}|_{2})(.+?)\2'                      # **strong**
     69 STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2'            # ***strong***
     70 
     71 if markdown.SMART_EMPHASIS:
     72     EMPHASIS_2_RE = r'(?<!\w)(_)(\S.+?)\2(?!\w)'        # _emphasis_
     73 else:
     74     EMPHASIS_2_RE = r'(_)(.+?)\2'                 # _emphasis_
     75 
     76 LINK_RE = NOIMG + BRK + \
     77 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12)?\)'''
     78 # [text](url) or [text](<url>)
     79 
     80 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
     81 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
     82 REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]'           # [Google][3]
     83 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
     84 NOT_STRONG_RE = r'((^| )(\*|_)( |$))'                        # stand-alone * or _
     85 AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>'        # <http://www.123.com>
     86 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'               # <me (at] example.com>
     87 
     88 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'               # <...>
     89 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'               # &amp;
     90 LINE_BREAK_RE = r'  \n'                     # two spaces at end of line
     91 LINE_BREAK_2_RE = r'  $'                    # two spaces at end of text
     92 
     93 
     94 def dequote(string):
     95     """Remove quotes from around a string."""
     96     if ( ( string.startswith('"') and string.endswith('"'))
     97          or (string.startswith("'") and string.endswith("'")) ):
     98         return string[1:-1]
     99     else:
    100         return string
    101 
    102 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
    103 
    104 def handleAttributes(text, parent):
    105     """Set values of an element based on attribute definitions ({@id=123})."""
    106     def attributeCallback(match):
    107         parent.set(match.group(1), match.group(2).replace('\n', ' '))
    108     return ATTR_RE.sub(attributeCallback, text)
    109 
    110 
    111 """
    112 The pattern classes
    113 -----------------------------------------------------------------------------
    114 """
    115 
    116 class Pattern:
    117     """Base class that inline patterns subclass. """
    118 
    119     def __init__ (self, pattern, markdown_instance=None):
    120         """
    121         Create an instant of an inline pattern.
    122 
    123         Keyword arguments:
    124 
    125         * pattern: A regular expression that matches a pattern
    126 
    127         """
    128         self.pattern = pattern
    129         self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL)
    130 
    131         # Api for Markdown to pass safe_mode into instance
    132         self.safe_mode = False
    133         if markdown_instance:
    134             self.markdown = markdown_instance
    135 
    136     def getCompiledRegExp (self):
    137         """ Return a compiled regular expression. """
    138         return self.compiled_re
    139 
    140     def handleMatch(self, m):
    141         """Return a ElementTree element from the given match.
    142 
    143         Subclasses should override this method.
    144 
    145         Keyword arguments:
    146 
    147         * m: A re match object containing a match of the pattern.
    148 
    149         """
    150         pass
    151 
    152     def type(self):
    153         """ Return class name, to define pattern type """
    154         return self.__class__.__name__
    155 
    156 BasePattern = Pattern # for backward compatibility
    157 
    158 class SimpleTextPattern (Pattern):
    159     """ Return a simple text of group(2) of a Pattern. """
    160     def handleMatch(self, m):
    161         text = m.group(2)
    162         if text == markdown.INLINE_PLACEHOLDER_PREFIX:
    163             return None
    164         return text
    165 
    166 class SimpleTagPattern (Pattern):
    167     """
    168     Return element of type `tag` with a text attribute of group(3)
    169     of a Pattern.
    170 
    171     """
    172     def __init__ (self, pattern, tag):
    173         Pattern.__init__(self, pattern)
    174         self.tag = tag
    175 
    176     def handleMatch(self, m):
    177         el = markdown.etree.Element(self.tag)
    178         el.text = m.group(3)
    179         return el
    180 
    181 
    182 class SubstituteTagPattern (SimpleTagPattern):
    183     """ Return a eLement of type `tag` with no children. """
    184     def handleMatch (self, m):
    185         return markdown.etree.Element(self.tag)
    186 
    187 
    188 class BacktickPattern (Pattern):
    189     """ Return a `<code>` element containing the matching text. """
    190     def __init__ (self, pattern):
    191         Pattern.__init__(self, pattern)
    192         self.tag = "code"
    193 
    194     def handleMatch(self, m):
    195         el = markdown.etree.Element(self.tag)
    196         el.text = markdown.AtomicString(m.group(3).strip())
    197         return el
    198 
    199 
    200 class DoubleTagPattern (SimpleTagPattern):
    201     """Return a ElementTree element nested in tag2 nested in tag1.
    202 
    203     Useful for strong emphasis etc.
    204 
    205     """
    206     def handleMatch(self, m):
    207         tag1, tag2 = self.tag.split(",")
    208         el1 = markdown.etree.Element(tag1)
    209         el2 = markdown.etree.SubElement(el1, tag2)
    210         el2.text = m.group(3)
    211         return el1
    212 
    213 
    214 class HtmlPattern (Pattern):
    215     """ Store raw inline html and return a placeholder. """
    216     def handleMatch (self, m):
    217         rawhtml = m.group(2)
    218         inline = True
    219         place_holder = self.markdown.htmlStash.store(rawhtml)
    220         return place_holder
    221 
    222 
    223 class LinkPattern (Pattern):
    224     """ Return a link element from the given match. """
    225     def handleMatch(self, m):
    226         el = markdown.etree.Element("a")
    227         el.text = m.group(2)
    228         title = m.group(11)
    229         href = m.group(9)
    230 
    231         if href:
    232             if href[0] == "<":
    233                 href = href[1:-1]
    234             el.set("href", self.sanitize_url(href.strip()))
    235         else:
    236             el.set("href", "")
    237 
    238         if title:
    239             title = dequote(title) #.replace('"', "&quot;")
    240             el.set("title", title)
    241         return el
    242 
    243     def sanitize_url(self, url):
    244         """
    245         Sanitize a url against xss attacks in "safe_mode".
    246 
    247         Rather than specifically blacklisting `javascript:alert("XSS")` and all
    248         its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
    249         safe url formats. Most urls contain a network location, however some
    250         are known not to (i.e.: mailto links). Script urls do not contain a
    251         location. Additionally, for `javascript:...`, the scheme would be
    252         "javascript" but some aliases will appear to `urlparse()` to have no
    253         scheme. On top of that relative links (i.e.: "foo/bar.html") have no
    254         scheme. Therefore we must check "path", "parameters", "query" and
    255         "fragment" for any literal colons. We don't check "scheme" for colons
    256         because it *should* never have any and "netloc" must allow the form:
    257         `username:password@host:port`.
    258 
    259         """
    260         locless_schemes = ['', 'mailto', 'news']
    261         scheme, netloc, path, params, query, fragment = url = urlparse(url)
    262         safe_url = False
    263         if netloc != '' or scheme in locless_schemes:
    264             safe_url = True
    265 
    266         for part in url[2:]:
    267             if ":" in part:
    268                 safe_url = False
    269 
    270         if self.markdown.safeMode and not safe_url:
    271             return ''
    272         else:
    273             return urlunparse(url)
    274 
    275 class ImagePattern(LinkPattern):
    276     """ Return a img element from the given match. """
    277     def handleMatch(self, m):
    278         el = markdown.etree.Element("img")
    279         src_parts = m.group(9).split()
    280         if src_parts:
    281             src = src_parts[0]
    282             if src[0] == "<" and src[-1] == ">":
    283                 src = src[1:-1]
    284             el.set('src', self.sanitize_url(src))
    285         else:
    286             el.set('src', "")
    287         if len(src_parts) > 1:
    288             el.set('title', dequote(" ".join(src_parts[1:])))
    289 
    290         if markdown.ENABLE_ATTRIBUTES:
    291             truealt = handleAttributes(m.group(2), el)
    292         else:
    293             truealt = m.group(2)
    294 
    295         el.set('alt', truealt)
    296         return el
    297 
    298 class ReferencePattern(LinkPattern):
    299     """ Match to a stored reference and return link element. """
    300     def handleMatch(self, m):
    301         if m.group(9):
    302             id = m.group(9).lower()
    303         else:
    304             # if we got something like "[Google][]"
    305             # we'll use "google" as the id
    306             id = m.group(2).lower()
    307 
    308         if not id in self.markdown.references: # ignore undefined refs
    309             return None
    310         href, title = self.markdown.references[id]
    311 
    312         text = m.group(2)
    313         return self.makeTag(href, title, text)
    314 
    315     def makeTag(self, href, title, text):
    316         el = markdown.etree.Element('a')
    317 
    318         el.set('href', self.sanitize_url(href))
    319         if title:
    320             el.set('title', title)
    321 
    322         el.text = text
    323         return el
    324 
    325 
    326 class ImageReferencePattern (ReferencePattern):
    327     """ Match to a stored reference and return img element. """
    328     def makeTag(self, href, title, text):
    329         el = markdown.etree.Element("img")
    330         el.set("src", self.sanitize_url(href))
    331         if title:
    332             el.set("title", title)
    333         el.set("alt", text)
    334         return el
    335 
    336 
    337 class AutolinkPattern (Pattern):
    338     """ Return a link Element given an autolink (`<http://example/com>`). """
    339     def handleMatch(self, m):
    340         el = markdown.etree.Element("a")
    341         el.set('href', m.group(2))
    342         el.text = markdown.AtomicString(m.group(2))
    343         return el
    344 
    345 class AutomailPattern (Pattern):
    346     """
    347     Return a mailto link Element given an automail link (`<foo@example.com>`).
    348     """
    349     def handleMatch(self, m):
    350         el = markdown.etree.Element('a')
    351         email = m.group(2)
    352         if email.startswith("mailto:"):
    353             email = email[len("mailto:"):]
    354 
    355         def codepoint2name(code):
    356             """Return entity definition by code, or the code if not defined."""
    357             entity = htmlentitydefs.codepoint2name.get(code)
    358             if entity:
    359                 return "%s%s;" % (markdown.AMP_SUBSTITUTE, entity)
    360             else:
    361                 return "%s#%d;" % (markdown.AMP_SUBSTITUTE, code)
    362 
    363         letters = [codepoint2name(ord(letter)) for letter in email]
    364         el.text = markdown.AtomicString(''.join(letters))
    365 
    366         mailto = "mailto:" + email
    367         mailto = "".join([markdown.AMP_SUBSTITUTE + '#%d;' %
    368                           ord(letter) for letter in mailto])
    369         el.set('href', mailto)
    370         return el
    371 
    372