Home | History | Annotate | Download | only in server2
      1 # Copyright 2013 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 from collections import defaultdict, deque, namedtuple
      6 from HTMLParser import HTMLParser, HTMLParseError
      7 from itertools import groupby
      8 from operator import itemgetter
      9 import posixpath
     10 from urlparse import urlsplit
     11 
     12 from file_system_util import CreateURLsFromPaths
     13 import svn_constants
     14 
     15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
     16 
     17 def _SplitAnchor(url):
     18   components = urlsplit(url)
     19   return components.path, components.fragment
     20 
     21 def _Process(path, renderer):
     22   '''Render the page at |path| using a |renderer| and process the contents of
     23   that page. Returns a |Page| namedtuple with fields for the http status code
     24   of the page render, the href of all the links that occurred on the page, all
     25   of the anchors on the page (ids and names), and all links that contain an
     26   anchor component.
     27 
     28   If a non-html page is properly rendered, a |Page| with status code 200 and
     29   all other fields empty is returned.
     30   '''
     31   parser = _ContentParser()
     32   response = renderer(path)
     33 
     34   if response.status != 200:
     35     return Page(response.status, (), (), ())
     36   if not path.endswith('.html'):
     37     return Page(200, (), (), ())
     38 
     39   try:
     40     parser.feed(str(response.content))
     41   except HTMLParseError:
     42     return Page(200, (), (), ())
     43 
     44   links, anchors = parser.links, parser.anchors
     45   base, _ = path.rsplit('/', 1)
     46   edges = []
     47   anchor_refs = []
     48 
     49   # Convert relative links to absolute links and categorize links as edges
     50   # or anchor_refs.
     51   for link in links:
     52     # Files like experimental_history.html are refered to with the URL
     53     # experimental.history.html.
     54     head, last = link.rsplit('/', 1) if '/' in link else ('', link)
     55     last, anchor = _SplitAnchor(last)
     56 
     57     if last.endswith('.html') and last.count('.') > 1:
     58       last = last.replace('.', '_', last.count('.') - 1)
     59       link = posixpath.join(head, last)
     60       if anchor:
     61         link = '%s#%s' % (link, anchor)
     62 
     63     if link.startswith('#'):
     64       anchor_refs.append(link)
     65     else:
     66       if link.startswith('/'):
     67         link = link[1:]
     68       else:
     69         link = posixpath.normpath('%s/%s' % (base, link))
     70 
     71       if '#' in link:
     72         anchor_refs.append(link)
     73       else:
     74         edges.append(link)
     75 
     76   return Page(200, edges, anchors, anchor_refs)
     77 
     78 class _ContentParser(HTMLParser):
     79   '''Parse an html file pulling out all links and anchor_refs, where an
     80   anchor_ref is a link that contains an anchor.
     81   '''
     82 
     83   def __init__(self):
     84     HTMLParser.__init__(self)
     85     self.links = []
     86     self.anchors = set()
     87 
     88   def handle_starttag(self, tag, raw_attrs):
     89     attrs = dict(raw_attrs)
     90 
     91     if tag == 'a':
     92       # Handle special cases for href's that: start with a space, contain
     93       # just a '.' (period), contain python templating code, are an absolute
     94       # url, are a zip file, or execute javascript on the page.
     95       href = attrs.get('href', '').strip()
     96       if href and not href == '.' and not '{{' in href:
     97         if not urlsplit(href).scheme in ('http', 'https'):
     98           if not href.endswith('.zip') and not 'javascript:' in href:
     99             self.links.append(href)
    100 
    101     if attrs.get('id'):
    102       self.anchors.add(attrs['id'])
    103     if attrs.get('name'):
    104       self.anchors.add(attrs['name'])
    105 
    106 class LinkErrorDetector(object):
    107   '''Finds link errors on the doc server. This includes broken links, those with
    108   a target page that 404s or contain an anchor that doesn't exist, or pages that
    109   have no links to them.
    110   '''
    111 
    112   def __init__(self, file_system, renderer, public_path, root_pages):
    113     '''Creates a new broken link detector. |renderer| is a callable that takes
    114     a path and returns a full html page. |public_path| is the path to public
    115     template files. All URLs in |root_pages| are used as the starting nodes for
    116     the orphaned page search.
    117     '''
    118     self._file_system = file_system
    119     self._renderer = renderer
    120     self._public_path = public_path
    121     self._pages = defaultdict(lambda: Page(404, (), (), ()))
    122     self._root_pages = frozenset(root_pages)
    123     self._always_detached = frozenset((
    124         'apps/404.html',
    125         'extensions/404.html',
    126         'apps/private_apis.html',
    127         'extensions/private_apis.html'))
    128     self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
    129 
    130     self._RenderAllPages()
    131 
    132   def _RenderAllPages(self):
    133     '''Traverses the public templates directory rendering each URL and
    134     processing the resultant html to pull out all links and anchors.
    135     '''
    136     top_level_directories = (
    137       (svn_constants.PUBLIC_TEMPLATE_PATH, ''),
    138       (svn_constants.STATIC_PATH, 'static/'),
    139       (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),
    140     )
    141 
    142     for dirpath, urlprefix in top_level_directories:
    143       files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
    144       for url, path in files:
    145         self._pages[url] = _Process(url, self._renderer)
    146 
    147         if self._pages[url].status != 200:
    148           print(url, ', a url derived from the path', dirpath +
    149               ', resulted in a', self._pages[url].status)
    150 
    151   def _FollowRedirections(self, starting_url, limit=4):
    152     '''Follow redirection until a non-redirectable page is reached. Start at
    153     |starting_url| which must return a 301 or 302 status code.
    154 
    155     Return a tuple of: the status of rendering |staring_url|, the final url,
    156     and a list of the pages reached including |starting_url|. If no redirection
    157     occurred, returns (None, None, None).
    158     '''
    159     pages_reached = [starting_url]
    160     redirect_link = None
    161     target_page = self._renderer(starting_url)
    162     original_status = status = target_page.status
    163     count = 0
    164 
    165     while status in (301, 302):
    166       if count > limit:
    167         return None, None, None
    168       redirect_link = target_page.headers.get('Location')
    169       target_page = self._renderer(redirect_link)
    170       status = target_page.status
    171       pages_reached.append(redirect_link)
    172       count += 1
    173 
    174     if redirect_link is None:
    175       return None, None, None
    176 
    177     return original_status, redirect_link, pages_reached
    178 
    179   def _CategorizeBrokenLinks(self, url, page, pages):
    180     '''Find all broken links on a page and create appropriate notes describing
    181     why tehy are broken (broken anchor, target redirects, etc). |page| is the
    182     current page being checked and is the result of rendering |url|. |pages|
    183     is a callable that takes a path and returns a Page.
    184     '''
    185     broken_links = []
    186 
    187     for link in page.links + page.anchor_refs:
    188       components = urlsplit(link)
    189       fragment = components.fragment
    190 
    191       if components.path == '':
    192         if fragment == 'top' or fragment == '':
    193           continue
    194         if not fragment in page.anchors:
    195           broken_links.append((200, url, link, 'target anchor not found'))
    196       else:
    197         # Render the target page
    198         target_page = pages(components.path)
    199 
    200         if target_page.status != 200:
    201           if components.path in self._redirection_whitelist:
    202             continue
    203 
    204           status, relink, _ = self._FollowRedirections(components.path)
    205           if relink:
    206             broken_links.append((
    207                 status,
    208                 url,
    209                 link,
    210                 'redirects to %s' % relink))
    211           else:
    212             broken_links.append((
    213                 target_page.status, url, link, 'target page not found'))
    214 
    215         elif fragment:
    216           if not fragment in target_page.anchors:
    217             broken_links.append((
    218                 target_page.status, url, link, 'target anchor not found'))
    219 
    220     return broken_links
    221 
    222   def GetBrokenLinks(self):
    223     '''Find all broken links. A broken link is a link that leads to a page
    224     that does not exist (404s), redirects to another page (301 or 302), or
    225     has an anchor whose target does not exist.
    226 
    227     Returns a list of tuples of four elements: status, url, target_page,
    228     notes.
    229     '''
    230     broken_links = []
    231 
    232     for url in self._pages.keys():
    233       page = self._pages[url]
    234       if page.status != 200:
    235         continue
    236       broken_links.extend(self._CategorizeBrokenLinks(
    237           url, page, lambda x: self._pages[x]))
    238 
    239     return broken_links
    240 
    241   def GetOrphanedPages(self):
    242     '''Crawls the server find all pages that are connected to the pages at
    243     |seed_url|s. Return the links that are valid on the server but are not in
    244     part of the connected component containing the |root_pages|. These pages
    245     are orphans and cannot be reached simply by clicking through the server.
    246     '''
    247     pages_to_check = deque(self._root_pages.union(self._always_detached))
    248     found = set(self._root_pages) | self._always_detached
    249 
    250     while pages_to_check:
    251       item = pages_to_check.popleft()
    252       target_page = self._pages[item]
    253 
    254       if target_page.status != 200:
    255         redirected_page = self._FollowRedirections(item)[1]
    256         if not redirected_page is None:
    257           target_page = self._pages[redirected_page]
    258 
    259       for link in target_page.links:
    260         if link not in found:
    261           found.add(link)
    262           pages_to_check.append(link)
    263 
    264     all_urls = set(
    265         [url for url, page in self._pages.iteritems() if page.status == 200])
    266 
    267     return [url for url in all_urls - found if url.endswith('.html')]
    268 
    269 def StringifyBrokenLinks(broken_links):
    270   '''Prints out broken links in a more readable format.
    271   '''
    272   def fixed_width(string, width):
    273     return "%s%s" % (string, (width - len(string)) * ' ')
    274 
    275   first_col_width = max(len(link[1]) for link in broken_links)
    276   second_col_width = max(len(link[2]) for link in broken_links)
    277   target = itemgetter(2)
    278   output = []
    279 
    280   def pretty_print(link, col_offset=0):
    281     return "%s -> %s %s" % (
    282         fixed_width(link[1], first_col_width - col_offset),
    283         fixed_width(link[2], second_col_width),
    284         link[3])
    285 
    286   for target, links in groupby(sorted(broken_links, key=target), target):
    287     links = list(links)
    288     # Compress messages
    289     if len(links) > 50 and not links[0][2].startswith('#'):
    290       message = "Found %d broken links (" % len(links)
    291       output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
    292     else:
    293       for link in links:
    294         output.append(pretty_print(link))
    295 
    296   return '\n'.join(output)
    297