Home | History | Annotate | Download | only in server2
      1 # Copyright 2013 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 from collections import defaultdict, deque, namedtuple
      6 from HTMLParser import HTMLParser, HTMLParseError
      7 from itertools import groupby
      8 from operator import itemgetter
      9 import posixpath
     10 from urlparse import urlsplit
     11 
     12 from file_system_util import CreateURLsFromPaths
     13 from path_util import AssertIsDirectory
     14 
     15 
     16 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
     17 
     18 
     19 def _SplitAnchor(url):
     20   components = urlsplit(url)
     21   return components.path, components.fragment
     22 
     23 
     24 def _Process(path, renderer):
     25   '''Render the page at |path| using a |renderer| and process the contents of
     26   that page. Returns a |Page| namedtuple with fields for the http status code
     27   of the page render, the href of all the links that occurred on the page, all
     28   of the anchors on the page (ids and names), and all links that contain an
     29   anchor component.
     30 
     31   If a non-html page is properly rendered, a |Page| with status code 200 and
     32   all other fields empty is returned.
     33   '''
     34   parser = _ContentParser()
     35   response = renderer(path)
     36 
     37   if response.status != 200:
     38     return Page(response.status, (), (), ())
     39   if not path.endswith('.html'):
     40     return Page(200, (), (), ())
     41 
     42   try:
     43     parser.feed(str(response.content))
     44   except HTMLParseError:
     45     return Page(200, (), (), ())
     46 
     47   links, anchors = parser.links, parser.anchors
     48   if '/' in path:
     49     base, _ = path.rsplit('/', 1)
     50   else:
     51     base = ''
     52   edges = []
     53   anchor_refs = []
     54 
     55   # Convert relative links to absolute links and categorize links as edges
     56   # or anchor_refs.
     57   for link in links:
     58     # Files like experimental_history.html are refered to with the URL
     59     # experimental.history.html.
     60     head, last = link.rsplit('/', 1) if '/' in link else ('', link)
     61     last, anchor = _SplitAnchor(last)
     62 
     63     if last.endswith('.html') and last.count('.') > 1:
     64       last = last.replace('.', '_', last.count('.') - 1)
     65       link = posixpath.join(head, last)
     66       if anchor:
     67         link = '%s#%s' % (link, anchor)
     68 
     69     if link.startswith('#'):
     70       anchor_refs.append(link)
     71     else:
     72       if link.startswith('/'):
     73         link = link[1:]
     74       else:
     75         link = posixpath.normpath('%s/%s' % (base, link))
     76 
     77       if '#' in link:
     78         anchor_refs.append(link)
     79       else:
     80         edges.append(link)
     81 
     82   return Page(200, edges, anchors, anchor_refs)
     83 
     84 
     85 class _ContentParser(HTMLParser):
     86   '''Parse an html file pulling out all links and anchor_refs, where an
     87   anchor_ref is a link that contains an anchor.
     88   '''
     89 
     90   def __init__(self):
     91     HTMLParser.__init__(self)
     92     self.links = []
     93     self.anchors = set()
     94 
     95   def handle_starttag(self, tag, raw_attrs):
     96     attrs = dict(raw_attrs)
     97 
     98     if tag == 'a':
     99       # Handle special cases for href's that: start with a space, contain
    100       # just a '.' (period), contain python templating code, are an absolute
    101       # url, are a zip file, or execute javascript on the page.
    102       href = attrs.get('href', '').strip()
    103       if href and not href == '.' and not '{{' in href:
    104         if not urlsplit(href).scheme in ('http', 'https'):
    105           if not href.endswith('.zip') and not 'javascript:' in href:
    106             self.links.append(href)
    107 
    108     if attrs.get('id'):
    109       self.anchors.add(attrs['id'])
    110     if attrs.get('name'):
    111       self.anchors.add(attrs['name'])
    112 
    113 
    114 class LinkErrorDetector(object):
    115   '''Finds link errors on the doc server. This includes broken links, those with
    116   a target page that 404s or contain an anchor that doesn't exist, or pages that
    117   have no links to them.
    118   '''
    119 
    120   def __init__(self, file_system, renderer, public_path, root_pages):
    121     '''Creates a new broken link detector. |renderer| is a callable that takes
    122     a path and returns a full html page. |public_path| is the path to public
    123     template files. All URLs in |root_pages| are used as the starting nodes for
    124     the orphaned page search.
    125     '''
    126     AssertIsDirectory(public_path)
    127     self._file_system = file_system
    128     self._renderer = renderer
    129     self._public_path = public_path
    130     self._pages = defaultdict(lambda: Page(404, (), (), ()))
    131     self._root_pages = frozenset(root_pages)
    132     self._always_detached = frozenset((
    133         'apps/404.html',
    134         'extensions/404.html',
    135         'apps/private_apis.html',
    136         'extensions/private_apis.html'))
    137     self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
    138 
    139     self._RenderAllPages()
    140 
    141   def _RenderAllPages(self):
    142     '''Traverses the public templates directory rendering each URL and
    143     processing the resultant html to pull out all links and anchors.
    144     '''
    145     top_level_directories = (
    146       ('docs/templates/public/', ''),
    147       ('docs/static/', 'static/'),
    148       ('docs/examples/', 'extensions/examples/'),
    149     )
    150 
    151     for dirpath, urlprefix in top_level_directories:
    152       files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
    153       for url, path in files:
    154         self._pages[url] = _Process(url, self._renderer)
    155 
    156         if self._pages[url].status != 200:
    157           print(url, ', a url derived from the path', dirpath +
    158               ', resulted in a', self._pages[url].status)
    159 
    160   def _FollowRedirections(self, starting_url, limit=4):
    161     '''Follow redirection until a non-redirectable page is reached. Start at
    162     |starting_url| which must return a 301 or 302 status code.
    163 
    164     Return a tuple of: the status of rendering |staring_url|, the final url,
    165     and a list of the pages reached including |starting_url|. If no redirection
    166     occurred, returns (None, None, None).
    167     '''
    168     pages_reached = [starting_url]
    169     redirect_link = None
    170     target_page = self._renderer(starting_url)
    171     original_status = status = target_page.status
    172     count = 0
    173 
    174     while status in (301, 302):
    175       if count > limit:
    176         return None, None, None
    177       redirect_link = target_page.headers.get('Location')
    178       target_page = self._renderer(redirect_link)
    179       status = target_page.status
    180       pages_reached.append(redirect_link)
    181       count += 1
    182 
    183     if redirect_link is None:
    184       return None, None, None
    185 
    186     return original_status, redirect_link, pages_reached
    187 
    188   def _CategorizeBrokenLinks(self, url, page, pages):
    189     '''Find all broken links on a page and create appropriate notes describing
    190     why tehy are broken (broken anchor, target redirects, etc). |page| is the
    191     current page being checked and is the result of rendering |url|. |pages|
    192     is a callable that takes a path and returns a Page.
    193     '''
    194     broken_links = []
    195 
    196     for link in page.links + page.anchor_refs:
    197       components = urlsplit(link)
    198       fragment = components.fragment
    199 
    200       if components.path == '':
    201         if fragment == 'top' or fragment == '':
    202           continue
    203         if not fragment in page.anchors:
    204           broken_links.append((200, url, link, 'target anchor not found'))
    205       else:
    206         # Render the target page
    207         target_page = pages(components.path)
    208 
    209         if target_page.status != 200:
    210           if components.path in self._redirection_whitelist:
    211             continue
    212 
    213           status, relink, _ = self._FollowRedirections(components.path)
    214           if relink:
    215             broken_links.append((
    216                 status,
    217                 url,
    218                 link,
    219                 'redirects to %s' % relink))
    220           else:
    221             broken_links.append((
    222                 target_page.status, url, link, 'target page not found'))
    223 
    224         elif fragment:
    225           if not fragment in target_page.anchors:
    226             broken_links.append((
    227                 target_page.status, url, link, 'target anchor not found'))
    228 
    229     return broken_links
    230 
    231   def GetBrokenLinks(self):
    232     '''Find all broken links. A broken link is a link that leads to a page
    233     that does not exist (404s), redirects to another page (301 or 302), or
    234     has an anchor whose target does not exist.
    235 
    236     Returns a list of tuples of four elements: status, url, target_page,
    237     notes.
    238     '''
    239     broken_links = []
    240 
    241     for url in self._pages.keys():
    242       page = self._pages[url]
    243       if page.status != 200:
    244         continue
    245       broken_links.extend(self._CategorizeBrokenLinks(
    246           url, page, lambda x: self._pages[x]))
    247 
    248     return broken_links
    249 
    250   def GetOrphanedPages(self):
    251     '''Crawls the server find all pages that are connected to the pages at
    252     |seed_url|s. Return the links that are valid on the server but are not in
    253     part of the connected component containing the |root_pages|. These pages
    254     are orphans and cannot be reached simply by clicking through the server.
    255     '''
    256     pages_to_check = deque(self._root_pages.union(self._always_detached))
    257     found = set(self._root_pages) | self._always_detached
    258 
    259     while pages_to_check:
    260       item = pages_to_check.popleft()
    261       target_page = self._pages[item]
    262 
    263       if target_page.status != 200:
    264         redirected_page = self._FollowRedirections(item)[1]
    265         if not redirected_page is None:
    266           target_page = self._pages[redirected_page]
    267 
    268       for link in target_page.links:
    269         if link not in found:
    270           found.add(link)
    271           pages_to_check.append(link)
    272 
    273     all_urls = set(
    274         [url for url, page in self._pages.iteritems() if page.status == 200])
    275 
    276     return [url for url in all_urls - found if url.endswith('.html')]
    277 
    278 
    279 def StringifyBrokenLinks(broken_links):
    280   '''Prints out broken links in a more readable format.
    281   '''
    282   def fixed_width(string, width):
    283     return "%s%s" % (string, (width - len(string)) * ' ')
    284 
    285   first_col_width = max(len(link[1]) for link in broken_links)
    286   second_col_width = max(len(link[2]) for link in broken_links)
    287   target = itemgetter(2)
    288   output = []
    289 
    290   def pretty_print(link, col_offset=0):
    291     return "%s -> %s %s" % (
    292         fixed_width(link[1], first_col_width - col_offset),
    293         fixed_width(link[2], second_col_width),
    294         link[3])
    295 
    296   for target, links in groupby(sorted(broken_links, key=target), target):
    297     links = list(links)
    298     # Compress messages
    299     if len(links) > 50 and not links[0][2].startswith('#'):
    300       message = "Found %d broken links (" % len(links)
    301       output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
    302     else:
    303       for link in links:
    304         output.append(pretty_print(link))
    305 
    306   return '\n'.join(output)
    307