Home | History | Annotate | Download | only in js
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Local modifications to this file are described in the README.chromium
      6 // file.
      7 
      8 var dbg = (typeof console !== 'undefined') ? function(s) {
      9     console.log("Readability: " + s);
     10 } : function() {};
     11 
     12 /*
     13  * Readability. An Arc90 Lab Experiment.
     14  * Website: http://lab.arc90.com/experiments/readability
     15  * Source:  http://code.google.com/p/arc90labs-readability
     16  *
     17  * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
     18  *
     19  * Copyright (c) 2010 Arc90 Inc
     20  * Readability is licensed under the Apache License, Version 2.0.
     21 **/
     22 var readability = {
     23     readStyle: "style-newspaper",
     24     readSize: "size-medium",
     25     readMargin: "margin-wide",
     26 
     27     distilledHTML: '',
     28     distilledArticleContent: null,
     29     nextPageLink: '',
     30 
     31     version:                '1.7.1',
     32     iframeLoads:             0,
     33     convertLinksToFootnotes: false,
     34     reversePageScroll:       false, /* If they hold shift and hit space, scroll up */
     35     frameHack:               false, /**
     36                                       * The frame hack is to workaround a firefox bug where if you
     37                                       * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
     38                                       * So we fake a scrollbar in the wrapping div.
     39                                      **/
     40     biggestFrame:            false,
     41     flags:                   0x1 | 0x2 | 0x4,   /* Start with all flags set. */
     42 
     43     /* constants */
     44     FLAG_STRIP_UNLIKELYS:     0x1,
     45     FLAG_WEIGHT_CLASSES:      0x2,
     46     FLAG_CLEAN_CONDITIONALLY: 0x4,
     47 
     48     maxPages:    30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
     49     parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
     50     pageETags:   {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
     51 
     52     /**
     53      * All of the regular expressions in use within readability.
     54      * Defined up here so we don't instantiate them repeatedly in loops.
     55      **/
     56     regexps: {
     57         unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
     58         okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
     59         positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
     60         negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
     61         extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
     62         divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
     63         replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
     64         replaceFonts:          /<(\/?)font[^>]*>/gi,
     65         trim:                  /^\s+|\s+$/g,
     66         normalize:             /\s{2,}/g,
     67         killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
     68         videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
     69         skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
     70         nextLink:              /(next|weiter|continue|>([^\|]|$)|([^\|]|$))/i, // Match: next, continue, >, >>,  but not >|, | as those usually mean last.
     71         prevLink:              /(prev|earl|old|new|<|)/i
     72     },
     73 
     74     /**
     75      * Runs readability.
     76      *
     77      * Workflow:
     78      *  1. Prep the document by removing script tags, css, etc.
     79      *  2. Build readability's DOM tree.
     80      *  3. Grab the article content from the current dom tree.
     81      *  4. Replace the current DOM tree with the new one.
     82      *  5. Read peacefully.
     83      *
     84      * @return void
     85      **/
     86     init: function() {
     87         /* Before we do anything, remove all scripts that are not readability. */
     88         window.onload = window.onunload = function() {};
     89 
     90         readability.removeScripts(document);
     91 
     92         /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
     93         readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
     94 
     95         /* Pull out any possible next page link first */
     96         readability.nextPageLink = readability.findNextPageLink(document.body);
     97 
     98         /* We handle processing of nextPage from C++ set nextPageLink to null */
     99         var nextPageLink = null;
    100 
    101         readability.prepDocument();
    102 
    103         /* Build readability's DOM tree */
    104         var overlay        = document.createElement("DIV");
    105         var innerDiv       = document.createElement("DIV");
    106         var articleTools   = readability.getArticleTools();
    107         var articleTitleText   = readability.getArticleTitle();
    108         var articleContent = readability.grabArticle();
    109 
    110         if(!articleContent) {
    111             articleContent    = document.createElement("DIV");
    112             articleContent.id = "readability-content";
    113             articleContent.innerHTML = [
    114                 "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
    115                 (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""),
    116                 "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
    117             ].join('');
    118 
    119             nextPageLink = null;
    120         }
    121 
    122         overlay.id              = "readOverlay";
    123         innerDiv.id             = "readInner";
    124 
    125         /* Apply user-selected styling */
    126         document.body.className = readability.readStyle;
    127         document.dir            = readability.getSuggestedDirection(articleTitleText);
    128 
    129         if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){
    130             overlay.className = readability.readStyle + " rdbTypekit";
    131         } else {
    132             overlay.className = readability.readStyle;
    133         }
    134         innerDiv.className    = readability.readMargin + " " + readability.readSize;
    135 
    136         if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
    137             readability.convertLinksToFootnotes = true;
    138         }
    139 
    140         readability.distilledHTML = articleContent.innerHTML;
    141 
    142         if(readability.frameHack) {
    143             var readOverlay = document.getElementById('readOverlay');
    144             readOverlay.style.height = '100%';
    145             readOverlay.style.overflow = 'auto';
    146         }
    147 
    148         /**
    149          * If someone tries to use Readability on a site's root page, give them a warning about usage.
    150         **/
    151         if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) {
    152             articleContent.style.display = "none";
    153             var rootWarning = document.createElement('p');
    154                 rootWarning.id = "readability-warning";
    155                 rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
    156                 "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
    157 
    158             innerDiv.insertBefore( rootWarning, articleContent );
    159         }
    160 
    161         readability.postProcessContent(articleContent);
    162 
    163         window.scrollTo(0, 0);
    164 
    165         if (nextPageLink) {
    166             /**
    167              * Append any additional pages after a small timeout so that people
    168              * can start reading without having to wait for this to finish processing.
    169             **/
    170             window.setTimeout(function() {
    171                 readability.appendNextPage(nextPageLink);
    172             }, 500);
    173         }
    174 
    175         /** Smooth scrolling **/
    176         document.onkeydown = function(e) {
    177             var code = (window.event) ? event.keyCode : e.keyCode;
    178             if (code === 16) {
    179                 readability.reversePageScroll = true;
    180                 return;
    181             }
    182 
    183             if (code === 32) {
    184                 readability.curScrollStep = 0;
    185                 var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
    186 
    187                 if(readability.reversePageScroll) {
    188                     readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
    189                 }
    190                 else {
    191                     readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
    192                 }
    193 
    194                 return false;
    195             }
    196         };
    197 
    198         document.onkeyup = function(e) {
    199             var code = (window.event) ? event.keyCode : e.keyCode;
    200             if (code === 16) {
    201                 readability.reversePageScroll = false;
    202                 return;
    203             }
    204         };
    205     },
    206 
    207     /**
    208      * Run any post-process modifications to article content as necessary.
    209      *
    210      * @param Element
    211      * @return void
    212     **/
    213     postProcessContent: function(articleContent) {
    214         if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
    215             readability.addFootnotes(articleContent);
    216         }
    217 
    218         readability.fixImageFloats(articleContent);
    219     },
    220 
    221     /**
    222      * Some content ends up looking ugly if the image is too large to be floated.
    223      * If the image is wider than a threshold (currently 55%), no longer float it,
    224      * center it instead.
    225      *
    226      * @param Element
    227      * @return void
    228     **/
    229     fixImageFloats: function (articleContent) {
    230         var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
    231             images              = articleContent.getElementsByTagName('img');
    232 
    233         for(var i=0, il = images.length; i < il; i+=1) {
    234             var image = images[i];
    235 
    236             if(image.offsetWidth > imageWidthThreshold) {
    237                 image.className += " blockImage";
    238             }
    239         }
    240     },
    241 
    242     /**
    243      * Get the article tools Element that has buttons like reload, print.
    244      *
    245      * @return void
    246      **/
    247     getArticleTools: function () {
    248         var articleTools = document.createElement("DIV");
    249 
    250         articleTools.id        = "readTools";
    251         articleTools.innerHTML =
    252             "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
    253             "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
    254             "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
    255 
    256         return articleTools;
    257     },
    258 
    259     /**
    260      * retuns the suggested direction of the string
    261      *
    262      * @return "rtl" || "ltr"
    263      **/
    264     getSuggestedDirection: function(text) {
    265         function sanitizeText() {
    266             return text.replace(/@\w+/, "");
    267         }
    268 
    269         function countMatches(match) {
    270             var matches = text.match(new RegExp(match, "g"));
    271             return matches !== null ? matches.length : 0;
    272         }
    273 
    274         function isRTL() {
    275             var count_heb =  countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
    276             var count_arb =  countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
    277 
    278             // if 20% of chars are Hebrew or Arbic then direction is rtl
    279             return  (count_heb + count_arb) * 100 / text.length > 20;
    280         }
    281 
    282         text  = sanitizeText(text);
    283         return isRTL() ? "rtl" : "ltr";
    284     },
    285 
    286     /**
    287      * Get the article title as an H1.
    288      *
    289      * @return void
    290      **/
    291     getArticleTitle: function () {
    292         var curTitle = "",
    293             origTitle = "";
    294 
    295         try {
    296             curTitle = origTitle = document.title;
    297             if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
    298                 curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
    299             }
    300         }
    301         catch(e) {}
    302 
    303         if(curTitle.match(/ [\|\-] /))
    304         {
    305             curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
    306 
    307             if(curTitle.split(' ').length < 3) {
    308                 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
    309             }
    310         }
    311         else if(curTitle.indexOf(': ') !== -1)
    312         {
    313             curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
    314 
    315             if(curTitle.split(' ').length < 3) {
    316                 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
    317             }
    318         }
    319         else if(curTitle.length > 150 || curTitle.length < 15)
    320         {
    321             var hOnes = document.getElementsByTagName('h1');
    322             if(hOnes.length === 1)
    323             {
    324                 curTitle = readability.getInnerText(hOnes[0]);
    325             }
    326         }
    327 
    328         curTitle = curTitle.replace( readability.regexps.trim, "" );
    329 
    330         if(curTitle.split(' ').length <= 4) {
    331             curTitle = origTitle;
    332         }
    333         return curTitle;
    334     },
    335 
    336     /**
    337      * Prepare the HTML document for readability to scrape it.
    338      * This includes things like stripping javascript, CSS, and handling terrible markup.
    339      *
    340      * @return void
    341      **/
    342     prepDocument: function () {
    343         /**
    344          * In some cases a body element can't be found (if the HTML is totally hosed for example)
    345          * so we create a new body node and append it to the document.
    346          */
    347         if(document.body === null)
    348         {
    349             var body = document.createElement("body");
    350             try {
    351                 document.body = body;
    352             }
    353             catch(e) {
    354                 document.documentElement.appendChild(body);
    355                 dbg(e);
    356             }
    357         }
    358 
    359         document.body.id = "readabilityBody";
    360 
    361         var frames = document.getElementsByTagName('frame');
    362         if(frames.length > 0)
    363         {
    364             var bestFrame = null;
    365             var bestFrameSize = 0;    /* The frame to try to run readability upon. Must be on same domain. */
    366             var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
    367             for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1)
    368             {
    369                 var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
    370                 var canAccessFrame = false;
    371                 try {
    372                     var frameBody = frames[frameIndex].contentWindow.document.body;
    373                     canAccessFrame = true;
    374                 }
    375                 catch(eFrames) {
    376                     dbg(eFrames);
    377                 }
    378 
    379                 if(frameSize > biggestFrameSize) {
    380                     biggestFrameSize         = frameSize;
    381                     readability.biggestFrame = frames[frameIndex];
    382                 }
    383 
    384                 if(canAccessFrame && frameSize > bestFrameSize)
    385                 {
    386                     readability.frameHack = true;
    387 
    388                     bestFrame = frames[frameIndex];
    389                     bestFrameSize = frameSize;
    390                 }
    391             }
    392 
    393             if(bestFrame)
    394             {
    395                 var newBody = document.createElement('body');
    396                 readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);
    397                 newBody.style.overflow = 'scroll';
    398                 document.body = newBody;
    399 
    400                 var frameset = document.getElementsByTagName('frameset')[0];
    401                 if(frameset) {
    402                     frameset.parentNode.removeChild(frameset); }
    403             }
    404         }
    405 
    406         /* Remove all stylesheets */
    407         for (var k=0;k < document.styleSheets.length; k+=1) {
    408             if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) {
    409                 document.styleSheets[k].disabled = true;
    410             }
    411         }
    412 
    413         /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
    414         var styleTags = document.getElementsByTagName("style");
    415         for (var st=0;st < styleTags.length; st+=1) {
    416             styleTags[st].textContent = "";
    417         }
    418 
    419         /* Turn all double br's into p's */
    420         /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
    421         readability.replaceDoubleBrsWithPs(document.body);
    422         readability.replaceFontsWithSpans(document.body);
    423     },
    424 
    425 
    426     /**
    427      * Prepare the article node for display. Clean out any inline styles,
    428      * iframes, forms, strip extraneous <p> tags, etc.
    429      *
    430      * @param Element
    431      * @return void
    432      **/
    433     prepArticle: function (articleContent) {
    434         readability.cleanStyles(articleContent);
    435         readability.killBreaks(articleContent);
    436 
    437         /* Clean out junk from the article content */
    438         readability.cleanConditionally(articleContent, "form");
    439         readability.clean(articleContent, "object");
    440         readability.clean(articleContent, "h1");
    441 
    442         /**
    443          * If there is only one h2, they are probably using it
    444          * as a header and not a subheader, so remove it since we already have a header.
    445         ***/
    446         if(articleContent.getElementsByTagName('h2').length === 1) {
    447             readability.clean(articleContent, "h2");
    448         }
    449         readability.clean(articleContent, "iframe");
    450 
    451         readability.cleanHeaders(articleContent);
    452 
    453         /* Do these last as the previous stuff may have removed junk that will affect these */
    454         readability.cleanConditionally(articleContent, "table");
    455         readability.cleanConditionally(articleContent, "ul");
    456         readability.cleanConditionally(articleContent, "div");
    457 
    458         /* Remove extra paragraphs */
    459         var articleParagraphs = articleContent.getElementsByTagName('p');
    460         for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
    461             var imgCount    = articleParagraphs[i].getElementsByTagName('img').length;
    462             var embedCount  = articleParagraphs[i].getElementsByTagName('embed').length;
    463             var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
    464 
    465             if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
    466                 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
    467             }
    468         }
    469 
    470         try {
    471             readability.replaceBrsWithPs(articleContent);
    472         }
    473         catch (e) {
    474             dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
    475         }
    476     },
    477 
    478     /**
    479      * Initialize a node with the readability object. Also checks the
    480      * className/id for special names to add to its score.
    481      *
    482      * @param Element
    483      * @return void
    484     **/
    485     initializeNode: function (node) {
    486         node.readability = {"contentScore": 0};
    487 
    488         switch(node.tagName) {
    489             case 'DIV':
    490                 node.readability.contentScore += 5;
    491                 break;
    492 
    493             case 'PRE':
    494             case 'TD':
    495             case 'BLOCKQUOTE':
    496                 node.readability.contentScore += 3;
    497                 break;
    498 
    499             case 'ADDRESS':
    500             case 'OL':
    501             case 'UL':
    502             case 'DL':
    503             case 'DD':
    504             case 'DT':
    505             case 'LI':
    506             case 'FORM':
    507                 node.readability.contentScore -= 3;
    508                 break;
    509 
    510             case 'H1':
    511             case 'H2':
    512             case 'H3':
    513             case 'H4':
    514             case 'H5':
    515             case 'H6':
    516             case 'TH':
    517                 node.readability.contentScore -= 5;
    518                 break;
    519         }
    520 
    521         node.readability.contentScore += readability.getClassWeight(node);
    522     },
    523 
    524     /***
    525      * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
    526      *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
    527      *
    528      * @param page a document to run upon. Needs to be a full document, complete with body.
    529      * @return Element
    530     **/
    531     grabArticle: function (pageToClone) {
    532         var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
    533             isPaging = (page !== null) ? true: false;
    534 
    535         var page = null;
    536         // Never work on the actual page.
    537         if (isPaging) {
    538             page = document.body.cloneNode(true);
    539         } else {
    540             page = pageToClone.cloneNode(true);
    541         }
    542 
    543         var allElements = page.getElementsByTagName('*');
    544 
    545         /**
    546          * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
    547          * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
    548          *
    549          * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
    550          * TODO: Shouldn't this be a reverse traversal?
    551         **/
    552         var node = null;
    553         var nodesToScore = [];
    554         for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
    555             /* Remove unlikely candidates */
    556             if (stripUnlikelyCandidates) {
    557                 var unlikelyMatchString = node.className + node.id;
    558                 if (
    559                     (
    560                         unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
    561                         unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
    562                         node.tagName !== "BODY"
    563                     )
    564                 )
    565                 {
    566                     dbg("Removing unlikely candidate - " + unlikelyMatchString);
    567                     node.parentNode.removeChild(node);
    568                     nodeIndex-=1;
    569                     continue;
    570                 }
    571             }
    572 
    573             if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
    574                 nodesToScore[nodesToScore.length] = node;
    575             }
    576 
    577             /* Turn all divs that don't have children block level elements into p's */
    578             if (node.tagName === "DIV") {
    579                 if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
    580                     var newNode = document.createElement('p');
    581                     try {
    582                         readability.moveNodeInnards(node, newNode);
    583                         node.parentNode.replaceChild(newNode, node);
    584                         nodeIndex-=1;
    585 
    586                         nodesToScore[nodesToScore.length] = node;
    587                     }
    588                     catch(e) {
    589                         dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
    590                     }
    591                 }
    592                 else
    593                 {
    594                     /* EXPERIMENTAL */
    595                     for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
    596                         var childNode = node.childNodes[i];
    597                         if(childNode.nodeType === 3) { // Node.TEXT_NODE
    598                             var p = document.createElement('p');
    599                             var t = document.createTextNode(childNode.nodeValue);
    600                             p.appendChild(t);
    601                             p.style.display = 'inline';
    602                             p.className = 'readability-styled';
    603                             childNode.parentNode.replaceChild(p, childNode);
    604                         }
    605                     }
    606                 }
    607             }
    608         }
    609 
    610         /**
    611          * Loop through all paragraphs, and assign a score to them based on how content-y they look.
    612          * Then add their score to their parent node.
    613          *
    614          * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
    615         **/
    616         var candidates = [];
    617         for (var pt=0; pt < nodesToScore.length; pt+=1) {
    618             var parentNode      = nodesToScore[pt].parentNode;
    619             var grandParentNode = parentNode ? parentNode.parentNode : null;
    620             var innerText       = readability.getInnerText(nodesToScore[pt]);
    621 
    622             if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
    623                 continue;
    624             }
    625 
    626             /* If this paragraph is less than 25 characters, don't even count it. */
    627             if(innerText.length < 25) {
    628                 continue; }
    629 
    630             /* Initialize readability data for the parent. */
    631             if(typeof parentNode.readability === 'undefined') {
    632                 readability.initializeNode(parentNode);
    633                 candidates.push(parentNode);
    634             }
    635 
    636             /* Initialize readability data for the grandparent. */
    637             if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
    638                 readability.initializeNode(grandParentNode);
    639                 candidates.push(grandParentNode);
    640             }
    641 
    642             var contentScore = 0;
    643 
    644             /* Add a point for the paragraph itself as a base. */
    645             contentScore+=1;
    646 
    647             /* Add points for any commas within this paragraph */
    648             contentScore += innerText.split(',').length;
    649 
    650             /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
    651             contentScore += Math.min(Math.floor(innerText.length / 100), 3);
    652 
    653             /* Add the score to the parent. The grandparent gets half. */
    654             parentNode.readability.contentScore += contentScore;
    655 
    656             if(grandParentNode) {
    657                 grandParentNode.readability.contentScore += contentScore/2;
    658             }
    659         }
    660 
    661         /**
    662          * After we've calculated scores, loop through all of the possible candidate nodes we found
    663          * and find the one with the highest score.
    664         **/
    665         var topCandidate = null;
    666         for(var c=0, cl=candidates.length; c < cl; c+=1)
    667         {
    668             /**
    669              * Scale the final candidates score based on link density. Good content should have a
    670              * relatively small link density (5% or less) and be mostly unaffected by this operation.
    671             **/
    672             candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
    673 
    674             dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
    675 
    676             if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
    677                 topCandidate = candidates[c]; }
    678         }
    679 
    680         /**
    681          * If we still have no top candidate, just use the body as a last resort.
    682          * We also have to copy the body node so it is something we can modify.
    683          **/
    684         if (topCandidate === null || topCandidate.tagName === "BODY")
    685         {
    686             topCandidate = document.createElement("DIV");
    687             readability.replaceNodeInnards(page, topCandidate);
    688             page.appendChild(topCandidate);
    689             readability.initializeNode(topCandidate);
    690         }
    691 
    692         /**
    693          * Now that we have the top candidate, look through its siblings for content that might also be related.
    694          * Things like preambles, content split by ads that we removed, etc.
    695         **/
    696         var articleContent        = document.createElement("DIV");
    697         if (isPaging) {
    698             articleContent.id     = "readability-content";
    699         }
    700         var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
    701         var siblingNodes          = topCandidate.parentNode.childNodes;
    702 
    703 
    704         for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
    705             var siblingNode = siblingNodes[s];
    706             var append      = false;
    707 
    708             /**
    709              * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
    710              * Example of error visible here: http://www.esquire.com/features/honesty0707
    711             **/
    712             if(!siblingNode) {
    713                 continue;
    714             }
    715 
    716             dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
    717             dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
    718 
    719             if(siblingNode === topCandidate)
    720             {
    721                 append = true;
    722             }
    723 
    724             var contentBonus = 0;
    725             /* Give a bonus if sibling nodes and top candidates have the example same classname */
    726             if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
    727                 contentBonus += topCandidate.readability.contentScore * 0.2;
    728             }
    729 
    730             if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
    731             {
    732                 append = true;
    733             }
    734 
    735             if(siblingNode.nodeName === "P") {
    736                 var linkDensity = readability.getLinkDensity(siblingNode);
    737                 var nodeContent = readability.getInnerText(siblingNode);
    738                 var nodeLength  = nodeContent.length;
    739 
    740                 if(nodeLength > 80 && linkDensity < 0.25)
    741                 {
    742                     append = true;
    743                 }
    744                 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
    745                 {
    746                     append = true;
    747                 }
    748             }
    749 
    750             if(append) {
    751                 dbg("Appending node: " + siblingNode);
    752 
    753                 var nodeToAppend = null;
    754                 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
    755                     /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
    756 
    757                     dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
    758                     nodeToAppend = document.createElement("DIV");
    759                     try {
    760                         nodeToAppend.id = siblingNode.id;
    761                         readability.moveNodeInnards(siblingNode, nodeToAppend);
    762                     }
    763                     catch(er) {
    764                         dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
    765                         nodeToAppend = siblingNode;
    766                         s-=1;
    767                         sl-=1;
    768                     }
    769                 } else {
    770                     nodeToAppend = siblingNode;
    771                     s-=1;
    772                     sl-=1;
    773                 }
    774 
    775                 /* To ensure a node does not interfere with readability styles, remove its classnames */
    776                 nodeToAppend.className = "";
    777 
    778                 /* Append sibling and subtract from our list because it removes the node when you append to another node */
    779                 articleContent.appendChild(nodeToAppend);
    780             }
    781         }
    782 
    783         /**
    784          * So we have all of the content that we need. Now we clean it up for presentation.
    785         **/
    786         readability.distilledArticleContent = articleContent.cloneNode(true);
    787         //readability.prepArticle(articleContent);
    788 
    789         if (readability.curPageNum === 1) {
    790             var newNode = document.createElement('div');
    791             newNode.id = "readability-page-1";
    792             newNode.setAttribute("class", "page");
    793             readability.moveNodeInnards(articleContent, newNode);
    794             articleContent.appendChild(newNode);
    795         }
    796 
    797         /**
    798          * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
    799          * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
    800          * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
    801          * finding the -right- content.
    802         **/
    803         if(readability.getInnerText(articleContent, false).length < 250) {
    804             if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
    805                 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
    806                 return readability.grabArticle(document.body);
    807             }
    808             else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
    809                 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
    810                 return readability.grabArticle(document.body);
    811             }
    812             else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
    813                 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
    814                 return readability.grabArticle(document.body);
    815             } else {
    816                 return null;
    817             }
    818         }
    819 
    820         return articleContent;
    821     },
    822 
    823     /**
    824      * Removes script tags from the document.
    825      *
    826      * @param Element
    827     **/
    828     removeScripts: function (doc) {
    829         var scripts = doc.getElementsByTagName('script');
    830         for(var i = scripts.length-1; i >= 0; i-=1)
    831         {
    832             if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
    833             {
    834                 scripts[i].nodeValue="";
    835                 scripts[i].removeAttribute('src');
    836                 if (scripts[i].parentNode) {
    837                         scripts[i].parentNode.removeChild(scripts[i]);
    838                 }
    839             }
    840         }
    841     },
    842 
    843     /**
    844      * Get the inner text of a node - cross browser compatibly.
    845      * This also strips out any excess whitespace to be found.
    846      *
    847      * @param Element
    848      * @return string
    849     **/
    850     getInnerText: function (e, normalizeSpaces) {
    851         var textContent    = "";
    852 
    853         if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
    854             return "";
    855         }
    856 
    857         normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
    858 
    859         if (navigator.appName === "Microsoft Internet Explorer") {
    860             textContent = e.innerText.replace( readability.regexps.trim, "" ); }
    861         else {
    862             textContent = e.textContent.replace( readability.regexps.trim, "" ); }
    863 
    864         if(normalizeSpaces) {
    865             return textContent.replace( readability.regexps.normalize, " "); }
    866         else {
    867             return textContent; }
    868     },
    869 
    870     /**
    871      * Get the number of times a string s appears in the node e.
    872      *
    873      * @param Element
    874      * @param string - what to split on. Default is ","
    875      * @return number (integer)
    876     **/
    877     getCharCount: function (e,s) {
    878         s = s || ",";
    879         return readability.getInnerText(e).split(s).length-1;
    880     },
    881 
    882     /**
    883      * Remove the style attribute on every e and under.
    884      * TODO: Test if getElementsByTagName(*) is faster.
    885      *
    886      * @param Element
    887      * @return void
    888     **/
    889     cleanStyles: function (e) {
    890         e = e || document;
    891         var cur = e.firstChild;
    892 
    893         if(!e) {
    894             return; }
    895 
    896         // Remove any root styles, if we're able.
    897         if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
    898             e.removeAttribute('style'); }
    899 
    900         // Go until there are no more child nodes
    901         while ( cur !== null ) {
    902             if ( cur.nodeType === 1 ) {
    903                 // Remove style attribute(s) :
    904                 if(cur.className !== "readability-styled") {
    905                     cur.removeAttribute("style");
    906                 }
    907                 readability.cleanStyles( cur );
    908             }
    909             cur = cur.nextSibling;
    910         }
    911     },
    912 
    913     /**
    914      * Get the density of links as a percentage of the content
    915      * This is the amount of text that is inside a link divided by the total text in the node.
    916      *
    917      * @param Element
    918      * @return number (float)
    919     **/
    920     getLinkDensity: function (e) {
    921         var links      = e.getElementsByTagName("a");
    922         var textLength = readability.getInnerText(e).length;
    923         var linkLength = 0;
    924         for(var i=0, il=links.length; i<il;i+=1)
    925         {
    926             linkLength += readability.getInnerText(links[i]).length;
    927         }
    928 
    929         return linkLength / textLength;
    930     },
    931 
    932     /**
    933      * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
    934      *
    935      * @author Dan Lacy
    936      * @return string the base url
    937     **/
    938     findBaseUrl: function () {
    939         var noUrlParams     = window.location.pathname.split("?")[0],
    940             urlSlashes      = noUrlParams.split("/").reverse(),
    941             cleanedSegments = [],
    942             possibleType    = "";
    943 
    944         for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
    945             var segment = urlSlashes[i];
    946 
    947             // Split off and save anything that looks like a file type.
    948             if (segment.indexOf(".") !== -1) {
    949                 possibleType = segment.split(".")[1];
    950 
    951                 /* If the type isn't alpha-only, it's probably not actually a file extension. */
    952                 if(!possibleType.match(/[^a-zA-Z]/)) {
    953                     segment = segment.split(".")[0];
    954                 }
    955             }
    956 
    957             /**
    958              * EW-CMS specific segment replacement. Ugly.
    959              * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
    960             **/
    961             if(segment.indexOf(',00') !== -1) {
    962                 segment = segment.replace(',00', '');
    963             }
    964 
    965             // If our first or second segment has anything looking like a page number, remove it.
    966             if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
    967                 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
    968             }
    969 
    970 
    971             var del = false;
    972 
    973             /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
    974             if (i < 2 && segment.match(/^\d{1,2}$/)) {
    975                 del = true;
    976             }
    977 
    978             /* If this is the first segment and it's just "index", remove it. */
    979             if(i === 0 && segment.toLowerCase() === "index") {
    980                 del = true;
    981             }
    982 
    983 
    984             /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
    985             if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
    986                 del = true;
    987             }
    988 
    989             /* If it's not marked for deletion, push it to cleanedSegments. */
    990             if (!del) {
    991                 cleanedSegments.push(segment);
    992             }
    993         }
    994 
    995         // This is our final, cleaned, base article URL.
    996         return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
    997     },
    998 
    999     /**
   1000      * Look for any paging links that may occur within the document.
   1001      *
   1002      * @param body
   1003      * @return object (array)
   1004     **/
   1005     findNextPageLink: function (elem) {
   1006         var possiblePages = {},
   1007             allLinks = elem.getElementsByTagName('a'),
   1008             articleBaseUrl = readability.findBaseUrl();
   1009 
   1010         /**
   1011          * Loop through all links, looking for hints that they may be next-page links.
   1012          * Things like having "page" in their textContent, className or id, or being a child
   1013          * of a node with a page-y className or id.
   1014          *
   1015          * Also possible: levenshtein distance? longest common subsequence?
   1016          *
   1017          * After we do that, assign each page a score, and
   1018         **/
   1019         for(var i = 0, il = allLinks.length; i < il; i+=1) {
   1020             var link     = allLinks[i],
   1021                 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
   1022 
   1023             /* If we've already seen this page, ignore it */
   1024             if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
   1025                 continue;
   1026             }
   1027 
   1028             /* If it's on a different domain, skip it. */
   1029             if(window.location.host !== linkHref.split(/\/+/g)[1]) {
   1030                 continue;
   1031             }
   1032 
   1033             var linkText = readability.getInnerText(link);
   1034 
   1035             /* If the linkText looks like it's not the next page, skip it. */
   1036             if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
   1037                 continue;
   1038             }
   1039 
   1040             /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
   1041             var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
   1042             if(!linkHrefLeftover.match(/\d/)) {
   1043                 continue;
   1044             }
   1045 
   1046             if(!(linkHref in possiblePages)) {
   1047                 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
   1048             } else {
   1049                 possiblePages[linkHref].linkText += ' | ' + linkText;
   1050             }
   1051 
   1052             var linkObj = possiblePages[linkHref];
   1053 
   1054             /**
   1055              * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
   1056              * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
   1057             **/
   1058             if(linkHref.indexOf(articleBaseUrl) !== 0) {
   1059                 linkObj.score -= 25;
   1060             }
   1061 
   1062             var linkData = linkText + ' ' + link.className + ' ' + link.id;
   1063             if(linkData.match(readability.regexps.nextLink)) {
   1064                 linkObj.score += 50;
   1065             }
   1066             if(linkData.match(/pag(e|ing|inat)/i)) {
   1067                 linkObj.score += 25;
   1068             }
   1069             if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or  in the text,
   1070                 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
   1071                 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
   1072                     linkObj.score -= 65;
   1073                 }
   1074             }
   1075             if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
   1076                 linkObj.score -= 50;
   1077             }
   1078             if(linkData.match(readability.regexps.prevLink)) {
   1079                 linkObj.score -= 200;
   1080             }
   1081 
   1082             /* If a parentNode contains page or paging or paginat */
   1083             var parentNode = link.parentNode,
   1084                 positiveNodeMatch = false,
   1085                 negativeNodeMatch = false;
   1086             while(parentNode) {
   1087                 var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
   1088                 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
   1089                     positiveNodeMatch = true;
   1090                     linkObj.score += 25;
   1091                 }
   1092                 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
   1093                     /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
   1094                     if(!parentNodeClassAndId.match(readability.regexps.positive)) {
   1095                         linkObj.score -= 25;
   1096                         negativeNodeMatch = true;
   1097                     }
   1098                 }
   1099 
   1100                 parentNode = parentNode.parentNode;
   1101             }
   1102 
   1103             /**
   1104              * If the URL looks like it has paging in it, add to the score.
   1105              * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
   1106             **/
   1107             if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
   1108                 linkObj.score += 25;
   1109             }
   1110 
   1111             /* If the URL contains negative values, give a slight decrease. */
   1112             if (linkHref.match(readability.regexps.extraneous)) {
   1113                 linkObj.score -= 15;
   1114             }
   1115 
   1116             /**
   1117              * Minor punishment to anything that doesn't match our current URL.
   1118              * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
   1119              *       Dan, can you show me a counterexample where this is necessary?
   1120              * if (linkHref.indexOf(window.location.href) !== 0) {
   1121              *    linkObj.score -= 1;
   1122              * }
   1123             **/
   1124 
   1125             /**
   1126              * If the link text can be parsed as a number, give it a minor bonus, with a slight
   1127              * bias towards lower numbered pages. This is so that pages that might not have 'next'
   1128              * in their text can still get scored, and sorted properly by score.
   1129             **/
   1130             var linkTextAsNumber = parseInt(linkText, 10);
   1131             if(linkTextAsNumber) {
   1132                 // Punish 1 since we're either already there, or it's probably before what we want anyways.
   1133                 if (linkTextAsNumber === 1) {
   1134                     linkObj.score -= 10;
   1135                 }
   1136                 else {
   1137                     // Todo: Describe this better
   1138                     linkObj.score += Math.max(0, 10 - linkTextAsNumber);
   1139                 }
   1140             }
   1141         }
   1142 
   1143         /**
   1144          * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
   1145          * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
   1146         **/
   1147         var topPage = null;
   1148         for(var page in possiblePages) {
   1149             if(possiblePages.hasOwnProperty(page)) {
   1150                 if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
   1151                     topPage = possiblePages[page];
   1152                 }
   1153             }
   1154         }
   1155 
   1156         if(topPage) {
   1157             var nextHref = topPage.href.replace(/\/$/,'');
   1158 
   1159             dbg('NEXT PAGE IS ' + nextHref);
   1160             readability.parsedPages[nextHref] = true;
   1161             return nextHref;
   1162         }
   1163         else {
   1164             return null;
   1165         }
   1166     },
   1167 
   1168     createLinkDiv: function(link) {
   1169         var divNode = document.createElement('div');
   1170         var aNode = document.createElement('a');
   1171         var tNode = document.createTextNode('View Next Page');
   1172         divNode.setAttribute('style', 'text-align: center');
   1173         aNode.setAttribute('href', link);
   1174         aNode.appendChild(tNode);
   1175         divNode.appendChild(aNode);
   1176         return divNode;
   1177     },
   1178 
   1179     xhr: function () {
   1180         if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
   1181             return new XMLHttpRequest();
   1182         }
   1183         else {
   1184             try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
   1185             try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
   1186             try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
   1187         }
   1188 
   1189         return false;
   1190     },
   1191 
   1192     successfulRequest: function (request) {
   1193         return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
   1194     },
   1195 
   1196     ajax: function (url, options) {
   1197         var request = readability.xhr();
   1198 
   1199         function respondToReadyState(readyState) {
   1200             if (request.readyState === 4) {
   1201                 if (readability.successfulRequest(request)) {
   1202                     if (options.success) { options.success(request); }
   1203                 }
   1204                 else {
   1205                     if (options.error) { options.error(request); }
   1206                 }
   1207             }
   1208         }
   1209 
   1210         if (typeof options === 'undefined') { options = {}; }
   1211 
   1212         request.onreadystatechange = respondToReadyState;
   1213 
   1214         request.open('get', url, true);
   1215         request.setRequestHeader('Accept', 'text/html');
   1216 
   1217         try {
   1218             request.send(options.postBody);
   1219         }
   1220         catch (e) {
   1221             if (options.error) { options.error(); }
   1222         }
   1223 
   1224         return request;
   1225     },
   1226 
   1227     /**
   1228      * Make an AJAX request for each page and append it to the document.
   1229     **/
   1230     curPageNum: 1,
   1231 
   1232     appendNextPage: function (nextPageLink) {
   1233         readability.curPageNum+=1;
   1234 
   1235         var articlePage       = document.createElement("DIV");
   1236         articlePage.id        = 'readability-page-' + readability.curPageNum;
   1237         articlePage.className = 'page';
   1238         articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">&sect;</p>';
   1239 
   1240         document.getElementById("readability-content").appendChild(articlePage);
   1241 
   1242         if(readability.curPageNum > readability.maxPages) {
   1243             var linkDiv = readability.createLinkDiv(nextPageLink);
   1244 
   1245             articlePage.appendChild(linkDiv);
   1246             return;
   1247         }
   1248 
   1249         /**
   1250          * Now that we've built the article page DOM element, get the page content
   1251          * asynchronously and load the cleaned content into the div we created for it.
   1252         **/
   1253         (function(pageUrl, thisPage) {
   1254             readability.ajax(pageUrl, {
   1255                 success: function(r) {
   1256 
   1257                     /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
   1258                     var eTag = r.getResponseHeader('ETag');
   1259                     if(eTag) {
   1260                         if(eTag in readability.pageETags) {
   1261                             dbg("Exact duplicate page found via ETag. Aborting.");
   1262                             articlePage.style.display = 'none';
   1263                             return;
   1264                         } else {
   1265                             readability.pageETags[eTag] = 1;
   1266                         }
   1267                     }
   1268 
   1269                     // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
   1270                     var page = document.createElement("DIV");
   1271 
   1272                     /**
   1273                      * Do some preprocessing to our HTML to make it ready for appending.
   1274                      *  Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
   1275                      *  Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
   1276                      *  Turn all double br's into p's - was handled by prepDocument in the original view.
   1277                      *   Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
   1278                     **/
   1279                     var pageInnards = r.responseXML;
   1280                     readability.removeScripts(pageInnards);
   1281                     readability.replaceNoscriptsWithPs(pageInnards);
   1282                     readability.replaceDoubleBrsWithPs(pageInnards);
   1283                     readability.replaceFontsWithSpans(pageInnards);
   1284                     page.appendChild(pageInnards);
   1285 
   1286 
   1287                     /**
   1288                      * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
   1289                     **/
   1290                     readability.flags = 0x1 | 0x2 | 0x4;
   1291 
   1292                     var nextPageLink = readability.findNextPageLink(page),
   1293                         content      =  readability.grabArticle(page);
   1294 
   1295                     if(!content) {
   1296                         dbg("No content found in page to append. Aborting.");
   1297                         return;
   1298                     }
   1299 
   1300                     /**
   1301                      * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
   1302                      * Compare it against all of the the previous document's we've gotten. If the previous
   1303                      * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
   1304                     **/
   1305                     var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
   1306                     if(firstP && firstP.innerHTML.length > 100) {
   1307                         for(var i=1; i <= readability.curPageNum; i+=1) {
   1308                             var rPage = document.getElementById('readability-page-' + i);
   1309                             if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
   1310                                 dbg('Duplicate of page ' + i + ' - skipping.');
   1311                                 articlePage.style.display = 'none';
   1312                                 readability.parsedPages[pageUrl] = true;
   1313                                 return;
   1314                             }
   1315                         }
   1316                     }
   1317 
   1318                     readability.removeScripts(content);
   1319 
   1320                     readability.moveNodeInnards(content, thisPage);
   1321 
   1322                     /**
   1323                      * After the page has rendered, post process the content. This delay is necessary because,
   1324                      * in webkit at least, offsetWidth is not set in time to determine image width. We have to
   1325                      * wait a little bit for reflow to finish before we can fix floating images.
   1326                     **/
   1327                     window.setTimeout(
   1328                         function() { readability.postProcessContent(thisPage); },
   1329                         500
   1330                     );
   1331 
   1332                     if(nextPageLink) {
   1333                         readability.appendNextPage(nextPageLink);
   1334                     }
   1335                 }
   1336             });
   1337         }(nextPageLink, articlePage));
   1338     },
   1339 
   1340     /**
   1341      * Get an elements class/id weight. Uses regular expressions to tell if this
   1342      * element looks good or bad.
   1343      *
   1344      * @param Element
   1345      * @return number (Integer)
   1346     **/
   1347     getClassWeight: function (e) {
   1348         if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
   1349             return 0;
   1350         }
   1351 
   1352         var weight = 0;
   1353 
   1354         /* Look for a special classname */
   1355         if (typeof(e.className) === 'string' && e.className !== '')
   1356         {
   1357             if(e.className.search(readability.regexps.negative) !== -1) {
   1358                 weight -= 25; }
   1359 
   1360             if(e.className.search(readability.regexps.positive) !== -1) {
   1361                 weight += 25; }
   1362         }
   1363 
   1364         /* Look for a special ID */
   1365         if (typeof(e.id) === 'string' && e.id !== '')
   1366         {
   1367             if(e.id.search(readability.regexps.negative) !== -1) {
   1368                 weight -= 25; }
   1369 
   1370             if(e.id.search(readability.regexps.positive) !== -1) {
   1371                 weight += 25; }
   1372         }
   1373 
   1374         return weight;
   1375     },
   1376 
   1377     nodeIsVisible: function (node) {
   1378         return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
   1379     },
   1380 
   1381     /**
   1382      * Remove extraneous break tags from a node.
   1383      *
   1384      * @param Element
   1385      * @return void
   1386      **/
   1387     killBreaks: function (e) {
   1388         var allElements = e.getElementsByTagName('*');
   1389         while (i < allElements.length) {
   1390             readability.deleteExtraBreaks(allElements[i]);
   1391             i++;
   1392         }
   1393     },
   1394 
   1395     /**
   1396      * Clean a node of all elements of type "tag".
   1397      * (Unless it's a youtube/vimeo video. People love movies.)
   1398      *
   1399      * @param Element
   1400      * @param string tag to clean
   1401      * @return void
   1402      **/
   1403     clean: function (e, tag) {
   1404         var targetList = e.getElementsByTagName( tag );
   1405         var isEmbed    = (tag === 'object' || tag === 'embed');
   1406 
   1407         for (var y=targetList.length-1; y >= 0; y-=1) {
   1408             /* Allow youtube and vimeo videos through as people usually want to see those. */
   1409             if(isEmbed) {
   1410                 var attributeValues = "";
   1411                 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
   1412                     attributeValues += targetList[y].attributes[i].value + '|';
   1413                 }
   1414 
   1415                 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
   1416                 if (attributeValues.search(readability.regexps.videos) !== -1) {
   1417                     continue;
   1418                 }
   1419 
   1420                 /* Then check the elements inside this element for the same. */
   1421                 if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
   1422                     continue;
   1423                 }
   1424 
   1425             }
   1426 
   1427             targetList[y].parentNode.removeChild(targetList[y]);
   1428         }
   1429     },
   1430 
   1431     /**
   1432      * Clean an element of all tags of type "tag" if they look fishy.
   1433      * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
   1434      *
   1435      * @return void
   1436      **/
   1437     cleanConditionally: function (e, tag) {
   1438 
   1439         if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
   1440             return;
   1441         }
   1442 
   1443         var tagsList      = e.getElementsByTagName(tag);
   1444         var curTagsLength = tagsList.length;
   1445 
   1446         /**
   1447          * Gather counts for other typical elements embedded within.
   1448          * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
   1449          *
   1450          * TODO: Consider taking into account original contentScore here.
   1451         **/
   1452         for (var i=curTagsLength-1; i >= 0; i-=1) {
   1453             var weight = readability.getClassWeight(tagsList[i]);
   1454             var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
   1455 
   1456             dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
   1457 
   1458             if(weight+contentScore < 0)
   1459             {
   1460                 tagsList[i].parentNode.removeChild(tagsList[i]);
   1461             }
   1462             else if ( readability.getCharCount(tagsList[i],',') < 10) {
   1463                 /**
   1464                  * If there are not very many commas, and the number of
   1465                  * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
   1466                 **/
   1467                 var p      = tagsList[i].getElementsByTagName("p").length;
   1468                 var img    = tagsList[i].getElementsByTagName("img").length;
   1469                 var li     = tagsList[i].getElementsByTagName("li").length-100;
   1470                 var input  = tagsList[i].getElementsByTagName("input").length;
   1471 
   1472                 var embedCount = 0;
   1473                 var embeds     = tagsList[i].getElementsByTagName("embed");
   1474                 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
   1475                     if (embeds[ei].src.search(readability.regexps.videos) === -1) {
   1476                       embedCount+=1;
   1477                     }
   1478                 }
   1479 
   1480                 var linkDensity   = readability.getLinkDensity(tagsList[i]);
   1481                 var contentLength = readability.getInnerText(tagsList[i]).length;
   1482                 var toRemove      = false;
   1483 
   1484                 if ( img > p ) {
   1485                     toRemove = true;
   1486                 } else if(li > p && tag !== "ul" && tag !== "ol") {
   1487                     toRemove = true;
   1488                 } else if( input > Math.floor(p/3) ) {
   1489                     toRemove = true;
   1490                 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
   1491                     toRemove = true;
   1492                 } else if(weight < 25 && linkDensity > 0.2) {
   1493                     toRemove = true;
   1494                 } else if(weight >= 25 && linkDensity > 0.5) {
   1495                     toRemove = true;
   1496                 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
   1497                     toRemove = true;
   1498                 }
   1499 
   1500                 if(toRemove) {
   1501                     tagsList[i].parentNode.removeChild(tagsList[i]);
   1502                 }
   1503             }
   1504         }
   1505     },
   1506 
   1507     /**
   1508      * Clean out spurious headers from an Element. Checks things like classnames and link density.
   1509      *
   1510      * @param Element
   1511      * @return void
   1512     **/
   1513     cleanHeaders: function (e) {
   1514         for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
   1515             var headers = e.getElementsByTagName('h' + headerIndex);
   1516             for (var i=headers.length-1; i >=0; i-=1) {
   1517                 if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
   1518                     headers[i].parentNode.removeChild(headers[i]);
   1519                 }
   1520             }
   1521         }
   1522     },
   1523 
   1524     flagIsActive: function(flag) {
   1525         return (readability.flags & flag) > 0;
   1526     },
   1527 
   1528     addFlag: function(flag) {
   1529         readability.flags = readability.flags | flag;
   1530     },
   1531 
   1532     removeFlag: function(flag) {
   1533         readability.flags = readability.flags & ~flag;
   1534     },
   1535 
   1536     // Removes the children of |src| and appends them to |dest|.
   1537     moveNodeInnards: function(src, dest) {
   1538         try {
   1539             while (src.firstChild) {
   1540                 dest.appendChild(src.removeChild(src.firstChild));
   1541             }
   1542         } catch (e) {}
   1543     },
   1544 
   1545     // Returns true if the node is a whitespace text node.
   1546     isWhitespaceNode: function(node) {
   1547         if (node.nodeType == Node.TEXT_NODE) {
   1548             if (node.data.trim().length == 0) {
   1549                return true;
   1550             }
   1551         }
   1552         return false;
   1553     },
   1554 
   1555     // Returns true if the node is a <BR>.
   1556     isBrNode: function(node) {
   1557         return (node.tagName === 'BR');
   1558     },
   1559 
   1560 
   1561     // Returns the last <BR> node in a sequence of <BR> nodes that are only
   1562     // separated by whitespace, or null if there are not at least two <BR> tags
   1563     // in the sibling chain starting with |node|. Returns the second such <BR>
   1564     // node if |restrictToTwo| is true.
   1565     isMultipleBr: function(node, restrictToTwo) {
   1566         var lastBr = null;
   1567         if (!readability.isBrNode(node)) {
   1568             return lastBr;
   1569         }
   1570         var curr = node.nextSibling;
   1571         while (curr) {
   1572             if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) {
   1573                 lastBr = curr;
   1574                 curr = curr.nextSibling;
   1575                 if (restrictToTwo) {
   1576                     if (readability.isBrNode(lastBr)) {
   1577                         return lastBr;
   1578                     }
   1579                 }
   1580                 continue;
   1581             }
   1582             break;
   1583         }
   1584         return lastBr;
   1585     },
   1586 
   1587     // Removes all <BR> nodes except one and whitespace in between in a series
   1588     // of <BR> nodes.
   1589     deleteExtraBreaks: function(node) {
   1590         var lastBr = readability.isMultipleBr(node, false);
   1591         var ret = false;
   1592         while (lastBr && lastBr != node) {
   1593             var toRemove = lastBr;
   1594             lastBr = lastBr.previousSibling;
   1595             toRemove.parentNode.removeChild(toRemove);
   1596             ret = true;
   1597         }
   1598         return ret;
   1599     },
   1600 
   1601     // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
   1602     // <P> node, and makes all next siblings of that pair children of <P>, up
   1603     // until the next pair of <BR> nodes is reached.
   1604     replaceDoubleBrWithP: function(node) {
   1605         // Check that we are starting with a BR.
   1606         var second = readability.isMultipleBr(node, true);
   1607         if (!second) {
   1608             return;
   1609         }
   1610         // Make all next siblings of the second BR into children of a P.
   1611         var p = document.createElement('p');
   1612         var curr = second.nextSibling;
   1613         while (curr) {
   1614             if (readability.isMultipleBr(curr, true)) {
   1615                 break;
   1616             }
   1617             var next = curr.nextSibling;
   1618             p.appendChild(curr.parentNode.removeChild(curr));
   1619             curr = next;
   1620         }
   1621         var ret = curr;
   1622 
   1623         // Remove all nodes between the first and second BR.
   1624         curr = node.nextSibling;
   1625         while (curr && curr != second) {
   1626             var next = curr.nextSibling;
   1627             curr.parentNode.removeChild(curr);
   1628             curr = next;
   1629         }
   1630         // Remove the second BR.
   1631         second.parentNode.removeChild(second);
   1632         // Replace the first BR with the P.
   1633         node.parentNode.replaceChild(p, node);
   1634 
   1635         return ret;
   1636     },
   1637 
   1638     // Returns true if the NodeList contains a double <BR>.
   1639     hasDoubleBr: function(nodeList) {
   1640         for (var i = 0; i < nodeList.length; nodeList++) {
   1641             if (readability.isMultipleBr(nodeList[i], true)) {
   1642                 return true;
   1643             }
   1644         }
   1645         return false;
   1646     },
   1647 
   1648     // Replaces double <BR> tags with <P> tags.
   1649     replaceDoubleBrsWithPs: function(node) {
   1650         var allElements = node.getElementsByTagName('BR');
   1651         var node = null;
   1652         while (allElements && allElements.length > 0 &&
   1653                readability.hasDoubleBr(allElements)) {
   1654             for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
   1655                 var next = node;
   1656                 while (next = readability.replaceDoubleBrWithP(next));
   1657             }
   1658             allElements = document.body.getElementsByTagName('BR');
   1659         }
   1660     },
   1661 
   1662 
   1663     // Replaces a BR and the whitespace that follows it with a P.
   1664     replaceBrWithP: function(node) {
   1665         if (!readability.isBrNode(node)) {
   1666             return;
   1667         }
   1668         var p = document.createElement('p');
   1669         var curr = node.nextSibling;
   1670         while (curr && !isBrNode(curr)) {
   1671             var next = curr.nextSibling;
   1672             if (readability.isWhitespaceNode(curr)) {
   1673                 curr.parentNode.removeChild(curr);
   1674             } else {
   1675                 p.appendChild(curr.parentNode.removeChild(curr));
   1676             }
   1677             curr = next;
   1678         }
   1679         node.parentNode.replaceChild(p, node);
   1680         return curr;
   1681     },
   1682 
   1683     // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag
   1684     // children of the <P>.
   1685     replaceBrsWithPs: function(node) {
   1686         var allElements = node.getElementsByTagName('BR');
   1687         var node = null;
   1688         while (allElements && allElements.length > 0) {
   1689             for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
   1690                 var next = node;
   1691                 while (next = readability.replaceBrWithP(next));
   1692             }
   1693             allElements = document.body.getElementsByTagName('BR');
   1694         }
   1695     },
   1696 
   1697     // Replaces any tag with any other tag.
   1698     replaceTagsWithTags: function(node, srcTag, destTag) {
   1699         var allElements = node.getElementsByTagName(srcTag);
   1700         for (var i = 0; i < allElements.length; i++) {
   1701             var dest = document.createElement(destTag);
   1702             readability.moveNodeInnards(allElements[i], dest);
   1703             allElements[i].parentNode.replaceChild(dest, allElements[i]);
   1704         }
   1705     },
   1706 
   1707     // Replaces all <noscript> tags with <p> tags.
   1708     replaceNoscriptsWithPs: function(node) {
   1709         readability.replaceTagsWithTags(node, 'noscript', 'p');
   1710     },
   1711 
   1712     // Replaces all <font> tags with <span> tags.
   1713     replaceFontsWithSpans: function(node) {
   1714         readability.replaceTagsWithTags(node, 'font', 'span');
   1715     },
   1716 
   1717     // Returns a list of image URLs in the distilled article.
   1718     getImages : function() {
   1719         var images = document.getElementsByTagName('img');
   1720         var result = new Array(images.length);
   1721         dbg("Number of images: " + images.length);
   1722         for(i = 0; i < images.length; i++) {
   1723             result[i] = images[i].src;
   1724             dbg("Image: " + result[i]);
   1725         }
   1726         return result;
   1727     },
   1728 
   1729     // Returns the distilled article HTML from the page(s).
   1730     getDistilledArticleHTML : function() {
   1731         return readability.distilledHTML;
   1732     },
   1733 
   1734     // Returns the next page of this article.
   1735     getNextPageLink : function() {
   1736         return readability.nextPageLink;
   1737     }
   1738 };
   1739