Home | History | Annotate | Download | only in doc
      1 // Copyright 2005 Google Inc.
      2 // All Rights Reserved.
      3 //
      4 // msamuel (a] google.com
      5 
      6 // Usage:
      7 // 1) include this source file in an html page via
      8 // <script type=text/javascript src=prettify.js></script>
      9 // 2) define style rules.  See the example page for examples.
     10 // 3) mark the <pre> and <code> tags in your source with class=prettyprint.
     11 //    You can also use the (html deprecated) <xmp> tag, but the pretty printer
     12 //    needs to do more substantial DOM manipulations to support that, so some
     13 //    css styles may not be preserved.
     14 
     15 // Change log:
     16 // cbeust, 2006/08/22
     17 //   Java annotations (start with "@") are now captured as literals ("lit")
     18 // 
     19 
     20 var PR_keywords = new Object();
     21 /** initialize the keyword list for our target languages. */
     22 (function () {
     23   var CPP_KEYWORDS = (
     24     "bool break case catch char class const const_cast continue default " +
     25     "delete deprecated dllexport dllimport do double dynamic_cast else enum " +
     26     "explicit extern false float for friend goto if inline int long mutable " +
     27     "naked namespace new noinline noreturn nothrow novtable operator private " +
     28     "property protected public register reinterpret_cast return selectany " +
     29     "short signed sizeof static static_cast struct switch template this " +
     30     "thread throw true try typedef typeid typename union unsigned using " +
     31     "declaration, using directive uuid virtual void volatile while typeof");
     32   var JAVA_KEYWORDS = (
     33     "abstract default goto package synchronized boolean do if private this " +
     34     "break double implements protected throw byte else import public throws " +
     35     "case enum instanceof return transient catch extends int short try char " +
     36     "final interface static void class finally long strictfp volatile const " +
     37     "float native super while continue for new switch");
     38   var PYTHON_KEYWORDS = (
     39     "and assert break class continue def del elif else except exec finally " +
     40     "for from global if import in is lambda not or pass print raise return " +
     41     "try while yield");
     42   var JSCRIPT_KEYWORDS = (
     43     "abstract boolean break byte case catch char class const continue " +
     44     "debugger default delete do double else enum export extends false final " +
     45     "finally float for function goto if implements import in instanceof int " +
     46     "interface long native new null package private protected public return " +
     47     "short static super switch synchronized this throw throws transient " +
     48     "true try typeof var void volatile while with NaN Infinity");
     49   var PERL_KEYWORDS = (
     50     "foreach require sub unless until use elsif BEGIN END");
     51   var SH_KEYWORDS = (
     52     "if then do else fi end");
     53   var KEYWORDS = [CPP_KEYWORDS, JAVA_KEYWORDS, PYTHON_KEYWORDS,
     54                   JSCRIPT_KEYWORDS, PERL_KEYWORDS, SH_KEYWORDS];
     55   for (var k = 0; k < KEYWORDS.length; k++) {
     56     var kw = KEYWORDS[k].split(' ');
     57     for (var i = 0; i < kw.length; i++) {
     58       if (kw[i]) { PR_keywords[kw[i]] = true; }
     59     }
     60   }
     61 }).call(this);
     62 
     63 // token style names.  correspond to css classes
     64 /** token style for a string literal */
     65 var PR_STRING = 'str';
     66 /** token style for a keyword */
     67 var PR_KEYWORD = 'kwd';
     68 /** token style for a comment */
     69 var PR_COMMENT = 'com';
     70 /** token style for a type */
     71 var PR_TYPE = 'typ';
     72 /** token style for a literal value.  e.g. 1, null, true. */
     73 var PR_LITERAL = 'lit';
     74 /** token style for a punctuation string. */
     75 var PR_PUNCTUATION = 'pun';
     76 /** token style for a punctuation string. */
     77 var PR_PLAIN = 'pln';
     78 
     79 /** token style for an sgml tag. */
     80 var PR_TAG = 'tag';
     81 /** token style for a markup declaration such as a DOCTYPE. */
     82 var PR_DECLARATION = 'dec';
     83 /** token style for embedded source. */
     84 var PR_SOURCE = 'src';
     85 /** token style for an sgml attribute name. */
     86 var PR_ATTRIB_NAME = 'atn';
     87 /** token style for an sgml attribute value. */
     88 var PR_ATTRIB_VALUE = 'atv';
     89 
     90 /** the position of the end of a token during.  A division of a string into
     91   * n tokens can be represented as a series n - 1 token ends, as long as
     92   * runs of whitespace warrant their own token.
     93   * @private
     94   */
     95 function PR_TokenEnd(end, style) {
     96   if (undefined === style) { throw new Error('BAD'); }
     97   if ('number' != typeof(end)) { throw new Error('BAD'); }
     98   this.end = end;
     99   this.style = style;
    100 }
    101 PR_TokenEnd.prototype.toString = function () {
    102   return '[PR_TokenEnd ' + this.end +
    103     (this.style ? ':' + this.style : '') + ']';
    104 };
    105 
    106 
    107 /** a chunk of text with a style.  These are used to represent both the output
    108   * from the lexing functions as well as intermediate results.
    109   * @constructor
    110   * @param token the token text
    111   * @param style one of the token styles defined in designdoc-template, or null
    112   *   for a styleless token, such as an embedded html tag.
    113   * @private
    114   */
    115 function PR_Token(token, style) {
    116   if (undefined === style) { throw new Error('BAD'); }
    117   this.token = token;
    118   this.style = style;
    119 }
    120 
    121 PR_Token.prototype.toString = function () {
    122   return '[PR_Token ' + this.token + (this.style ? ':' + this.style : '') + ']';
    123 };
    124 
    125 
    126 /** a helper class that decodes common html entities used to escape source and
    127   * markup punctuation characters in html.
    128   * @constructor
    129   * @private
    130   */
    131 function PR_DecodeHelper() {
    132   this.next = 0;
    133   this.ch = '\0';
    134 }
    135 
    136 PR_DecodeHelper.prototype.decode = function (s, i) {
    137   var next = i + 1;
    138   var ch = s.charAt(i);
    139   if ('&' == ch) {
    140     var semi = s.indexOf(';', next);
    141     if (semi >= 0 && semi < next + 4) {
    142       var entityName = s.substring(next, semi).toLowerCase();
    143       next = semi + 1;
    144       if ('lt' == entityName) {
    145         ch = '<';
    146       } else if ('gt' == entityName) {
    147         ch = '>';
    148       } else if ('quot' == entityName) {
    149         ch = '"';
    150       } else if ('apos' == entityName) {
    151         ch = '\'';
    152       } else if ('amp' == entityName) {
    153         ch = '&';
    154       } else {
    155         next = i + 1;
    156       }
    157     }
    158   }
    159   this.next = next;
    160   this.ch = ch;
    161   return this.ch;
    162 }
    163 
    164 
    165 // some string utilities
    166 function PR_isWordChar(ch) {
    167   return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
    168 }
    169 
    170 function PR_isIdentifierStart(ch) {
    171   return PR_isWordChar(ch) || ch == '_' || ch == '$' || ch == '@';
    172 }
    173 
    174 function PR_isIdentifierPart(ch) {
    175   return PR_isIdentifierStart(ch) || PR_isDigitChar(ch);
    176 }
    177 
    178 function PR_isSpaceChar(ch) {
    179   return "\t \r\n".indexOf(ch) >= 0;
    180 }
    181 
    182 function PR_isDigitChar(ch) {
    183   return ch >= '0' && ch <= '9';
    184 }
    185 
    186 function PR_trim(s) {
    187   var i = 0, j = s.length - 1;
    188   while (i <= j && PR_isSpaceChar(s.charAt(i))) { ++i; }
    189   while (j > i && PR_isSpaceChar(s.charAt(j))) { --j; }
    190   return s.substring(i, j + 1);
    191 }
    192 
    193 function PR_startsWith(s, prefix) {
    194   return s.length >= prefix.length && prefix == s.substring(0, prefix.length);
    195 }
    196 
    197 function PR_endsWith(s, suffix) {
    198   return s.length >= suffix.length &&
    199          suffix == s.substring(s.length - suffix.length, s.length);
    200 }
    201 
    202 /** true iff prefix matches the first prefix characters in chars[0:len].
    203   * @private
    204   */
    205 function PR_prefixMatch(chars, len, prefix) {
    206   if (len < prefix.length) { return false; }
    207   for (var i = 0, n = prefix.length; i < n; ++i) {
    208     if (prefix.charAt(i) != chars[i]) { return false; }
    209   }
    210   return true;
    211 }
    212 
    213 /** used to convert html special characters embedded in XMP tags into html. */
    214 function PR_textToHtml(str) {
    215   return str.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
    216 }
    217 
    218 
    219 /** split markup into chunks of html tags (style null) and
    220   * plain text (style {@link #PR_PLAIN}).
    221   *
    222   * @param s a String of html.
    223   * @return an Array of PR_Tokens of style PR_PLAIN and null.
    224   * @private
    225   */
    226 function PR_chunkify(s) {
    227   var chunks = new Array();
    228   var state = 0;
    229   var start = 0;
    230   var pos = -1;
    231   for (var i = 0, n = s.length; i < n; ++i) {
    232     var ch = s.charAt(i);
    233     switch (state) {
    234       case 0:
    235         if ('<' == ch) { state = 1; }
    236         break;
    237       case 1:
    238         pos = i - 1;
    239         if ('/' == ch) { state = 2; }
    240         else if (PR_isWordChar(ch)) { state = 3; }
    241         else if ('<' == ch) { state = 1; }
    242         else { state = 0; }
    243         break;
    244       case 2:
    245         if (PR_isWordChar(ch)) { state = 3; }
    246         else if ('<' == ch) { state = 1; }
    247         else { state = 0; }
    248         break;
    249       case 3:
    250         if ('>' == ch) {
    251           if (pos > start) {
    252             chunks.push(new PR_Token(s.substring(start, pos), PR_PLAIN));
    253           }
    254           chunks.push(new PR_Token(s.substring(pos, i + 1), null));
    255           start = i + 1;
    256           pos = -1;
    257           state = 0;
    258         }
    259         break;
    260     }
    261   }
    262   if (s.length > start) {
    263     chunks.push(new PR_Token(s.substring(start, s.length), PR_PLAIN));
    264   }
    265   return chunks;
    266 }
    267 
    268 /** splits chunks around entities.
    269   * @private
    270   */
    271 function PR_splitEntities(chunks) {
    272   var chunksOut = new Array();
    273   var state = 0;
    274   for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
    275     var chunk = chunks[ci];
    276     if (PR_PLAIN != chunk.style) {
    277       chunksOut.push(chunk);
    278       continue;
    279     }
    280     var s = chunk.token;
    281     var pos = 0;
    282     var start;
    283     for (var i = 0; i < s.length; ++i) {
    284       var ch = s.charAt(i);
    285       switch (state) {
    286         case 0:
    287           if ('&' == ch) { state = 1; }
    288           break;
    289         case 1:
    290           if ('#' == ch || PR_isWordChar(ch)) {
    291             start = i - 1;
    292             state = 2;
    293           } else {
    294             state = 0;
    295           }
    296           break;
    297         case 2:
    298           if (';' == ch) {
    299             if (start > pos) {
    300               chunksOut.push(
    301                   new PR_Token(s.substring(pos, start), chunk.style));
    302             }
    303             chunksOut.push(new PR_Token(s.substring(start, i + 1), null));
    304             pos = i + 1;
    305             state = 0;
    306           }
    307           break;
    308       }
    309     }
    310     if (s.length > pos) {
    311       chunksOut.push(pos ?
    312                      new PR_Token(s.substring(pos, s.length), chunk.style) :
    313                      chunk);
    314     }
    315   }
    316   return chunksOut;
    317 }
    318 
    319 /** walk the tokenEnds list and the chunk list in parallel to generate a list
    320   * of split tokens.
    321   * @private
    322   */
    323 function PR_splitChunks(chunks, tokenEnds) {
    324   var tokens = new Array();  // the output
    325 
    326   var ci = 0;  // index into chunks
    327   // position of beginning of amount written so far in absolute space.
    328   var posAbs = 0;
    329   // position of amount written so far in chunk space
    330   var posChunk = 0;
    331 
    332   // current chunk
    333   var chunk = new PR_Token('', null);
    334 
    335   for (var ei = 0, ne = tokenEnds.length; ei < ne; ++ei) {
    336     var tokenEnd = tokenEnds[ei];
    337     var end = tokenEnd.end;
    338 
    339     var tokLen = end - posAbs;
    340     var remainingInChunk = chunk.token.length - posChunk;
    341     while (remainingInChunk <= tokLen) {
    342       if (remainingInChunk > 0) {
    343         tokens.push(
    344             new PR_Token(chunk.token.substring(posChunk, chunk.token.length),
    345                          null == chunk.style ? null : tokenEnd.style));
    346       }
    347       posAbs += remainingInChunk;
    348       posChunk = 0;
    349       if (ci < chunks.length) { chunk = chunks[ci++]; }
    350 
    351       tokLen = end - posAbs;
    352       remainingInChunk = chunk.token.length - posChunk;
    353     }
    354 
    355     if (tokLen) {
    356       tokens.push(
    357           new PR_Token(chunk.token.substring(posChunk, posChunk + tokLen),
    358                        tokenEnd.style));
    359       posAbs += tokLen;
    360       posChunk += tokLen;
    361     }
    362   }
    363 
    364   return tokens;
    365 }
    366 
    367 /** splits markup tokens into declarations, tags, and source chunks.
    368   * @private
    369   */
    370 function PR_splitMarkup(chunks) {
    371   // A state machine to split out declarations, tags, etc.
    372   // This state machine deals with absolute space in the text, indexed by k,
    373   // and position in the current chunk, indexed by pos and tokenStart to
    374   // generate a list of the ends of tokens.
    375   // Absolute space is calculated by considering the chunks as appended into
    376   // one big string, as they were before being split.
    377 
    378   // Known failure cases
    379   // Server side scripting sections such as <?...?> in attributes.
    380   // i.e. <span class="<? foo ?>">
    381   // Handling this would require a stack, and we don't use PHP.
    382 
    383   // The output: a list of pairs of PR_TokenEnd instances
    384   var tokenEnds = new Array();
    385 
    386   var state = 0;  // FSM state variable
    387   var k = 0;  // position in absolute space of the start of the current chunk
    388   var tokenStart = -1;  // the start of the current token
    389 
    390   // Try to find a closing tag for any open <style> or <script> tags
    391   // We can't do this at a later stage because then the following case
    392   // would fail:
    393   // <script>document.writeln('<!--');</script>
    394 
    395   // We use tokenChars[:tokenCharsI] to accumulate the tag name so that we
    396   // can check whether to enter into a no scripting section when the tag ends.
    397   var tokenChars = new Array(12);
    398   var tokenCharsI = 0;
    399   // if non null, the tag prefix that we need to see to break out.
    400   var endScriptTag = null;
    401   var decodeHelper = new PR_DecodeHelper();
    402 
    403   for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
    404     var chunk = chunks[ci];
    405     if (PR_PLAIN != chunk.style) {
    406       k += chunk.token.length;
    407       continue;
    408     }
    409 
    410     var s = chunk.token;
    411     var pos = 0;  // the position past the last character processed so far in s
    412 
    413     for (var i = 0, n = s.length; i < n; /* i = next at bottom */) {
    414       decodeHelper.decode(s, i);
    415       var ch = decodeHelper.ch;
    416       var next = decodeHelper.next;
    417 
    418       var tokenStyle = null;
    419       switch (state) {
    420         case 0:
    421           if ('<' == ch) { state = 1; }
    422           break;
    423         case 1:
    424           tokenCharsI = 0;
    425           if ('/' == ch) {  // only consider close tags if we're in script/style
    426             state = 7;
    427           } else if (null == endScriptTag) {
    428             if ('!' == ch) {
    429               state = 2;
    430             } else if (PR_isWordChar(ch)) {
    431               state = 8;
    432             } else if ('?' == ch) {
    433               state = 9;
    434             } else if ('%' == ch) {
    435               state = 11;
    436             } else if ('<' != ch) {
    437               state = 0;
    438             }
    439           } else if ('<' != ch) {
    440             state = 0;
    441           }
    442           break;
    443         case 2:
    444           if ('-' == ch) {
    445             state = 4;
    446           } else if (PR_isWordChar(ch)) {
    447             state = 3;
    448           } else if ('<' == ch) {
    449             state = 1;
    450           } else {
    451             state = 0;
    452           }
    453           break;
    454         case 3:
    455           if ('>' == ch) {
    456             state = 0;
    457             tokenStyle = PR_DECLARATION;
    458           }
    459           break;
    460         case 4:
    461           if ('-' == ch) { state = 5; }
    462           break;
    463         case 5:
    464           if ('-' == ch) { state = 6; }
    465           break;
    466         case 6:
    467           if ('>' == ch) {
    468             state = 0;
    469             tokenStyle = PR_COMMENT;
    470           } else if ('-' == ch) {
    471             state = 6;
    472           } else {
    473             state = 4;
    474           }
    475           break;
    476         case 7:
    477           if (PR_isWordChar(ch)) {
    478             state = 8;
    479           } else if ('<' == ch) {
    480             state = 1;
    481           } else {
    482             state = 0;
    483           }
    484           break;
    485         case 8:
    486           if ('>' == ch) {
    487             state = 0;
    488             tokenStyle = PR_TAG;
    489           }
    490           break;
    491         case 9:
    492           if ('?' == ch) { state = 10; }
    493           break;
    494         case 10:
    495           if ('>' == ch) {
    496             state = 0;
    497             tokenStyle = PR_SOURCE;
    498           } else if ('?' != ch) {
    499             state = 9;
    500           }
    501           break;
    502         case 11:
    503           if ('%' == ch) { state = 12; }
    504           break;
    505         case 12:
    506           if ('>' == ch) {
    507             state = 0;
    508             tokenStyle = PR_SOURCE;
    509           } else if ('%' != ch) {
    510             state = 11;
    511           }
    512           break;
    513       }
    514 
    515       if (tokenCharsI < tokenChars.length) {
    516         tokenChars[tokenCharsI++] = ch.toLowerCase();
    517       }
    518       if (1 == state) { tokenStart = k + i; }
    519       i = next;
    520       if (tokenStyle != null) {
    521         if (null != tokenStyle) {
    522           if (endScriptTag) {
    523             if (PR_prefixMatch(tokenChars, tokenCharsI, endScriptTag)) {
    524               endScriptTag = null;
    525             }
    526           } else {
    527             if (PR_prefixMatch(tokenChars, tokenCharsI, 'script')) {
    528               endScriptTag = '/script';
    529             } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'style')) {
    530               endScriptTag = '/style';
    531             } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'xmp')) {
    532               endScriptTag = '/xmp';
    533             }
    534           }
    535           // disallow the tag if endScriptTag is set and this was not an open
    536           // tag.
    537           if (endScriptTag && tokenCharsI && '/' == tokenChars[0]) {
    538             tokenStyle = null;
    539           }
    540         }
    541         if (null != tokenStyle) {
    542           tokenEnds.push(new PR_TokenEnd(tokenStart, PR_PLAIN));
    543           tokenEnds.push(new PR_TokenEnd(k + next, tokenStyle));
    544         }
    545       }
    546     }
    547     k += chunk.token.length;
    548   }
    549   tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN));
    550 
    551   return tokenEnds;
    552 }
    553 
    554 /** splits the given string into comment, string, and "other" tokens.
    555   * @return an array of PR_Tokens with style in
    556   *   (PR_STRING, PR_COMMENT, PR_PLAIN, null)
    557   *   The result array may contain spurious zero length tokens.  Ignore them.
    558   *
    559   * @private
    560   */
    561 function PR_splitStringAndCommentTokens(chunks) {
    562   // a state machine to split out comments, strings, and other stuff
    563   var tokenEnds = new Array();  // positions of ends of tokens in absolute space
    564   var state = 0;  // FSM state variable
    565   var delim = -1;  // string delimiter
    566   var k = 0;  // absolute position of beginning of current chunk
    567   for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
    568     var chunk = chunks[ci];
    569     var s = chunk.token;
    570     if (PR_PLAIN == chunk.style) {
    571       for (var i = 0, n = s.length; i < n; ++i) {
    572         var ch = s.charAt(i);
    573         if (0 == state) {
    574           if (ch == '"' || ch == '\'' || ch == '`') {
    575             tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
    576             state = 1;
    577             delim = ch;
    578           } else if (ch == '/') {
    579             state = 3;
    580           } else if (ch == '#') {
    581             tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
    582             state = 4;
    583           }
    584         } else if (1 == state) {
    585           if (ch == delim) {
    586             state = 0;
    587             tokenEnds.push(new PR_TokenEnd(k + i + 1, PR_STRING));
    588           } else if (ch == '\\') {
    589             state = 2;
    590           }
    591         } else if (2 == state) {
    592           state = 1;
    593         } else if (3 == state) {
    594           if (ch == '/') {
    595             state = 4;
    596             tokenEnds.push(new PR_TokenEnd(k + i - 1, PR_PLAIN));
    597           } else if (ch == '*') {
    598             state = 5;
    599             tokenEnds.push(new PR_TokenEnd(k + i - 1, PR_PLAIN));
    600           } else {
    601             state = 0;
    602             // next loop will reenter state 0 without same value of i, so
    603             // ch will be reconsidered as start of new token.
    604             --i;
    605           }
    606         } else if (4 == state) {
    607           if (ch == '\r' || ch == '\n') {
    608             state = 0;
    609             tokenEnds.push(new PR_TokenEnd(k + i, PR_COMMENT));
    610           }
    611         } else if (5 == state) {
    612           if (ch == '*') {
    613             state = 6;
    614           }
    615         } else if (6 == state) {
    616           if (ch == '/') {
    617             state = 0;
    618             tokenEnds.push(new PR_TokenEnd(k + i + 1, PR_COMMENT));
    619           } else if (ch != '*') {
    620             state = 5;
    621           }
    622         }
    623       }
    624     }
    625     k += s.length;
    626   }
    627   tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN));  // a token ends at the end
    628 
    629   return PR_splitChunks(chunks, tokenEnds);
    630 }
    631 
    632 /** used by lexSource to split a non string, non comment token.
    633   * @private
    634   */
    635 function PR_splitNonStringNonCommentToken(s, outlist) {
    636   var pos = 0;
    637   var state = 0;
    638   for (var i = 0; i <= s.length; i++) {
    639     var ch = s.charAt(i);
    640     // the next state.
    641     // if set to -1 then it will cause a reentry to state 0 without consuming
    642     // another character.
    643     var nstate = state;
    644 
    645     if (i == s.length) {
    646       // nstate will not be equal to state, so it will append the token
    647       nstate = -2;
    648     } else {
    649       switch (state) {
    650       case 0:  // whitespace state
    651         if (PR_isIdentifierStart(ch)) {
    652           nstate = 1;
    653         } else if (PR_isDigitChar(ch)) {
    654           nstate = 2;
    655         } else if (!PR_isSpaceChar(ch)) {
    656           nstate = 3;
    657         }
    658         if (nstate && pos < i) {
    659           var t = s.substring(pos, i);
    660           outlist.push(new PR_Token(t, PR_PLAIN));
    661           pos = i;
    662         }
    663         break;
    664       case 1:  // identifier state
    665         if (!PR_isIdentifierPart(ch)) {
    666           nstate = -1;
    667         }
    668         break;
    669       case 2:  // number literal state
    670         // handle numeric literals like
    671         // 0x7f 300UL 100_000
    672 
    673         // this does not treat floating point values as a single literal
    674         //   0.1 and 3e-6
    675         // are each split into multiple tokens
    676         if (!(PR_isDigitChar(ch) || PR_isWordChar(ch) || ch == '_')) {
    677           nstate = -1;
    678         }
    679         break;
    680       case 3:  // punctuation state
    681         if (PR_isIdentifierStart(ch) || PR_isDigitChar(ch) ||
    682             PR_isSpaceChar(ch)) {
    683           nstate = -1;
    684         }
    685         break;
    686       }
    687     }
    688 
    689     if (nstate != state) {
    690       if (nstate < 0) {
    691         if (i > pos) {
    692           var t = s.substring(pos, i);
    693           var ch0 = t.charAt(0);
    694           var style;
    695           if (PR_isIdentifierStart(ch0)) {
    696             if (PR_keywords[t]) {
    697               style = PR_KEYWORD;
    698             }
    699             else if (ch0 == '@') {
    700               style = PR_LITERAL;
    701             } else {
    702               // Treat any word that starts with an uppercase character and
    703               // contains at least one lowercase character as a type, or
    704               // ends with _t.
    705               // This works perfectly for Java, pretty well for C++, and
    706               // passably for Python.  The _t catches C structs.
    707               var isType = false;
    708               if (ch0 >= 'A' && ch0 <= 'Z') {
    709                 for (var j = 1; j < t.length; j++) {
    710                   var ch1 = t.charAt(j);
    711                   if (ch1 >= 'a' && ch1 <= 'z') {
    712                     isType = true;
    713                     break;
    714                   }
    715                 }
    716                 if (!isType && t.length >= 2 &&
    717                     t.substring(t.length - 2) == '_t') {
    718                   isType = true;
    719                 }
    720               }
    721               style = isType ? PR_TYPE : PR_PLAIN;
    722             }
    723           } else if (PR_isDigitChar(ch0)) {
    724             style = PR_LITERAL;
    725           } else if (!PR_isSpaceChar(ch0)) {
    726             style = PR_PUNCTUATION;
    727           } else {
    728             style = PR_PLAIN;
    729           }
    730           pos = i;
    731           outlist.push(new PR_Token(t, style));
    732         }
    733 
    734         state = 0;
    735         if (nstate == -1) {
    736           // don't increment.  This allows us to use state 0 to redispatch based
    737           // on the current character.
    738           i--;
    739           continue;
    740         }
    741       }
    742       state = nstate;
    743     }
    744   }
    745 }
    746 
    747 /** split a group of chunks of markup.
    748   * @private
    749   */
    750 function PR_tokenizeMarkup(chunks) {
    751   if (!(chunks && chunks.length)) { return chunks; }
    752 
    753   var tokenEnds = PR_splitMarkup(chunks);
    754   return PR_splitChunks(chunks, tokenEnds);
    755 }
    756 
    757 /** split tags attributes and their values out from the tag name, and
    758   * recursively lex source chunks.
    759   * @private
    760   */
    761 function PR_splitTagAttributes(tokens) {
    762   var tokensOut = new Array();
    763   var state = 0;
    764   var stateStyle = PR_TAG;
    765   var delim = null;  // attribute delimiter for quoted value state.
    766   var decodeHelper = new PR_DecodeHelper();
    767   for (var ci = 0; ci < tokens.length; ++ci) {
    768     var tok = tokens[ci];
    769     if (PR_TAG == tok.style) {
    770       var s = tok.token;
    771       var start = 0;
    772       for (var i = 0; i < s.length; /* i = next at bottom */) {
    773         decodeHelper.decode(s, i);
    774         var ch = decodeHelper.ch;
    775         var next = decodeHelper.next;
    776 
    777         var emitEnd = null;  // null or position of end of chunk to emit.
    778         var nextStyle = null;  // null or next value of stateStyle
    779         if (ch == '>') {
    780           if (PR_TAG != stateStyle) {
    781             emitEnd = i;
    782             nextStyle = PR_TAG;
    783           }
    784         } else {
    785           switch (state) {
    786             case 0:
    787               if ('<' == ch) { state = 1; }
    788               break;
    789             case 1:
    790               if (PR_isSpaceChar(ch)) { state = 2; }
    791               break;
    792             case 2:
    793               if (!PR_isSpaceChar(ch)) {
    794                 nextStyle = PR_ATTRIB_NAME;
    795                 emitEnd = i;
    796                 state = 3;
    797               }
    798               break;
    799             case 3:
    800               if ('=' == ch) {
    801                 emitEnd = i;
    802                 nextStyle = PR_TAG;
    803                 state = 5;
    804               } else if (PR_isSpaceChar(ch)) {
    805                 emitEnd = i;
    806                 nextStyle = PR_TAG;
    807                 state = 4;
    808               }
    809               break;
    810             case 4:
    811               if ('=' == ch) {
    812                 state = 5;
    813               } else if (!PR_isSpaceChar(ch)) {
    814                 emitEnd = i;
    815                 nextStyle = PR_ATTRIB_NAME;
    816                 state = 3;
    817               }
    818               break;
    819             case 5:
    820               if ('"' == ch || '\'' == ch) {
    821                 emitEnd = i;
    822                 nextStyle = PR_ATTRIB_VALUE;
    823                 state = 6;
    824                 delim = ch;
    825               } else if (!PR_isSpaceChar(ch)) {
    826                 emitEnd = i;
    827                 nextStyle = PR_ATTRIB_VALUE;
    828                 state = 7;
    829               }
    830               break;
    831             case 6:
    832               if (ch == delim) {
    833                 emitEnd = next;
    834                 nextStyle = PR_TAG;
    835                 state = 2;
    836               }
    837               break;
    838             case 7:
    839               if (PR_isSpaceChar(ch)) {
    840                 emitEnd = i;
    841                 nextStyle = PR_TAG;
    842                 state = 2;
    843               }
    844               break;
    845           }
    846         }
    847         if (emitEnd) {
    848           if (emitEnd > start) {
    849             tokensOut.push(
    850                 new PR_Token(s.substring(start, emitEnd), stateStyle));
    851             start = emitEnd;
    852           }
    853           stateStyle = nextStyle;
    854         }
    855         i = next;
    856       }
    857       if (s.length > start) {
    858         tokensOut.push(new PR_Token(s.substring(start, s.length), stateStyle));
    859       }
    860     } else {
    861       if (tok.style) {
    862         state = 0;
    863         stateStyle = PR_TAG;
    864       }
    865       tokensOut.push(tok);
    866     }
    867   }
    868   return tokensOut;
    869 }
    870 
    871 /** identify regions of markup that are really source code, and recursivley
    872   * lex them.
    873   * @private
    874   */
    875 function PR_splitSourceNodes(tokens) {
    876   var tokensOut = new Array();
    877   // when we see a <script> tag, store '/' here so that we know to end the
    878   // source processing
    879   var endScriptTag = null;
    880   var decodeHelper = new PR_DecodeHelper();
    881 
    882   var sourceChunks = null;
    883 
    884   for (var ci = 0, nc = tokens.length; ci < nc; ++ci) {
    885     var tok = tokens[ci];
    886     if (null == tok.style) {
    887       tokens.push(tok);
    888       continue;
    889     }
    890 
    891     var s = tok.token;
    892 
    893     if (null == endScriptTag) {
    894       if (PR_SOURCE == tok.style) {
    895         // split off any starting and trailing <?, <%
    896         if ('<' == decodeHelper.decode(s, 0)) {
    897           decodeHelper.decode(s, decodeHelper.next);
    898           if ('%' == decodeHelper.ch || '?' == decodeHelper.ch) {
    899             endScriptTag = decodeHelper.ch;
    900             tokensOut.push(new PR_Token(s.substring(0, decodeHelper.next),
    901                                         PR_TAG));
    902             s = s.substring(decodeHelper.next, s.length);
    903           }
    904         }
    905       } else if (PR_TAG == tok.style) {
    906         if ('<' == decodeHelper.decode(s, 0) &&
    907             '/' != s.charAt(decodeHelper.next)) {
    908           var tagContent = s.substring(decodeHelper.next).toLowerCase();
    909           // FIXME(msamuel): this does not mirror exactly the code in
    910           // in PR_splitMarkup that defers splitting tags inside script and
    911           // style blocks.
    912           if (PR_startsWith(tagContent, 'script') ||
    913               PR_startsWith(tagContent, 'style') ||
    914               PR_startsWith(tagContent, 'xmp')) {
    915             endScriptTag = '/';
    916           }
    917         }
    918       }
    919     }
    920 
    921     if (null != endScriptTag) {
    922       var endTok = null;
    923       if (PR_SOURCE == tok.style) {
    924         if (endScriptTag == '%' || endScriptTag == '?') {
    925           var pos = s.lastIndexOf(endScriptTag);
    926           if (pos >= 0 && '>' == decodeHelper.decode(s, pos + 1) &&
    927               s.length == decodeHelper.next) {
    928             endTok = new PR_Token(s.substring(pos, s.length), PR_TAG);
    929             s = s.substring(0, pos);
    930           }
    931         }
    932         if (null == sourceChunks) { sourceChunks = new Array(); }
    933         sourceChunks.push(new PR_Token(s, PR_PLAIN));
    934       } else if (PR_PLAIN == tok.style) {
    935         if (null == sourceChunks) { sourceChunks = new Array(); }
    936         sourceChunks.push(tok);
    937       } else if (PR_TAG == tok.style) {
    938         // if it starts with </ then it must be the end tag.
    939         if ('<' == decodeHelper.decode(tok.token, 0) &&
    940             tok.token.length > decodeHelper.next &&
    941             '/' == decodeHelper.decode(tok.token, decodeHelper.next)) {
    942           endTok = tok;
    943         } else {
    944           tokensOut.push(tok);
    945         }
    946       } else {
    947         if (sourceChunks) {
    948           sourceChunks.push(tok);
    949         } else {
    950           // push remaining tag and attribute tokens from the opening tag
    951           tokensOut.push(tok);
    952         }
    953       }
    954       if (endTok) {
    955         if (sourceChunks) {
    956           var sourceTokens = PR_lexSource(sourceChunks);
    957           tokensOut.push(new PR_Token('<span class=embsrc>', null));
    958           for (var si = 0, ns = sourceTokens.length; si < ns; ++si) {
    959             tokensOut.push(sourceTokens[si]);
    960           }
    961           tokensOut.push(new PR_Token('</span>', null));
    962           sourceChunks = null;
    963         }
    964         tokensOut.push(endTok);
    965         endScriptTag = null;
    966       }
    967     } else {
    968       tokensOut.push(tok);
    969     }
    970   }
    971   return tokensOut;
    972 }
    973 
    974 /** splits the quotes from an attribute value.
    975   * ['"foo"'] -> ['"', 'foo', '"']
    976   * @private
    977   */
    978 function PR_splitAttributeQuotes(tokens) {
    979   var firstPlain = null, lastPlain = null;
    980   for (var i = 0; i < tokens.length; ++i) {
    981     if (PR_PLAIN = tokens[i].style) {
    982       firstPlain = i;
    983       break;
    984     }
    985   }
    986   for (var i = tokens.length; --i >= 0;) {
    987     if (PR_PLAIN = tokens[i].style) {
    988       lastPlain = i;
    989       break;
    990     }
    991   }
    992   if (null == firstPlain) { return tokens; }
    993 
    994   var decodeHelper = new PR_DecodeHelper();
    995   var fs = tokens[firstPlain].token;
    996   var fc = decodeHelper.decode(fs, 0);
    997   if ('"' != fc && '\'' != fc) {
    998     return tokens;
    999   }
   1000   var fpos = decodeHelper.next;
   1001 
   1002   var ls = tokens[lastPlain].token;
   1003   var lpos = ls.lastIndexOf('&');
   1004   if (lpos < 0) { lpos = ls.length - 1; }
   1005   var lc = decodeHelper.decode(ls, lpos);
   1006   if (lc != fc || decodeHelper.next != ls.length) {
   1007     lc = null;
   1008     lpos = ls.length;
   1009   }
   1010 
   1011   var tokensOut = new Array();
   1012   for (var i = 0; i < firstPlain; ++i) {
   1013     tokensOut.push(tokens[i]);
   1014   }
   1015   tokensOut.push(new PR_Token(fs.substring(0, fpos), PR_ATTRIB_VALUE));
   1016   if (lastPlain == firstPlain) {
   1017     tokensOut.push(new PR_Token(fs.substring(fpos, lpos), PR_PLAIN));
   1018   } else {
   1019     tokensOut.push(new PR_Token(fs.substring(fpos, fs.length), PR_PLAIN));
   1020     for (var i = firstPlain + 1; i < lastPlain; ++i) {
   1021       tokensOut.push(tokens[i]);
   1022     }
   1023     if (lc) {
   1024       tokens.push(new PR_Token(ls.substring(0, lpos), PR_PLAIN));
   1025     } else {
   1026       tokens.push(tokens[lastPlain]);
   1027     }
   1028   }
   1029   if (lc) {
   1030     tokensOut.push(new PR_Token(ls.substring(lpos, ls.length), PR_PLAIN));
   1031   }
   1032   for (var i = lastPlain + 1; i < tokens.length; ++i) {
   1033     tokensOut.push(tokens[i]);
   1034   }
   1035   return tokensOut;
   1036 }
   1037 
   1038 /** identify attribute values that really contain source code and recursively
   1039   * lex them.
   1040   * @private
   1041   */
   1042 function PR_splitSourceAttributes(tokens) {
   1043   var tokensOut = new Array();
   1044 
   1045   var sourceChunks = null;
   1046   var inSource = false;
   1047   var name = '';
   1048 
   1049   for (var ci = 0, nc = tokens.length; ci < nc; ++ci) {
   1050     var tok = tokens[ci];
   1051     var outList = tokensOut;
   1052     if (PR_TAG == tok.style) {
   1053       if (inSource) {
   1054         inSource = false;
   1055         name = '';
   1056         if (sourceChunks) {
   1057           tokensOut.push(new PR_Token('<span class=embsrc>', null));
   1058           var sourceTokens =
   1059             PR_lexSource(PR_splitAttributeQuotes(sourceChunks));
   1060           for (var si = 0, ns = sourceTokens.length; si < ns; ++si) {
   1061             tokensOut.push(sourceTokens[si]);
   1062           }
   1063           tokensOut.push(new PR_Token('</span>', null));
   1064           sourceChunks = null;
   1065         }
   1066       } else if (name && tok.token.indexOf('=') >= 0) {
   1067         var nameLower = name.toLowerCase();
   1068         if (PR_startsWith(nameLower, 'on') || 'style' == nameLower) {
   1069           inSource = true;
   1070         }
   1071       } else {
   1072         name = '';
   1073       }
   1074     } else if (PR_ATTRIB_NAME == tok.style) {
   1075       name += tok.token;
   1076     } else if (PR_ATTRIB_VALUE == tok.style) {
   1077       if (inSource) {
   1078         if (null == sourceChunks) { sourceChunks = new Array(); }
   1079         outList = sourceChunks;
   1080         tok = new PR_Token(tok.token, PR_PLAIN);
   1081       }
   1082     } else {
   1083       if (sourceChunks) {
   1084         outList = sourceChunks;
   1085       }
   1086     }
   1087     outList.push(tok);
   1088   }
   1089   return tokensOut;
   1090 }
   1091 
   1092 /** returns a list of PR_Token objects given chunks of source code.
   1093   *
   1094   * This code assumes that < tokens are html escaped, but " are not.
   1095   * It will do a resonable job with <, but will not recognize an &quot;
   1096   * as starting a string.
   1097   *
   1098   * This code treats ", ', and ` as string delimiters, and \ as a string escape.
   1099   * It does not recognize double delimiter escapes, or perl's qq() style
   1100   * strings.
   1101   *
   1102   * It recognizes C, C++, and shell style comments.
   1103   *
   1104   * @param chunks PR_Tokens with style in (null, PR_PLAIN)
   1105   */
   1106 function PR_lexSource(chunks) {
   1107   // positions of ends of tokens in order
   1108   var tokensIn = PR_splitStringAndCommentTokens(chunks);
   1109 
   1110   // split entities out of so that we know to treat them as single units.
   1111   tokensIn = PR_splitEntities(tokensIn);
   1112 
   1113   // split non comment|string tokens on whitespace and word boundaries
   1114   var tokensOut = new Array();
   1115   for (var i = 0; i < tokensIn.length; ++i) {
   1116     var tok = tokensIn[i];
   1117     var t = tok.token;
   1118     var s = tok.style;
   1119 
   1120     if (PR_PLAIN == s) {
   1121       PR_splitNonStringNonCommentToken(t, tokensOut);
   1122       continue;
   1123     }
   1124     tokensOut.push(tok);
   1125   }
   1126 
   1127   return tokensOut;
   1128 }
   1129 
   1130 /** returns a list of PR_Token objects given a string of markup.
   1131   *
   1132   * This code assumes that < tokens are html escaped, but " are not.
   1133   * It will do a resonable job with <, but will not recognize an &quot;
   1134   * as starting a string.
   1135   *
   1136   * This code recognizes a number of constructs.
   1137   * <!-- ... --> comment
   1138   * <!\w ... >   declaration
   1139   * <\w ... >    tag
   1140   * </\w ... >   tag
   1141   * <?...?>      embedded source
   1142   * &[#\w]...;   entity
   1143   *
   1144   * It does not recognizes %foo; entities.
   1145   *
   1146   * It will recurse into any <style>, <script>, and on* attributes using
   1147   * PR_lexSource.
   1148   */
   1149 function PR_lexMarkup(chunks) {
   1150   // This function works as follows:
   1151   // 1) Start by splitting the markup into text and tag chunks
   1152   //    Input:  String s
   1153   //    Output: List<PR_Token> where style in (PR_PLAIN, null)
   1154   // 2) Then split the text chunks further into comments, declarations,
   1155   //    tags, etc.
   1156   //    After each split, consider whether the token is the start of an
   1157   //    embedded source section, i.e. is an open <script> tag.  If it is,
   1158   //    find the corresponding close token, and don't bother to lex in between.
   1159   //    Input:  List<String>
   1160   //    Output: List<PR_Token> with style in (PR_TAG, PR_PLAIN, PR_SOURCE, null)
   1161   // 3) Finally go over each tag token and split out attribute names and values.
   1162   //    Input:  List<PR_Token>
   1163   //    Output: List<PR_Token> where style in
   1164   //            (PR_TAG, PR_PLAIN, PR_SOURCE, NAME, VALUE, null)
   1165   var tokensOut = PR_tokenizeMarkup(chunks);
   1166   tokensOut = PR_splitTagAttributes(tokensOut);
   1167   tokensOut = PR_splitSourceNodes(tokensOut);
   1168   tokensOut = PR_splitSourceAttributes(tokensOut);
   1169   return tokensOut;
   1170 }
   1171 
   1172 /** classify the string as either source or markup and lex appropriately. */
   1173 function PR_lexOne(s) {
   1174   var chunks = PR_chunkify(s);
   1175   // treat it as markup if the first non whitespace character is a < and the
   1176   // last non-whitespace character is a >
   1177   var isMarkup = false;
   1178   for (var i = 0; i < chunks.length; ++i) {
   1179     if (PR_PLAIN == chunks[i].style) {
   1180       if (PR_startsWith(PR_trim(chunks[i].token), '&lt;')) {
   1181         for (var j = chunks.length; --j >= 0;) {
   1182           if (PR_PLAIN == chunks[j].style) {
   1183             isMarkup = PR_endsWith(PR_trim(chunks[j].token), '&gt;');
   1184             break;
   1185           }
   1186         }
   1187       }
   1188       break;
   1189     }
   1190   }
   1191   return isMarkup ? PR_lexMarkup(chunks) : PR_lexSource(chunks);
   1192 }
   1193 
   1194 /** pretty print a chunk of code.
   1195   *
   1196   * @param s code as html
   1197   * @return code as html, but prettier
   1198   */
   1199 function prettyPrintOne(s) {
   1200   try {
   1201     var tokens = PR_lexOne(s);
   1202     var out = '';
   1203     var lastStyle = null;
   1204     for (var i = 0; i < tokens.length; i++) {
   1205       var t = tokens[i];
   1206       if (t.style != lastStyle) {
   1207         if (lastStyle != null) {
   1208           out += '</span>';
   1209         }
   1210         if (t.style != null) {
   1211           out += '<span class=' + t.style + '>';
   1212         }
   1213         lastStyle = t.style;
   1214       }
   1215       var html = t.token;
   1216       if (null != t.style) {
   1217         // This interacts badly with the wiki which introduces paragraph tags
   1218         // int pre blocks for some strange reason.
   1219         // It's necessary for IE though which seems to lose the preformattedness
   1220         // of <pre> tags when their innerHTML is assigned.
   1221         html = html.replace(/(?:\r\n?)|\n/g, '<br>').replace(/  /g, '&nbsp; ');
   1222       }
   1223       out += html;
   1224     }
   1225     if (lastStyle != null) {
   1226       out += '</span>';
   1227     }
   1228     return out;
   1229   } catch (e) {
   1230     //alert(e.stack);  // DISABLE in production
   1231     return s;
   1232   }
   1233 }
   1234 
   1235 /** find all the < pre > and < code > tags in the DOM with class=prettyprint and
   1236   * prettify them.
   1237   */
   1238 function prettyPrint() {
   1239   // fetch a list of nodes to rewrite
   1240   var codeSegments = [
   1241       document.getElementsByTagName('pre'),
   1242       document.getElementsByTagName('code'),
   1243       document.getElementsByTagName('xmp') ];
   1244   var elements = [];
   1245   for (var i = 0; i < codeSegments.length; ++i) {
   1246     for (var j = 0; j < codeSegments[i].length; ++j) {
   1247       elements.push(codeSegments[i][j]);
   1248     }
   1249   }
   1250   codeSegments = null;
   1251 
   1252   // the loop is broken into a series of continuations to make sure that we
   1253   // don't make the browser unresponsive when rewriting a large page.
   1254   var k = 0;
   1255 
   1256   function doWork() {
   1257     var endTime = new Date().getTime() + 250;
   1258     for (; k < elements.length && new Date().getTime() < endTime; k++) {
   1259       var cs = elements[k];
   1260       if (cs.className && cs.className.indexOf('prettyprint') >= 0) {
   1261 
   1262         // make sure this is not nested in an already prettified element
   1263         var nested = false;
   1264         for (var p = cs.parentNode; p != null; p = p.parentNode) {
   1265           if ((p.tagName == 'pre' || p.tagName == 'code' ||
   1266                p.tagName == 'xmp') &&
   1267               p.className && p.className.indexOf('prettyprint') >= 0) {
   1268             nested = true;
   1269             break;
   1270           }
   1271         }
   1272         if (!nested) {
   1273           // XMP tags contain unescaped entities so require special handling.
   1274           var isRawContent = 'XMP' == cs.tagName;
   1275 
   1276           // fetch the content as a snippet of properly escaped HTML
   1277           var content = cs.innerHTML;
   1278           if (isRawContent) {
   1279             content = PR_textToHtml(content);
   1280           }
   1281 
   1282           // do the pretty printing
   1283           var newContent = prettyPrintOne(content);
   1284 
   1285           // push the prettified html back into the tag.
   1286           if (!isRawContent) {
   1287             // just replace the old html with the new
   1288             cs.innerHTML = newContent;
   1289           } else {
   1290             // we need to change the tag to a <pre> since <xmp>s do not allow
   1291             // embedded tags such as the span tags used to attach styles to
   1292             // sections of source code.
   1293             var pre = document.createElement('PRE');
   1294             for (var i = 0; i < cs.attributes.length; ++i) {
   1295               var a = cs.attributes[i];
   1296               if (a.specified) {
   1297                 pre.setAttribute(a.name, a.value);
   1298               }
   1299             }
   1300             pre.innerHTML = newContent;
   1301             // remove the old
   1302             cs.parentNode.replaceChild(pre, cs);
   1303           }
   1304         }
   1305       }
   1306     }
   1307     if (k < elements.length) {
   1308       // finish up in a continuation
   1309       setTimeout(doWork, 250);
   1310     }
   1311   }
   1312 
   1313   doWork();
   1314 }
   1315