Home | History | Annotate | Download | only in html
      1 // Copyright (c) 2011, Mike Samuel
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions
      6 // are met:
      7 //
      8 // Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 // Redistributions in binary form must reproduce the above copyright
     11 // notice, this list of conditions and the following disclaimer in the
     12 // documentation and/or other materials provided with the distribution.
     13 // Neither the name of the OWASP nor the names of its contributors may
     14 // be used to endorse or promote products derived from this software
     15 // without specific prior written permission.
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27 // POSSIBILITY OF SUCH DAMAGE.
     28 
     29 package org.owasp.html;
     30 
     31 import com.google.common.collect.ImmutableMap;
     32 
     33 /**
     34  * Utilities for decoding HTML entities, e.g., {@code &}.
     35  */
     36 class HtmlEntities {
     37 
     38   /**
     39    * Decodes any HTML entity at the given location.  This handles both named and
     40    * numeric entities.
     41    *
     42    * @param html HTML text.
     43    * @param offset the position of the sequence to decode.
     44    * @param limit the last position in chars that could be part of the sequence
     45    *    to decode.
     46    * @return The offset after the end of the decoded sequence and the decoded
     47    *    code-point or code-unit packed into a long.
     48    *    The first 32 bits are the offset, and the second 32 bits are a
     49    *    code-point or a code-unit.
     50    */
     51   public static long decodeEntityAt(String html, int offset, int limit) {
     52     char ch = html.charAt(offset);
     53     if ('&' != ch) {
     54       return ((offset + 1L) << 32) | ch;
     55     }
     56 
     57     int entityLimit = Math.min(limit, offset + 10);
     58     int end = -1;
     59     int tail = -1;
     60     if (entityLimit == limit) {
     61       // Assume a broken entity that ends at the end until shown otherwise.
     62       end = tail = entityLimit;
     63     }
     64     entityloop:
     65     for (int i = offset + 1; i < entityLimit; ++i) {
     66       switch (html.charAt(i)) {
     67         case ';':  // An unbroken entity.
     68           end = i;
     69           tail = end + 1;
     70           break entityloop;
     71         case '#':
     72         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     73         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
     74         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     75         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     76         case 'Y': case 'Z':
     77         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     78         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
     79         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     80         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
     81         case 'y': case 'z':
     82         case '0': case '1': case '2': case '3': case '4': case '5':
     83         case '6': case '7': case '8': case '9':
     84           break;
     85         case '=':
     86           // An equal sign after an entity missing a closing semicolon should
     87           // never have the semicolon inserted since that causes trouble with
     88           // parameters in partially encoded URLs.
     89           return ((offset + 1L) << 32) | '&';
     90         default:  // A possible broken entity.
     91           end = i;
     92           tail = i;
     93           break entityloop;
     94       }
     95     }
     96     if (end < 0 || offset + 2 >= end) {
     97       return ((offset + 1L) << 32) | '&';
     98     }
     99     // Now we know where the entity ends, and that there is at least one
    100     // character in the entity name
    101     char ch1 = html.charAt(offset + 1);
    102     char ch2 = html.charAt(offset + 2);
    103     int codepoint = -1;
    104     if ('#' == ch1) {
    105       // numeric entity
    106       if ('x' == ch2 || 'X' == ch2) {
    107         if (end == offset + 3) {  // No digits
    108           return ((offset + 1L) << 32) | '&';
    109         }
    110         codepoint = 0;
    111         // hex literal
    112         digloop:
    113         for (int i = offset + 3; i < end; ++i) {
    114           char digit = html.charAt(i);
    115           switch (digit & 0xfff8) {
    116             case 0x30: case 0x38: // ASCII 48-57 are '0'-'9'
    117               int decDig = digit & 0xf;
    118               if (decDig < 10) {
    119                 codepoint = (codepoint << 4) | decDig;
    120               } else {
    121                 codepoint = -1;
    122                 break digloop;
    123               }
    124               break;
    125             // ASCII 65-70 and 97-102 are 'A'-'Z' && 'a'-'z'
    126             case 0x40: case 0x60:
    127               int hexDig = (digit & 0x7);
    128               if (hexDig != 0 && hexDig < 7) {
    129                 codepoint = (codepoint << 4) | (hexDig + 9);
    130               } else {
    131                 codepoint = -1;
    132                 break digloop;
    133               }
    134               break;
    135             default:
    136               codepoint = -1;
    137               break digloop;
    138           }
    139         }
    140         if (codepoint > Character.MAX_CODE_POINT) {
    141           codepoint = 0xfffd;  // Unknown.
    142         }
    143       } else {
    144         codepoint = 0;
    145         // decimal literal
    146         digloop:
    147         for (int i = offset + 2; i < end; ++i) {
    148           char digit = html.charAt(i);
    149           switch (digit & 0xfff8) {
    150             case 0x30: case 0x38: // ASCII 48-57 are '0'-'9'
    151               int decDig = digit - '0';
    152               if (decDig < 10) {
    153                 codepoint = (codepoint * 10) + decDig;
    154               } else {
    155                 codepoint = -1;
    156                 break digloop;
    157               }
    158               break;
    159             default:
    160               codepoint = -1;
    161               break digloop;
    162           }
    163         }
    164         if (codepoint > Character.MAX_CODE_POINT) {
    165           codepoint = 0xfffd;  // Unknown.
    166         }
    167       }
    168     } else {
    169       Trie t = ENTITY_TRIE;
    170       for (int i = offset + 1; i < end; ++i) {
    171         char nameChar = html.charAt(i);
    172         t = t.lookup(nameChar);
    173         if (t == null) { break; }
    174       }
    175       if (t == null) {
    176         t = ENTITY_TRIE;
    177         for (int i = offset + 1; i < end; ++i) {
    178           char nameChar = html.charAt(i);
    179           if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; }
    180           t = t.lookup(nameChar);
    181           if (t == null) { break; }
    182         }
    183       }
    184       if (t != null && t.isTerminal()) {
    185         codepoint = t.getValue();
    186       }
    187     }
    188     if (codepoint < 0) {
    189       return ((offset + 1L) << 32) | '&';
    190     } else {
    191       return (((long) tail) << 32) | codepoint;
    192     }
    193   }
    194 
    195 //  /** A possible entity name like "amp" or "gt". */
    196 //  public static boolean isEntityName(String name) {
    197 //    Trie t = ENTITY_TRIE;
    198 //    int n = name.length();
    199 //
    200 //    // Treat AMP the same amp, but not Amp.
    201 //    boolean isUcase = true;
    202 //    for (int i = 0; i < n; ++i) {
    203 //      char ch = name.charAt(i);
    204 //      if (!('A' <= ch && ch <= 'Z')) {
    205 //        isUcase = false;
    206 //        break;
    207 //      }
    208 //    }
    209 //
    210 //    if (isUcase) { name = Strings.toLowerCase(name); }
    211 //
    212 //    for (int i = 0; i < n; ++i) {
    213 //      t = t.lookup(name.charAt(i));
    214 //      if (t == null) { return false; }
    215 //    }
    216 //    return t.isTerminal();
    217 //  }
    218 
    219   /** A trie that maps entity names to codepoints. */
    220   public static final Trie ENTITY_TRIE = new Trie(
    221       ImmutableMap.<String, Integer>builder()
    222     // C0 Controls and Basic Latin
    223       .put("quot", Integer.valueOf('"'))
    224       .put("amp", Integer.valueOf('&'))
    225       .put("lt", Integer.valueOf('<'))
    226       .put("gt", Integer.valueOf('>'))
    227 
    228     // XML 1.0
    229       .put("apos", Integer.valueOf('\''))
    230 
    231     // HTML4 entities
    232       .put("nbsp", Integer.valueOf('\u00a0'))
    233       .put("iexcl", Integer.valueOf('\u00a1'))
    234       .put("cent", Integer.valueOf('\u00a2'))
    235       .put("pound", Integer.valueOf('\u00a3'))
    236       .put("curren", Integer.valueOf('\u00a4'))
    237       .put("yen", Integer.valueOf('\u00a5'))
    238       .put("brvbar", Integer.valueOf('\u00a6'))
    239       .put("sect", Integer.valueOf('\u00a7'))
    240       .put("uml", Integer.valueOf('\u00a8'))
    241       .put("copy", Integer.valueOf('\u00a9'))
    242       .put("ordf", Integer.valueOf('\u00aa'))
    243       .put("laquo", Integer.valueOf('\u00ab'))
    244       .put("not", Integer.valueOf('\u00ac'))
    245       .put("shy", Integer.valueOf('\u00ad'))
    246       .put("reg", Integer.valueOf('\u00ae'))
    247       .put("macr", Integer.valueOf('\u00af'))
    248       .put("deg", Integer.valueOf('\u00b0'))
    249       .put("plusmn", Integer.valueOf('\u00b1'))
    250       .put("sup2", Integer.valueOf('\u00b2'))
    251       .put("sup3", Integer.valueOf('\u00b3'))
    252       .put("acute", Integer.valueOf('\u00b4'))
    253       .put("micro", Integer.valueOf('\u00b5'))
    254       .put("para", Integer.valueOf('\u00b6'))
    255       .put("middot", Integer.valueOf('\u00b7'))
    256       .put("cedil", Integer.valueOf('\u00b8'))
    257       .put("sup1", Integer.valueOf('\u00b9'))
    258       .put("ordm", Integer.valueOf('\u00ba'))
    259       .put("raquo", Integer.valueOf('\u00bb'))
    260       .put("frac14", Integer.valueOf('\u00bc'))
    261       .put("frac12", Integer.valueOf('\u00bd'))
    262       .put("frac34", Integer.valueOf('\u00be'))
    263       .put("iquest", Integer.valueOf('\u00bf'))
    264       .put("Agrave", Integer.valueOf('\u00c0'))
    265       .put("Aacute", Integer.valueOf('\u00c1'))
    266       .put("Acirc", Integer.valueOf('\u00c2'))
    267       .put("Atilde", Integer.valueOf('\u00c3'))
    268       .put("Auml", Integer.valueOf('\u00c4'))
    269       .put("Aring", Integer.valueOf('\u00c5'))
    270       .put("AElig", Integer.valueOf('\u00c6'))
    271       .put("Ccedil", Integer.valueOf('\u00c7'))
    272       .put("Egrave", Integer.valueOf('\u00c8'))
    273       .put("Eacute", Integer.valueOf('\u00c9'))
    274       .put("Ecirc", Integer.valueOf('\u00ca'))
    275       .put("Euml", Integer.valueOf('\u00cb'))
    276       .put("Igrave", Integer.valueOf('\u00cc'))
    277       .put("Iacute", Integer.valueOf('\u00cd'))
    278       .put("Icirc", Integer.valueOf('\u00ce'))
    279       .put("Iuml", Integer.valueOf('\u00cf'))
    280       .put("ETH", Integer.valueOf('\u00d0'))
    281       .put("Ntilde", Integer.valueOf('\u00d1'))
    282       .put("Ograve", Integer.valueOf('\u00d2'))
    283       .put("Oacute", Integer.valueOf('\u00d3'))
    284       .put("Ocirc", Integer.valueOf('\u00d4'))
    285       .put("Otilde", Integer.valueOf('\u00d5'))
    286       .put("Ouml", Integer.valueOf('\u00d6'))
    287       .put("times", Integer.valueOf('\u00d7'))
    288       .put("Oslash", Integer.valueOf('\u00d8'))
    289       .put("Ugrave", Integer.valueOf('\u00d9'))
    290       .put("Uacute", Integer.valueOf('\u00da'))
    291       .put("Ucirc", Integer.valueOf('\u00db'))
    292       .put("Uuml", Integer.valueOf('\u00dc'))
    293       .put("Yacute", Integer.valueOf('\u00dd'))
    294       .put("THORN", Integer.valueOf('\u00de'))
    295       .put("szlig", Integer.valueOf('\u00df'))
    296       .put("agrave", Integer.valueOf('\u00e0'))
    297       .put("aacute", Integer.valueOf('\u00e1'))
    298       .put("acirc", Integer.valueOf('\u00e2'))
    299       .put("atilde", Integer.valueOf('\u00e3'))
    300       .put("auml", Integer.valueOf('\u00e4'))
    301       .put("aring", Integer.valueOf('\u00e5'))
    302       .put("aelig", Integer.valueOf('\u00e6'))
    303       .put("ccedil", Integer.valueOf('\u00e7'))
    304       .put("egrave", Integer.valueOf('\u00e8'))
    305       .put("eacute", Integer.valueOf('\u00e9'))
    306       .put("ecirc", Integer.valueOf('\u00ea'))
    307       .put("euml", Integer.valueOf('\u00eb'))
    308       .put("igrave", Integer.valueOf('\u00ec'))
    309       .put("iacute", Integer.valueOf('\u00ed'))
    310       .put("icirc", Integer.valueOf('\u00ee'))
    311       .put("iuml", Integer.valueOf('\u00ef'))
    312       .put("eth", Integer.valueOf('\u00f0'))
    313       .put("ntilde", Integer.valueOf('\u00f1'))
    314       .put("ograve", Integer.valueOf('\u00f2'))
    315       .put("oacute", Integer.valueOf('\u00f3'))
    316       .put("ocirc", Integer.valueOf('\u00f4'))
    317       .put("otilde", Integer.valueOf('\u00f5'))
    318       .put("ouml", Integer.valueOf('\u00f6'))
    319       .put("divide", Integer.valueOf('\u00f7'))
    320       .put("oslash", Integer.valueOf('\u00f8'))
    321       .put("ugrave", Integer.valueOf('\u00f9'))
    322       .put("uacute", Integer.valueOf('\u00fa'))
    323       .put("ucirc", Integer.valueOf('\u00fb'))
    324       .put("uuml", Integer.valueOf('\u00fc'))
    325       .put("yacute", Integer.valueOf('\u00fd'))
    326       .put("thorn", Integer.valueOf('\u00fe'))
    327       .put("yuml", Integer.valueOf('\u00ff'))
    328 
    329     // Latin Extended-B
    330       .put("fnof", Integer.valueOf('\u0192'))
    331 
    332     // Greek
    333       .put("Alpha", Integer.valueOf('\u0391'))
    334       .put("Beta", Integer.valueOf('\u0392'))
    335       .put("Gamma", Integer.valueOf('\u0393'))
    336       .put("Delta", Integer.valueOf('\u0394'))
    337       .put("Epsilon", Integer.valueOf('\u0395'))
    338       .put("Zeta", Integer.valueOf('\u0396'))
    339       .put("Eta", Integer.valueOf('\u0397'))
    340       .put("Theta", Integer.valueOf('\u0398'))
    341       .put("Iota", Integer.valueOf('\u0399'))
    342       .put("Kappa", Integer.valueOf('\u039a'))
    343       .put("Lambda", Integer.valueOf('\u039b'))
    344       .put("Mu", Integer.valueOf('\u039c'))
    345       .put("Nu", Integer.valueOf('\u039d'))
    346       .put("Xi", Integer.valueOf('\u039e'))
    347       .put("Omicron", Integer.valueOf('\u039f'))
    348       .put("Pi", Integer.valueOf('\u03a0'))
    349       .put("Rho", Integer.valueOf('\u03a1'))
    350       .put("Sigma", Integer.valueOf('\u03a3'))
    351       .put("Tau", Integer.valueOf('\u03a4'))
    352       .put("Upsilon", Integer.valueOf('\u03a5'))
    353       .put("Phi", Integer.valueOf('\u03a6'))
    354       .put("Chi", Integer.valueOf('\u03a7'))
    355       .put("Psi", Integer.valueOf('\u03a8'))
    356       .put("Omega", Integer.valueOf('\u03a9'))
    357 
    358       .put("alpha", Integer.valueOf('\u03b1'))
    359       .put("beta", Integer.valueOf('\u03b2'))
    360       .put("gamma", Integer.valueOf('\u03b3'))
    361       .put("delta", Integer.valueOf('\u03b4'))
    362       .put("epsilon", Integer.valueOf('\u03b5'))
    363       .put("zeta", Integer.valueOf('\u03b6'))
    364       .put("eta", Integer.valueOf('\u03b7'))
    365       .put("theta", Integer.valueOf('\u03b8'))
    366       .put("iota", Integer.valueOf('\u03b9'))
    367       .put("kappa", Integer.valueOf('\u03ba'))
    368       .put("lambda", Integer.valueOf('\u03bb'))
    369       .put("mu", Integer.valueOf('\u03bc'))
    370       .put("nu", Integer.valueOf('\u03bd'))
    371       .put("xi", Integer.valueOf('\u03be'))
    372       .put("omicron", Integer.valueOf('\u03bf'))
    373       .put("pi", Integer.valueOf('\u03c0'))
    374       .put("rho", Integer.valueOf('\u03c1'))
    375       .put("sigmaf", Integer.valueOf('\u03c2'))
    376       .put("sigma", Integer.valueOf('\u03c3'))
    377       .put("tau", Integer.valueOf('\u03c4'))
    378       .put("upsilon", Integer.valueOf('\u03c5'))
    379       .put("phi", Integer.valueOf('\u03c6'))
    380       .put("chi", Integer.valueOf('\u03c7'))
    381       .put("psi", Integer.valueOf('\u03c8'))
    382       .put("omega", Integer.valueOf('\u03c9'))
    383       .put("thetasym", Integer.valueOf('\u03d1'))
    384       .put("upsih", Integer.valueOf('\u03d2'))
    385       .put("piv", Integer.valueOf('\u03d6'))
    386 
    387     // General Punctuation
    388       .put("bull", Integer.valueOf('\u2022'))
    389       .put("hellip", Integer.valueOf('\u2026'))
    390       .put("prime", Integer.valueOf('\u2032'))
    391       .put("Prime", Integer.valueOf('\u2033'))
    392       .put("oline", Integer.valueOf('\u203e'))
    393       .put("frasl", Integer.valueOf('\u2044'))
    394 
    395     // Letterlike Symbols
    396       .put("weierp", Integer.valueOf('\u2118'))
    397       .put("image", Integer.valueOf('\u2111'))
    398       .put("real", Integer.valueOf('\u211c'))
    399       .put("trade", Integer.valueOf('\u2122'))
    400       .put("alefsym", Integer.valueOf('\u2135'))
    401 
    402     // Arrows
    403       .put("larr", Integer.valueOf('\u2190'))
    404       .put("uarr", Integer.valueOf('\u2191'))
    405       .put("rarr", Integer.valueOf('\u2192'))
    406       .put("darr", Integer.valueOf('\u2193'))
    407       .put("harr", Integer.valueOf('\u2194'))
    408       .put("crarr", Integer.valueOf('\u21b5'))
    409       .put("lArr", Integer.valueOf('\u21d0'))
    410       .put("uArr", Integer.valueOf('\u21d1'))
    411       .put("rArr", Integer.valueOf('\u21d2'))
    412       .put("dArr", Integer.valueOf('\u21d3'))
    413       .put("hArr", Integer.valueOf('\u21d4'))
    414 
    415     // Mathematical Operators
    416       .put("forall", Integer.valueOf('\u2200'))
    417       .put("part", Integer.valueOf('\u2202'))
    418       .put("exist", Integer.valueOf('\u2203'))
    419       .put("empty", Integer.valueOf('\u2205'))
    420       .put("nabla", Integer.valueOf('\u2207'))
    421       .put("isin", Integer.valueOf('\u2208'))
    422       .put("notin", Integer.valueOf('\u2209'))
    423       .put("ni", Integer.valueOf('\u220b'))
    424       .put("prod", Integer.valueOf('\u220f'))
    425       .put("sum", Integer.valueOf('\u2211'))
    426       .put("minus", Integer.valueOf('\u2212'))
    427       .put("lowast", Integer.valueOf('\u2217'))
    428       .put("radic", Integer.valueOf('\u221a'))
    429       .put("prop", Integer.valueOf('\u221d'))
    430       .put("infin", Integer.valueOf('\u221e'))
    431       .put("ang", Integer.valueOf('\u2220'))
    432       .put("and", Integer.valueOf('\u2227'))
    433       .put("or", Integer.valueOf('\u2228'))
    434       .put("cap", Integer.valueOf('\u2229'))
    435       .put("cup", Integer.valueOf('\u222a'))
    436       .put("int", Integer.valueOf('\u222b'))
    437       .put("there4", Integer.valueOf('\u2234'))
    438       .put("sim", Integer.valueOf('\u223c'))
    439       .put("cong", Integer.valueOf('\u2245'))
    440       .put("asymp", Integer.valueOf('\u2248'))
    441       .put("ne", Integer.valueOf('\u2260'))
    442       .put("equiv", Integer.valueOf('\u2261'))
    443       .put("le", Integer.valueOf('\u2264'))
    444       .put("ge", Integer.valueOf('\u2265'))
    445       .put("sub", Integer.valueOf('\u2282'))
    446       .put("sup", Integer.valueOf('\u2283'))
    447       .put("nsub", Integer.valueOf('\u2284'))
    448       .put("sube", Integer.valueOf('\u2286'))
    449       .put("supe", Integer.valueOf('\u2287'))
    450       .put("oplus", Integer.valueOf('\u2295'))
    451       .put("otimes", Integer.valueOf('\u2297'))
    452       .put("perp", Integer.valueOf('\u22a5'))
    453       .put("sdot", Integer.valueOf('\u22c5'))
    454 
    455     // Miscellaneous Technical
    456       .put("lceil", Integer.valueOf('\u2308'))
    457       .put("rceil", Integer.valueOf('\u2309'))
    458       .put("lfloor", Integer.valueOf('\u230a'))
    459       .put("rfloor", Integer.valueOf('\u230b'))
    460       .put("lang", Integer.valueOf('\u2329'))
    461       .put("rang", Integer.valueOf('\u232a'))
    462 
    463     // Geometric Shapes
    464       .put("loz", Integer.valueOf('\u25ca'))
    465 
    466     // Miscellaneous Symbols
    467       .put("spades", Integer.valueOf('\u2660'))
    468       .put("clubs", Integer.valueOf('\u2663'))
    469       .put("hearts", Integer.valueOf('\u2665'))
    470       .put("diams", Integer.valueOf('\u2666'))
    471 
    472     // Latin Extended-A
    473       .put("OElig", Integer.valueOf('\u0152'))
    474       .put("oelig", Integer.valueOf('\u0153'))
    475       .put("Scaron", Integer.valueOf('\u0160'))
    476       .put("scaron", Integer.valueOf('\u0161'))
    477       .put("Yuml", Integer.valueOf('\u0178'))
    478 
    479     // Spacing Modifier Letters
    480       .put("circ", Integer.valueOf('\u02c6'))
    481       .put("tilde", Integer.valueOf('\u02dc'))
    482 
    483     // General Punctuation
    484       .put("ensp", Integer.valueOf('\u2002'))
    485       .put("emsp", Integer.valueOf('\u2003'))
    486       .put("thinsp", Integer.valueOf('\u2009'))
    487       .put("zwnj", Integer.valueOf('\u200c'))
    488       .put("zwj", Integer.valueOf('\u200d'))
    489       .put("lrm", Integer.valueOf('\u200e'))
    490       .put("rlm", Integer.valueOf('\u200f'))
    491       .put("ndash", Integer.valueOf('\u2013'))
    492       .put("mdash", Integer.valueOf('\u2014'))
    493       .put("lsquo", Integer.valueOf('\u2018'))
    494       .put("rsquo", Integer.valueOf('\u2019'))
    495       .put("sbquo", Integer.valueOf('\u201a'))
    496       .put("ldquo", Integer.valueOf('\u201c'))
    497       .put("rdquo", Integer.valueOf('\u201d'))
    498       .put("bdquo", Integer.valueOf('\u201e'))
    499       .put("dagger", Integer.valueOf('\u2020'))
    500       .put("Dagger", Integer.valueOf('\u2021'))
    501       .put("permil", Integer.valueOf('\u2030'))
    502       .put("lsaquo", Integer.valueOf('\u2039'))
    503       .put("rsaquo", Integer.valueOf('\u203a'))
    504       .put("euro", Integer.valueOf('\u20ac'))
    505       .build());
    506 
    507   private HtmlEntities() { /* uninstantiable */ }
    508 }
    509