1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package html provides functions for escaping and unescaping HTML text. 6 package html 7 8 import ( 9 "strings" 10 "unicode/utf8" 11 ) 12 13 type writer interface { 14 WriteString(string) (int, error) 15 } 16 17 // These replacements permit compatibility with old numeric entities that 18 // assumed Windows-1252 encoding. 19 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference 20 var replacementTable = [...]rune{ 21 '\u20AC', // First entry is what 0x80 should be replaced with. 22 '\u0081', 23 '\u201A', 24 '\u0192', 25 '\u201E', 26 '\u2026', 27 '\u2020', 28 '\u2021', 29 '\u02C6', 30 '\u2030', 31 '\u0160', 32 '\u2039', 33 '\u0152', 34 '\u008D', 35 '\u017D', 36 '\u008F', 37 '\u0090', 38 '\u2018', 39 '\u2019', 40 '\u201C', 41 '\u201D', 42 '\u2022', 43 '\u2013', 44 '\u2014', 45 '\u02DC', 46 '\u2122', 47 '\u0161', 48 '\u203A', 49 '\u0153', 50 '\u009D', 51 '\u017E', 52 '\u0178', // Last entry is 0x9F. 53 // 0x00->'\uFFFD' is handled programmatically. 54 // 0x0D->'\u000D' is a no-op. 55 } 56 57 // unescapeEntity reads an entity like "<" from b[src:] and writes the 58 // corresponding "<" to b[dst:], returning the incremented dst and src cursors. 59 // Precondition: b[src] == '&' && dst <= src. 60 // attribute should be true if parsing an attribute value. 61 func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { 62 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference 63 64 // i starts at 1 because we already know that s[0] == '&'. 65 i, s := 1, b[src:] 66 67 if len(s) <= 1 { 68 b[dst] = b[src] 69 return dst + 1, src + 1 70 } 71 72 if s[i] == '#' { 73 if len(s) <= 3 { // We need to have at least "&#.". 74 b[dst] = b[src] 75 return dst + 1, src + 1 76 } 77 i++ 78 c := s[i] 79 hex := false 80 if c == 'x' || c == 'X' { 81 hex = true 82 i++ 83 } 84 85 x := '\x00' 86 for i < len(s) { 87 c = s[i] 88 i++ 89 if hex { 90 if '0' <= c && c <= '9' { 91 x = 16*x + rune(c) - '0' 92 continue 93 } else if 'a' <= c && c <= 'f' { 94 x = 16*x + rune(c) - 'a' + 10 95 continue 96 } else if 'A' <= c && c <= 'F' { 97 x = 16*x + rune(c) - 'A' + 10 98 continue 99 } 100 } else if '0' <= c && c <= '9' { 101 x = 10*x + rune(c) - '0' 102 continue 103 } 104 if c != ';' { 105 i-- 106 } 107 break 108 } 109 110 if i <= 3 { // No characters matched. 111 b[dst] = b[src] 112 return dst + 1, src + 1 113 } 114 115 if 0x80 <= x && x <= 0x9F { 116 // Replace characters from Windows-1252 with UTF-8 equivalents. 117 x = replacementTable[x-0x80] 118 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { 119 // Replace invalid characters with the replacement character. 120 x = '\uFFFD' 121 } 122 123 return dst + utf8.EncodeRune(b[dst:], x), src + i 124 } 125 126 // Consume the maximum number of characters possible, with the 127 // consumed characters matching one of the named references. 128 129 for i < len(s) { 130 c := s[i] 131 i++ 132 // Lower-cased characters are more common in entities, so we check for them first. 133 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { 134 continue 135 } 136 if c != ';' { 137 i-- 138 } 139 break 140 } 141 142 entityName := string(s[1:i]) 143 if entityName == "" { 144 // No-op. 145 } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { 146 // No-op. 147 } else if x := entity[entityName]; x != 0 { 148 return dst + utf8.EncodeRune(b[dst:], x), src + i 149 } else if x := entity2[entityName]; x[0] != 0 { 150 dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) 151 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i 152 } else if !attribute { 153 maxLen := len(entityName) - 1 154 if maxLen > longestEntityWithoutSemicolon { 155 maxLen = longestEntityWithoutSemicolon 156 } 157 for j := maxLen; j > 1; j-- { 158 if x := entity[entityName[:j]]; x != 0 { 159 return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 160 } 161 } 162 } 163 164 dst1, src1 = dst+i, src+i 165 copy(b[dst:dst1], b[src:src1]) 166 return dst1, src1 167 } 168 169 // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b". 170 func unescape(b []byte) []byte { 171 for i, c := range b { 172 if c == '&' { 173 dst, src := unescapeEntity(b, i, i, false) 174 for src < len(b) { 175 c := b[src] 176 if c == '&' { 177 dst, src = unescapeEntity(b, dst, src, false) 178 } else { 179 b[dst] = c 180 dst, src = dst+1, src+1 181 } 182 } 183 return b[0:dst] 184 } 185 } 186 return b 187 } 188 189 var htmlEscaper = strings.NewReplacer( 190 `&`, "&", 191 `'`, "'", // "'" is shorter than "'" and apos was not in HTML until HTML5. 192 `<`, "<", 193 `>`, ">", 194 `"`, """, // """ is shorter than """. 195 ) 196 197 // EscapeString escapes special characters like "<" to become "<". It 198 // escapes only five such characters: <, >, &, ' and ". 199 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't 200 // always true. 201 func EscapeString(s string) string { 202 return htmlEscaper.Replace(s) 203 } 204 205 // UnescapeString unescapes entities like "<" to become "<". It unescapes a 206 // larger range of entities than EscapeString escapes. For example, "á" 207 // unescapes to "", as does "á" and "&xE1;". 208 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't 209 // always true. 210 func UnescapeString(s string) string { 211 if !strings.Contains(s, "&") { 212 return s 213 } 214 return string(unescape([]byte(s))) 215 } 216