Home | History | Annotate | Download | only in html
      1 // Copyright 2010 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package html provides functions for escaping and unescaping HTML text.
      6 package html
      7 
      8 import (
      9 	"strings"
     10 	"unicode/utf8"
     11 )
     12 
     13 type writer interface {
     14 	WriteString(string) (int, error)
     15 }
     16 
     17 // These replacements permit compatibility with old numeric entities that
     18 // assumed Windows-1252 encoding.
     19 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
     20 var replacementTable = [...]rune{
     21 	'\u20AC', // First entry is what 0x80 should be replaced with.
     22 	'\u0081',
     23 	'\u201A',
     24 	'\u0192',
     25 	'\u201E',
     26 	'\u2026',
     27 	'\u2020',
     28 	'\u2021',
     29 	'\u02C6',
     30 	'\u2030',
     31 	'\u0160',
     32 	'\u2039',
     33 	'\u0152',
     34 	'\u008D',
     35 	'\u017D',
     36 	'\u008F',
     37 	'\u0090',
     38 	'\u2018',
     39 	'\u2019',
     40 	'\u201C',
     41 	'\u201D',
     42 	'\u2022',
     43 	'\u2013',
     44 	'\u2014',
     45 	'\u02DC',
     46 	'\u2122',
     47 	'\u0161',
     48 	'\u203A',
     49 	'\u0153',
     50 	'\u009D',
     51 	'\u017E',
     52 	'\u0178', // Last entry is 0x9F.
     53 	// 0x00->'\uFFFD' is handled programmatically.
     54 	// 0x0D->'\u000D' is a no-op.
     55 }
     56 
     57 // unescapeEntity reads an entity like "<" from b[src:] and writes the
     58 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
     59 // Precondition: b[src] == '&' && dst <= src.
     60 // attribute should be true if parsing an attribute value.
     61 func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
     62 	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
     63 
     64 	// i starts at 1 because we already know that s[0] == '&'.
     65 	i, s := 1, b[src:]
     66 
     67 	if len(s) <= 1 {
     68 		b[dst] = b[src]
     69 		return dst + 1, src + 1
     70 	}
     71 
     72 	if s[i] == '#' {
     73 		if len(s) <= 3 { // We need to have at least "&#.".
     74 			b[dst] = b[src]
     75 			return dst + 1, src + 1
     76 		}
     77 		i++
     78 		c := s[i]
     79 		hex := false
     80 		if c == 'x' || c == 'X' {
     81 			hex = true
     82 			i++
     83 		}
     84 
     85 		x := '\x00'
     86 		for i < len(s) {
     87 			c = s[i]
     88 			i++
     89 			if hex {
     90 				if '0' <= c && c <= '9' {
     91 					x = 16*x + rune(c) - '0'
     92 					continue
     93 				} else if 'a' <= c && c <= 'f' {
     94 					x = 16*x + rune(c) - 'a' + 10
     95 					continue
     96 				} else if 'A' <= c && c <= 'F' {
     97 					x = 16*x + rune(c) - 'A' + 10
     98 					continue
     99 				}
    100 			} else if '0' <= c && c <= '9' {
    101 				x = 10*x + rune(c) - '0'
    102 				continue
    103 			}
    104 			if c != ';' {
    105 				i--
    106 			}
    107 			break
    108 		}
    109 
    110 		if i <= 3 { // No characters matched.
    111 			b[dst] = b[src]
    112 			return dst + 1, src + 1
    113 		}
    114 
    115 		if 0x80 <= x && x <= 0x9F {
    116 			// Replace characters from Windows-1252 with UTF-8 equivalents.
    117 			x = replacementTable[x-0x80]
    118 		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
    119 			// Replace invalid characters with the replacement character.
    120 			x = '\uFFFD'
    121 		}
    122 
    123 		return dst + utf8.EncodeRune(b[dst:], x), src + i
    124 	}
    125 
    126 	// Consume the maximum number of characters possible, with the
    127 	// consumed characters matching one of the named references.
    128 
    129 	for i < len(s) {
    130 		c := s[i]
    131 		i++
    132 		// Lower-cased characters are more common in entities, so we check for them first.
    133 		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
    134 			continue
    135 		}
    136 		if c != ';' {
    137 			i--
    138 		}
    139 		break
    140 	}
    141 
    142 	entityName := string(s[1:i])
    143 	if entityName == "" {
    144 		// No-op.
    145 	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
    146 		// No-op.
    147 	} else if x := entity[entityName]; x != 0 {
    148 		return dst + utf8.EncodeRune(b[dst:], x), src + i
    149 	} else if x := entity2[entityName]; x[0] != 0 {
    150 		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
    151 		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
    152 	} else if !attribute {
    153 		maxLen := len(entityName) - 1
    154 		if maxLen > longestEntityWithoutSemicolon {
    155 			maxLen = longestEntityWithoutSemicolon
    156 		}
    157 		for j := maxLen; j > 1; j-- {
    158 			if x := entity[entityName[:j]]; x != 0 {
    159 				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
    160 			}
    161 		}
    162 	}
    163 
    164 	dst1, src1 = dst+i, src+i
    165 	copy(b[dst:dst1], b[src:src1])
    166 	return dst1, src1
    167 }
    168 
    169 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
    170 func unescape(b []byte) []byte {
    171 	for i, c := range b {
    172 		if c == '&' {
    173 			dst, src := unescapeEntity(b, i, i, false)
    174 			for src < len(b) {
    175 				c := b[src]
    176 				if c == '&' {
    177 					dst, src = unescapeEntity(b, dst, src, false)
    178 				} else {
    179 					b[dst] = c
    180 					dst, src = dst+1, src+1
    181 				}
    182 			}
    183 			return b[0:dst]
    184 		}
    185 	}
    186 	return b
    187 }
    188 
    189 var htmlEscaper = strings.NewReplacer(
    190 	`&`, "&amp;",
    191 	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
    192 	`<`, "&lt;",
    193 	`>`, "&gt;",
    194 	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
    195 )
    196 
    197 // EscapeString escapes special characters like "<" to become "&lt;". It
    198 // escapes only five such characters: <, >, &, ' and ".
    199 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
    200 // always true.
    201 func EscapeString(s string) string {
    202 	return htmlEscaper.Replace(s)
    203 }
    204 
    205 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
    206 // larger range of entities than EscapeString escapes. For example, "&aacute;"
    207 // unescapes to "", as does "&#225;" and "&xE1;".
    208 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
    209 // always true.
    210 func UnescapeString(s string) string {
    211 	if !strings.Contains(s, "&") {
    212 		return s
    213 	}
    214 	return string(unescape([]byte(s)))
    215 }
    216