Home | History | Annotate | Download | only in strconv
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 //go:generate go run makeisprint.go -output isprint.go
      6 
      7 package strconv
      8 
      9 import "unicode/utf8"
     10 
     11 const lowerhex = "0123456789abcdef"
     12 
     13 func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
     14 	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
     15 }
     16 
     17 func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
     18 	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
     19 }
     20 
     21 func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
     22 	buf = append(buf, quote)
     23 	for width := 0; len(s) > 0; s = s[width:] {
     24 		r := rune(s[0])
     25 		width = 1
     26 		if r >= utf8.RuneSelf {
     27 			r, width = utf8.DecodeRuneInString(s)
     28 		}
     29 		if width == 1 && r == utf8.RuneError {
     30 			buf = append(buf, `\x`...)
     31 			buf = append(buf, lowerhex[s[0]>>4])
     32 			buf = append(buf, lowerhex[s[0]&0xF])
     33 			continue
     34 		}
     35 		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
     36 	}
     37 	buf = append(buf, quote)
     38 	return buf
     39 }
     40 
     41 func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
     42 	buf = append(buf, quote)
     43 	if !utf8.ValidRune(r) {
     44 		r = utf8.RuneError
     45 	}
     46 	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
     47 	buf = append(buf, quote)
     48 	return buf
     49 }
     50 
     51 func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
     52 	var runeTmp [utf8.UTFMax]byte
     53 	if r == rune(quote) || r == '\\' { // always backslashed
     54 		buf = append(buf, '\\')
     55 		buf = append(buf, byte(r))
     56 		return buf
     57 	}
     58 	if ASCIIonly {
     59 		if r < utf8.RuneSelf && IsPrint(r) {
     60 			buf = append(buf, byte(r))
     61 			return buf
     62 		}
     63 	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
     64 		n := utf8.EncodeRune(runeTmp[:], r)
     65 		buf = append(buf, runeTmp[:n]...)
     66 		return buf
     67 	}
     68 	switch r {
     69 	case '\a':
     70 		buf = append(buf, `\a`...)
     71 	case '\b':
     72 		buf = append(buf, `\b`...)
     73 	case '\f':
     74 		buf = append(buf, `\f`...)
     75 	case '\n':
     76 		buf = append(buf, `\n`...)
     77 	case '\r':
     78 		buf = append(buf, `\r`...)
     79 	case '\t':
     80 		buf = append(buf, `\t`...)
     81 	case '\v':
     82 		buf = append(buf, `\v`...)
     83 	default:
     84 		switch {
     85 		case r < ' ':
     86 			buf = append(buf, `\x`...)
     87 			buf = append(buf, lowerhex[byte(r)>>4])
     88 			buf = append(buf, lowerhex[byte(r)&0xF])
     89 		case r > utf8.MaxRune:
     90 			r = 0xFFFD
     91 			fallthrough
     92 		case r < 0x10000:
     93 			buf = append(buf, `\u`...)
     94 			for s := 12; s >= 0; s -= 4 {
     95 				buf = append(buf, lowerhex[r>>uint(s)&0xF])
     96 			}
     97 		default:
     98 			buf = append(buf, `\U`...)
     99 			for s := 28; s >= 0; s -= 4 {
    100 				buf = append(buf, lowerhex[r>>uint(s)&0xF])
    101 			}
    102 		}
    103 	}
    104 	return buf
    105 }
    106 
    107 // Quote returns a double-quoted Go string literal representing s. The
    108 // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
    109 // control characters and non-printable characters as defined by
    110 // IsPrint.
    111 func Quote(s string) string {
    112 	return quoteWith(s, '"', false, false)
    113 }
    114 
    115 // AppendQuote appends a double-quoted Go string literal representing s,
    116 // as generated by Quote, to dst and returns the extended buffer.
    117 func AppendQuote(dst []byte, s string) []byte {
    118 	return appendQuotedWith(dst, s, '"', false, false)
    119 }
    120 
    121 // QuoteToASCII returns a double-quoted Go string literal representing s.
    122 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
    123 // non-ASCII characters and non-printable characters as defined by IsPrint.
    124 func QuoteToASCII(s string) string {
    125 	return quoteWith(s, '"', true, false)
    126 }
    127 
    128 // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
    129 // as generated by QuoteToASCII, to dst and returns the extended buffer.
    130 func AppendQuoteToASCII(dst []byte, s string) []byte {
    131 	return appendQuotedWith(dst, s, '"', true, false)
    132 }
    133 
    134 // QuoteToGraphic returns a double-quoted Go string literal representing s.
    135 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
    136 // non-ASCII characters and non-printable characters as defined by IsGraphic.
    137 func QuoteToGraphic(s string) string {
    138 	return quoteWith(s, '"', false, true)
    139 }
    140 
    141 // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
    142 // as generated by QuoteToGraphic, to dst and returns the extended buffer.
    143 func AppendQuoteToGraphic(dst []byte, s string) []byte {
    144 	return appendQuotedWith(dst, s, '"', false, true)
    145 }
    146 
    147 // QuoteRune returns a single-quoted Go character literal representing the
    148 // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
    149 // for control characters and non-printable characters as defined by IsPrint.
    150 func QuoteRune(r rune) string {
    151 	return quoteRuneWith(r, '\'', false, false)
    152 }
    153 
    154 // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
    155 // as generated by QuoteRune, to dst and returns the extended buffer.
    156 func AppendQuoteRune(dst []byte, r rune) []byte {
    157 	return appendQuotedRuneWith(dst, r, '\'', false, false)
    158 }
    159 
    160 // QuoteRuneToASCII returns a single-quoted Go character literal representing
    161 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
    162 // \u0100) for non-ASCII characters and non-printable characters as defined
    163 // by IsPrint.
    164 func QuoteRuneToASCII(r rune) string {
    165 	return quoteRuneWith(r, '\'', true, false)
    166 }
    167 
    168 // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
    169 // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
    170 func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
    171 	return appendQuotedRuneWith(dst, r, '\'', true, false)
    172 }
    173 
    174 // QuoteRuneToGraphic returns a single-quoted Go character literal representing
    175 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
    176 // \u0100) for non-ASCII characters and non-printable characters as defined
    177 // by IsGraphic.
    178 func QuoteRuneToGraphic(r rune) string {
    179 	return quoteRuneWith(r, '\'', false, true)
    180 }
    181 
    182 // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
    183 // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
    184 func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
    185 	return appendQuotedRuneWith(dst, r, '\'', false, true)
    186 }
    187 
    188 // CanBackquote reports whether the string s can be represented
    189 // unchanged as a single-line backquoted string without control
    190 // characters other than tab.
    191 func CanBackquote(s string) bool {
    192 	for len(s) > 0 {
    193 		r, wid := utf8.DecodeRuneInString(s)
    194 		s = s[wid:]
    195 		if wid > 1 {
    196 			if r == '\ufeff' {
    197 				return false // BOMs are invisible and should not be quoted.
    198 			}
    199 			continue // All other multibyte runes are correctly encoded and assumed printable.
    200 		}
    201 		if r == utf8.RuneError {
    202 			return false
    203 		}
    204 		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
    205 			return false
    206 		}
    207 	}
    208 	return true
    209 }
    210 
    211 func unhex(b byte) (v rune, ok bool) {
    212 	c := rune(b)
    213 	switch {
    214 	case '0' <= c && c <= '9':
    215 		return c - '0', true
    216 	case 'a' <= c && c <= 'f':
    217 		return c - 'a' + 10, true
    218 	case 'A' <= c && c <= 'F':
    219 		return c - 'A' + 10, true
    220 	}
    221 	return
    222 }
    223 
    224 // UnquoteChar decodes the first character or byte in the escaped string
    225 // or character literal represented by the string s.
    226 // It returns four values:
    227 //
    228 //	1) value, the decoded Unicode code point or byte value;
    229 //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
    230 //	3) tail, the remainder of the string after the character; and
    231 //	4) an error that will be nil if the character is syntactically valid.
    232 //
    233 // The second argument, quote, specifies the type of literal being parsed
    234 // and therefore which escaped quote character is permitted.
    235 // If set to a single quote, it permits the sequence \' and disallows unescaped '.
    236 // If set to a double quote, it permits \" and disallows unescaped ".
    237 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
    238 func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
    239 	// easy cases
    240 	switch c := s[0]; {
    241 	case c == quote && (quote == '\'' || quote == '"'):
    242 		err = ErrSyntax
    243 		return
    244 	case c >= utf8.RuneSelf:
    245 		r, size := utf8.DecodeRuneInString(s)
    246 		return r, true, s[size:], nil
    247 	case c != '\\':
    248 		return rune(s[0]), false, s[1:], nil
    249 	}
    250 
    251 	// hard case: c is backslash
    252 	if len(s) <= 1 {
    253 		err = ErrSyntax
    254 		return
    255 	}
    256 	c := s[1]
    257 	s = s[2:]
    258 
    259 	switch c {
    260 	case 'a':
    261 		value = '\a'
    262 	case 'b':
    263 		value = '\b'
    264 	case 'f':
    265 		value = '\f'
    266 	case 'n':
    267 		value = '\n'
    268 	case 'r':
    269 		value = '\r'
    270 	case 't':
    271 		value = '\t'
    272 	case 'v':
    273 		value = '\v'
    274 	case 'x', 'u', 'U':
    275 		n := 0
    276 		switch c {
    277 		case 'x':
    278 			n = 2
    279 		case 'u':
    280 			n = 4
    281 		case 'U':
    282 			n = 8
    283 		}
    284 		var v rune
    285 		if len(s) < n {
    286 			err = ErrSyntax
    287 			return
    288 		}
    289 		for j := 0; j < n; j++ {
    290 			x, ok := unhex(s[j])
    291 			if !ok {
    292 				err = ErrSyntax
    293 				return
    294 			}
    295 			v = v<<4 | x
    296 		}
    297 		s = s[n:]
    298 		if c == 'x' {
    299 			// single-byte string, possibly not UTF-8
    300 			value = v
    301 			break
    302 		}
    303 		if v > utf8.MaxRune {
    304 			err = ErrSyntax
    305 			return
    306 		}
    307 		value = v
    308 		multibyte = true
    309 	case '0', '1', '2', '3', '4', '5', '6', '7':
    310 		v := rune(c) - '0'
    311 		if len(s) < 2 {
    312 			err = ErrSyntax
    313 			return
    314 		}
    315 		for j := 0; j < 2; j++ { // one digit already; two more
    316 			x := rune(s[j]) - '0'
    317 			if x < 0 || x > 7 {
    318 				err = ErrSyntax
    319 				return
    320 			}
    321 			v = (v << 3) | x
    322 		}
    323 		s = s[2:]
    324 		if v > 255 {
    325 			err = ErrSyntax
    326 			return
    327 		}
    328 		value = v
    329 	case '\\':
    330 		value = '\\'
    331 	case '\'', '"':
    332 		if c != quote {
    333 			err = ErrSyntax
    334 			return
    335 		}
    336 		value = rune(c)
    337 	default:
    338 		err = ErrSyntax
    339 		return
    340 	}
    341 	tail = s
    342 	return
    343 }
    344 
    345 // Unquote interprets s as a single-quoted, double-quoted,
    346 // or backquoted Go string literal, returning the string value
    347 // that s quotes.  (If s is single-quoted, it would be a Go
    348 // character literal; Unquote returns the corresponding
    349 // one-character string.)
    350 func Unquote(s string) (string, error) {
    351 	n := len(s)
    352 	if n < 2 {
    353 		return "", ErrSyntax
    354 	}
    355 	quote := s[0]
    356 	if quote != s[n-1] {
    357 		return "", ErrSyntax
    358 	}
    359 	s = s[1 : n-1]
    360 
    361 	if quote == '`' {
    362 		if contains(s, '`') {
    363 			return "", ErrSyntax
    364 		}
    365 		if contains(s, '\r') {
    366 			// -1 because we know there is at least one \r to remove.
    367 			buf := make([]byte, 0, len(s)-1)
    368 			for i := 0; i < len(s); i++ {
    369 				if s[i] != '\r' {
    370 					buf = append(buf, s[i])
    371 				}
    372 			}
    373 			return string(buf), nil
    374 		}
    375 		return s, nil
    376 	}
    377 	if quote != '"' && quote != '\'' {
    378 		return "", ErrSyntax
    379 	}
    380 	if contains(s, '\n') {
    381 		return "", ErrSyntax
    382 	}
    383 
    384 	// Is it trivial? Avoid allocation.
    385 	if !contains(s, '\\') && !contains(s, quote) {
    386 		switch quote {
    387 		case '"':
    388 			return s, nil
    389 		case '\'':
    390 			r, size := utf8.DecodeRuneInString(s)
    391 			if size == len(s) && (r != utf8.RuneError || size != 1) {
    392 				return s, nil
    393 			}
    394 		}
    395 	}
    396 
    397 	var runeTmp [utf8.UTFMax]byte
    398 	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
    399 	for len(s) > 0 {
    400 		c, multibyte, ss, err := UnquoteChar(s, quote)
    401 		if err != nil {
    402 			return "", err
    403 		}
    404 		s = ss
    405 		if c < utf8.RuneSelf || !multibyte {
    406 			buf = append(buf, byte(c))
    407 		} else {
    408 			n := utf8.EncodeRune(runeTmp[:], c)
    409 			buf = append(buf, runeTmp[:n]...)
    410 		}
    411 		if quote == '\'' && len(s) != 0 {
    412 			// single-quoted must be single character
    413 			return "", ErrSyntax
    414 		}
    415 	}
    416 	return string(buf), nil
    417 }
    418 
    419 // contains reports whether the string contains the byte c.
    420 func contains(s string, c byte) bool {
    421 	for i := 0; i < len(s); i++ {
    422 		if s[i] == c {
    423 			return true
    424 		}
    425 	}
    426 	return false
    427 }
    428 
    429 // bsearch16 returns the smallest i such that a[i] >= x.
    430 // If there is no such i, bsearch16 returns len(a).
    431 func bsearch16(a []uint16, x uint16) int {
    432 	i, j := 0, len(a)
    433 	for i < j {
    434 		h := i + (j-i)/2
    435 		if a[h] < x {
    436 			i = h + 1
    437 		} else {
    438 			j = h
    439 		}
    440 	}
    441 	return i
    442 }
    443 
    444 // bsearch32 returns the smallest i such that a[i] >= x.
    445 // If there is no such i, bsearch32 returns len(a).
    446 func bsearch32(a []uint32, x uint32) int {
    447 	i, j := 0, len(a)
    448 	for i < j {
    449 		h := i + (j-i)/2
    450 		if a[h] < x {
    451 			i = h + 1
    452 		} else {
    453 			j = h
    454 		}
    455 	}
    456 	return i
    457 }
    458 
    459 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
    460 // to give the same answer. It allows this package not to depend on unicode,
    461 // and therefore not pull in all the Unicode tables. If the linker were better
    462 // at tossing unused tables, we could get rid of this implementation.
    463 // That would be nice.
    464 
    465 // IsPrint reports whether the rune is defined as printable by Go, with
    466 // the same definition as unicode.IsPrint: letters, numbers, punctuation,
    467 // symbols and ASCII space.
    468 func IsPrint(r rune) bool {
    469 	// Fast check for Latin-1
    470 	if r <= 0xFF {
    471 		if 0x20 <= r && r <= 0x7E {
    472 			// All the ASCII is printable from space through DEL-1.
    473 			return true
    474 		}
    475 		if 0xA1 <= r && r <= 0xFF {
    476 			// Similarly for  through ...
    477 			return r != 0xAD // ...except for the bizarre soft hyphen.
    478 		}
    479 		return false
    480 	}
    481 
    482 	// Same algorithm, either on uint16 or uint32 value.
    483 	// First, find first i such that isPrint[i] >= x.
    484 	// This is the index of either the start or end of a pair that might span x.
    485 	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
    486 	// If we find x in a range, make sure x is not in isNotPrint list.
    487 
    488 	if 0 <= r && r < 1<<16 {
    489 		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
    490 		i := bsearch16(isPrint, rr)
    491 		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
    492 			return false
    493 		}
    494 		j := bsearch16(isNotPrint, rr)
    495 		return j >= len(isNotPrint) || isNotPrint[j] != rr
    496 	}
    497 
    498 	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
    499 	i := bsearch32(isPrint, rr)
    500 	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
    501 		return false
    502 	}
    503 	if r >= 0x20000 {
    504 		return true
    505 	}
    506 	r -= 0x10000
    507 	j := bsearch16(isNotPrint, uint16(r))
    508 	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
    509 }
    510 
    511 // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
    512 // characters include letters, marks, numbers, punctuation, symbols, and
    513 // spaces, from categories L, M, N, P, S, and Zs.
    514 func IsGraphic(r rune) bool {
    515 	if IsPrint(r) {
    516 		return true
    517 	}
    518 	return isInGraphicList(r)
    519 }
    520 
    521 // isInGraphicList reports whether the rune is in the isGraphic list. This separation
    522 // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
    523 // Should be called only if IsPrint fails.
    524 func isInGraphicList(r rune) bool {
    525 	// We know r must fit in 16 bits - see makeisprint.go.
    526 	if r > 0xFFFF {
    527 		return false
    528 	}
    529 	rr := uint16(r)
    530 	i := bsearch16(isGraphic, rr)
    531 	return i < len(isGraphic) && rr == isGraphic[i]
    532 }
    533