Home | History | Annotate | Download | only in width
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 //go:generate stringer -type=Kind
      6 //go:generate go run gen.go gen_common.go gen_trieval.go
      7 
      8 // Package width provides functionality for handling different widths in text.
      9 //
     10 // Wide characters behave like ideographs; they tend to allow line breaks after
     11 // each character and remain upright in vertical text layout. Narrow characters
     12 // are kept together in words or runs that are rotated sideways in vertical text
     13 // layout.
     14 //
     15 // For more information, see http://unicode.org/reports/tr11/.
     16 package width // import "golang_org/x/text/width"
     17 
     18 import (
     19 	"unicode/utf8"
     20 
     21 	"golang_org/x/text/transform"
     22 )
     23 
     24 // TODO
     25 // 1) Reduce table size by compressing blocks.
     26 // 2) API proposition for computing display length
     27 //    (approximation, fixed pitch only).
     28 // 3) Implement display length.
     29 
     30 // Kind indicates the type of width property as defined in http://unicode.org/reports/tr11/.
     31 type Kind int
     32 
     33 const (
     34 	// Neutral characters do not occur in legacy East Asian character sets.
     35 	Neutral Kind = iota
     36 
     37 	// EastAsianAmbiguous characters that can be sometimes wide and sometimes
     38 	// narrow and require additional information not contained in the character
     39 	// code to further resolve their width.
     40 	EastAsianAmbiguous
     41 
     42 	// EastAsianWide characters are wide in its usual form. They occur only in
     43 	// the context of East Asian typography. These runes may have explicit
     44 	// halfwidth counterparts.
     45 	EastAsianWide
     46 
     47 	// EastAsianNarrow characters are narrow in its usual form. They often have
     48 	// fullwidth counterparts.
     49 	EastAsianNarrow
     50 
     51 	// Note: there exist Narrow runes that do not have fullwidth or wide
     52 	// counterparts, despite what the definition says (e.g. U+27E6).
     53 
     54 	// EastAsianFullwidth characters have a compatibility decompositions of type
     55 	// wide that map to a narrow counterpart.
     56 	EastAsianFullwidth
     57 
     58 	// EastAsianHalfwidth characters have a compatibility decomposition of type
     59 	// narrow that map to a wide or ambiguous counterpart, plus U+20A9  WON
     60 	// SIGN.
     61 	EastAsianHalfwidth
     62 
     63 	// Note: there exist runes that have a halfwidth counterparts but that are
     64 	// classified as Ambiguous, rather than wide (e.g. U+2190).
     65 )
     66 
     67 // TODO: the generated tries need to return size 1 for invalid runes for the
     68 // width to be computed correctly (each byte should render width 1)
     69 
     70 var trie = newWidthTrie(0)
     71 
     72 // Lookup reports the Properties of the first rune in b and the number of bytes
     73 // of its UTF-8 encoding.
     74 func Lookup(b []byte) (p Properties, size int) {
     75 	v, sz := trie.lookup(b)
     76 	return Properties{elem(v), b[sz-1]}, sz
     77 }
     78 
     79 // LookupString reports the Properties of the first rune in s and the number of
     80 // bytes of its UTF-8 encoding.
     81 func LookupString(s string) (p Properties, size int) {
     82 	v, sz := trie.lookupString(s)
     83 	return Properties{elem(v), s[sz-1]}, sz
     84 }
     85 
     86 // LookupRune reports the Properties of rune r.
     87 func LookupRune(r rune) Properties {
     88 	var buf [4]byte
     89 	n := utf8.EncodeRune(buf[:], r)
     90 	v, _ := trie.lookup(buf[:n])
     91 	last := byte(r)
     92 	if r >= utf8.RuneSelf {
     93 		last = 0x80 + byte(r&0x3f)
     94 	}
     95 	return Properties{elem(v), last}
     96 }
     97 
     98 // Properties provides access to width properties of a rune.
     99 type Properties struct {
    100 	elem elem
    101 	last byte
    102 }
    103 
    104 func (e elem) kind() Kind {
    105 	return Kind(e >> typeShift)
    106 }
    107 
    108 // Kind returns the Kind of a rune as defined in Unicode TR #11.
    109 // See http://unicode.org/reports/tr11/ for more details.
    110 func (p Properties) Kind() Kind {
    111 	return p.elem.kind()
    112 }
    113 
    114 // Folded returns the folded variant of a rune or 0 if the rune is canonical.
    115 func (p Properties) Folded() rune {
    116 	if p.elem&tagNeedsFold != 0 {
    117 		buf := inverseData[byte(p.elem)]
    118 		buf[buf[0]] ^= p.last
    119 		r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
    120 		return r
    121 	}
    122 	return 0
    123 }
    124 
    125 // Narrow returns the narrow variant of a rune or 0 if the rune is already
    126 // narrow or doesn't have a narrow variant.
    127 func (p Properties) Narrow() rune {
    128 	if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianFullwidth || k == EastAsianWide || k == EastAsianAmbiguous) {
    129 		buf := inverseData[byte(p.elem)]
    130 		buf[buf[0]] ^= p.last
    131 		r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
    132 		return r
    133 	}
    134 	return 0
    135 }
    136 
    137 // Wide returns the wide variant of a rune or 0 if the rune is already
    138 // wide or doesn't have a wide variant.
    139 func (p Properties) Wide() rune {
    140 	if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianHalfwidth || k == EastAsianNarrow) {
    141 		buf := inverseData[byte(p.elem)]
    142 		buf[buf[0]] ^= p.last
    143 		r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
    144 		return r
    145 	}
    146 	return 0
    147 }
    148 
    149 // TODO for Properties:
    150 // - Add Fullwidth/Halfwidth or Inverted methods for computing variants
    151 // mapping.
    152 // - Add width information (including information on non-spacing runes).
    153 
    154 // Transformer implements the transform.Transformer interface.
    155 type Transformer struct {
    156 	t transform.SpanningTransformer
    157 }
    158 
    159 // Reset implements the transform.Transformer interface.
    160 func (t Transformer) Reset() { t.t.Reset() }
    161 
    162 // Transform implements the transform.Transformer interface.
    163 func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    164 	return t.t.Transform(dst, src, atEOF)
    165 }
    166 
    167 // Span implements the transform.SpanningTransformer interface.
    168 func (t Transformer) Span(src []byte, atEOF bool) (n int, err error) {
    169 	return t.t.Span(src, atEOF)
    170 }
    171 
    172 // Bytes returns a new byte slice with the result of applying t to b.
    173 func (t Transformer) Bytes(b []byte) []byte {
    174 	b, _, _ = transform.Bytes(t, b)
    175 	return b
    176 }
    177 
    178 // String returns a string with the result of applying t to s.
    179 func (t Transformer) String(s string) string {
    180 	s, _, _ = transform.String(t, s)
    181 	return s
    182 }
    183 
    184 var (
    185 	// Fold is a transform that maps all runes to their canonical width.
    186 	//
    187 	// Note that the NFKC and NFKD transforms in golang.org/x/text/unicode/norm
    188 	// provide a more generic folding mechanism.
    189 	Fold Transformer = Transformer{foldTransform{}}
    190 
    191 	// Widen is a transform that maps runes to their wide variant, if
    192 	// available.
    193 	Widen Transformer = Transformer{wideTransform{}}
    194 
    195 	// Narrow is a transform that maps runes to their narrow variant, if
    196 	// available.
    197 	Narrow Transformer = Transformer{narrowTransform{}}
    198 )
    199 
    200 // TODO: Consider the following options:
    201 // - Treat Ambiguous runes that have a halfwidth counterpart as wide, or some
    202 //   generalized variant of this.
    203 // - Consider a wide Won character to be the default width (or some generalized
    204 //   variant of this).
    205 // - Filter the set of characters that gets converted (the preferred approach is
    206 //   to allow applying filters to transforms).
    207