Home | History | Annotate | Download | only in bidirule
      1 // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
      2 
      3 // Copyright 2016 The Go Authors. All rights reserved.
      4 // Use of this source code is governed by a BSD-style
      5 // license that can be found in the LICENSE file.
      6 
      7 // Package bidirule implements the Bidi Rule defined by RFC 5893.
      8 //
      9 // This package is under development. The API may change without notice and
     10 // without preserving backward compatibility.
     11 package bidirule
     12 
     13 import (
     14 	"errors"
     15 	"unicode/utf8"
     16 
     17 	"golang_org/x/text/transform"
     18 	"golang_org/x/text/unicode/bidi"
     19 )
     20 
     21 // This file contains an implementation of RFC 5893: Right-to-Left Scripts for
     22 // Internationalized Domain Names for Applications (IDNA)
     23 //
     24 // A label is an individual component of a domain name.  Labels are usually
     25 // shown separated by dots; for example, the domain name "www.example.com" is
     26 // composed of three labels: "www", "example", and "com".
     27 //
     28 // An RTL label is a label that contains at least one character of class R, AL,
     29 // or AN. An LTR label is any label that is not an RTL label.
     30 //
     31 // A "Bidi domain name" is a domain name that contains at least one RTL label.
     32 //
     33 //  The following guarantees can be made based on the above:
     34 //
     35 //  o  In a domain name consisting of only labels that satisfy the rule,
     36 //     the requirements of Section 3 are satisfied.  Note that even LTR
     37 //     labels and pure ASCII labels have to be tested.
     38 //
     39 //  o  In a domain name consisting of only LDH labels (as defined in the
     40 //     Definitions document [RFC5890]) and labels that satisfy the rule,
     41 //     the requirements of Section 3 are satisfied as long as a label
     42 //     that starts with an ASCII digit does not come after a
     43 //     right-to-left label.
     44 //
     45 //  No guarantee is given for other combinations.
     46 
     47 // ErrInvalid indicates a label is invalid according to the Bidi Rule.
     48 var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
     49 
     50 type ruleState uint8
     51 
     52 const (
     53 	ruleInitial ruleState = iota
     54 	ruleLTR
     55 	ruleLTRFinal
     56 	ruleRTL
     57 	ruleRTLFinal
     58 	ruleInvalid
     59 )
     60 
     61 type ruleTransition struct {
     62 	next ruleState
     63 	mask uint16
     64 }
     65 
     66 var transitions = [...][2]ruleTransition{
     67 	// [2.1] The first character must be a character with Bidi property L, R, or
     68 	// AL. If it has the R or AL property, it is an RTL label; if it has the L
     69 	// property, it is an LTR label.
     70 	ruleInitial: {
     71 		{ruleLTRFinal, 1 << bidi.L},
     72 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
     73 	},
     74 	ruleRTL: {
     75 		// [2.3] In an RTL label, the end of the label must be a character with
     76 		// Bidi property R, AL, EN, or AN, followed by zero or more characters
     77 		// with Bidi property NSM.
     78 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
     79 
     80 		// [2.2] In an RTL label, only characters with the Bidi properties R,
     81 		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
     82 		// We exclude the entries from [2.3]
     83 		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
     84 	},
     85 	ruleRTLFinal: {
     86 		// [2.3] In an RTL label, the end of the label must be a character with
     87 		// Bidi property R, AL, EN, or AN, followed by zero or more characters
     88 		// with Bidi property NSM.
     89 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
     90 
     91 		// [2.2] In an RTL label, only characters with the Bidi properties R,
     92 		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
     93 		// We exclude the entries from [2.3] and NSM.
     94 		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
     95 	},
     96 	ruleLTR: {
     97 		// [2.6] In an LTR label, the end of the label must be a character with
     98 		// Bidi property L or EN, followed by zero or more characters with Bidi
     99 		// property NSM.
    100 		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
    101 
    102 		// [2.5] In an LTR label, only characters with the Bidi properties L,
    103 		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
    104 		// We exclude the entries from [2.6].
    105 		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
    106 	},
    107 	ruleLTRFinal: {
    108 		// [2.6] In an LTR label, the end of the label must be a character with
    109 		// Bidi property L or EN, followed by zero or more characters with Bidi
    110 		// property NSM.
    111 		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
    112 
    113 		// [2.5] In an LTR label, only characters with the Bidi properties L,
    114 		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
    115 		// We exclude the entries from [2.6].
    116 		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
    117 	},
    118 	ruleInvalid: {
    119 		{ruleInvalid, 0},
    120 		{ruleInvalid, 0},
    121 	},
    122 }
    123 
    124 // [2.4] In an RTL label, if an EN is present, no AN may be present, and
    125 // vice versa.
    126 const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
    127 
    128 // From RFC 5893
    129 // An RTL label is a label that contains at least one character of type
    130 // R, AL, or AN.
    131 //
    132 // An LTR label is any label that is not an RTL label.
    133 
    134 // Direction reports the direction of the given label as defined by RFC 5893.
    135 // The Bidi Rule does not have to be applied to labels of the category
    136 // LeftToRight.
    137 func Direction(b []byte) bidi.Direction {
    138 	for i := 0; i < len(b); {
    139 		e, sz := bidi.Lookup(b[i:])
    140 		if sz == 0 {
    141 			i++
    142 		}
    143 		c := e.Class()
    144 		if c == bidi.R || c == bidi.AL || c == bidi.AN {
    145 			return bidi.RightToLeft
    146 		}
    147 		i += sz
    148 	}
    149 	return bidi.LeftToRight
    150 }
    151 
    152 // DirectionString reports the direction of the given label as defined by RFC
    153 // 5893. The Bidi Rule does not have to be applied to labels of the category
    154 // LeftToRight.
    155 func DirectionString(s string) bidi.Direction {
    156 	for i := 0; i < len(s); {
    157 		e, sz := bidi.LookupString(s[i:])
    158 		if sz == 0 {
    159 			i++
    160 			continue
    161 		}
    162 		c := e.Class()
    163 		if c == bidi.R || c == bidi.AL || c == bidi.AN {
    164 			return bidi.RightToLeft
    165 		}
    166 		i += sz
    167 	}
    168 	return bidi.LeftToRight
    169 }
    170 
    171 // Valid reports whether b conforms to the BiDi rule.
    172 func Valid(b []byte) bool {
    173 	var t Transformer
    174 	if n, ok := t.advance(b); !ok || n < len(b) {
    175 		return false
    176 	}
    177 	return t.isFinal()
    178 }
    179 
    180 // ValidString reports whether s conforms to the BiDi rule.
    181 func ValidString(s string) bool {
    182 	var t Transformer
    183 	if n, ok := t.advanceString(s); !ok || n < len(s) {
    184 		return false
    185 	}
    186 	return t.isFinal()
    187 }
    188 
    189 // New returns a Transformer that verifies that input adheres to the Bidi Rule.
    190 func New() *Transformer {
    191 	return &Transformer{}
    192 }
    193 
    194 // Transformer implements transform.Transform.
    195 type Transformer struct {
    196 	state  ruleState
    197 	hasRTL bool
    198 	seen   uint16
    199 }
    200 
    201 // A rule can only be violated for "Bidi Domain names", meaning if one of the
    202 // following categories has been observed.
    203 func (t *Transformer) isRTL() bool {
    204 	const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
    205 	return t.seen&isRTL != 0
    206 }
    207 
    208 func (t *Transformer) isFinal() bool {
    209 	return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
    210 }
    211 
    212 // Reset implements transform.Transformer.
    213 func (t *Transformer) Reset() { *t = Transformer{} }
    214 
    215 // Transform implements transform.Transformer. This Transformer has state and
    216 // needs to be reset between uses.
    217 func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    218 	if len(dst) < len(src) {
    219 		src = src[:len(dst)]
    220 		atEOF = false
    221 		err = transform.ErrShortDst
    222 	}
    223 	n, err1 := t.Span(src, atEOF)
    224 	copy(dst, src[:n])
    225 	if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
    226 		err = err1
    227 	}
    228 	return n, n, err
    229 }
    230 
    231 // Span returns the first n bytes of src that conform to the Bidi rule.
    232 func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
    233 	if t.state == ruleInvalid && t.isRTL() {
    234 		return 0, ErrInvalid
    235 	}
    236 	n, ok := t.advance(src)
    237 	switch {
    238 	case !ok:
    239 		err = ErrInvalid
    240 	case n < len(src):
    241 		if !atEOF {
    242 			err = transform.ErrShortSrc
    243 			break
    244 		}
    245 		err = ErrInvalid
    246 	case !t.isFinal():
    247 		err = ErrInvalid
    248 	}
    249 	return n, err
    250 }
    251 
    252 // Precomputing the ASCII values decreases running time for the ASCII fast path
    253 // by about 30%.
    254 var asciiTable [128]bidi.Properties
    255 
    256 func init() {
    257 	for i := range asciiTable {
    258 		p, _ := bidi.LookupRune(rune(i))
    259 		asciiTable[i] = p
    260 	}
    261 }
    262 
    263 func (t *Transformer) advance(s []byte) (n int, ok bool) {
    264 	var e bidi.Properties
    265 	var sz int
    266 	for n < len(s) {
    267 		if s[n] < utf8.RuneSelf {
    268 			e, sz = asciiTable[s[n]], 1
    269 		} else {
    270 			e, sz = bidi.Lookup(s[n:])
    271 			if sz <= 1 {
    272 				if sz == 1 {
    273 					// We always consider invalid UTF-8 to be invalid, even if
    274 					// the string has not yet been determined to be RTL.
    275 					// TODO: is this correct?
    276 					return n, false
    277 				}
    278 				return n, true // incomplete UTF-8 encoding
    279 			}
    280 		}
    281 		// TODO: using CompactClass would result in noticeable speedup.
    282 		// See unicode/bidi/prop.go:Properties.CompactClass.
    283 		c := uint16(1 << e.Class())
    284 		t.seen |= c
    285 		if t.seen&exclusiveRTL == exclusiveRTL {
    286 			t.state = ruleInvalid
    287 			return n, false
    288 		}
    289 		switch tr := transitions[t.state]; {
    290 		case tr[0].mask&c != 0:
    291 			t.state = tr[0].next
    292 		case tr[1].mask&c != 0:
    293 			t.state = tr[1].next
    294 		default:
    295 			t.state = ruleInvalid
    296 			if t.isRTL() {
    297 				return n, false
    298 			}
    299 		}
    300 		n += sz
    301 	}
    302 	return n, true
    303 }
    304 
    305 func (t *Transformer) advanceString(s string) (n int, ok bool) {
    306 	var e bidi.Properties
    307 	var sz int
    308 	for n < len(s) {
    309 		if s[n] < utf8.RuneSelf {
    310 			e, sz = asciiTable[s[n]], 1
    311 		} else {
    312 			e, sz = bidi.LookupString(s[n:])
    313 			if sz <= 1 {
    314 				if sz == 1 {
    315 					return n, false // invalid UTF-8
    316 				}
    317 				return n, true // incomplete UTF-8 encoding
    318 			}
    319 		}
    320 		// TODO: using CompactClass results in noticeable speedup.
    321 		// See unicode/bidi/prop.go:Properties.CompactClass.
    322 		c := uint16(1 << e.Class())
    323 		t.seen |= c
    324 		if t.seen&exclusiveRTL == exclusiveRTL {
    325 			t.state = ruleInvalid
    326 			return n, false
    327 		}
    328 		switch tr := transitions[t.state]; {
    329 		case tr[0].mask&c != 0:
    330 			t.state = tr[0].next
    331 		case tr[1].mask&c != 0:
    332 			t.state = tr[1].next
    333 		default:
    334 			t.state = ruleInvalid
    335 			if t.isRTL() {
    336 				return n, false
    337 			}
    338 		}
    339 		n += sz
    340 	}
    341 	return n, true
    342 }
    343