Home | History | Annotate | Download | only in bidi
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package bidi
      6 
      7 import (
      8 	"container/list"
      9 	"fmt"
     10 	"sort"
     11 )
     12 
     13 // This file contains a port of the reference implementation of the
     14 // Bidi Parentheses Algorithm:
     15 // http://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
     16 //
     17 // The implementation in this file covers definitions BD14-BD16 and rule N0
     18 // of UAX#9.
     19 //
     20 // Some preprocessing is done for each rune before data is passed to this
     21 // algorithm:
     22 //  - opening and closing brackets are identified
     23 //  - a bracket pair type, like '(' and ')' is assigned a unique identifier that
     24 //    is identical for the opening and closing bracket. It is left to do these
     25 //    mappings.
     26 //  - The BPA algorithm requires that bracket characters that are canonical
     27 //    equivalents of each other be able to be substituted for each other.
     28 //    It is the responsibility of the caller to do this canonicalization.
     29 //
     30 // In implementing BD16, this implementation departs slightly from the "logical"
     31 // algorithm defined in UAX#9. In particular, the stack referenced there
     32 // supports operations that go beyond a "basic" stack. An equivalent
     33 // implementation based on a linked list is used here.
     34 
     35 // Bidi_Paired_Bracket_Type
     36 // BD14. An opening paired bracket is a character whose
     37 // Bidi_Paired_Bracket_Type property value is Open.
     38 //
     39 // BD15. A closing paired bracket is a character whose
     40 // Bidi_Paired_Bracket_Type property value is Close.
     41 type bracketType byte
     42 
     43 const (
     44 	bpNone bracketType = iota
     45 	bpOpen
     46 	bpClose
     47 )
     48 
     49 // bracketPair holds a pair of index values for opening and closing bracket
     50 // location of a bracket pair.
     51 type bracketPair struct {
     52 	opener int
     53 	closer int
     54 }
     55 
     56 func (b *bracketPair) String() string {
     57 	return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
     58 }
     59 
     60 // bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
     61 type bracketPairs []bracketPair
     62 
     63 func (b bracketPairs) Len() int           { return len(b) }
     64 func (b bracketPairs) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
     65 func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
     66 
     67 // resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
     68 //
     69 // For each rune, it takes the indexes into the original string, the class the
     70 // bracket type (in pairTypes) and the bracket identifier (pairValues). It also
     71 // takes the direction type for the start-of-sentence and the embedding level.
     72 //
     73 // The identifiers for bracket types are the rune of the canonicalized opening
     74 // bracket for brackets (open or close) or 0 for runes that are not brackets.
     75 func resolvePairedBrackets(s *isolatingRunSequence) {
     76 	p := bracketPairer{
     77 		sos:              s.sos,
     78 		openers:          list.New(),
     79 		codesIsolatedRun: s.types,
     80 		indexes:          s.indexes,
     81 	}
     82 	dirEmbed := L
     83 	if s.level&1 != 0 {
     84 		dirEmbed = R
     85 	}
     86 	p.locateBrackets(s.p.pairTypes, s.p.pairValues)
     87 	p.resolveBrackets(dirEmbed, s.p.initialTypes)
     88 }
     89 
     90 type bracketPairer struct {
     91 	sos Class // direction corresponding to start of sequence
     92 
     93 	// The following is a restatement of BD 16 using non-algorithmic language.
     94 	//
     95 	// A bracket pair is a pair of characters consisting of an opening
     96 	// paired bracket and a closing paired bracket such that the
     97 	// Bidi_Paired_Bracket property value of the former equals the latter,
     98 	// subject to the following constraints.
     99 	// - both characters of a pair occur in the same isolating run sequence
    100 	// - the closing character of a pair follows the opening character
    101 	// - any bracket character can belong at most to one pair, the earliest possible one
    102 	// - any bracket character not part of a pair is treated like an ordinary character
    103 	// - pairs may nest properly, but their spans may not overlap otherwise
    104 
    105 	// Bracket characters with canonical decompositions are supposed to be
    106 	// treated as if they had been normalized, to allow normalized and non-
    107 	// normalized text to give the same result. In this implementation that step
    108 	// is pushed out to the caller. The caller has to ensure that the pairValue
    109 	// slices contain the rune of the opening bracket after normalization for
    110 	// any opening or closing bracket.
    111 
    112 	openers *list.List // list of positions for opening brackets
    113 
    114 	// bracket pair positions sorted by location of opening bracket
    115 	pairPositions bracketPairs
    116 
    117 	codesIsolatedRun []Class // directional bidi codes for an isolated run
    118 	indexes          []int   // array of index values into the original string
    119 
    120 }
    121 
    122 // matchOpener reports whether characters at given positions form a matching
    123 // bracket pair.
    124 func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
    125 	return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
    126 }
    127 
    128 const maxPairingDepth = 63
    129 
    130 // locateBrackets locates matching bracket pairs according to BD16.
    131 //
    132 // This implementation uses a linked list instead of a stack, because, while
    133 // elements are added at the front (like a push) they are not generally removed
    134 // in atomic 'pop' operations, reducing the benefit of the stack archetype.
    135 func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
    136 	// traverse the run
    137 	// do that explicitly (not in a for-each) so we can record position
    138 	for i, index := range p.indexes {
    139 
    140 		// look at the bracket type for each character
    141 		if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
    142 			// continue scanning
    143 			continue
    144 		}
    145 		switch pairTypes[index] {
    146 		case bpOpen:
    147 			// check if maximum pairing depth reached
    148 			if p.openers.Len() == maxPairingDepth {
    149 				p.openers.Init()
    150 				return
    151 			}
    152 			// remember opener location, most recent first
    153 			p.openers.PushFront(i)
    154 
    155 		case bpClose:
    156 			// see if there is a match
    157 			count := 0
    158 			for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
    159 				count++
    160 				opener := elem.Value.(int)
    161 				if p.matchOpener(pairValues, opener, i) {
    162 					// if the opener matches, add nested pair to the ordered list
    163 					p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
    164 					// remove up to and including matched opener
    165 					for ; count > 0; count-- {
    166 						p.openers.Remove(p.openers.Front())
    167 					}
    168 					break
    169 				}
    170 			}
    171 			sort.Sort(p.pairPositions)
    172 			// if we get here, the closing bracket matched no openers
    173 			// and gets ignored
    174 		}
    175 	}
    176 }
    177 
    178 // Bracket pairs within an isolating run sequence are processed as units so
    179 // that both the opening and the closing paired bracket in a pair resolve to
    180 // the same direction.
    181 //
    182 // N0. Process bracket pairs in an isolating run sequence sequentially in
    183 // the logical order of the text positions of the opening paired brackets
    184 // using the logic given below. Within this scope, bidirectional types EN
    185 // and AN are treated as R.
    186 //
    187 // Identify the bracket pairs in the current isolating run sequence
    188 // according to BD16. For each bracket-pair element in the list of pairs of
    189 // text positions:
    190 //
    191 // a Inspect the bidirectional types of the characters enclosed within the
    192 // bracket pair.
    193 //
    194 // b If any strong type (either L or R) matching the embedding direction is
    195 // found, set the type for both brackets in the pair to match the embedding
    196 // direction.
    197 //
    198 // o [ e ] o -> o e e e o
    199 //
    200 // o [ o e ] -> o e o e e
    201 //
    202 // o [ NI e ] -> o e NI e e
    203 //
    204 // c Otherwise, if a strong type (opposite the embedding direction) is
    205 // found, test for adjacent strong types as follows: 1 First, check
    206 // backwards before the opening paired bracket until the first strong type
    207 // (L, R, or sos) is found. If that first preceding strong type is opposite
    208 // the embedding direction, then set the type for both brackets in the pair
    209 // to that type. 2 Otherwise, set the type for both brackets in the pair to
    210 // the embedding direction.
    211 //
    212 // o [ o ] e -> o o o o e
    213 //
    214 // o [ o NI ] o -> o o o NI o o
    215 //
    216 // e [ o ] o -> e e o e o
    217 //
    218 // e [ o ] e -> e e o e e
    219 //
    220 // e ( o [ o ] NI ) e -> e e o o o o NI e e
    221 //
    222 // d Otherwise, do not set the type for the current bracket pair. Note that
    223 // if the enclosed text contains no strong types the paired brackets will
    224 // both resolve to the same level when resolved individually using rules N1
    225 // and N2.
    226 //
    227 // e ( NI ) o -> e ( NI ) o
    228 
    229 // getStrongTypeN0 maps character's directional code to strong type as required
    230 // by rule N0.
    231 //
    232 // TODO: have separate type for "strong" directionality.
    233 func (p *bracketPairer) getStrongTypeN0(index int) Class {
    234 	switch p.codesIsolatedRun[index] {
    235 	// in the scope of N0, number types are treated as R
    236 	case EN, AN, AL, R:
    237 		return R
    238 	case L:
    239 		return L
    240 	default:
    241 		return ON
    242 	}
    243 }
    244 
    245 // classifyPairContent reports the strong types contained inside a Bracket Pair,
    246 // assuming the given embedding direction.
    247 //
    248 // It returns ON if no strong type is found. If a single strong type is found,
    249 // it returns this this type. Otherwise it returns the embedding direction.
    250 //
    251 // TODO: use separate type for "strong" directionality.
    252 func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
    253 	dirOpposite := ON
    254 	for i := loc.opener + 1; i < loc.closer; i++ {
    255 		dir := p.getStrongTypeN0(i)
    256 		if dir == ON {
    257 			continue
    258 		}
    259 		if dir == dirEmbed {
    260 			return dir // type matching embedding direction found
    261 		}
    262 		dirOpposite = dir
    263 	}
    264 	// return ON if no strong type found, or class opposite to dirEmbed
    265 	return dirOpposite
    266 }
    267 
    268 // classBeforePair determines which strong types are present before a Bracket
    269 // Pair. Return R or L if strong type found, otherwise ON.
    270 func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
    271 	for i := loc.opener - 1; i >= 0; i-- {
    272 		if dir := p.getStrongTypeN0(i); dir != ON {
    273 			return dir
    274 		}
    275 	}
    276 	// no strong types found, return sos
    277 	return p.sos
    278 }
    279 
    280 // assignBracketType implements rule N0 for a single bracket pair.
    281 func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
    282 	// rule "N0, a", inspect contents of pair
    283 	dirPair := p.classifyPairContent(loc, dirEmbed)
    284 
    285 	// dirPair is now L, R, or N (no strong type found)
    286 
    287 	// the following logical tests are performed out of order compared to
    288 	// the statement of the rules but yield the same results
    289 	if dirPair == ON {
    290 		return // case "d" - nothing to do
    291 	}
    292 
    293 	if dirPair != dirEmbed {
    294 		// case "c": strong type found, opposite - check before (c.1)
    295 		dirPair = p.classBeforePair(loc)
    296 		if dirPair == dirEmbed || dirPair == ON {
    297 			// no strong opposite type found before - use embedding (c.2)
    298 			dirPair = dirEmbed
    299 		}
    300 	}
    301 	// else: case "b", strong type found matching embedding,
    302 	// no explicit action needed, as dirPair is already set to embedding
    303 	// direction
    304 
    305 	// set the bracket types to the type found
    306 	p.setBracketsToType(loc, dirPair, initialTypes)
    307 }
    308 
    309 func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
    310 	p.codesIsolatedRun[loc.opener] = dirPair
    311 	p.codesIsolatedRun[loc.closer] = dirPair
    312 
    313 	for i := loc.opener + 1; i < loc.closer; i++ {
    314 		index := p.indexes[i]
    315 		if initialTypes[index] != NSM {
    316 			break
    317 		}
    318 		p.codesIsolatedRun[i] = dirPair
    319 	}
    320 
    321 	for i := loc.closer + 1; i < len(p.indexes); i++ {
    322 		index := p.indexes[i]
    323 		if initialTypes[index] != NSM {
    324 			break
    325 		}
    326 		p.codesIsolatedRun[i] = dirPair
    327 	}
    328 }
    329 
    330 // resolveBrackets implements rule N0 for a list of pairs.
    331 func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
    332 	for _, loc := range p.pairPositions {
    333 		p.assignBracketType(loc, dirEmbed, initialTypes)
    334 	}
    335 }
    336