Home | History | Annotate | Download | only in patches
      1 Index: source/data/brkitr/word.txt
      2 ===================================================================
      3 --- source/data/brkitr/word.txt	(revision 259715)
      4 +++ source/data/brkitr/word.txt	(working copy)
      5 @@ -35,10 +35,16 @@
      6  $ALetter            = [\p{Word_Break = ALetter}];
      7  $Single_Quote       = [\p{Word_Break = Single_Quote}];
      8  $Double_Quote       = [\p{Word_Break = Double_Quote}];
      9 -$MidNumLet          = [\p{Word_Break = MidNumLet}];
     10 +# Remove two full stop characters from $MidNumLet and add them to $MidNum
     11 +# to break a hostname into its components at the cost of breaking
     12 +# 'e.g.' and 'i.e.' as well.
     13 +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
     14 +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
     15 +# while rules 6/7 are reverted to the old behavior we want.
     16 +$MidNumLet    = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
     17  $MidLetter          = [\p{Word_Break = MidLetter}];
     18 -$MidNum             = [\p{Word_Break = MidNum}];
     19 -$Numeric            = [\p{Word_Break = Numeric}];
     20 +$MidNum       = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
     21 +$Numeric      = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
     22  $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
     23  
     24  $Han                = [:Han:];
     25 Index: source/data/brkitr/line.txt
     26 ===================================================================
     27 --- source/data/brkitr/line.txt	(revision 259715)
     28 +++ source/data/brkitr/line.txt	(working copy)
     29 @@ -12,9 +12,8 @@
     30  #         This is only because of a limitation of ICU break engine implementation,
     31  #         not because the older behavior is desirable.
     32  
     33 -#
     34 -#  Character Classes defined by TR 14.
     35 -#
     36 +# CHROME: 1. Use line_ja.txt to apply small kana rules in all locales.
     37 +#         2. Adjust CL, OP, and IS to handle 'comma-variants' consistently.
     38  
     39  !!chain;
     40  !!LBCMNoChain;
     41 @@ -57,14 +56,14 @@
     42  #
     43  
     44  $AI = [:LineBreak =  Ambiguous:];
     45 -$AL = [:LineBreak =  Alphabetic:];
     46 +$AL = [[:LineBreak =  Alphabetic:] - [\u23B4\u23B5]];
     47  $BA = [:LineBreak =  Break_After:];
     48  $BB = [:LineBreak =  Break_Before:];
     49  $BK = [:LineBreak =  Mandatory_Break:];
     50  $B2 = [:LineBreak =  Break_Both:];
     51  $CB = [:LineBreak =  Contingent_Break:];
     52  $CJ = [:LineBreak =  Conditional_Japanese_Starter:];
     53 -$CL = [:LineBreak =  Close_Punctuation:];
     54 +$CL = [[:LineBreak =  Close_Punctuation:] [\uFE51\uFE10\u23B5]];
     55  $CM = [:LineBreak =  Combining_Mark:];
     56  $CP = [:LineBreak =  Close_Parenthesis:];
     57  $CR = [:LineBreak =  Carriage_Return:];
     58 @@ -74,16 +73,16 @@
     59  $HY = [:LineBreak =  Hyphen:];
     60  $H2 = [:LineBreak =  H2:];
     61  $H3 = [:LineBreak =  H3:];
     62 -$ID = [:LineBreak =  Ideographic:];
     63 +$ID = [[[:LineBreak =  Ideographic:] $CJ] - [\uFE51]];
     64  $IN = [:LineBreak =  Inseperable:];
     65 -$IS = [:LineBreak =  Infix_Numeric:];
     66 +$IS = [[:LineBreak =  Infix_Numeric:] - [\uFE10]];
     67  $JL = [:LineBreak =  JL:];
     68  $JV = [:LineBreak =  JV:];
     69  $JT = [:LineBreak =  JT:];
     70  $LF = [:LineBreak =  Line_Feed:];
     71  $NL = [:LineBreak =  Next_Line:];
     72 -$NS = [[:LineBreak =  Nonstarter:] $CJ];
     73 +$NS = [:LineBreak =  Nonstarter:];
     74  $NU = [:LineBreak =  Numeric:];
     75 -$OP = [:LineBreak =  Open_Punctuation:];
     76 +$OP = [[:LineBreak =  Open_Punctuation:] \u23B4];
     77  $PO = [:LineBreak =  Postfix_Numeric:];
     78  $PR = [:LineBreak =  Prefix_Numeric:];
     79