Home | History | Annotate | Download | only in translit
      1 # Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html#License
      2 #
      3 # Copyright (C) 2006-2009, Google, International Business Machines Corporation and others. All Rights Reserved.
      4 # Regex for recognizing RFC 4646 well-formed tags
      5 # http://www.rfc-editor.org/rfc/rfc4646.txt
      6 # http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21
      7 
      8 # The structure requires no forward references, so it reverses the order.
      9 # It uses Java/Perl syntax instead of the old ABNF
     10 # The uppercase comments are fragments copied from RFC 4646
     11 
     12 # Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped.
     13 
     14 $alpha  = [a-z] ;   # ALPHA
     15 $digit  = [0-9] ;   # DIGIT
     16 $alphanum   = [a-z 0-9] ;   # ALPHA / DIGIT
     17 $x  = x ;   # private use singleton
     18 $singleton = [a-w y-z] ; # other singleton
     19 $s  = [-_] ; # separator -- lenient parsers will use [-_] -- strict will use [-]
     20 
     21 # Now do the components. The structure is slightly different to allow for capturing the right components.
     22 # The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing.
     23 
     24 $language   = $alpha{2,8} | $alpha{2,3} $s $alpha{3};
     25             
     26    # ABNF (2*3ALPHA) / 4ALPHA / 5*8ALPHA  --- note: because of how | works in regex, don't use $alpha{2,3} | $alpha{4,8} 
     27    # We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan).
     28 
     29 # Note: extlang invalid in Unicode language tags
     30 
     31 $script = $alpha{4} ;   # 4ALPHA 
     32 
     33 $region = $alpha{2} | $digit{3} ;    # 2ALPHA / 3DIGIT
     34 
     35 $variant    = (?: $alphanum{5,8} | $digit $alphanum{3} ) ;  # 5*8alphanum / (DIGIT 3alphanum)
     36 
     37 $extension  = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1*("-" (2*8alphanum))
     38 
     39 $privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1*("-" (1*8alphanum))
     40 
     41 # Define certain grandfathered codes, since otherwise the regex is pretty useless.
     42 # Since these are limited, this is safe even later changes to the registry --
     43 # the only oddity is that it might change the type of the tag, and thus
     44 # the results from the capturing groups.
     45 # http://www.iana.org/assignments/language-subtag-registry
     46 # Note that these have to be compared case insensitively, requiring (?i) below.
     47 
     48 $grandfathered  = en $s GB $s oed
     49       | i $s (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )
     50       | no $s (?: bok | nyn )
     51       | sgn $s (?: BE $s (?: fr | nl) | CH $s de )
     52       | zh $s min $s nan;
     53 
     54 # old:         | zh $s (?: cmn (?: $s Hans | $s Hant )? | gan | min (?: $s nan)? | wuu | yue );
     55 # For well-formedness, we don't need the ones that would otherwise pass.
     56 # For validity, they need to be checked.
     57 
     58 # $grandfatheredWellFormed = (?:
     59 #         art $s lojban
     60 #     | cel $s gaulish
     61 #     | zh $s (?: guoyu | hakka | xiang )
     62 # );
     63 
     64 # Unicode locales: but we are shifting to a compatible form
     65 # $keyvalue = (?: $alphanum+ \= $alphanum+);
     66 # $keywords = ($keyvalue (?: \; $keyvalue)*);
     67 
     68 # We separate items that we want to capture as a single group
     69 
     70 $variantList   = $variant (?: $s $variant )* ; # special for multiples
     71 $extensionList = $extension (?: $s $extension )* ;   # special for multiples
     72 
     73 $langtag = (?: ( $language )
     74       (?: $s ( $script ) )? 40%
     75       (?: $s ( $region ) )? 40%
     76       (?: $s ( $variantList ) )? 10%
     77       (?: $s ( $extensionList ) )? 5%
     78       (?: $s ( $privateUse ) )? 5%);
     79 
     80 # Here is the final breakdown, with capturing groups for each of these components
     81 # The variants, extensions, grandfathered, and private-use may have interior '-'
     82  
     83 $root = (?i) # case-insensitive
     84   (?:
     85       $langtag 90%
     86     | ( $privateUse ) 5%
     87     | ( $grandfathered ) 5%)
     88 #    (?: \@ $keywords )? 5%
     89     ;
     90