Home | History | Annotate | Download | only in src
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9      Original API code Copyright (c) 1997-2012 University of Cambridge
     10           New API code Copyright (c) 2016-2018 University of Cambridge
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 /* This module contains an internal function that is used to match a Unicode
     42 extended grapheme sequence. It is used by both pcre2_match() and
     43 pcre2_def_match(). However, it is called only when Unicode support is being
     44 compiled. Nevertheless, we provide a dummy function when there is no Unicode
     45 support, because some compilers do not like functionless source files. */
     46 
     47 
     48 #ifdef HAVE_CONFIG_H
     49 #include "config.h"
     50 #endif
     51 
     52 
     53 #include "pcre2_internal.h"
     54 
     55 
     56 /* Dummy function */
     57 
     58 #ifndef SUPPORT_UNICODE
     59 PCRE2_SPTR
     60 PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
     61   PCRE2_SPTR end_subject, BOOL utf, int *xcount)
     62 {
     63 (void)c;
     64 (void)eptr;
     65 (void)start_subject;
     66 (void)end_subject;
     67 (void)utf;
     68 (void)xcount;
     69 return NULL;
     70 }
     71 #else
     72 
     73 
     74 /*************************************************
     75 *      Match an extended grapheme sequence       *
     76 *************************************************/
     77 
     78 /*
     79 Arguments:
     80   c              the first character
     81   eptr           pointer to next character
     82   start_subject  pointer to start of subject
     83   end_subject    pointer to end of subject
     84   utf            TRUE if in UTF mode
     85   xcount         pointer to count of additional characters,
     86                    or NULL if count not needed
     87 
     88 Returns:         pointer after the end of the sequence
     89 */
     90 
     91 PCRE2_SPTR
     92 PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
     93   PCRE2_SPTR end_subject, BOOL utf, int *xcount)
     94 {
     95 int lgb = UCD_GRAPHBREAK(c);
     96 
     97 while (eptr < end_subject)
     98   {
     99   int rgb;
    100   int len = 1;
    101   if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
    102   rgb = UCD_GRAPHBREAK(c);
    103   if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
    104 
    105   /* Not breaking between Regional Indicators is allowed only if there
    106   are an even number of preceding RIs. */
    107 
    108   if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator)
    109     {
    110     int ricount = 0;
    111     PCRE2_SPTR bptr = eptr - 1;
    112     if (utf) BACKCHAR(bptr);
    113 
    114     /* bptr is pointing to the left-hand character */
    115 
    116     while (bptr > start_subject)
    117       {
    118       bptr--;
    119       if (utf)
    120         {
    121         BACKCHAR(bptr);
    122         GETCHAR(c, bptr);
    123         }
    124       else
    125       c = *bptr;
    126       if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break;
    127       ricount++;
    128       }
    129     if ((ricount & 1) != 0) break;  /* Grapheme break required */
    130     }
    131 
    132   /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this
    133   allows any number of them before a following Extended_Pictographic. */
    134 
    135   if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) ||
    136        lgb != ucp_gbExtended_Pictographic)
    137     lgb = rgb;
    138 
    139   eptr += len;
    140   if (xcount != NULL) *xcount += 1;
    141   }
    142 
    143 return eptr;
    144 }
    145 
    146 #endif  /* SUPPORT_UNICODE */
    147 
    148 /* End of pcre2_extuni.c */
    149