Home | History | Annotate | Download | only in dist2
      1 #! /bin/sh
      2 
      3 # Script for testing regular expressions with perl to check that PCRE2 handles
      4 # them the same. The Perl code has to have "use utf8" and "require Encode" at
      5 # the start when running UTF-8 tests, but *not* for non-utf8 tests. (The
      6 # "require" would actually be OK for non-utf8-tests, but is not always
      7 # installed, so this way the script will always run for these tests.)
      8 #
      9 # The desired effect is achieved by making this a shell script that passes the
     10 # Perl script to Perl through a pipe. If the first argument is "-utf8", a
     11 # suitable prefix is set up.
     12 #
     13 # The remaining arguments, if any, are passed to Perl. They are an input file
     14 # and an output file. If there is one argument, the output is written to
     15 # STDOUT. If Perl receives no arguments, it opens /dev/tty as input, and writes
     16 # output to STDOUT. (I haven't found a way of getting it to use STDIN, because
     17 # of the contorted piping input.)
     18 
     19 perl=perl
     20 prefix=''
     21 if [ $# -gt 0 -a "$1" = "-utf8" ] ; then
     22   prefix="use utf8; require Encode;"
     23   shift
     24 fi
     25 
     26 
     27 # The Perl script that follows has a similar specification to pcre2test, and so
     28 # can be given identical input, except that input patterns can be followed only
     29 # by Perl's lower case modifiers and certain other pcre2test modifiers that are
     30 # either handled or ignored:
     31 #
     32 #   aftertext          interpreted as "print $' afterwards"
     33 #   afteralltext       ignored
     34 #   dupnames           ignored (Perl always allows)
     35 #   mark               ignored
     36 #   no_auto_possess    ignored
     37 #   no_start_optimize  ignored
     38 #   ucp                sets Perl's /u modifier
     39 #   utf                invoke UTF-8 functionality
     40 #
     41 # The data lines must not have any pcre2test modifiers. They are processed as
     42 # Perl double-quoted strings, so if they contain " $ or @ characters, these
     43 # have to be escaped. For this reason, all such characters in the
     44 # Perl-compatible testinput1 and testinput4 files are escaped so that they can
     45 # be used for perltest as well as for pcre2test. The output from this script
     46 # should be same as from pcre2test, apart from the initial identifying banner.
     47 #
     48 # The other testinput files are not suitable for feeding to perltest.sh,
     49 # because they make use of the special modifiers that pcre2test uses for
     50 # testing features of PCRE2. Some of these files also contain malformed regular
     51 # expressions, in order to check that PCRE2 diagnoses them correctly.
     52 
     53 (echo "$prefix" ; cat <<'PERLEND'
     54 
     55 # Function for turning a string into a string of printing chars.
     56 
     57 sub pchars {
     58 my($t) = "";
     59 if ($utf8)
     60   {
     61   @p = unpack('U*', $_[0]);
     62   foreach $c (@p)
     63     {
     64     if ($c >= 32 && $c < 127) { $t .= chr $c; }
     65       else { $t .= sprintf("\\x{%02x}", $c);
     66       }
     67     }
     68   }
     69 else
     70   {
     71   foreach $c (split(//, $_[0]))
     72     {
     73     if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
     74       else { $t .= sprintf("\\x%02x", ord $c); }
     75     }
     76   }
     77 $t;
     78 }
     79 
     80 
     81 # Read lines from a named file or stdin and write to a named file or stdout;
     82 # lines consist of a regular expression, in delimiters and optionally followed
     83 # by options, followed by a set of test data, terminated by an empty line.
     84 
     85 # Sort out the input and output files
     86 
     87 if (@ARGV > 0)
     88   {
     89   open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n";
     90   $infile = "INFILE";
     91   $interact = 0;
     92   }
     93 else
     94   {
     95   open(INFILE, "</dev/tty") || die "Failed to open /dev/tty\n";
     96   $infile = "INFILE";
     97   $interact = 1;
     98   }
     99 
    100 if (@ARGV > 1)
    101   {
    102   open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n";
    103   $outfile = "OUTFILE";
    104   }
    105 else { $outfile = "STDOUT"; }
    106 
    107 printf($outfile "Perl $] Regular Expressions\n\n");
    108 
    109 # Main loop
    110 
    111 NEXT_RE:
    112 for (;;)
    113   {
    114   printf "  re> " if $interact;
    115   last if ! ($_ = <$infile>);
    116   printf $outfile "$_" if ! $interact;
    117   next if ($_ =~ /^\s*$/ || $_ =~ /^#/);
    118 
    119   $pattern = $_;
    120 
    121   while ($pattern !~ /^\s*(.).*\1/s)
    122     {
    123     printf "    > " if $interact;
    124     last if ! ($_ = <$infile>);
    125     printf $outfile "$_" if ! $interact;
    126     $pattern .= $_;
    127     }
    128 
    129   chomp($pattern);
    130   $pattern =~ s/\s+$//;
    131 
    132   # Split the pattern from the modifiers and adjust them as necessary.
    133 
    134   $pattern =~ /^\s*((.).*\2)(.*)$/s;
    135   $pat = $1;
    136   $mod = $3;
    137 
    138   # The private "aftertext" modifier means "print $' afterwards".
    139 
    140   $showrest = ($mod =~ s/aftertext,?//);
    141 
    142   # "allaftertext" is used by pcre2test to print remainders after captures
    143 
    144   $mod =~ s/allaftertext,?//;
    145 
    146   # Detect utf
    147 
    148   $utf8 = $mod =~ s/utf,?//;
    149 
    150   # Remove "dupnames".
    151 
    152   $mod =~ s/dupnames,?//;
    153 
    154   # Remove "mark" (asks pcre2test to check MARK data) */
    155 
    156   $mod =~ s/mark,?//;
    157 
    158   # "ucp" asks pcre2test to set PCRE2_UCP; change this to /u for Perl
    159 
    160   $mod =~ s/ucp,?/u/;
    161 
    162   # Remove "no_auto_possess" and "no_start_optimize" (disable PCRE2 optimizations)
    163 
    164   $mod =~ s/no_auto_possess,?//;
    165   $mod =~ s/no_start_optimize,?//;
    166 
    167   # Add back retained modifiers and check that the pattern is valid.
    168 
    169   $mod =~ s/,//g;
    170   $pattern = "$pat$mod";
    171   eval "\$_ =~ ${pattern}";
    172   if ($@)
    173     {
    174     printf $outfile "Error: $@";
    175     if (! $interact)
    176       {
    177       for (;;)
    178         {
    179         last if ! ($_ = <$infile>);
    180         last if $_ =~ /^\s*$/;
    181         }
    182       }
    183     next NEXT_RE;
    184     }
    185 
    186   # If the /g modifier is present, we want to put a loop round the matching;
    187   # otherwise just a single "if".
    188 
    189   $cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if";
    190 
    191   # If the pattern is actually the null string, Perl uses the most recently
    192   # executed (and successfully compiled) regex is used instead. This is a
    193   # nasty trap for the unwary! The PCRE2 test suite does contain null strings
    194   # in places - if they are allowed through here all sorts of weird and
    195   # unexpected effects happen. To avoid this, we replace such patterns with
    196   # a non-null pattern that has the same effect.
    197 
    198   $pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/);
    199 
    200   # Read data lines and test them
    201 
    202   for (;;)
    203     {
    204     printf "data> " if $interact;
    205     last NEXT_RE if ! ($_ = <$infile>);
    206     chomp;
    207     printf $outfile "%s", "$_\n" if ! $interact;
    208 
    209     s/\s+$//;  # Remove trailing space
    210     s/^\s+//;  # Remove leading space
    211 
    212     last if ($_ eq "");
    213     next if $_ =~ /^\\=(?:\s|$)/;   # Comment line
    214 
    215     $x = eval "\"$_\"";   # To get escapes processed
    216 
    217     # Empty array for holding results, ensure $REGERROR and $REGMARK are
    218     # unset, then do the matching.
    219 
    220     @subs = ();
    221 
    222     $pushes = "push \@subs,\$&;" .
    223          "push \@subs,\$1;" .
    224          "push \@subs,\$2;" .
    225          "push \@subs,\$3;" .
    226          "push \@subs,\$4;" .
    227          "push \@subs,\$5;" .
    228          "push \@subs,\$6;" .
    229          "push \@subs,\$7;" .
    230          "push \@subs,\$8;" .
    231          "push \@subs,\$9;" .
    232          "push \@subs,\$10;" .
    233          "push \@subs,\$11;" .
    234          "push \@subs,\$12;" .
    235          "push \@subs,\$13;" .
    236          "push \@subs,\$14;" .
    237          "push \@subs,\$15;" .
    238          "push \@subs,\$16;" .
    239          "push \@subs,\$'; }";
    240 
    241     undef $REGERROR;
    242     undef $REGMARK;
    243 
    244     eval "${cmd} (\$x =~ ${pattern}) {" . $pushes;
    245 
    246     if ($@)
    247       {
    248       printf $outfile "Error: $@\n";
    249       next NEXT_RE;
    250       }
    251     elsif (scalar(@subs) == 0)
    252       {
    253       printf $outfile "No match";
    254       if (defined $REGERROR && $REGERROR != 1)
    255         { printf $outfile (", mark = %s", &pchars($REGERROR)); }
    256       printf $outfile "\n";
    257       }
    258     else
    259       {
    260       while (scalar(@subs) != 0)
    261         {
    262         printf $outfile (" 0: %s\n", &pchars($subs[0]));
    263         printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest;
    264         $last_printed = 0;
    265         for ($i = 1; $i <= 16; $i++)
    266           {
    267           if (defined $subs[$i])
    268             {
    269             while ($last_printed++ < $i-1)
    270               { printf $outfile ("%2d: <unset>\n", $last_printed); }
    271             printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i]));
    272             $last_printed = $i;
    273             }
    274           }
    275         splice(@subs, 0, 18);
    276         }
    277 
    278       # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
    279       # set and the input pattern was a UTF-8 string. We can, however, force
    280       # it to be so marked.
    281 
    282       if (defined $REGMARK && $REGMARK != 1)
    283         {
    284         $xx = $REGMARK;
    285         $xx = Encode::decode_utf8($xx) if $utf8;
    286         printf $outfile ("MK: %s\n", &pchars($xx));
    287         }
    288       }
    289     }
    290   }
    291 
    292 # printf $outfile "\n";
    293 
    294 PERLEND
    295 ) | $perl - $@
    296 
    297 # End
    298