Home | History | Annotate | Download | only in dist2
      1 #! /bin/sh
      2 
      3 # Script for testing regular expressions with perl to check that PCRE2 handles
      4 # them the same. If the first argument to this script is "-w", Perl is also
      5 # called with "-w", which turns on its warning mode.
      6 #
      7 # The Perl code has to have "use utf8" and "require Encode" at the start when
      8 # running UTF-8 tests, but *not* for non-utf8 tests. (The "require" would
      9 # actually be OK for non-utf8-tests, but is not always installed, so this way
     10 # the script will always run for these tests.)
     11 #
     12 # The desired effect is achieved by making this a shell script that passes the
     13 # Perl script to Perl through a pipe. If the first argument (possibly after
     14 # removing "-w") is "-utf8", a suitable prefix is set up.
     15 #
     16 # The remaining arguments, if any, are passed to Perl. They are an input file
     17 # and an output file. If there is one argument, the output is written to
     18 # STDOUT. If Perl receives no arguments, it opens /dev/tty as input, and writes
     19 # output to STDOUT. (I haven't found a way of getting it to use STDIN, because
     20 # of the contorted piping input.)
     21 
     22 perl=perl
     23 perlarg=''
     24 prefix=''
     25 
     26 if [ $# -gt 0 -a "$1" = "-w" ] ; then
     27   perlarg="-w"
     28   shift
     29 fi
     30 
     31 if [ $# -gt 0 -a "$1" = "-utf8" ] ; then
     32   prefix="use utf8; require Encode;"
     33   shift
     34 fi
     35 
     36 
     37 # The Perl script that follows has a similar specification to pcre2test, and so
     38 # can be given identical input, except that input patterns can be followed only
     39 # by Perl's lower case modifiers and certain other pcre2test modifiers that are
     40 # either handled or ignored:
     41 #
     42 #   aftertext          interpreted as "print $' afterwards"
     43 #   afteralltext       ignored
     44 #   dupnames           ignored (Perl always allows)
     45 #   jitstack           ignored
     46 #   mark               show mark information
     47 #   no_auto_possess    ignored
     48 #   no_start_optimize  insert (??{""}) at pattern start (disables optimizing)
     49 #  -no_start_optimize  ignored
     50 #   subject_literal    does not process subjects for escapes
     51 #   ucp                sets Perl's /u modifier
     52 #   utf                invoke UTF-8 functionality
     53 #
     54 # Comment lines are ignored. The #pattern command can be used to set modifiers
     55 # that will be added to each subsequent pattern, after any modifiers it may
     56 # already have. NOTE: this is different to pcre2test where #pattern sets
     57 # defaults which can be overridden on individual patterns. The #subject command
     58 # may be used to set or unset a default "mark" modifier for data lines. This is
     59 # the only use of #subject that is supported. The #perltest, #forbid_utf, and
     60 # #newline_default commands, which are needed in the relevant pcre2test files,
     61 # are ignored. Any other #-command is ignored, with a warning message.
     62 #
     63 # The data lines must not have any pcre2test modifiers. Unless
     64 # "subject_literal" is on the pattern, data lines are processed as
     65 # Perl double-quoted strings, so if they contain " $ or @ characters, these
     66 # have to be escaped. For this reason, all such characters in the
     67 # Perl-compatible testinput1 and testinput4 files are escaped so that they can
     68 # be used for perltest as well as for pcre2test. The output from this script
     69 # should be same as from pcre2test, apart from the initial identifying banner.
     70 #
     71 # The other testinput files are not suitable for feeding to perltest.sh,
     72 # because they make use of the special modifiers that pcre2test uses for
     73 # testing features of PCRE2. Some of these files also contain malformed regular
     74 # expressions, in order to check that PCRE2 diagnoses them correctly.
     75 
     76 (echo "$prefix" ; cat <<'PERLEND'
     77 
     78 # Function for turning a string into a string of printing chars.
     79 
     80 sub pchars {
     81 my($t) = "";
     82 if ($utf8)
     83   {
     84   @p = unpack('U*', $_[0]);
     85   foreach $c (@p)
     86     {
     87     if ($c >= 32 && $c < 127) { $t .= chr $c; }
     88       else { $t .= sprintf("\\x{%02x}", $c);
     89       }
     90     }
     91   }
     92 else
     93   {
     94   foreach $c (split(//, $_[0]))
     95     {
     96     if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
     97       else { $t .= sprintf("\\x%02x", ord $c); }
     98     }
     99   }
    100 $t;
    101 }
    102 
    103 
    104 # Read lines from a named file or stdin and write to a named file or stdout;
    105 # lines consist of a regular expression, in delimiters and optionally followed
    106 # by options, followed by a set of test data, terminated by an empty line.
    107 
    108 # Sort out the input and output files
    109 
    110 if (@ARGV > 0)
    111   {
    112   open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n";
    113   $infile = "INFILE";
    114   $interact = 0;
    115   }
    116 else
    117   {
    118   open(INFILE, "</dev/tty") || die "Failed to open /dev/tty\n";
    119   $infile = "INFILE";
    120   $interact = 1;
    121   }
    122 
    123 if (@ARGV > 1)
    124   {
    125   open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n";
    126   $outfile = "OUTFILE";
    127   }
    128 else { $outfile = "STDOUT"; }
    129 
    130 printf($outfile "Perl $] Regular Expressions\n\n");
    131 
    132 # Main loop
    133 
    134 NEXT_RE:
    135 for (;;)
    136   {
    137   printf "  re> " if $interact;
    138   last if ! ($_ = <$infile>);
    139   printf $outfile "$_" if ! $interact;
    140   next if ($_ =~ /^\s*$/ || $_ =~ /^#[\s!]/);
    141 
    142   # A few of pcre2test's #-commands are supported, or just ignored. Any others
    143   # cause an error.
    144 
    145   if ($_ =~ /^#pattern(.*)/)
    146     {
    147     $extra_modifiers = $1;
    148     chomp($extra_modifiers);
    149     $extra_modifiers =~ s/\s+$//;
    150     next;
    151     }
    152   elsif ($_ =~ /^#subject(.*)/)
    153     {
    154     $mod = $1;
    155     chomp($mod);
    156     $mod =~ s/\s+$//;
    157     if ($mod =~ s/(-?)mark,?//)
    158       {
    159       $minus = $1;
    160       $default_show_mark = ($minus =~ /^$/);
    161       }
    162     if ($mod !~ /^\s*$/)
    163       {
    164       printf $outfile "** Warning: \"$mod\" in #subject ignored\n";
    165       }
    166     next;
    167     }
    168   elsif ($_ =~ /^#/)
    169     {
    170     if ($_ !~ /^#newline_default|^#perltest|^#forbid_utf/)
    171       {
    172       printf $outfile "** Warning: #-command ignored: %s", $_;
    173       }
    174     next;
    175     }
    176 
    177   $pattern = $_;
    178 
    179   while ($pattern !~ /^\s*(.).*\1/s)
    180     {
    181     printf "    > " if $interact;
    182     last if ! ($_ = <$infile>);
    183     printf $outfile "$_" if ! $interact;
    184     $pattern .= $_;
    185     }
    186 
    187   chomp($pattern);
    188   $pattern =~ s/\s+$//;
    189 
    190   # Split the pattern from the modifiers and adjust them as necessary.
    191 
    192   $pattern =~ /^\s*((.).*\2)(.*)$/s;
    193   $pat = $1;
    194   $del = $2;
    195   $mod = "$3,$extra_modifiers";
    196   $mod =~ s/^,\s*//;
    197 
    198   # The private "aftertext" modifier means "print $' afterwards".
    199 
    200   $showrest = ($mod =~ s/aftertext,?//);
    201 
    202   # The "subject_literal" modifer disables escapes in subjects.
    203 
    204   $subject_literal = ($mod =~ s/subject_literal,?//);
    205 
    206   # "allaftertext" is used by pcre2test to print remainders after captures
    207 
    208   $mod =~ s/allaftertext,?//;
    209 
    210   # Detect utf
    211 
    212   $utf8 = $mod =~ s/utf,?//;
    213 
    214   # Remove "dupnames".
    215 
    216   $mod =~ s/dupnames,?//;
    217 
    218   # Remove "jitstack".
    219 
    220   $mod =~ s/jitstack=\d+,?//;
    221 
    222   # The "mark" modifier requests checking of MARK data */
    223 
    224   $show_mark = $default_show_mark | ($mod =~ s/mark,?//);
    225 
    226   # "ucp" asks pcre2test to set PCRE2_UCP; change this to /u for Perl
    227 
    228   $mod =~ s/ucp,?/u/;
    229 
    230   # Remove "no_auto_possess".
    231 
    232   $mod =~ s/no_auto_possess,?//;
    233 
    234   # Use no_start_optimize (disable PCRE2 start-up optimization) to disable Perl
    235   # optimization by inserting (??{""}) at the start of the pattern. We may
    236   # also encounter -no_start_optimize from a #pattern setting.
    237 
    238   $mod =~ s/-no_start_optimize,?//;
    239   if ($mod =~ s/no_start_optimize,?//) { $pat =~ s/$del/$del(??{""})/; }
    240 
    241   # Add back retained modifiers and check that the pattern is valid.
    242 
    243   $mod =~ s/,//g;
    244   $pattern = "$pat$mod";
    245   eval "\$_ =~ ${pattern}";
    246   if ($@)
    247     {
    248     printf $outfile "Error: $@";
    249     if (! $interact)
    250       {
    251       for (;;)
    252         {
    253         last if ! ($_ = <$infile>);
    254         last if $_ =~ /^\s*$/;
    255         }
    256       }
    257     next NEXT_RE;
    258     }
    259 
    260   # If the /g modifier is present, we want to put a loop round the matching;
    261   # otherwise just a single "if".
    262 
    263   $cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if";
    264 
    265   # If the pattern is actually the null string, Perl uses the most recently
    266   # executed (and successfully compiled) regex is used instead. This is a
    267   # nasty trap for the unwary! The PCRE2 test suite does contain null strings
    268   # in places - if they are allowed through here all sorts of weird and
    269   # unexpected effects happen. To avoid this, we replace such patterns with
    270   # a non-null pattern that has the same effect.
    271 
    272   $pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/);
    273 
    274   # Read data lines and test them
    275 
    276   for (;;)
    277     {
    278     printf "data> " if $interact;
    279     last NEXT_RE if ! ($_ = <$infile>);
    280     chomp;
    281     printf $outfile "%s", "$_\n" if ! $interact;
    282 
    283     s/\s+$//;  # Remove trailing space
    284     s/^\s+//;  # Remove leading space
    285 
    286     last if ($_ eq "");
    287     next if $_ =~ /^\\=(?:\s|$)/;   # Comment line
    288 
    289     if ($subject_literal)
    290       {
    291       $x = $_;
    292       }
    293     else
    294       {
    295       $x = eval "\"$_\"";   # To get escapes processed
    296       }
    297 
    298     # Empty array for holding results, ensure $REGERROR and $REGMARK are
    299     # unset, then do the matching.
    300 
    301     @subs = ();
    302 
    303     $pushes = "push \@subs,\$&;" .
    304          "push \@subs,\$1;" .
    305          "push \@subs,\$2;" .
    306          "push \@subs,\$3;" .
    307          "push \@subs,\$4;" .
    308          "push \@subs,\$5;" .
    309          "push \@subs,\$6;" .
    310          "push \@subs,\$7;" .
    311          "push \@subs,\$8;" .
    312          "push \@subs,\$9;" .
    313          "push \@subs,\$10;" .
    314          "push \@subs,\$11;" .
    315          "push \@subs,\$12;" .
    316          "push \@subs,\$13;" .
    317          "push \@subs,\$14;" .
    318          "push \@subs,\$15;" .
    319          "push \@subs,\$16;" .
    320          "push \@subs,\$'; }";
    321 
    322     undef $REGERROR;
    323     undef $REGMARK;
    324 
    325     eval "${cmd} (\$x =~ ${pattern}) {" . $pushes;
    326 
    327     if ($@)
    328       {
    329       printf $outfile "Error: $@\n";
    330       next NEXT_RE;
    331       }
    332     elsif (scalar(@subs) == 0)
    333       {
    334       printf $outfile "No match";
    335       if ($show_mark && defined $REGERROR && $REGERROR != 1)
    336         { printf $outfile (", mark = %s", &pchars($REGERROR)); }
    337       printf $outfile "\n";
    338       }
    339     else
    340       {
    341       while (scalar(@subs) != 0)
    342         {
    343         printf $outfile (" 0: %s\n", &pchars($subs[0]));
    344         printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest;
    345         $last_printed = 0;
    346         for ($i = 1; $i <= 16; $i++)
    347           {
    348           if (defined $subs[$i])
    349             {
    350             while ($last_printed++ < $i-1)
    351               { printf $outfile ("%2d: <unset>\n", $last_printed); }
    352             printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i]));
    353             $last_printed = $i;
    354             }
    355           }
    356         splice(@subs, 0, 18);
    357         }
    358 
    359       # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
    360       # set and the input pattern was a UTF-8 string. We can, however, force
    361       # it to be so marked.
    362 
    363       if ($show_mark && defined $REGMARK && $REGMARK != 1)
    364         {
    365         $xx = $REGMARK;
    366         $xx = Encode::decode_utf8($xx) if $utf8;
    367         printf $outfile ("MK: %s\n", &pchars($xx));
    368         }
    369       }
    370     }
    371   }
    372 
    373 # printf $outfile "\n";
    374 
    375 PERLEND
    376 ) | $perl $perlarg - $@
    377 
    378 # End
    379