Home | History | Annotate | Download | only in dist
      1 #! /usr/bin/env perl
      2 
      3 # Program for testing regular expressions with perl to check that PCRE handles
      4 # them the same. This version needs to have "use utf8" at the start for running
      5 # the UTF-8 tests, but *not* for the other tests. The only way I've found for
      6 # doing this is to cat this line in explicitly in the RunPerlTest script. I've
      7 # also used this method to supply "require Encode" for the UTF-8 tests, so that
      8 # the main test will still run where Encode is not installed.
      9 
     10 #use utf8;
     11 #require Encode;
     12 
     13 # Function for turning a string into a string of printing chars.
     14 
     15 sub pchars {
     16 my($t) = "";
     17 
     18 if ($utf8)
     19   {
     20   @p = unpack('U*', $_[0]);
     21   foreach $c (@p)
     22     {
     23     if ($c >= 32 && $c < 127) { $t .= chr $c; }
     24       else { $t .= sprintf("\\x{%02x}", $c);
     25       }
     26     }
     27   }
     28 else
     29   {
     30   foreach $c (split(//, $_[0]))
     31     {
     32     if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
     33       else { $t .= sprintf("\\x%02x", ord $c); }
     34     }
     35   }
     36 
     37 $t;
     38 }
     39 
     40 
     41 # Read lines from named file or stdin and write to named file or stdout; lines
     42 # consist of a regular expression, in delimiters and optionally followed by
     43 # options, followed by a set of test data, terminated by an empty line.
     44 
     45 # Sort out the input and output files
     46 
     47 if (@ARGV > 0)
     48   {
     49   open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n";
     50   $infile = "INFILE";
     51   }
     52 else { $infile = "STDIN"; }
     53 
     54 if (@ARGV > 1)
     55   {
     56   open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n";
     57   $outfile = "OUTFILE";
     58   }
     59 else { $outfile = "STDOUT"; }
     60 
     61 printf($outfile "Perl $] Regular Expressions\n\n");
     62 
     63 # Main loop
     64 
     65 NEXT_RE:
     66 for (;;)
     67   {
     68   printf "  re> " if $infile eq "STDIN";
     69   last if ! ($_ = <$infile>);
     70   printf $outfile "$_" if $infile ne "STDIN";
     71   next if ($_ =~ /^\s*$/ || $_ =~ /^< forbid/);
     72 
     73   $pattern = $_;
     74 
     75   while ($pattern !~ /^\s*(.).*\1/s)
     76     {
     77     printf "    > " if $infile eq "STDIN";
     78     last if ! ($_ = <$infile>);
     79     printf $outfile "$_" if $infile ne "STDIN";
     80     $pattern .= $_;
     81     }
     82 
     83   chomp($pattern);
     84   $pattern =~ s/\s+$//;
     85 
     86   # The private /+ modifier means "print $' afterwards".
     87 
     88   $showrest = ($pattern =~ s/\+(?=[a-zA-Z]*$)//);
     89 
     90   # A doubled version is used by pcretest to print remainders after captures
     91 
     92   $pattern =~ s/\+(?=[a-zA-Z]*$)//;
     93 
     94   # Remove /8 from a UTF-8 pattern.
     95 
     96   $utf8 = $pattern =~ s/8(?=[a-zA-Z]*$)//;
     97 
     98   # Remove /J from a pattern with duplicate names.
     99 
    100   $pattern =~ s/J(?=[a-zA-Z]*$)//;
    101 
    102   # Remove /K from a pattern (asks pcretest to check MARK data) */
    103 
    104   $pattern =~ s/K(?=[a-zA-Z]*$)//;
    105 
    106   # /W asks pcretest to set PCRE_UCP; change this to /u for Perl
    107 
    108   $pattern =~ s/W(?=[a-zA-Z]*$)/u/;
    109 
    110   # Remove /S or /SS from a pattern (asks pcretest to study or not to study)
    111 
    112   $pattern =~ s/S(?=[a-zA-Z]*$)//g;
    113 
    114   # Remove /Y and /O from a pattern (disable PCRE optimizations)
    115 
    116   $pattern =~ s/[YO](?=[a-zA-Z]*$)//;
    117 
    118   # Check that the pattern is valid
    119 
    120   eval "\$_ =~ ${pattern}";
    121   if ($@)
    122     {
    123     printf $outfile "Error: $@";
    124     if ($infile != "STDIN")
    125       {
    126       for (;;)
    127         {
    128         last if ! ($_ = <$infile>);
    129         last if $_ =~ /^\s*$/;
    130         }
    131       }
    132     next NEXT_RE;
    133     }
    134 
    135   # If the /g modifier is present, we want to put a loop round the matching;
    136   # otherwise just a single "if".
    137 
    138   $cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if";
    139 
    140   # If the pattern is actually the null string, Perl uses the most recently
    141   # executed (and successfully compiled) regex is used instead. This is a
    142   # nasty trap for the unwary! The PCRE test suite does contain null strings
    143   # in places - if they are allowed through here all sorts of weird and
    144   # unexpected effects happen. To avoid this, we replace such patterns with
    145   # a non-null pattern that has the same effect.
    146 
    147   $pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/);
    148 
    149   # Read data lines and test them
    150 
    151   for (;;)
    152     {
    153     printf "data> " if $infile eq "STDIN";
    154     last NEXT_RE if ! ($_ = <$infile>);
    155     chomp;
    156     printf $outfile "$_\n" if $infile ne "STDIN";
    157 
    158     s/\s+$//;  # Remove trailing space
    159     s/^\s+//;  # Remove leading space
    160     s/\\Y//g;  # Remove \Y (pcretest flag to set PCRE_NO_START_OPTIMIZE)
    161 
    162     last if ($_ eq "");
    163     $x = eval "\"$_\"";   # To get escapes processed
    164 
    165     # Empty array for holding results, ensure $REGERROR and $REGMARK are
    166     # unset, then do the matching.
    167 
    168     @subs = ();
    169 
    170     $pushes = "push \@subs,\$&;" .
    171          "push \@subs,\$1;" .
    172          "push \@subs,\$2;" .
    173          "push \@subs,\$3;" .
    174          "push \@subs,\$4;" .
    175          "push \@subs,\$5;" .
    176          "push \@subs,\$6;" .
    177          "push \@subs,\$7;" .
    178          "push \@subs,\$8;" .
    179          "push \@subs,\$9;" .
    180          "push \@subs,\$10;" .
    181          "push \@subs,\$11;" .
    182          "push \@subs,\$12;" .
    183          "push \@subs,\$13;" .
    184          "push \@subs,\$14;" .
    185          "push \@subs,\$15;" .
    186          "push \@subs,\$16;" .
    187          "push \@subs,\$'; }";
    188 
    189     undef $REGERROR;
    190     undef $REGMARK;
    191 
    192     eval "${cmd} (\$x =~ ${pattern}) {" . $pushes;
    193 
    194     if ($@)
    195       {
    196       printf $outfile "Error: $@\n";
    197       next NEXT_RE;
    198       }
    199     elsif (scalar(@subs) == 0)
    200       {
    201       printf $outfile "No match";
    202       if (defined $REGERROR && $REGERROR != 1)
    203         { printf $outfile (", mark = %s", &pchars($REGERROR)); }
    204       printf $outfile "\n";
    205       }
    206     else
    207       {
    208       while (scalar(@subs) != 0)
    209         {
    210         printf $outfile (" 0: %s\n", &pchars($subs[0]));
    211         printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest;
    212         $last_printed = 0;
    213         for ($i = 1; $i <= 16; $i++)
    214           {
    215           if (defined $subs[$i])
    216             {
    217             while ($last_printed++ < $i-1)
    218               { printf $outfile ("%2d: <unset>\n", $last_printed); }
    219             printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i]));
    220             $last_printed = $i;
    221             }
    222           }
    223         splice(@subs, 0, 18);
    224         }
    225 
    226       # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
    227       # set and the input pattern was a UTF-8 string. We can, however, force
    228       # it to be so marked.
    229 
    230       if (defined $REGMARK && $REGMARK != 1)
    231         {
    232         $xx = $REGMARK;
    233         $xx = Encode::decode_utf8($xx) if $utf8;
    234         printf $outfile ("MK: %s\n", &pchars($xx));
    235         }
    236       }
    237     }
    238   }
    239 
    240 # printf $outfile "\n";
    241 
    242 # End
    243