Home | History | Annotate | Download | only in dist2
      1 #! /usr/bin/perl -w
      2 
      3 # Script to turn PCRE2 man pages into HTML
      4 
      5 
      6 # Subroutine to handle font changes and other escapes
      7 
      8 sub do_line {
      9 my($s) = $_[0];
     10 
     11 $s =~ s/</&#60;/g;                   # Deal with < and >
     12 $s =~ s/>/&#62;/g;
     13 $s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
     14 $s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
     15 $s =~ s"\\e"\\"g;
     16 $s =~ s/(?<=Copyright )\(c\)/&copy;/g;
     17 $s;
     18 }
     19 
     20 # Subroutine to ensure not in a paragraph
     21 
     22 sub end_para {
     23 if ($inpara)
     24   {
     25   print TEMP "</PRE>\n" if ($inpre);
     26   print TEMP "</P>\n";
     27   }
     28 $inpara = $inpre = 0;
     29 $wrotetext = 0;
     30 }
     31 
     32 # Subroutine to start a new paragraph
     33 
     34 sub new_para {
     35 &end_para();
     36 print TEMP "<P>\n";
     37 $inpara = 1;
     38 }
     39 
     40 
     41 # Main program
     42 
     43 $innf = 0;
     44 $inpara = 0;
     45 $inpre = 0;
     46 $wrotetext = 0;
     47 $toc = 0;
     48 $ref = 1;
     49 
     50 while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
     51   {
     52   $toc = 1 if $ARGV[0] eq "-toc";
     53   shift;
     54   }
     55 
     56 # Initial output to STDOUT
     57 
     58 print <<End ;
     59 <html>
     60 <head>
     61 <title>$ARGV[0] specification</title>
     62 </head>
     63 <body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
     64 <h1>$ARGV[0] man page</h1>
     65 <p>
     66 Return to the <a href="index.html">PCRE2 index page</a>.
     67 </p>
     68 <p>
     69 This page is part of the PCRE2 HTML documentation. It was generated
     70 automatically from the original man page. If there is any nonsense in it,
     71 please consult the man page, in case the conversion went wrong.
     72 <br>
     73 End
     74 
     75 print "<ul>\n" if ($toc);
     76 
     77 open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
     78 
     79 while (<STDIN>)
     80   {
     81   # Handle lines beginning with a dot
     82 
     83   if (/^\./)
     84     {
     85     # Some of the PCRE2 man pages used to contain instances of .br. However,
     86     # they should have all been removed because they cause trouble in some
     87     # (other) automated systems that translate man pages to HTML. Complain if
     88     # we find .br or .in (another macro that is deprecated).
     89 
     90     if (/^\.br/ || /^\.in/)
     91       {
     92       print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
     93       print STDERR "*** $_\n";
     94       die "*** Processing abandoned\n";
     95       }
     96 
     97     # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
     98 
     99     elsif (/^\.nf/)
    100       {
    101       $innf = 1;
    102       }
    103 
    104     elsif (/^\.fi/)
    105       {
    106       $innf = 0;
    107       }
    108 
    109     # Handling .sp is subtle. If it is inside a literal section, do nothing if
    110     # the next line is a non literal text line; similarly, if not inside a
    111     # literal section, do nothing if a literal follows, unless we are inside
    112     # a .nf/.fi section or about to enter one. The point being that the <pre>
    113     # and </pre> that delimit literal sections will do the spacing. Always skip
    114     # if no previous output.
    115 
    116     elsif (/^\.sp/)
    117       {
    118       if ($wrotetext)
    119         {
    120         $_ = <STDIN>;
    121         if ($inpre)
    122           {
    123           print TEMP "\n" if (/^[\s.]/);
    124           }
    125         else
    126           {
    127           print TEMP "<br>\n<br>\n" if ($innf || /^\.nf/ || !/^[\s.]/);
    128           }
    129         redo;    # Now process the lookahead line we just read
    130         }
    131       }
    132     elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
    133       {
    134       &new_para();
    135       }
    136     elsif (/^\.SH\s*("?)(.*)\1/)
    137       {
    138       # Ignore the NAME section
    139       if ($2 =~ /^NAME\b/)
    140         {
    141         <STDIN>;
    142         next;
    143         }
    144 
    145       &end_para();
    146       my($title) = &do_line($2);
    147       if ($toc)
    148         {
    149         printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
    150           $ref, $ref);
    151         printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
    152           $ref);
    153         $ref++;
    154         }
    155       else
    156         {
    157         print TEMP "<br><b>\n$title\n</b><br>\n";
    158         }
    159       }
    160     elsif (/^\.SS\s*("?)(.*)\1/)
    161       {
    162       &end_para();
    163       my($title) = &do_line($2);
    164       print TEMP "<br><b>\n$title\n</b><br>\n";
    165       }
    166     elsif (/^\.B\s*(.*)/)
    167       {
    168       &new_para() if (!$inpara);
    169       $_ = &do_line($1);
    170       s/"(.*?)"/$1/g;
    171       print TEMP "<b>$_</b>\n";
    172       $wrotetext = 1;
    173       }
    174     elsif (/^\.I\s*(.*)/)
    175       {
    176       &new_para() if (!$inpara);
    177       $_ = &do_line($1);
    178       s/"(.*?)"/$1/g;
    179       print TEMP "<i>$_</i>\n";
    180       $wrotetext = 1;
    181       }
    182 
    183     # A comment that starts "HREF" takes the next line as a name that
    184     # is turned into a hyperlink, using the text given, which might be
    185     # in a special font. If it ends in () or (digits) or punctuation, they
    186     # aren't part of the link.
    187 
    188     elsif (/^\.\\"\s*HREF/)
    189       {
    190       $_=<STDIN>;
    191       chomp;
    192       $_ = &do_line($_);
    193       $_ =~ s/\s+$//;
    194       $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
    195       print TEMP "<a href=\"$1.html\">$_</a>\n";
    196       }
    197 
    198     # A comment that starts "HTML" inserts literal HTML
    199 
    200     elsif (/^\.\\"\s*HTML\s*(.*)/)
    201       {
    202       print TEMP $1;
    203       }
    204 
    205     # A comment that starts < inserts that HTML at the end of the
    206     # *next* input line - so as not to get a newline between them.
    207 
    208     elsif (/^\.\\"\s*(<.*>)/)
    209       {
    210       my($markup) = $1;
    211       $_=<STDIN>;
    212       chomp;
    213       $_ = &do_line($_);
    214       $_ =~ s/\s+$//;
    215       print TEMP "$_$markup\n";
    216       }
    217 
    218     # A comment that starts JOIN joins the next two lines together, with one
    219     # space between them. Then that line is processed. This is used in some
    220     # displays where two lines are needed for the "man" version. JOINSH works
    221     # the same, except that it assumes this is a shell command, so removes
    222     # continuation backslashes.
    223 
    224     elsif (/^\.\\"\s*JOIN(SH)?/)
    225       {
    226       my($one,$two);
    227       $one = <STDIN>;
    228       $two = <STDIN>;
    229       $one =~ s/\s*\\e\s*$// if (defined($1));
    230       chomp($one);
    231       $two =~ s/^\s+//;
    232       $_ = "$one $two";
    233       redo;            # Process the joined lines
    234       }
    235 
    236     # .EX/.EE are used in the pcre2demo page to bracket the entire program,
    237     # which is unmodified except for turning backslash into "\e".
    238 
    239     elsif (/^\.EX\s*$/)
    240       {
    241       print TEMP "<PRE>\n";
    242       while (<STDIN>)
    243         {
    244         last if /^\.EE\s*$/;
    245         s/\\e/\\/g;
    246         s/&/&amp;/g;
    247         s/</&lt;/g;
    248         s/>/&gt;/g;
    249         print TEMP;
    250         }
    251       }
    252 
    253     # Ignore anything not recognized
    254 
    255     next;
    256     }
    257 
    258   # Line does not begin with a dot. Replace blank lines with new paragraphs
    259 
    260   if (/^\s*$/)
    261     {
    262     &end_para() if ($wrotetext);
    263     next;
    264     }
    265 
    266   # Convert fonts changes and output an ordinary line. Ensure that indented
    267   # lines are marked as literal.
    268 
    269   $_ = &do_line($_);
    270   &new_para() if (!$inpara);
    271 
    272   if (/^\s/)
    273     {
    274     if (!$inpre)
    275       {
    276       print TEMP "<pre>\n";
    277       $inpre = 1;
    278       }
    279     }
    280   elsif ($inpre)
    281     {
    282     print TEMP "</pre>\n";
    283     $inpre = 0;
    284     }
    285 
    286   # Add <br> to the end of a non-literal line if we are within .nf/.fi
    287 
    288   $_ .= "<br>\n" if (!$inpre && $innf);
    289 
    290   print TEMP;
    291   $wrotetext = 1;
    292   }
    293 
    294 # The TOC, if present, will have been written - terminate it
    295 
    296 print "</ul>\n" if ($toc);
    297 
    298 # Copy the remainder to the standard output
    299 
    300 close(TEMP);
    301 open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
    302 
    303 print while (<TEMP>);
    304 
    305 print <<End ;
    306 <p>
    307 Return to the <a href="index.html">PCRE2 index page</a>.
    308 </p>
    309 End
    310 
    311 close(TEMP);
    312 unlink("/tmp/$$");
    313 
    314 # End
    315