1 #! /usr/bin/perl -w 2 3 # Script to turn PCRE man pages into HTML 4 5 6 # Subroutine to handle font changes and other escapes 7 8 sub do_line { 9 my($s) = $_[0]; 10 11 $s =~ s/</</g; # Deal with < and > 12 $s =~ s/>/>/g; 13 $s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g; 14 $s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g; 15 $s =~ s"\\e"\\"g; 16 $s =~ s/(?<=Copyright )\(c\)/©/g; 17 $s; 18 } 19 20 # Subroutine to ensure not in a paragraph 21 22 sub end_para { 23 if ($inpara) 24 { 25 print TEMP "</PRE>\n" if ($inpre); 26 print TEMP "</P>\n"; 27 } 28 $inpara = $inpre = 0; 29 $wrotetext = 0; 30 } 31 32 # Subroutine to start a new paragraph 33 34 sub new_para { 35 &end_para(); 36 print TEMP "<P>\n"; 37 $inpara = 1; 38 } 39 40 41 # Main program 42 43 $innf = 0; 44 $inpara = 0; 45 $inpre = 0; 46 $wrotetext = 0; 47 $toc = 0; 48 $ref = 1; 49 50 while ($#ARGV >= 0 && $ARGV[0] =~ /^-/) 51 { 52 $toc = 1 if $ARGV[0] eq "-toc"; 53 shift; 54 } 55 56 # Initial output to STDOUT 57 58 print <<End ; 59 <html> 60 <head> 61 <title>$ARGV[0] specification</title> 62 </head> 63 <body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> 64 <h1>$ARGV[0] man page</h1> 65 <p> 66 Return to the <a href="index.html">PCRE index page</a>. 67 </p> 68 <p> 69 This page is part of the PCRE HTML documentation. It was generated automatically 70 from the original man page. If there is any nonsense in it, please consult the 71 man page, in case the conversion went wrong. 72 <br> 73 End 74 75 print "<ul>\n" if ($toc); 76 77 open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n"; 78 79 while (<STDIN>) 80 { 81 # Handle lines beginning with a dot 82 83 if (/^\./) 84 { 85 # Some of the PCRE man pages used to contain instances of .br. However, 86 # they should have all been removed because they cause trouble in some 87 # (other) automated systems that translate man pages to HTML. Complain if 88 # we find .br or .in (another macro that is deprecated). 89 90 if (/^\.br/ || /^\.in/) 91 { 92 print STDERR "\n*** Deprecated macro encountered - rewrite needed\n"; 93 print STDERR "*** $_\n"; 94 die "*** Processing abandoned\n"; 95 } 96 97 # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi. 98 99 elsif (/^\.nf/) 100 { 101 $innf = 1; 102 } 103 104 elsif (/^\.fi/) 105 { 106 $innf = 0; 107 } 108 109 # Handling .sp is subtle. If it is inside a literal section, do nothing if 110 # the next line is a non literal text line; similarly, if not inside a 111 # literal section, do nothing if a literal follows, unless we are inside 112 # a .nf/.ne section. The point being that the <pre> and </pre> that delimit 113 # literal sections will do the spacing. Always skip if no previous output. 114 115 elsif (/^\.sp/) 116 { 117 if ($wrotetext) 118 { 119 $_ = <STDIN>; 120 if ($inpre) 121 { 122 print TEMP "\n" if (/^[\s.]/); 123 } 124 else 125 { 126 print TEMP "<br>\n<br>\n" if ($innf || !/^[\s.]/); 127 } 128 redo; # Now process the lookahead line we just read 129 } 130 } 131 elsif (/^\.TP/ || /^\.PP/ || /^\.P/) 132 { 133 &new_para(); 134 } 135 elsif (/^\.SH\s*("?)(.*)\1/) 136 { 137 # Ignore the NAME section 138 if ($2 =~ /^NAME\b/) 139 { 140 <STDIN>; 141 next; 142 } 143 144 &end_para(); 145 my($title) = &do_line($2); 146 if ($toc) 147 { 148 printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n", 149 $ref, $ref); 150 printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n", 151 $ref, $ref); 152 $ref++; 153 } 154 else 155 { 156 print TEMP "<br><b>\n$title\n</b><br>\n"; 157 } 158 } 159 elsif (/^\.SS\s*("?)(.*)\1/) 160 { 161 &end_para(); 162 my($title) = &do_line($2); 163 print TEMP "<br><b>\n$title\n</b><br>\n"; 164 } 165 elsif (/^\.B\s*(.*)/) 166 { 167 &new_para() if (!$inpara); 168 $_ = &do_line($1); 169 s/"(.*?)"/$1/g; 170 print TEMP "<b>$_</b>\n"; 171 $wrotetext = 1; 172 } 173 elsif (/^\.I\s*(.*)/) 174 { 175 &new_para() if (!$inpara); 176 $_ = &do_line($1); 177 s/"(.*?)"/$1/g; 178 print TEMP "<i>$_</i>\n"; 179 $wrotetext = 1; 180 } 181 182 # A comment that starts "HREF" takes the next line as a name that 183 # is turned into a hyperlink, using the text given, which might be 184 # in a special font. If it ends in () or (digits) or punctuation, they 185 # aren't part of the link. 186 187 elsif (/^\.\\"\s*HREF/) 188 { 189 $_=<STDIN>; 190 chomp; 191 $_ = &do_line($_); 192 $_ =~ s/\s+$//; 193 $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/; 194 print TEMP "<a href=\"$1.html\">$_</a>\n"; 195 } 196 197 # A comment that starts "HTML" inserts literal HTML 198 199 elsif (/^\.\\"\s*HTML\s*(.*)/) 200 { 201 print TEMP $1; 202 } 203 204 # A comment that starts < inserts that HTML at the end of the 205 # *next* input line - so as not to get a newline between them. 206 207 elsif (/^\.\\"\s*(<.*>)/) 208 { 209 my($markup) = $1; 210 $_=<STDIN>; 211 chomp; 212 $_ = &do_line($_); 213 $_ =~ s/\s+$//; 214 print TEMP "$_$markup\n"; 215 } 216 217 # A comment that starts JOIN joins the next two lines together, with one 218 # space between them. Then that line is processed. This is used in some 219 # displays where two lines are needed for the "man" version. JOINSH works 220 # the same, except that it assumes this is a shell command, so removes 221 # continuation backslashes. 222 223 elsif (/^\.\\"\s*JOIN(SH)?/) 224 { 225 my($one,$two); 226 $one = <STDIN>; 227 $two = <STDIN>; 228 $one =~ s/\s*\\e\s*$// if (defined($1)); 229 chomp($one); 230 $two =~ s/^\s+//; 231 $_ = "$one $two"; 232 redo; # Process the joined lines 233 } 234 235 # .EX/.EE are used in the pcredemo page to bracket the entire program, 236 # which is unmodified except for turning backslash into "\e". 237 238 elsif (/^\.EX\s*$/) 239 { 240 print TEMP "<PRE>\n"; 241 while (<STDIN>) 242 { 243 last if /^\.EE\s*$/; 244 s/\\e/\\/g; 245 s/&/&/g; 246 s/</</g; 247 s/>/>/g; 248 print TEMP; 249 } 250 } 251 252 # Ignore anything not recognized 253 254 next; 255 } 256 257 # Line does not begin with a dot. Replace blank lines with new paragraphs 258 259 if (/^\s*$/) 260 { 261 &end_para() if ($wrotetext); 262 next; 263 } 264 265 # Convert fonts changes and output an ordinary line. Ensure that indented 266 # lines are marked as literal. 267 268 $_ = &do_line($_); 269 &new_para() if (!$inpara); 270 271 if (/^\s/) 272 { 273 if (!$inpre) 274 { 275 print TEMP "<pre>\n"; 276 $inpre = 1; 277 } 278 } 279 elsif ($inpre) 280 { 281 print TEMP "</pre>\n"; 282 $inpre = 0; 283 } 284 285 # Add <br> to the end of a non-literal line if we are within .nf/.fi 286 287 $_ .= "<br>\n" if (!$inpre && $innf); 288 289 print TEMP; 290 $wrotetext = 1; 291 } 292 293 # The TOC, if present, will have been written - terminate it 294 295 print "</ul>\n" if ($toc); 296 297 # Copy the remainder to the standard output 298 299 close(TEMP); 300 open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n"; 301 302 print while (<TEMP>); 303 304 print <<End ; 305 <p> 306 Return to the <a href="index.html">PCRE index page</a>. 307 </p> 308 End 309 310 close(TEMP); 311 unlink("/tmp/$$"); 312 313 # End 314