1 /* `L' command implementation for GNU sed, based on GNU fmt 1.22. 2 Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software Foundation, 16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 17 18 /* GNU fmt was written by Ross Paterson <rap (at) doc.ic.ac.uk>. */ 19 20 #include "sed.h" 21 22 #include <stdio.h> 23 #include <string.h> 24 #include <ctype.h> 25 #include <sys/types.h> 26 27 #if HAVE_LIMITS_H 28 # include <limits.h> 29 #endif 30 31 #ifndef UINT_MAX 32 # define UINT_MAX ((unsigned int) ~(unsigned int) 0) 33 #endif 34 35 #ifndef INT_MAX 36 # define INT_MAX ((int) (UINT_MAX >> 1)) 37 #endif 38 39 /* The following parameters represent the program's idea of what is 40 "best". Adjust to taste, subject to the caveats given. */ 41 42 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving 43 room for optimization. */ 44 #define LEEWAY 7 45 46 /* Costs and bonuses are expressed as the equivalent departure from the 47 optimal line length, multiplied by 10. e.g. assigning something a 48 cost of 50 means that it is as bad as a line 5 characters too short 49 or too long. The definition of SHORT_COST(n) should not be changed. 50 However, EQUIV(n) may need tuning. */ 51 52 typedef long COST; 53 54 #define MAXCOST (~(((unsigned long) 1) << (8 * sizeof (COST) -1))) 55 56 #define SQR(n) ((n) * (n)) 57 #define EQUIV(n) SQR ((COST) (n)) 58 59 /* Cost of a filled line n chars longer or shorter than best_width. */ 60 #define SHORT_COST(n) EQUIV ((n) * 10) 61 62 /* Cost of the difference between adjacent filled lines. */ 63 #define RAGGED_COST(n) (SHORT_COST (n) / 2) 64 65 /* Basic cost per line. */ 66 #define LINE_COST EQUIV (70) 67 68 /* Cost of breaking a line after the first word of a sentence, where 69 the length of the word is N. */ 70 #define WIDOW_COST(n) (EQUIV (200) / ((n) + 2)) 71 72 /* Cost of breaking a line before the last word of a sentence, where 73 the length of the word is N. */ 74 #define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2)) 75 76 /* Bonus for breaking a line at the end of a sentence. */ 77 #define SENTENCE_BONUS EQUIV (50) 78 79 /* Cost of breaking a line after a period not marking end of a sentence. 80 With the definition of sentence we are using (borrowed from emacs, see 81 get_line()) such a break would then look like a sentence break. Hence 82 we assign a very high cost -- it should be avoided unless things are 83 really bad. */ 84 #define NOBREAK_COST EQUIV (600) 85 86 /* Bonus for breaking a line before open parenthesis. */ 87 #define PAREN_BONUS EQUIV (40) 88 89 /* Bonus for breaking a line after other punctuation. */ 90 #define PUNCT_BONUS EQUIV(40) 91 92 /* Credit for breaking a long paragraph one line later. */ 93 #define LINE_CREDIT EQUIV(3) 94 95 /* Size of paragraph buffer in words. Longer paragraphs are handled 96 neatly (cf. flush_paragraph()), so there's little to gain by making 97 these larger. */ 98 #define MAXWORDS 1000 99 100 #define GETC() (parabuf == end_of_parabuf ? EOF : *parabuf++) 101 102 /* Extra ctype(3)-style macros. */ 103 104 #define isopen(c) (strchr ("([`'\"", (c)) != NULL) 105 #define isclose(c) (strchr (")]'\"", (c)) != NULL) 106 #define isperiod(c) (strchr (".?!", (c)) != NULL) 107 108 /* Size of a tab stop, for expansion on input and re-introduction on 109 output. */ 110 #define TABWIDTH 8 111 112 /* Word descriptor structure. */ 113 114 typedef struct Word WORD; 115 116 struct Word 117 { 118 119 /* Static attributes determined during input. */ 120 121 const char *text; /* the text of the word */ 122 short length; /* length of this word */ 123 short space; /* the size of the following space */ 124 unsigned paren:1; /* starts with open paren */ 125 unsigned period:1; /* ends in [.?!])* */ 126 unsigned punct:1; /* ends in punctuation */ 127 unsigned final:1; /* end of sentence */ 128 129 /* The remaining fields are computed during the optimization. */ 130 131 short line_length; /* length of the best line starting here */ 132 COST best_cost; /* cost of best paragraph starting here */ 133 WORD *next_break; /* break which achieves best_cost */ 134 }; 135 136 /* Forward declarations. */ 137 138 static bool get_paragraph P_ ((void)); 139 static int get_line P_ ((int c)); 140 static int get_space P_ ((int c)); 141 static int copy_rest P_ ((int c)); 142 static bool same_para P_ ((int c)); 143 static void flush_paragraph P_ ((void)); 144 static void fmt_paragraph P_ ((void)); 145 static void check_punctuation P_ ((WORD *w)); 146 static COST base_cost P_ ((WORD *this)); 147 static COST line_cost P_ ((WORD *next, int len)); 148 static void put_paragraph P_ ((WORD *finish)); 149 static void put_line P_ ((WORD *w, int indent)); 150 static void put_word P_ ((WORD *w)); 151 static void put_space P_ ((int space)); 152 153 /* Option values. */ 154 155 /* User-supplied maximum line width (default WIDTH). The only output 156 lines 157 longer than this will each comprise a single word. */ 158 static int max_width; 159 160 /* Space for the paragraph text. */ 161 static const char *parabuf; 162 163 /* End of space for the paragraph text. */ 164 static const char *end_of_parabuf; 165 166 /* The file on which we output */ 167 static FILE *outfile; 168 169 /* Values derived from the option values. */ 170 171 /* The preferred width of text lines, set to LEEWAY % less than max_width. */ 172 static int best_width; 173 174 /* Dynamic variables. */ 175 176 /* Start column of the character most recently read from the input file. */ 177 static int in_column; 178 179 /* Start column of the next character to be written to stdout. */ 180 static int out_column; 181 182 /* The words of a paragraph -- longer paragraphs are handled neatly 183 (cf. flush_paragraph()). */ 184 static WORD words[MAXWORDS]; 185 186 /* A pointer into the above word array, indicating the first position 187 after the last complete word. Sometimes it will point at an incomplete 188 word. */ 189 static WORD *word_limit; 190 191 /* Indentation of the first line of the current paragraph. */ 192 static int first_indent; 193 194 /* Indentation of other lines of the current paragraph */ 195 static int other_indent; 196 197 /* The last character read from the input file. */ 198 static int next_char; 199 200 /* If nonzero, the length of the last line output in the current 201 paragraph, used to charge for raggedness at the split point for long 202 paragraphs chosen by fmt_paragraph(). */ 203 static int last_line_length; 204 205 /* read file F and send formatted output to stdout. */ 206 207 void 208 fmt (const char *line, const char *line_end, int max_length, FILE *output_file) 209 { 210 parabuf = line; 211 end_of_parabuf = line_end; 212 outfile = output_file; 213 214 max_width = max_length; 215 best_width = max_width * (201 - 2 * LEEWAY) / 200; 216 217 in_column = 0; 218 other_indent = 0; 219 next_char = GETC(); 220 while (get_paragraph ()) 221 { 222 fmt_paragraph (); 223 put_paragraph (word_limit); 224 } 225 } 226 227 /* Read a paragraph from input file F. A paragraph consists of a 228 maximal number of non-blank (excluding any prefix) lines 229 with the same indent. 230 231 Return false if end-of-file was encountered before the start of a 232 paragraph, else true. */ 233 234 static bool 235 get_paragraph () 236 { 237 register int c; 238 239 last_line_length = 0; 240 c = next_char; 241 242 /* Scan (and copy) blank lines, and lines not introduced by the prefix. */ 243 244 while (c == '\n' || c == EOF) 245 { 246 c = copy_rest (c); 247 if (c == EOF) 248 { 249 next_char = EOF; 250 return false; 251 } 252 putc ('\n', outfile); 253 c = GETC(); 254 } 255 256 /* Got a suitable first line for a paragraph. */ 257 258 first_indent = in_column; 259 word_limit = words; 260 c = get_line (c); 261 262 /* Read rest of paragraph. */ 263 264 other_indent = in_column; 265 while (same_para (c) && in_column == other_indent) 266 c = get_line (c); 267 268 (word_limit - 1)->period = (word_limit - 1)->final = true; 269 next_char = c; 270 return true; 271 } 272 273 /* Copy to the output a blank line. In the latter, C is \n or EOF. 274 Return the character (\n or EOF) ending the line. */ 275 276 static int 277 copy_rest (register int c) 278 { 279 out_column = 0; 280 while (c != '\n' && c != EOF) 281 { 282 putc (c, outfile); 283 c = GETC(); 284 } 285 return c; 286 } 287 288 /* Return true if a line whose first non-blank character after the 289 prefix (if any) is C could belong to the current paragraph, 290 otherwise false. */ 291 292 static bool 293 same_para (register int c) 294 { 295 return (c != '\n' && c != EOF); 296 } 297 298 /* Read a line from the input data given first non-blank character C 299 after the prefix, and the following indent, and break it into words. 300 A word is a maximal non-empty string of non-white characters. A word 301 ending in [.?!]["')\]]* and followed by end-of-line or at least two 302 spaces ends a sentence, as in emacs. 303 304 Return the first non-blank character of the next line. */ 305 306 static int 307 get_line (register int c) 308 { 309 int start; 310 register WORD *end_of_word; 311 312 end_of_word = &words[MAXWORDS - 2]; 313 314 do 315 { /* for each word in a line */ 316 317 /* Scan word. */ 318 319 word_limit->text = parabuf - 1; 320 do 321 c = GETC(); 322 while (c != EOF && !ISSPACE (c)); 323 word_limit->length = parabuf - word_limit->text - (c != EOF); 324 in_column += word_limit->length; 325 326 check_punctuation (word_limit); 327 328 /* Scan inter-word space. */ 329 330 start = in_column; 331 c = get_space (c); 332 word_limit->space = in_column - start; 333 word_limit->final = (c == EOF 334 || (word_limit->period 335 && (c == '\n' || word_limit->space > 1))); 336 if (c == '\n' || c == EOF) 337 word_limit->space = word_limit->final ? 2 : 1; 338 if (word_limit == end_of_word) 339 flush_paragraph (); 340 word_limit++; 341 if (c == EOF) 342 { 343 in_column = first_indent; 344 return EOF; 345 } 346 } 347 while (c != '\n'); 348 349 in_column = 0; 350 c = GETC(); 351 return get_space (c); 352 } 353 354 /* Read blank characters from the input data, starting with C, and keeping 355 in_column up-to-date. Return first non-blank character. */ 356 357 static int 358 get_space (register int c) 359 { 360 for (;;) 361 { 362 if (c == ' ') 363 in_column++; 364 else if (c == '\t') 365 in_column = (in_column / TABWIDTH + 1) * TABWIDTH; 366 else 367 return c; 368 c = GETC(); 369 } 370 } 371 372 /* Set extra fields in word W describing any attached punctuation. */ 373 374 static void 375 check_punctuation (register WORD *w) 376 { 377 register const char *start, *finish; 378 379 start = w->text; 380 finish = start + (w->length - 1); 381 w->paren = isopen (*start); 382 w->punct = ISPUNCT (*finish); 383 while (isclose (*finish) && finish > start) 384 finish--; 385 w->period = isperiod (*finish); 386 } 387 388 /* Flush part of the paragraph to make room. This function is called on 389 hitting the limit on the number of words or characters. */ 390 391 static void 392 flush_paragraph (void) 393 { 394 WORD *split_point; 395 register WORD *w; 396 COST best_break; 397 398 /* - format what you have so far as a paragraph, 399 - find a low-cost line break near the end, 400 - output to there, 401 - make that the start of the paragraph. */ 402 403 fmt_paragraph (); 404 405 /* Choose a good split point. */ 406 407 split_point = word_limit; 408 best_break = MAXCOST; 409 for (w = words->next_break; w != word_limit; w = w->next_break) 410 { 411 if (w->best_cost - w->next_break->best_cost < best_break) 412 { 413 split_point = w; 414 best_break = w->best_cost - w->next_break->best_cost; 415 } 416 if (best_break <= MAXCOST - LINE_CREDIT) 417 best_break += LINE_CREDIT; 418 } 419 put_paragraph (split_point); 420 421 /* Copy words from split_point down to word -- we use memmove because 422 the source and target may overlap. */ 423 424 memmove ((char *) words, (char *) split_point, 425 (word_limit - split_point + 1) * sizeof (WORD)); 426 word_limit -= split_point - words; 427 } 428 429 /* Compute the optimal formatting for the whole paragraph by computing 430 and remembering the optimal formatting for each suffix from the empty 431 one to the whole paragraph. */ 432 433 static void 434 fmt_paragraph (void) 435 { 436 register WORD *start, *w; 437 register int len; 438 register COST wcost, best; 439 int saved_length; 440 441 word_limit->best_cost = 0; 442 saved_length = word_limit->length; 443 word_limit->length = max_width; /* sentinel */ 444 445 for (start = word_limit - 1; start >= words; start--) 446 { 447 best = MAXCOST; 448 len = start == words ? first_indent : other_indent; 449 450 /* At least one word, however long, in the line. */ 451 452 w = start; 453 len += w->length; 454 do 455 { 456 w++; 457 458 /* Consider breaking before w. */ 459 460 wcost = line_cost (w, len) + w->best_cost; 461 if (start == words && last_line_length > 0) 462 wcost += RAGGED_COST (len - last_line_length); 463 if (wcost < best) 464 { 465 best = wcost; 466 start->next_break = w; 467 start->line_length = len; 468 } 469 len += (w - 1)->space + w->length; /* w > start >= words */ 470 } 471 while (len < max_width); 472 start->best_cost = best + base_cost (start); 473 } 474 475 word_limit->length = saved_length; 476 } 477 478 /* Return the constant component of the cost of breaking before the 479 word THIS. */ 480 481 static COST 482 base_cost (register WORD *this) 483 { 484 register COST cost; 485 486 cost = LINE_COST; 487 488 if (this > words) 489 { 490 if ((this - 1)->period) 491 { 492 if ((this - 1)->final) 493 cost -= SENTENCE_BONUS; 494 else 495 cost += NOBREAK_COST; 496 } 497 else if ((this - 1)->punct) 498 cost -= PUNCT_BONUS; 499 else if (this > words + 1 && (this - 2)->final) 500 cost += WIDOW_COST ((this - 1)->length); 501 } 502 503 if (this->paren) 504 cost -= PAREN_BONUS; 505 else if (this->final) 506 cost += ORPHAN_COST (this->length); 507 508 return cost; 509 } 510 511 /* Return the component of the cost of breaking before word NEXT that 512 depends on LEN, the length of the line beginning there. */ 513 514 static COST 515 line_cost (register WORD *next, register int len) 516 { 517 register int n; 518 register COST cost; 519 520 if (next == word_limit) 521 return 0; 522 n = best_width - len; 523 cost = SHORT_COST (n); 524 if (next->next_break != word_limit) 525 { 526 n = len - next->line_length; 527 cost += RAGGED_COST (n); 528 } 529 return cost; 530 } 531 532 /* Output to stdout a paragraph from word up to (but not including) 533 FINISH, which must be in the next_break chain from word. */ 534 535 static void 536 put_paragraph (register WORD *finish) 537 { 538 register WORD *w; 539 540 put_line (words, first_indent); 541 for (w = words->next_break; w != finish; w = w->next_break) 542 put_line (w, other_indent); 543 } 544 545 /* Output to stdout the line beginning with word W, beginning in column 546 INDENT, including the prefix (if any). */ 547 548 static void 549 put_line (register WORD *w, int indent) 550 { 551 register WORD *endline; 552 out_column = 0; 553 put_space (indent); 554 555 endline = w->next_break - 1; 556 for (; w != endline; w++) 557 { 558 put_word (w); 559 put_space (w->space); 560 } 561 put_word (w); 562 last_line_length = out_column; 563 putc ('\n', outfile); 564 } 565 566 /* Output to stdout the word W. */ 567 568 static void 569 put_word (register WORD *w) 570 { 571 register const char *s; 572 register int n; 573 574 s = w->text; 575 for (n = w->length; n != 0; n--) 576 putc (*s++, outfile); 577 out_column += w->length; 578 } 579 580 /* Output to stdout SPACE spaces, or equivalent tabs. */ 581 582 static void 583 put_space (int space) 584 { 585 out_column += space; 586 while (space--) 587 putc (' ', outfile); 588 } 589