Home | History | Annotate | Download | only in sed
      1 /* `L' command implementation for GNU sed, based on GNU fmt 1.22.
      2    Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc.
      3 
      4    This program is free software; you can redistribute it and/or modify
      5    it under the terms of the GNU General Public License as published by
      6    the Free Software Foundation; either version 3, or (at your option)
      7    any later version.
      8 
      9    This program is distributed in the hope that it will be useful,
     10    but WITHOUT ANY WARRANTY; without even the implied warranty of
     11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12    GNU General Public License for more details.
     13 
     14    You should have received a copy of the GNU General Public License
     15    along with this program; if not, write to the Free Software Foundation,
     16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
     17 
     18 /* GNU fmt was written by Ross Paterson <rap (at) doc.ic.ac.uk>.  */
     19 
     20 #include "sed.h"
     21 
     22 #include <stdio.h>
     23 #include <string.h>
     24 #include <ctype.h>
     25 #include <sys/types.h>
     26 
     27 #if HAVE_LIMITS_H
     28 # include <limits.h>
     29 #endif
     30 
     31 #ifndef UINT_MAX
     32 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
     33 #endif
     34 
     35 #ifndef INT_MAX
     36 # define INT_MAX ((int) (UINT_MAX >> 1))
     37 #endif
     38 
     39 /* The following parameters represent the program's idea of what is
     40    "best".  Adjust to taste, subject to the caveats given.  */
     41 
     42 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
     43    room for optimization.  */
     44 #define	LEEWAY	7
     45 
     46 /* Costs and bonuses are expressed as the equivalent departure from the
     47    optimal line length, multiplied by 10.  e.g. assigning something a
     48    cost of 50 means that it is as bad as a line 5 characters too short
     49    or too long.  The definition of SHORT_COST(n) should not be changed.
     50    However, EQUIV(n) may need tuning.  */
     51 
     52 typedef long COST;
     53 
     54 #define	MAXCOST	(~(((unsigned long) 1) << (8 * sizeof (COST) -1)))
     55 
     56 #define	SQR(n)		((n) * (n))
     57 #define	EQUIV(n)	SQR ((COST) (n))
     58 
     59 /* Cost of a filled line n chars longer or shorter than best_width.  */
     60 #define	SHORT_COST(n)	EQUIV ((n) * 10)
     61 
     62 /* Cost of the difference between adjacent filled lines.  */
     63 #define	RAGGED_COST(n)	(SHORT_COST (n) / 2)
     64 
     65 /* Basic cost per line.  */
     66 #define	LINE_COST	EQUIV (70)
     67 
     68 /* Cost of breaking a line after the first word of a sentence, where
     69    the length of the word is N.  */
     70 #define	WIDOW_COST(n)	(EQUIV (200) / ((n) + 2))
     71 
     72 /* Cost of breaking a line before the last word of a sentence, where
     73    the length of the word is N.  */
     74 #define	ORPHAN_COST(n)	(EQUIV (150) / ((n) + 2))
     75 
     76 /* Bonus for breaking a line at the end of a sentence.  */
     77 #define	SENTENCE_BONUS	EQUIV (50)
     78 
     79 /* Cost of breaking a line after a period not marking end of a sentence.
     80    With the definition of sentence we are using (borrowed from emacs, see
     81    get_line()) such a break would then look like a sentence break.  Hence
     82    we assign a very high cost -- it should be avoided unless things are
     83    really bad.  */
     84 #define	NOBREAK_COST	EQUIV (600)
     85 
     86 /* Bonus for breaking a line before open parenthesis.  */
     87 #define	PAREN_BONUS	EQUIV (40)
     88 
     89 /* Bonus for breaking a line after other punctuation.  */
     90 #define	PUNCT_BONUS	EQUIV(40)
     91 
     92 /* Credit for breaking a long paragraph one line later.  */
     93 #define	LINE_CREDIT	EQUIV(3)
     94 
     95 /* Size of paragraph buffer in words.  Longer paragraphs are handled
     96    neatly (cf. flush_paragraph()), so there's little to gain by making
     97    these larger.  */
     98 #define	MAXWORDS	1000
     99 
    100 #define GETC()          (parabuf == end_of_parabuf ? EOF : *parabuf++)
    101 
    102 /* Extra ctype(3)-style macros.  */
    103 
    104 #define	isopen(c)	(strchr ("([`'\"", (c)) != NULL)
    105 #define	isclose(c)	(strchr (")]'\"", (c)) != NULL)
    106 #define	isperiod(c)	(strchr (".?!", (c)) != NULL)
    107 
    108 /* Size of a tab stop, for expansion on input and re-introduction on
    109    output.  */
    110 #define	TABWIDTH	8
    111 
    112 /* Word descriptor structure.  */
    113 
    114 typedef struct Word WORD;
    115 
    116 struct Word
    117   {
    118 
    119     /* Static attributes determined during input.  */
    120 
    121     const char *text;		/* the text of the word */
    122     short length;		/* length of this word */
    123     short space;		/* the size of the following space */
    124     unsigned paren:1;		/* starts with open paren */
    125     unsigned period:1;		/* ends in [.?!])* */
    126     unsigned punct:1;		/* ends in punctuation */
    127     unsigned final:1;		/* end of sentence */
    128 
    129     /* The remaining fields are computed during the optimization.  */
    130 
    131     short line_length;		/* length of the best line starting here */
    132     COST best_cost;		/* cost of best paragraph starting here */
    133     WORD *next_break;		/* break which achieves best_cost */
    134   };
    135 
    136 /* Forward declarations.  */
    137 
    138 static bool get_paragraph P_ ((void));
    139 static int get_line P_ ((int c));
    140 static int get_space P_ ((int c));
    141 static int copy_rest P_ ((int c));
    142 static bool same_para P_ ((int c));
    143 static void flush_paragraph P_ ((void));
    144 static void fmt_paragraph P_ ((void));
    145 static void check_punctuation P_ ((WORD *w));
    146 static COST base_cost P_ ((WORD *this));
    147 static COST line_cost P_ ((WORD *next, int len));
    148 static void put_paragraph P_ ((WORD *finish));
    149 static void put_line P_ ((WORD *w, int indent));
    150 static void put_word P_ ((WORD *w));
    151 static void put_space P_ ((int space));
    152 
    153 /* Option values.  */
    154 
    155 /* User-supplied maximum line width (default WIDTH).  The only output
    156    lines
    157    longer than this will each comprise a single word.  */
    158 static int max_width;
    159 
    160 /* Space for the paragraph text.  */
    161 static const char *parabuf;
    162 
    163 /* End of space for the paragraph text.  */
    164 static const char *end_of_parabuf;
    165 
    166 /* The file on which we output */
    167 static FILE *outfile;
    168 
    169 /* Values derived from the option values.  */
    170 
    171 /* The preferred width of text lines, set to LEEWAY % less than max_width.  */
    172 static int best_width;
    173 
    174 /* Dynamic variables.  */
    175 
    176 /* Start column of the character most recently read from the input file.  */
    177 static int in_column;
    178 
    179 /* Start column of the next character to be written to stdout.  */
    180 static int out_column;
    181 
    182 /* The words of a paragraph -- longer paragraphs are handled neatly
    183    (cf. flush_paragraph()).  */
    184 static WORD words[MAXWORDS];
    185 
    186 /* A pointer into the above word array, indicating the first position
    187    after the last complete word.  Sometimes it will point at an incomplete
    188    word.  */
    189 static WORD *word_limit;
    190 
    191 /* Indentation of the first line of the current paragraph.  */
    192 static int first_indent;
    193 
    194 /* Indentation of other lines of the current paragraph */
    195 static int other_indent;
    196 
    197 /* The last character read from the input file.  */
    198 static int next_char;
    199 
    200 /* If nonzero, the length of the last line output in the current
    201    paragraph, used to charge for raggedness at the split point for long
    202    paragraphs chosen by fmt_paragraph().  */
    203 static int last_line_length;
    204 
    205 /* read file F and send formatted output to stdout.  */
    206 
    207 void
    208 fmt (const char *line, const char *line_end, int max_length, FILE *output_file)
    209 {
    210   parabuf = line;
    211   end_of_parabuf = line_end;
    212   outfile = output_file;
    213 
    214   max_width = max_length;
    215   best_width = max_width * (201 - 2 * LEEWAY) / 200;
    216 
    217   in_column = 0;
    218   other_indent = 0;
    219   next_char = GETC();
    220   while (get_paragraph ())
    221     {
    222       fmt_paragraph ();
    223       put_paragraph (word_limit);
    224     }
    225 }
    226 
    227 /* Read a paragraph from input file F.  A paragraph consists of a
    228    maximal number of non-blank (excluding any prefix) lines
    229    with the same indent.
    230 
    231    Return false if end-of-file was encountered before the start of a
    232    paragraph, else true.  */
    233 
    234 static bool
    235 get_paragraph ()
    236 {
    237   register int c;
    238 
    239   last_line_length = 0;
    240   c = next_char;
    241 
    242   /* Scan (and copy) blank lines, and lines not introduced by the prefix.  */
    243 
    244   while (c == '\n' || c == EOF)
    245     {
    246       c = copy_rest (c);
    247       if (c == EOF)
    248 	{
    249 	  next_char = EOF;
    250 	  return false;
    251 	}
    252       putc ('\n', outfile);
    253       c = GETC();
    254     }
    255 
    256   /* Got a suitable first line for a paragraph.  */
    257 
    258   first_indent = in_column;
    259   word_limit = words;
    260   c = get_line (c);
    261 
    262   /* Read rest of paragraph.  */
    263 
    264   other_indent = in_column;
    265   while (same_para (c) && in_column == other_indent)
    266     c = get_line (c);
    267 
    268   (word_limit - 1)->period = (word_limit - 1)->final = true;
    269   next_char = c;
    270   return true;
    271 }
    272 
    273 /* Copy to the output a blank line.  In the latter, C is \n or EOF.
    274    Return the character (\n or EOF) ending the line.  */
    275 
    276 static int
    277 copy_rest (register int c)
    278 {
    279   out_column = 0;
    280   while (c != '\n' && c != EOF)
    281     {
    282       putc (c, outfile);
    283       c = GETC();
    284     }
    285   return c;
    286 }
    287 
    288 /* Return true if a line whose first non-blank character after the
    289    prefix (if any) is C could belong to the current paragraph,
    290    otherwise false.  */
    291 
    292 static bool
    293 same_para (register int c)
    294 {
    295   return (c != '\n' && c != EOF);
    296 }
    297 
    298 /* Read a line from the input data given first non-blank character C
    299    after the prefix, and the following indent, and break it into words.
    300    A word is a maximal non-empty string of non-white characters.  A word
    301    ending in [.?!]["')\]]* and followed by end-of-line or at least two
    302    spaces ends a sentence, as in emacs.
    303 
    304    Return the first non-blank character of the next line.  */
    305 
    306 static int
    307 get_line (register int c)
    308 {
    309   int start;
    310   register WORD *end_of_word;
    311 
    312   end_of_word = &words[MAXWORDS - 2];
    313 
    314   do
    315     {				/* for each word in a line */
    316 
    317       /* Scan word.  */
    318 
    319       word_limit->text = parabuf - 1;
    320       do
    321 	c = GETC();
    322       while (c != EOF && !ISSPACE (c));
    323       word_limit->length = parabuf - word_limit->text - (c != EOF);
    324       in_column += word_limit->length;
    325 
    326       check_punctuation (word_limit);
    327 
    328       /* Scan inter-word space.  */
    329 
    330       start = in_column;
    331       c = get_space (c);
    332       word_limit->space = in_column - start;
    333       word_limit->final = (c == EOF
    334 			   || (word_limit->period
    335 			       && (c == '\n' || word_limit->space > 1)));
    336       if (c == '\n' || c == EOF)
    337 	word_limit->space = word_limit->final ? 2 : 1;
    338       if (word_limit == end_of_word)
    339 	flush_paragraph ();
    340       word_limit++;
    341       if (c == EOF)
    342 	{
    343 	  in_column = first_indent;
    344 	  return EOF;
    345 	}
    346     }
    347   while (c != '\n');
    348 
    349   in_column = 0;
    350   c = GETC();
    351   return get_space (c);
    352 }
    353 
    354 /* Read blank characters from the input data, starting with C, and keeping
    355    in_column up-to-date.  Return first non-blank character.  */
    356 
    357 static int
    358 get_space (register int c)
    359 {
    360   for (;;)
    361     {
    362       if (c == ' ')
    363 	in_column++;
    364       else if (c == '\t')
    365 	in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
    366       else
    367 	return c;
    368       c = GETC();
    369     }
    370 }
    371 
    372 /* Set extra fields in word W describing any attached punctuation.  */
    373 
    374 static void
    375 check_punctuation (register WORD *w)
    376 {
    377   register const char *start, *finish;
    378 
    379   start = w->text;
    380   finish = start + (w->length - 1);
    381   w->paren = isopen (*start);
    382   w->punct = ISPUNCT (*finish);
    383   while (isclose (*finish) && finish > start)
    384     finish--;
    385   w->period = isperiod (*finish);
    386 }
    387 
    388 /* Flush part of the paragraph to make room.  This function is called on
    389    hitting the limit on the number of words or characters.  */
    390 
    391 static void
    392 flush_paragraph (void)
    393 {
    394   WORD *split_point;
    395   register WORD *w;
    396   COST best_break;
    397 
    398   /* - format what you have so far as a paragraph,
    399      - find a low-cost line break near the end,
    400      - output to there,
    401      - make that the start of the paragraph.  */
    402 
    403   fmt_paragraph ();
    404 
    405   /* Choose a good split point.  */
    406 
    407   split_point = word_limit;
    408   best_break = MAXCOST;
    409   for (w = words->next_break; w != word_limit; w = w->next_break)
    410     {
    411       if (w->best_cost - w->next_break->best_cost < best_break)
    412 	{
    413 	  split_point = w;
    414 	  best_break = w->best_cost - w->next_break->best_cost;
    415 	}
    416       if (best_break <= MAXCOST - LINE_CREDIT)
    417 	best_break += LINE_CREDIT;
    418     }
    419   put_paragraph (split_point);
    420 
    421   /* Copy words from split_point down to word -- we use memmove because
    422      the source and target may overlap.  */
    423 
    424   memmove ((char *) words, (char *) split_point,
    425 	 (word_limit - split_point + 1) * sizeof (WORD));
    426   word_limit -= split_point - words;
    427 }
    428 
    429 /* Compute the optimal formatting for the whole paragraph by computing
    430    and remembering the optimal formatting for each suffix from the empty
    431    one to the whole paragraph.  */
    432 
    433 static void
    434 fmt_paragraph (void)
    435 {
    436   register WORD *start, *w;
    437   register int len;
    438   register COST wcost, best;
    439   int saved_length;
    440 
    441   word_limit->best_cost = 0;
    442   saved_length = word_limit->length;
    443   word_limit->length = max_width;	/* sentinel */
    444 
    445   for (start = word_limit - 1; start >= words; start--)
    446     {
    447       best = MAXCOST;
    448       len = start == words ? first_indent : other_indent;
    449 
    450       /* At least one word, however long, in the line.  */
    451 
    452       w = start;
    453       len += w->length;
    454       do
    455 	{
    456 	  w++;
    457 
    458 	  /* Consider breaking before w.  */
    459 
    460 	  wcost = line_cost (w, len) + w->best_cost;
    461 	  if (start == words && last_line_length > 0)
    462 	    wcost += RAGGED_COST (len - last_line_length);
    463 	  if (wcost < best)
    464 	    {
    465 	      best = wcost;
    466 	      start->next_break = w;
    467 	      start->line_length = len;
    468 	    }
    469 	  len += (w - 1)->space + w->length;	/* w > start >= words */
    470 	}
    471       while (len < max_width);
    472       start->best_cost = best + base_cost (start);
    473     }
    474 
    475   word_limit->length = saved_length;
    476 }
    477 
    478 /* Return the constant component of the cost of breaking before the
    479    word THIS.  */
    480 
    481 static COST
    482 base_cost (register WORD *this)
    483 {
    484   register COST cost;
    485 
    486   cost = LINE_COST;
    487 
    488   if (this > words)
    489     {
    490       if ((this - 1)->period)
    491 	{
    492 	  if ((this - 1)->final)
    493 	    cost -= SENTENCE_BONUS;
    494 	  else
    495 	    cost += NOBREAK_COST;
    496 	}
    497       else if ((this - 1)->punct)
    498 	cost -= PUNCT_BONUS;
    499       else if (this > words + 1 && (this - 2)->final)
    500 	cost += WIDOW_COST ((this - 1)->length);
    501     }
    502 
    503   if (this->paren)
    504     cost -= PAREN_BONUS;
    505   else if (this->final)
    506     cost += ORPHAN_COST (this->length);
    507 
    508   return cost;
    509 }
    510 
    511 /* Return the component of the cost of breaking before word NEXT that
    512    depends on LEN, the length of the line beginning there.  */
    513 
    514 static COST
    515 line_cost (register WORD *next, register int len)
    516 {
    517   register int n;
    518   register COST cost;
    519 
    520   if (next == word_limit)
    521     return 0;
    522   n = best_width - len;
    523   cost = SHORT_COST (n);
    524   if (next->next_break != word_limit)
    525     {
    526       n = len - next->line_length;
    527       cost += RAGGED_COST (n);
    528     }
    529   return cost;
    530 }
    531 
    532 /* Output to stdout a paragraph from word up to (but not including)
    533    FINISH, which must be in the next_break chain from word.  */
    534 
    535 static void
    536 put_paragraph (register WORD *finish)
    537 {
    538   register WORD *w;
    539 
    540   put_line (words, first_indent);
    541   for (w = words->next_break; w != finish; w = w->next_break)
    542     put_line (w, other_indent);
    543 }
    544 
    545 /* Output to stdout the line beginning with word W, beginning in column
    546    INDENT, including the prefix (if any).  */
    547 
    548 static void
    549 put_line (register WORD *w, int indent)
    550 {
    551   register WORD *endline;
    552   out_column = 0;
    553   put_space (indent);
    554 
    555   endline = w->next_break - 1;
    556   for (; w != endline; w++)
    557     {
    558       put_word (w);
    559       put_space (w->space);
    560     }
    561   put_word (w);
    562   last_line_length = out_column;
    563   putc ('\n', outfile);
    564 }
    565 
    566 /* Output to stdout the word W.  */
    567 
    568 static void
    569 put_word (register WORD *w)
    570 {
    571   register const char *s;
    572   register int n;
    573 
    574   s = w->text;
    575   for (n = w->length; n != 0; n--)
    576     putc (*s++, outfile);
    577   out_column += w->length;
    578 }
    579 
    580 /* Output to stdout SPACE spaces, or equivalent tabs.  */
    581 
    582 static void
    583 put_space (int space)
    584 {
    585   out_column += space;
    586   while (space--)
    587     putc (' ', outfile);
    588 }
    589