Home | History | Annotate | Download | only in posix
      1 /* sed.c - stream editor. Thing that does s/// and other stuff.
      2  *
      3  * Copyright 2014 Rob Landley <rob (at) landley.net>
      4  *
      5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
      6  *
      7  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
      8  * but N and s///
      9  * TODO: make y// handle unicode
     10  * TODO: handle error return from emit(), error_msg/exit consistently
     11  *       What's the right thing to do for -i when write fails? Skip to next?
     12 
     13 USE_SED(NEWTOY(sed, "(help)(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
     14 
     15 config SED
     16   bool "sed"
     17   default y
     18   help
     19     usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
     20 
     21     Stream editor. Apply one or more editing SCRIPTs to each line of input
     22     (from FILE or stdin) producing output (by default to stdout).
     23 
     24     -e	add SCRIPT to list
     25     -f	add contents of SCRIPT_FILE to list
     26     -i	Edit each file in place.
     27     -n	No default output. (Use the p command to output matched lines.)
     28     -r	Use extended regular expression syntax.
     29     -E	Alias for -r.
     30     -s	Treat input files separately (implied by -i)
     31 
     32     A SCRIPT is a series of one or more COMMANDs separated by newlines or
     33     semicolons. All -e SCRIPTs are concatenated together as if separated
     34     by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
     35     If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
     36 
     37     Each COMMAND may be preceded by an address which limits the command to
     38     apply only to the specified line(s). Commands without an address apply to
     39     every line. Addresses are of the form:
     40 
     41       [ADDRESS[,ADDRESS]]COMMAND
     42 
     43     The ADDRESS may be a decimal line number (starting at 1), a /regular
     44     expression/ within a pair of forward slashes, or the character "$" which
     45     matches the last line of input. (In -s or -i mode this matches the last
     46     line of each file, otherwise just the last line of the last file.) A single
     47     address matches one line, a pair of comma separated addresses match
     48     everything from the first address to the second address (inclusive). If
     49     both addresses are regular expressions, more than one range of lines in
     50     each file can match.
     51 
     52     REGULAR EXPRESSIONS in sed are started and ended by the same character
     53     (traditionally / but anything except a backslash or a newline works).
     54     Backslashes may be used to escape the delimiter if it occurs in the
     55     regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
     56     and unicode). An empty regex repeats the previous one. ADDRESS regexes
     57     (above) require the first delimeter to be escaped with a backslash when
     58     it isn't a forward slash (to distinguish it from the COMMANDs below).
     59 
     60     Sed mostly operates on individual lines one at a time. It reads each line,
     61     processes it, and either writes it to the output or discards it before
     62     reading the next line. Sed can remember one additional line in a separate
     63     buffer (using the h, H, g, G, and x commands), and can read the next line
     64     of input early (using the n and N command), but other than that command
     65     scripts operate on individual lines of text.
     66 
     67     Each COMMAND starts with a single character. The following commands take
     68     no arguments:
     69 
     70       {  Start a new command block, continuing until a corresponding "}".
     71          Command blocks may nest. If the block has an address, commands within
     72          the block are only run for lines within the block's address range.
     73 
     74       }  End command block (this command cannot have an address)
     75 
     76       d  Delete this line and move on to the next one
     77          (ignores remaining COMMANDs)
     78 
     79       D  Delete one line of input and restart command SCRIPT (same as "d"
     80          unless you've glued lines together with "N" or similar)
     81 
     82       g  Get remembered line (overwriting current line)
     83 
     84       G  Get remembered line (appending to current line)
     85 
     86       h  Remember this line (overwriting remembered line)
     87 
     88       H  Remember this line (appending to remembered line, if any)
     89 
     90       l  Print line, escaping \abfrtv (but not newline), octal escaping other
     91          nonprintable characters, wrapping lines to terminal width with a
     92          backslash, and appending $ to actual end of line.
     93 
     94       n  Print default output and read next line, replacing current line
     95          (If no next line available, quit processing script)
     96 
     97       N  Append next line of input to this line, separated by a newline
     98          (This advances the line counter for address matching and "=", if no
     99          next line available quit processing script without default output)
    100 
    101       p  Print this line
    102 
    103       P  Print this line up to first newline (from "N")
    104 
    105       q  Quit (print default output, no more commands processed or lines read)
    106 
    107       x  Exchange this line with remembered line (overwrite in both directions)
    108 
    109       =  Print the current line number (followed by a newline)
    110 
    111     The following commands (may) take an argument. The "text" arguments (to
    112     the "a", "b", and "c" commands) may end with an unescaped "\" to append
    113     the next line (for which leading whitespace is not skipped), and also
    114     treat ";" as a literal character (use "\;" instead).
    115 
    116       a [text]   Append text to output before attempting to read next line
    117 
    118       b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
    119 
    120       c [text]   Delete line, output text at end of matching address range
    121                  (ignores remaining COMMANDs)
    122 
    123       i [text]   Print text
    124 
    125       r [file]   Append contents of file to output before attempting to read
    126                  next line.
    127 
    128       s/S/R/F    Search for regex S, replace matched text with R using flags F.
    129                  The first character after the "s" (anything but newline or
    130                  backslash) is the delimiter, escape with \ to use normally.
    131 
    132                  The replacement text may contain "&" to substitute the matched
    133                  text (escape it with backslash for a literal &), or \1 through
    134                  \9 to substitute a parenthetical subexpression in the regex.
    135                  You can also use the normal backslash escapes such as \n and
    136                  a backslash at the end of the line appends the next line.
    137 
    138                  The flags are:
    139 
    140                  [0-9]    A number, substitute only that occurrence of pattern
    141                  g        Global, substitute all occurrences of pattern
    142                  i        Ignore case when matching
    143                  p        Print the line if match was found and replaced
    144                  w [file] Write (append) line to file if match replaced
    145 
    146       t [label]  Test, jump to :label only if an "s" command found a match in
    147                  this line since last test (replacing with same text counts)
    148 
    149       T [label]  Test false, jump only if "s" hasn't found a match.
    150 
    151       w [file]   Write (append) line to file
    152 
    153       y/old/new/ Change each character in 'old' to corresponding character
    154                  in 'new' (with standard backslash escapes, delimiter can be
    155                  any repeated character except \ or \n)
    156 
    157       : [label]  Labeled target for jump commands
    158 
    159       #  Comment, ignore rest of this line of SCRIPT
    160 
    161     Deviations from posix: allow extended regular expressions with -r,
    162     editing in place with -i, separate with -s, printf escapes in text, line
    163     continuations, semicolons after all commands, 2-address anywhere an
    164     address is allowed, "T" command, multiline continuations for [abc],
    165     \; to end [abc] argument before end of line.
    166 */
    167 
    168 #define FOR_sed
    169 #include "toys.h"
    170 
    171 GLOBALS(
    172   struct arg_list *f;
    173   struct arg_list *e;
    174 
    175   // processed pattern list
    176   struct double_list *pattern;
    177 
    178   char *nextline, *remember;
    179   void *restart, *lastregex;
    180   long nextlen, rememberlen, count;
    181   int fdout, noeol;
    182   unsigned xx;
    183 )
    184 
    185 // Linked list of parsed sed commands. Offset fields indicate location where
    186 // regex or string starts, ala offset+(char *)struct, because we remalloc()
    187 // these to expand them for multiline inputs, and pointers would have to be
    188 // individually adjusted.
    189 
    190 struct sedcmd {
    191   struct sedcmd *next, *prev;
    192 
    193   // Begin and end of each match
    194   long lmatch[2]; // line number of match
    195   int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
    196   int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
    197   unsigned not, hit;
    198   unsigned sflags; // s///flag bits: i=1, g=2, p=4
    199   char c; // action
    200 };
    201 
    202 // Write out line with potential embedded NUL, handling eol/noeol
    203 static int emit(char *line, long len, int eol)
    204 {
    205   int l, old = line[len];
    206 
    207   if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
    208   TT.noeol = !eol;
    209   if (eol) line[len++] = '\n';
    210   if (!len) return 0;
    211   l = writeall(TT.fdout, line, len);
    212   if (eol) line[len-1] = old;
    213   if (l != len) {
    214     perror_msg("short write");
    215 
    216     return 1;
    217   }
    218 
    219   return 0;
    220 }
    221 
    222 // Extend allocation to include new string, with newline between if newlen<0
    223 
    224 static char *extend_string(char **old, char *new, int oldlen, int newlen)
    225 {
    226   int newline = newlen < 0;
    227   char *s;
    228 
    229   if (newline) newlen = -newlen;
    230   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
    231   if (newline) s[oldlen++] = '\n';
    232   memcpy(s+oldlen, new, newlen);
    233   s[oldlen+newlen] = 0;
    234 
    235   return s+oldlen+newlen+1;
    236 }
    237 
    238 // An empty regex repeats the previous one
    239 static void *get_regex(void *trump, int offset)
    240 {
    241   if (!offset) {
    242     if (!TT.lastregex) error_exit("no previous regex");
    243     return TT.lastregex;
    244   }
    245 
    246   return TT.lastregex = offset+(char *)trump;
    247 }
    248 
    249 // Apply pattern to line from input file
    250 static void process_line(char **pline, long plen)
    251 {
    252   struct append {
    253     struct append *next, *prev;
    254     int file;
    255     char *str;
    256   } *append = 0;
    257   char *line = TT.nextline;
    258   long len = TT.nextlen;
    259   struct sedcmd *command;
    260   int eol = 0, tea = 0;
    261 
    262   // Grab next line for deferred processing (EOF detection: we get a NULL
    263   // pline at EOF to flush last line). Note that only end of _last_ input
    264   // file matches $ (unless we're doing -i).
    265   TT.nextline = 0;
    266   TT.nextlen = 0;
    267   if (pline) {
    268     TT.nextline = *pline;
    269     TT.nextlen = plen;
    270     *pline = 0;
    271   }
    272 
    273   if (!line || !len) return;
    274   if (line[len-1] == '\n') line[--len] = eol++;
    275   TT.count++;
    276 
    277   // The restart-1 is because we added one to make sure it wasn't NULL,
    278   // otherwise N as last command would restart script
    279   command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
    280   TT.restart = 0;
    281 
    282   while (command) {
    283     char *str, c = command->c;
    284 
    285     // Have we got a line or regex matching range for this rule?
    286     if (*command->lmatch || *command->rmatch) {
    287       int miss = 0;
    288       long lm;
    289 
    290       // In a match that might end?
    291       if (command->hit) {
    292         if (!(lm = command->lmatch[1])) {
    293           if (!command->rmatch[1]) command->hit = 0;
    294           else {
    295             void *rm = get_regex(command, command->rmatch[1]);
    296 
    297             // regex match end includes matching line, so defer deactivation
    298             if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
    299           }
    300         } else if (lm > 0 && lm < TT.count) command->hit = 0;
    301 
    302       // Start a new match?
    303       } else {
    304         if (!(lm = *command->lmatch)) {
    305           void *rm = get_regex(command, *command->rmatch);
    306 
    307           if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
    308         } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
    309 
    310         if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
    311       }
    312 
    313       // Didn't match?
    314       lm = !(command->hit ^ command->not);
    315 
    316       // Deferred disable from regex end match
    317       if (miss || command->lmatch[1] == TT.count) command->hit = 0;
    318 
    319       if (lm) {
    320         // Handle skipping curly bracket command group
    321         if (c == '{') {
    322           int curly = 1;
    323 
    324           while (curly) {
    325             command = command->next;
    326             if (command->c == '{') curly++;
    327             if (command->c == '}') curly--;
    328           }
    329         }
    330         command = command->next;
    331         continue;
    332       }
    333     }
    334 
    335     // A deleted line can still update line match state for later commands
    336     if (!line) {
    337       command = command->next;
    338       continue;
    339     }
    340 
    341     // Process command
    342 
    343     if (c=='a' || c=='r') {
    344       struct append *a = xzalloc(sizeof(struct append));
    345       if (command->arg1) a->str = command->arg1+(char *)command;
    346       a->file = c=='r';
    347       dlist_add_nomalloc((void *)&append, (void *)a);
    348     } else if (c=='b' || c=='t' || c=='T') {
    349       int t = tea;
    350 
    351       if (c != 'b') tea = 0;
    352       if (c=='b' || t^(c=='T')) {
    353         if (!command->arg1) break;
    354         str = command->arg1+(char *)command;
    355         for (command = (void *)TT.pattern; command; command = command->next)
    356           if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
    357             break;
    358         if (!command) error_exit("no :%s", str);
    359       }
    360     } else if (c=='c') {
    361       str = command->arg1+(char *)command;
    362       if (!command->hit) emit(str, strlen(str), 1);
    363       free(line);
    364       line = 0;
    365       continue;
    366     } else if (c=='d') {
    367       free(line);
    368       line = 0;
    369       continue;
    370     } else if (c=='D') {
    371       // Delete up to \n or end of buffer
    372       str = line;
    373       while ((str-line)<len) if (*(str++) == '\n') break;
    374       len -= str - line;
    375       memmove(line, str, len);
    376 
    377       // if "delete" blanks line, disable further processing
    378       // otherwise trim and restart script
    379       if (!len) {
    380         free(line);
    381         line = 0;
    382       } else {
    383         line[len] = 0;
    384         command = (void *)TT.pattern;
    385       }
    386       continue;
    387     } else if (c=='g') {
    388       free(line);
    389       line = xstrdup(TT.remember);
    390       len = TT.rememberlen;
    391     } else if (c=='G') {
    392       line = xrealloc(line, len+TT.rememberlen+2);
    393       line[len++] = '\n';
    394       memcpy(line+len, TT.remember, TT.rememberlen);
    395       line[len += TT.rememberlen] = 0;
    396     } else if (c=='h') {
    397       free(TT.remember);
    398       TT.remember = xstrdup(line);
    399       TT.rememberlen = len;
    400     } else if (c=='H') {
    401       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
    402       TT.remember[TT.rememberlen++] = '\n';
    403       memcpy(TT.remember+TT.rememberlen, line, len);
    404       TT.remember[TT.rememberlen += len] = 0;
    405     } else if (c=='i') {
    406       str = command->arg1+(char *)command;
    407       emit(str, strlen(str), 1);
    408     } else if (c=='l') {
    409       int i, x, off;
    410 
    411       if (!TT.xx) {
    412         terminal_size(&TT.xx, 0);
    413         if (!TT.xx) TT.xx = 80;
    414         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
    415         if (TT.xx > 4) TT.xx -= 4;
    416       }
    417 
    418       for (i = off = 0; i<len; i++) {
    419         if (off >= TT.xx) {
    420           toybuf[off++] = '\\';
    421           emit(toybuf, off, 1);
    422           off = 0;
    423         }
    424         x = stridx("\\\a\b\f\r\t\v", line[i]);
    425         if (x != -1) {
    426           toybuf[off++] = '\\';
    427           toybuf[off++] = "\\abfrtv"[x];
    428         } else if (line[i] >= ' ') toybuf[off++] = line[i];
    429         else off += sprintf(toybuf+off, "\\%03o", line[i]);
    430       }
    431       toybuf[off++] = '$';
    432       emit(toybuf, off, 1);
    433     } else if (c=='n') {
    434       TT.restart = command->next+1;
    435 
    436       break;
    437     } else if (c=='N') {
    438       // Can't just grab next line because we could have multiple N and
    439       // we need to actually read ahead to get N;$p EOF detection right.
    440       if (pline) {
    441         TT.restart = command->next+1;
    442         extend_string(&line, TT.nextline, len, -TT.nextlen);
    443         free(TT.nextline);
    444         TT.nextline = line;
    445         TT.nextlen += len + 1;
    446         line = 0;
    447       }
    448 
    449       // Pending append goes out right after N
    450       goto done;
    451     } else if (c=='p' || c=='P') {
    452       char *l = (c=='P') ? strchr(line, '\n') : 0;
    453 
    454       if (emit(line, l ? l-line : len, eol)) break;
    455     } else if (c=='q') {
    456       if (pline) *pline = (void *)1;
    457       free(TT.nextline);
    458       TT.nextline = 0;
    459       TT.nextlen = 0;
    460 
    461       break;
    462     } else if (c=='s') {
    463       char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
    464       regmatch_t *match = (void *)toybuf;
    465       regex_t *reg = get_regex(command, command->arg1);
    466       int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
    467 
    468       // Find match in remaining line (up to remaining len)
    469       while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
    470         mflags = REG_NOTBOL;
    471 
    472         // Zero length matches don't count immediately after a previous match
    473         mlen = match[0].rm_eo-match[0].rm_so;
    474         if (!mlen && !zmatch) {
    475           if (!rlen--) break;
    476           rline++;
    477           zmatch++;
    478           continue;
    479         } else zmatch = 0;
    480 
    481         // If we're replacing only a specific match, skip if this isn't it
    482         off = command->sflags>>3;
    483         if (off && off != ++count) {
    484           rline += match[0].rm_eo;
    485           rlen -= match[0].rm_eo;
    486 
    487           continue;
    488         }
    489         // The fact getline() can allocate unbounded amounts of memory is
    490         // a bigger issue, but while we're here check for integer overflow
    491         if (match[0].rm_eo > INT_MAX) perror_exit(0);
    492 
    493         // newlen = strlen(new) but with \1 and & and printf escapes
    494         for (off = newlen = 0; new[off]; off++) {
    495           int cc = -1;
    496 
    497           if (new[off] == '&') cc = 0;
    498           else if (new[off] == '\\') cc = new[++off] - '0';
    499           if (cc < 0 || cc > 9) {
    500             newlen++;
    501             continue;
    502           }
    503           newlen += match[cc].rm_eo-match[cc].rm_so;
    504         }
    505 
    506         // Allocate new size, copy start/end around match. (Can't extend in
    507         // place because backrefs may refer to text after it's overwritten.)
    508         len += newlen-mlen;
    509         swap = xmalloc(len+1);
    510         rswap = swap+(rline-line)+match[0].rm_so;
    511         memcpy(swap, line, (rline-line)+match[0].rm_so);
    512         memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
    513 
    514         // copy in new replacement text
    515         for (off = mlen = 0; new[off]; off++) {
    516           int cc = 0, ll;
    517 
    518           if (new[off] == '\\') {
    519             cc = new[++off] - '0';
    520             if (cc<0 || cc>9) {
    521               if (!(rswap[mlen++] = unescape(new[off])))
    522                 rswap[mlen-1] = new[off];
    523 
    524               continue;
    525             } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
    526           } else if (new[off] != '&') {
    527             rswap[mlen++] = new[off];
    528 
    529             continue;
    530           }
    531 
    532           ll = match[cc].rm_eo-match[cc].rm_so;
    533           memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
    534           mlen += ll;
    535         }
    536 
    537         rline = rswap+newlen;
    538         free(line);
    539         line = swap;
    540 
    541         // Stop after first substitution unless we have flag g
    542         if (!(command->sflags & 2)) break;
    543       }
    544 
    545       if (mflags) {
    546         // flag p
    547         if (command->sflags & 4) emit(line, len, eol);
    548 
    549         tea = 1;
    550         if (command->w) goto writenow;
    551       }
    552     } else if (c=='w') {
    553       int fd, noeol;
    554       char *name;
    555 
    556 writenow:
    557       // Swap out emit() context
    558       fd = TT.fdout;
    559       noeol = TT.noeol;
    560 
    561       // We save filehandle and newline status before filename
    562       name = command->w + (char *)command;
    563       memcpy(&TT.fdout, name, 4);
    564       name += 4;
    565       TT.noeol = *(name++);
    566 
    567       // write, then save/restore context
    568       if (emit(line, len, eol))
    569         perror_exit("w '%s'", command->arg1+(char *)command);
    570       *(--name) = TT.noeol;
    571       TT.noeol = noeol;
    572       TT.fdout = fd;
    573     } else if (c=='x') {
    574       long swap = TT.rememberlen;
    575 
    576       str = TT.remember;
    577       TT.remember = line;
    578       line = str;
    579       TT.rememberlen = len;
    580       len = swap;
    581     } else if (c=='y') {
    582       char *from, *to = (char *)command;
    583       int i, j;
    584 
    585       from = to+command->arg1;
    586       to += command->arg2;
    587 
    588       for (i = 0; i < len; i++) {
    589         j = stridx(from, line[i]);
    590         if (j != -1) line[i] = to[j];
    591       }
    592     } else if (c=='=') {
    593       sprintf(toybuf, "%ld", TT.count);
    594       emit(toybuf, strlen(toybuf), 1);
    595     }
    596 
    597     command = command->next;
    598   }
    599 
    600   if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
    601 
    602 done:
    603   if (dlist_terminate(append)) while (append) {
    604     struct append *a = append->next;
    605 
    606     if (append->file) {
    607       int fd = open(append->str, O_RDONLY);
    608 
    609       // Force newline if noeol pending
    610       if (fd != -1) {
    611         if (TT.noeol) xwrite(TT.fdout, "\n", 1);
    612         TT.noeol = 0;
    613         xsendfile(fd, TT.fdout);
    614         close(fd);
    615       }
    616     } else if (append->str) emit(append->str, strlen(append->str), 1);
    617     else emit(line, 0, 0);
    618     free(append);
    619     append = a;
    620   }
    621   free(line);
    622 }
    623 
    624 // Callback called on each input file
    625 static void do_sed(int fd, char *name)
    626 {
    627   int i = toys.optflags & FLAG_i;
    628   char *tmp;
    629 
    630   if (i) {
    631     struct sedcmd *command;
    632 
    633     if (!fd && !strcmp(name, "-")) {
    634       error_msg("-i on stdin");
    635       return;
    636     }
    637     TT.fdout = copy_tempfile(fd, name, &tmp);
    638     TT.count = 0;
    639     for (command = (void *)TT.pattern; command; command = command->next)
    640       command->hit = 0;
    641   }
    642   do_lines(fd, process_line);
    643   if (i) {
    644     process_line(0, 0);
    645     replace_tempfile(-1, TT.fdout, &tmp);
    646     TT.fdout = 1;
    647     TT.nextline = 0;
    648     TT.nextlen = TT.noeol = 0;
    649   }
    650 }
    651 
    652 // Copy chunk of string between two delimiters, converting printf escapes.
    653 // returns processed copy of string (0 if error), *pstr advances to next
    654 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
    655 // if regxex, ignore delimiter in [ranges]
    656 static char *unescape_delimited_string(char **pstr, char *delim)
    657 {
    658   char *to, *from, mode = 0, d;
    659 
    660   // Grab leading delimiter (if necessary), allocate space for new string
    661   from = *pstr;
    662   if (!delim || !*delim) {
    663     if (!(d = *(from++))) return 0;
    664     if (d == '\\') d = *(from++);
    665     if (!d || d == '\\') return 0;
    666     if (delim) *delim = d;
    667   } else d = *delim;
    668   to = delim = xmalloc(strlen(*pstr)+1);
    669 
    670   while (mode || *from != d) {
    671     if (!*from) return 0;
    672 
    673     // delimiter in regex character range doesn't count
    674     if (*from == '[') {
    675       if (!mode) {
    676         mode = ']';
    677         if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
    678       } else if (mode == ']' && strchr(".=:", from[1])) {
    679         *(to++) = *(from++);
    680         mode = *from;
    681       }
    682     } else if (*from == mode) {
    683       if (mode == ']') mode = 0;
    684       else {
    685         *(to++) = *(from++);
    686         mode = ']';
    687       }
    688     // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
    689     // but the perl build does it, so we need to filter it out.
    690     } else if (mode && *from == '-' && from[-1] == from[1]) {
    691       from+=2;
    692       continue;
    693     } else if (*from == '\\') {
    694       if (!from[1]) return 0;
    695 
    696       // Check escaped end delimiter before printf style escapes.
    697       if (from[1] == d) from++;
    698       else if (from[1]=='\\') *(to++) = *(from++);
    699       else {
    700         char c = unescape(from[1]);
    701 
    702         if (c) {
    703           *(to++) = c;
    704           from+=2;
    705           continue;
    706         } else if (!mode) *(to++) = *(from++);
    707       }
    708     }
    709     *(to++) = *(from++);
    710   }
    711   *to = 0;
    712   *pstr = from+1;
    713 
    714   return delim;
    715 }
    716 
    717 // Translate pattern strings into command structures. Each command structure
    718 // is a single allocation (which requires some math and remalloc at times).
    719 static void parse_pattern(char **pline, long len)
    720 {
    721   struct sedcmd *command = (void *)TT.pattern;
    722   char *line, *reg, c, *errstart;
    723   int i;
    724 
    725   line = errstart = pline ? *pline : "";
    726   if (len && line[len-1]=='\n') line[--len] = 0;
    727 
    728   // Append this line to previous multiline command? (hit indicates type.)
    729   // During parsing "hit" stores data about line continuations, but in
    730   // process_line() it means the match range attached to this command
    731   // is active, so processing the continuation must zero it again.
    732   if (command && command->prev->hit) {
    733     // Remove half-finished entry from list so remalloc() doesn't confuse it
    734     TT.pattern = TT.pattern->prev;
    735     command = dlist_pop(&TT.pattern);
    736     c = command->c;
    737     reg = (char *)command;
    738     reg += command->arg1 + strlen(reg + command->arg1);
    739 
    740     // Resume parsing for 'a' or 's' command. (Only two that can do this.)
    741     // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
    742     // a unicode character.
    743     if (command->hit < 256) goto resume_s;
    744     else goto resume_a;
    745   }
    746 
    747   // Loop through commands in this line.
    748 
    749   command = 0;
    750   for (;;) {
    751     if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
    752 
    753     // If there's no more data on this line, return.
    754     for (;;) {
    755       while (isspace(*line) || *line == ';') line++;
    756       if (*line == '#') while (*line && *line != '\n') line++;
    757       else break;
    758     }
    759     if (!*line) return;
    760 
    761     // We start by writing data into toybuf. Later we'll allocate the
    762     // ex
    763 
    764     errstart = line;
    765     memset(toybuf, 0, sizeof(struct sedcmd));
    766     command = (void *)toybuf;
    767     reg = toybuf + sizeof(struct sedcmd);
    768 
    769     // Parse address range (if any)
    770     for (i = 0; i < 2; i++) {
    771       if (*line == ',') line++;
    772       else if (i) break;
    773 
    774       if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
    775       else if (*line == '$') {
    776         command->lmatch[i] = -1;
    777         line++;
    778       } else if (*line == '/' || *line == '\\') {
    779         char *s = line;
    780 
    781         if (!(s = unescape_delimited_string(&line, 0))) goto error;
    782         if (!*s) command->rmatch[i] = 0;
    783         else {
    784           xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
    785           command->rmatch[i] = reg-toybuf;
    786           reg += sizeof(regex_t);
    787         }
    788         free(s);
    789       } else break;
    790     }
    791 
    792     while (isspace(*line)) line++;
    793     if (!*line) break;
    794 
    795     while (*line == '!') {
    796       command->not = 1;
    797       line++;
    798     }
    799     while (isspace(*line)) line++;
    800 
    801     c = command->c = *(line++);
    802     if (strchr("}:", c) && i) break;
    803     if (strchr("aiqr=", c) && i>1) break;
    804 
    805     // Add step to pattern
    806     command = xmemdup(toybuf, reg-toybuf);
    807     reg = (reg-toybuf) + (char *)command;
    808 
    809     // Parse arguments by command type
    810     if (c == '{') TT.nextlen++;
    811     else if (c == '}') {
    812       if (!TT.nextlen--) break;
    813     } else if (c == 's') {
    814       char *end, delim = 0;
    815 
    816       // s/pattern/replacement/flags
    817 
    818       // line continuations use arg1 (back at the start of the function),
    819       // so let's fill out arg2 first (since the regex part can't be multiple
    820       // lines) and swap them back later.
    821 
    822       // get pattern (just record, we parse it later)
    823       command->arg2 = reg - (char *)command;
    824       if (!(TT.remember = unescape_delimited_string(&line, &delim)))
    825         goto error;
    826 
    827       reg += sizeof(regex_t);
    828       command->arg1 = reg-(char *)command;
    829       command->hit = delim;
    830 resume_s:
    831       // get replacement - don't replace escapes yet because \1 and \& need
    832       // processing later, after we replace \\ with \ we can't tell \\1 from \1
    833       end = line;
    834       while (*end != command->hit) {
    835         if (!*end) goto error;
    836         if (*end++ == '\\') {
    837           if (!*end || *end == '\n') {
    838             end[-1] = '\n';
    839             break;
    840           }
    841           end++;
    842         }
    843       }
    844 
    845       reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
    846       line = end;
    847       // line continuation? (note: '\n' can't be a valid delim).
    848       if (*line == command->hit) command->hit = 0;
    849       else {
    850         if (!*line) continue;
    851         reg--;
    852         line++;
    853         goto resume_s;
    854       }
    855 
    856       // swap arg1/arg2 so they're back in order arguments occur.
    857       i = command->arg1;
    858       command->arg1 = command->arg2;
    859       command->arg2 = i;
    860 
    861       // get flags
    862       for (line++; *line; line++) {
    863         long l;
    864 
    865         if (isspace(*line) && *line != '\n') continue;
    866 
    867         if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
    868         else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
    869           command->sflags |= l << 3;
    870           line--;
    871         } else break;
    872       }
    873 
    874       // We deferred actually parsing the regex until we had the s///i flag
    875       // allocating the space was done by extend_string() above
    876       if (!*TT.remember) command->arg1 = 0;
    877       else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
    878         ((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE));
    879       free(TT.remember);
    880       TT.remember = 0;
    881       if (*line == 'w') {
    882         line++;
    883         goto writenow;
    884       }
    885     } else if (c == 'w') {
    886       int fd, delim;
    887       char *cc;
    888 
    889       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
    890       // eol status, and to retain the filename for error messages, we'd need
    891       // to go up to arg5 just for this. Compromise: dynamically allocate the
    892       // filehandle and eol status.
    893 
    894 writenow:
    895       while (isspace(*line)) line++;
    896       if (!*line) goto error;
    897       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
    898       delim = *cc;
    899       *cc = 0;
    900       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
    901       *cc = delim;
    902 
    903       command->w = reg - (char *)command;
    904       command = xrealloc(command, command->w+(cc-line)+6);
    905       reg = command->w + (char *)command;
    906 
    907       memcpy(reg, &fd, 4);
    908       reg += 4;
    909       *(reg++) = 0;
    910       memcpy(reg, line, delim);
    911       reg += delim;
    912       *(reg++) = 0;
    913 
    914       line = cc;
    915       if (delim) line += 2;
    916     } else if (c == 'y') {
    917       char *s, delim = 0;
    918       int len;
    919 
    920       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
    921       command->arg1 = reg-(char *)command;
    922       len = strlen(s);
    923       reg = extend_string((void *)&command, s, reg-(char *)command, len);
    924       free(s);
    925       command->arg2 = reg-(char *)command;
    926       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
    927       if (len != strlen(s)) goto error;
    928       reg = extend_string((void *)&command, s, reg-(char*)command, len);
    929       free(s);
    930     } else if (strchr("abcirtTw:", c)) {
    931       int end;
    932 
    933       // trim leading spaces
    934       while (isspace(*line) && *line != '\n') line++;
    935 
    936       // Resume logic differs from 's' case because we don't add a newline
    937       // unless it's after something, so we add it on return instead.
    938 resume_a:
    939       command->hit = 0;
    940 
    941       // btT: end with space or semicolon, aicrw continue to newline.
    942       if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) {
    943         // Argument's optional for btT
    944         if (strchr("btT", c)) continue;
    945         else if (!command->arg1) break;
    946       }
    947 
    948       // Extend allocation to include new string. We use offsets instead of
    949       // pointers so realloc() moving stuff doesn't break things. Ok to write
    950       // \n over NUL terminator because call to extend_string() adds it back.
    951       if (!command->arg1) command->arg1 = reg - (char*)command;
    952       else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
    953       else if (!pline) {
    954         command->arg1 = 0;
    955         continue;
    956       }
    957       reg = extend_string((void *)&command, line, reg - (char *)command, end);
    958 
    959       // Recopy data to remove escape sequences and handle line continuation.
    960       if (strchr("aci", c)) {
    961         reg -= end+1;
    962         for (i = end; i; i--) {
    963           if ((*reg++ = *line++)=='\\') {
    964 
    965             // escape at end of line: resume if -e escaped literal newline,
    966             // else request callback and resume with next line
    967             if (!--i) {
    968               *--reg = 0;
    969               if (*line) {
    970                 line++;
    971                 goto resume_a;
    972               }
    973               command->hit = 256;
    974               break;
    975             }
    976             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
    977             line++;
    978           }
    979         }
    980         *reg = 0;
    981       } else line += end;
    982 
    983     // Commands that take no arguments
    984     } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
    985   }
    986 
    987 error:
    988   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
    989 }
    990 
    991 void sed_main(void)
    992 {
    993   struct arg_list *al;
    994   char **args = toys.optargs;
    995 
    996   // Lie to autoconf when it asks stupid questions, so configure regexes
    997   // that look for "GNU sed version %f" greater than some old buggy number
    998   // don't fail us for not matching their narrow expectations.
    999   if (toys.optflags & FLAG_version) {
   1000     xprintf("This is not GNU sed version 9.0\n");
   1001     return;
   1002   }
   1003 
   1004   // Handling our own --version means we handle our own --help too.
   1005   if (toys.optflags&FLAG_help) help_exit(0);
   1006 
   1007   // Parse pattern into commands.
   1008 
   1009   // If no -e or -f, first argument is the pattern.
   1010   if (!TT.e && !TT.f) {
   1011     if (!*toys.optargs) error_exit("no pattern");
   1012     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
   1013   }
   1014 
   1015   // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
   1016   // so handle all -e, then all -f. (At least the behavior's consistent.)
   1017 
   1018   for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
   1019   for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern);
   1020   parse_pattern(0, 0);
   1021   dlist_terminate(TT.pattern);
   1022   if (TT.nextlen) error_exit("no }");
   1023 
   1024   TT.fdout = 1;
   1025   TT.remember = xstrdup("");
   1026 
   1027   // Inflict pattern upon input files. Long version because !O_CLOEXEC
   1028   loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed);
   1029 
   1030   if (!(toys.optflags & FLAG_i)) process_line(0, 0);
   1031 
   1032   // todo: need to close fd when done for TOYBOX_FREE?
   1033 }
   1034