Home | History | Annotate | Download | only in posix
      1 /* sed.c - stream editor. Thing that does s/// and other stuff.
      2  *
      3  * Copyright 2014 Rob Landley <rob (at) landley.net>
      4  *
      5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
      6  *
      7  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
      8  * but N and s///
      9  * TODO: make y// handle unicode, unicode delimiters
     10  * TODO: handle error return from emit(), error_msg/exit consistently
     11  *       What's the right thing to do for -i when write fails? Skip to next?
     12  * test '//q' with no previous regex, also repeat previous regex?
     13 
     14 USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
     15 
     16 config SED
     17   bool "sed"
     18   default y
     19   help
     20     usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
     21 
     22     Stream editor. Apply one or more editing SCRIPTs to each line of input
     23     (from FILE or stdin) producing output (by default to stdout).
     24 
     25     -e	Add SCRIPT to list
     26     -f	Add contents of SCRIPT_FILE to list
     27     -i	Edit each file in place (-iEXT keeps backup file with extension EXT)
     28     -n	No default output (use the p command to output matched lines)
     29     -r	Use extended regular expression syntax
     30     -E	POSIX alias for -r
     31     -s	Treat input files separately (implied by -i)
     32     -z	Use \0 rather than \n as the input line separator
     33 
     34     A SCRIPT is a series of one or more COMMANDs separated by newlines or
     35     semicolons. All -e SCRIPTs are concatenated together as if separated
     36     by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
     37     If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
     38 
     39     Each COMMAND may be preceded by an address which limits the command to
     40     apply only to the specified line(s). Commands without an address apply to
     41     every line. Addresses are of the form:
     42 
     43       [ADDRESS[,ADDRESS]]COMMAND
     44 
     45     The ADDRESS may be a decimal line number (starting at 1), a /regular
     46     expression/ within a pair of forward slashes, or the character "$" which
     47     matches the last line of input. (In -s or -i mode this matches the last
     48     line of each file, otherwise just the last line of the last file.) A single
     49     address matches one line, a pair of comma separated addresses match
     50     everything from the first address to the second address (inclusive). If
     51     both addresses are regular expressions, more than one range of lines in
     52     each file can match.
     53 
     54     REGULAR EXPRESSIONS in sed are started and ended by the same character
     55     (traditionally / but anything except a backslash or a newline works).
     56     Backslashes may be used to escape the delimiter if it occurs in the
     57     regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
     58     and unicode). An empty regex repeats the previous one. ADDRESS regexes
     59     (above) require the first delimiter to be escaped with a backslash when
     60     it isn't a forward slash (to distinguish it from the COMMANDs below).
     61 
     62     Sed mostly operates on individual lines one at a time. It reads each line,
     63     processes it, and either writes it to the output or discards it before
     64     reading the next line. Sed can remember one additional line in a separate
     65     buffer (using the h, H, g, G, and x commands), and can read the next line
     66     of input early (using the n and N command), but other than that command
     67     scripts operate on individual lines of text.
     68 
     69     Each COMMAND starts with a single character. The following commands take
     70     no arguments:
     71 
     72       {  Start a new command block, continuing until a corresponding "}".
     73          Command blocks may nest. If the block has an address, commands within
     74          the block are only run for lines within the block's address range.
     75 
     76       }  End command block (this command cannot have an address)
     77 
     78       d  Delete this line and move on to the next one
     79          (ignores remaining COMMANDs)
     80 
     81       D  Delete one line of input and restart command SCRIPT (same as "d"
     82          unless you've glued lines together with "N" or similar)
     83 
     84       g  Get remembered line (overwriting current line)
     85 
     86       G  Get remembered line (appending to current line)
     87 
     88       h  Remember this line (overwriting remembered line)
     89 
     90       H  Remember this line (appending to remembered line, if any)
     91 
     92       l  Print line, escaping \abfrtv (but not newline), octal escaping other
     93          nonprintable characters, wrapping lines to terminal width with a
     94          backslash, and appending $ to actual end of line.
     95 
     96       n  Print default output and read next line, replacing current line
     97          (If no next line available, quit processing script)
     98 
     99       N  Append next line of input to this line, separated by a newline
    100          (This advances the line counter for address matching and "=", if no
    101          next line available quit processing script without default output)
    102 
    103       p  Print this line
    104 
    105       P  Print this line up to first newline (from "N")
    106 
    107       q  Quit (print default output, no more commands processed or lines read)
    108 
    109       x  Exchange this line with remembered line (overwrite in both directions)
    110 
    111       =  Print the current line number (followed by a newline)
    112 
    113     The following commands (may) take an argument. The "text" arguments (to
    114     the "a", "b", and "c" commands) may end with an unescaped "\" to append
    115     the next line (for which leading whitespace is not skipped), and also
    116     treat ";" as a literal character (use "\;" instead).
    117 
    118       a [text]   Append text to output before attempting to read next line
    119 
    120       b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
    121 
    122       c [text]   Delete line, output text at end of matching address range
    123                  (ignores remaining COMMANDs)
    124 
    125       i [text]   Print text
    126 
    127       r [file]   Append contents of file to output before attempting to read
    128                  next line.
    129 
    130       s/S/R/F    Search for regex S, replace matched text with R using flags F.
    131                  The first character after the "s" (anything but newline or
    132                  backslash) is the delimiter, escape with \ to use normally.
    133 
    134                  The replacement text may contain "&" to substitute the matched
    135                  text (escape it with backslash for a literal &), or \1 through
    136                  \9 to substitute a parenthetical subexpression in the regex.
    137                  You can also use the normal backslash escapes such as \n and
    138                  a backslash at the end of the line appends the next line.
    139 
    140                  The flags are:
    141 
    142                  [0-9]    A number, substitute only that occurrence of pattern
    143                  g        Global, substitute all occurrences of pattern
    144                  i        Ignore case when matching
    145                  p        Print the line if match was found and replaced
    146                  w [file] Write (append) line to file if match replaced
    147 
    148       t [label]  Test, jump to :label only if an "s" command found a match in
    149                  this line since last test (replacing with same text counts)
    150 
    151       T [label]  Test false, jump only if "s" hasn't found a match.
    152 
    153       w [file]   Write (append) line to file
    154 
    155       y/old/new/ Change each character in 'old' to corresponding character
    156                  in 'new' (with standard backslash escapes, delimiter can be
    157                  any repeated character except \ or \n)
    158 
    159       : [label]  Labeled target for jump commands
    160 
    161       #  Comment, ignore rest of this line of SCRIPT
    162 
    163     Deviations from POSIX: allow extended regular expressions with -r,
    164     editing in place with -i, separate with -s, NUL-separated input with -z,
    165     printf escapes in text, line continuations, semicolons after all commands,
    166     2-address anywhere an address is allowed, "T" command, multiline
    167     continuations for [abc], \; to end [abc] argument before end of line.
    168 */
    169 
    170 #define FOR_sed
    171 #include "toys.h"
    172 
    173 GLOBALS(
    174   char *i;
    175   struct arg_list *f, *e;
    176 
    177   // processed pattern list
    178   struct double_list *pattern;
    179 
    180   char *nextline, *remember;
    181   void *restart, *lastregex;
    182   long nextlen, rememberlen, count;
    183   int fdout, noeol;
    184   unsigned xx;
    185   char delim;
    186 )
    187 
    188 // Linked list of parsed sed commands. Offset fields indicate location where
    189 // regex or string starts, ala offset+(char *)struct, because we remalloc()
    190 // these to expand them for multiline inputs, and pointers would have to be
    191 // individually adjusted.
    192 
    193 struct sedcmd {
    194   struct sedcmd *next, *prev;
    195 
    196   // Begin and end of each match
    197   long lmatch[2]; // line number of match
    198   int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
    199   int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
    200   unsigned not, hit;
    201   unsigned sflags; // s///flag bits: i=1, g=2, p=4
    202   char c; // action
    203 };
    204 
    205 // Write out line with potential embedded NUL, handling eol/noeol
    206 static int emit(char *line, long len, int eol)
    207 {
    208   int l, old = line[len];
    209 
    210   if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
    211   TT.noeol = !eol;
    212   if (eol) line[len++] = '\n';
    213   if (!len) return 0;
    214   l = writeall(TT.fdout, line, len);
    215   if (eol) line[len-1] = old;
    216   if (l != len) {
    217     perror_msg("short write");
    218 
    219     return 1;
    220   }
    221 
    222   return 0;
    223 }
    224 
    225 // Extend allocation to include new string, with newline between if newlen<0
    226 
    227 static char *extend_string(char **old, char *new, int oldlen, int newlen)
    228 {
    229   int newline = newlen < 0;
    230   char *s;
    231 
    232   if (newline) newlen = -newlen;
    233   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
    234   if (newline) s[oldlen++] = '\n';
    235   memcpy(s+oldlen, new, newlen);
    236   s[oldlen+newlen] = 0;
    237 
    238   return s+oldlen+newlen+1;
    239 }
    240 
    241 // An empty regex repeats the previous one
    242 static void *get_regex(void *trump, int offset)
    243 {
    244   if (!offset) {
    245     if (!TT.lastregex) error_exit("no previous regex");
    246     return TT.lastregex;
    247   }
    248 
    249   return TT.lastregex = offset+(char *)trump;
    250 }
    251 
    252 // Apply pattern to line from input file
    253 static void sed_line(char **pline, long plen)
    254 {
    255   struct append {
    256     struct append *next, *prev;
    257     int file;
    258     char *str;
    259   } *append = 0;
    260   char *line = TT.nextline;
    261   long len = TT.nextlen;
    262   struct sedcmd *command;
    263   int eol = 0, tea = 0;
    264 
    265   // Ignore EOF for all files before last unless -i
    266   if (!pline && !FLAG(i)) return;
    267 
    268   // Grab next line for deferred processing (EOF detection: we get a NULL
    269   // pline at EOF to flush last line). Note that only end of _last_ input
    270   // file matches $ (unless we're doing -i).
    271   TT.nextline = 0;
    272   TT.nextlen = 0;
    273   if (pline) {
    274     TT.nextline = *pline;
    275     TT.nextlen = plen;
    276     *pline = 0;
    277   }
    278 
    279   if (!line || !len) return;
    280   if (line[len-1] == '\n') line[--len] = eol++;
    281   TT.count++;
    282 
    283   // The restart-1 is because we added one to make sure it wasn't NULL,
    284   // otherwise N as last command would restart script
    285   command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
    286   TT.restart = 0;
    287 
    288   while (command) {
    289     char *str, c = command->c;
    290 
    291     // Have we got a line or regex matching range for this rule?
    292     if (*command->lmatch || *command->rmatch) {
    293       int miss = 0;
    294       long lm;
    295 
    296       // In a match that might end?
    297       if (command->hit) {
    298         if (!(lm = command->lmatch[1])) {
    299           if (!command->rmatch[1]) command->hit = 0;
    300           else {
    301             void *rm = get_regex(command, command->rmatch[1]);
    302 
    303             // regex match end includes matching line, so defer deactivation
    304             if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
    305           }
    306         } else if (lm > 0 && lm < TT.count) command->hit = 0;
    307 
    308       // Start a new match?
    309       } else {
    310         if (!(lm = *command->lmatch)) {
    311           void *rm = get_regex(command, *command->rmatch);
    312 
    313           if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
    314         } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
    315 
    316         if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
    317       }
    318 
    319       // Didn't match?
    320       lm = !(command->hit ^ command->not);
    321 
    322       // Deferred disable from regex end match
    323       if (miss || command->lmatch[1] == TT.count) command->hit = 0;
    324 
    325       if (lm) {
    326         // Handle skipping curly bracket command group
    327         if (c == '{') {
    328           int curly = 1;
    329 
    330           while (curly) {
    331             command = command->next;
    332             if (command->c == '{') curly++;
    333             if (command->c == '}') curly--;
    334           }
    335         }
    336         command = command->next;
    337         continue;
    338       }
    339     }
    340 
    341     // A deleted line can still update line match state for later commands
    342     if (!line) {
    343       command = command->next;
    344       continue;
    345     }
    346 
    347     // Process command
    348 
    349     if (c=='a' || c=='r') {
    350       struct append *a = xzalloc(sizeof(struct append));
    351       if (command->arg1) a->str = command->arg1+(char *)command;
    352       a->file = c=='r';
    353       dlist_add_nomalloc((void *)&append, (void *)a);
    354     } else if (c=='b' || c=='t' || c=='T') {
    355       int t = tea;
    356 
    357       if (c != 'b') tea = 0;
    358       if (c=='b' || t^(c=='T')) {
    359         if (!command->arg1) break;
    360         str = command->arg1+(char *)command;
    361         for (command = (void *)TT.pattern; command; command = command->next)
    362           if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
    363             break;
    364         if (!command) error_exit("no :%s", str);
    365       }
    366     } else if (c=='c') {
    367       str = command->arg1+(char *)command;
    368       if (!command->hit) emit(str, strlen(str), 1);
    369       free(line);
    370       line = 0;
    371       continue;
    372     } else if (c=='d') {
    373       free(line);
    374       line = 0;
    375       continue;
    376     } else if (c=='D') {
    377       // Delete up to \n or end of buffer
    378       str = line;
    379       while ((str-line)<len) if (*(str++) == '\n') break;
    380       len -= str - line;
    381       memmove(line, str, len);
    382 
    383       // if "delete" blanks line, disable further processing
    384       // otherwise trim and restart script
    385       if (!len) {
    386         free(line);
    387         line = 0;
    388       } else {
    389         line[len] = 0;
    390         command = (void *)TT.pattern;
    391       }
    392       continue;
    393     } else if (c=='g') {
    394       free(line);
    395       line = xstrdup(TT.remember);
    396       len = TT.rememberlen;
    397     } else if (c=='G') {
    398       line = xrealloc(line, len+TT.rememberlen+2);
    399       line[len++] = '\n';
    400       memcpy(line+len, TT.remember, TT.rememberlen);
    401       line[len += TT.rememberlen] = 0;
    402     } else if (c=='h') {
    403       free(TT.remember);
    404       TT.remember = xstrdup(line);
    405       TT.rememberlen = len;
    406     } else if (c=='H') {
    407       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
    408       TT.remember[TT.rememberlen++] = '\n';
    409       memcpy(TT.remember+TT.rememberlen, line, len);
    410       TT.remember[TT.rememberlen += len] = 0;
    411     } else if (c=='i') {
    412       str = command->arg1+(char *)command;
    413       emit(str, strlen(str), 1);
    414     } else if (c=='l') {
    415       int i, x, off;
    416 
    417       if (!TT.xx) {
    418         terminal_size(&TT.xx, 0);
    419         if (!TT.xx) TT.xx = 80;
    420         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
    421         if (TT.xx > 4) TT.xx -= 4;
    422       }
    423 
    424       for (i = off = 0; i<len; i++) {
    425         if (off >= TT.xx) {
    426           toybuf[off++] = '\\';
    427           emit(toybuf, off, 1);
    428           off = 0;
    429         }
    430         x = stridx("\\\a\b\f\r\t\v", line[i]);
    431         if (x != -1) {
    432           toybuf[off++] = '\\';
    433           toybuf[off++] = "\\abfrtv"[x];
    434         } else if (line[i] >= ' ') toybuf[off++] = line[i];
    435         else off += sprintf(toybuf+off, "\\%03o", line[i]);
    436       }
    437       toybuf[off++] = '$';
    438       emit(toybuf, off, 1);
    439     } else if (c=='n') {
    440       TT.restart = command->next+1;
    441 
    442       break;
    443     } else if (c=='N') {
    444       // Can't just grab next line because we could have multiple N and
    445       // we need to actually read ahead to get N;$p EOF detection right.
    446       if (pline) {
    447         TT.restart = command->next+1;
    448         extend_string(&line, TT.nextline, len, -TT.nextlen);
    449         free(TT.nextline);
    450         TT.nextline = line;
    451         TT.nextlen += len + 1;
    452         line = 0;
    453       }
    454 
    455       // Pending append goes out right after N
    456       goto done;
    457     } else if (c=='p' || c=='P') {
    458       char *l = (c=='P') ? strchr(line, '\n') : 0;
    459 
    460       if (emit(line, l ? l-line : len, eol)) break;
    461     } else if (c=='q') {
    462       if (pline) *pline = (void *)1;
    463       free(TT.nextline);
    464       TT.nextline = 0;
    465       TT.nextlen = 0;
    466 
    467       break;
    468     } else if (c=='s') {
    469       char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
    470       regmatch_t *match = (void *)toybuf;
    471       regex_t *reg = get_regex(command, command->arg1);
    472       int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
    473 
    474       // Find match in remaining line (up to remaining len)
    475       while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
    476         mflags = REG_NOTBOL;
    477 
    478         // Zero length matches don't count immediately after a previous match
    479         mlen = match[0].rm_eo-match[0].rm_so;
    480         if (!mlen && !zmatch) {
    481           if (!rlen--) break;
    482           rline++;
    483           zmatch++;
    484           continue;
    485         } else zmatch = 0;
    486 
    487         // If we're replacing only a specific match, skip if this isn't it
    488         off = command->sflags>>3;
    489         if (off && off != ++count) {
    490           rline += match[0].rm_eo;
    491           rlen -= match[0].rm_eo;
    492 
    493           continue;
    494         }
    495         // The fact getline() can allocate unbounded amounts of memory is
    496         // a bigger issue, but while we're here check for integer overflow
    497         if (match[0].rm_eo > INT_MAX) perror_exit(0);
    498 
    499         // newlen = strlen(new) but with \1 and & and printf escapes
    500         for (off = newlen = 0; new[off]; off++) {
    501           int cc = -1;
    502 
    503           if (new[off] == '&') cc = 0;
    504           else if (new[off] == '\\') cc = new[++off] - '0';
    505           if (cc < 0 || cc > 9) {
    506             newlen++;
    507             continue;
    508           }
    509           newlen += match[cc].rm_eo-match[cc].rm_so;
    510         }
    511 
    512         // Allocate new size, copy start/end around match. (Can't extend in
    513         // place because backrefs may refer to text after it's overwritten.)
    514         len += newlen-mlen;
    515         swap = xmalloc(len+1);
    516         rswap = swap+(rline-line)+match[0].rm_so;
    517         memcpy(swap, line, (rline-line)+match[0].rm_so);
    518         memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
    519 
    520         // copy in new replacement text
    521         for (off = mlen = 0; new[off]; off++) {
    522           int cc = 0, ll;
    523 
    524           if (new[off] == '\\') {
    525             cc = new[++off] - '0';
    526             if (cc<0 || cc>9) {
    527               if (!(rswap[mlen++] = unescape(new[off])))
    528                 rswap[mlen-1] = new[off];
    529 
    530               continue;
    531             } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
    532           } else if (new[off] != '&') {
    533             rswap[mlen++] = new[off];
    534 
    535             continue;
    536           }
    537 
    538           if (match[cc].rm_so == -1) ll = 0; // Empty match.
    539           else {
    540             ll = match[cc].rm_eo-match[cc].rm_so;
    541             memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
    542           }
    543           mlen += ll;
    544         }
    545 
    546         rline = rswap+newlen;
    547         free(line);
    548         line = swap;
    549 
    550         // Stop after first substitution unless we have flag g
    551         if (!(command->sflags & 2)) break;
    552       }
    553 
    554       if (mflags) {
    555         // flag p
    556         if (command->sflags & 4) emit(line, len, eol);
    557 
    558         tea = 1;
    559         if (command->w) goto writenow;
    560       }
    561     } else if (c=='w') {
    562       int fd, noeol;
    563       char *name;
    564 
    565 writenow:
    566       // Swap out emit() context
    567       fd = TT.fdout;
    568       noeol = TT.noeol;
    569 
    570       // We save filehandle and newline status before filename
    571       name = command->w + (char *)command;
    572       memcpy(&TT.fdout, name, 4);
    573       name += 4;
    574       TT.noeol = *(name++);
    575 
    576       // write, then save/restore context
    577       if (emit(line, len, eol))
    578         perror_exit("w '%s'", command->arg1+(char *)command);
    579       *(--name) = TT.noeol;
    580       TT.noeol = noeol;
    581       TT.fdout = fd;
    582     } else if (c=='x') {
    583       long swap = TT.rememberlen;
    584 
    585       str = TT.remember;
    586       TT.remember = line;
    587       line = str;
    588       TT.rememberlen = len;
    589       len = swap;
    590     } else if (c=='y') {
    591       char *from, *to = (char *)command;
    592       int i, j;
    593 
    594       from = to+command->arg1;
    595       to += command->arg2;
    596 
    597       for (i = 0; i < len; i++) {
    598         j = stridx(from, line[i]);
    599         if (j != -1) line[i] = to[j];
    600       }
    601     } else if (c=='=') {
    602       sprintf(toybuf, "%ld", TT.count);
    603       if (emit(toybuf, strlen(toybuf), 1)) break;
    604     }
    605 
    606     command = command->next;
    607   }
    608 
    609   if (line && !FLAG(n)) emit(line, len, eol);
    610 
    611 done:
    612   if (dlist_terminate(append)) while (append) {
    613     struct append *a = append->next;
    614 
    615     if (append->file) {
    616       int fd = open(append->str, O_RDONLY);
    617 
    618       // Force newline if noeol pending
    619       if (fd != -1) {
    620         if (TT.noeol) xwrite(TT.fdout, "\n", 1);
    621         TT.noeol = 0;
    622         xsendfile(fd, TT.fdout);
    623         close(fd);
    624       }
    625     } else if (append->str) emit(append->str, strlen(append->str), 1);
    626     else emit(line, 0, 0);
    627     free(append);
    628     append = a;
    629   }
    630   free(line);
    631 }
    632 
    633 // Callback called on each input file
    634 static void do_sed_file(int fd, char *name)
    635 {
    636   char *tmp;
    637 
    638   if (FLAG(i)) {
    639     struct sedcmd *command;
    640 
    641     if (!fd) return error_msg("-i on stdin");
    642     TT.fdout = copy_tempfile(fd, name, &tmp);
    643     TT.count = 0;
    644     for (command = (void *)TT.pattern; command; command = command->next)
    645       command->hit = 0;
    646   }
    647   do_lines(fd, TT.delim, sed_line);
    648   if (FLAG(i)) {
    649     if (TT.i && *TT.i) {
    650       char *s = xmprintf("%s%s", name, TT.i);
    651 
    652       xrename(name, s);
    653       free(s);
    654     }
    655     replace_tempfile(-1, TT.fdout, &tmp);
    656     TT.fdout = 1;
    657     TT.nextline = 0;
    658     TT.nextlen = TT.noeol = 0;
    659   }
    660 }
    661 
    662 // Copy chunk of string between two delimiters, converting printf escapes.
    663 // returns processed copy of string (0 if error), *pstr advances to next
    664 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
    665 // if regxex, ignore delimiter in [ranges]
    666 static char *unescape_delimited_string(char **pstr, char *delim)
    667 {
    668   char *to, *from, mode = 0, d;
    669 
    670   // Grab leading delimiter (if necessary), allocate space for new string
    671   from = *pstr;
    672   if (!delim || !*delim) {
    673     if (!(d = *(from++))) return 0;
    674     if (d == '\\') d = *(from++);
    675     if (!d || d == '\\') return 0;
    676     if (delim) *delim = d;
    677   } else d = *delim;
    678   to = delim = xmalloc(strlen(*pstr)+1);
    679 
    680   while (mode || *from != d) {
    681     if (!*from) return 0;
    682 
    683     // delimiter in regex character range doesn't count
    684     if (*from == '[') {
    685       if (!mode) {
    686         mode = ']';
    687         if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
    688       } else if (mode == ']' && strchr(".=:", from[1])) {
    689         *(to++) = *(from++);
    690         mode = *from;
    691       }
    692     } else if (*from == mode) {
    693       if (mode == ']') mode = 0;
    694       else {
    695         *(to++) = *(from++);
    696         mode = ']';
    697       }
    698     // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
    699     // but the perl build does it, so we need to filter it out.
    700     } else if (mode && *from == '-' && from[-1] == from[1]) {
    701       from+=2;
    702       continue;
    703     } else if (*from == '\\') {
    704       if (!from[1]) return 0;
    705 
    706       // Check escaped end delimiter before printf style escapes.
    707       if (from[1] == d) from++;
    708       else if (from[1]=='\\') *(to++) = *(from++);
    709       else {
    710         char c = unescape(from[1]);
    711 
    712         if (c) {
    713           *(to++) = c;
    714           from+=2;
    715           continue;
    716         } else if (!mode) *(to++) = *(from++);
    717       }
    718     }
    719     *(to++) = *(from++);
    720   }
    721   *to = 0;
    722   *pstr = from+1;
    723 
    724   return delim;
    725 }
    726 
    727 // Translate pattern strings into command structures. Each command structure
    728 // is a single allocation (which requires some math and remalloc at times).
    729 static void parse_pattern(char **pline, long len)
    730 {
    731   struct sedcmd *command = (void *)TT.pattern;
    732   char *line, *reg, c, *errstart;
    733   int i;
    734 
    735   line = errstart = pline ? *pline : "";
    736   if (len && line[len-1]=='\n') line[--len] = 0;
    737 
    738   // Append this line to previous multiline command? (hit indicates type.)
    739   // During parsing "hit" stores data about line continuations, but in
    740   // sed_line() it means the match range attached to this command
    741   // is active, so processing the continuation must zero it again.
    742   if (command && command->prev->hit) {
    743     // Remove half-finished entry from list so remalloc() doesn't confuse it
    744     TT.pattern = TT.pattern->prev;
    745     command = dlist_pop(&TT.pattern);
    746     c = command->c;
    747     reg = (char *)command;
    748     reg += command->arg1 + strlen(reg + command->arg1);
    749 
    750     // Resume parsing for 'a' or 's' command. (Only two that can do this.)
    751     // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
    752     // a unicode character.
    753     if (command->hit < 256) goto resume_s;
    754     else goto resume_a;
    755   }
    756 
    757   // Loop through commands in this line.
    758 
    759   command = 0;
    760   for (;;) {
    761     if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
    762 
    763     // If there's no more data on this line, return.
    764     for (;;) {
    765       while (isspace(*line) || *line == ';') line++;
    766       if (*line == '#') while (*line && *line != '\n') line++;
    767       else break;
    768     }
    769     if (!*line) return;
    770 
    771     // We start by writing data into toybuf. Later we'll allocate the
    772     // ex
    773 
    774     errstart = line;
    775     memset(toybuf, 0, sizeof(struct sedcmd));
    776     command = (void *)toybuf;
    777     reg = toybuf + sizeof(struct sedcmd);
    778 
    779     // Parse address range (if any)
    780     for (i = 0; i < 2; i++) {
    781       if (*line == ',') line++;
    782       else if (i) break;
    783 
    784       if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
    785       else if (*line == '$') {
    786         command->lmatch[i] = -1;
    787         line++;
    788       } else if (*line == '/' || *line == '\\') {
    789         char *s = line;
    790 
    791         if (!(s = unescape_delimited_string(&line, 0))) goto error;
    792         if (!*s) command->rmatch[i] = 0;
    793         else {
    794           xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
    795           command->rmatch[i] = reg-toybuf;
    796           reg += sizeof(regex_t);
    797         }
    798         free(s);
    799       } else break;
    800     }
    801 
    802     while (isspace(*line)) line++;
    803     if (!*line) break;
    804 
    805     while (*line == '!') {
    806       command->not = 1;
    807       line++;
    808     }
    809     while (isspace(*line)) line++;
    810 
    811     c = command->c = *(line++);
    812     if (strchr("}:", c) && i) break;
    813     if (strchr("aiqr=", c) && i>1) break;
    814 
    815     // Add step to pattern
    816     command = xmemdup(toybuf, reg-toybuf);
    817     reg = (reg-toybuf) + (char *)command;
    818 
    819     // Parse arguments by command type
    820     if (c == '{') TT.nextlen++;
    821     else if (c == '}') {
    822       if (!TT.nextlen--) break;
    823     } else if (c == 's') {
    824       char *end, delim = 0;
    825 
    826       // s/pattern/replacement/flags
    827 
    828       // line continuations use arg1 (back at the start of the function),
    829       // so let's fill out arg2 first (since the regex part can't be multiple
    830       // lines) and swap them back later.
    831 
    832       // get pattern (just record, we parse it later)
    833       command->arg2 = reg - (char *)command;
    834       if (!(TT.remember = unescape_delimited_string(&line, &delim)))
    835         goto error;
    836 
    837       reg += sizeof(regex_t);
    838       command->arg1 = reg-(char *)command;
    839       command->hit = delim;
    840 resume_s:
    841       // get replacement - don't replace escapes yet because \1 and \& need
    842       // processing later, after we replace \\ with \ we can't tell \\1 from \1
    843       end = line;
    844       while (*end != command->hit) {
    845         if (!*end) goto error;
    846         if (*end++ == '\\') {
    847           if (!*end || *end == '\n') {
    848             end[-1] = '\n';
    849             break;
    850           }
    851           end++;
    852         }
    853       }
    854 
    855       reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
    856       line = end;
    857       // line continuation? (note: '\n' can't be a valid delim).
    858       if (*line == command->hit) command->hit = 0;
    859       else {
    860         if (!*line) continue;
    861         reg--;
    862         line++;
    863         goto resume_s;
    864       }
    865 
    866       // swap arg1/arg2 so they're back in order arguments occur.
    867       i = command->arg1;
    868       command->arg1 = command->arg2;
    869       command->arg2 = i;
    870 
    871       // get flags
    872       for (line++; *line; line++) {
    873         long l;
    874 
    875         if (isspace(*line) && *line != '\n') continue;
    876 
    877         if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
    878         else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
    879           command->sflags |= l << 3;
    880           line--;
    881         } else break;
    882       }
    883 
    884       // We deferred actually parsing the regex until we had the s///i flag
    885       // allocating the space was done by extend_string() above
    886       if (!*TT.remember) command->arg1 = 0;
    887       else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
    888         (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
    889       free(TT.remember);
    890       TT.remember = 0;
    891       if (*line == 'w') {
    892         line++;
    893         goto writenow;
    894       }
    895     } else if (c == 'w') {
    896       int fd, delim;
    897       char *cc;
    898 
    899       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
    900       // eol status, and to retain the filename for error messages, we'd need
    901       // to go up to arg5 just for this. Compromise: dynamically allocate the
    902       // filehandle and eol status.
    903 
    904 writenow:
    905       while (isspace(*line)) line++;
    906       if (!*line) goto error;
    907       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
    908       delim = *cc;
    909       *cc = 0;
    910       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
    911       *cc = delim;
    912 
    913       command->w = reg - (char *)command;
    914       command = xrealloc(command, command->w+(cc-line)+6);
    915       reg = command->w + (char *)command;
    916 
    917       memcpy(reg, &fd, 4);
    918       reg += 4;
    919       *(reg++) = 0;
    920       memcpy(reg, line, delim);
    921       reg += delim;
    922       *(reg++) = 0;
    923 
    924       line = cc;
    925       if (delim) line += 2;
    926     } else if (c == 'y') {
    927       char *s, delim = 0;
    928       int len;
    929 
    930       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
    931       command->arg1 = reg-(char *)command;
    932       len = strlen(s);
    933       reg = extend_string((void *)&command, s, reg-(char *)command, len);
    934       free(s);
    935       command->arg2 = reg-(char *)command;
    936       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
    937       if (len != strlen(s)) goto error;
    938       reg = extend_string((void *)&command, s, reg-(char*)command, len);
    939       free(s);
    940     } else if (strchr("abcirtTw:", c)) {
    941       int end;
    942 
    943       // trim leading spaces
    944       while (isspace(*line) && *line != '\n') line++;
    945 
    946       // Resume logic differs from 's' case because we don't add a newline
    947       // unless it's after something, so we add it on return instead.
    948 resume_a:
    949       command->hit = 0;
    950 
    951       // btT: end with space or semicolon, aicrw continue to newline.
    952       if (!(end = strcspn(line, strchr(":btT", c) ? "}; \t\r\n\v\f" : "\n"))) {
    953         // Argument's optional for btT
    954         if (strchr("btT", c)) continue;
    955         else if (!command->arg1) break;
    956       }
    957 
    958       // Extend allocation to include new string. We use offsets instead of
    959       // pointers so realloc() moving stuff doesn't break things. Ok to write
    960       // \n over NUL terminator because call to extend_string() adds it back.
    961       if (!command->arg1) command->arg1 = reg - (char*)command;
    962       else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
    963       else if (!pline) {
    964         command->arg1 = 0;
    965         continue;
    966       }
    967       reg = extend_string((void *)&command, line, reg - (char *)command, end);
    968 
    969       // Recopy data to remove escape sequences and handle line continuation.
    970       if (strchr("aci", c)) {
    971         reg -= end+1;
    972         for (i = end; i; i--) {
    973           if ((*reg++ = *line++)=='\\') {
    974 
    975             // escape at end of line: resume if -e escaped literal newline,
    976             // else request callback and resume with next line
    977             if (!--i) {
    978               *--reg = 0;
    979               if (*line) {
    980                 line++;
    981                 goto resume_a;
    982               }
    983               command->hit = 256;
    984               break;
    985             }
    986             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
    987             line++;
    988           }
    989         }
    990         *reg = 0;
    991       } else line += end;
    992 
    993     // Commands that take no arguments
    994     } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
    995   }
    996 
    997 error:
    998   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
    999 }
   1000 
   1001 void sed_main(void)
   1002 {
   1003   struct arg_list *al;
   1004   char **args = toys.optargs;
   1005 
   1006   if (!FLAG(z)) TT.delim = '\n';
   1007 
   1008   // Lie to autoconf when it asks stupid questions, so configure regexes
   1009   // that look for "GNU sed version %f" greater than some old buggy number
   1010   // don't fail us for not matching their narrow expectations.
   1011   if (FLAG(version)) {
   1012     xprintf("This is not GNU sed version 9.0\n");
   1013     return;
   1014   }
   1015 
   1016   // Handling our own --version means we handle our own --help too.
   1017   if (FLAG(help)) help_exit(0);
   1018 
   1019   // Parse pattern into commands.
   1020 
   1021   // If no -e or -f, first argument is the pattern.
   1022   if (!TT.e && !TT.f) {
   1023     if (!*toys.optargs) error_exit("no pattern");
   1024     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
   1025   }
   1026 
   1027   // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
   1028   // so handle all -e, then all -f. (At least the behavior's consistent.)
   1029 
   1030   for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
   1031   parse_pattern(0, 0);
   1032   for (al = TT.f; al; al = al->next)
   1033     do_lines(xopenro(al->arg), TT.delim, parse_pattern);
   1034   dlist_terminate(TT.pattern);
   1035   if (TT.nextlen) error_exit("no }");
   1036 
   1037   TT.fdout = 1;
   1038   TT.remember = xstrdup("");
   1039 
   1040   // Inflict pattern upon input files. Long version because !O_CLOEXEC
   1041   loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
   1042 
   1043   // Provide EOF flush at end of cumulative input for non-i mode.
   1044   if (!FLAG(i)) {
   1045     toys.optflags |= FLAG_i;
   1046     sed_line(0, 0);
   1047   }
   1048 
   1049   // todo: need to close fd when done for TOYBOX_FREE?
   1050 }
   1051