1 /* sed.c - stream editor. Thing that does s/// and other stuff. 2 * 3 * Copyright 2014 Rob Landley <rob (at) landley.net> 4 * 5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html 6 * 7 * TODO: lines > 2G could wrap signed int length counters. Not just getline() 8 * but N and s/// 9 * TODO: make y// handle unicode, unicode delimiters 10 * TODO: handle error return from emit(), error_msg/exit consistently 11 * What's the right thing to do for -i when write fails? Skip to next? 12 * test '//q' with no previous regex, also repeat previous regex? 13 14 USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP)) 15 16 config SED 17 bool "sed" 18 default y 19 help 20 usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...] 21 22 Stream editor. Apply one or more editing SCRIPTs to each line of input 23 (from FILE or stdin) producing output (by default to stdout). 24 25 -e Add SCRIPT to list 26 -f Add contents of SCRIPT_FILE to list 27 -i Edit each file in place (-iEXT keeps backup file with extension EXT) 28 -n No default output (use the p command to output matched lines) 29 -r Use extended regular expression syntax 30 -E POSIX alias for -r 31 -s Treat input files separately (implied by -i) 32 -z Use \0 rather than \n as the input line separator 33 34 A SCRIPT is a series of one or more COMMANDs separated by newlines or 35 semicolons. All -e SCRIPTs are concatenated together as if separated 36 by newlines, followed by all lines from -f SCRIPT_FILEs, in order. 37 If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT. 38 39 Each COMMAND may be preceded by an address which limits the command to 40 apply only to the specified line(s). Commands without an address apply to 41 every line. Addresses are of the form: 42 43 [ADDRESS[,ADDRESS]]COMMAND 44 45 The ADDRESS may be a decimal line number (starting at 1), a /regular 46 expression/ within a pair of forward slashes, or the character "$" which 47 matches the last line of input. (In -s or -i mode this matches the last 48 line of each file, otherwise just the last line of the last file.) A single 49 address matches one line, a pair of comma separated addresses match 50 everything from the first address to the second address (inclusive). If 51 both addresses are regular expressions, more than one range of lines in 52 each file can match. 53 54 REGULAR EXPRESSIONS in sed are started and ended by the same character 55 (traditionally / but anything except a backslash or a newline works). 56 Backslashes may be used to escape the delimiter if it occurs in the 57 regex, and for the usual printf escapes (\abcefnrtv and octal, hex, 58 and unicode). An empty regex repeats the previous one. ADDRESS regexes 59 (above) require the first delimiter to be escaped with a backslash when 60 it isn't a forward slash (to distinguish it from the COMMANDs below). 61 62 Sed mostly operates on individual lines one at a time. It reads each line, 63 processes it, and either writes it to the output or discards it before 64 reading the next line. Sed can remember one additional line in a separate 65 buffer (using the h, H, g, G, and x commands), and can read the next line 66 of input early (using the n and N command), but other than that command 67 scripts operate on individual lines of text. 68 69 Each COMMAND starts with a single character. The following commands take 70 no arguments: 71 72 { Start a new command block, continuing until a corresponding "}". 73 Command blocks may nest. If the block has an address, commands within 74 the block are only run for lines within the block's address range. 75 76 } End command block (this command cannot have an address) 77 78 d Delete this line and move on to the next one 79 (ignores remaining COMMANDs) 80 81 D Delete one line of input and restart command SCRIPT (same as "d" 82 unless you've glued lines together with "N" or similar) 83 84 g Get remembered line (overwriting current line) 85 86 G Get remembered line (appending to current line) 87 88 h Remember this line (overwriting remembered line) 89 90 H Remember this line (appending to remembered line, if any) 91 92 l Print line, escaping \abfrtv (but not newline), octal escaping other 93 nonprintable characters, wrapping lines to terminal width with a 94 backslash, and appending $ to actual end of line. 95 96 n Print default output and read next line, replacing current line 97 (If no next line available, quit processing script) 98 99 N Append next line of input to this line, separated by a newline 100 (This advances the line counter for address matching and "=", if no 101 next line available quit processing script without default output) 102 103 p Print this line 104 105 P Print this line up to first newline (from "N") 106 107 q Quit (print default output, no more commands processed or lines read) 108 109 x Exchange this line with remembered line (overwrite in both directions) 110 111 = Print the current line number (followed by a newline) 112 113 The following commands (may) take an argument. The "text" arguments (to 114 the "a", "b", and "c" commands) may end with an unescaped "\" to append 115 the next line (for which leading whitespace is not skipped), and also 116 treat ";" as a literal character (use "\;" instead). 117 118 a [text] Append text to output before attempting to read next line 119 120 b [label] Branch, jumps to :label (or with no label, to end of SCRIPT) 121 122 c [text] Delete line, output text at end of matching address range 123 (ignores remaining COMMANDs) 124 125 i [text] Print text 126 127 r [file] Append contents of file to output before attempting to read 128 next line. 129 130 s/S/R/F Search for regex S, replace matched text with R using flags F. 131 The first character after the "s" (anything but newline or 132 backslash) is the delimiter, escape with \ to use normally. 133 134 The replacement text may contain "&" to substitute the matched 135 text (escape it with backslash for a literal &), or \1 through 136 \9 to substitute a parenthetical subexpression in the regex. 137 You can also use the normal backslash escapes such as \n and 138 a backslash at the end of the line appends the next line. 139 140 The flags are: 141 142 [0-9] A number, substitute only that occurrence of pattern 143 g Global, substitute all occurrences of pattern 144 i Ignore case when matching 145 p Print the line if match was found and replaced 146 w [file] Write (append) line to file if match replaced 147 148 t [label] Test, jump to :label only if an "s" command found a match in 149 this line since last test (replacing with same text counts) 150 151 T [label] Test false, jump only if "s" hasn't found a match. 152 153 w [file] Write (append) line to file 154 155 y/old/new/ Change each character in 'old' to corresponding character 156 in 'new' (with standard backslash escapes, delimiter can be 157 any repeated character except \ or \n) 158 159 : [label] Labeled target for jump commands 160 161 # Comment, ignore rest of this line of SCRIPT 162 163 Deviations from POSIX: allow extended regular expressions with -r, 164 editing in place with -i, separate with -s, NUL-separated input with -z, 165 printf escapes in text, line continuations, semicolons after all commands, 166 2-address anywhere an address is allowed, "T" command, multiline 167 continuations for [abc], \; to end [abc] argument before end of line. 168 */ 169 170 #define FOR_sed 171 #include "toys.h" 172 173 GLOBALS( 174 char *i; 175 struct arg_list *f, *e; 176 177 // processed pattern list 178 struct double_list *pattern; 179 180 char *nextline, *remember; 181 void *restart, *lastregex; 182 long nextlen, rememberlen, count; 183 int fdout, noeol; 184 unsigned xx; 185 char delim; 186 ) 187 188 // Linked list of parsed sed commands. Offset fields indicate location where 189 // regex or string starts, ala offset+(char *)struct, because we remalloc() 190 // these to expand them for multiline inputs, and pointers would have to be 191 // individually adjusted. 192 193 struct sedcmd { 194 struct sedcmd *next, *prev; 195 196 // Begin and end of each match 197 long lmatch[2]; // line number of match 198 int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p) 199 int arg1, arg2, w; // offset of two arguments per command, plus s//w filename 200 unsigned not, hit; 201 unsigned sflags; // s///flag bits: i=1, g=2, p=4 202 char c; // action 203 }; 204 205 // Write out line with potential embedded NUL, handling eol/noeol 206 static int emit(char *line, long len, int eol) 207 { 208 int l, old = line[len]; 209 210 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1; 211 TT.noeol = !eol; 212 if (eol) line[len++] = '\n'; 213 if (!len) return 0; 214 l = writeall(TT.fdout, line, len); 215 if (eol) line[len-1] = old; 216 if (l != len) { 217 perror_msg("short write"); 218 219 return 1; 220 } 221 222 return 0; 223 } 224 225 // Extend allocation to include new string, with newline between if newlen<0 226 227 static char *extend_string(char **old, char *new, int oldlen, int newlen) 228 { 229 int newline = newlen < 0; 230 char *s; 231 232 if (newline) newlen = -newlen; 233 s = *old = xrealloc(*old, oldlen+newlen+newline+1); 234 if (newline) s[oldlen++] = '\n'; 235 memcpy(s+oldlen, new, newlen); 236 s[oldlen+newlen] = 0; 237 238 return s+oldlen+newlen+1; 239 } 240 241 // An empty regex repeats the previous one 242 static void *get_regex(void *trump, int offset) 243 { 244 if (!offset) { 245 if (!TT.lastregex) error_exit("no previous regex"); 246 return TT.lastregex; 247 } 248 249 return TT.lastregex = offset+(char *)trump; 250 } 251 252 // Apply pattern to line from input file 253 static void sed_line(char **pline, long plen) 254 { 255 struct append { 256 struct append *next, *prev; 257 int file; 258 char *str; 259 } *append = 0; 260 char *line = TT.nextline; 261 long len = TT.nextlen; 262 struct sedcmd *command; 263 int eol = 0, tea = 0; 264 265 // Ignore EOF for all files before last unless -i 266 if (!pline && !FLAG(i)) return; 267 268 // Grab next line for deferred processing (EOF detection: we get a NULL 269 // pline at EOF to flush last line). Note that only end of _last_ input 270 // file matches $ (unless we're doing -i). 271 TT.nextline = 0; 272 TT.nextlen = 0; 273 if (pline) { 274 TT.nextline = *pline; 275 TT.nextlen = plen; 276 *pline = 0; 277 } 278 279 if (!line || !len) return; 280 if (line[len-1] == '\n') line[--len] = eol++; 281 TT.count++; 282 283 // The restart-1 is because we added one to make sure it wasn't NULL, 284 // otherwise N as last command would restart script 285 command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern; 286 TT.restart = 0; 287 288 while (command) { 289 char *str, c = command->c; 290 291 // Have we got a line or regex matching range for this rule? 292 if (*command->lmatch || *command->rmatch) { 293 int miss = 0; 294 long lm; 295 296 // In a match that might end? 297 if (command->hit) { 298 if (!(lm = command->lmatch[1])) { 299 if (!command->rmatch[1]) command->hit = 0; 300 else { 301 void *rm = get_regex(command, command->rmatch[1]); 302 303 // regex match end includes matching line, so defer deactivation 304 if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1; 305 } 306 } else if (lm > 0 && lm < TT.count) command->hit = 0; 307 308 // Start a new match? 309 } else { 310 if (!(lm = *command->lmatch)) { 311 void *rm = get_regex(command, *command->rmatch); 312 313 if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++; 314 } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++; 315 316 if (!command->lmatch[1] && !command->rmatch[1]) miss = 1; 317 } 318 319 // Didn't match? 320 lm = !(command->hit ^ command->not); 321 322 // Deferred disable from regex end match 323 if (miss || command->lmatch[1] == TT.count) command->hit = 0; 324 325 if (lm) { 326 // Handle skipping curly bracket command group 327 if (c == '{') { 328 int curly = 1; 329 330 while (curly) { 331 command = command->next; 332 if (command->c == '{') curly++; 333 if (command->c == '}') curly--; 334 } 335 } 336 command = command->next; 337 continue; 338 } 339 } 340 341 // A deleted line can still update line match state for later commands 342 if (!line) { 343 command = command->next; 344 continue; 345 } 346 347 // Process command 348 349 if (c=='a' || c=='r') { 350 struct append *a = xzalloc(sizeof(struct append)); 351 if (command->arg1) a->str = command->arg1+(char *)command; 352 a->file = c=='r'; 353 dlist_add_nomalloc((void *)&append, (void *)a); 354 } else if (c=='b' || c=='t' || c=='T') { 355 int t = tea; 356 357 if (c != 'b') tea = 0; 358 if (c=='b' || t^(c=='T')) { 359 if (!command->arg1) break; 360 str = command->arg1+(char *)command; 361 for (command = (void *)TT.pattern; command; command = command->next) 362 if (command->c == ':' && !strcmp(command->arg1+(char *)command, str)) 363 break; 364 if (!command) error_exit("no :%s", str); 365 } 366 } else if (c=='c') { 367 str = command->arg1+(char *)command; 368 if (!command->hit) emit(str, strlen(str), 1); 369 free(line); 370 line = 0; 371 continue; 372 } else if (c=='d') { 373 free(line); 374 line = 0; 375 continue; 376 } else if (c=='D') { 377 // Delete up to \n or end of buffer 378 str = line; 379 while ((str-line)<len) if (*(str++) == '\n') break; 380 len -= str - line; 381 memmove(line, str, len); 382 383 // if "delete" blanks line, disable further processing 384 // otherwise trim and restart script 385 if (!len) { 386 free(line); 387 line = 0; 388 } else { 389 line[len] = 0; 390 command = (void *)TT.pattern; 391 } 392 continue; 393 } else if (c=='g') { 394 free(line); 395 line = xstrdup(TT.remember); 396 len = TT.rememberlen; 397 } else if (c=='G') { 398 line = xrealloc(line, len+TT.rememberlen+2); 399 line[len++] = '\n'; 400 memcpy(line+len, TT.remember, TT.rememberlen); 401 line[len += TT.rememberlen] = 0; 402 } else if (c=='h') { 403 free(TT.remember); 404 TT.remember = xstrdup(line); 405 TT.rememberlen = len; 406 } else if (c=='H') { 407 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2); 408 TT.remember[TT.rememberlen++] = '\n'; 409 memcpy(TT.remember+TT.rememberlen, line, len); 410 TT.remember[TT.rememberlen += len] = 0; 411 } else if (c=='i') { 412 str = command->arg1+(char *)command; 413 emit(str, strlen(str), 1); 414 } else if (c=='l') { 415 int i, x, off; 416 417 if (!TT.xx) { 418 terminal_size(&TT.xx, 0); 419 if (!TT.xx) TT.xx = 80; 420 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10; 421 if (TT.xx > 4) TT.xx -= 4; 422 } 423 424 for (i = off = 0; i<len; i++) { 425 if (off >= TT.xx) { 426 toybuf[off++] = '\\'; 427 emit(toybuf, off, 1); 428 off = 0; 429 } 430 x = stridx("\\\a\b\f\r\t\v", line[i]); 431 if (x != -1) { 432 toybuf[off++] = '\\'; 433 toybuf[off++] = "\\abfrtv"[x]; 434 } else if (line[i] >= ' ') toybuf[off++] = line[i]; 435 else off += sprintf(toybuf+off, "\\%03o", line[i]); 436 } 437 toybuf[off++] = '$'; 438 emit(toybuf, off, 1); 439 } else if (c=='n') { 440 TT.restart = command->next+1; 441 442 break; 443 } else if (c=='N') { 444 // Can't just grab next line because we could have multiple N and 445 // we need to actually read ahead to get N;$p EOF detection right. 446 if (pline) { 447 TT.restart = command->next+1; 448 extend_string(&line, TT.nextline, len, -TT.nextlen); 449 free(TT.nextline); 450 TT.nextline = line; 451 TT.nextlen += len + 1; 452 line = 0; 453 } 454 455 // Pending append goes out right after N 456 goto done; 457 } else if (c=='p' || c=='P') { 458 char *l = (c=='P') ? strchr(line, '\n') : 0; 459 460 if (emit(line, l ? l-line : len, eol)) break; 461 } else if (c=='q') { 462 if (pline) *pline = (void *)1; 463 free(TT.nextline); 464 TT.nextline = 0; 465 TT.nextlen = 0; 466 467 break; 468 } else if (c=='s') { 469 char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap; 470 regmatch_t *match = (void *)toybuf; 471 regex_t *reg = get_regex(command, command->arg1); 472 int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen; 473 474 // Find match in remaining line (up to remaining len) 475 while (!regexec0(reg, rline, rlen, 10, match, mflags)) { 476 mflags = REG_NOTBOL; 477 478 // Zero length matches don't count immediately after a previous match 479 mlen = match[0].rm_eo-match[0].rm_so; 480 if (!mlen && !zmatch) { 481 if (!rlen--) break; 482 rline++; 483 zmatch++; 484 continue; 485 } else zmatch = 0; 486 487 // If we're replacing only a specific match, skip if this isn't it 488 off = command->sflags>>3; 489 if (off && off != ++count) { 490 rline += match[0].rm_eo; 491 rlen -= match[0].rm_eo; 492 493 continue; 494 } 495 // The fact getline() can allocate unbounded amounts of memory is 496 // a bigger issue, but while we're here check for integer overflow 497 if (match[0].rm_eo > INT_MAX) perror_exit(0); 498 499 // newlen = strlen(new) but with \1 and & and printf escapes 500 for (off = newlen = 0; new[off]; off++) { 501 int cc = -1; 502 503 if (new[off] == '&') cc = 0; 504 else if (new[off] == '\\') cc = new[++off] - '0'; 505 if (cc < 0 || cc > 9) { 506 newlen++; 507 continue; 508 } 509 newlen += match[cc].rm_eo-match[cc].rm_so; 510 } 511 512 // Allocate new size, copy start/end around match. (Can't extend in 513 // place because backrefs may refer to text after it's overwritten.) 514 len += newlen-mlen; 515 swap = xmalloc(len+1); 516 rswap = swap+(rline-line)+match[0].rm_so; 517 memcpy(swap, line, (rline-line)+match[0].rm_so); 518 memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1); 519 520 // copy in new replacement text 521 for (off = mlen = 0; new[off]; off++) { 522 int cc = 0, ll; 523 524 if (new[off] == '\\') { 525 cc = new[++off] - '0'; 526 if (cc<0 || cc>9) { 527 if (!(rswap[mlen++] = unescape(new[off]))) 528 rswap[mlen-1] = new[off]; 529 530 continue; 531 } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc); 532 } else if (new[off] != '&') { 533 rswap[mlen++] = new[off]; 534 535 continue; 536 } 537 538 if (match[cc].rm_so == -1) ll = 0; // Empty match. 539 else { 540 ll = match[cc].rm_eo-match[cc].rm_so; 541 memcpy(rswap+mlen, rline+match[cc].rm_so, ll); 542 } 543 mlen += ll; 544 } 545 546 rline = rswap+newlen; 547 free(line); 548 line = swap; 549 550 // Stop after first substitution unless we have flag g 551 if (!(command->sflags & 2)) break; 552 } 553 554 if (mflags) { 555 // flag p 556 if (command->sflags & 4) emit(line, len, eol); 557 558 tea = 1; 559 if (command->w) goto writenow; 560 } 561 } else if (c=='w') { 562 int fd, noeol; 563 char *name; 564 565 writenow: 566 // Swap out emit() context 567 fd = TT.fdout; 568 noeol = TT.noeol; 569 570 // We save filehandle and newline status before filename 571 name = command->w + (char *)command; 572 memcpy(&TT.fdout, name, 4); 573 name += 4; 574 TT.noeol = *(name++); 575 576 // write, then save/restore context 577 if (emit(line, len, eol)) 578 perror_exit("w '%s'", command->arg1+(char *)command); 579 *(--name) = TT.noeol; 580 TT.noeol = noeol; 581 TT.fdout = fd; 582 } else if (c=='x') { 583 long swap = TT.rememberlen; 584 585 str = TT.remember; 586 TT.remember = line; 587 line = str; 588 TT.rememberlen = len; 589 len = swap; 590 } else if (c=='y') { 591 char *from, *to = (char *)command; 592 int i, j; 593 594 from = to+command->arg1; 595 to += command->arg2; 596 597 for (i = 0; i < len; i++) { 598 j = stridx(from, line[i]); 599 if (j != -1) line[i] = to[j]; 600 } 601 } else if (c=='=') { 602 sprintf(toybuf, "%ld", TT.count); 603 if (emit(toybuf, strlen(toybuf), 1)) break; 604 } 605 606 command = command->next; 607 } 608 609 if (line && !FLAG(n)) emit(line, len, eol); 610 611 done: 612 if (dlist_terminate(append)) while (append) { 613 struct append *a = append->next; 614 615 if (append->file) { 616 int fd = open(append->str, O_RDONLY); 617 618 // Force newline if noeol pending 619 if (fd != -1) { 620 if (TT.noeol) xwrite(TT.fdout, "\n", 1); 621 TT.noeol = 0; 622 xsendfile(fd, TT.fdout); 623 close(fd); 624 } 625 } else if (append->str) emit(append->str, strlen(append->str), 1); 626 else emit(line, 0, 0); 627 free(append); 628 append = a; 629 } 630 free(line); 631 } 632 633 // Callback called on each input file 634 static void do_sed_file(int fd, char *name) 635 { 636 char *tmp; 637 638 if (FLAG(i)) { 639 struct sedcmd *command; 640 641 if (!fd) return error_msg("-i on stdin"); 642 TT.fdout = copy_tempfile(fd, name, &tmp); 643 TT.count = 0; 644 for (command = (void *)TT.pattern; command; command = command->next) 645 command->hit = 0; 646 } 647 do_lines(fd, TT.delim, sed_line); 648 if (FLAG(i)) { 649 if (TT.i && *TT.i) { 650 char *s = xmprintf("%s%s", name, TT.i); 651 652 xrename(name, s); 653 free(s); 654 } 655 replace_tempfile(-1, TT.fdout, &tmp); 656 TT.fdout = 1; 657 TT.nextline = 0; 658 TT.nextlen = TT.noeol = 0; 659 } 660 } 661 662 // Copy chunk of string between two delimiters, converting printf escapes. 663 // returns processed copy of string (0 if error), *pstr advances to next 664 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter 665 // if regxex, ignore delimiter in [ranges] 666 static char *unescape_delimited_string(char **pstr, char *delim) 667 { 668 char *to, *from, mode = 0, d; 669 670 // Grab leading delimiter (if necessary), allocate space for new string 671 from = *pstr; 672 if (!delim || !*delim) { 673 if (!(d = *(from++))) return 0; 674 if (d == '\\') d = *(from++); 675 if (!d || d == '\\') return 0; 676 if (delim) *delim = d; 677 } else d = *delim; 678 to = delim = xmalloc(strlen(*pstr)+1); 679 680 while (mode || *from != d) { 681 if (!*from) return 0; 682 683 // delimiter in regex character range doesn't count 684 if (*from == '[') { 685 if (!mode) { 686 mode = ']'; 687 if (from[1]=='-' || from[1]==']') *(to++) = *(from++); 688 } else if (mode == ']' && strchr(".=:", from[1])) { 689 *(to++) = *(from++); 690 mode = *from; 691 } 692 } else if (*from == mode) { 693 if (mode == ']') mode = 0; 694 else { 695 *(to++) = *(from++); 696 mode = ']'; 697 } 698 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err, 699 // but the perl build does it, so we need to filter it out. 700 } else if (mode && *from == '-' && from[-1] == from[1]) { 701 from+=2; 702 continue; 703 } else if (*from == '\\') { 704 if (!from[1]) return 0; 705 706 // Check escaped end delimiter before printf style escapes. 707 if (from[1] == d) from++; 708 else if (from[1]=='\\') *(to++) = *(from++); 709 else { 710 char c = unescape(from[1]); 711 712 if (c) { 713 *(to++) = c; 714 from+=2; 715 continue; 716 } else if (!mode) *(to++) = *(from++); 717 } 718 } 719 *(to++) = *(from++); 720 } 721 *to = 0; 722 *pstr = from+1; 723 724 return delim; 725 } 726 727 // Translate pattern strings into command structures. Each command structure 728 // is a single allocation (which requires some math and remalloc at times). 729 static void parse_pattern(char **pline, long len) 730 { 731 struct sedcmd *command = (void *)TT.pattern; 732 char *line, *reg, c, *errstart; 733 int i; 734 735 line = errstart = pline ? *pline : ""; 736 if (len && line[len-1]=='\n') line[--len] = 0; 737 738 // Append this line to previous multiline command? (hit indicates type.) 739 // During parsing "hit" stores data about line continuations, but in 740 // sed_line() it means the match range attached to this command 741 // is active, so processing the continuation must zero it again. 742 if (command && command->prev->hit) { 743 // Remove half-finished entry from list so remalloc() doesn't confuse it 744 TT.pattern = TT.pattern->prev; 745 command = dlist_pop(&TT.pattern); 746 c = command->c; 747 reg = (char *)command; 748 reg += command->arg1 + strlen(reg + command->arg1); 749 750 // Resume parsing for 'a' or 's' command. (Only two that can do this.) 751 // TODO: using 256 to indicate 'a' means our s/// delimiter can't be 752 // a unicode character. 753 if (command->hit < 256) goto resume_s; 754 else goto resume_a; 755 } 756 757 // Loop through commands in this line. 758 759 command = 0; 760 for (;;) { 761 if (command) dlist_add_nomalloc(&TT.pattern, (void *)command); 762 763 // If there's no more data on this line, return. 764 for (;;) { 765 while (isspace(*line) || *line == ';') line++; 766 if (*line == '#') while (*line && *line != '\n') line++; 767 else break; 768 } 769 if (!*line) return; 770 771 // We start by writing data into toybuf. Later we'll allocate the 772 // ex 773 774 errstart = line; 775 memset(toybuf, 0, sizeof(struct sedcmd)); 776 command = (void *)toybuf; 777 reg = toybuf + sizeof(struct sedcmd); 778 779 // Parse address range (if any) 780 for (i = 0; i < 2; i++) { 781 if (*line == ',') line++; 782 else if (i) break; 783 784 if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0); 785 else if (*line == '$') { 786 command->lmatch[i] = -1; 787 line++; 788 } else if (*line == '/' || *line == '\\') { 789 char *s = line; 790 791 if (!(s = unescape_delimited_string(&line, 0))) goto error; 792 if (!*s) command->rmatch[i] = 0; 793 else { 794 xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r)); 795 command->rmatch[i] = reg-toybuf; 796 reg += sizeof(regex_t); 797 } 798 free(s); 799 } else break; 800 } 801 802 while (isspace(*line)) line++; 803 if (!*line) break; 804 805 while (*line == '!') { 806 command->not = 1; 807 line++; 808 } 809 while (isspace(*line)) line++; 810 811 c = command->c = *(line++); 812 if (strchr("}:", c) && i) break; 813 if (strchr("aiqr=", c) && i>1) break; 814 815 // Add step to pattern 816 command = xmemdup(toybuf, reg-toybuf); 817 reg = (reg-toybuf) + (char *)command; 818 819 // Parse arguments by command type 820 if (c == '{') TT.nextlen++; 821 else if (c == '}') { 822 if (!TT.nextlen--) break; 823 } else if (c == 's') { 824 char *end, delim = 0; 825 826 // s/pattern/replacement/flags 827 828 // line continuations use arg1 (back at the start of the function), 829 // so let's fill out arg2 first (since the regex part can't be multiple 830 // lines) and swap them back later. 831 832 // get pattern (just record, we parse it later) 833 command->arg2 = reg - (char *)command; 834 if (!(TT.remember = unescape_delimited_string(&line, &delim))) 835 goto error; 836 837 reg += sizeof(regex_t); 838 command->arg1 = reg-(char *)command; 839 command->hit = delim; 840 resume_s: 841 // get replacement - don't replace escapes yet because \1 and \& need 842 // processing later, after we replace \\ with \ we can't tell \\1 from \1 843 end = line; 844 while (*end != command->hit) { 845 if (!*end) goto error; 846 if (*end++ == '\\') { 847 if (!*end || *end == '\n') { 848 end[-1] = '\n'; 849 break; 850 } 851 end++; 852 } 853 } 854 855 reg = extend_string((void *)&command, line, reg-(char *)command,end-line); 856 line = end; 857 // line continuation? (note: '\n' can't be a valid delim). 858 if (*line == command->hit) command->hit = 0; 859 else { 860 if (!*line) continue; 861 reg--; 862 line++; 863 goto resume_s; 864 } 865 866 // swap arg1/arg2 so they're back in order arguments occur. 867 i = command->arg1; 868 command->arg1 = command->arg2; 869 command->arg2 = i; 870 871 // get flags 872 for (line++; *line; line++) { 873 long l; 874 875 if (isspace(*line) && *line != '\n') continue; 876 877 if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l; 878 else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) { 879 command->sflags |= l << 3; 880 line--; 881 } else break; 882 } 883 884 // We deferred actually parsing the regex until we had the s///i flag 885 // allocating the space was done by extend_string() above 886 if (!*TT.remember) command->arg1 = 0; 887 else xregcomp((void *)(command->arg1 + (char *)command), TT.remember, 888 (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE)); 889 free(TT.remember); 890 TT.remember = 0; 891 if (*line == 'w') { 892 line++; 893 goto writenow; 894 } 895 } else if (c == 'w') { 896 int fd, delim; 897 char *cc; 898 899 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and 900 // eol status, and to retain the filename for error messages, we'd need 901 // to go up to arg5 just for this. Compromise: dynamically allocate the 902 // filehandle and eol status. 903 904 writenow: 905 while (isspace(*line)) line++; 906 if (!*line) goto error; 907 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break; 908 delim = *cc; 909 *cc = 0; 910 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644); 911 *cc = delim; 912 913 command->w = reg - (char *)command; 914 command = xrealloc(command, command->w+(cc-line)+6); 915 reg = command->w + (char *)command; 916 917 memcpy(reg, &fd, 4); 918 reg += 4; 919 *(reg++) = 0; 920 memcpy(reg, line, delim); 921 reg += delim; 922 *(reg++) = 0; 923 924 line = cc; 925 if (delim) line += 2; 926 } else if (c == 'y') { 927 char *s, delim = 0; 928 int len; 929 930 if (!(s = unescape_delimited_string(&line, &delim))) goto error; 931 command->arg1 = reg-(char *)command; 932 len = strlen(s); 933 reg = extend_string((void *)&command, s, reg-(char *)command, len); 934 free(s); 935 command->arg2 = reg-(char *)command; 936 if (!(s = unescape_delimited_string(&line, &delim))) goto error; 937 if (len != strlen(s)) goto error; 938 reg = extend_string((void *)&command, s, reg-(char*)command, len); 939 free(s); 940 } else if (strchr("abcirtTw:", c)) { 941 int end; 942 943 // trim leading spaces 944 while (isspace(*line) && *line != '\n') line++; 945 946 // Resume logic differs from 's' case because we don't add a newline 947 // unless it's after something, so we add it on return instead. 948 resume_a: 949 command->hit = 0; 950 951 // btT: end with space or semicolon, aicrw continue to newline. 952 if (!(end = strcspn(line, strchr(":btT", c) ? "}; \t\r\n\v\f" : "\n"))) { 953 // Argument's optional for btT 954 if (strchr("btT", c)) continue; 955 else if (!command->arg1) break; 956 } 957 958 // Extend allocation to include new string. We use offsets instead of 959 // pointers so realloc() moving stuff doesn't break things. Ok to write 960 // \n over NUL terminator because call to extend_string() adds it back. 961 if (!command->arg1) command->arg1 = reg - (char*)command; 962 else if (*(command->arg1+(char *)command)) *(reg++) = '\n'; 963 else if (!pline) { 964 command->arg1 = 0; 965 continue; 966 } 967 reg = extend_string((void *)&command, line, reg - (char *)command, end); 968 969 // Recopy data to remove escape sequences and handle line continuation. 970 if (strchr("aci", c)) { 971 reg -= end+1; 972 for (i = end; i; i--) { 973 if ((*reg++ = *line++)=='\\') { 974 975 // escape at end of line: resume if -e escaped literal newline, 976 // else request callback and resume with next line 977 if (!--i) { 978 *--reg = 0; 979 if (*line) { 980 line++; 981 goto resume_a; 982 } 983 command->hit = 256; 984 break; 985 } 986 if (!(reg[-1] = unescape(*line))) reg[-1] = *line; 987 line++; 988 } 989 } 990 *reg = 0; 991 } else line += end; 992 993 // Commands that take no arguments 994 } else if (!strchr("{dDgGhHlnNpPqx=", c)) break; 995 } 996 997 error: 998 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line); 999 } 1000 1001 void sed_main(void) 1002 { 1003 struct arg_list *al; 1004 char **args = toys.optargs; 1005 1006 if (!FLAG(z)) TT.delim = '\n'; 1007 1008 // Lie to autoconf when it asks stupid questions, so configure regexes 1009 // that look for "GNU sed version %f" greater than some old buggy number 1010 // don't fail us for not matching their narrow expectations. 1011 if (FLAG(version)) { 1012 xprintf("This is not GNU sed version 9.0\n"); 1013 return; 1014 } 1015 1016 // Handling our own --version means we handle our own --help too. 1017 if (FLAG(help)) help_exit(0); 1018 1019 // Parse pattern into commands. 1020 1021 // If no -e or -f, first argument is the pattern. 1022 if (!TT.e && !TT.f) { 1023 if (!*toys.optargs) error_exit("no pattern"); 1024 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++); 1025 } 1026 1027 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah" 1028 // so handle all -e, then all -f. (At least the behavior's consistent.) 1029 1030 for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg)); 1031 parse_pattern(0, 0); 1032 for (al = TT.f; al; al = al->next) 1033 do_lines(xopenro(al->arg), TT.delim, parse_pattern); 1034 dlist_terminate(TT.pattern); 1035 if (TT.nextlen) error_exit("no }"); 1036 1037 TT.fdout = 1; 1038 TT.remember = xstrdup(""); 1039 1040 // Inflict pattern upon input files. Long version because !O_CLOEXEC 1041 loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file); 1042 1043 // Provide EOF flush at end of cumulative input for non-i mode. 1044 if (!FLAG(i)) { 1045 toys.optflags |= FLAG_i; 1046 sed_line(0, 0); 1047 } 1048 1049 // todo: need to close fd when done for TOYBOX_FREE? 1050 } 1051