1 /* sed.c - stream editor. Thing that does s/// and other stuff. 2 * 3 * Copyright 2014 Rob Landley <rob (at) landley.net> 4 * 5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html 6 * 7 * TODO: lines > 2G could wrap signed int length counters. Not just getline() 8 * but N and s/// 9 * TODO: make y// handle unicode 10 * TODO: handle error return from emit(), error_msg/exit consistently 11 * What's the right thing to do for -i when write fails? Skip to next? 12 13 USE_SED(NEWTOY(sed, "(help)(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP)) 14 15 config SED 16 bool "sed" 17 default y 18 help 19 usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...] 20 21 Stream editor. Apply one or more editing SCRIPTs to each line of input 22 (from FILE or stdin) producing output (by default to stdout). 23 24 -e add SCRIPT to list 25 -f add contents of SCRIPT_FILE to list 26 -i Edit each file in place. 27 -n No default output. (Use the p command to output matched lines.) 28 -r Use extended regular expression syntax. 29 -E Alias for -r. 30 -s Treat input files separately (implied by -i) 31 32 A SCRIPT is a series of one or more COMMANDs separated by newlines or 33 semicolons. All -e SCRIPTs are concatenated together as if separated 34 by newlines, followed by all lines from -f SCRIPT_FILEs, in order. 35 If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT. 36 37 Each COMMAND may be preceded by an address which limits the command to 38 apply only to the specified line(s). Commands without an address apply to 39 every line. Addresses are of the form: 40 41 [ADDRESS[,ADDRESS]]COMMAND 42 43 The ADDRESS may be a decimal line number (starting at 1), a /regular 44 expression/ within a pair of forward slashes, or the character "$" which 45 matches the last line of input. (In -s or -i mode this matches the last 46 line of each file, otherwise just the last line of the last file.) A single 47 address matches one line, a pair of comma separated addresses match 48 everything from the first address to the second address (inclusive). If 49 both addresses are regular expressions, more than one range of lines in 50 each file can match. 51 52 REGULAR EXPRESSIONS in sed are started and ended by the same character 53 (traditionally / but anything except a backslash or a newline works). 54 Backslashes may be used to escape the delimiter if it occurs in the 55 regex, and for the usual printf escapes (\abcefnrtv and octal, hex, 56 and unicode). An empty regex repeats the previous one. ADDRESS regexes 57 (above) require the first delimeter to be escaped with a backslash when 58 it isn't a forward slash (to distinguish it from the COMMANDs below). 59 60 Sed mostly operates on individual lines one at a time. It reads each line, 61 processes it, and either writes it to the output or discards it before 62 reading the next line. Sed can remember one additional line in a separate 63 buffer (using the h, H, g, G, and x commands), and can read the next line 64 of input early (using the n and N command), but other than that command 65 scripts operate on individual lines of text. 66 67 Each COMMAND starts with a single character. The following commands take 68 no arguments: 69 70 { Start a new command block, continuing until a corresponding "}". 71 Command blocks may nest. If the block has an address, commands within 72 the block are only run for lines within the block's address range. 73 74 } End command block (this command cannot have an address) 75 76 d Delete this line and move on to the next one 77 (ignores remaining COMMANDs) 78 79 D Delete one line of input and restart command SCRIPT (same as "d" 80 unless you've glued lines together with "N" or similar) 81 82 g Get remembered line (overwriting current line) 83 84 G Get remembered line (appending to current line) 85 86 h Remember this line (overwriting remembered line) 87 88 H Remember this line (appending to remembered line, if any) 89 90 l Print line, escaping \abfrtv (but not newline), octal escaping other 91 nonprintable characters, wrapping lines to terminal width with a 92 backslash, and appending $ to actual end of line. 93 94 n Print default output and read next line, replacing current line 95 (If no next line available, quit processing script) 96 97 N Append next line of input to this line, separated by a newline 98 (This advances the line counter for address matching and "=", if no 99 next line available quit processing script without default output) 100 101 p Print this line 102 103 P Print this line up to first newline (from "N") 104 105 q Quit (print default output, no more commands processed or lines read) 106 107 x Exchange this line with remembered line (overwrite in both directions) 108 109 = Print the current line number (followed by a newline) 110 111 The following commands (may) take an argument. The "text" arguments (to 112 the "a", "b", and "c" commands) may end with an unescaped "\" to append 113 the next line (for which leading whitespace is not skipped), and also 114 treat ";" as a literal character (use "\;" instead). 115 116 a [text] Append text to output before attempting to read next line 117 118 b [label] Branch, jumps to :label (or with no label, to end of SCRIPT) 119 120 c [text] Delete line, output text at end of matching address range 121 (ignores remaining COMMANDs) 122 123 i [text] Print text 124 125 r [file] Append contents of file to output before attempting to read 126 next line. 127 128 s/S/R/F Search for regex S, replace matched text with R using flags F. 129 The first character after the "s" (anything but newline or 130 backslash) is the delimiter, escape with \ to use normally. 131 132 The replacement text may contain "&" to substitute the matched 133 text (escape it with backslash for a literal &), or \1 through 134 \9 to substitute a parenthetical subexpression in the regex. 135 You can also use the normal backslash escapes such as \n and 136 a backslash at the end of the line appends the next line. 137 138 The flags are: 139 140 [0-9] A number, substitute only that occurrence of pattern 141 g Global, substitute all occurrences of pattern 142 i Ignore case when matching 143 p Print the line if match was found and replaced 144 w [file] Write (append) line to file if match replaced 145 146 t [label] Test, jump to :label only if an "s" command found a match in 147 this line since last test (replacing with same text counts) 148 149 T [label] Test false, jump only if "s" hasn't found a match. 150 151 w [file] Write (append) line to file 152 153 y/old/new/ Change each character in 'old' to corresponding character 154 in 'new' (with standard backslash escapes, delimiter can be 155 any repeated character except \ or \n) 156 157 : [label] Labeled target for jump commands 158 159 # Comment, ignore rest of this line of SCRIPT 160 161 Deviations from posix: allow extended regular expressions with -r, 162 editing in place with -i, separate with -s, printf escapes in text, line 163 continuations, semicolons after all commands, 2-address anywhere an 164 address is allowed, "T" command, multiline continuations for [abc], 165 \; to end [abc] argument before end of line. 166 */ 167 168 #define FOR_sed 169 #include "toys.h" 170 171 GLOBALS( 172 struct arg_list *f; 173 struct arg_list *e; 174 175 // processed pattern list 176 struct double_list *pattern; 177 178 char *nextline, *remember; 179 void *restart, *lastregex; 180 long nextlen, rememberlen, count; 181 int fdout, noeol; 182 unsigned xx; 183 ) 184 185 // Linked list of parsed sed commands. Offset fields indicate location where 186 // regex or string starts, ala offset+(char *)struct, because we remalloc() 187 // these to expand them for multiline inputs, and pointers would have to be 188 // individually adjusted. 189 190 struct sedcmd { 191 struct sedcmd *next, *prev; 192 193 // Begin and end of each match 194 long lmatch[2]; // line number of match 195 int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p) 196 int arg1, arg2, w; // offset of two arguments per command, plus s//w filename 197 unsigned not, hit; 198 unsigned sflags; // s///flag bits: i=1, g=2, p=4 199 char c; // action 200 }; 201 202 // Write out line with potential embedded NUL, handling eol/noeol 203 static int emit(char *line, long len, int eol) 204 { 205 int l, old = line[len]; 206 207 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1; 208 TT.noeol = !eol; 209 if (eol) line[len++] = '\n'; 210 if (!len) return 0; 211 l = writeall(TT.fdout, line, len); 212 if (eol) line[len-1] = old; 213 if (l != len) { 214 perror_msg("short write"); 215 216 return 1; 217 } 218 219 return 0; 220 } 221 222 // Extend allocation to include new string, with newline between if newlen<0 223 224 static char *extend_string(char **old, char *new, int oldlen, int newlen) 225 { 226 int newline = newlen < 0; 227 char *s; 228 229 if (newline) newlen = -newlen; 230 s = *old = xrealloc(*old, oldlen+newlen+newline+1); 231 if (newline) s[oldlen++] = '\n'; 232 memcpy(s+oldlen, new, newlen); 233 s[oldlen+newlen] = 0; 234 235 return s+oldlen+newlen+1; 236 } 237 238 // An empty regex repeats the previous one 239 static void *get_regex(void *trump, int offset) 240 { 241 if (!offset) { 242 if (!TT.lastregex) error_exit("no previous regex"); 243 return TT.lastregex; 244 } 245 246 return TT.lastregex = offset+(char *)trump; 247 } 248 249 // Apply pattern to line from input file 250 static void process_line(char **pline, long plen) 251 { 252 struct append { 253 struct append *next, *prev; 254 int file; 255 char *str; 256 } *append = 0; 257 char *line = TT.nextline; 258 long len = TT.nextlen; 259 struct sedcmd *command; 260 int eol = 0, tea = 0; 261 262 // Grab next line for deferred processing (EOF detection: we get a NULL 263 // pline at EOF to flush last line). Note that only end of _last_ input 264 // file matches $ (unless we're doing -i). 265 TT.nextline = 0; 266 TT.nextlen = 0; 267 if (pline) { 268 TT.nextline = *pline; 269 TT.nextlen = plen; 270 *pline = 0; 271 } 272 273 if (!line || !len) return; 274 if (line[len-1] == '\n') line[--len] = eol++; 275 TT.count++; 276 277 // The restart-1 is because we added one to make sure it wasn't NULL, 278 // otherwise N as last command would restart script 279 command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern; 280 TT.restart = 0; 281 282 while (command) { 283 char *str, c = command->c; 284 285 // Have we got a line or regex matching range for this rule? 286 if (*command->lmatch || *command->rmatch) { 287 int miss = 0; 288 long lm; 289 290 // In a match that might end? 291 if (command->hit) { 292 if (!(lm = command->lmatch[1])) { 293 if (!command->rmatch[1]) command->hit = 0; 294 else { 295 void *rm = get_regex(command, command->rmatch[1]); 296 297 // regex match end includes matching line, so defer deactivation 298 if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1; 299 } 300 } else if (lm > 0 && lm < TT.count) command->hit = 0; 301 302 // Start a new match? 303 } else { 304 if (!(lm = *command->lmatch)) { 305 void *rm = get_regex(command, *command->rmatch); 306 307 if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++; 308 } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++; 309 310 if (!command->lmatch[1] && !command->rmatch[1]) miss = 1; 311 } 312 313 // Didn't match? 314 lm = !(command->hit ^ command->not); 315 316 // Deferred disable from regex end match 317 if (miss || command->lmatch[1] == TT.count) command->hit = 0; 318 319 if (lm) { 320 // Handle skipping curly bracket command group 321 if (c == '{') { 322 int curly = 1; 323 324 while (curly) { 325 command = command->next; 326 if (command->c == '{') curly++; 327 if (command->c == '}') curly--; 328 } 329 } 330 command = command->next; 331 continue; 332 } 333 } 334 335 // A deleted line can still update line match state for later commands 336 if (!line) { 337 command = command->next; 338 continue; 339 } 340 341 // Process command 342 343 if (c=='a' || c=='r') { 344 struct append *a = xzalloc(sizeof(struct append)); 345 if (command->arg1) a->str = command->arg1+(char *)command; 346 a->file = c=='r'; 347 dlist_add_nomalloc((void *)&append, (void *)a); 348 } else if (c=='b' || c=='t' || c=='T') { 349 int t = tea; 350 351 if (c != 'b') tea = 0; 352 if (c=='b' || t^(c=='T')) { 353 if (!command->arg1) break; 354 str = command->arg1+(char *)command; 355 for (command = (void *)TT.pattern; command; command = command->next) 356 if (command->c == ':' && !strcmp(command->arg1+(char *)command, str)) 357 break; 358 if (!command) error_exit("no :%s", str); 359 } 360 } else if (c=='c') { 361 str = command->arg1+(char *)command; 362 if (!command->hit) emit(str, strlen(str), 1); 363 free(line); 364 line = 0; 365 continue; 366 } else if (c=='d') { 367 free(line); 368 line = 0; 369 continue; 370 } else if (c=='D') { 371 // Delete up to \n or end of buffer 372 str = line; 373 while ((str-line)<len) if (*(str++) == '\n') break; 374 len -= str - line; 375 memmove(line, str, len); 376 377 // if "delete" blanks line, disable further processing 378 // otherwise trim and restart script 379 if (!len) { 380 free(line); 381 line = 0; 382 } else { 383 line[len] = 0; 384 command = (void *)TT.pattern; 385 } 386 continue; 387 } else if (c=='g') { 388 free(line); 389 line = xstrdup(TT.remember); 390 len = TT.rememberlen; 391 } else if (c=='G') { 392 line = xrealloc(line, len+TT.rememberlen+2); 393 line[len++] = '\n'; 394 memcpy(line+len, TT.remember, TT.rememberlen); 395 line[len += TT.rememberlen] = 0; 396 } else if (c=='h') { 397 free(TT.remember); 398 TT.remember = xstrdup(line); 399 TT.rememberlen = len; 400 } else if (c=='H') { 401 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2); 402 TT.remember[TT.rememberlen++] = '\n'; 403 memcpy(TT.remember+TT.rememberlen, line, len); 404 TT.remember[TT.rememberlen += len] = 0; 405 } else if (c=='i') { 406 str = command->arg1+(char *)command; 407 emit(str, strlen(str), 1); 408 } else if (c=='l') { 409 int i, x, off; 410 411 if (!TT.xx) { 412 terminal_size(&TT.xx, 0); 413 if (!TT.xx) TT.xx = 80; 414 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10; 415 if (TT.xx > 4) TT.xx -= 4; 416 } 417 418 for (i = off = 0; i<len; i++) { 419 if (off >= TT.xx) { 420 toybuf[off++] = '\\'; 421 emit(toybuf, off, 1); 422 off = 0; 423 } 424 x = stridx("\\\a\b\f\r\t\v", line[i]); 425 if (x != -1) { 426 toybuf[off++] = '\\'; 427 toybuf[off++] = "\\abfrtv"[x]; 428 } else if (line[i] >= ' ') toybuf[off++] = line[i]; 429 else off += sprintf(toybuf+off, "\\%03o", line[i]); 430 } 431 toybuf[off++] = '$'; 432 emit(toybuf, off, 1); 433 } else if (c=='n') { 434 TT.restart = command->next+1; 435 436 break; 437 } else if (c=='N') { 438 // Can't just grab next line because we could have multiple N and 439 // we need to actually read ahead to get N;$p EOF detection right. 440 if (pline) { 441 TT.restart = command->next+1; 442 extend_string(&line, TT.nextline, len, -TT.nextlen); 443 free(TT.nextline); 444 TT.nextline = line; 445 TT.nextlen += len + 1; 446 line = 0; 447 } 448 449 // Pending append goes out right after N 450 goto done; 451 } else if (c=='p' || c=='P') { 452 char *l = (c=='P') ? strchr(line, '\n') : 0; 453 454 if (emit(line, l ? l-line : len, eol)) break; 455 } else if (c=='q') { 456 if (pline) *pline = (void *)1; 457 free(TT.nextline); 458 TT.nextline = 0; 459 TT.nextlen = 0; 460 461 break; 462 } else if (c=='s') { 463 char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap; 464 regmatch_t *match = (void *)toybuf; 465 regex_t *reg = get_regex(command, command->arg1); 466 int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen; 467 468 // Find match in remaining line (up to remaining len) 469 while (!regexec0(reg, rline, rlen, 10, match, mflags)) { 470 mflags = REG_NOTBOL; 471 472 // Zero length matches don't count immediately after a previous match 473 mlen = match[0].rm_eo-match[0].rm_so; 474 if (!mlen && !zmatch) { 475 if (!rlen--) break; 476 rline++; 477 zmatch++; 478 continue; 479 } else zmatch = 0; 480 481 // If we're replacing only a specific match, skip if this isn't it 482 off = command->sflags>>3; 483 if (off && off != ++count) { 484 rline += match[0].rm_eo; 485 rlen -= match[0].rm_eo; 486 487 continue; 488 } 489 // The fact getline() can allocate unbounded amounts of memory is 490 // a bigger issue, but while we're here check for integer overflow 491 if (match[0].rm_eo > INT_MAX) perror_exit(0); 492 493 // newlen = strlen(new) but with \1 and & and printf escapes 494 for (off = newlen = 0; new[off]; off++) { 495 int cc = -1; 496 497 if (new[off] == '&') cc = 0; 498 else if (new[off] == '\\') cc = new[++off] - '0'; 499 if (cc < 0 || cc > 9) { 500 newlen++; 501 continue; 502 } 503 newlen += match[cc].rm_eo-match[cc].rm_so; 504 } 505 506 // Allocate new size, copy start/end around match. (Can't extend in 507 // place because backrefs may refer to text after it's overwritten.) 508 len += newlen-mlen; 509 swap = xmalloc(len+1); 510 rswap = swap+(rline-line)+match[0].rm_so; 511 memcpy(swap, line, (rline-line)+match[0].rm_so); 512 memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1); 513 514 // copy in new replacement text 515 for (off = mlen = 0; new[off]; off++) { 516 int cc = 0, ll; 517 518 if (new[off] == '\\') { 519 cc = new[++off] - '0'; 520 if (cc<0 || cc>9) { 521 if (!(rswap[mlen++] = unescape(new[off]))) 522 rswap[mlen-1] = new[off]; 523 524 continue; 525 } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc); 526 } else if (new[off] != '&') { 527 rswap[mlen++] = new[off]; 528 529 continue; 530 } 531 532 ll = match[cc].rm_eo-match[cc].rm_so; 533 memcpy(rswap+mlen, rline+match[cc].rm_so, ll); 534 mlen += ll; 535 } 536 537 rline = rswap+newlen; 538 free(line); 539 line = swap; 540 541 // Stop after first substitution unless we have flag g 542 if (!(command->sflags & 2)) break; 543 } 544 545 if (mflags) { 546 // flag p 547 if (command->sflags & 4) emit(line, len, eol); 548 549 tea = 1; 550 if (command->w) goto writenow; 551 } 552 } else if (c=='w') { 553 int fd, noeol; 554 char *name; 555 556 writenow: 557 // Swap out emit() context 558 fd = TT.fdout; 559 noeol = TT.noeol; 560 561 // We save filehandle and newline status before filename 562 name = command->w + (char *)command; 563 memcpy(&TT.fdout, name, 4); 564 name += 4; 565 TT.noeol = *(name++); 566 567 // write, then save/restore context 568 if (emit(line, len, eol)) 569 perror_exit("w '%s'", command->arg1+(char *)command); 570 *(--name) = TT.noeol; 571 TT.noeol = noeol; 572 TT.fdout = fd; 573 } else if (c=='x') { 574 long swap = TT.rememberlen; 575 576 str = TT.remember; 577 TT.remember = line; 578 line = str; 579 TT.rememberlen = len; 580 len = swap; 581 } else if (c=='y') { 582 char *from, *to = (char *)command; 583 int i, j; 584 585 from = to+command->arg1; 586 to += command->arg2; 587 588 for (i = 0; i < len; i++) { 589 j = stridx(from, line[i]); 590 if (j != -1) line[i] = to[j]; 591 } 592 } else if (c=='=') { 593 sprintf(toybuf, "%ld", TT.count); 594 emit(toybuf, strlen(toybuf), 1); 595 } 596 597 command = command->next; 598 } 599 600 if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol); 601 602 done: 603 if (dlist_terminate(append)) while (append) { 604 struct append *a = append->next; 605 606 if (append->file) { 607 int fd = open(append->str, O_RDONLY); 608 609 // Force newline if noeol pending 610 if (fd != -1) { 611 if (TT.noeol) xwrite(TT.fdout, "\n", 1); 612 TT.noeol = 0; 613 xsendfile(fd, TT.fdout); 614 close(fd); 615 } 616 } else if (append->str) emit(append->str, strlen(append->str), 1); 617 else emit(line, 0, 0); 618 free(append); 619 append = a; 620 } 621 free(line); 622 } 623 624 // Callback called on each input file 625 static void do_sed(int fd, char *name) 626 { 627 int i = toys.optflags & FLAG_i; 628 char *tmp; 629 630 if (i) { 631 struct sedcmd *command; 632 633 if (!fd && !strcmp(name, "-")) { 634 error_msg("-i on stdin"); 635 return; 636 } 637 TT.fdout = copy_tempfile(fd, name, &tmp); 638 TT.count = 0; 639 for (command = (void *)TT.pattern; command; command = command->next) 640 command->hit = 0; 641 } 642 do_lines(fd, process_line); 643 if (i) { 644 process_line(0, 0); 645 replace_tempfile(-1, TT.fdout, &tmp); 646 TT.fdout = 1; 647 TT.nextline = 0; 648 TT.nextlen = TT.noeol = 0; 649 } 650 } 651 652 // Copy chunk of string between two delimiters, converting printf escapes. 653 // returns processed copy of string (0 if error), *pstr advances to next 654 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter 655 // if regxex, ignore delimiter in [ranges] 656 static char *unescape_delimited_string(char **pstr, char *delim) 657 { 658 char *to, *from, mode = 0, d; 659 660 // Grab leading delimiter (if necessary), allocate space for new string 661 from = *pstr; 662 if (!delim || !*delim) { 663 if (!(d = *(from++))) return 0; 664 if (d == '\\') d = *(from++); 665 if (!d || d == '\\') return 0; 666 if (delim) *delim = d; 667 } else d = *delim; 668 to = delim = xmalloc(strlen(*pstr)+1); 669 670 while (mode || *from != d) { 671 if (!*from) return 0; 672 673 // delimiter in regex character range doesn't count 674 if (*from == '[') { 675 if (!mode) { 676 mode = ']'; 677 if (from[1]=='-' || from[1]==']') *(to++) = *(from++); 678 } else if (mode == ']' && strchr(".=:", from[1])) { 679 *(to++) = *(from++); 680 mode = *from; 681 } 682 } else if (*from == mode) { 683 if (mode == ']') mode = 0; 684 else { 685 *(to++) = *(from++); 686 mode = ']'; 687 } 688 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err, 689 // but the perl build does it, so we need to filter it out. 690 } else if (mode && *from == '-' && from[-1] == from[1]) { 691 from+=2; 692 continue; 693 } else if (*from == '\\') { 694 if (!from[1]) return 0; 695 696 // Check escaped end delimiter before printf style escapes. 697 if (from[1] == d) from++; 698 else if (from[1]=='\\') *(to++) = *(from++); 699 else { 700 char c = unescape(from[1]); 701 702 if (c) { 703 *(to++) = c; 704 from+=2; 705 continue; 706 } else if (!mode) *(to++) = *(from++); 707 } 708 } 709 *(to++) = *(from++); 710 } 711 *to = 0; 712 *pstr = from+1; 713 714 return delim; 715 } 716 717 // Translate pattern strings into command structures. Each command structure 718 // is a single allocation (which requires some math and remalloc at times). 719 static void parse_pattern(char **pline, long len) 720 { 721 struct sedcmd *command = (void *)TT.pattern; 722 char *line, *reg, c, *errstart; 723 int i; 724 725 line = errstart = pline ? *pline : ""; 726 if (len && line[len-1]=='\n') line[--len] = 0; 727 728 // Append this line to previous multiline command? (hit indicates type.) 729 // During parsing "hit" stores data about line continuations, but in 730 // process_line() it means the match range attached to this command 731 // is active, so processing the continuation must zero it again. 732 if (command && command->prev->hit) { 733 // Remove half-finished entry from list so remalloc() doesn't confuse it 734 TT.pattern = TT.pattern->prev; 735 command = dlist_pop(&TT.pattern); 736 c = command->c; 737 reg = (char *)command; 738 reg += command->arg1 + strlen(reg + command->arg1); 739 740 // Resume parsing for 'a' or 's' command. (Only two that can do this.) 741 // TODO: using 256 to indicate 'a' means our s/// delimiter can't be 742 // a unicode character. 743 if (command->hit < 256) goto resume_s; 744 else goto resume_a; 745 } 746 747 // Loop through commands in this line. 748 749 command = 0; 750 for (;;) { 751 if (command) dlist_add_nomalloc(&TT.pattern, (void *)command); 752 753 // If there's no more data on this line, return. 754 for (;;) { 755 while (isspace(*line) || *line == ';') line++; 756 if (*line == '#') while (*line && *line != '\n') line++; 757 else break; 758 } 759 if (!*line) return; 760 761 // We start by writing data into toybuf. Later we'll allocate the 762 // ex 763 764 errstart = line; 765 memset(toybuf, 0, sizeof(struct sedcmd)); 766 command = (void *)toybuf; 767 reg = toybuf + sizeof(struct sedcmd); 768 769 // Parse address range (if any) 770 for (i = 0; i < 2; i++) { 771 if (*line == ',') line++; 772 else if (i) break; 773 774 if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0); 775 else if (*line == '$') { 776 command->lmatch[i] = -1; 777 line++; 778 } else if (*line == '/' || *line == '\\') { 779 char *s = line; 780 781 if (!(s = unescape_delimited_string(&line, 0))) goto error; 782 if (!*s) command->rmatch[i] = 0; 783 else { 784 xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED); 785 command->rmatch[i] = reg-toybuf; 786 reg += sizeof(regex_t); 787 } 788 free(s); 789 } else break; 790 } 791 792 while (isspace(*line)) line++; 793 if (!*line) break; 794 795 while (*line == '!') { 796 command->not = 1; 797 line++; 798 } 799 while (isspace(*line)) line++; 800 801 c = command->c = *(line++); 802 if (strchr("}:", c) && i) break; 803 if (strchr("aiqr=", c) && i>1) break; 804 805 // Add step to pattern 806 command = xmemdup(toybuf, reg-toybuf); 807 reg = (reg-toybuf) + (char *)command; 808 809 // Parse arguments by command type 810 if (c == '{') TT.nextlen++; 811 else if (c == '}') { 812 if (!TT.nextlen--) break; 813 } else if (c == 's') { 814 char *end, delim = 0; 815 816 // s/pattern/replacement/flags 817 818 // line continuations use arg1 (back at the start of the function), 819 // so let's fill out arg2 first (since the regex part can't be multiple 820 // lines) and swap them back later. 821 822 // get pattern (just record, we parse it later) 823 command->arg2 = reg - (char *)command; 824 if (!(TT.remember = unescape_delimited_string(&line, &delim))) 825 goto error; 826 827 reg += sizeof(regex_t); 828 command->arg1 = reg-(char *)command; 829 command->hit = delim; 830 resume_s: 831 // get replacement - don't replace escapes yet because \1 and \& need 832 // processing later, after we replace \\ with \ we can't tell \\1 from \1 833 end = line; 834 while (*end != command->hit) { 835 if (!*end) goto error; 836 if (*end++ == '\\') { 837 if (!*end || *end == '\n') { 838 end[-1] = '\n'; 839 break; 840 } 841 end++; 842 } 843 } 844 845 reg = extend_string((void *)&command, line, reg-(char *)command,end-line); 846 line = end; 847 // line continuation? (note: '\n' can't be a valid delim). 848 if (*line == command->hit) command->hit = 0; 849 else { 850 if (!*line) continue; 851 reg--; 852 line++; 853 goto resume_s; 854 } 855 856 // swap arg1/arg2 so they're back in order arguments occur. 857 i = command->arg1; 858 command->arg1 = command->arg2; 859 command->arg2 = i; 860 861 // get flags 862 for (line++; *line; line++) { 863 long l; 864 865 if (isspace(*line) && *line != '\n') continue; 866 867 if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l; 868 else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) { 869 command->sflags |= l << 3; 870 line--; 871 } else break; 872 } 873 874 // We deferred actually parsing the regex until we had the s///i flag 875 // allocating the space was done by extend_string() above 876 if (!*TT.remember) command->arg1 = 0; 877 else xregcomp((void *)(command->arg1 + (char *)command), TT.remember, 878 ((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE)); 879 free(TT.remember); 880 TT.remember = 0; 881 if (*line == 'w') { 882 line++; 883 goto writenow; 884 } 885 } else if (c == 'w') { 886 int fd, delim; 887 char *cc; 888 889 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and 890 // eol status, and to retain the filename for error messages, we'd need 891 // to go up to arg5 just for this. Compromise: dynamically allocate the 892 // filehandle and eol status. 893 894 writenow: 895 while (isspace(*line)) line++; 896 if (!*line) goto error; 897 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break; 898 delim = *cc; 899 *cc = 0; 900 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644); 901 *cc = delim; 902 903 command->w = reg - (char *)command; 904 command = xrealloc(command, command->w+(cc-line)+6); 905 reg = command->w + (char *)command; 906 907 memcpy(reg, &fd, 4); 908 reg += 4; 909 *(reg++) = 0; 910 memcpy(reg, line, delim); 911 reg += delim; 912 *(reg++) = 0; 913 914 line = cc; 915 if (delim) line += 2; 916 } else if (c == 'y') { 917 char *s, delim = 0; 918 int len; 919 920 if (!(s = unescape_delimited_string(&line, &delim))) goto error; 921 command->arg1 = reg-(char *)command; 922 len = strlen(s); 923 reg = extend_string((void *)&command, s, reg-(char *)command, len); 924 free(s); 925 command->arg2 = reg-(char *)command; 926 if (!(s = unescape_delimited_string(&line, &delim))) goto error; 927 if (len != strlen(s)) goto error; 928 reg = extend_string((void *)&command, s, reg-(char*)command, len); 929 free(s); 930 } else if (strchr("abcirtTw:", c)) { 931 int end; 932 933 // trim leading spaces 934 while (isspace(*line) && *line != '\n') line++; 935 936 // Resume logic differs from 's' case because we don't add a newline 937 // unless it's after something, so we add it on return instead. 938 resume_a: 939 command->hit = 0; 940 941 // btT: end with space or semicolon, aicrw continue to newline. 942 if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) { 943 // Argument's optional for btT 944 if (strchr("btT", c)) continue; 945 else if (!command->arg1) break; 946 } 947 948 // Extend allocation to include new string. We use offsets instead of 949 // pointers so realloc() moving stuff doesn't break things. Ok to write 950 // \n over NUL terminator because call to extend_string() adds it back. 951 if (!command->arg1) command->arg1 = reg - (char*)command; 952 else if (*(command->arg1+(char *)command)) *(reg++) = '\n'; 953 else if (!pline) { 954 command->arg1 = 0; 955 continue; 956 } 957 reg = extend_string((void *)&command, line, reg - (char *)command, end); 958 959 // Recopy data to remove escape sequences and handle line continuation. 960 if (strchr("aci", c)) { 961 reg -= end+1; 962 for (i = end; i; i--) { 963 if ((*reg++ = *line++)=='\\') { 964 965 // escape at end of line: resume if -e escaped literal newline, 966 // else request callback and resume with next line 967 if (!--i) { 968 *--reg = 0; 969 if (*line) { 970 line++; 971 goto resume_a; 972 } 973 command->hit = 256; 974 break; 975 } 976 if (!(reg[-1] = unescape(*line))) reg[-1] = *line; 977 line++; 978 } 979 } 980 *reg = 0; 981 } else line += end; 982 983 // Commands that take no arguments 984 } else if (!strchr("{dDgGhHlnNpPqx=", c)) break; 985 } 986 987 error: 988 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line); 989 } 990 991 void sed_main(void) 992 { 993 struct arg_list *al; 994 char **args = toys.optargs; 995 996 // Lie to autoconf when it asks stupid questions, so configure regexes 997 // that look for "GNU sed version %f" greater than some old buggy number 998 // don't fail us for not matching their narrow expectations. 999 if (toys.optflags & FLAG_version) { 1000 xprintf("This is not GNU sed version 9.0\n"); 1001 return; 1002 } 1003 1004 // Handling our own --version means we handle our own --help too. 1005 if (toys.optflags&FLAG_help) help_exit(0); 1006 1007 // Parse pattern into commands. 1008 1009 // If no -e or -f, first argument is the pattern. 1010 if (!TT.e && !TT.f) { 1011 if (!*toys.optargs) error_exit("no pattern"); 1012 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++); 1013 } 1014 1015 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah" 1016 // so handle all -e, then all -f. (At least the behavior's consistent.) 1017 1018 for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg)); 1019 for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern); 1020 parse_pattern(0, 0); 1021 dlist_terminate(TT.pattern); 1022 if (TT.nextlen) error_exit("no }"); 1023 1024 TT.fdout = 1; 1025 TT.remember = xstrdup(""); 1026 1027 // Inflict pattern upon input files. Long version because !O_CLOEXEC 1028 loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed); 1029 1030 if (!(toys.optflags & FLAG_i)) process_line(0, 0); 1031 1032 // todo: need to close fd when done for TOYBOX_FREE? 1033 } 1034