Home | History | Annotate | Download | only in sed
      1 /*  GNU SED, a batch stream editor.
      2     Copyright (C) 1989,90,91,92,93,94,95,98,99,2002,2003
      3     Free Software Foundation, Inc.
      4 
      5     This program is free software; you can redistribute it and/or modify
      6     it under the terms of the GNU General Public License as published by
      7     the Free Software Foundation; either version 3, or (at your option)
      8     any later version.
      9 
     10     This program is distributed in the hope that it will be useful,
     11     but WITHOUT ANY WARRANTY; without even the implied warranty of
     12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13     GNU General Public License for more details.
     14 
     15     You should have received a copy of the GNU General Public License
     16     along with this program; if not, write to the Free Software
     17     Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
     18 
     19 #ifdef HAVE_CONFIG_H
     20 #include "config.h"
     21 #endif
     22 
     23 #include "basicdefs.h"
     24 #include "regex.h"
     25 
     26 #ifndef BOOTSTRAP
     27 #include <stdio.h>
     28 #include "unlocked-io.h"
     29 #endif
     30 
     31 #include "utils.h"
     32 
     33 /* Struct vector is used to describe a compiled sed program. */
     34 struct vector {
     35   struct sed_cmd *v;	/* a dynamically allocated array */
     36   size_t v_allocated;	/* ... number slots allocated */
     37   size_t v_length;	/* ... number of slots in use */
     38 };
     39 
     40 /* This structure tracks files used by sed so that they may all be
     41    closed cleanly at normal program termination.  A flag is kept that tells
     42    if a missing newline was encountered, so that it is added on the
     43    next line and the two lines are not concatenated.  */
     44 struct output {
     45   char *name;
     46   bool missing_newline;
     47   FILE *fp;
     48   struct output *link;
     49 };
     50 
     51 struct text_buf {
     52   char *text;
     53   size_t text_length;
     54 };
     55 
     56 struct regex {
     57   regex_t pattern;
     58   int flags;
     59   size_t sz;
     60   char re[1];
     61 };
     62 
     63 enum replacement_types {
     64   REPL_ASIS = 0,
     65   REPL_UPPERCASE = 1,
     66   REPL_LOWERCASE = 2,
     67   REPL_UPPERCASE_FIRST = 4,
     68   REPL_LOWERCASE_FIRST = 8,
     69   REPL_MODIFIERS = REPL_UPPERCASE_FIRST | REPL_LOWERCASE_FIRST,
     70 
     71   /* These are given to aid in debugging */
     72   REPL_UPPERCASE_UPPERCASE = REPL_UPPERCASE_FIRST | REPL_UPPERCASE,
     73   REPL_UPPERCASE_LOWERCASE = REPL_UPPERCASE_FIRST | REPL_LOWERCASE,
     74   REPL_LOWERCASE_UPPERCASE = REPL_LOWERCASE_FIRST | REPL_UPPERCASE,
     75   REPL_LOWERCASE_LOWERCASE = REPL_LOWERCASE_FIRST | REPL_LOWERCASE
     76 };
     77 
     78 enum text_types {
     79   TEXT_BUFFER,
     80   TEXT_REPLACEMENT,
     81   TEXT_REGEX
     82 };
     83 
     84 enum posixicity_types {
     85   POSIXLY_EXTENDED,	/* with GNU extensions */
     86   POSIXLY_CORRECT,	/* with POSIX-compatible GNU extensions */
     87   POSIXLY_BASIC		/* pedantically POSIX */
     88 };
     89 
     90 enum addr_state {
     91   RANGE_INACTIVE,	/* never been active */
     92   RANGE_ACTIVE,		/* between first and second address */
     93   RANGE_CLOSED		/* like RANGE_INACTIVE, but range has ended once */
     94 };
     95 
     96 enum addr_types {
     97   ADDR_IS_NULL,		/* null address */
     98   ADDR_IS_REGEX,	/* a.addr_regex is valid */
     99   ADDR_IS_NUM,		/* a.addr_number is valid */
    100   ADDR_IS_NUM_MOD,	/* a.addr_number is valid, addr_step is modulo */
    101   ADDR_IS_STEP,		/* address is +N (only valid for addr2) */
    102   ADDR_IS_STEP_MOD,	/* address is ~N (only valid for addr2) */
    103   ADDR_IS_LAST		/* address is $ */
    104 };
    105 
    106 struct addr {
    107   enum addr_types addr_type;
    108   countT addr_number;
    109   countT addr_step;
    110   struct regex *addr_regex;
    111 };
    112 
    113 
    114 struct replacement {
    115   char *prefix;
    116   size_t prefix_length;
    117   int subst_id;
    118   enum replacement_types repl_type;
    119   struct replacement *next;
    120 };
    121 
    122 struct subst {
    123   struct regex *regx;
    124   struct replacement *replacement;
    125   countT numb;		/* if >0, only substitute for match number "numb" */
    126   struct output *outf;	/* 'w' option given */
    127   unsigned global : 1;	/* 'g' option given */
    128   unsigned print : 2;	/* 'p' option given (before/after eval) */
    129   unsigned eval : 1;	/* 'e' option given */
    130   unsigned max_id : 4;  /* maximum backreference on the RHS */
    131 };
    132 
    133 #ifdef REG_PERL
    134 /* This is the structure we store register match data in.  See
    135    regex.texinfo for a full description of what registers match.  */
    136 struct re_registers
    137 {
    138   unsigned num_regs;
    139   regoff_t *start;
    140   regoff_t *end;
    141 };
    142 #endif
    143 
    144 
    145 
    146 struct sed_cmd {
    147   struct addr *a1;	/* save space: usually is NULL */
    148   struct addr *a2;
    149 
    150   /* See description the enum, above.  */
    151   enum addr_state range_state;
    152 
    153   /* Non-zero if command is to be applied to non-matches. */
    154   char addr_bang;
    155 
    156   /* The actual command character. */
    157   char cmd;
    158 
    159   /* auxiliary data for various commands */
    160   union {
    161     /* This structure is used for a, i, and c commands. */
    162     struct text_buf cmd_txt;
    163 
    164     /* This is used for the l, q and Q commands. */
    165     int int_arg;
    166 
    167     /* This is used for the {}, b, and t commands. */
    168     countT jump_index;
    169 
    170     /* This is used for the r command. */
    171     char *fname;
    172 
    173     /* This is used for the hairy s command. */
    174     struct subst *cmd_subst;
    175 
    176     /* This is used for the w command. */
    177     struct output *outf;
    178 
    179     /* This is used for the R command. */
    180     FILE *fp;
    181 
    182     /* This is used for the y command. */
    183     unsigned char *translate;
    184     char **translatemb;
    185   } x;
    186 };
    187 
    188 
    189 
    190 void bad_prog P_((const char *why));
    192 size_t normalize_text P_((char *text, size_t len, enum text_types buftype));
    193 struct vector *compile_string P_((struct vector *, char *str, size_t len));
    194 struct vector *compile_file P_((struct vector *, const char *cmdfile));
    195 void check_final_program P_((struct vector *));
    196 void rewind_read_files P_((void));
    197 void finish_program P_((struct vector *));
    198 
    199 struct regex *compile_regex P_((struct buffer *b, int flags, int needed_sub));
    200 int match_regex P_((struct regex *regex,
    201 		    char *buf, size_t buflen, size_t buf_start_offset,
    202 		    struct re_registers *regarray, int regsize));
    203 #ifdef DEBUG_LEAKS
    204 void release_regex P_((struct regex *));
    205 #endif
    206 
    207 int process_files P_((struct vector *, char **argv));
    208 
    209 int main P_((int, char **));
    210 
    211 extern void fmt P_ ((const char *line, const char *line_end, int max_length, FILE *output_file));
    212 
    213 extern int extended_regexp_flags;
    214 
    215 /* If set, fflush(stdout) on every line output. */
    216 extern bool unbuffered_output;
    217 
    218 /* If set, don't write out the line unless explicitly told to. */
    219 extern bool no_default_output;
    220 
    221 /* If set, reset line counts on every new file. */
    222 extern bool separate_files;
    223 
    224 /* If set, follow symlinks when invoked with -i option */
    225 extern bool follow_symlinks;
    226 
    227 /* Do we need to be pedantically POSIX compliant? */
    228 extern enum posixicity_types posixicity;
    229 
    230 /* How long should the `l' command's output line be? */
    231 extern countT lcmd_out_line_len;
    232 
    233 /* How do we edit files in-place? (we don't if NULL) */
    234 extern char *in_place_extension;
    235 
    236 /* The mode to use to read files, either "rt" or "rb".  */
    237 extern char *read_mode;
    238 
    239 /* Should we use EREs? */
    240 extern bool use_extended_syntax_p;
    241 
    242 /* Declarations for multibyte character sets.  */
    243 extern int mb_cur_max;
    244 extern bool is_utf8;
    245 
    246 #ifdef HAVE_MBRTOWC
    247 #ifdef HAVE_BTOWC
    248 #define MBRTOWC(pwc, s, n, ps) \
    249   (mb_cur_max == 1 ? \
    250    (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
    251    mbrtowc ((pwc), (s), (n), (ps)))
    252 
    253 #define WCRTOMB(s, wc, ps) \
    254   (mb_cur_max == 1 ? \
    255    (*(s) = wctob ((wint_t) (wc)), 1) : \
    256    wcrtomb ((s), (wc), (ps)))
    257 #else
    258 #define MBRTOWC(pwc, s, n, ps) \
    259   mbrtowc ((pwc), (s), (n), (ps))
    260 
    261 #define WCRTOMB(s, wc, ps) \
    262   wcrtomb ((s), (wc), (ps))
    263 #endif
    264 
    265 #define MBSINIT(s) \
    266   (mb_cur_max == 1 ? 1 : mbsinit ((s)))
    267 
    268 #define MBRLEN(s, n, ps) \
    269   (mb_cur_max == 1 ? 1 : mbrtowc (NULL, s, n, ps))
    270 
    271 #define BRLEN(ch, ps) \
    272   (mb_cur_max == 1 ? 1 : brlen (ch, ps))
    273 
    274 #else
    275 #define MBSINIT(s) 1
    276 #define MBRLEN(s, n, ps) 1
    277 #define BRLEN(ch, ps) 1
    278 #endif
    279 
    280 extern int brlen P_ ((int ch, mbstate_t *ps));
    281 extern void initialize_mbcs P_ ((void));
    282 
    283