Home | History | Annotate | Download | only in awk
      1 # Copyright (C) 2010 The Android Open Source Project
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #      http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 #
     15 
     16 # Tiny XML parser implementation in awk.
     17 #
     18 # This file is not meant to be used directly, instead copy the
     19 # functions it defines here into your own script then specialize
     20 # it appropriately.
     21 #
     22 
     23 # See further below for usage instructions and implementation details.
     24 #
     25 
     26 # ---------------------------- cut here ---------------------------
     27 
     28 function xml_event () {
     29     RS=">";
     30     XML_TAG=XML_TYPE="";
     31     split("", XML_ATTR);
     32     while ( 1 ) {
     33         if (_xml_closing) { # delayed direct tag closure
     34             XML_TAG = _xml_closing;
     35             XML_TYPE = "END";
     36             _xml_closing = "";
     37             _xml_exit(XML_TAG);
     38             return 1;
     39         }
     40         if (getline <= 0) return 0; # read new input line
     41         _xml_p = index($0, "<"); # get start marker
     42         if (_xml_p == 0) return 0; # end of file (or malformed input)
     43         $0 = substr($0, _xml_p) # remove anything before '<'
     44         # ignore CData / Comments / Processing instructions / Declarations
     45         if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") ||
     46             _xml_in_section("<!--", "--") ||
     47             _xml_in_section("<\\?", "\\?") ||
     48             _xml_in_section("<!", "")) {
     49             continue;
     50         }
     51         if (substr($0, 1, 2) == "</") { # is it a closing tag ?
     52             XML_TYPE = "END";
     53             $0 = substr($0, 3);
     54         } else { # nope, it's an opening one
     55             XML_TYPE = "BEGIN";
     56             $0 = substr($0, 2);
     57         }
     58         XML_TAG = $0
     59         sub("[ \n\t/].*$", "", XML_TAG);  # extract tag name
     60         XML_TAG = toupper(XML_TAG);       # uppercase it
     61         if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ )  # validate it
     62             _xml_panic("Invalid tag name: " XML_TAG);
     63         if (XML_TYPE == "BEGIN") {  # update reverse path
     64             _xml_enter(XML_TAG);
     65         } else {
     66             _xml_exit(XML_TAG);
     67         }
     68         sub("[^ \n\t]*[ \n\t]*", "", $0); # get rid of tag and spaces
     69         while ($0) { # process attributes
     70             if ($0 == "/") {  # deal with direct closing tag, e.g. </foo>
     71                 _xml_closing = XML_TAG; # record delayed tag closure.
     72                 break
     73             }
     74             _xml_attrib = $0;
     75             sub(/=.*$/,"",_xml_attrib);  # extract attribute name
     76             sub(/^[^=]*/,"",$0);         # remove it from record
     77             _xml_attrib = tolower(_xml_attrib);
     78             if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it
     79                 _xml_panic("Invalid attribute name: " _xml_attrib);
     80             if (substr($0,1,2) == "=\"") { # value is ="something"
     81                 _xml_value = substr($0,3);
     82                 sub(/".*$/,"",_xml_value);
     83                 sub(/^="[^"]*"/,"",$0);
     84             } else if (substr($0,1,2) == "='") { # value is ='something'
     85                 _xml_value = substr($0,3);
     86                 sub(/'.*$/,"",_xml_value);
     87                 sub(/^='[^']*'/,"",$0);
     88             } else {
     89                 _xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0);
     90             }
     91             XML_ATTR[_xml_attrib] = _xml_value;  # store attribute name/value
     92             sub(/^[ \t\n]*/,"",$0); # get rid of remaining leading spaces
     93         }
     94         return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set
     95     }
     96 }
     97 
     98 function _xml_panic (msg) {
     99     print msg > "/dev/stderr"
    100     exit(1)
    101 }
    102 
    103 function _xml_in_section (sec_begin, sec_end) {
    104     if (!match( $0, "^" sec_begin )) return 0;
    105     while (!match($0, sec_end "$")) {
    106         if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO);
    107     }
    108     return 1;
    109 }
    110 
    111 function _xml_enter (tag) {
    112     XML_RPATH = tag "/" XML_RPATH;
    113 }
    114 
    115 function _xml_exit (tag) {
    116     _xml_p = index(XML_RPATH, "/");
    117     _xml_expected = substr(XML_RPATH, 1, _xml_p-1);
    118     if (_xml_expected != XML_TAG)
    119         _xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected);
    120     XML_RPATH = substr(XML_RPATH, _xml_p+1);
    121 }
    122 
    123 # ---------------------------- cut here ---------------------------
    124 
    125 # USAGE:
    126 #
    127 # The functions provided here are used to extract the tags and attributes of a
    128 # given XML file. They do not support extraction of data, CDATA, comments,
    129 # processing instructions and declarations at all.
    130 #
    131 # You should use this from the BEGIN {} action of your awk script (it will
    132 # not work from an END {} action).
    133 #
    134 # Call xml_event() in a while loop. This functions returns 1 for each XML
    135 # 'event' encountered, or 0 when the end of input is reached. Note that in
    136 # case of malformed output, an error will be printed and the script will
    137 # force an exit(1)
    138 #
    139 # After each succesful xml_event() call, the following variables will be set:
    140 #
    141 #    XML_TYPE:  type of event: "BEGIN" -> mean an opening tag, "END" a
    142 #               closing one.
    143 #
    144 #    XML_TAG:   name of the tag, always in UPPERCASE!
    145 #
    146 #    XML_ATTR:  a map of attributes for the type. Only set for "BEGIN" types.
    147 #               all attribute names are in lowercase.
    148 #
    149 #               beware: values are *not* unescaped !
    150 #
    151 #    XML_RPATH: the _reversed_ element path, using "/" as a separator.
    152 #               if you are within the <manifest><application> tag, then
    153 #               it will be set to "APPLICATION/MANIFEST/"
    154 #               (note the trailing slash).
    155 #
    156 
    157 # This is a simple example that dumps the output of the parsing.
    158 #
    159 BEGIN {
    160     while ( xml_event() ) {
    161         printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH;
    162         if (XML_TYPE == "BEGIN") {
    163             for (attr in XML_ATTR) {
    164                 printf " %s='%s'", attr, XML_ATTR[attr];
    165             }
    166         }
    167         printf "\n";
    168     }
    169 }
    170 
    171 # IMPLEMENTATION DETAILS:
    172 #
    173 # 1. '>' as the record separator:
    174 #
    175 # RS is set to '>' to use this character as the record separator, instead of
    176 # the default '\n'. This means that something like the following:
    177 #
    178 #   <foo><bar attrib="value">stuff</bar></foo>
    179 #
    180 # will be translated into the following successive 'records':
    181 #
    182 #  <foo
    183 #  <bar attrib="value"
    184 #  stuff</bar
    185 #  </foo
    186 #
    187 # Note that the '>' is never part of the records and thus will not be matched.
    188 # If the record does not contain a single '<', the input is either
    189 # malformed XML, or we reached the end of file with data after the last
    190 # '>'.
    191 #
    192 # Newlines in the original input are kept in the records as-is.
    193 #
    194 # 2. Getting rid of unwanted stuff:
    195 #
    196 # We don't need any of the data within elements, so we get rid of them by
    197 # simply ignoring anything before the '<' in the current record. This is
    198 # done with code like this:
    199 #
    200 #     p = index($0, "<");       # get index of '<'
    201 #     if (p == 0) -> return 0;  # malformed input or end of file
    202 #     $0 = substr($0, p+1);     # remove anything before the '<' in record
    203 #
    204 # We also want to ignore certain sections like CDATA, comments, declarations,
    205 # etc.. These begin with a certain pattern and end with another one, e.g.
    206 # "<!--" and "-->" for comments. This is handled by the _xml_in_section()
    207 # function that accepts two patterns as input:
    208 #
    209 #    sec_begin: is the pattern for the start of the record.
    210 #    sec_end:   is the pattern for the end of the record (minus trailing '>').
    211 #
    212 # The function deals with the fact that these section can embed a valid '>'
    213 # and will then span multiple records, i.e. something like:
    214 #
    215 #  <!-- A comment with an embedded > right here ! -->
    216 #
    217 # will be decomposed into two records:
    218 #
    219 #   "<!-- A comment with an embedded "
    220 #   " right here ! --"
    221 #
    222 # The function deals with this case, and exits when such a section is not
    223 # properly terminated in the input.
    224 #
    225 # _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise.
    226 #
    227 # 3. Extracting the tag name:
    228 #
    229 # </foo> is a closing tag, and <foo> an opening tag, this is handled
    230 # by the following code:
    231 #
    232 #       if (substr($0, 1, 2) == "</") {
    233 #           XML_TYPE = "END";
    234 #           $0 = substr($0, 3);
    235 #       } else {
    236 #           XML_TYPE = "BEGIN";
    237 #           $0 = substr($0, 2);
    238 #       }
    239 #
    240 # which defines XML_TYPE, and removes the leading "</" or "<" from the record.
    241 # The tag is later extracted and converted to uppercase with:
    242 #
    243 #       XML_TAG = $0                      # copy record
    244 #       sub("[ \n\t/].*$", "", XML_TAG);  # remove anything after tag name
    245 #       XML_TAG = toupper(XML_TAG);       # conver to uppercase
    246 #       # validate tag
    247 #       if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic
    248 #
    249 # Then the record is purged from the tag name and the spaces after it:
    250 #
    251 #       # get rid of tag and spaces after it in $0
    252 #       sub("[^ \n\t]*[ \n\t]*", "", $0);
    253 #
    254 # 4. Maintaining XML_RPATH:
    255 #
    256 # The _xml_enter() and _xml_exit() functions are called to maintain the
    257 # XML_RPATH variable when entering and exiting specific tags. _xml_exit()
    258 # will also validate the input, checking proper tag enclosure (or exit(1)
    259 # in case of error).
    260 #
    261 #       if (XML_TYPE == "BEGIN") {
    262 #           _xml_enter(XML_TAG);
    263 #       } else {
    264 #           _xml_exit(XML_TAG);
    265 #       }
    266 #
    267 # 5. Extracting attributes:
    268 #
    269 # A loop is implemented to parse attributes, the idea is to get the attribute
    270 # name, which is always followed by a '=' character:
    271 #
    272 #           _xml_attrib = $0;              # copy record.
    273 #           sub(/=.*$/,"",_xml_attrib);    # get rid of '=' and anything after.
    274 #           sub(/^[^=]*/,"",$0);           # remove attribute name from $0
    275 #           _xml_attrib = tolower(_xml_attrib);
    276 #           if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ )
    277 #               _xml_panic("Invalid attribute name: " _xml_attrib);
    278 #
    279 # Now get the value, which is enclosed by either (") or (')
    280 #
    281 #          if (substr($0,1,2) == "=\"") {        # if $0 begins with ="
    282 #               _xml_value = substr($0,3);       # extract value
    283 #               sub(/".*$/,"",_xml_value);  
    284 #               sub(/^="[^"]*"/,"",$0);          # remove it from $0
    285 #           } else if (substr($0,1,2) == "='") { # if $0 begins with ='
    286 #               _xml_value = substr($0,3);       # extract value
    287 #               sub(/'.*$/,"",_xml_value);
    288 #               sub(/^='[^']*'/,"",$0);          # remove it from $0
    289 #           } else {
    290 #               -> panic (malformed input)
    291 #           }
    292 #
    293 # After that, we simply store the value into the XML_ATTR associative
    294 # array, and cleanup $0 from leading spaces:
    295 #
    296 #           XML_ATTR[_xml_attrib] = _xml_value;
    297 #           sub(/^[ \t\n]*/,"",$0);
    298 #
    299 #
    300 # 6. Handling direct tag closure:
    301 #
    302 # When a tag is closed directly (as in <foo/>), A single '/' will be
    303 # parsed in the attribute parsing loop. We need to record this for the
    304 # next call to xml_event(), since the current one should return a"BEGIN"
    305 # for the "FOO" tag instead.
    306 #
    307 # We do this by setting the special _xml_closing variable, as in:
    308 #
    309 #          if ($0 == "/") {
    310 #               # record a delayed tag closure for the next call
    311 #               _xml_closing = XML_TAG;
    312 #               break
    313 #           }
    314 #
    315 # This variable is checked at the start of xml_event() like this:
    316 #
    317 #       # delayed tag closure - see below
    318 #       if (_xml_closing) {
    319 #           XML_TAG = _xml_closing;
    320 #           XML_TYPE = "END";
    321 #           _xml_closing = "";
    322 #           _xml_exit(XML_TAG);
    323 #           return 1;
    324 #       }
    325 #
    326 # Note the call to _xml_exit() to update XML_RPATH here.
    327 #
    328