1 # Copyright (C) 2010 The Android Open Source Project 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 # 15 16 # Tiny XML parser implementation in awk. 17 # 18 # This file is not meant to be used directly, instead copy the 19 # functions it defines here into your own script then specialize 20 # it appropriately. 21 # 22 23 # See further below for usage instructions and implementation details. 24 # 25 26 # ---------------------------- cut here --------------------------- 27 28 function xml_event () { 29 RS=">"; 30 XML_TAG=XML_TYPE=""; 31 split("", XML_ATTR); 32 while ( 1 ) { 33 if (_xml_closing) { # delayed direct tag closure 34 XML_TAG = _xml_closing; 35 XML_TYPE = "END"; 36 _xml_closing = ""; 37 _xml_exit(XML_TAG); 38 return 1; 39 } 40 if (getline <= 0) return 0; # read new input line 41 _xml_p = index($0, "<"); # get start marker 42 if (_xml_p == 0) return 0; # end of file (or malformed input) 43 $0 = substr($0, _xml_p) # remove anything before '<' 44 # ignore CData / Comments / Processing instructions / Declarations 45 if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") || 46 _xml_in_section("<!--", "--") || 47 _xml_in_section("<\\?", "\\?") || 48 _xml_in_section("<!", "")) { 49 continue; 50 } 51 if (substr($0, 1, 2) == "</") { # is it a closing tag ? 52 XML_TYPE = "END"; 53 $0 = substr($0, 3); 54 } else { # nope, it's an opening one 55 XML_TYPE = "BEGIN"; 56 $0 = substr($0, 2); 57 } 58 XML_TAG = $0 59 sub("[ \r\n\t/].*$", "", XML_TAG); # extract tag name 60 XML_TAG = toupper(XML_TAG); # uppercase it 61 if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) # validate it 62 _xml_panic("Invalid tag name: " XML_TAG); 63 if (XML_TYPE == "BEGIN") { # update reverse path 64 _xml_enter(XML_TAG); 65 } else { 66 _xml_exit(XML_TAG); 67 } 68 sub("[^ \r\n\t]*[ \r\n\t]*", "", $0); # get rid of tag and spaces 69 while ($0) { # process attributes 70 if ($0 == "/") { # deal with direct closing tag, e.g. </foo> 71 _xml_closing = XML_TAG; # record delayed tag closure. 72 break 73 } 74 _xml_attrib = $0; 75 sub(/=.*$/,"",_xml_attrib); # extract attribute name 76 sub(/^[^=]*/,"",$0); # remove it from record 77 _xml_attrib = tolower(_xml_attrib); 78 if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it 79 _xml_panic("Invalid attribute name: " _xml_attrib); 80 if (substr($0,1,2) == "=\"") { # value is ="something" 81 _xml_value = substr($0,3); 82 sub(/".*$/,"",_xml_value); 83 sub(/^="[^"]*"/,"",$0); 84 } else if (substr($0,1,2) == "='") { # value is ='something' 85 _xml_value = substr($0,3); 86 sub(/'.*$/,"",_xml_value); 87 sub(/^='[^']*'/,"",$0); 88 } else { 89 _xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0); 90 } 91 XML_ATTR[_xml_attrib] = _xml_value; # store attribute name/value 92 sub(/^[ \t\r\n]*/,"",$0); # get rid of remaining leading spaces 93 } 94 return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set 95 } 96 } 97 98 function _xml_panic (msg) { 99 print msg > "/dev/stderr" 100 exit(1) 101 } 102 103 function _xml_in_section (sec_begin, sec_end) { 104 if (!match( $0, "^" sec_begin )) return 0; 105 while (!match($0, sec_end "$")) { 106 if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO); 107 } 108 return 1; 109 } 110 111 function _xml_enter (tag) { 112 XML_RPATH = tag "/" XML_RPATH; 113 } 114 115 function _xml_exit (tag) { 116 _xml_p = index(XML_RPATH, "/"); 117 _xml_expected = substr(XML_RPATH, 1, _xml_p-1); 118 if (_xml_expected != XML_TAG) 119 _xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected); 120 XML_RPATH = substr(XML_RPATH, _xml_p+1); 121 } 122 123 # ---------------------------- cut here --------------------------- 124 125 # USAGE: 126 # 127 # The functions provided here are used to extract the tags and attributes of a 128 # given XML file. They do not support extraction of data, CDATA, comments, 129 # processing instructions and declarations at all. 130 # 131 # You should use this from the BEGIN {} action of your awk script (it will 132 # not work from an END {} action). 133 # 134 # Call xml_event() in a while loop. This functions returns 1 for each XML 135 # 'event' encountered, or 0 when the end of input is reached. Note that in 136 # case of malformed output, an error will be printed and the script will 137 # force an exit(1) 138 # 139 # After each succesful xml_event() call, the following variables will be set: 140 # 141 # XML_TYPE: type of event: "BEGIN" -> mean an opening tag, "END" a 142 # closing one. 143 # 144 # XML_TAG: name of the tag, always in UPPERCASE! 145 # 146 # XML_ATTR: a map of attributes for the type. Only set for "BEGIN" types. 147 # all attribute names are in lowercase. 148 # 149 # beware: values are *not* unescaped ! 150 # 151 # XML_RPATH: the _reversed_ element path, using "/" as a separator. 152 # if you are within the <manifest><application> tag, then 153 # it will be set to "APPLICATION/MANIFEST/" 154 # (note the trailing slash). 155 # 156 157 # This is a simple example that dumps the output of the parsing. 158 # 159 BEGIN { 160 while ( xml_event() ) { 161 printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH; 162 if (XML_TYPE == "BEGIN") { 163 for (attr in XML_ATTR) { 164 printf " %s='%s'", attr, XML_ATTR[attr]; 165 } 166 } 167 printf "\n"; 168 } 169 } 170 171 # IMPLEMENTATION DETAILS: 172 # 173 # 1. '>' as the record separator: 174 # 175 # RS is set to '>' to use this character as the record separator, instead of 176 # the default '\n'. This means that something like the following: 177 # 178 # <foo><bar attrib="value">stuff</bar></foo> 179 # 180 # will be translated into the following successive 'records': 181 # 182 # <foo 183 # <bar attrib="value" 184 # stuff</bar 185 # </foo 186 # 187 # Note that the '>' is never part of the records and thus will not be matched. 188 # If the record does not contain a single '<', the input is either 189 # malformed XML, or we reached the end of file with data after the last 190 # '>'. 191 # 192 # Newlines in the original input are kept in the records as-is. 193 # 194 # 2. Getting rid of unwanted stuff: 195 # 196 # We don't need any of the data within elements, so we get rid of them by 197 # simply ignoring anything before the '<' in the current record. This is 198 # done with code like this: 199 # 200 # p = index($0, "<"); # get index of '<' 201 # if (p == 0) -> return 0; # malformed input or end of file 202 # $0 = substr($0, p+1); # remove anything before the '<' in record 203 # 204 # We also want to ignore certain sections like CDATA, comments, declarations, 205 # etc.. These begin with a certain pattern and end with another one, e.g. 206 # "<!--" and "-->" for comments. This is handled by the _xml_in_section() 207 # function that accepts two patterns as input: 208 # 209 # sec_begin: is the pattern for the start of the record. 210 # sec_end: is the pattern for the end of the record (minus trailing '>'). 211 # 212 # The function deals with the fact that these section can embed a valid '>' 213 # and will then span multiple records, i.e. something like: 214 # 215 # <!-- A comment with an embedded > right here ! --> 216 # 217 # will be decomposed into two records: 218 # 219 # "<!-- A comment with an embedded " 220 # " right here ! --" 221 # 222 # The function deals with this case, and exits when such a section is not 223 # properly terminated in the input. 224 # 225 # _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise. 226 # 227 # 3. Extracting the tag name: 228 # 229 # </foo> is a closing tag, and <foo> an opening tag, this is handled 230 # by the following code: 231 # 232 # if (substr($0, 1, 2) == "</") { 233 # XML_TYPE = "END"; 234 # $0 = substr($0, 3); 235 # } else { 236 # XML_TYPE = "BEGIN"; 237 # $0 = substr($0, 2); 238 # } 239 # 240 # which defines XML_TYPE, and removes the leading "</" or "<" from the record. 241 # The tag is later extracted and converted to uppercase with: 242 # 243 # XML_TAG = $0 # copy record 244 # sub("[ \r\n\t/].*$", "", XML_TAG); # remove anything after tag name 245 # XML_TAG = toupper(XML_TAG); # conver to uppercase 246 # # validate tag 247 # if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic 248 # 249 # Then the record is purged from the tag name and the spaces after it: 250 # 251 # # get rid of tag and spaces after it in $0 252 # sub("[^ \r\n\t]*[ \r\n\t]*", "", $0); 253 # 254 # 4. Maintaining XML_RPATH: 255 # 256 # The _xml_enter() and _xml_exit() functions are called to maintain the 257 # XML_RPATH variable when entering and exiting specific tags. _xml_exit() 258 # will also validate the input, checking proper tag enclosure (or exit(1) 259 # in case of error). 260 # 261 # if (XML_TYPE == "BEGIN") { 262 # _xml_enter(XML_TAG); 263 # } else { 264 # _xml_exit(XML_TAG); 265 # } 266 # 267 # 5. Extracting attributes: 268 # 269 # A loop is implemented to parse attributes, the idea is to get the attribute 270 # name, which is always followed by a '=' character: 271 # 272 # _xml_attrib = $0; # copy record. 273 # sub(/=.*$/,"",_xml_attrib); # get rid of '=' and anything after. 274 # sub(/^[^=]*/,"",$0); # remove attribute name from $0 275 # _xml_attrib = tolower(_xml_attrib); 276 # if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) 277 # _xml_panic("Invalid attribute name: " _xml_attrib); 278 # 279 # Now get the value, which is enclosed by either (") or (') 280 # 281 # if (substr($0,1,2) == "=\"") { # if $0 begins with =" 282 # _xml_value = substr($0,3); # extract value 283 # sub(/".*$/,"",_xml_value); 284 # sub(/^="[^"]*"/,"",$0); # remove it from $0 285 # } else if (substr($0,1,2) == "='") { # if $0 begins with =' 286 # _xml_value = substr($0,3); # extract value 287 # sub(/'.*$/,"",_xml_value); 288 # sub(/^='[^']*'/,"",$0); # remove it from $0 289 # } else { 290 # -> panic (malformed input) 291 # } 292 # 293 # After that, we simply store the value into the XML_ATTR associative 294 # array, and cleanup $0 from leading spaces: 295 # 296 # XML_ATTR[_xml_attrib] = _xml_value; 297 # sub(/^[ \t\r\n]*/,"",$0); 298 # 299 # 300 # 6. Handling direct tag closure: 301 # 302 # When a tag is closed directly (as in <foo/>), A single '/' will be 303 # parsed in the attribute parsing loop. We need to record this for the 304 # next call to xml_event(), since the current one should return a"BEGIN" 305 # for the "FOO" tag instead. 306 # 307 # We do this by setting the special _xml_closing variable, as in: 308 # 309 # if ($0 == "/") { 310 # # record a delayed tag closure for the next call 311 # _xml_closing = XML_TAG; 312 # break 313 # } 314 # 315 # This variable is checked at the start of xml_event() like this: 316 # 317 # # delayed tag closure - see below 318 # if (_xml_closing) { 319 # XML_TAG = _xml_closing; 320 # XML_TYPE = "END"; 321 # _xml_closing = ""; 322 # _xml_exit(XML_TAG); 323 # return 1; 324 # } 325 # 326 # Note the call to _xml_exit() to update XML_RPATH here. 327 # 328