# Copyright (C) 2010 The Android Open Source Project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Tiny XML parser implementation in awk. # # This file is not meant to be used directly, instead copy the # functions it defines here into your own script then specialize # it appropriately. # # See further below for usage instructions and implementation details. # # ---------------------------- cut here --------------------------- function xml_event () { RS=">"; XML_TAG=XML_TYPE=""; split("", XML_ATTR); while ( 1 ) { if (_xml_closing) { # delayed direct tag closure XML_TAG = _xml_closing; XML_TYPE = "END"; _xml_closing = ""; _xml_exit(XML_TAG); return 1; } if (getline <= 0) return 0; # read new input line _xml_p = index($0, "<"); # get start marker if (_xml_p == 0) return 0; # end of file (or malformed input) $0 = substr($0, _xml_p) # remove anything before '<' # ignore CData / Comments / Processing instructions / Declarations if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") || _xml_in_section("<!--", "--") || _xml_in_section("<\\?", "\\?") || _xml_in_section("<!", "")) { continue; } if (substr($0, 1, 2) == "</") { # is it a closing tag ? XML_TYPE = "END"; $0 = substr($0, 3); } else { # nope, it's an opening one XML_TYPE = "BEGIN"; $0 = substr($0, 2); } XML_TAG = $0 sub("[ \n\t/].*$", "", XML_TAG); # extract tag name XML_TAG = toupper(XML_TAG); # uppercase it if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) # validate it _xml_panic("Invalid tag name: " XML_TAG); if (XML_TYPE == "BEGIN") { # update reverse path _xml_enter(XML_TAG); } else { _xml_exit(XML_TAG); } sub("[^ \n\t]*[ \n\t]*", "", $0); # get rid of tag and spaces while ($0) { # process attributes if ($0 == "/") { # deal with direct closing tag, e.g. </foo> _xml_closing = XML_TAG; # record delayed tag closure. break } _xml_attrib = $0; sub(/=.*$/,"",_xml_attrib); # extract attribute name sub(/^[^=]*/,"",$0); # remove it from record _xml_attrib = tolower(_xml_attrib); if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it _xml_panic("Invalid attribute name: " _xml_attrib); if (substr($0,1,2) == "=\"") { # value is ="something" _xml_value = substr($0,3); sub(/".*$/,"",_xml_value); sub(/^="[^"]*"/,"",$0); } else if (substr($0,1,2) == "='") { # value is ='something' _xml_value = substr($0,3); sub(/'.*$/,"",_xml_value); sub(/^='[^']*'/,"",$0); } else { _xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0); } XML_ATTR[_xml_attrib] = _xml_value; # store attribute name/value sub(/^[ \t\n]*/,"",$0); # get rid of remaining leading spaces } return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set } } function _xml_panic (msg) { print msg > "/dev/stderr" exit(1) } function _xml_in_section (sec_begin, sec_end) { if (!match( $0, "^" sec_begin )) return 0; while (!match($0, sec_end "$")) { if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO); } return 1; } function _xml_enter (tag) { XML_RPATH = tag "/" XML_RPATH; } function _xml_exit (tag) { _xml_p = index(XML_RPATH, "/"); _xml_expected = substr(XML_RPATH, 1, _xml_p-1); if (_xml_expected != XML_TAG) _xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected); XML_RPATH = substr(XML_RPATH, _xml_p+1); } # ---------------------------- cut here --------------------------- # USAGE: # # The functions provided here are used to extract the tags and attributes of a # given XML file. They do not support extraction of data, CDATA, comments, # processing instructions and declarations at all. # # You should use this from the BEGIN {} action of your awk script (it will # not work from an END {} action). # # Call xml_event() in a while loop. This functions returns 1 for each XML # 'event' encountered, or 0 when the end of input is reached. Note that in # case of malformed output, an error will be printed and the script will # force an exit(1) # # After each succesful xml_event() call, the following variables will be set: # # XML_TYPE: type of event: "BEGIN" -> mean an opening tag, "END" a # closing one. # # XML_TAG: name of the tag, always in UPPERCASE! # # XML_ATTR: a map of attributes for the type. Only set for "BEGIN" types. # all attribute names are in lowercase. # # beware: values are *not* unescaped ! # # XML_RPATH: the _reversed_ element path, using "/" as a separator. # if you are within the <manifest><application> tag, then # it will be set to "APPLICATION/MANIFEST/" # (note the trailing slash). # # This is a simple example that dumps the output of the parsing. # BEGIN { while ( xml_event() ) { printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH; if (XML_TYPE == "BEGIN") { for (attr in XML_ATTR) { printf " %s='%s'", attr, XML_ATTR[attr]; } } printf "\n"; } } # IMPLEMENTATION DETAILS: # # 1. '>' as the record separator: # # RS is set to '>' to use this character as the record separator, instead of # the default '\n'. This means that something like the following: # # <foo><bar attrib="value">stuff</bar></foo> # # will be translated into the following successive 'records': # # <foo # <bar attrib="value" # stuff</bar # </foo # # Note that the '>' is never part of the records and thus will not be matched. # If the record does not contain a single '<', the input is either # malformed XML, or we reached the end of file with data after the last # '>'. # # Newlines in the original input are kept in the records as-is. # # 2. Getting rid of unwanted stuff: # # We don't need any of the data within elements, so we get rid of them by # simply ignoring anything before the '<' in the current record. This is # done with code like this: # # p = index($0, "<"); # get index of '<' # if (p == 0) -> return 0; # malformed input or end of file # $0 = substr($0, p+1); # remove anything before the '<' in record # # We also want to ignore certain sections like CDATA, comments, declarations, # etc.. These begin with a certain pattern and end with another one, e.g. # "<!--" and "-->" for comments. This is handled by the _xml_in_section() # function that accepts two patterns as input: # # sec_begin: is the pattern for the start of the record. # sec_end: is the pattern for the end of the record (minus trailing '>'). # # The function deals with the fact that these section can embed a valid '>' # and will then span multiple records, i.e. something like: # # <!-- A comment with an embedded > right here ! --> # # will be decomposed into two records: # # "<!-- A comment with an embedded " # " right here ! --" # # The function deals with this case, and exits when such a section is not # properly terminated in the input. # # _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise. # # 3. Extracting the tag name: # # </foo> is a closing tag, and <foo> an opening tag, this is handled # by the following code: # # if (substr($0, 1, 2) == "</") { # XML_TYPE = "END"; # $0 = substr($0, 3); # } else { # XML_TYPE = "BEGIN"; # $0 = substr($0, 2); # } # # which defines XML_TYPE, and removes the leading "</" or "<" from the record. # The tag is later extracted and converted to uppercase with: # # XML_TAG = $0 # copy record # sub("[ \n\t/].*$", "", XML_TAG); # remove anything after tag name # XML_TAG = toupper(XML_TAG); # conver to uppercase # # validate tag # if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic # # Then the record is purged from the tag name and the spaces after it: # # # get rid of tag and spaces after it in $0 # sub("[^ \n\t]*[ \n\t]*", "", $0); # # 4. Maintaining XML_RPATH: # # The _xml_enter() and _xml_exit() functions are called to maintain the # XML_RPATH variable when entering and exiting specific tags. _xml_exit() # will also validate the input, checking proper tag enclosure (or exit(1) # in case of error). # # if (XML_TYPE == "BEGIN") { # _xml_enter(XML_TAG); # } else { # _xml_exit(XML_TAG); # } # # 5. Extracting attributes: # # A loop is implemented to parse attributes, the idea is to get the attribute # name, which is always followed by a '=' character: # # _xml_attrib = $0; # copy record. # sub(/=.*$/,"",_xml_attrib); # get rid of '=' and anything after. # sub(/^[^=]*/,"",$0); # remove attribute name from $0 # _xml_attrib = tolower(_xml_attrib); # if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # _xml_panic("Invalid attribute name: " _xml_attrib); # # Now get the value, which is enclosed by either (") or (') # # if (substr($0,1,2) == "=\"") { # if $0 begins with =" # _xml_value = substr($0,3); # extract value # sub(/".*$/,"",_xml_value); # sub(/^="[^"]*"/,"",$0); # remove it from $0 # } else if (substr($0,1,2) == "='") { # if $0 begins with =' # _xml_value = substr($0,3); # extract value # sub(/'.*$/,"",_xml_value); # sub(/^='[^']*'/,"",$0); # remove it from $0 # } else { # -> panic (malformed input) # } # # After that, we simply store the value into the XML_ATTR associative # array, and cleanup $0 from leading spaces: # # XML_ATTR[_xml_attrib] = _xml_value; # sub(/^[ \t\n]*/,"",$0); # # # 6. Handling direct tag closure: # # When a tag is closed directly (as in <foo/>), A single '/' will be # parsed in the attribute parsing loop. We need to record this for the # next call to xml_event(), since the current one should return a"BEGIN" # for the "FOO" tag instead. # # We do this by setting the special _xml_closing variable, as in: # # if ($0 == "/") { # # record a delayed tag closure for the next call # _xml_closing = XML_TAG; # break # } # # This variable is checked at the start of xml_event() like this: # # # delayed tag closure - see below # if (_xml_closing) { # XML_TAG = _xml_closing; # XML_TYPE = "END"; # _xml_closing = ""; # _xml_exit(XML_TAG); # return 1; # } # # Note the call to _xml_exit() to update XML_RPATH here. #