# xsxp.tcl --
#
###Abstract
# Extremely Simple XML Parser
#
# This is pretty lame, but I needed something like this for S3,
# and at the time, TclDOM would not work with the new 8.5 Tcl
# due to version number problems.
#
# In addition, this is a pure-value implementation. There is no
# garbage to clean up in the event of a thrown error, for example.
# This simplifies the code for sufficiently small XML documents,
# which is what Amazon's S3 guarantees.
#
###Copyright
# Copyright (c) 2006 Darren New.
# All Rights Reserved.
# NO WARRANTIES OF ANY TYPE ARE PROVIDED.
# COPYING OR USE INDEMNIFIES THE AUTHOR IN ALL WAYS.
# See the license terms in LICENSE.txt
#
###Revision String
# SCCS: %Z% %M% %I% %E% %U%
# xsxp::parse $xml
# Returns a parsed XML, or PXML. A pxml is a list.
# The first element is the name of the tag.
# The second element is a list of name/value pairs of the
# associated attribues, if any.
# The third thru final values are recursively PXML values.
# If the first element (element zero, that is) is "%PCDATA",
# then the attributes will be emtpy and the third element
# will be the text of the element.
# xsxp::fetch $pxml $path ?$part?
# $pxml is a parsed XML, as returned from xsxp::parse.
# $path is a list of elements. Each element is the name of
# a child to look up, optionally followed by a hash ("#")
# and a string of digits. An emtpy list or an initial empty
# element selects $pxml. If no hash sign is present, the
# behavior is as if "#0" had been appended to that element.
# An element of $path scans the children at the indicated
# level for the n'th instance of a child whose tag matches
# the part of the element before the hash sign. If an element
# is simply "#" followed by digits, that indexed child is
# selected, regardless of the tags in the children. So
# an element of #3 will always select the fourth child
# of the node under consideration.
# $part defaults to %ALL. It can be one of the following:
# %ALL - returns the entire selected element.
# %TAGNAME - returns lindex 0 of the selected element.
# %ATTRIBUTES - returns lindex 1 of the selected element.
# %CHILDREN - returns lrange 2 through end of the selected element,
# resulting in a list of elements being returned.
# %PCDATA - returns a concatenation of all the bodies of
# direct children of this node whose tag is %PCDATA.
# Throws an error if no such children are found. That
# is, part=%PCDATA means return the textual content found
# in that node but not its children nodes.
# %PCDATA? - like %PCDATA, but returns an empty string if
# no PCDATA is found.
# xsxp::fetchall $pxml_list $path ?$part?
# Iterates over each PXML in $pxml_list, selecting the indicated
# path from it, building a new list with the selected data, and
# returning that new list. For example, $pxml_list might be
# the %CHILDREN of a particular element, and the $path and $part
# might select from each child a sub-element in which we're interested.
# xsxp::only $pxml $tagname
# Iterates over the direct children of $pxml and selects only
# those with $tagname as their tag. Returns a list of matching
# elements.
# xsxp::prettyprint $pxml
# Outputs to stdout a nested-list notation of the parsed XML.
package require xml
package provide xsxp 1.0
namespace eval xsxp {
variable Stack
variable Cur
proc Characterdatacommand {characterdata} {
variable Cur
# puts "characterdatacommand $characterdata"
set x [list %PCDATA {} $characterdata]
lappend Cur $x
}
proc Elementstartcommand {name attlist args} {
# puts "elementstart $name {$attlist} $args"
variable Stack
variable Cur
lappend Stack $Cur
set Cur [list $name $attlist]
}
proc Elementendcommand {args} {
# puts "elementend $args"
variable Stack
variable Cur
set x [lindex $Stack end]
lappend x $Cur
set Cur $x
set Stack [lrange $Stack 0 end-1]
}
proc parse {xml} {
variable Cur
variable Stack
set Cur {}
set Stack {}
set parser [::xml::parser \
-characterdatacommand [namespace code Characterdatacommand] \
-elementstartcommand [namespace code Elementstartcommand] \
-elementendcommand [namespace code Elementendcommand] \
-ignorewhitespace 1 -final 1
]
$parser parse $xml
$parser free
# The following line is needed because the close of the last element
# appends the outermost element to the item on the top of the stack.
# Since there's nothing on the top of the stack at the close of the
# last element, we append the current element to an empty list.
# In essence, since we don't really have a terminating condition
# on the recursion, an empty stack is still treated like an element.
set Cur [lindex $Cur 0]
set Cur [Normalize $Cur]
return $Cur
}
proc Normalize {pxml} {
# This iterates over pxml recursively, finding entries that
# start with multiple %PCDATA elements, and coalesces their
# content, so if an element contains only %PCDATA, it is
# guaranteed to have only one child.
# Not really necessary, given definition of part=%PCDATA
# However, it makes pretty-prints nicer (for AWS at least)
# and ends up with smaller lists. I have no idea why they
# would put quotes around an MD5 hash in hex, tho.
set dupl 1
while {$dupl} {
set first [lindex $pxml 2]
set second [lindex $pxml 3]
if {[lindex $first 0] eq "%PCDATA" && [lindex $second 0] eq "%PCDATA"} {
set repl [list %PCDATA {} [lindex $first 2][lindex $second 2]]
set pxml [lreplace $pxml 2 3 $repl]
} else {
set dupl 0
for {set i 2} {$i < [llength $pxml]} {incr i} {
set pxml [lreplace $pxml $i $i [Normalize [lindex $pxml $i]]]
}
}
}
return $pxml
}
proc prettyprint {pxml {chan stdout} {indent 0}} {
puts -nonewline $chan [string repeat " " $indent]
if {[lindex $pxml 0] eq "%PCDATA"} {
puts $chan "%PCDATA: [lindex $pxml 2]"
return
}
puts -nonewline $chan "[lindex $pxml 0]"
foreach {name val} [lindex $pxml 1] {
puts -nonewline $chan " $name='$val'"
}
puts $chan ""
foreach node [lrange $pxml 2 end] {
prettyprint $node $chan [expr $indent+1]
}
}
proc fetch {pxml path {part %ALL}} {
set path [string trim $path /]
if {-1 != [string first / $path]} {
set path [split $path /]
}
foreach element $path {
if {$pxml eq ""} {return ""}
foreach {tag count} [split $element #] {
if {$tag ne ""} {
if {$count eq ""} {set count 0}
set pxml [lrange $pxml 2 end]
while {0 <= $count && 0 != [llength $pxml]} {
if {$tag eq [lindex $pxml 0 0]} {
incr count -1
if {$count < 0} {
# We're done. Go on to next element.
set pxml [lindex $pxml 0]
} else {
# Not done yet. Throw this away.
set pxml [lrange $pxml 1 end]
}
} else {
# Not what we want.
set pxml [lrange $pxml 1 end]
}
}
} else { # tag eq ""
if {$count eq ""} {
# Just select whole $pxml
} else {
set pxml [lindex $pxml [expr {2+$count}]]
}
}
break
} ; # done the foreach [split] loop
} ; # done all the elements.
if {$part eq "%ALL"} {return $pxml}
if {$part eq "%ATTRIBUTES"} {return [lindex $pxml 1]}
if {$part eq "%TAGNAME"} {return [lindex $pxml 0]}
if {$part eq "%CHILDREN"} {return [lrange $pxml 2 end]}
if {$part eq "%PCDATA" || $part eq "%PCDATA?"} {
set res "" ; set found 0
foreach elem [lrange $pxml 2 end] {
if {"%PCDATA" eq [lindex $elem 0]} {
append res [lindex $elem 2]
set found 1
}
}
if {$found || $part eq "%PCDATA?"} {
return $res
} else {
error "xsxp::fetch did not find requested PCDATA"
}
}
return $pxml ; # Don't know what he's after
}
proc only {pxml tag} {
set res {}
foreach element [lrange $pxml 2 end] {
if {[lindex $element 0] eq $tag} {
lappend res $element
}
}
return $res
}
proc fetchall {pxml_list path {part %ALL}} {
set res [list]
foreach pxml $pxml_list {
lappend res [fetch $pxml $path $part]
}
return $res
}
}
namespace export xsxp parse prettyprint fetch