Attachment "htmlparse.tcl.diff" to
ticket [823346ffff]
added by
mic42
2003-10-14 17:42:55.
Index: htmlparse.tcl
===================================================================
RCS file: /cvsroot/tcllib/tcllib/modules/htmlparse/htmlparse.tcl,v
retrieving revision 1.12
diff -u -b -r1.12 htmlparse.tcl
--- htmlparse.tcl 15 Aug 2003 03:51:30 -0000 1.12
+++ htmlparse.tcl 14 Oct 2003 10:36:14 -0000
@@ -36,15 +36,15 @@
# I. Standard escapes. (ISO latin-1 esc's are in a different table)
- array set escapes {
- lt < gt > amp & quot \" copy \xa9
+ array set tmp {
+ lt < gt > quot \" copy \xa9
reg \xae ob \x7b cb \x7d nbsp \xa0
- bsl \\
+ bsl \\ amp &
} ; # " make the emacs highlighting code happy.
# II. ISO Latin-1 escape codes
- array set escapes {
+ array set tmp {
nbsp \xa0 iexcl \xa1 cent \xa2 pound \xa3 curren \xa4
yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9
ordf \xaa laquo \xab not \xac shy \xad reg \xae
@@ -67,6 +67,15 @@
yuml \xff
}
+ set escapes [list]
+ foreach esc [array names tmp] {
+ # create both valid forms, in the right order for string map
+ lappend escapes "&$esc;" $tmp($esc) "&$esc" $tmp($esc)
+ }
+ # and add the Tcl special chars
+ lappend escapes \] \\\] \[ \\\[ \$ \\\$ \\ \\\\
+ unset tmp
+
# Internal cache for the foreach variable-lists and the
# substitution strings used to split a HTML string into
# incrementally handleable scripts. This should reduce the
@@ -274,24 +283,25 @@
# Handle incomplete HTML (Recognize incomplete tag at end, buffer
# it up for the next call).
- if {[regexp -- {[^<]*(<[^>]*)$} [lindex "\{$html\}" end] -> trailer]} {
+ set end [lindex \{$html\} end]
+ if {[set idx [string last < $end]] > [string last > $end]} {
+
if {$incvar == {}} {
return -code error "::htmlparse::parse : HTML is incomplete, option -incvar is missing"
}
# upvar $incvar incomplete -- Already done, s.a.
- set incomplete $trailer
- set html [string range $html 0 [expr {[string last "<" $html] - 1}]]
+ set incomplete [string range $end $idx end]
+ set html [string range $end 0 [expr {$idx - 1}]]
+
} else {
set incomplete ""
}
# Convert the HTML string into a script.
- set w " \t\r\n" ;# white space
- set exp <(/?)([CClass ^$w>]+)[CClass $w]*([CClass ^>]*)>
set sub "\}\n$cmd {\\2} {\\1} {\\3} \{"
- regsub -all -- $exp $html $sub html
+ regsub -all -- {<(/?)([^\s>]+)\s*([^>]*)>} $html $sub html
# The value of queue now determines wether we process the HTML by
# ourselves (queue is empty) or if we generate a list of scripts
@@ -429,6 +439,7 @@
# their actual characters.
proc ::htmlparse::mapEscapes {html} {
+ variable escapes
# Find HTML escape characters of the form &xxx;
if { ! [string match "*&*" $html] } {
@@ -436,60 +447,13 @@
return $html
}
- regsub -all -- {([][$\\])} $html {\\\1} new
- regsub -all -- {&#([0-9][0-9]?[0-9]?);?} $new {[format %c [scan \1 %d tmp;set tmp]]} new
- regsub -all -- {&([a-zA-Z]+);?} $new {[DoMap \1]} new
+ set new [string map $escapes $html]
+ regsub -all -- {&[a-zA-Z];?} $new {?} new
+ regsub -all -- {&#([0-9][0-9]?[0-9]?);?} $new {[format %c [scan \1 %d]]} new
return [subst $new]
-}
-
-# htmlparse::CClass --
-#
-# Internal helper command used by '::htmlparse::parse' while
-# transforming the HTML string. Makes it easier to declare
-# character classes in a ""-bounded string without traipsing
-# into quoting hell.
-#
-# Arguments:
-# x A set of characters.
-#
-# Side Effects:
-# None.
-#
-# Results:
-# Returns a regular expression for the specified character
-# class.
-proc ::htmlparse::CClass {x} {
- return "\[$x\]"
}
-# htmlparse::DoMap --
-#
-# Internal helper command. Takes a the body of a single escape
-# sequence (i.e. the string without the sourounding & and ;) and
-# returns the associated actual character. Used by
-# '::htmlparse::mapEscapes' to do the real work.
-#
-# Arguments:
-# text The body of the escape sequence to convert.
-#
-# unknown Optional. Defaults to '?'. The string to return if the
-# escape sequence is not known to the command.
-#
-# Side Effects:
-# None.
-#
-# Results:
-# None.
-
-proc ::htmlparse::DoMap {text {unknown ?}} {
- # Convert an HTML escape sequence into a character.
-
- variable escapes
- set result $unknown
- catch {set result $escapes($text)}
- return $result
-}
# htmlparse::2tree --
#