Tk Library Source Code

Artifact [ff98adad78]
Login

Artifact ff98adad78fcd6ff3cb6228b738f69d7b119c064:

Attachment "htmlparse.tcl.diff" to ticket [823346ffff] added by mic42 2003-10-14 17:42:55.
Index: htmlparse.tcl
===================================================================
RCS file: /cvsroot/tcllib/tcllib/modules/htmlparse/htmlparse.tcl,v
retrieving revision 1.12
diff -u -b -r1.12 htmlparse.tcl
--- htmlparse.tcl	15 Aug 2003 03:51:30 -0000	1.12
+++ htmlparse.tcl	14 Oct 2003 10:36:14 -0000
@@ -36,15 +36,15 @@
 
     # I. Standard escapes. (ISO latin-1 esc's are in a different table)
 
-    array set escapes {
-	lt <   gt >   amp &   quot \"   copy \xa9
+    array set tmp {
+    lt <   gt >  quot \"   copy \xa9
 	reg \xae   ob \x7b   cb \x7d   nbsp \xa0
-	bsl \\
+    bsl \\ amp &#38; 
     } ; # " make the emacs highlighting code happy.
 
     # II. ISO Latin-1 escape codes
 
-    array set escapes {
+    array set tmp {
 	nbsp \xa0 iexcl \xa1 cent \xa2 pound \xa3 curren \xa4
 	yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9
 	ordf \xaa laquo \xab not \xac shy \xad reg \xae
@@ -67,6 +67,15 @@
 	yuml \xff
     }
 
+    set escapes [list]
+    foreach esc [array names tmp] {
+        # create both valid forms, in the right order for string map
+        lappend escapes "&$esc;" $tmp($esc) "&$esc" $tmp($esc)
+    }
+    # and add the Tcl special chars
+    lappend escapes \] \\\] \[ \\\[ \$ \\\$ \\ \\\\ 
+    unset tmp
+
     # Internal cache for the foreach variable-lists and the
     # substitution strings used to split a HTML string into
     # incrementally handleable scripts. This should reduce the
@@ -274,24 +283,25 @@
     # Handle incomplete HTML (Recognize incomplete tag at end, buffer
     # it up for the next call).
 
-    if {[regexp -- {[^<]*(<[^>]*)$} [lindex "\{$html\}" end] -> trailer]} {
+	set end [lindex \{$html\} end]
+    if {[set idx [string last < $end]] > [string last > $end]} {
+
 	if {$incvar == {}} {
 	    return -code error "::htmlparse::parse : HTML is incomplete, option -incvar is missing"
 	}
 
 	#  upvar $incvar incomplete -- Already done, s.a.
-	set incomplete $trailer
-	set html       [string range $html 0 [expr {[string last "<" $html] - 1}]]
+    set incomplete   [string range $end $idx end]
+    set html       [string range $end 0 [expr {$idx - 1}]]
+	
     } else {
 	set incomplete ""
     }
 
     # Convert the HTML string into a script.
 
-    set w " \t\r\n"	;# white space
-    set exp <(/?)([CClass ^$w>]+)[CClass $w]*([CClass ^>]*)>
     set sub "\}\n$cmd {\\2} {\\1} {\\3} \{"
-    regsub -all -- $exp $html $sub html
+    regsub -all -- {<(/?)([^\s>]+)\s*([^>]*)>} $html $sub html
 
     # The value of queue now determines wether we process the HTML by
     # ourselves (queue is empty) or if we generate a list of  scripts
@@ -429,6 +439,7 @@
 #	their actual characters.
 
 proc ::htmlparse::mapEscapes {html} {
+    variable escapes
     # Find HTML escape characters of the form &xxx;
 
     if { ! [string match "*&*" $html] } {
@@ -436,60 +447,13 @@
 	return $html
     }
 
-    regsub -all -- {([][$\\])} $html {\\\1} new
-    regsub -all -- {&#([0-9][0-9]?[0-9]?);?} $new {[format %c [scan \1 %d tmp;set tmp]]} new
-    regsub -all -- {&([a-zA-Z]+);?} $new {[DoMap \1]} new
+    set new [string map $escapes $html]
+    regsub -all -- {&[a-zA-Z];?} $new {?} new 
+    regsub -all -- {&#([0-9][0-9]?[0-9]?);?} $new {[format %c [scan \1 %d]]} new
     return [subst $new]
-}
-
-# htmlparse::CClass --
-#
-#	Internal helper command used by '::htmlparse::parse' while
-#	transforming the HTML string. Makes it easier to declare
-#	character classes in a ""-bounded string without traipsing
-#	into quoting hell.
-#
-# Arguments:
-#	x	A set of characters.
-#
-# Side Effects:
-#	None.
-#
-# Results:
-#	Returns a regular expression for the specified character
-#	class.
 
-proc ::htmlparse::CClass {x} {
-    return "\[$x\]"
 }
 
-# htmlparse::DoMap --
-#
-#	Internal helper command. Takes a the body of a single escape
-#	sequence (i.e. the string without the sourounding & and ;) and
-#	returns the associated actual character. Used by
-#	'::htmlparse::mapEscapes' to do the real work.
-#
-# Arguments:
-#	text	The body of the escape sequence to convert.
-#
-#	unknown	Optional. Defaults to '?'. The string to return if the
-#		escape sequence is not known to the command.
-#
-# Side Effects:
-#	None.
-#
-# Results:
-#	None.
-
-proc ::htmlparse::DoMap {text {unknown ?}} {
-    # Convert an HTML escape sequence into a character.
-
-    variable escapes
-    set result $unknown
-    catch {set result $escapes($text)}
-    return $result
-}
 
 # htmlparse::2tree --
 #