Attachment "list.tcl" to
ticket [708502ffff]
added by
kennykb
2003-03-24 03:33:34.
#----------------------------------------------------------------------
#
# list.tcl --
#
# Definitions for extended processing of Tcl lists.
#
# Copyright (c) 2003 by Kevin B. Kenny. All rights reserved.
#
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
# RCS: @(#) $Id: $
#
#----------------------------------------------------------------------
package require Tcl 8.0
namespace eval ::struct { namespace eval list {} }
namespace eval ::struct::list {
namespace export longestCommonSubsequence
namespace export longestCommonSubsequence2
}
# Do a compatibility version of [lset] for pre-8.4 versions of Tcl.
# This version does not do multi-arg [lset]!
if { [package vcompare [package provide Tcl] 8.4] < 0 } {
proc ::struct::list::K { x y } { set x }
proc ::struct::list::lset { var index arg } {
upvar 1 $var list
set list [lreplace [K $list [set list {}]] $index $index $arg]
}
}
# ::struct::list::longestCommonSubsequence --
#
# Computes the longest common subsequence of two lists.
#
# Parameters:
# sequence1, sequence2 -- Two lists to compare.
# maxOccurs -- If provided, causes the procedure to ignore
# lines that appear more than $maxOccurs times
# in the second sequence. See below for a discussion.
# Results:
# Returns a list of two lists of equal length.
# The first sublist is of indices into sequence1, and the
# second sublist is of indices into sequence2. Each corresponding
# pair of indices corresponds to equal elements in the sequences;
# the sequence returned is the longest possible.
#
# Side effects:
# None.
#
# Notes:
#
# While this procedure is quite rapid for many tasks of file
# comparison, its performance degrades severely if the second list
# contains many equal elements (as, for instance, when using this
# procedure to compare two files, a quarter of whose lines are blank.
# This drawback is intrinsic to the algorithm used (see the References
# for details). One approach to dealing with this problem that is
# sometimes effective in practice is arbitrarily to exclude elements
# that appear more than a certain number of times. This number is
# provided as the 'maxOccurs' parameter. If frequent lines are
# excluded in this manner, they will not appear in the common subsequence
# that is computed; the result will be the longest common subsequence
# of infrequent elements.
#
# The procedure struct::list::longestCommonSubsequence2
# functions as a wrapper around this procedure; it computes the longest
# common subsequence of infrequent elements, and then subdivides the
# subsequences that lie between the matches to approximate the true
# longest common subsequence.
#
# References:
# J. W. Hunt and M. D. McIlroy, "An algorithm for differential
# file comparison," Comp. Sci. Tech. Rep. #41, Bell Telephone
# Laboratories (1976). Available on the Web at the second
# author's personal site: http://www.cs.dartmouth.edu/~doug/
proc ::struct::list::longestCommonSubsequence { sequence1
sequence2
{ maxOccurs 0x7fffffff }
} {
# Construct a set of equivalence classes of lines in file 2
set index 0
foreach string $sequence2 {
lappend eqv($string) $index
incr index
}
# K holds descriptions of the common subsequences.
# Initially, there is one common subsequence of length 0,
# with a fence saying that it includes line -1 of both files.
# The maximum subsequence length is 0; position 0 of
# K holds a fence carrying the line following the end
# of both files.
lappend K [list -1 -1 {}]
lappend K [list [llength $sequence1] [llength $sequence2] {}]
set k 0
# Walk through the first file, letting i be the index of the line and
# string be the line itself.
set i 0
foreach string $sequence1 {
# Consider each possible corresponding index j in the second file.
if { [info exists eqv($string)]
&& [llength $eqv($string)] <= $maxOccurs } {
# c is the candidate match most recently found, and r is the
# length of the corresponding subsequence.
set r 0
set c [lindex $K 0]
foreach j $eqv($string) {
# Perform a binary search to find a candidate common
# subsequence to which may be appended this match.
set max $k
set min $r
set s [expr { $k + 1 }]
while { $max >= $min } {
set mid [expr { ( $max + $min ) / 2 }]
set bmid [lindex [lindex $K $mid] 1]
if { $j == $bmid } {
break
} elseif { $j < $bmid } {
set max [expr {$mid - 1}]
} else {
set s $mid
set min [expr { $mid + 1 }]
}
}
# Go to the next match point if there is no suitable
# candidate.
if { $j == [lindex [lindex $K $mid] 1] || $s > $k} {
continue
}
# s is the sequence length of the longest sequence
# to which this match point may be appended. Make
# a new candidate match and store the old one in K
# Set r to the length of the new candidate match.
set newc [list $i $j [lindex $K $s]]
if { $r >= 0 } {
lset K $r $c
}
set c $newc
set r [expr { $s + 1 }]
# If we've extended the length of the longest match,
# we're done; move the fence.
if { $s >= $k } {
lappend K [lindex $K end]
incr k
break
}
}
# Put the last candidate into the array
lset K $r $c
}
incr i
}
# Package the common subsequence in a convenient form
set seta {}
set setb {}
set q [lindex $K $k]
for { set i 0 } { $i < $k } {incr i } {
lappend seta {}
lappend setb {}
}
while { [lindex $q 0] >= 0 } {
incr k -1
lset seta $k [lindex $q 0]
lset setb $k [lindex $q 1]
set q [lindex $q 2]
}
return [list $seta $setb]
}
# ::struct::list::longestCommonSubsequence2 --
#
# Derives an approximation to the longest common subsequence
# of two lists.
#
# Parameters:
# sequence1, sequence2 - Lists to be compared
# maxOccurs - Parameter for imprecise matching - see below.
#
# Results:
# Returns a list of two lists of equal length.
# The first sublist is of indices into sequence1, and the
# second sublist is of indices into sequence2. Each corresponding
# pair of indices corresponds to equal elements in the sequences;
# the sequence returned is an approximation to the longest possible.
#
# Side effects:
# None.
#
# Notes:
# This procedure acts as a wrapper around the companion procedure
# struct::list::longestCommonSubsequence and accepts the same
# parameters. It first computes the longest common subsequence of
# elements that occur no more than $maxOccurs times in the
# second list. Using that subsequence to align the two lists,
# it then tries to augment the subsequence by computing the true
# longest common subsequences of the sublists between matched pairs.
proc ::struct::list::longestCommonSubsequence2 { sequence1
sequence2
{ maxOccurs 0x7fffffff} } {
# Derive the longest common subsequence of elements that occur at
# most $maxOccurs times
foreach { l1 l2 } \
[longestCommonSubsequence $sequence1 $sequence2 $maxOccurs] {
break
}
# Walk through the match points in the sequence just derived.
set result1 {}
set result2 {}
set n1 0
set n2 0
foreach i1 $l1 i2 $l2 {
if { $i1 != $n1 && $i2 != $n2 } {
# The match points indicate that there are unmatched
# elements lying between them in both input sequences.
# Extract the unmatched elements and perform precise
# longest-common-subsequence analysis on them.
set subl1 [lrange $sequence1 $n1 [expr { $i1 - 1 }]]
set subl2 [lrange $sequence2 $n2 [expr { $i2 - 1 }]]
foreach { m1 m2 } [longestCommonSubsequence $subl1 $subl2] break
foreach j1 $m1 j2 $m2 {
lappend result1 [expr { $j1 + $n1 }]
lappend result2 [expr { $j2 + $n2 }]
}
}
# Add the current match point to the result
lappend result1 $i1
lappend result2 $i2
set n1 [expr { $i1 + 1 }]
set n2 [expr { $i2 + 1 }]
}
# If there are unmatched elements after the last match in both files,
# perform precise longest-common-subsequence matching on them and
# add the result to our return.
if { $n1 < [llength $sequence1] && $n2 < [llength $sequence2] } {
set subl1 [lrange $sequence1 $n1 end]
set subl2 [lrange $sequence2 $n2 end]
foreach { m1 m2 } [longestCommonSubsequence $subl1 $subl2] break
foreach j1 $m1 j2 $m2 {
lappend result1 [expr { $j1 + $n1 }]
lappend result2 [expr { $j2 + $n2 }]
}
}
return [list $result1 $result2]
}