Tk Library Source Code

Artifact [3690b1ad46]
Login

Artifact 3690b1ad4649414f841504920edc64ed0f31f77a:

Attachment "list.tcl" to ticket [708502ffff] added by kennykb 2003-03-24 03:33:34.
#----------------------------------------------------------------------
#
# list.tcl --
#
#	Definitions for extended processing of Tcl lists.
#
# Copyright (c) 2003 by Kevin B. Kenny.  All rights reserved.
#
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
# 
# RCS: @(#) $Id: $
#
#----------------------------------------------------------------------

package require Tcl 8.0

namespace eval ::struct { namespace eval list {} }

namespace eval ::struct::list { 
    namespace export longestCommonSubsequence 
    namespace export longestCommonSubsequence2
}

# Do a compatibility version of [lset] for pre-8.4 versions of Tcl.
# This version does not do multi-arg [lset]!

if { [package vcompare [package provide Tcl] 8.4] < 0 } {
    proc ::struct::list::K { x y } { set x }
    proc ::struct::list::lset { var index arg } {
	upvar 1 $var list
	set list [lreplace [K $list [set list {}]] $index $index $arg]
    }
}

# ::struct::list::longestCommonSubsequence --
#
#       Computes the longest common subsequence of two lists.
#
# Parameters:
#       sequence1, sequence2 -- Two lists to compare.
#	maxOccurs -- If provided, causes the procedure to ignore
#		     lines that appear more than $maxOccurs times
#		     in the second sequence.  See below for a discussion.
# Results:
#       Returns a list of two lists of equal length. 
#       The first sublist is of indices into sequence1, and the
#       second sublist is of indices into sequence2.  Each corresponding
#       pair of indices corresponds to equal elements in the sequences;
#       the sequence returned is the longest possible.
#
# Side effects:
#       None.
#
# Notes:
#
#	While this procedure is quite rapid for many tasks of file
# comparison, its performance degrades severely if the second list
# contains many equal elements (as, for instance, when using this
# procedure to compare two files, a quarter of whose lines are blank.
# This drawback is intrinsic to the algorithm used (see the References
# for details).  One approach to dealing with this problem that is
# sometimes effective in practice is arbitrarily to exclude elements
# that appear more than a certain number of times.  This number is
# provided as the 'maxOccurs' parameter.  If frequent lines are
# excluded in this manner, they will not appear in the common subsequence
# that is computed; the result will be the longest common subsequence
# of infrequent elements.
#
#	The procedure struct::list::longestCommonSubsequence2
# functions as a wrapper around this procedure; it computes the longest
# common subsequence of infrequent elements, and then subdivides the
# subsequences that lie between the matches to approximate the true
# longest common subsequence.
#
# References:
#	J. W. Hunt and M. D. McIlroy, "An algorithm for differential 
#	file comparison," Comp. Sci. Tech. Rep. #41, Bell Telephone 
#	Laboratories (1976). Available on the Web at the second
#	author's personal site: http://www.cs.dartmouth.edu/~doug/

proc ::struct::list::longestCommonSubsequence { sequence1
						sequence2
						{ maxOccurs 0x7fffffff } 
				  } {
    
    # Construct a set of equivalence classes of lines in file 2 
    
    set index 0
    foreach string $sequence2 {
	lappend eqv($string) $index
	incr index
    }
    
    # K holds descriptions of the common subsequences.
    # Initially, there is one common subsequence of length 0,
    # with a fence saying that it includes line -1 of both files.
    # The maximum subsequence length is 0; position 0 of
    # K holds a fence carrying the line following the end
    # of both files.
    
    lappend K [list -1 -1 {}]
    lappend K [list [llength $sequence1] [llength $sequence2] {}]
    set k 0
    
    # Walk through the first file, letting i be the index of the line and
    # string be the line itself.
    
    set i 0
    foreach string $sequence1 {
	
	# Consider each possible corresponding index j in the second file.
	
	if { [info exists eqv($string)]
	     && [llength $eqv($string)] <= $maxOccurs } {

	    # c is the candidate match most recently found, and r is the
	    # length of the corresponding subsequence.
	    
	    set r 0
	    set c [lindex $K 0]
	    
	    foreach j $eqv($string) {
		
		# Perform a binary search to find a candidate common
		# subsequence to which may be appended this match.
		
		set max $k
		set min $r
		set s [expr { $k + 1 }]
		while { $max >= $min } {
		    set mid [expr { ( $max + $min ) / 2 }]
		    set bmid [lindex [lindex $K $mid] 1]
		    if { $j == $bmid } {
			break
		    } elseif { $j < $bmid } {
			set max [expr {$mid - 1}]
		    } else {
			set s $mid
			set min [expr { $mid + 1 }]
		    }
		}
		
		# Go to the next match point if there is no suitable
		# candidate.
		
		if { $j == [lindex [lindex $K $mid] 1] || $s > $k} {
		    continue
		}
		
		# s is the sequence length of the longest sequence
		# to which this match point may be appended. Make
		# a new candidate match and store the old one in K
		# Set r to the length of the new candidate match.
		
		set newc [list $i $j [lindex $K $s]]
		if { $r >= 0 } {
		    lset K $r $c
		}
		set c $newc
		set r [expr { $s + 1 }]
		
		# If we've extended the length of the longest match,
		# we're done; move the fence.
		
		if { $s >= $k } {
		    lappend K [lindex $K end]
		    incr k
		    break
		}
		
	    }
	    
	    # Put the last candidate into the array
	    
	    lset K $r $c
	    
	}
	
	incr i
	
    }
    
    # Package the common subsequence in a convenient form
    
    set seta {}
    set setb {}
    set q [lindex $K $k]

    for { set i 0 } { $i < $k } {incr i } {
	lappend seta {}
	lappend setb {}
    }
    while { [lindex $q 0] >= 0 } {
	incr k -1
	lset seta $k [lindex $q 0]
	lset setb $k [lindex $q 1]
	set q [lindex $q 2]
    }
    
    return [list $seta $setb]
    
}

# ::struct::list::longestCommonSubsequence2 --
#
#	Derives an approximation to the longest common subsequence
#	of two lists.
#
# Parameters:
#	sequence1, sequence2 - Lists to be compared
#	maxOccurs - Parameter for imprecise matching - see below.
#
# Results:
#       Returns a list of two lists of equal length. 
#       The first sublist is of indices into sequence1, and the
#       second sublist is of indices into sequence2.  Each corresponding
#       pair of indices corresponds to equal elements in the sequences;
#       the sequence returned is an approximation to the longest possible.
#
# Side effects:
#       None.
#
# Notes:
#	This procedure acts as a wrapper around the companion procedure
#	struct::list::longestCommonSubsequence and accepts the same
#	parameters.  It first computes the longest common subsequence of
#	elements that occur no more than $maxOccurs times in the
#	second list.  Using that subsequence to align the two lists,
#	it then tries to augment the subsequence by computing the true
#	longest common subsequences of the sublists between matched pairs.

proc ::struct::list::longestCommonSubsequence2 { sequence1
						 sequence2
						 { maxOccurs 0x7fffffff} } {

    # Derive the longest common subsequence of elements that occur at
    # most $maxOccurs times

    foreach { l1 l2 } \
	[longestCommonSubsequence $sequence1 $sequence2 $maxOccurs] {
	    break
	}

    # Walk through the match points in the sequence just derived.

    set result1 {}
    set result2 {}
    set n1 0
    set n2 0
    foreach i1 $l1 i2 $l2 {
	if { $i1 != $n1 && $i2 != $n2 } {

	    # The match points indicate that there are unmatched
	    # elements lying between them in both input sequences.
	    # Extract the unmatched elements and perform precise
	    # longest-common-subsequence analysis on them.

	    set subl1 [lrange $sequence1 $n1 [expr { $i1 - 1 }]]
	    set subl2 [lrange $sequence2 $n2 [expr { $i2 - 1 }]]
	    foreach { m1 m2 } [longestCommonSubsequence $subl1 $subl2] break
	    foreach j1 $m1 j2 $m2 {
		lappend result1 [expr { $j1 + $n1 }]
		lappend result2 [expr { $j2 + $n2 }]
	    }
	}

	# Add the current match point to the result

	lappend result1 $i1
	lappend result2 $i2
	set n1 [expr { $i1 + 1 }]
	set n2 [expr { $i2 + 1 }]
    }

    # If there are unmatched elements after the last match in both files,
    # perform precise longest-common-subsequence matching on them and
    # add the result to our return.

    if { $n1 < [llength $sequence1] && $n2 < [llength $sequence2] } {
	set subl1 [lrange $sequence1 $n1 end]
	set subl2 [lrange $sequence2 $n2 end]
	foreach { m1 m2 } [longestCommonSubsequence $subl1 $subl2] break
	foreach j1 $m1 j2 $m2 {
	    lappend result1 [expr { $j1 + $n1 }]
	    lappend result2 [expr { $j2 + $n2 }]
	}
    }

    return [list $result1 $result2]

}