Attachment "toregexp.dtx" to
ticket [1735601fff]
added by
lars_h
2007-08-11 22:38:04.
%
% \iffalse (driver block)
%<*driver>
\documentclass{tclldoc}
\usepackage{amsmath}
\newcommand{\Tcl}{\Tcllogo}
\CodelineIndex
\setcounter{IndexColumns}{2}
\begin{document}
\DocInput{toregexp.dtx}
\PrintIndex
\end{document}
%</driver>
% \fi
%
% \title{Analysis of finite automata}
% \author{Lars Hellstr\"om}
% \date{2007-07-23--}
% \maketitle
%
% \begin{abstract}
% By Kleene's analysis theorem for regular languages, every regular
% language (language recoginised by a finite automaton) can be
% described by a regular expression. The following implements a
% procedure for doing this, using the encodings of finite automata
% and regular expressions employed in the \textsf{grammar::fa}
% package.
% \end{abstract}
%
%
% \begin{tcl}
%<*pkg>
namespace eval grammar::fa::op {}
% \end{tcl}
% \setnamespace{grammar::fa::op}
%
%
% \section{Theory}
%
% A language (in the formal sense, i.e., as a set of finite words)
% can be regarded as an associative formal power series over the
% two-element boolean algebra: a word has coefficient $1$ if it is
% included in the language and $0$ otherwise. This point of view is
% useful because the fundamental operations in a regular expression
% corresponds directly to arithmetic operations on the formal power
% series:
% \begin{itemize}
% \item
% The union (\verb"|") of two languages is the sum (boolean sum,
% i.e., `or') of the corresponding power series, since \(0+0=0\)
% and \(1+0=0+1=1+1=1\).
% \item
% The catenation of two languages is the product of the
% corresponding power series, since a word is included in the
% catenated language if and only if it can be split into two
% words where the first belongs to the first language and the
% second belongs to the second language.
% \item
% The star of a language $L$ turns out to be given by the
% geometric series, so that
% \begin{equation}
% L^* = \sum_{n=0}^\infty L^n = 1 + L + LL + LLL + \dotsb
% \end{equation}
% \end{itemize}
%
% An automaton can in this setting be regarded as a triplet
% $(\mathbf{s}, T, \mathbf{f})$ of matrices, where $\mathbf{s}$ is
% the vector of start states (an entry is $1$ if this is a start
% state and $0$ otherwise), $\mathbf{f}$ is the vector of final
% (accepting) states, and $T$ is the matrix of state transitions,
% where $T_{i,j}$ is the language of words which cause a direct
% transition from state $i$ to state $j$. For practical automatons,
% the transition matrix elements are typically sums of degree $0$ or
% $1$, but it is formally possible to regard more complicated
% transitions as elementary. Conversely, the language recognised by
% such an automaton can be written as
% \begin{equation}
% \mathbf{i}^{\mathrm{T}} T^* \mathbf{f}
% \end{equation}
% where \(T^* = \sum_{n=0}^\infty T^n\) and matrix multiplication has
% its usual definition (although over the boolean semiring).
%
% In this form, it is possible to eliminate a state $j$ from the
% automaton by adding to any entry $T_{i,k}$ the expression
% $T_{i,j} T_{j,j}^* T_{j,k}$ corresponding to a transition from $i$
% to $k$ via $j$.
%
% It is also convenient to consider an \emph{extended transition
% matrix} which incorporates the start and final state vectors like
% so:
% \begin{equation*}
% \begin{pmatrix}
% 0 & 0 & \mathbf{s}^{\mathrm{T}} \\
% 0 & 0 & 0 \\
% 0 & \mathbf{f} & T
% \end{pmatrix}
% \end{equation*}
% This is the matrix for an equivalent automaton where there is one
% distinguished start state (the first state) with epsilon-transitions
% to the states in $\mathbf{s}$ and one distinguished final state
% (the second state) to which there are epsilon-transitions from all
% states in $\mathbf{f}$. After all original states are eliminated
% from this matrix, the language can be read off in $T_{1,2}$.
%
%
% \section{Implementation}
%
% Since the \textsf{grammar::fa} package does not seem to have any
% mechanism for quering all symbols that may cause a transition from
% one state to another, the easiest approach for generating the
% matrix form of an automaton actually seems to be to start from the
% serialised automaton format.
%
%
% \begin{proc}{ser_to_ematrix}
% This procedure converts the serialisation of a finite automaton
% to an extended transition matrix with regular expression
% elements. The call syntax is
% \begin{quote}
% |grammar::fa::op::ser_to_ematrix| \word{serialisation}
% \end{quote}
% and the return value is a matrix of side $n$, where $n-2$ is the
% number of states in the automaton.
% \begin{tcl}
proc grammar::fa::op::ser_to_ematrix {ser} {
if {[lindex $ser 0] ne "grammar::fa"} then {
error "Expected grammar::fa automaton serialisation"
}
set stateL {}
set n 2; foreach {state des} [lindex $ser 2] {
lappend stateL $state
set N($state) $n
incr n
}
set row0 {}
for {set k 0} {$k<$n} {incr k} {lappend row0 [list |]}
set res [list $row0 $row0]
foreach {from des} [lindex $ser 2] {
set row [lrange $row0 0 1]
if {[lindex $des 0]} then {lset res 0 $N($from) [list .]}
if {[lindex $des 1]} then {lset row 1 [list .]}
foreach to $stateL {set S($to) [list |]}
foreach {symbol targetL} [lindex $des 2] {
if {$symbol eq ""} then {
set atom [list .]
} else {
set atom [list S $symbol]
}
foreach to $targetL {lappend S($to) $atom}
}
foreach to $stateL {
if {[llength $S($to)] == 2} then {
lappend row [lindex $S($to) 1]
} else {
lappend row $S($to)
}
}
lappend res $row
}
return $res
}
% \end{tcl}
% \end{proc}
%
%
% \begin{proc}{matrix_drop_state}
% This procedure removes the last state from the (extended)
% transition matrix of an automaton while preserving the language
% by adding transitions between existing states. The call syntax is
% \begin{quote}
% |grammar::fa::op::matrix_drop_state| \word{matrix}
% \word{simplifier}\regopt
% \end{quote}
% and the return value is the modified matrix, whose side has
% shrunk by one. The \word{simplifier} is the namespace to use for
% simplifying regular expressions when they are built; it defaults
% to |re1|.
%
% \begin{tcl}
proc grammar::fa::op::matrix_drop_state {T_in {ns re1}} {
set sumcmd ${ns}::|
set prodcmd ${ns}::.
set T1 {}
set lastcol {}
foreach row $T_in {
lappend T1 [lreplace $row end end]
lappend lastcol [lindex $row end]
}
set lastrow [lindex $T1 end]
set T1 [lreplace $T1 end end]
set b [${ns}::* [lindex $lastcol end]]
set lastcol [lreplace $lastcol end end]
set res {}
foreach row $T1 a $lastcol {
set newrow {}
foreach pos $row c $lastrow {
lappend newrow [$sumcmd $pos [$prodcmd $a $b $c]]
}
lappend res $newrow
}
return $res
}
% \end{tcl}
% \end{proc}
%
%
% \begin{proc}{toRegexp}
% This procedure takes the serialisation of a FA as argument and
% returns a regular expression for the language it accepts.
%
% \begin{tcl}
proc grammar::fa::op::toRegexp {ser} {
set ET [ser_to_ematrix $ser]
while {[llength $ET] > 2} {
set ET [matrix_drop_state $ET]
}
return [lindex $ET 0 1]
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{toRegexp2}
% This procedure takes the serialisation of a FA as argument and
% returns a regular expression for the language it accepts. It uses
% the |re2| simplifier for regular expressions.
%
% \begin{tcl}
proc grammar::fa::op::toRegexp2 {ser} {
set ET [ser_to_ematrix $ser]
while {[llength $ET] > 2} {
set ET [matrix_drop_state $ET re2]
}
return [lindex $ET 0 1]
}
% \end{tcl}
% \end{proc}
%
%
%
%
%
% \section{Simplifying regular expressions}
%
% A crucial component in making the results reasonable is to simplify
% the regular expressions generated.
%
%
% \subsection{Trivial simplification}
%
% The |re1| namespace houses a
% data-is-code style simplification system, which only looks at the
% root operation in an expression.
% \begin{tcl}
namespace eval grammar::fa::op::re1 {
namespace export | . {\*}
}
% \end{tcl}
% \setnamespace{grammar::fa::op::re1}
%
% \begin{proc}{|}
% This procedure performs basic simplification of regular
% expression sums. The call syntax is
% \begin{quote}
% \verb"|" \word{regular expression}\regstar
% \end{quote}
% and the return value is a regular expression equivalent to the
% sum of the given regular expressions.
% \begin{tcl}
proc grammar::fa::op::re1::| {args} {
set L {}
foreach re $args {
switch -- [lindex $re 0] "|" {
foreach term [lrange $re 1 end] {lappend L $term}
} default {
lappend L $re
}
}
set L [lsort -unique $L]
if {[llength $L] == 1} then {
return [lindex $L 0]
} else {
return [linsert $L 0 |]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{.}
% This procedure performs basic simplification of regular
% expression products. The call syntax is
% \begin{quote}
% \verb"." \word{regular expression}\regstar
% \end{quote}
% and the return value is a regular expression equivalent to the
% product of the given regular expressions.
% \begin{tcl}
proc grammar::fa::op::re1::. {args} {
set L {}
foreach re $args {
switch -- [lindex $re 0] "." {
foreach term [lrange $re 1 end] {lappend L $term}
} "|" {
if {[llength $re] == 1} then {return $re}
lappend L $re
} default {
lappend L $re
}
}
if {[llength $L] == 1} then {
return [lindex $L 0]
} else {
return [linsert $L 0 .]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{*}
% This procedure performs basic simplification of regular
% expression products. The call syntax is
% \begin{quote}
% \verb"*" \word{regular expression}
% \end{quote}
% and the return value is a regular expression equivalent to the
% Kleene closure of the \word{regular expression}.
% \begin{tcl}
proc grammar::fa::op::re1::* {re} {
switch -- [lindex $re 0] "|" - "." {
if {[llength $re] == 1} then {
return [list .]
} else {
return [list * $re]
}
} "*" {
return $re
} default {
return [list * $re]
}
}
% \end{tcl}
% \end{proc}
%
%
% \subsection{Simplification with distribution}
%
% The |re2| set of simplifications is similar to the |re1| set, but
% it also distributes sums over products. The sum and star operations
% are the same as in |re1|.
% \begin{tcl}
namespace eval grammar::fa::op::re2 {
namespace import [namespace parent]::re1::|\
[namespace parent]::re1::\\*
}
% \end{tcl}
% \setnamespace{grammar::fa::op::re2}
%
% \begin{proc}{.}
% This procedure performs two simplifications of regular
% expression products: it flattens factors that are themselves
% produces and expands factors that are sums. Each sum encountered
% generates a recursive call to this procedure.
%
% The call syntax is
% \begin{quote}
% \verb"." \word{regular expression}\regstar
% \end{quote}
% and the return value is a regular expression equivalent to the
% product of the given regular expressions.
% \begin{tcl}
proc grammar::fa::op::re2::. {args} {
set L {}
set n -1; foreach re $args {incr n
switch -- [lindex $re 0] "." {
foreach term [lrange $re 1 end] {lappend L $term}
} "|" {
set res [list |]
set L2 [lreplace $args 0 $n]
foreach term [lrange $re 1 end] {
lappend res [eval [list .] $L [list $term] $L2]
}
return [eval $res]
} default {
lappend L $re
}
}
if {[llength $L] == 1} then {
return [lindex $L 0]
} else {
return [linsert $L 0 .]
}
}
% \end{tcl}
% \end{proc}
%
%
% \subsection{Removing nullary operations}
%
% At the time of writing, the |fromRegex| method of |grammar::fa|
% cannot cope with nullary sums or product. When such things are
% generated by |toRegexp|, it is usually in the form of an empty
% string branch of an alternation:
% \begin{quote}
% \verb"| . "\word{regexp}\regplus
% \end{quote}
% Such constructions can alternatively be coded using |?|, like so:
% \begin{quote}
% \verb"? {| "\word{regexp}\regplus\verb"}"
% \end{quote}
%
% \begin{tcl}
namespace eval grammar::fa::op::nonnull {}
% \end{tcl}
% \setnamespace{grammar::fa::op::nonnull}
%
% The procedures in this namespace tries to transform a regular
% expression to a form without nullary \verb"|" or |.|. For
% expressions not involving |!| or |&| (which are unimplemented),
% this fails precisely when the language is $0$ or $1$.
%
% \begin{proc}{|}
% The basic form of all procedures are: evaluate all sub-REs (so
% that these are nonnull if possible), then look at the surface
% types and simplify what you can. In the case of choice, this
% includes checking for empty branches.
% \begin{tcl}
proc grammar::fa::op::nonnull::| {args} {
set also_empty false
set res [list |]
foreach branch $args {
set RE [eval $branch]
if {[lindex $RE 0] eq "?"} then {
% \end{tcl}
% |?| branches are treated as special cases, since what is inside
% them could be another choice.
% \begin{tcl}
set also_empty true
set RE [lindex $RE 1]
}
% \end{tcl}
% Choices only need to be flattened, since the subbranches has
% already been simplified during the |eval $branch|.
% \begin{tcl}
switch -- [lindex $RE 0] "|" {
eval [lreplace $RE 0 0 lappend res]
% \end{tcl}
% Concatenations are the main case to examine, since the whole
% point of the exercise is to get rid of branches that are $1$.
% \begin{tcl}
} "." {
if {[llength $RE] == 1} then {
set also_empty true
} else {
lappend res $RE
}
} default {
lappend res $RE
}
}
% \end{tcl}
% The |res| is OK if |also_empty| is false. It is also OK if there
% is a |*| branch, since that covers the empty string case.
% Otherwise |res| must be transformed, and it is not too much work
% to optimise the cases for no branch and one branch (other than
% the empty one).
% \begin{tcl}
if {!$also_empty} then {return $res}
foreach branch [lrange $res 1 end] {
if {[lindex $branch 0] eq "*"} then {return $res}
}
if {[llength $res] == 1} then {
return [list .]
} elseif {[llength $res] == 2} then {
return [lreplace $res 0 0 ?]
} else {
return [list ? $res]
}
}
% \end{tcl}
% \end{proc}
%
%
% \begin{proc}{.}
% The corresponding simplification of concatenation expressions is
% simpler, because it only has detect nullary choices (the $0$
% language).
% \begin{tcl}
proc grammar::fa::op::nonnull::. {args} {
set res [list .]
foreach branch $args {
set RE [eval $branch]
switch -- [lindex $RE 0] "|" {
if {[llength $RE] == 1} then {return $RE}
lappend res $RE
} "." {
eval [lreplace $RE 0 0 lappend res]
} default {
lappend res $RE
}
}
return $res
}
% \end{tcl}
% \end{proc}
%
%
% \begin{proc}{*}
% Repetitions may sometimes be combined.
% \begin{tcl}
proc grammar::fa::op::nonnull::* {sub} {
set RE [eval $sub]
switch -- [lindex $RE 0] "*" - "?" - "+" {
return [lreplace $RE 0 0 *]
} default {
return [list * $RE]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{+}
% Repetitions may sometimes be combined.
% \begin{tcl}
proc grammar::fa::op::nonnull::+ {sub} {
set RE [eval $sub]
switch -- [lindex $RE 0] "+" {
return $RE
} "*" - "?" {
return [lreplace $RE 0 0 *]
} default {
return [list * $RE]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{?}
% Optional subexpressions may similarly be combined.
% \begin{tcl}
proc grammar::fa::op::nonnull::? {sub} {
set RE [eval $sub]
switch -- [lindex $RE 0] "?" - "*" {
return $RE
} "+" {
return [lreplace $RE 0 0 *]
} default {
return [list ? $RE]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{S}
% Nothing happens to symbols.
% \begin{tcl}
proc grammar::fa::op::nonnull::S {name} {return [list S $name]}
% \end{tcl}
% \end{proc}
%
%
% \subsection{Simplifying regular expressions}
%
% \setnamespace{grammar::fa::op}
%
% \begin{proc}{simplifyRegexp}
% This procedure tries to simplify a regular expression by
% converting it to an automaton, minimizing that, and then covert
% it back. If the result is shorter than the original regexp then
% that is taken, otherwise the original is retained. The procedure
% is then invoked recursively on all subexpressions.
%
% \begin{tcl}
proc grammar::fa::op::simplifyRegexp {RE0} {
set RE1 [namespace inscope nonnull $RE0]
if {[lindex $RE1 0] eq "S" || $RE1 eq "." || $RE1 eq "|"} then {
return $RE1
}
set tmp [grammar::fa %AUTO% fromRegex $RE1]
$tmp minimize
set RE1 [toRegexp [$tmp serialize]]
$tmp destroy
if {[string length $RE1] < [string length $RE0]} then {
set RE0 $RE1
}
if {[lindex $RE0 0] eq "S"} then {return $RE0}
set res [lrange $RE0 0 0]
foreach branch [lrange $RE0 1 end] {
lappend res [simplifyRegexp $branch]
}
return $res
}
% \end{tcl}
% \end{proc}
%
% \subsection{Conversion to infix form}
%
% A different kind of simplification would be to convert a
% \textsf{grammar::fa} regular expression to the ordinary infix form,
% as used by e.g.~the \Tcl\ |regexp| command. Not all constructions
% have an easy translation (language intersection is one example, as
% it is not the same as a lookahead constraint), but the only
% essential construction that lacks such an expression is the $0$
% language.
%
% \begin{tcl}
namespace eval grammar::fa::op::tclre {}
% \end{tcl}
% \setnamespace{grammar::fa::op::tclre}
%
% The commands in the |tclre| namespace have the syntax
% \begin{quote}
% \meta{regexp} \word{symbol-dict}
% \end{quote}
% and they return a list of the form
% \begin{quote}
% \word{syntactic type} \word{\Tcl-regexp}
% \end{quote}
% where \word{syntactic type} is one of
% \begin{description}
% \item[sum]
% Regexp can be used as operand of \verb"|", but not of any other
% operation.
% \item[prod]
% Regexp can be used as operatnd of \verb"|" or concatenation,
% but not of any other operation.
% \item[atom]
% Regexp is an atom.
% \item[char]
% Regexp is a char. This has been distinguished to simplify
% generating bracket expressions.
% \item[class]
% Regexp has the form |[[:|\meta{name}|:]]|, i.e., a character
% class bracket expression. It can be placed in a bracket
% expression if the first and last char is removed.
% \end{description}
% The \word{symbol-dict} is a dictionary mapping symbol names to
% `\word{syntactic type} \word{\Tcl-regexp}' pairs. If a symbol
% occurring in the regexp is not listed in this dictionary then
% single-character symbols are considered to designate themselves
% whereas multiple-character symbols are considered to be a character
% class name.
%
% \begin{proc}{S}
% This procedure converts symbol nodes to regular expressions.
% \begin{tcl}
proc grammar::fa::op::tclre::S {name dict} {
array set A $dict
if {[info exists A($name)]} then {
return $A($name)
} elseif {[string length $name] == 1} then {
if {[regexp {[\\\[\]{}.()*+?^$]} $name]} then {
return [list char \\$name]
} else {
return [list char $name]
}
} else {
return [list class "\[\[:${name}:\]\]"]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{.}
% This procedure converts product nodes to regular expressions.
% \begin{tcl}
proc grammar::fa::op::tclre::. {args} {
set suffix [lrange $args end end]
set L {}
foreach factor [lrange $args 0 end-1] {
set pair [eval $factor $suffix]
switch -- [lindex $pair 0] "sum" {
lappend L ([lindex $pair 1])
} default {
lappend L [lindex $pair 1]
}
}
return [list prod [join $L ""]]
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{*}
% This procedure converts Kleene star nodes to regular expressions.
% \begin{tcl}
proc grammar::fa::op::tclre::* {re dict} {
set pair [eval $re [list $dict]]
switch -- [lindex $pair 0] "sum" - "prod" {
return [list prod "([lindex $pair 1])*"]
} default {
return [list prod "[lindex $pair 1]*"]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{+}
% This procedure converts one-or-more nodes to regular expressions.
% \begin{tcl}
proc grammar::fa::op::tclre::+ {re dict} {
set pair [eval $re [list $dict]]
switch -- [lindex $pair 0] "sum" - "prod" {
return [list prod "([lindex $pair 1])+"]
} default {
return [list prod "[lindex $pair 1]+"]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{?}
% This procedure converts zero-or-one nodes to regular expressions.
% \begin{tcl}
proc grammar::fa::op::tclre::? {re dict} {
set pair [eval $re [list $dict]]
switch -- [lindex $pair 0] "sum" - "prod" {
return [list prod "([lindex $pair 1])?"]
} default {
return [list prod "[lindex $pair 1]?"]
}
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{|}
% This procedure converts branch nodes to regular expressions. It
% attempts to collect characters and character classes in bracket
% expressions.
% \begin{tcl}
proc grammar::fa::op::tclre::| {args} {
set suffix [lrange $args end end]
set charL {}
set classL {}
set prodL {}
foreach factor [lrange $args 0 end-1] {
set pair [eval $factor $suffix]
switch -- [lindex $pair 0] "char" {
lappend charL [lindex $pair 1]
} "class" {
lappend classL [string range [lindex $pair 1] 1 end-1]
} default {
lappend prodL [lindex $pair 1]
}
}
if {[llength $charL]>1 || [llength $classL]>0} then {
while {[set n [lsearch $charL -]] >= 0} {
lset charL $n {\-}
}
set bracket "\[[join $charL ""][join $classL ""]\]"
if {![llength $prodL]} then {
return [list atom $bracket]
}
lappend prodL $bracket
} else {
eval [list lappend prodL] $charL
}
return [list sum [join $prodL |]]
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{&}
% This procedure throws an error.
% \begin{tcl}
proc grammar::fa::op::tclre::& {args} {
error "Cannot express language intersection in Tcl-RE's"
}
% \end{tcl}
% \end{proc}
%
% \begin{proc}{!}
% This procedure throws an error.
% \begin{tcl}
proc grammar::fa::op::tclre::! {args} {
error "Cannot express language complementation in Tcl-RE's"
}
% \end{tcl}
% \end{proc}
%
%
\endinput