Tk Library Source Code

Artifact [206651a7d1]
Login

Artifact 206651a7d1a47c7a5dcbd789d631842fe5c66044:

Attachment "toregexp.dtx" to ticket [1735601fff] added by lars_h 2007-08-11 22:38:04.
% 
% \iffalse (driver block)
%<*driver>
\documentclass{tclldoc}
\usepackage{amsmath}
\newcommand{\Tcl}{\Tcllogo}
\CodelineIndex
\setcounter{IndexColumns}{2}
\begin{document}
\DocInput{toregexp.dtx}
\PrintIndex
\end{document}
%</driver>
% \fi
% 
% \title{Analysis of finite automata}
% \author{Lars Hellstr\"om}
% \date{2007-07-23--}
% \maketitle
% 
% \begin{abstract}
%   By Kleene's analysis theorem for regular languages, every regular 
%   language (language recoginised by a finite automaton) can be 
%   described by a regular expression. The following implements a 
%   procedure for doing this, using the encodings of finite automata 
%   and regular expressions employed in the \textsf{grammar::fa} 
%   package.
% \end{abstract}
% 
% 
% \begin{tcl}
%<*pkg>
namespace eval grammar::fa::op {}
% \end{tcl}
% \setnamespace{grammar::fa::op}
% 
% 
% \section{Theory}
% 
% A language (in the formal sense, i.e., as a set of finite words) 
% can be regarded as an associative formal power series over the 
% two-element boolean algebra: a word has coefficient $1$ if it is 
% included in the language and $0$ otherwise. This point of view is 
% useful because the fundamental operations in a regular expression 
% corresponds directly to arithmetic operations on the formal power 
% series:
% \begin{itemize}
%   \item
%     The union (\verb"|") of two languages is the sum (boolean sum, 
%     i.e., `or') of the corresponding power series, since \(0+0=0\) 
%     and \(1+0=0+1=1+1=1\).
%   \item
%     The catenation of two languages is the product of the 
%     corresponding power series, since a word is included in the 
%     catenated language if and only if it can be split into two 
%     words where the first belongs to the first language and the 
%     second belongs to the second language.
%   \item
%     The star of a language $L$ turns out to be given by the 
%     geometric series, so that
%     \begin{equation}
%       L^* = \sum_{n=0}^\infty L^n = 1 + L + LL + LLL + \dotsb
%     \end{equation}
% \end{itemize}
% 
% An automaton can in this setting be regarded as a triplet 
% $(\mathbf{s}, T, \mathbf{f})$ of matrices, where $\mathbf{s}$ is 
% the vector of start states (an entry is $1$ if this is a start 
% state and $0$ otherwise), $\mathbf{f}$ is the vector of final 
% (accepting) states, and $T$ is the matrix of state transitions, 
% where $T_{i,j}$ is the language of words which cause a direct 
% transition from state $i$ to state $j$. For practical automatons, 
% the transition matrix elements are typically sums of degree $0$ or 
% $1$, but it is formally possible to regard more complicated 
% transitions as elementary. Conversely, the language recognised by 
% such an automaton can be written as
% \begin{equation}
%   \mathbf{i}^{\mathrm{T}} T^* \mathbf{f}
% \end{equation}
% where \(T^* = \sum_{n=0}^\infty T^n\) and matrix multiplication has 
% its usual definition (although over the boolean semiring).
% 
% In this form, it is possible to eliminate a state $j$ from the 
% automaton by adding to any entry $T_{i,k}$ the expression 
% $T_{i,j} T_{j,j}^* T_{j,k}$ corresponding to a transition from $i$ 
% to $k$ via $j$.
% 
% It is also convenient to consider an \emph{extended transition 
% matrix} which incorporates the start and final state vectors like 
% so:
% \begin{equation*}
%   \begin{pmatrix}
%     0 & 0 & \mathbf{s}^{\mathrm{T}} \\
%     0 & 0 & 0 \\
%     0 & \mathbf{f} & T
%   \end{pmatrix}
% \end{equation*}
% This is the matrix for an equivalent automaton where there is one 
% distinguished start state (the first state) with epsilon-transitions 
% to the states in $\mathbf{s}$ and one distinguished final state 
% (the second state) to which there are epsilon-transitions from all 
% states in $\mathbf{f}$. After all original states are eliminated 
% from this matrix, the language can be read off in $T_{1,2}$.
% 
% 
% \section{Implementation}
% 
% Since the \textsf{grammar::fa} package does not seem to have any 
% mechanism for quering all symbols that may cause a transition from 
% one state to another, the easiest approach for generating the 
% matrix form of an automaton actually seems to be to start from the 
% serialised automaton format.
% 
% 
% \begin{proc}{ser_to_ematrix}
%   This procedure converts the serialisation of a finite automaton 
%   to an extended transition matrix with regular expression 
%   elements. The call syntax is
%   \begin{quote}
%     |grammar::fa::op::ser_to_ematrix| \word{serialisation}
%   \end{quote}
%   and the return value is a matrix of side $n$, where $n-2$ is the 
%   number of states in the automaton.
%   \begin{tcl}
proc grammar::fa::op::ser_to_ematrix {ser} {
   if {[lindex $ser 0] ne "grammar::fa"} then {
      error "Expected grammar::fa automaton serialisation"
   }
   set stateL {}
   set n 2; foreach {state des} [lindex $ser 2] {
      lappend stateL $state
      set N($state) $n
      incr n
   }
   set row0 {}
   for {set k 0} {$k<$n} {incr k} {lappend row0 [list |]}
   set res [list $row0 $row0]
   foreach {from des} [lindex $ser 2] {
      set row [lrange $row0 0 1]
      if {[lindex $des 0]} then {lset res 0 $N($from) [list .]}
      if {[lindex $des 1]} then {lset row 1 [list .]}
      foreach to $stateL {set S($to) [list |]}
      foreach {symbol targetL} [lindex $des 2] {
         if {$symbol eq ""} then {
            set atom [list .]
         } else {
            set atom [list S $symbol]
         }
         foreach to $targetL {lappend S($to) $atom}
      }
      foreach to $stateL {
         if {[llength $S($to)] == 2} then {
            lappend row [lindex $S($to) 1]
         } else {
            lappend row $S($to)
         }
      }
      lappend res $row
   }
   return $res
}
%   \end{tcl}
% \end{proc}
% 
% 
% \begin{proc}{matrix_drop_state}
%   This procedure removes the last state from the (extended) 
%   transition matrix of an automaton while preserving the language 
%   by adding transitions between existing states. The call syntax is
%   \begin{quote}
%     |grammar::fa::op::matrix_drop_state| \word{matrix}
%     \word{simplifier}\regopt
%   \end{quote}
%   and the return value is the modified matrix, whose side has 
%   shrunk by one. The \word{simplifier} is the namespace to use for 
%   simplifying regular expressions when they are built; it defaults 
%   to |re1|.
%   
%   \begin{tcl}
proc grammar::fa::op::matrix_drop_state {T_in {ns re1}} {
   set sumcmd ${ns}::|
   set prodcmd ${ns}::.
   set T1 {}
   set lastcol {}
   foreach row $T_in {
      lappend T1 [lreplace $row end end]
      lappend lastcol [lindex $row end]
   }
   set lastrow [lindex $T1 end]
   set T1 [lreplace $T1 end end]
   set b [${ns}::* [lindex $lastcol end]]
   set lastcol [lreplace $lastcol end end]
   set res {}
   foreach row $T1 a $lastcol {
      set newrow {}
      foreach pos $row c $lastrow {
         lappend newrow [$sumcmd $pos [$prodcmd $a $b $c]]
      }
      lappend res $newrow
   }
   return $res
}
%   \end{tcl}
% \end{proc}
% 
% 
% \begin{proc}{toRegexp}
%   This procedure takes the serialisation of a FA as argument and 
%   returns a regular expression for the language it accepts.
%   
%   \begin{tcl}
proc grammar::fa::op::toRegexp {ser} {
   set ET [ser_to_ematrix $ser]
   while {[llength $ET] > 2} {
      set ET [matrix_drop_state $ET]
   }
   return [lindex $ET 0 1]
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{toRegexp2}
%   This procedure takes the serialisation of a FA as argument and 
%   returns a regular expression for the language it accepts. It uses 
%   the |re2| simplifier for regular expressions.
%   
%   \begin{tcl}
proc grammar::fa::op::toRegexp2 {ser} {
   set ET [ser_to_ematrix $ser]
   while {[llength $ET] > 2} {
      set ET [matrix_drop_state $ET re2]
   }
   return [lindex $ET 0 1]
}
%   \end{tcl}
% \end{proc}
% 
% 
% 
% 
% 
% \section{Simplifying regular expressions}
% 
% A crucial component in making the results reasonable is to simplify 
% the regular expressions generated. 
% 
% 
% \subsection{Trivial simplification}
% 
% The |re1| namespace houses a 
% data-is-code style simplification system, which only looks at the 
% root operation in an expression.
% \begin{tcl}
namespace eval grammar::fa::op::re1 {
   namespace export | . {\*}
}
% \end{tcl}
% \setnamespace{grammar::fa::op::re1}
% 
% \begin{proc}{|}
%   This procedure performs basic simplification of regular 
%   expression sums. The call syntax is
%   \begin{quote}
%     \verb"|" \word{regular expression}\regstar
%   \end{quote}
%   and the return value is a regular expression equivalent to the 
%   sum of the given regular expressions.
%   \begin{tcl}
proc grammar::fa::op::re1::| {args} {
   set L {}
   foreach re $args {
      switch -- [lindex $re 0] "|" {
         foreach term [lrange $re 1 end] {lappend L $term}
      } default {
         lappend L $re
      }
   }
   set L [lsort -unique $L]
   if {[llength $L] == 1} then {
      return [lindex $L 0]
   } else {
      return [linsert $L 0 |]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{.}
%   This procedure performs basic simplification of regular 
%   expression products. The call syntax is
%   \begin{quote}
%     \verb"." \word{regular expression}\regstar
%   \end{quote}
%   and the return value is a regular expression equivalent to the 
%   product of the given regular expressions.
%   \begin{tcl}
proc grammar::fa::op::re1::. {args} {
   set L {}
   foreach re $args {
      switch -- [lindex $re 0] "." {
         foreach term [lrange $re 1 end] {lappend L $term}
      } "|" {
         if {[llength $re] == 1} then {return $re}
         lappend L $re
      } default {
         lappend L $re
      }
   }
   if {[llength $L] == 1} then {
      return [lindex $L 0]
   } else {
      return [linsert $L 0 .]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{*}
%   This procedure performs basic simplification of regular 
%   expression products. The call syntax is
%   \begin{quote}
%     \verb"*" \word{regular expression}
%   \end{quote}
%   and the return value is a regular expression equivalent to the 
%   Kleene closure of the \word{regular expression}.
%   \begin{tcl}
proc grammar::fa::op::re1::* {re} {
   switch -- [lindex $re 0] "|" - "." {
      if {[llength $re] == 1} then {
         return [list .]
      } else {
         return [list * $re]
      }
   } "*" {
      return $re
   } default {
      return [list * $re]
   }
}
%   \end{tcl}
% \end{proc}
% 
% 
% \subsection{Simplification with distribution}
% 
% The |re2| set of simplifications is similar to the |re1| set, but 
% it also distributes sums over products. The sum and star operations 
% are the same as in |re1|.
% \begin{tcl}
namespace eval grammar::fa::op::re2 {
   namespace import [namespace parent]::re1::|\
     [namespace parent]::re1::\\*
}
% \end{tcl}
% \setnamespace{grammar::fa::op::re2}
% 
% \begin{proc}{.}
%   This procedure performs two simplifications of regular 
%   expression products: it flattens factors that are themselves 
%   produces and expands factors that are sums. Each sum encountered 
%   generates a recursive call to this procedure.
%   
%   The call syntax is
%   \begin{quote}
%     \verb"." \word{regular expression}\regstar
%   \end{quote}
%   and the return value is a regular expression equivalent to the 
%   product of the given regular expressions.
%   \begin{tcl}
proc grammar::fa::op::re2::. {args} {
   set L {}
   set n -1; foreach re $args {incr n
      switch -- [lindex $re 0] "." {
         foreach term [lrange $re 1 end] {lappend L $term}
      } "|" {
         set res [list |]
         set L2 [lreplace $args 0 $n]
         foreach term [lrange $re 1 end] {
            lappend res [eval [list .] $L [list $term] $L2]
         }
         return [eval $res]
      } default {
         lappend L $re
      }
   }
   if {[llength $L] == 1} then {
      return [lindex $L 0]
   } else {
      return [linsert $L 0 .]
   }
}
%   \end{tcl}
% \end{proc}
% 
% 
% \subsection{Removing nullary operations}
% 
% At the time of writing, the |fromRegex| method of |grammar::fa| 
% cannot cope with nullary sums or product. When such things are 
% generated by |toRegexp|, it is usually in the form of an empty 
% string branch of an alternation:
% \begin{quote}
%   \verb"| . "\word{regexp}\regplus
% \end{quote}
% Such constructions can alternatively be coded using |?|, like so:
% \begin{quote}
%   \verb"? {| "\word{regexp}\regplus\verb"}"
% \end{quote}
% 
% \begin{tcl}
namespace eval grammar::fa::op::nonnull {}
% \end{tcl}
% \setnamespace{grammar::fa::op::nonnull}
% 
% The procedures in this namespace tries to transform a regular 
% expression to a form without nullary \verb"|" or |.|. For 
% expressions not involving |!| or |&| (which are unimplemented), 
% this fails precisely when the language is $0$ or $1$.
% 
% \begin{proc}{|}
%   The basic form of all procedures are: evaluate all sub-REs (so 
%   that these are nonnull if possible), then look at the surface 
%   types and simplify what you can. In the case of choice, this 
%   includes checking for empty branches.
%   \begin{tcl}
proc grammar::fa::op::nonnull::| {args} {
   set also_empty false
   set res [list |]
   foreach branch $args {
      set RE [eval $branch]
      if {[lindex $RE 0] eq "?"} then {
%   \end{tcl}
%   |?| branches are treated as special cases, since what is inside 
%   them could be another choice.
%   \begin{tcl}
         set also_empty true
         set RE [lindex $RE 1]
      }
%   \end{tcl}
%   Choices only need to be flattened, since the subbranches has 
%   already been simplified during the |eval $branch|.
%   \begin{tcl}
      switch -- [lindex $RE 0] "|" {
         eval [lreplace $RE 0 0 lappend res]
%   \end{tcl}
%   Concatenations are the main case to examine, since the whole 
%   point of the exercise is to get rid of branches that are $1$.
%   \begin{tcl}
      } "." {
         if {[llength $RE] == 1} then {
            set also_empty true
         } else {
            lappend res $RE
         }
      } default {
         lappend res $RE
      }
   }
%   \end{tcl}
%   The |res| is OK if |also_empty| is false. It is also OK if there 
%   is a |*| branch, since that covers the empty string case. 
%   Otherwise |res| must be transformed, and it is not too much work 
%   to optimise the cases for no branch and one branch (other than 
%   the empty one).
%   \begin{tcl}
   if {!$also_empty} then {return $res}
   foreach branch [lrange $res 1 end] {
      if {[lindex $branch 0] eq "*"} then {return $res}
   }
   if {[llength $res] == 1} then {
      return [list .]
   } elseif {[llength $res] == 2} then {
      return [lreplace $res 0 0 ?]
   } else {
      return [list ? $res]
   }
}
%   \end{tcl}
% \end{proc}
% 
% 
% \begin{proc}{.}
%   The corresponding simplification of concatenation expressions is 
%   simpler, because it only has detect nullary choices (the $0$ 
%   language).
%   \begin{tcl}
proc grammar::fa::op::nonnull::. {args} {
   set res [list .]
   foreach branch $args {
      set RE [eval $branch]
      switch -- [lindex $RE 0] "|" {
         if {[llength $RE] == 1} then {return $RE}
         lappend res $RE
      } "." {
         eval [lreplace $RE 0 0 lappend res]
      } default {
         lappend res $RE
      }
   }
   return $res
}
%   \end{tcl}
% \end{proc}
% 
% 
% \begin{proc}{*}
%   Repetitions may sometimes be combined.
%   \begin{tcl}
proc grammar::fa::op::nonnull::* {sub} {
   set RE [eval $sub]
   switch -- [lindex $RE 0] "*" - "?" - "+" {
      return [lreplace $RE 0 0 *]
   } default {
      return [list * $RE]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{+}
%   Repetitions may sometimes be combined.
%   \begin{tcl}
proc grammar::fa::op::nonnull::+ {sub} {
   set RE [eval $sub]
   switch -- [lindex $RE 0] "+" {
      return $RE
   } "*" - "?" {
      return [lreplace $RE 0 0 *]
   } default {
      return [list * $RE]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{?}
%   Optional subexpressions may similarly be combined.
%   \begin{tcl}
proc grammar::fa::op::nonnull::? {sub} {
   set RE [eval $sub]
   switch -- [lindex $RE 0] "?" - "*" {
      return $RE
   } "+" {
      return [lreplace $RE 0 0 *]
   } default {
      return [list ? $RE]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{S}
%   Nothing happens to symbols.
%   \begin{tcl}
proc grammar::fa::op::nonnull::S {name} {return [list S $name]}
%   \end{tcl}
% \end{proc}
% 
% 
% \subsection{Simplifying regular expressions}
% 
% \setnamespace{grammar::fa::op}
% 
% \begin{proc}{simplifyRegexp}
%   This procedure tries to simplify a regular expression by 
%   converting it to an automaton, minimizing that, and then covert 
%   it back. If the result is shorter than the original regexp then 
%   that is taken, otherwise the original is retained. The procedure 
%   is then invoked recursively on all subexpressions.
%   
%   \begin{tcl}
proc grammar::fa::op::simplifyRegexp {RE0} {
   set RE1 [namespace inscope nonnull $RE0]
   if {[lindex $RE1 0] eq "S" || $RE1 eq "." || $RE1 eq "|"} then {
      return $RE1
   }
   set tmp [grammar::fa %AUTO% fromRegex $RE1]
   $tmp minimize
   set RE1 [toRegexp [$tmp serialize]]
   $tmp destroy
   if {[string length $RE1] < [string length $RE0]} then {
      set RE0 $RE1
   }
   if {[lindex $RE0 0] eq "S"} then {return $RE0}
   set res [lrange $RE0 0 0]
   foreach branch [lrange $RE0 1 end] {
      lappend res [simplifyRegexp $branch]
   }
   return $res
}
%   \end{tcl}
% \end{proc}
% 
% \subsection{Conversion to infix form}
% 
% A different kind of simplification would be to convert a 
% \textsf{grammar::fa} regular expression to the ordinary infix form, 
% as used by e.g.~the \Tcl\ |regexp| command. Not all constructions 
% have an easy translation (language intersection is one example, as 
% it is not the same as a lookahead constraint), but the only 
% essential construction that lacks such an expression is the $0$ 
% language.
% 
% \begin{tcl}
namespace eval grammar::fa::op::tclre {}
% \end{tcl}
% \setnamespace{grammar::fa::op::tclre}
% 
% The commands in the |tclre| namespace have the syntax
% \begin{quote}
%   \meta{regexp} \word{symbol-dict}
% \end{quote}
% and they return a list of the form
% \begin{quote}
%   \word{syntactic type} \word{\Tcl-regexp}
% \end{quote}
% where \word{syntactic type} is one of
% \begin{description}
%   \item[sum]
%     Regexp can be used as operand of \verb"|", but not of any other 
%     operation.
%   \item[prod]
%     Regexp can be used as operatnd of \verb"|" or concatenation, 
%     but not of any other operation.
%   \item[atom]
%     Regexp is an atom.
%   \item[char]
%     Regexp is a char. This has been distinguished to simplify 
%     generating bracket expressions.
%   \item[class]
%     Regexp has the form |[[:|\meta{name}|:]]|, i.e., a character 
%     class bracket expression. It can be placed in a bracket 
%     expression if the first and last char is removed. 
% \end{description}
% The \word{symbol-dict} is a dictionary mapping symbol names to 
% `\word{syntactic type} \word{\Tcl-regexp}' pairs. If a symbol 
% occurring in the regexp is not listed in this dictionary then 
% single-character symbols are considered to designate themselves 
% whereas multiple-character symbols are considered to be a character 
% class name.
% 
% \begin{proc}{S}
%   This procedure converts symbol nodes to regular expressions.
%   \begin{tcl}
proc grammar::fa::op::tclre::S {name dict} {
   array set A $dict
   if {[info exists A($name)]} then {
      return $A($name)
   } elseif {[string length $name] == 1} then {
      if {[regexp {[\\\[\]{}.()*+?^$]} $name]} then {
         return [list char \\$name]
      } else {
         return [list char $name]
      }
   } else {
      return [list class "\[\[:${name}:\]\]"]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{.}
%   This procedure converts product nodes to regular expressions.
%   \begin{tcl}
proc grammar::fa::op::tclre::. {args} {
   set suffix [lrange $args end end]
   set L {}
   foreach factor [lrange $args 0 end-1] {
      set pair [eval $factor $suffix]
      switch -- [lindex $pair 0] "sum" {
         lappend L ([lindex $pair 1])
      } default {
         lappend L [lindex $pair 1]
      }
   }
   return [list prod [join $L ""]]
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{*}
%   This procedure converts Kleene star nodes to regular expressions.
%   \begin{tcl}
proc grammar::fa::op::tclre::* {re dict} {
   set pair [eval $re [list $dict]]
   switch -- [lindex $pair 0] "sum" - "prod" {
      return [list prod "([lindex $pair 1])*"]
   } default {
      return [list prod "[lindex $pair 1]*"]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{+}
%   This procedure converts one-or-more nodes to regular expressions.
%   \begin{tcl}
proc grammar::fa::op::tclre::+ {re dict} {
   set pair [eval $re [list $dict]]
   switch -- [lindex $pair 0] "sum" - "prod" {
      return [list prod "([lindex $pair 1])+"]
   } default {
      return [list prod "[lindex $pair 1]+"]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{?}
%   This procedure converts zero-or-one nodes to regular expressions.
%   \begin{tcl}
proc grammar::fa::op::tclre::? {re dict} {
   set pair [eval $re [list $dict]]
   switch -- [lindex $pair 0] "sum" - "prod" {
      return [list prod "([lindex $pair 1])?"]
   } default {
      return [list prod "[lindex $pair 1]?"]
   }
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{|}
%   This procedure converts branch nodes to regular expressions. It 
%   attempts to collect characters and character classes in bracket 
%   expressions.
%   \begin{tcl}
proc grammar::fa::op::tclre::| {args} {
   set suffix [lrange $args end end]
   set charL {}
   set classL {}
   set prodL {}
   foreach factor [lrange $args 0 end-1] {
      set pair [eval $factor $suffix]
      switch -- [lindex $pair 0] "char" {
         lappend charL [lindex $pair 1]
      } "class" {
         lappend classL [string range [lindex $pair 1] 1 end-1]
      } default {
         lappend prodL [lindex $pair 1]
      }
   }
   if {[llength $charL]>1 || [llength $classL]>0} then {
      while {[set n [lsearch $charL -]] >= 0} {
         lset charL $n {\-}
      }
      set bracket "\[[join $charL ""][join $classL ""]\]"
      if {![llength $prodL]} then {
         return [list atom $bracket]
      }
      lappend prodL $bracket
   } else {
     eval [list lappend prodL] $charL
   }
   return [list sum [join $prodL |]]
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{&}
%   This procedure throws an error.
%   \begin{tcl}
proc grammar::fa::op::tclre::& {args} {
   error "Cannot express language intersection in Tcl-RE's"
}
%   \end{tcl}
% \end{proc}
% 
% \begin{proc}{!}
%   This procedure throws an error.
%   \begin{tcl}
proc grammar::fa::op::tclre::! {args} {
   error "Cannot express language complementation in Tcl-RE's"
}
%   \end{tcl}
% \end{proc}
% 
% 
\endinput