Tcl Library Source Code

Artifact [3f68b5b204]
Login

Artifact 3f68b5b2048c6c79028b7eac157c5199fe1ada97:


#####
#
# "BibTeX parser"
# http://wiki.tcl.tk/13719
#
# Tcl code harvested on:   7 Mar 2005, 23:55 GMT
# Wiki page last updated: ???
#
#####

# bibtex.tcl --
#
#      A basic parser for BibTeX bibliography databases.
#
# Copyright (c) 2005 Neil Madden.
# Copyright (c) 2005 Andreas Kupries.
# License: Tcl/BSD style.

### NOTES
###
### Need commands to introspect parser state. Especially the string
### map (for testing of 'addStrings', should be useful in general as
### well).

# ### ### ### ######### ######### #########
## Requisites

package require Tcl 8.4
package require cmdline

# ### ### ### ######### ######### #########
## Implementation: Public API

namespace eval ::bibtex {}

# bibtex::parse --
#
#	Parse a bibtex file.
#
# parse ?options? ?bibtex?

proc ::bibtex::parse {args} {
    variable data
    variable id

    # Argument processing
    if {[llength $args] < 1} {
	set err "[lindex [info level 0] 0] ?options? ?bibtex?"
	return -code error "wrong # args: should be \"$err\""
    }

    array set state {}
    GetOptions $args state

    # Initialize the parser state from the options, fill in default
    # values, and handle the input according the specified mode.

    set token bibtex[incr id]
    foreach {k v} [array get state] {
	set data($token,$k) $v
    }

    if {$state(stream)} {
	# Text not in memory
	if {!$state(bg)} {
	    # Text from a channel, no async processing. We read everything
	    # into memory and the handle it as before.

	    set blockmode [fconfigure $state(-channel) -blocking]
	    fconfigure $state(-channel) -blocking 1
	    set data($token,buffer) [read $state(-channel)]
	    fconfigure $state(-channel) -blocking $blockmode

	    # Tell upcoming processing that the text is in memory.
	    set state(stream) 0
	} else {
	    # Text from a channel, and processing is async. Create an
	    # event handler for the incoming data.

	    set data($token,done) 0
	    fileevent $state(-channel) readable \
		    [list ::bibtex::ReadChan $token]

	    # Initialize the parser internal result buffer if we use plain
	    # -command, and not the SAX api.
	    if {!$state(sax)} {
		set data($token,result) {}
	    }
	}
    }

    # Initialize the string mappings (none known), and the result
    # accumulator.
    set data($token,strings) {}
    set data($token,result)  {}

    if {!$state(stream)} {
	ParseRecords $token 1
	if {$state(sax)} {
	    set result $token
	} else {
	    set result $data($token,result)
	    destroy $token
	}
	return $result
    }

    # Assert: Processing is in background.
    return $token
}

# Cleanup a parser, cancelling any callbacks etc.

proc ::bibtex::destroy {token} {
    variable data

    if {![info exists data($token,stream)]} {
	return -code error "Illegal bibtex parser \"$token\""
    }
    if {$data($token,stream)} {
	fileevent $data($token,-channel) readable {}
    }

    array unset data $token,*
    return
}


proc ::bibtex::wait {token} {
    variable data

    if {![info exists data($token,stream)]} {
	return -code error "Illegal bibtex parser \"$token\""
    }
    vwait ::bibtex::data($token,done)
    return
}

# bibtex::addStrings --
#
#	Add strings to the map for a particular parser. All strings are
#	expanded at parse time.

proc ::bibtex::addStrings {token strings} {
    variable data
    eval [linsert $strings 0 lappend data($token,strings)]
    return
}

# ### ### ### ######### ######### #########
## Implementation: Private utility routines

proc ::bibtex::AddRecord {token type key recdata} {
    variable data
    lappend  data($token,result) [list $type $key $recdata]
    return
}

proc ::bibtex::GetOptions {argv statevar} {
    upvar 1 $statevar state

    # Basic processing of the argument list
    # and the options found therein.

    set opts [lrange [::cmdline::GetOptionDefaults {
	{command.arg         {}}
	{channel.arg         {}}
	{recordcommand.arg   {}}
	{preamblecommand.arg {}}
	{stringcommand.arg   {}}
	{commentcommand.arg  {}}
	{progresscommand.arg {}}
    } result] 2 end] ;# Remove ? and help.

    set argc [llength $argv]
    while {[set err [::cmdline::getopt argv $opts opt arg]]} {
	if {$err < 0} {
	    set olist ""
	    foreach o [lsort $opts] {
		if {[string match *.arg $o]} {
		    set o [string range $o 0 end-4]
		}
		lappend olist -$o
	    }
	    return -code error "bad option \"$opt\",\
		    should be one of\
		    [linsert [join $olist ", "] end-1 or]"
	}
	set state(-$opt) $arg
    }

    # Check the information gained so far
    # for inconsistencies and/or missing
    # pieces.

    set sax [expr {
	[info exists state(-recordcommand)]   ||
	[info exists state(-preamblecommand)] ||
	[info exists state(-stringcommand)]   ||
	[info exists state(-commentcommand)]  ||
	[info exists state(-progresscommand)]
    }] ; # {}

    set bg [info exists state(-command)]

    if {$sax && $bg} {
	# Sax callbacks and channel completion callback exclude each
	# other.
	return -code error "The options -command and -TYPEcommand exclude each other"
    }

    set stream [info exists state(-channel)]

    if {$stream} {
	# Channel is present, a text is not allowed.
	if {[llength $argv]} {
	    return -code error "Option -channel and text exclude each other"
	}

	# The channel has to exist as well.
	if {[lsearch -exact [file channels] $state(-channel)] < 0} {
	    return -code error "Illegal channel handle \"$state(-channel)\""
	}
    } else {
	# Channel is not present, we have to have a text, and only
	# exactly one. And a general -command callback is not allowed.

	if {![llength $argv]} {
	    return -code error "Neither -channel nor text specified"
	} elseif {[llength $argv] > 1} {
	    return -code error "wrong # args: [lindex [info level 1] 0] ?options? ?bibtex?"
	}

	# Channel completion callback is not allowed if we are not
	# reading from a channel.

	if {$bg} {
	    return -code error "Option -command and text exclude each other"
	}

	set state(buffer) [lindex $argv 0]
    }

    set state(stream) $stream
    set state(sax)    $sax
    set state(bg)     [expr {$sax || $bg}]

    if {![info exists state(-stringcommand)]} {
	set state(-stringcommand) [list ::bibtex::addStrings]
    }
    if {![info exists state(-recordcommand)] && (!$sax)} {
	set state(-recordcommand) [list ::bibtex::AddRecord]
    }
    return
}

proc ::bibtex::Callback {token type args} {
    variable data

    #puts stdout "Callback ($token $type ($args))"

    if {[info exists data($token,-${type}command)]} {
	eval $data($token,-${type}command) [linsert $args 0 $token]
    }
    return
}

proc ::bibtex::ReadChan {token} {
    variable data

    # Read the waiting characters into our buffer and process
    # them. The records are saved either through a user supplied
    # record callback, or the standard callback for our non-sax
    # processing.

    set    chan $data($token,-channel)
    append data($token,buffer) [read $chan]

    if {[eof $chan]} {
	# Final processing. In non-SAX mode we have to deliver the
	# completed result before destroying the parser.

	ParseRecords $token 1
	set data($token,done) 1
	if {!$data($token,sax)} {
	    Callback $token {} $data($token,result)
	}
	return
    }

    # Processing of partial data.

    ParseRecords $token 0
    return
}

proc ::bibtex::Tidy {str} {
    return [string tolower [string trim $str]]
}

proc ::bibtex::ParseRecords {token eof} {
    # A rough BibTeX grammar (case-insensitive):
    #
    # Database      ::= (Junk '@' Entry)*
    # Junk          ::= .*?
    # Entry         ::= Record
    #               |   Comment
    #               |   String
    #               |   Preamble
    # Comment       ::= "comment" [^\n]* \n         -- ignored
    # String        ::= "string" '{' Field* '}'
    # Preamble      ::= "preamble" '{' .* '}'       -- (balanced)
    # Record        ::= Type '{' Key ',' Field* '}'
    #               |   Type '(' Key ',' Field* ')' -- not handled
    # Type          ::= Name
    # Key           ::= Name
    # Field         ::= Name '=' Value
    # Name          ::= [^\s\"#%'(){}]*
    # Value         ::= [0-9]+
    #               |   '"' ([^'"']|\\'"')* '"'
    #               |   '{' .* '}'                  -- (balanced)

    # " - Fixup emacs hilit confusion from the grammar above.
    variable data
    set bibtex $data($token,buffer)

    # Split at each @ character which is at the beginning of a line,
    # modulo whitespace. This is a heuristic to distinguish the @'s
    # starting a new record from the @'s occuring inside a record, as
    # part of email addresses. Empty pices at beginning or end are
    # stripped before the split.

    regsub -line -all {^[\n\r\f\t ]*@} $bibtex \000 bibtex
    set db [split [string trim $bibtex \000] \000]

    if {$eof} {
	set total [llength $db]
	set step  [expr {double($total) / 100.0}]
	set istep [expr {$step > 1 ? int($step) : 1}]
	set count 0
    } else {
	if {[llength $db] < 2} {
	    # Nothing to process, or data which ay be incomplete.
	    return
	}

	set data($token,buffer) [lindex $db end]
	set db                  [lrange $db 0 end-1]

	# Fake progress meter.
	set count -1
    }

    foreach block $db {
	if {$count < 0} {
	    Callback $token progress -1
	} elseif {([incr count] % $istep) == 0} {
	    Callback $token progress [expr {int($count / $step)}]
	}
	if {[regexp -nocase {\s*comment([^\n])*\n(.*)} $block \
		-> cmnt rest]} {
	    # Are @comments blocks, or just 1 line?
	    # Does anyone care?
	    Callback $token comment $cmnt

	} elseif {[regexp -nocase {\s*string[^\{]*\{(.*)\}[^\}]*} \
		$block -> rest]} {
	    # string macro defs
	    Callback $token string [ParseBlock $rest]

	} elseif {[regexp -nocase {\s*preamble[^\{]*\{(.*)\}[^\}]*} \
		$block -> rest]} {
	    Callback $token preamble $rest

	} elseif {[regexp {([^\{]+)\{([^,]*),(.*)\}[^\}]*} \
		$block -> type key rest]} {
	    # Do any @string mappings (these are case insensitive)
	    set rest [string map -nocase $data($token,strings) $rest]
	    Callback $token record [Tidy $type] [string trim $key] \
		    [ParseBlock $rest]
	} else {
	    ## FUTURE: Use a logger.
	    puts stderr "Skipping: $block"
	}
    }
}

proc ::bibtex::ParseBlock {block} {
    set ret   [list]
    set index 0
    while {
	[regexp -start $index -indices -- \
		{(\S+)[^=]*=(.*)} $block -> key rest]
    } {
	foreach {ks ke} $key break
	set k [Tidy [string range $block $ks $ke]]
	foreach {rs re} $rest break
	foreach {v index} \
		[ParseBibString $rs [string range $block $rs $re]] \
		break
	lappend ret $k $v
    }
    return $ret
}

proc ::bibtex::ParseBibString {index str} {
    set count 0
    set retstr ""
    set escape 0
    set string 0
    foreach char [split $str ""] {
	incr index
	if {$escape} {
	    set escape 0
	} else {
	    if {$char eq "\{"} {
		incr count
		continue
	    } elseif {$char eq "\}"} {
		incr count -1
		if {$count < 0} {incr index -1; break}
		continue
	    } elseif {$char eq ","} {
		if {$count == 0} break
	    } elseif {$char eq "\\"} {
		set escape 1
		continue
	    } elseif {$char eq "\""} {
		# Managing the count ensures that comma inside of a
		# string is not considered as the end of the field.
		if {!$string} {
		    incr count
		    set string 1
		} else {
		    incr count -1
		    set string 0
		}
		continue
	    }
	    # else: Nothing
	}
	append retstr $char
    }
    regsub -all {\s+} $retstr { } retstr
    return [list [string trim $retstr] $index]
}


# ### ### ### ######### ######### #########
## Internal. Package configuration and state.

namespace eval bibtex {
    # Counter for the generation of parser tokens.
    variable id 0

    # State of all parsers. Keys for each parser are prefixed with the
    # parser token.
    variable  data
    array set data {}

    # Keys and their meaning (listed without token prefix)
    ##
    # buffer
    # eof
    # channel    <-\/- Difference ?
    # strings      |
    # -async       |
    # -blocksize   |
    # -channel   <-/
    # -recordcommand   -- callback for each record
    # -preamblecommand -- callback for @preamble blocks
    # -stringcommand   -- callback for @string macros
    # -commentcommand  -- callback for @comment blocks
    # -progresscommand -- callback to indicate progress of parse
    ##
}

# ### ### ### ######### ######### #########
## Ready to go
package provide bibtex 0.5
# EOF