Index: generic/tclEncoding.c ================================================================== --- generic/tclEncoding.c +++ generic/tclEncoding.c @@ -2384,11 +2384,16 @@ * mode, so that they get converted to 0xC080. */ *dst++ = *src++; } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { + && (UCHAR(src[1]) == 0x80) + && ( + !(flags & TCL_ENCODING_MODIFIED) + || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) + )) + { /* * If in input mode, and -strict is specified: This is an error. */ if (flags & TCL_ENCODING_MODIFIED) { result = TCL_CONVERT_SYNTAX; Index: generic/tclIO.c ================================================================== --- generic/tclIO.c +++ generic/tclIO.c @@ -4654,18 +4654,20 @@ Channel *chanPtr = (Channel *) chan; ChannelState *statePtr = chanPtr->state; /* State info for channel */ ChannelBuffer *bufPtr; int inEofChar, skip, copiedTotal, oldFlags, oldRemoved; - int oldLength; + int reportError = 0; + size_t oldLength; Tcl_Encoding encoding; char *dst, *dstEnd, *eol, *eof; Tcl_EncodingState oldState; if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { UpdateInterest(chanPtr); Tcl_SetErrno(EILSEQ); + ResetFlag(statePtr, CHANNEL_ENCODING_ERROR); return TCL_INDEX_NONE; } if (CheckChannelErrors(statePtr, TCL_READABLE) != 0) { return TCL_INDEX_NONE; @@ -4936,10 +4938,23 @@ copiedTotal = -1; ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR); goto done; } goto gotEOL; + } else if (gs.bytesWrote == 0 + && GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { + /* Set eol to the position that caused the encoding error, and then + * coninue to gotEOL, which stores the data that was decoded + * without error to objPtr. This allows the caller to do something + * useful with the data decoded so far, and also results in the + * position of the file being the first byte that was not + * succesfully decoded, allowing further processing at exactly that + * point, if desired. + */ + eol = dstEnd; + reportError = 1; + goto gotEOL; } dst = dstEnd; } /* @@ -4979,11 +4994,20 @@ */ Tcl_SetObjLength(objPtr, eol - objPtr->bytes); CommonGetsCleanup(chanPtr); ResetFlag(statePtr, CHANNEL_BLOCKED); - copiedTotal = gs.totalChars + gs.charsWrote - skip; + if (reportError) { + ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR|CHANNEL_ENCODING_ERROR); + /* reset CHANNEL_ENCODING_ERROR to afford a chance to reconfigure + * the channel and try again + */ + Tcl_SetErrno(EILSEQ); + copiedTotal = -1; + } else { + copiedTotal = gs.totalChars + gs.charsWrote - skip; + } goto done; /* * Couldn't get a complete line. This only happens if we get a error * reading from the channel or we are non-blocking and there wasn't an EOL @@ -6022,12 +6046,13 @@ TclGetString(objPtr); } } if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { - /* TODO: We don't need this call? */ + /* TODO: UpdateInterest not needed here? */ UpdateInterest(chanPtr); + Tcl_SetErrno(EILSEQ); return -1; } /* * Early out when next read will see eofchar. @@ -6039,11 +6064,11 @@ if (GotFlag(statePtr, CHANNEL_STICKY_EOF)) { SetFlag(statePtr, CHANNEL_EOF); assert(statePtr->inputEncodingFlags & TCL_ENCODING_END); assert(!GotFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR)); - /* TODO: We don't need this call? */ + /* TODO: UpdateInterest not needed here? */ UpdateInterest(chanPtr); return 0; } /* @@ -6053,11 +6078,11 @@ if (GotFlag(statePtr, CHANNEL_EOF)) { statePtr->inputEncodingFlags |= TCL_ENCODING_START; } ResetFlag(statePtr, CHANNEL_BLOCKED|CHANNEL_EOF); statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; - /* TODO: We don't need this call? */ + /* TODO: UpdateInterest not needed here? */ UpdateInterest(chanPtr); return 0; } /* @@ -6084,11 +6109,11 @@ } else { copiedNow = ReadChars(statePtr, objPtr, toRead, &factor); } /* - * If the current buffer is empty recycle it. + * Recycle current buffer if empty. */ bufPtr = statePtr->inQueueHead; if (IsBufferEmpty(bufPtr)) { ChannelBuffer *nextPtr = bufPtr->nextPtr; @@ -6097,10 +6122,28 @@ statePtr->inQueueHead = nextPtr; if (nextPtr == NULL) { statePtr->inQueueTail = NULL; } } + + /* + * If CHANNEL_ENCODING_ERROR and CHANNEL_STICKY_EOF are both set, + * then CHANNEL_ENCODING_ERROR was caused by data that occurred + * after the EOF character was encountered, so it doesn't count as + * a real error. + */ + + if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR) + && !GotFlag(statePtr, CHANNEL_STICKY_EOF) + && !GotFlag(statePtr, CHANNEL_NONBLOCKING)) { + /* Channel is synchronous. Return an error so that callers + * like [read] can return an error. + */ + Tcl_SetErrno(EILSEQ); + copied = -1; + goto finish; + } } if (copiedNow < 0) { if (GotFlag(statePtr, CHANNEL_EOF)) { break; @@ -6125,10 +6168,11 @@ copied += copiedNow; toRead -= copiedNow; } } +finish: /* * Failure to fill a channel buffer may have left channel reporting a * "blocked" state, but so long as we fulfilled the request here, the * caller does not consider us blocked. */ @@ -6803,15 +6847,18 @@ if (srcStart + srcLen == eof) { /* * EOF character was seen in EOL translated range. Leave current file * position pointing at the EOF character, but don't store the EOF * character in the output string. + * + * If CHANNEL_ENCODING_ERROR is set, it can only be because of data + * encountered after the EOF character, so it is nonsense. Unset it. */ SetFlag(statePtr, CHANNEL_EOF | CHANNEL_STICKY_EOF); statePtr->inputEncodingFlags |= TCL_ENCODING_END; - ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR); + ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR|CHANNEL_ENCODING_ERROR); } } /* *---------------------------------------------------------------------- Index: generic/tclIOCmd.c ================================================================== --- generic/tclIOCmd.c +++ generic/tclIOCmd.c @@ -293,11 +293,11 @@ Tcl_Obj *const objv[]) /* Argument objects. */ { Tcl_Channel chan; /* The channel to read from. */ int lineLen; /* Length of line just read. */ int mode; /* Mode in which channel is opened. */ - Tcl_Obj *linePtr, *chanObjPtr; + Tcl_Obj *linePtr, *chanObjPtr, *resultDictPtr, *returnOptsPtr; int code = TCL_OK; if ((objc != 2) && (objc != 3)) { Tcl_WrongNumArgs(interp, 1, objv, "channelId ?varName?"); return TCL_ERROR; @@ -316,11 +316,10 @@ TclChannelPreserve(chan); TclNewObj(linePtr); lineLen = Tcl_GetsObj(chan, linePtr); if (lineLen < 0) { if (!Tcl_Eof(chan) && !Tcl_InputBlocked(chan)) { - Tcl_DecrRefCount(linePtr); /* * TIP #219. * Capture error messages put by the driver into the bypass area * and put them into the regular interpreter result. Fall back to @@ -330,11 +329,18 @@ if (!TclChanCaughtErrorBypass(interp, chan)) { Tcl_SetObjResult(interp, Tcl_ObjPrintf( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } + resultDictPtr = Tcl_NewDictObj(); + Tcl_DictObjPut(NULL, resultDictPtr, Tcl_NewStringObj("read", -1) + , linePtr); + returnOptsPtr = Tcl_NewDictObj(); + Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) + , resultDictPtr); code = TCL_ERROR; + Tcl_SetReturnOptions(interp, returnOptsPtr); goto done; } lineLen = TCL_INDEX_NONE; } if (objc == 3) { @@ -379,11 +385,11 @@ Tcl_Channel chan; /* The channel to read from. */ int newline, i; /* Discard newline at end? */ int toRead; /* How many bytes to read? */ int charactersRead; /* How many characters were read? */ int mode; /* Mode in which channel is opened. */ - Tcl_Obj *resultPtr, *chanObjPtr; + Tcl_Obj *resultPtr, *resultDictPtr, *returnOptsPtr, *chanObjPtr; if ((objc != 2) && (objc != 3)) { Interp *iPtr; argerror: @@ -468,12 +474,19 @@ if (!TclChanCaughtErrorBypass(interp, chan)) { Tcl_SetObjResult(interp, Tcl_ObjPrintf( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } + resultDictPtr = Tcl_NewDictObj(); + Tcl_DictObjPut(NULL, resultDictPtr, Tcl_NewStringObj("read", -1) + , resultPtr); + returnOptsPtr = Tcl_NewDictObj(); + Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) + , resultDictPtr); TclChannelRelease(chan); Tcl_DecrRefCount(resultPtr); + Tcl_SetReturnOptions(interp, returnOptsPtr); return TCL_ERROR; } /* * If requested, remove the last newline in the channel if at EOF. Index: tests/io.test ================================================================== --- tests/io.test +++ tests/io.test @@ -1545,23 +1545,47 @@ fconfigure $f -encoding utf-8 -buffersize 10 set in [read $f] close $f scan [string index $in end] %c } 160 -test io-12.9 {ReadChars: multibyte chars split} -body { - set f [open $path(test1) w] - fconfigure $f -translation binary - puts -nonewline $f [string repeat a 9]\xC2 - close $f - set f [open $path(test1)] - fconfigure $f -encoding utf-8 -buffersize 10 - set in [read $f] - close $f - scan [string index $in end] %c -} -cleanup { - catch {close $f} -} -result 194 + + +apply [list {} { + set template { + test io-12.9.@variant@ {ReadChars: multibyte chars split, default (strict)} -body { + set res {} + set f [open $path(test1) w] + fconfigure $f -translation binary + puts -nonewline $f [string repeat a 9]\xC2 + close $f + set f [open $path(test1)] + fconfigure $f -encoding utf-8 @strict@ -buffersize 10 + set status [catch {read $f} cres copts] + set in [dict get $copts -result] + lappend res $in + lappend res $status $cres + set status [catch {read $f} cres copts] + set in [dict get $copts -result] + lappend res $in + lappend res $status $cres + set res + } -cleanup { + catch {close $f} + } -match glob -result {{read aaaaaaaaa} 1\ + {error reading "*": illegal byte sequence}\ + {read {}} 1 {error reading "*": illegal byte sequence}} + } + + # strict encoding may be the default in Tcl 9, but in 8 it is not + foreach variant {encodingstrict} strict {{-strictencoding 1}} { + set script [string map [ + list @variant@ $variant @strict@ $strict] $template] + uplevel 1 $script + } +} [namespace current]] + + test io-12.10 {ReadChars: multibyte chars split} -body { set f [open $path(test1) w] fconfigure $f -translation binary puts -nonewline $f [string repeat a 9]\xC2 close $f @@ -9054,16 +9078,16 @@ puts -nonewline $f A\x81 flush $f seek $f 0 fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 } -body { - set d [read $f] + set status [catch {read $f} cres copts] + set d [dict get $copts -result read] binary scan $d H* hd - lappend hd [catch {read $f} msg] - close $f - lappend hd $msg + lappend hd $status $cres } -cleanup { + close $f removeFile io-75.6 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { set fn [makeFile {} io-75.7] @@ -9073,42 +9097,75 @@ puts -nonewline $f A\xA1\x1A flush $f seek $f 0 fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 } -body { - set d [read $f] + set status [catch {read $f} cres copts] + set d [dict get $copts -result read] binary scan $d H* hd lappend hd [eof $f] - lappend hd [catch {read $f} msg] - lappend hd $msg + lappend hd $status + lappend hd $cres fconfigure $f -encoding iso8859-1 lappend hd [read $f];# We changed encoding, so now we can read the \xA1 close $f set hd } -cleanup { removeFile io-75.7 } -match glob -result {41 0 1 {error reading "*": illegal byte sequence} ¡} -test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { +test io-75.8.incomplete { + incomplete uft-8 char after eof char is not an error (-strictencoding 1) +} -setup { + set hd {} set fn [makeFile {} io-75.8] set f [open $fn w+] fconfigure $f -encoding binary - # \x81 is invalid in utf-8, but since \x1A comes first, -eofchar takes precedence. + # \x81 is invalid and also incomplete utf-8 data, but because the eof + # character \x1A appears first, it's not an error. puts -nonewline $f A\x1A\x81 flush $f seek $f 0 fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 } -body { set d [read $f] binary scan $d H* hd lappend hd [eof $f] + # there should be no error on additional reads lappend hd [read $f] close $f set hd } -cleanup { removeFile io-75.8 } -result {41 1 {}} + + +test io-75.8.invalid {invalid utf-8 after eof char is not an error (-strictencoding 1)} -setup { + set res {} + set fn [makeFile {} io-75.8] + set f [open $fn w+] + fconfigure $f -encoding binary + # \xc0\x80 is invalid utf-8 data, but because the eof character \x1A + # appears first, it's not an error. + puts -nonewline $f A\x1a\xc0\x80 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 +} -body { + set d [read $f] + foreach char [split $d {}] { + lappend res [format %x [scan $char %c]] + } + lappend res [eof $f] + # there should be no error on additional reads + lappend res [read $f] + close $f + set res +} -cleanup { + removeFile io-75.8 +} -result {41 1 {}} + test io-75.9 {unrepresentable character write passes and is replaced by ?} -setup { set fn [makeFile {} io-75.9] set f [open $fn w+] fconfigure $f -encoding iso8859-1 -strictencoding 1 @@ -9120,31 +9177,53 @@ } -cleanup { close $f removeFile io-75.9 } -match glob -result [list {A} {error writing "*": illegal byte sequence}] -# Incomplete sequence test. -# This error may IMHO only be detected with the close. -# But the read already returns the incomplete sequence. + test io-75.10 {incomplete multibyte encoding read is ignored} -setup { set fn [makeFile {} io-75.10] set f [open $fn w+] fconfigure $f -encoding binary puts -nonewline $f A\xC0 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none + fconfigure $f -encoding utf-8 -strictencoding 0 -buffering none } -body { set d [read $f] close $f binary scan $d H* hd set hd } -cleanup { removeFile io-75.10 } -result 41c0 -# The current result returns the orphan byte as byte. -# This may be expected due to special utf-8 handling. + + +test io-75.10_strict {incomplete multibyte encoding read is an error} -setup { + set res {} + set fn [makeFile {} io-75.10] + set f [open $fn w+] + fconfigure $f -encoding binary + puts -nonewline $f A\xC0 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -strictencoding 1 -buffering none +} -body { + set status [catch {read $f} cres copts] + set d [dict get $copts -result read] + binary scan $d H* hd + lappend res $hd $cres + chan configure $f -encoding iso8859-1 + set d [read $f] + binary scan $d H* hd + lappend res $hd + close $f + return $res +} -cleanup { + removeFile io-75.10 +} -match glob -result {41 {error reading "*": illegal byte sequence} c0} + # As utf-8 has a special treatment in multi-byte decoding, also test another # one. test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { set fn [makeFile {} io-75.11] @@ -9153,29 +9232,53 @@ # In shiftjis, \x81 starts a two-byte sequence. # But 2nd byte \xFF is not allowed puts -nonewline $f A\x81\xFFA flush $f seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -strictencoding 1 + fconfigure $f -encoding shiftjis -buffering none -eofchar "" \ + -translation lf -strictencoding 1 } -body { - set d [read $f] + set status [catch {read $f} cres copts] + set d [dict get $copts -result read] binary scan $d H* hd - lappend hd [catch {set d [read $f]} msg] - lappend hd $msg + lappend hd $status + lappend hd $cres } -cleanup { close $f removeFile io-75.11 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} -test io-75.12 {invalid utf-8 encoding read is ignored} -setup { + +test io-75.12 {invalid utf-8 encoding read is an error} -setup { + set res {} + set fn [makeFile {} io-75.12] + set f [open $fn w+] + fconfigure $f -encoding binary + puts -nonewline $f A\x81 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar {} -translation lf \ + -strictencoding 1 +} -body { + set status [catch {read $f} cres copts] + set d [dict get $copts -result read] + close $f + binary scan $d H* hd + lappend res $hd $status $cres + return $res +} -cleanup { + removeFile io-75.12 +} -match glob -result {41 1 {error reading "*": illegal byte sequence}} +test io-75.12_ignore {invalid utf-8 encoding read is ignored} -setup { set fn [makeFile {} io-75.12] set f [open $fn w+] fconfigure $f -encoding binary puts -nonewline $f A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf + fconfigure $f -encoding utf-8 -buffering none -eofchar {} \ + -translation lf -strictencoding 0 } -body { set d [read $f] close $f binary scan $d H* hd set hd @@ -9190,21 +9293,72 @@ puts -nonewline $f "A\x81" flush $f seek $f 0 fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 } -body { - set d [read $f] + set status [catch {read $f} cres copts] + set d [dict get $copts -result read] binary scan $d H* hd - lappend hd [catch {read $f} msg] + lappend hd $status close $f - lappend hd $msg + lappend hd $cres } -cleanup { removeFile io-75.13 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} -# ### ### ### ######### ######### ######### +test io-75.14 {invalid utf-8 encoding [gets] coninues in non-strict mode after error} -setup { + set res {} + set fn [makeFile {} io-75.14] + set f [open $fn w+] + fconfigure $f -encoding binary + # \xc0 is invalid in utf-8 + puts -nonewline $f a\nb\xc0\nc\n + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar {} -translation lf -strictencoding 1 +} -body { + lappend res [gets $f] + set status [catch {gets $f} cres copts] + lappend res $status $cres + chan configure $f -strictencoding 0 + lappend res [gets $f] + lappend res [gets $f] + close $f + return $res +} -cleanup { + removeFile io-75.14 +} -match glob -result {a 1 {error reading "*": illegal byte sequence} bÀ c} + +test io-75.15 {invalid utf-8 encoding strict gets should not hang} -setup { + set res {} + set fn [makeFile {} io-75.15] + set chan [open $fn w+] + fconfigure $chan -encoding binary + # This is not valid UTF-8 + puts $chan hello\nAB\xc0\x40CD\nEFG + close $chan +} -body { + #Now try to read it with [gets] + set chan [open $fn] + fconfigure $chan -encoding utf-8 -strictencoding 1 + lappend res [gets $chan] + set status [catch {gets $chan} cres copts] + lappend res $status $cres + set status [catch {gets $chan} cres copts] + lappend res $status $cres + lappend res [dict get $copts -result] + chan configur $chan -encoding binary + foreach char [split [read $chan 2] {}] { + lappend res [format %x [scan $char %c]] + } + return $res +} -cleanup { + close $chan + removeFile io-75.15 +} -match glob -result {hello 1 {error reading "*": illegal byte sequence}\ + 1 {error reading "*": illegal byte sequence} {read AB} c0 40} test io-76.0 {channel modes} -setup { set datafile [makeFile {some characters} dummy] set f [open $datafile r]