Tcl Source Code

Check-in [369ac8bf18]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Implement unicode toXXXX congruent to string toXXXX
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | tip-726-plus | tip-726-tclunidata-equivalence-proof
Files: files | file ages | folders
SHA3-256: 369ac8bf188d604a633ddfb9b14e384cac0cbf1f9fd1eb1f9fc05491bf15ca83
User & Date: apnadkarni 2025-07-25 17:34:27.996
Context
2025-07-25
17:46
Eliminate use of tclUniData.c check-in: 1dcc7678c8 user: apnadkarni tags: tip-726-plus
17:34
Implement unicode toXXXX congruent to string toXXXX check-in: 369ac8bf18 user: apnadkarni tags: tip-726-plus, tip-726-tclunidata-equivalence-proof
16:05
Disable tests for bug 1ecea011 check-in: 13c2631e68 user: apnadkarni tags: tip-726-plus
Changes
Unified Diff Ignore Whitespace Patch
Changes to generic/tclCmdMZ.c.
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524

    /*
     * When entering here, result == 1 and failat == 0.
     */

    switch (index) {
    case STR_IS_ALNUM:
	int Tcl_Utf8procIsAlnum(int ch);
	chcomp = Tcl_Utf8procIsAlnum;
	break;
    case STR_IS_ALPHA:
	int Tcl_Utf8procIsAlpha(int ch);
	chcomp = Tcl_Utf8procIsAlpha;
	break;
    case STR_IS_ASCII:
	chcomp = UniCharIsAscii;
	break;
    case STR_IS_BOOL:
    case STR_IS_TRUE:
    case STR_IS_FALSE:







|
|


|
|







5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524

    /*
     * When entering here, result == 1 and failat == 0.
     */

    switch (index) {
    case STR_IS_ALNUM:
	int Tcl_UniCharIsAlnum(int ch);
	chcomp = Tcl_UniCharIsAlnum;
	break;
    case STR_IS_ALPHA:
	int Tcl_UniCharIsAlpha(int ch);
	chcomp = Tcl_UniCharIsAlpha;
	break;
    case STR_IS_ASCII:
	chcomp = UniCharIsAscii;
	break;
    case STR_IS_BOOL:
    case STR_IS_TRUE:
    case STR_IS_FALSE:
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
	    }
	} else if ((objPtr->internalRep.wideValue != 0)
		? (index == STR_IS_FALSE) : (index == STR_IS_TRUE)) {
	    result = 0;
	}
	break;
    case STR_IS_CONTROL:
	int Tcl_Utf8procIsControl(int ch);
	chcomp = Tcl_Utf8procIsControl;
	break;
    case STR_IS_DICT: {
	int dresult;
	Tcl_Size dsize;

	dresult = Tcl_DictObjSize(interp, objPtr, &dsize);
	Tcl_ResetResult(interp);







|
|







5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
	    }
	} else if ((objPtr->internalRep.wideValue != 0)
		? (index == STR_IS_FALSE) : (index == STR_IS_TRUE)) {
	    result = 0;
	}
	break;
    case STR_IS_CONTROL:
	int Tcl_UniCharIsControl(int ch);
	chcomp = Tcl_UniCharIsControl;
	break;
    case STR_IS_DICT: {
	int dresult;
	Tcl_Size dsize;

	dresult = Tcl_DictObjSize(interp, objPtr, &dsize);
	Tcl_ResetResult(interp);
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
		    break;
		}
	    }
	}
	break;
    }
    case STR_IS_DIGIT:
	int Tcl_Utf8procIsDigit(int ch);
	chcomp = Tcl_Utf8procIsDigit;
	break;

    case STR_IS_DOUBLE: {
	if (TclHasInternalRep(objPtr, &tclDoubleType) ||
		TclHasInternalRep(objPtr, &tclIntType) ||
		TclHasInternalRep(objPtr, &tclBignumType)) {
	    break;







|
|







5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
		    break;
		}
	    }
	}
	break;
    }
    case STR_IS_DIGIT:
	int Tcl_UniCharIsDigit(int ch);
	chcomp = Tcl_UniCharIsDigit;
	break;

    case STR_IS_DOUBLE: {
	if (TclHasInternalRep(objPtr, &tclDoubleType) ||
		TclHasInternalRep(objPtr, &tclIntType) ||
		TclHasInternalRep(objPtr, &tclBignumType)) {
	    break;
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
		result = 0;
		TclFreeInternalRep(objPtr);
	    }
	}
	break;
    }
    case STR_IS_GRAPH:
	int Tcl_Utf8procIsGraph(int ch);
	chcomp = Tcl_Utf8procIsGraph;
	break;
    case STR_IS_INT:
    case STR_IS_ENTIER:
	if (TclHasInternalRep(objPtr, &tclIntType) ||
		TclHasInternalRep(objPtr, &tclBignumType)) {
	    break;
	}







|
|







5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
		result = 0;
		TclFreeInternalRep(objPtr);
	    }
	}
	break;
    }
    case STR_IS_GRAPH:
	int Tcl_UniCharIsGraph(int ch);
	chcomp = Tcl_UniCharIsGraph;
	break;
    case STR_IS_INT:
    case STR_IS_ENTIER:
	if (TclHasInternalRep(objPtr, &tclIntType) ||
		TclHasInternalRep(objPtr, &tclBignumType)) {
	    break;
	}
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
		    break;
		}
	    }
	}
	result = 0;
	break;
    case STR_IS_LOWER:
	int Tcl_Utf8procIsLower(int ch);
	chcomp = Tcl_Utf8procIsLower;
	break;
    case STR_IS_PRINT:
	int Tcl_Utf8procIsPrint(int ch);
	chcomp = Tcl_Utf8procIsPrint;
	break;
    case STR_IS_PUNCT:
	int Tcl_Utf8procIsPunct(int ch);
	chcomp = Tcl_Utf8procIsPunct;
	break;
    case STR_IS_SPACE:
	int Tcl_Utf8procIsSpace(int ch);
	chcomp = Tcl_Utf8procIsSpace;
	break;
    case STR_IS_UPPER:
	int Tcl_Utf8procIsUpper(int ch);
	chcomp = Tcl_Utf8procIsUpper;
	break;
    case STR_IS_WORD:
	int Tcl_Utf8procIsWordChar(int ch);
	chcomp = Tcl_Utf8procIsWordChar;
	break;
    case STR_IS_XDIGIT:
	chcomp = UniCharIsHexDigit;
	break;
    default:
	TCL_UNREACHABLE();
    }







|
|


|
|


|
|


|
|


|
|


|
|







5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
		    break;
		}
	    }
	}
	result = 0;
	break;
    case STR_IS_LOWER:
	int Tcl_UniCharIsLower(int ch);
	chcomp = Tcl_UniCharIsLower;
	break;
    case STR_IS_PRINT:
	int Tcl_UniCharIsPrint(int ch);
	chcomp = Tcl_UniCharIsPrint;
	break;
    case STR_IS_PUNCT:
	int Tcl_UniCharIsPunct(int ch);
	chcomp = Tcl_UniCharIsPunct;
	break;
    case STR_IS_SPACE:
	int Tcl_UniCharIsSpace(int ch);
	chcomp = Tcl_UniCharIsSpace;
	break;
    case STR_IS_UPPER:
	int Tcl_UniCharIsUpper(int ch);
	chcomp = Tcl_UniCharIsUpper;
	break;
    case STR_IS_WORD:
	int Tcl_UniCharIsWordChar(int ch);
	chcomp = Tcl_UniCharIsWordChar;
	break;
    case STR_IS_XDIGIT:
	chcomp = UniCharIsHexDigit;
	break;
    default:
	TCL_UNREACHABLE();
    }
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
	length2 = TclUtfToUniChar(string, &ucs4);
	Tcl_ListObjAppendElement(interp, categoriesObj,
	    Tcl_NewStringObj(utf8proc_category_string(ucs4), -1));
    }
    Tcl_SetObjResult(interp, categoriesObj);
    return TCL_OK;
}

/*
 * TclUnicodeNormalizeCmd --
 *
 *	This procedure implements the "unicode tonfc|tonfd|tonfkc|tonfkd"
 *	commands. See the user documentation for details on what it does.
 *
 * Results:







|







5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
	length2 = TclUtfToUniChar(string, &ucs4);
	Tcl_ListObjAppendElement(interp, categoriesObj,
	    Tcl_NewStringObj(utf8proc_category_string(ucs4), -1));
    }
    Tcl_SetObjResult(interp, categoriesObj);
    return TCL_OK;
}

/*
 * TclUnicodeNormalizeCmd --
 *
 *	This procedure implements the "unicode tonfc|tonfd|tonfkc|tonfkd"
 *	commands. See the user documentation for details on what it does.
 *
 * Results:
5930
5931
5932
5933
5934
5935
5936










































































































































































































5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954



5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
	    &ds) == NULL) {
	return TCL_ERROR;
    }

    Tcl_DStringResult(interp, &ds);
    return TCL_OK;
}











































































































































































































/*
 * TclInitUnicodeCmd --
 *
 *	This procedure creates the "unicode" Tcl ensemble command. See user
 *	documentation for details on implemented commands.
 *
 * Results:
 *	A standard Tcl result.
 *
 * Side effects:
 *	Stores the result in the interpreter result.
 */
Tcl_Command
TclInitUnicodeCmd(
    Tcl_Interp *interp)
{
    static const EnsembleImplMap unicodeImplMap[] = {



	{"tonfc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFC, 0},
	{"tonfd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFD, 0},
	{"tonfkc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKC, 0},
	{"tonfkd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKD, 0},
	{"is", TclUnicodeIsCmd, NULL, NULL, NULL, 0},
	{"category", TclUnicodeCategoryCmd, NULL, NULL, NULL, 0},
	{NULL, NULL, NULL, NULL, NULL, 0}
    };
    return TclMakeEnsemble(interp, "unicode", unicodeImplMap);
}


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


















>
>
>




|
|













5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
	    &ds) == NULL) {
	return TCL_ERROR;
    }

    Tcl_DStringResult(interp, &ds);
    return TCL_OK;
}

static int
TclUnicodeToUpperCmd(
    TCL_UNUSED(void *),
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Argument objects. */
{
    Tcl_Size length1, length2;
    const char *string1;
    char *string2;

    if (objc < 2 || objc > 4) {
	Tcl_WrongNumArgs(interp, 1, objv, "string ?first? ?last?");
	return TCL_ERROR;
    }

    string1 = TclGetStringFromObj(objv[1], &length1);

    if (objc == 2) {
	Tcl_Obj *resultPtr = Tcl_NewStringObj(string1, length1);

	length1 = Tcl_Utf8procUtfToUpper(TclGetString(resultPtr));
	Tcl_SetObjLength(resultPtr, length1);
	Tcl_SetObjResult(interp, resultPtr);
    } else {
	Tcl_Size first, last;
	const char *start, *end;
	Tcl_Obj *resultPtr;

	length1 = Tcl_NumUtfChars(string1, length1) - 1;
	if (TclGetIntForIndexM(interp,objv[2],length1, &first) != TCL_OK) {
	    return TCL_ERROR;
	}
	if (first < 0) {
	    first = 0;
	}
	last = first;

	if ((objc == 4) && (TclGetIntForIndexM(interp, objv[3], length1,
		&last) != TCL_OK)) {
	    return TCL_ERROR;
	}

	if (last >= length1) {
	    last = length1;
	}
	if (last < first) {
	    Tcl_SetObjResult(interp, objv[1]);
	    return TCL_OK;
	}

	string1 = TclGetStringFromObj(objv[1], &length1);
	start = Tcl_UtfAtIndex(string1, first);
	end = Tcl_UtfAtIndex(start, last - first + 1);
	resultPtr = Tcl_NewStringObj(string1, end - string1);
	string2 = TclGetString(resultPtr) + (start - string1);

	length2 = Tcl_Utf8procUtfToUpper(string2);
	Tcl_SetObjLength(resultPtr, length2 + (start - string1));

	Tcl_AppendToObj(resultPtr, end, -1);
	Tcl_SetObjResult(interp, resultPtr);
    }

    return TCL_OK;
}

static int
TclUnicodeToLowerCmd(
    TCL_UNUSED(void *),
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Argument objects. */
{
    Tcl_Size length1, length2;
    const char *string1;
    char *string2;

    if (objc < 2 || objc > 4) {
	Tcl_WrongNumArgs(interp, 1, objv, "string ?first? ?last?");
	return TCL_ERROR;
    }

    string1 = TclGetStringFromObj(objv[1], &length1);

    if (objc == 2) {
	Tcl_Obj *resultPtr = Tcl_NewStringObj(string1, length1);

	length1 = Tcl_Utf8procUtfToLower(TclGetString(resultPtr));
	Tcl_SetObjLength(resultPtr, length1);
	Tcl_SetObjResult(interp, resultPtr);
    } else {
	Tcl_Size first, last;
	const char *start, *end;
	Tcl_Obj *resultPtr;

	length1 = Tcl_NumUtfChars(string1, length1) - 1;
	if (TclGetIntForIndexM(interp,objv[2],length1, &first) != TCL_OK) {
	    return TCL_ERROR;
	}
	if (first < 0) {
	    first = 0;
	}
	last = first;

	if ((objc == 4) && (TclGetIntForIndexM(interp, objv[3], length1,
		&last) != TCL_OK)) {
	    return TCL_ERROR;
	}

	if (last >= length1) {
	    last = length1;
	}
	if (last < first) {
	    Tcl_SetObjResult(interp, objv[1]);
	    return TCL_OK;
	}

	string1 = TclGetStringFromObj(objv[1], &length1);
	start = Tcl_UtfAtIndex(string1, first);
	end = Tcl_UtfAtIndex(start, last - first + 1);
	resultPtr = Tcl_NewStringObj(string1, end - string1);
	string2 = TclGetString(resultPtr) + (start - string1);

	length2 = Tcl_Utf8procUtfToLower(string2);
	Tcl_SetObjLength(resultPtr, length2 + (start - string1));

	Tcl_AppendToObj(resultPtr, end, -1);
	Tcl_SetObjResult(interp, resultPtr);
    }

    return TCL_OK;
}

static int
TclUnicodeToTitleCmd(
    TCL_UNUSED(void *),
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Argument objects. */
{
    Tcl_Size length1, length2;
    const char *string1;
    char *string2;
    Tcl_Size Tcl_Utf8procToTitle(char *str);

    if (objc < 2 || objc > 4) {
	Tcl_WrongNumArgs(interp, 1, objv, "string ?first? ?last?");
	return TCL_ERROR;
    }

    string1 = TclGetStringFromObj(objv[1], &length1);

    if (objc == 2) {
	Tcl_Obj *resultPtr = Tcl_NewStringObj(string1, length1);

	length1 = Tcl_Utf8procUtfToTitle(TclGetString(resultPtr));
	Tcl_SetObjLength(resultPtr, length1);
	Tcl_SetObjResult(interp, resultPtr);
    } else {
	Tcl_Size first, last;
	const char *start, *end;
	Tcl_Obj *resultPtr;

	length1 = Tcl_NumUtfChars(string1, length1) - 1;
	if (TclGetIntForIndexM(interp,objv[2],length1, &first) != TCL_OK) {
	    return TCL_ERROR;
	}
	if (first < 0) {
	    first = 0;
	}
	last = first;

	if ((objc == 4) && (TclGetIntForIndexM(interp, objv[3], length1,
		&last) != TCL_OK)) {
	    return TCL_ERROR;
	}

	if (last >= length1) {
	    last = length1;
	}
	if (last < first) {
	    Tcl_SetObjResult(interp, objv[1]);
	    return TCL_OK;
	}

	string1 = TclGetStringFromObj(objv[1], &length1);
	start = Tcl_UtfAtIndex(string1, first);
	end = Tcl_UtfAtIndex(start, last - first + 1);
	resultPtr = Tcl_NewStringObj(string1, end - string1);
	string2 = TclGetString(resultPtr) + (start - string1);

	length2 = Tcl_Utf8procUtfToTitle(string2);
	Tcl_SetObjLength(resultPtr, length2 + (start - string1));

	Tcl_AppendToObj(resultPtr, end, -1);
	Tcl_SetObjResult(interp, resultPtr);
    }

    return TCL_OK;
}

/*
 * TclInitUnicodeCmd --
 *
 *	This procedure creates the "unicode" Tcl ensemble command. See user
 *	documentation for details on implemented commands.
 *
 * Results:
 *	A standard Tcl result.
 *
 * Side effects:
 *	Stores the result in the interpreter result.
 */
Tcl_Command
TclInitUnicodeCmd(
    Tcl_Interp *interp)
{
    static const EnsembleImplMap unicodeImplMap[] = {
	{"is", TclUnicodeIsCmd, NULL, NULL, NULL, 0},
	{"category", TclUnicodeCategoryCmd, NULL, NULL, NULL, 0},
	{"tolower", TclUnicodeToLowerCmd, NULL, NULL, NULL, 0},
	{"tonfc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFC, 0},
	{"tonfd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFD, 0},
	{"tonfkc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKC, 0},
	{"tonfkd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKD, 0},
	{"totitle", TclUnicodeToTitleCmd, NULL, NULL, NULL, 0},
	{"toupper", TclUnicodeToUpperCmd, NULL, NULL, NULL, 0},
	{NULL, NULL, NULL, NULL, NULL, 0}
    };
    return TclMakeEnsemble(interp, "unicode", unicodeImplMap);
}


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */
Changes to generic/tclUtf.c.
1382
1383
1384
1385
1386
1387
1388


































1389
1390
1391
1392
1393
1394
1395
	    dst += Tcl_UniCharToUtf(upChar, dst);
	}
	src += len;
    }
    *dst = '\0';
    return (dst - str);
}



































/*
 *----------------------------------------------------------------------
 *
 * Tcl_UtfToLower --
 *
 *	Convert uppercase characters to lowercase characters in a UTF string







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
	    dst += Tcl_UniCharToUtf(upChar, dst);
	}
	src += len;
    }
    *dst = '\0';
    return (dst - str);
}
Tcl_Size
Tcl_Utf8procUtfToUpper(
    char *str)			/* String to convert in place. */
{
    int ch, upChar;
    char *src, *dst;
    Tcl_Size len;

    /*
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	upChar = Tcl_Utf8procUniCharToUpper(ch);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
	 */

	if (len < TclUtfCount(upChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(upChar, dst);
	}
	src += len;
    }
    *dst = '\0';
    return (dst - str);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UtfToLower --
 *
 *	Convert uppercase characters to lowercase characters in a UTF string
1418
1419
1420
1421
1422
1423
1424



































1425
1426
1427
1428
1429
1430
1431
     */

    src = dst = str;
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	lowChar = Tcl_UniCharToLower(ch);




































	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */

	if (len < TclUtfCount(lowChar)) {







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
     */

    src = dst = str;
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	lowChar = Tcl_UniCharToLower(ch);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */

	if (len < TclUtfCount(lowChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += len;
    }
    *dst = '\0';
    return (dst - str);
}

Tcl_Size
Tcl_Utf8procUtfToLower(
    char *str)			/* String to convert in place. */
{
    int ch, lowChar;
    char *src, *dst;
    Tcl_Size len;

    /*
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	lowChar = Tcl_Utf8procUniCharToLower(ch);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */

	if (len < TclUtfCount(lowChar)) {
1489
1490
1491
1492
1493
1494
1495














































1496
1497
1498
1499
1500
1501
1502
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	lowChar = ch;
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}















































	if (len < TclUtfCount(lowChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	lowChar = ch;
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}

	if (len < TclUtfCount(lowChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += len;
    }
    *dst = '\0';
    return (dst - str);
}
Tcl_Size
Tcl_Utf8procUtfToTitle(
    char *str)			/* String to convert in place. */
{
    int ch, titleChar, lowChar;
    char *src, *dst;
    Tcl_Size len;

    /*
     * Capitalize the first character and then lowercase the rest of the
     * characters until we get to a null.
     */

    src = dst = str;

    if (*src) {
	len = TclUtfToUniChar(src, &ch);
	titleChar = Tcl_Utf8procUniCharToTitle(ch);

	if (len < TclUtfCount(titleChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(titleChar, dst);
	}
	src += len;
    }
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	lowChar = ch;
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_Utf8procUniCharToLower(lowChar);
	}

	if (len < TclUtfCount(lowChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
1820
1821
1822
1823
1824
1825
1826










1827
1828
1829
1830
1831
1832
1833
	if (GetCaseType(info) & 0x04) {
	    ch -= GetDelta(info);
	}
    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}











/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharToLower --
 *
 *	Compute the lowercase equivalent of the given Unicode character.







>
>
>
>
>
>
>
>
>
>







1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
	if (GetCaseType(info) & 0x04) {
	    ch -= GetDelta(info);
	}
    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}
int
Tcl_Utf8procUniCharToUpper(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	ch = utf8proc_toupper(ch);
    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharToLower --
 *
 *	Compute the lowercase equivalent of the given Unicode character.
1848
1849
1850
1851
1852
1853
1854










1855
1856
1857
1858
1859
1860
1861
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	int info = GetUniCharInfo(ch);
	int mode = GetCaseType(info);

	if ((mode & 0x02) && (mode != 0x7)) {
	    ch += GetDelta(info);
	}










    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}

/*
 *----------------------------------------------------------------------







>
>
>
>
>
>
>
>
>
>







1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	int info = GetUniCharInfo(ch);
	int mode = GetCaseType(info);

	if ((mode & 0x02) && (mode != 0x7)) {
	    ch += GetDelta(info);
	}
    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}
int
Tcl_Utf8procUniCharToLower(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	ch = utf8proc_tolower(ch);
    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}

/*
 *----------------------------------------------------------------------
1888
1889
1890
1891
1892
1893
1894










1895
1896
1897
1898
1899
1900
1901

	    if (mode != 0x7) {
		ch += ((mode & 0x4) ? -1 : 1);
	    }
	} else if (mode == 0x4) {
	    ch -= GetDelta(info);
	}










    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}

/*
 *----------------------------------------------------------------------







>
>
>
>
>
>
>
>
>
>







2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046

	    if (mode != 0x7) {
		ch += ((mode & 0x4) ? -1 : 1);
	    }
	} else if (mode == 0x4) {
	    ch -= GetDelta(info);
	}
    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}
int
Tcl_Utf8procUniCharToTitle(
    int ch)			/* Unicode character to convert. */
{
    if (!UNICODE_OUT_OF_RANGE(ch)) {
	ch = utf8proc_totitle(ch);
    }
    /* Clear away extension bits, if any */
    return ch & 0x1FFFFF;
}

/*
 *----------------------------------------------------------------------
Changes to tests/unicodeNormalize.test.
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

    # Standard arg number tests
    test unicode-badargs-0 {unicode no args} -returnCodes error -body {
        unicode
    } -result {wrong # args: should be "unicode subcommand ?arg ...?"}
    test unicode-badargs-1 {unicode bad command} -returnCodes error -body {
        unicode foo
    } -result {unknown or ambiguous subcommand "foo": must be tonfc, tonfd, tonfkc, or tonfkd}

    variable cmd
    foreach cmd {tonfc tonfd tonfkc tonfkd} {
        test $cmd-badargs-0 "$cmd 0 args" -returnCodes error -body {
            unicode $cmd
        } -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
        test $cmd-badargs-1 "$cmd 2 args" -returnCodes error -body {







|







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

    # Standard arg number tests
    test unicode-badargs-0 {unicode no args} -returnCodes error -body {
        unicode
    } -result {wrong # args: should be "unicode subcommand ?arg ...?"}
    test unicode-badargs-1 {unicode bad command} -returnCodes error -body {
        unicode foo
    } -result {unknown or ambiguous subcommand "foo": must be category, is, tonfc, tonfd, tonfkc, or tonfkd}

    variable cmd
    foreach cmd {tonfc tonfd tonfkc tonfkd} {
        test $cmd-badargs-0 "$cmd 0 args" -returnCodes error -body {
            unicode $cmd
        } -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
        test $cmd-badargs-1 "$cmd 2 args" -returnCodes error -body {
Changes to tests/unicodeProperties.test.
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

















122
123
124
125
126
    # Compatibility tests between the string and unicode commands.
	proc testStringUnicodeCompatibility {class} {
        set mismatches "is $class mismatches:"
        foreach codePoint [lseq 0 $::tcltests::ucd::maxCodepoint] {
            set ch [format %c $codePoint]
            if {[string is $class $ch] != [unicode is $class $ch]} {
                append mismatches " " U+[format %x $codePoint]
                break
            }
        }
        return $mismatches
    }

    foreach class {alpha alnum control digit graph lower print space upper wordchar} {
    	test string-vs-unicode-is-$class "string is $class vs unicode" -body {
            testStringUnicodeCompatibility $class
		} -result "is $class mismatches:"
    }

















}

::tcltest::cleanupTests
namespace delete unicode::test
return







<










>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>





104
105
106
107
108
109
110

111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
    # Compatibility tests between the string and unicode commands.
	proc testStringUnicodeCompatibility {class} {
        set mismatches "is $class mismatches:"
        foreach codePoint [lseq 0 $::tcltests::ucd::maxCodepoint] {
            set ch [format %c $codePoint]
            if {[string is $class $ch] != [unicode is $class $ch]} {
                append mismatches " " U+[format %x $codePoint]

            }
        }
        return $mismatches
    }

    foreach class {alpha alnum control digit graph lower print space upper wordchar} {
    	test string-vs-unicode-is-$class "string is $class vs unicode" -body {
            testStringUnicodeCompatibility $class
		} -result "is $class mismatches:"
    }

	proc testStringUnicodeCaseConvertCompatibility {tocase} {
        set mismatches "$tocase mismatches:"
        foreach codePoint [lseq 0 $::tcltests::ucd::maxCodepoint] {
            set ch [format %c $codePoint]
            if {[string $tocase $ch] != [unicode $tocase $ch]} {
                append mismatches " " U+[format %x $codePoint]
            break
            }
        }
        return $mismatches
    }
    foreach tocase {tolower toupper totitle} {
    	test string-vs-unicode-$tocase "string $tocase vs unicode" -body {
            testStringUnicodeCaseConvertCompatibility $tocase
		} -result "$tocase mismatches:"
    }
}

::tcltest::cleanupTests
namespace delete unicode::test
return