Tcl Source Code

Check-in [b00ff15275]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add C API Tcl_UtfToNormalized
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | tip-726
Files: files | file ages | folders
SHA3-256: b00ff15275e85fd520ea0a7213438d0aaf7a2e700073599de59a4f71135c5832
User & Date: apnadkarni 2025-07-27 00:59:37.628
Context
2025-07-27
08:07
Add tests Tcl_UtfToNormalized API check-in: 228a4cdf87 user: apnadkarni tags: tip-726
00:59
Add C API Tcl_UtfToNormalized check-in: b00ff15275 user: apnadkarni tags: tip-726
2025-07-26
06:14
Merge tip-726-plus to eliminate use of tclUniData.c. check-in: 0a1c3bc2fb user: apnadkarni tags: tip-726
Changes
Unified Diff Ignore Whitespace Patch
Changes to generic/tcl.decls.
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400






2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
declare 694 {
    int Tcl_ListObjRange(Tcl_Interp *interp, Tcl_Obj *objPtr,
	    Tcl_Size start, Tcl_Size end, Tcl_Obj **resultPtrPtr)
}

# TIP 726
declare 695 {
    const char *Tcl_UtfToNormalizedDString(Tcl_Interp *interp,
	    const char *bytes, Tcl_Size length,
	    Tcl_UnicodeNormalizationForm normForm, int profile,
	    Tcl_DString *dsPtr)
}







# ----- BASELINE -- FOR -- 9.1.0 ----- #

declare 696 {
    void TclUnusedStubEntry(void)
}

##############################################################################

# Define the platform specific public Tcl interface. These functions are only
# available on the designated platform.







|




>
>
>
>
>
>



|







2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
declare 694 {
    int Tcl_ListObjRange(Tcl_Interp *interp, Tcl_Obj *objPtr,
	    Tcl_Size start, Tcl_Size end, Tcl_Obj **resultPtrPtr)
}

# TIP 726
declare 695 {
    int Tcl_UtfToNormalizedDString(Tcl_Interp *interp,
	    const char *bytes, Tcl_Size length,
	    Tcl_UnicodeNormalizationForm normForm, int profile,
	    Tcl_DString *dsPtr)
}
declare 696 {
    int Tcl_UtfToNormalized(Tcl_Interp *interp,
	    const char *bytes, Tcl_Size length,
	    Tcl_UnicodeNormalizationForm normForm, int profile,
	    char *bufPtr, Tcl_Size bufLen, Tcl_Size *lengthPtr)
}

# ----- BASELINE -- FOR -- 9.1.0 ----- #

declare 697 {
    void TclUnusedStubEntry(void)
}

##############################################################################

# Define the platform specific public Tcl interface. These functions are only
# available on the designated platform.
Changes to generic/tclCmdMZ.c.
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
	Tcl_WrongNumArgs(interp, 1, objv, "?-profile PROFILE? STRING");
	return TCL_ERROR;
    }

    Tcl_DString ds;
    if (Tcl_UtfToNormalizedDString(interp, Tcl_GetString(objv[objc - 1]),
	    TCL_INDEX_NONE, (Tcl_UnicodeNormalizationForm)clientData, profile,
	    &ds) == NULL) {
	return TCL_ERROR;
    }

    Tcl_DStringResult(interp, &ds);
    return TCL_OK;
}








|







5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
	Tcl_WrongNumArgs(interp, 1, objv, "?-profile PROFILE? STRING");
	return TCL_ERROR;
    }

    Tcl_DString ds;
    if (Tcl_UtfToNormalizedDString(interp, Tcl_GetString(objv[objc - 1]),
	    TCL_INDEX_NONE, (Tcl_UnicodeNormalizationForm)clientData, profile,
	    &ds) != TCL_OK) {
	return TCL_ERROR;
    }

    Tcl_DStringResult(interp, &ds);
    return TCL_OK;
}

Changes to generic/tclDecls.h.
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894






1895
1896
1897
1898
1899
1900
1901
				Tcl_Obj *const objv[],
				Tcl_Obj **resultPtrPtr);
/* 694 */
EXTERN int		Tcl_ListObjRange(Tcl_Interp *interp, Tcl_Obj *objPtr,
				Tcl_Size start, Tcl_Size end,
				Tcl_Obj **resultPtrPtr);
/* 695 */
EXTERN const char *	Tcl_UtfToNormalizedDString(Tcl_Interp *interp,
				const char *bytes, Tcl_Size length,
				Tcl_UnicodeNormalizationForm normForm,
				int profile, Tcl_DString *dsPtr);
/* 696 */






EXTERN void		TclUnusedStubEntry(void);

typedef struct {
    const struct TclPlatStubs *tclPlatStubs;
    const struct TclIntStubs *tclIntStubs;
    const struct TclIntPlatStubs *tclIntPlatStubs;
} TclStubHooks;







|




>
>
>
>
>
>







1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
				Tcl_Obj *const objv[],
				Tcl_Obj **resultPtrPtr);
/* 694 */
EXTERN int		Tcl_ListObjRange(Tcl_Interp *interp, Tcl_Obj *objPtr,
				Tcl_Size start, Tcl_Size end,
				Tcl_Obj **resultPtrPtr);
/* 695 */
EXTERN int		Tcl_UtfToNormalizedDString(Tcl_Interp *interp,
				const char *bytes, Tcl_Size length,
				Tcl_UnicodeNormalizationForm normForm,
				int profile, Tcl_DString *dsPtr);
/* 696 */
EXTERN int		Tcl_UtfToNormalized(Tcl_Interp *interp,
				const char *bytes, Tcl_Size length,
				Tcl_UnicodeNormalizationForm normForm,
				int profile, char *bufPtr, Tcl_Size bufLen,
				Tcl_Size *lengthPtr);
/* 697 */
EXTERN void		TclUnusedStubEntry(void);

typedef struct {
    const struct TclPlatStubs *tclPlatStubs;
    const struct TclIntStubs *tclIntStubs;
    const struct TclIntPlatStubs *tclIntPlatStubs;
} TclStubHooks;
2595
2596
2597
2598
2599
2600
2601
2602

2603
2604
2605
2606
2607
2608
2609
2610
    Tcl_Obj * (*tcl_NewWideUIntObj) (Tcl_WideUInt wideValue); /* 688 */
    void (*tcl_SetWideUIntObj) (Tcl_Obj *objPtr, Tcl_WideUInt uwideValue); /* 689 */
    int (*tcl_IsEmpty) (Tcl_Obj *obj); /* 690 */
    const char * (*tcl_GetEncodingNameForUser) (Tcl_DString *bufPtr); /* 691 */
    int (*tcl_ListObjReverse) (Tcl_Interp *interp, Tcl_Obj *objPtr, Tcl_Obj **resultPtrPtr); /* 692 */
    int (*tcl_ListObjRepeat) (Tcl_Interp *interp, Tcl_Size repeatCount, Tcl_Size objc, Tcl_Obj *const objv[], Tcl_Obj **resultPtrPtr); /* 693 */
    int (*tcl_ListObjRange) (Tcl_Interp *interp, Tcl_Obj *objPtr, Tcl_Size start, Tcl_Size end, Tcl_Obj **resultPtrPtr); /* 694 */
    const char * (*tcl_UtfToNormalizedDString) (Tcl_Interp *interp, const char *bytes, Tcl_Size length, Tcl_UnicodeNormalizationForm normForm, int profile, Tcl_DString *dsPtr); /* 695 */

    void (*tclUnusedStubEntry) (void); /* 696 */
} TclStubs;

extern const TclStubs *tclStubsPtr;

#ifdef __cplusplus
}
#endif







|
>
|







2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
    Tcl_Obj * (*tcl_NewWideUIntObj) (Tcl_WideUInt wideValue); /* 688 */
    void (*tcl_SetWideUIntObj) (Tcl_Obj *objPtr, Tcl_WideUInt uwideValue); /* 689 */
    int (*tcl_IsEmpty) (Tcl_Obj *obj); /* 690 */
    const char * (*tcl_GetEncodingNameForUser) (Tcl_DString *bufPtr); /* 691 */
    int (*tcl_ListObjReverse) (Tcl_Interp *interp, Tcl_Obj *objPtr, Tcl_Obj **resultPtrPtr); /* 692 */
    int (*tcl_ListObjRepeat) (Tcl_Interp *interp, Tcl_Size repeatCount, Tcl_Size objc, Tcl_Obj *const objv[], Tcl_Obj **resultPtrPtr); /* 693 */
    int (*tcl_ListObjRange) (Tcl_Interp *interp, Tcl_Obj *objPtr, Tcl_Size start, Tcl_Size end, Tcl_Obj **resultPtrPtr); /* 694 */
    int (*tcl_UtfToNormalizedDString) (Tcl_Interp *interp, const char *bytes, Tcl_Size length, Tcl_UnicodeNormalizationForm normForm, int profile, Tcl_DString *dsPtr); /* 695 */
    int (*tcl_UtfToNormalized) (Tcl_Interp *interp, const char *bytes, Tcl_Size length, Tcl_UnicodeNormalizationForm normForm, int profile, char *bufPtr, Tcl_Size bufLen, Tcl_Size *lengthPtr); /* 696 */
    void (*tclUnusedStubEntry) (void); /* 697 */
} TclStubs;

extern const TclStubs *tclStubsPtr;

#ifdef __cplusplus
}
#endif
3938
3939
3940
3941
3942
3943
3944


3945
3946
3947
3948
3949
3950
3951
3952
3953
	(tclStubsPtr->tcl_ListObjReverse) /* 692 */
#define Tcl_ListObjRepeat \
	(tclStubsPtr->tcl_ListObjRepeat) /* 693 */
#define Tcl_ListObjRange \
	(tclStubsPtr->tcl_ListObjRange) /* 694 */
#define Tcl_UtfToNormalizedDString \
	(tclStubsPtr->tcl_UtfToNormalizedDString) /* 695 */


#define TclUnusedStubEntry \
	(tclStubsPtr->tclUnusedStubEntry) /* 696 */

#endif /* defined(USE_TCL_STUBS) */

/* !END!: Do not edit above this line. */

#undef TclUnusedStubEntry








>
>

|







3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
	(tclStubsPtr->tcl_ListObjReverse) /* 692 */
#define Tcl_ListObjRepeat \
	(tclStubsPtr->tcl_ListObjRepeat) /* 693 */
#define Tcl_ListObjRange \
	(tclStubsPtr->tcl_ListObjRange) /* 694 */
#define Tcl_UtfToNormalizedDString \
	(tclStubsPtr->tcl_UtfToNormalizedDString) /* 695 */
#define Tcl_UtfToNormalized \
	(tclStubsPtr->tcl_UtfToNormalized) /* 696 */
#define TclUnusedStubEntry \
	(tclStubsPtr->tclUnusedStubEntry) /* 697 */

#endif /* defined(USE_TCL_STUBS) */

/* !END!: Do not edit above this line. */

#undef TclUnusedStubEntry

Changes to generic/tclEncoding.c.
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655

4656
4657
4658
4659

4660
4661
4662
4663
4664

4665
4666
4667

4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685

4686
4687
4688
4689
4690

4691
4692
4693
4694
4695


4696
4697
4698
4699
4700
4701
4702

4703
4704
4705
4706
4707
4708
4709
4710
4711
4712

4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724

4725
4726
4727














































4728














































































4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
    objPtr = Tcl_NewListObj(n, NULL);
    for (i = 0; i < n; ++i) {
	Tcl_ListObjAppendElement(interp, objPtr,
		Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
    }
    Tcl_SetObjResult(interp, objPtr);
}

/*
 * Tcl_UtfToNormalizedDString --
 *
 *	Converts the passed string to a Unicode normalization form storing
 *	it in dsPtr.
 *
 * Results:
 *	Pointer to content of dsPtr on success, and NULL on error.
 *
 */
const char *
Tcl_UtfToNormalizedDString(
    Tcl_Interp *interp, /* Used for error messages. May be NULL */
    const char *bytes,  /* Operand encoded in Tcl internal UTF8 */
    Tcl_Size length,	/* Length bytes[], or -1 if NUL terminated */

    Tcl_UnicodeNormalizationForm normForm, /* TCL_{NFC,NFD,NFKC,NFKC} */
    int profile,        /* TCL_ENCODING_PROFILE_{STRICT,REPLACE} */
    Tcl_DString *dsPtr) /* Converted output string in Tcl internal
			   UTF8 encoding. Init'ed by function */

{
    Tcl_DStringInit(dsPtr);

    if (profile != TCL_ENCODING_PROFILE_REPLACE &&
	profile != TCL_ENCODING_PROFILE_STRICT) {

	Tcl_SetObjResult(interp,
	    Tcl_ObjPrintf("Invalid value %d passed for encoding profile.",
		profile));

	return NULL;
    }

    utf8proc_option_t options = UTF8PROC_STABLE;
    switch (normForm) {
    case TCL_NFC:
	options |= UTF8PROC_COMPOSE;
	break;
    case TCL_NFD:
	options |= UTF8PROC_DECOMPOSE;
	break;
    case TCL_NFKC:
	options |= UTF8PROC_COMPOSE | UTF8PROC_COMPAT;
	break;
    case TCL_NFKD:
	options |= UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT;
	break;
    default:

	Tcl_SetObjResult(interp,
	    Tcl_ObjPrintf("Invalid value %d passed for normalization form.",
		normForm));
	return NULL;
    }


    Tcl_Encoding encoding = Tcl_GetEncoding(interp, "utf-8");
    if (encoding == NULL) {
	return NULL;
    }



    int result;
    Tcl_DString dsExt;
    result = Tcl_UtfToExternalDStringEx(interp, encoding, bytes, length,
	profile, &dsExt, NULL);
    /* !!! dsExt needs to be freed even in case of error returns */


    if (result == TCL_OK) {
	utf8proc_uint8_t *normUtf8;
        utf8proc_ssize_t normLength;
	Tcl_Size dsLength = Tcl_DStringLength(&dsExt);
	const utf8proc_uint8_t *dsStr =
	    (utf8proc_uint8_t *)Tcl_DStringValue(&dsExt);
	normLength = utf8proc_map_custom(dsStr, dsLength, &normUtf8,
	    options, NULL, NULL);

	if (normLength < 0) {

            const char *errorMsg = utf8proc_errmsg(normLength);
            Tcl_SetObjResult(
                interp, Tcl_NewStringObj(
                    errorMsg ? errorMsg : "Unicode normalization failed.", -1));
            result = TCL_ERROR;
        } else {
            /* Convert standard UTF8 to internal UTF8 */
            assert(normUtf8);
            result = Tcl_ExternalToUtfDStringEx(interp, encoding,
                (const char *)normUtf8, normLength, profile, dsPtr, NULL);
            free(normUtf8);
        }

    }

    Tcl_DStringFree(&dsExt);














































    Tcl_FreeEncoding(encoding);














































































    return result == TCL_OK ? Tcl_DStringValue(dsPtr) : NULL;
}

/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */







|
<
<
<
<
<
<
<
<
<
<
|
|
|

|
>

|
<
<
>

<
<


>
|
|
|
>
|

















>
|
|
|
<
|
>
|
<
<
<
|
>
>
|


|



>

<
|
<
<
|
<
|


>
|
|
|
|
<
<
<
<
<
<
|
|
>



>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|









4633
4634
4635
4636
4637
4638
4639
4640










4641
4642
4643
4644
4645
4646
4647
4648


4649
4650


4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679

4680
4681
4682



4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694

4695


4696

4697
4698
4699
4700
4701
4702
4703
4704






4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
    objPtr = Tcl_NewListObj(n, NULL);
    for (i = 0; i < n; ++i) {
	Tcl_ListObjAppendElement(interp, objPtr,
		Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
    }
    Tcl_SetObjResult(interp, objPtr);
}











static utf8proc_ssize_t
TclUtfNormalize(
    Tcl_Interp *interp,	/* Error messages. May be NULL */
    const char *bytes,  /* Operand encoded in Tcl internal UTF8 */
    Tcl_Size numBytes, /* Length bytes[], or -1 if NUL terminated */
    Tcl_Encoding encoding, /* Encoding - must be UTF-8. Caller passed for reuse */
    Tcl_UnicodeNormalizationForm normForm, /* TCL_{NFC,NFD,NFKC,NFKC} */
    int profile,                  /* TCL_ENCODING_PROFILE_{STRICT,REPLACE} */


    utf8proc_uint8_t **bufPtrPtr) /* On success, output length excluding nul */
{


    if (profile != TCL_ENCODING_PROFILE_REPLACE &&
	profile != TCL_ENCODING_PROFILE_STRICT) {
	if (interp) {
	    Tcl_SetObjResult(interp,
		Tcl_ObjPrintf("Invalid value %d passed for encoding profile.",
		    profile));
	}
	return -1;
    }

    utf8proc_option_t options = UTF8PROC_STABLE;
    switch (normForm) {
    case TCL_NFC:
	options |= UTF8PROC_COMPOSE;
	break;
    case TCL_NFD:
	options |= UTF8PROC_DECOMPOSE;
	break;
    case TCL_NFKC:
	options |= UTF8PROC_COMPOSE | UTF8PROC_COMPAT;
	break;
    case TCL_NFKD:
	options |= UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT;
	break;
    default:
	if (interp) {
	    Tcl_SetObjResult(interp,
		Tcl_ObjPrintf("Invalid value %d passed for normalization form.",
		    normForm));

	}
	return -1;
    }




    if (numBytes < 0) {
	numBytes = -1;
    }
    int result;
    Tcl_DString dsExt;
    result = Tcl_UtfToExternalDStringEx(interp, encoding, bytes, numBytes,
	profile, &dsExt, NULL);
    /* !!! dsExt needs to be freed even in case of error returns */

    utf8proc_ssize_t normLength = -1;
    if (result == TCL_OK) {

	normLength =


	    utf8proc_map_custom((utf8proc_uint8_t *)Tcl_DStringValue(&dsExt),

		Tcl_DStringLength(&dsExt), bufPtrPtr, options, NULL, NULL);

	if (normLength < 0) {
	    if (interp) {
		const char *errorMsg = utf8proc_errmsg(normLength);
		Tcl_SetObjResult(interp,
		    Tcl_NewStringObj(
			errorMsg ? errorMsg : "Unicode normalization failed.",






			-1));
	    }
	}
    }

    Tcl_DStringFree(&dsExt);
    return normLength;
}

/*
 * Tcl_UtfToNormalizedDString --
 *
 *	Converts the passed string to a Unicode normalization form storing
 *	it in dsPtr.
 *
 * Results:
 *	A standard Tcl error code.
 *
 * Side effects:
 *      The output string is stored in dsPtr, which is initialized.
 */
int
Tcl_UtfToNormalizedDString(
    Tcl_Interp *interp, /* Used for error messages. May be NULL */
    const char *bytes,  /* Operand encoded in Tcl internal UTF8 */
    Tcl_Size numBytes,	/* Length of bytes[], or -1 if NUL terminated */
    Tcl_UnicodeNormalizationForm normForm, /* TCL_{NFC,NFD,NFKC,NFKC} */
    int profile,        /* TCL_ENCODING_PROFILE_{STRICT,REPLACE} */
    Tcl_DString *dsPtr) /* Converted output string in Tcl internal
			   UTF8 encoding. Init'ed by function */
{
    Tcl_DStringInit(dsPtr);
    Tcl_Encoding encoding = Tcl_GetEncoding(interp, "utf-8");
    if (encoding == NULL) {
	return TCL_ERROR;
    }

    utf8proc_uint8_t *normUtf8;
    utf8proc_ssize_t normLength;

    normLength = TclUtfNormalize(interp, bytes, numBytes, encoding, normForm,
	profile, &normUtf8);
    if (normLength >= 0) {
	assert(normUtf8);
	/* Convert standard UTF8 to internal UTF8 */
	int result = Tcl_ExternalToUtfDStringEx(interp, encoding,
	    (const char *)normUtf8, normLength, profile, dsPtr, NULL);
	if (result != TCL_OK) {
	    normLength = -1;
	}
	free(normUtf8); /* NOT Tcl_Free! */
    }
    Tcl_FreeEncoding(encoding);
    return normLength >= 0 ? TCL_OK : TCL_ERROR;
}

/*
 * Tcl_UtfToNormalized --
 *
 *	Converts the passed string to a Unicode normalization form storing
 *	it in the caller provided buffer.
 *
 * Results:
 *	A standard Tcl error code.
 *
 * Side effects:
 *      The output string is stored in bufPtr.
 */
int
Tcl_UtfToNormalized(
    Tcl_Interp *interp, /* Used for error messages. May be NULL */
    const char *bytes,  /* Operand encoded in Tcl internal UTF8 */
    Tcl_Size numBytes,  /* Length of bytes[], or -1 if NUL terminated */
    Tcl_UnicodeNormalizationForm normForm, /* TCL_{NFC,NFD,NFKC,NFKC} */
    int profile,        /* TCL_ENCODING_PROFILE_{STRICT,REPLACE} */
    char *bufPtr,       /* Pointer to output buffer. Must not be NULL.  */
    Tcl_Size bufLen,    /* Size of bufPtr storage. */
    Tcl_Size *lengthPtr) /* Length of the output string in bytes excluding
			   the trailing NUL byte. May be NULL */
{
    Tcl_Encoding encoding = Tcl_GetEncoding(interp, "utf-8");
    if (encoding == NULL) {
	return TCL_ERROR;
    }

    utf8proc_uint8_t *normUtf8;
    utf8proc_ssize_t normLength;
    normLength = TclUtfNormalize(interp, bytes, numBytes, encoding, normForm,
	profile, &normUtf8);
    Tcl_FreeEncoding(encoding);
    if (normLength < 0) {
	return TCL_ERROR;
    }
    assert(normUtf8);

    /* Convert standard UTF8 to internal UTF8 */
    int result;
    utf8proc_uint8_t *from = normUtf8;
    utf8proc_uint8_t *fromEnd = from + normLength;
    char *to = (utf8proc_uint8_t *)bufPtr;
    char *toEnd = to + bufLen - 1; /* -1 for trailing NUL */
    while (from < fromEnd && to < toEnd) {
	/*
	 * The only difference between UTF-8 and internal UTF-8 is that
	 * internal UTF-8 does not allow NUL bytes in the middle of the string.
	 */
	if (*from) {
	    *to++ = *from++;
	} else {
	    *to++ = 0xC0; /* NUL byte in internal UTF-8 is encoded as C080 */
	    if (to == toEnd) {
		break;
	    }
	    *to++ = 0x80;
	    from++;
	}
    }
    if (from < fromEnd) {
	Tcl_SetObjResult(interp,
	    Tcl_NewStringObj("Output buffer too small", -1));
	result = TCL_CONVERT_NOSPACE;
    } else {
	assert(to < (bufPtr + bufLen - 1));
	*to = '\0'; /* NUL terminate the output */
	if (lengthPtr) {
	    *lengthPtr = to-bufPtr;
	}
	result = TCL_OK;
    }

    free(normUtf8); /* NOT Tcl_Free! */
    return result;
}

/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */
Changes to generic/tclStubInit.c.
1498
1499
1500
1501
1502
1503
1504

1505
1506
1507
1508
    Tcl_SetWideUIntObj, /* 689 */
    Tcl_IsEmpty, /* 690 */
    Tcl_GetEncodingNameForUser, /* 691 */
    Tcl_ListObjReverse, /* 692 */
    Tcl_ListObjRepeat, /* 693 */
    Tcl_ListObjRange, /* 694 */
    Tcl_UtfToNormalizedDString, /* 695 */

    TclUnusedStubEntry, /* 696 */
};

/* !END!: Do not edit above this line. */







>
|



1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
    Tcl_SetWideUIntObj, /* 689 */
    Tcl_IsEmpty, /* 690 */
    Tcl_GetEncodingNameForUser, /* 691 */
    Tcl_ListObjReverse, /* 692 */
    Tcl_ListObjRepeat, /* 693 */
    Tcl_ListObjRange, /* 694 */
    Tcl_UtfToNormalizedDString, /* 695 */
    Tcl_UtfToNormalized, /* 696 */
    TclUnusedStubEntry, /* 697 */
};

/* !END!: Do not edit above this line. */
Changes to generic/tclTest.c.
322
323
324
325
326
327
328

329
330
331
332
333
334
335
336
static Tcl_FSOpenFileChannelProc SimpleOpenFileChannel;
static Tcl_FSListVolumesProc SimpleListVolumes;
static Tcl_FSPathInFilesystemProc SimplePathInFilesystem;
static Tcl_Obj *	SimpleRedirect(Tcl_Obj *pathPtr);
static Tcl_FSMatchInDirectoryProc SimpleMatchInDirectory;
static Tcl_ObjCmdProc	TestUtfNextCmd;
static Tcl_ObjCmdProc	TestUtfPrevCmd;

static Tcl_ObjCmdProc	TestUtfNormalizeCmd;
static Tcl_ObjCmdProc	TestNumUtfCharsCmd;
static Tcl_ObjCmdProc	TestGetUniCharCmd;
static Tcl_ObjCmdProc	TestFindFirstCmd;
static Tcl_ObjCmdProc	TestFindLastCmd;
static Tcl_ObjCmdProc	TestHashSystemHashCmd;
static Tcl_ObjCmdProc	TestGetIntForIndexCmd;
static Tcl_ObjCmdProc	TestLutilCmd;







>
|







322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
static Tcl_FSOpenFileChannelProc SimpleOpenFileChannel;
static Tcl_FSListVolumesProc SimpleListVolumes;
static Tcl_FSPathInFilesystemProc SimplePathInFilesystem;
static Tcl_Obj *	SimpleRedirect(Tcl_Obj *pathPtr);
static Tcl_FSMatchInDirectoryProc SimpleMatchInDirectory;
static Tcl_ObjCmdProc	TestUtfNextCmd;
static Tcl_ObjCmdProc	TestUtfPrevCmd;
static Tcl_ObjCmdProc	TestUtfToNormalizedDStringCmd;
static Tcl_ObjCmdProc	TestUtfToNormalizedCmd;
static Tcl_ObjCmdProc	TestNumUtfCharsCmd;
static Tcl_ObjCmdProc	TestGetUniCharCmd;
static Tcl_ObjCmdProc	TestFindFirstCmd;
static Tcl_ObjCmdProc	TestFindLastCmd;
static Tcl_ObjCmdProc	TestHashSystemHashCmd;
static Tcl_ObjCmdProc	TestGetIntForIndexCmd;
static Tcl_ObjCmdProc	TestLutilCmd;
734
735
736
737
738
739
740
741
742


743
744
745
746
747
748
749
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testinterpresolver", TestInterpResolverCmd,
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testapplylambda", TestApplyLambdaCmd,
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testlutil", TestLutilCmd,
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testutfnormalize", TestUtfNormalizeCmd, NULL,
	    NULL);


#if defined(_WIN32)
    Tcl_CreateObjCommand(interp, "testhandlecount", TestHandleCountCmd,
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testappverifierpresent",
	    TestAppVerifierPresentCmd, NULL, NULL);
#endif








|
|
>
>







735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testinterpresolver", TestInterpResolverCmd,
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testapplylambda", TestApplyLambdaCmd,
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testlutil", TestLutilCmd,
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testutftonormalized",
	TestUtfToNormalizedCmd, NULL, NULL);
    Tcl_CreateObjCommand(interp, "testutftonormalizeddstring",
	TestUtfToNormalizedDStringCmd, NULL, NULL);
#if defined(_WIN32)
    Tcl_CreateObjCommand(interp, "testhandlecount", TestHandleCountCmd,
	    NULL, NULL);
    Tcl_CreateObjCommand(interp, "testappverifierpresent",
	    TestAppVerifierPresentCmd, NULL, NULL);
#endif

9026
9027
9028
9029
9030
9031
9032
9033
9034
9035

















































































9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
    if (l2Obj) {
	Tcl_DecrRefCount(l2Obj);
    }
    return ret;
}

/*
 * TestUtfNormalizeCmd --
 *
 *	This procedure implements the "testutfnormalize" command which

















































































 *	provides a raw interface to the Tcl_UtfToNormalizedDString API.
 *      objv[1] - input byte array encoded in Tcl internal UTF-8. Use
 *		  teststringbytes to construct.
 *	objv[2] - normForm value to pass to Tcl_UtfToNormalizeDString
 *	objv[3] - profile value to pass to Tcl_UtfToNormalizeDString
 *	objv[4] - (optional) length to pass to Tcl_UtfToNormalizeDString. If
 *		  not present, length of objv[1] is used.
 *
 * Results:
 *	A standard Tcl result.
 *
 * Side effects:
 *	The interpreter result is set to the raw bytes output of the
 *	Tcl_UtfToNormalizeDString call.
 *
 *----------------------------------------------------------------------
 */
static int
TestUtfNormalizeCmd(
    TCL_UNUSED(void *),
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Arguments. */
{
    if (objc != 4 && objc != 5) {
	Tcl_WrongNumArgs(interp, 1, objv, "BYTES NORMALFORM PROFILE ?LENGTH?");







|

|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



|
|
|







|




|







9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
    if (l2Obj) {
	Tcl_DecrRefCount(l2Obj);
    }
    return ret;
}

/*
 * TestUtfToNormalizedCmd --
 *
 *	This procedure implements the "testutftonormalized" command which
 *	provides a raw interface to the Tcl_UtfToNormalized API.
 *      objv[1] - input byte array encoded in Tcl internal UTF-8. Use
 *		  teststringbytes to construct.
 *	objv[2] - normForm value to pass to Tcl_UtfToNormalized
 *	objv[3] - profile value to pass to Tcl_UtfToNormalized
 *	objv[4] - buffer length to pass to Tcl_UtfToNormalized.
 *
 * Results:
 *	A standard Tcl result.
 *
 * Side effects:
 *	The interpreter result is set to the raw bytes output of the
 *	Tcl_UtfToNormalized call.
 *
 *----------------------------------------------------------------------
 */
static int
TestUtfToNormalizedCmd(
    TCL_UNUSED(void *),
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Arguments. */
{
    if (objc != 5 && objc != 6) {
	Tcl_WrongNumArgs(interp, 1, objv, "BYTES NORMALFORM PROFILE ?LENGTH? BUFLENGTH");
	return TCL_ERROR;
    }
    Tcl_Size bufLen, len, slen;
    unsigned char *s = Tcl_GetBytesFromObj(interp, objv[1], &slen);
    if (s == NULL) {
	return TCL_ERROR;
    }
    int normForm, profile;
    if (Tcl_GetIntFromObj(interp, objv[2], &normForm) != TCL_OK ||
	Tcl_GetIntFromObj(interp, objv[3], &profile) != TCL_OK) {
	return TCL_ERROR;
    }
    if (Tcl_GetSizeIntFromObj(interp, objv[objc-1], &bufLen) != TCL_OK) {
	return TCL_ERROR;
    }
    if (objc == 5) {
	len = slen;
    } else {
	if (Tcl_GetSizeIntFromObj(interp, objv[4], &len) != TCL_OK) {
	    return TCL_ERROR;
	}
	if (len > slen) {
	    Tcl_SetObjResult(interp,
		Tcl_ObjPrintf(
		    "Passed length %" TCL_SIZE_MODIFIER
		    "d is greater than string length %" TCL_SIZE_MODIFIER
		    "d.", len, slen));
	    return TCL_ERROR;
	}
    }
    int result;
    char buffer[20] = {0x80};
    char *bufPtr;
    Tcl_Size bufStored = 0;
    if (bufLen > sizeof(buffer)) {
	bufPtr = (char *)Tcl_Alloc(bufLen);
    } else {
	bufPtr = buffer;
    }
    result = Tcl_UtfToNormalized(interp, (char *) s, len,
	(Tcl_UnicodeNormalizationForm)normForm, profile, bufPtr, bufLen, &bufStored);
    if (result == TCL_OK) {
	/* Return as raw bytes, not string */
	Tcl_SetObjResult(interp,
	    Tcl_NewByteArrayObj(bufPtr, bufStored));
    }
    if (bufPtr != buffer) {
	Tcl_Free(bufPtr);
    }
    return result;
}

/*
 * TestUtfToNormalizedDStringCmd --
 *
 *	This procedure implements the "testutftonormalizedstring" command which
 *	provides a raw interface to the Tcl_UtfToNormalizedDString API.
 *      objv[1] - input byte array encoded in Tcl internal UTF-8. Use
 *		  teststringbytes to construct.
 *	objv[2] - normForm value to pass to Tcl_UtfToNormalizedDString
 *	objv[3] - profile value to pass to Tcl_UtfToNormalizedDString
 *	objv[4] - (optional) length to pass to Tcl_UtfToNormalizedDString. If
 *		  not present, length of objv[1] is used.
 *
 * Results:
 *	A standard Tcl result.
 *
 * Side effects:
 *	The interpreter result is set to the raw bytes output of the
 *	Tcl_UtfToNormalizedDString call.
 *
 *----------------------------------------------------------------------
 */
static int
TestUtfToNormalizedDStringCmd(
    TCL_UNUSED(void *),
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Arguments. */
{
    if (objc != 4 && objc != 5) {
	Tcl_WrongNumArgs(interp, 1, objv, "BYTES NORMALFORM PROFILE ?LENGTH?");
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089

9090
9091
9092
9093
9094
9095
9096
9097

9098

9099
9100
9101
9102
9103
9104
9105
9106
    if (Tcl_GetIntFromObj(interp, objv[2], &normForm) != TCL_OK ||
	Tcl_GetIntFromObj(interp, objv[3], &profile) != TCL_OK) {
	return TCL_ERROR;
    }
    if (objc == 4) {
	len = slen;
    } else {
	if (Tcl_GetSizeIntFromObj(interp, objv[2], &len) != TCL_OK) {
	    return TCL_ERROR;
	}
	if (len > slen) {
	    Tcl_SetObjResult(interp,
		Tcl_ObjPrintf(
		    "Passed length %" TCL_SIZE_MODIFIER
		    "d is greater than string length %" TCL_SIZE_MODIFIER
		    "d.", len, slen));
	    return TCL_ERROR;
	}
    }
    const char *bytes;
    Tcl_DString ds;

    bytes = Tcl_UtfToNormalizedDString(interp, (char *) s, len,
	(Tcl_UnicodeNormalizationForm)normForm, profile, &ds);
    if (bytes == NULL) {
	return TCL_ERROR;
    }
    /* Return as raw bytes, not string */
    Tcl_SetObjResult(interp,
	Tcl_NewByteArrayObj((unsigned char *)Tcl_DStringValue(&ds), Tcl_DStringLength(&ds)));

    Tcl_DStringFree(&ds);

    return TCL_OK;
}

#ifdef _WIN32
/*
 *----------------------------------------------------------------------
 *
 * TestHandleCountCmd --







|











<

>
|

|
<
<
|
|
|
>
|
>
|







9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171

9172
9173
9174
9175
9176


9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
    if (Tcl_GetIntFromObj(interp, objv[2], &normForm) != TCL_OK ||
	Tcl_GetIntFromObj(interp, objv[3], &profile) != TCL_OK) {
	return TCL_ERROR;
    }
    if (objc == 4) {
	len = slen;
    } else {
	if (Tcl_GetSizeIntFromObj(interp, objv[5], &len) != TCL_OK) {
	    return TCL_ERROR;
	}
	if (len > slen) {
	    Tcl_SetObjResult(interp,
		Tcl_ObjPrintf(
		    "Passed length %" TCL_SIZE_MODIFIER
		    "d is greater than string length %" TCL_SIZE_MODIFIER
		    "d.", len, slen));
	    return TCL_ERROR;
	}
    }

    Tcl_DString ds;
    int result;
    result = Tcl_UtfToNormalizedDString(interp, (char *) s, len,
	(Tcl_UnicodeNormalizationForm)normForm, profile, &ds);
    if (result == TCL_OK) {


	/* Return as raw bytes, not string */
	Tcl_SetObjResult(interp,
	    Tcl_NewByteArrayObj((unsigned char *)Tcl_DStringValue(&ds),
		Tcl_DStringLength(&ds)));
	Tcl_DStringFree(&ds);
    }
    return result;
}

#ifdef _WIN32
/*
 *----------------------------------------------------------------------
 *
 * TestHandleCountCmd --
Changes to tests/unicodeNormalize.test.
266
267
268
269
270
271
272

273
274
275
276
277
278
279
280
281
282

283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
        lassign $testCase lineno chars nfc nfd nfkc nfkd
        set bytes [teststringbytes $chars]
        foreach profile {strict replace} {
            foreach normForm {nfc nfd nfkc nfkd} {
                test Tcl_UtfToNormalizedDString-$normForm-line-$lineno-$profile \
                    "Tcl_UtfToNormalizedDString for $normForm at line $lineno of $::tcltests::ucd::normalizationDataFile" \
                    -body {

                        testutfnormalize $bytes $normEnums($normForm) $profileFlags($profile)
                    } -result [teststringbytes [set $normForm]]
            }
        }
    }

    foreach normForm {nfc nfd nfkc nfkd} {
        test Tcl_UtfToNormalizedDString-$normForm-nulchar-$profile \
            "Tcl_UtfToNormalizedDString for $normForm passed nul character" \
            -body {

                testutfnormalize [teststringbytes \0] $normEnums($normForm) $profileFlags($profile)
            } -result \xC0\x80
    }

    # Tcl_UtfToNormalizedDString error cases

    foreach normForm {nfc nfd nfkc nfkd} {
        test Tcl_UtfToNormalizedDString-$normForm-tcl8 \
            "Tcl_UtfToNormalizedDString for $normForm profile tcl8" \
            -body {
                testutfnormalize abc $normEnums($normForm) $profileFlags(tcl8)
            } -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error

        if {0} {
            # TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
            # no way to test normalization of invalid UTF-8. Enable this test
            # once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
            test Tcl_UtfToNormalizedDString-$normForm-invalid-utf8 \
                "Tcl_UtfToNormalizedDString for $normForm invalid utf8 profile strict" \
                -body {
                    testutfnormalize [binary decode hex EFBF7F] $normEnums($normForm) $profileFlags(strict)
                } -result {} -returnCodes error
        }
    }

    test Tcl_UtfToNormalizedDString-invalid-normalization-form \
            "Tcl_UtfToNormalizedDString invalid value for normalization form" \
            -body {
                testutfnormalize abc 4 $profileFlags(strict)
            } -result {Invalid value 4 passed for normalization form.} -returnCodes error

}

::tcltest::cleanupTests
namespace delete unicode::test
return







>
|









>
|









|









|







|







266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
        lassign $testCase lineno chars nfc nfd nfkc nfkd
        set bytes [teststringbytes $chars]
        foreach profile {strict replace} {
            foreach normForm {nfc nfd nfkc nfkd} {
                test Tcl_UtfToNormalizedDString-$normForm-line-$lineno-$profile \
                    "Tcl_UtfToNormalizedDString for $normForm at line $lineno of $::tcltests::ucd::normalizationDataFile" \
                    -body {
                        testutftonormalizeddstring $bytes $normEnums($normForm) \
					        $profileFlags($profile)
                    } -result [teststringbytes [set $normForm]]
            }
        }
    }

    foreach normForm {nfc nfd nfkc nfkd} {
        test Tcl_UtfToNormalizedDString-$normForm-nulchar-$profile \
            "Tcl_UtfToNormalizedDString for $normForm passed nul character" \
            -body {
                testutftonormalizeddstring [teststringbytes \0] \
				$normEnums($normForm) $profileFlags($profile)
            } -result \xC0\x80
    }

    # Tcl_UtfToNormalizedDString error cases

    foreach normForm {nfc nfd nfkc nfkd} {
        test Tcl_UtfToNormalizedDString-$normForm-tcl8 \
            "Tcl_UtfToNormalizedDString for $normForm profile tcl8" \
            -body {
                testutftonormalizeddstring abc $normEnums($normForm) $profileFlags(tcl8)
            } -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error

        if {0} {
            # TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
            # no way to test normalization of invalid UTF-8. Enable this test
            # once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
            test Tcl_UtfToNormalizedDString-$normForm-invalid-utf8 \
                "Tcl_UtfToNormalizedDString for $normForm invalid utf8 profile strict" \
                -body {
                    testutftonormalizeddstring [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict)
                } -result {} -returnCodes error
        }
    }

    test Tcl_UtfToNormalizedDString-invalid-normalization-form \
            "Tcl_UtfToNormalizedDString invalid value for normalization form" \
            -body {
                testutftonormalizeddstring abc 4 $profileFlags(strict)
            } -result {Invalid value 4 passed for normalization form.} -returnCodes error

}

::tcltest::cleanupTests
namespace delete unicode::test
return