Tcl Source Code

Check-in [a614b04af1]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:introduces fast hash algorithm for canonical numeric objects (all wide integer ranges covered now), removing CompareVarKeys (replaced with TclCompareObjKeys)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | tip-534-sebres-fast-number-hash
Files: files | file ages | folders
SHA3-256: a614b04af19e51076df0be819f24470bb472b6ba242aac60789345f920fe6ac7
User & Date: sebres 2019-05-17 15:18:42
Context
2019-05-17
17:43
code review (typo fixed, compiler compat, etc) + more test cases (hashing of not canonical form of i... check-in: 0c26cd8178 user: sebres tags: tip-534-sebres-fast-number-hash
15:18
introduces fast hash algorithm for canonical numeric objects (all wide integer ranges covered now), ... check-in: a614b04af1 user: sebres tags: tip-534-sebres-fast-number-hash
10:57
merge 8.7 check-in: a4a961564b user: sebres tags: tip-534-sebres-fast-number-hash
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclObj.c.

4161
4162
4163
4164
4165
4166
4167























4168
4169
4170
4171
4172
4173
4174
....
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204



4205
4206






4207
4208
4209

4210
4211
4212









4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226

4227
4228



4229

4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
....
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
....
4363
4364
4365
4366
4367
4368
4369



































4370








4371

































































4372
4373

4374
4375
4376
4377
4378
4379
4380
    hPtr->key.objPtr = objPtr;
    Tcl_IncrRefCount(objPtr);
    hPtr->clientData = NULL;

    return hPtr;
}
 























/*
 *----------------------------------------------------------------------
 *
 * TclCompareObjKeys --
 *
 *	Compares two Tcl_Obj * keys.
 *
................................................................................
 */

int
TclCompareObjKeys(
    void *keyPtr,		/* New key to compare. */
    Tcl_HashEntry *hPtr)	/* Existing key to compare. */
{
    Tcl_Obj *objPtr1 = keyPtr;
    Tcl_Obj *objPtr2 = (Tcl_Obj *) hPtr->key.oneWordValue;
    register const char *p1, *p2;
    register size_t l1, l2;

    /* Optimisation for comparing small integers */
    if (objPtr1->typePtr == &tclIntType && objPtr1->bytes == NULL && objPtr2->typePtr == &tclIntType && objPtr2->bytes == NULL) {
        return objPtr1->internalRep.wideValue == objPtr2->internalRep.wideValue;
    }

    /*
     * If the object pointers are the same then they match.
     * OPT: this comparison was moved to the caller

       if (objPtr1 == objPtr2) return 1;



    */







    /*
     * Don't use Tcl_GetStringFromObj as it would prevent l1 and l2 being
     * in a register.

     */

    p1 = TclGetString(objPtr1);









    l1 = objPtr1->length;
    p2 = TclGetString(objPtr2);
    l2 = objPtr2->length;

    /*
     * Only compare if the string representations are of the same length.
     */

    if (l1 == l2) {
	for (;; p1++, p2++, l1--) {
	    if (*p1 != *p2) {
		break;
	    }
	    if (l1 == 0) {

		return 1;
	    }



	}

    }

    return 0;
}
 
/*
 *----------------------------------------------------------------------
 *
 * TclFreeObjEntry --
 *
................................................................................
    void *keyPtr)		/* Key from which to compute hash value. */
{
    Tcl_Obj *objPtr = keyPtr;
    TCL_HASH_TYPE result = 0;
    int length;
    const char *string;

    /* Special case: we can compute the hash of integers numerically. */
    if (objPtr->typePtr == &tclIntType && objPtr->bytes == NULL) {
        const Tcl_WideInt objValue = objPtr->internalRep.wideValue;
        register
        Tcl_WideUInt value = (Tcl_WideUInt) objValue;

        if (objValue < 0) { /* wrap to positive (remove sign) */
            value = (Tcl_WideUInt) -objValue;
        }

#ifndef TCL_WIDE_INT_IS_LONG
        /* 
         * For the performance reasons we should try convert small integers
         * as unsigned long if it is safe to cast in.
         * Important: consider sign, so avoid UB by wide -0x8000000000000000.
         */
        if (value-1 < ((Tcl_WideUInt)ULONG_MAX)) {
            register unsigned long lvalue = (unsigned long)value;

            /* important: use do-cycle, because value could be 0 */
            do {
                result += (result << 3) + (lvalue % 10 + '0');
                lvalue /= 10;
            } while (lvalue);

            if (objValue < 0) { /* negative, sign as char */
                result += (result << 3) + '-';
            }
            return result;
        }
#endif

        /* important: use do-cycle, because value could be 0 */
        do {
            result += (result << 3) + (value % 10 + '0');
            value /= 10;
        } while (value);

        if (objValue < 0) { /* negative, sign as char */
            result += (result << 3) + '-';
        }
        return result;
    }

    string = TclGetString(objPtr);
    length = objPtr->length;

    /*
     * I tried a zillion different hash functions and asked many other people
     * for advice. Many people had their own favorite functions, all
     * different, but no-one had much idea why they were good ones. I chose
     * the one below (multiply by 9 and add new character) because of the
     * following reasons:
     *
................................................................................
     *
     * See also HashStringKey in tclHash.c.
     * See also HashString in tclLiteral.c.
     *
     * See [tcl-Feature Request #2958832]
     */




































    string += length;








    while (length--) {

































































        result += (result << 3) + (unsigned char)(*--string);
    }

    return result;
}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetCommandFromObj --






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|
|
<
|
<
<
<
<
<




|
|
>
>
>
|

>
>
>
>
>
>
|
<
<
>
|

<
>
>
>
>
>
>
>
>
>

<



|


|
|
|
|
|
|
>
|
|
>
>
>
|
>

<
|







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|

>







4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
....
4206
4207
4208
4209
4210
4211
4212
4213
4214

4215





4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233


4234
4235
4236

4237
4238
4239
4240
4241
4242
4243
4244
4245
4246

4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267

4268
4269
4270
4271
4272
4273
4274
4275
....
4318
4319
4320
4321
4322
4323
4324















































4325
4326
4327
4328
4329
4330
4331
....
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
    hPtr->key.objPtr = objPtr;
    Tcl_IncrRefCount(objPtr);
    hPtr->clientData = NULL;

    return hPtr;
}
 
static inline int
IntObjIsCanonical(
    Tcl_Obj *objPtr) 
{
    const char *p = objPtr->bytes;

    /* 
     * Canonical integers are:
     *   without or with simple representation (like 0..9)
     *   not hex, octal and not prefixed with + (consider minus-char as sign)
     *   and does not contains spaces
     */
    return (
	!p || objPtr->length == 1 /* simplest cases */
	|| (
	  (
	      (*p > '0') /* not 0x..., 0o..., 00..., space... */
	   || (*p == '-' && p[1] > '0') /* not -0 and -space... */
	  )
	  && (p[objPtr->length-1] >= '0') /* not ...space */
	)
    );
}
/*
 *----------------------------------------------------------------------
 *
 * TclCompareObjKeys --
 *
 *	Compares two Tcl_Obj * keys.
 *
................................................................................
 */

int
TclCompareObjKeys(
    void *keyPtr,		/* New key to compare. */
    Tcl_HashEntry *hPtr)	/* Existing key to compare. */
{
    register Tcl_Obj *objPtr1 = keyPtr;
    register Tcl_Obj *objPtr2 = hPtr->key.objPtr;

    register int l1, l2;






    /*
     * If the object pointers are the same then they match.
     * OPT: this comparison was moved to the caller
     *
     * if (objPtr1 == objPtr2) return 1;
     *
     * Normally we don't need to get strings, because if it is expected, it is
     * already done in TclHashObjKey, this is also covered by assert below.
     */

    /* Optimisation for comparing integer objects */

    if (objPtr1->typePtr == &tclIntType && objPtr2->typePtr == &tclIntType) {
	if (objPtr1->internalRep.wideValue != objPtr2->internalRep.wideValue) {
	    return 0;
	}
	/* 


	 * Integers are equal, so check it is canonical ...
	 */


	if (IntObjIsCanonical(objPtr1) && IntObjIsCanonical(objPtr2)) {
	    return 1;
	}

	/*
	 * Compare its string representations.
	 */
    }

    l1 = objPtr1->length;

    l2 = objPtr2->length;

    /*
     * Only compare string representations of the same length.
     */

    if (l1 != l2) {
        return 0;
    } else {
	register const char *p1 = objPtr1->bytes, *p2 = objPtr2->bytes;

	assert(p1 != NULL && p2 != NULL);
	if (!l1) {
	    return 0;
	}
	do {
	    if (*p1++ != *p2++) {
		return 0;
	    }
	} while (--l1);
    }

    return 1;
}
 
/*
 *----------------------------------------------------------------------
 *
 * TclFreeObjEntry --
 *
................................................................................
    void *keyPtr)		/* Key from which to compute hash value. */
{
    Tcl_Obj *objPtr = keyPtr;
    TCL_HASH_TYPE result = 0;
    int length;
    const char *string;
















































    /*
     * I tried a zillion different hash functions and asked many other people
     * for advice. Many people had their own favorite functions, all
     * different, but no-one had much idea why they were good ones. I chose
     * the one below (multiply by 9 and add new character) because of the
     * following reasons:
     *
................................................................................
     *
     * See also HashStringKey in tclHash.c.
     * See also HashString in tclLiteral.c.
     *
     * See [tcl-Feature Request #2958832]
     */

    /*
     * TIP #534, use fast integer hashing if it is canonical
     */
    if (objPtr->typePtr == &tclIntType && IntObjIsCanonical(objPtr)) {
	Tcl_WideUInt num = objPtr->internalRep.wideValue;
	/* remove sign and hash it differently */
	if (objPtr->internalRep.wideValue < 0) {
	    num = -num;
	    result = (TCL_HASH_TYPE)'-' << 31; /* 45<<31 == 0x(x64?16:0)80000000 */
	}
    #if ((TCL_HASH_TYPE)-1) > 0xffffffff
	/* unsigned 64-bit as unsigned 64-bit integer */
	result += (TCL_HASH_TYPE)objPtr->internalRep.wideValue;
    #else
	/* unsigned 64-bit as sum of parts in 32-bit unsigned */
	result += (TCL_HASH_TYPE)(num / 1000000000)
		+ (TCL_HASH_TYPE)(num % 1000000000);
    #endif
	return result;
    }

    /*
     * Hash string considering numeric (TIP #534), if it looks like a number
     * use fastest string to number conversion, thereby we don't care about
     * possible non-numeric characters, because it is just a hash value.
     */
    result = 0;
    string = TclGetString(objPtr);
    length = objPtr->length;

    if (!length) { return result; }

    if (*string == '-') {
	result = '-';
	string++; length--;
	if (!length) {
	    return result;
	}
    }

    if (length <= 19 && *string <= '9' && *string >= '0') {
    #if ((TCL_HASH_TYPE)-1) > 0xffffffff
        /* hash is 64-bit, assume compiled as x64 */
        Tcl_WideUInt num = 0;
	switch (length) {
	    /* signed 64-bit int is max 19 chars = (+/-)9223372036854775807L */
	    case 19:  num += (*string++ - '0') * 1000000000000000000;
	    case 18:  num += (*string++ - '0') * 100000000000000000;
	    case 17:  num += (*string++ - '0') * 10000000000000000;
	    case 16:  num += (*string++ - '0') * 1000000000000000;
	    case 15:  num += (*string++ - '0') * 100000000000000;
	    case 14:  num += (*string++ - '0') * 10000000000000;
	    case 13:  num += (*string++ - '0') * 1000000000000;
	    case 12:  num += (*string++ - '0') * 100000000000;
	    case 11:  num += (*string++ - '0') * 10000000000;
	    /* signed 32-bit int is max 10 chars = (+/-)2147483647 */
	    case 10:  num += (*string++ - '0') * 1000000000;
	    case  9:  num += (*string++ - '0') * 100000000;
	    case  8:  num += (*string++ - '0') * 10000000;
	    case  7:  num += (*string++ - '0') * 1000000;
	    case  6:  num += (*string++ - '0') * 100000;
	    case  5:  num += (*string++ - '0') * 10000;
	    case  4:  num += (*string++ - '0') * 1000;
	    case  3:  num += (*string++ - '0') * 100;
	    case  2:  num += (*string++ - '0') * 10;
	    case  1:  num += (*string++ - '0');
	}
	/* result considering sign (if result is '-', "negate" numeric) */
	result <<= 31; /* 45<<31 == 0x(x64?16:0)80000000 */
	result += (TCL_HASH_TYPE)num;
    #else 
        /* 32-bit hash (int calculation is faster) */
        unsigned int hnm = 0;
        unsigned int lnm = 0;
	switch (length) {
	    /* high part of hash (wide / 1000000000) */
	    case 19:  hnm += (*string++ - '0') * 100000000 * 10;
	    case 18:  hnm += (*string++ - '0') * 100000000;
	    case 17:  hnm += (*string++ - '0') * 10000000;
	    case 16:  hnm += (*string++ - '0') * 1000000;
	    case 15:  hnm += (*string++ - '0') * 100000;
	    case 14:  hnm += (*string++ - '0') * 10000;
	    case 13:  hnm += (*string++ - '0') * 1000;
	    case 12:  hnm += (*string++ - '0') * 100;
	    case 11:  hnm += (*string++ - '0') * 10;
	    case 10:  hnm += (*string++ - '0');
	    /* low part of hash (wide % 1000000000) */
	    case  9:  lnm += (*string++ - '0') * 100000000;
	    case  8:  lnm += (*string++ - '0') * 10000000;
	    case  7:  lnm += (*string++ - '0') * 1000000;
	    case  6:  lnm += (*string++ - '0') * 100000;
	    case  5:  lnm += (*string++ - '0') * 10000;
	    case  4:  lnm += (*string++ - '0') * 1000;
	    case  3:  lnm += (*string++ - '0') * 100;
	    case  2:  lnm += (*string++ - '0') * 10;
	    case  1:  lnm += (*string++ - '0');
	}
	/* result considering sign (if result is '-', "negate" numeric) */
	result <<= 31; /* 45<<31 == 0x(x64?16:0)80000000 */
	result += (TCL_HASH_TYPE)hnm + (TCL_HASH_TYPE)lnm;
    #endif

        return result;
    }

    /* 
     * Fast string hashing (non-numeric)
     */
    result = (result << 3) + UCHAR(*string);
    while (--length) {
	result += (result << 3) + UCHAR(*++string);
    }

    return result;
}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetCommandFromObj --

Changes to generic/tclVar.c.

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
....
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
/*
 * Prototypes for the variable hash key methods.
 */

static Tcl_HashEntry *	AllocVarEntry(Tcl_HashTable *tablePtr, void *keyPtr);
static void		FreeVarEntry(Tcl_HashEntry *hPtr);
static int		CompareVarKeys(void *keyPtr, Tcl_HashEntry *hPtr);

static const Tcl_HashKeyType tclVarHashKeyType = {
    TCL_HASH_KEY_TYPE_VERSION,	/* version */
    0,				/* flags */
    TclHashObjKey,		/* hashKeyProc */
    CompareVarKeys,		/* compareKeysProc */
    AllocVarEntry,		/* allocEntryProc */
    FreeVarEntry		/* freeEntryProc */
};

static inline Var *	VarHashCreateVar(TclVarHashTable *tablePtr,
			    Tcl_Obj *key, int *newPtr);
static inline Var *	VarHashFirstVar(TclVarHashTable *tablePtr,
................................................................................
    } else {
	VarHashInvalidateEntry(varPtr);
	TclSetVarUndefined(varPtr);
	VarHashRefCount(varPtr)--;
    }
    Tcl_DecrRefCount(objPtr);
}

static int
CompareVarKeys(
    void *keyPtr,		/* New key to compare. */
    Tcl_HashEntry *hPtr)	/* Existing key to compare. */
{
    Tcl_Obj *objPtr1 = keyPtr;
    Tcl_Obj *objPtr2 = hPtr->key.objPtr;
    register const char *p1, *p2;
    register int l1, l2;

    /* Optimisation for comparing small integers */
    if (objPtr1->typePtr == &tclIntType && objPtr1->bytes == NULL && objPtr2->typePtr == &tclIntType && objPtr2->bytes == NULL) {
        return objPtr1->internalRep.wideValue == objPtr2->internalRep.wideValue;
    }

    /*
     * If the object pointers are the same then they match.
     * OPT: this comparison was moved to the caller
     *
     * if (objPtr1 == objPtr2) return 1;
     */

    /*
     * Don't use Tcl_GetStringFromObj as it would prevent l1 and l2 being in a
     * register.
     */

    p1 = TclGetString(objPtr1);
    l1 = objPtr1->length;
    p2 = TclGetString(objPtr2);
    l2 = objPtr2->length;

    /*
     * Only compare string representations of the same length.
     */

    return ((l1 == l2) && !memcmp(p1, p2, l1));
}
 
/*----------------------------------------------------------------------
 *
 * ArrayDefaultCmd --
 *
 *	This function implements the 'array default' Tcl command.
 *	Refer to the user documentation for details on what it does.






<





|







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







22
23
24
25
26
27
28

29
30
31
32
33
34
35
36
37
38
39
40
41
....
6587
6588
6589
6590
6591
6592
6593







































6594
6595
6596
6597
6598
6599
6600
/*
 * Prototypes for the variable hash key methods.
 */

static Tcl_HashEntry *	AllocVarEntry(Tcl_HashTable *tablePtr, void *keyPtr);
static void		FreeVarEntry(Tcl_HashEntry *hPtr);


static const Tcl_HashKeyType tclVarHashKeyType = {
    TCL_HASH_KEY_TYPE_VERSION,	/* version */
    0,				/* flags */
    TclHashObjKey,		/* hashKeyProc */
    TclCompareObjKeys,	/* compareKeysProc */
    AllocVarEntry,		/* allocEntryProc */
    FreeVarEntry		/* freeEntryProc */
};

static inline Var *	VarHashCreateVar(TclVarHashTable *tablePtr,
			    Tcl_Obj *key, int *newPtr);
static inline Var *	VarHashFirstVar(TclVarHashTable *tablePtr,
................................................................................
    } else {
	VarHashInvalidateEntry(varPtr);
	TclSetVarUndefined(varPtr);
	VarHashRefCount(varPtr)--;
    }
    Tcl_DecrRefCount(objPtr);
}







































 
/*----------------------------------------------------------------------
 *
 * ArrayDefaultCmd --
 *
 *	This function implements the 'array default' Tcl command.
 *	Refer to the user documentation for details on what it does.