1 DJBX33A算法原理
DJBX33A (Daniel J. Bernstein, Times 33 with Addition)
哈希算法速度非常快,并且分类非常好(冲突小,分布均匀),是比较理想的字符串哈希算法,目前被广泛应用在多个软件项目中,例如:PHP
,Python
,Apache
,Nginx
和BerkeleyDB
等。
DJBX33A算法简单实现:
unsigned long djbx33a_hash(const char *str, size_t len)
{
unsigned long hash = 0U;
for(size_t i = 0;i < len; ++i) {
hash = hash * 33 + (unsigned long)str[i];
/* or, hash = ((hash << 5) + hash) + (unsigned long)str[i];
* where, hash * 33 = ((hash << 5) + hash)
*/
}
return hash;
}
2 DJBX33A算法典型实现
2.1 PHP(zend_string.h
)
static zend_always_inline zend_ulong zend_inline_hash_func(const char *str, size_t len)
{
zend_ulong hash = Z_UL(5381);
/* variant with the hash unrolled eight times */
for (; len >= 8; len -= 8) {
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
}
switch (len) {
case 7: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 6: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 5: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 4: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 3: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 2: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 1: hash = ((hash << 5) + hash) + *str++; break;
case 0: break;
EMPTY_SWITCH_DEFAULT_CASE()
}
/* Hash value can't be zero, so we always set the high bit */
#if SIZEOF_ZEND_LONG == 8
return hash | Z_UL(0x8000000000000000);
#elif SIZEOF_ZEND_LONG == 4
return hash | Z_UL(0x80000000);
#else
# error "Unknown SIZEOF_ZEND_LONG"
#endif
}
/* Associate type micro defination in other file*/
typedef uint32_t zend_ulong;
#define Z_UL(i) UINT32_C(i)
#define UINT32_C(c) c ## U
1
其中,DJBX33A
算法哈希(Hash)初始值为zend_ulong hash = Z_UL(5381)
,该函数注释如下:
/*
* DJBX33A (Daniel J. Bernstein, Times 33 with Addition)
*
* This is Daniel J. Bernstein's popular `times 33' hash function as
* posted by him years ago on comp.lang.c. It basically uses a function
* like ``hash(i) = hash(i-1) * 33 + str[i]''. This is one of the best
* known hash functions for strings. Because it is both computed very
* fast and distributes very well.
*
* The magic of number 33, i.e. why it works better than many other
* constants, prime or not, has never been adequately explained by
* anyone. So I try an explanation: if one experimentally tests all
* multipliers between 1 and 256 (as RSE did now) one detects that even
* numbers are not useable at all. The remaining 128 odd numbers
* (except for the number 1) work more or less all equally well. They
* all distribute in an acceptable way and this way fill a hash table
* with an average percent of approx. 86%.
*
* If one compares the Chi^2 values of the variants, the number 33 not
* even has the best value. But the number 33 and a few other equally
* good numbers like 17, 31, 63, 127 and 129 have nevertheless a great
* advantage to the remaining numbers in the large set of possible
* multipliers: their multiply operation can be replaced by a faster
* operation based on just one shift plus either a single addition
* or subtraction operation. And because a hash function has to both
* distribute good _and_ has to be very fast to compute, those few
* numbers should be preferred and seems to be the reason why Daniel J.
* Bernstein also preferred it.
*
*
* -- Ralf S. Engelschall <rse@engelschall.com>
*/
static zend_always_inline zend_ulong zend_inline_hash_func(const char *str, size_t len)
{
...
}
2.2 Apache(apr_hash.c
)
static unsigned int hashfunc_default(const char *char_key, apr_ssize_t *klen,
unsigned int hash)
{
const unsigned char *key = (const unsigned char *)char_key;
const unsigned char *p;
apr_ssize_t i;
/*
* This is the popular `times 33' hash algorithm which is used by
* perl and also appears in Berkeley DB. This is one of the best
* known hash functions for strings because it is both computed
* very fast and distributes very well.
*
* The originator may be Dan Bernstein but the code in Berkeley DB
* cites Chris Torek as the source. The best citation I have found
* is "Chris Torek, Hash function for text in C, Usenet message
* <27038@mimsy.umd.edu> in comp.lang.c , October, 1990." in Rich
* Salz's USENIX 1992 paper about INN which can be found at
* <http://citeseer.nj.nec.com/salz92internetnews.html>.
*
* The magic of number 33, i.e. why it works better than many other
* constants, prime or not, has never been adequately explained by
* anyone. So I try an explanation: if one experimentally tests all
* multipliers between 1 and 256 (as I did while writing a low-level
* data structure library some time ago) one detects that even
* numbers are not useable at all. The remaining 128 odd numbers
* (except for the number 1) work more or less all equally well.
* They all distribute in an acceptable way and this way fill a hash
* table with an average percent of approx. 86%.
*
* If one compares the chi^2 values of the variants (see
* Bob Jenkins ``Hashing Frequently Asked Questions'' at
* http://burtleburtle.net/bob/hash/hashfaq.html for a description
* of chi^2), the number 33 not even has the best value. But the
* number 33 and a few other equally good numbers like 17, 31, 63,
* 127 and 129 have nevertheless a great advantage to the remaining
* numbers in the large set of possible multipliers: their multiply
* operation can be replaced by a faster operation based on just one
* shift plus either a single addition or subtraction operation. And
* because a hash function has to both distribute good _and_ has to
* be very fast to compute, those few numbers should be preferred.
*
* -- Ralf S. Engelschall <rse@engelschall.com>
*/
if (*klen == APR_HASH_KEY_STRING) {
for (p = key; *p; p++) {
hash = hash * 33 + *p;
}
*klen = p - key;
}
else {
for (p = key, i = *klen; i; i--, p++) {
hash = hash * 33 + *p;
}
}
return hash;
}
hash = hashfunc_default(key, &klen, ht->seed);
1
2.3 BerkeleyDB(src\hash\hash_func.c
)
/* DJBX33A algorithm
* __ham_func4 --
* Chris Torek's hash function. Although this function performs only
* slightly worse than __ham_func5 on strings, it performs horribly on
* numbers.
*
* PUBLIC: u_int32_t __ham_func4 __P((DB *, const void *, u_int32_t));
*/
u_int32_t
__ham_func4(dbp, key, len)
DB *dbp;
const void *key;
u_int32_t len;
{
const u_int8_t *k;
u_int32_t h, loop;
if (dbp != NULL)
COMPQUIET(dbp, NULL);
if (len == 0)
return (0);
#define HASH4a h = (h << 5) - h + *k++;
#define HASH4b h = (h << 5) + h + *k++;
#define HASH4 HASH4b
h = 0;
k = key;
loop = (len + 8 - 1) >> 3;
switch (len & (8 - 1)) {
case 0:
do {
HASH4;
case 7:
HASH4;
case 6:
HASH4;
case 5:
HASH4;
case 4:
HASH4;
case 3:
HASH4;
case 2:
HASH4;
case 1:
HASH4;
} while (--loop);
}
return (h);
}
2.4 Python(pyhash.c
)
Py_hash_t
_Py_HashBytes(const void *src, Py_ssize_t len)
{
Py_hash_t x;
/*
We make the hash of the empty string be 0, rather than using
(prefix ^ suffix), since this slightly obfuscates the hash secret
*/
if (len == 0) {
return 0;
}
#ifdef Py_HASH_STATS
hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
#endif
#if Py_HASH_CUTOFF > 0
if (len < Py_HASH_CUTOFF) {
/* Optimize hashing of very small strings with inline DJBX33A. */
Py_uhash_t hash;
const unsigned char *p = src;
hash = 5381; /* DJBX33A starts with 5381 */
switch(len) {
/* ((hash << 5) + hash) + *p == hash * 33 + *p */
case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 1: hash = ((hash << 5) + hash) + *p++; break;
default:
Py_UNREACHABLE();
}
hash ^= len;
hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
x = (Py_hash_t)hash;
}
else
#endif /* Py_HASH_CUTOFF */
x = PyHash_Func.hash(src, len);
if (x == -1)
return -2;
return x;
}
typedef Py_ssize_t Py_hash_t;
typedef ssize_t Py_ssize_t;
#ifdef MS_WIN64
typedef __int64 ssize_t;
#else
typedef _W64 int ssize_t;
#endif
#define Py_UNREACHABLE() abort()
typedef struct {
Py_hash_t (*const hash)(const void *, Py_ssize_t);
const char *name;
const int hash_bits;
const int seed_bits;
} PyHash_FuncDef;
3 DJBX33A算法相似实现
Tokyo Cabinet,Nginx等软件项目通过改变每次相乘的倍数(31,37)获得与DJBX33A
相似哈希函数。
3.1 Tokyo Cabinet内存数据库
/* tctdb.c */
/* Get the hash value of a record.
`pkbuf' specifies the pointer to the region of the primary key.
`pksiz' specifies the size of the region of the primary key.
The return value is the hash value. */
static uint16_t tctdbidxhash(const char *pkbuf, int pksiz){
assert(pkbuf && pksiz && pksiz >= 0);
uint32_t hash = 19780211;
while(pksiz--) {
hash = hash * 37 + *(uint8_t *)pkbuf++;
}
return hash;
}
/* tchdb.c */
/* Get the bucket index of a record.
`hdb' specifies the hash database object.
`kbuf' specifies the pointer to the region of the key.
`ksiz' specifies the size of the region of the key.
`hp' specifies the pointer to the variable into which the second hash value is assigned.
The return value is the bucket index. */
static uint64_t tchdbbidx(TCHDB *hdb, const char *kbuf, int ksiz, uint8_t *hp){
assert(hdb && kbuf && ksiz >= 0 && hp);
uint64_t idx = 19780211;
uint32_t hash = 751;
const char *rp = kbuf + ksiz;
while(ksiz--) {
idx = idx * 37 + *(uint8_t *)kbuf++;
hash = (hash * 31) ^ *(uint8_t *)--rp;
}
*hp = hash;
return idx % hdb->bnum;
}
3.2 Nginx服务器
/* ngx_hash.c */
ngx_uint_t
ngx_hash_key(u_char *data, size_t len)
{
ngx_uint_t i, key;
key = 0;
for (i = 0; i < len; i++) {
key = ngx_hash(key, data[i]);
}
return key;
}
/* ngx_hash.h */
#define ngx_hash(key, c) ((ngx_uint_t) key * 31 + c)
7