hash

今天看代码的时候注意了一下 hash 以及看了下读书时候的笔记, 主要就是说: hash桶大小以及hash 算法

1、目前hash桶的大小都是素数(和2倍相近的一个素数)

设有一个哈希函数
H( c ) = c % N;
当N取一个合数时,最简单的例子是取2^n,比如说取2^3=8,这时候
H( 11100(二进制) ) = H( 28 ) = 4
H( 10100(二进制) ) = H( 20 )= 4
这时候c的二进制第4位开始(从右向左数)就”失效”了,也就是说,无论第c的4位取什么值,都会导致H( c )的值一样.这时候c的第四位就根本不参与H( c )的运算,这样H( c )就无法完整地反映c的特性,增大了导致冲突的几率.

 7ul,          23ul, \
  53ul,         97ul,         193ul,       389ul,       769ul,      \
  1543ul,       3079ul,       6151ul,      12289ul,     24593ul,    \
  49157ul,      98317ul,      196613ul,    393241ul,    786433ul,   \
  1572869ul,    3145739ul,    6291469ul,   12582917ul,  25165843ul, \
  50331653ul,   100663319ul,  201326611ul, 402653189ul, 805306457ul,\
  1610612741ul, 3221225473ul, 4294967291ul  \

 但是也有一个问题:如果桶的大小是2的n次方, 那么(n - 1) & hash实际上是计算出 key 在 tab 中索引位置; 此公式满足了:(n - 1) & hash = hash % n

  • &运算速度快,至少比%取模运算块

2、hash 算法:

1、DJB Hash 也叫 Time 33 hash哈希算法

Times33的算法很简单,就是不断的乘33;hash(i) = hash(i-1) * 33 + str[i]    Time33在效率和随机性两方面上俱佳;

unsigned int DJBHash(const char* str, unsigned int length){
   unsigned int hash = 5381;
   unsigned int i    = 0;
   for (i = 0; i < length; ++str, ++i)
   {
      hash = ((hash << 5) + hash) + (*str);
   }//((hash << 5) + hash) 就是位操作实现的hash * 32+hash,即 hash * 33。
   return hash;
}

Kernighan 和 Ritchie 在《The C Programming Language》提出BKDR Hash,采用/ 31 131 1313 13131 131313 etc.. / 作为种子计算hash;这个就是数学家的事了

MurmurHash算法:高运算性能,低碰撞率

/*
 * The following hash function is based on MurmurHash64A(), placed into the
 * public domain by Austin Appleby.  See http://murmurhash.googlepages.com/ for
 * details.
 */
JEMALLOC_INLINE uint64_t
hash(const void *key, size_t len, uint64_t seed)
{
    const uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
    const int r = 47;
    uint64_t h = seed ^ (len * m);
    const uint64_t *data = (const uint64_t *)key;
    const uint64_t *end = data + (len/8);
    const unsigned char *data2;

    assert(((uintptr_t)key & 0x7) == 0);

    while(data != end) {
        uint64_t k = *data++;

        k *= m;
        k ^= k >> r;
        k *= m;

        h ^= k;
        h *= m;
    }

    data2 = (const unsigned char *)data;
    switch(len & 7) {
    case 7: h ^= ((uint64_t)(data2[6])) << 48;
    case 6: h ^= ((uint64_t)(data2[5])) << 40;
    case 5: h ^= ((uint64_t)(data2[4])) << 32;
    case 4: h ^= ((uint64_t)(data2[3])) << 24;
    case 3: h ^= ((uint64_t)(data2[2])) << 16;
    case 2: h ^= ((uint64_t)(data2[1])) << 8;
    case 1: h ^= ((uint64_t)(data2[0]));
        h *= m;
    }

    h ^= h >> r;
    h *= m;
    h ^= h >> r;

    return (h);
} 

常见的散列算法有:CRC-32、MD5、SHA-1,SM3,以及广泛使用 SHA-2(SHA-224、SHA-356、SHA-384、SHA-512)

   目前使用dpdk 以及内核进行三层转发等时, 发现dpdk 、内核 目前使用jhash以及 hash_long比较多,

hash_long实现:

#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
/* Fast hashing routine for ints,  longs and pointers.
   (C) 2002 Nadia Yvette Chambers, IBM */

/*
 * Knuth recommends primes in approximately golden ratio to the maximum
 * integer representable by a machine word for multiplicative hashing.
 * Chuck Lever verified the effectiveness of this technique:
 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
 *
 * These primes are chosen to be bit-sparse, that is operations on
 * them can use shifts and additions instead of multiplications for
 * machines where multiplications are slow.
 */

#include <asm/types.h>
#include <linux/compiler.h>

/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL

#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
#else
#error Wordsize not 32 or 64
#endif

/*
 * The above primes are actively bad for hashing, since they are
 * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
 * real problems. Besides, the "prime" part is pointless for the
 * multiplicative hash.
 *
 * Although a random odd number will do, it turns out that the golden
 * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
 * properties.
 *
 * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
 * (See Knuth vol 3, section 6.4, exercise 9.)
 */
#define GOLDEN_RATIO_32 0x61C88647
#define GOLDEN_RATIO_64 0x61C8864680B583EBull

static __always_inline u64 hash_64(u64 val, unsigned int bits)
{
    u64 hash = val;

#if BITS_PER_LONG == 64
    hash = hash * GOLDEN_RATIO_64;
#else
    /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
    u64 n = hash;
    n <<= 18;
    hash -= n;
    n <<= 33;
    hash -= n;
    n <<= 3;
    hash += n;
    n <<= 3;
    hash -= n;
    n <<= 4;
    hash += n;
    n <<= 2;
    hash += n;
#endif

    /* High bits are more random, so use them. */
    return hash >> (64 - bits);
}

static inline u32 hash_32(u32 val, unsigned int bits)
{
    /* On some cpus multiply is faster, on others gcc will do shifts */
    u32 hash = val * GOLDEN_RATIO_PRIME_32;

    /* High bits are more random, so use them. */
    return hash >> (32 - bits);
}

static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
{
    return hash_long((unsigned long)ptr, bits);
}

static inline u32 hash32_ptr(const void *ptr)
{
    unsigned long val = (unsigned long)ptr;

#if BITS_PER_LONG == 64
    val ^= (val >> 32);
#endif
    return (u32)val;
}

#endif /* _LINUX_HASH_H */
View Code

jhash 实现:

#ifndef _LINUX_JHASH_H
#define _LINUX_JHASH_H

/* jhash.h: Jenkins hash support.
 *
 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
 *
 * http://burtleburtle.net/bob/hash/
 *
 * These are the credits from Bob's sources:
 *
 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 *
 * These are functions for producing 32-bit hashes for hash table lookup.
 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 * are externally useful functions.  Routines to test the hash are included
 * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 * the public domain.  It has no warranty.
 *
 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
 *
 * I've modified Bob's hash to be useful in the Linux kernel, and
 * any bugs present are my fault.
 * Jozsef
 */
#include <linux/bitops.h>
#include <linux/unaligned/packed_struct.h>

/* Best hash sizes are of power of two */
#define jhash_size(n)   ((u32)1<<(n))
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n)   (jhash_size(n)-1)

/* __jhash_mix -- mix 3 32-bit values reversibly. */
#define __jhash_mix(a, b, c)            \
{                        \
    a -= c;  a ^= rol32(c, 4);  c += b;    \
    b -= a;  b ^= rol32(a, 6);  a += c;    \
    c -= b;  c ^= rol32(b, 8);  b += a;    \
    a -= c;  a ^= rol32(c, 16); c += b;    \
    b -= a;  b ^= rol32(a, 19); a += c;    \
    c -= b;  c ^= rol32(b, 4);  b += a;    \
}

/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
#define __jhash_final(a, b, c)            \
{                        \
    c ^= b; c -= rol32(b, 14);        \
    a ^= c; a -= rol32(c, 11);        \
    b ^= a; b -= rol32(a, 25);        \
    c ^= b; c -= rol32(b, 16);        \
    a ^= c; a -= rol32(c, 4);        \
    b ^= a; b -= rol32(a, 14);        \
    c ^= b; c -= rol32(b, 24);        \
}

/* An arbitrary initial parameter */
#define JHASH_INITVAL        0xdeadbeef

/* jhash - hash an arbitrary key
 * @k: sequence of bytes as key
 * @length: the length of the key
 * @initval: the previous hash, or an arbitray value
 *
 * The generic version, hashes an arbitrary sequence of bytes.
 * No alignment or length assumptions are made about the input key.
 *
 * Returns the hash value of the key. The result depends on endianness.
 */
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
    u32 a, b, c;
    const u8 *k = key;

    /* Set up the internal state */
    a = b = c = JHASH_INITVAL + length + initval;

    /* All but the last block: affect some 32 bits of (a,b,c) */
    while (length > 12) {
        a += __get_unaligned_cpu32(k);
        b += __get_unaligned_cpu32(k + 4);
        c += __get_unaligned_cpu32(k + 8);
        __jhash_mix(a, b, c);
        length -= 12;
        k += 12;
    }
    /* Last block: affect all 32 bits of (c) */
    /* All the case statements fall through */
    switch (length) {
    case 12: c += (u32)k[11]<<24;
    case 11: c += (u32)k[10]<<16;
    case 10: c += (u32)k[9]<<8;
    case 9:  c += k[8];
    case 8:  b += (u32)k[7]<<24;
    case 7:  b += (u32)k[6]<<16;
    case 6:  b += (u32)k[5]<<8;
    case 5:  b += k[4];
    case 4:  a += (u32)k[3]<<24;
    case 3:  a += (u32)k[2]<<16;
    case 2:  a += (u32)k[1]<<8;
    case 1:  a += k[0];
         __jhash_final(a, b, c);
    case 0: /* Nothing left to add */
        break;
    }

    return c;
}

/* jhash2 - hash an array of u32's
 * @k: the key which must be an array of u32's
 * @length: the number of u32's in the key
 * @initval: the previous hash, or an arbitray value
 *
 * Returns the hash value of the key.
 */
static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
    u32 a, b, c;

    /* Set up the internal state */
    a = b = c = JHASH_INITVAL + (length<<2) + initval;

    /* Handle most of the key */
    while (length > 3) {
        a += k[0];
        b += k[1];
        c += k[2];
        __jhash_mix(a, b, c);
        length -= 3;
        k += 3;
    }

    /* Handle the last 3 u32's: all the case statements fall through */
    switch (length) {
    case 3: c += k[2];
    case 2: b += k[1];
    case 1: a += k[0];
        __jhash_final(a, b, c);
    case 0:    /* Nothing left to add */
        break;
    }

    return c;
}


/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
    a += initval;
    b += initval;
    c += initval;

    __jhash_final(a, b, c);

    return c;
}

static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
    return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
}

static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
    return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}

static inline u32 jhash_1word(u32 a, u32 initval)
{
    return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
}

#endif /* _LINUX_JHASH_H */
View Code

 

posted @ 2020-07-30 23:52  codestacklinuxer  阅读(194)  评论(0编辑  收藏  举报