【Hash】字符串哈希

Hash 的核心思想在于,将输入映射到一个值域较小、可以方便比较的范围,典型的用法就是将资源紧张的设备中的不定长字符串转化为定长整数,以达到节省空间的目的

如:printf("This is a string.")  =>  printf("0x12345678")   // 理想哈希算法可将不同的字符串转化为唯一的整数

 

常用hash算法:

  • 00 - RS 哈希函数
unsigned int RSHash(const char* str, unsigned int length)
{
   unsigned int b    = 378551;
   unsigned int a    = 63689;
   unsigned int hash = 0;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash = hash * a + (*str);
      a    = a * b;
   }

   return hash;
}
  • 01 - JS 哈希函数 
unsigned int JSHash(const char* str, unsigned int length)
{
   unsigned int hash = 1315423911;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash ^= ((hash << 5) + (*str) + (hash >> 2));
   }

   return hash;
}
  • 02 - PJW 哈希函数 
unsigned int PJWHash(const char* str, unsigned int length)
{
   const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
   const unsigned int ThreeQuarters     = (unsigned int)((BitsInUnsignedInt  * 3) / 4);
   const unsigned int OneEighth         = (unsigned int)(BitsInUnsignedInt / 8);
   const unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
   unsigned int hash = 0;
   unsigned int test = 0;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash = (hash << OneEighth) + (*str);

      if ((test = hash & HighBits) != 0)
      {
         hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
      }
   }

   return hash;
}
  • 03 - ELF哈希函数  
unsigned int ELFHash(const char* str, unsigned int length)
{
   unsigned int hash = 0;
   unsigned int x    = 0;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash = (hash << 4) + (*str);

      if ((x = hash & 0xF0000000L) != 0)
      {
         hash ^= (x >> 24);
      }

      hash &= ~x;
   }

   return hash;
}
  • 04 - BKDR 哈希函数
unsigned int BKDRHash(const char* str, unsigned int length)
{
   unsigned int seed = 131; /* 31 131 1313 13131 131313 etc.. */
   unsigned int hash = 0;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash = (hash * seed) + (*str);
   }

   return hash;
}
  • 05 - SDBM哈希函数  
unsigned int SDBMHash(const char* str, unsigned int length)
{
   unsigned int hash = 0;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash = (*str) + (hash << 6) + (hash << 16) - hash;
   }

   return hash;
}
  • 06 - DJB哈希函数  
unsigned int DJBHash(const char* str, unsigned int length)
{
   unsigned int hash = 5381;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash = ((hash << 5) + hash) + (*str);
   }

   return hash;
}
  • 07 - DEK哈希函数  
unsigned int DEKHash(const char* str, unsigned int length)
{
   unsigned int hash = len;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash = ((hash << 5) ^ (hash >> 27)) ^ (*str);
   }

   return hash;
}
  • 08 - AP 哈希函数  
unsigned int APHash(const char* str, unsigned int length)
{
   unsigned int hash = 0xAAAAAAAA;
   unsigned int i    = 0;

   for (i = 0; i < length; ++str, ++i)
   {
      hash ^= ((i & 1) == 0) ? (  (hash <<  7) ^ (*str) * (hash >> 3)) :
                               (~((hash << 11) + ((*str) ^ (hash >> 5))));
   }

   return hash;
}

 

各hash算法冲突率测评

Hash函数 数据1 数据2 数据3 数据4 数据1得分 数据2得分 数据3得分 数据4得分 平均分
BKDRHash 2 0 4774 481 96.55 100 90.95 82.05 92.64
APHash 2 3 4754 493 96.55 88.46 100 51.28 86.28
DJBHash 2 2 4975 474 96.55 92.31 0 100 83.43
JSHash 1 4 4761 506 100 84.62 96.83 17.95 81.94
RSHash 1 0 4861 505 100 100 51.58 20.51 75.96
SDBMHash 3 2 4849 504 93.1 92.31 57.01 23.08 72.41
PJWHash 30 26 4878 513 0 0 43.89 0 21.95
ELFHash 30 26 4878 513 0 0 43.89 0 21.95

数据1:为100000个字母和数字组成的随机串哈希冲突个数

数据2:为100000个有意义的英文句子哈希冲突个数

数据3:为数据1的哈希值与 1000003(大素数)求模后存储到线性表中冲突的个数

数据4:为数据1的哈希值与10000019(更大素数)求模后存储到线性表中冲突的个数。

经过比较,得出以上平均得分。平均数为平方平均数。可以发现,BKDRHash无论是在实际效果还是编码实现中,效果都是最突出的。APHash也是较为优秀的算法。DJBHash,JSHash,RSHash与SDBMHash各有千秋。

PJWHash与ELFHash效果最差,但得分相似,其算法本质是相似的。

 

参考:

    http://www.partow.net/programming/hashfunctions/

    https://www.cnblogs.com/uvsjoh/archive/2012/03/27/2420120.html

posted @ 2022-08-03 15:39  壹点灵异  阅读(126)  评论(0编辑  收藏  举报