hash编码
常用的字符串Hash函数还有ELFHash,APHash等等,都是十分简单有效的方法。这些函数使用位运算使得每一个字符都对最后的函数值产生影响。另外还有以MD5和SHA1为代表的杂凑函数,这些函数几乎不可能找到碰撞。
常用字符串哈希函数有 BKDRHash,APHash,DJBHash,JSHash,RSHash,SDBMHash,PJWHash,ELFHash等等。对于以上几种哈希函数,我对其进行了一个小小的评测。
Hash函数 | 数据1 | 数据2 | 数据3 | 数据4 | 数据1得分 | 数据2得分 | 数据3得分 | 数据4得分 | 平均分 |
BKDRHash | 2 | 0 | 4774 | 481 | 96.55 | 100 | 90.95 | 82.05 | 92.64 |
APHash | 2 | 3 | 4754 | 493 | 96.55 | 88.46 | 100 | 51.28 | 86.28 |
DJBHash | 2 | 2 | 4975 | 474 | 96.55 | 92.31 | 0 | 100 | 83.43 |
JSHash | 1 | 4 | 4761 | 506 | 100 | 84.62 | 96.83 | 17.95 | 81.94 |
RSHash | 1 | 0 | 4861 | 505 | 100 | 100 | 51.58 | 20.51 | 75.96 |
SDBMHash | 3 | 2 | 4849 | 504 | 93.1 | 92.31 | 57.01 | 23.08 | 72.41 |
PJWHash | 30 | 26 | 4878 | 513 | 0 | 0 | 43.89 | 0 | 21.95 |
ELFHash | 30 | 26 | 4878 | 513 | 0 | 0 | 43.89 | 0 | 21.95 |
其中数据1为100000个字母和数字组成的随机串哈希冲突个数。数据2为100000个有意义的英文句子哈希冲突个数。数据3为数据1的哈希值与 1000003(大素数)求模后存储到线性表中冲突的个数。数据4为数据1的哈希值与10000019(更大素数)求模后存储到线性表中冲突的个数。
经过比较,得出以上平均得分。平均数为平方平均数。可以发现,BKDRHash无论是在实际效果还是编码实现中,效果都是最突出的。APHash也是较为优秀的算法。DJBHash,JSHash,RSHash与SDBMHash各有千秋。PJWHash与ELFHash效果最差,但得分相似,其算法本质是相似的。
HashFun.h
unsigned int SDBMHash(char *str)
{ unsigned int hash = 0; while (*str) { // equivalent to: hash = 65599*hash + (*str++); hash = (*str++) + (hash << 6) + (hash << 16) - hash; } return (hash & 0x7FFFFFFF); } // RS Hash Function unsigned int RSHash(char *str) { unsigned int b = 378551; unsigned int a = 63689; unsigned int hash = 0; while (*str) { hash = hash * a + (*str++); a *= b; } return (hash & 0x7FFFFFFF); } // JS Hash Function unsigned int JSHash(char *str) { unsigned int hash = 1315423911; while (*str) { hash ^= ((hash << 5) + (*str++) + (hash >> 2)); } return (hash & 0x7FFFFFFF); } // P. J. Weinberger Hash Function unsigned int PJWHash(char *str) { unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8); unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4); unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8); unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth); unsigned int hash = 0; unsigned int test = 0; while (*str) { hash = (hash << OneEighth) + (*str++); if ((test = hash & HighBits) != 0) { hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits)); } } return (hash & 0x7FFFFFFF); } // ELF Hash Function unsigned int ELFHash(char *str) { unsigned int hash = 0; unsigned int x = 0; while (*str) { hash = (hash << 4) + (*str++); if ((x = hash & 0xF0000000L) != 0) { hash ^= (x >> 24); hash &= ~x; } } return (hash & 0x7FFFFFFF); } // BKDR Hash Function unsigned int BKDRHash(char *str) { unsigned int seed = 131; // 31 131 1313 13131 131313 etc.. unsigned int hash = 0; while (*str) { hash = hash * seed + (*str++); } return (hash & 0x7FFFFFFF); } // DJB Hash Function unsigned int DJBHash(char *str) { unsigned int hash = 5381; while (*str) { hash += (hash << 5) + (*str++); } return (hash & 0x7FFFFFFF); } // AP Hash Function unsigned int APHash(char *str) { unsigned int hash = 0; int i; for (i=0; *str; i++) { if ((i & 1) == 0) { hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3)); } else { hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5))); } } return (hash & 0x7FFFFFFF); } |
以上为转载内容 以下是自己测试Hash算法效果的函数。
main.cpp
#include<iostream>
#include<fstream> #include<cstdlib> #include<cstring> #include<ctime> #include"HashFun.h" using namespace std; /* 从Dict.txt读入英文单词表,总共24018个单词 测试哈希函数的分布情况 */ void ReadDict(char ** dict,int& len) { fstream dictFile; dictFile.open("Dict.txt",ios::in); if(!dictFile) { cout<<"OpenFailed."<<endl; exit(-1);} char tmpString[50]; len=0; int sl; while(!dictFile.eof()) { dictFile.getline(tmpString,50); sl=strlen(tmpString); if(sl>0 && tmpString[sl-1]=='\r') { tmpString[sl-1]='\0'; --sl; } if(sl>0) { dict[len]=new char[sl]; if(dict[len]==NULL) { cout<<"Memory fail."<<endl;exit(-1);} strcpy(dict[len],tmpString); //cout<<dict[len]<<endl; len++; } } dictFile.close(); } const int MaxHashLen=49999; void Analise(unsigned int HashHeight[]) { const unsigned int AreaNum=20; unsigned int AreaHeight[AreaNum]; const unsigned int MaxRecordHeight=10; //记录各自出现高度超过10的放在一起统计 unsigned int NumOfHeight[MaxRecordHeight+1]; memset(AreaHeight,0,sizeof(AreaHeight)); memset(NumOfHeight,0, sizeof(NumOfHeight)); unsigned int step = MaxHashLen/AreaNum+1,wordNum=0,MaxHeight=0; for(int i=0;i<MaxHashLen;i++) { wordNum+=HashHeight[i]; if( HashHeight[i]>=MaxRecordHeight ) NumOfHeight[MaxRecordHeight]++; else NumOfHeight[HashHeight[i]]++; if(HashHeight[i]>MaxHeight) MaxHeight=HashHeight[i]; AreaHeight[i/step]+=HashHeight[i]; } int searchtime=0; for(unsigned int i=1;i<MaxRecordHeight;i++) searchtime+=(i+1)*i*NumOfHeight[i]; //有i*NumOfHeight[i]个的单词在高度为i的链表里 searchtime+=((MaxRecordHeight+MaxHeight)/2+1)*(MaxRecordHeight+MaxHeight)/2*NumOfHeight[MaxRecordHeight]; searchtime/=2; cout<<"--------Hash分布如下"<<endl; for(unsigned int i=0;i<AreaNum;i++) cout<<"[ "<<i*step<<" , "<<(i+1)* step <<" ]: "<<AreaHeight[i]<<endl; cout<<"--------链表高度情况"<<endl; for(unsigned int i=0;i<MaxRecordHeight;i++) cout<<"高度为: "<<i<< "的链表个数为 " <<NumOfHeight[i]<<endl; cout<<"高度大于 "<<MaxRecordHeight<< "的链表个数为 " <<NumOfHeight[MaxRecordHeight]<<endl; cout<<"--------链表最高高度:"<<MaxHeight<<endl; cout<<"--------单词表个数为:"<<wordNum<<endl; cout<<"--------平均链表高度:"<<float(wordNum)/(MaxHashLen-NumOfHeight[0])<<endl; cout<<"--------链表的利用率:"<<1-float(NumOfHeight[0])/MaxHashLen<<endl; cout<<"--------平均查找次数:"<<1.0*searchtime/wordNum<<endl; } int main() { char *dict[24100]; //存储单词表 int wordNum; unsigned int HashHeight[MaxHashLen],hashvalue; memset(HashHeight,0,sizeof(HashHeight)); ReadDict(dict,wordNum); unsigned int startTime,curTime; startTime=clock(); for(int i=0;i<wordNum;i++) { hashvalue = SDBMHash( dict[i] ) % MaxHashLen; // hashvalue = RSHash( dict[i] ) % MaxHashLen; // hashvalue = JSHash( dict[i] ) % MaxHashLen; // hashvalue = PJWHash( dict[i] ) % MaxHashLen; // hashvalue = ELFHash( dict[i] ) % MaxHashLen; // hashvalue = BKDRHash( dict[i] ) % MaxHashLen; // hashvalue = DJBHash( dict[i] ) % MaxHashLen; // hashvalue = APHash( dict[i] ) % MaxHashLen; HashHeight[ hashvalue ]++; //cout<<hashvalue<<endl; } curTime=clock(); Analise(HashHeight); cout<<"cost time:"<<(curTime-startTime)/1000 <<"ms"<<endl; //for(int i=0;i<MaxHashLen;i++) // cout<<HashHeight[i]<<endl; return 0; } |
我的测试说明
测试的内容,是从英汉词典Dict.txt读取24018个单词,将哈希值对49999求余,观察Hash算法的散列情况
英汉词典Dict.txt在这里下载:http://wenku.baidu.com/view/4ff90a9851e79b8968022695.html
测试结果如下:
方法 | 最高高度 | 平均查找次数 |
BKDRHash | 6 | 1.2383 |
SDBMHash | 5 | 1.2387 |
JSHash | 6 | 1.2397 |
DJBHash | 5 | 1.2403 |
RSHash | 5 | 1.2417 |
PJWHash | 5 | 1.2427 |
ELFHash | 5 | 1.2427 |
APHash | 6 | 1.2436 |
转自:
http://apps.hi.baidu.com/share/detail/39630021
http://www.cnblogs.com/atlantis13579/archive/2010/02/06/1664792.html