hash编码

常用的字符串Hash函数还有ELFHash，APHash等等，都是十分简单有效的方法。这些函数使用位运算使得每一个字符都对最后的函数值产生影响。另外还有以MD5和SHA1为代表的杂凑函数，这些函数几乎不可能找到碰撞。

常用字符串哈希函数有 BKDRHash，APHash，DJBHash，JSHash，RSHash，SDBMHash，PJWHash，ELFHash等等。对于以上几种哈希函数，我对其进行了一个小小的评测。

Hash函数	数据1	数据2	数据3	数据4	数据1得分	数据2得分	数据3得分	数据4得分	平均分
BKDRHash	2	0	4774	481	96.55	100	90.95	82.05	92.64
APHash	2	3	4754	493	96.55	88.46	100	51.28	86.28
DJBHash	2	2	4975	474	96.55	92.31	0	100	83.43
JSHash	1	4	4761	506	100	84.62	96.83	17.95	81.94
RSHash	1	0	4861	505	100	100	51.58	20.51	75.96
SDBMHash	3	2	4849	504	93.1	92.31	57.01	23.08	72.41
PJWHash	30	26	4878	513	0	0	43.89	0	21.95
ELFHash	30	26	4878	513	0	0	43.89	0	21.95

其中数据1为100000个字母和数字组成的随机串哈希冲突个数。数据2为100000个有意义的英文句子哈希冲突个数。数据3为数据1的哈希值与 1000003(大素数)求模后存储到线性表中冲突的个数。数据4为数据1的哈希值与10000019(更大素数)求模后存储到线性表中冲突的个数。

经过比较，得出以上平均得分。平均数为平方平均数。可以发现，BKDRHash无论是在实际效果还是编码实现中，效果都是最突出的。APHash也是较为优秀的算法。DJBHash,JSHash,RSHash与SDBMHash各有千秋。PJWHash与ELFHash效果最差，但得分相似，其算法本质是相似的。

HashFun.h

unsigned int SDBMHash(char *str)
{
    unsigned int hash = 0;

    while (*str)
    {
        // equivalent to: hash = 65599*hash + (*str++);
        hash = (*str++) + (hash << 6) + (hash << 16) - hash;
    }

    return (hash & 0x7FFFFFFF);
}

// RS Hash Function
unsigned int RSHash(char *str)
{
    unsigned int b = 378551;
    unsigned int a = 63689;
    unsigned int hash = 0;

    while (*str)
    {
        hash = hash * a + (*str++);
        a *= b;
    }

    return (hash & 0x7FFFFFFF);
}

// JS Hash Function
unsigned int JSHash(char *str)
{
    unsigned int hash = 1315423911;

    while (*str)
    {
        hash ^= ((hash << 5) + (*str++) + (hash >> 2));
    }

    return (hash & 0x7FFFFFFF);
}

// P. J. Weinberger Hash Function
unsigned int PJWHash(char *str)
{
    unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
    unsigned int ThreeQuarters    = (unsigned int)((BitsInUnignedInt  * 3) / 4);
    unsigned int OneEighth        = (unsigned int)(BitsInUnignedInt / 8);
    unsigned int HighBits         = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
    unsigned int hash             = 0;
    unsigned int test             = 0;

    while (*str)
    {
        hash = (hash << OneEighth) + (*str++);
        if ((test = hash & HighBits) != 0)
        {
            hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
        }
    }

    return (hash & 0x7FFFFFFF);
}

// ELF Hash Function
unsigned int ELFHash(char *str)
{
    unsigned int hash = 0;
    unsigned int x    = 0;

    while (*str)
    {
        hash = (hash << 4) + (*str++);
        if ((x = hash & 0xF0000000L) != 0)
        {
            hash ^= (x >> 24);
            hash &= ~x;
        }
    }

    return (hash & 0x7FFFFFFF);
}

// BKDR Hash Function
unsigned int BKDRHash(char *str)
{
    unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
    unsigned int hash = 0;

    while (*str)
    {
        hash = hash * seed + (*str++);
    }

    return (hash & 0x7FFFFFFF);
}

// DJB Hash Function
unsigned int DJBHash(char *str)
{
    unsigned int hash = 5381;

    while (*str)
    {
        hash += (hash << 5) + (*str++);
    }

    return (hash & 0x7FFFFFFF);
}

// AP Hash Function
unsigned int APHash(char *str)
{
    unsigned int hash = 0;
    int i;

    for (i=0; *str; i++)
    {
        if ((i & 1) == 0)
        {
            hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
        }
        else
        {
            hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
        }
    }
    return (hash & 0x7FFFFFFF);
}

以上为转载内容以下是自己测试Hash算法效果的函数。

main.cpp

#include<iostream>
#include<fstream>
#include<cstdlib>
#include<cstring>
#include<ctime>
#include"HashFun.h"
using namespace std;
/*
  从Dict.txt读入英文单词表，总共24018个单词
  测试哈希函数的分布情况

*/
void ReadDict(char ** dict,int& len)
{
  fstream dictFile;
  dictFile.open("Dict.txt",ios::in);
  if(!dictFile)  {  cout<<"OpenFailed."<<endl;  exit(-1);}
  char tmpString[50];
  len=0;
  int sl;
  while(!dictFile.eof())
  {
    dictFile.getline(tmpString,50);

    sl=strlen(tmpString);
    if(sl>0 && tmpString[sl-1]=='\r')
            {
                tmpString[sl-1]='\0';
                --sl;
            }
    if(sl>0)
    {
        dict[len]=new char[sl];
            if(dict[len]==NULL) {   cout<<"Memory fail."<<endl;exit(-1);}
        strcpy(dict[len],tmpString);
        //cout<<dict[len]<<endl;
        len++;
    }
  }
  dictFile.close();
}

const int MaxHashLen=49999;
void Analise(unsigned int HashHeight[])
{
    const unsigned int AreaNum=20;
    unsigned int AreaHeight[AreaNum];
    const unsigned int MaxRecordHeight=10;       //记录各自出现高度超过10的放在一起统计
    unsigned  int NumOfHeight[MaxRecordHeight+1];
    memset(AreaHeight,0,sizeof(AreaHeight));
    memset(NumOfHeight,0, sizeof(NumOfHeight));
    unsigned int step = MaxHashLen/AreaNum+1,wordNum=0,MaxHeight=0;

    for(int i=0;i<MaxHashLen;i++)
    {
        wordNum+=HashHeight[i];
        if( HashHeight[i]>=MaxRecordHeight )
            NumOfHeight[MaxRecordHeight]++;
        else
            NumOfHeight[HashHeight[i]]++;
        if(HashHeight[i]>MaxHeight)
            MaxHeight=HashHeight[i];

        AreaHeight[i/step]+=HashHeight[i];

    }
    int searchtime=0;
    for(unsigned int i=1;i<MaxRecordHeight;i++)
        searchtime+=(i+1)*i*NumOfHeight[i]; //有i*NumOfHeight[i]个的单词在高度为i的链表里
    searchtime+=((MaxRecordHeight+MaxHeight)/2+1)*(MaxRecordHeight+MaxHeight)/2*NumOfHeight[MaxRecordHeight];
    searchtime/=2;
    cout<<"--------Hash分布如下"<<endl;
    for(unsigned int i=0;i<AreaNum;i++)
        cout<<"[ "<<i*step<<" , "<<(i+1)* step <<" ]: "<<AreaHeight[i]<<endl;

    cout<<"--------链表高度情况"<<endl;
    for(unsigned int i=0;i<MaxRecordHeight;i++)
        cout<<"高度为: "<<i<< "的链表个数为 " <<NumOfHeight[i]<<endl;
    cout<<"高度大于 "<<MaxRecordHeight<< "的链表个数为 " <<NumOfHeight[MaxRecordHeight]<<endl;

    cout<<"--------链表最高高度："<<MaxHeight<<endl;
    cout<<"--------单词表个数为："<<wordNum<<endl;
    cout<<"--------平均链表高度："<<float(wordNum)/(MaxHashLen-NumOfHeight[0])<<endl;
    cout<<"--------链表的利用率："<<1-float(NumOfHeight[0])/MaxHashLen<<endl;
    cout<<"--------平均查找次数："<<1.0*searchtime/wordNum<<endl;
}

int main()
{
    char *dict[24100];    //存储单词表
  int wordNum;
  unsigned int HashHeight[MaxHashLen],hashvalue;
  memset(HashHeight,0,sizeof(HashHeight));
  ReadDict(dict,wordNum);
    unsigned int startTime,curTime;
    startTime=clock();
    for(int i=0;i<wordNum;i++)
    {
        hashvalue = SDBMHash( dict[i] ) % MaxHashLen;
//        hashvalue = RSHash( dict[i] ) % MaxHashLen;
//        hashvalue = JSHash( dict[i] ) % MaxHashLen;
//        hashvalue = PJWHash( dict[i] ) % MaxHashLen;
//        hashvalue = ELFHash( dict[i] ) % MaxHashLen;
//        hashvalue = BKDRHash( dict[i] ) % MaxHashLen;
//        hashvalue = DJBHash( dict[i] ) % MaxHashLen;
//        hashvalue = APHash( dict[i] ) % MaxHashLen;
        HashHeight[ hashvalue ]++;
        //cout<<hashvalue<<endl;
    }
    curTime=clock();
    Analise(HashHeight);
    cout<<"cost time："<<(curTime-startTime)/1000 <<"ms"<<endl;
    //for(int i=0;i<MaxHashLen;i++)
      //  cout<<HashHeight[i]<<endl;
  return 0;
}

我的测试说明

测试的内容，是从英汉词典Dict.txt读取24018个单词，将哈希值对49999求余，观察Hash算法的散列情况

英汉词典Dict.txt在这里下载：http://wenku.baidu.com/view/4ff90a9851e79b8968022695.html

测试结果如下：

方法	最高高度	平均查找次数
BKDRHash	6	1.2383
SDBMHash	5	1.2387
JSHash	6	1.2397
DJBHash	5	1.2403
RSHash	5	1.2417
PJWHash	5	1.2427
ELFHash	5	1.2427
APHash	6	1.2436

转自：

http://apps.hi.baidu.com/share/detail/39630021

http://www.cnblogs.com/atlantis13579/archive/2010/02/06/1664792.html

posted on 2011-07-10 11:41 馒头_雪狼阅读(1324) 评论(2) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

馒头_雪狼

hash编码

导航

公告