实现哈希表

哈希表原理:输入(key, value) -> 通过散列函数生成hashkey -> 将(key,value)放入hashkey对应的bucket

散列函数满足以下的条件
1、对输入值运算,得到一个固定长度的摘要(Hash value);
2、不同的输入值可能对应同样的输出值;
3、散列函数的输出值尽量接近均匀分布,即输出值y的分布函数F(y)=y/m, m为散列函数的最大值;
4、x的微小变化可以使f(x)发生非常大的变化,即所谓“雪崩效应”(Avalanche effect),即|df(x)/dx| >> 1;


哈希冲突(Hash collision)是无法避免的。哈希冲突的处理方法:
1, 链地址法

//HashTable.h
typedef unsigned int UINT;

class Node{
public:
Node(int key, const string & str);
    int key;
    string value;
    Node * next;
};

class HashTable{
public:
    HashTable();
    bool Insert(int key, const string & value);
    bool Find(int key);
    string & operator[](int key); 
private:
    bool Insert(Node ** node, int key, const string & value);
    Node * HashTable::FindNode(int key);
    unsigned int hasher(int key);//哈希函数
    enum{SIZE = 100};
    Node * nodes[SIZE];
};

//HashTable.cpp
Node::Node(int Key, const string & str): key(Key), value(str), next(0){}

HashTable::HashTable(){
    memset(nodes, 0, SIZE * sizeof(Node *));
}

unsigned int HashTable::hasher(int key){
    return abs(key) % SIZE;//最简单的hash函数
}

bool HashTable::Insert(int key, const std::string &value){
    UINT adr = hasher(key);
    Node *  node = nodes[adr];
    if(node == 0){
        nodes[adr] = new Node(key, value);
    }else{
        return Insert(&node->next, key, value);
    }
}

bool HashTable::Insert(Node * * next, int key, const string & value){
    Node * node = *next;
    if(node == 0){
        (*next) = new Node(key, value);
        return true;
    }
    else{
        return Insert(&node->next, key, value);
    }
}

bool HashTable::Find(int key){
    UINT adr = hasher(key);
    Node *  node = nodes[adr];
    if(node == 0){
        return false;
    }else{
        do{
            if(node->key == key){
                return true;
            }else{
                node = node->next;
            }
        }while(node != 0);
        return false;
    }
}

Node * HashTable::FindNode(int key){
    UINT adr = hasher(key);
    Node *  node = nodes[adr];
    if(node == 0){
        return 0;
    }else{
        do{
            if(node->key == key){
                return node;
            }else{
                node = node->next;
            }
        }while(node != 0);
        return 0;
    }
}

string & HashTable::operator[](int key){
    Node * node = FindNode(key);
    assert(node != 0);
    return node->value; 
}

//main.cpp
int main()
{
    HashTable ht;
    ht.Insert(1, "you");
    string value = ht[1];
    cout << value << endl;
    ht.Insert(101, "girl");
    value = ht[101];
    cout << value << endl;
    ht.Insert(201, "boy");
    value = ht[201];
    cout << value << endl;
    ht[201] = "man";
    cout << ht[201] << endl;
    cin.get();
    return 0;
}

 2, 开放地址法
为每个Hash值,建立一个Hash桶(Bucket),哈希桶的个数是固定的,桶的容量也是固定的。
好处是查表的最大开销是可以确定的,因为最多处理的冲突数是确定的,所以算法的时间复杂度为O(1)+O(m),其中m为Hash桶容量。
坏处是新建的表项可能会由于冲突过多,而不能装入Hash表中。
http://www.360doc.com/content/13/0108/16/8363527_258987810.shtml

3,线性探测再散列

//StringHash.h
#define MAXTABLELEN 1024    // 默认哈希索引表大小   
typedef struct  _HASHTABLE  {    // 哈希索引表定义      
    long nHashA;        
    long nHashB;        
    bool bExists;    
}HASHTABLE, *PHASHTABLE ; 
class StringHash  {
public: 
    StringHash(const long nTableLength = MAXTABLELEN); 
    ~StringHash(void);   
private: 
    unsigned long cryptTable[0x500];  
    unsigned long m_tablelength;    // 哈希索引表长度   
    HASHTABLE *m_HashIndexTable;
    void InitCryptTable(); // 对哈希索引表预处理  
    unsigned long HashString(const string &lpszString, unsigned long dwHashType); // 哈希函数
public:  
    bool Hash(string url);  
    unsigned long Hashed(string url); // 检测url是否被hash过,是则返回位置,否则返回-1
};  

//StringHash.c
StringHash::StringHash(const long nTableLength )  {   
    InitCryptTable();   
    m_tablelength = nTableLength;    
    m_HashIndexTable = new HASHTABLE[nTableLength];    
    for ( int i = 0; i < nTableLength; i++ ) {      
        m_HashIndexTable[i].nHashA = -1;         
        m_HashIndexTable[i].nHashB = -1;     
        m_HashIndexTable[i].bExists = false;   
    } 
}
StringHash::~StringHash(void)  {      //清理内存 
    if ( NULL != m_HashIndexTable ) {     
        delete []m_HashIndexTable;        
        m_HashIndexTable = NULL;    
        m_tablelength = 0;     
    }    
}  
void StringHash::InitCryptTable(){     
    unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i;    
    for( index1 = 0; index1 < 0x100; index1++ ) {         
        for( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 ){     
            unsigned long temp1, temp2; 
            seed = (seed * 125 + 3) % 0x2AAAAB;   
            temp1 = (seed & 0xFFFF) << 0x10;     
            seed = (seed * 125 + 3) % 0x2AAAAB; 
            temp2 = (seed & 0xFFFF);    
            cryptTable[index2] = ( temp1 | temp2 );   
        }     
    }     
}    
unsigned long StringHash::HashString(const string& lpszString, unsigned long dwHashType){ //第二个参数指明使用哪个哈希表的哈希函数     
    unsigned char *key = (unsigned char *)(const_cast(lpszString.c_str())); 
    unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE;  
    int ch;     
    while(*key != 0){        
        ch = toupper(*key++);       
        seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2);       
        seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3;    
     }    
     return seed1;     
}    
unsigned long StringHash::Hashed(string lpszString){    
     const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;//不同的字符串三次hash还会碰撞的几率无限接近于不可能
     unsigned long nHash = HashString(lpszString, HASH_OFFSET);   
     unsigned long nHashA = HashString(lpszString, HASH_A);   
     unsigned long nHashB = HashString(lpszString, HASH_B);    
     unsigned long nHashStart = nHash % m_tablelength, nHashPos = nHashStart; 
     while ( m_HashIndexTable[nHashPos].bExists){      
         if (m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHashB)
              return nHashPos;        
          else 
              nHashPos = (nHashPos + 1) % m_tablelength; 
          if (nHashPos == nHashStart)  break;    
     }    
     return -1; //没有找到   
}    
bool StringHash::Hash(string lpszString)  { //hash a string
     const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;   
     unsigned long nHash = HashString(lpszString, HASH_OFFSET);  
     unsigned long nHashA = HashString(lpszString, HASH_A);   
     unsigned long nHashB = HashString(lpszString, HASH_B); 
     unsigned long nHashStart = nHash % m_tablelength, nHashPos = nHashStart;
     while ( m_HashIndexTable[nHashPos].bExists){        
         nHashPos = (nHashPos + 1) % m_tablelength;  
         if (nHashPos == nHashStart){ //一个轮回               
             return false;  //hash表中没有空余的位置了,无法完成hash 
         }    
     }   
     m_HashIndexTable[nHashPos].bExists = true; 
     m_HashIndexTable[nHashPos].nHashA = nHashA; 
     m_HashIndexTable[nHashPos].nHashB = nHashB;  
     return true;    
}

注解:字符串的哈希函数
你可以把哈希表存储在字符串数组中,然后你可以计算字符串的哈希值,然后与已经存储的字符串的哈希值进行比较。如果有匹配的哈希值,就可以通过字符串比较进行匹配验证。这种方法叫索引,根据数组的大小以及字符串的平均长度可以约100倍。

unsigned long HashString(char *lpszString){  
     unsigned long ulHash = 0xf1e2d3c4;
     while (*lpszString != 0) { 
         ulHash <<= 1; 
         ulHash += *lpszString++; 
     } 
     return ulHash;
}

上面代码中的散列算法在遍历字符串过程中,将哈希值左移一位,然后加上字符值。它会在较低的数据范围内产生相对可预测的输出,从而可能会产生大量冲突。
MPQ格式,使用了一种非常复杂的散列算法(如下所示),产生完全不可预测的哈希值,这个算法十分有效,这就是所谓的单向散列算法。

unsigned long HashString(char *lpszFileName, unsigned long dwHashType){    
    unsigned char *key = (unsigned char *)lpszFileName; 
    unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE;
    int ch; 
    while(*key != 0) {
        ch = toupper(*key++);
        seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2);
        seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3;
    }
    return seed1;
}

 

posted on 2015-10-01 09:57  joannae  阅读(352)  评论(0编辑  收藏  举报

导航