实现哈希表
哈希表原理:输入(key, value) -> 通过散列函数生成hashkey -> 将(key,value)放入hashkey对应的bucket
散列函数满足以下的条件:
1、对输入值运算,得到一个固定长度的摘要(Hash value);
2、不同的输入值可能对应同样的输出值;
3、散列函数的输出值尽量接近均匀分布,即输出值y的分布函数F(y)=y/m, m为散列函数的最大值;
4、x的微小变化可以使f(x)发生非常大的变化,即所谓“雪崩效应”(Avalanche effect),即|df(x)/dx| >> 1;
哈希冲突(Hash collision)是无法避免的。哈希冲突的处理方法:
1, 链地址法
//HashTable.h typedef unsigned int UINT; class Node{ public: Node(int key, const string & str); int key; string value; Node * next; }; class HashTable{ public: HashTable(); bool Insert(int key, const string & value); bool Find(int key); string & operator[](int key); private: bool Insert(Node ** node, int key, const string & value); Node * HashTable::FindNode(int key); unsigned int hasher(int key);//哈希函数 enum{SIZE = 100}; Node * nodes[SIZE]; }; //HashTable.cpp Node::Node(int Key, const string & str): key(Key), value(str), next(0){} HashTable::HashTable(){ memset(nodes, 0, SIZE * sizeof(Node *)); } unsigned int HashTable::hasher(int key){ return abs(key) % SIZE;//最简单的hash函数 } bool HashTable::Insert(int key, const std::string &value){ UINT adr = hasher(key); Node * node = nodes[adr]; if(node == 0){ nodes[adr] = new Node(key, value); }else{ return Insert(&node->next, key, value); } } bool HashTable::Insert(Node * * next, int key, const string & value){ Node * node = *next; if(node == 0){ (*next) = new Node(key, value); return true; } else{ return Insert(&node->next, key, value); } } bool HashTable::Find(int key){ UINT adr = hasher(key); Node * node = nodes[adr]; if(node == 0){ return false; }else{ do{ if(node->key == key){ return true; }else{ node = node->next; } }while(node != 0); return false; } } Node * HashTable::FindNode(int key){ UINT adr = hasher(key); Node * node = nodes[adr]; if(node == 0){ return 0; }else{ do{ if(node->key == key){ return node; }else{ node = node->next; } }while(node != 0); return 0; } } string & HashTable::operator[](int key){ Node * node = FindNode(key); assert(node != 0); return node->value; } //main.cpp int main() { HashTable ht; ht.Insert(1, "you"); string value = ht[1]; cout << value << endl; ht.Insert(101, "girl"); value = ht[101]; cout << value << endl; ht.Insert(201, "boy"); value = ht[201]; cout << value << endl; ht[201] = "man"; cout << ht[201] << endl; cin.get(); return 0; }
2, 开放地址法
为每个Hash值,建立一个Hash桶(Bucket),哈希桶的个数是固定的,桶的容量也是固定的。
好处是查表的最大开销是可以确定的,因为最多处理的冲突数是确定的,所以算法的时间复杂度为O(1)+O(m),其中m为Hash桶容量。
坏处是新建的表项可能会由于冲突过多,而不能装入Hash表中。
http://www.360doc.com/content/13/0108/16/8363527_258987810.shtml
3,线性探测再散列
//StringHash.h #define MAXTABLELEN 1024 // 默认哈希索引表大小 typedef struct _HASHTABLE { // 哈希索引表定义 long nHashA; long nHashB; bool bExists; }HASHTABLE, *PHASHTABLE ; class StringHash { public: StringHash(const long nTableLength = MAXTABLELEN); ~StringHash(void); private: unsigned long cryptTable[0x500]; unsigned long m_tablelength; // 哈希索引表长度 HASHTABLE *m_HashIndexTable; void InitCryptTable(); // 对哈希索引表预处理 unsigned long HashString(const string &lpszString, unsigned long dwHashType); // 哈希函数 public: bool Hash(string url); unsigned long Hashed(string url); // 检测url是否被hash过,是则返回位置,否则返回-1 }; //StringHash.c StringHash::StringHash(const long nTableLength ) { InitCryptTable(); m_tablelength = nTableLength; m_HashIndexTable = new HASHTABLE[nTableLength]; for ( int i = 0; i < nTableLength; i++ ) { m_HashIndexTable[i].nHashA = -1; m_HashIndexTable[i].nHashB = -1; m_HashIndexTable[i].bExists = false; } } StringHash::~StringHash(void) { //清理内存 if ( NULL != m_HashIndexTable ) { delete []m_HashIndexTable; m_HashIndexTable = NULL; m_tablelength = 0; } } void StringHash::InitCryptTable(){ unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i; for( index1 = 0; index1 < 0x100; index1++ ) { for( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 ){ unsigned long temp1, temp2; seed = (seed * 125 + 3) % 0x2AAAAB; temp1 = (seed & 0xFFFF) << 0x10; seed = (seed * 125 + 3) % 0x2AAAAB; temp2 = (seed & 0xFFFF); cryptTable[index2] = ( temp1 | temp2 ); } } } unsigned long StringHash::HashString(const string& lpszString, unsigned long dwHashType){ //第二个参数指明使用哪个哈希表的哈希函数 unsigned char *key = (unsigned char *)(const_cast(lpszString.c_str())); unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE; int ch; while(*key != 0){ ch = toupper(*key++); seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2); seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3; } return seed1; } unsigned long StringHash::Hashed(string lpszString){ const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;//不同的字符串三次hash还会碰撞的几率无限接近于不可能 unsigned long nHash = HashString(lpszString, HASH_OFFSET); unsigned long nHashA = HashString(lpszString, HASH_A); unsigned long nHashB = HashString(lpszString, HASH_B); unsigned long nHashStart = nHash % m_tablelength, nHashPos = nHashStart; while ( m_HashIndexTable[nHashPos].bExists){ if (m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHashB) return nHashPos; else nHashPos = (nHashPos + 1) % m_tablelength; if (nHashPos == nHashStart) break; } return -1; //没有找到 } bool StringHash::Hash(string lpszString) { //hash a string const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2; unsigned long nHash = HashString(lpszString, HASH_OFFSET); unsigned long nHashA = HashString(lpszString, HASH_A); unsigned long nHashB = HashString(lpszString, HASH_B); unsigned long nHashStart = nHash % m_tablelength, nHashPos = nHashStart; while ( m_HashIndexTable[nHashPos].bExists){ nHashPos = (nHashPos + 1) % m_tablelength; if (nHashPos == nHashStart){ //一个轮回 return false; //hash表中没有空余的位置了,无法完成hash } } m_HashIndexTable[nHashPos].bExists = true; m_HashIndexTable[nHashPos].nHashA = nHashA; m_HashIndexTable[nHashPos].nHashB = nHashB; return true; }
注解:字符串的哈希函数
你可以把哈希表存储在字符串数组中,然后你可以计算字符串的哈希值,然后与已经存储的字符串的哈希值进行比较。如果有匹配的哈希值,就可以通过字符串比较进行匹配验证。这种方法叫索引,根据数组的大小以及字符串的平均长度可以约100倍。
unsigned long HashString(char *lpszString){ unsigned long ulHash = 0xf1e2d3c4; while (*lpszString != 0) { ulHash <<= 1; ulHash += *lpszString++; } return ulHash; }
上面代码中的散列算法在遍历字符串过程中,将哈希值左移一位,然后加上字符值。它会在较低的数据范围内产生相对可预测的输出,从而可能会产生大量冲突。
MPQ格式,使用了一种非常复杂的散列算法(如下所示),产生完全不可预测的哈希值,这个算法十分有效,这就是所谓的单向散列算法。
unsigned long HashString(char *lpszFileName, unsigned long dwHashType){ unsigned char *key = (unsigned char *)lpszFileName; unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE; int ch; while(*key != 0) { ch = toupper(*key++); seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2); seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3; } return seed1; }