POCO C++库学习和分析 -- 哈希

1. Hash概论

在理解Poco中的Hash代码之前，首先需要了解一下Hash的基本理论。下面的这些内容和教课书上的内容并没有太大的差别。

1.1 定义

  下面这几段来自于百度百科：
  Hash：一般翻译做"散列"，也有直接音译为"哈希"的，就是把任意长度的输入（又叫做预映射， pre-image），通过散列算法，变换成固定长度的输出，该输出就是散列值。这种转换是一种压缩映射，也就是，散列值的空间通常远小于输入的空间，不同的输入可能会散列成相同的输出，而不可能从散列值来唯一的确定输入值。简单的说就是一种将任意长度的消息压缩到某一固定长度的消息摘要的函数。
  Hash table：散列表，也叫哈希表，是根据关键码值(Key value)而直接进行访问的数据结构。也就是说，它通过把关键码值映射到表中一个位置来访问记录，以加快查找的速度。这个映射函数叫做散列函数，存放记录的数组叫做散列表。
        * 若结构中存在关键字和K相等的记录，则必定存储在f(K）的位置上。由此，不需比较便可直接取得所查记录。这个对应关系f称为散列函数(Hash function），按这个思想建立的表为散列表。
          * 对不同的关键字可能得到同一散列地址，即key1≠key2，而f(key1)=f(key2），这种现象称冲突。具有相同函数值的关键字对该散列函数来说称做同义词。
          * 综上所述，根据散列函数H(key）和处理冲突的方法将一组关键字映象到一个有限的连续的地址集（区间）上，并以关键字在地址集中的“象”，作为这条记录在表中的存储位置，这种表便称为散列表，这一映象过程称为散列造表或散列，所得的存储位置称散列地址。这个现象也叫散列桶，在散列桶中，只能通过顺序的方式来查找，一般只需要查找三次就可以找到。科学家计算过，当重载因子不超过75%，查找效率最高。
    * 若对于关键字集合中的任一个关键字，经散列函数映象到地址集合中任何一个地址的概率是相等的，则称此类散列函数为均匀散列函数（Uniform Hash function），这就是使关键字经过散列函数得到一个“随机的地址”，从而减少冲突。

1.2 Hash table查找效率

    对于Hash table来言，理论上查找效率为O(1)。但在现实世界中，查找的过程存在冲突现象。产生的冲突少，查找效率就高，产生的冲突多，查找效率就低。因此，影响产生冲突多少的因素，也就是影响查找效率的因素。影响产生冲突多少有以下三个因素：
    1. 散列函数是否均匀；
    2. 处理冲突的方法；
    3. 散列表的装填因子。
    散列表的装填因子定义为：α= 填入表中的元素个数 / 散列表的长度
    实际上，散列表的平均查找长度是装填因子α的函数，只是不同处理冲突的方法有不同的函数。

1.3 Poco中的Hash内容

Poco中的hash内容主要关注于Hash表的应用。下面是Poco中相关于Hash的类图：

    我们看到Poco的Hash内容主要被分成3部分：
    1. Hash函数。Poco提供了一组Hash函数用于，生成hash值。同时提供了模板类HashFunction，通过仿函式提供对任意数据结构生成hash值的功能。
    2. Hash table(哈希表)。Poco中实现了3种哈希表，分别是SimpleHashTable, HashTable,LinearHashTable。
    3. 在哈希表上的应用，封装出hash map和hash set。

2. Hash函数

Hash函数是解决hash冲突的第一个要素。
Poco中提供了一组Hash函数，用于产生hash值。其定义如下：

inline std::size_t hash(Int8 n)
{
	return static_cast<std::size_t>(n)*2654435761U; 
}

inline std::size_t hash(UInt8 n)
{
	return static_cast<std::size_t>(n)*2654435761U; 
}

inline std::size_t hash(Int16 n)
{
	return static_cast<std::size_t>(n)*2654435761U; 
}


inline std::size_t hash(UInt16 n)
{
	return static_cast<std::size_t>(n)*2654435761U; 
}

inline std::size_t hash(Int32 n)
{
	return static_cast<std::size_t>(n)*2654435761U; 
}

inline std::size_t hash(UInt32 n)
{
	return static_cast<std::size_t>(n)*2654435761U; 
}


inline std::size_t hash(Int64 n)
{
	return static_cast<std::size_t>(n)*2654435761U; 
}

inline std::size_t hash(UInt64 n)
{
	return static_cast<std::size_t>(n)*2654435761U; 
}

std::size_t hash(const std::string& str)
{
	std::size_t h = 0;
	std::string::const_iterator it  = str.begin();
	std::string::const_iterator end = str.end();
	while (it != end)
	{
		h = h * 0xf4243 ^ *it++;
	}
	return h;
}

这里就不对hash函数做过多叙述了，下面列出一些其他的常用hash函数。网上有专门的论述，并对不同的hash函数效果做了比较，有兴趣的话可以google一下。
附：各种哈希函数的C语言程序代码

unsigned int SDBMHash(char *str)
{
	unsigned int hash = 0;
	while (*str)
	{
		// equivalent to: hash = 65599*hash + (*str++);
		hash = (*str++) + (hash << 6) + (hash << 16) - hash;
	}
	return (hash & 0x7FFFFFFF);
}


// RS Hash 
unsigned int RSHash(char *str)
{
	unsigned int b = 378551;
	unsigned int a = 63689;
	unsigned int hash = 0;
	while (*str)
	{
		hash = hash * a + (*str++);
		a *= b;
	}
	return (hash & 0x7FFFFFFF);
}


// JS Hash 
unsigned int JSHash(char *str)
{
	unsigned int hash = 1315423911;
	while (*str)
	{
		hash ^= ((hash << 5) + (*str++) + (hash >> 2));
	}
	return (hash & 0x7FFFFFFF);
}


// P. J. Weinberger Hash 
unsigned int PJWHash(char *str)
{
	unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
	unsigned int ThreeQuarters	= (unsigned int)((BitsInUnignedInt  * 3) / 4);
	unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8);
	unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
	unsigned int hash	= 0;
	unsigned int test	= 0;
	while (*str)
	{
		hash = (hash << OneEighth) + (*str++);
		if ((test = hash & HighBits) != 0)
		{
			hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
		}
	}
	return (hash & 0x7FFFFFFF);
}


// ELF Hash 
unsigned int ELFHash(char *str)
{
	unsigned int hash = 0;
	unsigned int x	= 0;
	while (*str)
	{
		hash = (hash << 4) + (*str++);
		if ((x = hash & 0xF0000000L) != 0)
		{
			hash ^= (x >> 24);
			hash &= ~x;
		}
	}
	return (hash & 0x7FFFFFFF);
}


// BKDR Hash 
unsigned int BKDRHash(char *str)
{
	unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
	unsigned int hash = 0;
	while (*str)
	{
		hash = hash * seed + (*str++);
	}
	return (hash & 0x7FFFFFFF);
}


// DJB Hash 
unsigned int DJBHash(char *str)
{
	unsigned int hash = 5381;
	while (*str)
	{
		hash += (hash << 5) + (*str++);
	}
	return (hash & 0x7FFFFFFF);
}


// AP Hash 
unsigned int APHash(char *str)
{
	unsigned int hash = 0;
	int i;
	for (i=0; *str; i++)
	{
		if ((i & 1) == 0)
		{
			hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
		}
		else
		{
			hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
		}
	}
	return (hash & 0x7FFFFFFF);
}


unsigned int hash(char *str)
{
	register unsigned int h;
	register unsigned char *p;
	for(h=0, p = (unsigned char *)str; *p ; p++)
		h = 31 * h + *p;
	return h;
}

// PHP中出现的字符串Hash函数
static unsigned long hashpjw(char *arKey, unsigned int nKeyLength)
{
	unsigned long h = 0, g;
	char *arEnd=arKey+nKeyLength;

	while (arKey < arEnd) {
		h = (h << 4) + *arKey++;
		if ((g = (h & 0xF0000000))) {
			h = h ^ (g >> 24);
			h = h ^ g;
		}
	}
	return h;
}

// OpenSSL中出现的字符串Hash函数
unsigned long lh_strhash(char *str)
{
	int i,l;
	unsigned long ret=0;
	unsigned short *s;

	if (str == NULL) return(0);
	l=(strlen(str)+1)/2;
	s=(unsigned short *)str;
	for (i=0; i
		ret^=(s[i]<<(i&0x0f));
		return(ret);
} 

/* The following hash seems to work very well on normal text strings
* no collisions on /usr/dict/words and it distributes on %2^n quite
* well, not as good as MD5, but still good.
*/
unsigned long lh_strhash(const char *c)
{
	unsigned long ret=0;
	long n;
	unsigned long v;
	int r;


	if ((c == NULL) || (*c == '\0'))
		return(ret);
	/*
	unsigned char b[16];
	MD5(c,strlen(c),b);
	return(b[0]|(b[1]<<8)|(b[2]<<16)|(b[3]<<24));
	*/


	n=0x100;
	while (*c)
	{
		v=n|(*c);
		n+=0x100;
		r= (int)((v>>2)^v)&0x0f;
		ret=(ret(32-r));
		ret&=0xFFFFFFFFL;
		ret^=v*v;
		c++;
	}
	return((ret>>16)^ret);
}

// MySql中出现的字符串Hash函数
#ifndef NEW_HASH_FUNCTION

/* Calc hashvalue for a key */
static uint calc_hashnr(const byte *key,uint length)
{
	register uint nr=1, nr2=4;
	while (length--)
	{
		nr^= (((nr & 63)+nr2)*((uint) (uchar) *key++))+ (nr << 8);
		nr2+=3;
	}
	return((uint) nr);
}


/* Calc hashvalue for a key, case indepenently */
static uint calc_hashnr_caseup(const byte *key,uint length)
{
	register uint nr=1, nr2=4;
	while (length--)
	{
		nr^= (((nr & 63)+nr2)*((uint) (uchar) toupper(*key++)))+ (nr << 8);
		nr2+=3;
	}
	return((uint) nr);
}

#else

/*
* Fowler/Noll/Vo hash
*
* The basis of the hash algorithm was taken from an idea sent by email to the
* IEEE Posix P1003.2 mailing list from Phong Vo (kpv@research.att.com) and
* Glenn Fowler (gsf@research.att.com). Landon Curt Noll (chongo@toad.com)
* later improved on their algorithm.
*
* The magic is in the interesting relationship between the special prime
* 16777619 (2^24 + 403) and 2^32 and 2^8.
*
* This hash produces the fewest collisions of any function that we've seen so
* far, and works well on both numbers and strings.
*/

uint calc_hashnr(const byte *key, uint len)
{
	const byte *end=key+len;
	uint hash;
	for (hash = 0; key < end; key++)
	{
		hash *= 16777619;
		hash ^= (uint) *(uchar*) key;
	}
	return (hash);
}

uint calc_hashnr_caseup(const byte *key, uint len)
{
	const byte *end=key+len;
	uint hash;
	for (hash = 0; key < end; key++)
	{
		hash *= 16777619;
		hash ^= (uint) (uchar) toupper(*key);
	}
	return (hash);
}
#endif

3. Hash 表

    我们接下去分析Poco中Hash表的实现。Poco中实现了3种哈希表，分别是SimpleHashTable, HashTable,LinearHashTable。它们的实现对应了当出现冲突时，解决冲突的不同方法。首先我们看一下通用的解决方法。
    1. 线性探测。当出现碰撞时，顺序依次查询后续位置，直到找到空位。《利用线性探测法构造散列表》
    2. 双重散列法。当使用第一个散列Hash函数，出现碰撞时，用第二个散列函数去寻找空位
    3. 拉链法。出现碰撞的时候，使用list存储碰撞数据
    4. 线性哈希，linear hash。立刻分裂或者延迟分裂。通过分裂，控制桶的高度，每次分裂时，会重新散列碰撞元素。《linear hashing》

SimpleHashTable的实现对应了方法一；HashTable对应了方法3；LinearHashTable对应了方法4。

3.1 SimpleHashTable

从类图里我们看到，SimpleHashTable是一个HashEntry容器，内部定义如下：

std::vector<HashEntry*> _entries

当插入新数据时，首先根据hash值，计算空位，然后存储；如果发现冲突，顺着计算的hash值按地址顺序依次寻找空位；如_entries容器无空位，则抛出异常。

UInt32 insert(const Key& key, const Value& value)
/// Returns the hash value of the inserted item.
/// Throws an exception if the entry was already inserted
{
	UInt32 hsh = hash(key);
	insertRaw(key, hsh, value);
	return hsh;
}

Value& insertRaw(const Key& key, UInt32 hsh, const Value& value)
/// Returns the hash value of the inserted item.
/// Throws an exception if the entry was already inserted
{
	UInt32 pos = hsh;
	if (!_entries[pos])
		_entries[pos] = new HashEntry(key, value);
	else
	{
		UInt32 origHash = hsh;
		while (_entries[hsh % _capacity])
		{
			if (_entries[hsh % _capacity]->key == key)
				throw ExistsException();
			if (hsh - origHash > _capacity)
				throw PoolOverflowException("SimpleHashTable full");
			hsh++;
		}
		pos = hsh % _capacity;
		_entries[pos] = new HashEntry(key, value);
	}
	_size++;
	return _entries[pos]->value;
}

SimpleHashTable进行搜索时，策略也一致。

const Value& get(const Key& key) const
/// Throws an exception if the value does not exist
{
	UInt32 hsh = hash(key);
	return getRaw(key, hsh);
}

const Value& getRaw(const Key& key, UInt32 hsh) const
/// Throws an exception if the value does not exist
{
	UInt32 origHash = hsh;
	while (true)
	{
		if (_entries[hsh % _capacity])
		{
			if (_entries[hsh % _capacity]->key == key)
			{
				return _entries[hsh % _capacity]->value;
			}
		}
		else
			throw InvalidArgumentException("value not found");
		if (hsh - origHash > _capacity)
			throw InvalidArgumentException("value not found");
		hsh++;
	}
}

SimpleHashTable没有提供删除数据的接口，只适用于数据量不大的简单应用。

3.2 HashTable

HashTable是拉链法的一个变种。当冲突数据发生时，存储的容器是map而不是list。其内部容器定义为：

HashEntryMap** _entries;

同map相比，它实际上是把一个大map分成了很多个小map，通过hash方法寻找到小map，再通过map的find函数寻找具体数据。其插入和搜索数据函数如下：

UInt32 insert(const Key& key, const Value& value)
/// Returns the hash value of the inserted item.
/// Throws an exception if the entry was already inserted
{
	UInt32 hsh = hash(key);
	insertRaw(key, hsh, value);
	return hsh;
}


Value& insertRaw(const Key& key, UInt32 hsh, const Value& value)
/// Returns the hash value of the inserted item.
/// Throws an exception if the entry was already inserted
{
	if (!_entries[hsh])
		_entries[hsh] = new HashEntryMap();
	std::pair<typename HashEntryMap::iterator, bool> res(_entries[hsh]->insert(std::make_pair(key, value)));
	if (!res.second)
		throw InvalidArgumentException("HashTable::insert, key already exists.");
	_size++;
	return res.first->second;
}


const Value& get(const Key& key) const
/// Throws an exception if the value does not exist
{
	UInt32 hsh = hash(key);
	return getRaw(key, hsh);
}


const Value& getRaw(const Key& key, UInt32 hsh) const
/// Throws an exception if the value does not exist
{
	if (!_entries[hsh])
		throw InvalidArgumentException("key not found");

	ConstIterator it = _entries[hsh]->find(key);
	if (it == _entries[hsh]->end())
		throw InvalidArgumentException("key not found");

	return it->second;
}

HashTable支持remove操作。

3.2 LinearHashTable

LinearHashTable按照解决冲突的方法4实现。它内部的容器为vector<vector<Value>>,同时还存在两个控制量_split和_front：

std::size_t _split;
std::size_t _front;
vector<vector<Value>> _buckets;

它的插入操作如下：

std::pair<Iterator, bool> insert(const Value& value)
/// Inserts an element into the table.
///
/// If the element already exists in the table,
/// a pair(iterator, false) with iterator pointing to the 
/// existing element is returned.
/// Otherwise, the element is inserted an a 
/// pair(iterator, true) with iterator
/// pointing to the new element is returned.
{
	std::size_t hash = _hash(value);
	std::size_t addr = bucketAddressForHash(hash);
	BucketVecIterator it(_buckets.begin() + addr);
	BucketIterator buckIt(std::find(it->begin(), it->end(), value));
	if (buckIt == it->end())
	{
		split();
		addr = bucketAddressForHash(hash);
		it = _buckets.begin() + addr;
		buckIt = it->insert(it->end(), value);
		++_size;
		return std::make_pair(Iterator(it, _buckets.end(), buckIt), true);
	}
	else
	{
		return std::make_pair(Iterator(it, _buckets.end(), buckIt), false);
	}
}

其中split函数是所有操作的关键：

void split()
{
	if (_split == _front)
	{
		_split = 0;
		_front *= 2;
		_buckets.reserve(_front*2);
	}
	Bucket tmp;
	_buckets.push_back(tmp);
	_buckets[_split].swap(tmp);
	++_split;
	for (BucketIterator it = tmp.begin(); it != tmp.end(); ++it)
	{
		using std::swap;
		std::size_t addr = bucketAddress(*it);
		_buckets[addr].push_back(Value());
		swap(*it, _buckets[addr].back());
	}
}

从上面的代码中我们可以看到，在每次插入新元素的时候，都会增加一个新的桶，并对桶_buckets[_split]进行重新散列；在_split == _front时，会把_buckets的容积扩大一倍。通过动态的增加桶的数量，这种方法降低了每个桶的高度，从而保证了搜索的效率。

4. HashMap和HashSet

HashMap和HashSet是在LinearHashTable上的封装，使接口同stl::map和stl::set相类似，使用时非常的简单。下面来看一个例子：

#include "Poco/HashMap.h"
int main()
{
	typedef HashMap<int, int> IntMap;
	IntMap hm;
	
	for (int i = 0; i < N; ++i)
	{
		std::pair<IntMap::Iterator, bool> res = hm.insert(IntMap::ValueType(i, i*2));
		IntMap::Iterator it = hm.find(i);
	}		
	
	assert (!hm.empty());
	
	for (int i = 0; i < N; ++i)
	{
		IntMap::Iterator it = hm.find(i);
	}
	
	for (int i = 0; i < N; ++i)
	{
		std::pair<IntMap::Iterator, bool> res = hm.insert(IntMap::ValueType(i, 0));
	}	
        return 0;
}

posted @ 2013-03-22 12:10 在天与地之间阅读(988) 评论(0) 编辑收藏举报

刷新页面返回顶部

在天与地之间

POCO C++库学习和分析 -- 哈希

POCO C++库学习和分析 -- 哈希

1. Hash概论

1.1 定义

1.2 Hash table查找效率

1.3 Poco中的Hash内容

2. Hash函数

3. Hash 表

3.1 SimpleHashTable

3.2 HashTable

3.2 LinearHashTable

4. HashMap和HashSet

公告