【散列】杜鹃散列详情与C++实现代码

导引

在球-箱问题中，如果将N项随机抛入N个箱子中，那么含球最多的箱子的期望球数为Θ(logN/log logN)。

如果在每次投掷中随机选取两个箱子且将被投项投入(在那一刻)较空的箱子中，则最大箱子的球数只是θ(log logN)，这是一个显著更小的数。其中一种做法就是杜鹃散列(cuckoo hashing)。

概念

在杜鹃散列中，假设我们有N项，我们保持两个散列表，每个都多于半空，并且我们有两个独立的散列函数，它们可将每一项分配给每个表中的一个位置。

杜鹃散列保持下述不变性：一项总是被存储在它的两个位置之一中。

杜鹃散列的好处包括最坏情形常数查找和删除次数，避免懒惰删除和额外的数据，以及并行处理的可能。但杜鹃散列对散列函数的选择非常敏感，最后，推荐使用较小的装填因子或多于两个的散列函数。

实现

杜鹃散列表常常作为拥有两个(或更多的)散列函数的一个大表来实现，这些散列函数探测整个大表。如果存在一个可用的位置，那么一些变化的做法则是尝试把一项立即置入二级散列表中，而不是一开始的位置替换。
杜鹃散列算法本身很简单：要想插入新项x，首先确认它不在表中。然后使用第一个散列函数，而如果这（第一）个表位置是空的，则该项即可置入。

代码

 //为杜鹃散列生成泛型HashFamily接口,用来发出多簇散列函数到杜鹃散列表
template<typename AnyType>
class CuckooHashFamily {
public:
	size_t hash(const AnyType& x, int which)const;
	int getNumberOfFunctions();
	void generateNewFunctions();
};
 
/**
* 杜鹃散列法的非正式字符串散列
*/
template<int count>
class StringHashFamily {
private:
	std::vector<int> MULTIPLIERS;
	UniformRandom r;
 
public:
	StringHashFamily() :MULTIPLIERS(count) {
		generateNewFuntions();
	}
	int getNumberOfFunctions()const {
		return count;
	}
	void generateNewFuntions() {
		for (auto& mult : MULTIPLIERS)
			mult = r.nextInt();
	}
 
	size_t hash(const string& x, int which)const {
		const int multiplier = MULTIPLIERS[which];
		size_t hashVal = 0;
		for (auto ch : x)
			hashVal = multiplier * hashVal + ch;
		return hashVal;
	}
};
 
//杜鹃散列类接口，允许(由HashFamily模板参数类型指定)任意个数的散列函数
template<typename AnyType, typename HashFamily>
class HashTable {
private:
	struct HashEntry {
		AnyType element;
		bool isActive;
 
		HashEntry(const AnyType&e=AnyType(),bool a=false)
			:element{e},isActive{a}{}
		HashEntry(AnyType&&e,bool a=false)
			:element{std::move(e)},isActive{a}{}
	};
 
	/**
	* 杜鹃散列的插入例程使用不同的算法，
	* 该算法随机选择要逐出的项，
	* 但不再试图重新逐出最后的项。
	* 如果存在太多的逐出项则散列表将尝试选取新的散列函数(再散列)，
	* 而若有太多的再散列则散列表将扩张
	*/
	bool insertHelper1(const AnyType& xx) {
		const int COUNT_LIMIT = 100;
		AnyType x = xx;
 
		while (true) {
			int lastPos = -1;
			int pos;
 
			for (int count = 0; count < COUNT_LIMIT; ++count) {
				for (int i = 0; i < numHashFunctions; ++i)
					pos = myhash(x, i);
					
				if (!isActive(pos)) {
					array[pos] = std::move(HashEntry{ std::move(x),true });
					++currentSize;
					return true;
				}
			}
 
			//无可用位置，逐出一个随机项
			int i = 0;
			do {
				pos = myhash(x, r.nextInt(numHashFunctions));
			} while (pos == lastPos && i++ < 5);
 
			lastPos = pos;
			std::swap(x, array[pos].element);
		}
 
		if (++rehashes > ALLOWED_REHASHES) {
			expand();		//使散列表扩大
			rehashes = 0;	//重置rehashes的计数
		}
		else
			rehash();		//表大小相同，散列函数都是新的
	}
	bool insertHelper1(AnyType&& x) {
		const int COUNT_LIMIT = 100;
 
		while (true) {
			int lastPos = -1;
			int pos;
 
			for (int count = 0; count < COUNT_LIMIT; ++count) {
				for (int i = 0; i < numHashFunctions; ++i)
					pos = myhash(x, i);
 
				if (!isActive(pos)) {
					array[pos] = std::move(HashEntry{ std::move(x),true });
					++currentSize;
					return true;
				}
			}
 
			//无可用位置，逐出一个随机项
			int i = 0;
			do {
				pos = myhash(x, r.nextInt(numHashFunctions));
			} while (pos == lastPos && i++ < 5);
 
			lastPos = pos;
			std::swap(x, array[pos].element);
		}
 
		if (++rehashes > ALLOWED_REHASHES) {
			expand();		//使散列表扩大
			rehashes = 0;	//重置rehashes的计数
		}
		else
			rehash();		//表大小相同，散列函数都是新的
	}
	bool isActive(int currentPos)const {
		return currentPos != -1 && array[currentPos].isActive;
	}
 
	/**
	* 使用特定函数计算x的散列代码
	* 选取适当的散列函数，然后把它换算成合法的数组下标
	*/
	size_t myhash(const AnyType& x, int which)const {
		return hashFunctions.hash(x, which) % array.size();
	}
 
	/**
	* 查找所有散列函数的位置
	* 返回查阅所有的散列函数以返回包含项x的下标，若找不到则返回-1
	*/
	int findPos(const AnyType& x)const {
		for (int i = 0; i < numHashFunctions; ++i) {
			int pos = myhash(x, i);
 
			if (isActive(pos) && array[pos].element == x)
				return pos;
		}
		return -1;
	}
 
	/**
	* 创建一个大数组但使用那些相同的散列函数
	*/
	void expand() {
		rehash(static_cast<int>(array.size() / MAX_LOAD));
	}
 
	/**
	* 保留数组的大小不变，创建一个新的数组
	* 该数组使用那些新选出的散列函数填充
	*/
	void rehash() {
		hashFunctions.generateNewFuntions();
		rehash(array.size());
	}
 
	void rehash(int newSize) {
		std::vector<HashEntry> oldArray = array;
 
		//创建新的双倍大小的空散列表
		array.resize(nextPrime(newSize));
		for (auto& entry : array)
			entry.isActive = false;
 
		//复制整个表
		currentSize = 0;
		for (auto& entry : oldArray)
			if (entry.isActive)
				insert(std::move(entry.element));
	}
 
	constexpr static const double MAX_LOAD=0.4; //最大装填因子
	static const int ALLOWED_REHASHES = 5; //最大散列次数
 
	vector<HashEntry>array;
	int currentSize;
	int numHashFunctions;
	int rehashes;
	UniformRandom r;
	HashFamily hashFunctions;
 
public:
	explicit HashTable(int size = 101) :array(nextPrime(size)) {
		numHashFunctions = hashFunctions.getNumberOfFunctions();
		rehashes = 0;
		makeEmpty();
	}
 
	//清空杜鹃散列表
	void makeEmpty() {
		currentSize = 0;
		for (auto& entry : array)
			entry.isActive = false;
	}
 
	/**
	* 搜索杜鹃散列表的例程
	* 如果找到x则返回true
	*/
	bool contains(const AnyType& x)const {
		return findPos(x) != -1;
	}
 
	/**
	* 从散列表中删除x
	* 若项x被找到且被删除则返回true
	*/
	bool remove(const AnyType& x) {
		int currentPos = findPos(x);
		if (!isActive(currentPos))
			return false;
 
		array[currentPos].isActive = false;
		--currentSize;
		return true;
	}
 
	//杜鹃散列表中公有插入方法
	bool insert(const AnyType& x) {
		if (contains(x))
			return false;
 
		if (currentSize >= array.size() * MAX_LOAD)
			expand(); 
 
		return insertHelper1(x);
	}
	bool insert(AnyType&& x) {
		if (contains(x))
			return false;
 
		if (currentSize >= array.size() * MAX_LOAD)
			expand(); 
 
		return insertHelper1(std::move(x));
	}
 
	int size() const
	{
		return currentSize;
	}
 
	int capacity() const
	{
		return array.size();
	}
};

posted @ 2022-10-18 08:47 aw11 阅读(135) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· 【散列】散列表HashTable

· 【散列】散列表HashTable分离链接法类模板的实现

· 散列（hash）

· 数据结构：7种哈希散列算法，你知道几个？

· 数据结构与算法分析——C语言描述（第5章散列）

阅读排行：
· 阿里最新开源QwQ-32B，效果媲美deepseek-r1满血版，部署成本又又又降低了！
· 单线程的Redis速度为什么快？
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决：字节Trae VS Cursor，谁才是开发者新宠？
· 展开说说关于C#中ORM框架的用法！

公告

昵称： aw11
园龄： 2年6个月
粉丝： 0
关注： 1

+加关注

2025年3月

日

一

二

三

四

五

六

随笔分类 (3)

随笔档案 (58)

2022年10月(58)

幺幺的算法地

Don't fight for the prize, practice hard fist

【散列】杜鹃散列详情与C++实现代码

导引

概念

实现

代码

公告

搜索

最新随笔

随笔分类 (3)

随笔档案 (58)

阅读排行榜

	//为杜鹃散列生成泛型HashFamily接口,用来发出多簇散列函数到杜鹃散列表
	template<typename AnyType>
	class CuckooHashFamily {
	public:
	size_t hash(const AnyType& x, int which)const;
	int getNumberOfFunctions();
	void generateNewFunctions();
	};

	/**
	* 杜鹃散列法的非正式字符串散列
	*/
	template<int count>
	class StringHashFamily {
	private:
	std::vector<int> MULTIPLIERS;
	UniformRandom r;

	public:
	StringHashFamily() :MULTIPLIERS(count) {
	generateNewFuntions();
	}
	int getNumberOfFunctions()const {
	return count;
	}
	void generateNewFuntions() {
	for (auto& mult : MULTIPLIERS)
	mult = r.nextInt();
	}

	size_t hash(const string& x, int which)const {
	const int multiplier = MULTIPLIERS[which];
	size_t hashVal = 0;
	for (auto ch : x)
	hashVal = multiplier * hashVal + ch;
	return hashVal;
	}
	};

	//杜鹃散列类接口，允许(由HashFamily模板参数类型指定)任意个数的散列函数
	template<typename AnyType, typename HashFamily>
	class HashTable {
	private:
	struct HashEntry {
	AnyType element;
	bool isActive;

	HashEntry(const AnyType&e=AnyType(),bool a=false)
	:element{e},isActive{a}{}
	HashEntry(AnyType&&e,bool a=false)
	:element{std::move(e)},isActive{a}{}
	};

	/**
	* 杜鹃散列的插入例程使用不同的算法，
	* 该算法随机选择要逐出的项，
	* 但不再试图重新逐出最后的项。
	* 如果存在太多的逐出项则散列表将尝试选取新的散列函数(再散列)，
	* 而若有太多的再散列则散列表将扩张
	*/
	bool insertHelper1(const AnyType& xx) {
	const int COUNT_LIMIT = 100;
	AnyType x = xx;

	while (true) {
	int lastPos = -1;
	int pos;

	for (int count = 0; count < COUNT_LIMIT; ++count) {
	for (int i = 0; i < numHashFunctions; ++i)
	pos = myhash(x, i);

	if (!isActive(pos)) {
	array[pos] = std::move(HashEntry{ std::move(x),true });
	++currentSize;
	return true;
	}
	}

	//无可用位置，逐出一个随机项
	int i = 0;
	do {
	pos = myhash(x, r.nextInt(numHashFunctions));
	} while (pos == lastPos && i++ < 5);

	lastPos = pos;
	std::swap(x, array[pos].element);
	}

	if (++rehashes > ALLOWED_REHASHES) {
	expand(); //使散列表扩大
	rehashes = 0; //重置rehashes的计数
	}
	else
	rehash(); //表大小相同，散列函数都是新的
	}
	bool insertHelper1(AnyType&& x) {
	const int COUNT_LIMIT = 100;

	while (true) {
	int lastPos = -1;
	int pos;

	for (int count = 0; count < COUNT_LIMIT; ++count) {
	for (int i = 0; i < numHashFunctions; ++i)
	pos = myhash(x, i);

	if (!isActive(pos)) {
	array[pos] = std::move(HashEntry{ std::move(x),true });
	++currentSize;
	return true;
	}
	}

	//无可用位置，逐出一个随机项
	int i = 0;
	do {
	pos = myhash(x, r.nextInt(numHashFunctions));
	} while (pos == lastPos && i++ < 5);

	lastPos = pos;
	std::swap(x, array[pos].element);
	}

	if (++rehashes > ALLOWED_REHASHES) {
	expand(); //使散列表扩大
	rehashes = 0; //重置rehashes的计数
	}
	else
	rehash(); //表大小相同，散列函数都是新的
	}
	bool isActive(int currentPos)const {
	return currentPos != -1 && array[currentPos].isActive;
	}

	/**
	* 使用特定函数计算x的散列代码
	* 选取适当的散列函数，然后把它换算成合法的数组下标
	*/
	size_t myhash(const AnyType& x, int which)const {
	return hashFunctions.hash(x, which) % array.size();
	}

	/**
	* 查找所有散列函数的位置
	* 返回查阅所有的散列函数以返回包含项x的下标，若找不到则返回-1
	*/
	int findPos(const AnyType& x)const {
	for (int i = 0; i < numHashFunctions; ++i) {
	int pos = myhash(x, i);

	if (isActive(pos) && array[pos].element == x)
	return pos;
	}
	return -1;
	}

	/**
	* 创建一个大数组但使用那些相同的散列函数
	*/
	void expand() {
	rehash(static_cast<int>(array.size() / MAX_LOAD));
	}

	/**
	* 保留数组的大小不变，创建一个新的数组
	* 该数组使用那些新选出的散列函数填充
	*/
	void rehash() {
	hashFunctions.generateNewFuntions();
	rehash(array.size());
	}

	void rehash(int newSize) {
	std::vector<HashEntry> oldArray = array;

	//创建新的双倍大小的空散列表
	array.resize(nextPrime(newSize));
	for (auto& entry : array)
	entry.isActive = false;

	//复制整个表
	currentSize = 0;
	for (auto& entry : oldArray)
	if (entry.isActive)
	insert(std::move(entry.element));
	}

	constexpr static const double MAX_LOAD=0.4; //最大装填因子
	static const int ALLOWED_REHASHES = 5; //最大散列次数

	vector<HashEntry>array;
	int currentSize;
	int numHashFunctions;
	int rehashes;
	UniformRandom r;
	HashFamily hashFunctions;

	public:
	explicit HashTable(int size = 101) :array(nextPrime(size)) {
	numHashFunctions = hashFunctions.getNumberOfFunctions();
	rehashes = 0;
	makeEmpty();
	}

	//清空杜鹃散列表
	void makeEmpty() {
	currentSize = 0;
	for (auto& entry : array)
	entry.isActive = false;
	}

	/**
	* 搜索杜鹃散列表的例程
	* 如果找到x则返回true
	*/
	bool contains(const AnyType& x)const {
	return findPos(x) != -1;
	}

	/**
	* 从散列表中删除x
	* 若项x被找到且被删除则返回true
	*/
	bool remove(const AnyType& x) {
	int currentPos = findPos(x);
	if (!isActive(currentPos))
	return false;

	array[currentPos].isActive = false;
	--currentSize;
	return true;
	}

	//杜鹃散列表中公有插入方法
	bool insert(const AnyType& x) {
	if (contains(x))
	return false;

	if (currentSize >= array.size() * MAX_LOAD)
	expand();

	return insertHelper1(x);
	}
	bool insert(AnyType&& x) {
	if (contains(x))
	return false;

	if (currentSize >= array.size() * MAX_LOAD)
	expand();

	return insertHelper1(std::move(x));
	}

	int size() const
	{
	return currentSize;
	}

	int capacity() const
	{
	return array.size();
	}
	};