C#源码(十二) HashSet
基础介绍
仓储地址
https://github.com/dotnet/runtime/
我本地的项目位置
C:\project\SourceCode\runtime-5.0.0-preview.3.20214.6\src\libraries\System.Collections
实现原理和Dictionary差不多,都是链地址法解决冲突。
Dictionary 有Key Value
HashSet只有Value
实际容器为Slot[] m_slots;
internal struct Slot { internal int hashCode; // Lower 31 bits of hash code, -1 if unused internal int next; // Index of next entry, -1 if last internal T value; }
HashSet操作元素的时间复杂度接近O(1)
定义int[] m_buckets 数组来保存元素在实际容器Slot[] m_slots 位置
即 Value的保存在 m_slots[m_buckets[value.GetHashCode()%m_buckets.Length]].value
容器长度为质数
质数只能被1和自身整除
减少位置冲突
数据已满时添加数据扩容会自动扩充当前容量的2倍
新建一个2倍大小的容器
数据拷贝过去 重新计算位置
使用优化点
已知容器大小的情况 直接初始化对应大小
自定义元素可以实现IEqualityComparer可以更高效判断相等和获取HashCode
哈希函数
当位置冲突时使用Slot.next保存数据,也就是拉链法解决冲突。
hashCode = value == null ? 0 : InternalGetHashCode(comparer.GetHashCode(value));
这里comparer就是IEqualityComparer<T>? comparer = _comparer;可以是默认的,也可以构造函数传入
InternalGetHashCode方法如下
private static int InternalGetHashCode(T item, IEqualityComparer<T>? comparer) { if (item == null) { return 0; } int hashCode = comparer?.GetHashCode(item) ?? item.GetHashCode(); return hashCode & Lower31BitMask; }
最后通过hashCode对桶长度求余获取bucket
bucket = hashCode % _buckets!.Length;
内部AddIfNotPresent方法
/// <summary> /// Adds value to MyHashSet if not contained already /// Returns true if added and false if already present /// </summary> /// <param name="value">value to find</param> /// <returns></returns> private bool AddIfNotPresent(T value) { if (_buckets == null) { Initialize(0); } int hashCode = InternalGetHashCode(value); int bucket = hashCode % _buckets.Length; int collisionCount = 0; //把快照保存下来 Slot[] slots = _slots; //遍历整个链 _buckets[bucket]- 1 是第一个要查找的位置 如果没找到 i就是-1 可以一直走下一步 for (int i = _buckets[bucket] - 1; i >= 0; i = slots[i].next) { //已存在相同的元素 if (slots[i].hashCode == hashCode && _comparer.Equals(slots[i].value, value)) { return false; } //冲突次数大于slots的长度了 if (collisionCount >= slots.Length) { // The chain of entries forms a loop, which means a concurrent update has happened. throw new InvalidOperationException( ); } collisionCount++; } int index; //获取空闲位置 if (_freeList >= 0) { index = _freeList; _freeList = slots[index].next; } else { if (_lastIndex == slots.Length) { IncreaseCapacity(); // this will change during resize slots = _slots; bucket = hashCode % _buckets.Length; } index = _lastIndex; _lastIndex++; } //存入数据,记录索引 slots[index].hashCode = hashCode; slots[index].value = value; slots[index].next = _buckets[bucket] - 1; _buckets[bucket] = index + 1; _count++; _version++; return true; }
HashHelpers辅助类
这里HashHelpers是用来求素数和获取下一次扩容的大小的辅助类,里面有一个数组存放基础素数,如果容量超过已有素数,会通过数学的方法计算出需要的素数。
public class HashHelpers { public const uint HashCollisionThreshold = 100; // This is the maximum prime smaller than Array.MaxArrayLength public const int MaxPrimeArrayLength = 0x7FEFFFFD; public const int HashPrime = 101; private static readonly int[] s_primes = { 3, 7, 11, 17, 23, 29, 37, 47, 59, 71, 89, 107, 131, 163, 197, 239, 293, 353, 431, 521, 631, 761, 919, 1103, 1327, 1597, 1931, 2333, 2801, 3371, 4049, 4861, 5839, 7013, 8419, 10103, 12143, 14591, 17519, 21023, 25229, 30293, 36353, 43627, 52361, 62851, 75431, 90523, 108631, 130363, 156437, 187751, 225307, 270371, 324449, 389357, 467237, 560689, 672827, 807403, 968897, 1162687, 1395263, 1674319, 2009191, 2411033, 2893249, 3471899, 4166287, 4999559, 5999471, 7199369 }; /// <summary> /// 判断是否为质数(素数) /// </summary> /// <param name="candidate"></param> /// <returns></returns> public static bool IsPrime(int candidate) { //按位与1不等于0 如果等于0那么只能为2 不然就肯定可以被2整除 if ((candidate & 1) != 0) { //求该数的平方根 int limit = (int)Math.Sqrt(candidate); //从3开始遍历,一直到平方根 大于平方根的数去除肯定是1.xxxx的,不用去判断 //每次+2是跳过 偶数 for (int divisor = 3; divisor <= limit; divisor += 2) { if ((candidate % divisor) == 0) return false; } return true; } return candidate == 2; } /// <summary> /// 获取质数 /// </summary> /// <param name="min">最小值</param> /// <returns></returns> public static int GetPrime(int min) { if (min < 0) throw new ArgumentException(); //遍历已有数组 foreach (int prime in s_primes) { if (prime >= min) return prime; } // Outside of our predefined table. Compute the hard way. //不在数组范围内,进行计算 把i的最后一位与1求或 要么不变要么加1 因为最后一位不为1的是不能为素数的(除了2) 每次+2是跳过 偶数 for (int i = (min | 1); i < int.MaxValue; i += 2) { if (IsPrime(i) && ((i - 1) % HashPrime != 0)) return i; } return min; } // Returns size of hashtable to grow to. public static int ExpandPrime(int oldSize) { int newSize = 2 * oldSize; // Allow the hashtables to grow to maximum possible size (~2G elements) before encountering capacity overflow. // Note that this check works even when _items.Length overflowed thanks to the (uint) cast if ((uint)newSize > MaxPrimeArrayLength && MaxPrimeArrayLength > oldSize) { return MaxPrimeArrayLength; } return GetPrime(newSize); } }
对外公有方法
/// <summary> /// 与另一个集合合并 /// </summary> /// <param name="other"></param> public void UnionWith(IEnumerable<T> other) { if (other == null) { throw new ArgumentNullException(nameof(other)); } foreach (T item in other) { AddIfNotPresent(item); } } /// <summary> /// 删除和other相等的项 /// </summary> /// <param name="other"></param> public void ExceptWith(IEnumerable<T> other) { if (other == null) { throw new ArgumentNullException("other"); } // this is already the enpty set; return if (m_count == 0) { return; } // special case if other is this; a set minus itself is the empty set if (other == this) { Clear(); return; } // remove every element in other from this foreach (T element in other) { Remove(element); } } /// <summary> /// 修改自身 删除存在自身和other的元素 /// </summary> /// <param name="other"></param> public void SymmetricExceptWith(IEnumerable<T> other) { if (other == null) { throw new ArgumentNullException("other"); } // if set is empty, then symmetric difference is other if (m_count == 0) { UnionWith(other); return; } // special case this; the symmetric difference of a set with itself is the empty set if (other == this) { Clear(); return; } MyHashSet<T> otherAsSet = other as MyHashSet<T>; // If other is a HashSet, it has unique elements according to its equality comparer, // but if they're using different equality comparers, then assumption of uniqueness // will fail. So first check if other is a hashset using the same equality comparer; // symmetric except is a lot faster and avoids bit array allocations if we can assume // uniqueness if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { SymmetricExceptWithUniqueHashSet(otherAsSet); } else { SymmetricExceptWithEnumerable(other); } }
交集和子集等判断
/// <summary> /// 求和other交集 /// </summary> /// <param name="other"></param> public void IntersectWith(IEnumerable<T> other) { if (other == null) { throw new ArgumentNullException("other"); } if (m_count == 0) { return; } ICollection<T> otherAsCollection = other as ICollection<T>; if (otherAsCollection != null) { if (otherAsCollection.Count == 0) { Clear(); return; } MyHashSet<T> otherAsSet = other as MyHashSet<T>; // faster if other is a hashset using same equality comparer; so check // that other is a hashset using the same equality comparer. if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { IntersectWithHashSetWithSameEC(otherAsSet); return; } } IntersectWithEnumerable(other); } private void IntersectWithMyHashSetWithSameEC(MyHashSet<T> other) { for (int i = 0; i < _lastIndex; i++) { if (_slots[i].hashCode >= 0) { T item = _slots[i].value; if (!other.Contains(item)) { Remove(item); } } } } /// <summary> /// Iterate over other. If contained in this, mark an element in bit array corresponding to /// its position in _slots. If anything is unmarked (in bit array), remove it. /// This attempts to allocate on the stack, if below StackAllocThreshold. /// 这里用了指针数组等,所以是unsafe方法 /// </summary> /// <param name="other"></param> private unsafe void IntersectWithEnumerable(IEnumerable<T> other) { // keep track of current last index; don't want to move past the end of our bit array // (could happen if another thread is modifying the collection) int originalLastIndex = _lastIndex; int intArrayLength = BitHelper.ToIntArrayLength(originalLastIndex); BitHelper bitHelper; //根据长度选择不同的BitHelper实例化方式 if (intArrayLength <= StackAllocThreshold) { int* bitArrayPtr = stackalloc int[intArrayLength]; bitHelper = new BitHelper(bitArrayPtr, intArrayLength); } else { int[] bitArray = new int[intArrayLength]; bitHelper = new BitHelper(bitArray, intArrayLength); } // mark if contains: find index of in slots array and mark corresponding element in bit array foreach (T item in other) { int index = InternalIndexOf(item); if (index >= 0) { bitHelper.MarkBit(index); } } // if anything unmarked, remove it. Perf can be optimized here if BitHelper had a // FindFirstUnmarked method. for (int i = 0; i < originalLastIndex; i++) { if (_slots[i].hashCode >= 0 && !bitHelper.IsMarked(i)) { Remove(_slots[i].value); } } }
/// <summary> /// 是否为other的子集 /// </summary> /// <param name="other"></param> /// <returns></returns> public bool IsSubsetOf(IEnumerable<T> other) { if (other == null) { throw new ArgumentNullException("other"); } // The empty set is a subset of any set if (m_count == 0) { return true; } MyHashSet<T> otherAsSet = other as MyHashSet<T>; // faster if other has unique elements according to this equality comparer; so check // that other is a hashset using the same equality comparer. if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { // if this has more elements then it can't be a subset if (m_count > otherAsSet.Count) { return false; } // already checked that we're using same equality comparer. simply check that // each element in this is contained in other. return IsSubsetOfHashSetWithSameEC(otherAsSet); } else { ElementCount result = CheckUniqueAndUnfoundElements(other, false); return (result.uniqueCount == m_count && result.unfoundCount >= 0); } }