C#源码(十二) HashSet

基础介绍

仓储地址

https://github.com/dotnet/runtime/

我本地的项目位置

C:\project\SourceCode\runtime-5.0.0-preview.3.20214.6\src\libraries\System.Collections

实现原理和Dictionary差不多,都是链地址法解决冲突。

Dictionary 有Key Value

HashSet只有Value

实际容器为Slot[] m_slots;

internal struct Slot 
{
  internal int hashCode;      // Lower 31 bits of hash code, -1 if unused
  internal int next;          // Index of next entry, -1 if last
  internal T value;
}

HashSet操作元素的时间复杂度接近O(1)

定义int[] m_buckets 数组来保存元素在实际容器Slot[] m_slots 位置

即 Value的保存在 m_slots[m_buckets[value.GetHashCode()%m_buckets.Length]].value

容器长度为质数

质数只能被1和自身整除

减少位置冲突

数据已满时添加数据扩容会自动扩充当前容量的2倍

新建一个2倍大小的容器

数据拷贝过去 重新计算位置

使用优化点

已知容器大小的情况 直接初始化对应大小

自定义元素可以实现IEqualityComparer可以更高效判断相等和获取HashCode

 

哈希函数

当位置冲突时使用Slot.next保存数据,也就是拉链法解决冲突。

hashCode = value == null ? 0 : InternalGetHashCode(comparer.GetHashCode(value));

这里comparer就是IEqualityComparer<T>? comparer = _comparer;可以是默认的,也可以构造函数传入

InternalGetHashCode方法如下

private static int InternalGetHashCode(T item, IEqualityComparer<T>? comparer)
{
    if (item == null)
    {
        return 0;
    }

    int hashCode = comparer?.GetHashCode(item) ?? item.GetHashCode();
    return hashCode & Lower31BitMask;
}

最后通过hashCode对桶长度求余获取bucket

bucket = hashCode % _buckets!.Length;

 

内部AddIfNotPresent方法

/// <summary>
/// Adds value to MyHashSet if not contained already
/// Returns true if added and false if already present
/// </summary>
/// <param name="value">value to find</param>
/// <returns></returns>
private bool AddIfNotPresent(T value)
{
    if (_buckets == null)
    {
        Initialize(0);
    }

    int hashCode = InternalGetHashCode(value);
    int bucket = hashCode % _buckets.Length;
    int collisionCount = 0;
    //把快照保存下来
    Slot[] slots = _slots;

    //遍历整个链 _buckets[bucket]- 1 是第一个要查找的位置  如果没找到 i就是-1 可以一直走下一步
    for (int i = _buckets[bucket] - 1; i >= 0; i = slots[i].next)
    {
        //已存在相同的元素
        if (slots[i].hashCode == hashCode && _comparer.Equals(slots[i].value, value))
        {
            return false;
        }

        //冲突次数大于slots的长度了
        if (collisionCount >= slots.Length)
        {
            // The chain of entries forms a loop, which means a concurrent update has happened.
            throw new InvalidOperationException( );
        }
        collisionCount++;
    }

    int index;

    //获取空闲位置
    if (_freeList >= 0)
    {
        index = _freeList;
        _freeList = slots[index].next;
    }
    else
    {
        if (_lastIndex == slots.Length)
        {
            IncreaseCapacity();
            // this will change during resize
            slots = _slots;
            bucket = hashCode % _buckets.Length;
        }
        index = _lastIndex;
        _lastIndex++;
    }
    //存入数据,记录索引
    slots[index].hashCode = hashCode;
    slots[index].value = value;
    slots[index].next = _buckets[bucket] - 1;
    _buckets[bucket] = index + 1;
    _count++;
    _version++;

    return true;
}

 

HashHelpers辅助类

这里HashHelpers是用来求素数和获取下一次扩容的大小的辅助类,里面有一个数组存放基础素数,如果容量超过已有素数,会通过数学的方法计算出需要的素数。

public class HashHelpers
{
    public const uint HashCollisionThreshold = 100;

    // This is the maximum prime smaller than Array.MaxArrayLength
    public const int MaxPrimeArrayLength = 0x7FEFFFFD;
    public const int HashPrime = 101;

    private static readonly int[] s_primes =
    {
        3, 7, 11, 17, 23, 29, 37, 47, 59, 71, 89, 107, 131, 163, 197, 239, 293, 353, 431, 521, 631, 761, 919,
        1103, 1327, 1597, 1931, 2333, 2801, 3371, 4049, 4861, 5839, 7013, 8419, 10103, 12143, 14591,
        17519, 21023, 25229, 30293, 36353, 43627, 52361, 62851, 75431, 90523, 108631, 130363, 156437,
        187751, 225307, 270371, 324449, 389357, 467237, 560689, 672827, 807403, 968897, 1162687, 1395263,
        1674319, 2009191, 2411033, 2893249, 3471899, 4166287, 4999559, 5999471, 7199369
    };

    /// <summary>
    /// 判断是否为质数(素数)
    /// </summary>
    /// <param name="candidate"></param>
    /// <returns></returns>
    public static bool IsPrime(int candidate)
    {
        //按位与1不等于0 如果等于0那么只能为2  不然就肯定可以被2整除
        if ((candidate & 1) != 0)
        {
            //求该数的平方根  
            int limit = (int)Math.Sqrt(candidate);
            //从3开始遍历,一直到平方根  大于平方根的数去除肯定是1.xxxx的,不用去判断
            //每次+2是跳过 偶数
            for (int divisor = 3; divisor <= limit; divisor += 2)
            {
                if ((candidate % divisor) == 0)
                    return false;
            }
            return true;
        }
        return candidate == 2;
    }

    /// <summary>
    /// 获取质数
    /// </summary>
    /// <param name="min">最小值</param>
    /// <returns></returns>
    public static int GetPrime(int min)
    {
        if (min < 0)
            throw new ArgumentException();
        //遍历已有数组
        foreach (int prime in s_primes)
        {
            if (prime >= min)
                return prime;
        }

        // Outside of our predefined table. Compute the hard way.
        //不在数组范围内,进行计算  把i的最后一位与1求或   要么不变要么加1  因为最后一位不为1的是不能为素数的(除了2)  每次+2是跳过 偶数
        for (int i = (min | 1); i < int.MaxValue; i += 2)
        {
            if (IsPrime(i) && ((i - 1) % HashPrime != 0))
                return i;
        }
        return min;
    }

    // Returns size of hashtable to grow to.
    public static int ExpandPrime(int oldSize)
    {
        int newSize = 2 * oldSize;

        // Allow the hashtables to grow to maximum possible size (~2G elements) before encountering capacity overflow.
        // Note that this check works even when _items.Length overflowed thanks to the (uint) cast
        if ((uint)newSize > MaxPrimeArrayLength && MaxPrimeArrayLength > oldSize)
        {
           
            return MaxPrimeArrayLength;
        }

        return GetPrime(newSize);
    }
}

 

对外公有方法

/// <summary>
/// 与另一个集合合并
/// </summary>
/// <param name="other"></param>
public void UnionWith(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException(nameof(other));
    }

    foreach (T item in other)
    {
        AddIfNotPresent(item);
    }
}

/// <summary>
/// 删除和other相等的项
/// </summary>
/// <param name="other"></param>
public void ExceptWith(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException("other");
    }

    // this is already the enpty set; return
    if (m_count == 0)
    {
        return;
    }

    // special case if other is this; a set minus itself is the empty set
    if (other == this)
    {
        Clear();
        return;
    }

    // remove every element in other from this
    foreach (T element in other)
    {
        Remove(element);
    }
}

/// <summary>
/// 修改自身 删除存在自身和other的元素
/// </summary>
/// <param name="other"></param>
public void SymmetricExceptWith(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException("other");
    }

    // if set is empty, then symmetric difference is other
    if (m_count == 0)
    {
        UnionWith(other);
        return;
    }

    // special case this; the symmetric difference of a set with itself is the empty set
    if (other == this)
    {
        Clear();
        return;
    }

    MyHashSet<T> otherAsSet = other as MyHashSet<T>;
    // If other is a HashSet, it has unique elements according to its equality comparer,
    // but if they're using different equality comparers, then assumption of uniqueness
    // will fail. So first check if other is a hashset using the same equality comparer;
    // symmetric except is a lot faster and avoids bit array allocations if we can assume
    // uniqueness
    if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet))
    {
        SymmetricExceptWithUniqueHashSet(otherAsSet);
    }
    else
    {
        SymmetricExceptWithEnumerable(other);
    }
}
对外公有方法

 

交集和子集等判断

/// <summary>
/// 求和other交集
/// </summary>
/// <param name="other"></param>
public void IntersectWith(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException("other");
    }

    if (m_count == 0)
    {
        return;
    }

    ICollection<T> otherAsCollection = other as ICollection<T>;
    if (otherAsCollection != null)
    {
        if (otherAsCollection.Count == 0)
        {
            Clear();
            return;
        }

        MyHashSet<T> otherAsSet = other as MyHashSet<T>;
        // faster if other is a hashset using same equality comparer; so check 
        // that other is a hashset using the same equality comparer.
        if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet))
        {
            IntersectWithHashSetWithSameEC(otherAsSet);
            return;
        }
    }

    IntersectWithEnumerable(other);
}

private void IntersectWithMyHashSetWithSameEC(MyHashSet<T> other)
{
    for (int i = 0; i < _lastIndex; i++)
    {
        if (_slots[i].hashCode >= 0)
        {
            T item = _slots[i].value;
            if (!other.Contains(item))
            {
                Remove(item);
            }
        }
    }
}

/// <summary>
/// Iterate over other. If contained in this, mark an element in bit array corresponding to
/// its position in _slots. If anything is unmarked (in bit array), remove it.
/// This attempts to allocate on the stack, if below StackAllocThreshold.
/// 这里用了指针数组等,所以是unsafe方法
/// </summary>
/// <param name="other"></param>
private unsafe void IntersectWithEnumerable(IEnumerable<T> other)
{
    // keep track of current last index; don't want to move past the end of our bit array
    // (could happen if another thread is modifying the collection)
    int originalLastIndex = _lastIndex;
    int intArrayLength = BitHelper.ToIntArrayLength(originalLastIndex);

    BitHelper bitHelper;
    //根据长度选择不同的BitHelper实例化方式
    if (intArrayLength <= StackAllocThreshold)
    {
        int* bitArrayPtr = stackalloc int[intArrayLength];
        bitHelper = new BitHelper(bitArrayPtr, intArrayLength);
    }
    else
    {
        int[] bitArray = new int[intArrayLength];
        bitHelper = new BitHelper(bitArray, intArrayLength);
    }

    // mark if contains: find index of in slots array and mark corresponding element in bit array
    foreach (T item in other)
    {
        int index = InternalIndexOf(item);
        if (index >= 0)
        {
            bitHelper.MarkBit(index);
        }
    }

    // if anything unmarked, remove it. Perf can be optimized here if BitHelper had a 
    // FindFirstUnmarked method.
    for (int i = 0; i < originalLastIndex; i++)
    {
        if (_slots[i].hashCode >= 0 && !bitHelper.IsMarked(i))
        {
            Remove(_slots[i].value);
        }
    }
}
求和other交集
/// <summary>
/// 是否为other的子集
/// </summary>
/// <param name="other"></param>
/// <returns></returns>
public bool IsSubsetOf(IEnumerable<T> other)
{
    if (other == null)
    {
        throw new ArgumentNullException("other");
    }

    // The empty set is a subset of any set
    if (m_count == 0)
    {
        return true;
    }

    MyHashSet<T> otherAsSet = other as MyHashSet<T>;
    // faster if other has unique elements according to this equality comparer; so check 
    // that other is a hashset using the same equality comparer.
    if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet))
    {
        // if this has more elements then it can't be a subset
        if (m_count > otherAsSet.Count)
        {
            return false;
        }

        // already checked that we're using same equality comparer. simply check that 
        // each element in this is contained in other.
        return IsSubsetOfHashSetWithSameEC(otherAsSet);
    }
    else
    {
        ElementCount result = CheckUniqueAndUnfoundElements(other, false);
        return (result.uniqueCount == m_count && result.unfoundCount >= 0);
    }
}
是否为other的子集

 

posted @ 2020-07-22 14:16  SeedQi  阅读(723)  评论(0编辑  收藏  举报