使用两次Hash的Hash表——Twice_Hash_Map

先回顾一下hash表的相关内容，STL里面的unordered_map和map。

使用unordered_map，通过hash函数，将key映射到一个位置，如果这个位置原本没有值，那么就可以将这个key作为唯一的下标来访问这个位置。
但是如果这个位置已经被占了，那么就需要比较这个key和已经存在的下标是否一样。所以unordered_map需要提供key的hash和等于函数。

由于是使用hash得到的值作为下标，所以会出现不同的key得到相同的下标值，称为冲突。处理冲突的方法，一种是使用一个链表，保存这个位置的key。
每次hash到这个位置，就逐次比较链表里的key，以确定这个key是否已经存在。另一种方法是使用一个有序二叉树，可以使得查找的效率更高。当然需要提供比较函数。

STL中的map就是一个用来查找的有序二叉树，查找所需要的比较次数是O(log2(N)),N为已经存在的节点数。而unordered_map，时间复杂度是一次hash的时间加上与冲突项的比较次数，O(1).

需要注意的是
1，实际的时间效率。
一般情况下，会认为unordered_map比map快。在hash函数较好时，冲突数比较少，unordered_map会快。当需要处理的key很多，hash函数也不是那么好的时候，冲突次数增加，会导致unordered_map效率降低。所以当需要处理大规模的自定义数据，发现unordered_map比较慢的时候，可以试试map。
2， map所需要的小于比较
map在插入或查找key的时候，使用的是小于比较。例如插入key，与root节点比较，如果小于root节点，那么就向左查找，如果大于，就向右查找。否则就是等于。所以map只需要小于比较。
3. unordered_map的自动扩容，
unordered_map基于数组，当冲突数太大的时候，可以将数组扩大，将hash值对一个更大的质数取余得到下标，可以减少一部分冲突。所以自己提供hash函数的时候，不需要自己使用质数将hash值取余。
4. 对于自定义数据的hash
当需要自己写hash函数的时候，就会为hash的效率担忧。网上有各种对字符串和整数的hash方法，如何结合这些方法来实现自定义数据的hash，还是需要好好设计的。一种比较简单的方法是，将自定义数据转化为字符串，比如u16string,u32string。STL提供了这些字符串的hash，等于，小于操作。所以这是非常简单的方法。但是如果使用map，或者类似地，需要转化得到的字符串唯一。另一种方法是使用两次或多次hash。STL的hash表只要一个hash函数，得到一个下标。如果自己使用两次hash，例如
hash = hash_0 * PRIME + hash_1, 把这个组合的hash值传给unordered_map,然后所使用的PRIME与系统使用的冲突，效果就会很糟糕。那么最好就是自己实现一个类似于unordered_map的hash表，使用这个组合的hash值作为下标。

一个自定义的使用两个hash函数，并且使用有序二叉树处理冲突的hash表的实现及简单测试如下，

hashmap.h

/*
 * hashmap.h
 *
 *  Created on: Dec 26, 2012
 *      Author: chjd
 */

#ifndef HASHMAP_H_
#define HASHMAP_H_

#include <vector>
#include <map>
#include <utility>
#include <random>
using std::vector;
using std::map;
using std::pair;

#include <algorithm>
using std::max;

#include <cassert>

template<typename _KeyT,typename _ValueT,typename _HashFn,typename _HashFn2,typename _EqualFn,typename _LessFn>
class HashMap
{
public:
    typedef _KeyT KeyT;
    typedef _ValueT ValueT;
    typedef _HashFn HashFn;
    typedef _HashFn2 HashFn2;
    typedef _EqualFn EqualFn;
    typedef _LessFn LessFn;

    typedef map<KeyT,ValueT,LessFn> CellT;
    typedef map<size_t,CellT> DenseDataT;
    typedef vector<CellT> ArrayDataT;

//    typedef pair<const KeyT,ValueT> KeyValuePairT;
    typedef typename CellT::value_type KeyValuePairT;
    typedef ArrayDataT DataT;

private:
    static const unsigned int littlePrimeSize = 7;
    static const unsigned int bigPrimeSize = 13;
    static const size_t maxBit = 24;    // 24 for 32bit memory space and 27bit for 64bit memory space
    static const size_t maxTotal = 13-2;
    static const size_t maxCellSize = 253;

    size_t ROW_PRIME;
    size_t COL_PRIME;
    size_t row_pindex;
    size_t col_pindex;
    size_t entrySize;    // ROW_PRIME * COL_PRIME
    size_t entryLoad;    // not empty entry
    size_t cellLoad;    // size of cell

    size_t cnt;

    DataT data;

// for rehash prime_index
    typedef std::uniform_int_distribution<> Dis;
    typedef std::mt19937 Gen;
    Gen gen_pindex;
    Dis dis_pindex;

    size_t littlePrimeTable[littlePrimeSize] = {
            5, 11, 23, 47, 97, 193, 397
    };

    size_t bigPrimeTable[bigPrimeSize] = {
            1021,            /* 10 bit */
            2027,            /* 11 bit */
            4013,            /* 12 bit */
            8191,            /* 13 bit */
            16127,            /* 14 bit */
            31321,            /* 15 bit */
            66047,            /* 16 bit */
            131071,            /* 17 bit */
            263167,            /* 18 bit */
            524287,            /* 19 bit */
            1046527,        /* 20 bit */
            2097593,        /* 21 bit */
            3626149,        /* 22 bit */
    };
//            8388587,        /* 23 bit */
//            16769023,        /* 24 bit */
//            33554347,        /* 25 bit */
//            67108721,        /* 26 bit */
//            134217613,        /* 27 bit */
//            253450711,        /* 28 bit */
//            433494437,        /* 29 bit */
//            1073676287,        /* 30 bit */
//            2147483647,        /* 31 bit */
//            2971215073,        /* 32 bit */
//    };


    size_t FinalHash(const KeyT& key)
    {
        size_t hash,hash2;
        HashFn hashFn;
        HashFn2 hashFn2;
        hash = hashFn(key)%COL_PRIME;
        hash2 = hashFn2(key)%ROW_PRIME;
        return (hash2 * COL_PRIME + hash);
    }

    void Init(size_t level=0)
    {
        row_pindex = 2+level;
        col_pindex = 4+level;
        dis_pindex = Dis(2, littlePrimeSize-2);
        ReInit();
        data = DataT(entrySize);
    }

    void ReInit()
    {
        ROW_PRIME = littlePrimeTable[row_pindex];
        COL_PRIME = bigPrimeTable[col_pindex];
        entrySize = ROW_PRIME*COL_PRIME;
        entryLoad = 0;
        cellLoad = 0;
        cnt = 0;
    }

    void WEnLarge(size_t& total)
    {
        if(entryLoad*1.0/entrySize>0.95 && total<maxTotal)
        {
            total += 1;
        }
    }

    void UpdatePrimeIndex(size_t total)
    {
        row_pindex = (row_pindex + dis_pindex(gen_pindex))%littlePrimeSize;
        col_pindex = total - row_pindex;
    }

    bool WReHash()
    {
        cnt++;
        if(cnt<maxCellSize)
            return false;
        cnt = 0;
        if(cellLoad>maxCellSize)
        {
            size_t total = row_pindex+col_pindex;
            WEnLarge(total);
            UpdatePrimeIndex(total);
            ReInit();
            return true;
        }
        else
            return false;
    }

    void ReHash()
    {
        DataT t_data = DataT(entrySize);
        for(auto e_it=data.begin(),e_et=data.end();e_it!=e_et;e_it++)
        {
            if(e_it->empty())
                continue;
            for(auto c_it=e_it->begin(),c_et=e_it->end();c_it!=c_et;c_it++)
            {
                auto it = insert_key(t_data,*c_it);
                assert(it.second==true);
            }
        }
        data.clear();
        data.swap(t_data);
    }


    pair<KeyValuePairT,bool> insert_key(DataT& t_data,KeyValuePairT pairValue)
    {
        size_t pos = FinalHash(pairValue.first);
        CellT& entry = t_data[pos];
        if(entry.empty())
            entryLoad++;
        auto it = entry.insert(pairValue);
        cellLoad = max(entry.size(),cellLoad);
        return make_pair(make_pair(it.first->first,it.first->second),it.second);
    }

public:
    HashMap(size_t level=0)
    {
        Init();
    }

    bool find(const KeyT& key,ValueT& value)
    {
        size_t pos = FinalHash(key);
        CellT& entry = data[pos];
        if(entry.empty())
            return false;
        auto cell_it = entry.find(key);
        if(cell_it==entry.end())
            return false;

        value = cell_it->second;
        return true;
    }

    ValueT& operator [] (KeyT& key)
    {
        size_t pos = FinalHash(key);
        CellT& entry = data[pos];
        return entry[key];
    }

    pair<KeyValuePairT,bool> insert(KeyValuePairT pairValue)
    {
        if(WReHash())
            ReHash();
        return insert_key(data,pairValue);
    }

    pair<double,double> LoadFactor()
    {
        double entry =  entryLoad * 1.0 / entrySize;
        double cell = cellLoad * 1.0 / maxCellSize;
        return pair<double,double>(entry,cell);
    }
};


/*
 * any to string, when use string as key of map
 *
 * int, u32string
 * address, size_t, 32bit or 64bit, u16string
 * N = sizeof(addr)>>2;
 * str = u16string(N,char16_t())
 * mask = size_t(0xFFFF);
 * tmp_addr = (size_t)addr;
 * for(i=0;i<N;i++)
 *         short x = short(tmp_addr & mask);
 *        str[i] = char16_t(x);
 *        tmp_addr = tmp_addr >> 16;
 *
 */

#endif /* HASHMAP_H_ */

测试相关的文件test.h和test.cpp,

View Code

/*
 * test.h
 *
 *  Created on: Dec 27, 2012
 *      Author: chjd
 */

#ifndef TEST_H_
#define TEST_H_


#include <string>
using std::string;

#include <unordered_map>
using std::unordered_map;

#include <iostream>
using std::cout;
using std::endl;

#include <random>

#include "hashmap.h"

#include <functional>
using std::hash;

#include <ctime>
using std::clock;
using std::clock_t;

#include <sparsehash/dense_hash_map>
using google::dense_hash_map;

class JsHash
{
public:
    size_t operator()(const char* str) const
    {
        size_t hash = 1315423911;
        while (*str)
        {
            hash ^= ((hash << 5) + (*str++) + (hash >> 2));
        }
        return (hash & 0x7FFFFFFF);
    }
};

class HashStr
{
public:
    size_t operator()(const string& s) const
    {
        hash<string> hashFn;
        return hashFn(s);
    }
};

class HashStr2
{
public:
    size_t operator()(const string& s) const
    {
        JsHash jsHash;
        return jsHash(s.c_str());
    }
};

class EqualStr
{
public:
    bool operator()(const string& a, const string& b) const
    {
        return a == b;
    }
};

class LessStr
{
public:
    bool operator()(const string& a, const string& b) const
    {
        return a < b;
    }
};

typedef HashMap<string, size_t, HashStr, HashStr2, EqualStr, LessStr> SHashMap;
typedef unordered_map<string, size_t> SUMap;
typedef map<string,size_t> SMap;
typedef dense_hash_map<string, size_t, HashStr,EqualStr> DMap;

typedef std::uniform_int_distribution<> DisUniInt;
typedef std::mt19937 GenInt;



#endif /* TEST_H_ */

View Code

/*
 * test.cpp
 *
 *  Created on: Dec 27, 2012
 *      Author: chjd
 */

#include "test.h"

void GenString(DisUniInt& disl, DisUniInt& disa, GenInt& gen, string& s)
{
    int length = disl(gen);
    s = string(length, char(0));
    for (int i = length - 1; i >= 0; i--)
        s[i] = char(disa(gen));
}

int TestSHashMap(size_t N)
{
    SHashMap table;
    DisUniInt disl(5, 32);
    DisUniInt disa(int('a'), int('z'));
    GenInt gen;
    size_t cnt = 0;
    clock_t t0 = clock();
    for (size_t i = 0; i < N; i++)
    {
        string s;
        GenString(disl, disa, gen, s);
//        cout << s << endl;
        auto ok = table.insert(make_pair(s, i));
        if (ok.second)
            cnt += 1;
/*        else
        {
            cout << ok.first.first << " -> " << ok.first.second << endl;
        }
*/    }
    clock_t t1 = clock();
    double t2 = (t1 - t0) * 1.0 / CLOCKS_PER_SEC;
    cout << "time:" << t2 << endl;
    pair<double,double> factor = table.LoadFactor();
    cout << "entry:" << factor.first << "\t cell:" << factor.second << endl;
    cout << "cnt:" << cnt << endl;
    return cnt;
}

int TestUMap(size_t N)
{
    SUMap table;
    DisUniInt disl(5, 32);
    DisUniInt disa(0, 255);
    GenInt gen;
    size_t cnt = 0;
    clock_t t0 = clock();
    for (size_t i = 0; i < N; i++)
    {
        string s;
        GenString(disl, disa, gen, s);
        auto ok = table.insert(make_pair(s, i));
        if (ok.second)
            cnt++;
    }
    clock_t t1 = clock();
    double t2 = (t1 - t0) * 1.0 / CLOCKS_PER_SEC;
    cout << "time:" << t2 << endl;
    cout << "cnt:" << cnt << endl;
    return cnt;
}

int TestSMap(size_t N)
{
    SMap table;
    DisUniInt disl(5, 32);
    DisUniInt disa(0, 255);
    GenInt gen;
    size_t cnt = 0;
    clock_t t0 = clock();
    for (size_t i = 0; i < N; i++)
    {
        string s;
        GenString(disl, disa, gen, s);
        auto ok = table.insert(make_pair(s, i));
        if (ok.second)
            cnt++;
    }
    clock_t t1 = clock();
    double t2 = (t1 - t0) * 1.0 / CLOCKS_PER_SEC;
    cout << "time:" << t2 << endl;
    cout << "cnt:" << cnt << endl;
    return cnt;
}

int TestDMap(size_t N)
{
    DMap table;
    DisUniInt disl(5, 32);
    DisUniInt disa(0, 255);
    GenInt gen;
    size_t cnt = 0;
    clock_t t0 = clock();
    table.set_empty_key(string());
    for (size_t i = 0; i < N; i++)
    {
        string s;
        GenString(disl, disa, gen, s);
        auto ok = table.insert(make_pair(s, i));
        if (ok.second)
            cnt++;

    }
    clock_t t1 = clock();
    double t2 = (t1 - t0) * 1.0 / CLOCKS_PER_SEC;
    cout << "time:" << t2 << endl;
    cout << "cnt:" << cnt << endl;
    return cnt;
}

bool TestCorrect()
{
    size_t N = 10000000;
    SUMap table_0;
    SHashMap table_1;
    DisUniInt disl(5, 32);
    DisUniInt disa(0, 255);
    GenInt gen;
    size_t cnt_0 = 0, cnt_1=0;
    for (size_t i = 0; i < N; i++)
    {
        string s;
        GenString(disl, disa, gen, s);
        auto ok = table_0.insert(make_pair(s, i));
        if (ok.second)
            cnt_0++;
        auto ok_1 = table_1.insert(make_pair(s,i));
        if(ok_1.second)
            cnt_1++;
    }
    cout << "cnt:" << cnt_0 << "\t" << cnt_1 << endl;
    return cnt_0==cnt_1;
}

int main()
{
//    size_t N = 100000;
    TestCorrect();
/*
    TestSHashMap(N);
    TestUMap(N);
    TestSMap(N);
    TestDMap(N);
*/
    return 0;
}

测试比较了自己实现的HashMap，STL的unordered_map，map，和google dense_hash_map。在数据比较小的时候，测试结果还好，当数据比较多的时候，会比unordered_map慢一些。应该是扩容的部分没写好吧。

目前就先这样了。

posted @ 2012-12-28 13:04 Frandy.CH 阅读(1568) 评论(0) 编辑收藏举报

刷新页面返回顶部

Frandy.CH

使用两次Hash的Hash表——Twice_Hash_Map

公告