Python 2.7的字典实现简化版(C语言)

这是一个能自动调整大小的哈希字典,外部接口实现了下列功能.

1.字典级别:

创建字典 dict_new

归零字典 dict_clear

2.键值级别:

查找 dict_search

强制查找 dict_force_search

更新 dict_update

添加 dict_add

删除 dict_del

所谓强制查找就是假如key不存在,那么它将先在字典中添加这个key,值设置为默认值,再返回这个值的指针.

由于键值都是以空指针定义的,所以在处理一些简单的值类型时(如int),显得繁琐了些(比如valcmp),但好处是更加灵活了,比如稍作修改(valdup和get_default_val)就可以处理值为字符串的情况.

C确实很快,但繁重的内存管理果然名不虚传.这个简单的字典要求:

1.键(me_key)和值(me_value)的指针所指向的堆内存区域能够直接用free释放,如果这些区域还包含另一些堆指针,那么可能会出问题.

2.只需传递缓冲数据(main中的keybuf和valbuf)给键值函数,函数内部会根据情况申请或释放内存,或不做任何处理.

 

为方便处理,words文本格式要求每行一个词语.

/* Pure C simple version of python 2.7.8 hash table */
/* Sample usage: see main() */
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#define PyDict_MINSIZE 8
#define PERTURB_SHIFT 5
#define NEED_RESIZE(mp) ((mp)->ma_fill * 3 >= ((mp)->ma_mask + 1) * 2)

typedef void PyObject;

typedef struct {
    size_t me_hash;
    PyObject *me_key;
    PyObject *me_value;
} PyDictEntry;

typedef struct _dictobject PyDictObject;
struct _dictobject {
    size_t ma_fill;  /* # Active + # Dummy */
    size_t ma_used;  /* # Active */
    size_t ma_mask;
    PyDictEntry *ma_table;
    size_t(*ma_keyhash)(PyObject *key);
    int(*ma_keycmp)(PyObject *key1, PyObject *key2);
    PyObject *(*ma_keydup)(PyObject *key);
    PyObject *(*ma_valuedup)(PyObject *value);
    PyObject *(*ma_default)(void);
};

/* Object used as dummy key to fill deleted entries */
static PyDictEntry _dummy_struct;
#define dummy (&_dummy_struct)

static size_t
keyhash(PyObject *_key)
{
    char *key = (char *)_key;
    size_t hash = 5381;
    for (; *key; key++)
        hash = ((hash << 5) + hash) + *key; /* hash * 33 + c */
    return hash;
}

static int
keycmp(PyObject *_key1, PyObject *_key2)
{
    char *key1 = (char *)_key1;
    char *key2 = (char *)_key2;
    for (; *key1 == *key2; key1++, key2++)
        if (*key1 == '\0')
            return 0;
    return *key1 - *key2;
}

static PyObject *
keydup(PyObject *key)
{
    return (PyObject *)strdup((char *)key);
}

static PyObject *
valuedup(PyObject *_value)
{
    size_t *value = (size_t *)malloc(sizeof(size_t));
    *value = *(size_t *)_value;
    return (PyObject *)value;
}

static PyObject *
get_default_value(void)
{
    size_t *value = (size_t *)malloc(sizeof(size_t));
    *value = 0;
    return (PyObject *)value;
}

PyDictObject *
dict_new_custom(size_t ma_size,
                size_t(*ma_keyhash)(PyObject *key),
                int(*ma_keycmp)(PyObject *key1, PyObject *key2),
                PyObject * (*ma_keydup)(PyObject *key),
                PyObject * (*ma_valuedup)(PyObject *value),
                PyObject * (*ma_default)(void))
{
    PyDictObject *mp;
    mp = (PyDictObject *)malloc(sizeof(PyDictObject));
    if (mp == NULL)
        return NULL;
    size_t newsize;
    for (newsize = PyDict_MINSIZE;
            newsize < ma_size && newsize > 0;
            newsize <<= 1)
        ;
    PyDictEntry *newtable = (PyDictEntry*)malloc(sizeof(PyDictEntry) * newsize);
    if (newtable == NULL)
        return NULL;
    memset(newtable, 0, sizeof(PyDictEntry)* newsize);
    mp->ma_table = newtable;
    mp->ma_mask = newsize - 1;
    mp->ma_fill = mp->ma_used = 0;
    mp->ma_keyhash = ma_keyhash ? ma_keyhash : keyhash;
    mp->ma_keycmp = ma_keycmp ? ma_keycmp : keycmp;
    mp->ma_keydup = ma_keydup ? ma_keydup : keydup;
    mp->ma_valuedup = ma_valuedup ? ma_valuedup : valuedup;
    mp->ma_default = ma_default ? ma_default : get_default_value;
    return mp;
}

PyDictObject *
dict_new(void)
{
    return dict_new_custom(0, 0, 0, 0, 0, 0);
}

/*intern basic search method, used by other fucntions*/
static PyDictEntry *
lookdict(PyDictObject *mp, PyObject *key, size_t hash)
{
    size_t i;
    size_t perturb;
    PyDictEntry *freeslot;
    size_t mask = mp->ma_mask;
    PyDictEntry *ep0 = mp->ma_table;
    PyDictEntry *ep;
    i = (size_t)hash & mask;
    ep = &ep0[i];
    if (ep->me_key == NULL || ep->me_key == key)
        return ep;
    if (ep->me_key == dummy)
        freeslot = ep;
    else if (ep->me_hash == hash
             && mp->ma_keycmp(ep->me_key, key) == 0)
        return ep;
    else
        freeslot = NULL;
    for (perturb = hash;; perturb >>= PERTURB_SHIFT) {
        i = (i << 2) + i + perturb + 1;
        ep = &ep0[i & mask];
        if (ep->me_key == NULL)
            return freeslot == NULL ? ep : freeslot;
        if (ep->me_key == key
                || (ep->me_hash == hash
                    && ep->me_key != dummy
                    && mp->ma_keycmp(ep->me_key, key) == 0))
            return ep;
        if (ep->me_key == dummy && freeslot == NULL)
            freeslot = ep;
    }
    assert(0);          /* NOT REACHED */
    return 0;
}

/*faster method used when no dummy key exists in table*/
static PyDictEntry *
lookdict_nodummy(PyDictObject *mp, PyObject *key, size_t hash)
{
    size_t i;
    size_t perturb;
    size_t mask = mp->ma_mask;
    PyDictEntry *ep0 = mp->ma_table;
    PyDictEntry *ep;
    i = (size_t)hash & mask;
    ep = &ep0[i];
    if (ep->me_key == NULL
            || ep->me_key == key
            || (ep->me_hash == hash && mp->ma_keycmp(ep->me_key, key) == 0))
        return ep;
    for (perturb = hash;; perturb >>= PERTURB_SHIFT) {
        i = (i << 2) + i + perturb + 1;
        ep = &ep0[i & mask];
        if (ep->me_key == NULL
                || ep->me_key == key
                || (ep->me_hash == hash && mp->ma_keycmp(ep->me_key, key) == 0))
            return ep;
    }
    assert(0);          /* NOT REACHED */
    return 0;
}

/*intern fast function to insert item when no dummy key exists in table*/
static void
insertdict_clean(PyDictObject *mp, PyObject *key, size_t hash, PyObject *value)
{
    size_t i;
    size_t perturb;
    size_t mask = mp->ma_mask;
    PyDictEntry *ep0 = mp->ma_table;
    PyDictEntry *ep;
    i = (size_t)hash & mask;
    ep = &ep0[i];
    for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) {
        i = (i << 2) + i + perturb + 1;
        ep = &ep0[i & mask];
    }
    mp->ma_fill++;
    mp->ma_used++;
    ep->me_key = key;
    ep->me_hash = hash;
    ep->me_value = value;
}

/*
Restructure the table by allocating a new table and reinserting all
items again.  When entries have been deleted, the new table may
actually be smaller than the old one.
*/
static int
dict_resize(PyDictObject *mp, size_t minused)
{
    size_t newsize;
    PyDictEntry *oldtable, *newtable, *ep;
    oldtable = mp->ma_table;
    /* Find the smallest table size > minused. */
    for (newsize = PyDict_MINSIZE;
            newsize <= minused && newsize > 0;
            newsize <<= 1)
        ;
    /* Get space for a new table. */
    newtable = (PyDictEntry*)malloc(sizeof(PyDictEntry) * newsize);
    if (newtable == NULL)
        return -1;
    memset(newtable, 0, sizeof(PyDictEntry)* newsize);
    mp->ma_table = newtable;
    mp->ma_mask = newsize - 1;
    size_t used = mp->ma_used;
    mp->ma_used = 0;
    mp->ma_fill = 0;
    for (ep = oldtable; used > 0; ep++) {
        /* only active entry */
        if (ep->me_value != NULL) {
            used--;
            insertdict_clean(mp, ep->me_key, ep->me_hash, ep->me_value);
        }
    }
    free(oldtable);
    return 0;
}

PyObject *
dict_search(PyDictObject *mp, PyObject *key)
{
    assert(key);
    size_t hash = mp->ma_keyhash(key);
    PyDictEntry *ep = lookdict(mp, key, hash);
    return ep->me_value;
}

int
dict_contain(PyDictObject *mp, PyObject *key)
{
    return dict_search(mp, key) ? 1 : 0;
}

int
dict_add(PyDictObject *mp, PyObject *key, PyObject *value)
{
    assert(key);
    assert(value);
    size_t hash = mp->ma_keyhash(key);
    PyDictEntry *ep = lookdict(mp, key, hash);
    /*only for non-existing keys*/
    assert(ep->me_value == NULL);
    PyObject *old_key = ep->me_key;
    if ((ep->me_key = mp->ma_keydup(key)) == NULL)
        return -1;
    if ((ep->me_value = mp->ma_valuedup(value)) == NULL) {
        free(ep->me_key);
        return -1;
    }
    if (old_key == NULL)
        mp->ma_fill++;
    mp->ma_used++;
    ep->me_hash = hash;
    if (NEED_RESIZE(mp))
        return dict_resize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used);
    return 0;
}

int
dict_update(PyDictObject *mp, PyObject *key, PyObject *value)
{
    assert(key);
    assert(value);
    size_t hash = mp->ma_keyhash(key);
    PyDictEntry *ep = lookdict(mp, key, hash);
    /*only for existing keys*/
    assert(ep->me_value != NULL);
    PyObject *old_value = ep->me_value;
    if ((ep->me_value = mp->ma_valuedup(value)) == NULL)
        return -1;
    free(old_value);
    return 0;
}

int
dict_del(PyDictObject *mp, PyObject *key)
{
    assert(key);
    size_t hash = mp->ma_keyhash(key);
    PyDictEntry *ep = lookdict(mp, key, hash);
    /*only for existing keys*/
    assert(ep->me_value != NULL);
    free(ep->me_key);
    free(ep->me_value);
    ep->me_key = dummy;
    ep->me_value = NULL;
    mp->ma_used--;
    return 0;
}

PyObject *
dict_force_search(PyDictObject *mp, PyObject *key)
{
    assert(key);
    size_t hash = mp->ma_keyhash(key);
    PyDictEntry *ep = lookdict(mp, key, hash);
    if (ep->me_value == NULL) {
        PyObject *old_key = ep->me_key;
        if ((ep->me_key = mp->ma_keydup(key)) == NULL)
            return NULL;
        if ((ep->me_value = mp->ma_default()) == NULL) {
            free(ep->me_key);
            return NULL;
        }
        if (old_key == NULL)
            mp->ma_fill++;
        mp->ma_used++;
        ep->me_hash = hash;
        if (NEED_RESIZE(mp)) {
            dict_resize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used);
            ep = lookdict_nodummy(mp, key, hash);
        }
    }
    return ep->me_value;
}

void
dict_clear(PyDictObject *mp)
{
    PyDictEntry *table = mp->ma_table;
    assert(table != NULL);
    size_t used = mp->ma_used;
    if (mp->ma_fill == 0)
        return;
    PyDictEntry *ep;
    for (ep = table; used > 0; ep++) {
        /*only free active entry, this is different from Python 2.7*/
        if (ep->me_value != NULL) {
            used--;
            free(ep->me_key);
            free(ep->me_value);
        }
    }
    memset(table, 0, sizeof(PyDictEntry) * (mp->ma_mask + 1));
}

size_t
dict_len(PyDictObject *mp)
{
    return mp->ma_used;
}

/*helper function for sorting a PyDictEntry by its value*/
static int
_valcmp(const void *a, const void *b)
{
    return *(size_t *)(*(PyDictEntry *)a).me_value > *(size_t *)(*
            (PyDictEntry *)b).me_value ? -1 : 1;
}

/*print key value pair by value DESC order*/
static void
print_all_by_value_desc(PyDictObject *mp)
{
    PyDictEntry *ep;
    PyDictEntry *temp_table = (PyDictEntry *)malloc(sizeof(PyDictEntry) *
                              (mp->ma_used));
    size_t i = 0, used = mp->ma_used;
    for (ep = mp->ma_table; used > 0; ep++) {
        if (ep->me_value != NULL) {
            used--;
            temp_table[i++] = *ep;
        }
    }
    used = mp->ma_used;
    qsort(temp_table, used, sizeof(temp_table[0]), _valcmp);
    for (i = 0; i < used; i++)
        fprintf(stdout, "%s\t%d\n", (char *)temp_table[i].me_key,
                *(size_t *)temp_table[i].me_value);
    free(temp_table);
}

void printd(PyDictObject *mp)
{
    PyDictEntry *ep;
    size_t used = mp->ma_used;
    for (ep = mp->ma_table; used > 0; ep++) {
        if (ep->me_value) {
            used--;
            fprintf(stdout, "%s\t%d\t%u\n", (char *)ep->me_key, *(size_t *)ep->me_value,
                    ep->me_hash);
        } else if (ep->me_key == dummy) {
            fprintf(stdout, "it is a dummy key! it's hash is %u\n", ep->me_hash);
        }
    }
}

/*scan words from stdin, print total amount for each word by DESC order*/
int main(void)
{
    //PyDictObject *mp = dict_new_custom(32, 0, 0, 0, 0, 0);
    PyDictObject *mp = dict_new();
    FILE *fp;
    fp = fopen("words", "r");
    char keybuf[100];
    size_t valuebuf[] = { 1 };
    size_t *vp;
    /*    while (fscanf(stdin, "%s", keybuf) == 1) {
    if (dict_contain(mp, keybuf)) {
    vp = dict_search(mp, keybuf);
    *vp += 1;
    } else
    dict_add(mp, keybuf, valuebuf);
    }*/
    while (fscanf(fp, "%s", keybuf) == 1) {
        vp = dict_force_search(mp, keybuf);
        *vp += 1;
    }

    print_all_by_value_desc(mp);
    //printd(mp);
    dict_clear(mp);
    fclose(fp);
    free(mp);
    return 0;
}
posted @ 2014-07-22 01:55  LisPythoniC  阅读(1931)  评论(0编辑  收藏  举报