Python 2.7的字典实现简化版(C语言)
这是一个能自动调整大小的哈希字典,外部接口实现了下列功能.
1.字典级别:
创建字典 dict_new
归零字典 dict_clear
2.键值级别:
查找 dict_search
强制查找 dict_force_search
更新 dict_update
添加 dict_add
删除 dict_del
所谓强制查找就是假如key不存在,那么它将先在字典中添加这个key,值设置为默认值,再返回这个值的指针.
由于键值都是以空指针定义的,所以在处理一些简单的值类型时(如int),显得繁琐了些(比如valcmp),但好处是更加灵活了,比如稍作修改(valdup和get_default_val)就可以处理值为字符串的情况.
C确实很快,但繁重的内存管理果然名不虚传.这个简单的字典要求:
1.键(me_key)和值(me_value)的指针所指向的堆内存区域能够直接用free释放,如果这些区域还包含另一些堆指针,那么可能会出问题.
2.只需传递缓冲数据(main中的keybuf和valbuf)给键值函数,函数内部会根据情况申请或释放内存,或不做任何处理.
为方便处理,words文本格式要求每行一个词语.
/* Pure C simple version of python 2.7.8 hash table */ /* Sample usage: see main() */ #include <stdio.h> #include <stdlib.h> #include <assert.h> #include <string.h> #define PyDict_MINSIZE 8 #define PERTURB_SHIFT 5 #define NEED_RESIZE(mp) ((mp)->ma_fill * 3 >= ((mp)->ma_mask + 1) * 2) typedef void PyObject; typedef struct { size_t me_hash; PyObject *me_key; PyObject *me_value; } PyDictEntry; typedef struct _dictobject PyDictObject; struct _dictobject { size_t ma_fill; /* # Active + # Dummy */ size_t ma_used; /* # Active */ size_t ma_mask; PyDictEntry *ma_table; size_t(*ma_keyhash)(PyObject *key); int(*ma_keycmp)(PyObject *key1, PyObject *key2); PyObject *(*ma_keydup)(PyObject *key); PyObject *(*ma_valuedup)(PyObject *value); PyObject *(*ma_default)(void); }; /* Object used as dummy key to fill deleted entries */ static PyDictEntry _dummy_struct; #define dummy (&_dummy_struct) static size_t keyhash(PyObject *_key) { char *key = (char *)_key; size_t hash = 5381; for (; *key; key++) hash = ((hash << 5) + hash) + *key; /* hash * 33 + c */ return hash; } static int keycmp(PyObject *_key1, PyObject *_key2) { char *key1 = (char *)_key1; char *key2 = (char *)_key2; for (; *key1 == *key2; key1++, key2++) if (*key1 == '\0') return 0; return *key1 - *key2; } static PyObject * keydup(PyObject *key) { return (PyObject *)strdup((char *)key); } static PyObject * valuedup(PyObject *_value) { size_t *value = (size_t *)malloc(sizeof(size_t)); *value = *(size_t *)_value; return (PyObject *)value; } static PyObject * get_default_value(void) { size_t *value = (size_t *)malloc(sizeof(size_t)); *value = 0; return (PyObject *)value; } PyDictObject * dict_new_custom(size_t ma_size, size_t(*ma_keyhash)(PyObject *key), int(*ma_keycmp)(PyObject *key1, PyObject *key2), PyObject * (*ma_keydup)(PyObject *key), PyObject * (*ma_valuedup)(PyObject *value), PyObject * (*ma_default)(void)) { PyDictObject *mp; mp = (PyDictObject *)malloc(sizeof(PyDictObject)); if (mp == NULL) return NULL; size_t newsize; for (newsize = PyDict_MINSIZE; newsize < ma_size && newsize > 0; newsize <<= 1) ; PyDictEntry *newtable = (PyDictEntry*)malloc(sizeof(PyDictEntry) * newsize); if (newtable == NULL) return NULL; memset(newtable, 0, sizeof(PyDictEntry)* newsize); mp->ma_table = newtable; mp->ma_mask = newsize - 1; mp->ma_fill = mp->ma_used = 0; mp->ma_keyhash = ma_keyhash ? ma_keyhash : keyhash; mp->ma_keycmp = ma_keycmp ? ma_keycmp : keycmp; mp->ma_keydup = ma_keydup ? ma_keydup : keydup; mp->ma_valuedup = ma_valuedup ? ma_valuedup : valuedup; mp->ma_default = ma_default ? ma_default : get_default_value; return mp; } PyDictObject * dict_new(void) { return dict_new_custom(0, 0, 0, 0, 0, 0); } /*intern basic search method, used by other fucntions*/ static PyDictEntry * lookdict(PyDictObject *mp, PyObject *key, size_t hash) { size_t i; size_t perturb; PyDictEntry *freeslot; size_t mask = mp->ma_mask; PyDictEntry *ep0 = mp->ma_table; PyDictEntry *ep; i = (size_t)hash & mask; ep = &ep0[i]; if (ep->me_key == NULL || ep->me_key == key) return ep; if (ep->me_key == dummy) freeslot = ep; else if (ep->me_hash == hash && mp->ma_keycmp(ep->me_key, key) == 0) return ep; else freeslot = NULL; for (perturb = hash;; perturb >>= PERTURB_SHIFT) { i = (i << 2) + i + perturb + 1; ep = &ep0[i & mask]; if (ep->me_key == NULL) return freeslot == NULL ? ep : freeslot; if (ep->me_key == key || (ep->me_hash == hash && ep->me_key != dummy && mp->ma_keycmp(ep->me_key, key) == 0)) return ep; if (ep->me_key == dummy && freeslot == NULL) freeslot = ep; } assert(0); /* NOT REACHED */ return 0; } /*faster method used when no dummy key exists in table*/ static PyDictEntry * lookdict_nodummy(PyDictObject *mp, PyObject *key, size_t hash) { size_t i; size_t perturb; size_t mask = mp->ma_mask; PyDictEntry *ep0 = mp->ma_table; PyDictEntry *ep; i = (size_t)hash & mask; ep = &ep0[i]; if (ep->me_key == NULL || ep->me_key == key || (ep->me_hash == hash && mp->ma_keycmp(ep->me_key, key) == 0)) return ep; for (perturb = hash;; perturb >>= PERTURB_SHIFT) { i = (i << 2) + i + perturb + 1; ep = &ep0[i & mask]; if (ep->me_key == NULL || ep->me_key == key || (ep->me_hash == hash && mp->ma_keycmp(ep->me_key, key) == 0)) return ep; } assert(0); /* NOT REACHED */ return 0; } /*intern fast function to insert item when no dummy key exists in table*/ static void insertdict_clean(PyDictObject *mp, PyObject *key, size_t hash, PyObject *value) { size_t i; size_t perturb; size_t mask = mp->ma_mask; PyDictEntry *ep0 = mp->ma_table; PyDictEntry *ep; i = (size_t)hash & mask; ep = &ep0[i]; for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) { i = (i << 2) + i + perturb + 1; ep = &ep0[i & mask]; } mp->ma_fill++; mp->ma_used++; ep->me_key = key; ep->me_hash = hash; ep->me_value = value; } /* Restructure the table by allocating a new table and reinserting all items again. When entries have been deleted, the new table may actually be smaller than the old one. */ static int dict_resize(PyDictObject *mp, size_t minused) { size_t newsize; PyDictEntry *oldtable, *newtable, *ep; oldtable = mp->ma_table; /* Find the smallest table size > minused. */ for (newsize = PyDict_MINSIZE; newsize <= minused && newsize > 0; newsize <<= 1) ; /* Get space for a new table. */ newtable = (PyDictEntry*)malloc(sizeof(PyDictEntry) * newsize); if (newtable == NULL) return -1; memset(newtable, 0, sizeof(PyDictEntry)* newsize); mp->ma_table = newtable; mp->ma_mask = newsize - 1; size_t used = mp->ma_used; mp->ma_used = 0; mp->ma_fill = 0; for (ep = oldtable; used > 0; ep++) { /* only active entry */ if (ep->me_value != NULL) { used--; insertdict_clean(mp, ep->me_key, ep->me_hash, ep->me_value); } } free(oldtable); return 0; } PyObject * dict_search(PyDictObject *mp, PyObject *key) { assert(key); size_t hash = mp->ma_keyhash(key); PyDictEntry *ep = lookdict(mp, key, hash); return ep->me_value; } int dict_contain(PyDictObject *mp, PyObject *key) { return dict_search(mp, key) ? 1 : 0; } int dict_add(PyDictObject *mp, PyObject *key, PyObject *value) { assert(key); assert(value); size_t hash = mp->ma_keyhash(key); PyDictEntry *ep = lookdict(mp, key, hash); /*only for non-existing keys*/ assert(ep->me_value == NULL); PyObject *old_key = ep->me_key; if ((ep->me_key = mp->ma_keydup(key)) == NULL) return -1; if ((ep->me_value = mp->ma_valuedup(value)) == NULL) { free(ep->me_key); return -1; } if (old_key == NULL) mp->ma_fill++; mp->ma_used++; ep->me_hash = hash; if (NEED_RESIZE(mp)) return dict_resize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used); return 0; } int dict_update(PyDictObject *mp, PyObject *key, PyObject *value) { assert(key); assert(value); size_t hash = mp->ma_keyhash(key); PyDictEntry *ep = lookdict(mp, key, hash); /*only for existing keys*/ assert(ep->me_value != NULL); PyObject *old_value = ep->me_value; if ((ep->me_value = mp->ma_valuedup(value)) == NULL) return -1; free(old_value); return 0; } int dict_del(PyDictObject *mp, PyObject *key) { assert(key); size_t hash = mp->ma_keyhash(key); PyDictEntry *ep = lookdict(mp, key, hash); /*only for existing keys*/ assert(ep->me_value != NULL); free(ep->me_key); free(ep->me_value); ep->me_key = dummy; ep->me_value = NULL; mp->ma_used--; return 0; } PyObject * dict_force_search(PyDictObject *mp, PyObject *key) { assert(key); size_t hash = mp->ma_keyhash(key); PyDictEntry *ep = lookdict(mp, key, hash); if (ep->me_value == NULL) { PyObject *old_key = ep->me_key; if ((ep->me_key = mp->ma_keydup(key)) == NULL) return NULL; if ((ep->me_value = mp->ma_default()) == NULL) { free(ep->me_key); return NULL; } if (old_key == NULL) mp->ma_fill++; mp->ma_used++; ep->me_hash = hash; if (NEED_RESIZE(mp)) { dict_resize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used); ep = lookdict_nodummy(mp, key, hash); } } return ep->me_value; } void dict_clear(PyDictObject *mp) { PyDictEntry *table = mp->ma_table; assert(table != NULL); size_t used = mp->ma_used; if (mp->ma_fill == 0) return; PyDictEntry *ep; for (ep = table; used > 0; ep++) { /*only free active entry, this is different from Python 2.7*/ if (ep->me_value != NULL) { used--; free(ep->me_key); free(ep->me_value); } } memset(table, 0, sizeof(PyDictEntry) * (mp->ma_mask + 1)); } size_t dict_len(PyDictObject *mp) { return mp->ma_used; } /*helper function for sorting a PyDictEntry by its value*/ static int _valcmp(const void *a, const void *b) { return *(size_t *)(*(PyDictEntry *)a).me_value > *(size_t *)(* (PyDictEntry *)b).me_value ? -1 : 1; } /*print key value pair by value DESC order*/ static void print_all_by_value_desc(PyDictObject *mp) { PyDictEntry *ep; PyDictEntry *temp_table = (PyDictEntry *)malloc(sizeof(PyDictEntry) * (mp->ma_used)); size_t i = 0, used = mp->ma_used; for (ep = mp->ma_table; used > 0; ep++) { if (ep->me_value != NULL) { used--; temp_table[i++] = *ep; } } used = mp->ma_used; qsort(temp_table, used, sizeof(temp_table[0]), _valcmp); for (i = 0; i < used; i++) fprintf(stdout, "%s\t%d\n", (char *)temp_table[i].me_key, *(size_t *)temp_table[i].me_value); free(temp_table); } void printd(PyDictObject *mp) { PyDictEntry *ep; size_t used = mp->ma_used; for (ep = mp->ma_table; used > 0; ep++) { if (ep->me_value) { used--; fprintf(stdout, "%s\t%d\t%u\n", (char *)ep->me_key, *(size_t *)ep->me_value, ep->me_hash); } else if (ep->me_key == dummy) { fprintf(stdout, "it is a dummy key! it's hash is %u\n", ep->me_hash); } } } /*scan words from stdin, print total amount for each word by DESC order*/ int main(void) { //PyDictObject *mp = dict_new_custom(32, 0, 0, 0, 0, 0); PyDictObject *mp = dict_new(); FILE *fp; fp = fopen("words", "r"); char keybuf[100]; size_t valuebuf[] = { 1 }; size_t *vp; /* while (fscanf(stdin, "%s", keybuf) == 1) { if (dict_contain(mp, keybuf)) { vp = dict_search(mp, keybuf); *vp += 1; } else dict_add(mp, keybuf, valuebuf); }*/ while (fscanf(fp, "%s", keybuf) == 1) { vp = dict_force_search(mp, keybuf); *vp += 1; } print_all_by_value_desc(mp); //printd(mp); dict_clear(mp); fclose(fp); free(mp); return 0; }