LD-sketch源码阅读
util.h
IP转换函数,将二进制的IP地址转化为点分十进制的形式。
/*
* Convert IP (in network order) to string
*/
inline char* ip2a(uint32_t ip, char* addr) {
sprintf(addr, "%d.%d.%d.%d", ip & 0xff, (ip >> 8) & 0xff, (ip >> 16) &
0xff, (ip >> 24) & 0xff);
return addr;
}
hash.hpp/cpp
mangle函数
为解决在不同机器上 C++的二进制兼容问题,使用mangle函数转化源码中的函数,因为在不同的编译器或不同的编译器版本,编译后的名称可能会有不同。
(或是使用seq-hash,测量heavy hitter heavy changers,提升了内存使用率,减少了计算开销,最关键的是可以恢复流的信息
把一个key值按照位分为几个段,分别hash进一个二维表里面,最后再把这些二维表的值合并出这个key)
void mangle(const unsigned char* key, unsigned char* ret_key,
int nbytes) {
unsigned long long new_key = 0;
int i;
for (i=0; i<nbytes; ++i) {
new_key |= key[nbytes-i-1] << (i * 8);
}
new_key = (new_key * 2083697005) & (0xffffffff);
for (i=0; i<nbytes; ++i) {
ret_key[i] = (new_key >> (i * 8)) & 0xff;
}
}
GenHashSeed函数
产生一个随机数的种子返回值。
uint64_t seed = 0;
uint64_t GenHashSeed(int index) {
/*
if (index == 0) {
srand(0);
}
*/
if (seed == 0) {
seed = rand();
}
uint64_t x, y = seed + index;
mangle((const unsigned char*)&y, (unsigned char*)&x, 8);
return AwareHash((uint8_t*)&y, 8, 388650253, 388650319, 1176845762);
}
AwareHash模块
接受参数:原数据、原数据长度。哈希值的初始值为388650253,乘以规模388650319,再和数据相加后,向后移动一个数据,数据有多长就跑几次。数据中心的流量是非常巨大的,要将对如此巨大的流量产生随机分布的哈希值,这个哈希值也将非常巨大,也难怪在自己的虚拟机上,把数值调小了几倍还是溢出。最后对1176845762进行异或处理
/**
* hash function
* @param data the binary to be hashed
* @param n the length of binary to be hashed
*/
static unsigned int AwareHash(const unsigned char* data, unsigned int n) {
unsigned int hash = 388650253;
unsigned int scale = 388650319;
unsigned int hardener = 1176845762;
while( n ) {
hash *= scale;
hash += *data++;
n--;
}
return hash ^ hardener;
}
LDSketch.hpp/cpp
LDSketch更新函数,对一个sketch插入键值对。
可在前部分使用mangle函数encode key值。利用LD_Sketch_find 函数,找到一个位置,再更新到桶内
/*
* Sequential hashign algorithm - encode keys (a.k.a. update step)
*/
void LDSketch_update(LDSketch_t* sk, unsigned char* key, long long val) {
int j, k;
// mangle
// mangle(key, key_to_add, tbl->n/8);
// add key/val to table
for (j=0; j<sk->h; ++j) {
k = LDSketch_find(sk, key, 0, sk->lgn - 1, j);
//tbl->T[j*tbl->w+k] += val;
dyn_tbl_update(sk->tbl[j*sk->w+k], key, val);
}
//tbl->total += val;
}
LD_Sketch的find函数
找到一个sketch内的key的哈希值。把这个key的副本初始化为0,把key值的每一位复制到副本当中,针对这个副本的前八位按照某种方法处理一次,之后的位置就把key原值复制进去。operator为列数乘以哈希函数的id号加上用户输入的桶的位置,并把operator加入这个桶中。输入这个副本和长度,返回这个键值的哈希值。
unsigned int LDSketch_find(LDSketch_t* tbl, const unsigned char* key, int start_bit,
int end_bit, int row_no) {
unsigned char key_str[50]; // assume n/8 + 4 <= 50
unsigned int oper;
char bit;
unsigned int ret_bucket;
int i;
// set the key string
memset(key_str, 0, sizeof(key_str));
i = start_bit; // start_bit == 0 in all cases
while (i <= end_bit) {
if (end_bit - i + 1 >= 8) {
key_str[i/8] = key[i/8];
i += 8;
} else {
bit = (key[i/8] & (1 << (8 - ((i%8) + 1)))) > 0 ? 1 : 0;
key_str[i/8] |= (bit << (8 - ((i%8) + 1)));
i++;
}
}
// set the operator and add it to key string
//oper = part_no * tbl->[part_no] + array_no;
oper = tbl->h * tbl->tbl_id + row_no;
//oper = row_no;
memcpy(key_str + tbl->lgn/8, &oper, sizeof(unsigned int));
/*
// hash
MD5_CTX md5;
unsigned char digest[16];
MD5_Init(&md5);
MD5_Update(&md5, key_str, tbl->n/8 + sizeof(unsigned int));
MD5_Final(digest, &md5);
memcpy(&ret_bucket, digest, sizeof(unsigned int)); // take 1st 4 bytes
ret_bucket = (ret_bucket % tbl->K);
*/
ret_bucket = AwareHash(key_str,
(unsigned int)(tbl->lgn/8 + sizeof(unsigned int))) % (tbl->w);
// return
return ret_bucket;
}
dyn_tbl.hpp/cpp
关键词
关键字的定义,最大长度为13字节。
#define MAX_KEYLEN 13
/**
* Structure of key
*/
typedef struct dyn_tbl_key_s {
/// 13-byte key
unsigned char key[MAX_KEYLEN];
} dyn_tbl_key_t;
哈希键值
定义一个哈希函数的对象。返回该键值的哈希值
/**
* Object for hash
*/
typedef struct {
/// overloaded operation
long operator() (const dyn_tbl_key_t &k) const { return AwareHash((unsigned char*)k.key, MAX_KEYLEN); }
} dyn_tbl_key_hash;
比较函数
定义一个比较函数,判断两个字串是否相等
/**
* Object for equality
*/
typedef struct {
/// overloaded operation
bool operator() (const dyn_tbl_key_t &x, const dyn_tbl_key_t &y) const {
return memcmp(x.key, y.key, MAX_KEYLEN)==0;
}
} dyn_tbl_key_eq;
桶的数据结构
包括A(i,j),V(i,j),L(i,j),e(i,j),T,maximum.
/**
* Bucket structure
*/
typedef struct dyn_tbl_s {
/// associative array: A(i,j)
std::unordered_map<dyn_tbl_key_t, long long, dyn_tbl_key_hash, dyn_tbl_key_eq> array;
/// total sum: V(i,j)
long long total;
/// maximum length of counters allowed, exceeding this value would trigger expansion: l(i, j)
unsigned int max_len;
/// total number of decrement: e(i, j)
unsigned int decrement;
/// expansion parameter: T
long long T;
/// maximum sum among keys, to speed up detection
long long max_value;
/***********************
* read only members
***********************/
/// length of keys
unsigned int lgn;
} dyn_tbl_t;
初始化
有参传参,无参置零
dyn_tbl_t* dyn_tbl_init(unsigned int length, int lgn, long long T) {
dyn_tbl_t* ret = (dyn_tbl_t*)calloc(1, sizeof(dyn_tbl_t));
ret->lgn = lgn;
ret->max_len = length;
ret->decrement = 0;
ret->total = 0;
ret->T = T;
ret->max_value = 0;
return ret;
}
destroy
free掉这个桶的内存空间
void dyn_tbl_destroy(dyn_tbl_t* dyn_tbl) {
free(dyn_tbl);
}
重置函数
将桶的参数置零
void dyn_tbl_reset(dyn_tbl_t* dyn_tbl) {
dyn_tbl->array.clear();
dyn_tbl->decrement = 0;
dyn_tbl->total = 0;
dyn_tbl->max_value = 0;
}
复制函数
将桶的参数复制
void dyn_tbl_copy(dyn_tbl_t* dyn_tbl_from, dyn_tbl_t* dyn_tbl_to) {
dyn_tbl_to->array = dyn_tbl_from->array;
dyn_tbl_to->decrement = dyn_tbl_from->decrement;
dyn_tbl_to->total = dyn_tbl_from->total;
dyn_tbl_to->max_len = dyn_tbl_from->max_len;
dyn_tbl_to->max_value = dyn_tbl_from->max_value;
}
输出函数
输出桶内的值,将值输出到文件中。主要输出的是桶内的IP地址的值。
void dyn_tbl_print(dyn_tbl_t* dyn_tbl, const char* output) {
FILE* fp;
// open a file
if ((fp = fopen(output, "w")) == NULL) {
fprintf(stderr, "ERR: cannot open %s\n", output);
exit(-1);
}
unsigned int len = dyn_tbl->array.size();
fprintf(fp, "length: %u\n", len);
// for(std::unordered_map<dyn_tbl_key_t, long long>::iterator it = dyn_tbl->array.begin(); it != dyn_tbl->array.end(); ++it) {
for(auto it = dyn_tbl->array.begin(); it != dyn_tbl->array.end(); ++it) {
dyn_tbl_key_t key = it->first;
if (dyn_tbl->lgn == 32) {
char addr1[30];
unsigned int* ptr = (unsigned int*) key.key;
fprintf(fp, "%s %lld\n", ip2a(*ptr, addr1), it->second);
}
else {
char addr1[30];
char addr2[30];
unsigned int* ptr = (unsigned int*) key.key;
fprintf(fp, "%s %s %lld\n", ip2a(*ptr, addr1), ip2a(*(ptr+1), addr2), it->second);
}
}
// close the file
fclose(fp);
}
返回这个桶内的长度
int dyn_tbl_length(dyn_tbl_t* dyn_tbl) {
return dyn_tbl->array.size();
}