LD-sketch源码阅读

util.h

IP转换函数,将二进制的IP地址转化为点分十进制的形式。

/*
 * Convert IP (in network order) to string
 */
inline char* ip2a(uint32_t ip, char* addr) {
    sprintf(addr, "%d.%d.%d.%d", ip & 0xff, (ip >> 8) & 0xff, (ip >> 16) &
            0xff, (ip >> 24) & 0xff);
    return addr;
}

hash.hpp/cpp

mangle函数

为解决在不同机器上 C++的二进制兼容问题,使用mangle函数转化源码中的函数,因为在不同的编译器或不同的编译器版本,编译后的名称可能会有不同。
(或是使用seq-hash,测量heavy hitter heavy changers,提升了内存使用率,减少了计算开销,最关键的是可以恢复流的信息
把一个key值按照位分为几个段,分别hash进一个二维表里面,最后再把这些二维表的值合并出这个key)

void mangle(const unsigned char* key, unsigned char* ret_key,
		int nbytes) {
	unsigned long long new_key = 0;
	int i;
	for (i=0; i<nbytes; ++i) {
		new_key |= key[nbytes-i-1] << (i * 8);
	}
	new_key = (new_key * 2083697005) & (0xffffffff);
	for (i=0; i<nbytes; ++i) {
		ret_key[i] = (new_key >> (i * 8)) & 0xff;
	}
}

GenHashSeed函数

产生一个随机数的种子返回值。

uint64_t seed = 0;
uint64_t GenHashSeed(int index) {
    /*
    if (index == 0) {
        srand(0);
    }
    */
    if (seed == 0) {
        seed = rand();
    }
    uint64_t x, y = seed + index;
    mangle((const unsigned char*)&y, (unsigned char*)&x, 8);
    return AwareHash((uint8_t*)&y, 8, 388650253, 388650319, 1176845762);
}

AwareHash模块

接受参数:原数据、原数据长度。哈希值的初始值为388650253,乘以规模388650319,再和数据相加后,向后移动一个数据,数据有多长就跑几次。数据中心的流量是非常巨大的,要将对如此巨大的流量产生随机分布的哈希值,这个哈希值也将非常巨大,也难怪在自己的虚拟机上,把数值调小了几倍还是溢出。最后对1176845762进行异或处理

/**
 * hash function
 * @param data the binary to be hashed
 * @param n the length of binary to be hashed
 */
static unsigned int AwareHash(const unsigned char* data, unsigned int n) {
	unsigned int hash = 388650253;
	unsigned int scale = 388650319;
	unsigned int hardener  = 1176845762;
	while( n ) {
		hash *= scale;
		hash += *data++;
		n--;
	}
	return hash ^ hardener;
}

LDSketch.hpp/cpp

LDSketch更新函数,对一个sketch插入键值对。

可在前部分使用mangle函数encode key值。利用LD_Sketch_find 函数,找到一个位置,再更新到桶内

/*
 * Sequential hashign algorithm - encode keys (a.k.a. update step)
 */
void LDSketch_update(LDSketch_t* sk, unsigned char* key, long long val) {
	int j, k;

	// mangle
	// mangle(key, key_to_add, tbl->n/8);

	// add key/val to table
	for (j=0; j<sk->h; ++j) {
		k = LDSketch_find(sk, key, 0, sk->lgn - 1, j);
		//tbl->T[j*tbl->w+k] += val;
		dyn_tbl_update(sk->tbl[j*sk->w+k], key, val);
	}
    //tbl->total += val;
}

LD_Sketch的find函数

找到一个sketch内的key的哈希值。把这个key的副本初始化为0,把key值的每一位复制到副本当中,针对这个副本的前八位按照某种方法处理一次,之后的位置就把key原值复制进去。operator为列数乘以哈希函数的id号加上用户输入的桶的位置,并把operator加入这个桶中。输入这个副本和长度,返回这个键值的哈希值。

unsigned int LDSketch_find(LDSketch_t* tbl, const unsigned char* key, int start_bit,
		int end_bit, int row_no) {
	unsigned char key_str[50];		// assume n/8 + 4 <= 50
	unsigned int oper;
	char bit;
	unsigned int ret_bucket;

	int i;

	// set the key string
	memset(key_str, 0, sizeof(key_str));
	i = start_bit;		// start_bit == 0 in all cases
	while (i <= end_bit) {
		if (end_bit - i + 1 >= 8) {
			key_str[i/8] = key[i/8];	
			i += 8;
		} else {
			bit = (key[i/8] & (1 << (8 - ((i%8) + 1)))) > 0 ?  1 : 0;
			key_str[i/8] |= (bit << (8 - ((i%8) + 1)));
			i++;
		}
	}

	// set the operator and add it to key string
	//oper = part_no * tbl->[part_no] + array_no;
	oper = tbl->h * tbl->tbl_id + row_no;
	//oper = row_no;
	memcpy(key_str + tbl->lgn/8, &oper, sizeof(unsigned int));

	/*
	// hash
	MD5_CTX md5;
	unsigned char digest[16];
	MD5_Init(&md5);
	MD5_Update(&md5, key_str, tbl->n/8 + sizeof(unsigned int));
	MD5_Final(digest, &md5);
	memcpy(&ret_bucket, digest, sizeof(unsigned int));	// take 1st 4 bytes
	ret_bucket = (ret_bucket % tbl->K);
	*/

	ret_bucket = AwareHash(key_str, 
			(unsigned int)(tbl->lgn/8 + sizeof(unsigned int))) % (tbl->w);

	// return
	return ret_bucket;
}

dyn_tbl.hpp/cpp

关键词

关键字的定义,最大长度为13字节。

#define MAX_KEYLEN 13

/**
 * Structure of key
 */
typedef struct dyn_tbl_key_s {
    /// 13-byte key
    unsigned char key[MAX_KEYLEN];
} dyn_tbl_key_t;

哈希键值

定义一个哈希函数的对象。返回该键值的哈希值

/**
 * Object for hash
 */
typedef struct {
    /// overloaded operation
    long operator() (const dyn_tbl_key_t &k) const { return AwareHash((unsigned char*)k.key, MAX_KEYLEN); }
} dyn_tbl_key_hash;

比较函数

定义一个比较函数,判断两个字串是否相等

/**
 * Object for equality
 */
typedef struct {
    /// overloaded operation
    bool operator() (const dyn_tbl_key_t &x, const dyn_tbl_key_t &y) const {
        return memcmp(x.key, y.key, MAX_KEYLEN)==0;
    }
} dyn_tbl_key_eq;

桶的数据结构

包括A(i,j),V(i,j),L(i,j),e(i,j),T,maximum.

/**
 * Bucket structure
 */
typedef struct dyn_tbl_s {

    /// associative array: A(i,j)
    std::unordered_map<dyn_tbl_key_t, long long, dyn_tbl_key_hash, dyn_tbl_key_eq> array;

    /// total sum: V(i,j)
    long long total;

    /// maximum length of counters allowed, exceeding this value would trigger expansion: l(i, j)
    unsigned int max_len;

    /// total number of decrement: e(i, j)
    unsigned int decrement;

    /// expansion parameter: T
    long long T;

    /// maximum sum among keys, to speed up detection
    long long max_value;

    /***********************
     * read only members
     ***********************/
    /// length of keys
    unsigned int lgn;
} dyn_tbl_t;

初始化

有参传参,无参置零

dyn_tbl_t* dyn_tbl_init(unsigned int length, int lgn, long long T) {
    dyn_tbl_t* ret = (dyn_tbl_t*)calloc(1, sizeof(dyn_tbl_t));
    ret->lgn = lgn;
    ret->max_len = length;
    ret->decrement = 0;
    ret->total = 0;
    ret->T = T;
    ret->max_value = 0;
    return ret;
}

destroy

free掉这个桶的内存空间

void dyn_tbl_destroy(dyn_tbl_t* dyn_tbl) {
    free(dyn_tbl);
}

重置函数

将桶的参数置零

void dyn_tbl_reset(dyn_tbl_t* dyn_tbl) {
    dyn_tbl->array.clear();
    dyn_tbl->decrement = 0;
    dyn_tbl->total = 0;
    dyn_tbl->max_value = 0;
}

复制函数

将桶的参数复制

void dyn_tbl_copy(dyn_tbl_t* dyn_tbl_from, dyn_tbl_t* dyn_tbl_to) {
    dyn_tbl_to->array = dyn_tbl_from->array;
    dyn_tbl_to->decrement = dyn_tbl_from->decrement;
    dyn_tbl_to->total = dyn_tbl_from->total;
    dyn_tbl_to->max_len = dyn_tbl_from->max_len;
    dyn_tbl_to->max_value = dyn_tbl_from->max_value;
}

输出函数

输出桶内的值,将值输出到文件中。主要输出的是桶内的IP地址的值。

void dyn_tbl_print(dyn_tbl_t* dyn_tbl, const char* output) {
    FILE* fp;

	// open a file
	if ((fp = fopen(output, "w")) == NULL) {
		fprintf(stderr, "ERR: cannot open %s\n", output);
		exit(-1);
	}

    unsigned int len = dyn_tbl->array.size();
    fprintf(fp, "length: %u\n", len);
    // for(std::unordered_map<dyn_tbl_key_t, long long>::iterator it = dyn_tbl->array.begin(); it != dyn_tbl->array.end(); ++it) {
    for(auto it = dyn_tbl->array.begin(); it != dyn_tbl->array.end(); ++it) {
        dyn_tbl_key_t key = it->first;
        if (dyn_tbl->lgn == 32) {
            char addr1[30];
            unsigned int* ptr = (unsigned int*) key.key;
            fprintf(fp, "%s %lld\n", ip2a(*ptr, addr1), it->second);
        }
        else {
            char addr1[30];
            char addr2[30];
            unsigned int* ptr = (unsigned int*) key.key;
            fprintf(fp, "%s %s %lld\n", ip2a(*ptr, addr1), ip2a(*(ptr+1), addr2), it->second);
        }
    }

	// close the file
	fclose(fp);
}

返回这个桶内的长度

int dyn_tbl_length(dyn_tbl_t* dyn_tbl) {
    return dyn_tbl->array.size();
}
posted @ 2018-11-19 19:40  范加索尔拉  阅读(937)  评论(0编辑  收藏  举报