测试3000万数据统计输出

import hashlib


def gen_test_data():
    d = {}
    i = 0
    f = open("./md5.txt", "w")
    while i < 30000000:
        md5 = hashlib.md5('adsf' + str(i))
        n = int(md5.hexdigest(), 16)
        d[n] = i
        if i % 10000 == 0:
            print len(d)
        f.write("%d\n" % n)
        i += 1


def test():
    d = {}
    f = open("./md5.txt")
    out_file = open("./result.txt", "w")
    i = 0
    for line in f:
        # print line
        md5 = hashlib.md5(line)
        k = int(md5.hexdigest(), 16)
        n = d.get(k)
        if not n:
            n = 0
        d[k] = n + 1
        out_file.write("%s\t%d" % (line, n + 1))

        if i % 10000 == 0:
            print i
        i += 1


# gen_test_data()
test()





# gen_test_data()

 

posted on 2017-11-01 20:31  学而知之者  阅读(149)  评论(0编辑  收藏  举报