测试3000万数据统计输出
import hashlib def gen_test_data(): d = {} i = 0 f = open("./md5.txt", "w") while i < 30000000: md5 = hashlib.md5('adsf' + str(i)) n = int(md5.hexdigest(), 16) d[n] = i if i % 10000 == 0: print len(d) f.write("%d\n" % n) i += 1 def test(): d = {} f = open("./md5.txt") out_file = open("./result.txt", "w") i = 0 for line in f: # print line md5 = hashlib.md5(line) k = int(md5.hexdigest(), 16) n = d.get(k) if not n: n = 0 d[k] = n + 1 out_file.write("%s\t%d" % (line, n + 1)) if i % 10000 == 0: print i i += 1 # gen_test_data() test() # gen_test_data()