hadoop 蓄水池抽样 分布式抽样
#!/usr/bin/env python # -*- coding=utf8 -*- import sys import os import random #input split 1565 #>>> 3000000/1565.0 #1916.932907348243 #2000 K = 2000 pool = [] idxs = range(K) i = 0 for line in sys.stdin: line = line.strip() if i < K: pool.append(line) i += 1 else: p_i = 1.0 / i if random.random() < p_i: idx = random.choice(idxs) pool[idx] = line i += 1 print os.linesep.join(pool)