spark和flink中计算topN的方法
一、SPARK
其中top算子调用的takeOrdered算子,takeOrdered算子底层使用的是优先队列(BoundedPriorityQueue),首先进入的是mapPatition,然后使用reduce将每个分区数据进行合并
- sortBy + take
val url: URL = Launcher.getClass.getClassLoader.getResource("word.dat") val lines: RDD[String] = sc.textFile(url.getPath) lines .flatMap(_.split("\\s+")) .map(_ -> 1) .reduceByKey(_ + _) .sortBy(_._2, false) .take(3) .foreach(println)
- top
val url: URL = Launcher.getClass.getClassLoader.getResource("word.dat") val lines: RDD[String] = sc.textFile(url.getPath) lines .flatMap(_.split("\\s+")) .map(_ -> 1) .reduceByKey(_ + _) .top(3)(Ordering.by(o => o._2)) .foreach(println)
- takeOrdered
val url: URL = Launcher.getClass.getClassLoader.getResource("word.dat") val lines: RDD[String] = sc.textFile(url.getPath) lines .flatMap(_.split("\\s+")) .map(_ -> 1) .reduceByKey(_ + _) .takeOrdered(3)(Ordering.by(o => -o._2)) .foreach(println)
二、FLINK
- TreeMap
public class TopNWithStateStat { public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); env.setParallelism(1); DataStreamSource<String> lines = env.socketTextStream("node", 9999); SingleOutputStreamOperator<Tuple2<String, Integer>> wordcount = lines .flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String line, Collector<Tuple2<String, Integer>> collector) throws Exception { String[] arr = line.split("\\s+"); for (String word : arr) { collector.collect(Tuple2.of(word, 1)); } } }) .keyBy(t -> t.f0) .window(SlidingProcessingTimeWindows.of(Time.minutes(5), Time.seconds(30))) .sum(1); wordcount .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(30))) .process(new TopNWithTreeMap(3)) .printToErr(); env.execute(); } public static class TopNWithTreeMap extends ProcessAllWindowFunction<Tuple2<String, Integer>, String, TimeWindow> { private Integer n = 3; public TopNWithTreeMap(Integer n) { this.n = n; } @Override public void process(Context context, Iterable<Tuple2<String, Integer>> iterable, Collector<String> collector) throws Exception { TreeMap<Integer, Tuple2<String, Integer>> treeMap = new TreeMap<Integer, Tuple2<String, Integer>>( new Comparator<Integer>() { @Override public int compare(Integer y, Integer x) { return (x < y) ? -1 : 1; } }); Iterator<Tuple2<String, Integer>> iterator = iterable.iterator(); while (iterator.hasNext()) { Tuple2<String, Integer> t = iterator.next(); treeMap.put(t.f1, t); if (treeMap.size() > n) { treeMap.pollLastEntry(); } } for (Map.Entry<Integer, Tuple2<String, Integer>> entry : treeMap.entrySet()) { Tuple2<String, Integer> value = entry.getValue(); collector.collect(String.format("%s:%d", value.f0, value.f1)); } } } }
- 小顶堆
public static class TopNWithQueue extends ProcessAllWindowFunction<Tuple2<String, Integer>, String, TimeWindow> { private Integer n = 3; public TopNWithQueue(Integer n) { this.n = n; } @Override public void process(Context context, Iterable<Tuple2<String, Integer>> iterable, Collector<String> collector) throws Exception { PriorityQueue<Tuple2<String, Integer>> priorityQueue = new PriorityQueue<>(new Comparator<Tuple2<String, Integer>>() { @Override public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) { return (o1.f1 < o2.f1) ? -1 : 1; } }); Iterator<Tuple2<String, Integer>> iterator = iterable.iterator(); while (iterator.hasNext()) { priorityQueue.add(iterator.next()); if (priorityQueue.size() > n) { priorityQueue.poll(); } } for (Tuple2<String, Integer> t : priorityQueue) { collector.collect(String.format("%s-%s:%d", t.f0, t.f1)); } } }