spark和flink中计算topN的方法

一、SPARK

  其中top算子调用的takeOrdered算子,takeOrdered算子底层使用的是优先队列(BoundedPriorityQueue),首先进入的是mapPatition,然后使用reduce将每个分区数据进行合并

  • sortBy + take
    val url: URL = Launcher.getClass.getClassLoader.getResource("word.dat")
    val lines: RDD[String] = sc.textFile(url.getPath)
    lines
       .flatMap(_.split("\\s+"))
       .map(_ -> 1)
       .reduceByKey(_ + _)
       .sortBy(_._2, false)
       .take(3)
       .foreach(println)
  • top
    val url: URL = Launcher.getClass.getClassLoader.getResource("word.dat")
    val lines: RDD[String] = sc.textFile(url.getPath)
    lines
       .flatMap(_.split("\\s+"))
       .map(_ -> 1)
       .reduceByKey(_ + _)
       .top(3)(Ordering.by(o => o._2))
       .foreach(println)
            
  • takeOrdered
    val url: URL = Launcher.getClass.getClassLoader.getResource("word.dat")
    val lines: RDD[String] = sc.textFile(url.getPath)
    lines
       .flatMap(_.split("\\s+"))
       .map(_ -> 1)
       .reduceByKey(_ + _)
       .takeOrdered(3)(Ordering.by(o => -o._2))
       .foreach(println)

     

二、FLINK

  • TreeMap
    public class TopNWithStateStat {
        public static void main(String[] args) throws Exception {
    
            StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
            env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
            env.setParallelism(1);
    
            DataStreamSource<String> lines = env.socketTextStream("node", 9999);
    
            SingleOutputStreamOperator<Tuple2<String, Integer>> wordcount = lines
                    .flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
                        @Override
                        public void flatMap(String line, Collector<Tuple2<String, Integer>> collector) throws Exception {
                            String[] arr = line.split("\\s+");
                            for (String word : arr) {
                                collector.collect(Tuple2.of(word, 1));
                            }
                        }
                    })
                    .keyBy(t -> t.f0)
                    .window(SlidingProcessingTimeWindows.of(Time.minutes(5), Time.seconds(30)))
                    .sum(1);
    
            wordcount
                    .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(30)))
                    .process(new TopNWithTreeMap(3))
                    .printToErr();
    
    
            env.execute();
        }
    
        public static class TopNWithTreeMap extends ProcessAllWindowFunction<Tuple2<String, Integer>, String, TimeWindow> {
    
            private Integer n = 3;
    
            public TopNWithTreeMap(Integer n) {
                this.n = n;
            }
    
            @Override
            public void process(Context context, Iterable<Tuple2<String, Integer>> iterable, Collector<String> collector) throws Exception {
                TreeMap<Integer, Tuple2<String, Integer>> treeMap = new TreeMap<Integer, Tuple2<String, Integer>>(
                        new Comparator<Integer>() {
                            @Override
                            public int compare(Integer y, Integer x) {
                                return (x < y) ? -1 : 1;
                            }
                        });
                Iterator<Tuple2<String, Integer>> iterator = iterable.iterator();
                while (iterator.hasNext()) {
                    Tuple2<String, Integer> t = iterator.next();
                    treeMap.put(t.f1, t);
                    if (treeMap.size() > n) {
                        treeMap.pollLastEntry();
                    }
                }
                
                for (Map.Entry<Integer, Tuple2<String, Integer>> entry : treeMap.entrySet()) {
                    Tuple2<String, Integer> value = entry.getValue();
                    collector.collect(String.format("%s:%d", value.f0, value.f1));
                }
            }
        }
    }
  • 小顶堆
    public static class TopNWithQueue extends ProcessAllWindowFunction<Tuple2<String, Integer>, String, TimeWindow> {
    
            private Integer n = 3;
    
            public TopNWithQueue(Integer n) {
                this.n = n;
            }
    
            @Override
            public void process(Context context, Iterable<Tuple2<String, Integer>> iterable, Collector<String> collector) throws Exception {
                PriorityQueue<Tuple2<String, Integer>> priorityQueue = new PriorityQueue<>(new Comparator<Tuple2<String, Integer>>() {
                    @Override
                    public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
                        return (o1.f1 < o2.f1) ? -1 : 1;
                    }
                });
                Iterator<Tuple2<String, Integer>> iterator = iterable.iterator();
                while (iterator.hasNext()) {
                    priorityQueue.add(iterator.next());
                    if (priorityQueue.size() > n) {
                        priorityQueue.poll();
                    }
                }
                
                for (Tuple2<String, Integer> t : priorityQueue) {
                    collector.collect(String.format("%s-%s:%d", t.f0, t.f1));
                }
            }
        }

     

posted @ 2021-12-13 14:44  Shydow  阅读(282)  评论(0编辑  收藏  举报