spark 数据分析 分组取TopN

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
package com.swust.seltop;
 
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
 
import java.util.*;
 
/**
 *
 * @author 雪瞳
 * @Slogan 时钟尚且前行,人怎能再此止步!
 * @Function 分组取TopN
 *
 */
public class SortTopN {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("top");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        jsc.setLogLevel("Error");
 
        String inputPath = "./data/top.txt";
        JavaRDD<String> input = jsc.textFile(inputPath,1);
        //top10类
        JavaPairRDD<String, Integer> pairRDD = input.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String line) throws Exception {
                // 14 cat1 cat1
                String[] splits = line.split(" ");
                Tuple2<String, Integer> tp = new Tuple2<>(splits[0]+"\t"+splits[1]+"\t"+splits[2], Integer.parseInt(splits[0]));
                return tp;
            }
        });
        //为每一个分区创建一个本地 top10列表
        JavaRDD<SortedMap<Integer, String>> singleTop10 = pairRDD.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String, Integer>>, SortedMap<Integer, String>>() {
            @Override
            public Iterator<SortedMap<Integer, String>> call(Iterator<Tuple2<String, Integer>> iterator) throws Exception {
                SortedMap<Integer, String> top = new TreeMap<>();
                while (iterator.hasNext()) {
                    Tuple2<String, Integer> next = iterator.next();
                    top.put(next._2, next._1);
                    //保留正序前10
                    if (top.size() > 10) {
                        top.remove(top.firstKey());
                    }
                }
                List<SortedMap<Integer, String>> list = Collections.singletonList(top);
                return list.iterator();
            }
        });
        //收集所有本地的top10 列表
        List<SortedMap<Integer, String>> singleResult = singleTop10.collect();
        SortedMap<Integer,String> finalResult = new TreeMap<>();
        for (SortedMap<Integer, String> elements : singleResult){
            //遍历map并将数据存储到finalResult内
            Set<Map.Entry<Integer, String>> entries = elements.entrySet();
            for (Map.Entry<Integer,String> entry:entries){
                finalResult.put(entry.getKey(),entry.getValue());
            }
 
            if (finalResult.size()>10){
                finalResult.remove(finalResult.firstKey());
            }
        }
        //输出结果
        for (Map.Entry<Integer,String> entry : finalResult.entrySet()){
            System.err.println(entry.getKey()+"------"+entry.getValue());
        }
        // 替代方案 使用reduce进行数据迭代
        /*singleTop10.reduce(new Function2<SortedMap<Integer, String>, SortedMap<Integer, String>, SortedMap<Integer, String>>() {
            @Override
            public SortedMap<Integer, String> call(SortedMap<Integer, String> sm1, SortedMap<Integer, String> sm2) throws Exception {
                SortedMap<Integer,String> top10 = new TreeMap<>();
                for (Map.Entry<Integer,String> entry : sm1.entrySet()){
                    top10.put(entry.getKey(),entry.getValue());
                    if (top10.size()>10){
                        top10.remove(top10.firstKey());
                    }
                }
                for (Map.Entry<Integer,String> entry : sm2.entrySet()){
                    top10.put(entry.getKey(),entry.getValue());
                    if (top10.size()>10){
                        top10.remove(top10.firstKey());
                    }
                }
                return top10;
            }
        });*/
 
    }
}

  

posted @   雪瞳  阅读(370)  评论(0编辑  收藏  举报
编辑推荐:
· 深入理解 Mybatis 分库分表执行原理
· 如何打造一个高并发系统?
· .NET Core GC压缩(compact_phase)底层原理浅谈
· 现代计算机视觉入门之:什么是图片特征编码
· .NET 9 new features-C#13新的锁类型和语义
阅读排行:
· Sdcb Chats 技术博客:数据库 ID 选型的曲折之路 - 从 Guid 到自增 ID,再到
· 语音处理 开源项目 EchoSharp
· 《HelloGitHub》第 106 期
· Spring AI + Ollama 实现 deepseek-r1 的API服务和调用
· 使用 Dify + LLM 构建精确任务处理应用
点击右上角即可分享
微信分享提示