spark读hdfs文件实现wordcount并将结果存回hdfs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package iie.udps.example.operator.spark;
 
import scala.Tuple2;
 
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import java.util.Arrays;
import java.util.regex.Pattern;
 
/**
 * 利用Spark框架读取HDFS文件,实现WordCount示例
 *
 * 执行命令:spark-submit --class iie.hadoop.hcatalog.TextFileSparkTest --master
 * yarn-cluster /tmp/sparkTest.jar hdfs://192.168.8.101/test/words
 * hdfs://192.168.8.101/test/spark/out
 *
 * @author xiaodongfang
 *
 */
public final class TextFileSparkTest {
    private static final Pattern SPACE = Pattern.compile(" ");
 
    @SuppressWarnings("serial")
    public static void main(String[] args) throws Exception {
 
        if (args.length < 2) {
            System.err.println("Usage: JavaWordCount <file>");
            System.exit(1);
        }
        String inputSparkFile = args[0];
        String outputSparkFile = args[1];
 
        SparkConf sparkConf = new SparkConf().setAppName("SparkWordCount");
        JavaSparkContext ctx = new JavaSparkContext(sparkConf);
        JavaRDD<String> lines = ctx.textFile(inputSparkFile, 1);
        JavaRDD<String> words = lines
                .flatMap(new FlatMapFunction<String, String>() {
                    @Override
                    public Iterable<String> call(String s) {
                        return Arrays.asList(SPACE.split(s));
                    }
                });
 
        JavaPairRDD<String, Integer> ones = words
                .mapToPair(new PairFunction<String, String, Integer>() {
 
                    @Override
                    public Tuple2<String, Integer> call(String s) {
                        return new Tuple2<String, Integer>(s, 1);
                    }
                });
         
 
        JavaPairRDD<String, Integer> counts = ones
                .reduceByKey(new Function2<Integer, Integer, Integer>() {
                    @Override
                    public Integer call(Integer i1, Integer i2) {
                        return i1 + i2;
                    }
                });
 
        counts.map(new Function<Tuple2<String, Integer>, String>() {
            @Override
            public String call(Tuple2<String, Integer> arg0) throws Exception {
                return arg0._1.toUpperCase() + ": " + arg0._2;
            }
        }).saveAsTextFile(outputSparkFile);
 
        ctx.stop();
    }
}

  

posted on   XIAO的博客  阅读(2785)  评论(0编辑  收藏  举报

编辑推荐:
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
阅读排行:
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· DeepSeek如何颠覆传统软件测试?测试工程师会被淘汰吗?

导航

统计

点击右上角即可分享
微信分享提示