Spark读取HDFS文件,文件格式为GB2312,转换为UTF-8

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package iie.udps.example.operator.spark;
 
import scala.Tuple2;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
 
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.regex.Pattern;
 
/**
 * 利用Spark框架读取HDFS文件,文件格式为GB2312,转换为UTF-8,实现WordCount示例
 *
 * 执行命令:spark-submit --class iie.hadoop.hcatalog.TextFileSparkTest --master
 * yarn-cluster /tmp/sparkTest.jar hdfs://192.168.8.101/test/words
 * hdfs://192.168.8.101/test/spark/out
 *
 *
 */
public final class SparkChangeTextCharsetTest {
    private static final Pattern SPACE = Pattern.compile(",");
 
    @SuppressWarnings("serial")
    public static void main(String[] args) throws Exception {
 
        if (args.length < 2) {
            System.err.println("Usage: JavaWordCount <file>");
            System.exit(1);
        }
        String inputSparkFile = args[0];
        String outputSparkFile = args[1];
 
        SparkConf sparkConf = new SparkConf().setAppName("SparkWordCount");
        JavaSparkContext ctx = new JavaSparkContext(sparkConf);
        Configuration conf = new Configuration();
        JavaPairRDD<LongWritable, Text> contents = ctx.newAPIHadoopFile(
                inputSparkFile, TextInputFormat.class, LongWritable.class,
                Text.class, conf);
        JavaRDD<String> lines = contents
                .map(new Function<Tuple2<LongWritable, Text>, String>() {
 
                    public String call(Tuple2<LongWritable, Text> x) {
                        String lines = null;
                        try {
                            lines = new String(x._2().getBytes(), 0, x._2()
                                    .getLength(), "GB2312");
                        } catch (UnsupportedEncodingException e) {
                            e.printStackTrace();
                        }
                        return lines;
                    }
                });
        // JavaRDD<String> changeLines = lines
        // .filter(new Function<String, Boolean>() {
        // public Boolean call(String s) {
        // return s.contains("234");
        // }
        // });
         JavaRDD<String> words = lines
         .flatMap(new FlatMapFunction<String, String>() {
         @Override
         public Iterable<String> call(String s) {
         return Arrays.asList(SPACE.split(s));
         }
         });
         
         JavaPairRDD<String, Integer> ones = words
         .mapToPair(new PairFunction<String, String, Integer>() {
         
         @Override
         public Tuple2<String, Integer> call(String s) {
         return new Tuple2<String, Integer>(s, 1);
         }
         });
         
         JavaPairRDD<String, Integer> counts = ones
         .reduceByKey(new Function2<Integer, Integer, Integer>() {
         @Override
         public Integer call(Integer i1, Integer i2) {
         return i1 + i2;
         }
         });
         
         counts.map(new Function<Tuple2<String, Integer>, String>() {
         @Override
         public String call(Tuple2<String, Integer> arg0) throws Exception {
         return arg0._1.toUpperCase() + ": " + arg0._2;
         }
         }).saveAsTextFile(outputSparkFile);
 
        ctx.stop();
    }
}

  

posted on   XIAO的博客  阅读(3206)  评论(0编辑  收藏  举报

编辑推荐:
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
阅读排行:
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· DeepSeek如何颠覆传统软件测试?测试工程师会被淘汰吗?

导航

统计

点击右上角即可分享
微信分享提示