spark 基于key排序的wordcount
java
1 /**
2 * 根据单词次数排序的wordcount
3 * @author Tele
4 *
5 */
6 public class SortWordCount {
7 private static SparkConf conf = new SparkConf().setMaster("local").setAppName("sortwordcount");
8 private static JavaSparkContext jsc = new JavaSparkContext(conf);
9 private static String path = "D:\\inputword\\result.txt";
10
11 public static <U> void main(String[] args) {
12 JavaRDD<String> rdd = jsc.textFile(path);
13
14 /*
15 * JavaRDD<String> lines = rdd.flatMap(new FlatMapFunction<String,String>() {
16 *
17 * private static final long serialVersionUID = 1L;
18 *
19 * @Override public Iterator<String> call(String t) throws Exception { return
20 * Arrays.asList(t.split(" ")).iterator(); } });
21 *
22 * JavaPairRDD<String, Integer> tuples = lines.mapToPair(new
23 * PairFunction<String,String,Integer>() {
24 *
25 * private static final long serialVersionUID = 1L;
26 *
27 * @Override public Tuple2<String,Integer> call(String t) throws Exception {
28 * return new Tuple2<String,Integer>(t,1); } });
29 */
30
31 JavaPairRDD<String, Integer> tuples = rdd.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
32
33 private static final long serialVersionUID = 1L;
34
35 @Override
36 public Iterator<Tuple2<String, Integer>> call(String t) throws Exception {
37 Stream<Tuple2<String, Integer>> stream = Arrays.asList(t.split(" ")).stream()
38 .map(i -> new Tuple2<>(i, 1));
39 return stream.iterator();
40 }
41 });
42
43 JavaPairRDD<String, Integer> wc = tuples.reduceByKey(new Function2<Integer, Integer, Integer>() {
44
45 private static final long serialVersionUID = 1L;
46
47 @Override
48 public Integer call(Integer v1, Integer v2) throws Exception {
49 return v1 + v2;
50 }
51 });
52
53 // 将词频与单词互换位置
54 JavaPairRDD<Integer, String> cw = wc.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
55
56 private static final long serialVersionUID = 1L;
57
58 @Override
59 public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
60 return new Tuple2<Integer, String>(t._2, t._1);
61 }
62 });
63
64 JavaPairRDD<Integer, String> result = cw.sortByKey(false);
65 result.foreach(new VoidFunction<Tuple2<Integer, String>>() {
66
67 private static final long serialVersionUID = 1L;
68
69 @Override
70 public void call(Tuple2<Integer, String> t) throws Exception {
71 System.out.println(t._2 + "----" + t._1);
72 }
73 });
74
75 // 也可以在排序完毕后换成单词-词频的形式
76 /*
77 * JavaPairRDD<String, Integer> result = cw.sortByKey(false).mapToPair(new
78 * PairFunction<Tuple2<Integer,String>,String,Integer>() {
79 *
80 * private static final long serialVersionUID = 1L;
81 *
82 * @Override public Tuple2<String,Integer> call(Tuple2<Integer, String> t)
83 * throws Exception { return new Tuple2<String,Integer>(t._2,t._1); } });
84 *
85 * result.foreach(new VoidFunction<Tuple2<String,Integer>>() {
86 *
87 * private static final long serialVersionUID = 1L;
88 *
89 * @Override public void call(Tuple2<String, Integer> t) throws Exception {
90 * System.out.println(t._1 + "-------" + t._2); } });
91 */
92
93 jsc.close();
94 }
95 }
scala
1 object SortWordCount {
2 def main(args: Array[String]): Unit = {
3 val conf = new SparkConf().setMaster("local").setAppName("sortwordcount");
4 val sc = new SparkContext(conf);
5
6 val rdd = sc.textFile("D:\\inputword\\result.txt", 1);
7
8 val wordcount = rdd.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _);
9 wordcount.map(t => (t._2, t._1)).sortByKey(false, 1).map(t => (t._2, t._1)).foreach(t => println(t._1 + "-----" + t._2));
10
11 }
12 }