Spark 用Scala和Java分别实现wordcount
Scala
import org.apache.spark.{SparkConf, SparkContext} object wordcount { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("wc_java").setMaster("local[*]") val sc = new SparkContext(conf) val lines = sc.textFile("H:/server.properties") val rdd1 = lines.flatMap(line=>line.split(" ")) val totalLength = rdd1.map(word=>(word,1)) val total_KV = totalLength.reduceByKey(_+_) total_KV.collect() total_KV.foreach(println) } }
Java
import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; import java.util.ArrayList; import java.util.Iterator; import java.util.List; public class WordCountJava { public static void main(String[] args) { //创建SparkConf对象 SparkConf conf = new SparkConf(); conf.setAppName("WordCountJava2"); conf.setMaster("local"); //创建java sc JavaSparkContext sc = new JavaSparkContext(conf); //加载文本文件 JavaRDD<String> rdd1 = sc.textFile("d:/scala//test.txt"); //压扁 JavaRDD<String> rdd2 = rdd1.flatMap(new FlatMapFunction<String, String>() { public Iterator<String> call(String s) throws Exception { List<String> list = new ArrayList<String>(); String[] arr = s.split(" "); for(String ss :arr){ list.add(ss); } return list.iterator(); } }); //映射,word -> (word,1) JavaPairRDD<String,Integer> rdd3 = rdd2.mapToPair(new PairFunction<String, String, Integer>() { public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<String, Integer>(s,1); } }); //reduce化简 JavaPairRDD<String,Integer> rdd4 = rdd3.reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); // List<Tuple2<String,Integer>> list = rdd4.collect(); for(Tuple2<String, Integer> t : list){ System.out.println(t._1() + " : " + t._2()); } } }