Spark-Java版本WordCount示例
首先创建Spark的Maven工程,我这里使用的是Eclipse。
1、编写WordCountApp代码
package com.mengyao.spark.java.core; import java.util.Arrays; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /** * Spark的WordCount程序 * @author mengyao * */ public class WordCountApp { public static void main(String[] args) { /** * 1、创建SparkConf对象,设置Spark应用程序的配置信息 */ SparkConf conf = new SparkConf() //设置Spark应用程序的名称 .setAppName(WordCountApp.class.getSimpleName()); /** * 2、创建SparkContext对象,Java开发使用JavaSparkContext;Scala开发使用SparkContext * 在Spark中,SparkContext负责连接Spark集群,创建RDD、累积量和广播量等。 * Master参数是为了创建TaskSchedule(较低级的调度器,高层次的调度器为DAGSchedule),如下: * 如果setMaster("local")则创建LocalSchedule; * 如果setMaster("spark")则创建SparkDeploySchedulerBackend。在SparkDeploySchedulerBackend的start函数,会启动一个Client对象,连接到Spark集群。 */ JavaSparkContext sc = new JavaSparkContext(conf); /** * 3、sc中提供了textFile方法是SparkContext中定义的,如下: * def textFile(path: String): JavaRDD[String] = sc.textFile(path) * 用来读取HDFS上的文本文件、集群中节点的本地文本文件或任何支持Hadoop的文件系统上的文本文件,它的返回值是JavaRDD[String],是文本文件每一行 */ JavaRDD<String> lines = sc.textFile("hdfs://soy1:9000/mapreduces/word.txt"); /** * 4、将行文本内容拆分为多个单词 * lines调用flatMap这个transformation算子(参数类型是FlatMapFunction接口实现类)返回每一行的每个单词 */ JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>(){ private static final long serialVersionUID = -3243665984299496473L; @Override public Iterable<String> call(String line) throws Exception { return Arrays.asList(line.split("\t")); } }); /** * 5、将每个单词的初始数量都标记为1个 * words调用mapToPair这个transformation算子(参数类型是PairFunction接口实现类,PairFunction<String, String, Integer>的三个参数是<输入单词, Tuple2的key, Tuple2的value>),返回一个新的RDD,即JavaPairRDD */ JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { private static final long serialVersionUID = -7879847028195817507L; @Override public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<String, Integer>(word, 1); } }); /** * 6、计算每个相同单词出现的次数 * pairs调用reduceByKey这个transformation算子(参数是Function2接口实现类)对每个key的value进行reduce操作,返回一个JavaPairRDD,这个JavaPairRDD中的每一个Tuple的key是单词、value则是相同单词次数的和 */ JavaPairRDD<String, Integer> wordCount = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = -4171349401750495688L; @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1+v2; } }); /** * 7、使用foreach这个action算子提交Spark应用程序 * 在Spark中,每个应用程序都需要transformation算子计算,最终由action算子触发作业提交 */ wordCount.foreach(new VoidFunction<Tuple2<String,Integer>>() { private static final long serialVersionUID = -5926812153234798612L; @Override public void call(Tuple2<String, Integer> wordCount) throws Exception { System.out.println(wordCount._1+":"+wordCount._2); } }); /** * 8、将计算结果文件输出到文件系统 * HDFS: * 使用新版API(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;) * wordCount.saveAsNewAPIHadoopFile("hdfs://ns1/spark/wordcount", Text.class, IntWritable.class, TextOutputFormat.class, new Configuration()); * 使用旧版API(org.apache.hadoop.mapred.JobConf;org.apache.hadoop.mapred.OutputFormat;) * wordCount.saveAsHadoopFile("hdfs://ns1/spark/wordcount", Text.class, IntWritable.class, OutputFormat.class, new JobConf(new Configuration())); * 使用默认TextOutputFile写入到HDFS(注意写入HDFS权限,如无权限则执行:hdfs dfs -chmod -R 777 /spark) * wordCount.saveAsTextFile("hdfs://soy1:9000/spark/wordCount"); */ wordCount.saveAsTextFile("hdfs://soy1:9000/spark/wordCount"); /** * 9、关闭SparkContext容器,结束本次作业 */ sc.close(); } }
2、打成jar包上传到集群中
3、使用spark/bin/spark-submit工具提交Spark应用到集群中(运行模式为yarn-cluster)
bin/spark-submit --class com.mengyao.spark.java.core.WordCountApp --master yarn-cluster --num-executors 3 --driver-memory 512m --executor-cores 3 /usr/local/apps/spark.java-0.0.1-SNAPSHOT-jar-with-dependencies.jar
4、POM文件如下:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.mengyao</groupId> <artifactId>spark.java</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>spark.java</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <junit.version>4.10</junit.version> <spark.version>1.3.0</spark.version> <hadoop.version>2.4.1</hadoop.version> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>${junit.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.10</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.10</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka_2.10</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> </dependency> </dependencies> <build> <sourceDirectory>src/main/java</sourceDirectory> <testSourceDirectory>src/main/test</testSourceDirectory> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> <archive> <manifest> <mainClass></mainClass> </manifest> </archive> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> <plugin> <groupId>org.codehaus.mojo</groupId> <artifactId>exec-maven-plugin</artifactId> <version>1.2.1</version> <executions> <execution> <goals> <goal>exec</goal> </goals> </execution> </executions> <configuration> <executable>java</executable> <includeProjectDependencies>true</includeProjectDependencies> <includePluginDependencies>false</includePluginDependencies> <classpathScope>compile</classpathScope> <mainClass>com.mengyao.spark.java.core.WordCountApp</mainClass> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>2.3.2</version> <configuration> <source>1.6</source> <target>1.6</target> </configuration> </plugin> </plugins> </build> </project>
SparkSession spark = SparkSession.builder()
.master("local[2]")
.appName("ALSCase")
.getOrCreate();
// 从SparkSession.sparkContext()中创建RDD(使用Java调用ScalaAPI)
JavaRDD<Rating> ratingsRDD = spark.sparkContext()
.parallelize(JavaConverters.asScalaBufferConverter(exampleData).asScala().seq(), 1, ClassManifestFactory.classType(String.class))
.toJavaRDD()
.map(Rating::parseRating);
System.out.println(ratingsRDD.count());