Spark-Java版本WordCount示例

首先创建Spark的Maven工程,我这里使用的是Eclipse。

1、编写WordCountApp代码

package com.mengyao.spark.java.core;

import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

/**
 * Spark的WordCount程序
 * @author mengyao
 *
 */
public class WordCountApp {

    public static void main(String[] args) {
        
        /**
         * 1、创建SparkConf对象,设置Spark应用程序的配置信息
         */
        SparkConf conf = new SparkConf()
                //设置Spark应用程序的名称
                .setAppName(WordCountApp.class.getSimpleName());
      /**
         * 2、创建SparkContext对象,Java开发使用JavaSparkContext;Scala开发使用SparkContext
         * 在Spark中,SparkContext负责连接Spark集群,创建RDD、累积量和广播量等。
         * Master参数是为了创建TaskSchedule(较低级的调度器,高层次的调度器为DAGSchedule),如下:
         *         如果setMaster("local")则创建LocalSchedule;
         *         如果setMaster("spark")则创建SparkDeploySchedulerBackend。在SparkDeploySchedulerBackend的start函数,会启动一个Client对象,连接到Spark集群。
         */
        JavaSparkContext sc = new JavaSparkContext(conf);
        
        /**
         * 3、sc中提供了textFile方法是SparkContext中定义的,如下:
         *         def textFile(path: String): JavaRDD[String] = sc.textFile(path)    
         * 用来读取HDFS上的文本文件、集群中节点的本地文本文件或任何支持Hadoop的文件系统上的文本文件,它的返回值是JavaRDD[String],是文本文件每一行
         */
        JavaRDD<String> lines = sc.textFile("hdfs://soy1:9000/mapreduces/word.txt");

        /**
         * 4、将行文本内容拆分为多个单词
         * lines调用flatMap这个transformation算子(参数类型是FlatMapFunction接口实现类)返回每一行的每个单词
         */
        JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>(){
            private static final long serialVersionUID = -3243665984299496473L;
            @Override
            public Iterable<String> call(String line) throws Exception {
                return Arrays.asList(line.split("\t"));
            }
        });
        
        /**
         * 5、将每个单词的初始数量都标记为1个
         * words调用mapToPair这个transformation算子(参数类型是PairFunction接口实现类,PairFunction<String, String, Integer>的三个参数是<输入单词, Tuple2的key, Tuple2的value>),返回一个新的RDD,即JavaPairRDD
         */
        JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
            private static final long serialVersionUID = -7879847028195817507L;
            @Override
            public Tuple2<String, Integer> call(String word) throws Exception {
                return new Tuple2<String, Integer>(word, 1);
            }
        });
        
        /**
         * 6、计算每个相同单词出现的次数
         * pairs调用reduceByKey这个transformation算子(参数是Function2接口实现类)对每个key的value进行reduce操作,返回一个JavaPairRDD,这个JavaPairRDD中的每一个Tuple的key是单词、value则是相同单词次数的和
         */
        JavaPairRDD<String, Integer> wordCount = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
            private static final long serialVersionUID = -4171349401750495688L;
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1+v2;
            }
        });
        
        /**
         * 7、使用foreach这个action算子提交Spark应用程序
         * 在Spark中,每个应用程序都需要transformation算子计算,最终由action算子触发作业提交
         */
        wordCount.foreach(new VoidFunction<Tuple2<String,Integer>>() {
            private static final long serialVersionUID = -5926812153234798612L;
            @Override
            public void call(Tuple2<String, Integer> wordCount) throws Exception {
                System.out.println(wordCount._1+":"+wordCount._2);
            }
        });
        
        /**
         * 8、将计算结果文件输出到文件系统
         *         HDFS:
         *             使用新版API(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;)
         *                 wordCount.saveAsNewAPIHadoopFile("hdfs://ns1/spark/wordcount", Text.class, IntWritable.class, TextOutputFormat.class, new Configuration());
         *             使用旧版API(org.apache.hadoop.mapred.JobConf;org.apache.hadoop.mapred.OutputFormat;)
         *                 wordCount.saveAsHadoopFile("hdfs://ns1/spark/wordcount", Text.class, IntWritable.class, OutputFormat.class, new JobConf(new Configuration()));
         *             使用默认TextOutputFile写入到HDFS(注意写入HDFS权限,如无权限则执行:hdfs dfs -chmod -R 777 /spark)
         *                 wordCount.saveAsTextFile("hdfs://soy1:9000/spark/wordCount");
         */
        wordCount.saveAsTextFile("hdfs://soy1:9000/spark/wordCount");
        
        /**
         * 9、关闭SparkContext容器,结束本次作业
         */
        sc.close();
        
    }

}

 

2、打成jar包上传到集群中

 

3、使用spark/bin/spark-submit工具提交Spark应用到集群中(运行模式为yarn-cluster)

bin/spark-submit

--class com.mengyao.spark.java.core.WordCountApp

--master yarn-cluster

--num-executors 3

--driver-memory 512m

--executor-cores 3

/usr/local/apps/spark.java-0.0.1-SNAPSHOT-jar-with-dependencies.jar

 

4、POM文件如下:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.mengyao</groupId>
  <artifactId>spark.java</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>spark.java</name>
  <url>http://maven.apache.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <junit.version>4.10</junit.version>
    <spark.version>1.3.0</spark.version>
    <hadoop.version>2.4.1</hadoop.version>
  </properties>

  <dependencies>
      <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>${junit.version}</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.10</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.10</artifactId>
      <version>${spark.version}</version>
      </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.10</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming_2.10</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka_2.10</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>${hadoop.version}</version>
    </dependency>
  </dependencies>
  
  <build>
    <sourceDirectory>src/main/java</sourceDirectory>
    <testSourceDirectory>src/main/test</testSourceDirectory>
    
    <plugins>
      <plugin>
        <artifactId>maven-assembly-plugin</artifactId>
        <configuration>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
          <archive>
            <manifest>
              <mainClass></mainClass>
            </manifest>
          </archive>
        </configuration>
        <executions>
          <execution>
            <id>make-assembly</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>

      <plugin>
        <groupId>org.codehaus.mojo</groupId>
        <artifactId>exec-maven-plugin</artifactId>
        <version>1.2.1</version>
        <executions>
          <execution>
            <goals>
              <goal>exec</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <executable>java</executable>
          <includeProjectDependencies>true</includeProjectDependencies>
          <includePluginDependencies>false</includePluginDependencies>
          <classpathScope>compile</classpathScope>
          <mainClass>com.mengyao.spark.java.core.WordCountApp</mainClass>
        </configuration>
      </plugin>

      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>2.3.2</version>
        <configuration>
          <source>1.6</source>
          <target>1.6</target>
        </configuration>
      </plugin>

    </plugins>
  </build>
</project>

 

SparkSession spark = SparkSession.builder()
.master("local[2]")
.appName("ALSCase")
.getOrCreate();

// 从SparkSession.sparkContext()中创建RDD(使用Java调用ScalaAPI)

JavaRDD<Rating> ratingsRDD = spark.sparkContext()
.parallelize(JavaConverters.asScalaBufferConverter(exampleData).asScala().seq(), 1, ClassManifestFactory.classType(String.class))
.toJavaRDD()
.map(Rating::parseRating);
System.out.println(ratingsRDD.count());

posted @ 2015-12-19 18:42  孟尧  阅读(2981)  评论(0编辑  收藏  举报