spark map flatmap collect方法

spark rdd的转化方法

rdd作为抽象分布式数据集,有常见的转化函数,比如map,flatmap,collect

map和flatMap方法区别

flatmap返回的是扁平化的数值,返回的更多。
map返回的T数目跟原来是一样的,对原来的数据做了处理仍然封装了在一起

collect返回

collect返回的是一个rdd数据List,打印出来就是数组

例子

package com.learn.hadoop.spark.doc.analysis.chpater.rdd;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

import java.util.Arrays;
import java.util.Iterator;

/**
 * map与flatmap的区别
 * map
 * [hello, spark, world]
 * [hello, java, world]
 * [hello, python, world]
 *
 * flatmap
 * hello
 * spark
 * world
 * hello
 * java
 * world
 * hello
 * python
 * world
 *
 * 测试collect方法
 * [[hello, spark, world], [hello, java, world], [hello, python, world]]
 */
public class RddTest02 {
    public static void main(String[] args) {
        //SparkConf sparkConf =new SparkConf().setMaster("local").setAppName("wordCount");
        //System.setProperty("hadoop.home.dir","D:\\work\\tools\\hadoop-3.0.0");
        SparkConf sparkConf =new SparkConf().setMaster("local").setAppName("RddTest01");
        JavaSparkContext sc =new JavaSparkContext(sparkConf);
        JavaRDD<String> rdd =sc.parallelize(Arrays.asList("hello spark world ","hello java world","hello python world"));
        
        JavaRDD<String> flatmap =rdd.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterator<String> call(String s) throws Exception {
                return Arrays.asList(s.split(" ")).iterator();
            }
        });
        System.out.println("flat map:");
        flatmap.foreach(s -> System.out.println(s));
        System.out.println("map");
        JavaRDD<String> rddmap =rdd.map(new Function<String, String>() {
            @Override
            public String call(String s) throws Exception {
                return Arrays.asList(s.split(" ")).toString();
            }
        });
        rddmap.foreach(s -> System.out.println(s));

        //返回的是一个List,但打印来就是一个数组
        System.out.println("test collect");
        System.out.println(rddmap.collect());

    }
}

posted @ 2020-03-03 10:36  碧海潮心  阅读(674)  评论(0编辑  收藏  举报