spark算子(二)
1.collect算子
*使用foreachACTION操作 ,collect在远程集群中遍历RDD的元素
*使用collect操作,将分布式在远程集群中的数据拉取到本地
*这种方式不建议使用,如果数据量大,会使用大量 的网络带宽
*这种方式不建议使用。
package kw.test.action; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; public class Collect { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<String> list = Arrays.asList("wo am xues","ni shi la ji ","zha zs","zha df"); JavaRDD<String> javaRDD = javaSparkContext.parallelize(list); JavaRDD<String> result = javaRDD.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String arg0) throws Exception { // TODO Auto-generated method stub //首选将数据分割,然后将和数据值机型压扁。 return Arrays.asList(arg0.split(" ")).iterator(); } }); List<String> str = result.collect(); for(String s : str) { System.out.println(s); } } }
2.count
* 这个是一个action,他是有返回值的。
package kw.test.action; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; /* * 这个是一个action,他是有返回值的。 */ public class count { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<Integer> list = Arrays.asList(1,2,3,4,5,6,7,8,9); JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list); long num = javaRDD.count(); System.out.println(num); } }
3.filter
*将返回true的数据进行输出
package kw.test.action; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction; public class Filter { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("Filter").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<Integer> list = Arrays.asList(1,2,3,4,5,6); JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list); JavaRDD<Integer> redult = javaRDD.filter(new Function<Integer, Boolean>() { @Override public Boolean call(Integer arg0) throws Exception { // TODO Auto-generated method stub return arg0%2==0;//返回的为true的就可以了。 } }); redult.foreach(new VoidFunction<Integer>() { @Override public void call(Integer arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0); } }); } }
4.coalesce
package kw.test.action; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.VoidFunction; /* * 此参数将返回一个RDD,分区的参数会别为设置的哪一个参数的个数 * * 这个返回是一个窄依赖,如果将一个100的变为10个partition的时候,这个时候 * 不会进行shuffle运算, * * 如果将好多个partition变为一个的时候,这个时候需要使用shuffle,他会进行shuffle的传递 * 一般是较少使用,因为需要将数据进行传递。 * * * * 不仅仅是将partition变得更少,同时也可以将partition变的更大,这个时候需要将shuffle变为true * 如果将partition变的更多的时候,也是需要将将shuffle设置为true的。 */ public class FilterOperter { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("Coalesce").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<Integer> list = Arrays.asList(1,2,3,4,5,6); JavaRDD<Integer> numbers = javaSparkContext.parallelize(list,6); //Colesce算子,在执行filter之后,有的partition上的数据就会变得很少,容易造成数据的倾斜 JavaRDD<String> result = numbers.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() { @Override public Iterator<String> call(Integer arg0, Iterator<Integer> arg1) throws Exception { // TODO Auto-generated method stub List<String> list = new ArrayList<String>(); while(arg1.hasNext()) { list.add(arg0+" "+arg1.next()); } return list.iterator(); } }, true); result.foreach(new VoidFunction<String>() { @Override public void call(String arg1) throws Exception { // TODO Auto-generated method stub System.out.println(arg1+" "); } }); JavaRDD<String> javaCoaxlesce = result.coalesce(3); JavaRDD<String> coalesce = javaCoaxlesce.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { @Override public Iterator<String> call(Integer arg0, Iterator<String> arg1) throws Exception { // TODO Auto-generated method stub List<String> list = new ArrayList<String>(); while(arg1.hasNext()) { list.add(arg1.next()+" "+arg0); } return list.iterator(); } }, true);//参数的含义:true的含义就是是否进行shuffle。默认是不进行shuffle coalesce.foreach(new VoidFunction<String>() { @Override public void call(String arg0) throws Exception { // TODO Auto-generated method stub System.out.println(" "+ arg0); } }); } }
5.floatmap
package kw.test.action; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.VoidFunction; /* * 先执行一个map操作,然后将数据flat压扁 */ public class FlatMap { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<String> list = Arrays.asList("wo am xues","ni shi la ji ","zha zs","zha df"); JavaRDD<String> javaRDD = javaSparkContext.parallelize(list); JavaRDD<String> result = javaRDD.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String arg0) throws Exception { // TODO Auto-generated method stub //首选将数据分割,然后将和数据值机型压扁。 return Arrays.asList(arg0.split(" ")).iterator(); } }); result.foreach(new VoidFunction<String>() { @Override public void call(String arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0); } }); } }
6.mappartition
package kw.test.action; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.VoidFunction; public class MapPartition { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("MapPartition").setMaster("local"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); List<String> list = Arrays.asList("wo","shi","lal","woowj","dffdds"); JavaRDD<String> javaRDD = sparkContext.parallelize(list); JavaRDD<String> result = javaRDD.mapPartitions(new FlatMapFunction<Iterator<String>, String>() { //将partition的数据全部给map,然后使用迭代器处理 @Override public Iterator<String> call(Iterator<String> arg0)throws Exception { // TODO Auto-generated method stub List<String> list = new ArrayList<String>(); while(arg0.hasNext() ) { list.add(arg0.next()); } return list.iterator(); } }); result.foreach(new VoidFunction<String>() { @Override public void call(String arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0); }}); } }
7.MapPartitionsWithIndex
package kw.test.action; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.VoidFunction; /* *k可以获取到里面的分区位置 */ public class MapPartitionsWithIndex { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("MapPartitionsWithIndex").setMaster("local"); JavaSparkContext sparkContext =new JavaSparkContext(sparkConf); List<String> list = Arrays.asList("kang","wang","ddd","kang1","wang2","ddd3"); JavaRDD<String> javaRDD = sparkContext.parallelize(list,4); JavaRDD<String> javaRDD2 = javaRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { @Override public Iterator<String> call(Integer arg0, Iterator<String> arg1) throws Exception { // TODO Auto-generated method stub List<String> list = new ArrayList<String>(); while(arg1.hasNext()) { list.add(arg1.next()+" "+ arg0); } return list.iterator(); } }, true); javaRDD2.foreach(new VoidFunction<String>() { @Override public void call(String arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0); } }); } }
8.Reduce
package kw.test.action; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; public class Reduce { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("Reduce").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<Integer> list= Arrays.asList(1,2,3,4,5,6,7); JavaRDD<Integer> num = javaSparkContext.parallelize(list); int sum = num.reduce(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer arg0, Integer arg1) throws Exception { // TODO Auto-generated method stub return arg0+arg1; } }); System.out.println("最终结果:"+sum); } }
9.ReduceByKey
package kw.test.action; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /* * ReduceByKey是一个shuffle操作 * * 在shuffle的时候,分为map端和reduce端 * * spark里面的reduceByKey在map端映带conbiner。 * 也就是在map中处理然后将其累加,减少了网络的传输,效率更高。 */ public class ReduceByKey { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("Reduce").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<Tuple2<String, Integer>> list= Arrays.asList(new Tuple2<String,Integer>("kang",100), new Tuple2<String,Integer>("kang",100), new Tuple2<String,Integer>("kang",100), new Tuple2<String,Integer>("kang",100), new Tuple2<String,Integer>("wang",100)); JavaPairRDD<String, Integer> num = javaSparkContext.parallelizePairs(list); JavaPairRDD<String, Integer> rr = num.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer arg0, Integer arg1) throws Exception { // TODO Auto-generated method stub return arg0+arg1; } }); rr.foreach(new VoidFunction<Tuple2<String,Integer>>() { @Override public void call(Tuple2<String, Integer> arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0); } }); } }
10.RePartition
package kw.test.action; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.VoidFunction; /* * repartition算子,用于任意数据量RDD的partition增多或者减少。 * coalesce仅仅将RDD的数量减少【其实不是这样的】 * 、 * * 建议使用的场景: * 一个很经典的场景,使用spark SQL从HIVE中查询数据的时候,spark SQLhui * 会根据HIVE对应的hdfs文件的block的数量决定加载出来的RDD的个数是 * 多少个,这里默认的partition的数量是我们根本无法设置的 * * * 有的时候,可能他会自动的设置partition的数量过于少,就进行优化 * 可以提高并发度,就是对RDD使用的partition的算子。 * * * 一般情况下,我们为了减少shuffle的时候,我们首选使用coalesce * 因为他可以避免shuffle操作。 * * */ public class RePartition { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("Coalesce").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<Integer> list = Arrays.asList(1,2,3,4,5,6); JavaRDD<Integer> numbers = javaSparkContext.parallelize(list); JavaRDD<String> result = numbers.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() { @Override public Iterator<String> call(Integer arg0, Iterator<Integer> arg1) throws Exception { // TODO Auto-generated method stub List<String> list = new ArrayList<String>(); while(arg1.hasNext()) { list.add(arg1.next()+" "+arg0); } return list.iterator(); } }, true); result.foreach(new VoidFunction<String>() { @Override public void call(String arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0+" "); } }); result.repartition(5); JavaRDD<String> result2 = result.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { @Override public Iterator<String> call(Integer arg0, Iterator<String> arg1) throws Exception { // TODO Auto-generated method stub List<String> list = new ArrayList<String>(); while(arg1.hasNext()) { list.add(arg1.next()+" "+arg0); } return list.iterator(); } }, true); result2.foreach(new VoidFunction<String>() { @Override public void call(String arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0+" "); } }); } }
11.Sample
package kw.test.action; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction; /* * 随机采样从,可以传入一个Float,不如数0.3就是采样30% * 比如说我在测试的时候使用的是5个单词,最后也就显示的是1个,当然了 Float的值是可以修改的,参数的个数也是可以自己给定的。 * * 这个也可以设置partition的个数,如果是一个就先手wordnum*float,如果是两个的时候,他们的值是每一个里面取出wordNum*float的值 */ public class Sample { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("sample").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<String> list = Arrays.asList("kwang","wang","is","a","student"); JavaRDD<String> num = javaSparkContext.parallelize(list); //参数:false就是抽取之后,不会将数据放回,最终的结果是不会有重复的,如果是true会将数据放回再次抽取,最终结果就会有重复的。 //它的第三个参数,种子,如果不指定就会自动产生一个种子,所以每次的结果都是不一样的,但是如果指定一个种子 //那么它的最终的结果都是一样的。此参数一般作为测试使用。 num.sample(false, 0.8).foreach(new VoidFunction<String>() { @Override public void call(String arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0); } }); } }
12.Take
package kw.test.action; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; /* * take是取数据,参数是几,我们就取其中的几个参数出来 */ public class Take { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("take").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<String> list = Arrays.asList("kang","wang","lala"); JavaRDD<String> result = javaSparkContext.parallelize(list); /* * List<String> list1 = Arrays.asList("kang","wang","lala"); List<Integer> list = Arrays.asList(1,2,3,4,5); JavaRDD<Integer> result = javaSparkContext.parallelize(list); JavaRDD<String> result1 = javaSparkContext.parallelize(list1); result.take(2); result1.take(num); */ List<String> name = result.take(2); for(String value :name) { System.out.println(value); } } }
13.TakeSample
package kw.test.action; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; /* * take是取出数据,sample是随机的取出数据 * * takeSample先进行sample在进行采样 */ public class TakeSample { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("takesample").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<String> list = Arrays.asList("kang1","wang1","lala1","kang2","wang2","lala2","kang3","wang3","lala3"); JavaRDD<String> result = javaSparkContext.parallelize(list); //第一个参数是否放回,第二个参数是数据的个数,第三个参数是种子。 如果种子一样,每一次的数据都是一样的,如果种子不一样,每一次的参数都不一样。 List<String> value = result.takeSample(false, 3); for(String v:value) { System.out.println(v); } } }
14.Union
package kw.test.action; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction; /* * 将两个list变为一个list,他不会进行shuffle操作,加入开始都是两个partition, * 将他们union之后,会是四个 */ public class Union { public static void main(String[] args) { // TODO Auto-generated method stub SparkConf sparkConf = new SparkConf().setAppName("union").setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); List<String> list1 = Arrays.asList("kang1","wang1","lala1","kang2","wang2","lala2"); List<String> list2 = Arrays.asList("kang3","wang3","lala3"); JavaRDD<String> result1 = javaSparkContext.parallelize(list1,2); JavaRDD<String> result2 = javaSparkContext.parallelize(list2,2); JavaRDD <String> unionre = result1.union(result2); unionre.foreach(new VoidFunction<String>() { @Override public void call(String arg0) throws Exception { // TODO Auto-generated method stub System.out.println(arg0); } }); } }