java操作spark1.2.0
虽然推荐的是scala,但是还是试一下
1 package org.admln.java7OperateSpark; 2 3 import java.util.Arrays; 4 import java.util.List; 5 import java.util.regex.Pattern; 6 7 import org.apache.spark.SparkConf; 8 import org.apache.spark.api.java.JavaPairRDD; 9 import org.apache.spark.api.java.JavaRDD; 10 import org.apache.spark.api.java.JavaSparkContext; 11 import org.apache.spark.api.java.function.FlatMapFunction; 12 import org.apache.spark.api.java.function.Function2; 13 import org.apache.spark.api.java.function.PairFunction; 14 15 import scala.Tuple2; 16 17 public class OperateSpark { 18 //单词切分分隔符 19 private static final Pattern SPACE = Pattern.compile(" "); 20 21 public static void main(String[] args) { 22 //初始化 23 SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount").setMaster("spark://hadoop:7077"); 24 JavaSparkContext ctx = new JavaSparkContext(sparkConf); 25 26 //第二个参数是文件的最小切分 27 JavaRDD<String> lines = ctx.textFile("hdfs://hadoop:8020/in/spark/javaOperateSpark/wordcount.txt"); 28 JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String,String>() { 29 public Iterable<String> call(String s) { 30 return Arrays.asList(SPACE.split(s)); 31 } 32 }); 33 34 //划成键值对 35 JavaPairRDD<String,Integer> ones = words.mapToPair(new PairFunction<String,String,Integer>() { 36 public Tuple2<String, Integer> call(String t) { 37 return new Tuple2<String,Integer>(t,1); 38 } 39 }); 40 41 JavaPairRDD<String,Integer> counts = ones.reduceByKey(new Function2<Integer,Integer,Integer>() { 42 public Integer call(Integer v1, Integer v2) { 43 return v1 + v2; 44 } 45 }); 46 47 List<Tuple2<String,Integer>> output = counts.collect(); 48 for(Tuple2<?,?> tuple : output) { 49 System.out.println(tuple._1() + ":" +tuple._2()); 50 } 51 counts.saveAsTextFile("hdfs://hadoop:8020/out/spark/javaOperateSpark2/"); 52 ctx.stop(); 53 } 54 }
运行的时候出现了错误
eclipse中为:
Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.hash.HashFunction.hashInt(I)Lcom/google/common/hash/HashCode; at org.apache.spark.util.collection.OpenHashSet.org$apache$spark$util$collection$OpenHashSet$$hashcode(OpenHashSet.scala:261) at org.apache.spark.util.collection.OpenHashSet$mcI$sp.getPos$mcI$sp(OpenHashSet.scala:165) at org.apache.spark.util.collection.OpenHashSet$mcI$sp.contains$mcI$sp(OpenHashSet.scala:102) at org.apache.spark.util.SizeEstimator$$anonfun$visitArray$2.apply$mcVI$sp(SizeEstimator.scala:214) at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:141) at org.apache.spark.util.SizeEstimator$.visitArray(SizeEstimator.scala:210) at org.apache.spark.util.SizeEstimator$.visitSingleObject(SizeEstimator.scala:169) at org.apache.spark.util.SizeEstimator$.org$apache$spark$util$SizeEstimator$$estimate(SizeEstimator.scala:161) at org.apache.spark.util.SizeEstimator$.estimate(SizeEstimator.scala:155) at org.apache.spark.util.collection.SizeTracker$class.takeSample(SizeTracker.scala:78) at org.apache.spark.util.collection.SizeTracker$class.afterUpdate(SizeTracker.scala:70) at org.apache.spark.util.collection.SizeTrackingVector.$plus$eq(SizeTrackingVector.scala:31) at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:249) at org.apache.spark.storage.MemoryStore.putIterator(MemoryStore.scala:136) at org.apache.spark.storage.MemoryStore.putIterator(MemoryStore.scala:114) at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:787) at org.apache.spark.storage.BlockManager.putIterator(BlockManager.scala:638) at org.apache.spark.storage.BlockManager.putSingle(BlockManager.scala:992) at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:98) at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:84) at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34) at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:29) at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:62) at org.apache.spark.SparkContext.broadcast(SparkContext.scala:945) at org.apache.spark.SparkContext.hadoopFile(SparkContext.scala:695) at org.apache.spark.SparkContext.textFile(SparkContext.scala:540) at org.apache.spark.api.java.JavaSparkContext.textFile(JavaSparkContext.scala:184) at org.admln.java7OperateSpark.OperateSpark.main(OperateSpark.java:27)
shell中为:
Exception in thread "main" java.lang.VerifyError: class org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$AddBlockRequestProto overrides final method getUnknownFields.()Lcom/google/protobuf/UnknownFieldSet; at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(ClassLoader.java:800) at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142) at java.net.URLClassLoader.defineClass(URLClassLoader.java:449) ... ... at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:358) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
可以看到是protobuf版本和hadoop的冲突了
默认spark1.2.0的protobuf版本为
而hadoop2.2.0的为protobuf2.5.0
所以修改spark中pom.xml后重新编译生成部署包(花费一个多小时)
再运行的话shell端成功。但是eclipse端仍然报那个错误
这是因为我用的maven引用的spark包,存在guava版本冲突,默认为
单独加一个依赖
<dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>14.0.1</version> </dependency>
然后eclipse提交的话不报错了,不过任务一直循环不执行,报告资源不够
WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient memory
然后把核数加到2,内存加到1500M,可是仍然报
INFO SparkDeploySchedulerBackend: Granted executor ID app-20150111003236-0000/3 on hostPort hadoop:34766 with 2 cores, 512.0 MB RAM
也就是说核数改了,但是执行内存改不了,不知道为什么,还有就是同样的程序shell端提交就正常执行,eclipse外部提交就报内存不足
改驱动的内存也不行。
我推测有两种可能的原因
1.spark的BUG,SPARK_DRIVER_MEMORY变量默认是512M,但是外部修改不生效;
2.centos的资源和本机windows的资源混乱了,因为我看到了
ERROR SparkDeploySchedulerBackend: Asked to remove non-existent executor 2
的错误,我本机是4核,虚拟机是2核。
不知道为什么网上没有eclipse提交的示例,应该要不就是本身就不支持,会和客户端资源混乱,要不就是还没人摸透。
欲为大树,何与草争;心若不动,风又奈何。