spark包
spark-assembly-1.5.2-hadoop2.6.0.jar
http://blog.csdn.net/ronaldo4511/article/details/53035494
http://doc.okbase.net/qindongliang1922/archive/172752.html
name := "spark-hello" version := "1.0" scalaVersion := "2.11.7" //使用公司的私服 resolvers += "Local Maven Repository" at "http://dev.bizbook-inc.com:8083/nexus/content/groups/public/" //使用内部仓储 externalResolvers := Resolver.withDefaultResolvers(resolvers.value, mavenCentral = false) //Hadoop的依赖 libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.7.1" //Spark的依赖 libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "1.4.1" //Spark SQL 依赖 libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "1.4.1" //java servlet 依赖 libraryDependencies += "javax.servlet" % "javax.servlet-api" % "3.0.1"
demo1:使用Scala读取HDFS的数据:
/** * * Spark读取来自HDFS的数据 */ def readDataFromHDFS(): Unit ={ //以standalone方式运行,提交到远程的spark集群上面 val conf = new SparkConf().setMaster("spark://h1:7077").setAppName("load hdfs data") conf.setJars(Seq(jarPaths)); //得到一个Sprak上下文 val sc = new SparkContext(conf) val textFile=sc.textFile("hdfs://h1:8020/user/webmaster/crawldb/etl_monitor/part-m-00000") //获取第一条数据 //val data=textFile.first() // println(data) //遍历打印 /**
libraryDependencies+="org.apache.spark"%"spark-sql_2.11"%"2.1.0"%"provided"
libraryDependencies+="org.apache.spark"%"spark-mllib_2.11"%"2.1.0"%"provided"
libraryDependencies+="org.apache.spark"%"spark-graphx_2.11"%"2.1.0"%"provided"
libraryDependencies+="org.apache.spark"%"spark-streaming_2.11"%"2.1.0"%"provided"
libraryDependencies+="com.github.scopt"%"scopt_2.11"%"3.5.0"
libraryDependencies+="org.apache.spark"%"spark-streaming-kafka-0-8_2.11"%"2.1.0"
libraryDependencies+="org.apache.spark"%"spark-streaming-flume_2.11"%"2.1.0"