spark
spark sql基础操作
SQLContext可能需要自己创建。
var sqlContext=new org.apache.spark.sql SQLContext(sc)
sc 是指spark context。
scala> val textFile=sc.textFile("file:///usr/local/spark/README.md")
textFile: org.apache.spark.rdd.RDD[String] = file:///usr/local/spark/README.md MapPartitionsRDD[1] at textFile at <console>:24
scala> textFile.count
res0: Long = 104
scala> textFile.first
res1: String = # Apache Spark
scala> textFile.filter(line=>line.contains("Spark")).count
res2: Long = 20
scala> var df=sqlContext.read.json("file:///usr/local/spark/examples/src/main/resources/people.json")
<console>:23: error: not found: value sqlContext
var df=sqlContext.read.json("file:///usr/local/spark/examples/src/main/resources/people.json")
^
scala>
scala> var sqlContext=new org.apache.spark.sql SQLContext(sc)
<console>:24: error: type sql is not a member of package org.apache.spark
var sqlContext=new org.apache.spark.sql SQLContext(sc)
^
scala> var sqlContext=new org.apache.spark.sql.SQLContext(sc)
warning: there was one deprecation warning; re-run with -deprecation for details
sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@576a8b00
scala> var df=sqlContext.read.json("file:///usr/local/spark/examples/src/main/resources/people.json")
18/06/21 22:33:50 WARN metastore.ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
18/06/21 22:33:51 WARN metastore.ObjectStore: Failed to get database default, returning NoSuchObjectException
18/06/21 22:33:54 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
scala> df.show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
scala> df.select("name").show
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
scala> df.select(df("name"),de("age")+1).show//age +1
<console>:29: error: not found: value de
df.select(df("name"),de("age")+1).show//age +1
^
scala> df.select(df("name"),df("age")+1).show//age +1
+-------+---------+
| name|(age + 1)|
+-------+---------+
|Michael| null|
| Andy| 31|
| Justin| 20|
+-------+---------+
scala> df.filter(df("age")>21).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+
scala> df.groupBy("age").count().show
+----+-----+
| age|count|
+----+-----+
| 19| 1|
|null| 1|
| 30| 1|
+----+-----+
scala> df.registerTempTable("people")
warning: there was one deprecation warning; re-run with -deprecation for details
scala> val result=sqlContext.sql("select name,age from people")
result: org.apache.spark.sql.DataFrame = [name: string, age: bigint]
scala> result.show
+-------+----+
| name| age|
+-------+----+
|Michael|null|
| Andy| 30|
| Justin| 19|
+-------+----+