spark sql 的基本用法（python）

1、初始化：

　　a)声明：

　　　　from pyspark import SparkContext, SparkConf

　　　　from pyspark.sql import HiveContext, Row

　　　　from pyspark.sql import SQLContext, Row

　　　　from pyspark.sql.types import IntegerType

　　b)创建SQL上下文环境：

　　　　sc = SparkContext(conf = SparkConf().setAppName('local'))

　　　　sqlContext = SQLContext(sc)

2、数据输入：

　　a)读取json文件：

　　　　data = sqlContext.read.json("testweet.json")

　　b)list数据转换：

　　　　datalist = sc.parallelize([Row(name = "holden", favourite = "coffee"), Row(name = "golden", favourite = "tea")])

　　　　data = sqlContext.createDataFrame(datalist)

　　　　data的类型是DataFrame。

3、DataFrame操作：

　　　　data. printSchema()：输出结构信息；

　　　　data.select(“col_name”).show()：输出列col_name；

　　　　data.filter(data.col_name > num).show()：输出列col_name中大于num的部分；

　　　　data.groupBy(“col_name”).count().show()：输出排序后的列col_name，并给出每个值的出现次数。

4、注册为临时表，使用sql语句查询：

　　a)注册：

　　　　data.registerTempTable("table") #table为表名

　　b)sql查询：

　　　　ds = sqlContext.sql("""SELECT * FROM table""")

　　c)自定义函数：

　　　　sqlContext.registerFunction("strLen", lambda x: len(x), IntegerType()) #自定义函数

　　　　strLen ds = sqlContext.sql("""SELECT strLen(col) FROM table LIMIT 10""")

5、完整代码

（1）

 1     sc = SparkContext(conf = SparkConf().setAppName('local'))
 2     sqlContext = SQLContext(sc)
 3 
 4     hpRdd = sc.parallelize([Row(name = "holden", favourite = "coffee"), Row(name = "golden", favourite = "tea")])
 5     hpSchemaRdd = sqlContext.createDataFrame(hpRdd)
 6     hpSchemaRdd.select('name').show()
 7 
 8     hpSchemaRdd.registerTempTable("happypeople")
 9     hpselect = sqlContext.sql("""SELECT * FROM happypeople WHERE favourite = 'tea'""")
10     print hpselect.collect()
11 
12     sc.stop()

View Code

（2）

    sc = SparkContext(conf = SparkConf().setAppName('local'))
    sqlContext = SQLContext(sc)
    hiveContext = HiveContext(sc)

    #读取并查询json数据
    input = sqlContext.read.json("testweet.json")
    input.printSchema()    #输出json的结构信息：各项key及value的取值类型

    input.select("retweetCount").show()        #选取指定列输出
    input.filter(input.retweetCount == 0).show()    #筛选出retweetCount值为0的记录输出

    input.registerTempTable("tweets")    #将输入（SchemaRDD）注册为临时表
    topTweets = sqlContext.sql("""SELECT text,  retweetCount FROM tweets ORDER BY retweetCount LIMIT 10""")
    topTweets = sqlContext.sql("""SELECT text,  retweetCount FROM tweets WHERE retweetCount > 10""")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row: row.text)
    print topTweetText

View Code

发表于 2017-04-24 15:46 zz.fish 阅读(847) 评论(0) 收藏举报

公告