hive读取json数据
- 查看hive的数据库
hive> show databases;
OK
bigdata
default
Time taken: 0.592 seconds, Fetched: 2 row(s)
- 将json数据上传到服务器
[root@master data]# pwd
/home/data
[root@master data]# ls
mini.json
- 编写代码
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import StringType
if __name__ == '__main__':
spark = SparkSession.builder.\
appName("SparkSQL Example").\
master("local[*]").\
config("spark.sql.shuffle.partitions", "2").\
config("hive.metastore.uris", "thrift://hadoop01:9083").\
enableHiveSupport().\
getOrCreate()
# 读取json数据
df = spark.read.format("json").load("/home/data/mini.json").\
dropna(thresh=1, subset=['storeProvince']).\
filter("storeProvince != 'null'").\
filter("receivable < 10000").\
select("storeProvince", "storeID", "receivable", "dateTS", "payType")
df.show(truncate=True)
# 按需求读取数据
province_sale_df = df.groupBy("storeProvince").sum("receivable").\
withColumnRenamed("sum(receivable)", "money").\
withColumn("money", F.round("money", 2)).\
orderBy("money", ascending=False)
province_sale_df.show(truncate=False)
# 写出Hive表的bigdata库
province_sale_df.write.mode("overwrite").saveAsTable("bigdata.province_sale", "parquet")
# 按需求读取数据
top3_province_df = province_sale_df.limit(3).select("storeProvince").withColumnRenamed("storeProvince", "top3_province")
top3_province_df_joined = df.join(top3_province_df, on = df['storeProvince'] == top3_province_df['top3_province'])
top3_province_df.show()
top3_province_df_joined.persist(StorageLevel.MEMORY_AND_DISK)
province_hot_store_count_df = top3_province_df_joined.groupBy("storeProvince", "storeID",
F.from_unixtime(df['dateTS'].substr(0, 10), "yyyy-MM-dd").alias("day")).\
sum("receivable").withColumnRenamed("sum(receivable)", "money").\
filter("money > 1000").\
dropDuplicates(subset=["storeID"]).\
groupBy("storeProvince").count()
# 写出Hive
province_hot_store_count_df.write.mode("overwrite").saveAsTable("bigdata.province_hot_store_count", "parquet")
# 按需求读取数据
top3_province_order_avg_df = top3_province_df_joined.groupBy("storeProvince").\
avg("receivable").\
withColumnRenamed("avg(receivable)", "money").\
withColumn("money", F.round("money", 2)).\
orderBy("money", ascending=False)
top3_province_order_avg_df.show(truncate=False)
top3_province_order_avg_df.write.mode("overwrite").saveAsTable("bigdata.top3_province_order_avg", "parquet")
# 按需求读取数据
top3_province_df_joined.createTempView("province_pay")
def udf_func(percent):
return str(round(percent * 100, 2)) + "%"
my_udf = F.udf(udf_func, StringType())
pay_type_df = spark.sql("""
SELECT storeProvince, payType, (COUNT(payType) / total) AS percent FROM
(SELECT storeProvince, payType, count(1) OVER(PARTITION BY storeProvince) AS total FROM province_pay) AS sub
GROUP BY storeProvince, payType, total
""").withColumn("percent", my_udf("percent"))
pay_type_df.show()
pay_type_df.write.mode("overwrite").saveAsTable("bigdata.pay_type", "parquet")
top3_province_df_joined.unpersist()
- 验证hive是否添加成功
hive> use bigdata;
hive> desc province_sale;
hive> desc province_hot_store_count;
- 将代码上传到服务器执行
[root@master day29]# pwd
/tmp/pycharm_project_115/day29
[root@master day29]# spark-submit main.py
-
报错:没有找到文件
-
解决方案
# 将/home/data文件夹中的mini.json上传到hadoop的/tmp目录
[root@master day29]# hdfs dfs -put /home/data/ /tmp
# 查看是否上传成功
[root@master day29]# hdfs dfs -ls -R /
drwx-wx-wx - root supergroup 0 2027-12-04 15:44 /tmp
drwxr-xr-x - root supergroup 0 2027-12-04 15:44 /tmp/data
-rw-r--r-- 1 root supergroup 124380564 2027-12-04 15:44 /tmp/data/mini.json
# 修改代码中的地址为hadoop中的文件路径
[root@master day29]# pwd
/tmp/pycharm_project_115/day29
[root@master day29]# vim main.py
df = spark.read.format("json").load("/tmp/data/mini.json").\
# 最后执行成功