展开
拓展 关闭
订阅号推广码
GitHub
视频
公告栏 关闭

hive读取json数据

  • 查看hive的数据库
hive> show databases;
OK
bigdata
default
Time taken: 0.592 seconds, Fetched: 2 row(s)
  • 将json数据上传到服务器
[root@master data]# pwd
/home/data
[root@master data]# ls
mini.json
  • 编写代码
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import StringType

if __name__ == '__main__':
    spark = SparkSession.builder.\
        appName("SparkSQL Example").\
        master("local[*]").\
        config("spark.sql.shuffle.partitions", "2").\
        config("hive.metastore.uris", "thrift://hadoop01:9083").\
        enableHiveSupport().\
        getOrCreate()
    # 读取json数据
    df = spark.read.format("json").load("/home/data/mini.json").\
        dropna(thresh=1, subset=['storeProvince']).\
        filter("storeProvince != 'null'").\
        filter("receivable < 10000").\
        select("storeProvince", "storeID", "receivable", "dateTS", "payType")
    df.show(truncate=True)
    # 按需求读取数据
    province_sale_df = df.groupBy("storeProvince").sum("receivable").\
        withColumnRenamed("sum(receivable)", "money").\
        withColumn("money", F.round("money", 2)).\
        orderBy("money", ascending=False)
    province_sale_df.show(truncate=False)

    # 写出Hive表的bigdata库
    province_sale_df.write.mode("overwrite").saveAsTable("bigdata.province_sale", "parquet")

    # 按需求读取数据
    top3_province_df = province_sale_df.limit(3).select("storeProvince").withColumnRenamed("storeProvince", "top3_province")
    top3_province_df_joined = df.join(top3_province_df, on = df['storeProvince'] == top3_province_df['top3_province'])
    top3_province_df.show()
    top3_province_df_joined.persist(StorageLevel.MEMORY_AND_DISK)
    province_hot_store_count_df = top3_province_df_joined.groupBy("storeProvince", "storeID",
                                    F.from_unixtime(df['dateTS'].substr(0, 10), "yyyy-MM-dd").alias("day")).\
        sum("receivable").withColumnRenamed("sum(receivable)", "money").\
        filter("money > 1000").\
        dropDuplicates(subset=["storeID"]).\
        groupBy("storeProvince").count()
    # 写出Hive
    province_hot_store_count_df.write.mode("overwrite").saveAsTable("bigdata.province_hot_store_count", "parquet")

    # 按需求读取数据
    top3_province_order_avg_df = top3_province_df_joined.groupBy("storeProvince").\
        avg("receivable").\
        withColumnRenamed("avg(receivable)", "money").\
        withColumn("money", F.round("money", 2)).\
        orderBy("money", ascending=False)
    top3_province_order_avg_df.show(truncate=False)
    top3_province_order_avg_df.write.mode("overwrite").saveAsTable("bigdata.top3_province_order_avg", "parquet")

    # 按需求读取数据
    top3_province_df_joined.createTempView("province_pay")
    def udf_func(percent):
        return str(round(percent * 100, 2)) + "%"
    my_udf = F.udf(udf_func, StringType())
    pay_type_df = spark.sql("""
        SELECT storeProvince, payType, (COUNT(payType) / total) AS percent FROM
        (SELECT storeProvince, payType, count(1) OVER(PARTITION BY storeProvince) AS total FROM province_pay) AS sub
        GROUP BY storeProvince, payType, total
    """).withColumn("percent", my_udf("percent"))
    pay_type_df.show()
    pay_type_df.write.mode("overwrite").saveAsTable("bigdata.pay_type", "parquet")
    top3_province_df_joined.unpersist()
  • 验证hive是否添加成功
hive> use bigdata;
hive> desc province_sale;
hive> desc province_hot_store_count;
  • 将代码上传到服务器执行
[root@master day29]# pwd
/tmp/pycharm_project_115/day29
[root@master day29]# spark-submit main.py
  • 报错:没有找到文件

  • 解决方案

# 将/home/data文件夹中的mini.json上传到hadoop的/tmp目录
[root@master day29]# hdfs dfs -put /home/data/ /tmp
# 查看是否上传成功
[root@master day29]# hdfs dfs -ls -R /
drwx-wx-wx   - root supergroup          0 2027-12-04 15:44 /tmp
drwxr-xr-x   - root supergroup          0 2027-12-04 15:44 /tmp/data
-rw-r--r--   1 root supergroup  124380564 2027-12-04 15:44 /tmp/data/mini.json

# 修改代码中的地址为hadoop中的文件路径
[root@master day29]# pwd
/tmp/pycharm_project_115/day29
[root@master day29]# vim main.py
    df = spark.read.format("json").load("/tmp/data/mini.json").\

# 最后执行成功
posted @ 2023-12-04 23:30  DogLeftover  阅读(85)  评论(0编辑  收藏  举报