寒假生活指导25

#coding:utf8
#三种创建DataFramed的方法
import pandas as pd
from  pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
if __name__ =='__main__':
    # spark=SparkSession.builder.appName("create_df").master("local[*]").getOrCreate()
    #
    # sc = spark.sparkContext
    # rdd=sc.textFile("../../data/input/sql/input/people.txt").\
    #     map(lambda x:x.split(',')).\
    #     map(lambda x:[x[0],int(x[1])])
    #
    # df=spark.createDataFrame(rdd,schema=['name','age'])
    # df.printSchema()
    # df.show()
    #
    # df.createTempView("tt")
    # spark.sql("select * from tt where age<30").show()
# ---------------------------------------------------------------------
    # spark=SparkSession.builder.\
    #     appName("create_df").\
    #     config("spark.sql.shuffle.partitions","4").\
    #     getOrCreate()
    #
    # sc=spark.sparkContext
    # rdd=sc.textFile("../../data/input/sql/input/stu_score.txt").\
    #     map(lambda x:x.split(',')).\
    #     map(lambda x:(int(x[0]),x[1],int(x[2])))
    #
    # schema=StructType().\
    #     add("id",IntegerType(),nullable=False). \
    #     add("name", StringType(), nullable=True). \
    #     add("score", IntegerType(), nullable=False)
    #
    # df=spark.createDataFrame(rdd,schema=schema)
    # df.printSchema()
    # df.show()
    #
    # df.createTempView("score")
    # df2=spark.sql("select * from score where name = '数学' ")
    # df2.show()
# ---------------------------------------------------------------------rdd转化为DataFrame对象
#     spark=SparkSession.builder.\
#         appName("create_df").\
#         config("spark.sql.shuffle.partitions","4").\
#         getOrCreate()
#
#     sc=spark.sparkContext
#
#     rdd=sc.textFile("../../data/input/sql/input/stu_score.txt").\
#         map(lambda x:x.split(',')).\
#         map(lambda x:(int(x[0]),x[1],int(x[2])))
#
#     schema=StructType(). \
#         add("id",IntegerType(),nullable=False). \
#         add("name", StringType(), nullable=True). \
#         add("score", IntegerType(), nullable=False)
#
#     df=rdd.toDF(['id','subject','score'])
#     df.printSchema()
#     df.show()
#
#     df=rdd.toDF(schema=schema)
#     df.printSchema()
#     df.show()
    # ---------------------------------------------------------------------panda创建DataFrame对象
    # spark =SparkSession.builder.\
    #     appName("create_df").\
    #     master("local[*]").\
    #     getOrCreate()
    # sc=spark.sparkContext
    # pdf=pd.DataFrame({
    #     "id":[1,2,3],
    #     "name":["张大仙","吕德华","夫赖"],
    #     "age":[30,30,30]
    # })
    # df=spark.createDataFrame(pdf)
    # df.printSchema()
    # df.show()
    # 读取text文件创建Data Frame对象
    spark = SparkSession.builder.appName("test").master("local[*]").config("spark.sql.shuffle.partitions",2).getOrCreate()
    # schema=StructType().add("name",StringType(),nullable=True)
    # df=spark.read.format("text")\
    #     .schema(schema=schema)\
    #     .load("../../data/input/sql/input/people.txt")
    # df.show()
#读取json文件创建Data Frame对象
    df=spark.read.format("json").\
        load("../../data/input/sql/input/people.json")

    df.printSchema()
    df.show()
    # 读取csv文件创建Data Frame对象
    df2=spark.read.format("csv")\
        .option("sep",";")\
        .option("header",False)\
        .option("encoding","utf-8")\
        .schema("name STRING,age INT,job STRING")\
        .load("../../data/input/sql/input/people.csv")
    df2.printSchema()
    df2.show()

创建DataFrame对象

 

posted @ 2024-02-02 13:56  一个小虎牙  阅读(1)  评论(0编辑  收藏  举报