寒假生活指导25
#coding:utf8 #三种创建DataFramed的方法 import pandas as pd from pyspark.sql import SparkSession from pyspark.sql.types import StructType,StringType,IntegerType if __name__ =='__main__': # spark=SparkSession.builder.appName("create_df").master("local[*]").getOrCreate() # # sc = spark.sparkContext # rdd=sc.textFile("../../data/input/sql/input/people.txt").\ # map(lambda x:x.split(',')).\ # map(lambda x:[x[0],int(x[1])]) # # df=spark.createDataFrame(rdd,schema=['name','age']) # df.printSchema() # df.show() # # df.createTempView("tt") # spark.sql("select * from tt where age<30").show() # --------------------------------------------------------------------- # spark=SparkSession.builder.\ # appName("create_df").\ # config("spark.sql.shuffle.partitions","4").\ # getOrCreate() # # sc=spark.sparkContext # rdd=sc.textFile("../../data/input/sql/input/stu_score.txt").\ # map(lambda x:x.split(',')).\ # map(lambda x:(int(x[0]),x[1],int(x[2]))) # # schema=StructType().\ # add("id",IntegerType(),nullable=False). \ # add("name", StringType(), nullable=True). \ # add("score", IntegerType(), nullable=False) # # df=spark.createDataFrame(rdd,schema=schema) # df.printSchema() # df.show() # # df.createTempView("score") # df2=spark.sql("select * from score where name = '数学' ") # df2.show() # ---------------------------------------------------------------------rdd转化为DataFrame对象 # spark=SparkSession.builder.\ # appName("create_df").\ # config("spark.sql.shuffle.partitions","4").\ # getOrCreate() # # sc=spark.sparkContext # # rdd=sc.textFile("../../data/input/sql/input/stu_score.txt").\ # map(lambda x:x.split(',')).\ # map(lambda x:(int(x[0]),x[1],int(x[2]))) # # schema=StructType(). \ # add("id",IntegerType(),nullable=False). \ # add("name", StringType(), nullable=True). \ # add("score", IntegerType(), nullable=False) # # df=rdd.toDF(['id','subject','score']) # df.printSchema() # df.show() # # df=rdd.toDF(schema=schema) # df.printSchema() # df.show() # ---------------------------------------------------------------------panda创建DataFrame对象 # spark =SparkSession.builder.\ # appName("create_df").\ # master("local[*]").\ # getOrCreate() # sc=spark.sparkContext # pdf=pd.DataFrame({ # "id":[1,2,3], # "name":["张大仙","吕德华","夫赖"], # "age":[30,30,30] # }) # df=spark.createDataFrame(pdf) # df.printSchema() # df.show() # 读取text文件创建Data Frame对象 spark = SparkSession.builder.appName("test").master("local[*]").config("spark.sql.shuffle.partitions",2).getOrCreate() # schema=StructType().add("name",StringType(),nullable=True) # df=spark.read.format("text")\ # .schema(schema=schema)\ # .load("../../data/input/sql/input/people.txt") # df.show() #读取json文件创建Data Frame对象 df=spark.read.format("json").\ load("../../data/input/sql/input/people.json") df.printSchema() df.show() # 读取csv文件创建Data Frame对象 df2=spark.read.format("csv")\ .option("sep",";")\ .option("header",False)\ .option("encoding","utf-8")\ .schema("name STRING,age INT,job STRING")\ .load("../../data/input/sql/input/people.csv") df2.printSchema() df2.show()
创建DataFrame对象