2.4总结
第十五天
今天继续进行了练习
# coding:utf8 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType import pandas as pd if __name__ == '__main__': # 0. 构建执行环境入口对象SparkSession spark = SparkSession.builder.\ appName("test").\ master("local[*]").\ getOrCreate() sc = spark.sparkContext # 读取CSV文件 df = spark.read.format("csv").\ option("sep", ";").\ option("header", True).\ option("encoding", "utf-8").\ schema("name STRING, age INT, job STRING").\ load("../data/input/sql/people.csv") df.printSchema() df.show()
# coding:utf8 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType import pandas as pd if __name__ == '__main__': # 0. 构建执行环境入口对象SparkSession spark = SparkSession.builder.\ appName("test").\ master("local[*]").\ getOrCreate() sc = spark.sparkContext # 读取parquet类型的文件 df = spark.read.format("parquet").load("../data/input/sql/users.parquet") df.printSchema() df.show()
# coding:utf8 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType import pandas as pd if __name__ == '__main__': # 0. 构建执行环境入口对象SparkSession spark = SparkSession.builder.\ appName("test").\ master("local[*]").\ getOrCreate() sc = spark.sparkContext df = spark.read.format("csv").\ schema("id INT, subject STRING, score INT").\ load("../data/input/sql/stu_score.txt") # Column对象的获取 id_column = df['id'] subject_column = df['subject'] # DLS风格演示 df.select(["id", "subject"]).show() df.select("id", "subject").show() df.select(id_column, subject_column).show() # filter API df.filter("score < 99").show() df.filter(df['score'] < 99).show() # where API df.where("score < 99").show() df.where(df['score'] < 99).show() # group By API df.groupBy("subject").count().show() df.groupBy(df['subject']).count().show() # df.groupBy API的返回值 GroupedData # GroupedData对象 不是DataFrame # 它是一个 有分组关系的数据结构, 有一些API供我们对分组做聚合 # SQL: group by 后接上聚合: sum avg count min man # GroupedData 类似于SQL分组后的数据结构, 同样有上述5种聚合方法 # GroupedData 调用聚合方法后, 返回值依旧是DataFrame # GroupedData 只是一个中转的对象, 最终还是要获得DataFrame的结果 r = df.groupBy("subject")
# coding:utf8 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType import pandas as pd if __name__ == '__main__': # 0. 构建执行环境入口对象SparkSession spark = SparkSession.builder.\ appName("test").\ master("local[*]").\ getOrCreate() sc = spark.sparkContext df = spark.read.format("csv").\ schema("id INT, subject STRING, score INT").\ load("../data/input/sql/stu_score.txt") # 注册成临时表 df.createTempView("score") # 注册临时视图(表) df.createOrReplaceTempView("score_2") # 注册 或者 替换 临时视图 df.createGlobalTempView("score_3") # 注册全局临时视图 全局临时视图在使用的时候 需要在前面带上global_temp. 前缀 # 可以通过SparkSession对象的sql api来完成sql语句的执行 spark.sql("SELECT subject, COUNT(*) AS cnt FROM score GROUP BY subject").show() spark.sql("SELECT subject, COUNT(*) AS cnt FROM score_2 GROUP BY subject").show() spark.sql("SELECT subject, COUNT(*) AS cnt FROM global_temp.score_3 GROUP BY subject").show()