Rdd转DataFrame
from pyspark.sql.types import * from pyspark import SparkContext,SparkConf from pyspark.sql import SparkSession spark=SparkSession.builder.appName("boye").getOrCreate() sc = spark.sparkContext textFile = sc.textFile("file:///usr/local/test/urls") rdd = textFile.map(lambda x:x.split("\t")).filter(lambda x:len(x)==2) df = spark.createDataFrame(rdd,schema=["rowkey","url"]) df.write.format("json").mode("overwrite").save("file:///usr/local/test/outPut") #保存数据 df.write.save(path='/usr/local/test/csv', format='csv', mode='overwrite', sep='\t') #保存为csv文件 df.write.mode("overwrite").saveAsTable("ss") #永久保存
DataFrame转Rdd
1 from pyspark import SparkContext,SparkConf 2 from pyspark.sql import SparkSession 3 spark=SparkSession.builder.appName("boye").getOrCreate() 4 sc = spark.sparkContext 5 df = spark.read.json("file:///usr/local/test/01.json") 6 rdd = df.select("name","age").limit(10).rdd 7 rdd = rdd.map(lambda d:"{}\t{}".format(d.name,d.age) ) 8 rdd.saveAsTextFile("file:///usr/local/test/rdd_json") 9 #rdd.repartition(1).saveAsTextFile("file:///usr/local/test/rdd1")