寒假生活指导02
今天学习了rdd的过滤和去重:
from pyspark import SparkConf,SparkContext #创建sparkconf对象 conf = SparkConf().setMaster("local[*]").setAppName("test_app") #基于sparkconf对象创建sparkContext对象 sc = SparkContext(conf=conf) ##########基本结构 #map计算 # rdd = sc.parallelize([1,2,3,4,5]) # def func(date): # return date*10 # rdds=rdd.map(func) #flatMap解除嵌套 # rdd = sc.parallelize(["dwad wad wdas","dwadw dfgawdfw dwad","dwadwad"]) # rdds=rdd.flatMap(lambda x : x.split(" ")) #reduceByKey分组两两计算 # rdd=sc.parallelize([('男',99),('女',99),('女',99),('男',99),('男',99),('男',99)]) # rdds = rdd.reduceByKey(lambda a, b: a+b) # print(rdds.collect()) #filter过滤数据 rdd=sc.parallelize([1,2,3,4,5]) rdds = rdd.filter(lambda num: num % 2 == 0) print(rdds.collect()) #distinct去重 rdd=sc.parallelize([1, 2, 3, 4, 5, 1]) rdds = rdd.distinct() print(rdds.collect()) #停止spark sc.stop()