寒假生活指导02

今天学习了rdd的过滤和去重:

from pyspark import SparkConf,SparkContext
#创建sparkconf对象
conf = SparkConf().setMaster("local[*]").setAppName("test_app")
#基于sparkconf对象创建sparkContext对象
sc = SparkContext(conf=conf)
##########基本结构

#map计算
# rdd = sc.parallelize([1,2,3,4,5])
# def func(date):
#     return date*10
# rdds=rdd.map(func)

#flatMap解除嵌套
# rdd = sc.parallelize(["dwad wad wdas","dwadw dfgawdfw dwad","dwadwad"])
# rdds=rdd.flatMap(lambda x : x.split(" "))

#reduceByKey分组两两计算
# rdd=sc.parallelize([('男',99),('女',99),('女',99),('男',99),('男',99),('男',99)])
# rdds = rdd.reduceByKey(lambda a, b: a+b)
# print(rdds.collect())

#filter过滤数据
rdd=sc.parallelize([1,2,3,4,5])
rdds = rdd.filter(lambda num: num % 2 == 0)
print(rdds.collect())

#distinct去重
rdd=sc.parallelize([1, 2, 3, 4, 5, 1])
rdds = rdd.distinct()
print(rdds.collect())

#停止spark
sc.stop()

 

 

posted @ 2024-01-10 11:38  一个小虎牙  阅读(6)  评论(0编辑  收藏  举报