代码改变世界

【大数据】RDD计算常见场景

2022-04-06 18:09  码上起舞  阅读(166)  评论(0编辑  收藏  举报
一、目的
本文主要用于记录大数据学习过程中一些沉淀

from pyspark import SparkContext,SparkConf

spconf =SparkConf().setAppName("ji").setMaster("local[*]")
sc = SparkContext(conf=spconf)

#1.求平均数
df = [1,5,7,10,23,20,6,5,10,7,10]
rdd_data = sc.parallelize(df)
total = rdd_data.reduce(lambda x,y:x+y+0.0)
count = rdd_data.count()
ava = total/count
print(total,count,ava)

#2.求众数,有多个众数,求众数平均值
data = [1,5,7,10,23,20,7,5,10,7,10]
rdd_data = sc.parallelize(data)
rdd_count = rdd_data.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y)
max_count = rdd_count.map(lambda x:x[1]).reduce(lambda x,y:x if x>y else y)
rdd_filter = rdd_count.filter(lambda x:x[1]==max_count).map(lambda x:x[0]) #
mode = rdd_filter.reduce(lambda x,y:x+y+0.0)/rdd_filter.count() #[10, 7]
print(mode)

#3.求topN
#有一批学生信息表格,包括name,age,score, 找出score排名前3的学生, score相同可以任取
students = [("LiLei",18,87),("HanMeiMei",16,77),("DaChui",16,66),("Jim",18,77),("Ruanhua",18,50)]
n = 3
rdd_data = sc.parallelize(students)
rdd_sorted = rdd_data.sortBy(lambda x:x[2],ascending=False)
topN = rdd_sorted.take(n)
print(topN)

#4.排序并返回序号
#任务:按从小到大排序并返回序号, 大小相同的序号可以不同
import time
starttime = time.time()
data = [1,7,8,5,3,18,34,9,0,12,8]
rdd_data = sc.parallelize(data)
rdd_sort = rdd_data.map(lambda x:(x,1)).sortByKey().map(lambda x:x[0])
index_rdd = rdd_sort.zipWithIndex()
endtime = time.time()
sparktime = endtime-starttime
print("spakrtime",sparktime)

#5,二次排序
#任务:有一批学生信息表格,包括name,age,score
#首先根据学生的score从大到小排序,如果score相同,根据age从大到小
students = [("LiLei",18,87),("HanMeiMei",16,77),("DaChui",16,66),("Jim",18,77),("Ruanhua",18,50)]
rdd_data = sc.parallelize(students)
from bigdata.example.exercise.student import STUDENT
rdd1 = rdd_data.map(lambda x:STUDENT(x[0],x[1],x[2])).sortBy(lambda x:x,ascending=False).map(lambda x:(x.name,x.age,x.score))
print(rdd1.collect())

#6.分组求众数
#任务:有一批学生信息表格,包括class和age。求每个班级学生年龄的众数。
students = [("class1",15),("class1",15),("class2",16),("class2",16),("class1",17),("class2",19)]