广播变量和累加器
累加器
from pyspark import SparkContext sc = SparkContext("local", "Accumulator app") num = sc.accumulator(10) def f(x): global num num+=x rdd = sc.parallelize([20,30,40,50]) rdd.foreach(f) final = num.value print(final)
广播变量
from pyspark import SparkContext sc = SparkContext("local", "Accumulator app") list_1 = [1,2,3,4] rdd1 = sc.parallelize([1,1,2,3,5,6,7]) rdd2 = sc.parallelize([1,1,2,8]) bc = sc.broadcast(list_1) res = rdd1.map(lambda x:(x,1) if x in bc.value else (x,0)) print(res.collect())