# -*- coding:utf-8 -*- from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext import math appName = "jhl_spark_1" # 你的应用程序名称 master = "local" # 设置单机 conf = SparkConf().setAppName(appName).setMaster(master) # 配置SparkContext sc = SparkContext(conf=conf) # parallelize:并行化数据,转化为RDD data = [1, 2, 3, 4, 5] distData = sc.parallelize(data, numSlices=10) # numSlices为分块数目,根据集群数进行分块 # textFile读取外部数据 #rdd = sc.textFile("./c2.txt") # 以行为单位读取外部文件,并转化为RDD #print rdd.collect() # map:迭代,对数据集中数据进行单独操作 def my_add(l): return (l, l) data = [1, 2, 3, 4, 5] distData = sc.parallelize(data) # 并行化数据集 result = distData.map(my_add) print (result.collect()) # 返回一个分布数据集 #[(1, 1), (2, 2), (3, 3), (4, 4), (5, 5)] # filter:过滤数据 def my_add(l): result = False if l > 2: result = True return result data = [1, 2, 3, 4, 5] distData = sc.parallelize(data) # 并行化数据集,分片 result = distData.filter(my_add) print (result.collect()) # 返回一个分布数据集 #[3, 4, 5] # zip:将两个RDD对应元素组合为元组 x = sc.parallelize(range(0, 5)) y = sc.parallelize(range(1000, 1005)) print x.zip(y).collect() #[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)] # union 组合两个RDD print x.union(x).collect() ##[0, 1, 2, 3, 4, 0, 1, 2, 3, 4] # Aciton操作 # collect:返回RDD中的数据 rdd = sc.parallelize(range(1, 10)) print rdd print rdd.collect() #ParallelCollectionRDD[9] at parallelize at PythonRDD.scala:475 #[1, 2, 3, 4, 5, 6, 7, 8, 9] # collectAsMap:以rdd元素为元组,以元组中一个元素作为索引返回RDD中的数据 m = sc.parallelize([('a', 2), (3, 4)]).collectAsMap() print m['a'] print m[3] #2 #4 # groupby函数:根据提供的方法为RDD分组: rdd = sc.parallelize([1, 1, 2, 3, 5, 8]) def fun(i): return i % 2 result = rdd.groupBy(fun).collect() print [(x, sorted(y)) for (x, y) in result] #[(0, [2, 8]), (1, [1, 1, 3, 5])] # reduce:对数据集进行运算 rdd = sc.parallelize(range(1, 10)) result = rdd.reduce(lambda a, b: a + b) print result #45