6.RDD算子实战
1 from pyspark import SparkContext,SparkConf 2 import sys 3 if __name__ == '__main__': 4 if len(sys.argv) != 2: 5 print("Usage: wordcount <input>",file=sys.stderr) 6 sys.exit(-1) 7 8 conf = SparkConf() 9 sc = SparkContext(conf=conf) 10 11 12 counts = sc.textFile(sys.argv[1])\ 13 .flatMap(lambda line:line.split(" "))\ 14 .map(lambda x:(x,1))\ 15 .reduceByKey(lambda a,b : a+b) 16 17 output = counts.collect() 18 for (word,count) in output : 19 print("%s: %i" % (word,count)) 20 21 22 sc.stop() 23
1 from pyspark import SparkContext,SparkConf 2 import sys 3 if __name__ == '__main__': 4 if len(sys.argv) != 2: 5 print("Usage: avg <input>",file = sys.stderr) 6 sys.exit(-1) 7 8 conf = SparkConf() 9 sc = SparkContext(conf=conf) 10 ageData = sc.textFile(sys.argv[1]).map(lambda line:line.split(" ")[1]) 11 totalAge = ageData.map(lambda x:int(x)).reduce(lambda a,b:a+b) 12 count = ageData.count() 13 avgAge = totalAge / count 14 15 print("totalAge:%s"%totalAge) 16 print("count:%s"%count) 17 print("avgAge:%s"%avgAge) 18 19 sc.stop()