使用Pyspark编写wordcount程序

# Word count on manuscript using PySpark

# import regex module
import re
# import add from operator module
from operator import add

# read input file
file_in = sc.textFile('/home/an/Documents/A00_Documents/Spark4Py 20150315')

# count lines
print('number of lines in file: %s' % file_in.count())

# add up lenths of each line
#
chars = file_in.map(lambda s: len(s)).reduce(add)
print('number of characters in file: %s' % chars)

# Get words from the input file
words =file_in.flatMap(lambda line: re.split('\W+', line.lower().strip()))

# words of more than 3 characters
words = words.filter(lambda x: len(x) > 3)

# set count 1 per word
words = words.map(lambda w: (w,1))

# reduce phase - sum count all the words
words = words.reduceByKey(add)

# create tuple (count, word) and sort in descending
words = words.map(lambda x: (x[1], x[0])).sortByKey(False)

# take top 20 words by frequency
words.take(20)

# create function for hitogram of most frequent words
#

% matplotlib inline
import matplotlib.pyplot as plt
#

def histogram(words):
count = map(lambda x: x[1], words)
word = map(lambda x: x[0], words)
plt.barh(range(len(count)), count,color = 'grey')
plt.yticks(range(len(count)), word)

# Change order of tuple (word, count) from (count, word)
words = words.map(lambda x:(x[1], x[0]))
words.take(25)

# display histogram
histogram(words.take(25))

# words in one summarised statement
words = sc.textFile('/home/an/Documents/A00_Documents/Spark4Py 20150315')
.flatMap(lambda line: re.split('\W+', line.lower().strip()))
.filter(lambda x: len(x) > 3)
.map(lambda w: (w,1))
.reduceByKey(add)
.map(lambda x: (x[1], x[0])).sortByKey(False)
words.take(20)

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

Spark Python API 文档

http://spark.apache.org/docs/latest/api/python/pyspark.html

官方示例

WordCount
bin/pyspark ./examples/src/main/python/wordcount.py /tmp/text
bin/spark-submit --master local --num-executors 10 ./examples/src/main/python/wordcount.py /tmp/text
bin/spark-submit --master yarn --num-executors 10 ./examples/src/main/python/wordcount.py /tmp/text

Pi
bin/spark-submit --master local --executor-memory 4G --num-executors 10 ./examples/src/main/python/pi.py

Python SparkConf

conf = SparkConf().setAppName("AppName").set("spark.executor.memory", "1g")
sc = SparkContext(conf=conf)

Python SparkContext

sc.textFile("/hdfs/path")
// 读取文件：Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings.

sc.union(rdd1, rdd2)
// RDDs 并集：Build the union of a list of RDDs.

sc.textFile("/hdfs/path").collect()
// 获取所有元素：Return a list that contains all of the elements in this RDD.

sc.parallelize([0, 2, 3, 4, 6], 5)
// 使用 Python 集合创建一个 RDD，一共 5 个分片：Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance.

Python Spark RDD 交并差（交集，并集，差集）

交集: intersection
>>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
>>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
>>> rdd1.intersection(rdd2).collect()
[1, 2, 3]

差集: subtract
>>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
>>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
>>> rdd1.subtract(rdd2).collect()
[4, 5, 10]

并集: subtract
>>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
>>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
>>> rdd1.union(rdd2).collect()
[1, 10, 2, 3, 4, 5, 1, 6, 2, 3, 7, 8]
>>> rdd1.union(rdd2).distinct().collect()
[1, 2, 3, 4, 5, 6, 7, 8, 10]

posted on 2016-01-21 09:42 gypsysunny 阅读(1029) 评论(0) 编辑收藏举报