【spark】jieba + wordcount

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from os import path
import jieba
from pyspark import SparkContext
from pyspark.sql import SQLContext
#from operator import add

sc = SparkContext("local[1]" , "wordCount")
sc.setLogLevel("ERROR")
sqc = SQLContext(sc)

thisDir = path.dirname(__file__)

def wordCut(strings):
    strings = strings.strip()
    returnList = []
    for r in jieba.cut(strings):
        returnList.append(r)
    return returnList

fileName = 'words.txt'
file_in = sc.textFile(path.join(thisDir,fileName))

linesNum = file_in.count()
print '[INFO]number of lines in file %s : %d' % (fileName , linesNum)

charsNum = file_in.map(lambda x : len(x)).reduce(lambda x,y : x+y)
print '[INFO]number of charts in file %s : %d' % (fileName , charsNum)

words = file_in.flatMap(lambda line : wordCut(line))
termBigger3 = words.filter(lambda word : len(word) > 3)
print '[INFO]number of words bigger than 3 in file %s : %d' % (fileName , termBigger3.count())

wordCount = words.map(lambda w : (w,1)).reduceByKey(lambda x,y:x+y)
sqc.createDataFrame(wordCount,['word','count']).sort('count',ascending = False).show(20)

 

posted on 2017-05-11 15:34  colipso  阅读(427)  评论(0编辑  收藏  举报

导航