pyspark处理hadoop上资源

1.pyspark处理存在hadoop上的seq文件

# coding=utf-8
from py4j.compat import long
from pyspark import SparkConf, SparkContext 
 

if __name__ == "__main__":

      

    sc = SparkContext(conf=SparkConf().setAppName('VerticalSentence'))

    Text = "org.apache.hadoop.io.Text" 
    result = sc.sequenceFile("/crawler/twitter/2019-01*/*/*", Text, Text).values()
    # print(type(result))
    lines = result.map(lambda x: x.strip()) 
    lines.saveAsTextFile("/tmp/wangling/commoncrawler/seq20200629output") 

 

posted @ 2020-09-14 15:33  lingwang3  阅读(233)  评论(0编辑  收藏  举报