pyspark处理hadoop上资源
1.pyspark处理存在hadoop上的seq文件
# coding=utf-8 from py4j.compat import long from pyspark import SparkConf, SparkContext if __name__ == "__main__": sc = SparkContext(conf=SparkConf().setAppName('VerticalSentence')) Text = "org.apache.hadoop.io.Text" result = sc.sequenceFile("/crawler/twitter/2019-01*/*/*", Text, Text).values() # print(type(result)) lines = result.map(lambda x: x.strip()) lines.saveAsTextFile("/tmp/wangling/commoncrawler/seq20200629output")