from pyspark.mllib.linalg import SparseVector
from collections import Counter
from pyspark import SparkContext
if __name__ == "__main__":
sc = SparkContext('local', 'term_doc')
corpus = sc.parallelize([
"It is the east, and Juliet is the sun.",
"A dish fit for the gods.",
"Brevity is the soul of wit."])
tokens = corpus.map(lambda raw_text: raw_text.split()).cache()
local_vocab_map = tokens.flatMap(lambda token: token).distinct().zipWithIndex().collectAsMap()
vocab_map = sc.broadcast(local_vocab_map)
vocab_size = sc.broadcast(len(local_vocab_map))
term_document_matrix = tokens \
.map(Counter) \
.map(lambda counts: {vocab_map.value[token]: float(counts[token]) fortokenin counts}) \
.map(lambda index_counts: SparseVector(vocab_size.value, index_counts))
for doc in term_document_matrix.collect():
print( doc)