gensim自然语言处理(续)

上一篇,已经实现了如何将一条语句在一个语料库中比较相似度,

发现运行的时候每次都要编译语料库,通过查找资料,可以一次性编译成预料库,存人文件

编译语料库代码 11_k.py

import sys
import jieba
reload(sys)
sys.setdefaultencoding("utf-8")
from gensim import corpora,models,similarities
alist = []

import json

def fenci():
for i_text in open("xaa.json"):
f_json = json.loads(i_text)
kk = f_json["content"]
item_str = jieba.cut(kk.encode("utf-8"),cut_all=True)
a = " ".join(item_str)
alist.append(a)

fenci()

class MyCorpus(object):
def __iter__(self):
for item_str in alist:
yield item_str.split(' ')

Corp = MyCorpus()
dictionary = corpora.Dictionary(Corp)

#dictionary = corpora.Dictionary(Corp)
dictionary.save("bbb.dict") #存入本地数据
corpus = [dictionary.doc2bow(text) for text in Corp]

corpora.MmCorpus.serialize('deerwester1.mm', corpus) # 存入硬盘,以备后需

编译好了 bbb.dict deerwester1.mm 文件,在一下一个代码中直接调用11_main.py
import jieba
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('bbb.dict') #调用
corpus = corpora.MmCorpus('deerwester1.mm')

tfidf = models.TfidfModel(corpus)

corpus_tfidf = tfidf[corpus]


def read_file(i):
with open("xaa.json","rt") as f:
y = f.readlines()[i]
print y

def test_kk(test):
test_data_1 = '请假洛阳牡丹'
test_cut_raw_1 = jieba.cut(test)
doc_new = " ".join(test_cut_raw_1)

test_corpus_1 = dictionary.doc2bow(doc_new.split())

vec_tfidf = tfidf[test_corpus_1]

index = similarities.MatrixSimilarity(corpus_tfidf)

sims = index[vec_tfidf]

similarit = list(sims)

#print(list(enumerate(sims)))
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for i in range(20): #只读取前20 个数据,
print sims[i] #相似度是与元组的形式存在
k = sims[i]
read_file(k[0]) #将相似文件中相似的语句打印出来


def buss_mian():
while True:
test = raw_input("please input test:")
test_kk(test)

if __name__ == "__main__":

buss_mian()



posted @ 2017-05-18 11:08  路金甲  阅读(576)  评论(0编辑  收藏  举报