NLP之gensim

一、

利用 jieba 进行分词,关键词提取

利用gensim下面的corpora,models,similarities 进行语料库建立,模型tfidf算法,稀疏矩阵相似度分析

# -*- coding: utf-8 -*-

import jieba
from gensim import corpora, models, similarities
from collections import defaultdict

# 定义文件目录
work_dir = "D:/workspace/PythonSdy/data"
f1 = work_dir + "/t1.txt"
f2 = work_dir + "/t2.txt"
# 读取文件内容
c1 = open(f1, encoding='utf-8').read()
c2 = open(f2, encoding='utf-8').read()
# jieba 进行分词
data1 = jieba.cut(c1)
data2 = jieba.cut(c2)

data11 = ""
# 获取分词内容
for i in data1:
    data11 += i + " "
data21 = ""
# 获取分词内容
for i in data2:
    data21 += i + " "

doc1 = [data11, data21]
# print(doc1)

t1 = [[word for word in doc.split()]
      for doc in doc1]
# print(t1)

# # frequence频率
freq = defaultdict(int)
for i in t1:
    for j in i:
        freq[j] += 1
# print(freq)

# 限制词频
t2 = [[token for token in k if freq[j] >= 3]
      for k in t1]
print(t2)

# corpora语料库建立字典
dic1 = corpora.Dictionary(t2)
dic1.save(work_dir + "/yuliaoku.txt")

# 对比文件
f3 = work_dir + "/t3.txt"
c3 = open(f3, encoding='utf-8').read()
# jieba 进行分词
data3 = jieba.cut(c3)
data31 = ""
for i in data3:
    data31 += i + " "
new_doc = data31
print(new_doc)

# doc2bow把文件变成一个稀疏向量
new_vec = dic1.doc2bow(new_doc.split())
# 对字典进行doc2bow处理,得到新语料库
new_corpor = [dic1.doc2bow(t3) for t3 in t2]
tfidf = models.TfidfModel(new_corpor)

# 特征数
featurenum = len(dic1.token2id.keys())

# similarities 相似之处
# SparseMatrixSimilarity 稀疏矩阵相似度
idx = similarities.SparseMatrixSimilarity(tfidf[new_corpor], num_features=featurenum)
sims = idx[tfidf[new_vec]]
print(sims)

二、轻量级数据文本相似的处理

Lsimodel训练模型

import jieba
from gensim import corpora
from gensim import models
from gensim import similarities

from settings import MONGO_DB


content_list = []  # 放数据库中的内容
for i in MONGO_DB.content.find():  # 查数据库内容,生成器
    content_list.append(i.get("title"))

# 制作语料库
l1 = content_list
all_doc_list = []  # 存放jieba分词列表
for doc in l1:
    doc_list = [word for word in jieba.cut_for_search(doc)]
    all_doc_list.append(doc_list)
dictionary = corpora.Dictionary(all_doc_list)  #制作词袋 例如: {'什么': 0, '你': 1, '名字': 2, '是': 3, '的': 4, '了': 5, '今年': 6}
corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]  #  [(1, 1), (5, 1), (6, 1), (7, 1)] bow模型语料库
lsi = models.LsiModel(corpus)  # 根据语料库训练Lsi模型,向量表示
# [5*5,6*4,2*3....]

# 百度ai识别的用户语音消息 ,jieba分词 --> 语料库
def my_gensim(ai_msg):
    doc_test_list = [word for word in jieba.cut_for_search(ai_msg)]  # 分词
    doc_test_vec = dictionary.doc2bow(doc_test_list)  # bow 对象语料库

    # 计算文本相似度
    # 稀疏矩阵相似度 将主语料库corpus的训练结果 作为初始值
    index = similarities.SparseMatrixSimilarity(lsi[corpus], num_features=len(dictionary.keys()))
    # 将 语料库doc_test_vec 在 语料库corpus的训练结果 中的 向量表示 ,与 语料库corpus的 向量表示 做矩阵相似度计算
    sim = index[lsi[doc_test_vec]] 
    print(sim,enumerate(sim))
    cc = sorted(enumerate(sim), key=lambda item: -item[1])  # 按相似度排序
    print(cc)
    if cc[0][1] > 0.58:
        text = l1[cc[0][0]]
    else:
        text = None

    return text

print(my_gensim('xiaoxiao 小的'))

 

posted @ 2019-05-19 16:10  清风_Z  阅读(438)  评论(0编辑  收藏  举报