doc2vec
预处理+模型训练:
# -*- coding: utf-8 -*- """ Created on Thu Oct 6 15:35:06 2022 datetime:2019/9/24 coding:utf-8 project name:Pycharm_workstation Program function: doc2vec段向量模型训练 @author: wex """ import jieba import gensim.models as g from gensim.corpora import WikiCorpus import logging # 日志信息 from langconv import * # 繁简体转换 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO) class TaggedWikiDocument(object): def __init__(self,wiki): self.wiki = wiki self.wiki.metadata = True def __iter__(self): for content,(page_id,title) in self.wiki.get_texts(): yield g.doc2vec.TaggedDocument(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))],tags=[title]) def my_function(): zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2' # wiki = WikiCorpus(zhwiki_name,lemmatize=False,dictionary={}) wiki = WikiCorpus(zhwiki_name, dictionary={}) documents = TaggedWikiDocument(wiki) # # dm:训练模型的种类,一般默认为1,指的是使用DM模型,当dm等于其他值,使用DBOW模型训练词向量 model = g.Doc2Vec(documents,dm=0,dbow_words=1,vector_size=192,window=8,min_count=19,epochs=5,workers=8) model.save('./data/doc/zhiwiki_news.doc2vec') # 保存 if __name__ == '__main__': my_function() #g.doc2vec.LabeledSentence---> gensim.models.doc2vec.TaggedDocument
相似度计算dec2vec_sim.py
# -*- coding: utf-8 -*- """ Created on Sat Sep 24 08:59:35 2022 datetime:2019/9/25 coding:utf-8 project name:Pycharm_workstation Program function: 使用段向量计算网页相似度 @author: wex """ import jieba import gensim.models as g import codecs import numpy import numpy as np model_path = 'zhiwiki_news.doc2vec' start_alpha = 0.01 infer_epoch = 1000 docvec_size = 192 # 段落向量的维度 def simlarityCalu(vector1, vector2): # 采用余弦函数计算文本相似度 vector1Mod = np.sqrt(vector1.dot(vector1)) vector2Mod = np.sqrt(vector2.dot(vector2)) if vector2Mod != 0 and vector1Mod != 0: simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod) else: simlarity = 0 return simlarity def doc2vec(file_name, model): # 分词 doc = [w for x in codecs.open(file_name, 'r', 'utf-8').readlines() for w in jieba.cut(x.strip())] # 句子向量化操作 doc_vec_all = model.infer_vector(doc, alpha=start_alpha, steps=infer_epoch) return doc_vec_all if __name__ == '__main__': model = g.Doc2Vec.load(model_path) p1 = 'P1.txt' p2 = 'P2.txt' # 获取文本向量化 P1_doc2vec = doc2vec(p1, model) P2_doc2vec = doc2vec(p2, model) print(simlarityCalu(P1_doc2vec, P2_doc2vec))