doc2vec

预处理+模型训练:

# -*- coding: utf-8 -*-
"""
Created on Thu Oct  6 15:35:06 2022
datetime:2019/9/24
coding:utf-8
project name:Pycharm_workstation
Program function: doc2vec段向量模型训练
@author: wex
"""
import jieba
import gensim.models as g
from gensim.corpora import WikiCorpus
import logging # 日志信息
from langconv import * # 繁简体转换

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

class TaggedWikiDocument(object):
    def __init__(self,wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        
        for content,(page_id,title) in self.wiki.get_texts():
            yield g.doc2vec.TaggedDocument(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))],tags=[title])

def my_function():
    zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
    # wiki = WikiCorpus(zhwiki_name,lemmatize=False,dictionary={})
    wiki = WikiCorpus(zhwiki_name, dictionary={})
    
    documents = TaggedWikiDocument(wiki)
    # # dm:训练模型的种类,一般默认为1,指的是使用DM模型,当dm等于其他值,使用DBOW模型训练词向量
    model = g.Doc2Vec(documents,dm=0,dbow_words=1,vector_size=192,window=8,min_count=19,epochs=5,workers=8)
    model.save('./data/doc/zhiwiki_news.doc2vec') # 保存

if __name__ == '__main__':
    my_function()
    #g.doc2vec.LabeledSentence---> gensim.models.doc2vec.TaggedDocument

 

 

 相似度计算dec2vec_sim.py

# -*- coding: utf-8 -*-
"""
Created on Sat Sep 24 08:59:35 2022
datetime:2019/9/25
coding:utf-8
project name:Pycharm_workstation
Program function: 使用段向量计算网页相似度
@author: wex
"""

import jieba
import gensim.models as g
import codecs
import numpy
import numpy as np

model_path = 'zhiwiki_news.doc2vec'
start_alpha = 0.01
infer_epoch = 1000
docvec_size = 192 # 段落向量的维度


def simlarityCalu(vector1, vector2): # 采用余弦函数计算文本相似度
    vector1Mod = np.sqrt(vector1.dot(vector1))
    vector2Mod = np.sqrt(vector2.dot(vector2))
    if vector2Mod != 0 and vector1Mod != 0:
        simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod)
    else:
        simlarity = 0
    return simlarity

def doc2vec(file_name, model):
    
    # 分词
    doc = [w for x in codecs.open(file_name, 'r', 'utf-8').readlines() for w in jieba.cut(x.strip())]
    # 句子向量化操作
    doc_vec_all = model.infer_vector(doc, alpha=start_alpha, steps=infer_epoch) 
    return doc_vec_all
    
if __name__ == '__main__':
    model = g.Doc2Vec.load(model_path)
    p1 = 'P1.txt'
    p2 = 'P2.txt'
    # 获取文本向量化
    P1_doc2vec = doc2vec(p1, model)
    P2_doc2vec = doc2vec(p2, model)
    print(simlarityCalu(P1_doc2vec, P2_doc2vec))
posted @ 2022-10-06 16:28  转角102  阅读(54)  评论(0编辑  收藏  举报