python word2vector (三)
下载三体文件,将其从命名为santi.txt 将其放在程序的统一目录下
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 1 10:13:28 2018
@author: luogan
"""
#!/bin/bash
# -*-coding=utf-8-*-
import jieba
import re
from gensim.models import word2vec
import multiprocessing
import gensim
def segment_text(source_corpus, train_corpus, coding, punctuation):
'''
切词,去除标点符号
:param source_corpus: 原始语料
:param train_corpus: 切词语料
:param coding: 文件编码
:param punctuation: 去除的标点符号
:return:
'''
with open(source_corpus, 'r', encoding=coding) as f, open(train_corpus, 'w', encoding=coding) as w:
for line in f:
# 去除标点符号
line = re.sub('[{0}]+'.format(punctuation), '', line.strip())
# 切词
words = jieba.cut(line)
w.write(' '.join(words))
#if __name__ == '__main__':
# 严格限制标点符号
strict_punctuation = '。,、':∶;?‘’“”〝〞ˆˇ﹕︰﹔﹖﹑·¨….¸;!´?!~—ˉ|‖"〃`@﹫¡¿﹏﹋﹌︴々﹟#﹩$﹠&﹪%*﹡﹢﹦﹤‐ ̄¯―﹨ˆ˜﹍﹎+=<__-\ˇ~﹉﹊()〈〉‹›﹛﹜『』〖〗[]《》〔〕{}「」【】︵︷︿︹︽_﹁﹃︻︶︸﹀︺︾ˉ﹂﹄︼'
# 简单限制标点符号
simple_punctuation = '’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
# 去除标点符号
punctuation = simple_punctuation + strict_punctuation
# 文件编码
coding = 'utf-8'
#coding ="gb18030"
# 原始语料
source_corpus_text = 'santi.txt'
# 是每个词的向量维度
size = 10
# 是词向量训练时的上下文扫描窗口大小,窗口为5就是考虑前5个词和后5个词
window = 5
# 设置最低频率,默认是5,如果一个词语在文档中出现的次数小于5,那么就会丢弃
min_count = 1
# 是训练的进程数,默认是当前运行机器的处理器核数。
workers = multiprocessing.cpu_count()
# 切词语料
train_corpus_text = 'words.txt'
# w2v模型文件
model_text = 'w2v_size_{0}.model'.format(size)
# 切词 @TODO 切词后注释
segment_text(source_corpus_text, train_corpus_text, coding, punctuation)
# w2v训练模型 @TODO 训练后注释
sentences = word2vec.Text8Corpus(train_corpus_text)
model = word2vec.Word2Vec(sentences=sentences, size=size, window=window, min_count=min_count, workers=workers)
model.save(model_text)
# 加载模型
model = gensim.models.Word2Vec.load(model_text)
# print(model['运动会'])
# 计算一个词的最近似的词,倒序
similar_words = model.most_similar('文明')
for word in similar_words:
print(word[0], word[1])
# 计算两词之间的余弦相似度
sim1 = model.similarity('飞船', '爱情')
print(sim1)
# 计算两个集合之间的余弦似度
list1 = ['三体', '物理']
list2 = ['相对论', '量子']
list_sim1 = model.n_similarity(list1, list2)
print(list_sim1)
# 选出集合中不同类的词语
list = ['上帝', '葡萄', '基督', '爱']
print(model.doesnt_match(list))
人类 0.9959995150566101
信息化 0.9937461614608765
宇宙 0.9928457736968994
数字化 0.9915485978126526
第三方 0.9911766052246094
世界 0.9876834154129028
三体 0.9861425161361694
踪迹 0.985514760017395
达尔文 0.9851547479629517
退回 0.9846546649932861
0.8862147950518867
0.9169202194786192
葡萄
#将文字转化为向量
vector = model.wv['三体'] # numpy vector of a word
print(vector)
[-6.710487 1.0552257 -3.034105 -4.0415897 4.609099 2.3800325
-0.9013142 2.425477 0.3875136 2.4119503]