使用wiki中文数据训练词向量模型
首先,使用gensim提供的WikiCorpus中的get_texts将维基数据里的每篇文章转换为1行text文本,并且去掉标点符号
wiki中文数据下载地址:https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2
from gensim.corpora import WikiCorpus
path_to_wiki_dump = "/content/drive/My Drive/zhwiki-latest-pages-articles.xml.bz2"
outp = None
space = " "
doc_count = 0
wiki = WikiCorpus(path_to_wiki_dump, lemmatize=False, dictionary=[])
with open('/content/drive/My Drive/wiki_zh.txt', 'w') as output_pipeline:
for text in wiki.get_texts():
sentence_data = space.join(text) + "\n"
output_pipeline.write(sentence_data)
doc_count = doc_count + 1
if (doc_count % 10000 == 0):
print(sentence_data)
print("save" + str(doc_count) + "articles")
print("----"*10)
print("finish with ", doc_count, "lines.")
接下来我们使用opencc把繁体中文转化为简体:
# !pip install opencc-python-reimplemented
from opencc import OpenCC
cc = OpenCC('t2s') # convert from Traditional Chinese to Simplified Chinese
input_path = "/content/drive/My Drive/wiki_zh.txt"
output_path = "/content/drive/My Drive/wiki_zh_simp.txt"
with open(input_path, 'r') as input_pipeline, open(output_path, 'w') as output_pipeline:
for index, line_text in enumerate(input_pipeline):
converted_text = cc.convert(line_text)
output_pipeline.write(converted_text)
if index % 10000 == 0:
print(index)
print(repr(converted_text))
print("----"*10)
print(index)
接下来使用jieba进行分词:
# !pip install jieba
import jieba
input_path = "/content/drive/My Drive/wiki_zh_simp.txt"
output_path = "/content/drive/My Drive/wiki_zh_simp_seg.txt"
with open(input_path, 'r') as input_pipeline, open(output_path, 'w') as output_pipeline:
for index, line_text in enumerate(input_pipeline):
words = jieba.cut(line_text)
words = ' '.join(words)
output_pipeline.write(words)
if index % 10000 == 0:
print(index)
print(repr(words))
print("----"*10)
print(index)
最后,使用gensim训练词向量(这个步骤在colab上运行大约需要75分钟)
相关文档:https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
# !pip install gensim
import logging
import multiprocess
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
file_dir = "/content/drive/My Drive/"
input_path = file_dir + "wiki_zh_simp_seg.txt"
output_path = file_dir + "wiki_zh_text_model"
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(LineSentence(input_path), size=400, workers=multiprocess.cpu_count())
model.save(output_path)
测试一下训练好的词向量:
from gensim.models import Word2Vec
file_dir = "/content/drive/My Drive/"
model = Word2Vec.load(file_dir + "wiki_zh_text_model")
print(model.wv[u"足球"]) # 使用model.wv来获取词向量
words = model.most_similar(u"足球")
for word in words:
print(word[0], word[1])