NLP（一）：word2vec的使用

一、处理短句子

from gensim.models import Word2Vec
sentences = [["Python", "深度学习", "机器学习"], ["NLP", "深度学习", "机器学习"]]
model = Word2Vec(sentences, min_count=1)

注意：把Python内置列表当作输入很方便，但当输入量很大的时候，大会占用大量内存。

二、语料是文件

1、Gensim需要输入一个可迭代的列表，可以是迭代器，没有必要把一切东西都保存在内存中，提供一个语句，加载处理它，忘记它，加载另一个语句。

2、一般我们的语料是在文件中存放的，首先，需要保证语料文件内部每一行对应一个句子（已经分词，以空格隔开），方法见上。

三、对一个目录下的所有文件生效

这些文件已经被分词好了，如果还需要进一步预处理文件中的单词，如移除数字，提取命名实体… 所有的这些都可以在MySentences 迭代器内进行，保证给work2vec的是处理好的迭代器。

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()
 
sentences = MySentences('/some/directory') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences)

四、对于单个文件

class: gensim.models.word2vec.LineSentence

每一行对应一个句子（已经分词，以空格隔开），我们可以直接用LineSentence把txt文件转为所需要的格式。

LineSentence功能解释：Iterate over a file that contains sentences: one line = one sentence. Words must be already preprocessed and separated by whitespace(对包含句子的文件进行迭代:一行=一句话。单词必须经过预处理，并由空格分隔)

from gensim import Word2Vec
from gensim.Word2Vec import LineSentence
from gensim.test.utils import common_texts, get_tmpfile
 
# inp为输入语料
inp = 'wiki.zh.text.jian.seg.txt'
sentences = LineSentences(inp)
path = get_tmpfile("word2vec.model") #创建临时文件
model = Word2Vec(sentences, size=100, window=5, min_count=1)
model.save("word2vec.model")

gensim.models.word2vec.LineSentence(source, max_sentence_length=10000, limit=None)

预处理类，限制句子最大长度，文档最大行数
拿到了分词后的文件，在一般的NLP处理中，会需要去停用词。由于word2vec的算法依赖于上下文，而上下文有可能就是停词。因此对于word2vec，我们可以不用去停词。

五、获取语料

1、https://files-cdn.cnblogs.com/files/pinard/in_the_name_of_people.zip

或者

class gensim.models.word2vec.Text8Corpus(fname, max_sentence_length=10000)
Bases: object
从一个叫‘text8’的语料库中获取数据，该语料来源于以下网址，参数max_sentence_length限定了获取的语料长度
Iterate over sentences from the “text8” corpus, unzipped from http://mattmahoney.net/dc/text8.zip

2、代码

import jieba
import jieba.analyse
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import gensim
with open("in_the_name_of_people.txt", encoding="utf8") as f:
    document = f.read()
    document_cut = jieba.cut(document)
    result = " ".join(document_cut)
    with open("segment.txt", "w", encoding="utf8") as fout:
        fout.write(result)

sentences = gensim.models.word2vec.LineSentence("segment.txt")
model = Word2Vec(sentences, hs=0,min_count=5,window=5,size=100)
# 上下文窗口大小：window=5
# 忽略低频次term：min_count=5
# 语言模型是用CBOW还是skip-gram？sg=0 是CBOW
# 优化方法是用层次softmax还是负采样：hs=0 是负采样
# 负采样样本数: negative=5 (一般设为5-20)
# 负采样采样概率的平滑指数：ns_exponent=0.75
# 高频词抽样的阈值 sample=0.001
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
for key in model.wv.similar_by_word('检察院', topn =10):
    print(key)

从bin中加载模型：

# with open("in_the_name_of_people.txt", encoding="utf8") as f:
#     document = f.read()
#     document_cut = jieba.cut(document)
#     result = " ".join(document_cut)
#     with open("segment.txt", "w", encoding="utf8") as fout:
#         fout.write(result)
#
# sentences = gensim.models.word2vec.LineSentence("segment.txt")
# model = Word2Vec(sentences, hs=0,min_count=5,window=5,size=100)
# # 上下文窗口大小：window=5
# # 忽略低频次term：min_count=5
# # 语言模型是用CBOW还是skip-gram？sg=0 是CBOW
# # 优化方法是用层次softmax还是负采样：hs=0 是负采样
# # 负采样样本数: negative=5 (一般设为5-20)
# # 负采样采样概率的平滑指数：ns_exponent=0.75
# # 高频词抽样的阈值 sample=0.001
# model.save("word2vec.model")
# model = gensim.models.KeyedVectors.load_word2vec_format('baike_26g_news_13g_novel_229g.bin', binary=True)
# sentence1 = "北京是中华人民供各国的首都"
# sentence2 = "人民民主"
# cut1 = jieba.cut(sentence1)
# cut2 = jieba.cut(sentence2)
#
# def getNumPyVec(list_cut):
#     vecList = []
#     for x in list_cut:
#         vecList.append(model[x])
#     torch_list = torch.tensor(vecList)
#     print(torch_list.shape)
#
#
# l1 = getNumPyVec(cut1)
# l2 = getNumPyVec(cut2)

#输入矩阵特征数input_size、输出矩阵特征数hidden_size、层数num_layers
# lstm = nn.LSTM(128,20,4)    #(input_size,hidden_size,num_layers)
# h0 = torch.randn(4,3,20)   #(num_layers* 1,batch_size,hidden_size)
# c0 = torch.randn(4,3,20)    #(num_layers*1,batch_size,hidden_size)
# inputs = torch.randn(10,3,128)   #(seq_len,batch_size,input_size)
# output,(hn,cn) = lstm(inputs,(h0,c0))

六、实战代码：训练word2vec和fasttext

import os
import jieba
import jieba.analyse
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models import fasttext
import gensim
import pandas as pd
from ltp import LTP
ltp = LTP()
from tqdm import tqdm

# segment, _ = ltp.seg(["他叫汤姆去拿外衣。"])
# # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']]

class TrainWord2Vec(object):
    def __init__(self):
        parent_path = os.path.split(os.path.realpath(__file__))[0]
        self.root = parent_path[:parent_path.find("pre_process")]  # E:\personas\semantics\
        #一万五的交行对话
        self.jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "all_text.csv")
        #2000条无意义
        self.meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless.txt")
        #6万条原始训练集
        self.semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "all.csv")
        #单字模型
        self.char_word2vec = os.path.join(self.root, "checkpoints", "word2vec", "char_word2vec.model")
        self.char_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "char_fasttext.model")
        #无意义单字分割
        self.char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt")
    def char_meaningless(self):
        char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt")
        with open(char_meaningless, "w", encoding="utf8") as fout:
            with open(self.meaningless, encoding="utf8") as f:
                for line in f.readlines():
                    line_str = line.replace(" ", "")
                    cn = " ".join(line_str)
                    fout.write(cn)

    def char_jiaohang(self):
        char_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_char.txt")
        with open(char_jiaohang, "w", encoding="utf8") as fout:
            dataList = pd.read_csv(self.jiaohang, sep="\t")["texts"].tolist()
            for line in dataList:
                line_str = line.replace(" ", "")
                cn = " ".join(line_str)
                fout.write(cn +  "\n")

    def char_semantic(self):
        char_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_char.txt")
        with open(char_semantic, "w", encoding="utf8") as fout:
            dataList = pd.read_csv(self.semantic, sep="\t")["sentence"].tolist()
            for line in dataList:
                line_str = line.replace(" ", "")
                cn = " ".join(line_str)
                fout.write(cn +  "\n")

    def all_char_file(self):
        char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt")
        char_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_char.txt")
        char_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_char.txt")
        r_lines = []
        with open(char_meaningless, "r", encoding="utf8") as f1:
            r_lines = r_lines + f1.readlines()
        with open(char_jiaohang, "r", encoding="utf8") as f2:
            r_lines = r_lines + f2.readlines()
        with open(char_semantic, "r", encoding="utf8") as f3:
            r_lines = r_lines + f3.readlines()
        out = os.path.join(self.root, "datas", "word2vec_data", "char_all.txt")
        with open(out, "w", encoding="utf8") as f4:
            for line in r_lines:
                f4.write(line)

    def train_char_meaningless_word2vec(self):
        all_text = os.path.join(self.root, "datas", "word2vec_data", "char_all.txt")
        sentences = gensim.models.word2vec.LineSentence(all_text)
        model = Word2Vec(sentences, hs=0, min_count=5, window=5, vector_size=128)
        # 上下文窗口大小：window=5
        # 忽略低频次term：min_count=5
        # 语言模型是用CBOW还是skip-gram？sg=0 是CBOW
        # 优化方法是用层次softmax还是负采样：hs=0 是负采样
        # 负采样样本数: negative=5 (一般设为5-20)
        # 负采样采样概率的平滑指数：ns_exponent=0.75
        # 高频词抽样的阈值 sample=0.001
        model.save(self.char_word2vec)
        print("wv:", model.wv.most_similar("嗯"))
        print("wv:", model.wv["你"])

        model1 = fasttext.FastText(sentences, hs=0, min_count=5, window=5, vector_size=128)
        model1.save(self.char_fasttext)
        print("ft:", model1.wv.most_similar("嗯"))
        print("ft:", model1.wv["你"])

    def word_meaningless(self):
        word_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_word.txt")
        with open(word_meaningless, "w", encoding="utf8") as fout:
            with open(self.meaningless, encoding="utf8") as f:
                for line in tqdm(f.readlines(), mininterval=1, smoothing=0.1):
                    line_str = line.replace(" ", "")
                    segment, _ = ltp.seg([line_str])
                    segment = " ".join(segment[0])
                    fout.write(segment + "\n")

    def word_jiaohang(self):
        word_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_word.txt")
        with open(word_jiaohang, "w", encoding="utf8") as fout:
            dataList = pd.read_csv(self.jiaohang, sep="\t")["texts"].tolist()
            for line in tqdm(dataList, mininterval=1, smoothing=0.1):
                line_str = line.replace(" ", "")
                segment, _ = ltp.seg([line_str])
                segment = " ".join(segment[0])
                fout.write(segment + "\n")

    def word_semantic(self):
        word_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_word.txt")
        with open(word_semantic, "w", encoding="utf8") as fout:
            dataList = pd.read_csv(self.semantic, sep="\t")["sentence"].tolist()
            for line in tqdm(dataList, mininterval=1, smoothing=0.1):
                line_str = line.replace(" ", "")
                segment, _ = ltp.seg([line_str])
                segment = " ".join(segment[0])
                fout.write(segment + "\n")

    def main(self):
        self.word_jiaohang()

posted @ 2020-12-08 22:06 jasonzhangxianrong 阅读(1452) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

公告

昵称： jasonzhangxianrong
园龄： 5年6个月
粉丝： 107
关注： 1

+加关注

2025年3月

日

一

二

三

四

五

六

NLP（一）：word2vec的使用

一、处理短句子

二、语料是文件

三、对一个目录下的所有文件生效

四、对于单个文件

五、获取语料

六、实战代码：训练word2vec和fasttext

公告

搜索

常用链接

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论