NLP(十八):训练字级别的word2vec、Fasttext和词级别的word2vec
一、数据:
word2vec训练数据格式说明:对于文本文件,基本格式是一行一句话,需要分好词。
(1)如果按字级别训练,将汉字分隔开就行。按字分割:
line_str = line.replace(" ", "") cn = " ".join(line_str)
(2)按词分割
方法有很多,jieba、北大的pkuseg、哈工大的LTP
1、基于字,文件示例。
不 一 定 。
不 一 定 不 一 定 。
不 一 样 。
不 卖 钱 。
不 可 以 。 我 还 没 开 始 用 呢 ,
不 同 的 地 点 今 天 你 会 经 过 哪 里 可 以 买 ?
不 听 。
不 在 。
不 太 信 不 在 。
2、基于分词
不 我这个 , 那 我那个 。
不是 一万 多 了 ? 怎么 变成 两万 多 ?
不是 不是 你 去 可以 去 移动 去 查 一下路途中 他们 绝对 不 是 徐世东 好 吗 ?
不是 不是 我 现在 现在 这个 号码 注册 的 四五折熊 图片 ?
不是 从前 两 年 说过 了 吗 ?
不是 你 能 听 我 说话 吗 ? 你 别老自己 跟着 吧 , 说 行不行 啊 。
不是 原来 它会 自动 还款 怎么办 ? 最近 都 没 没有 跳出来 。
不是 可以 自动 还款 吗 ?
不是 啊 , 这样 没有 啊 。
二、代码
import os import jieba import jieba.analyse from gensim.test.utils import common_texts, get_tmpfile from gensim.models import Word2Vec from gensim.models import fasttext import gensim import pandas as pd from ltp import LTP ltp = LTP() from tqdm import tqdm # segment, _ = ltp.seg(["他叫汤姆去拿外衣。"]) # # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']] class TrainWord2Vec(object): def __init__(self): parent_path = os.path.split(os.path.realpath(__file__))[0] self.root = parent_path[:parent_path.find("pre_process")] # E:\personas\semantics\ #一万五的交行对话 self.jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "all_text.csv") #2000条无意义 self.meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless.txt") #6万条原始训练集 self.semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "all.csv") #单字模型 self.char_word2vec = os.path.join(self.root, "checkpoints", "word2vec", "char_word2vec.model") self.char_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "char_fasttext.model") #无意义单字分割 self.char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt") #词模型 self.word_word2vec = os.path.join(self.root, "checkpoints", "word2vec", "word_word2vec.model") self.word_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "word_fasttext.model") def char_meaningless(self): char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt") with open(char_meaningless, "w", encoding="utf8") as fout: with open(self.meaningless, encoding="utf8") as f: for line in f.readlines(): line_str = line.replace(" ", "") cn = " ".join(line_str) fout.write(cn) def char_jiaohang(self): char_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_char.txt") with open(char_jiaohang, "w", encoding="utf8") as fout: dataList = pd.read_csv(self.jiaohang, sep="\t")["texts"].tolist() for line in dataList: line_str = line.replace(" ", "") cn = " ".join(line_str) fout.write(cn + "\n") def char_semantic(self): char_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_char.txt") with open(char_semantic, "w", encoding="utf8") as fout: dataList = pd.read_csv(self.semantic, sep="\t")["sentence"].tolist() for line in dataList: line_str = line.replace(" ", "") cn = " ".join(line_str) fout.write(cn + "\n") def all_char_file(self): char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt") char_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_char.txt") char_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_char.txt") r_lines = [] with open(char_meaningless, "r", encoding="utf8") as f1: r_lines = r_lines + f1.readlines() with open(char_jiaohang, "r", encoding="utf8") as f2: r_lines = r_lines + f2.readlines() with open(char_semantic, "r", encoding="utf8") as f3: r_lines = r_lines + f3.readlines() out = os.path.join(self.root, "datas", "word2vec_data", "char_all.txt") with open(out, "w", encoding="utf8") as f4: for line in r_lines: f4.write(line) def train_char_meaningless_word2vec(self): all_text = os.path.join(self.root, "datas", "word2vec_data", "char_all.txt") sentences = gensim.models.word2vec.LineSentence(all_text) model = Word2Vec(sentences, hs=0, min_count=5, window=5, vector_size=128) # 上下文窗口大小:window=5 # 忽略低频次term:min_count=5 # 语言模型是用CBOW还是skip-gram?sg=0 是CBOW # 优化方法是用层次softmax还是负采样:hs=0 是负采样 # 负采样样本数: negative=5 (一般设为5-20) # 负采样采样概率的平滑指数:ns_exponent=0.75 # 高频词抽样的阈值 sample=0.001 model.save(self.char_word2vec) print("wv:", model.wv.most_similar("嗯")) print("wv:", model.wv["你"]) model1 = fasttext.FastText(sentences, hs=0, min_count=5, window=5, vector_size=128) model1.save(self.char_fasttext) print("ft:", model1.wv.most_similar("嗯")) print("ft:", model1.wv["你"]) def word_meaningless(self): word_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_word.txt") with open(word_meaningless, "w", encoding="utf8") as fout: with open(self.meaningless, encoding="utf8") as f: for line in tqdm(f.readlines(), mininterval=1, smoothing=0.1): line_str = line.replace(" ", "") segment, _ = ltp.seg([line_str]) segment = " ".join(segment[0]) fout.write(segment + "\n") def word_jiaohang(self): word_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_word.txt") with open(word_jiaohang, "w", encoding="utf8") as fout: dataList = pd.read_csv(self.jiaohang, sep="\t")["texts"].tolist() for line in tqdm(dataList, mininterval=1, smoothing=0.1): line_str = line.replace(" ", "") segment, _ = ltp.seg([line_str]) segment = " ".join(segment[0]) fout.write(segment + "\n") def word_semantic(self): word_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_word.txt") with open(word_semantic, "w", encoding="utf8") as fout: dataList = pd.read_csv(self.semantic, sep="\t")["sentence"].tolist() for line in tqdm(dataList, mininterval=1, smoothing=0.1): line_str = line.replace(" ", "") segment, _ = ltp.seg([line_str]) segment = " ".join(segment[0]) fout.write(segment + "\n") def all_word_file(self): word_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_word.txt") word_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_word.txt") word_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_word.txt") r_lines = [] with open(word_meaningless, "r", encoding="utf8") as f1: r_lines = r_lines + f1.readlines() with open(word_jiaohang, "r", encoding="utf8") as f2: r_lines = r_lines + f2.readlines() with open(word_semantic, "r", encoding="utf8") as f3: r_lines = r_lines + f3.readlines() out = os.path.join(self.root, "datas", "word2vec_data", "word_all.txt") with open(out, "w", encoding="utf8") as f4: for line in r_lines: f4.write(line) def train_word_meaningless_word2vec(self): all_text = os.path.join(self.root, "datas", "word2vec_data", "word_all.txt") sentences = gensim.models.word2vec.LineSentence(all_text) model = Word2Vec(sentences, hs=0, min_count=5, window=5, vector_size=128) # 上下文窗口大小:window=5 # 忽略低频次term:min_count=5 # 语言模型是用CBOW还是skip-gram?sg=0 是CBOW # 优化方法是用层次softmax还是负采样:hs=0 是负采样 # 负采样样本数: negative=5 (一般设为5-20) # 负采样采样概率的平滑指数:ns_exponent=0.75 # 高频词抽样的阈值 sample=0.001 model.save(self.word_word2vec) print("wv:", model.wv.most_similar("了解")) print("wv:", model.wv["时候"]) model1 = fasttext.FastText(sentences, hs=0, min_count=5, window=5, vector_size=128) model1.save(self.word_fasttext) print("ft:", model1.wv.most_similar("了解")) print("ft:", model1.wv["时候"]) def main(self): self.train_word_meaningless_word2vec() if __name__ == '__main__': TrainWord2Vec().main()