NLP(一):word2vec的使用
一、处理短句子
from gensim.models import Word2Vec sentences = [["Python", "深度学习", "机器学习"], ["NLP", "深度学习", "机器学习"]] model = Word2Vec(sentences, min_count=1)
注意:把Python内置列表当作输入很方便,但当输入量很大的时候,大会占用大量内存。
二、语料是文件
1、Gensim需要输入一个可迭代的列表,可以是迭代器,没有必要把一切东西都保存在内存中,提供一个语句,加载处理它,忘记它,加载另一个语句。
2、一般我们的语料是在文件中存放的,首先,需要保证语料文件内部每一行对应一个句子(已经分词,以空格隔开),方法见上。
三、对一个目录下的所有文件生效
这些文件已经被分词好了,如果还需要进一步预处理文件中的单词,如移除数字,提取命名实体… 所有的这些都可以在MySentences 迭代器内进行,保证给work2vec的是处理好的迭代器。
class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): yield line.split() sentences = MySentences('/some/directory') # a memory-friendly iterator model = gensim.models.Word2Vec(sentences)
四、对于单个文件
class: gensim.models.word2vec.LineSentence
每一行对应一个句子(已经分词,以空格隔开),我们可以直接用LineSentence把txt文件转为所需要的格式。
LineSentence功能解释:Iterate over a file that contains sentences: one line = one sentence. Words must be already preprocessed and separated by whitespace(对包含句子的文件进行迭代:一行=一句话。单词必须经过预处理,并由空格分隔)
from gensim import Word2Vec from gensim.Word2Vec import LineSentence from gensim.test.utils import common_texts, get_tmpfile # inp为输入语料 inp = 'wiki.zh.text.jian.seg.txt' sentences = LineSentences(inp) path = get_tmpfile("word2vec.model") #创建临时文件 model = Word2Vec(sentences, size=100, window=5, min_count=1) model.save("word2vec.model")
gensim.models.word2vec.LineSentence(source, max_sentence_length=10000, limit=None)
预处理类,限制句子最大长度,文档最大行数
拿到了分词后的文件,在一般的NLP处理中,会需要去停用词。由于word2vec的算法依赖于上下文,而上下文有可能就是停词。因此对于word2vec,我们可以不用去停词。
五、获取语料
1、https://files-cdn.cnblogs.com/files/pinard/in_the_name_of_people.zip
或者
class gensim.models.word2vec.Text8Corpus(fname, max_sentence_length=10000) Bases: object 从一个叫‘text8’的语料库中获取数据,该语料来源于以下网址,参数max_sentence_length限定了获取的语料长度 Iterate over sentences from the “text8” corpus, unzipped from http://mattmahoney.net/dc/text8.zip
2、代码
import jieba import jieba.analyse from gensim.test.utils import common_texts, get_tmpfile from gensim.models import Word2Vec import gensim with open("in_the_name_of_people.txt", encoding="utf8") as f: document = f.read() document_cut = jieba.cut(document) result = " ".join(document_cut) with open("segment.txt", "w", encoding="utf8") as fout: fout.write(result) sentences = gensim.models.word2vec.LineSentence("segment.txt") model = Word2Vec(sentences, hs=0,min_count=5,window=5,size=100) # 上下文窗口大小:window=5 # 忽略低频次term:min_count=5 # 语言模型是用CBOW还是skip-gram?sg=0 是CBOW # 优化方法是用层次softmax还是负采样:hs=0 是负采样 # 负采样样本数: negative=5 (一般设为5-20) # 负采样采样概率的平滑指数:ns_exponent=0.75 # 高频词抽样的阈值 sample=0.001 model.save("word2vec.model") model = Word2Vec.load("word2vec.model") for key in model.wv.similar_by_word('检察院', topn =10): print(key)
从bin中加载模型:
# with open("in_the_name_of_people.txt", encoding="utf8") as f: # document = f.read() # document_cut = jieba.cut(document) # result = " ".join(document_cut) # with open("segment.txt", "w", encoding="utf8") as fout: # fout.write(result) # # sentences = gensim.models.word2vec.LineSentence("segment.txt") # model = Word2Vec(sentences, hs=0,min_count=5,window=5,size=100) # # 上下文窗口大小:window=5 # # 忽略低频次term:min_count=5 # # 语言模型是用CBOW还是skip-gram?sg=0 是CBOW # # 优化方法是用层次softmax还是负采样:hs=0 是负采样 # # 负采样样本数: negative=5 (一般设为5-20) # # 负采样采样概率的平滑指数:ns_exponent=0.75 # # 高频词抽样的阈值 sample=0.001 # model.save("word2vec.model") # model = gensim.models.KeyedVectors.load_word2vec_format('baike_26g_news_13g_novel_229g.bin', binary=True) # sentence1 = "北京是中华人民供各国的首都" # sentence2 = "人民民主" # cut1 = jieba.cut(sentence1) # cut2 = jieba.cut(sentence2) # # def getNumPyVec(list_cut): # vecList = [] # for x in list_cut: # vecList.append(model[x]) # torch_list = torch.tensor(vecList) # print(torch_list.shape) # # # l1 = getNumPyVec(cut1) # l2 = getNumPyVec(cut2) #输入矩阵特征数input_size、输出矩阵特征数hidden_size、层数num_layers # lstm = nn.LSTM(128,20,4) #(input_size,hidden_size,num_layers) # h0 = torch.randn(4,3,20) #(num_layers* 1,batch_size,hidden_size) # c0 = torch.randn(4,3,20) #(num_layers*1,batch_size,hidden_size) # inputs = torch.randn(10,3,128) #(seq_len,batch_size,input_size) # output,(hn,cn) = lstm(inputs,(h0,c0))
六、实战代码:训练word2vec和fasttext
import os import jieba import jieba.analyse from gensim.test.utils import common_texts, get_tmpfile from gensim.models import Word2Vec from gensim.models import fasttext import gensim import pandas as pd from ltp import LTP ltp = LTP() from tqdm import tqdm # segment, _ = ltp.seg(["他叫汤姆去拿外衣。"]) # # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']] class TrainWord2Vec(object): def __init__(self): parent_path = os.path.split(os.path.realpath(__file__))[0] self.root = parent_path[:parent_path.find("pre_process")] # E:\personas\semantics\ #一万五的交行对话 self.jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "all_text.csv") #2000条无意义 self.meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless.txt") #6万条原始训练集 self.semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "all.csv") #单字模型 self.char_word2vec = os.path.join(self.root, "checkpoints", "word2vec", "char_word2vec.model") self.char_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "char_fasttext.model") #无意义单字分割 self.char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt") def char_meaningless(self): char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt") with open(char_meaningless, "w", encoding="utf8") as fout: with open(self.meaningless, encoding="utf8") as f: for line in f.readlines(): line_str = line.replace(" ", "") cn = " ".join(line_str) fout.write(cn) def char_jiaohang(self): char_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_char.txt") with open(char_jiaohang, "w", encoding="utf8") as fout: dataList = pd.read_csv(self.jiaohang, sep="\t")["texts"].tolist() for line in dataList: line_str = line.replace(" ", "") cn = " ".join(line_str) fout.write(cn + "\n") def char_semantic(self): char_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_char.txt") with open(char_semantic, "w", encoding="utf8") as fout: dataList = pd.read_csv(self.semantic, sep="\t")["sentence"].tolist() for line in dataList: line_str = line.replace(" ", "") cn = " ".join(line_str) fout.write(cn + "\n") def all_char_file(self): char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt") char_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_char.txt") char_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_char.txt") r_lines = [] with open(char_meaningless, "r", encoding="utf8") as f1: r_lines = r_lines + f1.readlines() with open(char_jiaohang, "r", encoding="utf8") as f2: r_lines = r_lines + f2.readlines() with open(char_semantic, "r", encoding="utf8") as f3: r_lines = r_lines + f3.readlines() out = os.path.join(self.root, "datas", "word2vec_data", "char_all.txt") with open(out, "w", encoding="utf8") as f4: for line in r_lines: f4.write(line) def train_char_meaningless_word2vec(self): all_text = os.path.join(self.root, "datas", "word2vec_data", "char_all.txt") sentences = gensim.models.word2vec.LineSentence(all_text) model = Word2Vec(sentences, hs=0, min_count=5, window=5, vector_size=128) # 上下文窗口大小:window=5 # 忽略低频次term:min_count=5 # 语言模型是用CBOW还是skip-gram?sg=0 是CBOW # 优化方法是用层次softmax还是负采样:hs=0 是负采样 # 负采样样本数: negative=5 (一般设为5-20) # 负采样采样概率的平滑指数:ns_exponent=0.75 # 高频词抽样的阈值 sample=0.001 model.save(self.char_word2vec) print("wv:", model.wv.most_similar("嗯")) print("wv:", model.wv["你"]) model1 = fasttext.FastText(sentences, hs=0, min_count=5, window=5, vector_size=128) model1.save(self.char_fasttext) print("ft:", model1.wv.most_similar("嗯")) print("ft:", model1.wv["你"]) def word_meaningless(self): word_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_word.txt") with open(word_meaningless, "w", encoding="utf8") as fout: with open(self.meaningless, encoding="utf8") as f: for line in tqdm(f.readlines(), mininterval=1, smoothing=0.1): line_str = line.replace(" ", "") segment, _ = ltp.seg([line_str]) segment = " ".join(segment[0]) fout.write(segment + "\n") def word_jiaohang(self): word_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_word.txt") with open(word_jiaohang, "w", encoding="utf8") as fout: dataList = pd.read_csv(self.jiaohang, sep="\t")["texts"].tolist() for line in tqdm(dataList, mininterval=1, smoothing=0.1): line_str = line.replace(" ", "") segment, _ = ltp.seg([line_str]) segment = " ".join(segment[0]) fout.write(segment + "\n") def word_semantic(self): word_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_word.txt") with open(word_semantic, "w", encoding="utf8") as fout: dataList = pd.read_csv(self.semantic, sep="\t")["sentence"].tolist() for line in tqdm(dataList, mininterval=1, smoothing=0.1): line_str = line.replace(" ", "") segment, _ = ltp.seg([line_str]) segment = " ".join(segment[0]) fout.write(segment + "\n") def main(self): self.word_jiaohang()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧