【502】gensim实现Word2Vec
参考:Word Embedding Tutorial: word2vec using Gensim [EXAMPLE]
参考:NLP入门(三)词形还原(Lemmatization)
参考:Implementing Word2Vec with Gensim Library in Python
文本预处理
- 分词
- 单词转化为小写字母
- 去除单词中的标点符号
- 去除单词中的数字
- 去除空字符
- 去掉停用词
- 去掉空的list
- 词形还原
首先导入必要的 libraries
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import gensim import nltk from gensim.models import Word2Vec # 停用词 from nltk.corpus import stopwords stop = stopwords.words( 'english' ) # 标点符号 import string # string.punctuation # 词形还原 from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() |
加载数据并显示
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | data = [{ "tag" : "welcome" , "patterns" : [ "Hi" , "How are you" , "Is any one to talk?" , "Hello" , "hi are you available" ], "responses" : [ "Hello, thanks for contacting us" , "Good to see you here" , " Hi there, how may I assist you?" ] }, { "tag" : "goodbye" , "patterns" : [ "Bye" , "See you later" , "Goodbye" , "I will come back soon" ], "responses" : [ "See you later, thanks for visiting" , "have a great day ahead" , "Wish you Come back again soon." ] }, { "tag" : "thankful" , "patterns" : [ "Thanks for helping me" , "Thank your guidance" , "That's helpful and kind from you" ], "responses" : [ "Happy to help!" , "Any time!" , "My pleasure" , "It is my duty to help you" ] }, { "tag" : "hoursopening" , "patterns" : [ "What hours are you open?" , "Tell your opening time?" , "When are you open?" , "Just your timing please" ], "responses" : [ "We're open every day 8am-7pm" , "Our office hours are 8am-7pm every day" , "We open office at 8 am and close at 7 pm" ] }, { "tag" : "payments" , "patterns" : [ "Can I pay using credit card?" , " Can I pay using Mastercard?" , " Can I pay using cash only?" ], "responses" : [ "We accept VISA, Mastercard and credit card" , "We accept credit card, debit cards and cash. Please don’t worry" ] } ] bigger_list = [] for i in range ( len (data)): for s in data[i][ 'patterns' ]: li = s.split( " " ) bigger_list.append(li) bigger_list |
输出结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | [[ 'Hi' ], [ 'How' , 'are' , 'you' ], [ 'Is' , 'any' , 'one' , 'to' , 'talk?' ], [ 'Hello' ], [ 'hi' , 'are' , 'you' , 'available' ], [ 'Bye' ], [ 'See' , 'you' , 'later' ], [ 'Goodbye' ], [ 'I' , 'will' , 'come' , 'back' , 'soon' ], [ 'Thanks' , 'for' , 'helping' , 'me' ], [ 'Thank' , 'your' , 'guidance' ], [ "That's" , 'helpful ', ' and ', ' kind ', ' from ', ' you'], [ 'What' , 'hours' , 'are' , 'you' , 'open?' ], [ 'Tell' , 'your' , 'opening' , 'time?' ], [ 'When' , 'are' , 'you' , 'open?' ], [ 'Just' , 'your' , 'timing' , 'please' ], [ 'Can' , 'I' , 'pay' , 'using' , 'credit' , 'card?' ], [' ', ' Can ', ' I ', ' pay ', ' using ', ' Mastercard?'], [' ', ' Can ', ' I ', ' pay ', ' using ', ' cash ', ' only?']] |
将单词都转换为小写字母:
1 2 3 | # 将单词变为小写 bigger_list = [[w.lower() for w in s] for s in bigger_list] bigger_list |
输出结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | [[ 'hi' ], [ 'how' , 'are' , 'you' ], [ 'is' , 'any' , 'one' , 'to' , 'talk?' ], [ 'hello' ], [ 'hi' , 'are' , 'you' , 'available' ], [ 'bye' ], [ 'see' , 'you' , 'later' ], [ 'goodbye' ], [ 'i' , 'will' , 'come' , 'back' , 'soon' ], [ 'thanks' , 'for' , 'helping' , 'me' ], [ 'thank' , 'your' , 'guidance' ], [ "that's" , 'helpful ', ' and ', ' kind ', ' from ', ' you'], [ 'what' , 'hours' , 'are' , 'you' , 'open?' ], [ 'tell' , 'your' , 'opening' , 'time?' ], [ 'when' , 'are' , 'you' , 'open?' ], [ 'just' , 'your' , 'timing' , 'please' ], [ 'can' , 'i' , 'pay' , 'using' , 'credit' , 'card?' ], [' ', ' can ', ' i ', ' pay ', ' using ', ' mastercard?'], [' ', ' can ', ' i ', ' pay ', ' using ', ' cash ', ' only?']] |
删除单词里面的标点符号
1 2 3 4 5 6 7 8 9 10 11 | import string # 存储标点符号为一个字符串 # string.punctuation # 去掉单词中的标点 # ''.join([x for x in 'alex?' if x not in string.punctuation]) # 输出为 alex # 去掉单词中的标点 bigger_list = [[''.join([x for x in w if x not in string.punctuation]) for w in s] for s in bigger_list] bigger_list |
输出结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | [[ 'hi' ], [ 'how' , 'are' , 'you' ], [ 'is' , 'any' , 'one' , 'to' , 'talk' ], [ 'hello' ], [ 'hi' , 'are' , 'you' , 'available' ], [ 'bye' ], [ 'see' , 'you' , 'later' ], [ 'goodbye' ], [ 'i' , 'will' , 'come' , 'back' , 'soon' ], [ 'thanks' , 'for' , 'helping' , 'me' ], [ 'thank' , 'your' , 'guidance' ], [ 'thats' , 'helpful' , 'and' , 'kind' , 'from' , 'you' ], [ 'what' , 'hours' , 'are' , 'you' , 'open' ], [ 'tell' , 'your' , 'opening' , 'time' ], [ 'when' , 'are' , 'you' , 'open' ], [ 'just' , 'your' , 'timing' , 'please' ], [ 'can' , 'i' , 'pay' , 'using' , 'credit' , 'card' ], [' ', ' can ', ' i ', ' pay ', ' using ', ' mastercard'], [' ', ' can ', ' i ', ' pay ', ' using ', ' cash ', ' only']] |
去掉空字符
1 2 3 | # 去掉空字符 bigger_list = [[w for w in s if w! = ''] for s in bigger_list] bigger_list |
输出结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | [[ 'hi' ], [ 'how' , 'are' , 'you' ], [ 'is' , 'any' , 'one' , 'to' , 'talk' ], [ 'hello' ], [ 'hi' , 'are' , 'you' , 'available' ], [ 'bye' ], [ 'see' , 'you' , 'later' ], [ 'goodbye' ], [ 'i' , 'will' , 'come' , 'back' , 'soon' ], [ 'thanks' , 'for' , 'helping' , 'me' ], [ 'thank' , 'your' , 'guidance' ], [ 'thats' , 'helpful' , 'and' , 'kind' , 'from' , 'you' ], [ 'what' , 'hours' , 'are' , 'you' , 'open' ], [ 'tell' , 'your' , 'opening' , 'time' ], [ 'when' , 'are' , 'you' , 'open' ], [ 'just' , 'your' , 'timing' , 'please' ], [ 'can' , 'i' , 'pay' , 'using' , 'credit' , 'card' ], [ 'can' , 'i' , 'pay' , 'using' , 'mastercard' ], [ 'can' , 'i' , 'pay' , 'using' , 'cash' , 'only' ]] |
去掉停用词
1 2 3 4 5 6 7 | from nltk.corpus import stopwords # 存储停用词 stop = stopwords.words( 'english' ) # 去掉停用词 bigger_list = [[w for w in s if w not in stop] for s in bigger_list] bigger_list |
输出结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | [[ 'hi' ], [], [ 'one' , 'talk' ], [ 'hello' ], [ 'hi' , 'available' ], [ 'bye' ], [ 'see' , 'later' ], [ 'goodbye' ], [ 'come' , 'back' , 'soon' ], [ 'thanks' , 'helping' ], [ 'thank' , 'guidance' ], [ 'thats' , 'helpful' , 'kind' ], [ 'hours' , 'open' ], [ 'tell' , 'opening' , 'time' ], [ 'open' ], [ 'timing' , 'please' ], [ 'pay' , 'using' , 'credit' , 'card' ], [ 'pay' , 'using' , 'mastercard' ], [ 'pay' , 'using' , 'cash' ]] |
去掉空的 list
1 2 3 | # 去掉空的list bigger_list = [s for s in bigger_list if len (s) > 0 ] bigger_list |
输出结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | [[ 'hi' ], [ 'one' , 'talk' ], [ 'hello' ], [ 'hi' , 'available' ], [ 'bye' ], [ 'see' , 'later' ], [ 'goodbye' ], [ 'come' , 'back' , 'soon' ], [ 'thanks' , 'helping' ], [ 'thank' , 'guidance' ], [ 'thats' , 'helpful' , 'kind' ], [ 'hours' , 'open' ], [ 'tell' , 'opening' , 'time' ], [ 'open' ], [ 'timing' , 'please' ], [ 'pay' , 'using' , 'credit' , 'card' ], [ 'pay' , 'using' , 'mastercard' ], [ 'pay' , 'using' , 'cash' ]] |
词形还原
1 2 3 4 5 | # 词形还原 from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() bigger_list = [[wnl.lemmatize(w) for w in s] for s in bigger_list] bigger_list |
输出结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | [[ 'hi' ], [ 'one' , 'talk' ], [ 'hello' ], [ 'hi' , 'available' ], [ 'bye' ], [ 'see' , 'later' ], [ 'goodbye' ], [ 'come' , 'back' , 'soon' ], [ 'thanks' , 'helping' ], [ 'thank' , 'guidance' ], [ 'thats' , 'helpful' , 'kind' ], [ 'hour' , 'open' ], [ 'tell' , 'opening' , 'time' ], [ 'open' ], [ 'timing' , 'please' ], [ 'pay' , 'using' , 'credit' , 'card' ], [ 'pay' , 'using' , 'mastercard' ], [ 'pay' , 'using' , 'cash' ]] |
模型训练并存储以及调用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | # 训练模型 model = Word2Vec(bigger_list,min_count = 1 ,size = 300 ,workers = 4 ) # 模型存储 model.save( "word2vec.model" ) model.save( 'word2vec.bin' ) # 模型加载 model = Word2Vec.load( 'word2vec.bin' ) # 词汇 list (model.wv.vocab) # thanks 对应的 vector model.wv.word_vec( 'thanks' ) |
word2vec API讲解
在gensim中,word2vec 相关的API都在包gensim.models.word2vec中。和算法有关的参数都在类gensim.models.word2vec.Word2Vec中。算法需要注意的参数有:
- sentences:我们要分析的语料,可以是一个列表,或者从文件中遍历读出(word2vec.LineSentence(filename) )。
- size:词向量的维度,默认值是100。这个维度的取值一般与我们的语料的大小相关,如果是不大的语料,比如小于100M的文本语料,则使用默认值一般就可以了。如果是超大的语料,建议增大维度。
- window:即词向量上下文最大距离,window越大,则和某一词较远的词也会产生上下文关系。默认值为5,在实际使用中,可以根据实际的需求来动态调整这个window的大小。如果是小语料则这个值可以设的更小。对于一般的语料这个值推荐在[5;10]之间。
- sg:即我们的word2vec两个模型的选择了。如果是0, 则是CBOW模型;是1则是Skip-Gram模型;默认是0即CBOW模型。
- hs:即我们的word2vec两个解法的选择了。如果是0, 则是Negative Sampling;是1的话并且负采样个数negative大于0, 则是Hierarchical Softmax。默认是0即Negative Sampling。
- negative:即使用Negative Sampling时负采样的个数,默认是5。推荐在[3,10]之间。这个参数在我们的算法原理篇中标记为neg。
- cbow_mean:仅用于CBOW在做投影的时候,为0,则算法中的xw为上下文的词向量之和,为1则为上下文的词向量的平均值。在我们的原理篇中,是按照词向量的平均值来描述的。个人比较喜欢用平均值来表示xw,默认值也是1,不推荐修改默认值。
- min_count:需要计算词向量的最小词频。这个值可以去掉一些很生僻的低频词,默认是5。如果是小语料,可以调低这个值。
- iter:随机梯度下降法中迭代的最大次数,默认是5。对于大语料,可以增大这个值。
- alpha:在随机梯度下降法中迭代的初始步长。算法原理篇中标记为η,默认是0.025。
- min_alpha: 由于算法支持在迭代的过程中逐渐减小步长,min_alpha给出了最小的迭代步长值。随机梯度下降中每轮的迭代步长可以由iter,alpha, min_alpha一起得出。这部分由于不是word2vec算法的核心内容,因此在原理篇我们没有提到。
利用json和pandas处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | #list of libraries used by the code import string from gensim.models import Word2Vec import logging from nltk.corpus import stopwords from textblob import Word import json import pandas as pd #data in json format json_file = 'intents.json' with open ( 'intents.json' , 'r' ) as f: data = json.load(f) #displaying the list of stopwords stop = stopwords.words( 'english' ) #dataframe df = pd.DataFrame(data) df[ 'patterns' ] = df[ 'patterns' ]. apply ( ', ' .join) # print(df['patterns']) #print(df['patterns']) #cleaning the data using the NLP approach print (df) df[ 'patterns' ] = df[ 'patterns' ]. apply ( lambda x: ' ' .join(x.lower() for x in x.split())) df[ 'patterns' ] = df[ 'patterns' ]. apply ( lambda x: ' ' .join(x for x in x.split() if x not in string.punctuation)) df[ 'patterns' ] = df[ 'patterns' ]. str .replace( '[^\w\s]' ,'') df[ 'patterns' ] = df[ 'patterns' ]. apply ( lambda x: ' ' .join(x for x in x.split() if not x.isdigit())) df[ 'patterns' ] = df[ 'patterns' ]. apply ( lambda x: ' ' .join(x for x in x.split() if not x in stop)) df[ 'patterns' ] = df[ 'patterns' ]. apply ( lambda x: " " .join([Word(word).lemmatize() for word in x.split()])) #taking the outer list bigger_list = [] for i in df[ 'patterns' ]: li = list (i.split( " " )) bigger_list.append(li) #structure of data to be taken by the model.word2vec print ( "Data format for the overall list:" ,bigger_list) #custom data is fed to machine for further processing model = Word2Vec(bigger_list, min_count = 1 ,size = 300 ,workers = 4 ) #print(model) |
分类:
AI Related / NLP
, AI Related
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· .NET10 - 预览版1新功能体验(一)