gensim示例

安装

!pip install gensim

训练

from gensim.models import word2vec
import logging

# 主程序
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus()  # 加载语料。可替换为自己的语料
model = word2vec.Word2Vec(sentences, size=200)  # 默认window=5

# 计算两个词的相似度/相关程度
y1 = model.similarity(u"不错", u"好")
print(u"【不错】和【好】的相似度为:", y1)
print("--------\n")

加载词向量文件

from gensim.models import KeyedVectors

file = '/home/xuehp/data/Tencent_AILab_ChineseEmbedding.txt'
wv_from_text = KeyedVectors.load_word2vec_format(file, binary=False)
wv_from_text.init_sims(replace=True) 

获取单词向量

import numpy as np

# 未知词、短语向量补齐
def compute_ngrams(word, min_n, max_n):
    #BOW, EOW = ('<', '>')  # Used by FastText to attach to all words as prefix and suffix
    extended_word =  word
    ngrams = []
    for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
        for i in range(0, len(extended_word) - ngram_length + 1):
            ngrams.append(extended_word[i:i + ngram_length])
    return list(set(ngrams))
def wordVec(word, wv_from_text, min_n = 1, max_n = 3):
    '''
    ngrams_single/ngrams_more,主要是为了当出现oov的情况下,最好先不考虑单字词向量
    '''
    # 确认词向量维度
    word_size = wv_from_text.wv.syn0[0].shape[0]   
    # 计算word的ngrams词组
    ngrams = compute_ngrams(word,min_n = min_n, max_n = max_n)
    # 如果在词典之中,直接返回词向量
    if word in wv_from_text.wv.vocab.keys():
        return wv_from_text[word]
    else:  
        # 不在词典的情况下
        word_vec = np.zeros(word_size, dtype=np.float32)
        ngrams_found = 0
        ngrams_single = [ng for ng in ngrams if len(ng) == 1]
        ngrams_more = [ng for ng in ngrams if len(ng) > 1]
        # 先只接受2个单词长度以上的词向量
        for ngram in ngrams_more:
            if ngram in wv_from_text.wv.vocab.keys():
                word_vec += wv_from_text[ngram]
                ngrams_found += 1
                #print(ngram)
        # 如果,没有匹配到,那么最后是考虑单个词向量
        if ngrams_found == 0:
            for ngram in ngrams_single:
                word_vec += wv_from_text[ngram]
                ngrams_found += 1
        if word_vec.any():
            return word_vec / max(1, ngrams_found)
        else:
            raise KeyError('all ngrams for word %s absent from model' % word)

例子1

vec = wordVec('苹果', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)

输出:

[('苹果', 1.0),
 ('苹果公司', 0.8514505624771118),
 ('以及苹果', 0.8457839488983154),
 ('比如苹果', 0.7890200018882751),
 ('苹果新', 0.7845828533172607),
 ('其他苹果', 0.7817449569702148),
 ('iphone', 0.7793817520141602),
 ('苹果iphone', 0.7790712714195251),
 ('苹果的iphone', 0.7720062136650085),
 ('apple', 0.7679361701011658),
 ('苹果产品', 0.7623019814491272),
 ('像苹果', 0.7533938884735107),
 ('小米', 0.7517136335372925),
 ('关于苹果', 0.7515844106674194),
 ('iphone产品', 0.7507627606391907),
 ('iphonex', 0.7488199472427368),
 ('新款iphone', 0.747662365436554),
 ('苹果10', 0.7474119067192078),
 ('iphone系列', 0.7470223307609558),
 ('新iphone', 0.7435163855552673)]

例子2

vec = wordVec('iuap', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)

输出:

[('iuap', 1.0),
 ('用友云平台', 0.8234802484512329),
 ('paas平台', 0.8118030428886414),
 ('用友云', 0.7954781651496887),
 ('云操作系统', 0.7548810839653015),
 ('iaas平台', 0.7546966075897217),
 ('appcenter', 0.7538243532180786),
 ('u8cloud', 0.7484996914863586),
 ('paas', 0.7466067671775818),
 ('社会化商业', 0.7457333207130432),
 ('云erp', 0.7428735494613647),
 ('协同云', 0.7421062588691711),
 ('海云捷迅', 0.7403150200843811),
 ('采购云', 0.7385496497154236),
 ('paas+saas', 0.7368173599243164),
 ('云管理平台', 0.7367190718650818),
 ('escloud', 0.736686646938324),
 ('私有云平台', 0.7358618974685669),
 ('mopaas', 0.7325429916381836),
 ('云应用', 0.7322961688041687)]

例子3

vec = wordVec('友云采', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)

输出:

[('友云采', 1.0000001192092896),
 ('供应商协同平台', 0.7404446601867676),
 ('伙伴门户', 0.7326363325119019),
 ('企业交易平台', 0.7278861999511719),
 ('供应商门户', 0.7263870239257812),
 ('移动云分销', 0.7180557250976562),
 ('电商管理系统', 0.7153645157814026),
 ('求购大厅', 0.7131102085113525),
 ('百卓优采', 0.7128005027770996),
 ('o2o方案', 0.7122943997383118),
 ('农鲜生', 0.7077293992042542),
 ('会员资料库', 0.7064912915229797),
 ('企业管理云平台', 0.7042117118835449),
 ('56linked', 0.7034884691238403),
 ('网上订单系统', 0.7033181190490723),
 ('协同门户', 0.7029898762702942),
 ('电商建站', 0.7025145292282104),
 ('管理商机', 0.7013753056526184),
 ('直销通', 0.7007359862327576),
 ('erpbuilder', 0.6993728876113892)]

例子4

vec = wordVec('财务云', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)

输出:

[('财务云', 1.0),
 ('财务共享服务', 0.7762293815612793),
 ('金蝶云', 0.7745106220245361),
 ('浪潮云', 0.7651669383049011),
 ('财务共享中心', 0.7502492070198059),
 ('畅捷通', 0.7385521531105042),
 ('协同云', 0.7370111346244812),
 ('企业云服务', 0.7364829182624817),
 ('用友云', 0.7306167483329773),
 ('采购云', 0.729377031326294),
 ('云erp', 0.7251084446907043),
 ('共享服务中心', 0.7224213480949402),
 ('人力云', 0.721336305141449),
 ('金蝶', 0.7165836095809937),
 ('用友', 0.7122166752815247),
 ('企业云', 0.7093378305435181),
 ('erp云', 0.7075839638710022),
 ('致远协同', 0.706666886806488),
 ('企业金融', 0.7049797773361206),
 ('移动信息化', 0.7018118500709534)]

例子5

vec = wordVec('友报账', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)

输出:

[('报账', 0.7958753705024719),
 ('友报', 0.7958752512931824),
 ('报帐', 0.7087380886077881),
 ('报销业务', 0.7015117406845093),
 ('财务报账', 0.6572694778442383),
 ('审核报销', 0.6517125964164734),
 ('报销单', 0.6511596441268921),
 ('费用报销', 0.6456758975982666),
 ('报销单据', 0.642286479473114),
 ('原始票据', 0.6387859582901001),
 ('报销审核', 0.6324885487556458),
 ('发票报销', 0.6296700835227966),
 ('做账', 0.6251322031021118),
 ('员工报销', 0.6216662526130676),
 ('财务报销', 0.6187087297439575),
 ('原始单据', 0.6172932386398315),
 ('对账', 0.6172742247581482),
 ('费用报销单', 0.6142060160636902),
 ('审批报销', 0.6136212348937988),
 ('核账', 0.6098783016204834)]

这个例子中,训练时候和测试时候的分词结果不一致。

本文仅供学习使用

posted on 2021-08-25 17:06  宋岳庭  阅读(155)  评论(0编辑  收藏  举报