使用lsh快速检索语义-词向量结合
""" test """ import os import gensim import pickle import time import numpy as np DIR_PATH = os.path.dirname(os.path.abspath(__file__)) HASHTABLES = os.path.join(DIR_PATH, 'resource', 'hashtables.pkl') WORD2VEC = os.path.join(DIR_PATH, 'resource', 'sgns.weibo.word') RESOURCES = os.path.join(DIR_PATH, 'resource', 'resources.pkl') class MyClass(object): def __init__(self, Table_num=5, Hashcode_fun=5): self.hashtables = HASHTABLES self.word2vec = WORD2VEC self.resources = RESOURCES self.table_num = Table_num self.Hashcode_fun = Hashcode_fun def load_traindata(self): model = gensim.models.KeyedVectors.load_word2vec_format(self.word2vec, unicode_errors='ignore') data = [] features = [] for word, vector in zip(model.vocab, model.vectors): features.append(vector) data.append(word) print(word) self.features = np.array(features) self.data = data with open(self.resources, 'wb') as fw: pickle.dump((self.features, self.data), fw) print('词向量序列化完毕,当前词向量数量:{}'.format(len(self.data))) def create_hashtables(self): with open(self.resources, 'rb') as fr: features, _ = pickle.load(fr) print('特征加载完毕,当前词向量数量:{}'.format(len(features))) users_size, items_size = features.shape hashtables = [[[] for _ in range(int('1' * self.Hashcode_fun) + 1)] for _ in range(self.table_num)] random_matrixes = [np.empty((self.Hashcode_fun, items_size)) for _ in range(self.table_num)] for i in range(self.table_num): random_matrixes[i] = np.random.uniform(-1, 1, (self.Hashcode_fun, items_size)) for i, user_vec in enumerate(features): for j in range(self.table_num): v = random_matrixes[j] index = '' for k in range(self.Hashcode_fun): index += '1' if np.dot(user_vec, v[k]) > 0 else '0' t_index = int(index, 2) hashtables[j][t_index].append(i) with open(self.hashtables, 'wb') as fw: pickle.dump((hashtables,random_matrixes), fw) print('hash表存储完毕') def cal_similarity(self): with open(self.resources, 'rb') as fr: _, data = pickle.load(fr) with open(self.hashtables, 'rb') as fr: hashtables, random_matrixes = pickle.load(fr) model = gensim.models.KeyedVectors.load_word2vec_format(self.word2vec, unicode_errors='ignore') search_data = '中国' # word2vec 找出的相似词:[('Portugal#', 0.8183228373527527), ('University#', 0.8141831755638123), ('Montfort', 0.8129391074180603), search_feature_vec = np.array(model.get_vector(search_data)) sim = model.most_similar(search_data) print('word2vec 找出的相似词:{}'.format(sim)) print('{}-莱雅,相似度:{}'.format(search_data, model.similarity(search_data, '莱雅'))) print('{}-触网,相似度:{}'.format(search_data, model.similarity(search_data, '触网'))) # '莱雅', '真材实料', '触网', '@Sophia', '汕尾', similar_users = set() t1 = time.time() for i, hashtable in enumerate(hashtables): index = '' for j in range(self.Hashcode_fun): index += '1' if np.dot(search_feature_vec, random_matrixes[i][j]) > 0 else '0' target_index = int(index, 2) similar_users |= set(hashtable[target_index]) t2 = time.time() print('查找相似性用户耗时:{:.4f}'.format(t2 - t1)) t3 = time.time() res = {} for i in similar_users: res[data[i]] = cosine_similarity2(search_feature_vec, model.get_vector(data[i])) a = sorted(res.items(), key=lambda x: x[1], reverse=True) t4 = time.time() print('计算余弦相似度及排序耗时:{:.4f}ms'.format(t4-t3)) print(a[:20]) def cosine_similarity(x, y): res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))]) cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2]))) return cos def cosine_similarity2(x,y): num = x.dot(y.T) denom = np.linalg.norm(x) * np.linalg.norm(y) return num / denom if __name__ == '__main__': ir = MyClass() # ir.load_traindata() # ir.create_hashtables() ir.cal_similarity()
能够快速捕获一组相似性数据出来
时刻记着自己要成为什么样的人!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)