根据传统的TFIDF快速进行相似性匹配
一个比较规整的特征映射及相似数据查询模块,留着备用:
import gc import tqdm import numpy as np from gensim import corpora, models, similarities from sentence import Sentence from collections import defaultdict import time class SentenceSimilarity(): def __init__(self, seg): self.seg = seg def set_sentences(self, sentences): self.sentences = [] for i in range(0, len(sentences)): self.sentences.append(Sentence(sentences[i], self.seg, i)) self.sentences_num = len(self.sentences) # 获取切过词的句子 def get_cuted_sentences(self): cuted_sentences = [] for sentence in self.sentences: cuted_sentences.append(sentence.get_cuted_sentence()) return cuted_sentences # 构建其他复杂模型前需要的简单模型 def simple_model(self, min_frequency = 1): self.texts = self.get_cuted_sentences() # 删除低频词 frequency = defaultdict(int) for text in self.texts: for token in text: frequency[token] += 1 self.texts = [[token for token in text if frequency[token] > min_frequency] for text in self.texts] self.dictionary = corpora.Dictionary(self.texts) self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts] # tfidf模型 def TfidfModel(self): self.simple_model() # 转换模型 self.model = models.TfidfModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # lsi模型 def LsiModel(self): self.simple_model() # 转换模型 self.model = models.LsiModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # lda模型 def LdaModel(self): self.simple_model() # 转换模型 self.model = models.LdaModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # 对新输入的句子(比较的句子)进行预处理 def sentence2vec(self, sentence): sentence = Sentence(sentence, self.seg) vec_bow = self.dictionary.doc2bow(sentence.get_cuted_sentence()) return self.model[vec_bow] def bow2vec(self): vec = [] length = max(self.dictionary) + 1 for content in self.corpus: sentence_vectors = np.zeros(length) for co in content: sentence_vectors[co[0]] = co[1] # 将句子出现的单词的tf-idf表示放入矩阵中 vec.append(sentence_vectors) return vec # 求最相似的句子 # input: test sentence def similarity(self, sentence): sentence_vec = self.sentence2vec(sentence) sims = self.index[sentence_vec] sim = max(enumerate(sims), key=lambda item: item[1]) index = sim[0] score = sim[1] sentence = self.sentences[index] sentence.set_score(score) return sentence # 返回一个类 # 求最相似的句子 def similarity_k(self, sentence, k): sentence_vec = self.sentence2vec(sentence) t1 = time.time() sims = self.index[sentence_vec] t2 = time.time() print('特征检索耗时:{:.4f}ms, 检索样本总数:{}'.format(t2-t1, self.sentences_num)) sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k] indexs = [i[0] for i in sim_k] scores = [i[1] for i in sim_k] return indexs, scores
时刻记着自己要成为什么样的人!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)