NLP(二十三):用tf-idf得到句子向量,并计算相似度

一、基于gensim

1、模型类

复制代码
import os
import jieba
import pickle
import logging
import numpy as np
from gensim import corpora, models, similarities
import utils.word_process as word_process
from root_path import root
from pathlib import Path
import heapq

class TfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if not Path(root_path).is_dir():
            os.mkdir(root_path)
        self.dic_path = os.path.join(root_path, "bow.model")
        self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model")
        self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model")
        self.stop_list = word_process.get_stop_list()

        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")


    def del_stopwords(self, words):
        """删除一句话中的停用词"""
        word_list = []

        for word in words:
            if word not in self.stop_list:
                word_list.append(word)
        return word_list

    def _seg_word(self, words_list, jieba_flag=True, del_stopword=True):
        """对多句话进行分词或分字"""
        word_list = []
        if jieba_flag:
            if del_stopword:
                for words in words_list:
                    jieba.cut(words)
                    word_list.append(self.del_stopwords(list(jieba.cut(words))))
            else:
                for words in words_list:
                    word_list.append(list(jieba.cut(words)))
        else:
            if del_stopword:
                for words in words_list:
                    word_list.append(self.del_stopwords(words))
            else:
                for words in words_list:
                    word_list.append([word for word in words])
        return word_list

    def train(self, sentence_list):
        """训练模型"""
        #下面保存语料字典
        word_list = self._seg_word(sentence_list)
        dic = corpora.Dictionary(word_list, prune_at=2000000)
        dic.save(self.dic_path)

        # 构建tfidf模型
        tfidf_model_path = self.tfidf_model_path
        corpus_model = [dic.doc2bow(word) for word in word_list]
        tfidf_model = models.TfidfModel(corpus_model)
        tfidf_model.save(tfidf_model_path)

        #构造检索模型
        tfidf_index_path = self.tfidf_index_path
        corpus_tfidf = tfidf_model[corpus_model]
        tfidf_index = similarities.MatrixSimilarity(corpus_tfidf)
        tfidf_index.save(tfidf_index_path)

    def predict(self, sentence):
        # 得到句子向量, 直接出检索结果(检索是基于word_list的)。
        dic = corpora.Dictionary.load(self.dic_path)
        words = sentence
        word_bow = dic.doc2bow(self._seg_word([words])[0])
        word_tfidf = models.TfidfModel.load(self.tfidf_model_path)[word_bow]
        tfidf_index = similarities.MatrixSimilarity.load(self.tfidf_index_path)
        score = tfidf_index[word_tfidf]
        return score

    def get_train_data(self):
        """得到句子数组和标签数组"""
        labels = []
        sentences = []
        with open(self.data_path, "r", encoding="utf8") as f:
            for line in f.readlines():
                data_tuple = line.split("  ")
                label = data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("\n", "").replace("\r", "")
                sentences.append(sentence)
        return labels, sentences

    def main(self):
        labels, sentences = self.get_train_data()
        print(sentences)
        self.train(sentences)
        score_list = self.predict("我有困难还不了")

        # 获取下标, 输出为[4, 5, 2]
        print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__))

        # 获取数值, 输出为[9, 9, 6]
        print(heapq.nlargest(30, score_list))



if __name__ == '__main__':
    TfIdf().main()
复制代码

2、工具类

复制代码
import os
from root_path import root
import tqdm


stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt")

def get_stop_list():
    """得到停用词列表"""
    stop_word_list = []
    with open(stop, "r", encoding="utf8") as f:
        data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1)
        data_lines.set_description('正在处理停用词...')
        for line in data_lines:
            line = line.replace(" ", "").replace("\n", "").replace("\r", "")
            if len(line) == 1:
                stop_word_list.append(line)
    return stop_word_list
复制代码

二、基于sklearn

复制代码
import os
import jieba
import pickle
from root_path import root
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer

class TfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if not Path(root_path).is_dir():
            os.mkdir(root_path)
        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")
        self.model_path = os.path.join(root_path, "tfidf.model")

    def get_train_data(self):
        """得到句子数组和标签数组"""
        labels = []
        sentences = []
        with open(self.data_path, "r", encoding="utf8") as f:
            for line in f.readlines():
                data_tuple = line.split("  ")
                label = data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("\n", "").replace("\r", "")
                sentences.append(sentence)
        return labels, sentences

    def train(self):
        labels, sentences = self.get_train_data()
        sent_words = [list(jieba.cut(sent0)) for sent0 in sentences]
        document = [" ".join(sent0) for sent0 in sent_words]
        tfidf_vectorizer = TfidfVectorizer()
        feature = tfidf_vectorizer.fit_transform(document)
        # 保存模型
        with open(self.model_path, 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)

    def predict(self, sentence):
        # 加载模型
        with open(self.model_path, 'rb') as f:
            tfidf_vectorizer = pickle.load(f)
        sentence = list(jieba.cut(sentence))
        sen = " ".join(sentence)
        res = tfidf_vectorizer.transform([sen]).toarray()
        return res[0]

    def main(self):
        sentence = "是的,我知道那就十五号没办法,因为这个,也可能是十二十号发工资的,因为遇见了超过了一点点。"
        self.predict(sentence)

if __name__ == '__main__':
    TfIdf().main()
复制代码

 三、注意

vectorizer=CountVectorizer()
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
等价于:
transformer=TfidfVectorizer()
tfidf2=transformer.fit_transform(corpus)

四、解析

    在scikit-learn中,有两种方法进行TF-IDF的预处理。

    完整代码参见我的github:https://github.com/ljpzzz/machinelearning/blob/master/natural-language-processing/tf-idf.ipynb

    第一种方法是在用CountVectorizer类向量化之后再调用TfidfTransformer类进行预处理。第二种方法是直接用TfidfVectorizer完成向量化与TF-IDF预处理。

    首先我们来看第一种方法,CountVectorizer+TfidfTransformer的组合,代码如下:

复制代码
复制代码
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  

corpus=["I come to China to travel", 
    "This is a car polupar in China",          
    "I love tea and Apple ",   
    "The work is to write some papers in science"] 

vectorizer=CountVectorizer()

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))  
print tfidf
复制代码
复制代码

    输出的各个文本各个词的TF-IDF值如下:

  (0, 4)	0.442462137895
  (0, 15)	0.697684463384
  (0, 3)	0.348842231692
  (0, 16)	0.442462137895
  (1, 3)	0.357455043342
  (1, 14)	0.453386397373
  (1, 6)	0.357455043342
  (1, 2)	0.453386397373
  (1, 9)	0.453386397373
  (1, 5)	0.357455043342
  (2, 7)	0.5
  (2, 12)	0.5
  (2, 0)	0.5
  (2, 1)	0.5
  (3, 15)	0.281131628441
  (3, 6)	0.281131628441
  (3, 5)	0.281131628441
  (3, 13)	0.356579823338
  (3, 17)	0.356579823338
  (3, 18)	0.356579823338
  (3, 11)	0.356579823338
  (3, 8)	0.356579823338
  (3, 10)	0.356579823338

    现在我们用TfidfVectorizer一步到位,代码如下:

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer()
re = tfidf2.fit_transform(corpus)
print re

    输出的各个文本各个词的TF-IDF值和第一种的输出完全相同。大家可以自己去验证一下。

    由于第二种方法比较的简洁,因此在实际应用中推荐使用,一步到位完成向量化,TF-IDF与标准化。

    TF-IDF是非常常用的文本挖掘预处理基本步骤,但是如果预处理中使用了Hash Trick,则一般就无法使用TF-IDF了,因为Hash Trick后我们已经无法得到哈希后的各特征的IDF的值。使用了IF-IDF并标准化以后,我们就可以使用各个文本的词特征向量作为文本的特征,进行分类或者聚类分析。

    当然TF-IDF不光可以用于文本挖掘,在信息检索等很多领域都有使用。因此值得好好的理解这个方法的思想。

posted @   jasonzhangxianrong  阅读(1414)  评论(0编辑  收藏  举报
编辑推荐:
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
点击右上角即可分享
微信分享提示