NLP(二十三):用tf-idf得到句子向量,并计算相似度

一、基于gensim

1、模型类

import os
import jieba
import pickle
import logging
import numpy as np
from gensim import corpora, models, similarities
import utils.word_process as word_process
from root_path import root
from pathlib import Path
import heapq

class TfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if not Path(root_path).is_dir():
            os.mkdir(root_path)
        self.dic_path = os.path.join(root_path, "bow.model")
        self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model")
        self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model")
        self.stop_list = word_process.get_stop_list()

        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")


    def del_stopwords(self, words):
        """删除一句话中的停用词"""
        word_list = []

        for word in words:
            if word not in self.stop_list:
                word_list.append(word)
        return word_list

    def _seg_word(self, words_list, jieba_flag=True, del_stopword=True):
        """对多句话进行分词或分字"""
        word_list = []
        if jieba_flag:
            if del_stopword:
                for words in words_list:
                    jieba.cut(words)
                    word_list.append(self.del_stopwords(list(jieba.cut(words))))
            else:
                for words in words_list:
                    word_list.append(list(jieba.cut(words)))
        else:
            if del_stopword:
                for words in words_list:
                    word_list.append(self.del_stopwords(words))
            else:
                for words in words_list:
                    word_list.append([word for word in words])
        return word_list

    def train(self, sentence_list):
        """训练模型"""
        #下面保存语料字典
        word_list = self._seg_word(sentence_list)
        dic = corpora.Dictionary(word_list, prune_at=2000000)
        dic.save(self.dic_path)

        # 构建tfidf模型
        tfidf_model_path = self.tfidf_model_path
        corpus_model = [dic.doc2bow(word) for word in word_list]
        tfidf_model = models.TfidfModel(corpus_model)
        tfidf_model.save(tfidf_model_path)

        #构造检索模型
        tfidf_index_path = self.tfidf_index_path
        corpus_tfidf = tfidf_model[corpus_model]
        tfidf_index = similarities.MatrixSimilarity(corpus_tfidf)
        tfidf_index.save(tfidf_index_path)

    def predict(self, sentence):
        # 得到句子向量, 直接出检索结果(检索是基于word_list的)。
        dic = corpora.Dictionary.load(self.dic_path)
        words = sentence
        word_bow = dic.doc2bow(self._seg_word([words])[0])
        word_tfidf = models.TfidfModel.load(self.tfidf_model_path)[word_bow]
        tfidf_index = similarities.MatrixSimilarity.load(self.tfidf_index_path)
        score = tfidf_index[word_tfidf]
        return score

    def get_train_data(self):
        """得到句子数组和标签数组"""
        labels = []
        sentences = []
        with open(self.data_path, "r", encoding="utf8") as f:
            for line in f.readlines():
                data_tuple = line.split("  ")
                label = data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("\n", "").replace("\r", "")
                sentences.append(sentence)
        return labels, sentences

    def main(self):
        labels, sentences = self.get_train_data()
        print(sentences)
        self.train(sentences)
        score_list = self.predict("我有困难还不了")

        # 获取下标, 输出为[4, 5, 2]
        print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__))

        # 获取数值, 输出为[9, 9, 6]
        print(heapq.nlargest(30, score_list))



if __name__ == '__main__':
    TfIdf().main()

2、工具类

import os
from root_path import root
import tqdm


stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt")

def get_stop_list():
    """得到停用词列表"""
    stop_word_list = []
    with open(stop, "r", encoding="utf8") as f:
        data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1)
        data_lines.set_description('正在处理停用词...')
        for line in data_lines:
            line = line.replace(" ", "").replace("\n", "").replace("\r", "")
            if len(line) == 1:
                stop_word_list.append(line)
    return stop_word_list

二、基于sklearn

import os
import jieba
import pickle
from root_path import root
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer

class TfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if not Path(root_path).is_dir():
            os.mkdir(root_path)
        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")
        self.model_path = os.path.join(root_path, "tfidf.model")

    def get_train_data(self):
        """得到句子数组和标签数组"""
        labels = []
        sentences = []
        with open(self.data_path, "r", encoding="utf8") as f:
            for line in f.readlines():
                data_tuple = line.split("  ")
                label = data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("\n", "").replace("\r", "")
                sentences.append(sentence)
        return labels, sentences

    def train(self):
        labels, sentences = self.get_train_data()
        sent_words = [list(jieba.cut(sent0)) for sent0 in sentences]
        document = [" ".join(sent0) for sent0 in sent_words]
        tfidf_vectorizer = TfidfVectorizer()
        feature = tfidf_vectorizer.fit_transform(document)
        # 保存模型
        with open(self.model_path, 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)

    def predict(self, sentence):
        # 加载模型
        with open(self.model_path, 'rb') as f:
            tfidf_vectorizer = pickle.load(f)
        sentence = list(jieba.cut(sentence))
        sen = " ".join(sentence)
        res = tfidf_vectorizer.transform([sen]).toarray()
        return res[0]

    def main(self):
        sentence = "是的,我知道那就十五号没办法,因为这个,也可能是十二十号发工资的,因为遇见了超过了一点点。"
        self.predict(sentence)

if __name__ == '__main__':
    TfIdf().main()

 三、注意

vectorizer=CountVectorizer()
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
等价于:
transformer=TfidfVectorizer()
tfidf2=transformer.fit_transform(corpus)

四、解析

    在scikit-learn中,有两种方法进行TF-IDF的预处理。

    完整代码参见我的github:https://github.com/ljpzzz/machinelearning/blob/master/natural-language-processing/tf-idf.ipynb

    第一种方法是在用CountVectorizer类向量化之后再调用TfidfTransformer类进行预处理。第二种方法是直接用TfidfVectorizer完成向量化与TF-IDF预处理。

    首先我们来看第一种方法,CountVectorizer+TfidfTransformer的组合,代码如下:

复制代码
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  

corpus=["I come to China to travel", 
    "This is a car polupar in China",          
    "I love tea and Apple ",   
    "The work is to write some papers in science"] 

vectorizer=CountVectorizer()

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))  
print tfidf
复制代码

    输出的各个文本各个词的TF-IDF值如下:

  (0, 4)	0.442462137895
  (0, 15)	0.697684463384
  (0, 3)	0.348842231692
  (0, 16)	0.442462137895
  (1, 3)	0.357455043342
  (1, 14)	0.453386397373
  (1, 6)	0.357455043342
  (1, 2)	0.453386397373
  (1, 9)	0.453386397373
  (1, 5)	0.357455043342
  (2, 7)	0.5
  (2, 12)	0.5
  (2, 0)	0.5
  (2, 1)	0.5
  (3, 15)	0.281131628441
  (3, 6)	0.281131628441
  (3, 5)	0.281131628441
  (3, 13)	0.356579823338
  (3, 17)	0.356579823338
  (3, 18)	0.356579823338
  (3, 11)	0.356579823338
  (3, 8)	0.356579823338
  (3, 10)	0.356579823338

    现在我们用TfidfVectorizer一步到位,代码如下:

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer()
re = tfidf2.fit_transform(corpus)
print re

    输出的各个文本各个词的TF-IDF值和第一种的输出完全相同。大家可以自己去验证一下。

    由于第二种方法比较的简洁,因此在实际应用中推荐使用,一步到位完成向量化,TF-IDF与标准化。

    TF-IDF是非常常用的文本挖掘预处理基本步骤,但是如果预处理中使用了Hash Trick,则一般就无法使用TF-IDF了,因为Hash Trick后我们已经无法得到哈希后的各特征的IDF的值。使用了IF-IDF并标准化以后,我们就可以使用各个文本的词特征向量作为文本的特征,进行分类或者聚类分析。

    当然TF-IDF不光可以用于文本挖掘,在信息检索等很多领域都有使用。因此值得好好的理解这个方法的思想。

posted @ 2021-06-18 14:11  jasonzhangxianrong  阅读(1371)  评论(0编辑  收藏  举报