基于LDA主题模型和SVM的文本分类
用LDA模型抽取文本特征,再用线性SVM分类,发现效果很差,F1=0.654。
Precision:0.680,Recall:0.649,F1:0.654
RandomForestClassifier的表现也比较差:
Precision:0.680,Recall:0.668,F1:0.670
而随便用一个深度学习模型(textCNN,LSTM+Attention)都能达到0.95+的F1,而且还不用处理特征、不用分词。
说下具体流程:提取LDA特征时,需要CountVectorizer来先对文本进行向量化,首先需要对文本进行分词,考虑到样本数量较多(搜狐新闻数据集,5个类别*3000条信息),使用了多进程程(此处用了进程池ProcessPoolExecutor来实现)来进行jieba分词。
import pandas as pd import jieba from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation import multiprocessing from concurrent.futures import ProcessPoolExecutor,as_completed from utils import log from tqdm import tqdm import time import pickle as pk import numpy as np from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVC,SVC from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import precision_score,recall_score,f1_score def transform_text(text,stopwords): #对文章进行jieba分词 words=[w for w in jieba.cut(text) if w.strip() and (w not in stopwords)] return ','.join(words) def cut_texts(lock,texts,stopwords,processName,doc_list=[]): #进程+锁的形式来做多进程分词 log('Process {} is cutting texts...'.format(processName)) docs=[] for text in tqdm(texts): doc=transform_text(text,stopwords) #log(doc) docs.append(doc) lock.acquire() doc_list.extend(docs) lock.release() def cut_texts_pool(texts,stopwords,processName): #分词,此方法将以,进程池方式的方式实现多进程加速执行 log('Process {} is cutting texts...'.format(processName)) docs=[] for text in tqdm(texts): doc=transform_text(text,stopwords) #log(doc) docs.append(doc) log('Process {} finished cutting.'.format(processName)) return docs def hard_work(processName): #测试方法,模拟耗时操作 log('Process {} is running...'.format(processName)) time.sleep(2) log('Process {} finished.'.format(processName)) return processName def mp_pool_test(texts=None,res=None): #多进程测试 n_process=multiprocessing.cpu_count() pool=ProcessPoolExecutor() fs=[] for i in range(n_process): f=pool.submit(hard_work,i) fs.append(f) names=[] for f in as_completed(fs): name = f.result() names.append(name) log(names) def partition(iterable_,n_parittion): #多文本进行分割,大体均分为n_parittion份 assert isinstance(n_parittion,int) and n_parittion>0,'Invalid value for "n_partition"' temp=list(iterable_) total=len(temp) assert total>n_parittion,'Size of iterable is less than "n_partition"' partition_size=total//n_parittion res=[] for i in range(n_parittion-1): res.append(temp[partition_size*i:partition_size*(i+1)]) res.append(temp[partition_size*(i+1):]) return res def mp_cut_pool(texts): #有几个CPU就创建几个进程 n_process=multiprocessing.cpu_count() texts=partition(texts,n_process) #以进程池的方式进行多进程分词 pool=ProcessPoolExecutor(max_workers=12) fs=[] docs=[] for i in range(n_process): #submit启动进程,第一个参数是目标方法,后面是该方法的参数 f=pool.submit(cut_texts_pool,texts[i],[],i) #f是一个Future对象 fs.append(f) #as_completed返回一个迭代器,当进程池当中的进程执行结束时调用 for f in as_completed(fs): #f.result()获取每个进程的返回值 docs.extend(f.result()) return docs class LDA_Transformer: def __init__(self,n_features): self.n_features=n_features def fit(self,texts): log('Building CountVectorizer with texts...') ct=CountVectorizer() self.count_vectorizer=ct log(type(texts)) if isinstance(texts,list): log('Len of texts:{}'.format(len(texts))) #log(texts) else: log('Shape of texts:{}'.format(texts.shape)) print('texts[0]',texts[0]) ctv=ct.fit_transform(texts) log('Building LDA model with CountVectorizer..') #n_components是LDA的主题个数,类似于word embedding的维度大小 lda=LatentDirichletAllocation(n_components=self.n_features) lda.fit(ctv) log('Done building LDA model.') self.lda_model=lda def transform(self,texts): count_vec=self.count_vectorizer.transform(texts) return self.lda_model.transform(count_vec) def build_data(): df=pd.read_excel('data/souhu_news_400_500.xlsx') texts=list(df['content'])#文本字段 log(df.columns) docs=mp_cut_pool(texts) lda_transformer=LDA_Transformer(64) lda_transformer.fit(docs) #保存LDA模型到本地 with open('output/lda_transformer.pkl','wb') as f: pk.dump(lda_transformer,f) indices=list(range(df.shape[0])) np.random.shuffle(indices) df=df.iloc[indices] dic={topic:i for i,topic in enumerate(list(df['topic'].unique()))} y=[dic[topic] for topic in list(df['topic'])] with open('data/y_lda.pkl','wb') as f: pk.dump(y,f) texts=list(df['content']) X=lda_transformer.transform(texts) with open('data/X_lda.pkl','wb') as f: pk.dump(X,f) log('Training data is saved.') def load_train_data(): with open('data/X_lda.pkl','rb') as f: X=pk.load(f) with open('data/y_lda.pkl','rb') as f: y=pk.load(f) X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2) return X_train,X_test,y_train,y_test def main(): log('Building training data...') build_data() log('Loading training data with LDA features...') X_train,X_test,y_train,y_test=load_train_data() log('Training LinearSVC model..') #model=LinearSVC() model=RandomForestClassifier() model.fit(X_train,y_train) log('Evaluating model...') acc=model.score(X_test,y_test) log('Accuracy:{}'.format(acc)) y_pred=model.predict(X_test) p=precision_score(y_test,y_pred,average='macro') r=recall_score(y_test,y_pred,average='macro') f1=f1_score(y_test,y_pred,average='macro') log('Precision:{:.3f},Recall:{:.3f},F1:{:.3f}'.format(p,r,f1)) if __name__=='__main__': main()