朴素贝叶斯分类器基本代码 && n折交叉优化 2
这个代码基于上一个代码
不同的是:读取了txt文件,改变了min_ft与max_ft的参数
import re import pandas as pd import warnings import numpy as np from sklearn.metrics import roc_auc_score from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB as MNB #多项分布朴素贝叶斯公式 from sklearn.naive_bayes import BernoulliNB as BNB from sklearn.model_selection import cross_val_score warnings.filterwarnings("ignore") def proces(col2): col2_text=re.sub("[^a-zA-Z]"," ",col2) words=col2_text.lower().split() #print(words) return words train=pd.read_table('sentimentLabel.txt',lineterminator='\n', header=None, names=[0, 1]) print(train.head(5)) train_labers=train[0] train_texts=train[1] class_mapping={'Negative':0, 'Positive':1} train_labers=train_labers.map(class_mapping) #print(labers) test=pd.read_table('test.txt', lineterminator='\n', header=None, names=[0, 1]) test_labers=test[0] test_texts=test[1] test_labers=test_labers.map(class_mapping) train_data=[] for i in range(len(train_texts)): train_data.append(' '.join(proces(train_texts[i]))) pass test_data=[] for i in range(len(test_texts)): test_data.append(' '.join(proces(test_texts[i]))) #print(train_data) #print(test_data) data_all = train_data+test_data #print(data_all) count_vec = TfidfVectorizer(min_df=1, max_df=60, analyzer='word', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english' ) length=len(train_data) count_vec.fit(data_all) data_all=count_vec.transform(data_all) #print(data_all) train_data=data_all[:length] test_data=data_all[length:] model=MNB() #model=BNB() model.fit(train_data,train_labers) #pred=model.predict(test_data) MNB(alpha=1.0, class_prior=False, fit_prior=True) #print("roc_auc",roc_auc_score(test_labers, pred)) #print("roc_auc",roc_auc_score(w, pred)) ''' MX = 0.7996632996632996 MX_idx = 5 for i in range(400, 500): if MX < np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')): MX=np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')) MX_idx=i pass print("roc_auc",MX, MX_idx) ''' print("roc_auc", np.mean(cross_val_score(model, train_data, train_labers, cv=297, scoring='roc_auc')))
化繁为简 大巧不工