# 若没有nltk则先定义一个空函数 def pre(text): pre_text=text return pre_text #读取数据 import csv # with open(r'd:/SMSSpamCollectionjsn.txt',encoding = "utf-8")as file_path: # with open('C:\Users\Administrator\Desktop\SMSSpamCollection.csv','r',encoding='utf-8')as file_path: # sms=file_path.read() # print(sms) file_path=r'd:/SMSSpamCollectionjsn.txt' sms=open(file_path,'r',encoding="utf-8") sms_data=[] sms_label=[] reader=csv.reader(sms,delimiter='\t') for line in reader: sms_label.append(line[0]) sms_data.append(pre(line[1])) sms.close() #训练集合测试集,先将先验数据按如下比例划分 from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(sms_data,sms_label,test_size=0.3,random_state=0,stratify=sms_label) print(len(sms_data),len(x_train),len(x_test)) x_train # 将其向量化,提取数据特征,将文本解析为词向量,训练模型 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words='english',strip_accents='unicode') x_train=vectorizer.fit_transform(x_train) x_train.toarray().shape (3898, 6649) x_test=vectorizer.transform(x_test) # 贝叶斯分类器 from sklearn.naive_bayes import MultinomialNB result=MultinomialNB().fit(x_train,y_train) y_pred=result.predict(x_test) #分类结果显示,利用混淆矩阵评估预测模型的正确率,准确率、精确率、召回率。F值 from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report print(y_pred.shape,y_pred) print('nb_confusion_matrix:') cm=confusion_matrix(y_test,y_pred) print(cm) print('nb_classification_report:') cr=classification_report(y_test,y_pred) print(cr) (1671,) ['ham' 'ham' 'ham' ... 'ham' 'spam' 'ham'] nb_confusion_matrix: [[1447 0] [ 48 176]] nb_classification_report: precision recall f1-score support ham 0.97 1.00 0.98 1447 spam 1.00 0.79 0.88 224 avg / total 0.97 0.97 0.97 1671 #预测排行榜 feature_names=vectorizer.get_feature_names()#出现过的单词列表 xgailv=result.coef_ #先验概率 P(x_i|y) intercept=result.intercept_ #p(y) xgailv_with_fns=sorted(zip(xgailv[0],feature_names)) #对数海旅p(x_i|y)与单词x_i映射 n=10 top=zip(xgailv_with_fns[:n],xgailv_with_fns[:-(n+1):-1]) #最大的10个和最小的10个单词 for (coef_1,fn_1),(coef_2,fn_2) in top: print('\t%.4f\t%-15s\t%.4f\t%-15s' % (coef_1,fn_1,coef_2,fn_2)) -9.1053 10 smth -6.1149 free -9.1053 15 -6.3421 txt -9.1053 2go -6.4948 mobile -9.1053 2gthr -6.5769 text -9.1053 2gthr drinking -6.5780 claim -9.1053 2marrow -6.6015 stop -9.1053 2morrow -6.6108 ur -9.1053 2mrw -6.6352 reply -9.1053 2mrw luv -6.7198 www -9.1053 2nd ur -6.7481 prize vectorizer.get_feature_names()#出现的有分类价值的单词