大作业
#导入os包加载数据目录 import os path = r'G:\英雄时刻\258' #停词库 with open(r'G:\英雄时刻\stopsCN.txt', encoding='utf-8') as f: stopwords = f.read().split('\n') #对数据进行标准编码处理(encoding='utf-8') import codecs import jieba #存放文件名 filePaths = [] #存放读取的数据 fileContents = [] #存放文件类型 fileClasses = [] #进行遍历实现转码读取处理并对每条新闻进行切分 for root, dirs, files in os.walk(path): for name in files: filePath = os.path.join(root, name) filePaths.append(filePath) fileClasses.append(filePath.split('\\')[2]) f = codecs.open(filePath, 'r', 'utf-8') fileContent = f.read() fileContent = fileContent.replace('\n','') tokens = [token for token in jieba.cut(fileContent)] tokens = " ".join([token for token in tokens if token not in stopwords]) f.close() fileContents.append(tokens) import pandas all_datas = pandas.DataFrame({ 'fileClass': fileClasses, 'fileContent': fileContents }) print(all_datas)
#回归模型预测波士顿房价 #导入load_boston数据 from sklearn.datasets import load_boston data = load_boston() #多元线性回归模型 from sklearn.model_selection import train_test_split # 训练集与测试集划分为7:3 x_train,x_test,y_train,y_test = train_test_split(data.data,data.target,test_size=0.3) print(x_train.shape,y_train.shape) #线性回归模型:建立13个变量与房价之间的预测模型,并检测模型好 #线性回归模型公式:y=^bx+^a from sklearn.linear_model import LinearRegression mlr = LinearRegression() mlr.fit(x_train,y_train) print('系数b',mlr.coef_,"\n截距a",mlr.intercept_) #检测模型的好坏 from sklearn.metrics import regression y_predict = mlr.predict(x_test) #计算模型的预测指标 print('线性回归模型判断指数') print("预测的均方误差:",regression.mean_squared_error(y_test,y_predict)) print("预测的平均绝对误差:",regression.mean_absolute_error(y_test,y_predict)) #打印模型分数 print("模型的分数:",mlr.score(x_test,y_test)) #多项式回归模型:建立13个变量与房价之间的预测模型,并检测模型好坏。 # 多项式回归模型公式y = a0 + a1 * x + a2 * (x**2) + ... + an * (x ** n) + e from sklearn.preprocessing import PolynomialFeatures #多项式的训练集与测试集 poly2 =PolynomialFeatures(degree=2) x_poly_train = poly2.fit_transform(x_train) x_poly_test = poly2.transform(x_test) #多项回归模型 mlrp=LinearRegression() mlrp.fit(x_poly_train,y_train) #预测值 y_predict2 = mlrp.predict(x_poly_test) #检测模型预测指数的好坏 print("多项式回归模型判断指数") print("预测的均方误差:",regression.mean_squared_error(y_test,y_predict2)) print("预测平均绝对误差:",regression.mean_absolute_error(y_test,y_predict2)) #打印模型分数 print("模型的分数:",mlrp.score(x_poly_test,y_test))
#读取文件 all_txt=[] all_target=[] path = r'G:\英雄时刻\258' files = os.listdir(path) for root,dirs,files in os.walk(path): for file in files: filepath = os.path.join(root, file) # 文件路径 tokens=open(filepath,'r',encoding='utf-8').read() tokens=processing(tokens) all_txt.append(tokens) target = filepath.split('\\')[-2]#按文件夹获取特征名 all_target.append(target) #按0.7:0.3比例分为训练集和测试集 from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(all_txt,all_target,test_size=0.3,stratify=all_target) #将其向量化 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer=TfidfVectorizer() X_train=vectorizer.fit_transform(x_train) X_test=vectorizer.transform(x_test) #分类结果显示 from sklearn.naive_bayes import MultinomialNB mnb=MultinomialNB() clf=mnb.fit(X_train,y_train) #进行预测 y_predict = clf.predict(X_test) # 输出模型精确度 from sklearn.model_selection import cross_val_score from sklearn.metrics import classification_report scores=cross_val_score(mnb,X_test,y_test,cv=4) print("Accuracy:%.3f"%scores.mean()) # 输出模型评估报告 print("classification_report:\n",classification_report(y_predict,y_test)) # #分类结果显示 # from sklearn.metrics import confusion_matrix # from sklearn.metrics import classification_report # #x_test预测结果 # y_nb_pred = clf.predict(X_test) # 将预测结果和实际结果进行对比 import collections import matplotlib.pyplot as plt from pylab import mpl mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 # 统计测试集和预测集的各类新闻个数 testCount = collections.Counter(y_test) predCount = collections.Counter(y_predict) print('实际:',testCount,'\n', '预测', predCount) # 建立标签列表,实际结果列表,预测结果列表, nameList = list(testCount.keys()) testList = list(testCount.values()) predictList = list(predCount.values()) x = list(range(len(nameList))) print("新闻类别:",nameList,'\n',"实际:",testList,'\n',"预测:",predictList) # 画图 plt.figure(figsize=(7,5)) total_width, n = 0.6, 2 width = total_width / n plt.bar(x, testList, width=width,label='实际',fc = 'r') for i in range(len(x)): x[i] = x[i] + width plt.bar(x, predictList,width=width,label='预测',tick_label = nameList,fc='y') plt.grid() plt.title('实际和预测对比图',fontsize=17) plt.xlabel('新闻类别',fontsize=17) plt.ylabel('频数',fontsize=17) plt.legend(fontsize =17) plt.tick_params(labelsize=15) plt.show()