大作业
一、boston房价预测
1 #多元线性回归模型 2 from sklearn.datasets import load_boston 3 from sklearn.model_selection import train_test_split 4 #波士顿房价数据 5 data = load_boston() 6 #划分数据集
7 x_train,x_test,y_train,y_test = train_test_split(data.data,data.target,test_size=0.3) 8 #建立多元线性回归模型 9 from sklearn.linear_model import LinearRegression 10 mlr = LinearRegression() 11 mlr.fit(x_train,y_train) 12 w = mlr.coef_ 13 b = mlr.intercept_ 14 print("系数",w,"\n截距",b) 15 16 #检测模型好坏 17 18 from sklearn.metrics import regression 19 y_predict = mlr.predict(x_test) 20 21 #计算模型的预测指标 22 print("预测的均方误差:",regression.mean_squared_error(y_test,y_predict)) 23 print("预测的平均绝对误差:",regression.mean_absolute_error(y_test,y_predict)) 24 #打印模型的分数 25 print("模型的分数:",mlr.score(x_test,y_test)) 26 27 28 #多元多项式回归模型 29 #多项式化 30 from sklearn.preprocessing import PolynomialFeatures 31 poly2 = PolynomialFeatures(degree = 2) 32 x_poly_train = poly2.fit_transform(x_train) 33 x_poly_test = poly2.transform(x_test) 34 #建立模型 35 mlrp = LinearRegression() 36 mlrp.fit(x_poly_train,y_train) 37 #预测 38 y_predict2 = mlrp.predict(x_poly_test) 39 40 #检测模型好坏 41 42 #计算模型的预测指标 43 print("预测的均方差:",regression.mean_squared_error(y_test,y_predict2))
44 print("预测的平均绝对误差:",regression.mean_absolute_error(y_test,y_predict2)) 45 #打印模型的分数
46 print("模型的分数:",mlrp.score(x_poly_test,y_test))
多元线性回归模型结果:
多元多项式回归模型结果:
二、中文文本分类
1 path ='C:\\Users\\Administrator\\Desktop\\中文文本分类\\0369data' 2 3 import os 4 for root,dirs,files in os.walk(path): 5 for f in files: 6 fp = os.path.join(root,f) 7 with open(fp,'r',encoding='utf-8') as f: 8 content = f.read() # 获取文本 9 10 with open(r'C:\Users\Administrator\Desktop\中文文本分类\stopsCN.txt', encoding='utf-8') as f: 11 stopwords = f.read().split('\n') 12 import jieba 13 def processing(tokens): 14 tokens = "".join([char for char in tokens if char.isalpha()]) #去掉非字母汉字的字符 15 tokens = [token for token in jieba.cut(tokens,cut_all=True) if len(token)>=2] #结巴分词(长度大于等于2) 16 tokens = " ".join([token for token in tokens if token not in stopwords])#去掉停用词 17 return tokens 18 19 contentList = [] #存储经过处理以后的文本 20 classifyList = [] #存储分类出来的新闻类别 21 #用os.walk获取需要的变量,并拼接文件路径再打开每一个文件 22 for root,dirs,files in os.walk(path): 23 for f in files: 24 fp = os.path.join(root,f) 25 with open(fp,encoding='utf-8') as f: 26 content = f.read() 27 #h获取新闻类别标签,并处理该新闻 28 classify = fp.split("\\")[-2] 29 classifyList.append(classify) 30 contentList.append(processing(content)) 31 32 import pandas 33 datas = pandas.DataFrame({ 34 'classifyList': classifyList, 35 'contentList': contentList 36 }) 37 print(datas) 38 39 40 from sklearn.model_selection import train_test_split 41 from sklearn.feature_extraction.text import TfidfVectorizer 42 from sklearn.naive_bayes import GaussianNB,MultinomialNB 43 from sklearn.model_selection import cross_val_score 44 from sklearn.metrics import classification_report 45 46 x_train,x_test,y_train,y_test = train_test_split(contentList,classifyList,test_size=0.3,stratify = classifyList ) #划分测试集训练集 47 48 49 #转化为特征向量 50 tfv = TfidfVectorizer() #用TfidfVectorizer的方法建立特征向量 51 X_train = tfv.fit_transform(x_train) #对训练集建模 52 X_test = tfv.transform(x_test) 53 54 #用朴素贝叶斯建立模型,处理分散型数据 55 mnb = MultinomialNB() 56 module = mnb.fit(X_train,y_train) 57 58 y_pred = module.predict(X_test) #对测试集进行预测 59 scores = cross_val_score(mnb,X_test,y_test,cv = 5) #输出模型精确度 60 print("Accuracy:%.3f"%scores.mean()) 61 print("classification_report:\n",classification_report(y_pred,y_test))