大作业

一、boston房价预测

 1 #多元线性回归模型 
 2 from sklearn.datasets import load_boston
 3 from sklearn.model_selection import train_test_split
 4 #波士顿房价数据 
 5 data = load_boston()
 6 #划分数据集 
7 x_train,x_test,y_train,y_test = train_test_split(data.data,data.target,test_size=0.3)
8 #建立多元线性回归模型 9 from sklearn.linear_model import LinearRegression 10 mlr = LinearRegression() 11 mlr.fit(x_train,y_train) 12 w = mlr.coef_ 13 b = mlr.intercept_ 14 print("系数",w,"\n截距",b) 15 16 #检测模型好坏 17 18 from sklearn.metrics import regression 19 y_predict = mlr.predict(x_test) 20 21 #计算模型的预测指标 22 print("预测的均方误差:",regression.mean_squared_error(y_test,y_predict)) 23 print("预测的平均绝对误差:",regression.mean_absolute_error(y_test,y_predict)) 24 #打印模型的分数 25 print("模型的分数:",mlr.score(x_test,y_test)) 26 27 28 #多元多项式回归模型 29 #多项式化 30 from sklearn.preprocessing import PolynomialFeatures 31 poly2 = PolynomialFeatures(degree = 2) 32 x_poly_train = poly2.fit_transform(x_train) 33 x_poly_test = poly2.transform(x_test) 34 #建立模型 35 mlrp = LinearRegression() 36 mlrp.fit(x_poly_train,y_train) 37 #预测 38 y_predict2 = mlrp.predict(x_poly_test) 39 40 #检测模型好坏 41 42 #计算模型的预测指标 43 print("预测的均方差:",regression.mean_squared_error(y_test,y_predict2))
44 print("预测的平均绝对误差:",regression.mean_absolute_error(y_test,y_predict2)) 45 #打印模型的分数
46 print("模型的分数:",mlrp.score(x_poly_test,y_test))

 

多元线性回归模型结果:

多元多项式回归模型结果:

 

二、中文文本分类

 1 path ='C:\\Users\\Administrator\\Desktop\\中文文本分类\\0369data'
 2 
 3 import os
 4 for root,dirs,files in os.walk(path):
 5     for f in files:
 6         fp = os.path.join(root,f)
 7         with open(fp,'r',encoding='utf-8') as f:
 8             content = f.read() # 获取文本
 9 
10 with open(r'C:\Users\Administrator\Desktop\中文文本分类\stopsCN.txt', encoding='utf-8') as f:
11     stopwords = f.read().split('\n')
12 import jieba
13 def processing(tokens):
14     tokens = "".join([char for char in tokens if char.isalpha()]) #去掉非字母汉字的字符
15     tokens = [token for token in jieba.cut(tokens,cut_all=True) if len(token)>=2] #结巴分词(长度大于等于2)
16     tokens = " ".join([token for token in tokens if token not in stopwords])#去掉停用词
17     return tokens
18 
19 contentList = [] #存储经过处理以后的文本
20 classifyList = [] #存储分类出来的新闻类别
21 #用os.walk获取需要的变量,并拼接文件路径再打开每一个文件
22 for root,dirs,files in os.walk(path):
23     for f in files:
24         fp = os.path.join(root,f)
25         with open(fp,encoding='utf-8') as f:
26             content = f.read()
27         #h获取新闻类别标签,并处理该新闻
28         classify = fp.split("\\")[-2]
29         classifyList.append(classify)
30         contentList.append(processing(content))
31 
32 import pandas
33 datas = pandas.DataFrame({
34     'classifyList': classifyList,
35     'contentList':  contentList
36 })
37 print(datas)
38 
39 
40 from sklearn.model_selection import train_test_split
41 from sklearn.feature_extraction.text import TfidfVectorizer
42 from sklearn.naive_bayes import GaussianNB,MultinomialNB
43 from sklearn.model_selection import cross_val_score
44 from sklearn.metrics import classification_report
45 
46 x_train,x_test,y_train,y_test = train_test_split(contentList,classifyList,test_size=0.3,stratify = classifyList ) #划分测试集训练集
47 
48 
49 #转化为特征向量
50 tfv = TfidfVectorizer() #用TfidfVectorizer的方法建立特征向量
51 X_train = tfv.fit_transform(x_train) #对训练集建模
52 X_test = tfv.transform(x_test) 
53 
54 #用朴素贝叶斯建立模型,处理分散型数据
55 mnb = MultinomialNB()
56 module = mnb.fit(X_train,y_train)
57 
58 y_pred = module.predict(X_test) #对测试集进行预测
59 scores = cross_val_score(mnb,X_test,y_test,cv = 5) #输出模型精确度
60 print("Accuracy:%.3f"%scores.mean())
61 print("classification_report:\n",classification_report(y_pred,y_test))

 

 

posted @ 2018-12-17 14:17  safufu  阅读(349)  评论(0编辑  收藏  举报