期末大作业，最后一周上课检查(更新)


一、boston房价预测
#1.读取数据
from sklearn.datasets import load_boston   #导入房价数据集
boston=load_boston() 
boston.data               #读取房价数据
boston.target
boston.data.shape


#2.训练集与测试集划分
from sklearn.model_selection import train_test_split   #导入训练集和测试集包
x_train,x_test,y_train,y_test=train_test_split(boston.data,boston.target,test_size=0.3)   #划分训练集与测试集


#3.线性回归模型：建立13个变量与房价之间的预测模型，并检测模型好坏。
from sklearn.linear_model import LinearRegression
LineR=LinearRegression()    #线性回归
LineR.fit(x_train,y_train)         #对数据进行训练
print(LineR.coef_,LineR.intercept_)     #通过数据训练得出回归方程的斜率和截距

from sklearn.metrics import regression    # 检测模型好坏
y_pred= LineR.predict(x_test)
print("预测的均方误差：", regression.mean_squared_error(y_test,y_pred))    # 计算模型的预测指标
print("预测的平均绝对误差：", regression.mean_absolute_error(y_test,y_pred))
print("模型的分数：",LineR.score(x_test, y_test))    # 输出模型的分数


#4.多项式回归模型：建立13个变量与房价之间的预测模型，并检测模型好坏。
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=2)
x_poly_train=poly.fit_transform(x_train)
LineR=LinearRegression()           #建立多项回归模型
LineR.fit(x_poly_train,y_train)   

x_poly_test=poly.transform(x_test)   #多项回归预测模型
y_pred1=LineR.predict(x_poly_test)

# 检测模型好坏
print("预测的均方误差：", regression.mean_squared_error(y_test,y_pred1))
print("预测的平均绝对误差：", regression.mean_absolute_error(y_test,y_pred1))  # 计算模型的预测指标
print("模型的分数：",LineR.score(x_poly_test, y_test))    # 输出模型的分数


5. 比较线性模型与非线性模型的性能，并说明原因。
线性回归模型和非线性回归模型的区别是：线性就是每个变量的指数都是1，而非线性就是至少有一个变量的指数不是1。

线性回归模型，是利用数理统计中回归分析，来确定两种或两种以上变量间相互依赖的定量关系的一种统计分析方法。
而非线性回归，是在掌握大量观察数据的基础上，利用数理统计方法建立因变量与自变量之间的回归关系函数表达式（称回归方程式）。回归分析中，当研究的因果关系只涉及因变量和一个自变量时，叫做一元回归分析；当研究的因果关系涉及因变量和两个或两个以上自变量时，叫做多元回归分析。

二、中文文本分类
import numpy as np #导入数组numpy库
import os
path='C:\\Users\\Administrator\\Desktop\\data1'

import jieba #导入结巴库
with open(r'C:\Users\Administrator\Desktop\data',encoding='utf-8') as f: #split()通过指定分隔符字符串进行切片
stopwords=f.read().split('\n') #停用词
#stopwords

def processing(tokens):
tokens="".join([char for char in tokens if char.isalpha()])#去掉非字母汉字字符（alpha()判断字符是否为字母）
tokens=[token for token in jieba.cut(tokens,cut_all=True) if len(token)>=2] #结巴分词裁剪
tokens="".join([token for token in tokens if token not in stopwords]) #去掉停用词
return tokens
tokenList=[]
targetList=[]

for root,dirs,files in os.walk(path): # 用os.walk获取需要的变量，并拼接文件路径再打开每一个文件 os.walk(top[,topdown=True[,oneror=None[,followlinks=False]]]) top根目录，root文件本身地址，dirs所有目录名字，files文件夹所有文件
for f in files:
filePath=os.path.join(root,f)
with open(filePath,encoding='utf-8') as f:
classify=f.read()
target=filePath.split('\\')[-2] # 获取类别标签 [-2]返回列表索引，选取倒数第二项
targetList.append(target)
tokenList.append(processing(classify))
classify

from sklearn.feature_extraction.text import TfidfVectorizer #Feature extraction文本特征提取
from sklearn.model_selection import train_test_split #训练数据（training samples）和测试数据（testing samples）
from sklearn.naive_bayes import GaussianNB,MultinomialNB #scikit-learn中，提供了3中朴素贝叶斯分类算法：GaussianNB(高斯朴素贝叶斯)、MultinomialNB(多项式朴素贝叶斯)、BernoulliNB(伯努利朴素贝叶斯)
from sklearn.model_selection import cross_val_score #对数据集进行指定次数的交叉验证并为每次验证效果评测
from sklearn.metrics import classification_report #classification_report函数构建了一个文本报告，用于展示主要的分类metrics
x_train,x_test,y_train,y_test=train_test_split(tokenList,targetList,test_size=0.2,stratify=targetList) #划分训练集和测试集

vectorizer=TfidfVectorizer() #转化为特征向量,并用TfidfVectorizer的方式建立特征向量
x1_train=vectorizer.fit_transform(x_train) #预处理中先训练数据再标准化
x1_test=vectorizer.transform(x_test)
MNB=MultinomialNB() #用多项式朴素贝叶斯MultinomialMB类建模
module=MNB.fit(x1_train,y_train)

y_predict=module.predict(x1_test) #进行预测
scores=cross_val_score(MNB,x1_test,y_test,cv=5) #cross_val_score(knn,x,y,cv,scoring='accuracy') cv指规定将原始数据分成多少份
print("Accuracy:%.3f"%scores.mean()) # 输出模型精确度
print("classification_report:\n",classification_report(y_predict,y_test)) # 输出模型评估报告

import collections #Collections则是集合类的一个工具类帮助类，其中提供了一系列静态方法，用于对集合中元素进行排序、搜索以及线程安全等各种操作。
testCount=collections.Counter(y_test) #counter工具用于统计字符出现的个数,便捷和快速地计数 ,主要的作用是用来统计散列对象。而shape主要是计算矩阵的行和列
predictCount=collections.Counter(y_predict)
print('实际：',testCount,'\n','预测:',predictCount)

nameList=list(testCount.keys()) # 建立标签列表，实际结果列表，预测结果列表，
testList=list(testCount.values())
predictList=list(predictCount.values())
x=list(range(len(nameList)))
print('类别：',nameList,'\n','实际：',testList,'\n','预测：',predictList)

os.walk(top[, topdown=True[, onerror=None[, followlinks=False]]])

posted @ 2018-12-17 18:40 何美玲阅读(253) 评论(0) 编辑收藏举报

何美玲

期末大作业，最后一周上课检查(更新)

公告