实验3-逻辑回归(对率)
import numpy as np
import matplotlib.pyplot as plt
# 读入训练数据
train = np.loadtxt('data.csv', delimiter=',', dtype='int', skiprows=1)
train_x = train[:,0]
train_y = train[:,1]
# 标准化
mu = train_x.mean()
sigma = train_x.std()
def standardize(x):
return (x - mu) / sigma
train_z = standardize(train_x)
# 参数初始化
theta = np.random.rand(3)
# 创建训练数据的矩阵
def to_matrix(x):
return np.vstack([np.ones(x.size), x, x ** 2]).T
X = to_matrix(train_z)
# 预测函数
def f(x):
return np.dot(x, theta)
# 均方误差
def MSE(x, y):
return (1 / x.shape[0]) * np.sum((y - f(x)) ** 2)
# 学习率
ETA = 1e-3
# 误差的差值
diff = 1
# 更新次数
count = 0
# 重复学习
error = MSE(X, train_y)
while diff > 1e-2:
# 使用随机梯度下降法更新参数
p = np.random.permutation(X.shape[0])
print(p)
for x, y in zip(X[p,:], train_y[p]):
theta = theta - ETA * (f(x) - y) * x
# 计算与上一次误差的差值
current_error = MSE(X, train_y)
diff = error - current_error
error = current_error
# 输出日志
count += 1
log = '第 {} 次 : theta = {}, 差值 = {:.4f}'
print(log.format(count, theta, diff))
# 绘图确认
x = np.linspace(-3, 3, 100)
plt.plot(train_z, train_y, 'o')
plt.plot(x, f(to_matrix(x)))
plt.show()
信息增益下实现决策树
from cProfile import label
from math import log
from re import A
import numpy as np
import operator
import csv
def loaddata ():
dataSet = [[0, 0,0,0,0,0, 'yes'],
[1, 0,1,0,0,0,'yes'],
[1, 0,0,0,0,0,'yes'],
[0, 0,1,0,0,0,'yes'],
[2, 0,0,0,0,0,'yes'],
[0, 1,0,0,1,1,'yes'],
[1, 1,0,1,1,1,'yes'],
[1, 1,0,0,1,0, 'yes'],
[1, 1,1,1,1,0,'no'],
[0, 2,2,0,2,1,'no'],
[2, 2,2,2,2,0,'no'],
[2, 0,0,2,2,1,'no'],
[0, 1,0,1,0,0, 'no'],
[2, 1,1,1,0,0,'no'],
[1, 1,0,0,1,1,'no'],
[2, 0,0,2,2,0,'no'],
[0, 0,1,1,1,0,'no']]
feature_name = ['a1','a2','a3','a4','a5','a6']
return dataSet, feature_name
def entropy(dataSet):
#数据集条数
m = len(dataSet)
#保存所有的类别及属于该类别的样本数
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
#print(labelCounts) 内容为 {'yes':8, 'no':9}
#保存熵值
e = 0.0
#补充计算信息熵的代码
def log(base,x): #间接实现以2为底
return np.log(x)/np.log(base)
for k in labelCounts:
t = labelCounts[k]/m
e -= t * log(2, t)
# print(e)
return e
def splitDataSet(dataSet, axis, value):
#补充按给定特征和特征值划分好的数据集的代码
# axis对应的是特征的索引;
retDataSet = []
#遍历数据集
for i in dataSet:
if i[axis] == value:
retDataSet.append(i)
return retDataSet
def chooseBestFeature(dataSet):
n = len(dataSet[0]) - 1
# print(n)
#计数整个数据集的熵
baseEntropy = entropy(dataSet)
bestInfoGain = 0.0; bestFeature = -1
#遍历每个特征
for i in range(n):
#获取当前特征i的所有可能取值
featList = [example[i] for example in dataSet]
# print(featList)
uniqueVals = set(featList)
newEntropy = 0.0
#遍历特征i的每一个可能的取值
for value in uniqueVals:
#按特征i的value值进行数据集的划分
subDataSet = splitDataSet(dataSet, i, value)
#补充计算条件熵的代码
Dv = 0
for v in subDataSet:
if(v[i] == value):
Dv += 1
newEntropy += Dv/len(dataSet) * entropy(subDataSet)
#计算信息增益
infoGain = baseEntropy - newEntropy
#保存当前最大的信息增益及对应的特征
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def classVote(classList):
#定义字典,保存每个标签对应的个数
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
#排序
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def trainTree(dataSet,feature_name):
classList = [example[-1] for example in dataSet]
#所有类别都一致
if classList.count(classList[0]) == len(classList):
return classList[0]
#数据集中没有特征
if len(dataSet[0]) == 1:
return classVote(classList)
#选择最优划分特征
bestFeat = chooseBestFeature(dataSet)
bestFeatName = feature_name[bestFeat]
myTree = {bestFeatName:{}}
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
#遍历uniqueVals中的每个值,生成相应的分支
for value in uniqueVals:
sub_feature_name = feature_name[:]
# 生成在dataSet中bestFeat取值为value的子集;
sub_dataset = []
for i in dataSet:
if i[bestFeat] == value:
sub_dataset.append(i)
# 根据得到的子集,生成决策树
myTree[bestFeatName][value] = trainTree(sub_dataset, sub_feature_name)
return myTree
myDat,feature_name = loaddata()
myTree = trainTree(myDat,feature_name)
print(myTree)
实验五- BP神经网络
from tokenize import Double
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sympy import E1
seed = 2020
import random
np.random.seed(seed) # Numpy module.
random.seed(seed) # Python random module.
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
plt.close('all')
#(1)数据预处理
def preprocess(data):
#将非数映射数字
for title in data.columns:
if data[title].dtype=='object':
encoder = LabelEncoder()
data[title] = encoder.fit_transform(data[title])
#去均值和方差归一化
ss = StandardScaler()
X = data.drop('好瓜',axis=1)
Y = data['好瓜']
X = ss.fit_transform(X)
x,y = np.array(X),np.array(Y).reshape(Y.shape[0],1)
return x,y
#定义Sigmoid
def sigmoid(x):
return 1/(1+np.exp(-x))
#求导
def d_sigmoid(x):
return x*(1-x)
#(2)标准BP算法
def standard_BP(x,y,dim=10,eta=0.8,max_iter=500):
n_samples = 1
w1 = np.random.random((x.shape[1],dim))
w2 = np.random.random((dim,1))
b1 = np.random.random((n_samples,dim))
b2 = np.random.random((n_samples,1))
# print(w1) #w1 是8 * 10的,w2是 10 * 1的
# print(b2) #b1 是1 * 10的,b2是 1 * 1的
losslist = []
for ite in range(max_iter):
loss_per_ite = []
for m in range(x.shape[0]):
xi,yi = x[m,:],y[m,:] #xi为第 i 次输入的值
xi,yi = xi.reshape(1,xi.shape[0]),yi.reshape(1,yi.shape[0]) #yi为第i次的标签
xi = np.matrix(xi)
yi = np.matrix(yi)
##补充前向传播代码
# if m == 0:
# print(xi)
# print(yi)
out1 = np.dot(xi, w1) + b1
out2 = np.dot(sigmoid(out1), w2) + b2
out2 = sigmoid(out2)
loss = np.square(yi - out2)/2
loss = loss[0,0]
loss_per_ite.append(loss)
print("iter:%d loss:%.4f"%(ite,loss))
##反向传播
##补充反向传播代码
g = out2 * (1 - out2) * (yi - out2)
# print(g)
bh = sigmoid(out1)
# print(bh)
e = np.multiply(bh, (1 - bh))
e = np.multiply(e, w2.T).T * g
# print(e.shape)
##补充参数更新代码
w2 = w2 + eta * bh.T * g
b2 = b2 + eta * g
w1 = w1 + eta * np.dot(e, xi).T
b1 = b1 + eta * e.T
# print(w1.shape)
# print(b1.shape)
# print(w2.shape)
# print(b2.shape)
losslist.append(np.mean(loss_per_ite))
##Loss可视化
plt.figure()
##补充Loss可视化代码
x = np.arange(0, 500, 1)
plt.plot(x, losslist)
plt.show()
return w1,w2,b1,b2
#(3)测试
data = pd.read_table('watermelon30.txt',delimiter=',')
data.drop('编号',axis=1,inplace=True)
x,y = preprocess(data)
# print(x)
# print(y)
dim = 10
w1,w2,b1,b2 = standard_BP(x,y,dim)
#根据当前的x,预测其类别;
u1 = np.dot(x,w1)+b1
out1 = sigmoid(u1)
u2 = np.dot(out1,w2)+b2
out2 = sigmoid(u2)
y_pred = np.round(out2)
result = pd.DataFrame(np.hstack((y,y_pred)),columns=['真值','预测'] )
result.to_excel('result.xlsx',index=False)