LDA及logistic回归与分类python3 实现

一、LDA线性判决分类

总之求解

1、计算每类均值u0,u1为向量
2、计算Sw
3
w=(u0 - u1) * (mat(sw).I)
以机器学习西瓜数据3.0为例
from numpy import *
import numpy as np
import matplotlib.pyplot as plt
import math
# pandas 模块可以将读取到的表格型数据,进行数据列,行操作
import pandas as pd
# 也可以读取.txt格式,仍用read_csv
data=pd.read_csv("watermelon_3a.csv")
#获取的用列表保存
def calculate_w():
 df1= data[data.label==1]
 df2= data[data.label==0]
 #获取每个标签对应的数据值.values
 X0=df1.values[:,1:3]
 X1=df2.values[:,1:3]
 #计算每类均值u0,u1为向量,mean只计算向量,不能矩阵
 u0= array ([mean(X0[:,0]),mean(X0[:,1])])
 u1= array ([mean(X1[:,0]),mean(X1[:,1])])
 #产生全为u0和u1矩阵
 m1 = shape(X1)[0]
#shape[1] 为第二维的长度,shape[0] 为第一维的长度,矩阵的行数
#mat   函数将数据类型为数组的转化为矩阵形式,进行线代操作
 sw = zeros(shape=(2, 2))
 for i in range (m1):
    xsmean=mat(X1[i,:]-u1)
    sw += xsmean.transpose() * xsmean
 m0 = shape(X0)[0]
 for i in range (m0):
    xsmean=mat(X0[i,:]-u0)
    sw += xsmean.transpose() * xsmean
 w = (u0 - u1) * (mat(sw).I)
 return w

def plot(w):
 dataMat=array(data[['density','ratio_sugar']].values[:,:])
 labelMat = mat(data['label'].values[:]).transpose()
 print(labelMat)
 m=shape(dataMat)[0]
 xcord1 = []
 ycord1 = []
 xcord2 = []
 ycord2 = []
 for i in range(m):
        if labelMat[i] == 1:
            xcord1.append(dataMat[i, 0])
            ycord1.append(dataMat[i, 1])
        else:
            xcord2.append(dataMat[i, 0])
            ycord2.append(dataMat[i, 1])
 plt.figure(1)
 # 将画面分割成1行一列第1个图
 ax = plt.subplot(111)
 ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
 ax.scatter(xcord2, ycord2, s=30, c='green')
 x = arange(-0.2, 0.8, 0.1)
 #a=0.00066,b=0.9直线的斜率为-a/b,直线方程为kx
 a=w[0,0]
 b=w[0,1]
 u=(-w[0, 0] * x) / w[0, 1]
 y = array(u)
 print(shape(x))
 print(shape(y))
 plt.sca(ax)
 plt.plot(x, y)  # gradAscent
 plt.xlabel('density')
 plt.ylabel('ratio_sugar')
 plt.title('LDA')
 plt.show()

w=calculate_w()
print(w)
plot(w)

注意:求解出来的w=[w0,w1]垂直于我们的直线,所以斜率-w0/w1;实现的结果如下

二、logistic回归

    

1、几何几率回归

      f(xi)=w xi +b;使得f(xi)≈yi

  输入属性的数目只有一个

  

样本若有d个属性描述

  

在次程序中采用梯度上升和随机梯度上升法

   主要区别在于

      1、alpha的变化所有

      2、批量梯度上升,每次进行一次迭代更新就计算所有的样本;随机样本梯度上升:根据样本的数量进行迭代,每次计算一个样本进行一次更新

 (梯度上升)

    step1:获取数据,X为m x (d+1)矩阵,权值初始化w为(d+1)x 1;

   step2: 计算预测值输出值h=sigmoid(X*w)

  step3:计算误差 error=label-h

  step4: 更新公式如下

      

 step5:画出plot(w)

    注意直线w=[w0,w1,w2];前两项为-w0/w1仍然为直线的斜率,我感觉w2直接就是之前的b,但是在实际中用-w2/w1?????目前还不知道为什么?谁看明白可以告诉我,谢谢


from numpy import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#利用几何几率回归将西瓜数据进行分类,并用梯度上升和随机梯度上升求解w,b
#最小化损失函数-梯度下降;最大化极大似然函数--梯度上升法

# 读入csv文件数据
data = pd.read_csv('watermelon_3a.csv')
m, n = shape(data)
data['norm'] = ones((m, 1))
print(data)
#step1,先求出X为 m x(d+1)=17x3,第三个元素直接赋值为1,不需要在values[:,:,:]
#dataLabel为实际值{0,1}
dataMat=array(data[['density','ratio_sugar','norm']].values[:,:])
dataLabel=mat(data[['label']].values[:]).transpose()
def GradAscend():
#对权值初始化,全一
itermax=300
n=shape((dataMat))[1]
a=0.1
w=array(ones((n,1)))
for i in range (itermax):
u=dot(dataMat,w)
h=sigmoid(u)
erro=dataLabel.transpose()-h
w= w +a*( dataMat .transpose() *erro)
return w

def sigmoid(x):
return (1/(1+exp(-x)))

def plot(w):
dataMat1=array(data[['density','ratio_sugar']].values[:,:])
dataLabel1=mat(data[['label']].values[:])
xcard1=[]
ycard1=[]
xcard2=[]
ycard2=[]
m=shape(dataMat1)[0]
for i in range(m):
if (dataLabel1[i]==1):
xcard1.append(dataMat1[i,0])
ycard1.append(dataMat1[i,1])
else:
xcard2.append(dataMat1[i,0])
ycard2.append(dataMat1[i,1])
plt.figure(1)
ax=plt.subplot(111)
ax.scatter(xcard1, ycard1, s=30, c='red', marker='s')
ax.scatter(xcard2, ycard2, s=30, c='green')
x = arange(0.2, 0.8, 0.1)
W1=mat(w).transpose()
#y=array(((-W1[0, 2] - W1[0, 0]* x) /W1[0,1]))
y = array(((-W1[0, 2] - W1[0, 0] * x) / W1[0, 1]))
print
shape(x)
print
shape(y)
plt.sca(ax)
plt.plot(x, y) # ramdomgradAscent
# plt.plot(x,y[0]) #gradAscent
plt.xlabel('density')
plt.ylabel('ratio_sugar')
# plt.title('gradAscent logistic regression')
plt.title(' gradAscent logistic regression')
plt.show()

w=GradAscend()
print(w)
plot(w)
 

结果

  w= 

[[ 1.20808298]
[ 8.92638039]
[-2.52407404]]

划分结果

(2)随机梯度下降

  

from numpy import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 利用几何几率回归将西瓜数据进行分类,并用梯度上升和随机梯度上升求解w,b
# 最小化损失函数-梯度下降;最大化极大似然函数--梯度上升法

# 读入csv文件数据
data = pd.read_csv('watermelon_3a.csv')
m, n = shape(data)
data['norm'] = ones((m, 1))
print(data)
# step1,先求出X为 m x(d+1)=17x3,第三个元素直接赋值为1,不需要在values[:,:,:]
# dataLabel为实际值{0,1}
dataMat = array(data[['density', 'ratio_sugar', 'norm']].values[:, :])
dataLabel = mat(data[['label']].values[:]).transpose()


def sigmoid(x):
    return (1 / (1 + exp(-x)))

def randomgradAscent(dataMat,dataLabel):
  m, n = shape(dataMat)
  numIter=50
  w = ones(n)

  for j in range(numIter):
     dataIndex =list( range(m))
     for i in range(m):
          alpha = 4.0 / (1.0 + j + i) + 0.2
          randIndex_Index = int(random.uniform(0, len(dataIndex)))
          randIndex = dataIndex[randIndex_Index]
          h = sigmoid(sum(dot(dataMat[randIndex], w)))
          error = (dataLabel[0,randIndex] - h)
          w = w + alpha * error * (dataMat[randIndex].transpose())
          del (dataIndex[randIndex_Index])

  return w


def plot(w):
    dataMat1 = array(data[['density', 'ratio_sugar']].values[:, :])
    dataLabel1 = mat(data[['label']].values[:])
    xcard1 = []
    ycard1 = []
    xcard2 = []
    ycard2 = []
    m = shape(dataMat1)[0]
    for i in range(m):
        if (dataLabel1[i] == 1):
            xcard1.append(dataMat1[i, 0])
            ycard1.append(dataMat1[i, 1])
        else:
            xcard2.append(dataMat1[i, 0])
            ycard2.append(dataMat1[i, 1])
    plt.figure(1)
    ax = plt.subplot(111)
    ax.scatter(xcard1, ycard1, s=30, c='red', marker='s')
    ax.scatter(xcard2, ycard2, s=30, c='green')
    x = arange(0.2, 0.8, 0.1)
    #W1 = mat(w).transpose()
    # y=array(((-W1[0, 2] - W1[0, 0]* x) /W1[0,1]))
    y = array(((-w[2] - w[0] * x) / w[1]))
    print
    shape(x)
    print
    shape(y)
    plt.sca(ax)
    plt.plot(x, y)  # ramdomgradAscent
    # plt.plot(x,y[0])   #gradAscent
    plt.xlabel('density')
    plt.ylabel('ratio_sugar')
    # plt.title('gradAscent logistic regression')
    plt.title(' randgradAscent logistic regression')
    plt.show()
w =randomgradAscent(dataMat,dataLabel)
print(w)
plot(w)

w=[ 1.59035763  7.07872188 -2.18857215]

 

         

 




posted @ 2018-04-02 17:19  zjkl_欧了  阅读(1042)  评论(0编辑  收藏  举报