使用Python手撸了一个线性回归模型

 
## 线性回归模型

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def getData():
    r"""
    使用numpy构造满足条件的随机数()
    :return:
    """

    dataX = np.random.randint(1,100,size=(1,100))
    dataY = np.random.randint(1,100,size=(1,100))
    data = np.concatenate((dataX,dataY), axis = 0)

    G1tfa = data > 50
    G1tfi = np.logical_and(G1tfa[0,:], G1tfa[1,:])
    G1index = np.nonzero(G1tfi)[0]
    G1 = data[:, G1index]

    G2tfa = data < 50
    G2tfi = np.logical_and(G2tfa[0,:], G2tfa[1,:])
    G2index = np.nonzero(G2tfi)[0]
    G2 = data[:, G2index]

    return (G1, G2)

def getData2():
    r"""
    使用numpy生成随机数;
    使用pandas构造满足条件的随机数;
    :return:
    """
    df = pd.DataFrame()
    df['X'] = np.random.randint(1,100,size=(100))
    df['Y'] = np.random.randint(1,100,size=(100))

    G1 = df[(df['X']>50) & (df['Y']>50)]
    G2 = df[(df['X']<50) & (df['Y']<50)]

    return (G1.values.T, G2.values.T)

def getData3():
    r"""
    生成正太分布的随机数;
    :return:
    """
    # data = np.random.uniform(0, 1, size = 1000)#随机均匀采样
    # data3 = np.random.rand(1000) #随机均匀分布
    #
    # data2 = np.random.normal(0, 1, size = 1000)#正太分布,均值为0,方差为1
    # data4 = np.random.randn(1000)#正太分布,均值为0,方差为1

    point1 = (0.5, 0.5)
    point2 = (2.5, 2.5)
    a = np.random.normal(point1[0], 0.3, size = (1000,1))
    b = np.random.normal(point1[1], 0.2, size = (1000,1))
    c = np.random.normal(point2[0], 0.2, size = (1000,1))
    d = np.random.normal(point2[1], 0.3, size = (1000,1))

    G1 = pd.DataFrame(data = np.concatenate((a,b),axis=1), columns=['X','Y'])
    G2 = pd.DataFrame(data = np.concatenate((c,d),axis=1), columns=['X','Y'])

    return (G1.values.T, G2.values.T)

def getData4():
    r"""
    生成可以用来线性回归的数据
    :return:
    """
    w = 1.2
    b = 0.2
    x = np.linspace(1,100,100)
    noise = np.random.randn(100)*10
    y = w * x + b + noise
    return (x,y)

def linefit(x, y):
    r"""
    采用梯度下降的方法更新w和b
    :param x:
    :param y:
    :return:
    """
    #初始化迭代参数;
    w = 0
    b = 0
    w_last = 0.001
    b_last = 0.001
    loss_last  = 6330

    stop = 10000
    i = 0
    eta = 0.00001#学习率,如果学习率很大,将无法收敛,loss震荡很大;

    count = 0   #如果100次都没进步,就停止迭代
    loss_list = [] #用来保存loss曲线
    w_list = [0.001, 0]#用来保存每次迭代后的w
    b_list = [0.001, 0]#用来保存每次迭代后的b

    while(i < stop):
        loss = getLoss(w,b,x,y)
        w_t = w - eta*(loss-loss_last)/(w-w_last)
        b_t = b - eta*(loss-loss_last)/(b-b_last)

        w_list.append(w_t)
        b_list.append(b_t)

        w_last = w
        w = w_t
        b_last = b
        b = b_t

        if abs(loss_last - loss)< 0.01:
            count += 1
        if count >=100:
            print("w is {:.2f}, b is {:.2f}, error is {}, i is {}".format(w,b,loss,i))
            break
        print("w is {:.2f}, b is {:.2f}, loss is {:.2f},i is {}".format(w,b,loss,i))
        loss_list.append(loss)
        loss_last = loss
        i += 1

    # xb = np.mean(x)
    # yb = np.mean(y)

    return w,b,loss_list,w_list,b_list


def getLoss(w,b,x,y,MAE = True):

    if MAE:
        #L1 mean absolute error (MAE)
        totalLoss = np.sum(abs((w*x+b)-y))

    else:
        #L2 mean squari error (MSE)
        totalLoss = np.sum((w*x+b)-y**2)

    return totalLoss

def getErrorSurface(x,y):
    r"""
    获取error surface
    :param x:
    :param y:
    :return:
    """
    minError = 999999
    w_f = 0
    b_f = 0
    size = 200
    r = np.zeros((size,size))
    for i,b in enumerate(np.linspace(-2.1,2.2,size)):
        for j,w in enumerate(np.linspace(-2.1,2.2,size)):
            r[i,j] = getLoss(w,b,x,y)
            if r[i,j] < minError:
                minError = r[i,j]
                w_f = w
                b_f = b

    return r,w_f, b_f


def getErrorSurface2(x,y):
    r"""
    获取error surface (塞给contour函数)
    :param x:
    :param y:
    :return:
    """
    minError = 999999
    w_f = 0
    b_f = 0
    size = 200
    r = np.zeros((size,size))

    wa = np.linspace(-2.1,2.2,size)
    ba = np.linspace(-2.1,2.2,size)
    z = []
    for i,b in enumerate(np.linspace(-2.1,2.2,size)):
        for j,w in enumerate(np.linspace(-2.1,2.2,size)):
            r[i,j] = getLoss(w,b,x,y)
            z.append(r[i,j])
            if r[i,j] < minError:
                minError = r[i,j]
                w_f = w
                b_f = b
    z = np.array(z)
    za = z.reshape((size,size))
    return wa,ba,za


if __name__ == "__main__":
    print("线性回归模型")
    np.random.seed (10)

    # 绘制散点图;
    fig, ax = plt.subplots()
    # G1, G2 = getData3()
    # ax.scatter(G1[0,:], G1[1,:])
    # ax.scatter(G2[0,:], G2[1,:])
    x,y = getData4()
    w1,b1,lossList,w_list,b_list = linefit(x, y)
    r,w,b= getErrorSurface(x,y)
    w2,b2,z2= getErrorSurface2(x,y)
    ax.scatter(x, y,color = "C0")
    ax.plot(np.r_[0,100],1.2*np.r_[0,100]+0.2,color = "C1",linewidth=3.0,label='Ground Truth')
    ax.plot(np.r_[0,100],w*np.r_[0,100]+b,color = "C2",linewidth=3.0,label='fit from error surface')
    ax.plot(np.r_[0,100],w1*np.r_[0,100]+b1,color = "C3",linewidth=3.0,label='Grident')

    ax.legend()
    ax.set_title("GT and scatter ")
    fig.show()

    #绘制error surface
    fig, ax = plt.subplots()
    ax.contourf(w2,b2,z2)
    ax.scatter(np.array(w_list), np.array(b_list))
    ax.scatter(np.array(1.2), np.array(0.2), color = 'r')
    ax.annotate('', xy=(1.2, 0.2),  xycoords='data',
                xytext=(1.2+0.4, 0.2+0.4), textcoords='axes fraction',
                arrowprops=dict(facecolor='white', shrink=0.05),
                horizontalalignment='right', verticalalignment='top',
                )


    # im = ax.imshow(r)
    # cb = plt.colorbar(im,)

    ax.set_xlabel('w')
    # ax.set_xlim(xmin = 0.2, xmax = 2.2)
    ax.set_ylabel('b')
    # ax.set_ylim(ymin = 0, ymax = 0.4)
    ax.set_title("error surface ")
    fig.show()

    # 绘制loss
    # lossList
    fig, ax = plt.subplots()

    ax.plot(lossList)
    ax.set_xlabel('step')
    # ax.set_xlim(xmin = 0.2, xmax = 2.2)

    ax.set_ylabel('loss')
    ax.set_ylim(ymin = 700, ymax = 1000)
    ax.set_title("loss ")
    fig.show()

    #绘制直方图;
    # ax.hist(data,bins=20) #随机均匀采样
    # ax.hist(data3,bins=20)  #随机均匀分布
    # ax.hist(data2,bins=20) #正太分布,均值为0,方差为1
    # ax.hist(data4,bins=20)  #正太分布,均值为0,方差为1
    #固定X轴、Y轴的范围
    # ax.set_ylim(ymin = 0, ymax = 130)
    # ax.set_xlim(xmin = -5, xmax = 5)



 

 

 

posted @ 2022-03-20 21:37  bH1pJ  阅读(50)  评论(0编辑  收藏  举报