机器学习8—回归学习笔记

 

线性回归,加权回归推导过程

 

一、普通线性回归(OLS)

损失函数: 

J(w)=1ni=1n(yiwxi)2=1n||YXw||2


其中:Ywxi为向量,X为矩阵。对该损失函数求解如下,即为对J(w)函数求w的偏导: 

 


需要用到的矩阵求导公式为: 

dBAdA=BT


dATBdA=B


dATBAdA=2BA


所以,J(w)w求导得到: 

 


XTX为可逆矩阵的时候,有解: 

w=(XTX)1XTY


因为: 

Xw=X(XTX)1XTY=Y^=H^Y


所以,帽子矩阵为: 

H^=X(XTX)1XT

 

二、加权回归

损失函数: 

J(w)=1ni=1nαi(yiwxi)2=1nα||YXw||2


同理推导: 

 


其中,α为是权重的对角矩阵。对w求导得到: 

 


所以,得到: 

w=(XTαX)1XTαY


加权的帽子矩阵为: 

H^=X(XTαX)1XTα

机器学习实战之回归

python3.6代码

test8.py

 

#-*- coding:utf-8

import sys
sys.path.append("regression.py")

import regression
from numpy import *
import matplotlib.pyplot as plt

# xArr,yArr = regression.loadDataSet('ex0.txt')
#
# print(xArr)
#
# ws = regression.standRegres(xArr, yArr)
# print(ws)
#
#
# xMat = mat(xArr)
# yMat = mat(yArr)
# yHat = xMat*ws
#
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
#
# xCopy = xMat.copy()
# xCopy.sort(0)
# yHat = xCopy*ws
# ax.plot(xCopy[:,1],yHat)
# plt.show()
#
# yHat = xMat*ws
# array = corrcoef(yHat.T, yMat)
# print(array)


# xArr,yArr = regression.loadDataSet('ex0.txt')
# print(yArr[0])
# reg1 = regression.lwlr(xArr[0], xArr, yArr, 1.0)
# print(reg1)
# reg001 = regression.lwlr(xArr[0], xArr, yArr, 0.001)
# print(reg001)
#
# yHat = regression.lwlrTest(xArr, xArr, yArr, 0.01)
# xMat = mat(xArr)
# srtInd = xMat[:,1].argsort(0)
# xSort = xMat[srtInd][:,0,:]
#
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.plot(xSort[:,1],yHat[srtInd])
# ax.scatter(xMat[:,1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red')
# plt.show()
#
#
# abX, abY = regression.loadDataSet('abalone.txt')
# yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
# yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
# yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
#
# error01 = regression.rssError(abY[0:99], yHat01.T)
# print(error01)
# error1 = regression.rssError(abY[0:99], yHat1.T)
# print(error1)
# error10 = regression.rssError(abY[0:99], yHat10.T)
# print(error10)
#
# yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
# error01 = regression.rssError(abY[100:199], yHat01.T)
# print(error01)
#
# yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
# error1 = regression.rssError(abY[100:199], yHat1.T)
# print(error1)
#
# yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
# error10 = regression.rssError(abY[100:199], yHat10.T)
# print(error10)
#
# ws = regression.standRegres(abX[0:99], abY[0:99])
# yHat = mat(abX[100:199])*ws
# error = regression.rssError(abY[100:199], yHat.T.A)
# print(error)

# abX, abY = regression.loadDataSet("abalone.txt")
# ridgeWeights = regression.ridgeTest(abX, abY)
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.plot(ridgeWeights)
# plt.show()
# print(ridgeWeights)
# print("ridgeWeights over")

xArr, yArr = regression.loadDataSet('abalone.txt')
weights = regression.stageWise(xArr, yArr, 0.005, 1000)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(weights)
plt.show()


# print("最小二乘法")
# xMat = mat(xArr)
# yMat = mat(yArr).T
# xMat = regression.regularize(xMat)
# yM = mean(yMat, 0)
# yMat = yMat - yM
# weights = regression.standRegres(xMat, yMat.T)
# print(weights.T)



# lgX = []; lgY = []
# regression.setDataCollect(lgX, lgY)
# print(lgX)
# print("__________")
# print(lgY)


print("over!")

regression.py

'''
Created on Jan 8, 2011

@author: Peter
'''
from numpy import *

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields 
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr =[]
        curLine = line.strip().split('\t')
        for i in range(numFeat):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat

def standRegres(xArr,yArr):
    xMat = mat(xArr); yMat = mat(yArr).T
    xTx = xMat.T*xMat
    if linalg.det(xTx) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T*yMat)
    return ws

def lwlr(testPoint,xArr,yArr,k=1.0):
    xMat = mat(xArr); yMat = mat(yArr).T
    m = shape(xMat)[0]
    weights = mat(eye((m)))
    for j in range(m):                      #next 2 lines create weights matrix
        diffMat = testPoint - xMat[j,:]     #
        weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
    xTx = xMat.T * (weights * xMat)
    if linalg.det(xTx) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T * (weights * yMat))
    return testPoint * ws

def lwlrTest(testArr,xArr,yArr,k=1.0):  #loops over all the data points and applies lwlr to each one
    m = shape(testArr)[0]
    yHat = zeros(m)
    for i in range(m):
        yHat[i] = lwlr(testArr[i],xArr,yArr,k)
    return yHat

def lwlrTestPlot(xArr,yArr,k=1.0):  #same thing as lwlrTest except it sorts X first
    yHat = zeros(shape(yArr))       #easier for plotting
    xCopy = mat(xArr)
    xCopy.sort(0)
    for i in range(shape(xArr)[0]):
        yHat[i] = lwlr(xCopy[i],xArr,yArr,k)
    return yHat,xCopy

def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
    return ((yArr-yHatArr)**2).sum()

def ridgeRegres(xMat,yMat,lam=0.2):
    xTx = xMat.T*xMat
    denom = xTx + eye(shape(xMat)[1])*lam
    if linalg.det(denom) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = denom.I * (xMat.T*yMat)
    return ws
    
def ridgeTest(xArr,yArr):
    xMat = mat(xArr); yMat=mat(yArr).T
    yMean = mean(yMat,0)
    yMat = yMat - yMean     #to eliminate X0 take mean off of Y
    #regularize X's
    xMeans = mean(xMat,0)   #calc mean then subtract it off
    xVar = var(xMat,0)      #calc variance of Xi then divide by it
    xMat = (xMat - xMeans)/xVar
    numTestPts = 30
    wMat = zeros((numTestPts,shape(xMat)[1]))
    for i in range(numTestPts):
        ws = ridgeRegres(xMat,yMat,exp(i-10))
        wMat[i,:]=ws.T
    return wMat

def regularize(xMat):#regularize by columns
    inMat = xMat.copy()
    inMeans = mean(inMat,0)   #calc mean then subtract it off
    inVar = var(inMat,0)      #calc variance of Xi then divide by it
    inMat = (inMat - inMeans)/inVar
    return inMat

def stageWise(xArr,yArr,eps=0.01,numIt=100):
    xMat = mat(xArr); yMat=mat(yArr).T
    yMean = mean(yMat,0)
    yMat = yMat - yMean     #can also regularize ys but will get smaller coef
    xMat = regularize(xMat)
    m,n=shape(xMat)
    returnMat = zeros((numIt,n)) #testing code remove
    ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
    for i in range(numIt):
        print(ws.T)
        lowestError = inf; 
        for j in range(n):
            for sign in [-1,1]:
                wsTest = ws.copy()
                wsTest[j] += eps*sign
                yTest = xMat*wsTest
                rssE = rssError(yMat.A,yTest.A)
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        returnMat[i,:]=ws.T
    return returnMat

#def scrapePage(inFile,outFile,yr,numPce,origPrc):
#    from BeautifulSoup import BeautifulSoup
#    fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
#    soup = BeautifulSoup(fr.read())
#    i=1
#    currentRow = soup.findAll('table', r="%d" % i)
#    while(len(currentRow)!=0):
#        title = currentRow[0].findAll('a')[1].text
#        lwrTitle = title.lower()
#        if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
#            newFlag = 1.0
#        else:
#            newFlag = 0.0
#        soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
#        if len(soldUnicde)==0:
#            print("item #%d did not sell" % i)
#        else:
#            soldPrice = currentRow[0].findAll('td')[4]
#            priceStr = soldPrice.text
#            priceStr = priceStr.replace('$','') #strips out $
#            priceStr = priceStr.replace(',','') #strips out ,
#            if len(soldPrice)>1:
#                priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
#            print("%s\t%d\t%s" % (priceStr,newFlag,title))
#            fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
#        i += 1
#        currentRow = soup.findAll('table', r="%d" % i)
#    fw.close()
    
from time import sleep
import json
import urllib.request
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
    sleep(10)
    myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
    searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
    pg = urllib.request.urlopen(searchURL)
    retDict = json.loads(pg.read())
    for i in range(len(retDict['items'])):
        try:
            currItem = retDict['items'][i]
            if currItem['product']['condition'] == 'new':
                newFlag = 1
            else: newFlag = 0
            listOfInv = currItem['product']['inventories']
            for item in listOfInv:
                sellingPrice = item['price']
                if  sellingPrice > origPrc * 0.5:
                    print("%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice))
                    retX.append([yr, numPce, newFlag, origPrc])
                    retY.append(sellingPrice)
        except: print('problem with item %d' % i)
    
def setDataCollect(retX, retY):
    searchForSet(retX, retY, 8288, 2006, 800, 49.99)
    searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
    searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
    searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
    searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
    searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
    
def crossValidation(xArr,yArr,numVal=10):
    m = len(yArr)                           
    indexList = range(m)
    errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
    for i in range(numVal):
        trainX=[]; trainY=[]
        testX = []; testY = []
        random.shuffle(indexList)
        for j in range(m):#create training set based on first 90% of values in indexList
            if j < m*0.9: 
                trainX.append(xArr[indexList[j]])
                trainY.append(yArr[indexList[j]])
            else:
                testX.append(xArr[indexList[j]])
                testY.append(yArr[indexList[j]])
        wMat = ridgeTest(trainX,trainY)    #get 30 weight vectors from ridge
        for k in range(30):#loop over all of the ridge estimates
            matTestX = mat(testX); matTrainX=mat(trainX)
            meanTrain = mean(matTrainX,0)
            varTrain = var(matTrainX,0)
            matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
            yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
            errorMat[i,k]=rssError(yEst.T.A,array(testY))
            #print errorMat[i,k]
    meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
    minMean = float(min(meanErrors))
    bestWeights = wMat[nonzero(meanErrors==minMean)]
    #can unregularize to get model
    #when we regularized we wrote Xreg = (x-meanX)/var(x)
    #we can now write in terms of x not Xreg:  x*w/var(x) - meanX/var(x) +meanY
    xMat = mat(xArr); yMat=mat(yArr).T
    meanX = mean(xMat,0); varX = var(xMat,0)
    unReg = bestWeights/varX
    print("the best model from Ridge Regression is:\n",unReg)
    print("with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat))

 

 

  

 

 

posted @ 2018-02-22 17:36  Vae永Silence  阅读(587)  评论(0编辑  收藏  举报