机器学习8—回归学习笔记
线性回归,加权回归推导过程
一、普通线性回归(OLS)
损失函数:
J(w)=1n∑i=1n(yi−w∗xi)2=1n||Y−X∗w||2
其中:Y、w、xi为向量,X为矩阵。对该损失函数求解如下,即为对J(w)函数求w的偏导:
需要用到的矩阵求导公式为:
dBAdA=BT
dATBdA=B
dATBAdA=2BA
所以,J(w)对w求导得到:
当XTX为可逆矩阵的时候,有解:
w=(XTX)−1XTY
因为:
Xw=X(XTX)−1XTY=Y^=H^Y
所以,帽子矩阵为:
H^=X(XTX)−1XT
二、加权回归
损失函数:
J(w)=1n∑i=1nαi(yi−w∗xi)2=1nα||Y−X∗w||2
同理推导:
其中,α为是权重的对角矩阵。对w求导得到:
所以,得到:
w=(XTαX)−1XTαY
加权的帽子矩阵为:
H^=X(XTαX)−1XTα
机器学习实战之回归
python3.6代码
test8.py
#-*- coding:utf-8 import sys sys.path.append("regression.py") import regression from numpy import * import matplotlib.pyplot as plt # xArr,yArr = regression.loadDataSet('ex0.txt') # # print(xArr) # # ws = regression.standRegres(xArr, yArr) # print(ws) # # # xMat = mat(xArr) # yMat = mat(yArr) # yHat = xMat*ws # # fig = plt.figure() # ax = fig.add_subplot(111) # ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0]) # # xCopy = xMat.copy() # xCopy.sort(0) # yHat = xCopy*ws # ax.plot(xCopy[:,1],yHat) # plt.show() # # yHat = xMat*ws # array = corrcoef(yHat.T, yMat) # print(array) # xArr,yArr = regression.loadDataSet('ex0.txt') # print(yArr[0]) # reg1 = regression.lwlr(xArr[0], xArr, yArr, 1.0) # print(reg1) # reg001 = regression.lwlr(xArr[0], xArr, yArr, 0.001) # print(reg001) # # yHat = regression.lwlrTest(xArr, xArr, yArr, 0.01) # xMat = mat(xArr) # srtInd = xMat[:,1].argsort(0) # xSort = xMat[srtInd][:,0,:] # # fig = plt.figure() # ax = fig.add_subplot(111) # ax.plot(xSort[:,1],yHat[srtInd]) # ax.scatter(xMat[:,1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red') # plt.show() # # # abX, abY = regression.loadDataSet('abalone.txt') # yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) # yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) # yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) # # error01 = regression.rssError(abY[0:99], yHat01.T) # print(error01) # error1 = regression.rssError(abY[0:99], yHat1.T) # print(error1) # error10 = regression.rssError(abY[0:99], yHat10.T) # print(error10) # # yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) # error01 = regression.rssError(abY[100:199], yHat01.T) # print(error01) # # yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) # error1 = regression.rssError(abY[100:199], yHat1.T) # print(error1) # # yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) # error10 = regression.rssError(abY[100:199], yHat10.T) # print(error10) # # ws = regression.standRegres(abX[0:99], abY[0:99]) # yHat = mat(abX[100:199])*ws # error = regression.rssError(abY[100:199], yHat.T.A) # print(error) # abX, abY = regression.loadDataSet("abalone.txt") # ridgeWeights = regression.ridgeTest(abX, abY) # fig = plt.figure() # ax = fig.add_subplot(111) # ax.plot(ridgeWeights) # plt.show() # print(ridgeWeights) # print("ridgeWeights over") xArr, yArr = regression.loadDataSet('abalone.txt') weights = regression.stageWise(xArr, yArr, 0.005, 1000) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(weights) plt.show() # print("最小二乘法") # xMat = mat(xArr) # yMat = mat(yArr).T # xMat = regression.regularize(xMat) # yM = mean(yMat, 0) # yMat = yMat - yM # weights = regression.standRegres(xMat, yMat.T) # print(weights.T) # lgX = []; lgY = [] # regression.setDataCollect(lgX, lgY) # print(lgX) # print("__________") # print(lgY) print("over!")
regression.py
''' Created on Jan 8, 2011 @author: Peter ''' from numpy import * def loadDataSet(fileName): #general function to parse tab -delimited floats numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields dataMat = []; labelMat = [] fr = open(fileName) for line in fr.readlines(): lineArr =[] curLine = line.strip().split('\t') for i in range(numFeat): lineArr.append(float(curLine[i])) dataMat.append(lineArr) labelMat.append(float(curLine[-1])) return dataMat,labelMat def standRegres(xArr,yArr): xMat = mat(xArr); yMat = mat(yArr).T xTx = xMat.T*xMat if linalg.det(xTx) == 0.0: print("This matrix is singular, cannot do inverse") return ws = xTx.I * (xMat.T*yMat) return ws def lwlr(testPoint,xArr,yArr,k=1.0): xMat = mat(xArr); yMat = mat(yArr).T m = shape(xMat)[0] weights = mat(eye((m))) for j in range(m): #next 2 lines create weights matrix diffMat = testPoint - xMat[j,:] # weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2)) xTx = xMat.T * (weights * xMat) if linalg.det(xTx) == 0.0: print("This matrix is singular, cannot do inverse") return ws = xTx.I * (xMat.T * (weights * yMat)) return testPoint * ws def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one m = shape(testArr)[0] yHat = zeros(m) for i in range(m): yHat[i] = lwlr(testArr[i],xArr,yArr,k) return yHat def lwlrTestPlot(xArr,yArr,k=1.0): #same thing as lwlrTest except it sorts X first yHat = zeros(shape(yArr)) #easier for plotting xCopy = mat(xArr) xCopy.sort(0) for i in range(shape(xArr)[0]): yHat[i] = lwlr(xCopy[i],xArr,yArr,k) return yHat,xCopy def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays return ((yArr-yHatArr)**2).sum() def ridgeRegres(xMat,yMat,lam=0.2): xTx = xMat.T*xMat denom = xTx + eye(shape(xMat)[1])*lam if linalg.det(denom) == 0.0: print("This matrix is singular, cannot do inverse") return ws = denom.I * (xMat.T*yMat) return ws def ridgeTest(xArr,yArr): xMat = mat(xArr); yMat=mat(yArr).T yMean = mean(yMat,0) yMat = yMat - yMean #to eliminate X0 take mean off of Y #regularize X's xMeans = mean(xMat,0) #calc mean then subtract it off xVar = var(xMat,0) #calc variance of Xi then divide by it xMat = (xMat - xMeans)/xVar numTestPts = 30 wMat = zeros((numTestPts,shape(xMat)[1])) for i in range(numTestPts): ws = ridgeRegres(xMat,yMat,exp(i-10)) wMat[i,:]=ws.T return wMat def regularize(xMat):#regularize by columns inMat = xMat.copy() inMeans = mean(inMat,0) #calc mean then subtract it off inVar = var(inMat,0) #calc variance of Xi then divide by it inMat = (inMat - inMeans)/inVar return inMat def stageWise(xArr,yArr,eps=0.01,numIt=100): xMat = mat(xArr); yMat=mat(yArr).T yMean = mean(yMat,0) yMat = yMat - yMean #can also regularize ys but will get smaller coef xMat = regularize(xMat) m,n=shape(xMat) returnMat = zeros((numIt,n)) #testing code remove ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy() for i in range(numIt): print(ws.T) lowestError = inf; for j in range(n): for sign in [-1,1]: wsTest = ws.copy() wsTest[j] += eps*sign yTest = xMat*wsTest rssE = rssError(yMat.A,yTest.A) if rssE < lowestError: lowestError = rssE wsMax = wsTest ws = wsMax.copy() returnMat[i,:]=ws.T return returnMat #def scrapePage(inFile,outFile,yr,numPce,origPrc): # from BeautifulSoup import BeautifulSoup # fr = open(inFile); fw=open(outFile,'a') #a is append mode writing # soup = BeautifulSoup(fr.read()) # i=1 # currentRow = soup.findAll('table', r="%d" % i) # while(len(currentRow)!=0): # title = currentRow[0].findAll('a')[1].text # lwrTitle = title.lower() # if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): # newFlag = 1.0 # else: # newFlag = 0.0 # soldUnicde = currentRow[0].findAll('td')[3].findAll('span') # if len(soldUnicde)==0: # print("item #%d did not sell" % i) # else: # soldPrice = currentRow[0].findAll('td')[4] # priceStr = soldPrice.text # priceStr = priceStr.replace('$','') #strips out $ # priceStr = priceStr.replace(',','') #strips out , # if len(soldPrice)>1: # priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping # print("%s\t%d\t%s" % (priceStr,newFlag,title)) # fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr)) # i += 1 # currentRow = soup.findAll('table', r="%d" % i) # fw.close() from time import sleep import json import urllib.request def searchForSet(retX, retY, setNum, yr, numPce, origPrc): sleep(10) myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY' searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum) pg = urllib.request.urlopen(searchURL) retDict = json.loads(pg.read()) for i in range(len(retDict['items'])): try: currItem = retDict['items'][i] if currItem['product']['condition'] == 'new': newFlag = 1 else: newFlag = 0 listOfInv = currItem['product']['inventories'] for item in listOfInv: sellingPrice = item['price'] if sellingPrice > origPrc * 0.5: print("%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)) retX.append([yr, numPce, newFlag, origPrc]) retY.append(sellingPrice) except: print('problem with item %d' % i) def setDataCollect(retX, retY): searchForSet(retX, retY, 8288, 2006, 800, 49.99) searchForSet(retX, retY, 10030, 2002, 3096, 269.99) searchForSet(retX, retY, 10179, 2007, 5195, 499.99) searchForSet(retX, retY, 10181, 2007, 3428, 199.99) searchForSet(retX, retY, 10189, 2008, 5922, 299.99) searchForSet(retX, retY, 10196, 2009, 3263, 249.99) def crossValidation(xArr,yArr,numVal=10): m = len(yArr) indexList = range(m) errorMat = zeros((numVal,30))#create error mat 30columns numVal rows for i in range(numVal): trainX=[]; trainY=[] testX = []; testY = [] random.shuffle(indexList) for j in range(m):#create training set based on first 90% of values in indexList if j < m*0.9: trainX.append(xArr[indexList[j]]) trainY.append(yArr[indexList[j]]) else: testX.append(xArr[indexList[j]]) testY.append(yArr[indexList[j]]) wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge for k in range(30):#loop over all of the ridge estimates matTestX = mat(testX); matTrainX=mat(trainX) meanTrain = mean(matTrainX,0) varTrain = var(matTrainX,0) matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store errorMat[i,k]=rssError(yEst.T.A,array(testY)) #print errorMat[i,k] meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors minMean = float(min(meanErrors)) bestWeights = wMat[nonzero(meanErrors==minMean)] #can unregularize to get model #when we regularized we wrote Xreg = (x-meanX)/var(x) #we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY xMat = mat(xArr); yMat=mat(yArr).T meanX = mean(xMat,0); varX = var(xMat,0) unReg = bestWeights/varX print("the best model from Ridge Regression is:\n",unReg) print("with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat))