[学习笔记][Python机器学习:预测分析核心算法][多变量回归:使用交叉验证来估计套索模型的样本外错误]
1 import numpy 2 from sklearn import datasets, linear_model 3 from sklearn.linear_model import LassoCV 4 from math import sqrt 5 import matplotlib.pyplot as plot 6 7 #read data into iterable 8 #target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 9 #data = urllib2.urlopen(target_url) 10 target_url_file = open('winequality-red.csv','r') 11 data = target_url_file.readlines() 12 target_url_file.close() 13 14 xList = [] 15 labels = [] 16 names = [] 17 firstLine = True 18 for line in data: 19 if firstLine: 20 names = line.strip().split(";") 21 firstLine = False 22 else: 23 #split on semi-colon 24 row = line.strip().split(";") 25 #put labels in separate array 26 labels.append(float(row[-1])) 27 #remove label from row 28 row.pop() 29 #convert row to floats 30 floatRow = [float(num) for num in row] 31 xList.append(floatRow) 32 33 #Normalize columns in x and labels 34 #Note: be careful about normalization. 35 #Some penalized regression packages include it and some don't. 36 37 nrows = len(xList) 38 ncols = len(xList[0]) 39 40 #calculate means and variances 41 xMeans = [] 42 xSD = [] 43 for i in range(ncols): 44 col = [xList[j][i] for j in range(nrows)] 45 mean = sum(col)/nrows 46 xMeans.append(mean) 47 colDiff = [(xList[j][i] - mean) for j in range(nrows)] 48 sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)]) 49 stdDev = sqrt(sumSq/nrows) 50 xSD.append(stdDev) 51 52 #use calculate mean and standard deviation to normalize xList 53 xNormalized = [] 54 for i in range(nrows): 55 rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)] 56 xNormalized.append(rowNormalized) 57 58 #Normalize labels 59 meanLabel = sum(labels)/nrows 60 sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows) 61 62 labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)] 63 64 #Convert list of list to np array for input to sklearn packages 65 66 #Unnormalized labels 67 Y = numpy.array(labels) 68 69 #normalized lables 70 Y = numpy.array(labelNormalized) 71 72 #Unnormalized X's 73 X = numpy.array(xList) 74 75 #Normlized Xss 76 X = numpy.array(xNormalized) 77 78 #Call LassoCV from sklearn.linear_model 79 #10折交叉验证 80 wineModel = LassoCV(cv=10).fit(X, Y) 81 82 # Display results 83 84 85 plot.figure() 86 plot.figure(figsize=(12,8)) 87 #随着alpha值的变化,均方误差的变化曲线 88 plot.plot(wineModel.alphas_, wineModel.mse_path_, ':') 89 #验证过程中,随着alpha值的变化,均方误差的平均曲线,并设置的alpha变化区域 90 plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1), 91 label='Average MSE Across Folds', linewidth=2) 92 #最佳的alpha值,每次验证系统认为的最合适的alpha值 93 plot.axvline(wineModel.alpha_, linestyle='dotted',label='CV Estimate of Best alpha') 94 #这种轴半对数刻度曲线是将自变量对10取对数,可以有效的看出数据指数型变化时的衰变情况。 95 plot.semilogx() 96 #为图表打标注 97 plot.legend() 98 #当前的图表和子图可以使用plt.gcf()和plt.gca()获得,分别表示Get Current Figure和Get Current Axes。 99 ax = plot.gca() 100 #x轴反向 101 ax.invert_xaxis() 102 103 plot.xlabel('alpha') 104 plot.ylabel('Mean Square Error') 105 plot.axis('tight') 106 plot.show() 107 108 #print out the value of alpha that minimizes the Cv-error 109 print("alpha Value that Minimizes CV Error ",wineModel.alpha_) 110 print("Minimum MSE ", min(wineModel.mse_path_.mean(axis=-1)))
alpha Value that Minimizes CV Error 0.013561387700964642 Minimum MSE 0.6655849206002853