机器学习实战——第二章之改进约会网站的配对效果
三种类型:不喜欢的-1,魅力一般的-2,极具魅力的-3。
样本特征:每年获得的飞行常客里程数,玩视频游戏所耗时间百分比,每周消费的冰淇淋公升数。
1 from numpy import * 2 import matplotlib 3 import matplotlib.pyplot as plt 4 5 ''' 6 输入文本文件名字符串,输出训练样本矩阵和类标签向量 7 ''' 8 def file2matrix(filename): 9 fr = open(filename) 10 arrayOLines = fr.readlines() #一次读取整个文件,将文件内容分析成一个行的列表 11 numberOfLines = len(arrayOLines) 12 returnMat = zeros((numberOfLines, 3)) 13 classLabelVector = [] 14 index = 0 15 for line in arrayOLines: 16 line = line.strip() #截取掉所有的回车字符 17 listFromLine = line.split('\t') #将line分割成一个元素列表 18 returnMat[index, :] = listFromLine[0:3] #选取前3个元素存储到特征矩阵中 19 classLabelVector.append(int(listFromLine[-1])) #选取最后一个元素进行存储 20 index += 1 21 return returnMat, classLabelVector 22 23 if __name__ == '__main__': 24 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') 25 fig = plt.figure() 26 ax = fig.add_subplot(111) 27 ax.scatter(datingDataMat[:,0],datingDataMat[:,1], 20.0*array(datingLabels), 15.0*array(datingLabels)) 28 plt.show()
>>> import numpy as np >>> np.zeros((3,2)) #3行2列的零矩阵 array([[ 0., 0.], [ 0., 0.], [ 0., 0.]])
ax.scatter(datingDataMat[:,0],datingDataMat[:,1], 20.0*array(datingLabels), 15.0*array(datingLabels))#scatter(x,y,大小,颜色)
''' 归一化特征值 newValue = (oldValue - min) / (max - min) ''' def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m, 1)) #用tile函数将变量内容复制成输入矩阵同样大小的额矩阵 normDataSet = normDataSet / tile(ranges, (m, 1)) return normDataSet, ranges, minVals if __name__ == '__main__': datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(normMat[:,0],normMat[:,1], 30.0*array(datingLabels), 100.0*array(datingLabels)) plt.xlabel(u'每年获得的飞行常客里程数') plt.ylabel(u'玩视频游戏所耗时间百分比') plt.show()
1 ''' 2 测试代码。 3 评估算法的正确率:提供已有样本的90%作为训练样本,而使用其余的10%数据去测试分类器。 4 错误率 = 错误结果的次数 / 测试数据的总数 5 ''' 6 def datingClassTest(): 7 hoRatio = 0.10 8 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') 9 normMat, ranges, minVals = autoNorm(datingDataMat) 10 m = normMat.shape[0] 11 numTestVecs = int(m * hoRatio) #10%的样本数用于测试 12 errorCount = 0 13 for i in range(numTestVecs): 14 classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], \ 15 datingLabels[numTestVecs:m], 20) 16 print "the classifier came back with: %d, the real answer is: %d" \ 17 % (classifierResult, datingLabels[i]) 18 if (classifierResult != datingLabels[i]): errorCount += 1.0 19 print "the total error rate is: %f" % (errorCount / float(numTestVecs)) #输出错误率 20 21 if __name__ == '__main__': 22 datingClassTest()
1 ''' 2 输入某个人的信息,给出对对方喜欢程度的预测值 3 ''' 4 def classifyPerson(): 5 resultList = ['not at all', 'in small doses', 'in large doses'] 6 percentTats = float(raw_input("percentage of time spend playing video games?")) 7 ffMiles = float(raw_input("frequent flier miles earned per year?")) 8 iceCream = float(raw_input("liters of ice cream consumed per year?")) 9 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') 10 normMat, ranges, minVals = autoNorm(datingDataMat) 11 inArr = array([ffMiles, percentTats, iceCream]) 12 classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3) 13 print "You will probably like this person: ", resultList[classifierResult - 1] 14 15 if __name__ == '__main__': 16 classifyPerson()
测试:
1 percentage of time spend playing video games?10 2 frequent flier miles earned per year?10000 3 liters of ice cream consumed per year?0.5 4 You will probably like this person: in small doses