K临近算法

K临近算法原理

K临近算法(K-Nearest Neighbor, KNN)是最简单的监督学习分类算法之一。(有之一吗?)

对于一个应用样本点,K临近算法寻找距它最近的k个训练样本点即K个Nearest Neighbor。

若在K个邻居中属于某一类别的最多,则认为应用样本点也属于该类别。

KNN算法Python实现

KNN算法无需训练,很容易实现。

from numpy import *
import operator

class KNNClassifier():

  def __init__(self):
      self.dataSet = []
      self.labels = []

  def loadDataSet(self,filename):
      fr = open(filename)
      for line in fr.readlines():
          lineArr = line.strip().split()
          dataLine = list()
          for i in lineArr:
              dataLine.append(float(i))
          label = dataLine.pop() # pop the last column referring to  label
          self.dataSet.append(dataLine)
          self.labels.append(int(label))

  def setDataSet(self, dataSet, labels):
      self.dataSet = dataSet
      self.labels = labels

  def classify(self, data, k):
      self.dataSet = array(self.dataSet)
      self.labels = array(self.labels)
      self._normDataSet()
      dataSetSize = self.dataSet.shape[0]
      # get distance
      diffMat = tile(data, (dataSetSize,1)) - self.dataSet
      sqDiffMat = diffMat**2
      distances = sqDiffMat.sum(axis=1)
      # get K nearest neighbors
      sortedDistIndicies = distances.argsort()
      classCount= {}
      for i in range(k):
          voteIlabel = self.labels[sortedDistIndicies[i]]
          classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
      # get fittest label
      sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
      return sortedClassCount[0][0]

  def _normDataSet(self):
      minVals = self.dataSet.min(0)
      maxVals = self.dataSet.max(0)
      ranges = maxVals - minVals
      normDataSet = zeros(shape(self.dataSet))
      m = self.dataSet.shape[0]
      normDataSet = self.dataSet - tile(minVals, (m,1))
      normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
      self.dataSet = normDataSet

  def test(self):
      self.dataSet = array([[1.0,1.1],[1.0,1.0],[0.9,0.9],[0,0],[0,0.1],[0,0.2]])
      self.labels = [1,1,1,2,2,2]
      print(self.classify([1.0,1.1], 2))

if __name__ == '__main__':
  KNN = KNNClassifier()
  KNN.loadDataSet('testData.txt')
  print(KNN.classify([72011, 4.932976, 0.632026], 5) )
  # KNN.test()
posted @ 2016-04-05 19:33  -Finley-  阅读(617)  评论(0编辑  收藏  举报