INFO213 in-class exercise - KNN (课堂练习笔记)


Let us implement the KNN algorithm using a set of example data about favorite programming languages for data scientists in different cities.

cities = [(-86.75,33.5666666666667,'Python'),(-88.25,30.6833333333333,'Python'),(-112.016666666667,33.4333333333333,'Java'),(-110.933333333333,32.1166666666667,'Java'),(-92.2333333333333,34.7333333333333,'R'),(-121.95,37.7,'R'),(-118.15,33.8166666666667,'Python'),(-118.233333333333,34.05,'Java'),(-122.316666666667,37.8166666666667,'R'),(-117.6,34.05,'Python'),(-116.533333333333,33.8166666666667,'Python'),(-121.5,38.5166666666667,'R'),(-117.166666666667,32.7333333333333,'R'),(-122.383333333333,37.6166666666667,'R'),(-121.933333333333,37.3666666666667,'R'),(-122.016666666667,36.9833333333333,'Python'),(-104.716666666667,38.8166666666667,'Python'),(-104.866666666667,39.75,'Python'),(-72.65,41.7333333333333,'R'),(-75.6,39.6666666666667,'Python'),(-77.0333333333333,38.85,'Python'),(-80.2666666666667,25.8,'Java'),(-81.3833333333333,28.55,'Java'),(-82.5333333333333,27.9666666666667,'Java'),(-84.4333333333333,33.65,'Python'),(-116.216666666667,43.5666666666667,'Python'),(-87.75,41.7833333333333,'Java'),(-86.2833333333333,39.7333333333333,'Java'),(-93.65,41.5333333333333,'Java'),(-97.4166666666667,37.65,'Java'),(-85.7333333333333,38.1833333333333,'Python'),(-90.25,29.9833333333333,'Java'),(-70.3166666666667,43.65,'R'),(-76.6666666666667,39.1833333333333,'R'),(-71.0333333333333,42.3666666666667,'R'),(-72.5333333333333,42.2,'R'),(-83.0166666666667,42.4166666666667,'Python'),(-84.6,42.7833333333333,'Python'),(-93.2166666666667,44.8833333333333,'Python'),(-90.0833333333333,32.3166666666667,'Java'),(-94.5833333333333,39.1166666666667,'Java'),(-90.3833333333333,38.75,'Python'),(-108.533333333333,45.8,'Python'),(-95.9,41.3,'Python'),(-115.166666666667,36.0833333333333,'Java'),(-71.4333333333333,42.9333333333333,'R'),(-74.1666666666667,40.7,'R'),(-106.616666666667,35.05,'Python'),(-78.7333333333333,42.9333333333333,'R'),(-73.9666666666667,40.7833333333333,'R'),(-80.9333333333333,35.2166666666667,'Python'),(-78.7833333333333,35.8666666666667,'Python'),(-100.75,46.7666666666667,'Java'),(-84.5166666666667,39.15,'Java'),(-81.85,41.4,'Java'),(-82.8833333333333,40,'Java'),(-97.6,35.4,'Python'),(-122.666666666667,45.5333333333333,'Python'),(-75.25,39.8833333333333,'Python'),(-80.2166666666667,40.5,'Python'),(-71.4333333333333,41.7333333333333,'R'),(-81.1166666666667,33.95,'R'),(-96.7333333333333,43.5666666666667,'Python'),(-90,35.05,'R'),(-86.6833333333333,36.1166666666667,'R'),(-97.7,30.3,'Python'),(-96.85,32.85,'Java'),(-95.35,29.9666666666667,'Java'),(-98.4666666666667,29.5333333333333,'Java'),(-111.966666666667,40.7666666666667,'Python'),(-73.15,44.4666666666667,'R'),(-77.3333333333333,37.5,'Python'),(-122.3,47.5333333333333,'Python'),(-89.3333333333333,43.1333333333333,'R'),(-104.816666666667,41.15,'Java')]
cities = [([longitude, latitude], language) for longitude, latitude, language in cities]
[([-86.75, 33.5666666666667], 'Python'),
 ([-88.25, 30.6833333333333], 'Python'),
 ([-112.016666666667, 33.4333333333333], 'Java')]

The following is a helper function for plotting US state borders.


# 导入库
import re
import matplotlib.pyplot as plt
%matplotlib inline

segments = []
points = []

lat_long_regex = r"<point lat=\"(.*)\" lng=\"(.*)\""

with open("US-states.txt", "r") as f:
    lines = [line for line in f]

# 遍历每行
for line in lines:
    if line.startswith("</state>"):
        for p1, p2 in zip(points, points[1:]):
            #p1, p2分别对应每条记录全部数据/第二条及以后所有数据
            segments.append((p1, p2))
        points = []
    s =, line)
    if s:
        lat, lon = s.groups()
        points.append((float(lon), float(lat)))

#def plot_state_borders(color='0.8'):
def plot_state_borders(color = 'black'):
    for (lon1, lat1), (lon2, lat2) in segments:
        plt.plot([lon1, lon2], [lat1, lat2], color=color)


def plot_cities():

    # key is language, value is pair (longitudes, latitudes)(经度,纬度)
    plots = { "Java" : ([], []), "Python" : ([], []), "R" : ([], []) }

    # we want each language to have a different marker and color
    markers = { "Java" : "o", "Python" : "s", "R" : "^" }
    colors  = { "Java" : "r", "Python" : "b", "R" : "g" }

    for (longitude, latitude), language in cities:
        #遍历cities中的所有数据行,([longitudes, latitudes], language)

    plt.figure(figsize = (15, 9))
    # create a scatter series for each language
    for language, (x, y) in plots.items():
        plt.scatter(x, y, color=colors[language], marker=markers[language],
                          label=language, zorder=10)

    plot_state_borders()    # assume we have a function that does this

    plt.legend(loc=0)          # let matplotlib choose the location
    plt.axis([-130,-60,20,55]) # set the axes
    plt.title("Favorite Programming Languages") # set title


 Let’s say we’ve picked a number k like 3 or 5. Then when we want to classify some new data point, we find the k nearest labeled points and let them vote on the new output. To do this, we’ll need a function that counts votes. One possibility is:

from collections import Counter 
import math, random

def raw_majority_vote(labels): 
    votes = Counter(labels) 
    winner, _ = votes.most_common(1)[0] 
    #winner = votes.most_common(1)[0] 
    return winner

labels = [2, 3, 3, 4, 4, 5]
[(3, 2)]

What happens if there are several labels having equal majority votes? We need a strategy to break the tie. There are several options:

* Pick one of the winners at random.
* Weight the votes by distance and pick the weighted winner.
* Reduce k until we find a unique winner.

The following implements the third:

def majority_vote(labels):
    """assumes that labels are ordered from nearest to farthest"""
    vote_counts = Counter(labels)
    winner, winner_count = vote_counts.most_common(1)[0]
    num_winners = len([count
                       for count in vote_counts.values()
                       if count == winner_count])

    if num_winners == 1:
        return winner                     # unique winner, so return it
        return majority_vote(labels[:-1]) # try again without the farthest

#以labels = [2, 3, 3, 4, 4, 5]为例


With this, we can create a KNN classifier:

from scipy.spatial import distance
def knn_classify(k, labeled_points, new_point):
    """each labeled point should be a pair (point, label)"""

    # order the labeled points from nearest to farthest
    by_distance = sorted(labeled_points,
                         key=lambda point_label: distance.euclidean(point_label[0], new_point))

    # find the labels for the k closest
    k_nearest_labels = [label for _, label in by_distance[:k]]

    # and let them vote
    return majority_vote(k_nearest_labels)

To start with, let’s look at what happens if we try to predict each city’s preferred language using its neighbors other than itself:

# try several different values for k
for k in [1, 3, 5, 7]:
    num_correct = 0

    for location, actual_language in cities:

        other_cities = [other_city
                         for other_city in cities
                         if other_city != (location, actual_language)]
        predicted_language = knn_classify(k, other_cities, location)

        if predicted_language == actual_language:
            num_correct += 1

    print(k, "neighbor[s]:", num_correct, "correct out of", len(cities))
1 neighbor[s]: 40 correct out of 75
3 neighbor[s]: 44 correct out of 75
5 neighbor[s]: 41 correct out of 75
7 neighbor[s]: 35 correct out of 75

Now we can look at what regions would get classified to which languages under each nearest neighbors scheme. We can do that by classifying an entire grid worth of points, and then plotting them as we did the cities:

def classify_and_plot_grid(k=1):
    plots = { "Java" : ([], []), "Python" : ([], []), "R" : ([], []) }
    markers = { "Java" : "o", "Python" : "s", "R" : "^" }
    colors  = { "Java" : "red", "Python" : "blue", "R" : "green" }

    for longitude in range(-130, -60):
        for latitude in range(20, 55):
            predicted_language = knn_classify(k, cities, [longitude, latitude])

    plt.figure(figsize = (15, 9))
    # create a scatter series for each language
    for language, (x, y) in plots.items():
        plt.scatter(x, y, color=colors[language], marker=markers[language],
                          label=language, zorder=0)
    plot_state_borders()    # assume we have a function that does this

    plt.legend(loc=0)          # let matplotlib choose the location
    plt.axis([-130,-60,20,55]) # set the axes
    plt.title(str(k) + "-Nearest Neighbor Programming Languages")
#test k=3



The Curse of Dimensionality
k-nearest neighbors runs into trouble in higher dimensions thanks to the “curse of
dimensionality,” which boils down to the fact that high-dimensional spaces are vast.
Points in high-dimensional spaces tend not to be close to one another at all. One way
to see this is by randomly generating pairs of points in the d-dimensional “unit cube”
in a variety of dimensions, and calculating the distances between them.

Let us define a function to compute a list (len = num_pairs) of distances between two points with dimensions dim:

from scipy.spatial import distance
import numpy as np
import random

def random_distances(dim, num_pairs):
#np.random.rand(1, dim)0-1区间内rand出dim个随机坐标的点,并求euclidean距离
return [distance.euclidean(np.random.rand(1, dim), np.random.rand(1, dim))
for _ in range(num_pairs)]

For a range of dimensions between 1 and 101 with stride 5, compute 10000 distances:

dimensions = range(1, 101, 5)
avg_distances = []
min_distances = []

for dim in dimensions:
     distances = random_distances(dim, 10000)  # dis of 10,000 random pairs
     avg_distances.append(np.mean(distances))     # track the average
     min_distances.append(min(distances))      # track the minimum

Plot the ratio between mininum distances and averge distances as the dimension increases:

plt.plot(list(dimensions), list(np.array(min_distances) / np.array(avg_distances)))



In low-dimensional data sets, the closest points tend to be much closer than average. But two points are close only if they’re close in every dimension, and every extra dimension—even if just noise—is another opportunity for each point to be further away from every other point. When you have a lot of dimensions, it’s likely that the closest points aren’t much closer than average, which means that two points being close doesn’t mean very much (unless there’s a lot of structure in your data that makes it behave as if it were much lower-dimensional).

A different way of thinking about the problem involves the sparsity of higherdimensional spaces.

If you pick 50 random numbers between 0 and 1, you’ll probably get a pretty good sample of the unit interval:

If you pick 50 random points in the unit square, you’ll get less coverage:

And in three dimensions less still:


 So if you’re trying to use nearest neighbors in higher dimensions, it’s probably a good idea to do some kind of dimensionality reduction first(降维是很有用的).

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

y = []
for n in range(1000):
plt.plot(np.arange(1000), y)


The package `sklearn.neighbors` provides functionality for unsupervised and supervised neighbors-based learning methods. Despite its simplicity, nearest neighbors has been successful in a large number of classification and regression problems, including handwritten digits or satellite image scenes. Being a non-parametric and instance-based method, it is often successful in classification situations where the decision boundary is very irregular.

scikit-learn implements two different nearest neighbors classifiers: KNeighborsClassifier implements learning based on the k nearest neighbors of each query point, where k is an integer value specified by the user. RadiusNeighborsClassifier implements learning based on the number of neighbors within a fixed radius r of each training point, where r is a floating-point value specified by the user.

The k-neighbors classification in KNeighborsClassifier is the more commonly used of the two techniques. The optimal choice of the value k is highly data-dependent: in general a larger k suppresses the effects of noise, but makes the classification boundaries less distinct.

The basic nearest neighbors classification uses uniform weights: that is, the value assigned to a query point is computed from a simple majority vote of the nearest neighbors. Under some circumstances, it is better to weight the neighbors such that nearer neighbors contribute more to the fit. This can be accomplished through the weights keyword. The default value, weights = 'uniform', assigns uniform weights to each neighbor. weights = 'distance' assigns weights proportional to the inverse of the distance from the query point. Alternatively, a user-defined function of the distance can be supplied which is used to compute the weights.

The Iris Flower Data Set

The Iris dataset was used in R.A. Fisher's classic 1936 paper, The Use of Multiple Measurements in Taxonomic Problems, and can also be found on the UCI Machine Learning Repository.

It includes three iris species with 50 samples each as well as some properties about each flower. One flower species is linearly separable from the other two, but the other two are not linearly separable from each other.

The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimetres. Based on the combination of these four features, Fisher developed a linear discriminant model to distinguish the species from each other.

The columns in this dataset are:

  • Id
  • SepalLengthCm
  • SepalWidthCm
  • PetalLengthCm
  • PetalWidthCm
  • Species

Load the data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors

%matplotlib inline

df = pd.read_csv("Iris.csv")







(150, 6)


species = df.Species.unique()
label_maps = dict(zip(species, np.arange(len(species))))
{'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
df['Species'].replace(label_maps, inplace = True)
#df.head(10) 查看前n行的数据,没有参数时默认为5

X_train = df.iloc[:, 1:-1]

Explore the Data


y_train = df.Species
plt.figure(figsize = (15, 9))
pd.plotting.scatter_matrix(X_train, figsize = (15, 9), c = y_train, marker = 'o')
<Figure size 1080x648 with 0 Axes>

Model and Predict

from sklearn.neighbors import KNeighborsClassifier

#创建sklearn的knn model,k=5,
knn = KNeighborsClassifier(n_neighbors = 5)
#训练, y_train)
knn.score(X_train, y_train)
y_pred = knn.predict(X_train)
plt.figure(figsize = (15, 9))
pd.plotting.scatter_matrix(X_train, figsize = (15, 9), c = y_pred, marker = 'o')
<Figure size 1080x648 with 0 Axes>


Given a set of labeled data and a predictive model, every data point lies in one of four categories:

  • True positive (tp): “This is iris-setosa, we predicted as iris-setosa”
  • False positive (fp) (Type 1 Error): “this is not iris-setosa (v or c), we predicted as iris-setosa”
  • False negative (fn) (Type 2 Error): “this is iris-setosa, we predicted as not iris-setosa”
  • True negative (tn): “this is not iris-setosa, we predicted as not iris-setosa”

We often represent these as counts in a confusion matrix:

tp = sum((y_train == 0) & (y_pred == 0))

fn = sum((y_train == 0) & (y_pred != 0))

fp= sum((y_train  != 0) & (y_pred == 0))

tn = sum((y_train != 0) & (y_pred != 0))

print("recall for class 0 = " + str(tp / (tp + fn)))
print("precision for class 0 = " + str(tp / (tp + fp)))
print("F1 Score for class 0 = " + str(2 * tp /(2 * tp + fp + fn)))
recall for class 0 = 1.0
precision for class 0 = 1.0
F1 Score for class 0 = 1.0


Many techniques are sensitive to the scale of your data. For example, imagine that you have a data set consisting of the heights and weights of hundreds of data scientists, and that you are trying to identify clusters of body sizes.

Intuitively, we’d like clusters to represent points near each other, which means that we need some notion of distance between points. We already have a Euclidean distance function, so a natural approach might be to treat (height, weight) pairs as points in two-dimensional space. Consider the people listed following table:

df = pd.DataFrame({"Person":['A', 'B', 'C'], "height (cm)":[160, 170.2, 177.8], "weight (pounds)":[150, 160, 171], "height (inches)":[63, 67, 70]})

The distances between the pairs of persons in terms of height (inches) and weight (pounds) are:

from scipy.spatial import distance
print("A to B: " + str(distance.euclidean(df.iloc[0, 2:], df.iloc[1, 2:])))
print("A to C: " + str(distance.euclidean(df.iloc[0, 2:], df.iloc[2, 2:])))
print("B to C: " + str(distance.euclidean(df.iloc[1, 2:], df.iloc[2, 2:])))
A to B: 10.770329614269007
A to C: 22.135943621178654
B to C: 11.40175425099138


print("A to B: " + str(distance.euclidean(df.iloc[0, 1::2], df.iloc[1, 1::2])))
print("A to C: " + str(distance.euclidean(df.iloc[0, 1::2], df.iloc[2, 1::2])))
print("B to C: " + str(distance.euclidean(df.iloc[1, 1::2], df.iloc[2, 1::2])))
A to B: 10.95627673984186
A to C: 19.126944345608383
B to C: 8.17067928632622

The nearest point to B is different using different units. Obviously it’s problematic if changing units can change results like this. For this reason, when dimensions aren’t comparable with one another, we will sometimes rescale our data so that each dimension has mean 0 and standard deviation 1. This effectively gets rid of the units, converting each dimension to “standard deviations from the mean.

The equation for scaling is scaled = (data - mean) / standard_deviation

df_data = df.set_index('Person')
df_scaled = (df_data - df_data.mean(axis = 0)) / df_data.std(axis = 0)

df_scaled.mean(axis = 0)
height (cm)       -9.992007e-16
weight (pounds)   -8.881784e-16
height (inches)   -1.332268e-15
dtype: float64
df_scaled.std(axis = 0)
height (cm)        1.0
weight (pounds)    1.0
height (inches)    1.0
dtype: float64
# the distances using height in inches:
from scipy.spatial import distance
print("A to B: " + str(distance.euclidean(df_scaled.iloc[0, 1:], df_scaled.iloc[1, 1:])))
print("A to C: " + str(distance.euclidean(df_scaled.iloc[0, 1:], df_scaled.iloc[2, 1:])))
print("B to C: " + str(distance.euclidean(df_scaled.iloc[1, 1:], df_scaled.iloc[2, 1:])))
A to B: 1.4844668093876097
A to C: 2.823110310444265
B to C: 1.351446065105763

Using Scikit Learn to Scale

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled = scaler.transform(df_data)
array([[-1.27983368, -1.20484922, -1.27872403],
       [ 0.1188417 , -0.0388661 ,  0.11624764],
       [ 1.16099199,  1.24371532,  1.16247639]])
scaled.mean(axis = 0)
array([-1.33226763e-15, -1.11022302e-15, -1.70234197e-15])
scaled.std(axis = 0)
array([1., 1., 1.])
df_data.mean(axis = 0)
height (cm)        169.333333
weight (pounds)    160.333333
height (inches)     66.666667
dtype: float64
array([169.33333333, 160.33333333,  66.66666667])
df_data.var(axis = 0)
height (cm)         79.773333
weight (pounds)    110.333333
height (inches)     12.333333
dtype: float64
array([53.18222222, 73.55555556,  8.22222222])

Use preprocessing.scale():

from sklearn import preprocessing

sdf_data = preprocessing.scale(df_data)

array([[-1.27983368, -1.20484922, -1.27872403],
       [ 0.1188417 , -0.0388661 ,  0.11624764],
       [ 1.16099199,  1.24371532,  1.16247639]])
