K近邻算法Python代码笔记

1.代码

  1. 基本画图代码
from sklearn.neighbors import KNeighborsClassifier


x = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
# 实例化API
estimator = KNeighborsClassifier(n_neighbors=2)
# 使用fit方法进行训练
estimator.fit(x, y)
estimator.predict([[1]])
  • 绘图时如果坐标轴显示不出中文,可以使用下面代码
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
  • iris example
  1. 获取数据
from sklearn.datasets import load_iris
# 获取鸢尾花数据集
iris = load_iris()
print("鸢尾花数据集的返回值:\n", iris)
# 返回值是一个继承自字典的Bench
print("鸢尾花的特征值:\n", iris["data"])
print("鸢尾花的目标值:\n", iris.target)
print("鸢尾花特征的名字:\n", iris.feature_names)
print("鸢尾花目标值的名字:\n", iris.target_names)
print("鸢尾花的描述:\n", iris.DESCR)
  1. 画出散点图
from sklearn.datasets import load_iris
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]


# 获取鸢尾花数据集
iris = load_iris()
# 把数据转换成dataframe的格式
iris_d = pd.DataFrame(iris['data'], columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'])
iris_d['Species'] = iris.target


def plot_iris(iris, col1, col2):
    """
    :param iris: 总数据
    :param col1: Petal_Width
    :param col2: Sepal_Length
    :return:
    """
    sns.lmplot(x = col1, y = col2, data = iris, hue = "Species", fit_reg = False)
    plt.xlabel(col1)
    plt.ylabel(col2)
    # col1,col2为索引,这是DataFrame数据特性
    plt.title('鸢尾花种类分布图')
    plt.show()


plot_iris(iris_d, 'Petal_Width', 'Sepal_Length')
  1. 划分数据集
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


# 1、获取鸢尾花数据集
iris = load_iris()
# 对鸢尾花数据集进行分割
# 训练集的特征值x_train 测试集的特征值x_test 训练集的目标值y_train 测试集的目标值y_test
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=22)
print("x_train:\n", x_train.shape)
# 随机数种子
x_train1, x_test1, y_train1, y_test1 = train_test_split(iris.data, iris.target, random_state=6)
x_train2, x_test2, y_train2, y_test2 = train_test_split(iris.data, iris.target, random_state=6)
print("如果随机数种子不一致:\n", x_train == x_train1)
print("如果随机数种子一致:\n", x_train1 == x_train2)
  1. 简单综合example
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


# 1.获取数据集
iris = load_iris()

# 2.数据基本处理
# x_train,x_test,y_train,y_test为训练集特征值、测试集特征值、训练集目标值、测试集目标值
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
# 3、特征工程:标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、机器学习(模型训练)
estimator = KNeighborsClassifier(n_neighbors=9)
estimator.fit(x_train, y_train)
# 5、模型评估
# 方法1:比对真实值和预测值
y_predict = estimator.predict(x_test)
print("预测结果为:\n", y_predict)
print("比对真实值和预测值:\n", y_predict == y_test)
# 方法2:直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
  1. 例子
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


# 1、获取数据集
iris = load_iris()
# 2、数据基本处理 -- 划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=22)
# 3、特征工程:标准化
# 实例化一个转换器类
transfer = StandardScaler()
# 调用fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、KNN预估器流程
#  4.1 实例化预估器类
estimator = KNeighborsClassifier()

# 4.2 模型选择与调优——网格搜索和交叉验证
# 准备要调的超参数
param_dict = {"n_neighbors": [1, 3, 5]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
# 4.3 fit数据进行训练
estimator.fit(x_train, y_train)
# 5、评估模型效果
# 方法a:比对预测结果和真实值
y_predict = estimator.predict(x_test)
print("比对预测结果和真实值:\n", y_predict == y_test)
# 方法b:直接计算准确率
score = estimator.score(x_test, y_test)
print("直接计算准确率:\n", score)

print("在交叉验证中验证的最好结果:\n", estimator.best_score_)
print("最好的参数模型:\n", estimator.best_estimator_)
print("每次交叉验证后的准确率结果:\n", estimator.cv_results_)
  1. Facebook
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd


facebook = pd.read_csv("./data/FBlocation/train.csv")

# 2.基本数据处理
# 2.1 缩小数据范围
facebook_data = facebook.query("x>2.0 & x<2.5 & y>2.0 & y<2.5")
# 2.2 选择时间特征
time = pd.to_datetime(facebook_data["time"], unit="s")
time = pd.DatetimeIndex(time)
facebook_data["day"] = time.day
facebook_data["hour"] = time.hour
facebook_data["weekday"] = time.weekday
# 2.3 去掉签到较少的地方
place_count = facebook_data.groupby("place_id").count()
place_count = place_count[place_count["row_id"] > 3]
facebook_data = facebook_data[facebook_data["place_id"].isin(place_count.index)]
# 2.4 确定特征值和目标值
x = facebook_data[["x", "y", "accuracy", "day", "hour", "weekday"]]
y = facebook_data["place_id"]
# 2.5 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

# 3.特征工程--特征预处理(标准化)
# 3.1 实例化一个转换器
transfer = StandardScaler()
# 3.2 调用fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)

# 4.机器学习--knn+cv
# 4.1 实例化一个估计器
estimator = KNeighborsClassifier()
# 4.2 调用gridsearchCV
param_grid = {"n_neighbors": [1, 3, 5, 7, 9]}
estimator = GridSearchCV(estimator, param_grid=param_grid, cv=5)
# 4.3 模型训练
estimator.fit(x_train, y_train)

# 5.模型评估
# 5.1 基本评估方式
score = estimator.score(x_test, y_test)
print("最后预测的准确率为:\n", score)

y_predict = estimator.predict(x_test)
print("最后的预测值为:\n", y_predict)
print("预测值和真实值的对比情况:\n", y_predict == y_test)

# 5.2 使用交叉验证后的评估方式
print("在交叉验证中验证的最好结果:\n", estimator.best_score_)
print("最好的参数模型:\n", estimator.best_estimator_)
print("每次交叉验证后的验证集准确率结果和训练集准确率结果:\n",estimator.cv_results_)

特殊知识点

  1. lmplot

x, y:strings, optional

Input variables; these should be column names in data.

data:DataFrame

Tidy (“long-form”) dataframe where each column is a variable and each row is an observation.

hue, col, row:strings

Variables that define subsets of the data, which will be drawn on separate facets in the grid. See the *_order parameters to control the order of levels of this variable.

palette:palette name, list, or dict, optional

Colors to use for the different levels of the hue variable. Should be something that can be interpreted by color_palette(), or a dictionary mapping hue levels to matplotlib colors.

col_wrap:int, optional

“Wrap” the column variable at this width, so that the column facets span multiple rows. Incompatible with a row facet.

heights:calar, optional

Height (in inches) of each facet. See also: aspect.

aspect:scalar, optional

Aspect ratio of each facet, so that aspect * height gives the width of each facet in inches.

markers:matplotlib marker code or list of marker codes, optional

Markers for the scatterplot. If a list, each marker in the list will be used for each level of the hue variable.

share{x,y}:bool, ‘col’, or ‘row’ optional

If true, the facets will share y axes across columns and/or x axes across rows.

{hue,col,row}_order:lists, optional

Order for the levels of the faceting variables. By default, this will be the order that the levels appear in data or, if the variables are pandas categoricals, the category order.

legend:bool, optional

If True and there is a hue variable, add a legend.

legend_out:bool, optional

If True, the figure size will be extended, and the legend will be drawn outside the plot on the center right.

x_estimator:callable that maps vector -> scalar, optional

Apply this function to each unique value of x and plot the resulting estimate. This is useful when x is a discrete variable. If x_ci is given, this estimate will be bootstrapped and a confidence interval will be drawn.

x_bins:int or vector, optional

Bin the x variable into discrete bins and then estimate the central tendency and a confidence interval. This binning only influences how the scatterplot is drawn; the regression is still fit to the original data. This parameter is interpreted either as the number of evenly-sized (not necessary spaced) bins or the positions of the bin centers. When this parameter is used, it implies that the default of x_estimator is numpy.mean.

x_ci:“ci”, “sd”, int in [0, 100] or None, optional

Size of the confidence interval used when plotting a central tendency for discrete values of x. If "ci", defer to the value of the ci parameter. If "sd", skip bootstrapping and show the standard deviation of the observations in each bin.

scatter:bool, optional

If True, draw a scatterplot with the underlying observations (or the x_estimator values).

fit_reg:bool, optional

If True, estimate and plot a regression model relating the x and y variables.

ci:int in [0, 100] or None, optional

Size of the confidence interval for the regression estimate. This will be drawn using translucent bands around the regression line. The confidence interval is estimated using a bootstrap; for large datasets, it may be advisable to avoid that computation by setting this parameter to None.

n_boot:int, optional

Number of bootstrap resamples used to estimate the ci. The default value attempts to balance time and stability; you may want to increase this value for “final” versions of plots.

units:variable name in data, optional

If the x and y observations are nested within sampling units, those can be specified here. This will be taken into account when computing the confidence intervals by performing a multilevel bootstrap that resamples both units and observations (within unit). This does not otherwise influence how the regression is estimated or drawn.

seed:int, numpy.random.Generator, or numpy.random.RandomState, optional

Seed or random number generator for reproducible bootstrapping.

order:int, optional

If order is greater than 1, use numpy.polyfit to estimate a polynomial regression.

logistic:bool, optional

If True, assume that y is a binary variable and use statsmodels to estimate a logistic regression model. Note that this is substantially more computationally intensive than linear regression, so you may wish to decrease the number of bootstrap resamples (n_boot) or set ci to None.

lowess:bool, optional

If True, use statsmodels to estimate a nonparametric lowess model (locally weighted linear regression). Note that confidence intervals cannot currently be drawn for this kind of model.

robust:bool, optional

If True, use statsmodels to estimate a robust regression. This will de-weight outliers. Note that this is substantially more computationally intensive than standard linear regression, so you may wish to decrease the number of bootstrap resamples (n_boot) or set ci to None.

logx:bool, optional

If True, estimate a linear regression of the form y ~ log(x), but plot the scatterplot and regression model in the input space. Note that x must be positive for this to work.

{x,y}_partial:strings in data or matrices

Confounding variables to regress out of the x or y variables before plotting.

truncate:bool, optional

By default, the regression line is drawn to fill the x axis limits after the scatterplot is drawn. If truncate is True, it will instead by bounded by the data limits.

{x,y}_jitter:floats, optional

Add uniform random noise of this size to either the x or y variables. The noise is added to a copy of the data after fitting the regression, and only influences the look of the scatterplot. This can be helpful when plotting variables that take discrete values.

{scatter,line}_kws:dictionaries

Additional keyword arguments to pass to plt.scatter and plt.plot.

对random_state参数的理解

在学习机器学习的过程中,常常遇到random_state这个参数,下面来简单叙述一下它的作用。
作用:控制随机状态。

原因:为什么需要用到这样一个参数random_state(随机状态)?

在此先简单罗列三种情况:

  1. 在构建模型时:

forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train, y_train)

  1. 在生成数据集时:

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)

  1. 在拆分数据集为训练集、测试集时:

X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)

如果不设置random_state的话会怎样?

  • 例如1中,每次构建的模型是不同的。
  • 例如2中,每次生成的数据集是不同的。
  • 例如3中,每次拆分出的训练集、测试集是不同的。

之所以会这样,是因为模型的构建、数据集的生成、数据集的拆分都是一个随机的过程。

如果你希望结果可以重现,固定random_state是非常重要的。

  • 对于随机森林这个模型,它本质上是随机的,设置不同的随机状态(或者不设置random_state参数)可以彻底改变构建的模型。
  • 对于数据集的生成,它本质上也是随机的,设置不同的随机状态(或者不设置random_state参数)可以彻底改变生成的数据集。
  • 对于数据集的拆分,它本质上也是随机的,设置不同的随机状态(或者不设置random_state参数)可以彻底改变拆分的结果。

固定random_state后,每次构建的模型是相同的、生成的数据集是相同的、每次的拆分结果也是相同的。

总结:对于那些本质上是随机的过程,我们有必要控制随机的状态,这样才能重复的展现相同的结果。如果,对随机状态不加控制,那么实验的结果就无法固定,而是随机的显现。

posted @ 2020-02-17 23:24  紫色未来  阅读(348)  评论(0编辑  收藏  举报