import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.neighbors import KNeighborsClassifier
cancer = pd.read_csv('./data/cancer.csv',sep='\t')
print(cancer.shape)
cancer
#target中m恶性,b良性
data = cancer.iloc[:,2:]
target = cancer.iloc[:,1]
display(data.head(),target.head())
knn = KNeighborsClassifier(n_neighbors=15)
#打乱顺序,并且给它一分为二,训练数据,预测数据
#sklearn为我们能提供方法
from sklearn.model_selection import train_test_split
#使用其进行分割
X_train,X_test,y_train,y_test = train_test_split(data,target,test_size = 0.1)
knn.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=15, p=2,
weights='uniform')
score = knn.score(X_test,y_test)
y_ = knn.predict(X_test)
print(score)
0.9473684210526315
#可以说明真实值和预测值拿一些数据不同了
pd.crosstab(index=y_,columns=y_test,rownames=['Predict'],colnames=['True'])
#提升准确度
#对数据进行清洗
data
#归一化
#(num -min)/(max-min)
columns = data.columns
for col in columns:
data_min = data[col].min()
data_max = data[col].max()
# data[col] = (data[col] -data_min)/(data_max-data_min)
print(data[col])
break;