python机器学习之KNN预测QSAR生物浓缩类别
KNN预测QSAR生物浓缩类别
数据来源:http://archive.ics.uci.edu/ml/datasets/QSAR+Bioconcentration+classes+dataset
import numpy
import pandas #导入Excel文件
from sklearn.neighbors import KNeighborsClassifier #机器学习算法库,没有深度学习算法
shen=pandas.read_csv(r"D:\Python\代码\Machine-Learn\1-KNN\data\shenwu.csv")
print("总数据条数:{};列数:{}".format(shen.shape[0],shen.shape[1]))
shen.head()
CAS | SMILES | Set | nHM | piPC09 | PCD | X2Av | MLOGP | ON1V | N-072 | B02[C-N] | F04[C-O] | Class | logBCF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100-02-7 | O=[N+](c1ccc(cc1)O)[O-] | Train | 0 | 0.0 | 1.49 | 0.14 | 1.35 | 0.72 | 0 | 1 | 5 | 1 | 0.74 |
1 | 100-17-4 | O=[N+](c1ccc(cc1)OC)[O-] | Train | 0 | 0.0 | 1.47 | 0.14 | 1.70 | 0.88 | 0 | 1 | 5 | 1 | 0.93 |
2 | 100-18-5 | c1cc(ccc1C(C)C)C(C)C | Train | 0 | 0.0 | 1.20 | 0.25 | 4.14 | 2.06 | 0 | 0 | 0 | 3 | 3.24 |
3 | 100-25-4 | O=[N+]([O-])c1ccc(cc1)[N+](=O)[O-] | Train | 0 | 0.0 | 1.69 | 0.13 | 1.89 | 0.79 | 0 | 1 | 8 | 3 | -0.40 |
4 | 100-40-3 | C=CC1CCC=CC1 | Train | 0 | 0.0 | 0.52 | 0.25 | 2.65 | 1.31 | 0 | 0 | 0 | 1 | 2.24 |
# 筛选set值为Train的训练数据
shen_train=shen[shen.Set.isin(["Train"])]
shen_test=shen[shen.Set.isin(["Test"])]
print("训练数据:{}个\n测试数据:{}个".format((shen_train.shape)[0],(shen_test.shape[0])))
shen_test.head()
CAS | SMILES | Set | nHM | piPC09 | PCD | X2Av | MLOGP | ON1V | N-072 | B02[C-N] | F04[C-O] | Class | logBCF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5 | 100-42-5 | C=Cc1ccccc1 | Test | 0 | 0.000 | 1.40 | 0.18 | 2.85 | 0.86 | 0 | 0 | 0 | 3 | 1.13 |
12 | 101-53-1 | Oc1ccc(cc1)Cc1ccccc1 | Test | 0 | 5.768 | 2.21 | 0.18 | 3.40 | 1.47 | 0 | 0 | 1 | 3 | 1.40 |
15 | 101-84-8 | O(c1ccccc1)c1ccccc1 | Test | 0 | 5.614 | 2.21 | 0.16 | 3.40 | 1.31 | 0 | 0 | 2 | 1 | 2.57 |
16 | 102-06-7 | N=C(Nc1ccccc1)Nc1ccccc1 | Test | 0 | 5.030 | 2.07 | 0.16 | 3.09 | 1.54 | 0 | 1 | 0 | 2 | 1.05 |
19 | 10315-98-7 | O1CCN(CC1)CC(C)C | Test | 0 | 0.000 | 0.00 | 0.28 | 1.00 | 1.80 | 0 | 1 | 1 | 1 | 0.23 |
获得训练,测试数据中的训练数据和结果
# 筛选训练数据中的结果数据
y_train=shen_train.iloc[:,[-2,-1]]
y_test=shen_test.iloc[:,[-2,-1]]
print("训练数据结果:\n{}\n测试数据结果:\n{}\n".format(y_train.head(),y_test.head()))
训练数据结果:
Class logBCF 0 1 0.74 1 1 0.93 2 3 3.24 3 3 -0.40 4 1 2.24 测试数据结果: Class logBCF 5 3 1.13 12 3 1.40 15 1 2.57 16 2 1.05 19 1 0.23
# 筛选训练数据中的训练数据
x_train=shen_train.iloc[:,[3,4,5,6,7,8,9,10,11]]
x_test=shen_test.iloc[:,[3,4,5,6,7,8,9,10,11]]
print("训练数据:\n{}\n测试数据:\n{}\n".format(x_train.head(),x_test.head()))
将非int类型的数据量化为整数
def change_type(values):
for col in values.columns[:]:
u=values[col].unique()
# 获取每个属性的具体属性值,argwhere方法可以获取属性值的索引值(0,1,2...),并将具体属性值映射为索引值
def conver(x):
return numpy.argwhere(u==x)[0,0]
values[col]=values[col].map(conver)
change_type(x_train)
change_type(x_test)
change_type(y_train)
change_type(y_test)
y_train
584 rows × 2 columns
Class | logBCF | |
---|---|---|
0 | 0 | 0 |
1 | 0 | 1 |
2 | 1 | 2 |
3 | 1 | 3 |
4 | 0 | 4 |
... | ... | ... |
771 | 0 | 333 |
772 | 0 | 334 |
773 | 0 | 41 |
774 | 0 | 142 |
776 | 0 | 335 |
knn=KNeighborsClassifier(n_neighbors=5,weights="distance",n_jobs=-1)
knn.fit(x_train, y_train)
y_=knn.predict(x_test)
acc=(y_==y_test).mean()
print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
提高算法准确率
1,修改算法参数
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)
knn.fit(x_train, y_train)
y_=knn.predict(x_test)
acc=(y_==y_test).mean()
print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
2,修改训练数据
# 最大值最小值归一化(当数据间隔很大时可以有效提高准确率,消除属性之间的差异)
x_train_min=x_train.min()
x_train_max=x_train.max()
x2_train=(x_train-x_train_min)/(x_train_max-x_train_min)
x_test_min=x_test.min()
x_test_max=x_test.max()
x2_test=(x_test-x_test_min)/(x_test_max-x_test_min)
x2_test.head()
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)
knn.fit(x2_train, y_train)
y_=knn.predict(x2_test)
acc=(y_==y_test).mean()
print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
# Z-score归一化
# 求平均值
x_train_mean=x_train.mean()
# 求方差
x_train_std=x_train.std()
x3_train=(x_train-x_train_mean)/x_train_std
x3_train.head()
# 求平均值
x_test_mean=x_test.mean()
# 求方差
x_test_std=x_test.std()
x3_test=(x_test-x_test_mean)/x_test_std
x3_test.head()
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)
knn.fit(x3_train, y_train)
y_=knn.predict(x3_test)
acc=(y_==y_test).mean()
print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
# sklearn封装为的z-score归一化操作
from sklearn.preprocessing import StandardScaler,MinMaxScaler
s=StandardScaler()
x4_train=s.fit_transform(x_train)
# x4_train 为z-score归一化后的数据
x4_test=s.fit_transform(x_test)
m=MinMaxScaler()
x5_train=m.fit_transform(x_train)
x5_test=m.fit_transform(x_test)
# x5_train 为最大值,最小值归一化后的数据
保存算法模型
from sklearn.externals import joblib
joblib.dump(knn,'./model',cache_size=9) # 保存模型,默认保存为压缩类型,会保存所有数据,cache_size=9时为压缩最小大小
# 加载模型
model=joblib.load('./model')
x=model.predict(x2_test)
acc=(y_==y_test).mean()
print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))