scikit-learn杂记
1.数据预处理 二值化
import numpy as np from sklearn import preprocessing X = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]) binarized = preprocessing.Binarizer().fit(X) print(binarized.transform(X))
2.数据预处理 Onehot处理离散数据
import numpy as np from sklearn import preprocessing Y = np.array([[0, 1, 0], [1, 0, 1], [2, 2, 1], [3, 1, 0]]) enc = preprocessing.OneHotEncoder() enc.fit(Y) print(enc.transform([[3, 0, 1]]).toarray())
3.综合处理文本离散数据 Onehot处理离散文本数据
import numpy as np from sklearn import preprocessing from sklearn.preprocessing import LabelEncoder # 原始离散数据,其中国家有四种数据,职业有三种数据,性别有两种数据,即[2,3,4] Y_label = np.array([['from China', 'Student', 'Male'], ['from USA', 'Teacher', 'Female'], ['from UK', 'Engineer', 'Female'],['from AU', 'Student', 'Male']]) # 将离散文本转换为数字表示 le_from = LabelEncoder() le_job = LabelEncoder() le_gender = LabelEncoder() le_from.fit(np.array(['from China', 'from USA', 'from UK', 'from AU'])) le_job.fit(np.array(['Student', 'Teacher', 'Engineer'])) le_gender.fit(np.array(['Male','Female'])) # 替换原数据 Y_label[:, 0] = le_from.transform(Y_label[:, 0]) Y_label[:, 1] = le_job.transform(Y_label[:, 1]) Y_label[:, 2] = le_gender.transform(Y_label[:, 2]) # 使用OneHot编码数据 enc = preprocessing.OneHotEncoder() enc.fit(Y_label) print(enc.transform([[3, 0, 1]]).toarray())
保持学习,否则迟早要被淘汰*(^ 。 ^ )***