onehot编码和无量纲化(归一化&标准化)方差过滤 PCA降维
# pandas进行onehot编码 import pandas as pd df = pd.DataFrame([ ["green","M",20,"class1"], ["red","L",21,"class2"], ["blue","XL",30,"class3"], ]) df.columns = ["color","size","weight","class label"] df2 = pd.get_dummies(df["class label"]) # sklearn工具类进行onehot编码 from sklearn.feature_extraction import DictVectorizer alist = [ {"city":"beijing","temp":33}, {"city":"GZ","temp":42}, {"city":"SH","temp":40}, ] d = DictVectorizer(sparse=False) feature = d.fit_transform(alist) print(d.get_feature_names()) print(feature)
归一化(当所有数据权重一样时使用)
# 归一化 from sklearn.preprocessing import MinMaxScaler mm = MinMaxScaler(feature_range=(0,1)) data = [ [90,2,10,40], [60,5,15,45], [73,3,13,45] ] data = mm.fit_transform(data)
标准化(当数据存在巨大异常值时使用)
from sklearn.preprocessing import StandardScaler ss=StandardScaler() data = [ [90,2,10,40], [60,5,15,45], [73,3,13,45] ] data =ss.fit_transform(data) print(data)
# Filter过滤式(方差过滤) from sklearn.feature_selection import VarianceThreshold v = VarianceThreshold(threshold=2) a=v.fit_transform([[0,2,4,3],[0,3,7,3],[0,9,6,3]]) # PCA from sklearn.decomposition import PCA pca = PCA(n_components=2) a = pca.fit_transform([[0,2,4,3],[0,3,7,3],[0,9,6,3]])