数据清洗和数据预处理
摘要:
内容:
我的github 源代码:https://github.com/Tongzhenguo/Python-Project/blob/master/learntoscikit/preprocessing/demo.py
1 # coding=utf-8 2 __author__ = 'arachis' 3 4 import numpy as np 5 from sklearn import preprocessing 6 7 """ 8 缺失值处理(填充负值,填充中值,填充众数,剔除,单独作为一个特征) 9 """ 10 11 ##直接使用pandas 中的异常值处理 12 import pandas as pd 13 import numpy as np 14 dates = pd.date_range('20130101', periods=6) 15 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) 16 df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"]) 17 # df1.fillna(-1) ##填充负值 18 df1.dropna() ## 剔除 19 20 21 """ 22 异常值处理(剔除) 23 """ 24 25 26 """ 27 z-score:均值为0,方差为1(标准化)(基于列向量) 28 """ 29 X_train = np.array([[ 1., -1., 2.],\ 30 [ 2., 0., 0.],\ 31 [ 0., 1., -1.]]) 32 X_test = np.array([[ -3., -1., 4.]]) 33 X_scaled = preprocessing.scale(X_train) 34 print X_scaled 35 36 #Scaled data has zero mean and unit variance: 37 print X_scaled.mean(axis=0) 38 print X_scaled.std(axis=0) 39 40 #Scaler 41 scaler = preprocessing.StandardScaler().fit(X_train) 42 print scaler.transform(X_train) 43 print scaler.transform(X_test) 44 45 print scaler.mean_ 46 print scaler.scale_ 47 48 """ 49 min-max score:映射到区间[0,1](最小-最大规范化)(基于列向量) 50 """ 51 scaler = preprocessing.MinMaxScaler() 52 print scaler.fit_transform(X_train) 53 print scaler.transform(X_test) #新的数据可能会不在[0,1]区间内 54 55 56 """ 57 规范化(Normalization)(归一化)(基于行向量) 58 """ 59 normalizer = preprocessing.Normalizer(norm='l2') 60 print normalizer.fit_transform(X_train) 61 print normalizer.fit_transform(X_test) 62 63 64 """ 65 二值化(Binarization) 66 """ 67 #给定阈值,将特征转换为0/1 68 binarizer = preprocessing.Binarizer(threshold=1.1) 69 print binarizer.transform(X_train) 70 print binarizer.transform(X_test) 71 72 73 """ 74 类别特征编码(Encoding categorical features) 75 """ 76 #知道各个类别的数目,可通过n_values指定 77 enc = preprocessing.OneHotEncoder() 78 print enc.fit([[1, 2, 3], [0, 2, 0]]) 79 print enc.transform([[1, 0, 0]]).toarray() 80 81 82 """ 83 标签编码(Label encoding) 84 """ 85 #非数值型转化为数值型 86 le = preprocessing.LabelEncoder() 87 le.fit(["paris", "paris", "tokyo", "amsterdam"]) 88 print le.transform(["tokyo", "tokyo", "paris"]) 89 90 91 """ 92 生成多项式特征(Generating polynomial features) 93 """ 94 # (x1,x2) => (1,x1,x2,x1^2,x1*x2,x2^2) 95 from sklearn.preprocessing import PolynomialFeatures 96 X = np.arange(6).reshape(3, 2) 97 poly = PolynomialFeatures(2) 98 print poly.fit_transform(X) 99 100 101 102 """ 103 滤除方差小的数据(Removing features with low variance) 104 """ 105 from sklearn.feature_selection import VarianceThreshold 106 sel = VarianceThreshold(threshold=(.8 * (1 - .8))) 107 sel.fit_transform(X)