数据挖掘特征提取方法-汇集
1.基于树模型提取特征
1 # 使用树模型提取特征 2 import numpy as np 3 from sklearn import feature_selection 4 from sklearn.ensemble import GradientBoostingClassifier 5 6 matrix = np.array(X) 7 target = np.array(target) 8 temp = feature_selection.SelectFromModel(GradientBoostingClassifier()).fit(matrix, target) 9 indx = temp._get_support_mask().tolist() 10 scores = get_importance(temp.estimator_).tolist() 11 result = temp.transform(matrix).tolist() 12 return scores, indx, result 13 14 # X: array-like 15 # target: array-like 16 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html 17
2.基于L1,L2惩罚值提取特征
1 # 基于L1,L2惩罚值提取特征 2 import numpy as np 3 from sklearn import feature_selection 4 from sklearn.linear_model import LogisticRegression 5 6 matrix = np.array(arr0) 7 target = np.array(target) 8 temp = feature_selection.SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit(matrix, target) 9 indx = temp._get_support_mask().tolist() 10 scores = get_importance(temp.estimator_).tolist() 11 result = temp.transform(matrix).tolist() 12 return scores, indx, result 13 14 # X: array-like 15 # target: array-like 16 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html 17
3.递归特征消除法提取特征
1 # 递归特征消除法 2 import numpy as np 3 from sklearn import feature_selection 4 from sklearn.linear_model import LogisticRegression 5 6 matrix = np.array(X) 7 target = np.array(target) 8 temp = feature_selection.RFE(estimator=LogisticRegression(), n_features_to_select=n_features).fit(matrix, target) 9 scores = temp.ranking_.tolist() 10 indx = temp.support_.tolist() 11 result = temp.transform(matrix).tolist() 12 return scores, indx, result 13 14 # X: array-like 15 # target: array-like 16 # n-features: int 17 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html 18
4.互信息选择法提取特征
1 # 互信息选择法 2 from minepy import MINE 3 import numpy as np 4 from sklearn import feature_selection 5 6 matrix = np.array(X) 7 target = np.array(target) 8 def mic(x, y): 9 m = MINE() 10 m.compute_score(x, y) 11 return (m.mic(), 0.5) 12 temp = feature_selection.SelectKBest(lambda X, Y: np.array(list(map(lambda x: mic(x, Y), X.T))).T[0], k=k).fit(matrix, target) 13 scores = temp.scores_.tolist() 14 indx = temp.get_support().tolist() 15 result = temp.transform(matrix).tolist() 16 return scores, indx, result 17 18 # X: array-like 19 # target: array-like 20 # k: int 21 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html 22
5.利用相关系数选择特征
1 # 利用相关系数选择特征 2 import numpy as np 3 from sklearn import feature_selection 4 from sklearn.feature_selection import chi2 5 6 matrix = np.array(X) 7 target = np.array(target) 8 temp = feature_selection.SelectKBest(lambda X, Y: np.array(list(map(lambda x: abs(pearsonr(x, Y)[0]), X.T))), k=k).fit(matrix, target) 9 scores = temp.scores_.tolist() 10 indx = temp.get_support().tolist() 11 result = temp.transform(matrix).tolist() 12 return scores, indx, result 13 14 # X: array-like 15 # target: array-like 16 # k: int 17 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html 18
6.卡方检验法提取特征
1 # 卡方检验法提取特征 2 import numpy as np 3 from sklearn import feature_selection 4 from sklearn.feature_selection import chi2 5 6 matrix = np.array(X) 7 target = np.array(target) 8 temp = feature_selection.SelectKBest(chi2, k=k).fit(matrix, target) 9 scores = temp.scores_.tolist() 10 indx = temp.get_support().tolist() 11 result = temp.transform(matrix).tolist() 12 return scores, indx, result 13 14 # X: array-like 15 # target: array-like 16 # k: int 17 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html 18
7.利用方差选择特征
1 # 利用方差选择特征 2 import numpy as np 3 from sklearn import feature_selection 4 5 matrix = np.array(X) 6 temp = feature_selection.VarianceThreshold(threshold=t).fit(matrix) 7 scores = [np.var(el) for el in matrix.T] 8 indx = temp.get_support().tolist() 9 result = temp.transform(matrix).tolist() 10 return scores, indx, result 11 12 # X: array-like 13 # t: float 14 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html 15