使用对数变换来提升单变量的回归准确度
from sklearn.linear_model import Ridge from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier, export_graphviz from IPython.display import display import matplotlib.pyplot as plt import numpy as np import matplotlib as mt import pandas as pd from sklearn.decomposition import PCA from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split rnd = np.random.RandomState(0) X_org = rnd.normal(size=(1000, 3)) w = rnd.normal(size=3) X = rnd.poisson(10 * np.exp(X_org)) y = np.dot(X_org, w) print("Number of feature appearances:\n{}".format(np.bincount(X[:, 0]))) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) #岭回归验证测试分数 score = Ridge().fit(X_train, y_train).score(X_test, y_test) print("Ridge Test score: {:.3f}".format(score)) X_train_log = np.log(X_train + 1) X_test_log = np.log(X_test + 1) score = Ridge().fit(X_train_log, y_train).score(X_test_log, y_test) print("Test score: {:.3f}".format(score))
Ridge Test score: 0.622
Test score: 0.875
用log变换一般是在连续值拉锯越来越大时使用。
关于作者:
王昕(QQ:475660)
在广州工作生活30余年。十多年开发经验,在Java、即时通讯、NoSQL、BPM、大数据等领域较有经验。
目前维护的开源产品:https://gitee.com/475660
目前维护的开源产品:https://gitee.com/475660