LightGBM使用

https://bacterous.github.io/2018/09/13/LightGBM%E4%BD%BF%E7%94%A8/

深入理解LightGBM

https://zhuanlan.zhihu.com/p/99069186

 

https://github.com/Microstrong0305/WeChat-zhihu-csdnblog-code

https://github.com/microsoft/LightGBM

 

 

https://www.csdn.net/tags/MtTaEg5sMjE2MjE0LWJsb2cO0O0O.html

 

利用XGBoost实现对鸢尾花数据集(Iris.csv)的分类预测

https://blog.csdn.net/Cyril_KI/article/details/107660210

 

python将大csv文件划分成小csv文件做训练集和测试集

https://blog.csdn.net/Findingxu/article/details/86683743?utm_term=csv%E6%96%87%E4%BB%B6%E5%88%86%E8%AE%AD%E7%BB%83%E9%9B%86%E5%92%8C%E6%B5%8B%E8%AF%95%E9%9B%86&utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~all~sobaiduweb~default-0-86683743-null-null&spm=3001.4430

https://wenku.baidu.com/view/9597ad3bc6da50e2524de518964bcf84b9d52db6.html

 

 

程序1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.datasets import  make_classification
import csv
import numpy as np    def read_data(test_data='input/train.csv', n=0, label=1):
    '''
    加载数据的功能
    n:特征数据起始位
    label:是否是监督样本数据
    '''
    csv.field_size_limit(500 * 1024 * 1024) #一定要加上这一句
    csv_reader = csv.reader(open(test_data, encoding="utf8", errors="ignore"))
    data_list = []
    for one_line in csv_reader:
        data_list.append(one_line)
    x_list = []
    y_list = []
    for one_line in data_list[1:]:
        if label == 1:#如果是监督样本数据
            y_list.append(int(one_line[-1]))  # 标志位(最后一位都是标签位)
            one_list = [o for o in one_line[n:-1]]
            x_list.append(one_list)
        else:
            one_list = [o for o in one_line[n:]]
            x_list.append(one_list)
    return x_list, y_list
  
def split_data(data_list, y_list, ratio=0.30):#70%训练集,30%测试集: 914285,391837
    '''
    按照指定的比例,划分样本数据集
    ratio: 测试数据的比率
    '''
    X_train, X_test, y_train, y_test = train_test_split(data_list, y_list, test_size=ratio, random_state=50)
  
    """训练集"""
    with open('input/sub_train.csv', 'w', encoding="utf8",newline="", errors="ignore") as csvfile:#不加newline=""的话会空一行出来
        fieldnames = ['qid', 'question_text','target']
        write = csv.DictWriter(csvfile,fieldnames=fieldnames)
        write.writeheader()#写表头
        for i in range(len(X_train)):
           write.writerow({'qid':X_train[i][0],'question_text':X_train[i][1],'target':y_train[i]})
  
    """测试集"""
    #标签文件
    with open('input/sub_test_y', 'w') as fp:
        json.dump(y_test, fp)
    #测试csv
    with open('input/sub_test_x.csv', 'w', encoding="utf8",newline="", errors="ignore") as csvfile:#不加newline=""的话会空一行出来
        fieldnames = ['qid', 'question_text']
        write = csv.DictWriter(csvfile,fieldnames=fieldnames)
        write.writeheader()#写表头
        for i in range(len(X_test)):
           write.writerow({'qid':X_test[i][0],'question_text':X_test[i][1]})
    return X_train, X_test, y_train, y_test
  
F_feature,F_label = read_data(test_data='D:\\20210706E\\2020-python\\light_GBM\\Iris.csv', n=1, label=1)
print(np.array(F_feature))
print(np.array(F_label))
X_train,X_test,y_train,y_test =train_test_split(np.array(F_feature),np.array(F_label),test_size=0.2)
#X_train,X_test,y_train,y_test =train_test_split(F_train,F_train,test_size=0.2)
print(70*'*\n')
 
'''df_train = pd.read_csv('D:\\20210706E\\2020-python\\light_GBM\\Iris.csv',sep=",")
#print(df_train)
df_train_label=df_train['Species']
# df_train_feature=df_train['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
df_train_feature=df_train[['SepalWidthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
#print(df_train_label.tolist())
#test = df_train_label.drop(, axis = 1)
#print(test)
print(df_train_feature)'''
 
 
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values
 
 
# 加载数据
 
print('Load data...')
 
iris = load_iris()
data=iris.data
target = iris.target
#X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
 
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values
 
print('Start training...')
# 创建模型,训练模型
gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=20)
gbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)
<br>
print('Start predicting...')
# 测试机预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# 模型评估
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
 
# feature importances
print('Feature importances:', list(gbm.feature_importances_))
 
# 网格搜索,参数优化
estimator = lgb.LGBMRegressor(num_leaves=31)
 
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}
 
gbm = GridSearchCV(estimator, param_grid)
 
gbm.fit(X_train, y_train)
 
print('Best parameters found by grid search are:', gbm.best_params_)
   # 模型存储
   joblib.dump(gbm, 'loan_model.pkl')
   # 模型加载
   gbm = joblib.load('loan_model.pkl')

  

 from sklearn.metrics import roc_auc_score, accuracy_score

# 模型预测
y_pred = gbm.predict(X_test)
y_pred = [list(x).index(max(x)) for x in y_pred]
print(y_pred)

# 模型评估
print(accuracy_score(y_test, y_pred))

数据1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0
6,5.4,3.9,1.7,0.4,0
7,4.6,3.4,1.4,0.3,0
8,5.0,3.4,1.5,0.2,0
9,4.4,2.9,1.4,0.2,0
10,4.9,3.1,1.5,0.1,0
11,5.4,3.7,1.5,0.2,0
12,4.8,3.4,1.6,0.2,0
13,4.8,3.0,1.4,0.1,0
14,4.3,3.0,1.1,0.1,0
15,5.8,4.0,1.2,0.2,0
16,5.7,4.4,1.5,0.4,0
17,5.4,3.9,1.3,0.4,0
18,5.1,3.5,1.4,0.3,0
19,5.7,3.8,1.7,0.3,0
20,5.1,3.8,1.5,0.3,0
21,5.4,3.4,1.7,0.2,0
22,5.1,3.7,1.5,0.4,0
23,4.6,3.6,1.0,0.2,0
24,5.1,3.3,1.7,0.5,0
25,4.8,3.4,1.9,0.2,0
26,5.0,3.0,1.6,0.2,0
27,5.0,3.4,1.6,0.4,0
28,5.2,3.5,1.5,0.2,0
29,5.2,3.4,1.4,0.2,0
30,4.7,3.2,1.6,0.2,0
31,4.8,3.1,1.6,0.2,0
32,5.4,3.4,1.5,0.4,0
33,5.2,4.1,1.5,0.1,0
34,5.5,4.2,1.4,0.2,0
35,4.9,3.1,1.5,0.1,0
36,5.0,3.2,1.2,0.2,0
37,5.5,3.5,1.3,0.2,0
38,4.9,3.1,1.5,0.1,0
39,4.4,3.0,1.3,0.2,0
40,5.1,3.4,1.5,0.2,0
41,5.0,3.5,1.3,0.3,0
42,4.5,2.3,1.3,0.3,0
43,4.4,3.2,1.3,0.2,0
44,5.0,3.5,1.6,0.6,0
45,5.1,3.8,1.9,0.4,0
46,4.8,3.0,1.4,0.3,0
47,5.1,3.8,1.6,0.2,0
48,4.6,3.2,1.4,0.2,0
49,5.3,3.7,1.5,0.2,0
50,5.0,3.3,1.4,0.2,0
51,7.0,3.2,4.7,1.4,1
52,6.4,3.2,4.5,1.5,1
53,6.9,3.1,4.9,1.5,1
54,5.5,2.3,4.0,1.3,1
55,6.5,2.8,4.6,1.5,1
56,5.7,2.8,4.5,1.3,1
57,6.3,3.3,4.7,1.6,1
58,4.9,2.4,3.3,1.0,1
59,6.6,2.9,4.6,1.3,1
60,5.2,2.7,3.9,1.4,1
61,5.0,2.0,3.5,1.0,1
62,5.9,3.0,4.2,1.5,1
63,6.0,2.2,4.0,1.0,1
64,6.1,2.9,4.7,1.4,1
65,5.6,2.9,3.6,1.3,1
66,6.7,3.1,4.4,1.4,1
67,5.6,3.0,4.5,1.5,1
68,5.8,2.7,4.1,1.0,1
69,6.2,2.2,4.5,1.5,1
70,5.6,2.5,3.9,1.1,1
71,5.9,3.2,4.8,1.8,1
72,6.1,2.8,4.0,1.3,1
73,6.3,2.5,4.9,1.5,1
74,6.1,2.8,4.7,1.2,1
75,6.4,2.9,4.3,1.3,1
76,6.6,3.0,4.4,1.4,1
77,6.8,2.8,4.8,1.4,1
78,6.7,3.0,5.0,1.7,1
79,6.0,2.9,4.5,1.5,1
80,5.7,2.6,3.5,1.0,1
81,5.5,2.4,3.8,1.1,1
82,5.5,2.4,3.7,1.0,1
83,5.8,2.7,3.9,1.2,1
84,6.0,2.7,5.1,1.6,1
85,5.4,3.0,4.5,1.5,1
86,6.0,3.4,4.5,1.6,1
87,6.7,3.1,4.7,1.5,1
88,6.3,2.3,4.4,1.3,1
89,5.6,3.0,4.1,1.3,1
90,5.5,2.5,4.0,1.3,1
91,5.5,2.6,4.4,1.2,1
92,6.1,3.0,4.6,1.4,1
93,5.8,2.6,4.0,1.2,1
94,5.0,2.3,3.3,1.0,1
95,5.6,2.7,4.2,1.3,1
96,5.7,3.0,4.2,1.2,1
97,5.7,2.9,4.2,1.3,1
98,6.2,2.9,4.3,1.3,1
99,5.1,2.5,3.0,1.1,1
100,5.7,2.8,4.1,1.3,1
101,6.3,3.3,6.0,2.5,2
102,5.8,2.7,5.1,1.9,2
103,7.1,3.0,5.9,2.1,2
104,6.3,2.9,5.6,1.8,2
105,6.5,3.0,5.8,2.2,2
106,7.6,3.0,6.6,2.1,2
107,4.9,2.5,4.5,1.7,2
108,7.3,2.9,6.3,1.8,2
109,6.7,2.5,5.8,1.8,2
110,7.2,3.6,6.1,2.5,2
111,6.5,3.2,5.1,2.0,2
112,6.4,2.7,5.3,1.9,2
113,6.8,3.0,5.5,2.1,2
114,5.7,2.5,5.0,2.0,2
115,5.8,2.8,5.1,2.4,2
116,6.4,3.2,5.3,2.3,2
117,6.5,3.0,5.5,1.8,2
118,7.7,3.8,6.7,2.2,2
119,7.7,2.6,6.9,2.3,2
120,6.0,2.2,5.0,1.5,2
121,6.9,3.2,5.7,2.3,2
122,5.6,2.8,4.9,2.0,2
123,7.7,2.8,6.7,2.0,2
124,6.3,2.7,4.9,1.8,2
125,6.7,3.3,5.7,2.1,2
126,7.2,3.2,6.0,1.8,2
127,6.2,2.8,4.8,1.8,2
128,6.1,3.0,4.9,1.8,2
129,6.4,2.8,5.6,2.1,2
130,7.2,3.0,5.8,1.6,2
131,7.4,2.8,6.1,1.9,2
132,7.9,3.8,6.4,2.0,2
133,6.4,2.8,5.6,2.2,2
134,6.3,2.8,5.1,1.5,2
135,6.1,2.6,5.6,1.4,2
136,7.7,3.0,6.1,2.3,2
137,6.3,3.4,5.6,2.4,2
138,6.4,3.1,5.5,1.8,2
139,6.0,3.0,4.8,1.8,2
140,6.9,3.1,5.4,2.1,2
141,6.7,3.1,5.6,2.4,2
142,6.9,3.1,5.1,2.3,2
143,5.8,2.7,5.1,1.9,2
144,6.8,3.2,5.9,2.3,2
145,6.7,3.3,5.7,2.5,2
146,6.7,3.0,5.2,2.3,2
147,6.3,2.5,5.0,1.9,2
148,6.5,3.0,5.2,2.0,2
149,6.2,3.4,5.4,2.3,2
150,5.9,3.0,5.1,1.8,2

 

复制代码
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

# 加载数据
iris = load_iris()
data = iris.data
target = iris.target

# 划分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

# 模型训练
gbm = LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5)

# 模型存储
joblib.dump(gbm, 'loan_model.pkl')
# 模型加载
gbm = joblib.load('loan_model.pkl')

# 模型预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

# 模型评估
print('The accuracy of prediction is:', accuracy_score(y_test, y_pred))

# 特征重要度
print('Feature importances:', list(gbm.feature_importances_))

# 网格搜索,参数优化
estimator = LGBMClassifier(num_leaves=31)
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_)
复制代码

 

 

ImportError: cannot import name ‘joblib‘ from ‘sklearn.externals‘

https://blog.csdn.net/weixin_45031468/article/details/113825131