1.引入包
import pandas as pd import numpy as np import re
2.读取数据
train=pd.read_csv("训练数据.csv",encoding="gbk")
3.设置最大显示列数目
pd.set_option("display.max_columns",100)
q3_2['userid']=q3_2['userid'].astype(str)
q3_5.sort_values(by='userid',ascending=False).reset_index(drop=True)
q3_3=q3_2[q3_2['userid'].str.len()==12]
q3_3['user_id7']=q3_3['userid'].str[:7]
user_scenic_df=user_scenic_df.dropna(subset=['scenic_area_name'])
user_scenic_df.shape
行、列选取service_df.iloc[[0,2],[2,3]]
service_df.iloc[[0,2],[2,3,4]][service_df.volume_M>0.3]
4.数据类型转换
train["VideoTestTime"]=pd.to_datetime(train["VideoTestTime"])
train[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'Source', 'LAC', 'CI', 'LteCi', 'LtePci', 'LteTac', 'RX', 'L_SINR', 'LteRsrq', 'VideoAvgSpeed', 'VideoPeakSpeed', 'TCLASS', 'VideoSize', 'VideoTotleTraffic']]=train[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'Source', 'LAC', 'CI', 'LteCi', 'LtePci', 'LteTac', 'RX', 'L_SINR', 'LteRsrq', 'VideoAvgSpeed', 'VideoPeakSpeed', 'TCLASS', 'VideoSize', 'VideoTotleTraffic']].apply(pd.to_numeric)
5.空值处理
train["LAC"].fillna(np.mean(train["LAC"]),inplace=True)
6.one-hot编码
APN=pd.get_dummies(train["APN/SSID"]) train=pd.concat([train,APN],axis=1) train=train.drop(["APN/SSID"],axis=1)
7.处理Label列
train["label"]=train["BufferCounter"].apply(lambda x:dealLabel(x))
def dealLabel(x): if((x==0)or x=="0"): return 0 else: return 1
8.时间相关处理
train["VideoTestTime"]=pd.to_datetime(train["VideoTestTime"])
train["year"]=train["VideoTestTime"].apply(lambda x:x.year) train["month"]=train["VideoTestTime"].apply(lambda x:x.month) train["Day"]=train["VideoTestTime"].apply(lambda x:x.day) train["hour"]=train["VideoTestTime"].apply(lambda x:x.hour) train["minute"]=train["VideoTestTime"].apply(lambda x:x.minute) train=train.drop(["VideoTestTime"],axis=1)
排序
avg_traffic.sort_values('Downlink traffic at the PDCP Layer',ascending=False, inplace=True)
删除重复项
data=data.drop_duplicates(subset=None,keep='first',inplace=False)
9.引入sklearn包(随机森林)
from sklearn.ensemble import RandomForestClassifier RF=RandomForestClassifier() RF.fit(X_train,y_train) y_pre=RF.predict(X_test)
10参数调节
from sklearn.model_selection import GridSearchCV
parameter_space = { "n_estimators": [10, 15, 20], "criterion": ["gini", "entropy"], "min_samples_leaf": [2, 4, 6], } grid = GridSearchCV(RF, parameter_space, cv=5,scoring="f1") grid.fit(X_train, y_train)
print(grid.best_params_) grid.best_score_
11.标准归一化
from sklearn.preprocessing import StandardScaler std=StandardScaler() X_std=std.fit_transform(X_train)
12.损失函数权重(1:10)、上采样(1:100)、下采样(1:1000)、异常检测(1:10000)
# 使用imlbearn库中上采样方法中的SMOTE接口 from imblearn.over_sampling import SMOTE # 定义SMOTE模型,random_state相当于随机数种子的作用 smo = SMOTE(random_state=42) X_smo, y_smo = smo.fit_sample(X, y)
13.集成学习
# 下面针对多个模型进行集成操作 from sklearn.svm import SVC, LinearSVC from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.pipeline import make_pipeline
SEED=666 def get_models(): """Generate a library of base learners.""" nb = GaussianNB() knn = KNeighborsClassifier(n_neighbors=3) lr = LogisticRegression(C=100, random_state=SEED) nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED) gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED) rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED) models = { 'knn': knn, 'naive bayes': nb, 'mlp-nn': nn, 'random forest': rf, 'gbm': gb, 'logistic': lr, } return models
meta_learner = GradientBoostingClassifier(n_estimators=1000, loss="exponential", max_features=4, max_depth=3, subsample=0.5, learning_rate=0.005, random_state=SEED) from mlens.ensemble import SuperLearner # Instantiate the ensemble with 10 folds sl = SuperLearner( folds=2, random_state=SEED, verbose=2, backend="multiprocessing" ) # Add the base learners and the meta learner sl.add(list(get_models().values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(X_train,y_train) # Predict the test set
from sklearn.metrics import f1_score p_sl = sl.predict_proba(X_test) Y_prelast=np.argmax(p_sl,axis=1) f1_score(y_test,Y_prelast)