1.引入包

import pandas as pd
import numpy as np
import re

2.读取数据

train=pd.read_csv("训练数据.csv",encoding="gbk")

3.设置最大显示列数目

pd.set_option("display.max_columns",100)
q3_2['userid']=q3_2['userid'].astype(str)
q3_5.sort_values(by='userid',ascending=False).reset_index(drop=True)

q3_3=q3_2[q3_2['userid'].str.len()==12]
q3_3['user_id7']=q3_3['userid'].str[:7]

user_scenic_df=user_scenic_df.dropna(subset=['scenic_area_name'])
user_scenic_df.shape

 

行、列选取service_df.iloc[[0,2],[2,3]]

service_df.iloc[[0,2],[2,3,4]][service_df.volume_M>0.3]

4.数据类型转换

train["VideoTestTime"]=pd.to_datetime(train["VideoTestTime"])
train[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'Source', 'LAC', 'CI',
       'LteCi', 'LtePci', 'LteTac', 'RX', 'L_SINR', 'LteRsrq', 'VideoAvgSpeed',
       'VideoPeakSpeed', 'TCLASS', 'VideoSize', 'VideoTotleTraffic']]=train[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'Source', 'LAC', 'CI',
       'LteCi', 'LtePci', 'LteTac', 'RX', 'L_SINR', 'LteRsrq', 'VideoAvgSpeed',
       'VideoPeakSpeed', 'TCLASS', 'VideoSize', 'VideoTotleTraffic']].apply(pd.to_numeric)

5.空值处理

train["LAC"].fillna(np.mean(train["LAC"]),inplace=True)

6.one-hot编码

APN=pd.get_dummies(train["APN/SSID"])
train=pd.concat([train,APN],axis=1)
train=train.drop(["APN/SSID"],axis=1)

7.处理Label列

train["label"]=train["BufferCounter"].apply(lambda x:dealLabel(x))
def dealLabel(x):
    if((x==0)or x=="0"):
        return 0
    else:
        return 1

8.时间相关处理

train["VideoTestTime"]=pd.to_datetime(train["VideoTestTime"])
train["year"]=train["VideoTestTime"].apply(lambda x:x.year)
train["month"]=train["VideoTestTime"].apply(lambda x:x.month)
train["Day"]=train["VideoTestTime"].apply(lambda x:x.day)
train["hour"]=train["VideoTestTime"].apply(lambda x:x.hour)
train["minute"]=train["VideoTestTime"].apply(lambda x:x.minute)
train=train.drop(["VideoTestTime"],axis=1)

 排序

avg_traffic.sort_values('Downlink traffic at the PDCP Layer',ascending=False, inplace=True)

删除重复项

data=data.drop_duplicates(subset=None,keep='first',inplace=False)

9.引入sklearn包(随机森林)

from sklearn.ensemble import RandomForestClassifier
RF=RandomForestClassifier()
RF.fit(X_train,y_train)
y_pre=RF.predict(X_test)

10参数调节

from sklearn.model_selection import GridSearchCV
parameter_space = {
    "n_estimators": [10, 15, 20],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6],
}
grid = GridSearchCV(RF, parameter_space, cv=5,scoring="f1")
grid.fit(X_train, y_train)
print(grid.best_params_)
grid.best_score_

11.标准归一化

from sklearn.preprocessing import StandardScaler
std=StandardScaler()
X_std=std.fit_transform(X_train)

12.损失函数权重(1:10)、上采样(1:100)、下采样(1:1000)、异常检测(1:10000)

# 使用imlbearn库中上采样方法中的SMOTE接口
from imblearn.over_sampling import SMOTE
# 定义SMOTE模型,random_state相当于随机数种子的作用
smo = SMOTE(random_state=42)
X_smo, y_smo = smo.fit_sample(X, y)

13.集成学习

# 下面针对多个模型进行集成操作
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
SEED=666
def get_models():
    """Generate a library of base learners."""
    nb = GaussianNB()
    knn = KNeighborsClassifier(n_neighbors=3)
    lr = LogisticRegression(C=100, random_state=SEED)
    nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
    gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
    rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)

    models = {
              'knn': knn,
              'naive bayes': nb,
              'mlp-nn': nn,
              'random forest': rf,
              'gbm': gb,
              'logistic': lr,
              }

    return models
meta_learner = GradientBoostingClassifier(n_estimators=1000,    loss="exponential",    max_features=4,    max_depth=3,    subsample=0.5,    learning_rate=0.005,     random_state=SEED)
from mlens.ensemble import SuperLearner
# Instantiate the ensemble with 10 folds
sl = SuperLearner(
    folds=2,
    random_state=SEED,
    verbose=2,
    backend="multiprocessing"
)

# Add the base learners and the meta learner
sl.add(list(get_models().values()), proba=True) 
sl.add_meta(meta_learner, proba=True)
# Train the ensemble
sl.fit(X_train,y_train)
# Predict the test set
from sklearn.metrics import f1_score
p_sl = sl.predict_proba(X_test)
Y_prelast=np.argmax(p_sl,axis=1)
f1_score(y_test,Y_prelast)