电信用户流失率预测
1 | |
二、代码实现
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | # /usr/bin/python # -*- encoding:utf-8 -*- # data analysis import pandas as pd import numpy as np import random as rnd # visualization import seaborn as sns import matplotlib.pyplot as plt # machine learning from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestRegressor train_df = pd.read_csv( "E:\\pyworkpace\\CDA\\data\\tele_cust_train.csv" ) test_df = pd.read_csv( "E:\\pyworkpace\\CDA\\data\\tele_cust_test.csv" ) combine = [train_df, test_df] # 查看数据的完整性 train_df.info() test_df.info() # 查看特征值的分布 train_df.describe() train_df.describe(include = [ 'O' ]) # 地区与对应流失率 train_df[[ 'area' , 'churn' ]].groupby([ 'area' ], as_index = False ).mean().sort_values(by = 'churn' , ascending = False ) # 离散变量和连续变量画图 # 居住面积、租房类型和婚姻状况的关系 grid = sns.FacetGrid(train_df, row = 'marital' , size = 2.2 , aspect = 1.6 ) grid. map (sns.pointplot, 'dwllsize' , 'churn' , 'dwlltype' , palette = 'deep' ) grid.add_legend() plt.show() # 离散变量补充 # (1)eqpdays 以众数补齐,删除错误值,例如为负数的值,清空再补(112条为负的数据) freq_port = train_df.eqpdays.dropna().mode()[ 0 ] # 众数 for dataset in combine: dataset[ 'eqpdays' ] = dataset[ 'eqpdays' ].replace([ - 5 , - 4 , - 3 , - 2 , - 1 ], np.nan) dataset[ 'eqpdays' ] = dataset[ 'eqpdays' ].fillna(freq_port) # (2)dulband 以众数补齐数据 freq_port = train_df.dualband.dropna().mode()[ 0 ] # 众数 for dataset in combine: dataset[ 'dualband' ] = dataset[ 'dualband' ].fillna(freq_port) # (4)creditcd、truck、ethnic、marital、kid0_2至kid16_16共9个数据同时缺失,这几个变量同时用未知分类代替并生成一个新的变量new,1为补的缺失值,0为有值的数据 for dataset in combine: dataset[ 'new' ] = 0 # 没有填补过的设置为0 dataset.loc[np.where(np.isnan(dataset[ 'truck' ]))[ 0 ], 'new' ] = 1 dataset[ 'truck' ] = dataset[ 'truck' ].replace(np.nan, 3 ) dataset[ 'ethnic' ] = dataset[ 'ethnic' ].replace(np.nan, 'Z' ) dataset[ 'marital' ] = dataset[ 'marital' ].fillna( 'Z' ) dataset[ 'creditcd' ] = dataset[ 'creditcd' ].fillna( 'Z' ) dataset[[ 'kid0_2' , 'kid3_5' , 'kid6_10' , 'kid11_15' , 'kid16_17' ]] = \ dataset[[ 'kid0_2' , 'kid3_5' , 'kid6_10' , 'kid11_15' , 'kid16_17' ]].fillna( 'Y' ) # 将为空的替换为Y(1) # 将U替换成0,Y替换成1 dataset[ 'kid0_2' ] = dataset[ 'kid0_2' ]. map ({ 'U' : 0 , 'Y' : 1 }).astype( int ) dataset[ 'kid3_5' ] = dataset[ 'kid3_5' ]. map ({ 'U' : 0 , 'Y' : 1 }).astype( int ) dataset[ 'kid6_10' ] = dataset[ 'kid6_10' ]. map ({ 'U' : 0 , 'Y' : 1 }).astype( int ) dataset[ 'kid11_15' ] = dataset[ 'kid11_15' ]. map ({ 'U' : 0 , 'Y' : 1 }).astype( int ) dataset[ 'kid16_17' ] = dataset[ 'kid16_17' ]. map ({ 'U' : 0 , 'Y' : 1 }).astype( int ) # 生成新的kids(孩子个数)用于取代kid0_2至kid16_16 for dataset in combine: dataset[ 'kids' ] = dataset. apply ( lambda x: x[ 'kid0_2' ] + x[ 'kid3_5' ] + x[ 'kid6_10' ] + x[ 'kid11_15' ] + x[ 'kid16_17' ], axis = 1 ) # 连续变量补充 new_df = train_df.append(test_df, sort = True ) new_df = new_df.reset_index(drop = True ) # 重排索引 corrmat = new_df.corr() # 得到连续变量间的相关关系 f, ax = plt.subplots(figsize = ( 20 , 9 )) sns.heatmap(corrmat, vmax = 0.8 , square = True ) # 绘制关系矩阵图 ''' 补充da_Mean的数据 (其他连续变量以此类似) 与此相关关系较大的变量有:'adjmou','adjrev','avgrev','avgmou','totcalls','avg3mou','avg3rev','totmou',peak_vce_Mean,mou_Mean,rev_Mean ''' # 补充da_Mean数据 train_df_da_Mean = new_df.dropna(subset = [ 'da_Mean' ]).copy() test_df_da_Mean = new_df[np.isnan(new_df[ 'da_Mean' ])] X_train = train_df_da_Mean[[ 'adjmou' , 'adjrev' , 'avgrev' , 'avgmou' , 'totcalls' , 'avg3mou' , 'avg3rev' , 'totmou' ]] Y_train = train_df_da_Mean[ "da_Mean" ] X_test = test_df_da_Mean[[ 'adjmou' , 'adjrev' , 'avgrev' , 'avgmou' , 'totcalls' , 'avg3mou' , 'avg3rev' , 'totmou' ]] model = RandomForestRegressor(n_estimators = 100 , oob_score = True , criterion = 'mse' ) model.fit(X_train, Y_train.ravel()) Y_test = model.predict(X_test) r2 = model.score(X_train, Y_train) test_df_da_Mean[ 'da_Mean' ] = Y_test neww = test_df_da_Mean.append(train_df_da_Mean) # 未改变索引值 neww = neww.sort_index() new_df[ 'da_Mean' ] = neww[ 'da_Mean' ] # 编码 new_df[ 'dualband' ] = new_df[ 'dualband' ]. map ({ 'Y' : 0 , 'N' : 1 , 'T' : 2 , 'U' : 3 }).astype( int ) new_df[ 'creditcd' ] = new_df[ 'creditcd' ]. map ({ 'Y' : 0 , 'N' : 1 , 'Z' : 3 }).astype( int ) new_df[ 'ethnic' ] = new_df[ 'ethnic' ]. map ({ 'U' : 0 , 'N' : 1 , 'H' : 2 , 'Z' : 3 , 'F' : 4 , 'S' : 5 , 'R' : 6 , 'O' : 7 , 'G' : 8 , 'J' : 9 , 'P' : 10 , 'I' : 11 , 'B' : 12 , 'D' : 13 , 'X' : 14 , 'C' : 15 , 'M' : 16 }).astype( int ) new_df[ 'marital' ] = new_df[ 'marital' ]. map ({ 'B' : 0 , 'M' : 1 , 'U' : 2 , 'A' : 3 , 'S' : 4 , 'Z' : 5 }).astype( int ) new_df = new_df.drop([ 'Customer_ID' , 'HHstatin' , 'area' , 'kid0_2' , 'kid3_5' , 'kid6_10' , 'kid11_15' , 'kid16_17' ],axis = 1 ) # 没有处理'numbcars','income','dwlltype','dwllsize','adult','ownrent','hnd_price'这几个变量 yy = new_df.copy() yy = yy.drop([ 'numbcars' , 'income' , 'dwlltype' , 'dwllsize' , 'adult' , 'ownrent' , 'hnd_price' ],axis = 1 ) train_df = yy.dropna(subset = [ 'churn' ]).copy() test_df = yy[np.isnan(new_df[ 'churn' ])] X_train = train_df.drop([ 'churn' ],axis = 1 ) Y_train = train_df[ 'churn' ] X_test = test_df.drop([ 'churn' ],axis = 1 ) # 用决策树 decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round (decision_tree.score(X_train, Y_train) * 100 , 2 ) |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 零经验选手,Compose 一天开发一款小游戏!
· 通过 API 将Deepseek响应流式内容输出到前端
· 因为Apifox不支持离线,我果断选择了Apipost!