1、性别

def xb(x):
if pd.isnull(x):
return "空"
elif int(x[-2])%2 == 1:
return "男"
elif int(x[-2])%2 == 0:
return "女"
else:return "其他"
def f(x):
return x.nunique()

2、省份

def province(x):
if str(x)[0:2] =='11' :
return '北京'
elif str(x)[0:2] =='12' :
return '天津'
elif str(x)[0:2] =='13' :
return '河北'
elif str(x)[0:2] =='14' :
return '山西'
elif str(x)[0:2] =='15' :
return '内蒙古'
elif str(x)[0:2] =='21' :
return '辽宁'
elif str(x)[0:2] =='22' :
return '吉林'
elif str(x)[0:2] =='23' :
return '黑龙江'
elif str(x)[0:2] =='31' :
return '上海'
elif str(x)[0:2] =='32' :
return '江苏'
elif str(x)[0:2] =='33' :
return '浙江'
elif str(x)[0:2] =='34' :
return '安徽'
elif str(x)[0:2] =='35' :
return '福建'
elif str(x)[0:2] =='36' :
return '江西'
elif str(x)[0:2] =='37' :
return '山东'
elif str(x)[0:2] =='41' :
return '河南'
elif str(x)[0:2] =='42' :
return '湖北'
elif str(x)[0:2] =='43' :
return '湖南'
elif str(x)[0:2] =='44' :
return '广东'
elif str(x)[0:2] =='45' :
return '广西'
elif str(x)[0:2] =='46' :
return '海南'
elif str(x)[0:2] =='50' :
return '重庆'
elif str(x)[0:2] =='51' :
return '四川'
elif str(x)[0:2] =='52' :
return '贵州'
elif str(x)[0:2] =='53' :
return '云南'
elif str(x)[0:2] =='54' :
return '西藏'
elif str(x)[0:2] =='61' :
return '陕西'
elif str(x)[0:2] =='62' :
return '甘肃'
elif str(x)[0:2] =='63' :
return '青海'
elif str(x)[0:2] =='64' :
return '宁夏'
else : return '新疆'

3、年龄

import datetime
def ag1(x):
if pd.isnull(x):
return "其他"
else:
birth_year = int(x[6:10])
today_year = datetime.date.today().year
return (today_year - birth_year)
data['年龄']=data['id_card_no'].apply(ag1)

4、邮件自动化

import openpyxl
dst_xlsx = '/data/python_home/cqg/data_model.xlsx'#模板,设置格式。
file_path = '/data/python_home/cqg/youjian.xlsx'#最终写好的模板路径
df=openpyxl.load_workbook(dst_xlsx) # 打开模板
tar = pd.read_excel('/data/python_home/cqg/duibi1.xlsx')#目标数据
tar
dt=pd.read_excel('data_model.xlsx')#打开看一下模板长啥样
dt
fund_list = ['多平台']
for k in range(1):
sheet0 = df.worksheets[k]# sheet页
target_data = pd.read_excel('/data/python_home/cqg/duibi1.xlsx').reset_index().T.reset_index().T.iloc[1:,2:10]
for i in range(len(target_data)):
#print(target_data.iloc[i,:])
for j in range(8):
sheet0.cell(i+3,j+1).value = target_data.iloc[(i,j)] # 写入数据
#sheet0.cell(i+1,j+1).alignment = Alignment(horizontal='center', vertical='center') # 居中对齐
df.save(file_path) # 保存文件
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import smtplib
from email.mime.image import MIMEImage
import datetime
mail_host = "smtphz.qiye.163.com" #SMTP服务器地址
mail_sender = "jckxpy@ihanchen.com" #账号
mail_passwd = "Cloud@12@@" #密码

msg = MIMEMultipart('related')
msg["Subject"] = '多平台对比'
msg["From"] = mail_sender #发送人
receiver=['chenqianguang@ihanchen.com']

邮件正文

content = '''
各位好:
'''

import datetime

yesterday = datetime.date.today()-datetime.timedelta(days=1)

msg.attach(MIMEText(content,'plain','utf-8'))

file_path = '/data/python_home/cqg2/数据情况1.xlsx'

构造附件2,xlsx文件

att2 = MIMEText(open(file_path, 'rb').read(), 'base64', 'utf-8')
att2["Content-Type"] = 'application/octet-stream'
att2.add_header("Content-Disposition",'attachment', filename=file_path)
msg.attach(att2)

s = smtplib.SMTP() #实例化对象
s.connect(mail_host) #连接163邮箱服务器,端口号为465,注意,这里不需要写端口号
s.login(mail_sender, mail_passwd) #登录邮箱
s.sendmail(mail_sender, receiver, msg.as_string())
s.quit()

6、聚合新增一列

zt['借款总次数']=zt.groupby('idcard')['loan_date'].transform('count')

7、将数据拆分并计数

yxjj['reason_code']=yxjj['reason_code'].apply(lambda x:str(x).split(", "))
def hmd(x):
a=0
for i in x:
if i in ['']:
a=a+1
return a

8、日期类型转为字符串

import datetime
now = datetime.datetime.now()
(s,'%Y/%m/%d %H:%M:%S') #先将字符串变成时间,再将时间变为字符串且此时可选择日期格式
将其他类型变成时间类型,不能随意改格式
to_datetime(arg=data['apply_date'],format="%Y-%m-%d")
ZT[i]=ZT[i].apply(lambda x: x.days)
today_year = datetime.date.today().year
字符串变时间类型
data['apply_date']=data['apply_date'].apply(lambda x:dt.datetime.strptime(x,"%Y-%m-%d"))
时间类型变字符串
import datetime as dt
data['apply_date']=data['apply_date'].apply(lambda x:x.strftime('%Y-%m'))

9、画图

画图,柱状图
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
y=[10,30,14,15,5,5]
name=['10_20','20_30','30_40','40_50','50_60','60_70']
plt.bar(x,y,tick_label=name,width=3)
plt.ylim((0,35))
for x,y in zip(x,y):
plt.text(x,y,y,ha='center',va='bottom')

点图
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pyplot as plt
x=[10,3,1,12,14,5,8,9,14,15,21]
y2=[15,31,2,14,15,6,31,20,25,12,12]
plt.scatter(x,y2)
plt.legend()
plt.show()

10循环写入

testxlsx=pd.ExcelWriter('test.xlsx')
for i in plat_360:
for j in range(0,len(needcolumns)):
if needcolumns[j]!='省份':
table=pd.pivot_table(data_cgm_360_copy_dropdup[data_cgm_360_copy_dropdup.plati],index='等级',columns=needcolumns[j],values='phone',aggfunc=f,margins=True)
for z in table.columns:
table[str(z)+'占比']=table[z]/table['All']
table[str(z)+'占比']=table[str(z)+'占比'].apply(lambda x:format(x100,'.2f')+'%')
table.insert(0,'type',value=needcolumns[j])
title[title.index0].style.set_properties({'text-align':'center'}).to_excel(testxlsx,sheet_name=i,startrow=0,index=False)
dengjitable360=pd.pivot_table
(data_cgm_360_copy_dropdup[data_cgm_360_copy_dropdup.plat==i],index='等级',values='phone',aggfunc=f).rename(columns={'phone':'用户数'})
dengjitable360['用户数占比']=dengjitable360['用户数']/dengjitable360['用户数'].sum()
dengjitable360['用户数占比']=dengjitable360['用户数占比'].apply(lambda x:format(x
100,'.2f')+'%')
dengjitable360.style.set_properties().to_excel(testxlsx,sheet_name=i,startrow=83)
table.drop(columns='All占比').style.set_properties({'text-align':'center'}).to_excel(testxlsx,sheet_name=i,startrow=j*13+2)
pjnl=pd.pivot_table(data_cgm_360_copy_dropdup[data_cgm_360_copy_dropdup.plati],index='等级',values='年龄',aggfunc=np.mean).rename(columns={'年龄':'平均年龄'})
pjnl.to_excel(testxlsx,sheet_name=i,startrow=15,startcol=16)
else:
table=pd.pivot_table(data_cgm_360_copy_dropdup[data_cgm_360_copy_dropdup.plati],index='等级',columns=needcolumns[j],values='phone',aggfunc=f,margins=True)
table.insert(0,'type',value=needcolumns[j])
table.style.set_properties().to_excel(testxlsx,sheet_name=i,startrow=69)
setsheetwidth=testxlsx.sheets[i]
setsheetwidth.set_column('A:Q',16)
textxlsx.save()

11、四种分箱

cut

sx['kh_loanterm']=pd.cut(sx['kh_loanterm'],bins=[3,6,9,12,15,np.inf],right=False,
labels=['A_3','B_6','C_9','D_12','E_15'])

占位符

def 朴道_海纳综合指数V2_申请命中网络贷款类机构数_trans(x):
# 朴道_海纳综合指数V2_申请命中网络贷款类机构数 连续型特征的分箱转换函数
inf = np.inf
bins = [-inf, 0.5, 1.5, 2.5, 13.5, inf]
for i in range(len(bins)-1):
start = bins[i]
end = bins[i+1]
if start < x <= end:
return "{0}({1}, {2}]".format(i+1, start, end) # 分箱字符串格式: i(start, end]
return '0_nan'
fksx1[i]=fksx1[i].cat.add_categories(['空值'])

占位符升级

def kh_jld_ApplyTime_hour_bin(x):
values=[4,8,12,18,23]
index=['A','B','C','D']
for i in range(len(values)-1):
if values[i]<x<=values[i+1]:
return '{0}_({1},{2}]'.format(index[i],values[i],values[i+1])
return 'G_未知'

qcut

for i in coll:
sx[i]=pd.qcut(sx[sx[i]>=0][i],q=5,duplicates='drop')
sx[i]=sx[i].astype('str')
sx[i].replace('nan',np.nan,inplace=True)
sx[i].fillna(sx2[i],inplace=True)

12、把时间变为周

mysql专用:

周日开始
concat(str_to_date(concat(yearweek(apply_date), ' Sunday'), '%X%V %W'),"~",date_add(str_to_date(concat(yearweek(apply_date), ' Sunday'), '%X%V %W'),interval 6 day)) AS APPLY_WEE
周一开始
concat(str_to_date(concat(yearweek(apply_date,1),'Monday'),'%x%v %W'),"~",date_add(str_to_date(concat(yearweek(apply_date,1),'Monday'),'%x%v %W'),interval 6 day)) AS APPLY_WEEK

Python函数 :

周一开始
import datetime as dt
def get_monday_to_sunday(today):
today = dt.datetime.strptime(str(today), "%Y-%m-%d")
monday = dt.datetime.strftime(today - dt.timedelta(today.weekday()), "%Y-%m-%d")
monday_ = dt.datetime.strptime(monday, "%Y-%m-%d")
sunday = dt.datetime.strftime(monday_ + dt.timedelta(monday_.weekday() + 6), "%Y-%m-%d")
monday1=str(monday)
sunday1=str(sunday)
return monday1+"~"+sunday1

数仓专用

周一开始

concat(DATE_ADD(NEXT_DAY(放款日期,'星期天'),-6),"~",NEXT_DAY(放款日期,'星期天')) week

13、KS和IV(pycard)

KS循环

testks=pd.ExcelWriter('test1.xlsx')
columns_df=pd.DataFrame(mx2.columns)
for i,j in enumerate(mx2.columns):
result=toad.metrics.KS_bucket(mx[j],mx['target'],bucket=10,method='quantile').sort_index()
result.to_excel(testks,sheet_name='kskn',startrow=14*i+1,index=False)
testks.save()
testks.close()

分箱加结果

data_sd = ZT3
import pycard as pc
num_iv_woedf = pd.DataFrame()
clf = pc.NumBin()
for i in coll:
if data_sd[i].isnull().sum()/len(data_sd)<1:
clf.fit(data_sd[i] ,data_sd['dpd30+'])
num_iv_woedf = num_iv_woedf.append(clf.woe_df_)

原始分箱

import toad
import scorecardpy as sc
str(x).strip()
toad.detector.detect()
toad.metrics.KS_bucket(x,y,bucket=10,method = 'quantile')
bins = sc.woebin(y2, y="target")#y2是数据,里面的目标列用target
sc.woebin_plot(bins)
breaks_adj = {'xy':[1,2,3,4]}
bins_adj = sc.woebin(y2, y="target",breaks_list=breaks_adj)
sc.woebin_plot(bins_adj)

bins_result= pd.DataFrame()
for value in bins.values():
bins_result=bins_result.append(value)
bins_result.to_excel("IV.xlsx")

14异常展示

展示异常

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

15、第一行设为列名

import numpy as np
array = np.array(zt1)
list = array.tolist()
list = list[0]
zt1.columns = list

16、加密

sha256加密

import hashlib
def sha256_jiami(x):
'''
Parameters
----------
x : string
需要加密的字符串.

Returns
-------
字符窜加密后的字符

'''
sha256 = hashlib.sha256()
sha256.update(x.encode('utf-8'))
return sha256.hexdigest()

target_01['phone_sha256'] = target_01['Phone'].apply(sha256_jiami)

MD5加密

import hashlib
def md5_jiami(x):
Jpwd = hashlib.md5()
Jpwd.update(x.encode('utf-8'))
return Jpwd.hexdigest()
zt1['手机号']=zt1['手机号'].apply(md5_jiami)

17、计算PSI

HX1['本月-训练集']=HX1.apply(lambda x:(x.本月-x.训练集)*math.log(x.本月/x.训练集),axis=1)
HX1.loc['Row_sum'] = HX1.iloc[:,1:].apply(lambda x: x.sum())

18、pycard分析

for i in ['dpd1+']:
for j in kncol:
tmp_data = pc.cross_woe(kn[j],kn[i]).reset_index()
tmp_data.insert(0,'type',value=i)
tmp_data.rename({j:'区间'},axis=1,inplace=True)
data_iv = data_iv.append(tmp_data)
data_iv

19、选取最近今天的时间

import datetime as dt
import time
import datetime
t=datetime.datetime.now()
d1 =t.strftime('%Y-%m-%d')
d2=(t-datetime.timedelta(days=14)).strftime("%Y-%m-%d")
zt1=zt[zt['注册时间']>d2]

20、删掉存在的表

import pymysql
conn = pymysql.connect(host="10.101.2.41",port=3306,user="select_fk",password="select_fk#2022",database="chenqianguang")
cursor = conn.cursor()
sqls1 = "drop table if exists jldyymx1 " #如果存在就删除
cursor.execute(sqls1)
conn.commit()
cursor.close()
conn.close()

21、批量快速写入sql

ZTcolumns=zt1.营销渠道1.unique()
import pandas as pd
import numpy as np
import pycard as pc
from datetime import datetime, date, timedelta
cgm_36 = pc.Mysql2Tools('41','chenqianguang') # 分别替换41,32,36库
for i in ZTcolumns:
cgm_36.to_sql('jldyymx',zt1[zt1.营销渠道1==i])

22、样本抽样

good_d = yb[(yb.target0)&(yb['放款时间']'2022-12')].sample(n=740,replace=False,random_state=2022)
good_d = good_d.append(yb[(yb.target0)&(yb['放款时间']'2023-07')].sample(n=3611,replace=False,random_state=2022))

bad_d = yb[(yb.target1)&(yb['放款时间']'2022-12')].sample(n=185,replace=False,random_state=2022)
bad_d = bad_d.append(yb[(yb.target1)&(yb['放款时间']'2023-01')].sample(n=1543,replace=False,random_state=2022))
yb=good_d.append(bad_d)

23、清除变量

globals().clear()
locals().clear()
del x 删掉单个变量

24、增加前缀或者后缀

前缀:TDtyf.add_prefix('talkingdata_TD腾云分_')
后缀:TDtyf.add_suffix('talkingdata_TD腾云分_')

25、其他

import math
p = 50 / math.log(2) #factor = 20 / np.log(2)
q = 600 + 50 * math.log(0.1) / math.log(2)
import toad
p1=model.predict_proba(yb.iloc[:,:-2])
p1=pd.DataFrame(p1)
p1[1]=p1[1].apply(lambda x:q-p*math.log(x/(1-x)))
p1

26、筛选空值

任意行
test[test.isnull().T.any()] #无论哪列,有空值的被选出来
某行
test[test[['生日']].isnull().T.any()] #只选择【生日】列有空值的所有行
不为空
test = test[test['性别'].notna()] #去掉【性别】为空值的行
为空
Total['申请准入分'].isna()

时间和时间戳转化

定义date

today_d = dt.date(2021,3,5)
today_d = dt.date.today()

转换成datetime

today_dt = dt.datetime.combine(today_d ,dt.time())
today_dt = dt.datetime.strptime(str(today_d ),'%Y-%m-%d')

转换date

today_copy = today_copy.date()

import joblib
from sklearn2pmml import sklearn2pmml,PMMLPipeline

xgb_clf = joblib.load('模型文件/乐花卡XGB模型26个变量20210729.pkl')

from pypmml import Model

model = Model.fromFile('模型文件/msxf_xgb模型20220119.pmml')#加载模型文件

pipeline = PMMLPipeline([('classifier',model)])

pipeline.fit(x_train,y_train)

sklearn2pmml(pipeline, '君航微金v1.pmml',with_repr = True,debug = True)
from pypmml import Model

import joblib

from sklearn2pmml import sklearn2pmml,PMMLPipeline

model = Model.fromFile('君航微金v1.pmml')#加载模型文件
all_pred = model.predict(yb.iloc[:,:-2])
all_pred

查看版本 pip show sklearn2pmml
安装指定版本 pip install sklearn2pmml0.90.4
删除 pip uninstall sklearn2pmml
0.90.4

计算ROC与AUC值

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
plt.figure(dpi=50)

真实标签和模型预测的概率

y_true = y_test
y_score = n

计算ROC曲线的各个点

fpr, tpr, thresholds = roc_curve(y_true, y_score)

计算AUC值

roc_auc = auc(fpr, tpr)

绘制ROC曲线

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Example')
plt.legend(loc='lower right')
plt.show()

决策树画图

y=y.fillna(-1000)
y=y[['婚姻状况','借款分期数','借款分期费率','性别','借款金额','target']]
for i in y.columns:
y[i]=y[i].astype('str')
X=y.iloc[:,:-1]
Y=y['target']
import pandas as pd # 数据处理
import numpy as np # 使用数组
import matplotlib.pyplot as plt # 可视化
from matplotlib import rcParams # 图大小
from sklearn.tree import DecisionTreeClassifier as dtc # 树算法
from sklearn.model_selection import train_test_split # 拆分数据
from sklearn.metrics import accuracy_score # 模型准确度
from sklearn.tree import plot_tree # 树图
plt.figure(dpi=150)
rcParams['figure.figsize'] = (20,10)
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.2, random_state = 0)
model = dtc(criterion = 'entropy', max_depth = 3)
model.fit(X_train, y_train)
pred_model = model.predict(X_test)
feature_names = y.columns[:5]
target_names = y['target'].unique().tolist()
plot_tree(model,
feature_names = feature_names,
#class_names = target_names,
filled = True,
rounded = True)
plt.savefig('tree_visualization.png')
1、bangkebang_policy_sxall.columns =bangkebang_policy_sxall.columns.map(str.lower)
2、cate_cols = df.select_dtypes(include='object').columns
data_iv=pd.DataFrame()
for i in ['fpd30+']:
for j in hxcol:
tmp_data = pc.cross_woe(HX[j],HX[i]).reset_index()
tmp_data.insert(0,'type',value=i)
tmp_data.rename({j:'区间'},axis=1,inplace=True)
data_iv = data_iv.append(tmp_data)
data_iv
import pycard as pc
num_iv_woedf = pc.WoeDf()
clf = pc.NumBin(max_bins_num=5,min_impurity_decrease=4e-5)
for i in [“”“bianliang”]:
clf.fit(zhenxing[i] ,zhenxing.taget)
#zhenxing[i+'bin'] = clf.transform(zhenxing[i]) #这样可以省略掉后面转换成_bin的一步骤
clf.bin_code(i)
num_iv_woedf.append(clf.woe_df
)