psi计算
基础概念:https://zhuanlan.zhihu.com/p/344754828
复制import sys
import pandas as pd
import numpy as np
import math
# all_list = []
# df = pd.DataFrame(columns = ['date', 'data'])
# counter = 0
# for line in sys.stdin:
# line = line.strip('\r\n')
# df.loc[counter] = line.split(',')
# counter += 1
# df.to_excel('./output2.xlsx', sheet_name='Sheet1')
def calc_psi(dataframe):
date_col = dataframe['date']
data_col = dataframe['data']
# print(dataframe['data'].max(), dataframe['data'].min())
# 分组
result = pd.qcut(dataframe['data'], 10)
# print(result)
dataframe["group"] = result
group = result.drop_duplicates(inplace=False).sort_values(ascending=True)
group.index = range(1, len(group) + 1)
print(group)
print('group 1:', group[1])
# 日期
from datetime import datetime
months = date_col.apply(lambda x: str(datetime.strptime(str(x), '%Y%m%d').date().year)+str(datetime.strptime(str(x), '%Y%m%d').date().month))
months_drop = months.drop_duplicates(inplace=False)
print(months_drop.values)
dataframe['new_date'] = months
print(dataframe)
# 先取月份
# print((months == months_drop.values[1]) and (dataframe['group'] == group[1]))
# 取分组
# print(dataframe['group'])
# print('group 11:', group[1])
################ 9月数据在第一组的数量/9月总数 ###########
# print(dataframe['group'] == (group[1]))
# print((months == months_drop.values[1]))
# print('ggg:', (months == months_drop.values[1]) & (dataframe['group'] == group[1]))
actual_i = []
psi_indexs = []
for month_idx in range(0, 7):
group_i = []
for group_index in range(1, 11):
dec_group_i = dataframe[(months == months_drop.values[month_idx]) & (dataframe['group'] == group[group_index])]
# print(dec_group_i.size, dataframe[months == months_drop.values[month_idx]].size)
group_i.append(dec_group_i.size / dataframe[months == months_drop.values[month_idx]].size)
print('group_i:', group_i)
# 计算index = (实际占比 - 预期占比)* ln(实际占比 / 预期占比)
actual_i.append(group_i)
if month_idx > 0:
actual_ = np.array(actual_i[month_idx])
except_ = np.array(actual_i[month_idx - 1])
psi_index = (actual_ - except_) * np.log(actual_ / except_)
psi_indexs.append(psi_index)
print('psi_index:', psi_index)
print(actual_i)
# print(psi_indexs)
final_index = [np.sum(psi_index) for psi_index in psi_indexs]
print('final_index: ', final_index)
# data_202109 = dataframe[(months == months_drop.values[1]) & (dataframe['group'] == group[1])]
# print(data_202109.size, dataframe[(months == months_drop.values[1])].size)
# group2 = dataframe[(months == months_drop.values[1]) & (dataframe['group'] == group[2])]
# print(group2.size)
# date_drop = data.drop_duplicates(subset=['date', 'date'], keep='first', inplace=False)
data = pd.read_csv('./lyx_data.txt', sep=',', names=['date', 'data'])
# print(data.head())
calc_psi(data)
# 生成数据
# import numpy as np
# date = ['20210812', '20210922', '20211009', '20211102', '20211202', '20220112', '20220202']
# new_date = []
# for i in range(100):
# new_date.extend(date)
# new_data = []
# for i in range(700):
# new_data.append(np.random.random(1))
# data = pd.DataFrame(columns=['date', 'data'])
# data['date'] = np.array(new_date)
# data['data'] = np.array(new_data)
# data.to_csv('./lyx_data.txt', sep=',', index=None)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
2020-02-23 RNN LSTM语言模型
2018-02-23 二分算法的应用——最大化最小值 POJ2456 Aggressive cows
2018-02-23 考研计划
2017-02-23 (三)系统调用
2017-02-23 (二) 中断、异常和系统调用比较
2017-02-23 (一)系统启动流程