基于python、jupyter-notebook 的金融领域用户交易行为分析
说明:本文重在说明交易数据统计、分析方法,所有数据均为生成的数据
时间原因代码未定义成函数
统计指标:1.用户单日交易行为数据
2.按小时为计算单位,统计用户行为数据(旨在求得一天24小时中每个小时的交易金额、交易量和后再做统计计算)
获取数据代码如下:
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 __author__ = 'zqf' 4 5 6 import pymysql 7 import random 8 import time 9 from conf import test_conf 10 11 12 con = pymysql.connect(**test_conf.con_set) 13 print("连接成功") 14 cur = con.cursor() 15 time_start = time.time() 16 for i in range(1, 100000): 17 phone_random = random.randint(10000000000, 19999999999) 18 transaction_amount_random = random.uniform(0, 50000) 19 transaction_type_random = random.randint(1, 5) 20 # sql = "select * from my_database.transcation_info" 21 sql = "insert into my_database.transaction_info(user_id, name, phone, transaction_amount, transaction_type, " \ 22 "transaction_time) values('%d', 'name%d', '%d', '%d', '%d', '2019-05-%d %d:%d:%d')" % \ 23 (random.randint(1, 1000), random.randint(1, 1000), phone_random, transaction_amount_random, 24 transaction_type_random, random.randint(1, 31), random.randint(0, 23), random.randint(0, 59), 25 random.randint(0, 59)) 26 27 cur.execute(sql) 28 print("execute完毕即将提交") 29 try: 30 con.commit() 31 print("提交成功") 32 except Exception: 33 print("插入失败") 34 con.rollback() 35 time_stop = time.time() 36 take_time = time_stop - time_start 37 print("花费时间:", take_time) 38 # print(cur.fetchall())
在juoyter-notebook中
导入所需包
# 导入所需包
import pandas as pd import pymysql from datetime import datetime import time from matplotlib import pyplot as plt plt.rcParams['font.family'] = ['sans-serif'] plt.rcParams['font.sans-serif'] = ['SimHei'] # from conf import test_conf
连接sql数据库
1 # 连接数据库 2 con = pymysql.connect(**{ 3 'database': 'my_database', 4 'host': '192.168.**.**', 5 'port': 3306, 6 'user': 'root', 7 'password': '********', 8 'charset': 'utf8' 9 }) 10 print("连接成功") 11 cur = con.cursor() 12 sql = "select * from transaction_info" 13 time_start = time.time() 14 df = pd.read_sql(sql=sql, con=con) 15 time_stop = time.time() 16 take_time = time_stop - time_start 17 print("读取十万条数据花费时间:", take_time) 18 # print(df)
连接成功
读取十万条数据花费时间: 4.09512186050415
# 用户每天每小时最大交易金额 df_groupby_hour = df.groupby(['user_id', df['transaction_time'].apply(lambda item: datetime.strftime(item, '%Y-%m-%d %H'))])
# 计算每天每小时交易金额最大值, 最小值, 平均值, 交易次数, 交易金额总计 df_calculate_by_hour = df_groupby_hour['transaction_amount'].agg([["daily_max", "max"], ["daily_min", "min"], ["daily_mean", "mean"], ["daily_count", "count"], ["daily_sum", "sum"]])
# 注:每天每小时交易金额最大值为每个小时中累计金额的最大值,所以后续需对hour_sum计算相应参数,hour_max、hour_min、hour_mean的后续相应 # 计算均为以天为单位 df_calculate_by_hour
# 将user_id, transaction_time层索引设置成列索引 df_calculate_by_hour_reset_index = df_calculate_by_hour.reset_index() df_calculate_by_hour_reset_index
# 再根据user_id、transaction_time 分组,到天 df_calculate_by_hour_reset_index_regroup = df_calculate_by_hour_reset_index.groupby(['user_id', df_calculate_by_hour_reset_index['transaction_time'].apply(lambda item: datetime.strftime(pd.to_datetime(item), '%Y-%m-%d'))])
# 聚合计算 df_finally = df_calculate_by_hour_reset_index_regroup.agg({'daily_max':['max'], 'daily_min':['min'], 'daily_mean':['mean'], 'daily_count':['sum', 'max', 'min', 'median', 'std', 'mean'], 'daily_sum':['sum', 'max', 'min', 'median', 'std', 'mean']})
# 获取用户交易信息 search_user_id = 1 get_user_message = df_finally.loc[search_user_id] # 将user_id, transaction_time层索引设置成列索引 df_finally_reset = df_finally.reset_index() df_finally.loc[search_user_id]
# 绘制每天参数图 plt.figure(figsize=(10, 6)) plt.xticks(rotation=45) x = df_finally.loc[search_user_id].index li_daily = df_finally.columns.levels[0][:-2] colors = ['y', 'k', 'r'] for index, i in enumerate(li_daily): plt.plot(x, df_finally.loc[search_user_id][i][df_finally[i].columns[0]], label=f"{i}", color=colors[index]) plt.title("每天交易统计") plt.xlabel("日期") plt.ylabel("交易金额") plt.legend() plt.show()
# 绘制交易量交易图 plt.figure(figsize=(10, 6)) plt.xticks(rotation=45) daily_count_str = df_finally.columns.levels[0][3] li_counts = df_finally[daily_count_str].columns x = df_finally.loc[search_user_id].index df_finally.columns.levels[0][3] colors = ['b', 'g', 'c', 'y', 'k', 'r'] for index, i in enumerate(li_counts): plt.plot(x, df_finally.loc[search_user_id][daily_count_str][i], label=f"{i}", color=colors[index]) plt.title("每天交易统计") plt.xlabel("日期") plt.ylabel("交易次数") plt.legend() plt.show()
# 绘制每天每小时交易图 plt.rcParams['font.family'] = ['sans-serif'] plt.rcParams['font.sans-serif'] = ['SimHei'] plt.figure(figsize=(10, 6)) plt.xticks(rotation=45) x = df_finally.loc[search_user_id].index li_daily_hour = df_finally.columns.levels[1] daily_sum_str = df_finally.columns.levels[0][4] colors = ['b', 'g', 'c', 'y', 'k', 'r'] for index, i in enumerate(li_daily_hour): plt.plot(x, df_finally.loc[search_user_id][daily_sum_str][i], label=f"{i}", color=colors[index]) plt.title("每天每小时交易统计") plt.xlabel("日期") plt.ylabel("交易金额") plt.legend() plt.show()