数据分析之两种用户分群方法(RFM和聚类)
本文由于没有现成的数据,就自己生成了一些商品订单数据,基于该数据进行了RFM和聚类的构建
1.数据的生成
数据库表操作
1 use my_work; 2 3 -- 创建商品订单表 4 CREATE table goods_orders_ful( 5 user_id varchar(100), -- 用户id 6 order_id varchar(100), -- 订单id 7 is_paid bool, -- 用户是否实际支付,1支付;0未支付 8 amount double, -- 订单金额 9 created_date date, -- 订单生成日期 yyyy-mm-dd 10 created_time timestamp, -- 订单生成时间 yyyy-mm-dd hh:mm:ss 11 business_type varchar(10), -- 业务类型 12 region_name varchar(10), -- 所属区域:如 东部地区 13 order_source_name varchar(10), -- 订单渠道:Web、H5、App 14 is_done bool -- 订单是否完成 15 ); 16 17 -- 创建用户订单行为中间表 18 drop table if exists user_info_frm_01; 19 CREATE table user_info_frm_01 20 as 21 select gof.user_id, 22 sum(gof.amount) all_of_money, 23 max(gof.created_date) latest_date, 24 count(gof.order_id) all_of_orders 25 FROM goods_orders_ful gof 26 where gof.is_paid = 1 27 and gof.is_done = 1 28 and gof.created_date >= '2020-01-01' 29 and gof.created_date < '2020-07-01' 30 group by gof.user_id; 31 32 SELECT count(*) from user_info_frm_01 uif; 33 SELECT * from user_info_frm_01 uif limit 10 34 35 -- 创建行为指标均值表 36 create table if not exists user_info_frm_02 37 as 38 select avg(uif.all_of_money) all_of_money_avg, 39 avg(datediff('2020-07-22', uif.latest_date)) latest_days_avg, 40 avg(uif.all_of_orders) orders_avg 41 from user_info_frm_01 uif; 42 43 SELECT * from user_info_frm_02; 44 -- 消费均值1107.10,最小天数均值86.9,订单数量均值2.1 45 46 -- 将用户进行rfm一级打标 47 create table if not exists user_info_frm_03 48 as 49 SELECT uif.user_id, 50 case when uif.all_of_money >= 1107.10 51 then '高' 52 else '低' 53 end money, 54 case when datediff('2020-07-22', uif.latest_date) >= 86.9 55 then '高' 56 else '低' 57 end recency, 58 case when uif.all_of_orders >= 2.1 59 then '高' 60 else '低' 61 end frequency 62 from user_info_frm_01 uif; 63 64 -- 将用户进行二级打标 65 create table if not exists user_info_frm_04 66 as 67 select uif.user_id, 68 uif.recency, 69 uif.frequency, 70 uif.money, 71 case when uif.recency = '高' and uif.frequency = '高' and uif.money = '高' 72 then '重要价值用户' 73 when uif.recency = '低' and uif.frequency = '高' and uif.money = '高' 74 then '重要保持用户' 75 when uif.recency = '高' and uif.frequency = '低' and uif.money = '高' 76 then '重要发展用户' 77 when uif.recency = '低' and uif.frequency = '低' and uif.money = '高' 78 then '重要挽留用户' 79 when uif.recency = '高' and uif.frequency = '高' and uif.money = '低' 80 then '一般价值用户' 81 when uif.recency = '低' and uif.frequency = '高' and uif.money = '低' 82 then '一般保持用户' 83 when uif.recency = '高' and uif.frequency = '低' and uif.money = '低' 84 then '一般发展用户' 85 when uif.recency = '低' and uif.frequency = '低' and uif.money = '低' 86 then '一般挽留用户' 87 else NULL 88 end type 89 90 from user_info_frm_03 uif;
python 程序生成数据
1 # _*_ coding: utf-8 _*_ # 2 # @Time :2020/7/25 7:30 下午 3 # @Author :Zhx 4 5 6 import pymysql 7 import uuid 8 import random 9 import time 10 11 12 class CreateData(object): 13 14 def __init__(self): 15 pass 16 17 @staticmethod 18 def create(): 19 user_id_ = random.randint(1, 5000) 20 order_id_ = uuid.uuid1() 21 is_paid_ = random.choice([1, 0, 1, 1, 1, 1, 1, 1, 1, 1]) 22 amount_ = random.uniform(10, 1000) 23 a1 = (2020, 1, 1, 0, 0, 0, 0, 0, 0) 24 a2 = (2020, 6, 31, 23, 59, 59, 0, 0, 0) 25 26 start = time.mktime(a1) # 生成开始时间戳 27 end = time.mktime(a2) # 生成结束时间戳 28 29 # 随机生成10个日期字符串 30 t = random.randint(start, end) # 在开始和结束时间戳中随机取出一个 31 date_tuple = time.localtime(t) # 将时间戳生成时间元组 32 created_date_ = time.strftime("%Y-%m-%d", date_tuple) # 将时间元组转成格式化字符串 33 created_time_ = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple) 34 business_type_ = random.randint(0, 20) 35 region_name_ = random.choice(['东', '西', '南', '北']) 36 order_source_name_ = random.choice(['Web', 'app', 'H5']) 37 is_done_ = is_paid_ 38 return user_id_, order_id_, is_paid_, amount_, created_date_, created_time_, \ 39 business_type_, region_name_, order_source_name_, is_done_ 40 41 42 if __name__ == '__main__': 43 database = 'my_work' 44 table = 'goods_orders_ful' 45 counts = 10000 46 create_data = CreateData() 47 con = pymysql.connect(database=database, host='localhost', 48 user='root', port=3306, password='199498zhx@') 49 cur = con.cursor() 50 for i in range(counts): 51 user_id, order_id, is_paid, amount, created_date, created_time, \ 52 business_type, region_name, order_source_name, is_done = create_data.create() 53 sql = """insert into %s.%s values('%s', '%s', %d, %f, '%s', '%s', '%s', '%s', '%s', %d)""" % \ 54 (database, table, user_id, order_id, is_paid, amount, created_date, created_time, business_type, 55 region_name, order_source_name, is_done) 56 try: 57 cur.execute(sql) 58 print(i, i % 1000) 59 con.commit() 60 except Exception as e: 61 print(e) 62 con.rollback() 63 con.close() 64 cur.close()
源数据字段有:
user_id varchar(100), -- 用户id
order_id varchar(100), -- 订单id
is_paid bool, -- 用户是否实际支付,1支付;0未支付
amount double, -- 订单金额
created_date date, -- 订单生成日期 yyyy-mm-dd
created_time timestamp, -- 订单生成时间 yyyy-mm-dd hh:mm:ss
business_type varchar(10), -- 业务类型
region_name varchar(10), -- 所属区域:如 东部地区
order_source_name varchar(10), -- 订单渠道:Web、H5、App
is_done bool -- 订单是否完成
RFM 模型最终表数据
最终的可视化分析使用jupyter完成