python生成数据仓库日期维度表
设计一张日期维度表
表名称:DIM_PUB_DATE
具体属性值:
字段名称
|
字段类型
|
字段描述
|
示例
|
ds
|
string
|
Id(主键)
|
20210624
|
d_date
|
string
|
日期
|
2021-06-24
|
d_datetime
|
string
|
日期(包含时分秒)
|
2021-06-24 00:00:00
|
y_ds
|
string
|
去年id
|
20200624(减12月)
|
y_date
|
string
|
去年日期
|
2020-06-24
|
y_datetime
|
string
|
去年日期(包含时分秒)
|
2020-06-24 00:00:00
|
d_year
|
string
|
年份
|
2021
|
d_yearstartday
|
string
|
年初日
|
2021-01-01
|
y_year
|
string
|
去年份
|
2020
|
y_yearstartday
|
string
|
去年初日
|
2020-01-01
|
d_month
|
string
|
月份
|
2021-06
|
d_monthstartday
|
string
|
月初日
|
2021-06-01
|
d_monthendday
|
string
|
月尾日
|
2021-06-30
|
y_month
|
string
|
去年月份
|
2020-06
|
y_monthstartday
|
string
|
去年月初日
|
2020-06-01
|
y_monthendday
|
string
|
去年月尾日
|
2020-06-30
|
d_quarter
|
string
|
季度
|
Q2
|
d_yearquarter
|
string
|
季度(带年份)
|
2021-2
|
week
|
string
|
星期(数字表示)
|
1,2,3,4,5,6,7:1代表星期一
|
d_week
|
string
|
星期几
|
星期一
|
d_dayofyear
|
string
|
当年第几天
|
D100
|
d_dayofmonth
|
string
|
当月第几天
|
D24
|
d_dayofweek
|
string
|
当周第几天
|
D4
|
d_weekofmonth
|
string
|
当月第几周
|
W4
|
d_weekofyear
|
string
|
当年第几周
|
W24
|
is_weekend
|
string
|
是否周末
|
1:是,2:否
|
day_type
|
string
|
日历天类型
|
0:节假日,1:工作日,2:休息日
(调休也属于工作日)
|
hol_name
|
string
|
节假日名称
|
元旦,调休,春节,清明节,劳动节,端午节,中秋节,国庆节
|
生成方法:
使用python生成一张维度表,考虑到通用性,可以先写到Excel或者CSV里面,然后自定义写入到hive或者odps里面去。
Py3语法:
# -*- coding:utf-8 -*- import csv import json from datetime import * import requests from dateutil.relativedelta import * import calendar import pandas as pd from odps import ODPS def getDate(ddate, holiday_list): # 日期 d_date = datetime.strptime(ddate, "%Y-%m-%d").date() # id(主键) ds = str(d_date).replace('-', '') # 日期时间(包含时分秒) d_datetime = datetime(d_date.year,d_date.month,d_date.day) # 去年今日 y_date = d_date + relativedelta(months=-12) # 去年ds y_ds = str(y_date).replace('-', '') # 去年日期 y_datetime = datetime(y_date.year, y_date.month, y_date.day) # 年份 d_year = str(d_date.year) # 年初日 d_yearstartday = d_year+'-01-01' # 年份 y_year = str(y_date.year) # 去年初日 y_yearstartday = y_year + '-01-01' # 月份 d_month = d_year+'-'+str(d_date).split('-')[1] # 月初日 d_monthstartday = d_month + '-01' # 月尾日 d_monthendday = d_month + '-' + str(calendar.monthrange(int(d_year), int(d_date.month))[1]) # 去年月份 y_month = y_year+'-'+str(y_date).split('-')[1] # 去年月初日 y_monthstartday = y_month + '-01' # 去年月尾日 y_monthendday = y_month + '-' + str(calendar.monthrange(int(y_year), int(y_date.month))[1]) # 季度 d_quarter = "Q"+str((int(d_date.month)-1) // 3 + 1) # 星期几 week = d_date.strftime('%u') # 星期 d_week = '' if week == '1': d_week = "星期一" elif week == '2': d_week = "星期二" elif week == '3': d_week = "星期三" elif week == '4': d_week = "星期四" elif week == '5': d_week = "星期五" elif week == '6': d_week = "星期六" else: d_week = "星期日" # 季度带年份 d_yearquarter = y_year+'-'+str((int(d_date.month)-1) // 3 + 1) # 当年第几天 d_dayofyear = d_date.strftime('%j') # 当月第几天 d_dayofmonth = d_date.strftime('%d') # 当周第几天:星期一为第一天 (值从1到7,星期一为1) d_dayofweek = d_date.strftime('%u') # 当月第几周:第一天只要不是星期一,都算一周 d_weekofmonth = (int(d_date.strftime('%W'))-int(date(d_date.year,d_date.month,1).strftime('%W')))+1 # 当年第几周:把星期一做为第一天(值从0到53) d_weekofyear = int(d_date.strftime('%W'))+1 # 是否周末 is_weekend = '' # 日期类型:0:节假日,1:工作日,2:休息日 day_type = '' # 具体节假日名称(包含调休) hol_name = '' if d_dayofweek in ('6', '7'): is_weekend = '1' day_type = '2' else: is_weekend = '0' day_type = '1' # 法定节假日:0,工作日:1,休息日:2 for hol_json in holiday_list: if str(d_date) == json.loads(hol_json)['date']: # 是否节假日 if json.loads(hol_json)['flag'] != "调休": day_type = '0' else: day_type = '1' # 具体节假日(包含调休) hol_name = json.loads(hol_json)['flag'] # 把数据添加到列表中 date_list = [ds, str(d_date), str(d_datetime), y_ds, str(y_date), str(y_datetime), d_year, d_yearstartday, y_year, y_yearstartday, d_month, d_monthstartday, d_monthendday, y_month, y_monthstartday, y_monthendday, d_quarter, d_yearquarter, week, d_week, 'D'+str(d_dayofyear), 'D'+str(d_dayofmonth), 'D'+str(d_dayofweek), 'W'+str(d_weekofmonth), 'W'+str(d_weekofyear), is_weekend, day_type, hol_name] return date_list # 获取节假日,转为需要的格式 def get_holiday_dict(start_year, y_cnt): y_list = [start_year] for i in range(1, y_cnt+1): y_list.append(start_year+i) hol_list = [] dict_res = {} for year in y_list: headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"} response = requests.get('https://timor.tech/api/holiday/year/'+str(year)+'/', headers=headers) res = response.json()['holiday'].items() for i in res: # 日期 dict_res['date'] = i[1]['date'] # 标识节假日(详细) dict_res['name'] = i[1]['name'] # 薪资倍数,3表示是3倍工资 dict_res['wage'] = i[1]['wage'] dict_res['holiday'] = i[1]['holiday'] # 标识节假日 if ("初" in str(i[1]['name'])) or ("除夕" in str(i[1]['name'])): dict_res['flag'] = '春节' elif "调休" in str(i[1]['name']): dict_res['flag'] = '调休' else: dict_res['flag'] = i[1]['name'] hol_list.append(str(json.dumps(dict_res, ensure_ascii=False))) print("节假日数据已经获取完毕!!!") return hol_list #获取生成指定日期范围的数据 def get_ste_day(time_start, time_end, holiday_list): date_l = [datetime.strftime(x, '%Y-%m-%d') for x in list(pd.date_range(start=time_start, end=time_end))] res_list = [] # 遍历日期,生成时间数据 for ddate in date_l: date_list = getDate(ddate, holiday_list) res_list.append(date_list) # 返回结果数据 return res_list # 把数据写入csv里面 def write_csv(res_list): # 1. 创建文件对象 f = open('E:\\dim_pub_date.txt', 'w', encoding='utf-8', newline="") # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # 3. 构建列表头 csv_writer.writerow(["ds", "d_date", "d_datetime", "y_ds", "y_date", "y_datetime", "d_year", "d_yearstartday", "y_year", "y_yearstartday", "d_month", "d_monthstartday", "d_monthendday", "y_month", "y_monthstartday", "y_monthendday", "d_quarter", "d_yearquarter", "week", "d_week", "d_dayofyear", "d_dayofmonth", "d_dayofweek", "d_weekofmonth", "d_weekofyear", "is_weekend", "day_type", "hol_name"]) # 4. 写入csv文件内容 for ls in res_list: if len(ls) != 0: csv_writer.writerow(ls) # 5. 关闭文件 print("数据写入csv完成!!!") f.close() # 把数据写入到odps里面 def write_odps(res_list): # 获取odps环境 o = ODPS('xxx', 'xxx', 'DILI_FRESH', endpoint='http://service.cn-beijing.maxcompute.aliyun.com/api') # 写表 t = o.get_table("dim_pub_date") with t.open_writer(partition="year=2018", create_partition=True) as writer: writer.write(res_list) print("数据写入odps完成!!!") # 把数据写入到hive里面 def write_hive(res_list): pass if __name__ == '__main__': # 初始时间和结束时间 time_start = date(2018, 1, 1) time_end = date(2018, 12, 31) # 节假日年份 y_cnt = time_end.year-time_start.year # 获取节假日信息 holiday_list = get_holiday_dict(time_start.year, y_cnt) # 生成指定的时间范围数据 res_list = get_ste_day(time_start, time_end, holiday_list) # 把数据写入到csv文件中 write_csv(res_list) # 把数据写入到odps中 write_odps(res_list) # 把数据写入到hive中 write_hive(res_list)
节假日获取参考:http://timor.tech/api/holiday