python 2.7.15

安装 pip  install  pymysql

       pip install pyspark

constant里面是一些数据库连接串信息
 1 # coding: utf-8
 2 # -*- coding:utf-8 -*-
 3 import sys
 4 import argparse
 5 import constant
 6 from datetime import datetime, timedelta
 7 
 8 reload(sys)
 9 sys.setdefaultencoding('utf-8')
10 import pymysql
11 from pyspark import SparkConf, SparkContext
12 from pyspark.sql import HiveContext
13 
14 
15 class Hive2Mysql:
16     # 初始化参数
17     def __init__(self):
18         host, port, user, passwd, db, target_table, hql, dt, start_date, end_date = self.parseParameter()
19         self.host = host
20         self.port = port
21         self.db = db
22         self.user = user
23         self.passwd = passwd
24         self.target_table = target_table
25         self.hql = hql
26         self.start_date = start_date
27         self.end_date = end_date
28         self.charset = 'utf8'
29 
30     def validateWarehouseParameter(self):
31         # 参数设置
32         parser = argparse.ArgumentParser(description='relationalDatabase from hive ads table')
33         parser.add_argument('-b', '--business', action="store", required=True, help='业务名(和数据仓库分层对应)')
34         parser.add_argument('-f', '--hsql', action="store", required=True,help='mysql表: jdbc|target_table|hql|dt|start_dt|end_dt')
35 
36         parameter = parser.parse_args()
37         return parameter.business, parameter.hsql
38 
39     def parseParameter(self):
40         business, tomysql = self.validateWarehouseParameter()
41         jdbc, target_table, hql, dt, start_dt, end_dt = tomysql.split("|")
42         jdbc = constant.mysqlReportDict[jdbc]
43         host = jdbc['host']
44         port = jdbc['port']
45         user = jdbc['username']
46         passwd = jdbc['passwd']
47         db = jdbc['db']
48         return host, port, user, passwd, db, target_table, hql, dt, start_dt, end_dt
49 
50     def tomysql(self, sqlContext):
51         url = "jdbc:mysql://%(ip)s:%(port)d/%(db)s" % {'ip': self.host, 'port': self.port, 'db': self.db}
52         print (url, self.target_table)
53 
54         if self.end_date == '':
55             endDate = datetime.today().strftime('%Y-%m-%d')
56             deltaDays = 1
57         if self.start_date:
58             workBeginDate = datetime.strptime(self.start_date, '%Y-%m-%d')
59             workEndDate = datetime.strptime(self.end_date, '%Y-%m-%d')
60             deltaDays = (workEndDate - workBeginDate).days
61 
62         ExecDate = datetime.strptime(self.start_date, '%Y-%m-%d') if self.start_date != '' else (datetime.today() - timedelta(days = 1))
63 
64         for i in range(deltaDays):
65             currExecDate = ExecDate + timedelta(days = i)
66 
67             workDate = currExecDate.strftime("%Y-%m-%d")
68             createDate = currExecDate.strftime("%Y%m%d")
69 
70             conn = pymysql.connect(host=self.host, user=self.user, password=self.passwd, database=self.db, port=self.port, charset=self.charset)
71             cur = conn.cursor()
72             print "delete " + workDate + "data"
73             cur.execute("delete from {0} where dt='{1}' ".format(self.target_table, workDate))
74             conn.commit()
75             conn.close()
76 
77             hql = self.hql.format(workDate)
78             df = sqlContext.sql(hql)
79             print hql
80             df.show(20)
81             df.write.jdbc(url, self.target_table, "append", dict(user=self.user, password=self.passwd))
82 
83 
84 if __name__ == '__main__':
85     hive2mysql = Hive2Mysql()
86     conf = SparkConf().setMaster("local").setAppName("spark_json1")
87     sc = SparkContext(conf=conf)
88     sqlContext = HiveContext(sc)
89 
90     hive2mysql.tomysql(sqlContext)
91     print ""

 

posted on 2020-07-07 16:13  大羽治不了水  阅读(800)  评论(0编辑  收藏  举报