python 2.7.15
安装 pip install pymysql
pip install pyspark
constant里面是一些数据库连接串信息
1 # coding: utf-8 2 # -*- coding:utf-8 -*- 3 import sys 4 import argparse 5 import constant 6 from datetime import datetime, timedelta 7 8 reload(sys) 9 sys.setdefaultencoding('utf-8') 10 import pymysql 11 from pyspark import SparkConf, SparkContext 12 from pyspark.sql import HiveContext 13 14 15 class Hive2Mysql: 16 # 初始化参数 17 def __init__(self): 18 host, port, user, passwd, db, target_table, hql, dt, start_date, end_date = self.parseParameter() 19 self.host = host 20 self.port = port 21 self.db = db 22 self.user = user 23 self.passwd = passwd 24 self.target_table = target_table 25 self.hql = hql 26 self.start_date = start_date 27 self.end_date = end_date 28 self.charset = 'utf8' 29 30 def validateWarehouseParameter(self): 31 # 参数设置 32 parser = argparse.ArgumentParser(description='relationalDatabase from hive ads table') 33 parser.add_argument('-b', '--business', action="store", required=True, help='业务名(和数据仓库分层对应)') 34 parser.add_argument('-f', '--hsql', action="store", required=True,help='mysql表: jdbc|target_table|hql|dt|start_dt|end_dt') 35 36 parameter = parser.parse_args() 37 return parameter.business, parameter.hsql 38 39 def parseParameter(self): 40 business, tomysql = self.validateWarehouseParameter() 41 jdbc, target_table, hql, dt, start_dt, end_dt = tomysql.split("|") 42 jdbc = constant.mysqlReportDict[jdbc] 43 host = jdbc['host'] 44 port = jdbc['port'] 45 user = jdbc['username'] 46 passwd = jdbc['passwd'] 47 db = jdbc['db'] 48 return host, port, user, passwd, db, target_table, hql, dt, start_dt, end_dt 49 50 def tomysql(self, sqlContext): 51 url = "jdbc:mysql://%(ip)s:%(port)d/%(db)s" % {'ip': self.host, 'port': self.port, 'db': self.db} 52 print (url, self.target_table) 53 54 if self.end_date == '': 55 endDate = datetime.today().strftime('%Y-%m-%d') 56 deltaDays = 1 57 if self.start_date: 58 workBeginDate = datetime.strptime(self.start_date, '%Y-%m-%d') 59 workEndDate = datetime.strptime(self.end_date, '%Y-%m-%d') 60 deltaDays = (workEndDate - workBeginDate).days 61 62 ExecDate = datetime.strptime(self.start_date, '%Y-%m-%d') if self.start_date != '' else (datetime.today() - timedelta(days = 1)) 63 64 for i in range(deltaDays): 65 currExecDate = ExecDate + timedelta(days = i) 66 67 workDate = currExecDate.strftime("%Y-%m-%d") 68 createDate = currExecDate.strftime("%Y%m%d") 69 70 conn = pymysql.connect(host=self.host, user=self.user, password=self.passwd, database=self.db, port=self.port, charset=self.charset) 71 cur = conn.cursor() 72 print "delete " + workDate + "data" 73 cur.execute("delete from {0} where dt='{1}' ".format(self.target_table, workDate)) 74 conn.commit() 75 conn.close() 76 77 hql = self.hql.format(workDate) 78 df = sqlContext.sql(hql) 79 print hql 80 df.show(20) 81 df.write.jdbc(url, self.target_table, "append", dict(user=self.user, password=self.passwd)) 82 83 84 if __name__ == '__main__': 85 hive2mysql = Hive2Mysql() 86 conf = SparkConf().setMaster("local").setAppName("spark_json1") 87 sc = SparkContext(conf=conf) 88 sqlContext = HiveContext(sc) 89 90 hive2mysql.tomysql(sqlContext) 91 print ""