MySQL2HIVE批量建表
1.mysql2hive 数据类型转换
点击查看代码
from re import fullmatch
def column_type_mysql2hive(mysql_column_type):
"""MySQL列数据类型转成HIVE的"""
# tinyint
if fullmatch('^tinyint.+unsigned', mysql_column_type):
return 'BIGINT'
elif fullmatch('^tinyint.*', mysql_column_type):
return 'BIGINT'
# smallint
elif fullmatch('^smallint.+unsigned', mysql_column_type):
return 'BIGINT'
elif fullmatch('^smallint.*', mysql_column_type):
return 'BIGINT'
# mediumint
elif fullmatch('^mediumint.*', mysql_column_type):
return 'BIGINT'
# int
elif fullmatch('^int.+unsigned', mysql_column_type):
return 'BIGINT'
elif fullmatch('^int.*', mysql_column_type):
return 'BIGINT'
# bigint
elif fullmatch('^bigint.+unsigned', mysql_column_type):
# return 'STRING'
return 'BIGINT' # 无符号BIGINT可能会越界
elif fullmatch('^bigint.*', mysql_column_type):
return 'BIGINT'
# double、float、decimal
elif fullmatch('^double.*', mysql_column_type):
return 'DOUBLE'
elif fullmatch('^float.*', mysql_column_type):
return 'DOUBLE'
elif fullmatch('^decimal.*', mysql_column_type):
return 'decimal'
# date
elif fullmatch('^date.*', mysql_column_type):
return 'date'
# json
elif fullmatch('^json.*', mysql_column_type):
return 'MAP<STRING,STRING>'
# 其它
else:
return 'STRING'
点击查看代码
2.get_all_dbs
import pymysql
class Mysql:
# mysql 端口号,注意:必须是int类型
def __init__(self, host, user, passwd, port, db_name):
self.host = host
self.user = user
self.passwd = passwd
self.port = port
self.db_name = db_name
def select(self, sql):
"""
执行sql命令
:param sql: sql语句
:return: 元祖
"""
try:
conn = pymysql.connect(
host=self.host,
user=self.user,
passwd=self.passwd,
port=self.port,
database=self.db_name,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor
)
cur = conn.cursor() # 创建游标
# conn.cursor()
cur.execute(sql) # 执行sql命令
res = cur.fetchall() # 获取执行的返回结果
cur.close()
conn.close()
return res
except Exception as e:
print(e)
return False
def get_all_db(self):
"""
获取所有数据库名
:return: list
"""
# 排除自带的数据库
exclude_list = ["sys", "information_schema", "mysql", "performance_schema"]
sql = "show databases" # 显示所有数据库
res = self.select(sql)
# print(res)
if not res: # 判断结果非空
return False
db_list = [] # 数据库列表
for i in res:
db_name = i['Database']
# 判断不在排除列表时
if db_name not in exclude_list:
db_list.append(db_name)
if not db_list:
return False
return db_list
def get_all_tbs(self):
sql_get_tbs = f"SELECT TABLE_NAME FROM information_schema.tables WHERE table_schema = '{self.db_name}' AND TABLE_TYPE = 'BASE TABLE';"
res=self.select(sql_get_tbs)
tb_list=[]
for i in res:
tb_name = i['TABLE_NAME']
tb_list.append(tb_name)
return tb_list
if __name__ == '__main__':
host = ""
user = ""
passwd = ""
port = 3306
点击查看代码
3.generate_hive_sql
`from re import fullmatch
from pymysql import Connection # conda install -y pymysql
import data_type_trans
# 查询MySQL表的列名、列类型和列注释
SQL_COLUMNS = '''
SELECT
`COLUMN_NAME` -- 列名
,`COLUMN_TYPE` -- 类型
,`COLUMN_COMMENT` -- 列注释
FROM `information_schema`.`COLUMNS`
WHERE `TABLE_SCHEMA`='{TABLE_SCHEMA}'
AND `TABLE_NAME`='{TABLE_NAME}'
ORDER BY `ORDINAL_POSITION`;
'''.strip().format
# 查询MySQL表的注释
SQL_COMMENT = '''SELECT `TABLE_COMMENT`
FROM `information_schema`.`TABLES`
WHERE `TABLE_SCHEMA`='{TABLE_SCHEMA}'
AND `TABLE_NAME`='{TABLE_NAME}';
'''.strip().format
# HIVE表前缀
HIVE_PREFIX = 'ods_'
# HIVE建表语句
HIVE_DDL = '''
CREATE EXTERNAL TABLE `{table}`(
{columns}
) COMMENT '{table_comment}'
PARTITIONED BY (`mysql2hive_upload_time` STRING COMMENT 'yyyy-mm-dd')
row format delimited fields terminated by '\001'
'''.strip().format
# MySQL原表的建表语句,用于参照
MYSQL_DDL = "SHOW CREATE TABLE `{TABLE_SCHEMA}`.`{TABLE_NAME}`".format
class Mysql:
def __init__(self,db_name):
self.db = Connection(
host="",
user = "",
passwd = "",
port = 3306,
database=db_name
)
self.cursor = self.db.cursor()
def __del__(self):
self.cursor.close()
self.db.close()
def commit(self, sql):
try:
self.cursor.execute(sql)
self.db.commit()
except Exception as e:
print(e)
def fetchall(self, query):
self.cursor.execute(query)
return self.cursor.fetchall() # 有数据:tuple of tuple;无数据:()
def get_columns(self, db, tb):
columns = []
for c_name, c_type, c_comment in self.fetchall(SQL_COLUMNS(TABLE_SCHEMA=db, TABLE_NAME=tb)):
hive_type = data_type_trans.column_type_mysql2hive(c_type)
c_comment=str(c_comment).replace("'", "")
columns.append(f" `{c_name}` {hive_type} COMMENT '{c_comment}',")
return '\n'.join(columns).rstrip(',')
def get_table_comment(self, db, tb):
return self.fetchall(SQL_COMMENT(TABLE_SCHEMA=db, TABLE_NAME=tb))[0][0]
def get_hive_ddl(self, db, tb, prefix='ods_mysql_', postfix='_full'):
columns = self.get_columns(db, tb)
comment = self.get_table_comment(db, tb)
table = tb
return HIVE_DDL(table=table, columns=columns, table_comment=comment)
def get_mysql_ddl(self, db, tb):
return self.fetchall(MYSQL_DDL(TABLE_SCHEMA=db, TABLE_NAME=tb))[0][1]
if __name__ == '__main__':
TABLE_SCHEMA = ''
TABLE_NAME = ''
m = Mysql(db_name='')
print('源MySQL建表语句'.center(99, '-'))
print(m.get_mysql_ddl(TABLE_SCHEMA, TABLE_NAME))
print('HIVE建表语句'.center(99, '-'))
print(m.get_hive_ddl(TABLE_SCHEMA, TABLE_NAME))
点击查看代码
4. execute hsql
from pyhive import hive
from TCLIService.ttypes import TOperationState
import data_type_trans
import generate_hive_tbs
import get_all_dbs
from pyhive import hive
import os
def hive_conn():
conn = hive.Connection(host='',
port=10000,
auth="CUSTOM",
database='default',
username='',
password='')
return conn
def generate_hive_table(db_list):
conn = hive_conn()
curr = conn.cursor()
mysql2hive = generate_hive_tbs.Mysql(db_name='information_schema') # 导入hive建表模块
#将运行中抛出的异常写入文件
file=open("error_logs.txt","w")
# 创建对应库下的所有表
for e in db_list: # 遍历当前库,获取当前库内所有表
obj = get_all_dbs.Mysql(host, user, passwd, port, db_name=e)
all_tb_list = obj.get_all_tbs()
curr.execute(f"use {e}")
for tb in all_tb_list:
try:
create_hive_tb_hql = mysql2hive.get_hive_ddl(e, tb) # 生成hive对应mysql的建表hql
curr.execute(create_hive_tb_hql)
except Exception as error:
print(error)
print(e + ':' +tb+'\n'+create_hive_tb_hql)
file.write(str(error)+'\n')
file.write(str(e + ':' +tb)+'\n'+str(create_hive_tb_hql)+'\n')
file.write("-----------------------------------"+'\n')
file.close()
curr.close()
conn.close() # 关闭连接
def generate_hive_dbs(list_dbs_name):#传进mysql数据库list
#list_dbs_name = ['0rigion_audit','0rigion_lims', '0rigion_usercenter', 'attendance', 'auditserver', 'bedu_audit', 'bedu_cms', 'bedu_config', 'bedu_datashare', 'bedu_dc', 'bedu_file_server', 'bedu_financial', 'bedu_instruments', 'bedu_lims', 'bedu_oauth2', 'bedu_practice', 'bedu_process', 'bedu_teach', 'bedu_teacher_info_center', 'bedu_transcript', 'bedu_usercenter', 'bedu_xmgl', 'bsdzh_bak', 'bsdzh_cms', 'bsdzh_cms_official_cms', 'bsdzh_cms_tsfront', 'bsdzh_config', 'bsdzh_datashare', 'bsdzh_dc', 'bsdzh_file_server', 'bsdzh_lims', 'bsdzh_middle', 'bsdzh_oauth2', 'bsdzh_process', 'bsdzh_transcript', 'bsdzh_usercenter', 'bsdzh_xmgl', 'bsdzhlims_cms_back', 'chb_audit', 'chb_cms', 'chb_datashare', 'chb_file', 'chb_lims', 'chb_oauth2', 'chb_teach', 'chb_transcript', 'chb_usercenter', 'chd_audit', 'chd_cms', 'chd_datashare', 'chd_examserver', 'chd_file_server', 'chd_instruments', 'chd_iot', 'chd_oauth2', 'chd_usercenter', 'dhu_attendance', 'dhu_audit', 'dhu_cms', 'dhu_datashare', 'dhu_file_server', 'dhu_info_center', 'dhu_instruments', 'dhu_iot', 'dhu_lims', 'dhu_message', 'dhu_oauth2', 'dhu_teach', 'dhu_tmp', 'dhu_transcript', 'dhu_usercenter', 'dhu_visual', 'djtu_audit', 'djtu_back_lab_cms', 'djtu_cms', 'djtu_config', 'djtu_datashare', 'djtu_examserver', 'djtu_file_server', 'djtu_ilab_cms', 'djtu_iniciacl_cms', 'djtu_iot', 'djtu_lab3_cms', 'djtu_lims', 'djtu_message', 'djtu_oauth2', 'djtu_process', 'djtu_teach', 'djtu_tmp', 'djtu_transcript', 'djtu_usercenter', 'dlu_cms', 'examserver', 'experiment', 'fbmu_audit', 'fbmu_usercenter', 'financial', 'gengshang_cms', 'gengshang_oauth2', 'gengshang_usercenter', 'gvsun_access', 'gvsun_analysis', 'gvsun_appointment', 'gvsun_cms', 'gvsun_config', 'gvsun_customer', 'gvsun_datashare', 'gvsun_es', 'gvsun_ilab_cms', 'gvsun_iotcloud', 'gvsun_lab_cms', 'gvsun_lims', 'gvsun_oauth2', 'gvsun_practice', 'gvsun_process', 'gvsun_studentinnovation', 'gvsun_usercenter', 'gvsun_visual', 'gvsuntms_new', 'hainanu_access', 'hainanu_appointment', 'hainanu_attendance', 'hainanu_audit', 'hainanu_cms', 'hainanu_config', 'hainanu_datashare', 'hainanu_es', 'hainanu_exam', 'hainanu_file_server', 'hainanu_financial', 'hainanu_info_center', 'hainanu_instruments', 'hainanu_iot', 'hainanu_lab_cms', 'hainanu_lims', 'hainanu_message', 'hainanu_middle_cms', 'hainanu_oauth2', 'hainanu_official_cms', 'hainanu_practice', 'hainanu_process', 'hainanu_school_cms', 'hainanu_teach', 'hainanu_transcript', 'hainanu_usercenter', 'hainanu_visual', 'hainanu_xmgl', 'haishi_attendance', 'haishi_audit', 'haishi_cms', 'haishi_config', 'haishi_datashare', 'haishi_file_server', 'haishi_financial', 'haishi_instruments', 'haishi_iot', 'haishi_lims', 'haishi_message', 'haishi_oauth2', 'haishi_process', 'haishi_studentinnovation', 'haishi_teach', 'haishi_tmp', 'haishi_transcript', 'haishi_usercenter', 'haishi_visual', 'haishi_xmgl', 'haishinew_appointment', 'haishinew_attendance', 'haishinew_audit', 'haishinew_cms', 'haishinew_config', 'haishinew_datashare', 'haishinew_file_server', 'haishinew_gengshang_cms', 'haishinew_gvsun_cms', 'haishinew_instruments', 'haishinew_iot', 'haishinew_jiaoshilou_cms', 'haishinew_lims', 'haishinew_lubanlou_ent_cms', 'haishinew_message', 'haishinew_oauth2', 'haishinew_process', 'haishinew_teach', 'haishinew_tmp', 'haishinew_transcript', 'haishinew_usercenter', 'haishinew_visual', 'haishinew_xmgl', 'haiyang_cms', 'haiyang_datashare', 'haiyang_file_server', 'haiyang_oauth2', 'haiyang_teach', 'haiyang_transcript', 'haiyang_usercenter', 'hdu_audit', 'hdu_cms', 'hdu_config', 'hdu_datashare', 'hdu_file_server', 'hdu_lims', 'hdu_lims_test', 'hdu_message', 'hdu_oauth2', 'hdu_oauth2_test', 'hdu_process', 'hdu_sync', 'hdu_usercenter', 'hiananu_config_bak', 'hust_attendacne', 'hust_audit', 'hust_cms', 'hust_datashare', 'hust_file_server', 'hust_iot', 'hust_lims', 'hust_oauth2', 'hust_teach', 'hust_tmp', 'hust_usercenter', 'hustleao_audit', 'hustleao_cms', 'hustleao_config', 'hustleao_datashare', 'hustleao_file_server', 'hustleao_ilab_cms', 'hustleao_lab_cms', 'hustleao_lims', 'hustleao_oauth2', 'hustleao_process', 'hustleao_usercenter', 'hx_access', 'hx_appointment', 'hx_attendance', 'hx_audit', 'hx_cms', 'hx_config', 'hx_datashare', 'hx_es', 'hx_exam', 'hx_file', 'hx_inst', 'hx_iot', 'hx_lab_cms', 'hx_lims', 'hx_lubanlou_cms', 'hx_message', 'hx_middle_cms', 'hx_oauth', 'hx_official_cms', 'hx_opsm', 'hx_practice', 'hx_process', 'hx_teach', 'hx_tmp', 'hx_transcript', 'hx_user', 'hx_user_info', 'hx_visual', 'hx_xmgl', 'inicial_access', 'inicial_appointment', 'inicial_attendance', 'inicial_audit', 'inicial_cms', 'inicial_config', 'inicial_datashare', 'inicial_es', 'inicial_exam', 'inicial_file', 'inicial_inst', 'inicial_iot', 'inicial_iotcloud', 'inicial_lims', 'inicial_message', 'inicial_oauth', 'inicial_opsm', 'inicial_practice', 'inicial_process', 'inicial_projm', 'inicial_teach', 'inicial_transcript', 'inicial_user', 'inicial_visual', 'insproduct', 'iot', 'iot_management', 'jjysta_audit', 'jjysta_cms', 'jjysta_file_server', 'jjysta_lims', 'jjysta_oauth2', 'jjysta_responsible', 'jjysta_teach', 'jjysta_usercenter', 'lida_attendance', 'lida_audit', 'lida_cms', 'lida_config', 'lida_datashare', 'lida_examserver', 'lida_file_server', 'lida_iot', 'lida_lims', 'lida_message', 'lida_oauth2', 'lida_process', 'lida_teach', 'lida_teacher_info_center', 'lida_transcript', 'lida_usercenter', 'limsproduct', 'lubanlou_cms', 'mask_tmp', 'message', 'new_file_server', 'nwulims_teach', 'official_cms', 'osgvsun_cms', 'osgvsun_file_server', 'osgvsun_oauth2', 'osgvsun_usercenter', 'process', 'report_center', 'sbsjsjxy_audit', 'sbsjsjxy_cms', 'sbsjsjxy_config', 'sbsjsjxy_lims', 'sbsjsjxy_oauth2', 'sbsjsjxy_process', 'sbsjsjxy_teach', 'sbsjsjxy_usercenter', 'sbsjsxy_file_server', 'shafc_audit', 'shafc_cms', 'shafc_datashare', 'shafc_datashare_new', 'shafc_file_server', 'shafc_lims', 'shafc_oauth2', 'shafc_tmp', 'shafc_usercenter', 'shangfei_cms', 'shangfei_datashare', 'shangfei_file_server', 'shangfei_lims', 'shangfei_oauth2', 'shangfei_teach', 'shangfei_transcript', 'shangfei_usercenter', 'shgzi_file_server', 'shhzi_cms', 'shhzi_datashare', 'shhzi_file_server', 'shhzi_oauth2', 'shhzi_teach', 'shhzi_transcript', 'shhzi_usercenter', 'shifan_access', 'shifan_appointment', 'shifan_attendance', 'shifan_audit', 'shifan_back_lab_cms', 'shifan_cms', 'shifan_cms1', 'shifan_config', 'shifan_datashare', 'shifan_examserver', 'shifan_file_server', 'shifan_financial', 'shifan_ilab_cms', 'shifan_instruments', 'shifan_iot', 'shifan_lab_cms', 'shifan_lims', 'shifan_message', 'shifan_middle_cms', 'shifan_oauth2', 'shifan_official', 'shifan_practice', 'shifan_process', 'shifan_studentinnovation', 'shifan_teach', 'shifan_transcript', 'shifan_usercenter', 'shifan_visual', 'shifan_xmgl', 'shifanlims_official', 'shjdhj_cms', 'shjdhj_lims', 'shjdhuangjing_datashare', 'shjdhuanjing_audit', 'shjdhuanjing_cms', 'shjdhuanjing_datashare', 'shjdhuanjing_file_server', 'shjdhuanjing_iot', 'shjdhuanjing_lims', 'shjdhuanjing_oauth2', 'shjdhuanjing_teach', 'shjdhuanjing_usercenter', 'shou_audit', 'shou_cms', 'shou_config', 'shou_datashare', 'shou_file_server', 'shou_lims', 'shou_oauth2', 'shou_usercenter', 'showdoc', 'shs_access', 'shs_appointment', 'shs_attendance', 'shs_auditserver', 'shs_cms', 'shs_cms_back', 'shs_cms_dss', 'shs_cms_front', 'shs_cms_middle', 'shs_config', 'shs_datashare', 'shs_es', 'shs_exam', 'shs_file_server', 'shs_financial', 'shs_info_center', 'shs_instruments', 'shs_iot', 'shs_lims', 'shs_message', 'shs_oauth2', 'shs_practice', 'shs_process', 'shs_teach', 'shs_transcript', 'shs_usercenter', 'shs_visual', 'shs_xmgl', 'shsxy_cms', 'sjtu_practice', 'spkj_audit', 'spkj_back_cms', 'spkj_cms', 'spkj_config', 'spkj_datashare', 'spkj_file_server', 'spkj_lab3_cms', 'spkj_lab_cms', 'spkj_lims', 'spkj_message', 'spkj_mid_cms', 'spkj_oauth2', 'spkj_usercenter', 'sta_cms', 'sudayx_cms', 'sudayx_config', 'sudayx_datashare', 'sudayx_file_server', 'sudayx_lims', 'sudayx_oauth2', 'sudayx_process', 'sudayx_teach', 'sudayx_transcript', 'sudayx_usercenter', 'teacher_info_center', 'test1', 'test_datashare', 'tju_audit', 'tju_cms', 'tju_datashare', 'tju_file_server', 'tju_gengshang_cms', 'tju_gvsun_cms', 'tju_iot', 'tju_jiaoshilou_cms', 'tju_lab_cms', 'tju_lims', 'tju_oauth2', 'tju_usercenter', 'tmp', 'tmp_config', 'transcript', 'usst_audit', 'usst_cms', 'usst_datashare', 'usst_file_server', 'usst_lims', 'usst_oauth2', 'usst_teach', 'usst_transcript', 'usst_usercenter', 'usstlims_cms_lab1', 'usstlims_cms_lab2', 'usstlims_cms_lab3', 'usstlims_cms_lab4', 'xaau_appointment', 'xaau_attendance', 'xaau_audit', 'xaau_back_cms', 'xaau_cms', 'xaau_config', 'xaau_datashare', 'xaau_exam', 'xaau_file_server', 'xaau_iot', 'xaau_jiaoshi_cms', 'xaau_lab_cms', 'xaau_lims', 'xaau_message', 'xaau_oauth2', 'xaau_official_cms', 'xaau_process', 'xaau_teach', 'xaau_tmp', 'xaau_transcript', 'xaau_usercenter', 'xaau_visual', 'xasyu_cms', 'xh_access', 'xh_appointment', 'xh_attendance', 'xh_audit', 'xh_cms', 'xh_config', 'xh_datashare', 'xh_es', 'xh_exam', 'xh_file', 'xh_inst', 'xh_iot', 'xh_lims', 'xh_message', 'xh_oauth', 'xh_opsm', 'xh_practice', 'xh_process', 'xh_projm', 'xh_teach', 'xh_transcript', 'xh_user', 'xh_user_info', 'xh_visual', 'xh_xmgl', 'xhu_access', 'xhu_appointment', 'xhu_attendance', 'xhu_audit', 'xhu_config', 'xhu_customer', 'xhu_datashare', 'xhu_es', 'xhu_examserver', 'xhu_file_server', 'xhu_gengshang_cms', 'xhu_gvsun_cms', 'xhu_info_center', 'xhu_instruments', 'xhu_iot', 'xhu_jiaoshilou_cms', 'xhu_lab_cms', 'xhu_lims', 'xhu_lubanlou_cms', 'xhu_message', 'xhu_oauth2', 'xhu_practice', 'xhu_process', 'xhu_studentinnovation', 'xhu_teach', 'xhu_transcript', 'xhu_usercenter', 'xhu_visual', 'xhu_xmgl', 'xiyi_audit', 'xiyi_cms', 'xiyi_datashare', 'xiyi_file_server', 'xiyi_lims', 'xiyi_oauth2', 'xiyi_teach', 'xiyi_tmp', 'xiyi_usercenter', 'xmgl', 'xmgl_tmp', 'xqsf_audit', 'xqsf_back_cms', 'xqsf_cms', 'xqsf_datashare', 'xqsf_datashare_zhongjianku', 'xqsf_file_server', 'xqsf_lab3_cms', 'xqsf_lab_cms', 'xqsf_lims', 'xqsf_mid_cms', 'xqsf_newlims', 'xqsf_oauth2', 'xqsf_teach', 'xqsf_tmp', 'xqsf_transcript', 'xqsf_usercenter', 'xuele_audit', 'xuele_cms', 'xuele_config', 'xuele_datashare', 'xuele_file_server', 'xuele_message', 'xuele_oauth2', 'xuele_process', 'xuele_teach', 'xuele_teach_info_center', 'xuele_transcript', 'xuele_usercenter', 'xuexihaojile_audit', 'xuexihaojile_cms', 'xuexihaojile_config', 'xuexihaojile_datashare', 'xuexihaojile_file_server', 'xuexihaojile_message', 'xuexihaojile_oauth2', 'xuexihaojile_process', 'xuexihaojile_teach', 'xuexihaojile_teach_info_center', 'xuexihaojile_transcript', 'xuexihaojile_usercenter', 'z_proc_temp', 'zuul']
# get_all_dbs=get_all_dbs.Mysql(host, user, passwd, port,db_name='')
# list_dbs_name=get_all_dbs.get_all_db()
test_db_name=['bedu_oauth2']
#hive连接
conn = hive_conn()
curr = conn.cursor()
#创建对应数据库
try:
for e in list_dbs_name:
create_hive_db=f"create database if not exists `{e}`"
curr.execute(create_hive_db)
except Exception as error:
print(error)
conn.close()
curr.close()
def get_hive_dbs():#获取hive内所有库
conn = hive_conn()
curr = conn.cursor()
curr.execute("show databases")
res = curr.fetchall()
curr.close()
conn.close()
new_list = []
for e in range(len(res)):
str1 = str(res[e]).split("\'")[1]
new_list.append(str1)
new_list.remove('default')
return new_list
def get_hive_db_tbs(db_name):#获取库内所有表 以list返回
conn = hive_conn()
curr = conn.cursor()
curr.execute(f"use `{db_name}`")
curr.execute("show tables")
res = curr.fetchall()
curr.close()
conn.close()
new_list = []
for e in range(len(res)):
str1 = str(res[e]).split("\'")[1]
new_list.append(str1)
return new_list
if __name__ == '__main__':
host = ""
user = ""
passwd = ""
port = 3306
print(get_hive_dbs())

浙公网安备 33010602011771号