pandas连接MySQL和impala
pandas连接MySQL和impala
SQL连接
read_sql(sql,
con,
index_col=None,
coerce_float=True,
params=None,
parse_dates=None,
columns=None,
chunksize=None)
参数的意义:
- sql: 为可执行的sql语句
- con: 数据库的连接
- index_col: 选择某一列作为index
- coerce_float: 将数字形式的字符串直接以float型读入
- params: 返回传递参数的查询字符串
- parse_dates: 将某一列日期型字符串转换为datetime型数据,与pd.to_datetime函数功能类似。可以直接提供需要转换的列名以默认的日期形式转换,也可以用字典的格式提供列名和转换的日期格式,比如{column_name: format string}(format string:"%Y:%m:%H:%M:%S")
- colunms: 要选取的列。一般没啥用,因为在sql命令里面一般就指定要选择的列了
- chunksize: 如果提供了一个整数值,那么就会返回一个generator,每次输出的行数就是提供的值的大小。
import pandas as pd
# 方法一:用DBAPI构建数据库链接engine
import pymysql
conn = pymysql.connect(host='localhost',
user='root',
password='',
database='database_name')
df = pd.read_sql("select * from table_name",con=conn)
# 方法二:用sqlalchemy构建数据库链接engine
import sqlalchemy
from sqlalchemy import create_engine
#connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format(DB_USER, DB_PASS, DB_HOST, DB_PORT, DATABASE)
connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}'.format('root',
'',
'localhost',
3306,
'nowcoderdb')
engine=create_engine(connect_info)
df2=pd.read_sql("select * from table_name",con=engine)
impala连接
# Impala 配置
import pandas as pd
from impala.dbapi import connect
import decimal
def impala_data_get(isql, db_name, table_name):
conn = connect(host=HOST,
port=PORT,
user=USER,
password=PASSWOED,
auth_mechanism=AUTH_MECHANISM
)
cur = conn.cursor()
# cur.execute("use snap")
# cur.execute("show tables")
# INVALIDATE METADATA 表名
frech_isql = "refresh `" + db_name + "`.`" + table_name + "`"
cur.execute(frech_isql)
cur.execute(isql)
# 获取字段名
des = cur.description
cols = [item[0] for item in des]
# 获取数据
data_list=cur.fetchall()
# DataFrame
data = pd.DataFrame(data_list, columns=cols)
return data, cols
pandas 基础设置
pd.set_option('display.max_rows',xxx) # 最大行数
pd.set_option('display.min_rows',xxx) # 最小显示行数
pd.set_option('display.max_columns',xxx) # 最大显示列数
pd.set_option ('display.max_colwidth',xxx) #最大列字符数
pd.set_option( 'display.precision',2) # 浮点型精度
pd.set_option('display.float_format','{:,}'.format) #逗号分隔数字
pd.set_option('display.float_format', '{:,.2f}'.format) #设置浮点精度
pd.set_option('display.float_format', '{:.2f}%'.format) #百分号格式化
pd.set_option('plotting.backend', 'altair') # 更改后端绘图方式
pd.set_option('display.max_info_columns', 200) # info输出最大列数
pd.set_option('display.max_info_rows', 5) # info计数null时的阈值
pd.describe_option() #展示所有设置和描述
pd.reset_option('all') #重置所有设置选项
csv 以日期存储
import datetime
import os
# date_time = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S').split(' ')[0]
save_time = datetime.datetime.now().strftime('%Y_%m_%d').split(' ')[0]
# print(csv_name)
file_path = './data/' + str(save_time) +'/'
file_path
outpath = ''
if not os.path.exists(file_path):
os.mkdir(file_path)
绘图
# 添加转入转出指标
def spec_advertiser_id_cbti_time_series(id_from_df, all_dfs, min_data=0, max_data=100000, height=5, width=5):
shape = id_from_df.shape[0]
tmp = int(np.sqrt(shape))
# print(shape)
if np.floor(np.sqrt(shape)+0.5)!=np.sqrt(shape):
tmp = tmp+1
else:
tmp = tmp
if height is None:
height, width = tmp, tmp
top25_advertiser = list(id_from_df[:height*width].id)
food_drinks_datas = all_dfs
i = 0
fig = plt.figure(figsize=(40*width/5,30*height/5))
print({str(index): value for index, value in enumerate(top25_advertiser)})
# print(top25_advertiser)
required_datas = pd.DataFrame()
for advertiser_id in top25_advertiser:
spec_advertiser_datas = food_drinks_datas[food_drinks_datas.id==str(advertiser_id)].sort_values(
by='etl_date')
# print(spec_advertiser_datas.shape)
# 取时间,时间化,并设置为列
spec_advertiser_datas['etl_date'] = pd.to_datetime(spec_advertiser_datas['etl_date'])
spec_advertiser_datas.set_index("etl_date", inplace=True)
# 补全所有时间
dates=pd.date_range(spec_advertiser_datas.index.min(), spec_advertiser_datas.index.max())
daily_new=spec_advertiser_datas.reindex(dates)
# print(dates.shape)
ax = fig.add_subplot(height, width, i+1)
# lns1 = ax.plot(spec_advertiser_datas['etl_date'], spec_advertiser_datas['cost'], color='green', label='cost')
lns1 = ax.plot(daily_new.index,
daily_new['cost'],
color='green',
label='cost')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
ax2 = ax.twinx()
lns2 = ax2.plot(daily_new.index,
# spec_advertiser_datas['etl_date'],
daily_new['balance'],
color='red',
label='balance')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
lns3 = ax2.scatter(daily_new.index,
# spec_advertiser_datas['etl_date'],
daily_new['transfer_in'],
color='orangered',
label='transfer_in',
alpha=0.3)
lns4 = ax2.scatter(daily_new.index,
# spec_advertiser_datas['etl_date'],
daily_new['transfer_out'],
color='blue',
label='transfer_out',
alpha=0.3)
lns5 = ax2.scatter(daily_new.index,
# spec_advertiser_datas['etl_date'],
daily_new['income'],
color='black',
label='income',
alpha=0.3)
ax.set_xlabel("etl_date")
ax.set_ylabel("cost")
lns = lns1+lns2+lns3+lns4+lns5
labs = [l.get_label() for l in lns]
ax.legend(lns, labs, loc=2)
ax.set_ylim(min_data, max_data)
ax2.set_ylim(min_data, max_data)
# ax.set_title('id' + str(i) + ": " + str(advertiser_id))
ax.set_title(str(i) + ": " + str(advertiser_id) + "\ncreate_time: " + str(spec_advertiser_datas['create_time'].unique()))
ax_ = plt.gca()
x_major_locator=MultipleLocator(15)
ax_.xaxis.set_major_locator(x_major_locator)
i += 1
required_datas = required_datas.append(spec_advertiser_datas)
plt.subplots_adjust(wspace=0.4, hspace=0.9)
plt.show()
return required_datas
优化
# Impala 配置
import pandas as pd
from sqlalchemy import create_engine
from impala.dbapi import connect
import pymysql
'''
示例:
my_impala = ImpalaConnect(host='192.186.30.11', port=21050)
my_sql = 'select * from snap.crm_ec_cloud_mk_mk_sn_schedule_data'
my_df = my_impala.exe_query(my_sql)
'''
def get_query_table(sql):
replace_list = ['\n', '(', ')', '*', '=', '`']
for i in replace_list:
sql = sql.replace(i, ' ')
sql = sql.split()
res = set()
for i in range(1, len(sql)):
if sql[i - 1].lower() in ['from', 'join'] and sql[i].lower() != 'select':
res.add(sql[i])
return list(res)
def refresh_table(f):
def decorate(self, *args, **kwargs):
tb_name = get_query_table(*args, **kwargs)
for tb in tb_name:
if tb:
refresh_sql = 'refresh {}'.format(tb)
self.just_exe(refresh_sql)
# print('表:\t{}\t刷新完毕'.format(tb))
res = f(self, *args, **kwargs)
return res
return decorate
class ImpalaConnect(object):
def __init__(self, host, port, conn=None):
self.host = host
self.port = port
self.conn = conn
def get_connect(self):
try:
self.conn = connect(host=self.host, port=self.port)
if self.conn:
cur = self.conn.cursor()
if not cur:
raise (NameError, '连接impala失败')
else:
return cur
except Exception as e:
print(e)
print('连接impala失败,host或者port有误')
def just_exe(self, sql):
cur = self.get_connect()
try:
cur.execute(sql)
self.conn.close()
except Exception as e:
print(e)
print('查询sql有误,请检测schema、表或者字段名是否有误')
@refresh_table
def exe_query(self, sql):
cur = self.get_connect()
try:
cur.execute(sql)
_des = cur.description
_cols = [col[0] for col in _des]
res_list = cur.fetchall()
self.conn.close()
return _cols, res_list
except Exception as e:
print(e)
print('查询sql有误,请检测schema、表或者字段名是否有误')
@refresh_table
def exe_query_df(self, sql):
try:
self.conn = connect(host=self.host, port=self.port)
if self.conn:
df = pd.read_sql(sql, self.conn)
self.conn.close()
return df
except Exception as e:
print(e)
print('连接impala失败,host或者port有误')
my_impala = ImpalaConnect(host='192.186.30.11', port=21050)
# my_sql = 'select * from snap.crm_ec_cloud_mk_mk_sn_schedule_data'
# my_df = my_impala.exe_query(my_sql)
class MysqlDB:
def __init__(self, host, port, user, password, database, charset='utf8', **kwargs):
self.host = host or 'localhost'
self.port = int(port) or 3306
if type(self.port) is not int:
raise ValueError('端口应该是数值类型')
self.user = user
self.password = password
self.database = database
self.charset = charset or 'utf8'
self._conn = None
self._cur = None
def get_connect(self):
self._conn = pymysql.connect(
host=self.host, port=self.port, user=self.user, password=self.password, database=self.database,
charset=self.charset
)
if self._conn:
try:
self._cur = self._conn.cursor()
if not self._cur:
raise (NameError, '连接mysql失败')
return self._cur
except pymysql.Error as e:
print('mysql连接失败')
print(e)
def query_one(self, query, args=None):
cur = self.get_connect()
if cur:
try:
cur.execute(query, args)
res_one = cur.fetchone()
res_cols = [i[0] for i in cur.description]
cur.close()
self._conn.close()
return res_cols, res_one
except pymysql.Error as e:
print('mysql连接失败')
def query_all(self, query, args=None, one=False, is_df=False):
cur = self.get_connect()
try:
cur.execute(query, args)
if one:
res = cur.fetchone()
else:
res = cur.fetchall()
res_cols = [i[0] for i in cur.description]
res_df = pd.DataFrame(list(res), columns=res_cols)
cur.close()
self._conn.close()
if is_df:
return res_df
else:
return res_cols, res
except pymysql.Error as e:
print(e)
def delete(self, query, args=None):
cur = self.get_connect()
try:
cur.execute(query, args)
self._conn.commit()
cur.close()
self._conn.close()
except pymysql.Error as e:
print(e)
def df_to_sql(self):
return create_engine(
f'mysql+pymysql://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}?charset=utf8')
sql_conf = {'test': dict(name='rptuser',
pwd='666666',
host='192.168.15.1',
port='3306',
db='ecreport'),
'product': dict(name='crmbdexport',
pwd='666666',
host='192.186.30.1',
port='3306',
db='ecreport')}
sql_env = sql_conf.get('product')
my_sql = MysqlDB(host=sql_env.get('host'),
port=sql_env.get('port'),
user=sql_env.get('name'),
password=sql_env.get('pwd'),
database=sql_env.get('db'))