The future of you, waiting for you in the future.

Geoffrey

Long, long the pathway to Cold Hill;
Drear, drear the waterside so chill.

返回顶部

pandas连接MySQL和impala

pandas连接MySQL和impala

SQL连接

read_sql(sql, 
		 con, 
		 index_col=None, 
		 coerce_float=True, 
		 params=None, 
		 parse_dates=None, 
		 columns=None, 
		 chunksize=None)

参数的意义:

  • sql: 为可执行的sql语句
  • con: 数据库的连接
  • index_col: 选择某一列作为index
  • coerce_float: 将数字形式的字符串直接以float型读入
  • params: 返回传递参数的查询字符串
  • parse_dates: 将某一列日期型字符串转换为datetime型数据,与pd.to_datetime函数功能类似。可以直接提供需要转换的列名以默认的日期形式转换,也可以用字典的格式提供列名和转换的日期格式,比如{column_name: format string}(format string:"%Y:%m:%H:%M:%S")
  • colunms: 要选取的列。一般没啥用,因为在sql命令里面一般就指定要选择的列了
  • chunksize: 如果提供了一个整数值,那么就会返回一个generator,每次输出的行数就是提供的值的大小。
import pandas as pd

# 方法一:用DBAPI构建数据库链接engine
import pymysql
conn = pymysql.connect(host='localhost',
                       user='root',
                       password='',
                       database='database_name')
df = pd.read_sql("select * from table_name",con=conn)


# 方法二:用sqlalchemy构建数据库链接engine
import sqlalchemy
from sqlalchemy import create_engine
#connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format(DB_USER, DB_PASS, DB_HOST, DB_PORT, DATABASE)
connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}'.format('root',
							'', 
							'localhost', 
							3306, 
							'nowcoderdb')
engine=create_engine(connect_info)
df2=pd.read_sql("select * from table_name",con=engine)

impala连接

# Impala 配置
import pandas as pd
from impala.dbapi import connect 
import decimal



def impala_data_get(isql, db_name, table_name):
    conn = connect(host=HOST, 
                    port=PORT,
                    user=USER,
                    password=PASSWOED,
                    auth_mechanism=AUTH_MECHANISM
                    )
    cur = conn.cursor()
    # cur.execute("use snap")
    # cur.execute("show tables")
    #     INVALIDATE METADATA 表名

    frech_isql = "refresh `" + db_name + "`.`" + table_name + "`"
    cur.execute(frech_isql)
    cur.execute(isql)
    # 获取字段名
    des = cur.description
    cols = [item[0] for item in des]
    # 获取数据
    data_list=cur.fetchall()
    # DataFrame    
    data = pd.DataFrame(data_list, columns=cols)
    return data, cols 

pandas 基础设置

详见

pd.set_option('display.max_rows',xxx) # 最大行数
pd.set_option('display.min_rows',xxx) # 最小显示行数
pd.set_option('display.max_columns',xxx) # 最大显示列数
pd.set_option ('display.max_colwidth',xxx) #最大列字符数
pd.set_option( 'display.precision',2) # 浮点型精度
pd.set_option('display.float_format','{:,}'.format) #逗号分隔数字
pd.set_option('display.float_format',  '{:,.2f}'.format) #设置浮点精度
pd.set_option('display.float_format', '{:.2f}%'.format) #百分号格式化
pd.set_option('plotting.backend', 'altair') # 更改后端绘图方式
pd.set_option('display.max_info_columns', 200) # info输出最大列数
pd.set_option('display.max_info_rows', 5) # info计数null时的阈值
pd.describe_option() #展示所有设置和描述
pd.reset_option('all') #重置所有设置选项

csv 以日期存储

import datetime 
import os 
# date_time = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S').split(' ')[0]
save_time = datetime.datetime.now().strftime('%Y_%m_%d').split(' ')[0]
# print(csv_name)
file_path = './data/' + str(save_time) +'/'
file_path 


outpath = ''
if not os.path.exists(file_path):
    os.mkdir(file_path)

绘图

# 添加转入转出指标
def spec_advertiser_id_cbti_time_series(id_from_df, all_dfs, min_data=0, max_data=100000, height=5, width=5):
    shape = id_from_df.shape[0]
    tmp = int(np.sqrt(shape))
    # print(shape)
    if np.floor(np.sqrt(shape)+0.5)!=np.sqrt(shape):
        tmp = tmp+1 
    else:
        tmp = tmp
    if height is None:
        height, width = tmp, tmp 
        
    top25_advertiser = list(id_from_df[:height*width].id)
    food_drinks_datas = all_dfs
    i = 0
    fig = plt.figure(figsize=(40*width/5,30*height/5))
    print({str(index): value for index, value in enumerate(top25_advertiser)})
    # print(top25_advertiser)
    required_datas = pd.DataFrame()
    for advertiser_id in top25_advertiser:
        spec_advertiser_datas = food_drinks_datas[food_drinks_datas.id==str(advertiser_id)].sort_values(
            by='etl_date')
        # print(spec_advertiser_datas.shape)
        # 取时间,时间化,并设置为列
        spec_advertiser_datas['etl_date'] = pd.to_datetime(spec_advertiser_datas['etl_date'])        
        spec_advertiser_datas.set_index("etl_date", inplace=True)
        
        # 补全所有时间
        dates=pd.date_range(spec_advertiser_datas.index.min(), spec_advertiser_datas.index.max())
        daily_new=spec_advertiser_datas.reindex(dates)
        # print(dates.shape)
        
        
        ax = fig.add_subplot(height, width, i+1)
        # lns1 = ax.plot(spec_advertiser_datas['etl_date'], spec_advertiser_datas['cost'], color='green', label='cost')
        lns1 = ax.plot(daily_new.index, 
                       daily_new['cost'], 
                       color='green', 
                       label='cost')
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)

        ax2 = ax.twinx()
        lns2 = ax2.plot(daily_new.index,
                        # spec_advertiser_datas['etl_date'], 
                        daily_new['balance'], 
                        color='red', 
                        label='balance')
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)
        
        lns3 = ax2.scatter(daily_new.index,
                        # spec_advertiser_datas['etl_date'], 
                        daily_new['transfer_in'], 
                        color='orangered',
                        label='transfer_in',
                        alpha=0.3)

        lns4 = ax2.scatter(daily_new.index,
                        # spec_advertiser_datas['etl_date'],
                        daily_new['transfer_out'],
                        color='blue', 
                        label='transfer_out',
                        alpha=0.3)
        
        lns5 = ax2.scatter(daily_new.index,
                        # spec_advertiser_datas['etl_date'],
                        daily_new['income'],
                        color='black', 
                        label='income',
                        alpha=0.3)
        
        ax.set_xlabel("etl_date")
        ax.set_ylabel("cost")

        lns = lns1+lns2+lns3+lns4+lns5
        labs = [l.get_label() for l in lns]
        ax.legend(lns, labs, loc=2)
        
        ax.set_ylim(min_data, max_data)
        ax2.set_ylim(min_data, max_data)

        # ax.set_title('id' + str(i) + ": " + str(advertiser_id))
        ax.set_title(str(i) + ": " + str(advertiser_id) + "\ncreate_time: " + str(spec_advertiser_datas['create_time'].unique()))

        ax_ = plt.gca()
        x_major_locator=MultipleLocator(15)
        ax_.xaxis.set_major_locator(x_major_locator)

        i += 1 
        
        required_datas = required_datas.append(spec_advertiser_datas)

    plt.subplots_adjust(wspace=0.4, hspace=0.9)
    plt.show()
    return required_datas 

优化

# Impala 配置
import pandas as pd
from sqlalchemy import create_engine

from impala.dbapi import connect
import pymysql


'''
示例:
my_impala = ImpalaConnect(host='192.186.30.11', port=21050)
my_sql = 'select * from snap.crm_ec_cloud_mk_mk_sn_schedule_data'
my_df = my_impala.exe_query(my_sql)
'''


def get_query_table(sql):
    replace_list = ['\n', '(', ')', '*', '=', '`']
    for i in replace_list:
        sql = sql.replace(i, ' ')
    sql = sql.split()
    res = set()
    for i in range(1, len(sql)):
        if sql[i - 1].lower() in ['from', 'join'] and sql[i].lower() != 'select':
            res.add(sql[i])
    return list(res)


def refresh_table(f):
    def decorate(self, *args, **kwargs):
        tb_name = get_query_table(*args, **kwargs)
        for tb in tb_name:
            if tb:
                refresh_sql = 'refresh {}'.format(tb)
                self.just_exe(refresh_sql)
                # print('表:\t{}\t刷新完毕'.format(tb))
        res = f(self, *args, **kwargs)
        return res

    return decorate


class ImpalaConnect(object):
    def __init__(self, host, port, conn=None):
        self.host = host
        self.port = port
        self.conn = conn

    def get_connect(self):
        try:
            self.conn = connect(host=self.host, port=self.port)
            if self.conn:
                cur = self.conn.cursor()
                if not cur:
                    raise (NameError, '连接impala失败')
                else:
                    return cur
        except Exception as e:
            print(e)
            print('连接impala失败,host或者port有误')

    def just_exe(self, sql):
        cur = self.get_connect()
        try:
            cur.execute(sql)
            self.conn.close()
        except Exception as e:
            print(e)
            print('查询sql有误,请检测schema、表或者字段名是否有误')

    @refresh_table
    def exe_query(self, sql):
        cur = self.get_connect()
        try:
            cur.execute(sql)
            _des = cur.description
            _cols = [col[0] for col in _des]
            res_list = cur.fetchall()
            self.conn.close()
            return _cols, res_list
        except Exception as e:
            print(e)
            print('查询sql有误,请检测schema、表或者字段名是否有误')

    @refresh_table
    def exe_query_df(self, sql):
        try:
            self.conn = connect(host=self.host, port=self.port)
            if self.conn:
                df = pd.read_sql(sql, self.conn)
                self.conn.close()
                return df
        except Exception as e:
            print(e)
            print('连接impala失败,host或者port有误')


my_impala = ImpalaConnect(host='192.186.30.11', port=21050)


# my_sql = 'select * from snap.crm_ec_cloud_mk_mk_sn_schedule_data'
# my_df = my_impala.exe_query(my_sql)


class MysqlDB:
    def __init__(self, host, port, user, password, database, charset='utf8', **kwargs):
        self.host = host or 'localhost'
        self.port = int(port) or 3306
        if type(self.port) is not int:
            raise ValueError('端口应该是数值类型')
        self.user = user
        self.password = password
        self.database = database
        self.charset = charset or 'utf8'
        self._conn = None
        self._cur = None

    def get_connect(self):
        self._conn = pymysql.connect(
            host=self.host, port=self.port, user=self.user, password=self.password, database=self.database,
            charset=self.charset
        )
        if self._conn:
            try:
                self._cur = self._conn.cursor()
                if not self._cur:
                    raise (NameError, '连接mysql失败')
                return self._cur
            except pymysql.Error as e:
                print('mysql连接失败')
                print(e)

    def query_one(self, query, args=None):
        cur = self.get_connect()
        if cur:
            try:
                cur.execute(query, args)
                res_one = cur.fetchone()
                res_cols = [i[0] for i in cur.description]
                cur.close()
                self._conn.close()
                return res_cols, res_one
            except pymysql.Error as e:
                print('mysql连接失败')

    def query_all(self, query, args=None, one=False, is_df=False):
        cur = self.get_connect()
        try:
            cur.execute(query, args)
            if one:
                res = cur.fetchone()
            else:
                res = cur.fetchall()
            res_cols = [i[0] for i in cur.description]
            res_df = pd.DataFrame(list(res), columns=res_cols)
            cur.close()
            self._conn.close()
            if is_df:
                return res_df
            else:
                return res_cols, res
        except pymysql.Error as e:
            print(e)
    
    def delete(self, query, args=None):
        cur = self.get_connect()
        try:
            cur.execute(query, args)
            self._conn.commit()
            
            cur.close()
            self._conn.close()
        except pymysql.Error as e:
            print(e)

    def df_to_sql(self):
        return create_engine(
            f'mysql+pymysql://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}?charset=utf8')


sql_conf = {'test': dict(name='rptuser',
                         pwd='666666',
                         host='192.168.15.1',
                         port='3306',
                         db='ecreport'),
            'product': dict(name='crmbdexport',
                            pwd='666666',
                            host='192.186.30.1',
                            port='3306',
                            db='ecreport')}




sql_env = sql_conf.get('product')
my_sql = MysqlDB(host=sql_env.get('host'),
                 port=sql_env.get('port'),
                 user=sql_env.get('name'),
                 password=sql_env.get('pwd'),
                 database=sql_env.get('db'))

posted @ 2021-07-06 14:45  Geoffreygau  阅读(599)  评论(0编辑  收藏  举报