Pandas VS SQL

环境说明

  • MySQL版本:5.7.28
  • Pandas版本:1.0.1
  • Python版本:3.8.2

数据准备

MySQL:

create table tab_name(
    idx int primary key,
    dt  datetime,
    msg varchar(50)
);

insert into tab_name(idx,dt,msg) values(1,str_to_date('2020-01-01 11:23:45','%Y-%m-%d %H:%i:%s'),'A1一');
insert into tab_name(idx,dt,msg) values(2,str_to_date('2020-01-02 12:23:45','%Y-%m-%d %H:%i:%s'),'B2二');
insert into tab_name(idx,dt,msg) values(3,str_to_date('2020-01-03 13:23:45','%Y-%m-%d %H:%i:%s'),'C3三');
insert into tab_name(idx,dt,msg) values(4,str_to_date('2020-01-04 14:23:45','%Y-%m-%d %H:%i:%s'),'D4四');
insert into tab_name(idx,dt,msg) values(5,str_to_date('2020-01-05 15:23:45','%Y-%m-%d %H:%i:%s'),'E5五');
insert into tab_name(idx,dt,msg) values(6,str_to_date('2020-01-06 16:23:45','%Y-%m-%d %H:%i:%s'),'F6六');
insert into tab_name(idx,dt,msg) values(7,str_to_date('2020-01-07 17:23:45','%Y-%m-%d %H:%i:%s'),'G7七');
insert into tab_name(idx,dt,msg) values(8,str_to_date('2020-01-08 18:23:45','%Y-%m-%d %H:%i:%s'),'H8八');
insert into tab_name(idx,dt,msg) values(9,str_to_date('2020-01-09 19:23:45','%Y-%m-%d %H:%i:%s'),'I9九');

mysql> select * from tab_name;
+-----+---------------------+------+
| idx | dt                  | msg  |
+-----+---------------------+------+
|   1 | 2020-01-01 11:23:45 | A1一  |
|   2 | 2020-01-02 12:23:45 | B2二  |
|   3 | 2020-01-03 13:23:45 | C3三  |
|   4 | 2020-01-04 14:23:45 | D4四  |
|   5 | 2020-01-05 15:23:45 | E5五  |
|   6 | 2020-01-06 16:23:45 | F6六  |
|   7 | 2020-01-07 17:23:45 | G7七  |
|   8 | 2020-01-08 18:23:45 | H8八  |
|   9 | 2020-01-09 19:23:45 | I9九  |
+-----+---------------------+------+
9 rows in set (0.00 sec)

Pandas:

import pandas as pd
from datetime import datetime

df = pd.DataFrame({
    'idx': [1,2,3,4,5,6,7,8,9],
    'dt': [
        datetime(2020,1,1,11,23,45),
        datetime(2020,1,2,12,23,45),
        datetime(2020,1,3,13,23,45),
        datetime(2020,1,4,14,23,45),
        datetime(2020,1,5,15,23,45),
        datetime(2020,1,6,16,23,45),
        datetime(2020,1,7,17,23,45),
        datetime(2020,1,8,18,23,45),
        datetime(2020,1,9,19,23,45),
    ],
    'msg': ['A1一','B2二','C3三','D4四','E5五','F6六','G7七','H8八','I9九']
})

# df数据
   idx                  dt  msg
0    1 2020-01-01 11:23:45  A1一
1    2 2020-01-02 12:23:45  B2二
2    3 2020-01-03 13:23:45  C3三
3    4 2020-01-04 14:23:45  D4四
4    5 2020-01-05 15:23:45  E5五
5    6 2020-01-06 16:23:45  F6六
6    7 2020-01-07 17:23:45  G7七
7    8 2020-01-08 18:23:45  H8八
8    9 2020-01-09 19:23:45  I9九


数据过滤

单值过滤

mysql> select
    ->     idx,
    ->     dt,
    ->     msg
    -> from tab_name
    -> where idx>5;
+-----+---------------------+-------+
| idx | dt                  | msg   |
+-----+---------------------+-------+
|   6 | 2020-01-06 16:23:45 | F6六  |
|   7 | 2020-01-07 17:23:45 | G7七  |
|   8 | 2020-01-08 18:23:45 | H8八  |
|   9 | 2020-01-09 19:23:45 | I9九  |
+-----+---------------------+-------+
4 rows in set (0.00 sec)
print(df.loc[lambda x: x['idx'] > 5])

   idx                  dt  msg
5    6 2020-01-06 16:23:45  F6六
6    7 2020-01-07 17:23:45  G7七
7    8 2020-01-08 18:23:45  H8八
8    9 2020-01-09 19:23:45  I9九

多值过滤

mysql> select
    ->     idx,
    ->     dt,
    ->     msg
    -> from tab_name
    -> where idx in (2,4,6);
+-----+---------------------+-------+
| idx | dt                  | msg   |
+-----+---------------------+-------+
|   2 | 2020-01-02 12:23:45 | B2二  |
|   4 | 2020-01-04 14:23:45 | D4四  |
|   6 | 2020-01-06 16:23:45 | F6六  |
+-----+---------------------+-------+
3 rows in set (0.00 sec)
print(df.loc[lambda x: x['idx'].isin([2, 4, 6])])

   idx                  dt  msg
5    6 2020-01-06 16:23:45  F6六
6    7 2020-01-07 17:23:45  G7七
7    8 2020-01-08 18:23:45  H8八
8    9 2020-01-09 19:23:45  I9九


类型操作

数值与字符串

数值转字符串

mysql> select
    ->     idx,
    ->     cast(idx as char) as idx_char
    -> from tab_name;
+-----+----------+
| idx | idx_char |
+-----+----------+
|   1 | 1        |
|   2 | 2        |
|   3 | 3        |
|   4 | 4        |
|   5 | 5        |
|   6 | 6        |
|   7 | 7        |
|   8 | 8        |
|   9 | 9        |
+-----+----------+
9 rows in set (0.00 sec)
df['idx_char'] = df['idx'].apply(lambda x: str(x))
print(df[['idx', 'idx_char']])

   idx idx_char
0    1        1
1    2        2
2    3        3
3    4        4
4    5        5
5    6        6
6    7        7
7    8        8
8    9        9

字符串转数值

mysql> select
    ->     idx,
    ->     cast('1' as signed) as string_to_int
    -> from tab_name;
+-----+---------------+
| idx | string_to_int |
+-----+---------------+
|   1 |             1 |
|   2 |             1 |
|   3 |             1 |
|   4 |             1 |
|   5 |             1 |
|   6 |             1 |
|   7 |             1 |
|   8 |             1 |
|   9 |             1 |
+-----+---------------+
9 rows in set (0.00 sec)
df['string_to_int'] = int('1')
print(df[['string_to_int']])

   string_to_int
0              1
1              1
2              1
3              1
4              1
5              1
6              1
7              1
8              1

日期与字符串

日期转字符串

mysql> select
    ->     idx,
    ->     dt,
    ->     date_format(dt,'%Y-%m-%d %H:%i:%s') as datetime_to_string
    -> from tab_name;
+-----+---------------------+---------------------+
| idx | dt                  | datetime_to_string  |
+-----+---------------------+---------------------+
|   1 | 2020-01-01 11:23:45 | 2020-01-01 11:23:45 |
|   2 | 2020-01-02 12:23:45 | 2020-01-02 12:23:45 |
|   3 | 2020-01-03 13:23:45 | 2020-01-03 13:23:45 |
|   4 | 2020-01-04 14:23:45 | 2020-01-04 14:23:45 |
|   5 | 2020-01-05 15:23:45 | 2020-01-05 15:23:45 |
|   6 | 2020-01-06 16:23:45 | 2020-01-06 16:23:45 |
|   7 | 2020-01-07 17:23:45 | 2020-01-07 17:23:45 |
|   8 | 2020-01-08 18:23:45 | 2020-01-08 18:23:45 |
|   9 | 2020-01-09 19:23:45 | 2020-01-09 19:23:45 |
+-----+---------------------+---------------------+
9 rows in set (0.00 sec)
df['datetime_to_string'] = df['dt'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
print(df[['idx','dt','datetime_to_string']])

   idx                  dt   datetime_to_string
0    1 2020-01-01 11:23:45  2020-01-01 11:23:45
1    2 2020-01-02 12:23:45  2020-01-02 12:23:45
2    3 2020-01-03 13:23:45  2020-01-03 13:23:45
3    4 2020-01-04 14:23:45  2020-01-04 14:23:45
4    5 2020-01-05 15:23:45  2020-01-05 15:23:45
5    6 2020-01-06 16:23:45  2020-01-06 16:23:45
6    7 2020-01-07 17:23:45  2020-01-07 17:23:45
7    8 2020-01-08 18:23:45  2020-01-08 18:23:45
8    9 2020-01-09 19:23:45  2020-01-09 19:23:45

字符串转日期

mysql> select
    ->     idx,
    ->     str_to_date('2020-01-01 11:23:45','%Y-%m-%d %H:%i:%s') as string_to_datetime
    -> from tab_name;
+-----+---------------------+
| idx | string_to_datetime  |
+-----+---------------------+
|   1 | 2020-01-01 11:23:45 |
|   2 | 2020-01-01 11:23:45 |
|   3 | 2020-01-01 11:23:45 |
|   4 | 2020-01-01 11:23:45 |
|   5 | 2020-01-01 11:23:45 |
|   6 | 2020-01-01 11:23:45 |
|   7 | 2020-01-01 11:23:45 |
|   8 | 2020-01-01 11:23:45 |
|   9 | 2020-01-01 11:23:45 |
+-----+---------------------+
9 rows in set (0.00 sec)
df['string_to_datetime'] = datetime.strptime('2020-01-01 11:23:45','%Y-%m-%d %H:%M:%S')
print(df[['idx','string_to_datetime']])

   idx  string_to_datetime
0    1 2020-01-01 11:23:45
1    2 2020-01-01 11:23:45
2    3 2020-01-01 11:23:45
3    4 2020-01-01 11:23:45
4    5 2020-01-01 11:23:45
5    6 2020-01-01 11:23:45
6    7 2020-01-01 11:23:45
7    8 2020-01-01 11:23:45
8    9 2020-01-01 11:23:45

字符串截取

mysql> select
    ->     idx,
    ->     msg,
    ->     substr(msg,3,1) as substring
    -> from tab_name;
+-----+-------+-----------+
| idx | msg   | substring |
+-----+-------+-----------+
|   1 | A1一  | 一        |
|   2 | B2二  | 二        |
|   3 | C3三  | 三        |
|   4 | D4四  | 四        |
|   5 | E5五  | 五        |
|   6 | F6六  | 六        |
|   7 | G7七  | 七        |
|   8 | H8八  | 八        |
|   9 | I9九  | 九        |
+-----+-------+-----------+
9 rows in set (0.00 sec)
df['substring'] = df['msg'].apply(lambda x: x[2:3])
print(df[['idx','msg','substring']])

   idx  msg substring
0    1  A1一         一
1    2  B2二         二
2    3  C3三         三
3    4  D4四         四
4    5  E5五         五
5    6  F6六         六
6    7  G7七         七
7    8  H8八         八
8    9  I9九         九

字符串拼接

mysql> select
    ->     idx,
    ->     msg,
    ->     concat(msg,' + AaC') as concat_string
    -> from tab_name;
+-----+-------+---------------+
| idx | msg   | concat_string |
+-----+-------+---------------+
|   1 | A1一  | A1一 + AaC    |
|   2 | B2二  | B2二 + AaC    |
|   3 | C3三  | C3三 + AaC    |
|   4 | D4四  | D4四 + AaC    |
|   5 | E5五  | E5五 + AaC    |
|   6 | F6六  | F6六 + AaC    |
|   7 | G7七  | G7七 + AaC    |
|   8 | H8八  | H8八 + AaC    |
|   9 | I9九  | I9九 + AaC    |
+-----+-------+---------------+
9 rows in set (0.00 sec)
df['concat_string'] = df['msg'].apply(lambda x: x + ' + AaC')
print(df[['idx','msg','concat_string']])

   idx  msg concat_string
0    1  A1一     A1一 + AaC
1    2  B2二     B2二 + AaC
2    3  C3三     C3三 + AaC
3    4  D4四     D4四 + AaC
4    5  E5五     E5五 + AaC
5    6  F6六     F6六 + AaC
6    7  G7七     G7七 + AaC
7    8  H8八     H8八 + AaC
8    9  I9九     I9九 + AaC

字符串替换

mysql> select
    ->     idx,
    ->     msg,
    ->     replace(msg,idx,'@') as replace_string
    -> from tab_name;
+-----+-------+----------------+
| idx | msg   | replace_string |
+-----+-------+----------------+
|   1 | A1一  | A@一           |
|   2 | B2二  | B@二           |
|   3 | C3三  | C@三           |
|   4 | D4四  | D@四           |
|   5 | E5五  | E@五           |
|   6 | F6六  | F@六           |
|   7 | G7七  | G@七           |
|   8 | H8八  | H@八           |
|   9 | I9九  | I@九           |
+-----+-------+----------------+
9 rows in set (0.00 sec)
# 方法一
def func(source, old, new):
    return source.replace(old, new)

df['replace_string'] = df.apply(lambda x: func(x['msg'], str(x['idx']), '@'), axis=1)
print(df[['idx', 'msg', 'replace_string']])

   idx  msg replace_string
0    1  A1一            A@一
1    2  B2二            B@二
2    3  C3三            C@三
3    4  D4四            D@四
4    5  E5五            E@五
5    6  F6六            F@六
6    7  G7七            G@七
7    8  H8八            H@八
8    9  I9九            I@九


# 方法二
import re

df['replace_string'] = df.apply(lambda x: re.sub(str(x['idx']), '@', x['msg']), axis=1)
print(df[['idx', 'msg', 'replace_string']])

   idx  msg replace_string
0    1  A1一            A@一
1    2  B2二            B@二
2    3  C3三            C@三
3    4  D4四            D@四
4    5  E5五            E@五
5    6  F6六            F@六
6    7  G7七            G@七
7    8  H8八            H@八
8    9  I9九            I@九


列操作

增加列

mysql> select
    ->     idx,
    ->     msg,
    ->     concat(msg, '_new') as new_msg
    -> from tab_name;
+-----+-------+-----------+
| idx | msg   | new_msg   |
+-----+-------+-----------+
|   1 | A1一  | A1一_new  |
|   2 | B2二  | B2二_new  |
|   3 | C3三  | C3三_new  |
|   4 | D4四  | D4四_new  |
|   5 | E5五  | E5五_new  |
|   6 | F6六  | F6六_new  |
|   7 | G7七  | G7七_new  |
|   8 | H8八  | H8八_new  |
|   9 | I9九  | I9九_new  |
+-----+-------+-----------+
9 rows in set (0.00 sec)
df['new_msg'] = df['msg'].apply(lambda x: x + '_new')
print(df[['idx','msg','new_msg']])

   idx  msg  new_msg
0    1  A1一  A1一_new
1    2  B2二  B2二_new
2    3  C3三  C3三_new
3    4  D4四  D4四_new
4    5  E5五  E5五_new
5    6  F6六  F6六_new
6    7  G7七  G7七_new
7    8  H8八  H8八_new
8    9  I9九  I9九_new

Case语句

mysql> select
    ->     idx,
    ->     msg,
    ->     case substr(msg,1,1)
    ->         when 'A' then 'AAA'
    ->         when 'B' then 'BBB'
    ->         else 'ZZZ'
    ->     end as case_msg
    -> from tab_name;
+-----+-------+----------+
| idx | msg   | case_msg |
+-----+-------+----------+
|   1 | A1一  | AAA      |
|   2 | B2二  | BBB      |
|   3 | C3三  | ZZZ      |
|   4 | D4四  | ZZZ      |
|   5 | E5五  | ZZZ      |
|   6 | F6六  | ZZZ      |
|   7 | G7七  | ZZZ      |
|   8 | H8八  | ZZZ      |
|   9 | I9九  | ZZZ      |
+-----+-------+----------+
9 rows in set (0.01 sec)
def func(msg):
    s = msg[0:1]
    if s == 'A':
        return 'AAA'
    elif s == 'B':
        return 'BBB'
    else:
        return 'ZZZ'

df['case_msg'] = df['msg'].apply(func)
print(df[['idx','msg','case_msg']])

   idx  msg case_msg
0    1  A1一      AAA
1    2  B2二      BBB
2    3  C3三      ZZZ
3    4  D4四      ZZZ
4    5  E5五      ZZZ
5    6  F6六      ZZZ
6    7  G7七      ZZZ
7    8  H8八      ZZZ
8    9  I9九      ZZZ


Group By

mysql> select
    ->     idx%3 as idx_mod,
    ->     count(idx) as idx_count,
    ->     sum(idx) as idx_sum
    -> from tab_name
    -> group by idx%3
    -> order by idx_mod;
+---------+-----------+---------+
| idx_mod | idx_count | idx_sum |
+---------+-----------+---------+
|       0 |         3 |      18 |
|       1 |         3 |      12 |
|       2 |         3 |      15 |
+---------+-----------+---------+
3 rows in set (0.00 sec)
import numpy as np

df['idx_mod'] = df['idx'].apply(lambda x: x%3)
grouped = df.groupby('idx_mod')
print(grouped.agg(idx_count=('idx','count'), idx_sum=('idx','sum')).sort_values(by='idx_mod', ascending=True))
# 上面的等价写法如下:
# print(grouped['idx'].agg(['count', 'sum']).rename(columns={'count': 'idx_count', 'sum': 'idx_sum'}).sort_values(by='idx_mod', ascending=True))

         idx_count  idx_sum
idx_mod
0                3       18
1                3       12
2                3       15
posted @ 2020-12-03 16:32  数据狐  阅读(100)  评论(0编辑  收藏  举报