pandas学习

#载入包
import pandas as pd 
import numpy as np

一、文件导入导出

读取文件

1、读取txt文件或csv文件

import pandas as pd
import numpy as np

df1=pd.read_csv('D:/personal_file/python/code/births1881.txt',
               sep=',', #默认逗号分隔符
               header=None, #不将第一行作为每列列名
               names=['Names','Births'], #使用names参数为每一列指定一个名字
               index_col='Names'#设置行索引(数据会按索引排序)
        
               )

df1.head() #查看前几行数据

	Births
Names
Mel	67
Mel	603
Mary	455
John	588
Mary	406

2、读取excel文件

df2=pd.read_excel('D:/personal_file/python/code/births1882.xlsx'
                  #,index_col='StatusDate'
                 )

df2.head() #01234是索引行

	State	Status	Custo merCount	StatusDate
0	TX	2	547	2009-01-05
1	GA	1	903	2009-01-12
2	fl	3	152	2009-01-19
3	NY	1	414	2009-01-26
4	TX	3	436	2009-02-02

3、从Mysql读取文件

import pymysql
conn=pymysql.connect(
    host='rm-2vc58d2kt0b47p8zruo.mysql.cn-chengdu.rds.aliyuncs.com'
    ,user='user'
    ,password='Pyy0609mmm'
    )
sql_query="select id from test.biz_requirement limit 2"

mysql_page=pd.read_sql(sql_query,con=conn)

mysql_page

	id
0	94616755141521896
1	94623970686575221

导出文件

保存为csv文件

#df导出为csv文件
df.to_csv('D:/personal_file/python/code/births1880.csv',
          index=False,#隐藏行索引
          header=False, #隐藏列名
          #encoding='gbk' #从UTF-8格式数据库导出要设置编码，不然中文会乱码
         )

保存为excel文件

#df导出为excel文件
df.to_excel('D:/personal_file/python/code/births1882.xlsx',
            index=False #不保存行索引
           )

二、 Pandas数据结构

pandas包含两种数据类型：series和dataframe。

series是一维数据，相当于一行或者一列。
dataframe是二维表，相当于多行多列

创建series的3种方法：列表、数组、字典

pd.Series(list('abc'),name="字母") #1列表

0    a
1    b
2    c
Name: 字母, dtype: object

pd.Series([1,'a',2]) #2数组

0    1
1    a
2    2
dtype: object

ser1=pd.Series({'a':1,'b':2,'c':3}) #3字典

ser1

a    1
b    2
c    3
dtype: int64

# 综合
mylist = list('abcedfghijklmnopqrstuvwxyz')   # 列表
myarr = np.arange(26)                         # 数组
#zip函数进行合并
mydict = dict(zip(mylist, myarr))             # 字典
ser3 = pd.Series(mydict)                      #创建series

ser3.head()

a    0
b    1
c    2
e    3
d    4
dtype: int64

series操作

1、命名索引列名称
2、series转换为dataframe
3、索引列转换为dataframe的列
4、垂直和水平的拼接series
5、多个series组成dataframe
6、如何判断series对象A中是否包含series对象B的元素
7、如何获得seriesA和seriesB不相同的项（补集思想）
8、求数值series的四分位数
9、获取series中给定索引的元素（items）
10、获取series对象A中包含series对象B元素的位置
11、使series中每个元素的首字母为大写
12、计算series中每个元素长度
13、如何用出现次数最少的字符替换空格符

ser=pd.Series({'a':1,'b':2,'c':3})
ser

a    1
b    2
c    3
dtype: int64

# 1.命名索引列名称
ser.name = 'alphabets' 
ser

a    1
b    2
c    3
Name: alphabets, dtype: int64

# 2.series转换为dataframe
df=ser.to_frame()
df

	alphabets
a	1
b	2
c	3

# 3.索引列转换为dataframe的列
df.reset_index(inplace=True)
df.head()

	index	alphabets
0	a	1
1	b	2
2	c	3

# 4.垂直和水平的拼接series
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

 # 垂直拼接(相当于union all)
df = pd.concat([ser1, ser2], axis=0) #返回series
print(df)
 # 水平拼接
df = pd.concat([ser1, ser2], axis=1) #返回dataframe
print(df)

# 5.多个series组成dataframe
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

 # 方法1，axis=1表示列拼接，0表示行拼接
df = pd.concat([ser1, ser2], axis=1)
 # 与方法1相比，方法2设置了列名
df = pd.DataFrame({'col1': ser1, 'col2': ser2})
df.head()

# 6.如何判断series对象A中是否包含series对象B的元素
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

 # 判断ser1元素是否包含在ser2中，返回布尔型series
ser3 = ser1.isin(ser2)
ser3

 # 判断ser1元素是否不包含在ser2中，返回布尔型series
ser3 = ~ser1.isin(ser2)
ser1[ser3] # 从ser1中获取不包含在ser2的元素 #注意先判断再取

0    1
1    2
2    3
dtype: int64

# 7.如何获得seriesA和seriesB不相同的项
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

 # 求ser1和ser2的并集
ser_u = pd.Series(np.union1d(ser1, ser2))
 # 求ser1和ser2的交集
ser_i = pd.Series(np.intersect1d(ser1, ser2))
 # ser_i在ser_u的补集就是ser1和ser2不相同的项
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

# 8.求数值series的四分位数
ser1 = pd.Series(range(1,10))
np.percentile(ser1, q=[0, 25, 50, 75, 100])

array([1., 3., 5., 7., 9.])

# 9.获取series中给定索引的元素（items）
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))

ser.take([0])   # 获取指定索引的元素-单个
ser.take([-1])  # 获取指定索引的元素-最后一个

index = [0, 4, 8, 14, 20]
ser.take(index) # 获取指定索引的元素-多个

0     a
4     e
8     i
14    o
20    u
dtype: object

# 10.获取series对象A中包含series对象B元素的位置
 # ser1必须包含ser2，否则会报错
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])
 # 方法 1
#[np.where(i == ser1)[0].tolist()[0] for i in ser2]
 # 方法 2
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

 #方法2等价写法
for i in ser2:
    print(pd.Index(ser1).get_loc(i))

# 11.使series中每个元素的首字母为大写
 # series的元素为str类型
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
 # 方法 1
ser.map(lambda x: x.title())
 # 方法 2 ，字符串相加
ser.map(lambda x: x[0].upper() + x[1:])
 # 方法 3
pd.Series([i.title() for i in ser])

0     How
1      To
2    Kick
3    Ass?
dtype: object

# 12.计算series中每个元素长度
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
 # 方法
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

# 13.如何用出现次数最少的字符替换空格符
 # 方法
ser = pd.Series(list('dbc deb abed gade'))
 # 统计元素的频数
freq = ser.value_counts() #默认会按频数从高到低排序
print(freq)
 # 取最小频数的字符元素
least_freq = freq.dropna().index[-1]
print(least_freq)
 # 替换
"".join(ser.replace(' ', least_freq))

d    4
b    3
e    3
     3
a    2
c    1
g    1
dtype: int64
g





'dbcgdebgabedggade'

创建dataframe

import numpy as np
import pandas as pd

dates=pd.date_range('20130101',periods=6) #创建时间序列
df = pd.DataFrame(np.random.randn(6,4),# 产生随机数
                  index=dates, #创建行索引
                  columns=list('ABCD') #创建列名
                 )
df

	A	B	C	D
2013-01-01	-1.192893	-0.359633	1.360742	0.208753
2013-01-02	0.855963	0.332799	-0.018560	-0.701360
2013-01-03	0.052533	1.941321	-0.431581	-0.244977
2013-01-04	0.044910	-1.490302	1.109744	0.263806
2013-01-05	-0.436184	-0.619776	0.676728	0.076502
2013-01-06	-0.981972	1.362772	-1.428714	0.298718

df.info() #查看dataframe的详细信息

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 400.0 bytes

df1 = pd.DataFrame(np.random.rand(4, 4),# 产生随机数
                      index=list('ABCD'), #创建行索引
                      columns=list('1234') #创建列名
                  )
df1

	1	2	3	4
A	0.604314	0.522725	0.092801	0.458008
B	0.248015	0.835771	0.538854	0.004363
C	0.464301	0.000974	0.102777	0.958989
D	0.937618	0.145729	0.369612	0.957957

df2 = pd.DataFrame([   [1, 2, 3, 4],
                       [2, 3, 4, 5],
                       [3, 4, 5, 6],
                       [4, 5, 6, 7]],
                      columns=list('ABCD')  #设置列名
                  )
df2

	A	B	C	D
0	1	2	3	4
1	2	3	4	5
2	3	4	5	6
3	4	5	6	7

dic1 = {'name': ['小明', '小红', '狗蛋', '铁柱'],
        'age': [17, 20, 5, 40],
        'sex': ['男', '女', '女', '男']}  # 使用字典创建
df3 = pd.DataFrame(dic1, 
                   index=list('ABCD') #设置行索引名
                  )

df3

	name	age	sex
A	小明	17	男
B	小红	20	女
C	狗蛋	5	女
D	铁柱	40	男

names = ['Bob','Jessica','Mary','John','Mel']
births = [968, 155, 77, 578, 973]
BabyDataSet=list(zip(names,births)) 
df4 = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births' ])
df4

	Names	Births
0	Bob	968
1	Jessica	155
2	Mary	77
3	John	578
4	Mel	973

dataframe操作

查看数据

# 查看数据 （先创建上页的df,df1,df2,df3)
 #查询一列返回series类型，index是行索引；
 #查询一行返回series类型，index是列名columns；
 #查询多行/列返回DataFrame类型；
print('查看dataframe的详细信息:', df.info() )
print('查看行数:', df1.shape[0] )
print('查看列数:', df1.shape[1] )
print('type查看单列数据类型:\n', type(df3['age']) ) #返回series
print('type查看多列数据类型:\n', type(df3[['sex','age']]) ) #返回dataframe
print('查看所有列数据类型:\n',df1.dtypes)
print('看前两行:\n', df1.head(2) ) #默认5行
print('看后两行:\n', df1.tail(2) )
print('查看行索引:\n', df1.index )
print('columns看列名:\n', df1.columns )
print('查看age列数据:\n', df3['age'] )
print('查看age列数据:\n', df3.age )
print('查看多列数据:\n', df3[['sex','age']] )
print('查看1-3行:\n',df[0:3] ) #默认索引
print('查看指定行:\n',df['20130102':'20130104'] ) #设置索引
#使用df.loc() or df.iloc()
  # df.loc方法根据行/列标签值查询，既能查又能写入覆盖
  #df.iloc方法根据行/列数字位置查询
  #用法相同：df.loc[],df.iloc[]第一个参数为行，第二个参数为列，[]里边为先行后列
print('使用iloc进行行检索,查看行索引位置为0的行:\n', df3.iloc[0] ) #查看第一行
print('查看多行的数据，查看行索引为1-3的行:\n', df3.iloc[1:3] )
print('查看datee索引行的第1行:\n',df.loc[dates[0]] )
print('查看dates行的第1行A列:\n',df.loc[dates[0],'A'] )
print('查看dates行的第1行A列:\n',df.at[dates[0],'A'] ) #同上
print('查看A、B列的所有行:\n',df.loc[:,['A','B']] ) #冒号表示所有数据；第一个参数行使用的冒号表示所有数据不可省略
print('查看A、B列的指定行:\n',df.loc['20130102',['A','B']] )
print('查看A、B列的指定行:\n',df.loc['20130102':'20130104',['A','B']] )
print('查看A列到C列之间的指定行:\n',df.loc['20130102':'20130104','A':'C'] )
print('筛选A列大于0的数据:\n',df.loc[df['A']>0,:] ) #df.A>0本身返回布尔判断结果
print('查询年龄大于15且性别为男的数据:\n',
      df3.loc[(df3['age']>=15)&(df3['sex']=='男'),:] #--多条件查询用&连接
     ) 
#直接条件筛选
 #注意先条件筛选再从数据框取
print('筛选A列大于0的数据:\n',df[df['A']>0] ) #结果同上，第二个参数列使用的冒号表示所有数据可省略
print('筛选sex列含"男"和"非"的数据:\n',
      df3[df3['sex'].isin(['男','非'])]  #isin()接受一个列表，判断该列中元素是否在列表中
     )
print('筛选性别为男的数据:\n',df3[df3['sex']=='男'] )
print('返回所有大于0的数据，若小于0返回缺失值NaN:\n',df[df>0] )

print('查看所有数据值:\n', df.values ) #返回数组
print('查看age列的数据值:\n', df3['age'].values ) #返回数组
print('设置name列作为行索引:\n',df3.set_index('name',inplace=True) ) #设置前name还不是索引，否则报错
print('进行转置:\n', df3.T) #行索引变成列名

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes
查看dataframe的详细信息: None
查看行数: 4
查看列数: 4
type查看单列数据类型:
 <class 'pandas.core.series.Series'>
type查看多列数据类型:
 <class 'pandas.core.frame.DataFrame'>
查看所有列数据类型:
 1    float64
2    float64
3    float64
4    float64
dtype: object
看前两行:
           1         2         3         4
A  0.604314  0.522725  0.092801  0.458008
B  0.248015  0.835771  0.538854  0.004363
看后两行:
           1         2         3         4
C  0.464301  0.000974  0.102777  0.958989
D  0.937618  0.145729  0.369612  0.957957
查看行索引:
 Index(['A', 'B', 'C', 'D'], dtype='object')
columns看列名:
 Index(['1', '2', '3', '4'], dtype='object')
查看age列数据:
 A    17
B    20
C     5
D    40
Name: age, dtype: int64
查看age列数据:
 A    17
B    20
C     5
D    40
Name: age, dtype: int64
查看多列数据:
   sex  age
A   男   17
B   女   20
C   女    5
D   男   40
查看1-3行:
                    A         B         C         D
2013-01-01 -1.192893 -0.359633  1.360742  0.208753
2013-01-02  0.855963  0.332799 -0.018560 -0.701360
2013-01-03  0.052533  1.941321 -0.431581 -0.244977
查看指定行:
                    A         B         C         D
2013-01-02  0.855963  0.332799 -0.018560 -0.701360
2013-01-03  0.052533  1.941321 -0.431581 -0.244977
2013-01-04  0.044910 -1.490302  1.109744  0.263806
使用iloc进行行检索,查看行索引位置为0的行:
 name    小明
age     17
sex      男
Name: A, dtype: object
查看多行的数据，查看行索引为1-3的行:
   name  age sex
B   小红   20   女
C   狗蛋    5   女
查看datee索引行的第1行:
 A   -1.192893
B   -0.359633
C    1.360742
D    0.208753
Name: 2013-01-01 00:00:00, dtype: float64
查看dates行的第1行A列:
 -1.1928925959361503
查看dates行的第1行A列:
 -1.1928925959361503
查看A、B列的所有行:
                    A         B
2013-01-01 -1.192893 -0.359633
2013-01-02  0.855963  0.332799
2013-01-03  0.052533  1.941321
2013-01-04  0.044910 -1.490302
2013-01-05 -0.436184 -0.619776
2013-01-06 -0.981972  1.362772
查看A、B列的指定行:
 A    0.855963
B    0.332799
Name: 2013-01-02 00:00:00, dtype: float64
查看A、B列的指定行:
                    A         B
2013-01-02  0.855963  0.332799
2013-01-03  0.052533  1.941321
2013-01-04  0.044910 -1.490302
查看A列到C列之间的指定行:
                    A         B         C
2013-01-02  0.855963  0.332799 -0.018560
2013-01-03  0.052533  1.941321 -0.431581
2013-01-04  0.044910 -1.490302  1.109744
筛选A列大于0的数据:
                    A         B         C         D
2013-01-02  0.855963  0.332799 -0.018560 -0.701360
2013-01-03  0.052533  1.941321 -0.431581 -0.244977
2013-01-04  0.044910 -1.490302  1.109744  0.263806
查询年龄大于15且性别为男的数据:
   name  age sex
A   小明   17   男
D   铁柱   40   男
筛选A列大于0的数据:
                    A         B         C         D
2013-01-02  0.855963  0.332799 -0.018560 -0.701360
2013-01-03  0.052533  1.941321 -0.431581 -0.244977
2013-01-04  0.044910 -1.490302  1.109744  0.263806
筛选sex列含"男"和"非"的数据:
   name  age sex
A   小明   17   男
D   铁柱   40   男
筛选性别为男的数据:
   name  age sex
A   小明   17   男
D   铁柱   40   男
返回所有大于0的数据，若小于0返回缺失值NaN:
                    A         B         C         D
2013-01-01       NaN       NaN  1.360742  0.208753
2013-01-02  0.855963  0.332799       NaN       NaN
2013-01-03  0.052533  1.941321       NaN       NaN
2013-01-04  0.044910       NaN  1.109744  0.263806
2013-01-05       NaN       NaN  0.676728  0.076502
2013-01-06       NaN  1.362772       NaN  0.298718
查看所有数据值:
 [[-1.1928926  -0.35963294  1.36074179  0.20875278]
 [ 0.85596322  0.33279946 -0.01856033 -0.70136001]
 [ 0.0525335   1.94132053 -0.43158128 -0.24497683]
 [ 0.04490992 -1.49030186  1.10974384  0.26380623]
 [-0.43618383 -0.61977617  0.67672776  0.07650155]
 [-0.9819717   1.36277173 -1.42871436  0.29871825]]
查看age列的数据值:
 [17 20  5 40]
设置name列作为行索引:
 None
进行转置:
 name  小明  小红 狗蛋  铁柱
age   17  20  5  40
sex    男   女  女   男

三、 Pandas数据查询

1、使用单个标签值查询数据; 2、使用值列表查询; 3、使用区间查询; 4、使用条件表达式查询; 5、调用函数查询

df1=pd.read_csv('D:/personal_file/python/code/births1881.txt',
                names=['Names','Births'] #使用names参数指定列名
               )

df1.head()

	Names	Births
0	Mel	67
1	Mel	603
2	Mary	455
3	John	588
4	Mary	406

df1.loc[2,'Births'] #1使用单个标签值查询数据

df1.loc[2:4,'Names'] #使用值列表查询

2    Mary
3    John
4    Mary
Name: Names, dtype: object

df1.loc[[2,3,4],'Names'] #2使用值列表查询

2    Mary
3    John
4    Mary
Name: Names, dtype: object

df1.loc[2:4,'Names':'Births']#3使用区间查询

2    Mary
3    John
4    Mary
Name: Names, dtype: object

df1.loc[2:4,'Names':'Births']

	Names	Births
2	Mary	455
3	John	588
4	Mary	406

#4使用条件表达式查询
df1.loc[df1["Births"]>=999,:] #查询Births中大于999的数据--单条件查询

	Names	Births
579	John	999

df1.loc[(df1["Births"]>=995)&(df1["Names"]=='Mary'),:] #查询Births>990且Names=Mary的数据--多条件查询用&连接

	Names	Births
153	Mary	996
857	Mary	997

df1.loc[lambda x :(x["Births"]>=995)&(x["Names"]=='Mary'),:] #直接写lambda表达式

	Names	Births
153	Mary	996
857	Mary	997

四、修改数据、新增列、删除列

1、直接赋值；2、df.apply方法；3、df.assign方法；4、按条件分组分别赋值;5、其他方法

import pandas as pd
df2=pd.read_csv('D:/personal_file/python/code/天气.txt',sep='\t')

df2.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel
0	2018年01月01日	2018-01-01	3度	-3度	晴~多云	东北风	1-2级	59	良	3
1	2018年01月02日	2018-01-02	2度	-2度	晴~多云	东北风	1-2级	49	优	2
2	2018年01月03日	2018-01-03	0度	-5度	多云	北风	1-2级	28	差	1
3	2018年01月04日	2018-01-04	1度	-1度	阴	东北风	1-2级	28	良	1
4	2018年01月05日	2018-01-05	-1度	-6度	多云-晴	西北风	1-2级	60	优	1

1.直接赋值法

#修改列

#替换bwebdu列和ywebdu列的“度”并转化为数字类型
 #写法1：使用loc方法
df2.loc[:,"bwendu"]=df2["bwendu"].str.replace("度","").astype('int32')
df2.loc[:,"ywendu"]=df2["ywendu"].str.replace("度","").astype('int32')
 #写法2
df2["bwendu"]=df2["bwendu"].str.replace("度","").astype('int32')
df2["ywendu"]=df2["ywendu"].str.replace("度","").astype('int32')

df2.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel
0	2018年01月01日	2018-01-01	3	-3	晴~多云	东北风	1-2级	59	良	3
1	2018年01月02日	2018-01-02	2	-2	晴~多云	东北风	1-2级	49	优	2
2	2018年01月03日	2018-01-03	0	-5	多云	北风	1-2级	28	差	1
3	2018年01月04日	2018-01-04	1	-1	阴	东北风	1-2级	28	良	1
4	2018年01月05日	2018-01-05	-1	-6	多云-晴	西北风	1-2级	60	优	1

df2.dtypes

ymd          object
bwendu        int32
ywendu        int32
tianqi       object
fengxiang    object
fengli       object
aqi           int64
aqlinfo      object
aqilevel      int64
dtype: object

#新增列

#两列相减作为新增列wencha
 #写法1
df2.loc[:,"wencha"] = df2["bwendu"]-df2["ywendu"]
 #写法2
df2["wencha"] = df2["bwendu"]-df2["ywendu"]

df2.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel	wencha
0	2018年01月01日	2018-01-01	3	-3	晴~多云	东北风	1-2级	59	良	3	6
1	2018年01月02日	2018-01-02	2	-2	晴~多云	东北风	1-2级	49	优	2	4
2	2018年01月03日	2018-01-03	0	-5	多云	北风	1-2级	28	差	1	5
3	2018年01月04日	2018-01-04	1	-1	阴	东北风	1-2级	28	良	1	2
4	2018年01月05日	2018-01-05	-1	-6	多云-晴	西北风	1-2级	60	优	1	5

2 df.apply方法

apply a function along an axis of DataFrame.Objects passed to the function are Series objects whose index is either the DataFrame's index(axis=0) or DataFrame's columns(axis=1)

#新增列
#条件判断作为新增列
def get_wendu_type(x):
    if x["bwendu"]>=2:
        return '高温'
    if x["ywendu"]<=-5:
        return '低温'
    else:
        return '常温'
df2.loc[:,"wendu_type"]=df2.apply(get_wendu_type,axis=1) #注意设置axis=1
df2.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel	wencha	wendu_type
0	2018年01月01日	2018-01-01	3	-3	晴~多云	东北风	1-2级	59	良	3	6	高温
1	2018年01月02日	2018-01-02	2	-2	晴~多云	东北风	1-2级	49	优	2	4	高温
2	2018年01月03日	2018-01-03	0	-5	多云	北风	1-2级	28	差	1	5	低温
3	2018年01月04日	2018-01-04	1	-1	阴	东北风	1-2级	28	良	1	2	常温
4	2018年01月05日	2018-01-05	-1	-6	多云-晴	西北风	1-2级	60	优	1	5	低温

3 df.assign方法

Assign new columns to a DataFrame.
Returns a new object with all original columns in addition to new ones.
Existing columns that are re-assigned will be overwritten.

#新增列
#可以同时添加多个列
df2.assign(temp_f=lambda x:x['bwendu'] * 9 / 5 + 32,
           temp_k=lambda x:x['ywendu'] * 9 / 5 + 32
         )

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel	wencha	wendu_type	temp_f	temp_k
0	2018年01月01日	2018-01-01	3	-3	晴~多云	东北风	1-2级	59	良	3	6	高温	37.4	26.6
1	2018年01月02日	2018-01-02	2	-2	晴~多云	东北风	1-2级	49	优	2	4	高温	35.6	28.4
2	2018年01月03日	2018-01-03	0	-5	多云	北风	1-2级	28	差	1	5	低温	32.0	23.0
3	2018年01月04日	2018-01-04	1	-1	阴	东北风	1-2级	28	良	1	2	常温	33.8	30.2
4	2018年01月05日	2018-01-05	-1	-6	多云-晴	西北风	1-2级	60	优	1	5	低温	30.2	21.2
5	2018年01月06日	2018-01-06	-2	-7	小雨	西南风	1-2级	50	良	1	5	低温	28.4	19.4

4 按条件选择分组判断并分别赋值

#新增列
df2["wencha_cd"]="" #先创建空列(这是一种新的创建空列的方法)
df2.loc[df2["bwendu"]-df2["ywendu"]>5,"wencha_cd"]="温差大"
df2.loc[df2["bwendu"]-df2["ywendu"]<=5,"wencha_cd"]="温差小"
df2.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel	wencha	wendu_type	wencha_cd
0	2018年01月01日	2018-01-01	3	-3	晴~多云	东北风	1-2级	59	良	3	6	高温	温差大
1	2018年01月02日	2018-01-02	2	-2	晴~多云	东北风	1-2级	49	优	2	4	高温	温差小
2	2018年01月03日	2018-01-03	0	-5	多云	北风	1-2级	28	差	1	5	低温	温差小
3	2018年01月04日	2018-01-04	1	-1	阴	东北风	1-2级	28	良	1	2	常温	温差小
4	2018年01月05日	2018-01-05	-1	-6	多云-晴	西北风	1-2级	60	优	1	5	低温	温差小

5 其他方法

#新增列
dates=pd.date_range('20130101',periods=6) #创建时间序列
df = pd.DataFrame(np.random.randn(6,4),# 产生随机数
                  index=dates, #创建行索引
                  columns=list('ABCD') #创建列名
                 )
df

# 插入列
s1=pd.Series([1,2,3,4,5,6],index=pd.date_range('20130102',periods=6))
df['F']=s1 #插入一列F
df

	A	B	C	D	F
2013-01-01	2.046441	-1.328544	-0.623193	-0.001910	NaN
2013-01-02	0.691444	0.824626	-0.074844	-0.242671	1.0
2013-01-03	-2.648910	-0.461332	1.348693	-0.151167	2.0
2013-01-04	0.433959	-1.901356	-0.986606	0.948850	3.0
2013-01-05	1.114743	-0.470343	-1.076789	-0.112336	4.0
2013-01-06	-1.604482	-0.962977	-1.054972	0.455335	5.0

dic1 = {'name': ['小明', '小红', '狗蛋', '铁柱'],
        'age': [17, 20, 5, 40],
        'sex': ['男', '女', '女', '男']}  # 使用字典创建
df3 = pd.DataFrame(dic1, 
                   index=list('ABCD')
                  )
#插入列
df3['count']=['one','one','two','three'] #插入一列count
df3.insert(3, 'skin', ['b', 'w', 'w', 'y']) # 指定位置插入一列:第3列之后插入一列:skin
df3

	name	age	sex	skin	count
A	小明	17	男	b	one
B	小红	20	女	w	one
C	狗蛋	5	女	w	two
D	铁柱	40	男	y	three

#插入行
df3.loc['E'] = ['小花', 12, '男', 'y']  # 插入一行   #报错了？。。。
df3

#删除列
 #删除sex列
del df3['sex'] #直接修改原数据，再次删除会报错
df3

	name	age	skin	count
A	小明	17	b	one
B	小红	20	w	one
C	狗蛋	5	w	two
D	铁柱	40	y	three

五、pandas统计函数

1、汇总类统计，用于统计数字类型列;
2、按列逐行累加
3、唯一去重和按值分类，用于分类或枚举列；
4、相关系数和协方差:
a.协方差：衡量同向反向程度；如果协方差为正，说明X,Y同向变化，协方差越大说明同向程度越高；如果协方差为负，说明X,Y反向变化，协方差越小说明反向程度越高
b.协方差矩阵：假如某个观测样本有p个维度，计算每个维度同所有维度之间的协方差，则会形成一个pXp的矩阵，矩阵的每个数是其相应维度之间的协方差，这个矩阵就称为协方差矩阵
c.相关系数：衡量相似度程度，分正相关和负相关

import pandas as pd
df2=pd.read_csv('D:/personal_file/python/code/天气.txt',sep='\t')
df2.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel
0	2018年01月01日	2018-01-01	3度	-3度	晴~多云	东北风	1-2级	59	良	3
1	2018年01月02日	2018-01-02	2度	-2度	晴~多云	东北风	1-2级	49	优	2
2	2018年01月03日	2018-01-03	0度	-5度	多云	北风	1-2级	28	差	1
3	2018年01月04日	2018-01-04	1度	-1度	阴	东北风	1-2级	28	良	1
4	2018年01月05日	2018-01-05	-1度	-6度	多云-晴	西北风	1-2级	60	优	1

# 1、汇总类统计
df2.describe() #只统计数值列的最值、均值、方差、分位数、计数

	bwendu	ywendu	aqi	aqilevel
count	6.000000	6.000000	6.000000	6.00000
mean	0.500000	-4.000000	45.666667	1.50000
std	1.870829	2.366432	14.403703	0.83666
min	-2.000000	-7.000000	28.000000	1.00000
25%	-0.750000	-5.750000	33.250000	1.00000
50%	0.500000	-4.000000	49.500000	1.00000
75%	1.750000	-2.250000	56.750000	1.75000
max	3.000000	-1.000000	60.000000	3.00000

#最值均值总值统计：min/max/mean/sum，只统计数值列
 #按列统计，以min为例
df2['aqi'].min() #某一列最小值
df2['aqi'].min(0) 
 #按行统计，以min为例
df2.min(1) #某一行最小值

2013-01-01   -1.089183
2013-01-02   -0.383638
2013-01-03   -0.639936
2013-01-04    1.226136
2013-01-05   -1.942171
2013-01-06   -1.302461
Freq: D, dtype: float64

#纵向求均值
df2.mean() #每一列最小值，只统计数值列

bwendu       0.500000
ywendu      -4.000000
aqi         45.666667
aqilevel     1.500000
dtype: float64

#某列求均值
df2["bwendu"].mean()

0.5

#计算极差：每列最值之差
print(df)
df.apply(lambda x: x.max()-x.min())

                   A         B         C         D
2013-01-01 -1.189561  0.604377 -1.240541 -1.517069
2013-01-02  1.058069 -1.490177  0.106553  0.687971
2013-01-03  0.146353 -2.402185  0.319810 -1.694133
2013-01-04  1.030632 -1.562393  0.274369 -0.038280
2013-01-05  0.536428  0.145011 -0.645631  0.163246
2013-01-06 -0.594192  0.991625  0.517435  1.259324





A    2.247630
B    3.393810
C    1.757977
D    2.953457
dtype: float64

# 2、按列逐行累加
dic1 = {'name': ['小明', '小红', '狗蛋', '铁柱'],
        'age': [17, 20, 5, 40],
        'sex': ['男', '女', '女', '男']}  # 使用字典创建
df3 = pd.DataFrame(dic1, 
                   index=list('ABCD')
                  )
print(df3)
df3.apply(np.cumsum) #按列逐行累加，发现字符串也会相加

  name  age sex
A   小明   17   男
B   小红   20   女
C   狗蛋    5   女
D   铁柱   40   男

	name	age	sex
A	小明	17	男
B	小明小红	37	男女
C	小明小红狗蛋	42	男女女
D	小明小红狗蛋铁柱	82	男女女男

# 3、唯一去重
df2["tianqi"].unique()

array(['晴~多云', '多云', '阴', '多云-晴', '小雨'], dtype=object)

   #按值计数
df2["fengxiang"].value_counts() #统计某个字段每个值出现的次数

东北风    3
西南风    1
西北风    1
北风     1
Name: fengxiang, dtype: int64

# 4、协方差矩阵
df2.cov()

	bwendu	ywendu	aqi	aqilevel
bwendu	3.5	3.6	1.200000	1.3
ywendu	3.6	5.6	-11.400000	0.8
aqi	1.2	-11.4	207.466667	6.0
aqilevel	1.3	0.8	6.000000	0.7

   #相关系数矩阵
df2.corr()

	bwendu	ywendu	aqi	aqilevel
bwendu	1.000000	0.813157	0.044532	0.830540
ywendu	0.813157	1.000000	-0.334454	0.404061
aqi	0.044532	-0.334454	1.000000	0.497884
aqilevel	0.830540	0.404061	0.497884	1.000000

#相关系数矩阵的热力图
import matplotlib.pyplot as plt
figure, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(df2.corr(), square=True, annot=True, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x280f97643d0>

png

#单独查看两个变量的相关系数
df2["bwendu"].corr(df2["ywendu"])

0.8131571126147262

#单独查看两个变量的相关系数(只能查看两个数值列)
df2["aqi"].corr(df2["bwendu"]-df2["ywendu"]) #查看空气质量和温差的相关系数

0.6346285123340253

#使用seaborn绘制相关图
import seaborn as sns
sns.pairplot(df2) #返回一张大图，其中包含16个子图，每个子图都是每个维度和其他某个维度的相关关系图，这其中主对角线上的图，则是每个维度的数据分布直方图
sns.pairplot(df2 , hue ='tianqi') #画出同上的图形，但却以tianqi这个维度的数据为标准，来对各个数据点进行着色

C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\seaborn\distributions.py:283: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:214: RuntimeWarning: Glyph 26228 missing from current font.
  font.set_text(s, 0.0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:214: RuntimeWarning: Glyph 22810 missing from current font.
  font.set_text(s, 0.0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:214: RuntimeWarning: Glyph 20113 missing from current font.
  font.set_text(s, 0.0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:214: RuntimeWarning: Glyph 38452 missing from current font.
  font.set_text(s, 0.0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:214: RuntimeWarning: Glyph 23567 missing from current font.
  font.set_text(s, 0.0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:214: RuntimeWarning: Glyph 38632 missing from current font.
  font.set_text(s, 0.0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:183: RuntimeWarning: Glyph 26228 missing from current font.
  font.set_text(s, 0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:183: RuntimeWarning: Glyph 22810 missing from current font.
  font.set_text(s, 0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:183: RuntimeWarning: Glyph 20113 missing from current font.
  font.set_text(s, 0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:183: RuntimeWarning: Glyph 38452 missing from current font.
  font.set_text(s, 0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:183: RuntimeWarning: Glyph 23567 missing from current font.
  font.set_text(s, 0, flags=flags)
C:\Users\18308\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:183: RuntimeWarning: Glyph 38632 missing from current font.
  font.set_text(s, 0, flags=flags)





<seaborn.axisgrid.PairGrid at 0x280f862dd30>

png

六、缺失值处理

1、isnull和notnull：探索是否是空值，可用于df和series
2、dropna:丢弃、删除缺失值
3、fillna:填充空值："ffill" 表示用前一个不为空的缺失值填充；"bfill" 表示用后一个不为空的缺失值填充；

df2.head()

	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel
0	2018-01-01	3度	-3度	晴~多云	东北风	1-2级	59	良	3
1	2018-01-02	2度	-2度	晴~多云	东北风	1-2级	49	优	2
2	2018-01-03	0度	-5度	多云	北风	1-2级	28	差	1
3	2018-01-04	1度	-1度	阴	东北风	1-2级	28	良	1
4	2018-01-05	-1度	-6度	多云-晴	西北风	1-2级	60	优	1

pd.isnull(df2) #判断所有行和列是否为空值,布尔型
df2.isnull()   #同上
df2["ymd"].isnull() #返回某列是否为空值

df2.notnull()#返回所有行和列是否为非空
df2["ymd"].notnull() #返回某列是否为非空

	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel
0	False	False	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False	False
5	False	False	False	False	False	False	False	False	False

#筛选tianqi的为空所有行
df2.loc[df2["tianqi"].isnull(),:]
#筛选没有空tianqi的所有行
df2.loc[df2["tianqi"].notnull(),:]

#删除全是空值的行或列
df2.dropna(axis="columns", #删除行还是列，默认删除行0{0 or index,1 or columns}
           how="all", #all表示所有值为空才删除，any表示任何值为空都删除
           inplace=True #True修改当前df,否则返回新的df
          )

#将某列为空值的数据用 0填充
df2.fillna({"bwendu":0}) #写法1
df2.loc[:,"bwendu"]=df2["bwendu"].fillna(0)  #写法2

#将某列为空值的数据用相邻的前面或后面数据填充
df2.loc[:,"bwendu"]=df2["bwendu"].fillna(method="ffill" #用前一个不为空的缺失值填充
                                        )

dates=pd.date_range('20130101',periods=6) #创建时间序列
df = pd.DataFrame(np.random.randn(6,4),# 产生随机数
                  index=dates, #创建行索引
                  columns=list('ABCD') #创建列名
                 )

#reindex()方法可以对指定轴上的索引进行改变/增加/删除操作
#通过索引选取前4行，增加新列E
df1=df.reindex(index=dates[0:4],
               columns=list(df.columns)+['E']
              )
#E列前两行赋值为1
df1.loc[dates[0:2],'E']=1
df1.loc[dates[0]:dates[1],'E']=1 #结果同上

#去掉包含缺失值的行
df1.dropna(how='any') #意味着如果任何值为空，则删除缺失值所在行

#对缺失值进行填充
df1.fillna(value=5) #用常数填充
df1.fillna(5) #同上

	A	B	C	D	E
2013-01-01	-0.636970	-0.327551	-0.231448	-1.089183	1.0
2013-01-02	-0.077707	-0.383638	-0.104501	0.272184	1.0
2013-01-03	-0.501988	0.393708	-0.639936	-0.075200	5.0
2013-01-04	2.695484	1.472324	1.226136	1.627496	5.0

七、pandas数据排序

sort_values/sort_index可以对series和dataframe进行排序，以dataframe为例：
1、按照指定值大小顺序排列:
DataFrame.sort_values(by=str or list #对单列或多列排序,
ascending=bool or list,
inplace=False #是否修改原始列)；
2、按照索引大小顺序排列
DataFrame.sort_index(axis=1, #指定排序的轴,ascending=False, #是否升序,
ignore_index=False #是否重新生成索引)

#载入包
import pandas as pd 
import numpy as np

df3=pd.read_csv('D:/personal_file/python/code/天气.txt',sep='\t')
df3.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel
0	2018年01月01日	2018-01-01	3度	-3度	晴~多云	东北风	1-2级	59	良	3
1	2018年01月02日	2018-01-02	2度	-2度	晴~多云	东北风	1-2级	49	优	2
2	2018年01月03日	2018-01-03	0度	-5度	多云	北风	1-2级	28	差	1
3	2018年01月04日	2018-01-04	1度	-1度	阴	东北风	1-2级	28	良	1
4	2018年01月05日	2018-01-05	-1度	-6度	多云-晴	西北风	1-2级	60	优	1

按照指定值大小顺序排列 sort_values()

#对单列排序
df3["aqi"].sort_values(ascending=True, #是否升序
                       inplace=False #是否修改原始列
                      )

2    28
3    28
1    49
5    50
0    59
4    60
Name: aqi, dtype: int64

#对数据框排序：按aqi升序，aqilevel降序排列
df3.sort_values(by=["aqi","aqilevel"], #指定排序列
                ascending=[True,False],
                inplace=False
               )

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel
2	2018年01月03日	2018-01-03	0度	-5度	多云	北风	1-2级	28	差	1
3	2018年01月04日	2018-01-04	1度	-1度	阴	东北风	1-2级	28	良	1
1	2018年01月02日	2018-01-02	2度	-2度	晴~多云	东北风	1-2级	49	优	2
5	2018年01月06日	2018-01-06	-2度	-7度	小雨	西南风	1-2级	50	良	1
0	2018年01月01日	2018-01-01	3度	-3度	晴~多云	东北风	1-2级	59	良	3
4	2018年01月05日	2018-01-05	-1度	-6度	多云-晴	西北风	1-2级	60	优	1

按照索引大小顺序排列 sort_index()

df3.sort_index(axis=1,#指定排序的轴：按行索引0/按列索引1 #这里设1后结果有点没懂？
               ascending=False, #是否升序
               ignore_index=False #是否重新生成索引
              )

	ywendu	ymd	tianqi	fengxiang	fengli	dates	bwendu	aqlinfo	aqilevel	aqi
0	-3度	2018-01-01	晴~多云	东北风	1-2级	2018年01月01日	3度	良	3	59
1	-2度	2018-01-02	晴~多云	东北风	1-2级	2018年01月02日	2度	优	2	49
2	-5度	2018-01-03	多云	北风	1-2级	2018年01月03日	0度	差	1	28
3	-1度	2018-01-04	阴	东北风	1-2级	2018年01月04日	1度	良	1	28
4	-6度	2018-01-05	多云-晴	西北风	1-2级	2018年01月05日	-1度	优	1	60
5	-7度	2018-01-06	小雨	西南风	1-2级	2018年01月06日	-2度	良	1	50

八、pandas字符串处理

1、使用方法：先获取series.str属性，然后在属性上调用函数。注意只能在字符串列上使用；
2、使用str的startswith、contains等bool类型的series可以做条件查询；
3、通常需要做多次str处理的链式操作；
4、使用正则表达式的处理，series.str默认开启了正则表达式模式；
5、字符串处理英文参考文档：https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html

df3["bwendu"].str.replace("度","") #bwendu列替换“度”

0     3
1     2
2     0
3     1
4    -1
5    -2
Name: bwendu, dtype: object

df3["ymd"].str.slice(0,7) #获取年月

0    2018-01
1    2018-01
2    2018-01
3    2018-01
4    2018-01
5    2018-01
Name: ymd, dtype: object

df3["ymd"].str.replace("-","").str.slice(0,6) #获取年月
#注意df3["ymd"].str.replace("-","").slice(0,6) #这种写法不对，因为df3["ymd"].str.replace("-","")返回object，需要先获取str属性

0    201801
1    201801
2    201801
3    201801
4    201801
5    201801
Name: ymd, dtype: object

df3["ymd"].str.startswith("2018-01-01") #返回布尔型

0     True
1    False
2    False
3    False
4    False
5    False
Name: ymd, dtype: bool

问题：怎样将“2018年01月01日”中的年月日去掉？

#方法1：链式replace
df3["dates"].str.replace("年","").str.replace("月","").str.replace("日","")

0    20180101
1    20180102
2    20180103
3    20180104
4    20180105
5    20180106
Name: dates, dtype: object

#方法2：正则表达式
df3["dates"].str.replace("[年月日]","") #series.str默认开启了正则表达式模式

0    20180101
1    20180102
2    20180103
3    20180104
4    20180105
5    20180106
Name: dates, dtype: object

九、pandas数据合并Merge、concat、append

9.1 pandas实现DataFrame的Merge

merge：可以根据一个或多个键将不同的DataFrame中的行连接起来。
语法：pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,left_index=False, right_index=False, sort=True,suffixes=('_x', '_y'), copy=True, indicator=False)

以影评数据为例

df_ratings=pd.read_csv('D:/personal_file/python/code/data/t_ratings_auto.txt',
               sep='::', #默认逗号分隔符
               engine='python', #不指定会报ParserWarning
               names=['userid','movieid','rating','timestamp']
               #names="userid::movieid::rating::timestamp".split("::") #高级写法
               )

df_ratings.head()

	userid	movieid	rating	timestamp
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291

df_users=pd.read_csv('D:/personal_file/python/code/data/t_users_auto.txt',
               sep='::', #默认逗号分隔符
               engine='python', #不指定会报ParserWarning
               names=['userid','gender','age','job','zip-code']
               )

df_users.head()

	userid	gender	age	job	zip-code
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455

df_movie=pd.read_csv('D:/personal_file/python/code/data/t_movies_auto.txt',
               sep='::', #默认逗号分隔符
               engine='python', #不指定会报ParserWarning
               names=['movieid','title','genres']
               )

df_movie.head()

	movieid	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

#df_ratings_users=pd.merge(df_ratings, df_users, how='inner', left_on='userid', right_on='userid') #写法1
df_ratings_users=pd.merge(df_ratings, df_users, how='inner', on='userid') #写法2
df_ratings_users.head()

	userid	movieid	rating	timestamp	gender	age	job	zip-code
0	1	1193	5	978300760	F	1	10	48067
1	1	661	3	978302109	F	1	10	48067
2	1	914	3	978301968	F	1	10	48067
3	1	3408	4	978300275	F	1	10	48067
4	1	2355	5	978824291	F	1	10	48067

df_ratings_movie=pd.merge(df_ratings, df_movie, how='inner', on='movieid')
df_ratings_movie.head()

	userid	movieid	rating	timestamp	title	genres
0	1	1193	5	978300760	One Flew Over the Cuckoo's Nest (1975)	Drama
1	2	1193	5	978298413	One Flew Over the Cuckoo's Nest (1975)	Drama
2	12	1193	4	978220179	One Flew Over the Cuckoo's Nest (1975)	Drama
3	15	1193	4	978199279	One Flew Over the Cuckoo's Nest (1975)	Drama
4	17	1193	5	978158471	One Flew Over the Cuckoo's Nest (1975)	Drama

9.2、pandas实现数据concat合并

指定按某个轴进行连接（axis=0/1），也可以指定join方法。
语法：pd.concat(objs, axis=0, join='outer',ignore_index=False)

s1 = pd.Series(['a', 'b'])
s2 = pd.Series(['c', 'd'])

s1

0    a
1    b
dtype: object

s2

0    c
1    d
dtype: object

pd.concat([s1,s2]) # axis=0按行合并

0    a
1    b
0    c
1    d
dtype: object

pd.concat([s1,s2],axis=1) #按列合并

	0	1
0	a	c
1	b	d

pd.concat([s1, s2], ignore_index=True) #忽略原索引

0    a
1    b
2    c
3    d
dtype: object

十、pandas实现groupby分组统计

所有的聚合函数都是在dataframe和series上进行的；
pandas的groupby遵从split、apply、combine模式

#载入包
import pandas as pd 
import numpy as np
#加上这一句能在 jupyter展示matplot图表
%matplotlib inline

df4 = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df4

	A	B	C	D
0	foo	one	1.176942	-0.987104
1	bar	one	0.937907	-0.267920
2	foo	two	0.608983	-0.526361
3	bar	three	-0.032497	0.229008
4	foo	two	-0.994119	0.341918
5	bar	two	0.136394	-0.553871
6	foo	one	-0.009277	-0.507712
7	foo	three	-1.149535	1.778878

对单个列groupby,查询所有数据列的统计

df4.groupby(by='A').sum() #by表示分组
df4.groupby('A').sum()    #同上，省略by
#结果：A列变成了索引，只展示数字列的统计
#df4.groupby['A'].sum() 为什么这种写法错误?因为groupby()是个函数，里面需要参数的

	C	D
A
bar	3.724872	-2.875343
foo	3.183604	-1.814848

对多个列groupby,查询所有数据列的统计

df4.groupby(["A","B"]).sum() #可以看到"A","B"变成了二级索引

		C	D
A	B
bar	one	0.402025	-0.234494
	three	0.027734	-0.610918
	two	0.728252	0.715883
foo	one	1.389367	-1.327171
	three	-0.252610	-0.072518
	two	2.238247	2.877372

df4.groupby(["B","A"]).sum() #可以看到groupby多列分组排列顺序与by关键字内容位置有关

		C	D
B	A
one	bar	0.058339	0.447656
one	foo	-0.481317	1.623534
three	bar	-0.757645	-2.468676
three	foo	0.284769	-0.375871
two	bar	0.466323	-1.722524
two	foo	1.492266	1.060856

df4.groupby(["A","B"],as_index=False).mean() # as_index=False A","B"变成普通列

	A	B	C	D
0	bar	one	0.402025	-0.234494
1	bar	three	0.027734	-0.610918
2	bar	two	0.728252	0.715883
3	foo	one	0.694684	-0.663586
4	foo	three	-0.252610	-0.072518
5	foo	two	1.119123	1.438686

同时查看多种数据统计agg

df4.groupby('A').agg([np.sum,np.mean,np.std])

	C			D
	sum	mean	std	sum	mean	std
A
bar	1.158010	0.386003	0.350534	-0.129530	-0.043177	0.683778
foo	3.375004	0.675001	0.934625	1.477683	0.295537	1.165673

df4.groupby('A').agg([np.sum,np.mean,np.std])["C"] #再筛选C列
#df4.groupby('A')["C"].agg([np.sum,np.mean,np.std]) #这种写法也可

	sum	mean	std
A
bar	1.158010	0.386003	0.350534
foo	3.375004	0.675001	0.934625

不同列使用不同的聚合函数

df4.groupby('A').agg({"C":np.mean,"D":np.sum})

	C	D
A
bar	0.386003	-0.129530
foo	0.675001	1.477683

过滤操作：根据某些条件筛选出符合条件的分组的过程

#筛选D列总和大于0的分组
df4.filter(lambda x: x["D"].sum() < 0 ) #为什么不对？

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-5-0919bf34bb13> in <module>
----> 1 df4.filter(lambda x: x["D"].sum() < 0 )


~\anaconda3\lib\site-packages\pandas\core\generic.py in filter(self, items, like, regex, axis)
   4697         if items is not None:
   4698             name = self._get_axis_name(axis)
-> 4699             return self.reindex(**{name: [r for r in items if r in labels]})
   4700         elif like:
   4701 


TypeError: 'function' object is not iterable

实例：分组统计天气数据

df3=pd.read_csv('D:/personal_file/python/code/天气.txt',sep='\t')
df3.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel
0	2018年01月01日	2018-01-01	3度	-3度	晴~多云	东北风	1-2级	59	良	3
1	2018年01月02日	2018-01-02	2度	-2度	晴~多云	东北风	1-2级	49	优	2
2	2018年01月03日	2018-01-03	0度	-5度	多云	北风	1-2级	28	差	1
3	2018年01月04日	2018-01-04	1度	-1度	阴	东北风	1-2级	28	良	1
4	2018年01月05日	2018-01-05	-1度	-6度	多云-晴	西北风	1-2级	60	优	1

#替换bwendu列和ywendu列的“度”并转化为数字类型
df3.loc[:,"bwendu"]=df3["bwendu"].str.replace("度","").astype("int32")
df3.loc[:,"ywendu"]=df3["ywendu"].str.replace("度","").astype("int32")

#新增一列为月份
df3.loc[:,"month"] = df3["ymd"].str[0:7] #写法1
#df3["month"] =df3["ymd"].str[0:7] #写法2

df3.head()

	dates	ymd	bwendu	ywendu	tianqi	fengxiang	fengli	aqi	aqlinfo	aqilevel	month
0	2018年01月01日	2018-01-01	3	-3	晴~多云	东北风	1-2级	59	良	3	2018-01
1	2018年01月02日	2018-01-02	2	-2	晴~多云	东北风	1-2级	49	优	2	2018-01
2	2018年01月03日	2018-01-03	0	-5	多云	北风	1-2级	28	差	1	2018-01
3	2018年01月04日	2018-01-04	1	-1	阴	东北风	1-2级	28	良	1	2018-01
4	2018年01月05日	2018-01-05	-1	-6	多云-晴	西北风	1-2级	60	优	1	2018-01

#1、查看每月的最高温度
data=df3.groupby("month").max()["bwendu"]
data

month
2018-01    3
Name: bwendu, dtype: int32

data.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a40001d400>

png

df3.plot("ymd","bwendu")

<matplotlib.axes._subplots.AxesSubplot at 0x1a47f8d5790>

png

#2、查看每月的最高温度、最低温度、平均空气质量指数
df3.groupby("month").agg({"bwendu":np.max,"ywendu":np.min,"aqi":np.mean})

	bwendu	ywendu	aqi
month
2018-01	3	-7	45.666667

十一、pandas对每个分组应用apply函数

posted @ 2023-12-06 22:10 灯新阅读(53) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· Python自学整理

· python入门学习之《python编程快速上手》

· pandas(数据)

· Pandas学习之路【1】

· Pandas - 1

公告

昵称：灯新
园龄： 6年1个月
粉丝： 0
关注： 2

+加关注

2025年3月

日

一

二

三

四

五

六

lizixi