pandas的操作
pandas的拼接操作
级联 pd.concat , pd.append
合并 pd.merge , pd.join
一丶pd.concat()级联
objs
axis=0
keys
join='outer' / 'inner':表示的是级联的方式,outer会将所有的项进行级联(忽略匹配和不匹配),而inner只会将匹配的项级联到一起,不匹配的不级联
ignore_index=False
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
df1 = DataFrame(data=np.random.randint(0,100,size=(3,4)))
df2 = DataFrame(data=np.random.randint(0,100,size=(3,4)))
pd.concat((df1,df2),axis=0)
pd.concat((df1,df2),axis=0,join='outer')
pd.concat((df1,df2),axis=0,join='inner')
二丶pd.merge()合并
how:out取并集 inner取交集
on:当有多列相同的时候,可以使用on来指定使用那一列进行合并,on的值为一个列表
df1 = DataFrame({'employee':['Bob','Jake','Lisa'],
'group':['Accounting','Engineering','Engineering'],
})
df2 = DataFrame({'employee':['Lisa','Bob','Jake'],
'hire_date':[2004,2008,2012],
})
pd.merge(df1,df2)
df3 = DataFrame({
'employee':['Lisa','Jake'],
'group':['Accounting','Engineering'],
'hire_date':[2004,2016]})
df4 = DataFrame({'group':['Accounting','Engineering','Engineering'],
'supervisor':['Carly','Guido','Steve']
})
pd.merge(df3,df4)
df1 = DataFrame({'employee':['Bob','Jake','Lisa'],
'group':['Accounting','Engineering','Engineering']})
df5 = DataFrame({'group':['Engineering','Engineering','HR'],
'supervisor':['Carly','Guido','Steve']
})
pd.merge(df1,df5,how='right')
df1 = DataFrame({'employee':['Jack',"Summer","Steve"],
'group':['Accounting','Finance','Marketing']})
df2 = DataFrame({'employee':['Jack','Bob',"Jake"],
'hire_date':[2003,2009,2012],
'group':['Accounting','sell','ceo']})
pd.merge(df1,df2,on='employee')
df5 = DataFrame({'name':['Lisa','Bobs','Bill'],
'hire_dates':[1998,2016,2007]})
pd.merge(df1,df5,left_on='employee',right_on='name',how='outer')
df6 = DataFrame({'name':['Peter','Paul','Mary'],
'food':['fish','beans','bread']}
)
df7 = DataFrame({'name':['Mary','Joseph'],
'drink':['wine','beer']})
df6 = DataFrame({'name':['Peter','Paul','Mary'],
'food':['fish','beans','bread']}
)
df7 = DataFrame({'name':['Mary','Joseph'],
'drink':['wine','beer']})
案例:人口分析
import numpy as np
from pandas import DataFrame,Series
import pandas as pd
abb = pd.read_csv('./data/state-abbrevs.csv')
abb.head(2)
pop = pd.read_csv('./data/state-population.csv')
pop.head(2)
area = pd.read_csv('./data/state-areas.csv')
area.head(2)
abb_pop = pd.merge(abb,pop,left_on='abbreviation',right_on='state/region',how='outer')
abb_pop.head(2)
abb_pop.drop(labels='abbreviation',axis=1,inplace=True)
abb_pop.head(2)
abb_pop.isnull().any(axis=0)
abb_pop['state'].isnull()
abb_pop.loc[abb_pop['state'].isnull()]
abb_pop.loc[abb_pop['state'].isnull()]['state/region'].unique()
abb_pop['state/region'] == 'USA'
abb_pop.loc[abb_pop['state/region'] == 'USA']
indexs = abb_pop.loc[abb_pop['state/region'] == 'USA'].index
abb_pop.loc[indexs,'state'] = 'United Status'
abb_pop['state/region'] == 'PR'
abb_pop.loc[abb_pop['state/region'] == 'PR']
indexs = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
abb_pop.loc[indexs,'state'] = 'PUERTO RICO'
abb_pop_area = pd.merge(abb_pop,area,how='outer')
abb_pop_area.head(3)
abb_pop_area['area (sq. mi)'].isnull()
indexs = abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()].index
abb_pop_area.drop(labels=indexs,axis=0,inplace=True)
abb_pop_area.query('year == 2010 & ages == "total"')
abb_pop_area['midu'] = abb_pop_area['population'] / abb_pop_area['area (sq. mi)']
abb_pop_area.head(2)
abb_pop_area.sort_values(by='midu',axis=0,ascending=False).head(5)
三丶数据处理
删除重复的元素
映射
单值替换
普通替换: 替换所有符合要求的元素:to_replace=15,value='e'
按列指定单值替换: to_replace={列标签:替换值} value='value'
多值替换
列表替换: to_replace=[] value=[]
字典替换(推荐) to_replace={to_replace:value,to_replace:value}
df.replace(to_replace=6,value='six')
df.replace(to_replace={6:'six'})
df.replace(to_replace={5:6},value='six')
dic = {
'name':['张三','周杰伦','张三'],
'salary':[20000,10000,20000]
}
df = DataFrame(data=dic)
dic = {
'张三':'tom',
'周杰伦':'jay'
}
df['e_name'] = df['name'].map(dic)
return s - (s-3000)*0.5
df['after_sal'] = df['salary'].map(after_sal)
df['after_sal'] = df['salary'].apply(after_sal)
四丶使用聚合操作对数据异常值检测和过滤
df = DataFrame(data=np.random.random(size=(1000,3)),columns=['A','B','C'])
twice_std = df['C'].std() * 2
df.loc[~(df['C'] > twice_std)]
dropna
fillna
drop_duplicated(keep)
得到鉴定异常值的条件
将异常值对应的行删除
无论是iloc还是loc 均采用[]而不是括号
1. 如果只是取行 建议用iloc 因为比较简单
2. 如果列和行同时取 建议采用loc 因为可以直接定义到标签
3.iloc:位置索引,和列表索引类似,里面只能是数字
4.loc:标签索引,只能使用字符型标签来索引数据
五丶排序
np.random.permutation(1000)
df.take(indices=np.random.permutation(1000),axis=0).take(indices=np.random.permutation(3),axis=1)
df.take(indices=np.random.permutation(1000),axis=0).take(indices=np.random.permutation(3),axis=1)[0:5]
# 随机抽样
# 当DataFrame规模足够大时,直接使用np.random.permutation(x)函数,就配合take()函数实现随机抽样
六丶 数据分类处理【重点】
分组
df = DataFrame({'item':['Apple','Banana','Orange','Banana','Orange','Apple'],
'price':[4,3,3,2.5,4,2],
'color':['red','yellow','yellow','green','green','green'],
'weight':[12,20,50,30,20,44]})
df.groupby(by='item',axis=0)
df.groupby(by='item',axis=0).groups
df.groupby(by='item',axis=0).mean()['price']
mean_price_series = df.groupby(by='item',axis=0)['price'].mean()
dic = mean_price_series.to_dict()
df['mean_price'] = df['item'].map(dic)
df['color_mean_price'] = df['color'].map(df.groupby(by='color')['price'].mean().to_dict())
高级数据聚合
df.groupby(by='item')['price'].mean()
def my_mean(s):
sum = 0
for i in s:
sum += i
return sum/len(s)
df.groupby(by='item')['price'].apply(my_mean)
df.groupby(by='item')['price'].transform(my_mean)


apply() 方法+自定义函数

pandas:apply和transform方法的性能比较
案例:美国2012年总统候选人政治献金数据分析
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick',
'Paul, Ron', 'Gingrich, Newt']
parties = {
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican'
}
df = pd.read_csv('./data/usa_election.txt')
df.head()
df['party'] = df['cand_nm'].map(parties)
df.head()
df['party'].unique()
df['party'].value_counts()
df.groupby(by='party')['contb_receipt_amt'].sum()
df.groupby(by=['contb_receipt_dt','party'])['contb_receipt_amt'].sum()
def transformDate(d):
day,month,year = d.split('-')
month = months[month]
return '20'+year+'-'+str(month)+'-'+day
df['contb_receipt_dt'] = df['contb_receipt_dt'].map(transformDate)
df.head()
df['contbr_occupation'] == 'DISABLED VETERAN'
old_bing = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']
old_bing.groupby(by='cand_nm')['contb_receipt_amt'].sum()
df['contb_receipt_amt'].max()
df.query('contb_receipt_amt == 1944042.43')
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?