pandas

pandas 连接数据库

 1 import numpy as np
 2 import pandas as pd
 3 from sqlalchemy import create_engine
 4 
 5 engine = create_engine("mysql+pymysql://root:100100@localhost/skill_table?charset=UTF8MB4")
 6 data = pd.DataFrame(data=np.random.randint(1,100,size=(100,3)),columns=["python","Math","En"])
 7 # print(data)
 8 # data.to_sql(name="pandas",con=engine,index=False)
 9 r = pd.read_sql("select * from pandas limit 10",con=engine)
10 print(r)

其他技能

map 针对一维数组,applymap针对二维数组

str.lower()

str.extract() 提取

 1 job["positionName"].str.contains("数据分析")
 2 deal_money = job["salary"].str.lower().str.extract(r'(.*?)k-(.*?)k').applymap(lambda x:int(x))
 3 job["Python"] = del_job_detail.map(lambda x:1 if "python" in x else 0)
 4 
 5 def get_field(obj):
 6     ls = obj.split(",")
 7     if ls[0]=="移动互联网" and len(ls)>1:
 8         return ls[1]
 9     else:
10         return ls[0]
11 
12 print(job["industryField"].map(get_field))

 str.startswith

1 df[df['InvoiceNo'].str.startswith('C')]

 

padans sql 方法

1 from pandasql import sqldf
2 # padans sql 方法
3 rr  =sqldf(
4     "select * from df limit 10"
5 )
6 print(rr)

时间处理法

1 df['date'] = df['InvoiceDate'].apply(
2     lambda d:datetime.datetime.strptime(d,'%m/%d/%Y %H:%M').strftime('%Y-%m-%d')
3 )

query

1 start_date = '2011-01-01'
2 end_date = '2011-07-01'
3 
4 df = df.query( """ date >=@start_date and date <@end_date """ )
5 cond = (df['date']>=start_date) & (df['date']<=end_date)
6 
7 print(df.head(3).T)
8 print(df[cond].head(3).T)

 

 

一、Series

1 s = pd.Series(data=1) # 带索引的数组
2 s = pd.Series(data=[1,2,3])
3 s = pd.Series(data=[1,2,3],index=["a","b","c"],dtype='float32')
4 s = pd.Series(data={"a":1,"b":2},name="python Series")

 

二、DataFrame

1 s1 = pd.DataFrame(data={"python":[22,12,33],"C++":[1,2,3]},index=["tom","JIm","mir"])
2 s1 = pd.DataFrame(data=np.random.randint(2,12,(2,3)),index=["tom","JIm"],columns=["列1","列2","列3"])

 

三、查看

1 s2 = s1.head(2)
2 s2 = s1.tail(2)
3 s2 = s1.shape
4 s2 = s1.dtypes
5 s2 = s1.index
6 s2 = s1.columns
7 s2 = s1.values # 对象值,⼆维ndarray数组
8 s2 = s1.describe()
9 s2 = s1.info()

 

四、文件操作

 1 # csv
 2 df = pd.DataFrame(data=np.random.randint(1,5,(3,2)),columns=["语文","数学"])
 3 df.to_csv(
 4     'xx.csv',
 5     # sep=';',
 6     header=True,
 7     index=False
 8 )
 9 r = pd.read_csv(
10     'xx.csv',
11     # header=[0],
12     # index_col=1
13 )
14 # excel
15 df.to_excel(
16     'xx.xls',
17     sheet_name="salary",
18     header=True,
19     index=False
20 )
21 
22 rr = pd.read_excel(
23     "xx.xls",
24     sheet_name="salary",
25     names=list("ab"), # 替换header
26 )
27 
28 # ⼀个Excel⽂件中保存多个⼯作表
29 with pd.ExcelWriter("xx.xlsx") as f:
30     df.to_excel(f,sheet_name="one",index=False)
31     df.to_excel(f,sheet_name="two",index=False)
32 print(rr)
33 
34 # sql
35 from sqlalchemy import create_engine
36 df = pd.DataFrame(data={"python":33,"Math":60,"En":89},index=[0])
37 
38 # 数据库连接
39 conn = create_engine(
40     'mysql+pymysql://root:100100@localhost/skill_table?charset=UTF8MB4'
41 )
42 
43 # 保存到数据库
44 df.to_sql(
45     "pandas",#数据库中表名
46     conn,
47     if_exists="append",#如果表名存在,追加数据
48     index=False
49 )
50 # 从数据库中加载
51 r = pd.read_sql("select * from pandas limit 10",
52                 conn,
53                 )

 

五、获取数据

 1 # 标签选择
 2 rr = r[3:5] # ⾏切⽚
 3 rr = r.loc[[0,1]]  # ⾏切⽚
 4 rr = r.loc[0,'python'] # 0行python列
 5 rr = r.loc[[0,1],["python"]]
 6 rr = r.loc[0:3,["python"]]
 7 rr = r.loc[:,["python"]]
 8 rr = r.loc[2::2,["python"]]
 9 rr = r.loc[2:,["python"]]
10 rr = r.loc[2:,"Math":]
1 # 位置选择
2 rr2 = r.iloc[4] # ⾏切⽚
3 rr2 = r.iloc[4:5,1:2] #行、列索引
4 rr2 = r.iloc[[1,2],[1,2]] #行、列索引
5 rr2 = r.iloc[0,2] # 选取标量值
1 # boolean索引选择
2 cond = r["python"]>70
3 cond2 = (r["python"]>50) & (r["python"]<70)
4 cond2 = (r["python"]>50) | (r["python"]<70)
5 dd = r[cond2]

六、concat、append、insert、join

 1 # 连接 concat、append
 2 pd.concat([r,r2],axis=0)
 3 r.append(r2)
 4 
 5 # 插入列
 6 r2.insert(loc=0,column='dd',value=66)
 7 
 8 # join-left、right、outer、inner
 9 df1 = pd.DataFrame(data = {'name':
10 ['softpo','Daniel','Brandon','Ella'],'weight':[70,55,75,65]})
11 # 表⼆中记录的是name和身⾼信息
12 df2 = pd.DataFrame(data = {'name':
13 ['softpo','Daniel','Brandon','Cindy'],'height':[172,170,170,166]})
14 df3 = pd.DataFrame(data = {'名字':
15 ['softpo','Daniel','Brandon','Cindy'],'height':[172,170,170,166]})
16 vv = pd.merge(
17     df1,df2,
18     how='inner',
19     on="name"
20 )
21 vv = pd.merge(
22     df1,df3,
23     how='outer',
24     left_on="name",
25     right_on="名字"
26 )

 

案例:增加一列

 1 df4 = pd.DataFrame(data = np.random.randint(0,151,size = (10,3)),
 2 index = list('ABCDEFHIJK'),
 3 columns=['Python','Keras','Tensorflow'])
 4 score_mean = pd.DataFrame(df4.mean(axis = 1).round(1),columns=['平均分'])
 5 
 6 print(df4)
 7 print(score_mean)
 8 # 赋值法
 9 df4["平均分"]=score_mean
10 # 方法一 insert
11 df4.insert(loc=3,column='平均分',value=score_mean)
12 # 方法二 concat
13 result = pd.concat([df4,score_mean],axis=1)
14 # 方法三 merge
15 result = pd.merge(
16     left=df4,
17     right=score_mean,
18     left_index=True,
19     right_index=True
20 )

 七、数据清洗

 1 df = pd.DataFrame(data = {'color':
 2 ['red','blue','red','green','blue',None,'red'],
 3 'price':[10,20,10,15,20,0,np.NaN]})
 4 # 1、重复数据过滤
 5 df.duplicated() # 判断是否存在重复数据
 6 df.drop_duplicates() # 删除重复数据

7 # 2、空数据过滤 8 df.isnull() # 判断是否存在空数据,存在返回True,否则返回False 9 df.dropna(how = 'any') # 删除空数据 10 df.fillna(value=1111) # 填充空数据

11 # 3、指定⾏或者列过滤 12 del df['color'] # 直接删除某列 13 df.drop(labels = ['price'],axis = 1)# 删除指定列 14 df.drop(labels = [0,1,5],axis = 0) # 删除指定⾏

15 # 4、函数filter使⽤ 16 df = pd.DataFrame(np.array(([3,7,1], [2, 8, 256])), 17 index=['dog', 'cat'], 18 columns=['China', 'America', 'France']) 19 df.filter(items=['China', 'France']) 20 # 根据正则表达式删选列标签 21 df.filter(regex='a$', axis=1) 22 # 选择⾏中包含og 23 df.filter(like='og', axis=0)
24 # 5、异常值过滤 25 df2 = pd.DataFrame(data = np.random.randn(10000,3)) # 正态分布数据 26 # 3σ过滤异常值,σ即是标准差 27 cond = (df2 > 3*df2.std()).any(axis = 1) 28 index = df2[cond].index # 不满⾜条件的⾏索引 29 df2.drop(labels=index,axis = 0) # 根据⾏索引,进⾏数据删除

 

八、replace

1 df.replace(3,1024) #将3替换为1024
2 df.replace([0,7],2048) # 将0和7替换为2048
3 df.replace({0:512,np.nan:998}) # 根据字典键值对进⾏替换
4 df.replace({'Python':2},-1024) # 将Python这⼀列中等于2的,替换为-102

 九、计算

简单统计指标

 1 df = pd.DataFrame(data = np.random.randint(0,100,size = (20,3)),
 2 index = list('ABCDEFHIJKLMNOPQRSTU'),
 3 columns=['Python','Tensorflow','Keras'])
 4 # 1、简单统计指标
 5 df.count() # ⾮NA值的数量
 6 df.max(axis = 0) #轴0最⼤值,即每⼀列最⼤值
 7 df.min() #默认计算轴0最⼩值
 8 df.median() # 中位数
 9 df.sum() # 求和
10 df.mean(axis = 1) #轴1平均值,即每⼀⾏的平均值
11 df.quantile(q = [0.2,0.4,0.8]) # 分位数
12 df.describe() # 查看数值型列的汇总统计,计数、平均值、标准差、最⼩值、四分位数、最⼤值

 

 索引标签、位置获取

1 df['Python'].argmin() # 计算最⼩值位置
2 df['Keras'].argmax() # 最⼤值位置
3 df.idxmax() # 最⼤值索引标签
4 df.idxmin() # 最⼩值索引标签

更多统计指标

 1 df['Python'].value_counts() # 统计元素出现次数
 2 df['Keras'].unique() # 去重
 3 df.cumsum() # 累加
 4 df.cumprod() # 累乘
 5 df.std() # 标准差
 6 df.var() # ⽅差
 7 df.cummin() # 累计最⼩值
 8 df.cummax() # 累计最⼤值
 9 df.diff() # 计算差分
10 df.pct_change() # 计算百分⽐变化

高级统计指标

df.cov() # 属性的协⽅差
df['Python'].cov(df['Keras']) # Python和Keras的协⽅差
df.corr() # 所有属性相关性系数
df.corrwith(df['Tensorflow']) # 单⼀属性相关性系数

十、排序

 1 df = pd.DataFrame(data = np.random.randint(0,30,size = (30,3)),
 2 index = list('qwertyuioijhgfcasdcvbnerfghjcf'),
 3 columns = ['Python','Keras','Pytorch'])
 4 # 1、索引列名排序
 5 df.sort_index(axis = 0,ascending=True) # 按索引排序,升序
 6 df.sort_index(axis = 1,ascending=False) #按列名排序,降序
 7 # 2、属性值排序
 8 df.sort_values(by = ['Python']) #按Python属性值排序
 9 df.sort_values(by = ['Python','Keras'])#先按Python,再按Keras排序
10 
11 # 3、返回属性前几或者后几
12 df.nlargest(10,columns='Keras') # 根据属性Keras排序,返回最大10个数据
13 df.nsmallest(5,columns='Python') # 根据属性Python排序,返回最小5个数据

 

十一、分箱操作

 1 df = pd.DataFrame(data = np.random.randint(0,150,size = (100,3)),
 2 columns=['Python','Tensorflow','Keras'])
 3 # 1、等宽分箱
 4 pd.cut(df.Python,bins = 3)
 5 # 指定宽度分箱
 6 pd.cut(df.Keras,#分箱数据
 7 bins = [0,60,90,120,150],#分箱断点
 8 right = False,# 左闭右开
 9 labels=['不及格','中等','良好','优秀'])# 分箱后分类
10 # 2、等频分箱
11 pd.qcut(df.Python,q = 4,# 4等分
12 labels=['','','','']) # 分箱后分类

 

十二、分组聚合

 1 df = pd.DataFrame(data =
 2 {'sex':np.random.randint(0,2,size = 300), # 0男,1⼥
 3 'class':np.random.randint(1,9,size = 300),#1~8⼋个班
 4 'Python':np.random.randint(0,151,size = 300),#Python成绩
 5 'Keras':np.random.randint(0,151,size =300),#Keras成绩
 6 'Tensorflow':np.random.randint(0,151,size=300),
 7 'Java':np.random.randint(0,151,size = 300),
 8 'C++':np.random.randint(0,151,size = 300)})
 9 def xx(obj):
10     if obj==0:
11         return ""
12     else:
13         return ""
14 # 方法一
15 df['sex'].apply(xx)
16 # 方法二
17 df['sex']=df['sex'].map({0:"",1:""})
18 
19 # 分组
20 s = df.groupby(by="sex")
21 for name,data in s:
22     print('组名:', name)
23     print('数据:', data)
24 df['Python'].groupby([df["class"],df['sex']])
25 
26 # 分组后的计算
27 # 按照性别分组,其他列均值聚合
28 df.groupby(by = 'sex').mean().round(1) # 保留1位⼩数
29 # 按照班级和性别进⾏分组,Python、Keras的最⼤值聚合
30 df.groupby(by = ['class','sex'])[['Python','Keras']].max()
31 # 按照班级和性别进⾏分组,计数聚合。统计每个班,男⼥⼈数
32 df.groupby(by = ['class','sex']).size()
33 # 基本描述性统计聚合
34 df.groupby(by = ['class','sex']).describe()
35 
36 # 分组聚合apply
37 df.groupby(by = ['class','sex'])[['Python','Keras']].apply(np.mean).round(1)
38 
39 #agg
40 # 分组后调⽤agg应⽤多种统计汇总
41 df.groupby(by = ['class','sex'])
42 [['Tensorflow','Keras']].agg([np.max,np.min,pd.Series.count])
43 # 分组后不同属性应⽤多种不同统计汇总
44 df.groupby(by = ['class','sex'])[['Python','Keras']].agg({'Python':[('最⼤
45',np.max),('最⼩值',np.min)],
46 'Keras':[('
47',pd.Series.count),('中位数',np.median)]})

 

posted @ 2021-08-30 18:42  东方不败--Never  阅读(66)  评论(0编辑  收藏  举报