pandas
pandas 连接数据库
1 import numpy as np 2 import pandas as pd 3 from sqlalchemy import create_engine 4 5 engine = create_engine("mysql+pymysql://root:100100@localhost/skill_table?charset=UTF8MB4") 6 data = pd.DataFrame(data=np.random.randint(1,100,size=(100,3)),columns=["python","Math","En"]) 7 # print(data) 8 # data.to_sql(name="pandas",con=engine,index=False) 9 r = pd.read_sql("select * from pandas limit 10",con=engine) 10 print(r)
其他技能
map 针对一维数组,applymap针对二维数组
str.lower()
str.extract() 提取
1 job["positionName"].str.contains("数据分析") 2 deal_money = job["salary"].str.lower().str.extract(r'(.*?)k-(.*?)k').applymap(lambda x:int(x)) 3 job["Python"] = del_job_detail.map(lambda x:1 if "python" in x else 0) 4 5 def get_field(obj): 6 ls = obj.split(",") 7 if ls[0]=="移动互联网" and len(ls)>1: 8 return ls[1] 9 else: 10 return ls[0] 11 12 print(job["industryField"].map(get_field))
str.startswith
1 df[df['InvoiceNo'].str.startswith('C')]
padans sql 方法
1 from pandasql import sqldf 2 # padans sql 方法 3 rr =sqldf( 4 "select * from df limit 10" 5 ) 6 print(rr)
时间处理法
1 df['date'] = df['InvoiceDate'].apply( 2 lambda d:datetime.datetime.strptime(d,'%m/%d/%Y %H:%M').strftime('%Y-%m-%d') 3 )
query
1 start_date = '2011-01-01' 2 end_date = '2011-07-01' 3 4 df = df.query( """ date >=@start_date and date <@end_date """ ) 5 cond = (df['date']>=start_date) & (df['date']<=end_date) 6 7 print(df.head(3).T) 8 print(df[cond].head(3).T)
一、Series
1 s = pd.Series(data=1) # 带索引的数组 2 s = pd.Series(data=[1,2,3]) 3 s = pd.Series(data=[1,2,3],index=["a","b","c"],dtype='float32') 4 s = pd.Series(data={"a":1,"b":2},name="python Series")
二、DataFrame
1 s1 = pd.DataFrame(data={"python":[22,12,33],"C++":[1,2,3]},index=["tom","JIm","mir"]) 2 s1 = pd.DataFrame(data=np.random.randint(2,12,(2,3)),index=["tom","JIm"],columns=["列1","列2","列3"])
三、查看
1 s2 = s1.head(2) 2 s2 = s1.tail(2) 3 s2 = s1.shape 4 s2 = s1.dtypes 5 s2 = s1.index 6 s2 = s1.columns 7 s2 = s1.values # 对象值,⼆维ndarray数组 8 s2 = s1.describe() 9 s2 = s1.info()
四、文件操作
1 # csv 2 df = pd.DataFrame(data=np.random.randint(1,5,(3,2)),columns=["语文","数学"]) 3 df.to_csv( 4 'xx.csv', 5 # sep=';', 6 header=True, 7 index=False 8 ) 9 r = pd.read_csv( 10 'xx.csv', 11 # header=[0], 12 # index_col=1 13 ) 14 # excel 15 df.to_excel( 16 'xx.xls', 17 sheet_name="salary", 18 header=True, 19 index=False 20 ) 21 22 rr = pd.read_excel( 23 "xx.xls", 24 sheet_name="salary", 25 names=list("ab"), # 替换header 26 ) 27 28 # ⼀个Excel⽂件中保存多个⼯作表 29 with pd.ExcelWriter("xx.xlsx") as f: 30 df.to_excel(f,sheet_name="one",index=False) 31 df.to_excel(f,sheet_name="two",index=False) 32 print(rr) 33 34 # sql 35 from sqlalchemy import create_engine 36 df = pd.DataFrame(data={"python":33,"Math":60,"En":89},index=[0]) 37 38 # 数据库连接 39 conn = create_engine( 40 'mysql+pymysql://root:100100@localhost/skill_table?charset=UTF8MB4' 41 ) 42 43 # 保存到数据库 44 df.to_sql( 45 "pandas",#数据库中表名 46 conn, 47 if_exists="append",#如果表名存在,追加数据 48 index=False 49 ) 50 # 从数据库中加载 51 r = pd.read_sql("select * from pandas limit 10", 52 conn, 53 )
五、获取数据
1 # 标签选择 2 rr = r[3:5] # ⾏切⽚ 3 rr = r.loc[[0,1]] # ⾏切⽚ 4 rr = r.loc[0,'python'] # 0行python列 5 rr = r.loc[[0,1],["python"]] 6 rr = r.loc[0:3,["python"]] 7 rr = r.loc[:,["python"]] 8 rr = r.loc[2::2,["python"]] 9 rr = r.loc[2:,["python"]] 10 rr = r.loc[2:,"Math":]
1 # 位置选择 2 rr2 = r.iloc[4] # ⾏切⽚ 3 rr2 = r.iloc[4:5,1:2] #行、列索引 4 rr2 = r.iloc[[1,2],[1,2]] #行、列索引 5 rr2 = r.iloc[0,2] # 选取标量值
1 # boolean索引选择 2 cond = r["python"]>70 3 cond2 = (r["python"]>50) & (r["python"]<70) 4 cond2 = (r["python"]>50) | (r["python"]<70) 5 dd = r[cond2]
六、concat、append、insert、join
1 # 连接 concat、append 2 pd.concat([r,r2],axis=0) 3 r.append(r2) 4 5 # 插入列 6 r2.insert(loc=0,column='dd',value=66) 7 8 # join-left、right、outer、inner 9 df1 = pd.DataFrame(data = {'name': 10 ['softpo','Daniel','Brandon','Ella'],'weight':[70,55,75,65]}) 11 # 表⼆中记录的是name和身⾼信息 12 df2 = pd.DataFrame(data = {'name': 13 ['softpo','Daniel','Brandon','Cindy'],'height':[172,170,170,166]}) 14 df3 = pd.DataFrame(data = {'名字': 15 ['softpo','Daniel','Brandon','Cindy'],'height':[172,170,170,166]}) 16 vv = pd.merge( 17 df1,df2, 18 how='inner', 19 on="name" 20 ) 21 vv = pd.merge( 22 df1,df3, 23 how='outer', 24 left_on="name", 25 right_on="名字" 26 )
案例:增加一列
1 df4 = pd.DataFrame(data = np.random.randint(0,151,size = (10,3)), 2 index = list('ABCDEFHIJK'), 3 columns=['Python','Keras','Tensorflow']) 4 score_mean = pd.DataFrame(df4.mean(axis = 1).round(1),columns=['平均分']) 5 6 print(df4) 7 print(score_mean) 8 # 赋值法 9 df4["平均分"]=score_mean 10 # 方法一 insert 11 df4.insert(loc=3,column='平均分',value=score_mean) 12 # 方法二 concat 13 result = pd.concat([df4,score_mean],axis=1) 14 # 方法三 merge 15 result = pd.merge( 16 left=df4, 17 right=score_mean, 18 left_index=True, 19 right_index=True 20 )
七、数据清洗
1 df = pd.DataFrame(data = {'color': 2 ['red','blue','red','green','blue',None,'red'], 3 'price':[10,20,10,15,20,0,np.NaN]}) 4 # 1、重复数据过滤 5 df.duplicated() # 判断是否存在重复数据 6 df.drop_duplicates() # 删除重复数据
7 # 2、空数据过滤 8 df.isnull() # 判断是否存在空数据,存在返回True,否则返回False 9 df.dropna(how = 'any') # 删除空数据 10 df.fillna(value=1111) # 填充空数据
11 # 3、指定⾏或者列过滤 12 del df['color'] # 直接删除某列 13 df.drop(labels = ['price'],axis = 1)# 删除指定列 14 df.drop(labels = [0,1,5],axis = 0) # 删除指定⾏
15 # 4、函数filter使⽤ 16 df = pd.DataFrame(np.array(([3,7,1], [2, 8, 256])), 17 index=['dog', 'cat'], 18 columns=['China', 'America', 'France']) 19 df.filter(items=['China', 'France']) 20 # 根据正则表达式删选列标签 21 df.filter(regex='a$', axis=1) 22 # 选择⾏中包含og 23 df.filter(like='og', axis=0)
24 # 5、异常值过滤 25 df2 = pd.DataFrame(data = np.random.randn(10000,3)) # 正态分布数据 26 # 3σ过滤异常值,σ即是标准差 27 cond = (df2 > 3*df2.std()).any(axis = 1) 28 index = df2[cond].index # 不满⾜条件的⾏索引 29 df2.drop(labels=index,axis = 0) # 根据⾏索引,进⾏数据删除
八、replace
1 df.replace(3,1024) #将3替换为1024 2 df.replace([0,7],2048) # 将0和7替换为2048 3 df.replace({0:512,np.nan:998}) # 根据字典键值对进⾏替换 4 df.replace({'Python':2},-1024) # 将Python这⼀列中等于2的,替换为-102
九、计算
简单统计指标
1 df = pd.DataFrame(data = np.random.randint(0,100,size = (20,3)), 2 index = list('ABCDEFHIJKLMNOPQRSTU'), 3 columns=['Python','Tensorflow','Keras']) 4 # 1、简单统计指标 5 df.count() # ⾮NA值的数量 6 df.max(axis = 0) #轴0最⼤值,即每⼀列最⼤值 7 df.min() #默认计算轴0最⼩值 8 df.median() # 中位数 9 df.sum() # 求和 10 df.mean(axis = 1) #轴1平均值,即每⼀⾏的平均值 11 df.quantile(q = [0.2,0.4,0.8]) # 分位数 12 df.describe() # 查看数值型列的汇总统计,计数、平均值、标准差、最⼩值、四分位数、最⼤值
索引标签、位置获取
1 df['Python'].argmin() # 计算最⼩值位置 2 df['Keras'].argmax() # 最⼤值位置 3 df.idxmax() # 最⼤值索引标签 4 df.idxmin() # 最⼩值索引标签
更多统计指标
1 df['Python'].value_counts() # 统计元素出现次数 2 df['Keras'].unique() # 去重 3 df.cumsum() # 累加 4 df.cumprod() # 累乘 5 df.std() # 标准差 6 df.var() # ⽅差 7 df.cummin() # 累计最⼩值 8 df.cummax() # 累计最⼤值 9 df.diff() # 计算差分 10 df.pct_change() # 计算百分⽐变化
高级统计指标
df.cov() # 属性的协⽅差 df['Python'].cov(df['Keras']) # Python和Keras的协⽅差 df.corr() # 所有属性相关性系数 df.corrwith(df['Tensorflow']) # 单⼀属性相关性系数
十、排序
1 df = pd.DataFrame(data = np.random.randint(0,30,size = (30,3)), 2 index = list('qwertyuioijhgfcasdcvbnerfghjcf'), 3 columns = ['Python','Keras','Pytorch']) 4 # 1、索引列名排序 5 df.sort_index(axis = 0,ascending=True) # 按索引排序,升序 6 df.sort_index(axis = 1,ascending=False) #按列名排序,降序 7 # 2、属性值排序 8 df.sort_values(by = ['Python']) #按Python属性值排序 9 df.sort_values(by = ['Python','Keras'])#先按Python,再按Keras排序 10 11 # 3、返回属性前几或者后几 12 df.nlargest(10,columns='Keras') # 根据属性Keras排序,返回最大10个数据 13 df.nsmallest(5,columns='Python') # 根据属性Python排序,返回最小5个数据
十一、分箱操作
1 df = pd.DataFrame(data = np.random.randint(0,150,size = (100,3)), 2 columns=['Python','Tensorflow','Keras']) 3 # 1、等宽分箱 4 pd.cut(df.Python,bins = 3) 5 # 指定宽度分箱 6 pd.cut(df.Keras,#分箱数据 7 bins = [0,60,90,120,150],#分箱断点 8 right = False,# 左闭右开 9 labels=['不及格','中等','良好','优秀'])# 分箱后分类 10 # 2、等频分箱 11 pd.qcut(df.Python,q = 4,# 4等分 12 labels=['差','中','良','优']) # 分箱后分类
十二、分组聚合
1 df = pd.DataFrame(data = 2 {'sex':np.random.randint(0,2,size = 300), # 0男,1⼥ 3 'class':np.random.randint(1,9,size = 300),#1~8⼋个班 4 'Python':np.random.randint(0,151,size = 300),#Python成绩 5 'Keras':np.random.randint(0,151,size =300),#Keras成绩 6 'Tensorflow':np.random.randint(0,151,size=300), 7 'Java':np.random.randint(0,151,size = 300), 8 'C++':np.random.randint(0,151,size = 300)}) 9 def xx(obj): 10 if obj==0: 11 return "女" 12 else: 13 return "男" 14 # 方法一 15 df['sex'].apply(xx) 16 # 方法二 17 df['sex']=df['sex'].map({0:"女",1:"男"}) 18 19 # 分组 20 s = df.groupby(by="sex") 21 for name,data in s: 22 print('组名:', name) 23 print('数据:', data) 24 df['Python'].groupby([df["class"],df['sex']]) 25 26 # 分组后的计算 27 # 按照性别分组,其他列均值聚合 28 df.groupby(by = 'sex').mean().round(1) # 保留1位⼩数 29 # 按照班级和性别进⾏分组,Python、Keras的最⼤值聚合 30 df.groupby(by = ['class','sex'])[['Python','Keras']].max() 31 # 按照班级和性别进⾏分组,计数聚合。统计每个班,男⼥⼈数 32 df.groupby(by = ['class','sex']).size() 33 # 基本描述性统计聚合 34 df.groupby(by = ['class','sex']).describe() 35 36 # 分组聚合apply 37 df.groupby(by = ['class','sex'])[['Python','Keras']].apply(np.mean).round(1) 38 39 #agg 40 # 分组后调⽤agg应⽤多种统计汇总 41 df.groupby(by = ['class','sex']) 42 [['Tensorflow','Keras']].agg([np.max,np.min,pd.Series.count]) 43 # 分组后不同属性应⽤多种不同统计汇总 44 df.groupby(by = ['class','sex'])[['Python','Keras']].agg({'Python':[('最⼤ 45 值',np.max),('最⼩值',np.min)], 46 'Keras':[('计 47 数',pd.Series.count),('中位数',np.median)]})