Pandas Dataframe的一些操作
读取txt
1 2 3 4 5 6 7 8 9 | # ' 和 " 没区别,路径为 \ ,./ 表示为当前目录下路径 df = pd.read_csv( "./input/" + year + "ha.txt" ,skiprows = 13 ,skipfooter = 2 ,header = None ,sep = " " ) #skiprows=[0,2,5]跳过指定行 skiprows=[i for i in range(1,3)]跳过除首行外前n行 #skipfooter=2 跳过末尾n行 #skiprows= lambda x: logic(x) 跳过符合条件的行 其中logic为设定的筛选函数,可参照如下设置 def logic(index): if index % 3 = = 0 : return True return False |
读取dbf
1 2 3 4 5 | import pandas as pd from simpledbf import Dbf5 #未下载需下载pip install simpledbf dbf = Dbf5(r "D:\01DATA\dem.dbf" ) df = dbf.to_dataframe() df[ 'Id' ] = df.index |
1 2 3 4 5 | #删除列 del df[ 0 ] del df[ 5 ]<br> del df[ 'name' ] df = df.iloc[ 0 : 221 ] #选择前221行 df.columns = [ 'location_code' , 'Hub' , 'Autho' ] #给列重命名 |
1 2 3 4 | df3.columns = [ 'Eigenvec' ] df3 = df3[ 'Eigenvec' ]. str .split( ' ' , expand = True ) #在某列内部以空格进行分割 df3[ 'new' ] = df3.index #创建新列 和索引列相同 loca = df3[ 'new' ]. str .split( ' ' , expand = True ) |
1 2 | result = pd.merge(df1, df2, how = 'inner' , on = [ 'location_code' ]) #公共列为'location_code' 取交集 result = pd.merge(result, df3, how = 'inner' , on = [ 'location_code' ]) |
1 | result.to_excel( './output/worldtrade_' + year + '_all.xlsx' ,sheet_name = 'Sheet' ) #导出 |
编码错误
1 2 | df_w = pd.read_table(r 'C:\Users\lab\list.txt' , encoding = 'ISO-8859-1' ) #\加r #encoding = 'utf-8' |
将多个dataframe存到一个excel中不同sheet中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | from pandas import ExcelWriter fields = [ 'degree' , 'weighted degree' , 'Eigenvec' , 'Autho' , 'Hub' ] years = [ '1992' , '2004' , '2009' , '2015' , '2018' ] for year in years: writer = ExcelWriter( './output/n_' + year + '.xlsx' ) #writer = pd.ExcelWriter('./output/n_1992.xlsx', engine='xlsxwriter') for field in fields: df = pd.read_excel( './output/' + year + '.xlsx' ) #field='degree' df_5 = df.sort_values(by = field, ascending = False ).head( 5 ) #对某列按照字段降序排列 df_n = df.loc[df[ 'Label' ].isin([ 'IND' , 'CHN' , 'RUS' , 'BEL' , 'ZAF' ])] #选出符合条件的rows df_ = pd.concat([df_5,df_n]) df_ = df_.sort_values(by = field, ascending = False ) df1 = df_[[ 'Label' ,field]] #提取出列 df1[ 'Index' ] = df1[field] / df1[field]. max () * 100 df2 = df1.drop_duplicates(subset = 'Label' ) #除去重复的行 根据Label属性列 df2.to_excel(writer,sheet_name = field) #sheet的名字和field一致 writer.save() #注意保存放置的位置 需要都存好后再保存 |
根据某一类进行排序,相等值序列相同,同时需要某些固定的值:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | from pandas import ExcelWriter fields = [ 'degree' , 'weighted degree' , 'Eigenvec' , 'Autho' , 'Hub' ] years = [ '1992' , '2004' , '2009' , '2015' , '2018' ] for year in years: writer = ExcelWriter( './output5/sort/n_' + year + '.xlsx' ) #writer = pd.ExcelWriter('./output5/sort/n_1992.xlsx', engine='xlsxwriter') for field in fields: df = pd.read_excel( './output5/' + year + '.xlsx' ) dfs = df.sort_values(by = field, ascending = False ).reset_index(drop = True ) #更新index dfs[ 'sort_index' ] = dfs.index + 1 dfs = dfs[[ 'Label' ,field, 'sort_index' ]] for ind in dfs.index: if ind > 0 : if dfs[field][ind] = = dfs[field][ind - 1 ]: dfs[ 'sort_index' ][ind] = dfs[ 'sort_index' ][ind - 1 ] dfs[ 'sort_index' ][ind + 1 :] = dfs[ 'sort_index' ][ind + 1 :] - 1 df_5 = df.sort_values(by = field, ascending = False ).head( 5 ) df_n = df.loc[df[ 'Label' ].isin([ 'IND' , 'CHN' , 'RUS' , 'BEL' , 'ZAF' ])] df_ = pd.concat([df_5,df_n]) df_ = df_.sort_values(by = field, ascending = False ) df1 = df_[[ 'Label' ,field]] df1[ 'Index/%' ] = df1[field] / df1[field]. max () * 100 df2 = df1.drop_duplicates(subset = 'Label' ) df2[ 'sort_index' ] = df2[ 'Label' ]. map (dfs.set_index( 'Label' )[ 'sort_index' ]) #匹配dfs(多)中的'sort_index',匹配字段为Label <a href="https://stackoverflow.com/questions/46789098/create-new-column-in-dataframe-with-match-values-from-other-dataframe" rel="noopener nofollow">https://stackoverflow.com/questions/46789098/create-new-column-in-dataframe-with-match-values-from-other-dataframe</a> df2 = df2[[field, 'sort_index' , 'Label' , 'Index/%' ]] #按照想的给列排序导出 df2[ 'Index/%' ] = df2[ 'Index/%' ]. round (decimals = 2 ) #对这一列保留小数点后两位小数 df2.to_excel(writer,sheet_name = field) writer.save() |
将一文件夹下所有的xlsx文件转为csv以方便在ArcGIS中使用:
1 2 3 4 5 6 7 8 9 | import pandas as pd import glob # folder C:\Users\admin\ for filepath in glob.iglob(r 'C:\Users\admin\*.xlsx' ): data_xlsx = pd.read_excel(filepath, 'Sheet1' , index_col = None ) data_xlsx.to_csv(filepath[: - 5 ] + '.csv' , encoding = 'utf-8' ) #filepath[:-5]表示文件路径去除后五位,即'.xlsx'后的文件名 print (filepath) |
其他遍历文件下所有文件的代码可移步参考:https://www.newbedev.com/python/howto/how-to-iterate-over-files-in-a-given-directory/
按规则选择行
1 2 3 4 5 6 7 | df.loc[df[ 'column_name' ] = = some_value] df.loc[df[ 'column_name' ].isin(some_values)] df.loc[~df[ 'column_name' ].isin(some_values)] df.loc[(df[ 'column_name' ] > = A) & (df[ 'column_name' ] < = B)] |
创建dataframe 并增加新的行
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | import pandas as pd # create an Empty DataFrame # object With column names only df = pd.DataFrame(columns = [ 'Name' , 'Articles' , 'Improved' ]) print (df) # append rows to an empty DataFrame df = df.append({ 'Name' : 'Ankit' , 'Articles' : 97 , 'Improved' : 2200 }, ignore_index = True ) df = df.append({ 'Name' : 'Aishwary' , 'Articles' : 30 , 'Improved' : 50 }, ignore_index = True ) df = df.append({ 'Name' : 'yash' , 'Articles' : 17 , 'Improved' : 220 }, ignore_index = True ) df |
统计某列数据各个值出现的频次:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | import pandas as pd import glob from pandas import ExcelWriter dft = pd.DataFrame(columns = [ 'USA' , 'CHN' , 'JPN' , 'IND' , 'ZAF' ]) fields = [ 'maxtrade_partner_code' , 'maxex_partner_code' , 'maxim_partner_code' ] # folder C:\Users\admin\ writer = ExcelWriter( 'G:/out.xlsx' ) for field in fields: print (field) for filepath in glob.iglob(r 'D:\1Money\ylh工作安排9-1\*.xlsx' ): #遍历文件夹所有xlsx文件 print (filepath) df = pd.read_excel(filepath, 'Sheet 1' , index_col = None ) idx = pd.Index(df[field]) #索引 co = pd.DataFrame(idx.value_counts()) #统计频次转为 dataframe dft = dft.append({ 'USA' :co.at[ 'USA' ,field], 'CHN' :co.at[ 'CHN' ,field], 'JPN' :co.at[ 'JPN' ,field] , 'IND' :co.at[ 'IND' ,field], 'ZAF' :co.at[ 'ZAF' ,field]},ignore_index = True ) dft.to_excel(writer,sheet_name = field) dft.drop(dft.index, inplace = True ) writer.save() |
将dataframe中的负值替换为其他
1 | df[df < 0 ] = np.nan |
简单数据探索
1 2 3 4 5 6 7 | import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize = ( 20 , 7 )) #设置图片尺寸 sns.displot(df[ 'RASTERVALU' ]); #对这一列数据频率直方图显示 sns.lineplot(x = df[ 'Id' ],y = df[ 'RASTERVALU' ]); #折线图 |
分类:
Python
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示