Pandas Dataframe的一些操作
读取txt
# ' 和 " 没区别,路径为 \ ,./ 表示为当前目录下路径 df = pd.read_csv("./input/"+year+"ha.txt",skiprows =13,skipfooter=2,header=None,sep=" ") #skiprows=[0,2,5]跳过指定行 skiprows=[i for i in range(1,3)]跳过除首行外前n行 #skipfooter=2 跳过末尾n行 #skiprows= lambda x: logic(x) 跳过符合条件的行 其中logic为设定的筛选函数,可参照如下设置 def logic(index): if index % 3 == 0: return True return False
读取dbf
import pandas as pd from simpledbf import Dbf5 #未下载需下载pip install simpledbf dbf = Dbf5(r"D:\01DATA\dem.dbf") df = dbf.to_dataframe() df['Id']=df.index
#删除列 del df[0] del df[5]
del df['name'] df=df.iloc[0:221]#选择前221行 df.columns = ['location_code','Hub','Autho']#给列重命名
df3.columns=['Eigenvec'] df3=df3['Eigenvec'].str.split(' ', expand=True)#在某列内部以空格进行分割 df3['new'] = df3.index#创建新列 和索引列相同 loca = df3['new'].str.split(' ', expand=True)
result = pd.merge(df1, df2, how='inner', on=['location_code'])#公共列为'location_code' 取交集 result = pd.merge(result, df3, how='inner', on=['location_code'])
result.to_excel('./output/worldtrade_'+year+'_all.xlsx',sheet_name='Sheet')#导出
编码错误
df_w = pd.read_table(r'C:\Users\lab\list.txt', encoding='ISO-8859-1')#\加r #encoding = 'utf-8'
将多个dataframe存到一个excel中不同sheet中:
from pandas import ExcelWriter fields=['degree','weighted degree','Eigenvec','Autho','Hub'] years=['1992','2004','2009','2015','2018'] for year in years: writer = ExcelWriter('./output/n_'+year+'.xlsx') #writer = pd.ExcelWriter('./output/n_1992.xlsx', engine='xlsxwriter') for field in fields: df = pd.read_excel('./output/'+year+'.xlsx') #field='degree' df_5=df.sort_values(by=field, ascending=False).head(5)#对某列按照字段降序排列 df_n=df.loc[df['Label'].isin(['IND','CHN','RUS','BEL','ZAF'])]#选出符合条件的rows df_=pd.concat([df_5,df_n]) df_=df_.sort_values(by=field, ascending=False) df1=df_[['Label',field]]#提取出列 df1['Index']=df1[field]/df1[field].max()*100 df2 = df1.drop_duplicates(subset='Label')#除去重复的行 根据Label属性列 df2.to_excel(writer,sheet_name=field)#sheet的名字和field一致 writer.save()#注意保存放置的位置 需要都存好后再保存
根据某一类进行排序,相等值序列相同,同时需要某些固定的值:
from pandas import ExcelWriter fields=['degree','weighted degree','Eigenvec','Autho','Hub'] years=['1992','2004','2009','2015','2018'] for year in years: writer = ExcelWriter('./output5/sort/n_'+year+'.xlsx') #writer = pd.ExcelWriter('./output5/sort/n_1992.xlsx', engine='xlsxwriter') for field in fields: df = pd.read_excel('./output5/'+year+'.xlsx') dfs=df.sort_values(by=field, ascending=False).reset_index(drop=True)#更新index dfs['sort_index']=dfs.index+1 dfs=dfs[['Label',field,'sort_index']] for ind in dfs.index: if ind > 0: if dfs[field][ind]==dfs[field][ind-1]: dfs['sort_index'][ind]=dfs['sort_index'][ind-1] dfs['sort_index'][ind+1:]=dfs['sort_index'][ind+1:]-1 df_5=df.sort_values(by=field, ascending=False).head(5) df_n=df.loc[df['Label'].isin(['IND','CHN','RUS','BEL','ZAF'])] df_=pd.concat([df_5,df_n]) df_=df_.sort_values(by=field, ascending=False) df1=df_[['Label',field]] df1['Index/%']=df1[field]/df1[field].max()*100 df2 = df1.drop_duplicates(subset='Label') df2['sort_index'] = df2['Label'].map(dfs.set_index('Label')['sort_index'])#匹配dfs(多)中的'sort_index',匹配字段为Label https://stackoverflow.com/questions/46789098/create-new-column-in-dataframe-with-match-values-from-other-dataframe df2 = df2[[field, 'sort_index', 'Label','Index/%']]#按照想的给列排序导出 df2['Index/%']=df2['Index/%'].round(decimals=2)#对这一列保留小数点后两位小数 df2.to_excel(writer,sheet_name=field) writer.save()
将一文件夹下所有的xlsx文件转为csv以方便在ArcGIS中使用:
import pandas as pd import glob # folder C:\Users\admin\ for filepath in glob.iglob(r'C:\Users\admin\*.xlsx'): data_xlsx = pd.read_excel(filepath, 'Sheet1', index_col=None) data_xlsx.to_csv(filepath[:-5]+'.csv', encoding='utf-8') #filepath[:-5]表示文件路径去除后五位,即'.xlsx'后的文件名 print(filepath)
其他遍历文件下所有文件的代码可移步参考:https://www.newbedev.com/python/howto/how-to-iterate-over-files-in-a-given-directory/
按规则选择行
df.loc[df['column_name'] == some_value] df.loc[df['column_name'].isin(some_values)] df.loc[~df['column_name'].isin(some_values)] df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]
创建dataframe 并增加新的行
import pandas as pd # create an Empty DataFrame # object With column names only df = pd.DataFrame(columns = ['Name', 'Articles', 'Improved']) print(df) # append rows to an empty DataFrame df = df.append({'Name' : 'Ankit', 'Articles' : 97, 'Improved' : 2200}, ignore_index = True) df = df.append({'Name' : 'Aishwary', 'Articles' : 30, 'Improved' : 50}, ignore_index = True) df = df.append({'Name' : 'yash', 'Articles' : 17, 'Improved' : 220}, ignore_index = True) df
统计某列数据各个值出现的频次:
import pandas as pd import glob from pandas import ExcelWriter dft = pd.DataFrame(columns = ['USA', 'CHN', 'JPN','IND','ZAF']) fields=['maxtrade_partner_code','maxex_partner_code','maxim_partner_code'] # folder C:\Users\admin\ writer = ExcelWriter('G:/out.xlsx') for field in fields: print(field) for filepath in glob.iglob(r'D:\1Money\ylh工作安排9-1\*.xlsx'):#遍历文件夹所有xlsx文件 print(filepath) df = pd.read_excel(filepath, 'Sheet 1', index_col=None) idx = pd.Index(df[field]) #索引 co=pd.DataFrame(idx.value_counts()) #统计频次转为 dataframe dft = dft.append({'USA':co.at['USA',field],'CHN' :co.at['CHN',field],'JPN':co.at['JPN',field] ,'IND' :co.at['IND',field],'ZAF':co.at['ZAF',field]},ignore_index = True) dft.to_excel(writer,sheet_name=field) dft.drop(dft.index, inplace=True) writer.save()
将dataframe中的负值替换为其他
df[df < 0] = np.nan
简单数据探索
import seaborn as sns import matplotlib.pyplot as plt f, ax = plt.subplots(figsize=(20, 7))#设置图片尺寸 sns.displot(df['RASTERVALU']);#对这一列数据频率直方图显示 sns.lineplot(x=df['Id'],y=df['RASTERVALU']);#折线图