Pandas Dataframe的一些操作

读取txt

# ' 和 " 没区别,路径为 \ ,./ 表示为当前目录下路径
df = pd.read_csv("./input/"+year+"ha.txt",skiprows =13,skipfooter=2,header=None,sep=" ")
#skiprows=[0,2,5]跳过指定行 skiprows=[i for i in range(1,3)]跳过除首行外前n行
#skipfooter=2 跳过末尾n行
#skiprows= lambda x: logic(x) 跳过符合条件的行 其中logic为设定的筛选函数,可参照如下设置
def logic(index):
    if index % 3 == 0:
       return True
    return False

 读取dbf

import pandas as pd
from simpledbf import Dbf5 #未下载需下载pip install simpledbf
dbf = Dbf5(r"D:\01DATA\dem.dbf")
df = dbf.to_dataframe()
df['Id']=df.index

 

 

#删除列
del df[0]
del df[5]
del df['name'] df=df.iloc[0:221]#选择前221行 df.columns = ['location_code','Hub','Autho']#给列重命名

 

df3.columns=['Eigenvec']
df3=df3['Eigenvec'].str.split('  ', expand=True)#在某列内部以空格进行分割
df3['new'] = df3.index#创建新列 和索引列相同
loca = df3['new'].str.split(' ', expand=True)

 

result = pd.merge(df1, df2, how='inner', on=['location_code'])#公共列为'location_code' 取交集
result = pd.merge(result, df3, how='inner', on=['location_code'])

 

result.to_excel('./output/worldtrade_'+year+'_all.xlsx',sheet_name='Sheet')#导出

 

编码错误

df_w = pd.read_table(r'C:\Users\lab\list.txt', encoding='ISO-8859-1')#\加r
#encoding = 'utf-8'

 

将多个dataframe存到一个excel中不同sheet中:

from pandas import ExcelWriter
fields=['degree','weighted degree','Eigenvec','Autho','Hub']
years=['1992','2004','2009','2015','2018']
for year in years:
    writer = ExcelWriter('./output/n_'+year+'.xlsx')
    #writer = pd.ExcelWriter('./output/n_1992.xlsx', engine='xlsxwriter')
    for field in fields:
        df = pd.read_excel('./output/'+year+'.xlsx')
        #field='degree'
        df_5=df.sort_values(by=field, ascending=False).head(5)#对某列按照字段降序排列
        df_n=df.loc[df['Label'].isin(['IND','CHN','RUS','BEL','ZAF'])]#选出符合条件的rows
        df_=pd.concat([df_5,df_n])  
        df_=df_.sort_values(by=field, ascending=False)
        df1=df_[['Label',field]]#提取出列
        df1['Index']=df1[field]/df1[field].max()*100
        df2 = df1.drop_duplicates(subset='Label')#除去重复的行 根据Label属性列
        df2.to_excel(writer,sheet_name=field)#sheet的名字和field一致
    writer.save()#注意保存放置的位置 需要都存好后再保存

 

根据某一类进行排序,相等值序列相同,同时需要某些固定的值:

from pandas import ExcelWriter
fields=['degree','weighted degree','Eigenvec','Autho','Hub']
years=['1992','2004','2009','2015','2018']
for year in years:
    writer = ExcelWriter('./output5/sort/n_'+year+'.xlsx')
    #writer = pd.ExcelWriter('./output5/sort/n_1992.xlsx', engine='xlsxwriter')
    for field in fields:
        df = pd.read_excel('./output5/'+year+'.xlsx')
        dfs=df.sort_values(by=field, ascending=False).reset_index(drop=True)#更新index
        dfs['sort_index']=dfs.index+1
        dfs=dfs[['Label',field,'sort_index']]
        for ind in dfs.index:
            if ind > 0:
                if dfs[field][ind]==dfs[field][ind-1]:
                    dfs['sort_index'][ind]=dfs['sort_index'][ind-1]
                    dfs['sort_index'][ind+1:]=dfs['sort_index'][ind+1:]-1
        df_5=df.sort_values(by=field, ascending=False).head(5)
        df_n=df.loc[df['Label'].isin(['IND','CHN','RUS','BEL','ZAF'])]
        df_=pd.concat([df_5,df_n])  
        df_=df_.sort_values(by=field, ascending=False)
        df1=df_[['Label',field]]
        df1['Index/%']=df1[field]/df1[field].max()*100
        df2 = df1.drop_duplicates(subset='Label')
        df2['sort_index'] = df2['Label'].map(dfs.set_index('Label')['sort_index'])#匹配dfs(多)中的'sort_index',匹配字段为Label https://stackoverflow.com/questions/46789098/create-new-column-in-dataframe-with-match-values-from-other-dataframe
        df2 = df2[[field, 'sort_index', 'Label','Index/%']]#按照想的给列排序导出
        df2['Index/%']=df2['Index/%'].round(decimals=2)#对这一列保留小数点后两位小数
        df2.to_excel(writer,sheet_name=field)
    writer.save()   

  

 

将一文件夹下所有的xlsx文件转为csv以方便在ArcGIS中使用:

import pandas as pd
import glob

#  folder C:\Users\admin\
for filepath in glob.iglob(r'C:\Users\admin\*.xlsx'):
    data_xlsx = pd.read_excel(filepath, 'Sheet1', index_col=None)
    data_xlsx.to_csv(filepath[:-5]+'.csv', encoding='utf-8')
    #filepath[:-5]表示文件路径去除后五位,即'.xlsx'后的文件名
    print(filepath)

其他遍历文件下所有文件的代码可移步参考:https://www.newbedev.com/python/howto/how-to-iterate-over-files-in-a-given-directory/

 

按规则选择行

df.loc[df['column_name'] == some_value]

df.loc[df['column_name'].isin(some_values)]
df.loc[~df['column_name'].isin(some_values)]


df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]

  

 创建dataframe 并增加新的行

import pandas as pd 
  
# create an Empty DataFrame 
# object With column names only 
df = pd.DataFrame(columns = ['Name', 'Articles', 'Improved']) 
print(df) 
  
# append rows to an empty DataFrame 
df = df.append({'Name' : 'Ankit', 'Articles' : 97, 'Improved' : 2200},  
                ignore_index = True) 
df = df.append({'Name' : 'Aishwary', 'Articles' : 30, 'Improved' : 50}, 
                ignore_index = True) 
df = df.append({'Name' : 'yash', 'Articles' : 17, 'Improved' : 220}, 
               ignore_index = True) 
  
df 

  

统计某列数据各个值出现的频次:

import pandas as pd
import glob
from pandas import ExcelWriter

dft = pd.DataFrame(columns = ['USA', 'CHN', 'JPN','IND','ZAF'])
fields=['maxtrade_partner_code','maxex_partner_code','maxim_partner_code']
#  folder C:\Users\admin\
writer = ExcelWriter('G:/out.xlsx')

for field in fields:
    print(field)
    for filepath in glob.iglob(r'D:\1Money\ylh工作安排9-1\*.xlsx'):#遍历文件夹所有xlsx文件
        print(filepath)
        df = pd.read_excel(filepath, 'Sheet 1', index_col=None)
        idx = pd.Index(df[field]) #索引
        co=pd.DataFrame(idx.value_counts()) #统计频次转为 dataframe
        dft = dft.append({'USA':co.at['USA',field],'CHN' :co.at['CHN',field],'JPN':co.at['JPN',field] ,'IND' :co.at['IND',field],'ZAF':co.at['ZAF',field]},ignore_index = True)        
    dft.to_excel(writer,sheet_name=field)
    dft.drop(dft.index, inplace=True)
writer.save()

 

将dataframe中的负值替换为其他

df[df < 0] = np.nan 

  

 简单数据探索

import seaborn as sns
import matplotlib.pyplot as plt

f, ax = plt.subplots(figsize=(20, 7))#设置图片尺寸

sns.displot(df['RASTERVALU']);#对这一列数据频率直方图显示
sns.lineplot(x=df['Id'],y=df['RASTERVALU']);#折线图

  

 

posted @ 2020-08-29 16:44  icydengyw  阅读(585)  评论(0编辑  收藏  举报