python DataFrame数据格式常用操作(备份)

# 读取前
import pandas as pd
# 读取时
a=pd.read_csv("d:/data/111.csv",encoding="GBK")#读gbk编码文件
b=pd.read_csv("d:/data/222.csv")#读utf8的csv文件
c = pd.read_csv(path + 'dat/import.csv', converters={'id'=str})#设置某列的格式
# 读取后
dfname.columns=['kind', 'd1-1', 'd1-2', 'd1-3'] #修改列名
dat=dat.drop('colname',axis=1) #删除某一列
x1=x.iloc[[12,13,19,21],[1,2,4,5,6,8,9,11,12,14,15,18,19,20,21,22]]# 按照位置选择(数据筛选)
result=pd.DataFrame.drop_duplicates(data1,keep='first',inplace=False) #过滤重复值
result = a.append([b,c,d,e,f,g,h,i,j,k,l,m], ignore_index=False) #行拼接(类似R的rbind)
df1_nona = df1.dropna()#删除含有缺失值的行 
df1_nona_cp.reset_index(drop=True, inplace=True) #重置目录
  #删除某些列含有负数的行 
df1_nona_cp[df1_nona_cp.iloc[:,2:]<0]=np.nan 
df1_nona_cp = df1_nona_cp.dropna()
  #分组
df1_gp=df1.groupby('nyr') #将df1按照nyr字段分组 df1_gp_count=df1_gp.count() #计算每个字段的分组计数结果,并把结果赋给df1_gp_count
  #0-1化 
df1_nona_cp['colmn'] = df1_nona_cp['colm'].apply(lambda x:x[:4]) 
df1_nona_cp = df1_nona_cp.drop(['colm'],axis=1)
af = colm[['y','colmn']]
dumm = pd.get_dummies(af['colmn'], prefix='colmn')
dumm = dumm[['colmn_1','colmn_3','colmn_8']] #before this need to test which value is useful 
df1_nona_cp=df1_nona_cp.join(dumm) 
del dumm,af

 

posted @ 2021-05-09 16:50  Shilo  阅读(387)  评论(0编辑  收藏  举报