python数据分析-05DataFrame深入

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

#df1 = DataFrame({'城市':["北京","上海","广州"],'人口':[1000,2000,1500]})
# print(df1)
# 城市 人口
# 0 北京 1000
# 1 上海 2000
# 2 广州 1500

#方法1:
# df1["GDP"] = Series([1000,2000,1500])
# print(df1)
# 城市 人口 GDP
# 0 北京 1000 1000
# 1 上海 2000 2000
# 2 广州 1500 1500

#方法2:
# df2 = DataFrame({'城市':["北京","上海","广州"],'人口':[1000,2000,1500]},index=["A","B","C"])
# gdp_map = {"北京":1000,"上海":2000,"广州":1500}
# df2["GDP"] = df2["城市"].map(gdp_map)
# print(df2)
# 城市 人口 GDP
# A 北京 1000 1000
# B 上海 2000 2000
# C 广州 1500 1500

# df3 = DataFrame({'城市':["北京","上海","广州"],'人口':[1000,2000,1500]},index=["A","B","C"])#不在是默认index时,需要使用指定添加
# df3["GDP"] = Series([1000,2000,1500])
# print(df3)#无法填充进去
# 城市 人口 GDP
# A 北京 1000 NaN
# B 上海 2000 NaN
# C 广州 1500 NaN

# df3["GDP"] = Series([1000,2000,1500],index=["A","B","C"])
# print(df3)
# 城市 人口 GDP
# A 北京 1000 1000
# B 上海 2000 2000
# C 广州 1500 1500


#-----------------------------
#replace in Series
#s1 = Series(np.arange(10))
#print(s1)
# 0 0
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
# 9 9
# dtype: int32

#print(s1.replace(1,np.nan))
# 0 0.0
# 1 NaN
# 2 2.0
# 3 3.0
# 4 4.0
# 5 5.0
# 6 6.0
# 7 7.0
# 8 8.0
# 9 9.0
# dtype: float64

#print(s1.replace([1,2,3],[10,20,30]))
# 0 0
# 1 10
# 2 20
# 3 30
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
# 9 9
# dtype: int64


#-----------------------
#Series 和 DataFrame的简单数学运算
# s1 = Series([1,2,3],index=["A","B","C"])
# s2 = Series([4,5,6,7],index=["B","C","D","E"])
# print(s1)
# A 1
# B 2
# C 3
# dtype: int64
# print(s2)
# B 4
# C 5
# D 6
# E 7
# dtype: int64
# print(s1+s2)
# A NaN
# B 6.0
# C 8.0
# D NaN
# E NaN
# dtype: float64

#DataFrame的运算
# df1 = DataFrame(np.arange(4).reshape(2,2),index=["A","B"],columns=["BJ","SH"])
# print(df1)
# BJ SH
# A 0 1
# B 2 3
# df2 = DataFrame(np.arange(9).reshape(3,3),index=["A","B","C"],columns=["BJ","SH","GZ"])
# print(df2)
# BJ SH GZ
# A 0 1 2
# B 3 4 5
# C 6 7 8
# print(df1+df2)
# BJ GZ SH
# A 0.0 NaN 2.0
# B 5.0 NaN 7.0
# C NaN NaN NaN

# df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]],index=["A","B","C"],columns=["c1","c2","c3"])
# print(df3)
# c1 c2 c3
# A 1 2 3.0
# B 4 5 NaN
# C 7 8 9.0
# print(df3.sum())
# c1 12.0
# c2 15.0
# c3 12.0
# dtype: float64
# print(df3.sum(axis=1))
# A 6.0
# B 9.0
# C 24.0
# dtype: float64
# print(df3.min())
# c1 1.0
# c2 2.0
# c3 3.0
# dtype: float64
# print(df3.max())
# c1 7.0
# c2 8.0
# c3 9.0
# dtype: float64
# print(df3.describe())
# c1 c2 c3
# count 3.0 3.0 2.000000
# mean 4.0 5.0 6.000000
# std 3.0 3.0 4.242641
# min 1.0 2.0 3.000000
# 25% 2.5 3.5 4.500000
# 50% 4.0 5.0 6.000000
# 75% 5.5 6.5 7.500000
# max 7.0 8.0 9.000000


#-----------------------------
#Series和DataFrame的排序
# s1 = Series(np.random.randn(10))
# print(s1)
# 0 -1.745069
# 1 -3.339463
# 2 2.245615
# 3 0.201136
# 4 -0.115314
# 5 -0.425709
# 6 -1.037263
# 7 0.015670
# 8 -0.514211
# 9 -0.122862
# dtype: float64
# print(s1.values)
# [-0.46066427 -0.01673619 -0.79758999 -0.99447067 -1.2554336 0.95775716
# -0.98716949 0.81775325 -0.95819146 -0.38062781]
#print(s1.index)#RangeIndex(start=0, stop=10, step=1)
# s2 = s1.sort_values()
# print(s2)
# 3 -1.533961
# 1 -0.777431
# 5 -0.587565
# 2 -0.463069
# 7 -0.257701
# 0 -0.037266
# 6 0.062657
# 9 0.149767
# 8 0.245388
# 4 2.024740
# dtype: float64
# s2 = s1.sort_values(ascending=False)
# print(s2)
# 1 1.905997
# 6 0.369854
# 0 0.346478
# 2 0.283084
# 3 0.152866
# 4 0.145149
# 5 -0.362064
# 8 -0.627749
# 7 -0.738645
# 9 -0.905832
# dtype: float64
# print(s2.sort_index())
# 0 0.250688
# 1 -0.005753
# 2 0.818747
# 3 1.074309
# 4 0.057101
# 5 -1.576862
# 6 -1.358057
# 7 -0.774541
# 8 1.260600
# 9 0.028084
# dtype: float64


#DataFrame的排序
df1 = DataFrame(np.random.randn(40).reshape(8,5),columns=["A","B","C","D","E"])
print(df1)
# A B C D E
# 0 1.301407 0.079596 -0.324598 -0.489004 -0.319954
# 1 1.627349 -1.848241 -1.535149 0.616749 -0.581343
# 2 -1.599599 0.177486 0.413103 -0.121707 -0.771692
# 3 -0.346563 2.376872 -0.299881 -0.038205 -1.101628
# 4 2.000585 -0.087473 1.679934 -1.520698 -0.037990
# 5 -0.622608 0.178647 0.511137 0.001924 1.104219
# 6 0.680216 0.616194 0.492893 -1.495716 -2.129312
# 7 0.769310 -0.425242 0.270568 -1.340633 -0.507089

print(df1["A"].sort_values())
# 0 -0.781176
# 5 -0.699767
# 4 -0.257146
# 6 -0.168928
# 2 -0.160794
# 1 0.348743
# 3 1.015523
# 7 1.750817
# Name: A, dtype: float64

print(df1.sort_values("A"))
# A B C D E
# 7 -1.667484 1.052349 -0.786262 1.515977 -1.663600
# 1 -0.755957 -0.748133 -0.078783 1.221847 1.087867
# 0 -0.624164 -0.225844 0.146987 0.209596 -1.327463
# 5 -0.362764 0.958340 0.580041 -1.062712 0.233652
# 4 -0.184361 0.924434 0.304635 1.863528 0.775122
# 6 0.406105 0.030612 -1.115804 2.543703 -0.234756
# 3 0.657304 1.464882 0.091570 -1.226326 -1.272059
# 2 2.092520 -0.210072 -0.693642 0.152570 0.659520

df2 = df1.sort_values("A")
print(df2.sort_index())#跟df1一样的
# A B C D E
# 0 -0.612644 -0.795620 1.621510 -1.316650 1.504513
# 1 -1.770057 -1.740721 2.078625 -1.738596 0.345799
# 2 0.697535 1.126456 0.591017 0.272984 1.004823
# 3 1.323213 0.630537 1.063169 -0.682980 0.630861
# 4 0.292257 -0.683437 -2.204945 -0.997271 0.535046
# 5 1.441142 0.637664 0.801728 -0.249832 2.079914
# 6 -0.647377 0.078151 -0.649099 -0.360512 0.692393
# 7 0.333072 1.713874 0.672938 0.130204 -1.050239
posted @ 2019-07-04 10:38  nikecode  阅读(345)  评论(0编辑  收藏  举报