pandas 1 基本介绍

import numpy as np
import pandas as pd

pd.Series() 构造数据

s = pd.Series([1, 3, 5, np.nan, 44, 1])

print(s)

# 0     1.0
# 1     3.0
# 2     5.0
# 3     NaN
# 4    44.0
# 5     1.0
# dtype: float64

pd.date_range() 生成数据

dates = pd.date_range('20190225', periods=2)

print(dates)  

# DatetimeIndex(['2019-02-25', '2019-02-26'], dtype='datetime64[ns]', freq='D')

pd.DataFrame() 构造数据

df = pd.DataFrame(np.random.randn(2, 4), index=dates, columns=['a', 'b', 'c', 'd'])

print(df)

#                    a         b         c         d
# 2019-02-25  1.236639 -0.918432 -0.211460  1.834082
# 2019-02-26  1.191895 -1.680464  0.863866  0.171246

pd.DataFrame() 构造数据

df1 = pd.DataFrame(np.arange(12).reshape(3, 4)

print(df1)

#    0  1   2   3
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11

pd.DataFrame() 构造数据

df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(5)), dtype='float32'),
                    'D': np.array([3] * 5, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train", 'yzn']),
                    'F': 'foo'})
                    
print(df2)

#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  1.0  3  train  foo
# 2  1.0 2013-01-02  1.0  3   test  foo
# 3  1.0 2013-01-02  1.0  3  train  foo
# 4  1.0 2013-01-02  1.0  3    yzn  foo

属性 df2.dtypes df2.index df2.columns

df2.values df2.describe() df2.T

df.sort_index(axis=1, ascending=False) df2.sort_values(by='E')

print(df2.dtypes)

# A           float64
# B    datetime64[ns]
# C           float32
# D             int32
# E          category
# F            object
# dtype: object

print(df2.index)

# Int64Index([0, 1, 2, 3, 4], dtype='int64')
print(df2.columns)

# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df2.values)

# [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'yzn' 'foo']]
print(df2.describe())

#          A    C    D
# count  5.0  5.0  5.0
# mean   1.0  1.0  3.0
# std    0.0  0.0  0.0
# min    1.0  1.0  3.0
# 25%    1.0  1.0  3.0
# 50%    1.0  1.0  3.0
# 75%    1.0  1.0  3.0
# max    1.0  1.0  3.0

print(df2.T)

#                      0  ...                    4
# A                    1  ...                    1
# B  2013-01-02 00:00:00  ...  2013-01-02 00:00:00
# C                    1  ...                    1
# D                    3  ...                    3
# E                 test  ...                  yzn
# F                  foo  ...                  foo
# [6 rows x 5 columns]

print(df.sort_index(axis=1, ascending=False))

#                    d         c         b         a
# 2019-02-25 -0.086707  0.388089  0.513976 -0.148502
# 2019-02-26 -0.237655 -0.799583 -1.722373  0.318766

print(df.sort_index(axis=0, ascending=False))

#                    a         b         c         d
# 2019-02-26 -2.117756  0.453841 -2.900436  1.061481
# 2019-02-25 -0.974467  0.598005 -0.552265 -2.487490

print(df2.sort_values(by='E'))

#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  3   test  foo
# 2  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  1.0  3  train  foo
# 3  1.0 2013-01-02  1.0  3  train  foo
# 4  1.0 2013-01-02  1.0  3    yzn  foo

END

posted @ 2019-02-26 14:55  YangZhaonan  阅读(214)  评论(0编辑  收藏  举报