pandas入门:pandas的数据结构介绍
Series
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
# Series是一种类似于一维数组的对象,由一组数据(各种Numpy数据类型)以及一组与之相关的数据标签(即索引)组成
obj = Series([4,7,-5,3])
print(obj)
'''
0 4
1 7
2 -5
3 3
dtype: int64
'''
# 索引在左边,值在右边,可通过Series的values和index属性获取其数组表示形式和索引对象
print(obj.values) # [ 4 7 -5 3]
print(obj.index) # RangeIndex(start=0, stop=4, step=1)
# 创建一个带有可以对各个数据点进行标记的索引
obj2 = Series([4,7,-5,1],index=['d','b','a','c'])
print(obj2)
'''
d 4
b 7
a -5
c 1
dtype: int64
'''
print(obj2.index) # Index(['d', 'b', 'a', 'c'], dtype='object')
# 与普通NumPy数据相比,可以通过索引的方式选取Series中单个或一组值
print(obj2['a']) # -5
obj2['d'] = 6
print(obj2[['c','a','d']])
'''
c 1
a -5
d 6
dtype: int64
'''
print(obj2[obj2>0])
'''
d 6
b 7
c 1
dtype: int64
'''
print(obj2*2)
'''
d 12
b 14
a -10
c 2
dtype: int64
'''
print(np.exp(obj2))
'''
d 403.428793
b 1096.633158
a 0.006738
c 2.718282
dtype: float64
'''
print('b'in obj2) # True
print('e'in obj2) # False
# 通过字典创建Series
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3 = Series(sdata)
print(obj3)
'''
Ohio 35000
Oregon 16000
Texas 71000
Utah 5000
dtype: int64
'''
states = ['California','Ohio','Oregon','Texas']
obj4 = Series(sdata,index=states)
print(obj4)
'''
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
'''
# sdata中跟states索引相匹配的3个值会找出来放到对应位置上,由于California所对应的sdata找不到,其结果就是NaN(not a number,表示缺失或NA值)
# Pandas的isnull和notnull函数可用于检测缺失值
print(pd.isnull(obj4))
'''
California True
Ohio False
Oregon False
Texas False
dtype: bool
'''
print(pd.notnull(obj4))
'''
California False
Ohio True
Oregon True
Texas True
dtype: bool
'''
print(obj4.isnull())
'''
California True
Ohio False
Oregon False
Texas False
dtype: bool
'''
# Series的一个重要功能是:在算术运算中会自动对齐不同索引数据
print(obj3+obj4)
'''
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
'''
# Series对象本身及其索引都有一个name属性
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)
'''
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
'''
# Series的索引可以通过赋值的方式就地修改
obj.index = ['Bob','Steve','Jeff','Ryan']
print(obj)
'''
Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64
'''
DataFrame
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
# DataFrame既有行索引也有列索引,可以看做由Series组成的字典
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002],
'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
print(frame)
'''
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2002
'''
# 如果指定了列序列,则dataframe的列就会按照指定顺序进行排列
frame1 = DataFrame(data,columns=['year','state','pop'])
print(frame1)
'''
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
'''
# 如果传入列在数据中找不到,就会产生NA值
frame2 = DataFrame(data,columns=['year','state','pop','debt'],
index=['one','two','three','four','five'])
print(frame2)
'''
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
'''
print(frame2.columns) # Index(['year', 'state', 'pop', 'debt'], dtype='object')
# 通过类似字典标记的方式或者属性的方式,可将dataframe的列获取为一个series
print(frame2['state'])
'''
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
Name: state, dtype: object
'''
print(frame2.year)
'''
one 2000
two 2001
three 2002
four 2001
five 2002
Name: year, dtype: int64
'''
# 通过索引字段ix获取
print(frame2.ix['three'])
'''
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
'''
# 列可以通过赋值的方式进行修改,例如给debt列赋上一个标量的值或一组值
frame2['debt'] = 16.5
print(frame2)
'''
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5
'''
frame2['debt'] = np.arange(5.)
print(frame2)
'''
year state pop debt
one 2000 Ohio 1.5 0.0
two 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0
'''
# 将列表或数组赋值给某列时,其长度必须跟dataframe的长度相匹配,如果赋值的是一个series,就是精确匹配dataframe的索引,所有空位都将被填上缺失值
val = Series([-1.2,-1.5,-1.7],index = ['two','four','five'])
frame2['debt'] = val
print(frame2)
'''
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
'''
# 为不存在的列赋值会创建出一个新列,关键字del用于删除列
frame2['eastern'] = frame2.state =='Ohio'
print(frame2)
'''
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
'''
del frame2['eastern']
print(frame2)
'''
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
'''
# 嵌套字典
pop = {"Nevada":{2001:2.4,2002:2.9},
'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = DataFrame(pop)
print(frame3)
# 字典外层的键作为列,内层键座位行索引
'''
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
'''
# 进行转置
print(frame3.T)
'''
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6
'''
# 显示指定索引
print(DataFrame(pop,index=[2001,2002,2003]))
'''
Nevada Ohio
2001 2.4 1.7
2002 2.9 3.6
2003 NaN NaN
'''
# print(frame3['Ohio'][:-1])
# print(frame3['Nevada'][:2])
pdata = {"Nevada":frame3['Nevada'][:2],
'Ohio':frame3['Ohio'][:-1]}
print(DataFrame(pdata))
'''
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
'''
# 如果设置了dataframe的index和columns的name属性,则这些信息也会被显示出来
frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)
'''
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
'''
#与series一样,values属性也会以二维ndarray的形式返回
print(frame3.values)
'''
[[nan 1.5]
[2.4 1.7]
[2.9 3.6]]
'''
索引对象
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
obj = Series(range(3),index=['a','b','c'])
index = obj.index
print(index) # Index(['a', 'b', 'c'], dtype='object')
print(index[1:]) # Index(['b', 'c'], dtype='object')
# index是不可修改的
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0],index=index)
print(obj2.index is index) # True
pop = {"Nevada":{2001:2.4,2002:2.9},
'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = DataFrame(pop)
print('Ohio' in frame3.columns) # True
print(2003 in frame3.index) # False
Index的方法和属性
from pandas import Series
obj1 = Series([1,6,5,9],index=['a','b','c','d'])
obj2 = Series([2,10,6],index=['a','a','b'])
obj3 = Series([2,10,10])
# append 连接另一个index对象,产生一个新的index
obj_append = obj1.append(obj2)
print(obj_append)
'''
a 1
b 6
c 5
d 9
a 2
a 10
b 6
dtype: int64
'''
# diff 计算差集,得到一个index
obj_diff = obj1.diff()
print(obj_diff)
# 后一个value-前一个value
'''
a NaN
b 5.0
c -1.0
d 4.0
dtype: float64
'''
# intersection 计算交集
obj_intersection =obj1.index.intersection(obj2.index)
print(obj_intersection) # Index(['a', 'a', 'b'], dtype='object')
# union 计算并集
obj_union = obj1.index.union(obj2.index)
print(obj_union) # Index(['a', 'a', 'b', 'c', 'd'], dtype='object')
# isin 计算一个指示各值是否都包含在参数几个中的布尔型数据
obj_isin = obj1.index.isin(obj2.index)
print(obj_isin) # [ True True False False]
# delete 删除索引处的元素,并得到新的index
obj_delete =obj1.index.delete(2)
print(obj_delete) # Index(['a', 'b', 'd'], dtype='object')
# drop 删除传入的值,并得到新的index
obj_drop = obj1.drop(['a']) # 删除第a行
print(obj_drop)
'''
b 6
c 5
d 9
dtype: int64
'''
# is_monotonic 当各元素均大于等于前一个元素时,返回True
print(obj3.is_monotonic) # True
print(obj2.is_monotonic) # False
# is_unique 当index没有重复值时,返回True
print(obj3.is_unique) # False
print(obj2.is_unique) # True
# unique 计算index中唯一值的数组
print(obj3.unique()) # [ 2 10]
本文来自博客园,作者:OTAKU_nicole,转载请注明原文链接:https://www.cnblogs.com/nicole-zhang/p/12955094.html