pandas-基础数据结构

pandas-基础数据结构

数据结构

Pandas 的主要数据结构是 Series（一维数据）与DataFrame（二维数据）

⽆论是numpy中的NAN还是Python中的None在pandas中都以缺失数据NaN对待

Series

Series是一种类似于以为NumPy数组的对象，它由一组数据（各种NumPy数据类型）和与之相关的一组数据标签（即索引）组成的。可以用index和values分别规定索引和值

pd.Series( data, index, dtype, name, copy)

data：一组数据(list,array,dict)。
index：数据索引标签，如果不指定，默认从 0 开始。
dtype：数据类型，默认会自己判断。
name：设置名称。

创建Series

第一种 list

obj = pd.Series([4, 7, -5, 3, 7, np.nan])
print(obj)

#0    4.0
#1    7.0
...
#5    NaN
#dtype: float64

第二种 array

# array
arr = np.arange(6)
s = pd.Series(arr)
print(s)

#0    0
#1    1
...
#5    5
#dtype: int32

第三种 dict

# dict
d = {'a':10,'b':20,'c':30,'d':40,'e':50}
s2 = pd.Series(d)
print(s2)

#a    10
#b    20
#c    30
#d    40
#e    50
#dtype: int64

# 指定⾏索引
s2 = pd.Series(0,index = list('abcdefhi'),dtype='int64') 
#a    0
#b    0
...
#i    0
#dtype: int64

常用操作

因为pandas是基于numpy的，所以有numpy的很多特性,Series和numpy很多类似

	函数属性	描述
	`.name()` `.rename（new_name）`	名称，重命名
	`.head(n)` `.tail(n)`	头几个数据，尾几个数据，默认5条`n`可选
	`.index`	查看索引，没有括号
	`.loc[]`	以标签为索引
	`.iloc[]`	一下标为索引
	`.reindex()`	重新对索引进行标签
	`.sort_values()`	按值排序标签
	`.sort_index()`	按索引排序
	`.drop()`	删除

lst=[1,2,3,4,5,6,7]
s = pd.Series(lst,index=list("abcdefg"))
print(s)
print(s.head(1)) # 查看第1个数据
print(s.tail()) # 默认查看5条

索引

import pandas as pd
lst=[1,2,3]
s = pd.Series(lst,index=list("abc"))
print(s)
print(s.index) # 查看索引 
# Index(['a', 'b', 'c'], dtype='object')

# 标签索引和下标索引
# **标签索引**

import pandas as pd
lst=[1,2,3]
s = pd.Series(lst,index=list("abc"))
print(s)
print(s["a"]) 		# 索引index标签为"a"的元素,得到单个元素
print(s[["a","b"]]) # 索引index标签为"a"和"b"的元素,得到Series


# 按属性,索引元素
import pandas as pd
lst=[1,2,3]
s = pd.Series(lst,index=list("abc"))
print(s)
print(s.a) # 索引属性为"a"的元素,得到单个元素

# **下标索引**

import pandas as pd
lst=[1,2,3]
s = pd.Series(lst,index=list("abc"))
print(s)
print(s[-2],type(s[1])) 	# 索引倒数第2个元素,并查看数据类型
print(s[[1,2]]) 			# 索引下标索引为1和2的元素

sr = pd.Series(np.arange(10))
sr1 = sr[3:]
print(sr1)  
# 这是没问题的， 使用整数索引取值是优先以标签解释
# 就是index的值，不是下标

# 3    3
# 4    4
# 5    5
# 6    6
# ...
# 9    9
# dtype: int32

print(sr1[0])  # 报错 KeyError(key)
# 这种方式是取不出的，因为默认优先的是以标签解释的

解决方法：

loc 属性以标签解释
iloc 属性 以下标(索引)解释

print(sr1.loc[0])   # 报错  KeyError(key)
print(sr1.iloc[0])  # 3

# 重新索引

import pandas as pd
s = pd.Series([1,2],index=["a","b"])
print(s)
s2=s.reindex(["b","d"])
print(s2)
s3=s.reindex(["a","e"],fill_value=0) # fill_value参数,填充缺失值
print(s3)

# 执行结果
a    1
b    2
dtype: int64
b    2.0
d    NaN
dtype: float64
a    1
e    0
dtype: int64

缺失数据

dropna() # 过滤掉值为NaN的行
fillna() # 填充缺失数据
isnull() # 返回布尔数组，缺失值对应为True
notnull() # 返回布尔数组，缺失值对应为False

import numpy as np
import pandas as pd
lst=[1,2,3,None,np.nan] # None 空值, NaN 有问题的值
s = pd.Series(lst,index=list("abcde"))
bool_arr=s.isnull()
print(s[bool_arr])

d   NaN
e   NaN
dtype: float6

添加和修改

import pandas as pd
s=pd.Series([1,2],index=[0,1])
s["a"]="2" #添加
print(s)

import pandas as pd
s1=pd.Series([1,2],index=[0,1])
s2=pd.Series([1,2],index=["index0","index1"])
s=pd.concat([s1, s2], ignore_index=False)   # 将s2追加到s1后面.不会改变s1的值
print(s)

# 执行结果
0    1
1    2
a    2
dtype: object
0         1
1         2
index0    1
index1    2
dtype: int64

删除

import pandas as pd
s = pd.Series([1,2,3,4],index=list("abcd"))
s2=s.drop("a")  #删除index标签为"a"的值
print(s2)
s3=s.drop(["b","c"]) # s的值没有更改
print(s3)

DataFrame

DataFrame是一种表格型数据结构，有多种创建方式，它含有一组有序的列，每列可以是不同的值。DataFrame既有行索引，也有列索引，它可以看作是由Series组成的字典。

创建DataFrame

pandas.DataFrame( data, index, columns, dtype, copy)

参数说明：

data：一组数据(ndarray、series, map, lists, dict 等类型)。
index：索引值，或者可以称为行标签。
columns：列标签，默认为 RangeIndex (0, 1, 2, …, n) 。
dtype：数据类型。
copy：拷贝数据，默认为 False。

根据字典创建

data = {
    'state':['python','python','python','go','java'],
    'year':[2000,2001,2002,2001,2002],
    'pop':[1.5,1.7,3.6,2.4,2.9]
}
frame = pd.DataFrame(data)
print(frame)

#输出
    state  year  pop
0  python  2000  1.5
1  python  2001  1.7
2  python  2002  3.6
3      go  2001  2.4
4    java  2002  2.9

根据List[Dict]创建

import pandas as pd

data = {'a': [1, 2, 3], 'b': [4, 5, 6]}
df = pd.DataFrame.from_dict(data)
print(df)

data = [{'a':1,'b':4},{'a':2,'b':5},{'a':3,'b':4}]
df = pd.DataFrame.from_dict(data)
print(df)

   a  b
0  1  4
1  2  5
2  3  6
   a  b
0  1  4
1  2  5
2  3  6

由列表,元组,数组

import numpy as np
import pandas as pd
lst=[np.array([1,2,3]),
     np.array([4,5,6]),
     np.array([7,8,9])]
df=pd.DataFrame(lst)
print(df) 

#输出
   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9

{(元组)字典} 或 {[列表]字典}

import pandas as pd
dic={"clm0":[1,2,3],
     "clm1":[4,5,6],
     "clm2":[7,8,9]}
df=pd.DataFrame(dic)
print(df) 
#输出
   clm0  clm1  clm2
0     1     4     7
1     2     5     8
2     3     6     9

由Series创建

import pandas as pd
clm=["clm0","clm1","clm2"]
lst=[pd.Series([1,2,3],name="idx0",index=clm),
     pd.Series([4,5,6],name="idx1",index=clm),
     pd.Series([7,8,9],name="idx2",index=clm)]
df=pd.DataFrame(lst)
print(df) 

# 输出
      clm0  clm1  clm2
idx0     1     2     3
idx1     4     5     6
idx2     7     8     9

定义行列

import pandas as pd
dic=lst=[[1,2,3],
        [4,5,6],
        [7,8,9]]

idx=["idx2","idx0","idx2"] # 标签可以重复, 但请尽量避免
clm=["clm1","clm3","clm0"] # 没有的引入NaN值
df=pd.DataFrame(dic,index=idx,columns=clm)
print(df) 


      clm1  clm3  clm0
idx2     1     2     3
idx0     4     5     6
idx2     7     8     9

常用操作

DataFrame常用属性如下：

values：数据值
index：行标签
columns：列标签
shape：形状

因为pandas是基于numpy的，所以有numpy的很多特性,Series和numpy很多类似

	函数属性	描述
	`.name()` `.rename（new_name）`	名称，重命名
	`.head(n)` `.tail(n)`	头几个数据，尾几个数据，默认5条`n`可选
	`.set_index([],inplace=True)`	设置某列为索引
	`.reset_index()`	重置表索引
索引排序	`.loc[]`	以标签为索引 [行标签,]
	`.iloc[]`	以下标为索引, [行下标]
	`.reindex()`	重新对索引进行标签
	`.sort_values()`	按值排序标签
	`.sort_index()`	按索引排序
	`.drop()`	删除
计算描述统计	`.describe()`	数值型数据的快速统计汇总
	`.round(2)`	显示数字保留两位小数
	`df.apply(func[,axis,...])`

查看行名与列名

# 查看行名
df1.index

# 查看列名。
df3.columns

# 查看数据值
df3.values

# 查看数据维度
df3.shape

# 查看数据长度
df3.shape[0]

索引和切片

操作	句法	结果
选择列	`df[col]`	Series
用标签选择行	`df.loc[label]`	Series
用整数位置选择行	`df.iloc[loc]`	Series
行切片(连续)	`df[5:10]`	DataFrame
用布尔向量选择行	`df[bool_vec]`	DataFrame

import pandas as pd

df = pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]),
                            orient='index', columns=['one', 'two', 'three'])

print(df)
   one  two  three
A    1    2      3
B    4    5      6


print(df['one'])           #  查看列
print(df.loc[["A","B"]])   #  根据行标签，查看行
df.iloc[0]                 #  根据行索引，查看行

A    1
B    4
Name: one, dtype: int64
   one  two  three
A    1    2      3
B    4    5      6

添加和修改

添加行、添加列

import pandas as pd
lst=[[1,2],
     [3,4]]
idx=["idx0","idx1"] 
clm=["clm0","clm1"] 
df=pd.DataFrame(lst,index=idx,columns=clm)
print(df)
df["add"]="0"
print(df)
df.loc["app0"]="0"
print(df)

      clm0  clm1
idx0     1     2
idx1     3     4
      clm0  clm1 add
idx0     1     2   0
idx1     3     4   0
     clm0 clm1 add
idx0    1    2   0
idx1    3    4   0
app0    0    0   0

import pandas as pd

df = pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]),
                            orient='index', columns=['one', 'two', 'three'])


df['flag'] = df['one'] > 2
print(df)
# 执行结果
   one  two  three
A    1    2      3
B    4    5      6
   one  two  three   flag
A    1    2      3  False
B    4    5      6   True

索引后修改

import pandas as pd
lst=[[1,2],
     [3,4]]
idx=["idx0","idx1"] 
clm=["clm0","clm1"] 
df=pd.DataFrame(lst,index=idx,columns=clm)
print(df)
df.loc["idx1","clm1"]=0
print(df)
# 执行结果
      clm0  clm1
idx0     1     2
idx1     3     4
      clm0  clm1
idx0     1     2
idx1     3     0

删除

import pandas as pd
lst=[[1,2,3],
     [4,5,6],
     [7,8,9]]
idx=["idx0","idx1","idx2"] 
clm=["clm0","clm1","clm2"] 
df=pd.DataFrame(lst,index=idx,columns=clm)
print(df)
print(df.drop("idx0"))
print(df.drop(["clm2"],axis=1))

# 执行结果
      clm0  clm1  clm2
idx0     1     2     3
idx1     4     5     6
idx2     7     8     9
      clm0  clm1  clm2
idx1     4     5     6
idx2     7     8     9
      clm0  clm1
idx0     1     2
idx1     4     5
idx2     7     8

参考资料

https://blog.csdn.net/abc13526222160/article/details/121022952

posted @ 2023-08-12 13:17 贝壳里的星海阅读(59) 评论(0) 编辑收藏举报

刷新页面返回顶部

贝壳里的星海

pandas-基础数据结构