【Python】Pandas 笔记
pandas-data
In [1]:
import numpy as np
import pandas as pd
In [7]:
'''
创建序列的三种方式
1. 通过一维数组
2. 通过字典
3. 通过 DataFrame 的行或列
'''
arr = np.array([1, 2, 3, 4, 5])
print(arr)
s1 = pd.Series(arr)
print(s1)
print(type(s1))
dic = {'name': 'Bob', 'age': 32, 'gender': 'man'}
s2 = pd.Series(dic)
print(s2)
1.2 DataFrame 二维数组¶
In [23]:
'''
创建 DataFrame 的三种方式
1. 通过 二维数组 创建
2. 通过字典创建
3. 通过数据框
'''
arr = np.arange(20).reshape((4, 5))
df1 = pd.DataFrame(arr)
print(df1)
print()
df2 = pd.DataFrame(arr, columns=['a', 'b', 'c', 'd', 'e'])
print(df2)
In [15]:
dic = {'a':[1, 2, 3, 4], 'b':[5, 6, 7, 8], 'c':[9, 10, 11, 12]}
df3 = pd.DataFrame(dic)
print(df3)
In [16]:
dic = {'one':{'a':1,'b':2,'c':3,'d':4},'two':{'a':5,'b':6,'c':7,'d':8},'three':{'a':9,'b':10,'c':11,'d':12}}
df4 = pd.DataFrame(dic)
print(df4)
In [18]:
df5 = df4[['one', 'three']]
print(df5)
In [24]:
# Series
arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
s1 = pd.Series(arr)
print(s1[1])
print()
dic = {'name': 'Bob', 'age': 32, 'gender': 'man'}
s2 = pd.Series(dic)
print(s2['age'])
print()
print(s2[['name', 'gender']])
In [22]:
# DataFrame
dic = {'one':{'a':1,'b':2,'c':3,'d':4},'two':{'a':5,'b':6,'c':7,'d':8},'three':{'a':9,'b':10,'c':11,'d':12}}
df4 = pd.DataFrame(dic)
print(df4)
print()
print(df4['one'])
print()
print(df4[['one','three']])
3.利用 Pandas 查询数据¶
In [26]:
# 准备数据
stu_dic = {'Age':[14,13,13,14,14,12,12,15,13,12,11,14,12,15,16,12,15,11,15],
'Height':[69,56.5,65.3,62.8,63.5,57.3,59.8,62.5,62.5,59,51.3,64.3,56.3,66.5,72,64.8,67,57.5,66.5],
'Name':['Alfred','Alice','Barbara','Carol','Henry','James','Jane','Janet','Jeffrey','John','Joyce','Judy','Louise','Marry','Philip','Robert','Ronald','Thomas','Willam'],
'Gender':['M','F','F','F','M','M','F','F','M','M','F','F','F','F','M','M','M','M','M'],
'Weight':[112.5,84,98,102.5,102.5,83,84.5,112.5,84,99.5,50.5,90,77,112,150,128,133,85,112]}
students = pd.DataFrame(stu_dic, columns=['Name', 'Age', 'Gender', 'Height', 'Weight'])
print(students)
In [27]:
# 查看前五行
print(students.head())
print()
# 查看末五行
print(students.tail())
In [31]:
# 索引指定的行
print(students.loc[[0, 4, 5, 8]])
print()
# 索引指定的列
print(students[['Name', 'Height', 'Weight']])
In [34]:
# 筛选出所有十二岁以上的女生
print(students[(students['Gender'] == 'F') & (students['Age'] > 12) ])
In [35]:
# 列出所有十二岁以上女生的姓名身高和体重
print(students[(students['Gender'] == 'F') & (students['Age'] > 12) ][['Name', 'Height', 'Weight']])
如果是多个条件的查询,必须在&(且)或者|(或)的两端条件用括号括起来。¶
4. 利用 Pandas 进行数据统计分析¶
In [36]:
# 准备数据
stu_dic = {'Age':[14,13,13,14,14,12,12,15,13,12,11,14,12,15,16,12,15,11,15],
'Height':[69,56.5,65.3,62.8,63.5,57.3,59.8,62.5,62.5,59,51.3,64.3,56.3,66.5,72,64.8,67,57.5,66.5],
'Name':['Alfred','Alice','Barbara','Carol','Henry','James','Jane','Janet','Jeffrey','John','Joyce','Judy','Louise','Marry','Philip','Robert','Ronald','Thomas','Willam'],
'Gender':['M','F','F','F','M','M','F','F','M','M','F','F','F','F','M','M','M','M','M'],
'Weight':[112.5,84,98,102.5,102.5,83,84.5,112.5,84,99.5,50.5,90,77,112,150,128,133,85,112]}
students = pd.DataFrame(stu_dic, columns=['Name', 'Age', 'Gender', 'Height', 'Weight'])
print(students)
4.1 数据量、求和、均值、最值¶
In [42]:
print("数据量: ")
print(students.count(), '\n')
print("求和: ") # 字符类型求和结果是拼接
print(students.sum(), '\n')
print("均值: ")
print(students[['Age', 'Height', 'Weight']].mean(), '\n')
print("最大值:")
print(students[['Age', 'Height', 'Weight']].max(), '\n')
print("最小值:")
print(students[['Age', 'Height', 'Weight']].min(), '\n')
print("最大值的索引:")
print(students[['Age', 'Height', 'Weight']].idxmax(), '\n')
print("最小值的索引:")
print(students[['Age', 'Height', 'Weight']].idxmin(), '\n')
4.2 中位数、 众数、 方差、 分位数¶
In [43]:
print("中位数: ")
print(students[['Age', 'Height', 'Weight']].median(), '\n')
print("众数: ")
print(students[['Age', 'Height', 'Weight']].mode(), '\n')
print("方差: ")
print(students[['Age', 'Height', 'Weight']].var(), '\n')
print("10% 分位数")
print(students[['Age', 'Height', 'Weight']].quantile(0.1), '\n')
4.3 标准差、平均绝对偏差、偏度、峰度¶
In [44]:
print("标准差: ")
print(students[['Age', 'Height', 'Weight']].std(), '\n')
print("平均绝对偏差: ")
print(students[['Age', 'Height', 'Weight']].mad(), '\n')
print("偏度: ")
print(students[['Age', 'Height', 'Weight']].skew(), '\n')
print("峰度: ")
print(students[['Age', 'Height', 'Weight']].kurt(), '\n')
In [45]:
# 概览
print(students.describe())
4.4 自定义数值统计 apply()¶
In [47]:
def state(x):
return pd.Series([x.std(), x.mean(), x.count(), x.max(), x.min()],
index=['std', 'mean', 'count', 'max', 'min'])
students[['Age', 'Height', 'Weight']].apply(state)
Out[47]:
5. 使用 Pandas 进行类 SQL 操作¶
5.1 新增¶
In [50]:
# 新增行或列
add = [{'Name': 'Alice', 'Age': 15, 'Gender': 'F', 'Height': 130, 'Weight': 88},
{'Name': 'James', 'Age': 13, 'Gender': 'M', 'Height': 100, 'Weight': 120}]
df = pd.DataFrame(add)
print(df, '\n')
print(pd.concat([students, df]))
In [51]:
# 重建索引
print(pd.concat([students, df], ignore_index=True))
In [53]:
print(pd.DataFrame(students, columns=['Name', 'Age', 'Gender', 'Height', 'Weight', 'score']))
5.2 删除¶
In [56]:
# 删除行
print(students.drop([2, 3, 5]))
In [57]:
# 删除列
print(students.drop(columns=['Age']))
In [58]:
# 删除 14 岁以下的女生, 就是筛选出 14岁以上或14岁以下的男生
print(students[(students['Age'] >= 14) | ((students['Age'] < 14) & (students['Gender'] == 'M')) ])
5.3 修改¶
In [65]:
# 将 James 的身高修改为 1000
students.loc[students['Name'] == 'James', 'Height'] = 1000
print(students[students['Name'] == 'James'])
In [68]:
print(students.groupby('Gender').count())
In [69]:
print(students.drop('Age', axis=1).groupby('Gender').mean())
In [71]:
print(students.groupby(['Gender', 'Age']).mean())
5.6 排序 sort_index 和 sort_values¶
In [72]:
# 先按年龄,再按身高排序
print(students.sort_values(by=['Age', 'Height']))
In [73]:
# 降序
print(students.sort_values(by=['Age', 'Height'], ascending=False))
5.7 多表连接¶
In [74]:
dic2 = {'Name':['Alfred','Alice','Barbara','Carol','Henry','Jeffrey','Judy','Philip','Robert','Willam'],
'Score':[88,76,89,67,79,90,92,86,73,77]}
score = pd.DataFrame(dic2)
students2 = pd.merge(students, score, on='Name') # 默认内联
print(students2)
In [79]:
students3 = pd.merge(students, score, on='Name', how='left') # how 指定连接方式
print(students3)
In [78]:
students4 = pd.merge(students, score, on='Name', how='right')
print(students4)
6. Pandas 缺失值处理¶
6.1 删除法 dropna()¶
In [80]:
# 删除法
stu_score = students3['Score']
print(stu_score)
In [81]:
print('缺失值记录数:', sum(pd.isnull(stu_score)))
# 删除缺失值的行
print(stu_score.dropna())
In [85]:
print(students3.dropna(axis=1)) # 删除含有 NaN 值的列
In [86]:
print(students3.dropna(axis=0)) # 删除含有 NaN 值的行
6.2 补值法 fillna()¶
In [87]:
print(students3.fillna(0)) # 使用 0 来填补值为 NaN 的数据
In [88]:
print(students3.fillna(method='ffill')) # 使用前值填充
In [89]:
print(students3.fillna(method='bfill')) # 使用后值填充
7. Pandas 数据透视¶
欢迎访问我的个人博客站点:
https://yeyeck.com