Pandas数据分析
Pandas数据分析
1.基本统计函数
函数 | 说明 |
---|---|
sum() | 求和 |
count() | 统计个数 |
max() | 求最大值 |
min() | 求最小值 |
median() | 求中位数 |
mean() | 求平均值 |
mode() | 求众数 |
var() | 求方差 |
std() | 求标准差 |
quantile() | 求分位数 |
(1)sum()
df.sum(axis=0或1,numeric_only=布尔值)
numeric_only的默认值为False,表示对所以的行或列进行求和
True则会对数字进行求和
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df.sum(axis=0, numeric_only=False)
print(res)
(2)count()
df.count(axis=0或1)
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df.count(axis=0)
print(res)
(3)max() & min()
df.max(axis=0或1,numeric_only=布尔值)
df.min(axis=0或1,numeric_only=布尔值)
numeric_only的默认值为False,表示对所以的行或列进行求最值
True则会对数字进行求最值
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df.max(axis=0, numeric_only=True)
print(res)
(4)median()
df.median(axis=0或1,numeric_only=布尔值)
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df.median(axis=0, numeric_only=True)
print(res)
(5)mode()
df.mode(axis=0或1, numeric_only=布尔值)
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df.mode(axis=0, numeric_only=False)
print(res)
(6)quantile()
df.quantile(axis=0或1,q=值)
q是一个浮点数表示取多少百分位数
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df.quantile(axis=0, numeric_only=True, q=0.5)
print(res)
2.其他统计函数
函数 | 说明 |
---|---|
unique() | 统计取值的种类 |
value_counts() | 统计取值个数 |
pct_change() | 求变化百分比 |
idxmax() | 求最大值的行名 |
idxmin() | 求最小值的行名 |
(1)unique()
df[列名].unique()
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df["平均价格(元/斤)"].unique()
print(res)
返回的结果是个列表,里面是这一列全部的取值
(2)value_counts()
df[列名].value_counts()
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df["平均价格(元/斤)"].value_counts()
print(res)
返回结果为Series,Series的index是值的种类,value是值对应的个数
(3)pct_change()
df.pct_change(axis=0或1)
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data).iloc[:, [1, 2, 3]]
res = df.pct_change(axis=0)
print(res)
这个函数将每一个元素和前面的值进行比较,计算变化百分比
(4)idxmax() & idxmin()
df[列名].idxmax()
df[列名].idxmin()
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df["月销量(斤)"].idxmax()
print(res)
3.整体情况
(1)describe()
describe()函数会一次性获得一个数据集
df.describe()
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df.describe()
print(res)
(2)info
info()函数会的一个数据集,主要为类型,列名等
df.info()
import pandas as pd
data = {
"水果名称": ["苹果", "香蕉", "橙子", "西瓜", "草莓", "葡萄", "芒果", "菠萝", "梨", "桃子"],
"平均价格(元/斤)": [5, 3, 6, 2, 10, 8, 7, 4, 4, 5],
"月销量(斤)": [500, 350, 400, 600, 200, 300, 250, 180, 220, 150],
"维生素 C 含量(mg/100g)": [4, 8, 30, 6, 60, 5, 28, 18, 4, 7]
}
df = pd.DataFrame(data)
res = df.info()
print(res)
4.聚合函数
聚合函数可以实现多个对象进行统计
df.agg(列表)
5.数据分类
(1)创建分组
df.groupby(列名或列表)
import pandas as pd
data = {
"学生姓名": ["张三", "李四", "王五", "赵六", "孙七"],
"班级": ["一班", "二班", "三班", "一班", "二班"],
"性别": ["男", "女", "男", "女", "男"],
"年龄": [18, 19, 20, 17, 18]
}
df = pd.DataFrame(data)
group = df.groupby("班级")
for i in group:
print(i)
(2)统计分析
可以使用一些统计函数进行分析
import pandas as pd
data = {
"学生姓名": ["张三", "李四", "王五", "赵六", "孙七"],
"班级": ["一班", "二班", "三班", "一班", "二班"],
"性别": ["男", "女", "男", "女", "男"],
"年龄": [18, 19, 20, 17, 18]
}
df = pd.DataFrame(data)
group = df.groupby("班级")
res = group.count()
print(res)