数据挖掘基础

目录

一 、Matplotlib

Matplotlib 三层结构

  • 1.容器层 : 画板层 画布层plt.figure()

    绘图区/坐标系axes: plt.subplots() x、y轴张成的区域

    • 2.辅助显示层 横纵坐标,网格等 辅助显示
    • 3.图像层 指的是不同的同,如折线图,柱状图等
      注:2 , 3 置于 1 之上

1.常见图表的使用

  • 折线图 (plot)

    • 适用于连续数据 变量之间随时间变化趋势
  • 散点图 (scatter)

    • 适用于离散数据 变量之间是否存在数量关联趋势 关系/规律
  • 柱状图 (bar)

    • 适用于离散数据 统计和比较数据之间的差别 统计/对比
  • 直方图 (histogram)

    • 适用于连续数据 展示一组或者多组数据的分布 分布状况
  • 饼图(pie)

    • 适用于分类数据 展示不同分类的占比情况 占比
import matplotlib.pyplot as plt
import random
# # 显示中文绘图
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False

2.画法一

# 需求:画出城市11点到23点1小时内每分钟的温度变化折线图,温度范围在15度-18度
import matplotlib.pyplot as plt
import random


# 1.准备数据 x  y
x = range(60)
y_shanghai =  [random.uniform(15,18) for i in x]
y_beijing = [random.uniform(1,3) for i in x]


# 2.创建画布
plt.figure(figsize=(15,5),dpi=80)


# 3.绘制图像
# 颜色字符 r g b w c m y k  风格字符 -  --  -.   :  ''
plt.plot(x,y_shanghai,color="r",linestyle = "--",label="上海")
plt.plot(x,y_beijing,color="b",linestyle="-",label="北京")

#显示图例 ---》label ="上海" ,label = "北京"
#位置 0  1  2  3   4   5   6   7   8   9 
#位置 upper   low   center   +    left   right   /  best  
# plt.legend(loc = 0)
plt.legend(loc="upper right")

#修改x,y刻度值
#准备x的刻度说明
x_lable = ["11点{}分".format(i) for i in x]
plt.xticks(x[::5],x_lable[::5])
plt.yticks(range(0,41,5))

#添加网格显示
plt.grid(linestyle="--",alpha=0.5)

#添加描述信息
plt.xlabel("时间变化")
plt.ylabel("温度变化")
plt.title("上海、北京11点到12点的温度变化状况")


# 4.显示图像
plt.show()

png

3.画法二

# 需求:将北京、上海 在同一画布上,显示不同的图

# 1.准备数据 x  y
x = range(60)
y_shanghai = [random.uniform(15,18) for i in x ]
y_beijing  = [random.uniform(1,3)   for i in x ]

# 2.创建画布
plt.figure(figsize=(10,5),dpi=50)
#一行两列
figure , axes = plt.subplots(nrows = 1 ,ncols = 2 
                             ,figsize=(10,5),dpi=80)

# 3.绘制图像
axes[0].plot(x,y_shanghai,color = "r",linestyle="--",label="上海")
axes[1].plot(x,y_beijing,color="b",linestyle="-",label = "北京")

#显示图例 ---》label ="上海" ,label = "北京"
#位置 0  1  2  3   4   5   6   7   8   9 
#位置 upper   low   center   +    left   right   /  best  
# plt.legend(loc = 0)
axes[0].legend(loc="upper right")
axes[1].legend(loc=0)

#添加网格显示
axes[0].grid(linestyle="--",alpha=0.3)
axes[1].grid(linestyle="-",alpha = 0.4)

# 添加刻度
x_lable = ["12:{}".format(i)  for i in x ]
axes[0].set_xticks(x[::5],x_lable[::5])
axes[0].set_yticks(range(0,41,5))
axes[1].set_xticks(x[::5],x_lable[::5])
axes[1].set_yticks(range(0,41,5))


# 添加描述信息
axes[0].set_xlabel("时间变化")
axes[0].set_ylabel("温度变化")
axes[0].set_title("上海12点到13点的温度变化状况")
axes[1].set_xlabel("时间变化")
axes[1].set_ylabel("温度变化")
axes[1].set_title("北京12点到13点的温度变化状况")

plt.show()
<Figure size 500x250 with 0 Axes>

png

import numpy as np

4.绘制数学函数图像

# 1.准备 x,y数据
x = np.linspace(-1,1,1000)
y = 2 * x * x

# 2.创建画布
plt.figure(figsize=(20, 8), dpi=80)

# 3、绘制图像
plt.plot(x,y,color = "g",linestyle="-",label="抛物线")
plt.legend(loc=0)
# 添加网格信息
plt.grid(linestyle="--",alpha=0.5)
# 添加刻度
x_label = ["x={}".format(i) for i in x]
plt.xticks(x[::100],x_label[::100])
plt.yticks(y[::100])
# 添加描述信息
plt.xlabel("x自变量")
plt.ylabel("y变量")
plt.title("抛物线变化情况")

# 4显示图像
plt.show()

png

5.散点图绘制

# 需求:探究房屋面积和房屋价格的关系

# 1、准备数据
x = [225.98, 247.07, 253.14, 457.85, 241.58, 301.01,  20.67, 288.64,
       163.56, 120.06, 207.83, 342.75, 147.9 ,  53.06, 224.72,  29.51,
        21.61, 483.21, 245.25, 399.25, 343.35]

y = [196.63, 203.88, 210.75, 372.74, 202.41, 247.61,  24.9 , 239.34,
       140.32, 104.15, 176.84, 288.23, 128.79,  49.64, 191.74,  33.1 ,
        30.74, 400.02, 205.35, 330.64, 283.45]
# 2、创建画布
plt.figure(figsize=(10,5),dpi=50)

# 3、绘制图像
plt.scatter(x,y,color="r")

# 4、显示图像
plt.show()

png

5.综合案例

需求1-对比每部电影的票房收入----单直方图

# 1、准备数据
movie_names = ['雷神3:诸神黄昏','正义联盟','东方快车谋杀案','寻梦环游记','全球风暴', '降魔传','追捕','七十七天','密战','狂兽','其它']
tickets = [73853,57767,22354,15969,14839,8725,8716,8318,7916,6764,52222]

# 2、创建画布
plt.figure(figsize=(20,8),dpi=80)

# 3、绘制柱状图
x_ticks = range(len(movie_names))
plt.bar(x_ticks,tickets,color=['b','r','g','y','c','m','y','k','c','g','b'])

#修改刻度
plt.xticks(x_ticks,movie_names)

# 添加标题
plt.title("电影票房收入对比")

# 添加网格
plt.grid(linestyle="--",alpha=0.8)
# 4、显示图像
plt.show()

png

需求2-如何对比电影票房收入才更能加有说服力?---双直方图

# 1、准备数据
movie_name = ['雷神3:诸神黄昏','正义联盟','寻梦环游记']

first_day = [10587.6,10062.5,1275.7]
first_weekend=[36224.9,34479.6,11830]

# 2、创建画布
plt.figure(figsize=(10,5),dpi=80)

# 3、绘制柱状图
plt.bar(range(3),first_day,color="r",label="首日票房",width=0.2)
plt.bar([0.2,1.2,2.2],first_weekend,width=0.2,label="首周票房")

# 显示图例
plt.legend()

# 修改刻度
plt.xticks([0.1,1.1,2.1],movie_name)

plt.show()

png

6.直方图绘制

# 需求:电影时长分布状况
# 1、准备数据
time = [131,  98, 125, 131, 124, 139, 131, 117, 128, 108, 135, 138, 131, 102, 107, 114, 119, 128, 121, 142, 127, 130, 124, 101, 110, 116, 117, 110, 128, 128, 115,  99, 136, 126, 134,  95, 138, 117, 111,78, 132, 124, 113, 150, 110, 117,  86,  95, 144, 105, 126, 130,126, 130, 126, 116, 123, 106, 112, 138, 123,  86, 101,  99, 136,123, 117, 119, 105, 137, 123, 128, 125, 104, 109, 134, 125, 127,105, 120, 107, 129, 116, 108, 132, 103, 136, 118, 102, 120, 114,105, 115, 132, 145, 119, 121, 112, 139, 125, 138, 109, 132, 134,156, 106, 117, 127, 144, 139, 139, 119, 140,  83, 110, 102,123,107, 143, 115, 136, 118, 139, 123, 112, 118, 125, 109, 119, 133,112, 114, 122, 109, 106, 123, 116, 131, 127, 115, 118, 112, 135,115, 146, 137, 116, 103, 144,  83, 123, 111, 110, 111, 100, 154,136, 100, 118, 119, 133, 134, 106, 129, 126, 110, 111, 109, 141,120, 117, 106, 149, 122, 122, 110, 118, 127, 121, 114, 125, 126,114, 140, 103, 130, 141, 117, 106, 114, 121, 114, 133, 137,  92,121, 112, 146,  97, 137, 105,  98, 117, 112,  81,  97, 139, 113,134, 106, 144, 110, 137, 137, 111, 104, 117, 100, 111, 101, 110,105, 129, 137, 112, 120, 113, 133, 112,  83,  94, 146, 133, 101,131, 116, 111,  84, 137, 115, 122, 106, 144, 109, 123, 116, 111,111, 133, 150]

# 2、创建画布
plt.figure(figsize=(20, 8), dpi=80)

# 3、绘制直方图
distance = 2
group_num = int((max(time) - min(time)) / distance)

plt.hist(time, bins=group_num, density=True)

# 修改x轴刻度
plt.xticks(range(min(time), max(time) + 2, distance))

# 添加网格
plt.grid(linestyle="--", alpha=0.5)

# 4、显示图像
plt.show()

png

# 1、准备数据
movie_name = ['雷神3:诸神黄昏','正义联盟','东方快车谋杀案','寻梦环游记','全球风暴','降魔传','追捕','七十七天','密战','狂兽','其它']

place_count = [60605,54546,45819,28243,13270,9945,7679,6799,6101,4621,20105]

# 2、创建画布
plt.figure(figsize=(20, 8), dpi=80)

# 3、绘制饼图
plt.pie(place_count, labels=movie_name, colors=['b','r','g','y','c','m','y','k','c','g','y'], autopct="%1.3f%%")

# 显示图例
plt.legend()

plt.axis('equal')

# 4、显示图像
plt.show()

png

二.Numpy

  • 科学数据库
  • ndarray ---- n维数组
import numpy as np
score = np.array([[80, 89, 86, 67, 79],
[78, 97, 89, 67, 81],
[90, 94, 78, 67, 74],
[91, 91, 90, 67, 69],
[76, 87, 75, 67, 86],
[70, 79, 84, 67, 84],
[94, 92, 93, 67, 64],
[86, 85, 83, 67, 80]])

score 
----result----------
array([[80, 89, 86, 67, 79],
       [78, 97, 89, 67, 81],
       [90, 94, 78, 67, 74],
       [91, 91, 90, 67, 69],
       [76, 87, 75, 67, 86],
       [70, 79, 84, 67, 84],
       [94, 92, 93, 67, 64],
       [86, 85, 83, 67, 80]])
type(score)
----result----------
numpy.ndarray

1.ndarray与Python原生list运算效率对比

  • 具有相同数据类型的多维数组
  • 底层是c封装,运行速度快
  • 存储是紧凑型,占有空间少
import random
import time
# 生成一个大数组
python_list = []
for i in range(100000000):
    python_list.append(random.random())
python_list = np.array(python_list)
python_list
----result----------
array([0.23401045, 0.59441637, 0.58932156, ..., 0.94788828, 0.27280735,
       0.44378139])
# 原生Python list 求和
t1 = time.time()
a = sum(python_list)
t2 = time.time()
d1 = t2 -t1

# ndarray 求和
t3 = time.time()
b = np.sum(python_list)
t4 = time.time()
d2 = t4 -t3 
d1
----result----------
6.524392366409302
d2
----result----------
0.11269235610961914

2. naarray属性

# shape(数组维度的元组)   
# ndim(数组维数)   
# size(数组中的元素数量)
# dtype(一个数组元素的长度(字节))
# itemesize(数组元素的类型)
score
----result----------
array([[80, 89, 86, 67, 79],
       [78, 97, 89, 67, 81],
       [90, 94, 78, 67, 74],
       [91, 91, 90, 67, 69],
       [76, 87, 75, 67, 86],
       [70, 79, 84, 67, 84],
       [94, 92, 93, 67, 64],
       [86, 85, 83, 67, 80]])
# shape(数组维度的元组)   
score.shape    # 8行5列
----result----------
(8, 5)
# ndim(数组维数)  
score.ndim   # 二维数组
----result----------
2
# size(数组中的元素数量)
score.size    # 数组中有40个元素
----result----------
40
# dtype(一个数组元素的长度(字节))
score.dtype    # 数据类型是 int32
----result----------
dtype('int32')
# itemesize(数组元素的类型)
score.itemsize
----result----------
4

2.1 ndarray的形状

a = np.array([[1,2,3],[4,5,6]])
b = np.array([1,2,3,4])
c = np.array([[[1,2,3],[4,5,6]],[[1,2,3],[4,5,6]]])
# 二维数组
a
----result----------
array([[1, 2, 3],
       [4, 5, 6]])
a.shape
----result----------
(2, 3)
# 一维数组
b  
----result----------
array([1, 2, 3, 4])
b.shape
----result----------
(4,)
# 三维数组
c  
----result----------
array([[[1, 2, 3],
        [4, 5, 6]],

[[1, 2, 3],
        [4, 5, 6]]])
c.shape
----result----------
(2, 2, 3)

2.2 ndarray的类型

data = np.array([1.1,2.2,3.3])
data.dtype
----result----------
dtype('float64')
# 创建数组的时候指定类型
data1 = np.array([1.1,2.2,3.3],dtype=np.float32)
data1.dtype
----result----------
dtype('float32')

3.ndarray的基本操作

3.1生成0和1的数组

# 生成0和1的数组
np.zeros(shape=(3,4),dtype="float32")
----result----------
array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]], dtype=float32)
np.ones(shape=(4,3),dtype="int32")
----result----------
array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])

3.2从现有数组中生成

score
----result----------
array([[80, 89, 86, 67, 79],
       [78, 97, 89, 67, 81],
       [90, 94, 78, 67, 74],
       [91, 91, 90, 67, 69],
       [76, 87, 75, 67, 86],
       [70, 79, 84, 67, 84],
       [94, 92, 93, 67, 64],
       [86, 85, 83, 67, 80]])
# 深拷贝(常用)   直接开辟一块新空间,元素的数值改变不受影响
data_deepcopy = np.array(score,dtype="float64")  
data_deepcopy
----result----------
array([[80., 89., 86., 67., 79.],
       [78., 97., 89., 67., 81.],
       [90., 94., 78., 67., 74.],
       [91., 91., 90., 67., 69.],
       [76., 87., 75., 67., 86.],
       [70., 79., 84., 67., 84.],
       [94., 92., 93., 67., 64.],
       [86., 85., 83., 67., 80.]])
# 浅拷贝  只是复制索引,当原数据改变的时候,索引的相应位置也会改变
data_copy = np.asarray(score)
data_copy
----result----------
array([[80, 89, 86, 67, 79],
       [78, 97, 89, 67, 81],
       [90, 94, 78, 67, 74],
       [91, 91, 90, 67, 69],
       [76, 87, 75, 67, 86],
       [70, 79, 84, 67, 84],
       [94, 92, 93, 67, 64],
       [86, 85, 83, 67, 80]])
score[3,1]  = 10000
score
----result----------
array([[   80,    89,    86,    67,    79],
       [   78,    97,    89,    67,    81],
       [   90,    94,    78,    67,    74],
       [   91, 10000,    90,    67,    69],
       [   76,    87,    75,    67,    86],
       [   70,    79,    84,    67,    84],
       [   94,    92,    93,    67,    64],
       [   86,    85,    83,    67,    80]])
data_deepcopy
----result----------
array([[80., 89., 86., 67., 79.],
       [78., 97., 89., 67., 81.],
       [90., 94., 78., 67., 74.],
       [91., 91., 90., 67., 69.],
       [76., 87., 75., 67., 86.],
       [70., 79., 84., 67., 84.],
       [94., 92., 93., 67., 64.],
       [86., 85., 83., 67., 80.]])
data_copy
----result----------
array([[   80,    89,    86,    67,    79],
       [   78,    97,    89,    67,    81],
       [   90,    94,    78,    67,    74],
       [   91, 10000,    90,    67,    69],
       [   76,    87,    75,    67,    86],
       [   70,    79,    84,    67,    84],
       [   94,    92,    93,    67,    64],
       [   86,    85,    83,    67,    80]])

3.3生成固定范围的数组

np.linspace(0,10,4)   # 范围[0,10]   4个等距离数
----result----------
array([ 0.        ,  3.33333333,  6.66666667, 10.        ])
np.arange(0,10,4)      #范围[0,10)   步长为4
----result----------
array([0, 4, 8])

3.4生成随机数组

import matplotlib.pyplot as plt
# 均匀分布  最低为 -1  最高为 1   数量为1000000
data1 =  np.random.uniform(-1,1,1000000)
# 1、创建画布
plt.figure(figsize=(20, 8), dpi=80)

# 2、绘制直方图
plt.hist(data1, 1000)

# 3、显示图像
plt.show()

png

# 正态分布  loc -> 均值 0  scale -> 标准差  1 size 数据个数  10000
data2 = np.random.normal(0,1,1000000)
# 1、创建画布
plt.figure(figsize=(20, 8), dpi=80)

# 2、绘制直方图
plt.hist(data2, 1000)

# 3、显示图像
plt.show()

png

案例:随机生成8只股票两周的交易日涨幅数据

# 生成数组
#使用正态分布生成数据   按照 8 行  10列  
stock_change = np.random.normal( 0,1,(8,10))
stock_chang
----result----------
array([[ 0.78322177,  0.507861  ,  1.85003005,  1.19323072,  0.81847297,
        -0.63094271, -0.57060276,  0.30855245,  1.94698162, -0.88928573],
       [ 0.06893518, -0.40478084, -1.65714139, -0.81447523,  0.64866496,
         0.39132671,  0.04355432,  0.0446165 , -0.51806464,  0.43098731],
       [ 0.16193423, -0.59461836,  0.09053752, -0.45878007, -0.09133374,
        -0.10657109, -0.41976839, -0.76897636, -0.89682933,  0.44727093],
       [-0.21787426, -0.70705768,  0.14948141,  1.88738647, -2.41193373,
        -0.86842017,  1.51809276,  0.39750412,  1.13165355, -0.18717137],
       [-1.9212302 , -0.85526221, -0.68535196, -1.8369455 ,  2.30827726,
         0.58769356, -1.29491169,  0.70073049,  1.36375486,  0.31760057],
       [ 0.88214854,  0.81825296,  1.70235699, -0.85743748,  1.67908554,
         0.02161739,  1.49909361, -1.43709353, -0.0857922 ,  1.09128374],
       [-1.48763567, -0.71125736,  0.20389856, -0.10618373, -0.09510078,
        -1.16947122, -1.09477064,  1.45625879, -0.93677276,  1.41664427],
       [-1.36546057,  0.27390585, -2.00117963,  0.48013552, -1.37752954,
         1.1236394 , -0.82104285,  0.63678403,  0.23191035,  0.44840676]])

4.数组的索引、切片

# 获取第一个股票的前三个交易日的涨跌幅数据
# TODO  对二维数据如何切片? 第一个参数是第几行,第二参数是第几列
# [  :  ,  :  ]    0 代表二维数组第一个数组   :3 这个时候是一维数组 中前三个  

stock_change [0,:3] 
----result----------
array([0.78322177, 0.507861  , 1.85003005])
stock_change [0,:3] 
----result----------
array([0.78322177, 0.507861  , 1.85003005])
a1 = np.array([[[1,2,3],[4,5,6]],[[12,3,34],[5,6,7]]])
# 使用索引定位到34
a1[1,0,2]
----result----------
34
# TODO 需求:让刚才的股票行,日期列放过来,变成日期行,股票

5.数据形状的修改

5.1 reshape

# reshape 并没有把行列进行转换,只是将数组的形状进行重新划分 ,
stock_change.shape
----result----------
(8, 10)
stock_change.reshape((10,8)).shape
----result----------
(10, 8)

5.2 resize

# resize 没有返回值,只是对原始数据进行修改
stock_change.resize((10,8))
stock_change.shape
----result----------
(10, 8)

5.3 T 转置

# 使用转置,将原本数据的行列  进行列行的转换
stock_change.T
----result----------
array([[ 0.78322177,  1.94698162,  0.04355432, -0.09133374,  0.14948141,
        -1.9212302 ,  1.36375486,  1.49909361, -0.09510078, -2.00117963],
       [ 0.507861  , -0.88928573,  0.0446165 , -0.10657109,  1.88738647,
        -0.85526221,  0.31760057, -1.43709353, -1.16947122,  0.48013552],
       [ 1.85003005,  0.06893518, -0.51806464, -0.41976839, -2.41193373,
        -0.68535196,  0.88214854, -0.0857922 , -1.09477064, -1.37752954],
       [ 1.19323072, -0.40478084,  0.43098731, -0.76897636, -0.86842017,
        -1.8369455 ,  0.81825296,  1.09128374,  1.45625879,  1.1236394 ],
       [ 0.81847297, -1.65714139,  0.16193423, -0.89682933,  1.51809276,
         2.30827726,  1.70235699, -1.48763567, -0.93677276, -0.82104285],
       [-0.63094271, -0.81447523, -0.59461836,  0.44727093,  0.39750412,
         0.58769356, -0.85743748, -0.71125736,  1.41664427,  0.63678403],
       [-0.57060276,  0.64866496,  0.09053752, -0.21787426,  1.13165355,
        -1.29491169,  1.67908554,  0.20389856, -1.36546057,  0.23191035],
       [ 0.30855245,  0.39132671, -0.45878007, -0.70705768, -0.18717137,
         0.70073049,  0.02161739, -0.10618373,  0.27390585,  0.44840676]])

6.类型修改 astype 、tostring

6.1 astype 转换数据类型

stock_change.astype("int32").dtype 
----result----------
dtype('int32')

6.2 tostring 序列化到本地

stock_change.tostring()
C:\Users\wyk15\AppData\Local\Temp\ipykernel_9792\2974151914.py:1: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.
  stock_change.tostring(

b'\'hJ\x17\'\x10\xe9?\xce9\xde\xb3e@\xe0?!\xf6i\x1b\xb9\x99\xfd?\x97\xa6\xb5\x18y\x17\xf3?\x97b7;\xee0\xea?{5\x9b\xc1\xae0\xe4\xbfqx\xf3\xb9`B\xe2\xbf\xdb\x0f\x11\xc8R\xbf\xd3??\x96\x184\xd6&\xff?\xb5\xdc\x96V\x07u\xec\xbf3\x01X{\xbc\xa5\xb1?   ......................

7.数组的去重 set 、 unique

7.1 set

set([1,2,3,4,3,2])
----result----------
{1, 2, 3, 4}
temp = np.array([[1,2,3,4],[3,4,5,6]])

# set(temp)
# unhashable type: 'numpy.ndarray'

set(temp.flatten())
----result----------
{1, 2, 3, 4, 5, 6}

7.2 unique

np.unique(temp)
----result----------
array([1, 2, 3, 4, 5, 6])

8.ndarray运算

8.1 逻辑运算

8.1.1 逻辑判断 ---布尔索引 data[ 判断条件 ]

stock_change = np.random.normal(loc=0,scale=1,size=(8,10))
stock_change
----result----------
array([[-0.0245709 ,  0.68795842, -0.95063999, -0.53223192,  1.303397  ,
        -0.06299074, -0.95125381, -0.82551705,  0.42587723, -0.11330232],
       [-0.23024816, -0.41551718, -0.01701004,  0.25067482, -0.63199918,
         0.76156926, -1.41413847,  1.71607016,  1.40861841,  0.56333108],
       [-0.33762242, -0.9108106 , -0.57667507, -1.71265711, -0.14329169,
        -0.42219945, -1.30530528, -1.66632015,  0.66312763, -0.15927883],
       [ 0.95633189, -0.76190147,  3.62513955,  0.79395741, -1.05363437,
        -1.19484002,  1.05985014, -0.34990448,  0.5154208 , -0.61577867],
       [ 1.62141953, -0.45131205,  1.22414137, -0.12214553, -0.94923934,
         0.07767484,  0.30012161, -0.78144416, -0.5917796 ,  0.42159082],
       [ 0.77102353, -0.66785462, -0.18199085,  0.51546877,  0.89751432,
        -1.20205727, -1.22355634,  0.9701325 ,  0.66979969, -1.31552754],
       [ 1.48046995,  0.76282364, -1.00391509,  2.32600983,  0.37431788,
        -0.54506338,  0.44654218, -0.26642661, -1.0385316 , -0.29532037],
       [ 0.32563363,  0.73159911, -0.6522114 ,  0.0418548 , -0.02176352,
        -0.34409397, -0.59366906,  0.25135939,  1.4580353 ,  0.29256462]])
# 布尔索引   逻辑判断,如果过涨跌幅大于0.5就标记为True 否则为False
stock_change > 0.5
----result----------
array([[False,  True, False, False,  True, False, False, False, False,
        False],
       [False, False, False, False, False,  True, False,  True,  True,
         True],
       [False, False, False, False, False, False, False, False,  True,
        False],
       [ True, False,  True,  True, False, False,  True, False,  True,
        False],
       [ True, False,  True, False, False, False, False, False, False,
        False],
       [ True, False, False,  True,  True, False, False,  True,  True,
        False],
       [ True,  True, False,  True, False, False, False, False, False,
        False],
       [False,  True, False, False, False, False, False, False,  True,
        False]])
# 布尔索引       把逻辑判断节点,放在索引中,可以获取满足条件的数据
stock_change[stock_change>0.5]  = 1.1
stock_change[stock_change>0.5]
----result----------
array([1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1,
       1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1])

8.1.2 通用判断函数 all 、 any

# 通用判断函数    判断stock_change[0:2,0:5] 是否全是上涨的
stock_change[0:2,0:5] > 0
----result----------
array([[False,  True, False, False,  True],
       [False, False, False,  True, False]])
# np.all() 判断条件是   是否索引判断中都满足条件    即都返回true
#            若满足则为 true   若不满足则为false
np.all(stock_change[0:2,0:5] > 0)
----result----------
False
# np.any() 判断条件是   索引中是否有满足条件  即都返回true
np.any(stock_change[0:2,0:5] > 0 )
----result----------
True

8.2 三元运算符 where

# 判断前四个股票前四天的涨跌幅 大于的置为1 ,否为为0
temp = stock_change[:4,:4]
temp
----result----------
array([[-0.0245709 ,  1.1       , -0.95063999, -0.53223192],
       [-0.23024816, -0.41551718, -0.01701004,  0.25067482],
       [-0.33762242, -0.9108106 , -0.57667507, -1.71265711],
       [ 1.1       , -0.76190147,  1.1       ,  1.1       ]])
temp>0
----result----------
array([[False,  True, False, False],
       [False, False, False,  True],
       [False, False, False, False],
       [ True, False,  True,  True]])
# np.where 进行三元运算,并见满足条件的值置为 1   不满足条件的置为 0 
np.where(temp > 0 , 1 ,0 )
----result----------
array([[0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 0],
       [1, 0, 1, 1]])

8.3 符合逻辑需要结合np.logical_and 和 np.logical_or 使用

# 判断 前四个股票前四天的涨跌幅 大于0.5并且小于1的,换为1,否则为0
# 判断 前四个股票前四天的涨跌幅 大于0.5或者小于-0.5 , 换位1 ,否则为0
# temp > 0.5 and temp < 1
# ---------------------------------------------------------------------------
# ValueError                                Traceback (most recent call last)
# Input In [134], in <cell line: 1>()
# ----> 1 temp > 0.5 and temp < 1

# ValueError: The truth value of an array with more 
#     than one element is ambiguous. Use a.any() or a.all()
# 判断 前四个股票前四天的涨跌幅 大于0.5并且小于1的,换为1,否则为0
log= np.logical_and(temp > 0.5 ,temp < 1)
log
----result----------
array([[False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False]])
np.where(log,1,0)
----result----------
array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])
log2= np.logical_or(temp > 0.5 ,temp < -0.5)
log2
----result----------
array([[False,  True,  True,  True],
       [False, False, False, False],
       [False,  True,  True,  True],
       [ True,  True,  True,  True]])
np.where(log2,1,0)
----result----------
array([[0, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 1, 1, 1],
       [1, 1, 1, 1]])

9.统计计算

9.1.统计指标

# 统计指标
# 统计指标函数  min \ max \ mean  \ median  \ var \ std
# np.函数名   或者     np.方法名
temp
----result----------
array([[-0.0245709 ,  1.1       , -0.95063999, -0.53223192],
       [-0.23024816, -0.41551718, -0.01701004,  0.25067482],
       [-0.33762242, -0.9108106 , -0.57667507, -1.71265711],
       [ 1.1       , -0.76190147,  1.1       ,  1.1       ]])
# 按列求得最大值 ,axis = 0
temp.max(axis = 0)
----result----------
array([1.1, 1.1, 1.1, 1.1])
# 按行求得最小值  axis = -1 或者   1  
np.min(temp,axis = 1)
np.min(temp,axis = -1)
----result----------
array([-0.95063999, -0.41551718, -1.71265711, -0.76190147])
# 返回最大值得索引  -- 按行 求最大值
np.argmax(temp,axis= 1)
----result----------
array([1, 3, 0, 0], dtype=int64)
# 返回最小值得索引  -- 按列 求最小值
temp.argmin(axis = 0)
----result----------
array([2, 2, 0, 2], dtype=int64)

10.数组间的运算

10.1 数组与数的运算

# 数组可以与数直接计算
arr = np.array([[1, 2, 3, 2, 1, 4], [5, 6, 1, 2, 3, 1]])
arr = arr  /  10
arr
----result----------
array([[0.1, 0.2, 0.3, 0.2, 0.1, 0.4],
       [0.5, 0.6, 0.1, 0.2, 0.3, 0.1]])
# 列表不能进行直接计算
a = [1,2,3]
a * 3
----result----------
[1, 2, 3, 1, 2, 3, 1, 2, 3]

10.2 数组与数组的运算

arr1 = np.array([[1, 2, 3, 2, 1, 4], [5, 6, 1, 2, 3, 1]])
arr2 = np.array([[1, 2, 3, 4], [3, 4, 5, 6]])
arr1.shape
----result----------
(2, 6)
arr1.ndim
----result----------
2
arr2.shape
----result----------
(2, 4)
arr2.ndim
----result----------
2
# arr1  + arr2
# ---------------------------------------------------------------------------
# ValueError                                Traceback (most recent call last)
# Input In [182], in <cell line: 1>()
# ----> 1 arr1  + arr2

# ValueError: operands could not be broadcast together with shapes (2,6) (2,4) 
# 广播机制 满足下列条件 可以进行数组间的运算
# 1.维度相等
# 2.shape(其中相对应的一个地方为1)
arr1 = np.array([[1, 2, 3, 2, 1, 4], [5, 6, 1, 2, 3, 1]])
arr2 = np.array([[1], [3]])
arr1.shape
----result----------
(2, 6)
arr2.shape
----result----------
(2, 1)
arr3 = arr1 + arr2 
arr3.shape
----result----------
(2, 6)

10.3 矩阵的运算

# 矩阵matrix  ----  二维数组
# 矩阵 与 二维数组区别
# 矩阵可以直接进行运算符运算 , array 的二维数组需要调用方法
# ndarray存储矩阵
data = np.array([[80, 86],
[82, 80],
[85, 78],
[90, 90],
[86, 82],
[82, 90],
[78, 80],
[92, 94]])
data
----result----------
array([[80, 86],
       [82, 80],
       [85, 78],
       [90, 90],
       [86, 82],
       [82, 90],
       [78, 80],
       [92, 94]])
# matrix存储矩阵
data_mat = np.mat([[80, 86],
[82, 80],
[85, 78],
[90, 90],
[86, 82],
[82, 90],
[78, 80],
[92, 94]])
data_mat
----result----------
matrix([[80, 86],
        [82, 80],
        [85, 78],
        [90, 90],
        [86, 82],
        [82, 90],
        [78, 80],
        [92, 94]])

10.4 矩阵乘法运算

# 矩阵乘法的两个关键
# 1.形状改变
# 2.运算规则
# (M行,N列) *   (N行 ,L列 )   注:N列与N行 要对应,否则不能进行矩阵运算
weights = np.array([[0.3],[0.7]])
weights
----result----------
array([[0.3],
       [0.7]])
weights_mat = np.mat([[0.3],[0.7]])
# 两种方法 
# 1. np.matmul  矩阵相乘
# 2. np.dot  点乘
# 3. 拓展(了解)
np.matmul(data,weights)
----result----------
array([[84.2],
       [80.6],
       [80.1],
       [90. ],
       [83.2],
       [87.6],
       [79.4],
       [93.4]])
np.dot(data,weights)
----result----------
array([[84.2],
       [80.6],
       [80.1],
       [90. ],
       [83.2],
       [87.6],
       [79.4],
       [93.4]])
data_mat  *  weights_mat
----result----------
matrix([[84.2],
        [80.6],
        [80.1],
        [90. ],
        [83.2],
        [87.6],
        [79.4],
        [93.4]])

11. 合并与分割

11.1 合并

11.1.1 水平拼接 hstack

a = np.array((1,2,3))
b = np.array((2,3,4))
np.hstack((a,b))
----result----------
array([1, 2, 3, 2, 3, 4])
a = np.array([[1],[2],[3]])
b = np.array([[2],[3],[4]])
np.hstack((a,b))
----result----------
array([[1, 2],
       [2, 3],
       [3, 4]])

11.1.2 竖直拼接 hstack

a = np.array((1,2,3))
b = np.array((2,3,4))
np.vstack((a,b))
----result----------
array([[1, 2, 3],
       [2, 3, 4]])
a = np.array([[1],[2],[3]])
b = np.array([[2],[3],[4]])
np.vstack((a,b))
----result----------
array([[1],
       [2],
       [3],
       [2],
       [3],
       [4]])

11.1.3 合并拼接 concatenate axis = 0 水平 1或-1 竖直

a = np.array([[1,2],[3,4]])
b = np.array([[5,6]])
np.concatenate((a,b),axis = 0)
----result----------
array([[1, 2],
       [3, 4],
       [5, 6]])
np.concatenate((a,b.T),axis = 1)
----result----------
array([[1, 2, 5],
       [3, 4, 6]])

11.2 分割

x = np.arange(9.0)
x
----result----------
array([0., 1., 2., 3., 4., 5., 6., 7., 8.])
# 按照步长进行分割
np.split(x,3)
----result----------
[array([0., 1., 2.]), array([3., 4., 5.]), array([6., 7., 8.])]
# 按照索引进行分割
np.split(x,[3,5,6,10])
----result----------
[array([0., 1., 2.]),
 array([3., 4.]),
 array([5.]),
 array([6., 7., 8.]),
 array([], dtype=float64)]

12. IO操作与数据处理

12.1 Numpy读取

data = np.genfromtxt("./day3/day3资料/02-代码/test.csv",delimiter=",")

12.2 如何处理缺失值

# 两种处理缺失值数据方法
# 1.直接删除缺失值数据
# 2.将缺失值替换补全数据
# nan  NAN  类型为  float64
# t = data[2,2]
# type(t)
def fill_nan_by_column_mean(t):
    for i in range(t.shape[1]):
        # 计算nan的个数
        nan_num = np.count_nonzero(t[:, i][t[:, i] != t[:, i]])
        if nan_num > 0:
            now_col = t[:, i]
            # 求和
            now_col_not_nan = now_col[np.isnan(now_col) == False].sum()
            # 和/个数
            now_col_mean = now_col_not_nan / (t.shape[0] - nan_num)
            # 赋值给now_col
            now_col[np.isnan(now_col)] = now_col_mean
            # 赋值给t,即更新t的当前列
            t[:, i] = now_col
    return t
# fill_nan_by_column_mean(data)

三、Pandas高级处理 1

  • 数据处理工具

    • panel + data + analysis
    • panel面板数据 - 计量经济学 三维数据
  • 为什么使用pandas

    • 便捷的数据处理能力
    • 读取文件方便
    • 封装了Matplotlib、Numpy的画图和计算
  • DataFrame

    • 既有行索引又有列索引
    • 二维数组(固定不变的)
    • 属性:
      • shape
      • index
      • columns
      • values
      • T
    • 方法:
      • head()
      • tail()
    • DataFrame索引设置
      • 修改行列索引值
      • 重设索引
      • 设置新索引
  • Panel

    • DataFrame的容器
  • Series

    • 带索引的一维数组
    • 属性
      • index
      • values
  • 总结

    • DataFrame是Series的容器
    • Panel 是DataFrame的容器
import numpy as np
# 创建一个符合正态分布的10个股票5天的涨跌幅数据
# 均值为 0   标准差为  1    生成的数据为  10行5列  
stock_change = np.random.normal(0,1,(10,5))
stock_change
----result----------
array([[ 0.62392828,  1.49780091, -2.24893185,  1.42890353,  1.0371102 ],
       [ 1.26229577, -0.01889369, -1.03854223, -0.84674139, -0.58421933],
       [ 0.16216979, -0.99435009,  0.95895474, -0.98746556, -0.98492345],
       [ 0.16194925,  0.2109128 , -0.45130401,  1.32244156,  1.0676456 ],
       [ 0.26151688,  0.25919852, -0.6945317 ,  1.35207356, -0.03483866],
       [-0.03017989, -0.35822256, -0.83436719,  0.55108206,  0.10087387],
       [-1.29781219,  0.93603652, -0.54495118, -0.60376686, -2.24740919],
       [-1.05103675, -0.15612166, -0.3558128 , -0.30679821, -1.38390409],
       [ 0.26297504, -0.57099557,  0.40567368, -0.87836647,  0.4373559 ],
       [ 1.95955588,  1.36344196, -1.0547673 ,  0.08483486,  0.72349534]])

1.DataFrame

import pandas as pd
pd.DataFrame(stock_change)  # 将其转换为二维数组表形式

1.1 添加行 、列索引

# 构造一个行索引列表
stock_name =  [ "股票{}".format(i) for i in range(10)]
# 构造一个列索时间列表  ---  使用pd 时间工具
date =  pd.date_range(start ="20180101", periods=5,freq="B")
# index  为行索引     columns 为列索引 
data = pd.DataFrame(stock_change,
             index= stock_name, 
             columns = date)

1.2 数据属性之行索引

data.index
--------result-------------
Index(['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9'], dtype='object')

1.3 数据属性之列索引

data.columns
--------result-------------
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05'],
              dtype='datetime64[ns]', freq='B')

1.4 数据属性之数据结构类型

# shape  数据结构类型 10行5列
data.shape
--------result-------------
(10, 5)

1.5 数据属性之数据值

# values  直接获取其中array的值
data.values
--------result-------------
array([[ 0.62392828,  1.49780091, -2.24893185,  1.42890353,  1.0371102 ],
       [ 1.26229577, -0.01889369, -1.03854223, -0.84674139, -0.58421933],
       [ 0.16216979, -0.99435009,  0.95895474, -0.98746556, -0.98492345],
       [ 0.16194925,  0.2109128 , -0.45130401,  1.32244156,  1.0676456 ],
       [ 0.26151688,  0.25919852, -0.6945317 ,  1.35207356, -0.03483866],
       [-0.03017989, -0.35822256, -0.83436719,  0.55108206,  0.10087387],
       [-1.29781219,  0.93603652, -0.54495118, -0.60376686, -2.24740919],
       [-1.05103675, -0.15612166, -0.3558128 , -0.30679821, -1.38390409],
       [ 0.26297504, -0.57099557,  0.40567368, -0.87836647,  0.4373559 ],
       [ 1.95955588,  1.36344196, -1.0547673 ,  0.08483486,  0.72349534]])

1.6 数据属性之转置

# T  转置
data.T

1.7 属性常用方法

1.7.1 head: 显示前几行

data.head(3)

1.7.2 tail:显示后几行

data.tail(3)

2.DataFrame之索引设置

2.1 修改行、列索引值 (只能整体修改)

# data.index[2] = "股票88"   #不能单独修改索引
# TypeError: Index does not support mutable operations
# 修改行、列索引操作
stock_ = ["股票{}".format(i)  for i in range(10,20)]
col    = pd.date_range(start="20190101",periods=5,freq="B")
data.index = stock_
data.columns = col
stock_
-------------result------------------
['股票10',
 '股票11',
 '股票12',
 '股票13',
 '股票14',
 '股票15',
 '股票16',
 '股票17',
 '股票18',
 '股票19']
data.index
-------------result------------------
Index(['股票10', '股票11', '股票12', '股票13', '股票14', '股票15', '股票16', '股票17', '股票18',
       '股票19'],
      dtype='object')
data.columns
-------------result------------------
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-07'],
              dtype='datetime64[ns]', freq='B')

2.2 重设行索引

data.reset_index().shape
-------------result------------------
(10, 6)
data.reset_index(drop=True).shape
-------------result------------------
(10, 5)

2.3 设置新索引

df = pd.DataFrame({
    'month': [1,4,7,10],
    'year' : [2012,2014,2013,2014],
    'sale' : [55,40,84,31]
})
df

# 以月份设置新的列索引
df.set_index("month",drop=False)

# 设置多个索引,以年和月份
new_df = df.set_index(['year','month'],drop=False)
new_df.index
-------------result------------------
MultiIndex([(2012,  1),
            (2014,  4),
            (2013,  7),
            (2014, 10)],
           names=['year', 'month'])

2.4 MultiIndex

  • 多级或分成索引对象
  • index属性
  • names: levels的名称
  • levels:每个level的元组值
new_df.index.names
-------------result------------------
FrozenList(['year', 'month'])
new_df.index.levels
-------------result------------------
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])

3.Panel

  • 存储3维数组的Panel结构
# #最新版本已经移除了panel
# p = pd.Panel(np.arange(24).reshape(4,3,2),
#                  items=list('ABCD'),
#                  major_axis=pd.date_range('20130101', periods=3),
#                  minor_axis=['first', 'second'])


# module 'pandas' has no attribute 'Panel'

4.Series

4.1 series属性

# series结构只有行索引
sr = data.iloc[1,:]
sr
-------------result------------------
2019-01-01    1.262296
2019-01-02   -0.018894
2019-01-03   -1.038542
2019-01-04   -0.846741
2019-01-07   -0.584219
Freq: B, Name: 股票11, dtype: float64
sr.index
-------------result------------------
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-07'],
              dtype='datetime64[ns]', freq='B')
sr.values
-------------result------------------
array([ 1.26229577, -0.01889369, -1.03854223, -0.84674139, -0.58421933])
type(sr.values)
-------------result------------------
numpy.ndarray

4.2 指定索引

# 指定内容,默认索引
pd.Series(np.arange(3,9,2))
-------------result------------------
0    3
1    5
2    7
dtype: int32
#  指定索引
pd.Series(np.arange(3,9,2),index=["a","b","c"])
-------------result------------------
a    3
b    5
c    7
dtype: int32

4.3 通过字段数据创建

# 通过字段数据创建
pd.Series({'red':100, 'blue':200, 'green': 500, 'yellow':1000})
-------------result------------------
red        100
blue       200
green      500
yellow    1000
dtype: int64

5.基本数据操作

5.1 数据获取

data = pd.read_csv("./day3/day3资料/02-代码/stock_day/stock_day.csv")
data

5.2 删除字段

# 可以将不想看到的字段进行删除 按照列进行删除
data = data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1)
data

6.索引操作

6.1 直接索引

# numpy下索引
stock_change[1,1]

# pandas中DataFrame不能够直接索引
# data[1,2]
-------------result-----------
-0.01889369395385527
# 直接定位------必须先列后行
data["open"]["2018-02-26"]
-------------result-----------
22.8

6.2 名字索引(常用)

# 通过列表的形式 可以先行后列索引
data.loc["2018-02-26","open"]
-------------result-----------
22.8

6.3 数字索引(常用)

data.iloc[1,0]
-------------result-----------
22.8

6.4 组合索引

# 现在这个方法不能使用了
# data.ix[:4, ['open', 'close', 'high', 'low']]
#  'DataFrame' object has no attribute 'ix'
# 推荐使用  ------     loc  + index 方式
data.loc[data.index[0:4],['open','close','high','low'] ]

# 推荐使用  -----   iloc  + columns
col = data.columns.get_indexer(['open','close','high','low'])
col
-------------result-----------
array([0, 2, 1, 3], dtype=int64)
data.iloc[0:4,col]

7.赋值操作

7.1 对整列数据进行赋值

# 对整列数据进行赋值
data.open = 100
data

7.2 对某一个值进行赋值

# 对某一个值进行赋值
data.iloc[1,0] = 222
data

8.排序

8.1 内容排序

# 对单键或多健进行排序,ascending = True : 升序   False :降序
data.sort_values(by=["high"],ascending=True)

data.sort_values(by=["high","low"],ascending=False)

8.2 索引排序

data.sort_index().head()

9.算术运算

9.1 作用于一列

# 直接使用算术加法
data["open"].head()
--------result------------------
2018-02-27    100
2018-02-26    222
2018-02-23    100
2018-02-22    100
2018-02-14    100
Name: open, dtype: int64
data["open"] + 3
--------result------------------
2018-02-27    103
2018-02-26    225
2018-02-23    103
2018-02-22    103
2018-02-14    103
             ...
2015-03-06    103
2015-03-05    103
2015-03-04    103
2015-03-03    103
2015-03-02    103
Name: open, Length: 643, dtype: int64
# 使用函数进行加法运算
data["open"].add(3).head()
--------result------------------
2018-02-27    103
2018-02-26    225
2018-02-23    103
2018-02-22    103
2018-02-14    103
Name: open, dtype: int64
# 减法运算
data.sub(100).head()

# 查看股票的涨跌情况
data["close"].sub(data["open"]).head()
--------result------------------
2018-02-27    -75.84
2018-02-26   -198.47
2018-02-23    -77.18
2018-02-22    -77.72
2018-02-14    -78.08
dtype: float64

9.2 直接作用于整个数据集合

data / 10 

10.逻辑运算 < 、> 、| 、 &

10.1 布尔索引

# 例如筛选p_change > 2 的日期数据
data[data["p_change"] > 2 ].head()

# 完成一个多个逻辑判断,筛选p_change > 2 并且 low > 15
data[(data["p_change"] > 2 ) & (data["low"] > 15)]

data[(data["p_change"] > 2 ) | (data["low"] < 15)]

11. 统计计算

11.1 describe()

# 数据描述
data.describe()

# count 数量    mean 平均值    std 标准差
# min   最小值  max  最大值

#最大值
data.max(axis=0)
--------result------------------
open               222.00
high                36.35
close               35.21
low                 34.01
volume          501915.41
price_change         3.03
p_change            10.03
turnover            12.56
dtype: float64
# 最小值
data.min(axis=1)
--------result------------------
2018-02-27    0.63
2018-02-26    0.69
2018-02-23    0.54
2018-02-22    0.36
2018-02-14    0.44
              ...
2015-03-06    1.12
2015-03-05    0.26
2015-03-04    0.20
2015-03-03    0.18
2015-03-02    0.32
Length: 643, dtype: float64
# 标准差
data.std()
--------result------------------
open                4.811210
high                4.077578
close               3.942806
low                 3.791968
volume          73879.119354
price_change        0.898476
p_change            4.079698
turnover            2.079375
dtype: float64
# 最大值所在索引
data.idxmax()
--------result------------------
open            2018-02-26
high            2015-06-10
close           2015-06-12
low             2015-06-12
volume          2017-10-26
price_change    2015-06-09
p_change        2015-08-28
turnover        2017-10-26
dtype: object

12.累计统计函数

# 函数                      作用

# cumsum                    1 + 2 + 3 + 4 + ...  =  

# cumprod                   1 * 2 * 3 * 4 * ...  =  

# cummax                    计算1/2/3/.../n 个数的最大值

# cummin                    计算1/2/3/.../n 个数的最小值
data["p_change"]
--------result------------------
2018-02-27    2.68
2018-02-26    3.02
2018-02-23    2.42
2018-02-22    1.64
2018-02-14    2.05
              ...
2015-03-06    8.51
2015-03-05    2.02
2015-03-04    1.57
2015-03-03    1.44
2015-03-02    2.62
Name: p_change, Length: 643, dtype: float64
data["p_change"].cumsum()
--------result------------------
2018-02-27      2.68
2018-02-26      5.70
2018-02-23      8.12
2018-02-22      9.76
2018-02-14     11.81
               ...
2015-03-06    114.70
2015-03-05    116.72
2015-03-04    118.29
2015-03-03    119.73
2015-03-02    122.35
Name: p_change, Length: 643, dtype: float64
data["p_change"].cumprod()
--------result------------------
2018-02-27     2.680000
2018-02-26     8.093600
2018-02-23    19.586512
2018-02-22    32.121880
2018-02-14    65.849853
                ...
2015-03-06    -0.000000
2015-03-05    -0.000000
2015-03-04    -0.000000
2015-03-03    -0.000000
2015-03-02    -0.000000
Name: p_change, Length: 643, dtype: float64
data["p_change"].cummax()
--------result------------------
2018-02-27     2.68
2018-02-26     3.02
2018-02-23     3.02
2018-02-22     3.02
2018-02-14     3.02
              ...
2015-03-06    10.03
2015-03-05    10.03
2015-03-04    10.03
2015-03-03    10.03
2015-03-02    10.03
Name: p_change, Length: 643, dtype: float64
data["p_change"].cummin()
--------result------------------
2018-02-27     2.68
2018-02-26     2.68
2018-02-23     2.42
2018-02-22     1.64
2018-02-14     1.64
              ...
2015-03-06   -10.03
2015-03-05   -10.03
2015-03-04   -10.03
2015-03-03   -10.03
2015-03-02   -10.03
Name: p_change, Length: 643, dtype: float64
data["p_change"].sort_index().cumsum().plot()
--------result------------------[AxesSubplot:](AxesSubplot:)

png

13.自定义运算

#  apply(func,axis=0)
#     func:自定义函数
#     axis = 0 默认按列运算 axis = 1 按行进行运算

# 定义一个对列 进行最大值,最小值的函数
data.apply(lambda x : x.max() - x.min() )
--------result------------------
open               122.00
high                23.68
close               22.85
low                 21.81
volume          500757.29
price_change         6.55
p_change            20.06
turnover            12.52
dtype: float64

14.pandas画图

# data.plot( x = , y =  , kind = )
#kind =  line (默认)  ,bar  barth  hist ,scatter  ,pie
data.plot(x="p_change",y="turnover",kind="scatter")
--------result------------------
<AxesSubplot:xlabel='p_change', ylabel='turnover'>

png

data.plot(x="volume",y="turnover",kind="scatter")
<AxesSubplot:xlabel='volume', ylabel='turnover'>

png

data.plot(x="high",y="low",kind="scatter")
--------result------------------
<AxesSubplot:xlabel='high', ylabel='low'>![png](https://img2023.cnblogs.com/blog/3186882/202306/3186882-20230630141351565-1382016683.png)

15.文件的读取与存储(重点)

15.1 CSV文件的读取和存储

# pandas.read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer',names=None, index_col=None, usecols=None)
# usecols     使用哪些字段
# names       为各个字段取名
# index_col   将某一字段设为索引
# sep         用sep参数设置分隔符
# nrows       需要获取的行数
pd.read_csv("./day3/day3资料/02-代码/stock_day/stock_day.csv",usecols = ["high","low","open","close"])

# CSV文件的读取
data = pd.read_csv("./day3/day3资料/02-代码/stock_day2.csv", names=["open", "high", "close", "low", "volume", "price_change", "p_change", "ma5", "ma10", "ma20", "v_ma5", "v_ma10", "v_ma20", "turnover"])
data

# CSV文件的存储
data[:10].to_csv("./test.csv", 
                 columns=["open","close"],
                 index=False,
                 mode = "a",
                 header =False)
pd.read_csv("test.csv")

15.2 SQL的读取与存储

# import pymysql
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# # 1.获取数据
# conn = pymysql.connect(host='localhost',
#                        user='root',
#                        password='123456',
#                        database='pms')
# data = pd.read_sql("select * from test_lcc limit 100 ;",con=conn)
# df = pd.DataFrame(data)

15.3 JSON文件的读取与存储

# JSON文件读取
sa = pd.read_json("./day3/day3资料/02-代码/Sarcasm_Headlines_Dataset.json",orient="records",lines = True)
sa

sa.to_json("test.json",orient = "records",lines = True)
pd.read_json("test.json",orient = "records",lines = True)

15.4 HDF5文件的读取与存储

day_close = pd.read_hdf("./day3/day3资料/02-代码/stock_data/day/day_close.h5")
day_close

day_close.to_hdf("test.h5",key="close")
pd.read_hdf("test.h5")

Pandas 高级处理2

1.如何进行缺失值处理

两种思路

  • 删除含有缺失值的样本
  • 替换/插补
import pandas as pd
import numpy  as np 
movie = pd.read_csv("./day4/day4资料/02-代码/IMDB/IMDB-Movie-Data.csv")
movie.head()

1.1 查看是否有缺失值

1.1.1 方法一 isnull + any

# any 有一个True  就返回true  所以当movie中有空值则返回 true
np.any(pd.isnull(movie))   # 返回true,说明数据中存在缺失值
--------result------------------
True
# 统计各个字段是否存在缺失值
# Revenue (Millions)     True
# Metascore              True

pd.isnull(movie).any()  # 返回true 有空值,返回false 无空值
--------result------------------
Rank                  False
Title                 False
Genre                 False
Description           False
Director              False
Actors                False
Year                  False
Runtime (Minutes)     False
Rating                False
Votes                 False
Revenue (Millions)     True
Metascore              True
dtype: bool

1.1.2 方法一 notnull + all

# all 全部是True 就返回true 否则返回false  所以当movie中有 空值则返回true
np.all(pd.notnull(movie))  # 返回false,说明数据汇总存在缺失值
--------result------------------
False
pd.notnull(movie).all()  # 返回false 有空值 , 返回true 无空值
--------result------------------
Rank                   True
Title                  True
Genre                  True
Description            True
Director               True
Actors                 True
Year                   True
Runtime (Minutes)      True
Rating                 True
Votes                  True
Revenue (Millions)    False
Metascore             False
dtype: bool

1.1.3 方法三 isnull + sum

# 统计各个字段的缺失值个数
pd.isnull(movie).sum()
--------result------------------
Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64
movie.isnull().sum()
--------result------------------
Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64

2.缺失值处理

2.1 方法一 : 删除含有缺失值的样本

# 方法一 : 删除含有缺失值的样本
# dropna(axis=0, how="any", thresh=None, subset=None, inplace=False)
data1 = movie.dropna()
data1.head()

# 处理过的数据不存在缺失值
data1.isnull().any()
--------result------------------
Rank                  False
Title                 False
Genre                 False
Description           False
Director              False
Actors                False
Year                  False
Runtime (Minutes)     False
Rating                False
Votes                 False
Revenue (Millions)    False
Metascore             False
dtype: bool
data1.isnull().sum()
--------result------------------
Rank                  0
Title                 0
Genre                 0
Description           0
Director              0
Actors                0
Year                  0
Runtime (Minutes)     0
Rating                0
Votes                 0
Revenue (Millions)    0
Metascore             0
dtype: int64

2.2 方法二 : 替换

# 方法二 替换
# fillna(value=None, method=None, axis=None, inplace=False, limit=None)
# 1.使用平均值替换   inplace  = true   对原有数据集进行修改
# 使用 method{‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None

# 使用上一个有效值填充
# df.fillna(method='backfill')

# # 同 backfill
# df.fillna(method='bfill')

# # 把当前值广播到后边的缺失值
# df.fillna(method='pad')

# # 同 pad
# df.fillna(method='ffill')
movie["Revenue (Millions)"].fillna(movie["Revenue (Millions)"].mean(), 
                                   inplace=True)
movie["Metascore"].fillna(movie["Metascore"].mean()
                          , inplace=True)
# 查看是否有缺失值 返回true 则不存在缺失值
movie.notnull().all()
--------result------------------
Rank                  True
Title                 True
Genre                 True
Description           True
Director              True
Actors                True
Year                  True
Runtime (Minutes)     True
Rating                True
Votes                 True
Revenue (Millions)    True
Metascore             True
dtype: bool

2.3 不是缺失值NaN,有默认标记的

# 读取数据
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
name = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"]

data = pd.read_csv(path, names=name)
data

# 1.替换
# 将 ? 替换成  nan
data_new = data.replace(to_replace= "?",value= np.nan)
# 2.查看是否存在缺失值
# Bare Nuclei                     True
data_new.isnull().any()
--------result------------------
Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                     True
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool
# 3.删除缺失值
data_new.dropna(inplace=True)
# 4.查看是否存在缺失值
data_new.isnull().any()
--------result------------------
Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool
# 5.补充:
# 5.1 查看数据集合中字段的数据类型
data_new.dtypes
--------result------------------
Sample code number              int64
Clump Thickness                 int64
Uniformity of Cell Size         int64
Uniformity of Cell Shape        int64
Marginal Adhesion               int64
Single Epithelial Cell Size     int64
Bare Nuclei                    object
Bland Chromatin                 int64
Normal Nucleoli                 int64
Mitoses                         int64
Class                           int64
dtype: object
# 5.2 观察数据
data_new.info()
--------result------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample code number           683 non-null    int64
 1   Clump Thickness              683 non-null    int64
 2   Uniformity of Cell Size      683 non-null    int64
 3   Uniformity of Cell Shape     683 non-null    int64
 4   Marginal Adhesion            683 non-null    int64
 5   Single Epithelial Cell Size  683 non-null    int64
 6   Bare Nuclei                  683 non-null    object
 7   Bland Chromatin              683 non-null    int64
 8   Normal Nucleoli              683 non-null    int64
 9   Mitoses                      683 non-null    int64
 10  Class                        683 non-null    int64
dtypes: int64(10), object(1)
memory usage: 64.0+ KB
# 5.3 统计每个字段缺失值个数
data_new.isnull().sum()
--------result------------------
Sample code number             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64
data_new["Bare Nuclei"] = data_new["Bare Nuclei"].astype("int64")
data_new.info()
--------result------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample code number           683 non-null    int64
 1   Clump Thickness              683 non-null    int64
 2   Uniformity of Cell Size      683 non-null    int64
 3   Uniformity of Cell Shape     683 non-null    int64
 4   Marginal Adhesion            683 non-null    int64
 5   Single Epithelial Cell Size  683 non-null    int64
 6   Bare Nuclei                  683 non-null    int64
 7   Bland Chromatin              683 non-null    int64
 8   Normal Nucleoli              683 non-null    int64
 9   Mitoses                      683 non-null    int64
 10  Class                        683 non-null    int64
dtypes: int64(11)
memory usage: 64.0 KB
# 5.4 pandas内置的处理函数  to_numeric     to_datetime

# df['Jan Units'] = pd.to_numeric(df['Jan Units'], errors='coerce').fillna(0)
# errors参数有: raise, errors默认为raise      ignore 忽略错误    coerce 将错误数据标注为NaN

#df['Start_Date'] = pd.to_datetime(df[['Month', 'Day', 'Year']])

3.数据离散化

# 为什么要离散化  --------简化数据结构

# one_hot编码  哑变量

# 如何实现数据的离散化
# 1) 分组
#         自动分组     pd.qcut( data , bins)
#         自定义分组   pd.cut( data ,[] )

# 2)  将分组好的结果转换成one_hot编码 
#         str 是Series 类型
#             pd.get_dummies(str,prefix=)
# 1)准备数据
data = pd.Series([165,174,160,180,159,163,192,184], 
                 index=['No1:165', 'No2:174','No3:160', 'No4:180', 'No5:159', 'No6:163', 'No7:192', 'No8:184']) 
data
--------result------------------
No1:165    165
No2:174    174
No3:160    160
No4:180    180
No5:159    159
No6:163    163
No7:192    192
No8:184    184
dtype: int64

3.1 自动分组 + one-hot编码

# 2)分组
# 自动分组  --- 将数据 分为  3 组
sr = pd.qcut(data, 3)
sr
--------result------------------
No1:165      (163.667, 178.0]
No2:174      (163.667, 178.0]
No3:160    (158.999, 163.667]
No4:180        (178.0, 192.0]
No5:159    (158.999, 163.667]
No6:163    (158.999, 163.667]
No7:192        (178.0, 192.0]
No8:184        (178.0, 192.0]
dtype: category
Categories (3, interval[float64, right]): [(158.999, 163.667] < (163.667, 178.0] < (178.0, 192.0]]
#### one-hot编码
pd.get_dummies(sr,prefix="height")

3.2 自定义分组 + one-hot编码

# 自定义分组
bins = [150,165,180,195]
sr = pd.cut(data,bins)
sr
--------result------------------
No1:165    (150, 165]
No2:174    (165, 180]
No3:160    (150, 165]
No4:180    (165, 180]
No5:159    (150, 165]
No6:163    (150, 165]
No7:192    (180, 195]
No8:184    (180, 195]
dtype: category
Categories (3, interval[int64, right]): [(150, 165] < (165, 180] < (180, 195]]
# 查看分组各个区间的个数
sr.value_counts()
--------result------------------
(150, 165]    4
(165, 180]    2
(180, 195]    2
dtype: int64
# 转换成one-hot编码
pd.get_dummies(sr,prefix="身高")

4.案例:股票的涨跌幅离散化

# 我们对p_change 进行离散化
# 1.读取数据
stock = pd.read_csv("./day4/day4资料/02-代码/stock_day/stock_day.csv")
p_change = stock["p_change"]
p_change.head()
--------result------------------
2018-02-27    2.68
2018-02-26    3.02
2018-02-23    2.42
2018-02-22    1.64
2018-02-14    2.05
Name: p_change, dtype: float64
# 2.观察数据信息
p_change.info()
--------result------------------
<class 'pandas.core.series.Series'>
Index: 643 entries, 2018-02-27 to 2015-03-02
Series name: p_change
Non-Null Count  Dtype
--------------  -----
643 non-null    float64
dtypes: float64(1)
memory usage: 10.0+ KB
# 3.观察是否有缺失值
p_change.isnull().sum()
p_change.isnull().any()
p_change.notnull().all()
--------result------------------
True
# 4.处理缺失值   -----    这里没有空值,所以没有处理
#     ①  pd.dropna()     删除缺失值
#     ②  pd.fillna()     填充缺失值
# 5.数据离散化
# ① 分组
sr = pd.qcut(p_change,10)
# ② 查看分组
sr.value_counts()
--------result------------------
(-10.030999999999999, -4.836]    65
(-0.462, 0.26]                   65
(0.26, 0.94]                     65
(5.27, 10.03]                    65
(-4.836, -2.444]                 64
(-2.444, -1.352]                 64
(-1.352, -0.462]                 64
(1.738, 2.938]                   64
(2.938, 5.27]                    64
(0.94, 1.738]                    63
Name: p_change, dtype: int64
# ③ 离散化
pd.get_dummies(sr,prefix="涨跌幅")

# ④自定义分组
bins = [-100,-7,-5,-3,0,3,5,7,100]
sr = pd.cut(p_change,bins)
sr
--------result------------------
2018-02-27      (0, 3]
2018-02-26      (3, 5]
2018-02-23      (0, 3]
2018-02-22      (0, 3]
2018-02-14      (0, 3]
                ...
2015-03-06    (7, 100]
2015-03-05      (0, 3]
2015-03-04      (0, 3]
2015-03-03      (0, 3]
2015-03-02      (0, 3]
Name: p_change, Length: 643, dtype: category
Categories (8, interval[int64, right]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]
# ⑤查看分组
sr.value_counts()
--------result------------------
(0, 3]        215
(-3, 0]       188
(3, 5]         57
(-5, -3]       51
(5, 7]         35
(7, 100]       35
(-100, -7]     34
(-7, -5]       28
Name: p_change, dtype: int64
# ⑥数据离散化
stock_change = pd.get_dummies(sr,"涨跌幅")

5.数据合并

# numpy 数据合并
# 水平拼接       np.hstack()
# 竖直拼接       np.vstack()
# 拼接           np.concatnate((a,b),axis=)

5.1 按方向拼接 pd.concat()

# 处理好的one-hot编码与原数据合并
stock.head()

stock_change.head()

5.1.1 水平拼接

# 水平拼接
pd.concat([stock,stock_change],axis = 1)

5.1.2 竖直拼接

# 竖直拼接
pd.concat([stock,stock_change],axis = 0)

5.2 按索引合并 pd.merge()

# pd.merge(left,right,how="连接方式",on=[索引])
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                        'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                        'key2': ['K0', 'K0', 'K0', 'K0'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
left

right

# 内连接    --  左索引 等于 右索引 ,则返回   
pd.merge(left,right,how="inner",on=["key1","key2"])

# 左连接    -- 以左边索引为主,符合左边索引的就返回
pd.merge(left,right,how="left",on=["key1","key2"])

# 右连接   -- 以右边索引为主,符合右边索引的就返回
pd.merge(left,right,how="right",on=["key1","key2"])

# 外连接    -- 以两边为主   即 笛卡尔积
pd.merge(left,right,how="outer",on=["key1","key2"])

6. 交叉表与透视表什么作用

6.1 交叉表

# 交叉表 pd.crosstab(): 交叉表用于计算一列数据对于另外一列数的分组个数
#           即:寻找连个列之间的关系
# 星期数据以及涨跌幅好坏的 数据
# pd.crosstab(星期数据列,涨跌幅数据列)
stock.index
--------result------------------
Index(['2018-02-27', '2018-02-26', '2018-02-23', '2018-02-22', '2018-02-14',
       '2018-02-13', '2018-02-12', '2018-02-09', '2018-02-08', '2018-02-07',
       ...
       '2015-03-13', '2015-03-12', '2015-03-11', '2015-03-10', '2015-03-09',
       '2015-03-06', '2015-03-05', '2015-03-04', '2015-03-03', '2015-03-02'],
      dtype='object', length=643)
# pandas 处理日期类型  API
date = pd.to_datetime(stock.index)
date.weekday
--------result------------------
Int64Index([1, 0, 4, 3, 2, 1, 0, 4, 3, 2,
            ...
            4, 3, 2, 1, 0, 4, 3, 2, 1, 0],
           dtype='int64', length=643)
# 在原始数据集中添加一列数据
stock["weekday"] = date.weekday
stock["weekday"]
--------result------------------
2018-02-27    1
2018-02-26    0
2018-02-23    4
2018-02-22    3
2018-02-14    2
             ..
2015-03-06    4
2015-03-05    3
2015-03-04    2
2015-03-03    1
2015-03-02    0
Name: weekday, Length: 643, dtype: int64
# 准备涨跌幅数据列   大于 0 返回 1   否则返回  0
stock["pona"] = np.where(stock["p_change"] > 0  ,  1  , 0)
stock["pona"] 
--------result------------------
2018-02-27    1
2018-02-26    1
2018-02-23    1
2018-02-22    1
2018-02-14    1
             ..
2015-03-06    1
2015-03-05    1
2015-03-04    1
2015-03-03    1
2015-03-02    1
Name: pona, Length: 643, dtype: int32
stock.head()

# 交叉表
data = pd.crosstab(stock["weekday"],stock["pona"])
data

#  按行就行求和
data.sum(axis = 1)
--------result------------------
weekday
0    125
1    131
2    132
3    128
4    127
dtype: int64
# 按列进行求和
data.sum(axis = 0)
--------result------------------
pona
0    301
1    342
dtype: int64
# pd   除法方法    第一个参数是 每行的求和  第二个参数是 每行的数据值

plot_data = data.div(data.sum(axis=1),axis=0)
plot_data

# pd.plot()绘图     bar  为柱状图    stacked  =  True  为图像是否堆叠
plot_data.plot(kind="bar",stacked=True)
--------result------------------[AxesSubplot:xlabel=&#39;weekday&#39;](AxesSubplot:xlabel='weekday')

png

pd.crosstab(stock["weekday"],stock["pona"])
data.div(data.sum(axis=1),axis=0)

6.2 透视表

# 透视表
stock.pivot_table(["pona"],index=["weekday"])

7.分组与聚合

col =pd.DataFrame({'color': ['white','red','green','red','green'],
                   'object': ['pen','pencil','pencil','ashtray','pen'],
                   'price1':[5.56,4.20,1.30,0.56,2.75],
                   'price2':[4.75,4.12,1.60,0.75,3.15]},index=["a","b","c","d","e"])
col

# 进行分组,对颜色进行分组,price进行聚合
col.groupby(by="color")["price1"].max()
--------result------------------
color
green    2.75
red      4.20
white    5.56
Name: price1, dtype: float64
col["price1"].groupby(col["color"]).count()
--------result------------------
color
green    2
red      2
white    1
Name: price1, dtype: int64

8.案例:星巴克零售店铺数据案列

# 1.读取数据
data = pd.read_csv("./day4/day4资料/02-代码/directory.csv")
data.head()

# 2.按照国家进行分组,并求出数量
data.groupby(by="Country").count()

# 按照国家进行分组,取出 Brand字段数据
count_data = data.groupby(by="Country").count()["Brand"]
count_data
--------result------------------
Country
AD        1
AE      144
AR      108
AT       18
AU       22
      ...
TT        3
TW      394
US    13608
VN       25
ZA        3
Name: Brand, Length: 73, dtype: int64
# 按照国家进行分组,取出 Brand字段数据 并排序,最后绘制出图表
count_data.sort_values(ascending=False)[:10].plot(kind = "bar")
--------result------------------[AxesSubplot:xlabel=&#39;Country&#39;](AxesSubplot:xlabel='Country')

png

9.综合案例

# 1.准备数据
movie = pd.read_csv("./day4/day4资料/02-代码/IMDB/IMDB-Movie-Data.csv")
movie.head()

# 问题1:我们想知道这些电影数据中评分的平均分,
#        导演的人数等信息,我们怎么获取?
movie["Rating"].mean()
--------result------------------
6.723200000000003
np.unique(movie["Director"]).size
--------result------------------
644
# 问题2: 对于这一组电影数据,
#         如果我们想rating,runtime的分布情况,应该如何呈现数据
movie["Rating"].plot(kind="hist",figsize=(20,8))
--------result------------------[AxesSubplot:ylabel=&#39;Frequency&#39;](AxesSubplot:ylabel='Frequency')

png

import matplotlib.pyplot as plt 

# 1.创建图像
plt.figure(figsize = (10,5),dpi = 100 )

# 2.绘制直方图
plt.hist(movie["Runtime (Minutes)"],20)
plt.legend()

# 修改刻度
plt.xticks(np.linspace(movie["Runtime (Minutes)"].min(),movie["Runtime (Minutes)"].max(),20))

# 添加网格
plt.grid(linestyle = "--",alpha=0.5)

# 3显示图像
plt.show()
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

png

# 问题3:对于这一组电影数据,如果我们希望统计电影分类(genre)的情况,
#        应该如何处理数据?
# 先统计电影类别都有哪些
movie_genre = [ i.split(",") for i in movie["Genre"]]
movie_genre[:20]
--------result------------------
[['Action', 'Adventure', 'Sci-Fi'],
 ['Adventure', 'Mystery', 'Sci-Fi'],
 ['Horror', 'Thriller'],
 ['Animation', 'Comedy', 'Family'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Comedy', 'Drama', 'Music'],
 ['Comedy'],
 ['Action', 'Adventure', 'Biography'],
 ['Adventure', 'Drama', 'Romance'],
 ['Adventure', 'Family', 'Fantasy'],
 ['Biography', 'Drama', 'History'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Animation', 'Adventure', 'Comedy'],
 ['Action', 'Comedy', 'Drama'],
 ['Animation', 'Adventure', 'Comedy'],
 ['Biography', 'Drama', 'History'],
 ['Action', 'Thriller'],
 ['Biography', 'Drama'],
 ['Drama', 'Mystery', 'Sci-Fi']]
# 第一个循环   for i  in   movie_genre
# 第二个循环   for j in i
# 去重  np.unique
list = [ j for i in movie_genre  for j in i ]
movie_class = np.unique(list)
movie_class
--------result------------------
array(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller',
       'War', 'Western'], dtype='<U9')
# 统计每个类别有几个电影
count = pd.DataFrame(np.zeros(shape=(1000,20)),dtype="int32",columns=movie_class)  
count

count.head()

# 计数填表
for i in range(1000) :
    a = []
    for j in movie_genre[i] :
        for k in range(20):
            if j == movie_class[k] :
                a.append(k)
    count.iloc[i, a]  = 1

count

count.sum(axis=0).sort_values(ascending=False).plot(kind="bar",
                                                    figsize=(20, 9),
                                                    fontsize=40,
                                                    colormap="cool")
--------result------------------[AxesSubplot:](AxesSubplot:)

png

count1 = count
for i in range(1000) :
    count1.loc[count1.index[i], movie_genre[i] ]  = 1
count1

count1.sum(axis=0).sort_values(ascending=False).plot(kind="bar",
                                                    figsize=(20, 9),
                                                    fontsize=40,
                                                    color="b")
[AxesSubplot:](AxesSubplot:)

png

posted @ 2023-06-28 12:52  派森的猫  阅读(32)  评论(0编辑  收藏  举报