numpy & pandas学习

numpy & pandas

介绍 & 安装

numpy 和 pandas 用于数据分析/处理

numpy基于C语言,pandas基于numpy,相比于python的字典/或列表,可以较快实现矩阵计算

numpy

numpy的属性

  • ndim

    矩阵的维度

  • shape

    矩阵的形状(行-列)

  • size

    矩阵中元素的个数

import numpy as np

array = np.array([[1, 2, 3], [4, 5, 6]])
print(array)
print('number of dim:', array.ndim)
print('shape :', array.shape)
print('size:', array.size)

# output
# [[1 2 3]
#  [4 5 6]]
# number of dim: 2
# shape : (2, 3)
# size: 6

创建array

调用np.array()传入参数创建矩阵

参数dtype可指定矩阵中元素的数据类型

生成特殊矩阵

a = np.zeros((3, 4))
print(a)
# [[0. 0. 0. 0.]
#  [0. 0. 0. 0.]
#  [0. 0. 0. 0.]]

a = ones((3, 4), dtype=np.int32)
print(a)
# [[1 1 1 1]
#  [1 1 1 1]
#  [1 1 1 1]]

a = np.arange(0, 10, 2)		# 起始值为0,终止值为10,步长为2
print(a)
# [0 2 4 6 8]

a = np.arange(12).reshape((3, 4))	# 矩阵中元素为0~11,重新定义形状为3行4列
print(a)
# [[ 0  1  2  3]
#  [ 4  5  6  7]
#  [ 8  9 10 11]]

a = np.random.random((2, 4))	# 生成随机矩阵,元素值范围0~1

基础运算

矩阵中逐个元素加减乘

矩阵乘法np.dot(a,b)a.dot(b)

矩阵转置np.transpose(a)a.T

最大/小值,求和,指定维度

np.sum()	# 所有元素求和
np.min()	# 求最小值
np.max()	# 求最大值
a = np.array([[1, 2, 3], [4, 5, 6]])
print(a)
print(np.sum(a))
print(np.min(a))
print(np.max(a))
# [[1 2 3]
#  [4 5 6]]
# 21
# 1
# 6

print('axis = 0')	# 按列看
print(np.sum(a, axis=0))
print(np.min(a, axis=0))
print(np.max(a, axis=0))
# axis = 0
# [5 7 9]
# [1 2 3]
# [4 5 6]

print('axis = 1')	# 按行看
print(np.sum(a, axis=1))
print(np.min(a, axis=1))
print(np.max(a, axis=1))
# axis = 1
# [ 6 15]
# [1 4]
# [3 6]

最大/小值索引

a = np.arange(12).reshape((3, 4))
print(a)
print(np.argmin(a))
print(np.argmax(a))

# [[ 0  1  2  3]
#  [ 4  5  6  7]
#  [ 8  9 10 11]]
# 0
# 11

平均值np.mean(),中位数np.median()

numpy索引

对于矩阵中特定某一个元素的索引同C语言一样,比如a[2][3]

可以使用:对多个元素进行索引

a = np.arange(12).reshape((3, 4))
print(a)
print()
print(a[1, :])      # 1号行的所有元素
print(a[0, 1:])     # 0号行从下标为1开始的所有元素
print(a[2, 1:3])    # 2号行下标为[1,3)的所有元素,注意左闭右开

# [[ 0  1  2  3]
#  [ 4  5  6  7]
#  [ 8  9 10 11]]
# 
# [4 5 6 7]
# [1 2 3]
# [ 9 10]

将数组元素展开使用a.flatten()

array合并

垂直合并np.vstack(),水平合并np.hstack()

a = np.array([1, 1, 1])
b = np.array([2, 2, 2])
c = np.vstack((a, b))       # vertical stack    垂直合并
d = np.hstack((a, b))       # horizontal stack  水平合并

print(c)
# [[1 1 1]
#  [2 2 2]]

print(d)
# [1 1 1 2 2 2]

新增维度

a = np.array([1, 1, 1])
a_new1 = a[np.newaxis, :]   # 对于这个向量,在第一维增加一个维度,变为矩阵
a_new2 = a[:, np.newaxis]   # 对于这个向量,在第二维增加一个维度,变为矩阵

print(a)        # 向量
# [1 1 1]

print(a_new1)   # 1x3矩阵
# [[1 1 1]]

print(a_new2)   # 3x1矩阵
# [[1]
#  [1]
#  [1]]

array分割

使用np.split()进行分割,参数可指定分割的块数和维度

a = np.arange(12).reshape((3, 4))
print(a)

a_s1 = np.split(a, 2, 1)    # 将a纵向分割为2块
print(a_s1)
# [array([[0, 1],
#        [4, 5],
#        [8, 9]]),
#  array([[ 2,  3],
#        [ 6,  7],
#        [10, 11]])]

a_s2 = np.split(a, 3, 0)    # 将a横向分割为3块
print(a_s2)
# [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]

垂直分割np.vsplit(),水平分割np.hsplit()

a = np.arange(12).reshape((3, 4))
print(a)
# [[ 0  1  2  3]
#  [ 4  5  6  7]
#  [ 8  9 10 11]]

a_s1 = np.vsplit(a, 3)  # 垂直分割:将垂直的线分成3份,也就是分成了3行
print(a_s1)
# [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]

a_s2 = np.hsplit(a, 2)  # 水平分割:将水平的线分成2分,也就是分成了2块2列的矩阵
print(a_s2)
# [array([[0, 1],
#        [4, 5],
#        [8, 9]]), 
#  array([[ 2,  3],
#        [ 6,  7],
#        [10, 11]])]

numpy的赋值

numpy中的赋值,可以理解为C语言中的指针,指向的是同一个东西

a = np.arange(4)
print(a)
b = a
c = a
a[0] = 10
print(a)
print(b)
print(c)
print(b is a)
print(c is a)

# [0 1 2 3]
# [10  1  2  3]
# [10  1  2  3]
# [10  1  2  3]
# True
# True

如果不想让他们指向同一个东西,需要使用copy()函数

a = np.arange(4)
print(a)
b = a.copy()
a[0] = 10
print(a)
print(b)
print(b is a)

# [0 1 2 3]
# [10  1  2  3]
# [0 1 2 3]
# False

pandas

相当于一个表格/字典,结合numpy,使用numpy的数据,加上行名和列名

DataFrame

创建DataFrame

DataFrame是pandas中基本的数据对象

df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(df1)

# 对于np生成的3x4的矩阵,自动补上行号0~2和列号0~3
#    0  1   2   3
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11

可以指定index=行号columns=列号

dates = pd.date_range('20230101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
print(df
      
# 参数index指定行号,参数columns指定列号
#                    a         b         c         d
# 2023-01-01  0.373805 -0.098212 -0.557294 -2.307917
# 2023-01-02  0.371519  1.555928 -1.064751 -2.035834
# 2023-01-03 -2.297619  1.130451 -0.137015 -0.062393
# 2023-01-04 -0.064325  0.090884  1.356461 -1.610893
# 2023-01-05 -2.232988 -2.331771  1.138061  0.050736
# 2023-01-06  0.639382  1.620253  0.087044  1.820290

可以直接定义完整的表格

df = pd.DataFrame({'A': 1.0,
                   'B': pd.Timestamp('20230418'),
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                   'D': np.array([3] * 4, dtype='int32'),
                   'E': pd.Categorical(["test", "train", "test", "train"]),
                   'F': 'foo'})
print(df)

#      A          B    C  D      E    F
# 0  1.0 2023-04-18  1.0  3   test  foo
# 1  1.0 2023-04-18  1.0  3  train  foo
# 2  1.0 2023-04-18  1.0  3   test  foo
# 3  1.0 2023-04-18  1.0  3  train  foo

DataFrame属性

使用.dtypes查看每一列数据的类型

print(df.dtypes)

# A           float64
# B    datetime64[ns]
# C           float32
# D             int32
# E          category
# F            object
# dtype: object

使用.index查看行号

print(df.index)

# Int64Index([0, 1, 2, 3], dtype='int64')

使用.columns查看列名

print(df.columns)

# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

使用.values查看所有数据值

print(df.values)

# [[1.0 Timestamp('2023-04-18 00:00:00') 1.0 3 'test' 'foo']
#  [1.0 Timestamp('2023-04-18 00:00:00') 1.0 3 'train' 'foo']
#  [1.0 Timestamp('2023-04-18 00:00:00') 1.0 3 'test' 'foo']
#  [1.0 Timestamp('2023-04-18 00:00:00') 1.0 3 'train' 'foo']]

使用.T查看转置后的数据

print(df.T)

#                      0  ...                    3
# A                  1.0  ...                  1.0
# B  2023-04-18 00:00:00  ...  2023-04-18 00:00:00
# C                  1.0  ...                  1.0
# D                    3  ...                    3
# E                 test  ...                train
# F                  foo  ...                  foo
# 
# [6 rows x 4 columns]

DataFrame方法

使用.describe()生成对于表格中所有数据列的统计数据

print(df.describe())

# 注意.describe()只针对数据列,对于B E F是非数据列不在考虑范围内
#          A    C    D
# count  4.0  4.0  4.0
# mean   1.0  1.0  3.0
# std    0.0  0.0  0.0
# min    1.0  1.0  3.0
# 25%    1.0  1.0  3.0
# 50%    1.0  1.0  3.0
# 75%    1.0  1.0  3.0
# max    1.0  1.0  3.0

使用.sort_index()按照行号或者列号排序

print(df.sort_index(axis=1, ascending=False))	# 按列号降序

#      F      E  D    C          B    A
# 0  foo   test  3  1.0 2023-04-18  1.0
# 1  foo  train  3  1.0 2023-04-18  1.0
# 2  foo   test  3  1.0 2023-04-18  1.0
# 3  foo  train  3  1.0 2023-04-18  1.0
print(df.sort_index(axis=0, ascending=False))	# 按行号降序

#      A          B    C  D      E    F
# 3  1.0 2023-04-18  1.0  3  train  foo
# 2  1.0 2023-04-18  1.0  3   test  foo
# 1  1.0 2023-04-18  1.0  3  train  foo
# 0  1.0 2023-04-18  1.0  3   test  foo

使用.sort_values()按照指定列中的数据排序

print(df.sort_values(by='E'))	# 指定对E列中的数据排序

#      A          B    C  D      E    F
# 0  1.0 2023-04-18  1.0  3   test  foo
# 2  1.0 2023-04-18  1.0  3   test  foo
# 1  1.0 2023-04-18  1.0  3  train  foo
# 3  1.0 2023-04-18  1.0  3  train  foo

选择数据

设定测试数据

dates = pd.date_range('20230101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)

#              A   B   C   D
# 2023-01-01   0   1   2   3
# 2023-01-02   4   5   6   7
# 2023-01-03   8   9  10  11
# 2023-01-04  12  13  14  15
# 2023-01-05  16  17  18  19
# 2023-01-06  20  21  22  23

选定某一列的数据

print(df['A'])      # 或使用print(df.A)

# 2023-01-01     0
# 2023-01-02     4
# 2023-01-03     8
# 2023-01-04    12
# 2023-01-05    16
# 2023-01-06    20
# Freq: D, Name: A, dtype: int32

选定若干行数据,切片选择,区间左闭右开

print(df[0:3])

#             A  B   C   D
# 2023-01-01  0  1   2   3
# 2023-01-02  4  5   6   7
# 2023-01-03  8  9  10  11

使用.loc[]根据字典中的标签(也就是行列名)进行选择

print(df.loc['2023-01-05'])		# 选择指定行
# A    16
# B    17
# C    18
# D    19
# Name: 2023-01-05 00:00:00, dtype: int32

print(df.loc[:, ['A', 'B']])	# 选择指定列
#              A   B
# 2023-01-01   0   1
# 2023-01-02   4   5
# 2023-01-03   8   9
# 2023-01-04  12  13
# 2023-01-05  16  17
# 2023-01-06  20  21

print(df.loc['2023-01-05', ['A', 'B']])	# 选择指定行列
# A    16
# B    17
# Name: 2023-01-05 00:00:00, dtype: int32

使用.iloc[]根据下标对数据进行选择

print(df.iloc[3])			# 选择下标为3的一行
# A    12
# B    13
# C    14
# D    15
# Name: 2023-01-04 00:00:00, dtype: int32

print(df.iloc[3, 1])		# 选择下标为3的一行中下标为1的列
# 13

print(df.iloc[0:3, 1:3])	# 切片选择下标为0,1,2行中下标为1,2的列
#             B   C
# 2023-01-01  1   2
# 2023-01-02  5   6
# 2023-01-03  9  10

print(df.iloc[[1, 3, 5], 1:3])	# 不连续选择若干行
#              B   C
# 2023-01-02   5   6
# 2023-01-04  13  14
# 2023-01-06  21  22

注意:python中的切片选择范围左闭右开

根据某一列中值的范围进行选择

print(df)
print(df[df.A > 8])

#              A   B   C   D
# 2023-01-01   0   1   2   3
# 2023-01-02   4   5   6   7
# 2023-01-03   8   9  10  11
# 2023-01-04  12  13  14  15
# 2023-01-05  16  17  18  19
# 2023-01-06  20  21  22  23

#              A   B   C   D
# 2023-01-04  12  13  14  15
# 2023-01-05  16  17  18  19
# 2023-01-06  20  21  22  23

赋值/更新值

先选择然后更新

df.iloc[2, 2] = 111
print(df)
#              A   B    C   D
# 2023-01-01   0   1    2   3
# 2023-01-02   4   5    6   7
# 2023-01-03   8   9  111  11
# 2023-01-04  12  13   14  15
# 2023-01-05  16  17   18  19
# 2023-01-06  20  21   22  23

df.loc['2023-01-05', 'D'] = 222
print(df)
#              A   B   C    D
# 2023-01-01   0   1   2    3
# 2023-01-02   4   5   6    7
# 2023-01-03   8   9  10   11
# 2023-01-04  12  13  14   15
# 2023-01-05  16  17  18  222
# 2023-01-06  20  21  22   23

df[df.A > 8] = 0
print(df)
#             A  B   C   D
# 2023-01-01  0  1   2   3
# 2023-01-02  4  5   6   7
# 2023-01-03  8  9  10  11
# 2023-01-04  0  0   0   0
# 2023-01-05  0  0   0   0
# 2023-01-06  0  0   0   0

df.A[df.A > 8] = 0		# 只更新A这一列 
print(df)
#             A   B   C   D
# 2023-01-01  0   1   2   3
# 2023-01-02  4   5   6   7
# 2023-01-03  8   9  10  11
# 2023-01-04  0  13  14  15
# 2023-01-05  0  17  18  19
# 2023-01-06  0  21  22  23

新增一列

dates = pd.date_range('20230101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
df['E'] = pd.Series([1, 2, 3, 4, 5, 6], index=dates)
print(df)

#              A   B   C   D
# 2023-01-01   0   1   2   3
# 2023-01-02   4   5   6   7
# 2023-01-03   8   9  10  11
# 2023-01-04  12  13  14  15
# 2023-01-05  16  17  18  19
# 2023-01-06  20  21  22  23

# 新增一列E
#              A   B   C   D  E
# 2023-01-01   0   1   2   3  1
# 2023-01-02   4   5   6   7  2
# 2023-01-03   8   9  10  11  3
# 2023-01-04  12  13  14  15  4
# 2023-01-05  16  17  18  19  5
# 2023-01-06  20  21  22  23  6

处理丢失数据

在机器学习的实际应用中,收集到的数据很可能存在部分数据丢失的情况,numpy中使用NaN表示丢失的数据

一般的处理思路有两种

  1. 直接删除丢失数据所在的整行/列
  2. 将丢失的数据填充为指定值

构造样例数据如下

dates = pd.date_range('20230101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df)

#              A     B     C   D
# 2023-01-01   0   NaN   2.0   3
# 2023-01-02   4   5.0   NaN   7
# 2023-01-03   8   9.0  10.0  11
# 2023-01-04  12  13.0  14.0  15
# 2023-01-05  16  17.0  18.0  19
# 2023-01-06  20  21.0  22.0  23

使用.ifnull()判断数据中是否存在丢失数据

print(df.isnull())

#                 A      B      C      D
# 2023-01-01  False   True  False  False
# 2023-01-02  False  False   True  False
# 2023-01-03  False  False  False  False
# 2023-01-04  False  False  False  False
# 2023-01-05  False  False  False  False
# 2023-01-06  False  False  False  False

print(np.any(df.isnull()) == True)	# 使用np.any简单判断df.isnull()返回的矩阵中是否存在True
# True

删除丢失数据

使用.dropna()直接丢弃掉存在丢失数据所在的行/列

  • axis

    axis=0:按行丢弃

    axis=1:按列丢弃

  • how

    how='any':当此行/列存在NaN时丢弃

    how='all':当此行/列全为NaN时丢弃

print(df.dropna(axis=0, how='any'))		# 丢弃行
#              A     B     C   D
# 2023-01-03   8   9.0  10.0  11
# 2023-01-04  12  13.0  14.0  15
# 2023-01-05  16  17.0  18.0  19
# 2023-01-06  20  21.0  22.0  23

print(df.dropna(axis=1, how='any'))		# 丢弃列
#              A   D
# 2023-01-01   0   3
# 2023-01-02   4   7
# 2023-01-03   8  11
# 2023-01-04  12  15
# 2023-01-05  16  19
# 2023-01-06  20  23

填充丢失数据

使用.fillna()填充丢失数据,可以指定使用此列均值填充

print(df.fillna(value=0))	# df.fillna(df.mean())
#              A     B     C   D
# 2023-01-01   0   0.0   2.0   3
# 2023-01-02   4   5.0   0.0   7
# 2023-01-03   8   9.0  10.0  11
# 2023-01-04  12  13.0  14.0  15
# 2023-01-05  16  17.0  18.0  19
# 2023-01-06  20  21.0  22.0  23

one-hot编码

参考教程:https://blog.csdn.net/qq_43404784/article/details/89486442

对于离散数据可以使用get_dummies函数自动进行one-hot编码,dummy_na为True指定对Nan是否进行编码

image-20230517183900966

pandas导入/导出

使用.read_csv()读取.csv文件

data = pd.read_csv('student.csv')
print(data)

#     Student ID  name   age  gender
# 0         1100  Kelly   22  Female
# 1         1101    Clo   21  Female
# 2         1102  Tilly   22  Female
# 3         1103   Tony   24    Male
# 4         1104  David   20    Male
# 5         1105  Catty   22  Female
# 6         1106      M    3  Female
# 7         1107      N   43    Male
# 8         1108      A   13    Male
# 9         1109      S   12    Male
# 10        1110  David   33    Male
# 11        1111     Dw    3  Female
# 12        1112      Q   23    Male
# 13        1113      W   21  Female

使用.to_pickle()保存为.pickle文件

data.to_pickle('student.pickle')

参考:

b站:莫烦python

https://www.bilibili.com/video/BV1Ex411L7oT/

posted @ 2023-04-18 15:42  dctwan  阅读(17)  评论(0编辑  收藏  举报