numpy 和 pandas基础学习

1. numpy创建数组：array()

import numpy as np
import pandas as pd


# 创建一维数组
res = np.array([1,2,3])
print(res) # [1 2 3]

# 创建二维数组，多维数组以此类推
res = np.array([[1,2,3],[4,5,6]])
print(res) 
"""
[[1 2 3]
 [4 5 6]]
"""

数组特点：

# 数组中数据类型是统一的
res = np.array([[1,'strvar',3],[4,4.9,6]])
print(res)
"""
优先级顺序：
字符串>浮点型>整型
[['1' 'strvar' '3']
 ['4' '4.9' '6']]
"""

numpy相关方法：ones() zero() linspace() arange() random.randint(）random.random() astype()

# ones()方法创建一个数组，值用1填充，shape制定数组的形状和维度，三个参数的话就是三维
# zero()方法同理，只不过值用0填充
ar = np.ones(shape=(3,4))
print(ar)
"""
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
"""

# linspace()是返回一个一维的数组, num代表需要几个数值，然后linspace会等分形成一个一维数组
ar = np.linspace(0,30,num=6)
print(ar) # [ 0.  6. 12. 18. 24. 30.]

#　arange()也是返回一个一维数组,第三个参数是步长
ar = np.arange(0,30,8)
print(ar) # [ 0  8 16 24]

# random.ranint()在制定的范围内返回随机数组，size是返回数组维度和形状
ar = np.random.randint(10,20,size=(3,4))
print(ar)
"""
[[17 10 13 10]
 [11 15 15 12]
 [13 18 16 19]]
"""

# random.random()是返回０到１之间的随机浮点数数组
ar = np.random.random(size=(2,3))
print(ar)
"""
[[0.45168459 0.64069979 0.68093153]
 [0.68517369 0.67128161 0.21001053]]
"""

# astype()可修改数组的数据类型
ar = np.array([1,2,3],dtype='float32')
print(ar)

res = ar.astype('int8')
print(res)
"""
[1. 2. 3.]
[1 2 3]
"""

numpy相关属性：shape, ndim, size, dtype

# shape属性返回数组的形状
res = ar.shape
print(res) # (2, 3)

# ndim属性返回数组的维度
res = ar.ndim
print(res)  # 2

# size属性返回数组的元素个数
res = ar.size
print(res) # 6

# dtype属性返回数组元素的数据类型
res = ar.dtype
print(res) # float64

# type属性返回数组的数据类型
res = type(ar)
print(res) # <class 'numpy.ndarray'>

numpy的索引和切片：

ar = np.random.randint(10,20,size=(4,5))
print(ar)
# 切出数组的前两行
res = ar[0:2]
print(res)

# 切出数组的前两列,冒号前面的代表是行的切片，冒号后面的是列的切片
res = ar[:,0:2]
print(res)

# 切出前两行的前两列
res = ar[0:2,0:2]
print(res)

# 将数组的行倒置
res = ar[::-1]
print(res)

# 将数组的列进行倒置
res = ar[:,::-1]
print(res)

# 应用：将图片进行翻转和裁剪
ar = plt.imread('./666.bmp')
# 左右翻转
ar = plt.imshow(ar[:,::-1,:])

# 图片裁剪
ar = plt.imshow(ar[50:100,100:150,:])

变形：reshape()

ar = np.random.randint(10,20,size=(2,4))
print(ar)

# 变成一维数组,注意元素的个数跟之前数组的元素个数一定要相等
ar = ar.reshape((8,))
print(ar)
# 变成三维数组
ar = ar.reshape((2,2,2))
print(ar)

#　-1为自动计算,如行为４，根据元素相等，自动计算列有多少
ar = ar.reshape((4,-1))
print(ar)

级联拼接：concatenate()

ar = np.random.randint(10,20,size=(2,4))
print(ar)

# axis为0表示以行拼接，为1表示以列来拼接
# 注意，在以行拼接时要保证两个数组的行数一致，以列拼接时同理，不然报错
ar = np.concatenate((ar,ar),axis=0)
print(ar)

# 应用：图片九宫格拼接

常用的聚合函数：sum() max() min() mean()

ar = np.random.randint(10,20,size=(2,4))
print(ar)

# sum()返回数据数据的总和
res = ar.sum()
print(res) # 103

# max()返回数组数据的最大值,若指定了axis参数，则表示求行或者列
res = ar.max(axis=0)
print(res) # 19

# min()返回数据数据的最小值
res = ar.min()
print(res) #　10

# mean()返回数组数据的均值
res = ar.mean()
print(res) # 12.5

数学函数：

ar = np.array([33.72,12.39,3.88])
# decimals参数表示保留几位小数然后进行四舍五入计算，默认为1
res = np.around(ar,decimals=1)
print(res) # [33. 13.  4.]

# 参数decimals为-1代表区数组小数点左边的部分进行四舍五入
res = np.around(ar, decimals=-1)
print(res) # [30. 10.  0.]

# 统计函数,axis为1表示每行里面的最小值，返回列表，np.amax()同理
res = np.amin(ar,axis=1)
print(res) # [12 10 11]

# ptp()是数组里面最大值减去最小值
res = np.ptp(ar)
print(res) # 9

# median()是返回数组中元素的中位数，也可以用axis来指定轴
res = np.median(ar)
print(res) # 13.0

# std()是返回数组中的标准差，也可以用axis来指定轴
res = np.std(ar)
print(res) # 2.153807997220014

# var()是返回数组中的方差，也可以用axis来指定轴
res = np.var(ar)
print(res) # 1.909722222222222

数组的倒置和矩阵的乘法：

ar = np.random.randint(10,20,size=(3,4))
print(ar)

# T是数组所有元素的倒置
res = ar.T
print(res)

# dot()是矩阵的乘法
res = np.dot([[1,2],[2,3]],[[2,0],[1,4]])
print(res)

2. pandas：

Series相当于一个一维数组

DataFrame: 相当于一个二维数组

df = DataFrame(data=np.random.randint(10,20,size=(3,2)),columns=['第一列','第二列'],index=['第一行','第二行','第三行'])
print(df)
"""
columns是指定列索引的，index是指定行索引的
DataFrame的同样有shape index columns(返回列元素) values(返回元素)属性
     第一列  第二列
第一行   10   16
第二行   19   11
第三行   16   12
"""

# header防止将数据第一行内容当成头部信息，sep表示头部以自增数字表示
res = pd.read_csv('./tu.txt',header=None, sep='-')

DataFrame的索引和切片：

df = DataFrame(data=np.random.randint(10,20,size=(3,2)),columns=['第一列','第二列'],index=['第一行','第二行','第三行'])
print(df)
# 若指定了显示索引，只能直接使用显示列索引取值
res = df['第一列']
print(res)

# loc用于取显示行索引，第一行的第二列元素
res = df.loc['第一行','第二列']
print(res)

# iloc用于取隐示行索引,两个中括号表示取第一行和第二行的所有元素,逗号前面是行，后面是列
res = df.iloc[[0,1],]
print(res)

# 一个中括号表示取第一行的第二个元素
res = df.iloc[0,1]
print(res)

# 行切片
res = df[0:2]
print(res)

# 列切片
res = df.iloc[:,1:2]
print(res)

3. matplotlib创建图片数组和展示图片数组：

import matplotlib.pyplot as plt

# 可以读取图片信息，结果时三维数组
res = plt.imread('./666.bmp')
print(res)
"""
[[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
"""
# 展示图片类型的数组
res = plt.imshow(res)
print(res)

# plt将二维数组展示成图片
a = np.array([[223,123,234],[255,143,25]])
res = plt.imshow(a)
print(res)

绘制简单图像：

# 绘制折线图
x = np.array([1,2,3,4])
y = x * 2
# 保存图像,实例化一个对象
fig = plt.figure()

# 等比例的放大或者缩小坐标系（刻度不会发生变化）
plt.figure(figsize=(3,6))

res = plt.plot(x,y,label='deadline')
# 调用图例
plt.legend()
print(res)

# 给坐标系设置标示
# 设置x抽名称
plt.xlabel('time')
# 设置y抽名称
plt.ylabel('money')
# 设置坐标系名称
plt.title('happy')

# 保存图像
fig.savefig('保存图像的路径')

# 绘制柱状图
x = [1,2,3,4] # x抽的标示
y = [2,4,5,1] # 柱状图的高度
plt.bar(x,y) #　柱状图
plt.barh(x,y) # 横向的柱状图

# 绘制直方图/密度图
x = [1,2,3,2,1,2,3,4,2,4,5,6,3,2,2,4,7]
#　标示数据出现的频次
plt.hist(x,bins=10) # bins标示柱子的个数

# 绘制饼图
x = [0.1,0.4,0.3] # 表示数据的比例
# labels参数表示给数据起别名,labeldistance表示别名距离圆心的距离
# autopct表示显示占比，保留两位小数，explode表示每部分跟圆心的距离
plt.pie(x,labels=['a','b','c'],labeldistance=0.5,autopct='%.2f%%',explode=[0.2,0.2,0.2])

# 绘制散点图
x = [1,2,3,4.9,5,4,2.8]
y = [0.1,0.4,0.2,0.8,0.9]
plt.scatter(x,y)

4. tushare简单使用

import tushare

t1 = tushare.get_k_data(code='603660', start='1993-11-23')
print(t1)

# 将数据存贮到本地
t1.to_csv('./tu.txt')

# 将外部数据加载到数组
df = pd.read_csv('./tu.txt')
print(df)

# 将第一列删除,labels表示删除的列名称，axis此时表示列，inplace表示在原来数组的基础上做修改
res = df.drop(labels='Unnamed: 0',axis=1,inplace=True)
print(df)

# 将列数据整体下移一位可以用列数据.shift(1)

# resample()方法可以重新取样,如取出每个月的第一条数据
df.resample(rule='M').first()

案例：计算某一股票金叉买入死叉卖出的收益

df = tushare.get_k_data(code='000001', start='2010-01',end='2021-1')
df.to_csv('./pingan.csv')
df = pd.read_csv('./pingan.csv')

# 删除不需要的这一行
df.drop(labels='Unnamed: 0',axis=1,inplace=True)
# 将时间字符串转给时间格式
df['date'] = pd.to_datetime(df['date'])
# 将时间变成行索引
df.set_index('date',inplace=True)

# 计算5日均值
ma5 = df['close'].rolling(5).mean()
# 计算30日均值
ma30 = df['close'].rolling(30).mean()
# 根据得到的布尔值来找金叉和死叉的时间点
s1 = ma5 < ma30
s2 = ma5 > ma30
# 取出死叉对应的时间点
sale_time = df.loc[s1 & s2.shift(1)].index
# 取出金叉对应的时间点
buy_time = df.loc[~(s1 | s2.shift(1))].index

# 将金叉和死叉时间存放在series中
series_1 = Series(data=1, index=buy_time)
series_2 = Series(data=0, index=sale_time)
# 将金叉死叉日期整合到一起,并且去掉30天均线前面的30个无意义日期
series_all = series_1.append(series_2).sort_index()[30:]

# 开始计算金叉和死叉的交易
first_money = 100000 # 本金
cost_money = 100000 # 手上剩余资金
hold = 0 # 手上持有股票（支）

for index in series_all.index:
    if series_all[index] == 1:
        # 说明是金叉，要买入股票
        # 当天股票开盘价
        price = df.loc[index]['open']
        # 剩余资金可以购买的手数
        hand = cost_money // (price * 100)
        # 购买股票的数量（支)
        hold = hand * 100
        # 手上剩余资金
        cost_money -= (hold * price)
    else:
        # 说明是死叉，应该卖出股票
        # 当天股票开盘价
        price = df.loc[index]['open']
        cost_money += (price * hold)
        hold = 0

# 计算最后手上剩余股票的资金
last_money = hold * df['open'][-1]
print(cost_money+last_money-first_money)

基于pandas的简单数据清洗：

空值处理：

#　简单的数据清洗
df = DataFrame(data=np.random.randint(10,20,size=(3,4)))
df.iloc[0,1] = None
df.iloc[2,2] = np.NAN
print(df)

# dropna()方法是删除空，axis为1表示行
res = df.dropna(axis=0)
print(res)

# fillna()方法是将空值填充
# 将空值以666来填充
res = df.fillna(value=666)
print(res)
# 使用空值的近邻值来填充
# axis表示为列，ffill表示为以列的前面一个值来填充，bfill表示为以列的后面一个值来填充
res = df.fillna(axis=0,method='ffill').fillna(axis=0,method='bfill')
print(res)

# 使用均值来填充空值
for col in df.columns:
    if df[col].isnull().sum() > 0:
        mean_value = df[col].mean()
        df[col] = df[col].fillna(value=mean_value)

重复值处理：

#　简单的数据清洗
df = DataFrame(data=np.random.randint(10,20,size=(3,4)))
df.iloc[1] = 2
df.iloc[2] = 2
print(df)

# duplicated表示去掉重复的行，keep代表保留重复的第一行，返回布尔值
res = df.duplicated(keep='first')
print(res)
#　drop_duplicates()是直接返回删除重复行的数组，keep默认我first
res = df.drop_duplicates()
print(res)

级联操作：对表格进行拼接

#　行列索引都一致的叫匹配级联
df1 = DataFrame(data=np.random.randint(10,20,size=(3,4)))
df2 = DataFrame(data=np.random.randint(10,20,size=(3,4)))
# concat()是级联拼接，axis为1是以行做拼接
df3 = pd.concat((df1,df2),axis=1)
print(df3)

#　行列索引不一致
df1 = DataFrame(data=np.random.randint(10,20,size=(3,4)),index=['a','b','c'])
df2 = DataFrame(data=np.random.randint(10,20,size=(3,4)),index=['a','d','c'])
# 以行为索引进行拼接，没有问题，但是如果以列进行拼接，b,d两行会出现NAN
# join 参数表示的是拼接方式，inner表示拼接能拼接的，其他舍弃，outer表示所有都拼接，不能拼接的以NAN补充
df3 = pd.concat((df1,df2),axis=1,join='outer')
print(df3)

合并操作：对数据进行合并

df1 = DataFrame(data=np.random.randint(10,20,size=(3,4)),index=['a','b','c'])
df2 = DataFrame(data=np.random.randint(10,20,size=(3,4)),index=['a','d','c'])
# 以class字段合并df1和df2两个数组，on默认使用两个数组中的共同字段来连接
# how 参数inner表示合并能合并的，outer表示全都合并，left表示以左边为主合并，right表示以右边为主合并
# left_on指明左表使用的合并，right_on指明右表使用的合并，这样没有相同索引的两个数组也可以进行合并操作
df3 = pd.merge(df1,df2,on='class',how='inner')

pandas替换操作：replace

df1 = DataFrame(data=np.random.randint(10,20,size=(3,4)),index=['a','b','c'])
print(df1)

# replace()方法可以将一个值替换成另一个值
res = df1.replace(to_replace=11, value='一一',inplace=True)
print(df1)
# 多值替换
res = df1.replace(to_replace={11: "一一", 18: "一二"},inplace=True)
print(df1)
# 替换指定索引值
res = df1.replace(to_replace={2:11},value='第二行的11',inplace=True)
print(df1)

映射：

df = DataFrame(data=[['bob',12],['jack',14],['bob',12]],columns=['name','salary'])
print(df)

# 映射关系表
dic = {
    'bob': '张三',
    'jack': '李四'
}
# map()方法添加映射，map只是series的方法
df['cname'] = df['name'].map(dic)
print(df)

# 将原有工资加100
df['after_salary'] = df['salary'].map(lambda s : s+100)
print(df)

抽样和分组：

df = DataFrame(np.random.randint(10,20,size=(4,3)),columns=['A','B','C'])
print(df)

# 按照列对数组进行打乱，indices接收的是隐示索引，permutation()方法是随机生成3以下的数字
df = df.take(indices=np.random.permutation(3),axis=1)
print(df)

# 随机抽样
df = df.take(indices=np.random.permutation(3),axis=1).take(indices=np.random.permutation(4),axis=0)[:2]
print(df)

# 分组：groupby()分组，groups查看分组结果
res = df.groupby('B').groups
print(res)

数据透视图：

# 透视图：pivot_table()
# index代表分类条件，values代表筛选条件，columns设置列层次字段，aggfunc使用的聚合函数，默认为均值mean，fill_value是以什么值填充空值
res = df.pivot_table(index='',values='',columns='',aggfunc='',fill_value='')

posted on 2023-08-28 07:49 fdsimin 阅读(55) 评论(0) 编辑收藏举报