python 数据分析
基本环境安装
安装Anaconda
Matplot绘图架构
Scripting(脚本) -> Artist(美工) -> Backend(后端)
折线图
点击查看代码
import matplotlib.pyplot as plt
import random
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(20,8),dpi=80) # 绘图区域 创建大小 和 清晰度
x = range(60)
# 构造中文
x_ch = ['11点{}分'.format(i) for i in x]
y_ticks = range(40) # 刻度
# 修改x,y的刻度
plt.xticks(x[::5],x_ch[::5])
plt.yticks(y_ticks[::5])
# 增加标题,坐标描述
plt.xlabel('时间')
plt.ylabel('温度')
plt.title('某些城市11点到12点之间的温度变化')
# 准备上海的数据
y_shanghai = [random.uniform(15,18) for i in x]
# 准备北京的数据
y_beijing = [random.uniform(1,2) for i in x]
# 画折线图
plt.plot(x,y_shanghai,label='上海') # 实现绘图
plt.plot(x,y_beijing,color='r', linestyle ='--',label='北京') # 实现绘图
plt.legend(loc='best') # 实现图例 up
# plt.savefig('test.png') # 保存图片
plt.show() # 显示图片
颜色字符 | 风格字符 | 位置信息 |
---|---|---|
r 红色 | - 实线 | 'bese' 0 |
g 绿色 | -- 虚线 | 'upper right' 1 |
b 蓝色 | -. 点画线 | 'upper left' 2 |
w 白色 | :点虚线 | 'lower left' 3 |
c 青色 | ''留空 空格 | 'lower right' 4 |
m 洋红 | 'right' 5 | |
y 黄色 | 'center left' 6 | |
k 黑色 | 'center right' 7 | |
'lower center' 8 | ||
'upper center 9' | ||
'center' 10 |
多个坐标系绘制
点击查看代码
import matplotlib.pyplot as plt
import random
plt.rcParams['font.sans-serif'] = ['SimHei']
# 画出某城市11点到12点之间1小时的每分钟的温度变化显示,温度范围在15-18之间
# 创建一个figure
# plt.figure(figsize=(20, 8), dpi=80)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))
# 准备数据
x = range(60)
# 准备上海的温度数据
y_shanghai = [random.uniform(15, 18) for i in x]
# 准备北京的温度数据
y_beijing = [random.uniform(1, 3) for i in x]
# 构造中文
x_ch = ['11点{}分'.format(i) for i in x]
y_ticks = range(40)
# 画折线图
# plt.plot(x, y_shanghai, label='上海')
# plt.plot(x, y_beijing, color='r', linestyle='--', label='北京')
ax[0].plot(x, y_shanghai, label='上海')
ax[1].plot(x, y_beijing, color='r', linestyle='--', label='北京')
# plt是对整体画图,ax是对每个坐标系做处理
# 修改x,y的刻度
# plt.xticks(x[::5], x_ch[::5])
# plt.yticks(y_ticks[::5])
ax[0].set_xticks(x[::5], x_ch[::5])
ax[1].set_xticks(x[::5], x_ch[::5])
ax[0].set_yticks(y_ticks[::5])
ax[1].set_yticks(y_ticks[::5])
# 增加标题,坐标描述
# plt.xlabel('时间')
# plt.ylabel('温度')
# plt.title('某些城市11点到12点之间的温度变化显示')
ax[0].set_xlabel('时间')
ax[1].set_xlabel('时间')
ax[0].set_ylabel("温度")
ax[1].set_ylabel("温度")
ax[0].set_title("中午11点到12点之间的温度变化显示")
ax[1].set_title("中午11点到12点之间的温度变化显示")
ax[0].legend(loc='upper left')
ax[1].legend(loc='upper left')
plt.show()
柱状图
点击查看代码
import matplotlib.pyplot as plt
# bar(x, width)
plt.rcParams['font.sans-serif'] = ['SimHei']
# 创建fig对象
plt.figure(figsize=(20, 8))
# 准备数据
movie_name = ['雷神3:诸神黄昏','正义联盟','东方快车谋杀案','寻梦环游记','全球风暴','降魔传','追捕','七十七天','密战','狂兽','其它']
y = [73853,57767,22354,15969,14839,8725,8716,8318,7916,6764,52222]
# 放进横坐标的数字列表
x = range(len(movie_name))
# 画图
plt.bar(x, y, width=0.5, color=['b','r','g','y','c','m','y','k','c','g','g'])
# 修改刻度名称
plt.xticks(x, movie_name)
plt.show()
点击查看代码
plt.rcParams['font.sans-serif'] = ['SimHei']
# 创建fig对象
plt.figure(figsize=(20, 8))
movie_name = ['雷神3:诸神黄昏','正义联盟','寻梦环游记']
first_day = [10587.6,10062.5,1275.7]
first_weekend=[36224.9,34479.6,11830]
x = range(len(movie_name))
plt.bar(x, first_day, width=0.2, label='首日票房')
plt.bar([i+0.2 for i in x],first_weekend, width=0.2, label='首周票房')
# 修改刻度
plt.xticks([i + 0.1 for i in x], movie_name)
plt.legend(loc='best')
plt.show()
直方图
点击查看代码
import matplotlib.pyplot as plt
# 组数:数据按照不同的范围分组,分成的组成为组数 = 极差/组距(max-mix)/bins
# 组距:每一组两个端点的差
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(20, 8))
time =[131, 98, 125, 131, 124, 139, 131, 117, 128, 108, 135, 138, 131, 102, 107, 114, 119, 128, 121, 142, 127, 130, 124, 101, 110, 116, 117, 110, 128, 128, 115, 99, 136, 126, 134, 95, 138, 117, 111,78, 132, 124, 113, 150, 110, 117, 86, 95, 144, 105, 126, 130,126, 130, 126, 116, 123, 106, 112, 138, 123, 86, 101, 99, 136,123, 117, 119, 105, 137, 123, 128, 125, 104, 109, 134, 125, 127,105, 120, 107, 129, 116, 108, 132, 103, 136, 118, 102, 120, 114,105, 115, 132, 145, 119, 121, 112, 139, 125, 138, 109, 132, 134,156, 106, 117, 127, 144, 139, 139, 119, 140, 83, 110, 102,123,107, 143, 115, 136, 118, 139, 123, 112, 118, 125, 109, 119, 133,112, 114, 122, 109, 106, 123, 116, 131, 127, 115, 118, 112, 135,115, 146, 137, 116, 103, 144, 83, 123, 111, 110, 111, 100, 154,136, 100, 118, 119, 133, 134, 106, 129, 126, 110, 111, 109, 141,120, 117, 106, 149, 122, 122, 110, 118, 127, 121, 114, 125, 126,114, 140, 103, 130, 141, 117, 106, 114, 121, 114, 133, 137, 92,121, 112, 146, 97, 137, 105, 98, 117, 112, 81, 97, 139, 113,134, 106, 144, 110, 137, 137, 111, 104, 117, 100, 111, 101, 110,105, 129, 137, 112, 120, 113, 133, 112, 83, 94, 146, 133, 101,131, 116, 111, 84, 137, 115, 122, 106, 144, 109, 123, 116, 111,111, 133, 150]
# 组距 2分钟 组数
bins = 2
groups = int((max(time)-min(time)) / bins)
# 画直方图
# normed: 纵坐标的显示频率
plt.hist(time, groups)
# 指定刻度范围,以及步长
plt.xticks(list(range(min(time), max(time)))[::2])
plt.xlabel('电影时长大小')
plt.ylabel('电影的数据量')
# 增加网格显示
plt.grid(None, linestyle='--', alpha=1)
plt.show()
饼图
点击查看代码
import matplotlib.pyplot as plt
import pandas as pd
from mplfinance.original_flavor import candlestick_ochl
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(20, 8))
movie_name = ['雷神3:诸神黄昏','正义联盟','东方快车谋杀案','寻梦环游记','全球风暴','降魔传','追捕','七十七天','密战','狂兽','其它']
place_count = [60605,54546,45819,28243,13270,9945,7679,6799,6101,4621,20105]
# 绘制饼图
plt.pie(place_count, labels=movie_name, autopct='%1.2f%%', colors=['b','r','g','y','c','m','y','r','c','g','g'])
# 显示正圆
plt.axis('equal')
plt.legend(loc='best')
plt.title('排片占比示意图')
plt.show()
点击查看代码
labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30, 45, 10]
# 将某部分爆炸出来, 使用括号将第一块分割出来,数值的大小是分割出来的与其他两块之间的间隙
explode = (0, 0.1, 0, 0) # 分别对应labels
fig1, ax1 = plt.subplots()
# pctdistance, 百分比的text离圆心的距离
ax1.pie(sizes,explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
ax1.axis('equal')
plt.show()
K线图
点击查看代码
# 了解部分
data = pd.read_hdf("./stock_plot/day_open.h5")[:100]
data1 = pd.read_hdf("./stock_plot/day_close.h5")[:100]
data2 = pd.read_hdf("./stock_plot/day_high.h5")[:100]
data3 = pd.read_hdf("./stock_plot/day_low(1).h5")[:100]
day = pd.concat([data["000001.SZ"], data1["000001.SZ"], data2["000001.SZ"],
data3["000001.SZ"]], axis=1)
day.columns = ["open", "close", "high", "low"]
day = day.reset_index().values
# 画图
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 8), dpi=80)
# 第一个参数axes
candlestick_ochl(axes, day, width=0.2, colorup='r', colordown='g')
plt.show()
Numpy
了解Numpy
import random
import time
import numpy as np
a = []
for i in range(100000000):
a.append(random.random())
t1 = time.time()
sum1 = sum(a)
t2 = time.time()
b = np.array(a)
t4 = time.time()
sum3 = np.sum(b)
t5 = time.time()
print(t2-t1, t5-t4)
1.6841034889221191 0.5198299884796143
ndarray n维数组
# ndarray n维数组
# 主要存储相同的类型的数据集合
# 创建数组
# 创建二维数组
a = np.array([[1,2,3], [4,5,6]])
a.shape # 查看数组形状
# (2,3)
a.ndim # 数组的维度
# 2
a.size # 查看数组中的元素的数量
# 6
a.itemsize
# 4
a.nbytes # 6 * 4
# 24
a.flags
# C_CONTIGUOUS : True
# F_CONTIGUOUS : False
# OWNDATA : True
# WRITEABLE : True
# ALIGNED : True
# WRITEBACKIFCOPY : False
# UPDATEIFCOPY : False
a = np.array([[1,2,3], [4,5,6]])
b = np.array([7,8,9,10])
c = np.array([[[1,2,3], [4,5,6]], [[11,22,33], [44,55,66]]])
a.shape
b.shape
c.shape
# (2, 2, 3)
# N维数组
# 0维:1,2,3,
# 1维:[7,8,9,10]
# 2维:[[1,2,3], [4,5,6]]
# 3维:[[[1,2,3], [4,5,6]], [[11,22,33], [44,55,66]]]
a
a.dtype # 获取数组的类型
# dtype('int32')
a = np.array([[1,2,3],[4,5,6]],dtype=np.float32)
a.dtype
# dtype('float32')
数组之间的运算
import numpy as np
arr = np.array([1,2,3,4])
arr+1
# array([2, 3, 4, 5])
# 数组与数组之间的运算
# 广播机制
a = np.array([[4,5,6],[7,8,9]])
b = np.array([[2,10], [2, 15]])
# a * b element-wise
score = np.array([[80,86],
[82,80],
[85,78],
[90,90],
[86,82],
[82,90],
[78,80],
[92,94]])
percent = np.array([[0.3, 0.7]])
score * percent
#array([[24. , 60.2],
# [24.6, 56. ],
# [25.5, 54.6],
# [27. , 63. ],
# [25.8, 57.4],
# [24.6, 63. ],
# [23.4, 56. ],
# [27.6, 65.8]])
# 矩阵,特殊在运算机制
np.mat(score)
#matrix([[80, 86],
# [82, 80],
# [85, 78],
# [90, 90],
# [86, 82],
# [82, 90],
# [78, 80],
# [92, 94]])
c = np.array([[0.3], [0.7]])
np.mat(c)
#matrix([[0.3, 0.7],
# [0.3, 0.7]])
# 矩阵运算
# (8, 2) * (2, 1) = (8, 1)
np.matmul(score, c)
#array([[84.2],
# [80.6],
# [80.1],
# [90. ],
# [83.2],
# [87.6],
# [79.4],
# [93.4]])
stock_day_rise = np.random.normal(0, 1, [500, 504])
stock_day_rise.shape
# (500, 504)
stock1 = stock_day_rise[:10, :100]
stock2 = stock_day_rise[10: 20, :100]
stock2
# 合并
# axis: 0按照数组的行的方向拼接在一起
# axis: 1按照数组的列的方向拼接在一起
all_ = np.concatenate([stock1, stock2], axis=0)
# hstack 列拼接 axis: 1
# vstack 行拼接 axis: 0
# 分割
np.split(all_, 20, axis=0)
np.genfromtxt('test.csv', delimiter=',')
#array([[ nan, nan, nan, nan],
# [ 1. , 123. , 1.4, 23. ],
# [ 2. , 110. , nan, 18. ],
# [ 3. , nan, 2.1, 19. ]])
type(np.nan)
#float
e = 2.73
1/e
#0.3663003663003663
np.exp(2)
# 7.38905609893065
1 / np.exp(2)
#0.1353352832366127
1 / (1 + 1/np.exp(2))
# 0.8807970779778823
np.exp(2)
# 7.38905609893065
m = np.array([1,2,3])
1 / (1 + 1/np.exp(m))
# array([0.73105858, 0.88079708, 0.95257413])
Pandas
pandas数据结构
import numpy as np
import pandas as pd
stock_day_rise = np.random.normal(0, 1, [500, 504])
stock_day_rise
#array([[-0.51275272, 0.94026123, -0.28734351, ..., -1.80535228,
# 1.12647759, -0.34482647],
# [-0.11082195, -0.61753087, 0.51247014, ..., -0.71336186,
# -0.75038013, 1.23107248],
# [ 1.30920002, -0.86247187, -0.18046507, ..., 0.41082344,
# 0.36615753, -1.15248877],
# ...,
# [-0.64597353, 0.98051196, 0.21157511, ..., 0.3901954 ,
# 0.44220279, 0.7628329 ],
# [-0.45372471, 0.74978987, 1.14269309, ..., -0.9227356 ,
# -0.64413556, -0.36949079],
# [-0.7002719 , 0.57790589, -1.65279998, ..., -1.57232142,
# -0.51782955, 0.13426912]])
stock_df = pd.DataFrame(stock_day_rise)
stock_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 |
| ---- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | ---- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| 0 | -0.512753 | 0.940261 | -0.287344 | 0.531760 | 0.012567 | 0.709473 | 0.239689 | -1.779217 | -0.501474 | -0.507617 | ... | 1.377147 | 1.783230 | 0.196377 | 1.594897 | 0.619660 | -1.876187 | 1.279120 | -1.805352 | 1.126478 |
| 1 | -0.110822 | -0.617531 | 0.512470 | 0.581689 | 0.711916 | 0.813071 | 1.521003 | -0.290721 | -0.156604 | -1.124984 | ... | 0.948753 | 1.402447 | 0.294993 | -0.802038 | -1.067637 | -0.223470 | 0.445096 | -0.713362 | -0.750380 |
| 2 | 1.309200 | -0.862472 | -0.180465 | 0.028584 | 0.037257 | 0.051052 | 1.629817 | -1.133528 | -0.987510 | -1.585423 | ... | 0.245225 | 1.909723 | -1.403577 | -0.406025 | -1.327163 | -0.608240 | 0.755096 | 0.410823 | 0.366158 |
| 3 | 0.920909 | -0.473799 | -1.925638 | -0.989393 | 0.837138 | 0.948183 | 0.011733 | 0.466019 | 0.258141 | 0.270631 | ... | 1.028244 | 0.550098 | -0.168381 | 0.029352 | 0.652068 | -1.366157 | 2.141130 | -0.391050 | -0.524698 |
| 4 | -0.319762 | 0.599024 | -0.154454 | -1.954056 | -1.672396 | -0.158403 | 1.369486 | 1.294337 | 0.920220 | 0.784408 | ... | -0.694639 | -0.250066 | 0.229763 | -1.020350 | 0.725860 | -0.062765 | -0.071443 | -0.708495 | -1.298314 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 495 | -0.189318 | 0.680488 | 0.696482 | -0.480230 | -1.868852 | -0.012383 | -1.519626 | -1.279518 | 0.804979 | 1.390888 | ... | 1.013731 | -1.506497 | -0.326615 | -1.552188 | 0.427825 | -0.533029 | 0.143934 | 0.192034 | 1.304076 |
| 496 | 0.838544 | -0.455677 | -0.874880 | 0.494403 | -0.196655 | -0.738068 | -2.619937 | -0.151928 | -1.533008 | -2.134869 | ... | -0.575703 | -0.237983 | -1.551520 | 0.825470 | 0.186887 | -0.449823 | 1.406305 | 1.347674 | 0.058468 |
| 497 | -0.645974 | 0.980512 | 0.211575 | -0.397760 | -0.926155 | -0.628815 | 0.407839 | -0.002652 | 0.106013 | 0.377582 | ... | -0.984033 | 0.882435 | 0.741889 | 1.084276 | -0.514312 | 1.374642 | 0.186176 | 0.390195 | 0.442203 |
| 498 | -0.453725 | 0.749790 | 1.142693 | -0.058502 | 0.327256 | 1.752110 | 0.535332 | 1.743112 | -0.459879 | -2.108713 | ... | 0.119614 | -0.412215 | 0.209263 | 0.313788 | 0.216358 | -1.119070 | 1.067892 | -0.922736 | -0.644136 |
| 499 | -0.700272 | 0.577906 | -1.652800 | -0.523849 | -0.342849 | -0.937188 | 0.835102 | 0.269253 | -0.754492 | -0.169862 | ... | -0.792549 | -0.159701 | 0.900721 | -0.909817 | -1.044447 | -1.155437 | 0.309660 | -1.572321 | -0.517830 |
type(stock_df)
#pandas.core.frame.DataFrame
# 添加行索引
stock_code = ['股票' + str(i) for i in range(stock_day_rise.shape[0])]
# stcok_code
stock_df = pd.DataFrame(stock_day_rise,index=stock_code)
stock_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 |
| ----- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | ---- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| 股票0 | -0.512753 | 0.940261 | -0.287344 | 0.531760 | 0.012567 | 0.709473 | 0.239689 | -1.779217 | -0.501474 | -0.507617 | ... | 1.377147 | 1.783230 | 0.196377 | 1.594897 | 0.619660 | -1.876187 | 1.279120 | -1.805352 | 1.126478 |
| 股票1 | -0.110822 | -0.617531 | 0.512470 | 0.581689 | 0.711916 | 0.813071 | 1.521003 | -0.290721 | -0.156604 | -1.124984 | ... | 0.948753 | 1.402447 | 0.294993 | -0.802038 | -1.067637 | -0.223470 | 0.445096 | -0.713362 | -0.750380 |
| 股票2 | 1.309200 | -0.862472 | -0.180465 | 0.028584 | 0.037257 | 0.051052 | 1.629817 | -1.133528 | -0.987510 | -1.585423 | ... | 0.245225 | 1.909723 | -1.403577 | -0.406025 | -1.327163 | -0.608240 | 0.755096 | 0.410823 | 0.366158 |
| 股票3 | 0.920909 | -0.473799 | -1.925638 | -0.989393 | 0.837138 | 0.948183 | 0.011733 | 0.466019 | 0.258141 | 0.270631 | ... | 1.028244 | 0.550098 | -0.168381 | 0.029352 | 0.652068 | -1.366157 | 2.141130 | -0.391050 | -0.524698 |
| 股票4 | -0.319762 | 0.599024 | -0.154454 | -1.954056 | -1.672396 | -0.158403 | 1.369486 | 1.294337 | 0.920220 | 0.784408 | ... | -0.694639 | -0.250066 | 0.229763 | -1.020350 | 0.725860 | -0.062765 | -0.071443 | -0.708495 | -1.298314 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 股票495 | -0.189318 | 0.680488 | 0.696482 | -0.480230 | -1.868852 | -0.012383 | -1.519626 | -1.279518 | 0.804979 | 1.390888 | ... | 1.013731 | -1.506497 | -0.326615 | -1.552188 | 0.427825 | -0.533029 | 0.143934 | 0.192034 | 1.304076 |
| 股票496 | 0.838544 | -0.455677 | -0.874880 | 0.494403 | -0.196655 | -0.738068 | -2.619937 | -0.151928 | -1.533008 | -2.134869 | ... | -0.575703 | -0.237983 | -1.551520 | 0.825470 | 0.186887 | -0.449823 | 1.406305 | 1.347674 | 0.058468 |
| 股票497 | -0.645974 | 0.980512 | 0.211575 | -0.397760 | -0.926155 | -0.628815 | 0.407839 | -0.002652 | 0.106013 | 0.377582 | ... | -0.984033 | 0.882435 | 0.741889 | 1.084276 | -0.514312 | 1.374642 | 0.186176 | 0.390195 | 0.442203 |
| 股票498 | -0.453725 | 0.749790 | 1.142693 | -0.058502 | 0.327256 | 1.752110 | 0.535332 | 1.743112 | -0.459879 | -2.108713 | ... | 0.119614 | -0.412215 | 0.209263 | 0.313788 | 0.216358 | -1.119070 | 1.067892 | -0.922736 | -0.644136 |
# freq='B' 默认略过周六周日
date = pd.date_range('2017-01-01', periods=504, freq='B')
stock_df = pd.DataFrame(stock_day_rise, index=stock_code, columns=date)
pandas的索引与修改
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
stock_day_rise = np.random.normal(0, 1, [500, 504])
# stock_day_rise
stock_code = ['股票' + str(i) for i in range(stock_day_rise.shape[0])]
data = pd.date_range('2017-01-01', periods=504, freq='B')
stock_dataframe = pd.DataFrame(stock_day_rise, index=stock_code, columns=data)
stock_dataframe
| 2017-01-02 | 2017-01-03 | 2017-01-04 | 2017-01-05 | 2017-01-06 | 2017-01-09 | 2017-01-10 | 2017-01-11 | 2017-01-12 | 2017-01-13 | ... | 2018-11-23 | 2018-11-26 | 2018-11-27 | 2018-11-28 | 2018-11-29 | 2018-11-30 | 2018-12-03 | 2018-12-04 | 2018-12-05 | 2018-12-06 |
| ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | --------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- |
| 股票0 | -0.336502 | -0.283818 | -1.833312 | -0.034063 | -0.273923 | -0.013894 | 0.571314 | -0.685192 | -0.844952 | 0.697230 | ... | -0.897525 | 2.295753 | 0.726545 | -0.332880 | -0.707125 | 0.301560 | -1.315805 | 1.038277 | 0.232298 |
| 股票1 | 0.431983 | -0.128563 | 0.430541 | 0.260152 | 0.885598 | 1.659742 | 0.407230 | 0.011112 | 0.624398 | -1.356692 | ... | 0.433011 | -0.468825 | 0.536704 | -0.796652 | 0.972271 | 1.537066 | -0.146411 | 1.468827 | 1.733275 |
| 股票2 | 1.068510 | 0.637716 | -1.626844 | -0.985523 | 0.745854 | -0.359343 | 0.889808 | 1.364657 | -1.017752 | -0.772868 | ... | -0.310762 | 0.420062 | 0.903381 | -0.804816 | -0.444837 | 1.373565 | -1.688836 | -0.853804 | 1.056135 |
| 股票3 | 1.650343 | -0.921815 | -0.068494 | 1.043372 | -1.766311 | -1.018881 | -1.031309 | 1.024690 | -0.533850 | 0.350309 | ... | -1.010353 | 0.614537 | -0.511354 | -0.752013 | -1.017201 | -0.886048 | 0.680733 | 1.063538 | -0.383206 |
| 股票4 | -1.128249 | -1.282252 | -0.928848 | 0.075446 | -1.358604 | 1.602723 | -0.966502 | 2.256386 | 0.925430 | -1.027316 | ... | | | | | | | | | |
stock_dataframe.values
array([[-0.33650197, -0.28381791, -1.83331156, ..., 1.03827662,
0.23229771, 0.50349308],
[ 0.43198327, -0.12856302, 0.4305411 , ..., 1.46882666,
1.73327538, 0.44540417],
[ 1.06851021, 0.63771568, -1.6268439 , ..., -0.8538035 ,
1.05613455, 1.13792046],
stock_dataframe.T
| 股票0 | 股票1 | 股票2 | 股票3 | 股票4 | 股票5 | 股票6 | 股票7 | 股票8 | 股票9 | ... | 股票490 | 股票491 | 股票492 | 股票493 | 股票494 | 股票495 | 股票496 | 股票497 | 股票498 | 股票499 |
| ---------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | ----- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| 2017-01-02 | -0.336502 | 0.431983 | 1.068510 | 1.650343 | -1.128249 | -0.605703 | -0.551460 | 0.019854 | -2.092409 | -0.495476 | ... | 0.451665 | -0.328315 | -0.289311 | 0.204668 | 2.794759 | 0.877930 | 1.944935 | -0.657216 | 1.261522 |
| 2017-01-03 | -0.283818 | -0.128563 | 0.637716 | -0.921815 | -1.282252 | 0.427100 | -1.296923 | 0.767681 | -0.621305 | 0.122074 | ... | 2.337562 | -0.350175 | -0.424671 | -1.011431 | 0.184091 | 0.242851 | 0.164125 | 0.910831 | -2.520630 |
| 2017-01-04 | -1.833312 | 0.430541 | -1.626844 | -0.068494 | -0.928848 | 0.030197 | -0.171296 | -0.267061 | -0.285124 | -0.212287 | ... | -0.301684 | 0.015821 | 0.582552 | -0.349317 | 2.052757 | 0.056201 | 1.028949 | -0.730406 | -0.275469 |
| 2017-01-05 | -0.034063 | 0.260152 | -0.985523 | 1.043372 | 0.075446 | -0.282063 | 0.939964 | -1.005864 | -0.536240 | -0.521829 | ... | 0.487618 | 0.211755 | 1.134300 | -1.530601 | -1.129824 | -0.106915 | -0.757018 | -0.306077 | 0.088784 |
| 2017-01-06 | -0.273923 | 0.885598 | 0.745854 | -1.766311 | -1.358604 | -1.407985 | -1.195100 | -0.552709 | -1.014346 | -0.442240 | | | | | | | | | | |
stock_dataframe.head(10)
2017-01-02 2017-01-03 2017-01-04 2017-01-05 2017-01-06 2017-01-09 2017-01-10 2017-01-11 2017-01-12 2017-01-13 ... 2018-11-23 2018-11-26 2018-11-27 2018-11-28 2018-11-29 2018-11-30 2018-12-03 2018-12-04 2018-12-05 2018-12-06
股票0 -0.336502 -0.283818 -1.833312 -0.034063 -0.273923 -0.013894 0.571314 -0.685192 -0.844952 0.697230 ... -0.897525 2.295753 0.726545 -0.332880 -0.707125 0.301560 -1.315805 1.038277 0.232298 0.503493
股票1 0.431983 -0.128563 0.430541 0.260152 0.885598 1.659742 0.407230 0.011112 0.624398 -1.356692 ... 0.433011 -0.468825 0.536704 -0.796652 0.972271 1.537066 -0.146411 1.468827 1.733275 0.445404
股票2 1.068510 0.637716 -1.626844 -0.985523 0.745854 -0.359343 0.889808 1.364657 -1.017752 -0.772868 ... -0.310762 0.420062 0.903381 -0.804816 -0.444837 1.373565 -1.688836 -0.853804 1.056135 1.137920
股票3 1.650343 -0.921815 -0.068494 1.043372 -1.766311 -1.018881 -1.031309 1.024690 -0.533850 0.350309
# stock_dataframe.tail(10)
# DataFrame索引操作
# 重设索引
# stock_dataframe.reset_index(drop=True)
df = pd.DataFrame({'month':[1,4,7,10], 'year':[1, 1, 2, 2], 'sale':[55, 40, 84, 31]})
df = df.set_index('year','month')
# df
df
month sale
year
1 1 55
1 4 40
2 7 84
2 10 31
df.index
Int64Index([1, 1, 2, 2], dtype='int64', name='year')
# MutiIndex
stock_dataframe = stock_dataframe.T
stock_dataframe
| 股票0 | 股票1 | 股票2 | 股票3 | 股票4 | 股票5 | 股票6 | 股票7 | 股票8 | 股票9 | ... | 股票490 | 股票491 | 股票492 | 股票493 | 股票494 | 股票495 | 股票496 | 股票497 | 股票498 | 股票499 |
| ---------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | ----- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| 2017-01-02 | -0.336502 | 0.431983 | 1.068510 | 1.650343 | -1.128249 | -0.605703 | -0.551460 | 0.019854 | -2.092409 | -0.495476 | ... | 0.451665 | -0.328315 | -0.289311 | 0.204668 | 2.794759 | 0.877930 | 1.944935 | -0.657216 | 1.261522 |
| 2017-01-03 | -0.283818 | -0.128563 | 0.637716 | -0.921815 | -1.282252 | 0.427100 | -1.296923 | 0.767681 | -0.621305 | 0.122074 | ... | 2.337562 | -0.350175 | -0.424671 | -1.011431 | 0.184091 | 0.242851 | 0.164125 | 0.910831 | -2.520630 |
| 2017-01-04 | -1.833312 | 0.430541 | -1.626844 | -0.068494 | -0.928848 | 0.030197 | -0.171296 | -0.267061 | -0.285124 | -0.212287 | ... | -0.301684 | 0.015821 | 0.582552 | -0.349317 | 2.052757 | 0.056201 | 1.028949 | -0.730406 | -0.275469 |
| 2017-01-05 | -0.034063 | 0.260152 | -0.985523 | 1.043372 | 0.075446 | -0.282063 | 0.939964 | -1.005864 | -0.536240 | -0.521829 | ... | 0.487618 | 0.211755 | 1.134300 | -1.530601 | -1.129824 | -0.106915 | -0.757018 | -0.306077 | 0.088784 |
| 2017-01-06 | -0.273923 | 0.885598 | 0.745854 | -1.766311 | | | | | | | | | | | | | | | | |
stock_dataframe['股票0']['2017-01-02']
# -0.33650197255654596
pd.Series(np.arange(10))
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
pd.Series({'red':100, 'blue':200, 'green': 500, 'yellow':1000})
red 100
blue 200
green 500
yellow 1000
dtype: int64
data = pd.read_csv('./stock_day.csv')
data.head()
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39
2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53
2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32
2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90
2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58
data[['open', 'high', 'close']]
open high close
2018-02-27 23.53 25.88 24.16
2018-02-26 22.80 23.78 23.53
2018-02-23 22.88 23.37 22.82
2018-02-22 22.25 22.76 22.28
2018-02-14 21.49 21.99 21.92
... ... ... ...
2015-03-06 13.17 14.48 14.28
2015-03-05 12.88 13.45 13.16
2015-03-04 12.80 12.92 12.90
2015-03-03 12.52 13.06 12.70
2015-03-02 12.25 12.67 12.52
643 rows × 3 columns
# 使用行列索引的方式取值,必须按照先列后行的顺序
data['open']['2018-02-27']
23.53
# data[:1, :2]
# loc: 只能指定行列索引的名字
# iloc: 可以通过索引的下标获取,索引是时间或者指标的名字
data.loc['2018-02-27': '2018-02-23', 'open']
2018-02-27 23.53
2018-02-26 22.80
2018-02-23 22.88
Name: open, dtype: float64
data.iloc[0:3, 0:4] # 相当于取到一个DataFrame
open high close low
2018-02-27 23.53 25.88 24.16 23.53
2018-02-26 22.80 23.78 23.53 22.80
2018-02-23 22.88 23.37 22.82 22.71
# ix, 在1.0.0版本之后就删除这个方法
# 排序
data.sort_index()
# ascending=False 按照升序还是降序的顺序排序,默认从小到大
data.sort_values(by='p_change', ascending=False)
data.sort_values(by=['open', 'close'], ascending=False).head(10)
# 统计分析
# 求出最小值
data.idxmin(axis=0)
# cumsum
data = data.sort_index()
data
data.p_change.cumsum().plot()
plt.show()
# 逻辑运算
# 通过运算符
data['p_change'] > 2
data[data['p_change'] > 2]
data[(data['p_change']>2) & (data['turnover']>5)]
data.query('p_change>2&turnover>5')
# isin
data[data['turnover'].isin([4.19, 2.39])]
# 数学运算
data
open_ = data['open']
close_ = data['close']
# add 加法 sub 减法
data['my_price_change'] = close_.sub(open_)
# 自定义运算
data[['open', 'close']].apply(lambda x: x.max() - x.min(), axis=0)