python 数据分析

基本环境安装

安装Anaconda

Matplot绘图架构

Scripting(脚本) -> Artist(美工) -> Backend(后端)

折线图

点击查看代码

import matplotlib.pyplot as plt
import random 

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(20,8),dpi=80)  # 绘图区域   创建大小 和 清晰度
x = range(60)
# 构造中文
x_ch = ['11点{}分'.format(i) for i in x]
y_ticks = range(40)   # 刻度

# 修改x,y的刻度
plt.xticks(x[::5],x_ch[::5])
plt.yticks(y_ticks[::5])

# 增加标题，坐标描述
plt.xlabel('时间')
plt.ylabel('温度')
plt.title('某些城市11点到12点之间的温度变化')

# 准备上海的数据
y_shanghai = [random.uniform(15,18) for i in x]
# 准备北京的数据
y_beijing = [random.uniform(1,2) for i in x]

# 画折线图
plt.plot(x,y_shanghai,label='上海')   # 实现绘图
plt.plot(x,y_beijing,color='r', linestyle ='--',label='北京')   # 实现绘图

plt.legend(loc='best')   # 实现图例  up
# plt.savefig('test.png')     # 保存图片  
plt.show()                  # 显示图片

颜色字符	风格字符	位置信息
r 红色	- 实线	'bese' 0
g 绿色	-- 虚线	'upper right' 1
b 蓝色	-. 点画线	'upper left' 2
w 白色	：点虚线	'lower left' 3
c 青色	''留空空格	'lower right' 4
m 洋红		'right' 5
y 黄色		'center left' 6
k 黑色		'center right' 7
		'lower center' 8
		'upper center 9'
		'center' 10

多个坐标系绘制

点击查看代码

import matplotlib.pyplot as plt
import random 
plt.rcParams['font.sans-serif'] = ['SimHei']

# 画出某城市11点到12点之间1小时的每分钟的温度变化显示，温度范围在15-18之间
# 创建一个figure
# plt.figure(figsize=(20, 8), dpi=80)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))

# 准备数据
x = range(60)

# 准备上海的温度数据
y_shanghai = [random.uniform(15, 18) for i in x]
# 准备北京的温度数据
y_beijing = [random.uniform(1, 3) for i in x]

# 构造中文
x_ch = ['11点{}分'.format(i) for i in x]
y_ticks = range(40)

# 画折线图
# plt.plot(x, y_shanghai, label='上海')
# plt.plot(x, y_beijing, color='r', linestyle='--', label='北京')

ax[0].plot(x, y_shanghai, label='上海')
ax[1].plot(x, y_beijing, color='r', linestyle='--', label='北京')

# plt是对整体画图，ax是对每个坐标系做处理

# 修改x，y的刻度
# plt.xticks(x[::5], x_ch[::5])
# plt.yticks(y_ticks[::5])

ax[0].set_xticks(x[::5], x_ch[::5])
ax[1].set_xticks(x[::5], x_ch[::5])

ax[0].set_yticks(y_ticks[::5])
ax[1].set_yticks(y_ticks[::5])

# 增加标题，坐标描述
# plt.xlabel('时间')
# plt.ylabel('温度')
# plt.title('某些城市11点到12点之间的温度变化显示')

ax[0].set_xlabel('时间')
ax[1].set_xlabel('时间')

ax[0].set_ylabel("温度")
ax[1].set_ylabel("温度")

ax[0].set_title("中午11点到12点之间的温度变化显示")
ax[1].set_title("中午11点到12点之间的温度变化显示")

ax[0].legend(loc='upper left')
ax[1].legend(loc='upper left')

plt.show()

柱状图

点击查看代码

import matplotlib.pyplot as plt

# bar(x, width)
plt.rcParams['font.sans-serif'] = ['SimHei']

# 创建fig对象
plt.figure(figsize=(20, 8))

# 准备数据
movie_name = ['雷神3：诸神黄昏','正义联盟','东方快车谋杀案','寻梦环游记','全球风暴','降魔传','追捕','七十七天','密战','狂兽','其它']
y = [73853,57767,22354,15969,14839,8725,8716,8318,7916,6764,52222]

# 放进横坐标的数字列表
x = range(len(movie_name))

# 画图
plt.bar(x, y, width=0.5, color=['b','r','g','y','c','m','y','k','c','g','g'])

# 修改刻度名称
plt.xticks(x, movie_name)

plt.show()

点击查看代码

plt.rcParams['font.sans-serif'] = ['SimHei']

# 创建fig对象
plt.figure(figsize=(20, 8))

movie_name = ['雷神3：诸神黄昏','正义联盟','寻梦环游记']

first_day = [10587.6,10062.5,1275.7]
first_weekend=[36224.9,34479.6,11830]

x = range(len(movie_name))
plt.bar(x, first_day, width=0.2, label='首日票房')
plt.bar([i+0.2 for i in x],first_weekend, width=0.2, label='首周票房')

# 修改刻度
plt.xticks([i + 0.1 for i in x], movie_name)
plt.legend(loc='best')

plt.show()

直方图

点击查看代码

import matplotlib.pyplot as plt

# 组数：数据按照不同的范围分组，分成的组成为组数 = 极差/组距(max-mix)/bins
# 组距：每一组两个端点的差

plt.rcParams['font.sans-serif'] = ['SimHei']

plt.figure(figsize=(20, 8))

time =[131,  98, 125, 131, 124, 139, 131, 117, 128, 108, 135, 138, 131, 102, 107, 114, 119, 128, 121, 142, 127, 130, 124, 101, 110, 116, 117, 110, 128, 128, 115,  99, 136, 126, 134,  95, 138, 117, 111,78, 132, 124, 113, 150, 110, 117,  86,  95, 144, 105, 126, 130,126, 130, 126, 116, 123, 106, 112, 138, 123,  86, 101,  99, 136,123, 117, 119, 105, 137, 123, 128, 125, 104, 109, 134, 125, 127,105, 120, 107, 129, 116, 108, 132, 103, 136, 118, 102, 120, 114,105, 115, 132, 145, 119, 121, 112, 139, 125, 138, 109, 132, 134,156, 106, 117, 127, 144, 139, 139, 119, 140,  83, 110, 102,123,107, 143, 115, 136, 118, 139, 123, 112, 118, 125, 109, 119, 133,112, 114, 122, 109, 106, 123, 116, 131, 127, 115, 118, 112, 135,115, 146, 137, 116, 103, 144,  83, 123, 111, 110, 111, 100, 154,136, 100, 118, 119, 133, 134, 106, 129, 126, 110, 111, 109, 141,120, 117, 106, 149, 122, 122, 110, 118, 127, 121, 114, 125, 126,114, 140, 103, 130, 141, 117, 106, 114, 121, 114, 133, 137,  92,121, 112, 146,  97, 137, 105,  98, 117, 112,  81,  97, 139, 113,134, 106, 144, 110, 137, 137, 111, 104, 117, 100, 111, 101, 110,105, 129, 137, 112, 120, 113, 133, 112,  83,  94, 146, 133, 101,131, 116, 111,  84, 137, 115, 122, 106, 144, 109, 123, 116, 111,111, 133, 150]

# 组距 2分钟  组数
bins = 2

groups = int((max(time)-min(time)) / bins)

# 画直方图
# normed： 纵坐标的显示频率

plt.hist(time, groups)

# 指定刻度范围，以及步长
plt.xticks(list(range(min(time), max(time)))[::2])

plt.xlabel('电影时长大小')
plt.ylabel('电影的数据量')
# 增加网格显示
plt.grid(None, linestyle='--', alpha=1)

plt.show()

饼图

点击查看代码

import matplotlib.pyplot as plt
import pandas as pd
from mplfinance.original_flavor import candlestick_ochl

plt.rcParams['font.sans-serif'] = ['SimHei']

plt.figure(figsize=(20, 8))

movie_name = ['雷神3：诸神黄昏','正义联盟','东方快车谋杀案','寻梦环游记','全球风暴','降魔传','追捕','七十七天','密战','狂兽','其它']

place_count = [60605,54546,45819,28243,13270,9945,7679,6799,6101,4621,20105]

# 绘制饼图
plt.pie(place_count, labels=movie_name, autopct='%1.2f%%', colors=['b','r','g','y','c','m','y','r','c','g','g'])

# 显示正圆
plt.axis('equal')

plt.legend(loc='best')
plt.title('排片占比示意图')

plt.show()

点击查看代码

labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30, 45, 10]
# 将某部分爆炸出来， 使用括号将第一块分割出来，数值的大小是分割出来的与其他两块之间的间隙
explode = (0, 0.1, 0, 0)  # 分别对应labels

fig1, ax1 = plt.subplots()

# pctdistance, 百分比的text离圆心的距离

ax1.pie(sizes,explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)

ax1.axis('equal')

plt.show()

K线图

点击查看代码

# 了解部分
data = pd.read_hdf("./stock_plot/day_open.h5")[:100]
data1 = pd.read_hdf("./stock_plot/day_close.h5")[:100]
data2 = pd.read_hdf("./stock_plot/day_high.h5")[:100]
data3 = pd.read_hdf("./stock_plot/day_low(1).h5")[:100]

day = pd.concat([data["000001.SZ"], data1["000001.SZ"], data2["000001.SZ"],
data3["000001.SZ"]], axis=1)

day.columns = ["open", "close", "high", "low"]
day = day.reset_index().values

# 画图
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 8), dpi=80)
# 第一个参数axes
candlestick_ochl(axes, day, width=0.2, colorup='r', colordown='g')
plt.show()

Numpy

了解Numpy

import random
import time
import numpy as np

a = []
for i in range(100000000):
    a.append(random.random())

t1 = time.time()
sum1 = sum(a)
t2 = time.time()

b = np.array(a)
t4 = time.time()
sum3 = np.sum(b)
t5 = time.time()

print(t2-t1, t5-t4)

1.6841034889221191 0.5198299884796143

ndarray n维数组

# ndarray  n维数组
# 主要存储相同的类型的数据集合

# 创建数组
# 创建二维数组
a = np.array([[1,2,3], [4,5,6]])

a.shape  # 查看数组形状
# （2,3）

a.ndim  # 数组的维度
# 2

a.size  # 查看数组中的元素的数量
# 6

a.itemsize
# 4

a.nbytes  # 6 * 4
# 24

a.flags
 # C_CONTIGUOUS : True
 # F_CONTIGUOUS : False
 # OWNDATA : True
 # WRITEABLE : True
 # ALIGNED : True
 # WRITEBACKIFCOPY : False
 # UPDATEIFCOPY : False

a = np.array([[1,2,3], [4,5,6]])
b = np.array([7,8,9,10])
c = np.array([[[1,2,3], [4,5,6]], [[11,22,33], [44,55,66]]])

a.shape
b.shape
c.shape
# (2, 2, 3)

# N维数组
# 0维：1，2,3，
# 1维：[7,8,9,10]
# 2维：[[1,2,3], [4,5,6]]
# 3维：[[[1,2,3], [4,5,6]], [[11,22,33], [44,55,66]]]

a
a.dtype  # 获取数组的类型
# dtype('int32')

a = np.array([[1,2,3],[4,5,6]],dtype=np.float32)
a.dtype
# dtype('float32')

数组之间的运算

import numpy as np

arr = np.array([1,2,3,4])
arr+1
# array([2, 3, 4, 5])

# 数组与数组之间的运算
# 广播机制
a = np.array([[4,5,6],[7,8,9]])
b = np.array([[2,10], [2, 15]])
# a * b  element-wise

score = np.array([[80,86],
[82,80],
[85,78],
[90,90],
[86,82],
[82,90],
[78,80],
[92,94]])

percent = np.array([[0.3, 0.7]])
score * percent
#array([[24. , 60.2],
#       [24.6, 56. ],
#       [25.5, 54.6],
#       [27. , 63. ],
#       [25.8, 57.4],
#       [24.6, 63. ],
#       [23.4, 56. ],
#       [27.6, 65.8]])

# 矩阵，特殊在运算机制
np.mat(score)
#matrix([[80, 86],
#        [82, 80],
#        [85, 78],
#        [90, 90],
#        [86, 82],
#        [82, 90],
#        [78, 80],
#        [92, 94]])

c = np.array([[0.3], [0.7]])
np.mat(c)
#matrix([[0.3, 0.7],
#        [0.3, 0.7]])

# 矩阵运算
# (8, 2) * (2, 1) = (8, 1)
np.matmul(score, c)

#array([[84.2],
#       [80.6],
#       [80.1],
#       [90. ],
#       [83.2],
#       [87.6],
#       [79.4],
#       [93.4]])

stock_day_rise = np.random.normal(0, 1, [500, 504])
stock_day_rise.shape
# (500, 504)

stock1 = stock_day_rise[:10, :100]
stock2 = stock_day_rise[10: 20, :100]

stock2

# 合并
# axis: 0按照数组的行的方向拼接在一起
# axis: 1按照数组的列的方向拼接在一起

all_ = np.concatenate([stock1, stock2], axis=0)

# hstack  列拼接  axis: 1
# vstack  行拼接  axis: 0

# 分割
np.split(all_, 20, axis=0)
np.genfromtxt('test.csv', delimiter=',')
#array([[  nan,   nan,   nan,   nan],
#       [  1. , 123. ,   1.4,  23. ],
#       [  2. , 110. ,   nan,  18. ],
#       [  3. ,   nan,   2.1,  19. ]])

type(np.nan)
#float

e = 2.73
1/e
#0.3663003663003663

np.exp(2)
# 7.38905609893065

1 / np.exp(2)
#0.1353352832366127

1 / (1 + 1/np.exp(2))
# 0.8807970779778823

np.exp(2)
# 7.38905609893065

m = np.array([1,2,3])
1 / (1 + 1/np.exp(m))
# array([0.73105858, 0.88079708, 0.95257413])

Pandas

pandas数据结构

import numpy as np
import pandas as pd

stock_day_rise = np.random.normal(0, 1, [500, 504])

stock_day_rise
#array([[-0.51275272,  0.94026123, -0.28734351, ..., -1.80535228,
#         1.12647759, -0.34482647],
#       [-0.11082195, -0.61753087,  0.51247014, ..., -0.71336186,
#        -0.75038013,  1.23107248],
#       [ 1.30920002, -0.86247187, -0.18046507, ...,  0.41082344,
#         0.36615753, -1.15248877],
#       ...,
#       [-0.64597353,  0.98051196,  0.21157511, ...,  0.3901954 ,
#         0.44220279,  0.7628329 ],
#       [-0.45372471,  0.74978987,  1.14269309, ..., -0.9227356 ,
#        -0.64413556, -0.36949079],
#       [-0.7002719 ,  0.57790589, -1.65279998, ..., -1.57232142,
#        -0.51782955,  0.13426912]])

stock_df = pd.DataFrame(stock_day_rise)
stock_df
| 0    | 1         | 2         | 3         | 4         | 5         | 6         | 7         | 8         | 9         | ...       | 494  | 495       | 496       | 497       | 498       | 499       | 500       | 501       | 502       | 503       |
| ---- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | ---- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| 0    | -0.512753 | 0.940261  | -0.287344 | 0.531760  | 0.012567  | 0.709473  | 0.239689  | -1.779217 | -0.501474 | -0.507617 | ...  | 1.377147  | 1.783230  | 0.196377  | 1.594897  | 0.619660  | -1.876187 | 1.279120  | -1.805352 | 1.126478  |
| 1    | -0.110822 | -0.617531 | 0.512470  | 0.581689  | 0.711916  | 0.813071  | 1.521003  | -0.290721 | -0.156604 | -1.124984 | ...  | 0.948753  | 1.402447  | 0.294993  | -0.802038 | -1.067637 | -0.223470 | 0.445096  | -0.713362 | -0.750380 |
| 2    | 1.309200  | -0.862472 | -0.180465 | 0.028584  | 0.037257  | 0.051052  | 1.629817  | -1.133528 | -0.987510 | -1.585423 | ...  | 0.245225  | 1.909723  | -1.403577 | -0.406025 | -1.327163 | -0.608240 | 0.755096  | 0.410823  | 0.366158  |
| 3    | 0.920909  | -0.473799 | -1.925638 | -0.989393 | 0.837138  | 0.948183  | 0.011733  | 0.466019  | 0.258141  | 0.270631  | ...  | 1.028244  | 0.550098  | -0.168381 | 0.029352  | 0.652068  | -1.366157 | 2.141130  | -0.391050 | -0.524698 |
| 4    | -0.319762 | 0.599024  | -0.154454 | -1.954056 | -1.672396 | -0.158403 | 1.369486  | 1.294337  | 0.920220  | 0.784408  | ...  | -0.694639 | -0.250066 | 0.229763  | -1.020350 | 0.725860  | -0.062765 | -0.071443 | -0.708495 | -1.298314 |
| ...  | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...  | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       |
| 495  | -0.189318 | 0.680488  | 0.696482  | -0.480230 | -1.868852 | -0.012383 | -1.519626 | -1.279518 | 0.804979  | 1.390888  | ...  | 1.013731  | -1.506497 | -0.326615 | -1.552188 | 0.427825  | -0.533029 | 0.143934  | 0.192034  | 1.304076  |
| 496  | 0.838544  | -0.455677 | -0.874880 | 0.494403  | -0.196655 | -0.738068 | -2.619937 | -0.151928 | -1.533008 | -2.134869 | ...  | -0.575703 | -0.237983 | -1.551520 | 0.825470  | 0.186887  | -0.449823 | 1.406305  | 1.347674  | 0.058468  |
| 497  | -0.645974 | 0.980512  | 0.211575  | -0.397760 | -0.926155 | -0.628815 | 0.407839  | -0.002652 | 0.106013  | 0.377582  | ...  | -0.984033 | 0.882435  | 0.741889  | 1.084276  | -0.514312 | 1.374642  | 0.186176  | 0.390195  | 0.442203  |
| 498  | -0.453725 | 0.749790  | 1.142693  | -0.058502 | 0.327256  | 1.752110  | 0.535332  | 1.743112  | -0.459879 | -2.108713 | ...  | 0.119614  | -0.412215 | 0.209263  | 0.313788  | 0.216358  | -1.119070 | 1.067892  | -0.922736 | -0.644136 |
| 499  | -0.700272 | 0.577906  | -1.652800 | -0.523849 | -0.342849 | -0.937188 | 0.835102  | 0.269253  | -0.754492 | -0.169862 | ...  | -0.792549 | -0.159701 | 0.900721  | -0.909817 | -1.044447 | -1.155437 | 0.309660  | -1.572321 | -0.517830 |

type(stock_df)
#pandas.core.frame.DataFrame

# 添加行索引
stock_code = ['股票' + str(i) for i in range(stock_day_rise.shape[0])]

# stcok_code
stock_df = pd.DataFrame(stock_day_rise,index=stock_code)

stock_df
| 0     | 1         | 2         | 3         | 4         | 5         | 6         | 7         | 8         | 9         | ...       | 494  | 495       | 496       | 497       | 498       | 499       | 500       | 501       | 502       | 503       |
| ----- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | ---- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| 股票0   | -0.512753 | 0.940261  | -0.287344 | 0.531760  | 0.012567  | 0.709473  | 0.239689  | -1.779217 | -0.501474 | -0.507617 | ...  | 1.377147  | 1.783230  | 0.196377  | 1.594897  | 0.619660  | -1.876187 | 1.279120  | -1.805352 | 1.126478  |
| 股票1   | -0.110822 | -0.617531 | 0.512470  | 0.581689  | 0.711916  | 0.813071  | 1.521003  | -0.290721 | -0.156604 | -1.124984 | ...  | 0.948753  | 1.402447  | 0.294993  | -0.802038 | -1.067637 | -0.223470 | 0.445096  | -0.713362 | -0.750380 |
| 股票2   | 1.309200  | -0.862472 | -0.180465 | 0.028584  | 0.037257  | 0.051052  | 1.629817  | -1.133528 | -0.987510 | -1.585423 | ...  | 0.245225  | 1.909723  | -1.403577 | -0.406025 | -1.327163 | -0.608240 | 0.755096  | 0.410823  | 0.366158  |
| 股票3   | 0.920909  | -0.473799 | -1.925638 | -0.989393 | 0.837138  | 0.948183  | 0.011733  | 0.466019  | 0.258141  | 0.270631  | ...  | 1.028244  | 0.550098  | -0.168381 | 0.029352  | 0.652068  | -1.366157 | 2.141130  | -0.391050 | -0.524698 |
| 股票4   | -0.319762 | 0.599024  | -0.154454 | -1.954056 | -1.672396 | -0.158403 | 1.369486  | 1.294337  | 0.920220  | 0.784408  | ...  | -0.694639 | -0.250066 | 0.229763  | -1.020350 | 0.725860  | -0.062765 | -0.071443 | -0.708495 | -1.298314 |
| ...   | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...  | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       | ...       |
| 股票495 | -0.189318 | 0.680488  | 0.696482  | -0.480230 | -1.868852 | -0.012383 | -1.519626 | -1.279518 | 0.804979  | 1.390888  | ...  | 1.013731  | -1.506497 | -0.326615 | -1.552188 | 0.427825  | -0.533029 | 0.143934  | 0.192034  | 1.304076  |
| 股票496 | 0.838544  | -0.455677 | -0.874880 | 0.494403  | -0.196655 | -0.738068 | -2.619937 | -0.151928 | -1.533008 | -2.134869 | ...  | -0.575703 | -0.237983 | -1.551520 | 0.825470  | 0.186887  | -0.449823 | 1.406305  | 1.347674  | 0.058468  |
| 股票497 | -0.645974 | 0.980512  | 0.211575  | -0.397760 | -0.926155 | -0.628815 | 0.407839  | -0.002652 | 0.106013  | 0.377582  | ...  | -0.984033 | 0.882435  | 0.741889  | 1.084276  | -0.514312 | 1.374642  | 0.186176  | 0.390195  | 0.442203  |
| 股票498 | -0.453725 | 0.749790  | 1.142693  | -0.058502 | 0.327256  | 1.752110  | 0.535332  | 1.743112  | -0.459879 | -2.108713 | ...  | 0.119614  | -0.412215 | 0.209263  | 0.313788  | 0.216358  | -1.119070 | 1.067892  | -0.922736 | -0.644136 |

# freq='B' 默认略过周六周日
date = pd.date_range('2017-01-01', periods=504, freq='B')
stock_df = pd.DataFrame(stock_day_rise, index=stock_code, columns=date)

pandas的索引与修改

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

stock_day_rise = np.random.normal(0, 1, [500, 504])
# stock_day_rise
stock_code = ['股票' + str(i) for i in range(stock_day_rise.shape[0])]
data = pd.date_range('2017-01-01', periods=504, freq='B')
stock_dataframe = pd.DataFrame(stock_day_rise, index=stock_code, columns=data)
stock_dataframe
| 2017-01-02 | 2017-01-03 | 2017-01-04 | 2017-01-05 | 2017-01-06 | 2017-01-09 | 2017-01-10 | 2017-01-11 | 2017-01-12 | 2017-01-13 | ...       | 2018-11-23 | 2018-11-26 | 2018-11-27 | 2018-11-28 | 2018-11-29 | 2018-11-30 | 2018-12-03 | 2018-12-04 | 2018-12-05 | 2018-12-06 |
| ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | --------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- |
| 股票0        | -0.336502  | -0.283818  | -1.833312  | -0.034063  | -0.273923  | -0.013894  | 0.571314   | -0.685192  | -0.844952  | 0.697230  | ...        | -0.897525  | 2.295753   | 0.726545   | -0.332880  | -0.707125  | 0.301560   | -1.315805  | 1.038277   | 0.232298   |
| 股票1        | 0.431983   | -0.128563  | 0.430541   | 0.260152   | 0.885598   | 1.659742   | 0.407230   | 0.011112   | 0.624398   | -1.356692 | ...        | 0.433011   | -0.468825  | 0.536704   | -0.796652  | 0.972271   | 1.537066   | -0.146411  | 1.468827   | 1.733275   |
| 股票2        | 1.068510   | 0.637716   | -1.626844  | -0.985523  | 0.745854   | -0.359343  | 0.889808   | 1.364657   | -1.017752  | -0.772868 | ...        | -0.310762  | 0.420062   | 0.903381   | -0.804816  | -0.444837  | 1.373565   | -1.688836  | -0.853804  | 1.056135   |
| 股票3        | 1.650343   | -0.921815  | -0.068494  | 1.043372   | -1.766311  | -1.018881  | -1.031309  | 1.024690   | -0.533850  | 0.350309  | ...        | -1.010353  | 0.614537   | -0.511354  | -0.752013  | -1.017201  | -0.886048  | 0.680733   | 1.063538   | -0.383206  |
| 股票4        | -1.128249  | -1.282252  | -0.928848  | 0.075446   | -1.358604  | 1.602723   | -0.966502  | 2.256386   | 0.925430   | -1.027316 | ...        |            |            |            |            |            |            |            |            |            |

stock_dataframe.values
array([[-0.33650197, -0.28381791, -1.83331156, ...,  1.03827662,
         0.23229771,  0.50349308],
       [ 0.43198327, -0.12856302,  0.4305411 , ...,  1.46882666,
         1.73327538,  0.44540417],
       [ 1.06851021,  0.63771568, -1.6268439 , ..., -0.8538035 ,
         1.05613455,  1.13792046],
stock_dataframe.T
| 股票0        | 股票1       | 股票2       | 股票3       | 股票4       | 股票5       | 股票6       | 股票7       | 股票8       | 股票9       | ...       | 股票490 | 股票491     | 股票492     | 股票493     | 股票494     | 股票495     | 股票496     | 股票497     | 股票498     | 股票499     |
| ---------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | ----- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| 2017-01-02 | -0.336502 | 0.431983  | 1.068510  | 1.650343  | -1.128249 | -0.605703 | -0.551460 | 0.019854  | -2.092409 | -0.495476 | ...   | 0.451665  | -0.328315 | -0.289311 | 0.204668  | 2.794759  | 0.877930  | 1.944935  | -0.657216 | 1.261522  |
| 2017-01-03 | -0.283818 | -0.128563 | 0.637716  | -0.921815 | -1.282252 | 0.427100  | -1.296923 | 0.767681  | -0.621305 | 0.122074  | ...   | 2.337562  | -0.350175 | -0.424671 | -1.011431 | 0.184091  | 0.242851  | 0.164125  | 0.910831  | -2.520630 |
| 2017-01-04 | -1.833312 | 0.430541  | -1.626844 | -0.068494 | -0.928848 | 0.030197  | -0.171296 | -0.267061 | -0.285124 | -0.212287 | ...   | -0.301684 | 0.015821  | 0.582552  | -0.349317 | 2.052757  | 0.056201  | 1.028949  | -0.730406 | -0.275469 |
| 2017-01-05 | -0.034063 | 0.260152  | -0.985523 | 1.043372  | 0.075446  | -0.282063 | 0.939964  | -1.005864 | -0.536240 | -0.521829 | ...   | 0.487618  | 0.211755  | 1.134300  | -1.530601 | -1.129824 | -0.106915 | -0.757018 | -0.306077 | 0.088784  |
| 2017-01-06 | -0.273923 | 0.885598  | 0.745854  | -1.766311 | -1.358604 | -1.407985 | -1.195100 | -0.552709 | -1.014346 | -0.442240 |       |           |           |           |           |           |           |           |           |           |

stock_dataframe.head(10)
2017-01-02	2017-01-03	2017-01-04	2017-01-05	2017-01-06	2017-01-09	2017-01-10	2017-01-11	2017-01-12	2017-01-13	...	2018-11-23	2018-11-26	2018-11-27	2018-11-28	2018-11-29	2018-11-30	2018-12-03	2018-12-04	2018-12-05	2018-12-06
股票0	-0.336502	-0.283818	-1.833312	-0.034063	-0.273923	-0.013894	0.571314	-0.685192	-0.844952	0.697230	...	-0.897525	2.295753	0.726545	-0.332880	-0.707125	0.301560	-1.315805	1.038277	0.232298	0.503493
股票1	0.431983	-0.128563	0.430541	0.260152	0.885598	1.659742	0.407230	0.011112	0.624398	-1.356692	...	0.433011	-0.468825	0.536704	-0.796652	0.972271	1.537066	-0.146411	1.468827	1.733275	0.445404
股票2	1.068510	0.637716	-1.626844	-0.985523	0.745854	-0.359343	0.889808	1.364657	-1.017752	-0.772868	...	-0.310762	0.420062	0.903381	-0.804816	-0.444837	1.373565	-1.688836	-0.853804	1.056135	1.137920
股票3	1.650343	-0.921815	-0.068494	1.043372	-1.766311	-1.018881	-1.031309	1.024690	-0.533850	0.350309	
# stock_dataframe.tail(10)

# DataFrame索引操作
# 重设索引
# stock_dataframe.reset_index(drop=True)

df = pd.DataFrame({'month':[1,4,7,10], 'year':[1, 1, 2, 2], 'sale':[55, 40, 84, 31]})
df = df.set_index('year','month')
# df

df
month	sale
year		
1	1	55
1	4	40
2	7	84
2	10	31

df.index
Int64Index([1, 1, 2, 2], dtype='int64', name='year')

# MutiIndex
stock_dataframe = stock_dataframe.T
stock_dataframe
| 股票0        | 股票1       | 股票2       | 股票3       | 股票4       | 股票5       | 股票6       | 股票7       | 股票8       | 股票9       | ...       | 股票490 | 股票491     | 股票492     | 股票493     | 股票494     | 股票495     | 股票496     | 股票497     | 股票498     | 股票499     |
| ---------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | ----- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| 2017-01-02 | -0.336502 | 0.431983  | 1.068510  | 1.650343  | -1.128249 | -0.605703 | -0.551460 | 0.019854  | -2.092409 | -0.495476 | ...   | 0.451665  | -0.328315 | -0.289311 | 0.204668  | 2.794759  | 0.877930  | 1.944935  | -0.657216 | 1.261522  |
| 2017-01-03 | -0.283818 | -0.128563 | 0.637716  | -0.921815 | -1.282252 | 0.427100  | -1.296923 | 0.767681  | -0.621305 | 0.122074  | ...   | 2.337562  | -0.350175 | -0.424671 | -1.011431 | 0.184091  | 0.242851  | 0.164125  | 0.910831  | -2.520630 |
| 2017-01-04 | -1.833312 | 0.430541  | -1.626844 | -0.068494 | -0.928848 | 0.030197  | -0.171296 | -0.267061 | -0.285124 | -0.212287 | ...   | -0.301684 | 0.015821  | 0.582552  | -0.349317 | 2.052757  | 0.056201  | 1.028949  | -0.730406 | -0.275469 |
| 2017-01-05 | -0.034063 | 0.260152  | -0.985523 | 1.043372  | 0.075446  | -0.282063 | 0.939964  | -1.005864 | -0.536240 | -0.521829 | ...   | 0.487618  | 0.211755  | 1.134300  | -1.530601 | -1.129824 | -0.106915 | -0.757018 | -0.306077 | 0.088784  |
| 2017-01-06 | -0.273923 | 0.885598  | 0.745854  | -1.766311 |           |           |           |           |           |           |       |           |           |           |           |           |           |           |           |           |

stock_dataframe['股票0']['2017-01-02']
# -0.33650197255654596

pd.Series(np.arange(10))
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

pd.Series({'red':100, 'blue':200, 'green': 500, 'yellow':1000})
red        100
blue       200
green      500
yellow    1000
dtype: int64

data = pd.read_csv('./stock_day.csv')
data.head()
open	high	close	low	volume	price_change	p_change	ma5	ma10	ma20	v_ma5	v_ma10	v_ma20	turnover
2018-02-27	23.53	25.88	24.16	23.53	95578.03	0.63	2.68	22.942	22.142	22.875	53782.64	46738.65	55576.11	2.39
2018-02-26	22.80	23.78	23.53	22.80	60985.11	0.69	3.02	22.406	21.955	22.942	40827.52	42736.34	56007.50	1.53
2018-02-23	22.88	23.37	22.82	22.71	52914.01	0.54	2.42	21.938	21.929	23.022	35119.58	41871.97	56372.85	1.32
2018-02-22	22.25	22.76	22.28	22.02	36105.01	0.36	1.64	21.446	21.909	23.137	35397.58	39904.78	60149.60	0.90
2018-02-14	21.49	21.99	21.92	21.48	23331.04	0.44	2.05	21.366	21.923	23.253	33590.21	42935.74	61716.11	0.58

data[['open', 'high', 'close']]
open	high	close
2018-02-27	23.53	25.88	24.16
2018-02-26	22.80	23.78	23.53
2018-02-23	22.88	23.37	22.82
2018-02-22	22.25	22.76	22.28
2018-02-14	21.49	21.99	21.92
...	...	...	...
2015-03-06	13.17	14.48	14.28
2015-03-05	12.88	13.45	13.16
2015-03-04	12.80	12.92	12.90
2015-03-03	12.52	13.06	12.70
2015-03-02	12.25	12.67	12.52
643 rows × 3 columns

# 使用行列索引的方式取值，必须按照先列后行的顺序
data['open']['2018-02-27']
23.53

# data[:1, :2]

# loc: 只能指定行列索引的名字
# iloc: 可以通过索引的下标获取，索引是时间或者指标的名字
data.loc['2018-02-27': '2018-02-23', 'open']

2018-02-27    23.53
2018-02-26    22.80
2018-02-23    22.88
Name: open, dtype: float64

data.iloc[0:3, 0:4]  # 相当于取到一个DataFrame

	open	high	close	low
2018-02-27	23.53	25.88	24.16	23.53
2018-02-26	22.80	23.78	23.53	22.80
2018-02-23	22.88	23.37	22.82	22.71


# ix, 在1.0.0版本之后就删除这个方法
# 排序
data.sort_index()

# ascending=False 按照升序还是降序的顺序排序，默认从小到大
data.sort_values(by='p_change', ascending=False)

data.sort_values(by=['open', 'close'], ascending=False).head(10)

# 统计分析
# 求出最小值
data.idxmin(axis=0)

# cumsum
data = data.sort_index()
data

data.p_change.cumsum().plot()
plt.show()

# 逻辑运算
# 通过运算符
data['p_change'] > 2

data[data['p_change'] > 2]

data[(data['p_change']>2) & (data['turnover']>5)]

data.query('p_change>2&turnover>5')

# isin
data[data['turnover'].isin([4.19, 2.39])]

# 数学运算
data


open_ = data['open']
close_ = data['close']

# add  加法 sub 减法
data['my_price_change'] = close_.sub(open_)

# 自定义运算
data[['open', 'close']].apply(lambda x: x.max() - x.min(), axis=0)

posted on 2022-06-19 15:20 康二栋阅读(57) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· git生成ssh密钥详细步骤

· 数据挖掘基础

· Pandas 2.2 中文官方教程和指南（十九）

· Python数据处理

· 数据分析笔记

阅读排行：
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布：重大改进与新特性概览！
· AI与.NET技术实操系列（二）：开始使用ML.NET
· 单线程的Redis速度为什么快？

kangerdong

导航

公告

统计

搜索

常用链接

随笔分类

随笔档案

文章分类

阅读排行榜

推荐排行榜

python 数据分析

基本环境安装

Matplot绘图架构

折线图

多个坐标系绘制

柱状图

直方图

饼图

K线图

Numpy

了解Numpy

ndarray n维数组

数组之间的运算

Pandas

pandas数据结构

pandas的索引与修改