数据分析
数据分析
numpy+scipy+matplotlib+pandas
scikit-learn + tensorflow
一、Numpy的特点
1、擅长数值计算
2、足够高的运算性能
3、支持矢量化运算
4、免费、开源
import numpy as np import datetime as dt n = 100000 start = dt.datetime.now() A,B, = [],[] for i in range(n): A.append(i**2) B.append(i**3) C = [] for a,b in zip(A,B): C.append(a+b) print((dt.datetime.now() - start).microseconds)#143709(微秒) # 基于Numpy的实现 start = dt.datetime.now() A, B = np.arange(n) ** 2, np.arange(n) ** 3#此处的计算相当于调用了C语言进行计算,所以更快 C = A+B print((dt.datetime.now() - start).microseconds)#34831(微秒)
二、Numpy的数组
1、Numpy中的数组是ndarry类型的对象,将实际数据和元数据分开存放,独立操作,以此提升性能
2、Numpy数组的元素类型必须相同(同质性)
3、Numpy数组的元素可以通过基0的下标单独访问,size个元素,合理下标范围[0,size-1],[-size,-1]
数组[i][j][k]等价于数组[i,j,k]
4、Numpy数组通过dtype和shape属性表示元素的类型和维度,其中维度的类型是元组,按照从高到低的顺序排列每一维的大小(页,行,列)
5、创建数组的方法:
1、numpy.arange(起始值(0),终止值,步长(1)),只能生成一维数组
2、numpy.array(任意可以被解释为数据的序列)
6、稀疏矩阵:矩阵中(二维数组)零元素比非零元素多并且分布没有规律称为稀疏矩阵,用scipy中的sparse.csr_matrix()表示稀疏矩阵更好
import numpy as np from scipy import sparse matrix = np.array([[0,2,0],[0,0,0],[1,0,0]]) csr = sparse.csr_matrix(matrix) print(matrix)
"""
[[0 2 0]
[0 0 0]
[1 0 0]]
""" print(csr)
"""
只显示非零的数据和位置:1行2列的位置为2,3行1列的位置为1
(0, 1) 2
(2, 0) 1
"""
from __future__ import unicode_literals import numpy as np a = np.arange(1,3) print(a, a.shape, sep=' ')#[1 2] (2,) b = np.array([{1,2,3}, [4,5,6]]) print(b, b.shape, sep=' ')# [{1, 2, 3} list([4, 5, 6])] (2,) print(b.dtype) # object,是python对象,因为nump会将不同类型的元素转为相同的对象 c = np.array(['A','B',"CD"]) print(c.dtype)# <U2, 表示小端字节序Unicode2字节,‘A’其实是<U1,因为数组的同质性,所以变为了<U2
三、Numpy的内置数据类型
1、布尔
bool_
2、整数
1、有符号整数
int8/int16/int32/int64
2、无符号整数
uint8/uint16/uint32/uint64
3、浮点型
float16/float32/float64
4、复数
complex64/complex128,其中实部和虚部都是float型的
5、字符串
str_
四、显式使用数据类型
1、默认数据类型不满足需要,人为指定
2、复合类型,在一个元素中含有多个字段
3、用不同类型访问同一个元素
numpy.array(...., dtype = 类型)
import numpy as np a = np.array([1,2,3,4,5],dtype=np.int8) print('a.dtype:',a.dtype) b = a.astype(float) print('b.dtype:',b.dtype) c = a.astype(np.float32) print('c.dtype:',c.dtype) d = a.astype(np.str_) print('d.dtype:',d.dtype,d) # 一下三种写法相同 #e = np.array([1234],dtype = np.int32) #e = np.array([1234],dtype = 'int32') e = np.array([1234],dtype = 'i4') print('e:',e.dtype,e.shape) # dtype = (变长类型,长度) f = np.array(['1234'],dtype=(np.str_, 2))# 相当于dtype='U2', 会把‘1234’截取为‘12’ print('f:',f.dtype,f[0]) # (定长类型,(维度)) g = np.array([(1,2,3,4)], dtype=(np.int32,4))# 每个元素的类型是4个int32 g = np.array([((1,2),(3,4))],dtype=(np.int32,(2,2))) print('g:',g.dtype,g.shape) # '类型字符串1,类型字符串2,类型字符串3,...' h = np.array([('1234',(1,2,3,4))], dtype='U4 ,4i4') print('h:',h.dtype,h[0]['f0'],h[0]['f1'])# f0,f1是系统分别为‘1234’,(1,2,3,4)起的别名 # dtype = {'names':[字段名称],‘formats’:[字段类型表]} i = np.array([('1234',(1,2,3,4)),('5678',(5,6,7,8))],dtype={'names':['fa','fb'],'formats':['U4','4i4']}) print('i:', i.dtype, i.shape, i[0]['fa'],i[0]['fb']) # dtype = [(字段名称,字段类型,字段维度),...] j = np.array([('1234',(1,2,3,4))], dtype=[('fa',np.str_,4),('fb',np.int32,4)]) print('j:',j.dtype,j.shape, j[0]['fa'],j[0]['fb']) k = np.array([('1234',(1,2,3,4))],dtype=[('fa','U4'),('fb','4i4')]) print('k:',k.dtype,k[0]['fa'],k[0]['fb']) # (基本类型,解释类型) # 0x表示16进制,‘u1’整型1个字节, l = np.array([0x1234],dtype=('<u2',{'names':['lo','hi'],'formats':['u1','u1']})) # :x表示16进制 print('{:x} {:x} {:x}'.format(l[0],l['lo'][0],l['hi'][0])) m = np.array([('ABC',(1,2,3))],dtype ='U3, 3u1') print(m)# [('ABC', [1, 2, 3])] print(m.dtype)# [('f0', '<U3'), ('f1', 'u1', (3,))] m1 = np.array([('ABC',(1,2,3))],dtype =[('fa', np.str_,3),('fb','3u1')]) print(m1)# [('ABC', [1, 2, 3])] print(m1.dtype)# [('fa', '<U3'), ('fb', 'u1', (3,))]
五、切片
数组[起始:终止:步长,...],
缺省起始:首(+步长)/尾(-步长)
缺省终止:尾后(+步长)/首前(-步长)
缺省步长:1
六、改变维度
1、视图变维:根据指定的新维度,构造新的元数据,实际数据共享
源数组.reshape(新维度) ->目标数组
\--------------/
共享实际数据
/--------------\
源数组.ravel()->一维目标数组
2、复制变维
源数组.flatten()->一维目标数组
| |
实际数据<- 非共享 ->实际数据副本
3、就地变维
源数组.shap = 新维度
原数组.resize(新维度)
4、转置
原数组.transpose()->转置视图
七、合并与拆分
1、垂直合分
vstack((上,下))->垂直合并
concatenate((上,下),axis=0)
shape:(3,4)
0 1
vsplit(数组,等分份数)->垂直拆分
split(数组,等分份数,axis=0)
2、水平合分
hstack((左,右))->水平合并
concatenate((左,右),axis=1)
hspilt(数组,等分份数)->水平拆分
3、深度合分
dstack((前,后))->深度合并
dsplit(数组,等分份数)->深度拆分
4、行列合并
row_stack((上,下))->行合并
column_stack((左,右))->列合并
八、ndarry对象的属性
dtype:元素的数据类型
shape:数组维度
ndim:数组维数
size:元素数,只有一维数组才等价于len()
itemsize:每个元素的字节数
nbytes:所有元素的总字节数,size × itemsize
T :转置视图
real/imag:复数数组的实部和虚部
flat:扁平迭代器
import numpy as np a = np.array([[1+1j, 2+4j, 3+7j], [4+2j, 5+5j, 6+8j], [7+3j, 8+6j, 9+9j],]) print(a.dtype)# complex128 print(a.shape)# (3, 3) print(a.ndim) # 2 print(a.size) # 9 print(a.itemsize) # 16 (16*8=128) print(a.nbytes) # 144 (9*16=144) print(a) """ [[1.+1.j 2.+4.j 3.+7.j] [4.+2.j 5.+5.j 6.+8.j] [7.+3.j 8.+6.j 9.+9.j]] """ print(a.T) """ [[1.+1.j 4.+2.j 7.+3.j] [2.+4.j 5.+5.j 8.+6.j] [3.+7.j 6.+8.j 9.+9.j]] """ print(a.real) """ [[1. 2. 3.] [4. 5. 6.] [7. 8. 9.]] """ print(a.imag) """ [[1. 4. 7.] [2. 5. 8.] [3. 6. 9.]] """ print(a.flat)# <numpy.flatiter object at 0x11259c0> for elem in a.flat: print(elem) """(1+1j)(2+4j)(3+7j)(4+2j)(5+5j)(6+8j)(7+3j)(8+6j)(9+9j)""" print(list(a)) # [array([1.+1.j, 2.+4.j, 3.+7.j]), array([4.+2.j, 5.+5.j, 6.+8.j]), array([7.+3.j, 8.+6.j, 9.+9.j])]
九、数据可视化(matplotlib,数学绘图库)
1、缺省样式
曲线图:plot(水平坐标数组,垂直坐标数组)
import matplotlib.pyplot as mp import numpy as np # 获取-π到π之间的数字组成的数组 x = np.linspace(-np.pi,np.pi,1000) sin_y = np.sin(x) cos_y = np.cos(x)/2 mp.plot(x,sin_y) mp.plot(x,cos_y) mp.show()
2、线性、线宽和颜色
plot(...,linestyle=线型,linewidth=线宽,color=颜色)
3、设置坐标范围
mp.xlim(最小水平坐标,最大水平坐标)
mp.ylim(最小垂直坐标,最大垂直坐标)
4、设置刻度标签
mp.xticks(刻度位置数组[,刻度文本数组])
mp.yticks(刻度位置数组[,刻度文本数组])
5、十字坐标轴
ax = mp.gca()#获取当前坐标图
ax.spinex['left'].set_position((坐标系,位置坐标))
ax.spinex['right'].set_color(颜色)
6、图例
plot(..,label=图例文本)
legend(loc=位置)
loc的取值:
lower right
center
center left
upper center
right
center right
lower center
lower left
upper right
upper left
best
7、添加特殊点
scatter(水平坐标数组,垂直坐标数组)
8、备注
annotate(备注文本,xy=目标位置,xycoords=目标坐标系,
xytext=备注位置,textcoords=备注坐标系,fontsize=字体大小,
arrowprops=箭头属性props)
import matplotlib.pyplot as mp import numpy as np # 获取-π到π之间的数字组成的数组 x = np.linspace(-np.pi,np.pi,1000) sin_y = np.sin(x) cos_y = np.cos(x)/2 # 添加特殊点 xo = np.pi*3/4 yo_sin = np.sin(xo) yo_cos = np.cos(xo)/2 # 设置边框位置 mp.xlim(x.min()*1.1,x.max()*1.1) mp.ylim(sin_y.min()*1.1,sin_y.max()*1.1) # 设置坐标位置 ax = mp.gca() ax.spines['left'].set_position(('data',0)) ax.spines['bottom'].set_position(('data',0)) ax.spines['top'].set_color('none') ax.spines['right'].set_color('none') # 设置刻度 mp.yticks([-1,-0.5,0,0.5,1]) mp.xticks([-np.pi,-np.pi/2,0,np.pi/2, np.pi*3/4, np.pi], [r'$-\pi$',r'$-\frac{\pi}{2}$',r'$0$',r'$\frac{\pi}{2}$',r'$\frac{3\pi}{4}$',r'$\pi$']) mp.plot(x,sin_y,linestyle='-',linewidth=2,color='orangered',label=r'$x=sin(x)$') mp.plot(x,cos_y,linestyle='-',linewidth=2,color='dodgerblue',label=r'$y=\frac{1}{2}cos(x)$') # 将特殊的点连线 mp.plot([xo,xo],[yo_cos,yo_sin],linestyle='--',linewidth='1',color='limegreen') # 表示点的大小,edgecolor表示边界颜色,facecolor表示内部颜色,zorder表示画的顺序,大值表示后画 mp.scatter([xo,xo],[yo_sin,yo_cos],s=60,edgecolor='limegreen',facecolor='white',zorder=3) # 添加备注 mp.annotate(r'$\frac{1}{2}cos(\frac{3\pi}{4})=-\frac{\sqrt{2}}{2}$', xy=(xo,yo_cos),xycoords='data', xytext=(-90,-40),textcoords='offset points', fontsize=14,arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=0.2')) mp.annotate(r'$sin(\frac{3\pi}{4})=-\frac{\sqrt{2}}{4}$', xy=(xo,yo_sin),xycoords='data', xytext=(20,20),textcoords='offset points', fontsize=14,arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=0.2')) # 图例的显示位置 mp.legend(loc='upper left') mp.show()
9、图形对象
figure(对象名(窗口标题),figsize=大小,dpi=分辨率,facecolor=颜色,..)
该方法会返回图像对象(创建/设置为当前)
10、子图
subplot(总行数,总列数,图号)
subplot(总行数×100+总列数×10+图号)
import matplotlib.pyplot as mp mp.figure('Subplot',facecolor='lightgray') mp.subplot(221) # 去掉刻度 mp.xticks(()) mp.yticks(()) # 写文本text(水平位置,处置位置,文本,水平对齐方式,垂直其方式,文本你大小,文本不透明度) mp.text(0.5,0.5,'1',ha='center',va='center',size=36,alpha=0.5) mp.subplot(222) mp.xticks(()) mp.yticks(()) mp.text(0.5,0.5,'2',ha='left',va='top',size=36,alpha=0.5) mp.subplot(223) mp.xticks(()) mp.yticks(()) mp.text(0.5,0.5,'3',ha='left',va='bottom',size=36,alpha=0.5) mp.subplot(224) mp.xticks(()) mp.yticks(()) mp.text(0.5,0.5,'4',ha='right',va='bottom',size=36,alpha=0.5) # 减小每个子图的之间的空隙 mp.tight_layout() mp.show()
11、栅格布局
import matplotlib.gridspec as mg
gs = mg.GridSpec(行数,列数)
subplot(gs[切片])
import matplotlib.pyplot as mp import matplotlib.gridspec as mg mp.figure('Gridspec',facecolor='lightgray') # 三行两列 gs = mg.GridSpec(3,2) # 第一行 mp.subplot(gs[0,:]) # 取消刻度 mp.xticks(()) mp.yticks(()) # 第2,3行,第一列 mp.subplot(gs[1:,0]) mp.xticks(()) mp.yticks(()) # 第2行2列 mp.subplot(gs[1,1]) mp.xticks(()) mp.yticks(()) # 第3行2列 mp.subplot(gs[2,1]) mp.xticks(()) mp.yticks(()) # 减小空隙 mp.tight_layout() mp.show()
12、自由布局
axes([左,底,宽,高])
说明:这些参数都是相对值
import matplotlib.pyplot as mp mp.figure('Axes',facecolor='lightgray') # 参数都是相对值 mp.axes([0.03,0.038,0.94,0.924]) mp.xticks(()) mp.yticks(()) mp.text(0.5,0.5,'1',ha='center',va='center',size=36,alpha=0.5) mp.axes([0.63,0.076,0.31,0.308]) mp.xticks(()) mp.yticks(()) mp.text(0.5,0.5,'2',ha='center',va='center',size=36,alpha=0.5) mp.show()
13、坐标刻度
xxxLocator(...)->坐标定位器对象
ax = mp.gca()#获取坐标对象
设置水平轴主刻度定位器
ax.xaxis.set_major_locator(坐标定位器)
设置水平次刻度定位器
ax.xaxis.set_minor_locatir(坐标定位器)
设置垂直主刻度定位器
ax.yaxis.set_magor_locator(坐标定位器)
设置处置次刻度定位器
ax.yaxis.set_minor_locator(坐标定位器)
import matplotlib.pyplot as mp import numpy as np mp.figure('Locator') locators = [ 'mp.NullLocator()', 'mp.MaxNLocator(nbins=3,steps=[1,3,5,7,9])',# 一共3个刻度, 'mp.FixedLocator(locs=[0,2.5,5,7.5,10])', 'mp.AutoLocator()', 'mp.IndexLocator(offset=0.5,base=1.5)',#offset表示起始刻度,base表示间隔 'mp.MultipleLocator()', 'mp.LinearLocator(numticks=21)',#numticks主刻度的个数 'mp.LogLocator(base=2,subs=[1.0])',#base表示底数,subs表示指数的步长 ] n_locators = len(locators) for i ,locator in enumerate(locators): mp.subplot(n_locators,1,i+1) # 设置x,y轴的取值范围 mp.xlim(0,10) mp.ylim(-1,1) # 取消y轴刻度 mp.yticks(()) ax = mp.gca() ax.spines['left'].set_color('none') ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.spines['bottom'].set_position(('data',0)) ax.xaxis.set_major_locator(eval(locator)) # MultipleLocator(等差值)多点定位器 ax.xaxis.set_minor_locator(mp.MultipleLocator(0.1)) mp.plot(np.arange(0,11),np.zeros(11),c='none') mp.text(5,0.3,locator[3:],ha='center',va='center',size=12) mp.tight_layout() mp.show()
14、散点图
scatter(水平坐标,垂直坐标,s=大小,c=颜色,cmap=颜色映射,
marker=点型,alpha=透明度,label=标签)
x y z
10 40 1
20 30 2
30 20 3
40 10 4
#jet深蓝到深红的渐变
scatter([10,20,30,40],[40,30,20,10],c=[1,2,3,4],cmap='jet')
import matplotlib.pyplot as mp import numpy as np n = 1000 # normal表示正态分布0表示平局值,1表示标准差 x = np.random.normal(0,1,n) y = np.random.normal(0,1,n) z = np.sqrt(x ** 2 + y ** 2) mp.figure('Scatter',facecolor='lightgray') mp.title('Scatter',fontsize=20) mp.xlabel('x',fontsize=14) mp.ylabel('y',fontsize=14) mp.tick_params(labelsize=10)#刻度线的字体 mp.grid(linestyle=':') mp.scatter(x,y,c=z,s=60,cmap='jet_r',alpha=0.5) mp.axis('equal')# x y 轴等轴 mp.show()
15、区域填充
fill_between(水平坐标,起始垂直坐标,终止垂直坐标,填充条件
color=颜色,alpha=透明度)
import matplotlib.pyplot as mp import numpy as np n = 1000 # normal表示正态分布0表示平局值,1表示标准差 x = np.linspace(0,8*np.pi,n) sin_y = np.sin(x) cos_y = np.cos(x/2)/2 mp.figure('Fill',facecolor='lightgray') mp.title('Fill',fontsize=20) mp.xlabel('x',fontsize=14) mp.ylabel('y',fontsize=14) mp.tick_params(labelsize=10)#刻度线的字体 mp.grid(linestyle=':') mp.plot(x,sin_y,color='dodgerblue',label=r'$y=sin(x)$') mp.plot(x,cos_y,color='orangered',label=r'$y=\frac{1}{2}cos(\frac{x}{2})$') mp.fill_between(x,sin_y,cos_y,cos_y<sin_y,color='dodgerblue',alpha=0.5) mp.fill_between(x,sin_y,cos_y,cos_y>sin_y,color='orangered',alpha=0.5) mp.legend() mp.show()
16、条形图
bar(水平坐标,高度,ec=边缘色,fc=填充色,label=标签)
说明:高度为正值,矩形条画在零轴上方,高度为负值,矩形条画在零轴下方
import matplotlib.pyplot as mp import numpy as np n = 12 # normal表示正态分布0表示平局值,1表示标准差 x = np.arange(n) y1 = (1-x/n) * np.random.uniform(0.5,1.0,n) y2 = (1-x/n) * np.random.uniform(0.5,1.0,n) mp.figure('Bar',facecolor='lightgray') mp.title('Bar',fontsize=20) mp.ylim(-1.25,1.25) mp.xlabel('x',fontsize=14) mp.ylabel('y',fontsize=14) mp.xticks(x,x+1) mp.tick_params(labelsize=10)#刻度线的字体 mp.grid(axis='y',linestyle=':') # 只在y轴上画刻度 mp.bar(x,y1,ec='white',fc='dodgerblue',label='sample 1') # 显示数值 for _x,_y in zip(x,y1): mp.text(_x,_y,'%.2f'%_y, ha='left', va='bottom',size=8) mp.bar(x,-y2,ec='white',fc='dodgerblue',label='sample 2',alpha=0.5) for _x,_y in zip(x,y2): mp.text(_x,-_y-0.015,'%.2f'%_y, ha='left', va='top',size=8) mp.legend() mp.show()
17、等高线图
contourf(点阵水平坐标,点阵垂直坐标,点阵直立坐标,高差份数,cmap=颜色映射),填充
contour(点阵水平坐标,点阵垂直坐标,点阵直立坐标,
高差份数,colors=颜色,linewidths=线宽)
clabel(等高线图对象,inline_spacing=线内空白,fmt=格式化串,fontsize=字体大小)
import matplotlib.pyplot as mp import numpy as np n = 1000 # 生成1000*1000的交叉点 x, y = np.meshgrid(np.linspace(-3,3,n),np.linspace(-3,3,n)) # exp表示e的多少次方 z = (1-x/2 + x**5 + y**3)*np.exp(-x**2-y**2) y1 = (1-x/n) * np.random.uniform(0.5,1.0,n) y2 = (1-x/n) * np.random.uniform(0.5,1.0,n) mp.figure('Contour',facecolor='lightgray') mp.title('Contour',fontsize=20) mp.xlabel('x',fontsize=14) mp.ylabel('y',fontsize=14) mp.tick_params(labelsize=10)#刻度线的字体 mp.grid(linestyle=':') # 只在y轴上画刻度 # 画等高线图 mp.contourf(x,y,z,8,camp='jet') # 填充 cntr = mp.contour(x,y,z,8,colors='black',linewidth=0.5) # 不填充 mp.clabel(cntr,inline_spacing=1,fmt='%.1f',fontsize=10) mp.show()
填充图:
轮廓图:
填充加轮廓图
18、热成像图
inshow(矩阵,cmap=颜色映射,origin=纵轴方向)
说明:orgin默认值为heigh(向下增大),还有一个相反的取值low
0 1 2
-------
0 |1 2 3
1 |4 5 6
2 |7 8 9
import matplotlib.pyplot as mp import numpy as np n = 1000 # 生成1000*1000的交叉点 x, y = np.meshgrid(np.linspace(-3,3,n),np.linspace(-3,3,n)) # exp表示e的多少次方 z = (1-x/2 + x**5 + y**3)*np.exp(-x**2-y**2) y1 = (1-x/n) * np.random.uniform(0.5,1.0,n) y2 = (1-x/n) * np.random.uniform(0.5,1.0,n) mp.figure('Hot',facecolor='lightgray') mp.title('Hot',fontsize=20) mp.xlabel('x',fontsize=14) mp.ylabel('y',fontsize=14) mp.xticks(np.linspace(1,1000,7),np.linspace(-3,3,7).astype(int)) mp.yticks(np.linspace(1,1000,7),np.linspace(-3,3,7).astype(int)) mp.tick_params(labelsize=10)#刻度线的字体 mp.grid(linestyle=':') # 只在y轴上画刻度 # 热成像图 mp.imshow(z,cmap='jet',origin='low')
# 设置图例 mp.colorbar().set_label('z',fontsize=14) mp.show()
19、饼图
pie(值,间隙,标签,颜色,格式,shadow=阴影,startangle=起始角度)
说明:画图的顺序是逆时针,startangle的值默认是0单位是度不是弧度
import matplotlib.pyplot as mp values = [26,17,21,29,11] spaces = [0.05,0.01,0.01,0.01,0.01]#间隙占半径的百分比 labels = ['Python','Javascrip','C++','C','PHP'] colors = ['dodgerblue','orangered','limegreen','violet','gold'] mp.figure('Pie',facecolor='lightgray') mp.title('Pie',fontsize=20) mp.pie(values,spaces,labels,colors,'%d%%',shadow=True,startangle=90) mp.axis('equal')#等轴 mp.show()
20、坐标线
grid(which='major/minor',axis='x/y/both',
linewidth=线宽,linestyle=线型,color=)
import matplotlib.pyplot as mp mp.figure('Grid',facecolor='lightgray') mp.title('Grid',fontsize=20) mp.xlabel('x',fontsize=14) mp.ylabel('y',fontsize=14) mp.xlim(0,10) mp.ylim(0,10) ax = mp.gca() ax.xaxis.set_major_locator(mp.MultipleLocator(1.0)) ax.xaxis.set_minor_locator(mp.MultipleLocator(0.1)) ax.yaxis.set_major_locator(mp.MultipleLocator(1.0)) ax.yaxis.set_minor_locator(mp.MultipleLocator(0.1)) mp.tick_params(labelsize=10)#刻度线的字体 #which默认值是major,axis默认值是both mp.grid(which='major',axis='x',linewidth=0.75,linestyle='-',color='lightgray') mp.grid(which='minor',axis='y',linewidth=0.25,linestyle='-',color='red') mp.show()
21、极坐标(ρ,θ)
mp.gca(projection='polar')
plot/scatter(θ,ρ,...)
import matplotlib.pyplot as mp import numpy as np t = np.linspace(0,2*np.pi,1000) r_spiral = 0.8*t r_rose = 5* np.sin(6*t) mp.figure('Polar',facecolor='lightgray') # 创建极坐标系 mp.gca(projection='polar') mp.title('Polar',fontsize=20) mp.xlabel(r'$\theta$',fontsize=14) mp.ylabel(r'$\rho$',fontsize=14) mp.tick_params(labelsize=10)#刻度线的字体 mp.grid(linestyle=':') mp.plot(t,r_spiral, c='dodgerblue',label=r'$\rho=0.8\theta$') mp.plot(t,r_rose, c='orangered',label=r'$\rho=5sin(6\theta)$') mp.legend(loc='upper left') mp.show()
22、三维曲面
from mpl_toolkits.mplot3d import axes3d
ax = gca(projection='3d')
ax.set_xlabel(...)
ax.set_ylabel(...)
ax.set_zlabel(...)
ax.plot_surface(水平坐标,垂直坐标,直立坐标,
rstride=行距,cstride=列距,cmap=颜色映射),填充
ax.plot_wireframe(水平坐标,垂直坐标,直立坐标,
rstride=行距,cstride=列距,linewidth=线宽,color=颜色) ,轮廓
说明:rstride、cstride的值越小图像越润滑
import matplotlib.pyplot as mp import numpy as np from mpl_toolkits.mplot3d import axes3d n = 1000 # 生成1000*1000的交叉点 x, y = np.meshgrid(np.linspace(-3,3,n),np.linspace(-3,3,n)) # exp表示e的多少次方 z = (1 - x/2 + x**5 + y**3)*np.exp(-x**2-y**2) mp.figure('3D Surface') mp.title('3D Surface',fontsize=20) ax = mp.gca(projection='3d') ax.set_xlabel('x',fontsize=14) ax.set_ylabel('y',fontsize=14) ax.set_zlabel('z',fontsize=14) mp.tick_params(labelsize=10)#刻度线的字体 ax.plot_surface(x,y,z,rstride=10,cstride=10,cmap='jet',linewidth=0) # 轮廓图 mp.figure('3D Wireframe') mp.title('3D Wireframe',fontsize=20) ax = mp.gca(projection='3d') ax.set_xlabel('x',fontsize=14) ax.set_ylabel('y',fontsize=14) ax.set_zlabel('z',fontsize=14) mp.tick_params(labelsize=10)#刻度线的字体 ax.plot_wireframe(x,y,z,rstride=20,cstride=20,linewidth=0.5,color='orangered') mp.show()
填充图:
轮廓图:
23、半对数坐标(y轴取对数,x轴是正常坐标)
semilogy(参数同plot)
import matplotlib.pyplot as mp import numpy as np y = np.array([1,10,100,1000,100,10,1]) mp.figure('Normal & Log',facecolor='lightgray') # 正常坐标 mp.subplot(211) mp.title('Nomal',fontsize=20) mp.ylabel('y',fontsize=14) ax =mp.gca() ax.xaxis.set_major_locator(mp.MultipleLocator(1)) ax.xaxis.set_minor_locator(mp.MultipleLocator(0.1)) ax.yaxis.set_major_locator(mp.MultipleLocator(250)) ax.yaxis.set_minor_locator(mp.MultipleLocator(50)) mp.tick_params(labelsize=10) ax.grid(which='major',axis='both',linewidth=0.75,linestyle='-',color='lightgray') ax.grid(which='minor',axis='both',linewidth=0.2,linestyle='-',color='lightgray') mp.plot(y,'o-',c='dodgerblue',label='plot') mp.ylim(min(y)-10,max(y)*1.1) mp.legend() # 半对数坐标 mp.subplot(212) mp.title('Log',fontsize=20) mp.ylabel('y',fontsize=14) ax =mp.gca() ax.xaxis.set_major_locator(mp.MultipleLocator(1)) ax.xaxis.set_minor_locator(mp.MultipleLocator(0.1)) ax.yaxis.set_major_locator(mp.MultipleLocator(250)) ax.yaxis.set_minor_locator(mp.MultipleLocator(50)) mp.tick_params(labelsize=10) ax.grid(which='major',axis='both',linewidth=0.75,linestyle='-',color='lightgray') ax.grid(which='minor',axis='both',linewidth=0.2,linestyle='-',color='lightgray') mp.semilogy(y,'o-',c='dodgerblue',label='semilogy') mp.legend() mp.ylim(min(y)-50,max(y)*1.1) mp.tight_layout() mp.show()
24、简单动画
import matplotlib.animations as ma
ma.FuncAnimation(图像对象,更新函数,生成器函数,interval=间隔毫秒)
说明:每间隔interval毫秒,调用一次更新函数
不带生成器函数
import numpy as np import matplotlib.pyplot as mp import matplotlib.animation as ma n_bubbles = 100 """ 使用numpy生成一个以为数组并且每个元素含有四个字段: position size growth color x y r g b alpha float float float float float float float float """ bubbles = np.zeros(n_bubbles,dtype=[ ('position',float,2),('size',float),('growth',float),('color',float,4)]) bubbles['position'] = np.random.uniform(0,1,(n_bubbles,2)) bubbles['size'] = np.random.uniform(50,750,n_bubbles) bubbles['growth'] = np.random.uniform(30,150,n_bubbles) bubbles['color'] = np.random.uniform(0,1,(n_bubbles,4)) mp.figure('Bubbles',facecolor='lightgray') mp.title('Bubbles',fontsize=20) mp.xticks(()) mp.yticks(()) sc = mp.scatter(bubbles['position'][:,0],bubbles['position'][:,1],s=bubbles['size'],c=bubbles['color']) def update(number): bubbles['size'] += bubbles['growth'] burst = number % n_bubbles bubbles['position'][burst] = np.random.uniform(0,1,2)#重新赋值一个位置 bubbles['size'][burst] = 0 bubbles['growth'][burst]=np.random.uniform(30,150) bubbles['color'][burst] = np.random.uniform(0,1,4) sc.set_offsets(bubbles['position']) sc.set_facecolors(bubbles['color']) # 将改变后的大小告诉sc sc.set_sizes(bubbles['size']) # 如果将这个对象赋值给一个变量则程序执行结束就结束,不会持续调用update anim = ma.FuncAnimation(mp.gcf(),update,interval=10) mp.show()
动态图
带生成器函数
import numpy as np import matplotlib.pyplot as mp import matplotlib.animation as ma mp.figure('Signal',facecolor='lightgray') mp.title('Signal',fontsize=20) mp.xlabel('Time',fontsize=14) mp.ylabel('Signal',fontsize=14) mp.xticks(()) ax = mp.gca() ax.set_ylim(-3,3) ax.set_xlim(0,10) mp.tick_params(labelsize=10) mp.grid(linestyle=':') # 获取曲线的对象,此时是空的,pl是:Line2D(_line0) # mp.plot([],[],c='orangered')得到的是一个对象集合[<matplotlib.lines.Line2D object at 0x7f458345df28>] pl = mp.plot([],[],c='orangered')[0] # 将我们的缓冲区域pl的缓冲区关联起来 pl.set_data([],[]) # data 是生成器生成的数据 def update(data): t, v = data x,y = pl.get_data() x.append(t) y.append(v) # 动态改变坐标 x_min, x_max = ax.get_xlim() if t >= x_max: ax.set_xlim(t-(x_max-x_min),t) ax.figure.canvas.draw() pl.set_data(x,y) def generator(): t =0 while True: # v 可以是从外界采集到的数据 v = np.sin(2*np.pi *t)*np.exp(np.sin(0.2*np.pi*t)) yield t,v t += 0.05 # 如果将这个对象赋值给一个变量则程序执行结束就结束,不会持续调用update anim = ma.FuncAnimation(mp.gcf(),update,generator,interval=5) mp.show()
动态图
十、numpy的通用函数
1、读文件
loadtxt(文件名,delimiter=分隔符,usecols=选择列,unpack=True/False,
dtype=目标类型,converters={列号:转换器})->二维数组/一维数组的元组
说明:unpack=True时得到的是二维数组,False得到的是一维数组的元组
import datetime as dt import numpy as np import matplotlib.pyplot as mp import matplotlib.dates as md # 用于画图时的日期 # unicode有几种编码:包括UCS-4,UTF-8等,默认读进来的是UTF-8,UTF-8的一个字符是1至4个字节 # USC-4每个字符是固定的4字节,符合数组同质性的要求 def dmy2ymd(dmy): """日月年转换为年月日""" # 文件读出来的日期是utf-8格式的字符串(每个字符所占的字节数在1-4之间),字节序列,非字符序列 #将字节序列转换为字符序列 dmy = str(dmy,encoding='utf-8') # 将字符串解析为日期格式strptime中的p就是parse的缩写 date = dt.datetime.strptime(dmy,'%d-%m-%Y').date() # 将日期转换为日期字符串 # ymd = date.strftime('%Y-%m-%d') return date # M8[D]表示8字节的日期类型 dates, opening_prices, heighest_prices, lowset_prices, closing_prices \ = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(1,3,4,5,6),unpack=True, dtype=np.dtype('M8[D], f8, f8, f8, f8'),converters={1:dmy2ymd}) mp.figure('Candlestick', facecolor='lightgray') mp.title('Candlestick', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('Price', fontsize=14) ax = mp.gca() # 主刻度以星期一为标志 ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) # 次刻度以天为单位,没有参数表示一天一个点, ax.xaxis.set_minor_locator(md.DayLocator()) # 格式化 31 Jan 2011 ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) # 设置标签大小 mp.tick_params(labelsize=10) # 设置网格线 mp.grid(linestyle=':') # 将numpy的日期类型转换为matiplotlib可以识别的格式 dates = dates.astype(md.datetime.datetime) # 得到一个布尔类型的数组 # 阳线,得到一个bool数组:收盘价大于开盘价为Ture,否则为False,掩码 rise = closing_prices - opening_prices >= 0.01 # 阴线,得到一个bool数组:收盘价小于开盘价为Ture,否则为False fall = opening_prices - closing_prices >= 0.01 # 设置颜色的rgb fc = np.zeros(dates.size, dtype='3f4') ec = np.zeros(dates.size, dtype='3f4') fc[rise],fc[fall] = (1,1,1),(0,0.5,0) # (1,1,1)表示白色 (0,0.5,0)表示绿色 ec[rise],ec[fall] = (1,0,0),(0,0.5,0) # (1,0,0)表示红色 # 画影线 mp.bar(dates,heighest_prices-lowset_prices,0,lowset_prices,color=fc,edgecolor=ec) #lowset_prices 表示底的位置 # 画实体 mp.bar(dates,closing_prices-opening_prices,0.8,opening_prices,color=fc,edgecolor=ec)# 表示底的位置 # 下标数据自适应 mp.gcf().autofmt_xdate() mp.show()
2、算数平均值
样本:S= [s1,s2,s3,...sn]
算数平均数:m = (s1+s2+s3+...+sn)/n
numpy.mean(S)-> m
s1 = s + d1, s表示实际值(真值),d1表示误差,s1表示实际值
s2 = s + d2
.......
sn = s + dn
m= s + (d1+d2...+dn)/n ,当n趋于无限大,(d1+d2...+dn)/n趋于0
算数平均值就是对真值的无偏估计
import numpy as np closing_prices = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(6),unpack=True) mean = 0 for closing_price in closing_prices: mean += closing_price mean /= closing_prices.size print(mean) # 使用Numpy求平均数 mean = np.mean(closing_prices) print(mean)
3、加权平均值
样本:S= [s1,s2,s3,...sn]
权重:W=[w1,w2,w3,.....,wn]
加权平均值:a = (s1w1+s2w2+s3w3+...+snwn)/(w1+w2+w3+...+wn)
numpy.average(S,weights=W) ->a
算数平均值就是各个样本权重相等时的加权平局值
成交量加权平均价格
import numpy as np closing_prices, volumes = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(6,7),unpack=True) vwap = 0 for closing_price,volume in zip(closing_prices,volumes): vwap += closing_price*volume vwap /= volumes.sum() print(vwap) # 使用Numpy求平均数 vwap = np.average(closing_prices,weights=volumes) print(vwap)
时间加权平均价格(时间越近的影响较大)
import datetime as dt import numpy as np def dmy2days(dmy): dmy = str(dmy,encoding='utf-8') date = dt.datetime.strptime(dmy,'%d-%m-%Y').date() days = (date - dt.date.min).days return days days,closing_prices = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(1,6),unpack=True, converters={1: dmy2days}) twap = 0 for day,closing_price in zip(days,closing_prices): twap += closing_price*day twap /= days.sum() print(twap) # 使用Numpy求平均数 twap = np.average(closing_prices,weights=days) print(twap)
4、最大值和最小值
max/min:在一个数组中寻找最大值/最小值
argmax/argmin:在一个数组中寻找最大值/最小值的下标,将多维的转换为一维求下标
maximum/minimum:在两个数组的对应位置元素中寻找最大值/最小值
ptp:一个数组中寻找最大值和最小值之差
import numpy as np # 产生位于[10,100)区间的随机整数 a = np.random.randint(10,100,9).reshape(3,3) print(a) b, c = np.max(a),np.min(a) print(b,c) d ,e = np.argmax(a),np.argmin(a) print(d,e) """ 输出结果 [[44 32 78] [28 72 75] [74 14 81]] 81 14 8 7 """ f = np.random.randint(10,100,9).reshape(3,3) g, h = np.maximum(a,f),np.minimum(a,f) print('a:',a) print('f:',f) print("g:",g) print('h:',h) """ a: [[67 22 73] [87 93 94] [82 74 16]] f: [[67 33 50] [56 65 24] [52 21 54]] g: [[67 33 73] [87 93 94] [82 74 54]] h: [[67 22 50] [56 65 24] [52 21 16]] """
import numpy as np heightest_prices, lowest_prices = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(4,5),unpack=True) # 常规方法实现 max_heightest_price, min_lowest_price = heightest_prices[0],lowest_prices[0] for heightest_price, lowest_price in zip(heightest_prices,lowest_prices): if max_heightest_price < heightest_price: max_heightest_price = heightest_price if min_lowest_price > lowest_price: min_lowest_price = lowest_prices print(max_heightest_price-min_lowest_price) # 使用Numpy方法实现 max_heightest_price, min_lowest_price = np.max(heightest_prices),np.min(lowest_prices) print(max_heightest_price-min_lowest_price)
import numpy as np heightest_prices, lowest_prices = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(4,5),unpack=True) # 常规方法实现 max_heightest_price,min_heightest_price ,max_lowest_price,min_lowest_price \ = heightest_prices[0],heightest_prices[0],lowest_prices[0],lowest_prices[0] for heightest_price, lowest_price in zip(heightest_prices,lowest_prices): if max_heightest_price < heightest_price: max_heightest_price = heightest_price if min_heightest_price > heightest_price: min_heightest_price = heightest_price if max_lowest_price < lowest_price: max_lowest_price = lowest_price if min_lowest_price > lowest_price: min_lowest_price = lowest_price print("最大值幅度:",max_heightest_price - min_heightest_price) print("最小值幅度",max_lowest_price - min_lowest_price) # 使用Numpy方法实现 print("最大值幅度:",np.ptp(heightest_prices)) print("最小值幅度",np.ptp(lowest_prices))
5、中位数
将多个样本按照大小顺序排列,居于中间位置的元素即为中位数
13 22 27 31 43 :27是中位数
13 22 27 31 43 51 :(27+31)/2 是中位数
L:序列长度
M: (A[(L-1)/2]+A[L/2])/2
验证:
L=5
M=(A[4/2]+A[5/2])/2 =(A[2]+A[2])/2=27,下标下取整
L=6
M=(A[5/2]+A[6/2])/2 =(A[2]+A[3])/3=(27+31)/2
numpy.median(A)->M
import numpy as np closing_prices = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(6),unpack=True) sorted_closing_prices = np.sort(closing_prices) #closing_prices.sort()#修改了原来的值,等价于closing_prices = np.sort(closing_prices) l = closing_prices.size median = (sorted_closing_prices[int((l-1)/2)]+sorted_closing_prices[int(l/2)])/2 print(median) median = np.median(closing_prices) print(median)
6、标准差
样本:S=[s1,s2,...sn]
均值:m=(s1+s2+..+sn)/n
离差:D=[s1-m,s2-m,...,sn-m]
离差方:Q=[(s1-m)^2,(s2-m)^2,...,(sn-m)^2]
方差:v = (q1+q2+...+qn)/n ,q1表示(s1-m)^2
标准差:std = sqrt(v),方均根误差
numpy.std(S,ddof=非自由因子) -> std
说明:
总体标准差:根号内除以n,ddof=0
样本标准差:根号内除以(n-1),doof=1
ddof默认为0,得到的结果是总体标准差
import numpy as np closing_prices = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(6),unpack=True) mean = closing_prices.mean()# 算数平均值 devs = closing_prices - mean# 离差 dev2 = devs ** 2 # 离差方 pvar = dev2 .mean() # 总体方差 svar = dev2.sum()/(dev2.size-1) # 样本方差 pstd = np.sqrt(pvar) # 总体标准差 svtd = np.sqrt(svar) # 样本标准差 print("总体标准差:",pstd) print("样本标准差",svtd) # 使用Numpy方法 pstd = np.std(closing_prices) vstd = np.std(closing_prices,ddof=1) print("总体标准差:",pstd) print("样本标准差",svtd)
7、时间数据
1、通过布尔型掩码数组过滤数组中满足特定条件的元素:
数组[掩码数组],只有与掩码数组中值为True的元素相对应的元素可被访问
2、numpy.where(关系/逻辑表达)->满足关系表达/逻辑表达式的元素的下标数组
3、numpy.take(数组,下标数组)->提取数组中由下标数组所指示的元素
import numpy as np import datetime as dt def dmy2wday(dmy): """将日期转换为星期几的格式""" dmy = str(dmy,encoding='utf-8') date = dt.datetime.strptime(dmy,'%d-%m-%Y').date() wday = date.weekday()# 0表示星期1 return wday wdays, closing_prices = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(1,6),unpack=True, converters={1:dmy2wday}) # 0 1 2 3 4 5 6 #MON TUE WED THU FRI SAT SUN ave_closing_price = np.zeros(5) for wday in range(ave_closing_price.size): # wdays== wday 会得到一个掩码(bool类型的数组) # ave_closing_price[wday] = closing_prices[wdays== wday].mean() # np.where(wdays== wday)得到的是掩码数组为真的下标数组 #ave_closing_price[wday] = closing_prices[np.where(wdays== wday)].mean() ave_closing_price[wday] = np.take(closing_prices, np.where(wdays == wday)).mean() for wday, ave_closing_price in zip(['MON','TUE','WED', 'THU', 'FRI', 'SAT', 'SUN'],ave_closing_price): # np.round(ave_closing_price,2)保留两位小数 print(wday,np.round(ave_closing_price,2))
numpy.apply_along_axis(N-1维函数,轴向,N维数组):
将N维数组按照轴向拆分成若干N-1维数组,作为参数传递给N-1维函数,并将该函数的返回值按照相同的轴向重新组合成数组返回给条用着
import numpy as np def foo(x): print("foo:",x) return x.sum(),x.mean(),x.std() a =np.array([1,2,3,4,5]) print(foo(a)) b = np.array([[1,2,3], [4,5,6], [7,8,9]]) print("-"*40) print(np.apply_along_axis(foo,0,b)) print("-"*40) print(np.apply_along_axis(foo,1,b))
import numpy as np import datetime as dt def dmy2wday(dmy): """将日期转换为星期几的格式""" dmy = str(dmy,encoding='utf-8') date = dt.datetime.strptime(dmy,'%d-%m-%Y').date() wday = date.weekday()# 0表示星期1 return wday wdays, opening_prices,highest_prices,lowest_prices,closing_prices\ = np.loadtxt('data/aapl.csv',delimiter=',',usecols=(1,3,4,5,6),unpack=True, converters={1:dmy2wday}) # 0 1 2 3 4 5 6 #MON TUE WED THU FRI SAT SUN wdays = wdays[:16] opening_prices = opening_prices[:16] highest_prices = highest_prices[:16] lowest_prices = lowest_prices[:16] closing_prices = closing_prices[:16] first_monday = np.where(wdays ==0)[0][0] last_friday = np.where((wdays == 4))[0][-1] # 获取完成星期的的下标 indices = np.arange(first_monday,last_friday+1) indices = np.split(indices,3) # [array([1, 2, 3, 4, 5], dtype=int64), array([ 6, 7, 8, 9, 10], dtype=int64), array([11, 12, 13, 14, 15], dtype=int64)] def week_summary(indices): opening_price = opening_prices[indices[0]] highest_price = np.max(np.take(highest_prices,indices)) lowest_price = np.min(np.take(lowest_prices,indices)) closing_price = closing_prices[indices[-1]] return opening_price, highest_price,lowest_price ,closing_price summaries = np.apply_along_axis(week_summary,1,indices) print(summaries) # %g 紧凑浮点格式,360. 转换为360 np.savetxt('data/summary.csv',summaries,delimiter=',',fmt='%g')
8、卷积
激励函数:f(t)
单位响应函数:g(t)
则该激励函数作用下的响应为上二者的卷积:
/
|f(t)g(t)dt
/
在已知瞬间激励下的响应的条件下,求持续激励下的响应:
a = [1,2,3,4,5]
b = [6,7,8]
numpy.convolve(a,b,'valid/same/full'); a:表示被卷积数组,激励强度。b:表示核数组,单位激励的响应因数,第三个参数默认为full
0 0 1 2 3 4 5 0 0
8 7 6(结果:0*8+0*7+1*6=6)
8 7 6(结果:19)
8 7 6(结果:40)
8 7 6(结果:61)
8 7 6(结果:82)
8 7 6(结果:67)
8 7 6(结果:40)
结果集合6, 19,40,61,82,67,40就是完全卷积(full)
19,40,61,82,67是同维卷积(same)(维度与a相同)
40,61,82是有效卷积(valid)(计算时不需要补0)
import numpy as np a = np.arange(1,6) print(a)# [1 2 3 4 5] b = np.arange(6,9) print(b) # [6 7 8] c = np.convolve(a,b,'full') # 第三个参数默认值为full print(c) # [ 6 19 40 61 82 67 40] d= np.convolve(a, b, 'same') print(d) # [19 40 61 82 67] e= np.convolve(a, b, 'valid') print(e) # [40 61 82]
简单移动平均线和指数移动平均线
import numpy as np import datetime as dt import matplotlib.pyplot as mp import matplotlib.dates as md def dmy2ymd(dmy): dmy = str(dmy, encoding='utf-8') date = dt.datetime.strptime(dmy, '%d-%m-%Y').date() ymd = date.strftime("%Y-%m-%d") return ymd dates, closing_prices = np.loadtxt("../data/aapl.csv", delimiter=',',usecols=(1,6), unpack=True, dtype=np.dtype('M8[D], f8'), converters= {1:dmy2ymd} ) # 简单移动平均线,取5天的平均值 sma51 = np.zeros(closing_prices.size-4) for i in range(sma51.size): sma51[i] = closing_prices[i:i+5].mean() # sma51 相当于sma52 sma52 = np.convolve(closing_prices, np.ones(5)/5, 'valid') # 简单移动平均线,取10天的平均值 sma10 = np.convolve(closing_prices, np.ones(10)/10, 'valid') # 权重数组 weights = np.exp(np.linspace(-1,0,5)) weights /= weights.sum() ema5 = np.convolve(closing_prices, weights[::-1], 'valid') mp.figure('Moving Average', facecolor='lightgray') mp.title('Moving Average', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('Price', fontsize=14) ax = mp.gca() ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) ax.xaxis.set_minor_locator(md.DayLocator()) ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) mp.tick_params(labelsize=10) mp.grid(linestyle=":") dates = dates.astype(md.datetime.datetime) mp.plot(dates,closing_prices, c='lightgray', label='Closing Price') mp.plot(dates[4:],sma51, c='orangered', label='SMA-51') mp.plot(dates[4:],sma52, c='orangered', label='SMA-52', alpha=0.25, linewidth=6) mp.plot(dates[9:],sma10, c='dodgerblue', label='SMA-10') mp.plot(dates[4:],ema5, c='limegreen', label='EMA-5') mp.legend() mp.gcf().autofmt_xdate() mp.show()
布林带
中轨:移动平均线,反应趋势 - 策略
上轨:中轨+标准差*2,反应压力 - 买点
下轨:中轨-标准差*2,反应支撑 - 卖点
import numpy as np import datetime as dt import matplotlib.pyplot as mp import matplotlib.dates as md def dmy2ymd(dmy): dmy = str(dmy, encoding='utf-8') date = dt.datetime.strptime(dmy, '%d-%m-%Y').date() ymd = date.strftime("%Y-%m-%d") return ymd dates, closing_prices = np.loadtxt("../data/aapl.csv", delimiter=',',usecols=(1,6), unpack=True, dtype=np.dtype('M8[D], f8'), converters= {1:dmy2ymd} ) N = 5 # 中轨线 medios = np.convolve(closing_prices, np.ones(N)/N, 'valid') stds = np.zeros(medios.size) for i in range(medios.size): stds[i] = closing_prices[i:i+N].std() stds *= 2 # 下轨 lowers = medios -stds # 上轨 uppers = medios + stds mp.figure('Bollinger Bands', facecolor='lightgray') mp.title('Bollinger Bands', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('Price', fontsize=14) ax = mp.gca() ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) ax.xaxis.set_minor_locator(md.DayLocator()) ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) mp.tick_params(labelsize=10) mp.grid(linestyle=":") dates = dates.astype(md.datetime.datetime) mp.plot(dates,closing_prices, c='lightgray', label='Closing Price') mp.plot(dates[N-1:],medios, c='dodgerblue', label='Medio') mp.plot(dates[N-1:],lowers, c='limegreen', label='Lower') mp.plot(dates[N-1:],uppers, c='orangered', label='Upper') mp.legend() mp.gcf().autofmt_xdate() mp.show()
9、线性模型
1、线性预测
a b c d e f ?
N=3
A B C
aA+bB+cC = d
bA+cB+dC = e
cA+dB+eC = f
通过上面三个三元一次方程求出,A B C的值,从而求出g=dA+eB+fC
/a b c\ /A\ /d\
|b c d| * |B| = |e|
\c d e/ \C/ \f/
------- ---- ----
a x b
x = numpy.linalg.lstsq(a,b)
g = bx
import numpy as np import datetime as dt import matplotlib.pyplot as mp import matplotlib.dates as md import pandas as pd def dmy2ymd(dmy): dmy = str(dmy, encoding='utf-8') date = dt.datetime.strptime(dmy, '%d-%m-%Y').date() ymd = date.strftime("%Y-%m-%d") return ymd dates, closing_prices = np.loadtxt("../data/aapl.csv", delimiter=',',usecols=(1,6), unpack=True, dtype=np.dtype('M8[D], f8'), converters= {1:dmy2ymd}) # 用5天预测第六天 N = 5 pred_prices = np.zeros(closing_prices.size - N*2 + 1) for i in range(pred_prices.size): a = np.zeros((N, N)) # a[0,] = closing_prices[0:N] # a[1,] = closing_prices[1:N+1] # a[2,] = closing_prices[2:N+2] for j in range(N): # a[j,] = closing_prices[j: N + j] a[j, ] = closing_prices[i+j: i+N+j] b = closing_prices[i+N: i+ N*2] x = np.linalg.lstsq(a, b)[0] pred_prices[i] = b.dot(x) # 两个数组相乘并将得到的数组中的元素相加 mp.figure('Linear Prediction', facecolor='lightgray') mp.title('Linear Prediction', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('Price', fontsize=14) ax = mp.gca() ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) ax.xaxis.set_minor_locator(md.DayLocator()) ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) mp.tick_params(labelsize=10) mp.grid(linestyle=":") dates = dates.astype(md.datetime.datetime) mp.plot(dates,closing_prices, 'o-', c='lightgray', label='Closing Price') last_next_day = dates[-1] + pd.tseries.offsets.BDay() # 最后一个交易日的下一个交易日 dates = np.append(dates, last_next_day) mp.plot(dates[N*2:],pred_prices,'o-', c='orangered',label='Predicted Price' ) for date, price in zip(dates[N*2:], pred_prices): print(date, "->", price) mp.legend() mp.gcf().autofmt_xdate() mp.show()
2、线性拟合
x1 y1
x2 y2
...
xn yn
y = kx+b
y1 = kx1 +b
y2 = kx2 +b
....
yn = kxn +b
/x1 1\ /k\ /y1\
|x2 1| * |b| = |y2|
|....| \ / |..|
\xn b/ \yn/, 1表示b前面的系数
------ --- ----
a x b
x = numpy.linalg.lstsq(a,b)
import datetime as dt import numpy as np import matplotlib.pyplot as mp import matplotlib.dates as md # 用于画图时的日期 # unicode有几种编码:包括UCS-4,UTF-8等,默认读进来的是UTF-8,UTF-8的一个字符是1至4个字节 # USC-4每个字符是固定的4字节,符合数组同质性的要求 def dmy2ymd(dmy): """日月年转换为年月日""" # 文件读出来的日期是utf-8格式的字符串(每个字符所占的字节数在1-4之间),字节序列,非字符序列 #将字节序列转换为字符序列 dmy = str(dmy,encoding='utf-8') # 将字符串解析为日期格式strptime中的p就是parse的缩写 date = dt.datetime.strptime(dmy,'%d-%m-%Y').date() # 将日期转换为日期字符串 # ymd = date.strftime('%Y-%m-%d') return date # M8[D]表示8字节的日期类型 dates, opening_prices, heighest_prices, lowset_prices, closing_prices \ = np.loadtxt('../data/aapl.csv',delimiter=',',usecols=(1,3,4,5,6),unpack=True, dtype=np.dtype('M8[D], f8, f8, f8, f8'),converters={1:dmy2ymd}) # 趋势点 trend_points = (heighest_prices+lowset_prices+closing_prices)/3 spreads = heighest_prices - lowset_prices # 压力点 resistance_points = trend_points + spreads # 支撑点 support_points = trend_points - spreads # 将日期类型的数据转换为整数 days = dates.astype(int) # 合并矩阵ones_like(days)生成一个与days维度相同的全1的数组 a = np.column_stack((days,np.ones_like(days))) x = np.linalg.lstsq(a, trend_points)[0] trend_line = days * x[0]+ x[1] # 压力线 x = np.linalg.lstsq(a, resistance_points)[0] resistance_line = days * x[0]+ x[1] # 支撑线 x = np.linalg.lstsq(a, support_points)[0] support_line = days * x[0]+ x[1] mp.figure('Trend', facecolor='lightgray') mp.title('Trend', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('Price', fontsize=14) ax = mp.gca() # 主刻度以星期一为标志 ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) # 次刻度以天为单位,没有参数表示一天一个点, ax.xaxis.set_minor_locator(md.DayLocator()) # 格式化 31 Jan 2011 ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) # 设置标签大小 mp.tick_params(labelsize=10) # 设置网格线 mp.grid(linestyle=':') # 将numpy的日期类型转换为matiplotlib可以识别的格式 dates = dates.astype(md.datetime.datetime) # 得到一个布尔类型的数组 # 阳线,得到一个bool数组:收盘价大于开盘价为Ture,否则为False,掩码 rise = closing_prices - opening_prices >= 0.01 # 阴线,得到一个bool数组:收盘价小于开盘价为Ture,否则为False fall = opening_prices - closing_prices >= 0.01 # 设置颜色的rgb fc = np.zeros(dates.size, dtype='3f4') ec = np.zeros(dates.size, dtype='3f4') fc[rise],fc[fall] = (1,1,1),(0.85,0.85,0.85) ec[rise],ec[fall] = (0.85,0.85,0.85),(0.85,0.85,0.85) # 画影线 mp.bar(dates,heighest_prices-lowset_prices,0,lowset_prices,color=fc,edgecolor=ec) #lowset_prices 表示底的位置 # 画实体 mp.bar(dates,closing_prices-opening_prices,0.8,opening_prices,color=fc,edgecolor=ec)# 表示底的位置 mp.scatter(dates, trend_points, c='dodgerblue',alpha=0.5, s=60, zorder=2) mp.scatter(dates, resistance_points, c='orangered',alpha=0.5, s=60, zorder=2) mp.scatter(dates, support_points, c='limegreen',alpha=0.5, s=60, zorder=2) mp.plot(dates, trend_line,c='dodgerblue', linewidth=3, label='Trend') mp.plot(dates, resistance_line,c='orangered', linewidth=3, label='Resistance') mp.plot(dates, support_line,c='limegreen', linewidth=3, label='Support') # 下标数据自适应 mp.gcf().autofmt_xdate() mp.show()
10、裁剪、压缩和累乘
1、裁剪,clip
数组对象.clip(min=最小阈值,max=最大阈值),将数组中小于min时都设置为min,大于max设置为max,返回裁剪后的数组对象
2、压缩,compress
数组对象.compress(条件),返回满足条件的元素所组成的数组
3、累乘,prod
数组对象.prod(),返回元素的累乘之积
数组对象.cumprod,返回元素累乘的过程
import numpy as mp a = mp.arange(1,6) print(a) # [1 2 3 4 5] b = a.clip(min=2, max=4) print(b) # [2 2 3 4 4] c = a.compress((a>=2) & (a<=4)) print(c) # [2 3 4] d = a.prod() print(d) # 120 e = a.cumprod() print(e) # [ 1 2 6 24 120]
11、协方差、相关系数和相关矩阵
样本:
a:[a1,a2,...,an]
b:[b1,b2,...,bn]
均值:
ave(a) = (a1+a2+...+an)/n
ave(b) = (b1+b2+...+bn)/n
离差:
dev(a) = [a1,a2,...,an] - ave(a)
dev(b) = [b1,b2,...,bn] - ave(b)
方差:
var(a) = ave(dev(a)*dev(a))
var(b) = ave(dev(b)*dev(b))
标准差:
std(a) = sqrt(var(a))
std(b) = sqrt(var(b))
--------------------------------
a和b的协方差:cov(a,b) = ave(dev(a)*dev(b))
相关系数:cov(a,b)/std(a)*std(b) ,取值在[-1,1]正负号表示相关性为正相关或负相关,绝对值表示相关性的程度强弱
相关矩阵:
var(a) cov(a,b)
_____________ ______________
std(a)*std(a) std(a)std(b)
cov(b,a) var(b)
_____________ ______________
std(b)*std(a) std(b)std(b)
numpy.corrcoef(a,b) ==> / 1 相关系数\
\相关系数 1 /
import datetime as dt import numpy as np import matplotlib.pyplot as mp import matplotlib.dates as md # 用于画图时的日期 # unicode有几种编码:包括UCS-4,UTF-8等,默认读进来的是UTF-8,UTF-8的一个字符是1至4个字节 # USC-4每个字符是固定的4字节,符合数组同质性的要求 def dmy2ymd(dmy): """日月年转换为年月日""" # 文件读出来的日期是utf-8格式的字符串(每个字符所占的字节数在1-4之间),字节序列,非字符序列 #将字节序列转换为字符序列 dmy = str(dmy,encoding='utf-8') # 将字符串解析为日期格式strptime中的p就是parse的缩写 date = dt.datetime.strptime(dmy,'%d-%m-%Y').date() # 将日期转换为日期字符串 # ymd = date.strftime('%Y-%m-%d') return date # M8[D]表示8字节的日期类型 dates, bhp_closing_prices \ = np.loadtxt('../data/bhp.csv',delimiter=',',usecols=(1,6),unpack=True, dtype=np.dtype('M8[D],f8'),converters={1:dmy2ymd}) vale_closing_price = np.loadtxt("../data/vale.csv",delimiter=',', usecols=(6,), unpack=True) # diff(),相邻的元素求差 bhp_returns = np.diff(bhp_closing_prices)/bhp_closing_prices[:-1] vale_returns = np.diff(vale_closing_price)/vale_closing_price[:-1] ave_a = bhp_returns.mean() dev_a = bhp_returns - ave_a var_a = (dev_a*dev_a).mean() std_a = np.sqrt(var_a) ave_b = vale_returns.mean() dev_b = vale_returns - ave_b var_b = (dev_b*dev_b).mean() std_b = np.sqrt(var_b) cov_ab = (dev_a*dev_b).mean() cov_ba = (dev_b*dev_a).mean() corr = np.array([ [var_a/(std_a*std_a),cov_ab/(std_a*std_b)], [cov_ba/(std_b*std_a),var_b/(std_b*std_b)], ]) print(corr) """ [[ 1. 0.67841747] [ 0.67841747 1. ]] """ # 使用numpy corr = np.corrcoef(bhp_returns,vale_returns) print(corr) """ [[ 1. 0.67841747] [ 0.67841747 1. ]] """ mp.figure('Correlation', facecolor='lightgray') mp.title('Correlation', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('Returns', fontsize=14) # 收益 ax = mp.gca() # 主刻度以星期一为标志 ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) # 次刻度以天为单位,没有参数表示一天一个点, ax.xaxis.set_minor_locator(md.DayLocator()) # 格式化 31 Jan 2011 ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) # 设置标签大小 mp.tick_params(labelsize=10) # 设置网格线 mp.grid(linestyle=':') # 将numpy的日期类型转换为matiplotlib可以识别的格式 dates = dates.astype(md.datetime.datetime) # 得到一个布尔类型的数组 # 阳线,得到一个bool数组:收盘价大于开盘价为Ture,否则为False,掩码 # 设置颜色的rgb fc = np.zeros(dates.size, dtype='3f4') ec = np.zeros(dates.size, dtype='3f4') mp.plot(dates[:-1],bhp_returns, c='orangered', label="BHP") mp.plot(dates[:-1],vale_returns, c='dodgerblue', label="VALE") # 下标数据自适应 mp.gcf().autofmt_xdate() mp.legend() mp.show()
12、多项式拟合
y = p0x^n + p1x^n-1 + p2x^n-2+...+pn
numpy.polyfit(X,Y,n) -->[p0,p1,p2,....,pn]
numpy.polyval([p0,p1,p2,....,pn],X) -> Y
numpy.polyder([p0,p1,p2,....,pn]) - > [p0,p1,p2,....,pn-1],求导
numpy.root([p0,p1,p2,....,pn]) ->p0x^n + p1x^n-1 + p2x^n-2+...+pn=0方程的根
import datetime as dt import numpy as np import matplotlib.pyplot as mp import matplotlib.dates as md # 用于画图时的日期 # unicode有几种编码:包括UCS-4,UTF-8等,默认读进来的是UTF-8,UTF-8的一个字符是1至4个字节 # USC-4每个字符是固定的4字节,符合数组同质性的要求 def dmy2ymd(dmy): """日月年转换为年月日""" # 文件读出来的日期是utf-8格式的字符串(每个字符所占的字节数在1-4之间),字节序列,非字符序列 # 将字节序列转换为字符序列 dmy = str(dmy, encoding='utf-8') # 将字符串解析为日期格式strptime中的p就是parse的缩写 date = dt.datetime.strptime(dmy, '%d-%m-%Y').date() # 将日期转换为日期字符串 # ymd = date.strftime('%Y-%m-%d') return date # M8[D]表示8字节的日期类型 dates, bhp_closing_prices \ = np.loadtxt('../data/bhp.csv', delimiter=',', usecols=(1, 6), unpack=True, dtype=np.dtype('M8[D],f8'), converters={1: dmy2ymd}) vale_closing_price = np.loadtxt("../data/vale.csv", delimiter=',', usecols=(6,), unpack=True) diff_closing_prices = bhp_closing_prices - vale_closing_price days = dates.astype(int) p = np.polyfit(days,diff_closing_prices,4) print(p) poly_closing_prices = np.polyval(p,days) q = np.polyder(p) roots = np.roots(q) # 取出实根 reals = roots[np.isreal(roots)].real peeks = [[days[0],np.polyval(p,days[0])]] for real in reals: if days[0] < real and real < days[-1]: peeks.append([real,np.polyval(p,real)]) peeks.append([days[-1],np.polyval(p,days[-1])]) peeks.sort() peeks = np.array(peeks) mp.figure('Polynomial Fitting', facecolor='lightgray') mp.title('Polynomial Fitting', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('difference price', fontsize=14) # ax = mp.gca() # 主刻度以星期一为标志 ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) # 次刻度以天为单位,没有参数表示一天一个点, ax.xaxis.set_minor_locator(md.DayLocator()) # 格式化 31 Jan 2011 ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) # 设置标签大小 mp.tick_params(labelsize=10) # 设置网格线 mp.grid(linestyle=':') # 将numpy的日期类型转换为matiplotlib可以识别的格式 dates = dates.astype(md.datetime.datetime) mp.plot(dates, poly_closing_prices, c='dodgerblue',linewidth=3,label='Polynomial Fitting') mp.scatter(dates,diff_closing_prices, c='limegreen', alpha=0.5, s=60, label='Difference Price') # 拆分 dates, prices = np.hsplit(peeks,2) dates = dates.astype(int).astype('M8[D]').astype(md.datetime.datetime) for i in range(1,dates.size): mp.annotate('', xytext=(dates[i-1],prices[i-1]),xy=(dates[i],prices[i]),size=40, arrowprops = dict(arrowstyle='fancy',color='orangered',alpha=0.25)) mp.scatter(dates, prices, marker='^',c='orangered',s=100,label='Peek',zorder=4) # 下标数据自适应 mp.gcf().autofmt_xdate() mp.legend() mp.show()
13、符号数组
1、numpy.sign([12 -8 -9 11 -7 -0 25]) -->[1 -1 -1 1 -1 0 1]
2、numpy.piecewise([12 -8 -9 11 -7 -0 25],[条件1,条件2,...],[取值1,取值2,...]),数组中的元素满足哪个条件就取哪个值
净额成交量(OBV)
[10 -5 -1]
[1000 2000 3000]
[1000 -2000 -3000]
import datetime as dt import numpy as np import matplotlib.pyplot as mp import matplotlib.dates as md # 用于画图时的日期 # unicode有几种编码:包括UCS-4,UTF-8等,默认读进来的是UTF-8,UTF-8的一个字符是1至4个字节 # USC-4每个字符是固定的4字节,符合数组同质性的要求 def dmy2ymd(dmy): """日月年转换为年月日""" # 文件读出来的日期是utf-8格式的字符串(每个字符所占的字节数在1-4之间),字节序列,非字符序列 # 将字节序列转换为字符序列 dmy = str(dmy, encoding='utf-8') # 将字符串解析为日期格式strptime中的p就是parse的缩写 date = dt.datetime.strptime(dmy, '%d-%m-%Y').date() # 将日期转换为日期字符串 # ymd = date.strftime('%Y-%m-%d') return date # M8[D]表示8字节的日期类型,volumes是成交量 dates, closing_prices, volumes \ = np.loadtxt('../data/bhp.csv', delimiter=',', usecols=(1, 6, 7), unpack=True, dtype=np.dtype('M8[D],f8, f8'), converters={1: dmy2ymd}) # 差分 diff_closing_prices = np.diff(closing_prices) # sign_closing_prices = np.sign(diff_closing_prices) sign_closing_prices = np.piecewise(diff_closing_prices, [diff_closing_prices<0,diff_closing_prices==0,diff_closing_prices>0], [-1,0,1]) print(sign_closing_prices) obvs = volumes[1:] * sign_closing_prices mp.figure('On-Banlance Volumes', facecolor='lightgray') mp.title('On-Banlance Volumes', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('OBV', fontsize=14) # ax = mp.gca() # 主刻度以星期一为标志 ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) # 次刻度以天为单位,没有参数表示一天一个点, ax.xaxis.set_minor_locator(md.DayLocator()) # 格式化 31 Jan 2011 ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) # 设置标签大小 mp.tick_params(labelsize=10) # 设置网格线 mp.grid(axis='y',linestyle=':') # 将numpy的日期类型转换为matiplotlib可以识别的格式 dates = dates[1:].astype(md.datetime.datetime) rise = obvs>0 fall = obvs<0 fc = np.zeros(dates.size, '3f4') ec = np.zeros(dates.size, '3f4') fc[rise], fc[fall] = (1,0,0),(0,0.5,0) ec[rise], ec[fall] = (1,1,1),(1,1,1) mp.bar(dates,obvs, color=fc, edgecolor=ec, label="OBV") # 下标数据自适应 mp.gcf().autofmt_xdate() mp.legend() mp.show()
14、矢量化
def 标量函数(标量):
...
return 标量返回值
矢量返回值 = numpy.vectorize(标量函数)(矢量参数)
import numpy as np def foo(x, y): return x + y, x - y, x * y a, b = 3, 4 c, d, e = foo(a,b) print(c, d, e) f, g = np.array([5, 6, 7]), np.array([8, 9, 10]) h, i, j = [], [], [] for x, y in zip(f, g): add, sub, mul = foo(x, y) h.append(add) i.append(sub) j.append(mul) h = np.array(h) i = np.array(i) j = np.array(j) print(h, i, j) # 下面的作用等效于上面的 h, i, j = np.vectorize(foo)(f, g) print(h, i, j)
import datetime as dt import numpy as np import matplotlib.pyplot as mp import matplotlib.dates as md # 用于画图时的日期 # unicode有几种编码:包括UCS-4,UTF-8等,默认读进来的是UTF-8,UTF-8的一个字符是1至4个字节 # USC-4每个字符是固定的4字节,符合数组同质性的要求 def dmy2ymd(dmy): """日月年转换为年月日""" # 文件读出来的日期是utf-8格式的字符串(每个字符所占的字节数在1-4之间),字节序列,非字符序列 #将字节序列转换为字符序列 dmy = str(dmy,encoding='utf-8') # 将字符串解析为日期格式strptime中的p就是parse的缩写 date = dt.datetime.strptime(dmy,'%d-%m-%Y').date() # 将日期转换为日期字符串 # ymd = date.strftime('%Y-%m-%d') return date # M8[D]表示8字节的日期类型 dates, opening_prices, highest_prices, lowset_prices, closing_prices \ = np.loadtxt('../data/bhp.csv',delimiter=',',usecols=(1,3,4,5,6),unpack=True, dtype=np.dtype('M8[D], f8, f8, f8, f8'),converters={1:dmy2ymd}) def profit(opening_price, highest_price, lowest_price, closing_price): buying_price = opening_price * 0.99 # 开盘价降低一个百分点 if lowest_price <= buying_price <= highest_price: return (closing_price - buying_price) * 100 / buying_price return np.nan profits = np.vectorize(profit)(opening_prices, highest_prices, lowset_prices, closing_prices) print(profits) nan = np.isnan(profits) dates, profits = dates[~nan], profits[~nan] # 获取有效的数据 gain_dates, gain_profits = dates[profits > 0], profits[profits > 0] # 盈利 loss_dates, loss_profits = dates[profits < 0], profits[profits < 0] # 亏损 mp.figure('Trading Simulation', facecolor='lightgray') mp.title('Trading Simulation', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('Profit', fontsize=14) ax = mp.gca() # 主刻度以星期一为标志 ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) # 次刻度以天为单位,没有参数表示一天一个点, ax.xaxis.set_minor_locator(md.DayLocator()) # 格式化 31 Jan 2011 ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) # 设置标签大小 mp.tick_params(labelsize=10) # 设置网格线 mp.grid(linestyle=':') if dates.size > 0: # 将numpy的日期类型转换为matiplotlib可以识别的格式 dates = dates.astype(md.datetime.datetime) mp.plot(dates, profits, c='gray', label='Profit') mp.axhline(y = profits.mean(), linestyle='--', color = 'gray') # 画水平线 if gain_dates.size > 0: gain_dates = gain_dates.astype(md.datetime.datetime) mp.plot(gain_dates, gain_profits, 'o', c='orangered', label='Gain Profit') mp.axhline(y=gain_profits.mean(), linestyle='--', color='orangered') # 画水平线 if loss_dates.size > 0: loss_dates = loss_dates.astype(md.datetime.datetime) mp.plot(loss_dates, loss_profits, 'o', c='limegreen', label='Loss Profit') mp.axhline(y=loss_profits.mean(), linestyle='--', color='limegreen') # 画水平线 mp.legend() mp.gcf().autofmt_xdate() mp.show()
15、数据平滑
步骤:
1、卷积降噪
2、曲线拟合(为了寻找交点)
3、计算特征
4、指导业务
import datetime as dt import numpy as np import matplotlib.pyplot as mp import matplotlib.dates as md # 用于画图时的日期 # unicode有几种编码:包括UCS-4,UTF-8等,默认读进来的是UTF-8,UTF-8的一个字符是1至4个字节 # USC-4每个字符是固定的4字节,符合数组同质性的要求 def dmy2ymd(dmy): """日月年转换为年月日""" # 文件读出来的日期是utf-8格式的字符串(每个字符所占的字节数在1-4之间),字节序列,非字符序列 # 将字节序列转换为字符序列 dmy = str(dmy, encoding='utf-8') # 将字符串解析为日期格式strptime中的p就是parse的缩写 date = dt.datetime.strptime(dmy, '%d-%m-%Y').date() # 将日期转换为日期字符串 # ymd = date.strftime('%Y-%m-%d') return date # M8[D]表示8字节的日期类型 dates, bhp_closing_prices = np.loadtxt('../data/bhp.csv', delimiter=',', usecols=(1, 6), unpack=True, dtype=np.dtype('M8[D],f8'), converters={1: dmy2ymd}) vale_closing_price = np.loadtxt("../data/vale.csv", delimiter=',', usecols=(6,), unpack=True) # diff(),相邻的元素求差 bhp_returns = np.diff(bhp_closing_prices) / bhp_closing_prices[:-1] vale_returns = np.diff(vale_closing_price) / vale_closing_price[:-1] N = 8 weights = np.hanning(N) print(weights) # [ 0. 0.1882551 0.61126047 0.95048443 0.95048443 0.61126047 0.1882551 0. ] weights /= weights.sum() # 求卷积 bhp_smooth_returns = np.convolve(bhp_returns, weights, 'valid') vale_smooth_returns = np.convolve(vale_returns, weights, 'valid') # 多项式拟合 days = dates[N-1:-1].astype(int) degree = 3 bhp_p = np.polyfit(days, bhp_smooth_returns, degree) bhp_fitted_returns = np.polyval(bhp_p, days) vale_p = np.polyfit(days, vale_smooth_returns, degree) vale_fitted_returns = np.polyval(vale_p, days) # 差函数的系数 sub_p = np.polysub(bhp_p, vale_p) roots = np.roots(sub_p) # 提取实根 reals = roots[np.isreal(roots)].real inters = [] for real in reals: if days[0] <= real and real <= days[-1]: inters.append([real, np.polyval(bhp_p, real)]) inters.sort() inters = np.array(inters) mp.figure('Smoothing Returns', facecolor='lightgray') mp.title('Smoothing Returns', fontsize=20) mp.xlabel('Date', fontsize=14) mp.ylabel('Returns', fontsize=14) # 收益 ax = mp.gca() # 主刻度以星期一为标志 ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday=md.MO)) # 次刻度以天为单位,没有参数表示一天一个点, ax.xaxis.set_minor_locator(md.DayLocator()) # 格式化 31 Jan 2011 ax.xaxis.set_major_formatter(md.DateFormatter('%d %b %Y')) # 设置标签大小 mp.tick_params(labelsize=10) # 设置网格线 mp.grid(linestyle=':') # 将numpy的日期类型转换为matiplotlib可以识别的格式 dates = dates.astype(md.datetime.datetime) mp.plot(dates[:-1], bhp_returns, alpha=0.25 , c='orangered', label="BHP") mp.plot(dates[:-1], vale_returns, alpha=0.25, c='dodgerblue', label="VALE") mp.plot(dates[N-1:-1], bhp_smooth_returns, alpha=0.75, c='orangered', label="Smooth BHP") mp.plot(dates[N-1:-1], vale_smooth_returns, alpha=0.75, c='dodgerblue', label="Smooth VALE") mp.plot(dates[N-1:-1], bhp_fitted_returns,linewidth=3, c='orangered', label="Fitted BHP") mp.plot(dates[N-1:-1], vale_fitted_returns,linewidth=3, c='dodgerblue', label="Fitted VALE") dates, returns = np.hsplit(inters,2) dates = dates.astype(int).astype('M8[D]').astype(md.datetime.datetime) mp.scatter(dates, returns, marker='x', c='firebrick', s=120, lw=3, zorder=3) # 下标数据自适应 mp.gcf().autofmt_xdate() mp.legend() mp.show()
二、矩阵和ufunc
1、创建矩阵
numpy中的矩阵是matrix类类型的对象,matrix类是ndarray类的子类,对某些专门针对矩阵的运算做了重载,增加部分新的属性和方法
numpy.matrix(可被解释为矩阵的二维容器,copy=是否复制数据[True])->矩阵对象
说明:copy = False时表示矩阵与二维容器共享数据等价于numpy.mat()
numpy.mat(可被解释为矩阵的二维容器) ->矩阵对象,数据共享
可被解释为矩阵的二维容器也可以用字符串表示,如:‘1 2 3;4 5 6;7 8 9’
numpy.bmat(子矩阵的块组合字符串)->组合矩阵
‘A B; C D’:A B C D都代表一个矩阵或可被解释为矩阵的二维容器
import numpy as np a = np.matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) print(a, type(a)) b = np.arange(1,10).reshape(3,3) print(b, type(b)) c = np.matrix(b) print(c, type(c)) d = np.matrix('1 2 3; 4 5 6;7 8 9 ') e = d # e 和 e 是一个对象 print(id(e), id(d)) f = np.mat(e) print(id(f)) g = np.matrix(f, copy=False) print(id(g)) h = np.matrix(g) print(id(h)) d += 10 print(d, e, f, g, h, sep='\n') print(b**2) ''' 1 2 3 1 2 3 1 4 9 4 5 6 * 4 5 6 = 16 25 36 7 8 9 7 8 9 49 64 81 ''' print(h**2) # 相当于数组中的点乘 b.dot(b) """ 1 2 3 * 4 5 6 7 8 9 1 2 3 30 36 42 4 5 6 66 81 96 6 7 8 102 126 150 每行与每列对应相乘再相加 """ print(b.T, h.T, sep='\n') i = np.mat('1 2 6; 3 5 7; 4 8 9') j = i.I # 逆矩阵,数组中没有这个方法 print(j) print(i*j) # 矩阵与逆矩阵的乘积等于单位矩阵(对角线为1,其余都为0) k = np.ones((2,2)) l = k * 2 m = k * 3 n = k * 4 o = np.bmat('k l; m n') print(type(o))
2、ufunc,通过函数,统一函数
numpy.ufunc是一个类,由该类所创建的对象可被当做函数调用,在执行过程中会根据所接收的矢量化参数中的元素依次调用其内部封装
的标量化函数,并将其返回的标量结果组织成矢量返回调用者
标量函数(标量) -> 标量
numpy.frompyfunc(标量函数,参数个数,返回值的个数) ->ufunc类型的对象
ufunc类型的对象(矢量)->矢量
import numpy as np def foo(x, y): return x + y, x - y, x * y a, b = 3, 4 c, d, e = foo(a,b) print(c, d, e) f, g = np.array([5, 6, 7]), np.array([8, 9, 10]) h, i, j = [], [], [] for x, y in zip(f, g): add, sub, mul = foo(x, y) h.append(add) i.append(sub) j.append(mul) h = np.array(h) i = np.array(i) j = np.array(j) print(h, i, j) # 下面的作用等效于上面的 uf = np.frompyfunc(foo, 2, 3) print(type(uf)) # <class 'numpy.ufunc'> h, i, j = uf(f, g) print(h, i, j)
3、加法ufunc预定义对象(省去了numpy.frompyfunc()这个步骤)
numpy.add.reduce(数组) ->元素累加和
numpy.add.accumulate(数组)->元素累加的过程
numpy.add.reduceat(数组,位置)->分段累加
numpy.add.outer(数组1,数组2)->外和,数组1放在行的位置上,数组2放在列的位置上求和
a:[1 2 3]
b:[4 5 6 7 ]
numpy.add.outer(a,b)->得到的二维数组就是外和
4 5 6 7
1 5 6 7 8
2 6 7 8 9
3 7 8 9 10
import numpy as np print(type(np.add)) # <class 'numpy.ufunc'> a = np.array([10, 20, 30]) b = np.array([1, 2, 3]) c= np.add(a, b) # 等价于 c = a+b print(c) # [11 22 33] e = np.add.reduce(a) print(e) # 60 f = np.add.accumulate(a) print(f) # [10 30 60] g = np.arange(1,7) ''' 1 2 3 4 5 6 0 1 2 3 4 5 ^ ^ ^ ''' h = np.add.reduceat(g,[0, 2, 4]) # 分段求和 print(h) # [3 7 11] i = np.add.outer(a, b) print(i) ''' 1 2 3 10 11 12 13 20 21 22 23 30 31 32 33 ''' j = np.outer(a,b) # 外积 print(j) ''' 1 2 3 10 10 20 30 20 20 40 60 30 30 60 90 '''
4、除法ufunc预定对象
1、真除:[5 5 -5 -5]<真除>[2 -2 2 -2]=[2.5 -2.5 -2.5 2.5]
numpy.true_divide()
numpy.divide()
/
2、地板除:[5 5 -5 -5]<地板除>[2 -2 2 -2]=[2 -3 -3 2]
numpy.floor_divide()
//
3、天花板除:[5 5 -5 -5]<天花板除>[2 -2 2 -2]=[3 -2 -2 3]
numpy.ceil(a/b).astype(int)
4、截断除:[5 5 -5 -5]<天截断除>[2 -2 2 -2]=[2 -2 -2 2]
numpy.trunc(a/b).astype(int)
(a/b).astype(int)
5、取余ufunc预定义对象
被除数<除以>除数=商...余数
5<除以>2 = 2...1
除数<乘以>商+余数 = 被除数
1、地板模
[5 5 -5 -5]<地板除>[2 -2 2 -2]=[2 -3 -3 2]...[1 -1 1 -1]
numpy.remainder()
numpy.mod()
%
2、截断模
[5 5 -5 -5]<天截断除>[2 -2 2 -2]=[2 -2 -2 2]...[1 1 -1 -1]
numpy.fmod()
import numpy as np a = np.array([5, 5, -5, -5]) b = np.array([2, -2, 2, -2]) c = np.true_divide(a, b) d = np.divide(a, b) e = a/b print(c, d, e, sep='\n') f = np.floor_divide(a, b) g = a // b print(f, g, sep='\n') h = np.ceil(a/b).astype(int) print(h) # [ 3 -2 -2 3] i = np.trunc(a/b).astype(int) print(i) # [ 2 -2 -2 2] j = (a/b).astype(int) print(j) # [ 2 -2 -2 2] # 模运算 k = np.remainder(a, b) l = np.mod(a, b) n = a % b print(k, l, n) # [ 1 -1 1 -1] [ 1 -1 1 -1] [ 1 -1 1 -1] m = np.fmod(a,b) print(m) # [ 1 1 -1 -1]
6、python语言中绝大部分的运算符都被numpy通过ufunc进行了重载定义,使之能够支持数组间的运算
1 1 2 3 5 8 13 21 ....?
f(n) = f(n-1)+f(n-2),n >= 3
f(1) = f(2) = 1
import numpy as np n = 35 # 递归实现 def fibo(n): return 1 if n <3 else fibo(n-1)+fibo(n-2) print(fibo(n)) # 循环实现 fn_1, fn_2 = 0, 1 # f(n-1), f(n-2) = 0, 1 for i in range(n): fn = fn_1 + fn_2 fn_1, fn_2 = fn, fn_1 print(fn) # numpy实现 res = (np.mat('1 1; 1 0') ** (n-1))[0, 0] print(res) ''' 使用斐波那契通项公式实现 __ n __ n /1 + V 5 \ /1 - V 5 \ |--------| - |--------| \ 2 / \ 2 / ------------------------------- __ V 5 ''' r = np.sqrt(5) res = int((((1+r)/2)**n - ((1-r)/2)**n) / r) print(res)
7、实现三角函数的ufanc预定义对象
利萨如曲线
lissa.py
方波发生器
4sin(1x) 4sin(3x) 4sin(5x)
------- + ---------- + ---------- +......
1pi 3pi 5pi 2k-1
1 2 3 k
squr.py
8、位运算
1、异或:^/__xor__/bitwise_xor,相同为0,不同为1
0 ^ 0 = 0
0 ^ 1 = 1
1 ^ 0 = 1
1 ^ 1 = 0
判断a、b是否同号(符号位0表示正数,1表示负数)
if a^b <0 then a和b 异号
2、与:&/__and__/bitwise_and
0 & 0 = 0
0 & 1 = 0
1 & 0 = 0
1 & 1 = 1
1 00000001 - 1 = 00000000
2 00000010 - 1 = 00000010
4 00000100 - 1 = 00000011
8 00001000 - 1 = 00000111
16 00010000 - 1 = 00001111
\------&-----/
|
0
if a & (a-1) == 0 then a 是2的幂
3、移位:
右移:>>/__rshift__/right_shift, >>1相当于/2
左移:<</__lshift__/left_shift, <<1相当于*2
import numpy as np a = np.array([10, 20, -30, 40, -50]) b = np.array([60, -70, -80, -90, -100]) # c = a ^ b # c = a.__xor__(b) c = np.bitwise_xor(a, b) print(c) print(a[c < 0], b[c < 0]) # [20 40] [-70 -90] # 判断一个数是否是2的幂 d = np.arange(1,21) print(d) # [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20] print(d[d & (d-1) == 0]) # [ 1 2 4 8 16] print(d[d.__and__(d-1) == 0]) print(d[np.bitwise_and(d, (d-1)) == 0])
三、Numpy的子模块
1、线性代数子模块(linalg)
1、矩阵的逆矩阵:inv
若矩阵A与矩阵B的乘积是一个单位矩阵,则称A和B互为可逆
矩阵:A * A^-1 = E , A必须是方阵(行列数相等)
import numpy as np A = np.mat('1 2 3; 8 9 4; 7 6 5') print(A) B = np.linalg.inv(A) print(B) C = np.linalg.inv(B) print(C) D = A * B print(D) E = np.mat('0 1 2 3;0 8 9 4;0 7 6 5') #np.linalg.inv(E) # numpy.linalg.linalg.LinAlgError: Last 2 dimensions of the array must be square, E不是方阵 # 可以通过I获取非方阵的逆矩阵(广义逆矩阵) print(E.I * E)
2、解线性方程组
/x-2y+z = 0
|2y-8z-8= 0
\-4x+5y+9z+9=0
/1x-2y+1z = 0
|0x+2y-8z = 8
\-4x+5y+9z= -9
/1 2 1 \ /x\ / 0\
|0 2 8 | * |y| = | 8|
\-4 5 9 / \z/ \-9/
---------- ---- ------
a x b
x = numpy.linalg.lstsq(a,b)[0] ,近似解,如果未知数与方程个数相同得到的是精确解
x = numpy.linalg.solve(a,b) ,精确解
import numpy as np a = np.mat('1 -2 1; 0 2 -8; -4 5 9') b = np.mat('0;8;-9') x = np.linalg.solve(a,b) x1 = np.linalg.lstsq(a,b)[0] print(x) print(x1)
3、特征值和特征向量
对于一个n阶的方阵A,如果存在一个数a和一个非零n维向量(包含n个元素)x,使得Ax=ax成立,则成a是矩阵A的特征值,
x是矩阵A属于特征值a的特征向量,一个方阵可以有任意个特征值和特征向量
x z
* x * a z
x z
A A A y
A A A y
A A A y
矩阵Y和矩阵Z相等
numpy.linalg.eig(A) ->[特征值1 特征值2...]
[[特征向量1 特征向量2...]
... ... ...]
import numpy as np A = np.mat('3 -2;1 0') print(A) eigvals, eigvecs = np.linalg.eig(A) print(eigvals, eigvecs, sep='\n') a = eigvals[0] x = eigvecs[:,0] print(A*x, a*x, sep='\n') a = eigvals[1] x = eigvecs[:,1] print(A*x, a*x, sep='\n')
4、奇异值分解
M = U * Sigma * V
| | |
正交矩阵 | 正交矩阵(矩阵与矩阵的转置相乘为单位矩阵)(单位矩阵:主对角线全为1其余为零)
UU^T=E | VV^T=E
只有主对角线上的元素非零,其他元素全部为零,主对角线上的非零元素称为矩阵M的奇异值
numpy.linalg.svd(M, full_matrices=False) ->U,奇异值(Sigma的主对角线元素组成的矩阵),V
import numpy as np M = np.mat('4 11 14; 8 7 -2') print(M) U, sv, V = np.linalg.svd(M,full_matrices=False) ''' full_matrices默认为True 为True时V的值为: [[-0.33333333 -0.66666667 -0.66666667] [ 0.66666667 0.33333333 -0.66666667] [-0.66666667 0.66666667 -0.33333333]] 为False时V的值为: [[-0.33333333 -0.66666667 -0.66666667] [ 0.66666667 0.33333333 -0.66666667]] ''' print(U, sv, V, sep='\n') print(U * U.T) print(V * V.T) Sigma = np.diag(sv) #将Sigma补全 print(Sigma, type(Sigma)) print(U * Sigma * V)
5、广义逆矩阵
广义逆矩阵是将矩阵求逆的运算法则由方阵推广到非方阵,只要A*B=E,即使A并非方阵,仍然可以称B为其广义逆矩阵。
非方阵的I属性即其广义逆矩阵
numpy.linalg.pinv(A) ->A的广义逆矩阵
import numpy as np A = np.mat('11 12 13 14; 20 21 22 15; 19 18 17 16') print(A) # B = np.linalg.inv(A) # 会报错numpy.linalg.linalg.LinAlgError: Last 2 dimensions of the array must be square B = np.linalg.pinv(A) print(B) C = A.I # C == B print(C) print(A * C) # 单位阵
6、行列式
二阶:
a b
c d
二阶行列式:ad - bc
三阶:
a b c
d e f
g h i
三阶行列式:
a e f - b d f + c d e ==> a(ei-fh) - b(di-fg) + c(dh-eg) ==>aei-afh-bdi+bfg+cdh-ceg
h i g i g h
numpy.linalg.det(方阵)->行列式的值
import numpy as np A = np.mat('2 1; 3 4') print(A, np.linalg.det(A)) B = np.mat('3 2 1; 4 9 8; 5 6 7') print(B, np.linalg.det(B))
2、快速傅里叶变换子模块(fft)
傅里叶定理:
任何周期函数,总可以被分解为有限个不同幅值、频率和初相位的正弦函数。
f(t) - 时间域
(A幅值,fai初相位)(w频率) - 频率域
numpy.fft.fftfreq(采样数,采样周期)->频率数组(Hz为单位)
numpy.fft.fft(时域信号)->频域复数数组
numpy.fft.ifft(频域复数数组)->时域信号
import numpy as np import numpy.fft as nf import matplotlib.pyplot as mp times = np.linspace(0, 2*np.pi,201) sigs1 = 4/(1 * np.pi) * np.sin(1 * times) sigs2 = 4/(3 * np.pi) * np.sin(3 * times) sigs3 = 4/(5 * np.pi) * np.sin(5 * times) sigs4 = 4/(7 * np.pi) * np.sin(7 * times) sigs5 = 4/(9 * np.pi) * np.sin(9 * times) sigs6 = sigs1+sigs2+sigs3+sigs4+sigs5 # 快速傅里叶变换 freqs = nf.fftfreq(times.size, times[1]-times[0]) ffts = nf.fft(sigs6) # 求复数的长度:abs对于实数来说数是求绝对值,对复数就是就模,就是求点到原点的距离 pows = np.abs(ffts) sigs7 = nf.ifft(ffts).real mp.figure('FFT', facecolor='lightgray') mp.subplot(121) mp.title("Time Domain", fontsize=16) mp.xlabel('Time', fontsize=12) mp.ylabel('Signal', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(times, sigs1, label='{:.4f}'.format(1/(2*np.pi))) mp.plot(times, sigs2, label='{:.4f}'.format(3/(2*np.pi))) mp.plot(times, sigs3, label='{:.4f}'.format(5/(2*np.pi))) mp.plot(times, sigs4, label='{:.4f}'.format(7/(2*np.pi))) mp.plot(times, sigs5, label='{:.4f}'.format(9/(2*np.pi))) mp.plot(times, sigs6, label='{:.4f}'.format(1/(2*np.pi))) mp.plot(times, sigs7, label='{:.4f}'.format(1/(2*np.pi)), alpha=0.5, linewidth=6) mp.legend() mp.tight_layout() mp.subplot(122) mp.title("Frequency Domain", fontsize=16) mp.xlabel('Frequency', fontsize=12) mp.ylabel('Power', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(freqs[freqs>=0], pows[freqs>=0], c='orangered', label='Frequency Spectrum') mp.legend() mp.tight_layout() mp.show()
import numpy as np import numpy.fft as nf import matplotlib.pyplot as mp import scipy.io.wavfile as wf # 读取WAV文件 # 样本的采样频率(每秒有多少个采样点),样本的信号值(采样点个数) sample_rate, noised_sigs = wf.read('../data/noised.wav') print(sample_rate) # 44100 print(noised_sigs.shape) # (220500,) # 时间数组 times = np.arange(len(noised_sigs))/sample_rate # numpy.fft.fftfreq(采样数,采样周期)->频率数组(Hz为单位) freqs = nf.fftfreq(times.size, d = 1/ sample_rate) # numpy.fft.fft(时域信号)->频域复数数组 noised_ffts = nf.fft(noised_sigs) # 振幅(求模) noised_pows = np.abs(noised_ffts) # 找到能量最大值对应的横坐标 print(noised_pows.argmax()) # 5000 fund_freq = np.abs(freqs[noised_pows.argmax()]) noised_indices = np.where(np.abs(freqs) != fund_freq) filter_ffts = noised_ffts.copy() # 把噪声置零 filter_ffts[noised_indices] = 0 filter_pows = np.abs(filter_ffts) print(fund_freq) # 1000.0 # 将去噪之后的数去还原为时间域 filter_sigs = nf.ifft(filter_ffts).real # 将去噪之后的声音保存为文件 wf.write('../data/filter.wav',sample_rate, filter_sigs.astype(np.int16)) mp.subplot(221) mp.title("Time Domain", fontsize=16) mp.ylabel('Signal', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(times[:178], noised_sigs[:178], label='Noised', c='orangered') mp.legend() mp.subplot(222) mp.title("Frequency Domain", fontsize=16) mp.ylabel('Signal', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') # 半对数坐标 mp.semilogy(freqs[freqs>=0], noised_pows[freqs>=0], label='Noised', c='limegreen') mp.legend() mp.subplot(223) mp.xlabel('Time', fontsize=12) mp.ylabel('Signal', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(times[:178], filter_sigs[:178], label='Filter', c='hotpink') mp.legend() mp.subplot(224) mp.xlabel('Frequency', fontsize=12) mp.ylabel('Power', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(freqs[freqs>=0], filter_pows[freqs>=0], label='Filter', c='dodgerblue') mp.legend() mp.tight_layout() mp.show()
3、随机数子模块(random)
1、二项分布
numpy.random.binomial(n,p,size)->size个随机数,每个随机数来自n次尝试中成功的次数,其中每次尝试成功的概率为p
猜硬币游戏:
初始筹码1000,每轮猜9次,每次猜对的概率0.5,猜对5次以及以上为赢,筹码加1,否则为输,筹码减1,问1w轮,筹码的变化轨迹
numpy.random.binomial(9,0.5,10000)
import numpy as np import matplotlib.pyplot as mp outcomes = np.random.binomial(9, 0.5, 10000) chips = [1000] # 获得每轮的成功次数 for outcome in outcomes: if outcome >= 5: chips.append(chips[-1]+1) else: chips.append((chips[-1]-1)) chips = np.array(chips) mp.figure('Binomial Distribution', facecolor='lightgray') mp.title('Binomial Distribution', fontsize=20) mp.xlabel('Round', fontsize=14) mp.ylabel('Chip', fontsize=14) mp.tick_params(labelsize=10) mp.grid(linestyle=":") o, h, l, c = 0, chips.argmax(), chips.argmin(), chips.size - 1 if chips[o] < chips[c]: color = 'orangered' elif chips[o] > chips[c]: color = 'limegreen' else: color = 'dodgerblue' mp.plot(chips, c=color, label='Chip') # 画平行于x轴的平行线 mp.axhline(y=chips[o], linestyle='--', color='deepskyblue', linewidth=1) mp.axhline(y=chips[h], linestyle='--', color='crimson', linewidth=1) mp.axhline(y=chips[l], linestyle='--', color='seagreen', linewidth=1) mp.axhline(y=chips[c], linestyle='--', color='orange', linewidth=1) # 画出四个点的位置 mp.scatter(o, chips[o], edgecolors='deepskyblue',s=60,label='Opening:%d'% chips[o],zorder=3) mp.scatter(h, chips[h], edgecolors='crimson',s=60,label='Highest:%d'% chips[h],zorder=3) mp.scatter(l, chips[l], edgecolors='seagreen',s=60,label='Lowest:%d'% chips[l],zorder=3) mp.scatter(c, chips[c], edgecolors='orange',s=60,label='Closing:%d'% chips[c],zorder=3) mp.legend() mp.show()
2、超几何分布
numpy.random.hypergeometric(ngood,ndad,nsample,size)->size个随机数,每个随机数来自随机抽取的nsample个样本中的好样本数,
总样本由ngood个好样本和nbad个坏样本组成
摸球游戏:
将25个好球和一个坏球放在一起,每轮随机摸3个球,全为好球加一分,否则减6份,问100轮,分值变化轨迹
numpy.random.hypergeometric(25,1,3,100)
import numpy as np import matplotlib.pyplot as mp outcomes = np.random.hypergeometric(25, 1, 3, 100) scores = [0] for outcome in outcomes: if outcome == 3: scores.append(scores[-1]+1) else: scores.append(scores[-1]-6) scores = np.array(scores) mp.figure('Hypergeometric Distribution', facecolor='lightgray') mp.title('Hypergeometric Distribution', fontsize=20) mp.xlabel('Round', fontsize=14) mp.ylabel('Score', fontsize=14) mp.tick_params(labelsize=10) mp.grid(linestyle=":") o, h, l, c = 0, scores.argmax(), scores.argmin(), scores.size - 1 if scores[o] < scores[c]: color = 'orangered' elif scores[o] > scores[c]: color = 'limegreen' else: color = 'dodgerblue' mp.plot(scores, c=color, label='Chip') # 画平行于x轴的平行线 mp.axhline(y=scores[o], linestyle='--', color='deepskyblue', linewidth=1) mp.axhline(y=scores[h], linestyle='--', color='crimson', linewidth=1) mp.axhline(y=scores[l], linestyle='--', color='seagreen', linewidth=1) mp.axhline(y=scores[c], linestyle='--', color='orange', linewidth=1) # 画出四个点的位置 mp.scatter(o, scores[o], edgecolors='deepskyblue',s=60,label='Opening:%d'% scores[o],zorder=3) mp.scatter(h, scores[h], edgecolors='crimson',s=60,label='Highest:%d'% scores[h],zorder=3) mp.scatter(l, scores[l], edgecolors='seagreen',s=60,label='Lowest:%d'% scores[l],zorder=3) mp.scatter(c, scores[c], edgecolors='orange',s=60,label='Closing:%d'% scores[c],zorder=3) mp.legend() mp.show()
3、标准正态分布(平均值为0,标准差为1)
numpy.randmom,normal(size) -> size个服从标准正态分布的随机数
import numpy as np import matplotlib.pyplot as mp samples = np.random.normal(size=10000) mp.figure('Normal Distribution', facecolor='lightgray') mp.title('Normal Distribution', fontsize=20) mp.xlabel('Sample', fontsize=14) mp.ylabel('Occurence', fontsize=14) mp.tick_params(labelsize=10) mp.grid(axis='y', linestyle=":") # 画直方图hist(samples, 画多少个条数,)normed=True,表示已比例的方式显示 bins = mp.hist(samples, 100, normed=True,edgecolor='steelblue', facecolor='deepskyblue',label='Normal')[1] probs = np.exp(-bins ** 2 / 2) / np.sqrt(2*np.pi) # 标准正态分布公式 mp.plot(bins, probs, 'o-', c='orangered', label='Probability') mp.legend() mp.show()
四、numpy的专用函数
1、联合间接排序
a数组:b数组: c:
张三 27 0 170
李四 22 1 165
王五 25 2 175
赵刘 22 3 158
... ...
直接排序:22 22 25 27
间接排序:3 1 2 0 ——>有序下标
\ /
-----
联合参考序列c的升序
numpy.lexsort((参考序列c,待排序列b))->待排序列中的有序下标
numpy.sort_complex(复数数组)->有序复数数组,按照实部升序,实部相同参考虚部升序
max/min/argmin/argmin将数组中的无效值nan视作正负无穷大,即是最大值也是最小值
nanmax/nanmin/nanargmin/nanargmin在排除数组中的无效值之后,计算其最大值和最小值
numpy.searchsorted(有序数组,被插入数组)——>将被插入数组中的元素插入到有序数组中,不改变其有序性的位置数组。
numpy.insert(原数组,位置数组,被插入数组)->将被插入数组中元素插入到原数组由位置数组所标记的位置处,返回插入后的结果
import numpy as np a = np.array(['Z3', 'L4', 'W5', 'Z6']) b = np.array([27, 22, 25, 22]) c = np.array([170, 165, 175, 158]) print(np.lexsort((c,b))) print(a[np.lexsort((c,b))]) d = b + c*1j print(d) e = np.sort_complex(d) print(e) f = np.array([13, 11,np.nan, 19, 17]) print(np.nanmax(f),np.nanmin(f)) # 将空值忽略19.0 11.0 print(np.nanargmax(f), np.nanargmin(f)) # 将空值忽略3 1 # 0 1 2 3 4 5 6 g = np.array([1, 2, 4, 5, 6, 8, 9]) h = np.array([7, 3]) i = np.searchsorted(g, h) print(i) # [5 2] j = np.insert(g, i, h) print(j)
2、插值
import scipy.interpolate as si
一维插值器 = si.interp1d(离散样本水平坐标,离散样本垂直坐标,kind=‘插值算法’)
插值样本垂直坐标 = 一维插值器(插值样本水平坐标)
插值算法:
1、linear,缺省,线性插值
2、cubic,三次样条插值
import numpy as np import scipy.interpolate as si import matplotlib.pyplot as mp min_x, max_x = -2.5, 2.5 con_x = np.linspace(min_x, max_x, 1001) # 幅值逐渐衰减的函数 con_y = np.sinc(con_x) dis_x = np.linspace(min_x, max_x, 11) dis_y = np.sinc(dis_x) # 线性插值 linear = si.interp1d(dis_x, dis_y) lin_x = np.linspace(min_x,max_x, 51) lin_y = linear(lin_x) # 样条插值器 cubic = si.interp1d(dis_x, dis_y, kind='cubic') cub_x = np.linspace(min_x,max_x, 51) cub_y = cubic(cub_x) mp.figure('Interpolation', facecolor='lightgray') mp.subplot(221) mp.title('Interpolation', fontsize=16) mp.ylabel('y', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(con_x, con_y, c='hotpink', label='Continuous') mp.legend() mp.subplot(222) mp.title('Discrete', fontsize=16) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.scatter(dis_x, dis_y, c='orangered', label='Discrete') mp.legend() mp.subplot(223) mp.title('Linear', fontsize=16) mp.xlabel('x', fontsize=12) mp.ylabel('y', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(lin_x, lin_y,'o-', c='limegreen', label='Continuous') mp.scatter(dis_x, dis_y, c='orangered', label='Discrete', zorder=3,s=60) mp.legend() mp.subplot(224) mp.title('Cubic', fontsize=16) mp.xlabel('x', fontsize=12) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(cub_x, cub_y,'o-', c='dodgerblue', label='Cubic') mp.scatter(dis_x, dis_y, c='orangered', label='Discrete', zorder=3,s=60) mp.legend() mp.show()
3、定积分
import scipy.integrate as si
si.quad(积分函数,积分下限,积分上限)[0]->积分结果
/b
| f(x)dx
/a
import numpy as np import matplotlib.pyplot as mp import matplotlib.patches as mc import scipy.integrate as si def f(x): return 2*x**2 + 3*x + 4 a, b =-5, 5 x1 = np.linspace(a, b, 1001) y1 = f(x1) n = 50 x2 = np.linspace(a, b, n+1) y2 = f(x2) area = 0 for i in range(n): area += (y2[i] + y2[i+1]) * (x2[i+1] - x2[i]) / 2 print(area) # 206.8 # 使用定积分计算面积 area = si.quad(f ,a, b)[0] print(area) # 206.66666666666669 mp.figure('Intergral', facecolor='lightgray') mp.title('Intergral', fontsize=16) mp.xlabel('x', fontsize=14) mp.ylabel('y', fontsize=14) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.plot(x1, y1, c='orangered',linewidth=8, label=r'$y=2x^2+3x+4$', zorder=0) # 画梯形,给出梯形的四个顶点 for i in range(n): mp.gca().add_patch( mc.Polygon([ [x2[i],0], [x2[i],y2[i]], [x2[i+1],y2[i+1]], [x2[i+1],0],], fc='deepskyblue',ec='dodgerblue',alpha=0.5) ) mp.legend() mp.show()
4、金融计算
import numpy as np # 终值 = fv(利率,期数,每期支付,现值) # 将1000元以1%的利率存入银行5年,每年加存100元,到期后本息合计多少钱 fv = np.fv(0.01, 5, -100, -1000) print(round(fv,2)) # 1561.11 # 现值 = pv(利率,期数,每期支付,终值) # 将多少钱以1%的利率存入银行5年,每年加存100元,到期后本息合计fv钱 pv = np.pv(0.01, 5, -100, fv) print(pv) # -1000.0 # 净现值 = npv(利率,现金流) # 将1000元以1%的利率存入银行5年,每年加存100元,相当于一次存入多少钱 npv = np.npv(0.01,[-1000, -100,-100,-100,-100,-100]) print(round(npv,2)) # -1485.34 fv = np.fv(0.01, 5, 0, npv) print(round(fv,2)) # 1561.11 # 内部收益率 = irr(现金流) # 将1000元存入银行5年,以后逐年提现100,200,300,400,500。银行年利率达到多少,可在最后一次提现后偿还全部本息 # 相当于净现值为0的利率 irr = np.irr([-1000,100,200,300,400,500]) print(round(irr,2)) # 0.12 npv = np.npv(irr,[-1000,100,200,300,400,500]) print(round(npv,2)) # 0.0 # 每期支付 = pmt(利率,期数,现值),终值为0 # 以1%的年利率从银行贷款1000元,份5年还清,平均每年还多少钱? pmt = np.pmt(0.01, 5, 1000) print(round(pmt,2)) # -206.04 # 期数 = nper(利率,每期支付,现值) # 以1%的年利率从银行贷款1000元,平均每年还pmt元,多少年还清? nper = np.nper(0.01,pmt,1000) print(nper) # 利率 = rate(期数,每期支付, 现值,终值) # 以1%的年利率从银行贷款1000元,平均每年还pmt元,nper年还清,年利率是多少 rate = np.rate(nper, pmt, 1000, 0) print(rate) # 0.01