利用Python进行数据分析_Pandas_数据规整
数据规整
1.时间序列以及截面对齐
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
import warnings
warnings.filterwarnings("ignore")
# 设置一个日期范围
date_range = pd.date_range(start="2023-01-01", end="2023-01-10", freq="D")
# 为4只股票生成随机股价
stock_symbols = ['AAPL', 'GOOGL', 'MSFT', 'AMZN']
#np.random.rand 用于生成指定形状的在 [0.0, 1.0) 范围内均匀分布的随机数。
prices_data = np.random.rand(len(date_range), len(stock_symbols)) * 100 # 随机价格
prices = pd.DataFrame(prices_data, index=date_range, columns=stock_symbols)
# 为相同的4只股票生成随机交易量,但日期范围是s1的子集
subset_dates = date_range[::2] # 使用每隔两天的日期作为s2的子集
volumes_data = np.random.randint(100000, 500000, size=(len(subset_dates), len(stock_symbols))) # 随机交易量
volumes = pd.DataFrame(volumes_data, index=subset_dates, columns=stock_symbols)
# 打印生成的DataFrame
prices
AAPL | GOOGL | MSFT | AMZN | |
---|---|---|---|---|
2023-01-01 | 14.314858 | 3.770164 | 47.853384 | 52.129960 |
2023-01-02 | 49.168337 | 16.809880 | 50.745822 | 4.065592 |
2023-01-03 | 51.006419 | 87.196374 | 56.078768 | 9.049886 |
2023-01-04 | 21.995947 | 8.197457 | 41.555084 | 57.651605 |
2023-01-05 | 46.431166 | 83.819638 | 78.740614 | 80.649507 |
2023-01-06 | 55.849528 | 89.490260 | 22.954482 | 62.232844 |
2023-01-07 | 93.226985 | 56.326575 | 26.826220 | 55.494495 |
2023-01-08 | 78.876867 | 88.315382 | 58.793917 | 13.255849 |
2023-01-09 | 69.001249 | 48.880222 | 49.288958 | 56.896331 |
2023-01-10 | 8.710692 | 96.346660 | 40.227193 | 18.281541 |
volumes
AAPL | GOOGL | MSFT | AMZN | |
---|---|---|---|---|
2023-01-01 | 300532 | 364114 | 475873 | 490740 |
2023-01-03 | 464820 | 447201 | 323960 | 427487 |
2023-01-05 | 470647 | 452899 | 205389 | 168949 |
2023-01-07 | 113168 | 482721 | 184416 | 387871 |
2023-01-09 | 271816 | 232564 | 285368 | 121470 |
prices*volumes
AAPL | GOOGL | MSFT | AMZN | |
---|---|---|---|---|
2023-01-01 | 4.302073e+06 | 1.372770e+06 | 2.277213e+07 | 2.558226e+07 |
2023-01-02 | NaN | NaN | NaN | NaN |
2023-01-03 | 2.370880e+07 | 3.899431e+07 | 1.816728e+07 | 3.868709e+06 |
2023-01-04 | NaN | NaN | NaN | NaN |
2023-01-05 | 2.185269e+07 | 3.796183e+07 | 1.617246e+07 | 1.362565e+07 |
2023-01-06 | NaN | NaN | NaN | NaN |
2023-01-07 | 1.055031e+07 | 2.719002e+07 | 4.947184e+06 | 2.152471e+07 |
2023-01-08 | NaN | NaN | NaN | NaN |
2023-01-09 | 1.875564e+07 | 1.136778e+07 | 1.406549e+07 | 6.911197e+06 |
2023-01-10 | NaN | NaN | NaN | NaN |
通过一组索引不同的Series构建一个DataFrame
import pandas as pd
import numpy as np
# 生成一些示例数据
data1 = np.random.randn(5) # 5个随机数
data2 = np.random.randint(1, 10, 6) # 6个在 [1, 10) 范围内的随机整数
data3 = np.random.random(4) # 4个在 [0.0, 1.0) 范围内的随机小数
data4 = ["apple", "banana", "orange", "grape"] # 一些字符串数据
# 创建四个具有不同索引的 Series
index1 = [10, 20, 30, 40, 50]
index2 = ['A', 'B', 'C', 'D', 'E', 'F']
index3 = pd.date_range('2023-01-01', periods=4, freq='D')
index4 = ['one', 'two', 'three', 'four']
series1 = pd.Series(data1, index=index1)
series2 = pd.Series(data2, index=index2)
series3 = pd.Series(data3, index=index3)
series4 = pd.Series(data4, index=index4)
DataFrame({'one':series1,'two':series2, 'three':series3, 'four':series4})
C:\Users\hspcadmin\AppData\Local\Temp\ipykernel_13588\3484969428.py:21: RuntimeWarning: '<' not supported between instances of 'Timestamp' and 'int', sort order is undefined for incomparable objects.
DataFrame({'one':series1,'two':series2, 'three':series3, 'four':series4})
one | two | three | four | |
---|---|---|---|---|
10 | -0.967830 | NaN | NaN | NaN |
20 | -2.051181 | NaN | NaN | NaN |
30 | 0.816328 | NaN | NaN | NaN |
40 | 1.028584 | NaN | NaN | NaN |
50 | -0.017745 | NaN | NaN | NaN |
A | NaN | 7.0 | NaN | NaN |
B | NaN | 2.0 | NaN | NaN |
C | NaN | 6.0 | NaN | NaN |
D | NaN | 1.0 | NaN | NaN |
E | NaN | 5.0 | NaN | NaN |
F | NaN | 3.0 | NaN | NaN |
2023-01-01 00:00:00 | NaN | NaN | 0.851957 | NaN |
2023-01-02 00:00:00 | NaN | NaN | 0.241342 | NaN |
2023-01-03 00:00:00 | NaN | NaN | 0.514155 | NaN |
2023-01-04 00:00:00 | NaN | NaN | 0.093532 | NaN |
one | NaN | NaN | NaN | apple |
two | NaN | NaN | NaN | banana |
three | NaN | NaN | NaN | orange |
four | NaN | NaN | NaN | grape |
2.频率不同的时间序列的计算
t = Series(np.random.randn(3),index=pd.date_range('2023-01-01',periods=3,freq='W-WED'))
t
2023-01-04 0.143276
2023-01-11 -0.917840
2023-01-18 -1.320858
Freq: W-WED, dtype: float64
tt_resample = t.resample('B')
tt_resample
<pandas.core.resample.DatetimeIndexResampler object at 0x000001775C717C50>
tt_resample_ffill = t.resample('B').ffill()
tt_resample_ffill
2023-01-04 0.143276
2023-01-05 0.143276
2023-01-06 0.143276
2023-01-09 0.143276
2023-01-10 0.143276
2023-01-11 -0.917840
2023-01-12 -0.917840
2023-01-13 -0.917840
2023-01-16 -0.917840
2023-01-17 -0.917840
2023-01-18 -1.320858
Freq: B, dtype: float64
t.reindex(tt_resample_ffill.index).ffill()
2023-01-04 0.143276
2023-01-05 0.143276
2023-01-06 0.143276
2023-01-09 0.143276
2023-01-10 0.143276
2023-01-11 -0.917840
2023-01-12 -0.917840
2023-01-13 -0.917840
2023-01-16 -0.917840
2023-01-17 -0.917840
2023-01-18 -1.320858
Freq: B, dtype: float64
# 每年 3 月底结束一个季度
q_mar = pd.date_range('2023-01-01', periods=4, freq='Q-MAR')
# 每年 9 月底结束一个季度
q_sep = pd.date_range('2023-01-01', periods=4, freq='Q-SEP')
q_sep
DatetimeIndex(['2023-03-31', '2023-06-30', '2023-09-30', '2023-12-31'], dtype='datetime64[ns]', freq='Q-SEP')
q_mar
DatetimeIndex(['2023-03-31', '2023-06-30', '2023-09-30', '2023-12-31'], dtype='datetime64[ns]', freq='Q-MAR')
3.时间和“最当前”数据的选取
import akshare as ak
stock_zh_a_tick_tx_js_df = ak.stock_zh_a_tick_tx_js(symbol="sh600570")
stock_zh_a_tick_tx_js_df
D:\Program Files\Python\Python311\Lib\site-packages\akshare\stock\stock_zh_a_tick_tx.py:27: UserWarning: 正在下载数据,请稍等
warnings.warn("正在下载数据,请稍等")
成交时间 | 成交价格 | 价格变动 | 成交量 | 成交金额 | 性质 | |
---|---|---|---|---|---|---|
0 | 09:25:02 | 28.55 | 0.00 | 485 | 1384675 | 卖盘 |
1 | 09:30:02 | 28.55 | 0.00 | 31 | 88492 | 买盘 |
2 | 09:30:05 | 28.56 | 0.01 | 172 | 491086 | 买盘 |
3 | 09:30:08 | 28.59 | 0.03 | 55 | 157070 | 买盘 |
4 | 09:30:11 | 28.55 | -0.04 | 142 | 405509 | 卖盘 |
... | ... | ... | ... | ... | ... | ... |
688 | 10:04:50 | 28.07 | 0.00 | 33 | 92641 | 卖盘 |
689 | 10:04:53 | 28.08 | 0.01 | 5 | 14038 | 买盘 |
690 | 10:04:56 | 28.08 | 0.00 | 6 | 16847 | 买盘 |
691 | 10:04:59 | 28.07 | -0.01 | 77 | 216164 | 卖盘 |
692 | 10:05:02 | 28.07 | 0.00 | 129 | 362237 | 卖盘 |
693 rows × 6 columns
from datetime import time
stock_zh_a_tick_tx_js_df.to_timestamp
<bound method DataFrame.to_timestamp of 成交时间 成交价格 价格变动 成交量 成交金额 性质
成交时间
1900-01-01 09:25:02 09:25:02 28.55 0.00 485 1384675 卖盘
1900-01-01 09:30:02 09:30:02 28.55 0.00 31 88492 买盘
1900-01-01 09:30:05 09:30:05 28.56 0.01 172 491086 买盘
1900-01-01 09:30:08 09:30:08 28.59 0.03 55 157070 买盘
1900-01-01 09:30:11 09:30:11 28.55 -0.04 142 405509 卖盘
... ... ... ... ... ... ..
1900-01-01 10:04:50 10:04:50 28.07 0.00 33 92641 卖盘
1900-01-01 10:04:53 10:04:53 28.08 0.01 5 14038 买盘
1900-01-01 10:04:56 10:04:56 28.08 0.00 6 16847 买盘
1900-01-01 10:04:59 10:04:59 28.07 -0.01 77 216164 卖盘
1900-01-01 10:05:02 10:05:02 28.07 0.00 129 362237 卖盘
[693 rows x 6 columns]>
# 将时间戳列设置为 DatetimeIndex
stock_zh_a_tick_tx_js_df.index = pd.to_datetime(stock_zh_a_tick_tx_js_df['成交时间'], format='%H:%M:%S')
stock_zh_a_tick_tx_js_df
成交时间 | 成交价格 | 价格变动 | 成交量 | 成交金额 | 性质 | |
---|---|---|---|---|---|---|
成交时间 | ||||||
1900-01-01 09:25:02 | 09:25:02 | 28.55 | 0.00 | 485 | 1384675 | 卖盘 |
1900-01-01 09:30:02 | 09:30:02 | 28.55 | 0.00 | 31 | 88492 | 买盘 |
1900-01-01 09:30:05 | 09:30:05 | 28.56 | 0.01 | 172 | 491086 | 买盘 |
1900-01-01 09:30:08 | 09:30:08 | 28.59 | 0.03 | 55 | 157070 | 买盘 |
1900-01-01 09:30:11 | 09:30:11 | 28.55 | -0.04 | 142 | 405509 | 卖盘 |
... | ... | ... | ... | ... | ... | ... |
1900-01-01 10:04:50 | 10:04:50 | 28.07 | 0.00 | 33 | 92641 | 卖盘 |
1900-01-01 10:04:53 | 10:04:53 | 28.08 | 0.01 | 5 | 14038 | 买盘 |
1900-01-01 10:04:56 | 10:04:56 | 28.08 | 0.00 | 6 | 16847 | 买盘 |
1900-01-01 10:04:59 | 10:04:59 | 28.07 | -0.01 | 77 | 216164 | 卖盘 |
1900-01-01 10:05:02 | 10:05:02 | 28.07 | 0.00 | 129 | 362237 | 卖盘 |
693 rows × 6 columns
selected_rows = stock_zh_a_tick_tx_js_df.between_time(time(9, 50), time(10, 1))
selected_rows
成交时间 | 成交价格 | 价格变动 | 成交量 | 成交金额 | 性质 | |
---|---|---|---|---|---|---|
成交时间 | ||||||
1900-01-01 09:50:02 | 09:50:02 | 27.91 | -0.02 | 304 | 848440 | 卖盘 |
1900-01-01 09:50:05 | 09:50:05 | 27.90 | -0.01 | 879 | 2451488 | 卖盘 |
1900-01-01 09:50:08 | 09:50:08 | 27.89 | -0.01 | 149 | 415565 | 买盘 |
1900-01-01 09:50:11 | 09:50:11 | 27.88 | -0.01 | 114 | 317867 | 卖盘 |
1900-01-01 09:50:14 | 09:50:14 | 27.89 | 0.01 | 47 | 131033 | 买盘 |
... | ... | ... | ... | ... | ... | ... |
1900-01-01 10:00:47 | 10:00:47 | 28.05 | 0.01 | 15 | 42061 | 买盘 |
1900-01-01 10:00:50 | 10:00:50 | 28.04 | -0.01 | 37 | 103770 | 卖盘 |
1900-01-01 10:00:53 | 10:00:53 | 28.06 | 0.02 | 39 | 109408 | 买盘 |
1900-01-01 10:00:56 | 10:00:56 | 28.11 | 0.05 | 34 | 95503 | 买盘 |
1900-01-01 10:00:59 | 10:00:59 | 28.10 | -0.01 | 115 | 323101 | 卖盘 |
216 rows × 6 columns
selected_rows_at_time = stock_zh_a_tick_tx_js_df.at_time(time(9, 50))
selected_rows_at_time
成交时间 | 成交价格 | 价格变动 | 成交量 | 成交金额 | 性质 | |
---|---|---|---|---|---|---|
成交时间 |
4.拼接多个数据源
import akshare as ak
import pandas as pd
symbols = ['000001', '600570']
historical_data = {}
for symbol in symbols:
data = ak.stock_zh_a_hist(
symbol=symbol,
period="daily",
start_date="20230101",
end_date='20231219',
adjust="qfq"
)
historical_data[symbol] = data
df_list = [data for data in historical_data.values()]
result_df = pd.concat(df_list, keys=historical_data.keys(), axis=1)
result_df
000001 | ... | 600570 | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
日期 | 开盘 | 收盘 | 最高 | 最低 | 成交量 | 成交额 | 振幅 | 涨跌幅 | 涨跌额 | ... | 开盘 | 收盘 | 最高 | 最低 | 成交量 | 成交额 | 振幅 | 涨跌幅 | 涨跌额 | 换手率 | |
0 | 2023-01-03 | 12.92 | 13.49 | 13.57 | 12.77 | 2194128 | 2.971547e+09 | 6.21 | 4.74 | 0.61 | ... | 40.22 | 41.77 | 42.04 | 39.98 | 188785 | 7.876238e+08 | 5.11 | 3.57 | 1.44 | 0.99 |
1 | 2023-01-04 | 13.43 | 14.04 | 14.14 | 13.35 | 2189683 | 3.110729e+09 | 5.86 | 4.08 | 0.55 | ... | 41.94 | 42.63 | 42.92 | 41.24 | 257842 | 1.088131e+09 | 4.02 | 2.06 | 0.86 | 1.36 |
2 | 2023-01-05 | 14.12 | 14.20 | 14.46 | 14.09 | 1665425 | 2.417272e+09 | 2.64 | 1.14 | 0.16 | ... | 42.19 | 41.96 | 42.61 | 41.57 | 99206 | 4.184562e+08 | 2.44 | -1.57 | -0.67 | 0.52 |
3 | 2023-01-06 | 14.22 | 14.34 | 14.44 | 14.20 | 1195745 | 1.747915e+09 | 1.69 | 0.99 | 0.14 | ... | 42.30 | 42.08 | 43.87 | 41.57 | 218465 | 9.297619e+08 | 5.48 | 0.29 | 0.12 | 1.15 |
4 | 2023-01-09 | 14.47 | 14.52 | 14.60 | 14.24 | 1057659 | 1.561368e+09 | 2.51 | 1.26 | 0.18 | ... | 41.45 | 42.40 | 42.98 | 40.77 | 191544 | 8.106186e+08 | 5.25 | 0.76 | 0.32 | 1.01 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
229 | 2023-12-13 | 9.38 | 9.16 | 9.39 | 9.15 | 1061302 | 9.810012e+08 | 2.55 | -2.76 | -0.26 | ... | 29.78 | 29.34 | 30.16 | 29.33 | 235863 | 7.014522e+08 | 2.79 | -1.28 | -0.38 | 1.24 |
230 | 2023-12-14 | 9.21 | 9.15 | 9.28 | 9.15 | 742901 | 6.832115e+08 | 1.42 | -0.11 | -0.01 | ... | 29.60 | 29.17 | 29.87 | 29.15 | 155482 | 4.586276e+08 | 2.45 | -0.58 | -0.17 | 0.82 |
231 | 2023-12-15 | 9.20 | 9.21 | 9.35 | 9.19 | 988939 | 9.151261e+08 | 1.75 | 0.66 | 0.06 | ... | 29.32 | 28.77 | 29.36 | 28.71 | 185556 | 5.368904e+08 | 2.23 | -1.37 | -0.40 | 0.98 |
232 | 2023-12-18 | 9.18 | 9.13 | 9.24 | 9.09 | 654426 | 5.993790e+08 | 1.63 | -0.87 | -0.08 | ... | 28.55 | 28.49 | 29.18 | 28.30 | 160454 | 4.597516e+08 | 3.06 | -0.97 | -0.28 | 0.84 |
233 | 2023-12-19 | 9.12 | 9.10 | 9.17 | 9.07 | 644071 | 5.867167e+08 | 1.10 | -0.33 | -0.03 | ... | 28.48 | 28.66 | 28.81 | 28.27 | 150101 | 4.282943e+08 | 1.90 | 0.60 | 0.17 | 0.79 |
234 rows × 22 columns
5.收益指数和累计收益
import akshare as ak
stock_zh_a_hist_df = ak.stock_zh_a_hist(symbol="600570", period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
stock_zh_a_hist_df
日期 | 开盘 | 收盘 | 最高 | 最低 | 成交量 | 成交额 | 振幅 | 涨跌幅 | 涨跌额 | 换手率 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01-03 | 40.22 | 41.77 | 42.04 | 39.98 | 188785 | 7.876238e+08 | 5.11 | 3.57 | 1.44 | 0.99 |
1 | 2023-01-04 | 41.94 | 42.63 | 42.92 | 41.24 | 257842 | 1.088131e+09 | 4.02 | 2.06 | 0.86 | 1.36 |
2 | 2023-01-05 | 42.19 | 41.96 | 42.61 | 41.57 | 99206 | 4.184562e+08 | 2.44 | -1.57 | -0.67 | 0.52 |
3 | 2023-01-06 | 42.30 | 42.08 | 43.87 | 41.57 | 218465 | 9.297619e+08 | 5.48 | 0.29 | 0.12 | 1.15 |
4 | 2023-01-09 | 41.45 | 42.40 | 42.98 | 40.77 | 191544 | 8.106186e+08 | 5.25 | 0.76 | 0.32 | 1.01 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
229 | 2023-12-13 | 29.78 | 29.34 | 30.16 | 29.33 | 235863 | 7.014522e+08 | 2.79 | -1.28 | -0.38 | 1.24 |
230 | 2023-12-14 | 29.60 | 29.17 | 29.87 | 29.15 | 155482 | 4.586276e+08 | 2.45 | -0.58 | -0.17 | 0.82 |
231 | 2023-12-15 | 29.32 | 28.77 | 29.36 | 28.71 | 185556 | 5.368904e+08 | 2.23 | -1.37 | -0.40 | 0.98 |
232 | 2023-12-18 | 28.55 | 28.49 | 29.18 | 28.30 | 160454 | 4.597516e+08 | 3.06 | -0.97 | -0.28 | 0.84 |
233 | 2023-12-19 | 28.48 | 28.66 | 28.81 | 28.27 | 150101 | 4.282943e+08 | 1.90 | 0.60 | 0.17 | 0.79 |
234 rows × 11 columns
# 将日期列转换为 datetime 类型的索引
stock_zh_a_hist_df.index = pd.to_datetime(stock_zh_a_hist_df['日期'])
# 选择特定日期的数据,比如 "2023-01-05"
selected_date_1 = stock_zh_a_hist_df.loc['2023-01-05']
selected_date_2 = stock_zh_a_hist_df.loc['2023-12-19']
closed = stock_zh_a_hist_df['收盘']
closed
日期
2023-01-03 41.77
2023-01-04 42.63
2023-01-05 41.96
2023-01-06 42.08
2023-01-09 42.40
...
2023-12-13 29.34
2023-12-14 29.17
2023-12-15 28.77
2023-12-18 28.49
2023-12-19 28.66
Name: 收盘, Length: 234, dtype: float64
# 获取特定日期的收盘价
closing_price_1 = selected_date_1['收盘']
closing_price_2 = selected_date_2['收盘']
closing_price_1
41.96
closing_price_2
28.66
change_rate = (closing_price_2/closing_price_1-1)*100
change_rate
-31.696854146806487
pct_change() 方法来计算相邻两个元素之间的百分比变化
returns = closed.pct_change()
ret_index = (1+returns).cumprod()
ret_index[0] = 1
ret_index
日期
2023-01-03 1.000000
2023-01-04 1.020589
2023-01-05 1.004549
2023-01-06 1.007422
2023-01-09 1.015083
...
2023-12-13 0.702418
2023-12-14 0.698348
2023-12-15 0.688772
2023-12-18 0.682068
2023-12-19 0.686138
Name: 收盘, Length: 234, dtype: float64
# 计算指定日期内的累计收益
m_returns = ret_index.resample('BM').last()
m_returns
日期
2023-01-31 1.127364
2023-02-28 1.059852
2023-03-31 1.271008
2023-04-28 1.179555
2023-05-31 1.013886
2023-06-30 1.060330
2023-07-31 0.984439
2023-08-31 0.861863
2023-09-29 0.776873
2023-10-31 0.745032
2023-11-30 0.718937
2023-12-29 0.686138
Freq: BM, Name: 收盘, dtype: float64
m_returns.pct_change()
日期
2023-01-31 NaN
2023-02-28 -0.059885
2023-03-31 0.199232
2023-04-28 -0.071953
2023-05-31 -0.140451
2023-06-30 0.045809
2023-07-31 -0.071574
2023-08-31 -0.124514
2023-09-29 -0.098611
2023-10-31 -0.040986
2023-11-30 -0.035026
2023-12-29 -0.045621
Freq: BM, Name: 收盘, dtype: float64
m_returns['2023-08-31']
0.8618625807996175
6.分组变换和分析
略
7.分组因子暴露
在进行因子分析之前,需要明确你想要分析的因子是什么。因子可以是与股票价格变动相关的任何特征,比如财务指标、技术指标、市值等。通常,因子分析的目标是找到与股票收益变化相关的因子。
以下是一个简单的例子,假设你想分析的因子是每个股票的收益率(以百分比形式)。你可以使用pct_change方法计算每个股票的收益率,并将其作为因子进行分析:
import akshare as ak
import pandas as pd
import statsmodels.api as sm
# 定义股票代码列表
stock_symbols = ['600570', '600313']
# 初始化一个空的DataFrame来存储合并的数据
combined_df = pd.DataFrame()
# 为每个股票代码获取历史数据并将结果合并
for symbol in stock_symbols:
# 获取每个股票的日线历史行情数据
stock_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
# 添加一个新列,用于存储股票代码
stock_data['证券代码'] = symbol
# 将每个股票的数据合并到总的DataFrame中
combined_df = pd.concat([combined_df, stock_data])
# 计算每个股票的收益率并作为因子进行分析
combined_df['收益率'] = combined_df['收盘'].pct_change() * 100
# 移除缺失值
combined_df = combined_df.dropna()
# 将因子分析模型拟合到数据
X = sm.add_constant(combined_df['收益率']) # 添加常数项
y = combined_df['涨跌幅']
model = sm.OLS(y, X).fit()
# 显示因子分析的结果
model.summary()
Dep. Variable: | 涨跌幅 | R-squared: | 0.326 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.325 |
Method: | Least Squares | F-statistic: | 225.3 |
Date: | Wed, 20 Dec 2023 | Prob (F-statistic): | 8.11e-42 |
Time: | 11:28:06 | Log-Likelihood: | -921.04 |
No. Observations: | 467 | AIC: | 1846. |
Df Residuals: | 465 | BIC: | 1854. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -0.0438 | 0.081 | -0.542 | 0.588 | -0.203 | 0.115 |
收益率 | 0.3328 | 0.022 | 15.011 | 0.000 | 0.289 | 0.376 |
Omnibus: | 512.023 | Durbin-Watson: | 1.958 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 58498.105 |
Skew: | 4.665 | Prob(JB): | 0.00 |
Kurtosis: | 57.030 | Cond. No. | 3.66 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
因子暴露(Factor Exposure)是指股票对每个因子的敏感程度,即每个因子对于股票收益的贡献。可以通过回归分析来计算因子暴露。
import akshare as ak
import pandas as pd
import statsmodels.api as sm
# 定义股票代码列表
stock_symbols = ['600570', '600313']
# 初始化一个空的DataFrame来存储合并的数据
combined_df = pd.DataFrame()
# 为每个股票代码获取历史数据并将结果合并
for symbol in stock_symbols:
# 获取每个股票的日线历史行情数据
stock_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
# 添加一个新列,用于存储股票代码
stock_data['证券代码'] = symbol
# 将每个股票的数据合并到总的DataFrame中
combined_df = pd.concat([combined_df, stock_data])
# 计算每个股票的收益率和涨跌幅
combined_df['收益率'] = combined_df['收盘'].pct_change() * 100
combined_df['涨跌幅'] = combined_df['涨跌幅'].astype(float)
# 移除缺失值
combined_df = combined_df.dropna()
# 构建回归模型,使用涨跌幅作为因变量,收益率作为自变量
X = sm.add_constant(combined_df['收益率']) # 添加常数项
y = combined_df['涨跌幅']
# 拟合回归模型
model = sm.OLS(y, X).fit()
# 提取因子暴露
factor_exposure = model.params['收益率']
# 显示因子暴露
print(f"因子暴露: {factor_exposure}")
因子暴露: 0.33282243711466647
import akshare as ak
import pandas as pd
import statsmodels.api as sm
# 定义股票代码列表
stock_symbols = ['600570', '600313']
# 初始化一个空的DataFrame来存储合并的数据
combined_df = pd.DataFrame()
# 为每个股票代码获取历史数据并将结果合并
for symbol in stock_symbols:
# 获取每个股票的日线历史行情数据
stock_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
# 添加一个新列,用于存储股票代码
stock_data['证券代码'] = symbol
# 将每个股票的数据合并到总的DataFrame中
combined_df = pd.concat([combined_df, stock_data])
# 计算每个股票的收益率、涨跌幅和成交量
combined_df['收益率'] = combined_df['收盘'].pct_change() * 100
combined_df['涨跌幅'] = combined_df['涨跌幅'].astype(float)
combined_df['成交量'] = combined_df['成交量'].astype(float)
# 移除缺失值
combined_df = combined_df.dropna()
# 构建多元回归模型,使用涨跌幅和成交量作为因变量,收益率作为自变量
X = sm.add_constant(combined_df[['收益率', '成交量']]) # 添加常数项
y = combined_df['涨跌幅']
# 拟合多元回归模型
model = sm.OLS(y, X).fit()
# 提取因子暴露
factor_exposure_returns = model.params['收益率']
factor_exposure_volume = model.params['成交量']
# 显示因子暴露
print(f"收益率因子暴露: {factor_exposure_returns}")
print(f"成交量因子暴露: {factor_exposure_volume}")
收益率因子暴露: 0.3172937471715912
成交量因子暴露: 2.146227651605541e-06
# 显示多元回归的结果
model.summary()
Dep. Variable: | 涨跌幅 | R-squared: | 0.361 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.358 |
Method: | Least Squares | F-statistic: | 130.8 |
Date: | Wed, 20 Dec 2023 | Prob (F-statistic): | 8.71e-46 |
Time: | 13:09:59 | Log-Likelihood: | -908.88 |
No. Observations: | 467 | AIC: | 1824. |
Df Residuals: | 464 | BIC: | 1836. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -0.5781 | 0.133 | -4.341 | 0.000 | -0.840 | -0.316 |
收益率 | 0.3173 | 0.022 | 14.523 | 0.000 | 0.274 | 0.360 |
成交量 | 2.146e-06 | 4.31e-07 | 4.980 | 0.000 | 1.3e-06 | 2.99e-06 |
Omnibus: | 454.093 | Durbin-Watson: | 1.926 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 47379.024 |
Skew: | 3.792 | Prob(JB): | 0.00 |
Kurtosis: | 51.758 | Cond. No. | 5.22e+05 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.22e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
import akshare as ak
import pandas as pd
# 定义股票代码列表
stock_symbols = ['600004']
# 根据需要的年份(例如2020年),筛选数据
desired_year = 2020
# 初始化一个空的DataFrame来存储合并的数据
stock_financial_analysis_indicator_df = pd.DataFrame()
# 为每个股票代码获取财务分析指标数据并将结果合并
for symbol in stock_symbols:
# 获取每个股票的财务分析指标数据
stock_financial_analysis_indicator_data = ak.stock_financial_analysis_indicator(symbol=symbol)
# 根据指定年份筛选数据
filtered_df = stock_financial_analysis_indicator_data[stock_financial_analysis_indicator_data['日期'] == desired_year]
# 添加一个新列,用于存储股票代码
filtered_df['证券代码'] = symbol
# 将每个股票的数据合并到总的DataFrame中
stock_financial_analysis_indicator_df = pd.concat([stock_financial_analysis_indicator_df, filtered_df])
# 显示合并后的DataFrame
stock_financial_analysis_indicator_df
日期 | 摊薄每股收益(元) | 加权每股收益(元) | 每股收益_调整后(元) | 扣除非经常性损益后的每股收益(元) | 每股净资产_调整前(元) | 每股净资产_调整后(元) | 每股经营性现金流(元) | 每股资本公积金(元) | 每股未分配利润(元) | ... | 3年以内应收帐款(元) | 1年以内预付货款(元) | 1-2年以内预付货款(元) | 2-3年以内预付货款(元) | 3年以内预付货款(元) | 1年以内其它应收款(元) | 1-2年以内其它应收款(元) | 2-3年以内其它应收款(元) | 3年以内其它应收款(元) | 证券代码 |
---|
0 rows × 87 columns
8.十分位和四分位分析
# 通过pandas.qcut和groupby进行分位数分析
8.1 接入行情
import akshare as ak
stock_zh_a_ist_df = ak.stock_zh_a_hist(symbol="600570", period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
stock_zh_a_hist_df
日期 | 开盘 | 收盘 | 最高 | 最低 | 成交量 | 成交额 | 振幅 | 涨跌幅 | 涨跌额 | 换手率 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01-03 | 40.22 | 41.77 | 42.04 | 39.98 | 188785 | 7.876238e+08 | 5.11 | 3.57 | 1.44 | 0.99 |
1 | 2023-01-04 | 41.94 | 42.63 | 42.92 | 41.24 | 257842 | 1.088131e+09 | 4.02 | 2.06 | 0.86 | 1.36 |
2 | 2023-01-05 | 42.19 | 41.96 | 42.61 | 41.57 | 99206 | 4.184562e+08 | 2.44 | -1.57 | -0.67 | 0.52 |
3 | 2023-01-06 | 42.30 | 42.08 | 43.87 | 41.57 | 218465 | 9.297619e+08 | 5.48 | 0.29 | 0.12 | 1.15 |
4 | 2023-01-09 | 41.45 | 42.40 | 42.98 | 40.77 | 191544 | 8.106186e+08 | 5.25 | 0.76 | 0.32 | 1.01 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
229 | 2023-12-13 | 29.78 | 29.34 | 30.16 | 29.33 | 235863 | 7.014522e+08 | 2.79 | -1.28 | -0.38 | 1.24 |
230 | 2023-12-14 | 29.60 | 29.17 | 29.87 | 29.15 | 155482 | 4.586276e+08 | 2.45 | -0.58 | -0.17 | 0.82 |
231 | 2023-12-15 | 29.32 | 28.77 | 29.36 | 28.71 | 185556 | 5.368904e+08 | 2.23 | -1.37 | -0.40 | 0.98 |
232 | 2023-12-18 | 28.55 | 28.49 | 29.18 | 28.30 | 160454 | 4.597516e+08 | 3.06 | -0.97 | -0.28 | 0.84 |
233 | 2023-12-19 | 28.48 | 28.66 | 28.81 | 28.27 | 150101 | 4.282943e+08 | 1.90 | 0.60 | 0.17 | 0.79 |
234 rows × 11 columns
import akshare as ak
# 获取特定股票的历史行情数据
stock_zh_a_hist_df = ak.stock_zh_a_hist(
symbol="600570",
period="daily",
start_date="20230101",
end_date="20231219",
adjust="qfq"
)
# 显示历史行情数据
stock_zh_a_hist_df
日期 | 开盘 | 收盘 | 最高 | 最低 | 成交量 | 成交额 | 振幅 | 涨跌幅 | 涨跌额 | 换手率 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01-03 | 40.22 | 41.77 | 42.04 | 39.98 | 188785 | 7.876238e+08 | 5.11 | 3.57 | 1.44 | 0.99 |
1 | 2023-01-04 | 41.94 | 42.63 | 42.92 | 41.24 | 257842 | 1.088131e+09 | 4.02 | 2.06 | 0.86 | 1.36 |
2 | 2023-01-05 | 42.19 | 41.96 | 42.61 | 41.57 | 99206 | 4.184562e+08 | 2.44 | -1.57 | -0.67 | 0.52 |
3 | 2023-01-06 | 42.30 | 42.08 | 43.87 | 41.57 | 218465 | 9.297619e+08 | 5.48 | 0.29 | 0.12 | 1.15 |
4 | 2023-01-09 | 41.45 | 42.40 | 42.98 | 40.77 | 191544 | 8.106186e+08 | 5.25 | 0.76 | 0.32 | 1.01 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
229 | 2023-12-13 | 29.78 | 29.34 | 30.16 | 29.33 | 235863 | 7.014522e+08 | 2.79 | -1.28 | -0.38 | 1.24 |
230 | 2023-12-14 | 29.60 | 29.17 | 29.87 | 29.15 | 155482 | 4.586276e+08 | 2.45 | -0.58 | -0.17 | 0.82 |
231 | 2023-12-15 | 29.32 | 28.77 | 29.36 | 28.71 | 185556 | 5.368904e+08 | 2.23 | -1.37 | -0.40 | 0.98 |
232 | 2023-12-18 | 28.55 | 28.49 | 29.18 | 28.30 | 160454 | 4.597516e+08 | 3.06 | -0.97 | -0.28 | 0.84 |
233 | 2023-12-19 | 28.48 | 28.66 | 28.81 | 28.27 | 150101 | 4.282943e+08 | 1.90 | 0.60 | 0.17 | 0.79 |
234 rows × 11 columns
8.2 计算日收益率
# 计算日收益率
stock_zh_a_hist_df['日收益率'] = stock_zh_a_hist_df['收盘'].pct_change()
# 显示包含日收益率的 DataFrame
stock_zh_a_hist_df[['日期', '收盘', '日收益率']]
日期 | 收盘 | 日收益率 | |
---|---|---|---|
0 | 2023-01-03 | 41.77 | NaN |
1 | 2023-01-04 | 42.63 | 0.020589 |
2 | 2023-01-05 | 41.96 | -0.015717 |
3 | 2023-01-06 | 42.08 | 0.002860 |
4 | 2023-01-09 | 42.40 | 0.007605 |
... | ... | ... | ... |
229 | 2023-12-13 | 29.34 | -0.012786 |
230 | 2023-12-14 | 29.17 | -0.005794 |
231 | 2023-12-15 | 28.77 | -0.013713 |
232 | 2023-12-18 | 28.49 | -0.009732 |
233 | 2023-12-19 | 28.66 | 0.005967 |
234 rows × 3 columns
8.3 将收益率变换为趋势信号
# 编写将收益率变换为趋势信号的函数
def generate_trend_signal(returns, lookback, lag):
signal = returns.rolling(window=lookback, min_periods=lookback-5).sum()
return signal.shift(lag)
# 使用函数生成趋势信号
lookback_period = 100
lag_period = 3
stock_zh_a_hist_df['趋势信号'] = generate_trend_signal(stock_zh_a_hist_df['日收益率'], lookback_period, lag_period)
# 显示包含趋势信号的 DataFrame
stock_zh_a_hist_df[['日期', '收盘', '日收益率', '趋势信号']]
日期 | 收盘 | 日收益率 | 趋势信号 | |
---|---|---|---|---|
0 | 2023-01-03 | 41.77 | NaN | NaN |
1 | 2023-01-04 | 42.63 | 0.020589 | NaN |
2 | 2023-01-05 | 41.96 | -0.015717 | NaN |
3 | 2023-01-06 | 42.08 | 0.002860 | NaN |
4 | 2023-01-09 | 42.40 | 0.007605 | NaN |
... | ... | ... | ... | ... |
229 | 2023-12-13 | 29.34 | -0.012786 | -0.200829 |
230 | 2023-12-14 | 29.17 | -0.005794 | -0.203740 |
231 | 2023-12-15 | 28.77 | -0.013713 | -0.236405 |
232 | 2023-12-18 | 28.49 | -0.009732 | -0.234316 |
233 | 2023-12-19 | 28.66 | 0.005967 | -0.239846 |
234 rows × 4 columns
8.4 编写交易策略
根据每周五动量信号进行交易
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
# 根据每周五的趋势信号生成交易策略
def weekly_momentum_strategy(df, signal_column='趋势信号'):
# 策略规则:如果趋势信号大于0,则持有股票;否则,不持有股票
# 如果 df['趋势信号'] 中的元素大于 0,则对应位置的 '信号' 列的值为 1。
# 如果 df['趋势信号'] 中的元素不大于 0,则对应位置的 '信号' 列的值为 0。
df['信号'] = np.where(df[signal_column] > 0, 1, 0)
# 计算每日收益率
df['日收益率'] = df['收盘'].pct_change()
# 计算策略的每日收益率
df['策略收益率'] = df['Position'].shift(1) * df['日收益率']
# 计算策略的累积收益率
df['累计收益率'] = (1 + df['策略收益率']).cumprod()
return df
# 应用策略并生成收益指数
strategy_df = weekly_momentum_strategy(stock_zh_a_hist_df)
# 显示包含策略结果的 DataFrame
strategy_df[['日期', '收盘', '趋势信号', '信号', '日收益率', '策略收益率', '累计收益率']]
日期 | 收盘 | 趋势信号 | 信号 | 日收益率 | 策略收益率 | 累计收益率 | |
---|---|---|---|---|---|---|---|
0 | 2023-01-03 | 41.77 | NaN | 0 | NaN | NaN | NaN |
1 | 2023-01-04 | 42.63 | NaN | 0 | 0.020589 | 0.0 | 1.000000 |
2 | 2023-01-05 | 41.96 | NaN | 0 | -0.015717 | -0.0 | 1.000000 |
3 | 2023-01-06 | 42.08 | NaN | 0 | 0.002860 | 0.0 | 1.000000 |
4 | 2023-01-09 | 42.40 | NaN | 0 | 0.007605 | 0.0 | 1.000000 |
... | ... | ... | ... | ... | ... | ... | ... |
229 | 2023-12-13 | 29.34 | -0.200829 | 0 | -0.012786 | -0.0 | 0.994311 |
230 | 2023-12-14 | 29.17 | -0.203740 | 0 | -0.005794 | -0.0 | 0.994311 |
231 | 2023-12-15 | 28.77 | -0.236405 | 0 | -0.013713 | -0.0 | 0.994311 |
232 | 2023-12-18 | 28.49 | -0.234316 | 0 | -0.009732 | -0.0 | 0.994311 |
233 | 2023-12-19 | 28.66 | -0.239846 | 0 | 0.005967 | 0.0 | 0.994311 |
234 rows × 7 columns
# 绘制收益指数图表
plt.figure(figsize=(10, 6))
plt.plot(strategy_df['日期'], strategy_df['累计收益率'], label='累计收益率')
plt.title('每周五动量信号交易策略累计收益率')
plt.xlabel('日期')
plt.ylabel('累计收益率')
plt.legend()
plt.show()
本文来自博客园,作者:江雪独钓翁,转载请注明原文链接:https://www.cnblogs.com/zhouwp/p/17916499.html