其他
赠书 | Python 预测股票价格,竟然这么简单
1 var hq_str_sh601006="大秦铁路,6.980,6.960,7.010,7.070,6.950,7.010,7.020,121033256,847861533.000,18900, 7.010,214867,7.000,66500,6.990,386166,6.980,336728,6.970,273750,7.020,836066,7.030,630800,7.040,936306,7.050,579400,7.060,2016-03-18,15:00:00,00";
0:<大秦铁路>,股票名字1:<< span="">6.980>,今日开盘价2:<< span="">6.960>,昨日收盘价3:<< span="">7.010>,当前价格4:<< span="">7.070>,今日最高价5:<< span="">6.950>,今日最低价6:<< span="">7.010>,竞买价,即“买一”报价7:<< span="">7.020>,竞卖价,即“卖一”报价8:<< span="">121033256>,成交的股票数,由于股票交易以一百股为基本单位,所以在使用时,通常把该值除以一百9:<< span="">847861533.000>,成交金额,单位为“元”,为了一目了然,通常以“万元”为成交金额的单位,所以通常把该值除以一万10:<< span="">18900>,“买一”申请4695股,即47手11:<< span="">7.010>,“买一”报价12:<< span="">214867>,“买二”13:<< span="">7.000>,“买二”14:<< span="">66500>,“买三”15:<< span="">6.990>,“买三”16:<< span="">386166>,“买四”17:<< span="">6.980>,“买四”18:<< span="">336728>,“买五”19:<< span="">6.970>,“买五”20:<< span="">273750>,“卖一”申报3100股,即31手21:<< span="">7.020>,“卖一”报价(22,23),(24,25),(26,27),(28,29)分别为“卖二”至“卖四的情况”30:<< span="">2016-03-18>,日期31:<< span="">15:00:00>,时间
2import pandas_datareader.data as web
3import datetime as dt
4data = web.DataReader('600519.ss','yahoo', dt.datetime(2019,8,1),dt.datetime(2019,8,31))
5data.head()
6 High Low Open Close Volume Adj Close
7# Date
8# 2019-08-01 977.000000 953.020020 976.51001 959.299988 3508952 959.299988
9# 2019-08-02 957.979980 943.000000 944.00000 954.450012 3971940 954.450012
10# 2019-08-05 954.000000 940.000000 945.00000 942.429993 3677431 942.429993
11# 2019-08-06 948.000000 923.799988 931.00000 946.299988 4399116 946.299988
12# 2019-08-07 955.530029 945.000000 949.50000 945.000000 2686998 945.000000
13
14kldata=data.values[:,[2,3,1,0]] # 分别对应开盘价、收盘价、最低价和最高价
15from pyecharts import options as opts
16from pyecharts.charts import Kline
17
18kobj = Kline().add_xaxis(data.index.strftime("%Y-%m-%d").tolist()).add_yaxis("贵州茅台-日K线图",kldata.tolist()).set_global_opts(
19 yaxis_opts=opts.AxisOpts(is_scale=True),
20 xaxis_opts=opts.AxisOpts(is_scale=True),
21 title_opts=opts.TitleOpts(title=""))
22kobj.render()
2import pandas_datareader.data as web
3import datetime as dt
4import pandas as pd
5import numpy as np
6
7data = web.DataReader('600519.ss','yahoo', dt.datetime(2014,1,1),dt.datetime(2019,9,30))
8subdata = data.iloc[:-30,:4]
9for i in range(4):
10 pvalue = stat.adfuller(subdata.values[:,i], 1)[1]
11 print("指标 ",data.columns[i]," 单位根检验的p值为:",pvalue)
12# 指标 High 单位根检验的p值为:0.9955202280850401
13# 指标 Low 单位根检验的p值为:0.9942509439755689
14# 指标 Open 单位根检验的p值为:0.9938548193990323
15# 指标 Close 单位根检验的p值为:0.9950049124079876
2for i in range(4):
3 pvalue = stat.adfuller(subdata_diff1[:,i], 1)[1]
4 print("指标 ",data.columns[i]," 单位根检验的p值为:",pvalue)
5# 指标 High 单位根检验的p值为:0.0
6# 指标 Low 单位根检验的p值为:0.0
7# 指标 Open 单位根检验的p值为:0.0
8# 指标 Close 单位根检验的p值为:0.0
2rows, cols = subdata_diff1.shape
3aicList = []
4lmList = []
5
6for p in range(1,11):
7 baseData = None
8 for i in range(p,rows):
9 tmp_list = list(subdata_diff1[i,:]) + list(subdata_diff1[i-p:i].flatten())
10 if baseData is None:
11 baseData = [tmp_list]
12 else:
13 baseData = np.r_[baseData, [tmp_list]]
14 X = np.c_[[1]*baseData.shape[0],baseData[:,cols:]]
15 Y = baseData[:,0:cols]
16 coefMatrix = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),Y)
17 aic = np.log(np.linalg.det(np.cov(Y - np.matmul(X,coefMatrix),rowvar=False))) + 2*(coefMatrix.shape[0]-1)**2*p/baseData.shape[0]
18 aicList.append(aic)
19 lmList.append(coefMatrix)
20
21#对比查看阶数和AIC
22pd.DataFrame({"P":range(1,11),"AIC":aicList})
23# P AIC
24# 0 1 13.580156
25# 1 2 13.312225
26# 2 3 13.543633
27# 3 4 14.266087
28# 4 5 15.512437
29# 5 6 17.539047
30# 6 7 20.457337
31# 7 8 24.385459
32# 8 9 29.438091
33# 9 10 35.785909
2n = rows
3preddf = None
4for i in range(30):
5 predData = list(subdata_diff1[n+i-p:n+i].flatten())
6 predVals = np.matmul([1]+predData,lmList[p-1])
7 # 使用逆差分运算,还原预测值
8 predVals=data.iloc[n+i,:].values[:4]+predVals
9 if preddf is None:
10 preddf = [predVals]
11 else:
12 preddf = np.r_[preddf, [predVals]]
13 # 为subdata_diff1增加一条新记录
14 subdata_diff1 = np.r_[subdata_diff1, [data.iloc[n+i+1,:].values[:4] - data.iloc[n+i,:].values[:4]]]
15
16#分析预测残差情况
17(np.abs(preddf - data.iloc[-30:data.shape[0],:4])/data.iloc[-30:data.shape[0],:4]).describe()
18# High Low Open Close
19# count 30.000000 30.000000 30.000000 30.000000
20# mean 0.010060 0.009380 0.005661 0.013739
21# std 0.008562 0.009968 0.006515 0.013674
22# min 0.001458 0.000115 0.000114 0.000130
23# 25% 0.004146 0.001950 0.001653 0.002785
24# 50% 0.007166 0.007118 0.002913 0.010414
25# 75% 0.014652 0.012999 0.006933 0.022305
26# max 0.039191 0.045802 0.024576 0.052800
2plt.figure(figsize=(10,7))
3for i in range(4):
4 plt.subplot(2,2,i+1)
5 plt.plot(range(30),data.iloc[-30:data.shape[0],i].values,'o-',c='black')
6 plt.plot(range(30),preddf[:,i],'o--',c='gray')
7 plt.ylim(1000,1200)
8 plt.ylabel("$"+data.columns[i]+"$")
9plt.show()
10v = 100*(1 - np.sum(np.abs(preddf - data.iloc[-30:data.shape[0],:4]).values)/np.sum(data.iloc[-30:data.shape[0],:4].values))
11print("Evaluation on test data: accuracy = %0.2f%% \n" % v)
12# Evaluation on test data: accuracy = 99.03%
本节主要基于LSTM算法对贵州茅台股票数据进行预测,该算法非常擅长序列数据的建模,由于引入了遗忘门等更为复杂的内部处理单元来处理上下文信息的存储与更新,这样既可以消除梯度问题的困扰,也可以对存在短期或长期依赖的数据建模,该算法在文本、语音等序列数据模型中广泛使用。本节从LSTM建模的数据要求及网络结构设计讲起,通过设置合理的参数,通过训练得到模型,并基于该模型进行预测,最后将结果与真实数据进行比较,评估预测效果。
2dim_in = 4
3dim_out = 4
4pred_len = 30
5vmean = data.iloc[:,:4].apply(lambda x:np.mean(x))
6vstd = data.iloc[:,:4].apply(lambda x:np.std(x))
7t0 = data.iloc[:,:4].apply(lambda x:(x-np.mean(x))/np.std(x)).values
8X_train = np.zeros((t0.shape[0]-SEQLEN-pred_len, SEQLEN, dim_in))
9Y_train = np.zeros((t0.shape[0]-SEQLEN-pred_len, dim_out),)
10X_test = np.zeros((pred_len, SEQLEN, dim_in))
11Y_test = np.zeros((pred_len, dim_out),)
12for i in range(SEQLEN, t0.shape[0]-pred_len):
13 Y_train[i-SEQLEN] = t0[i]
14 X_train[i-SEQLEN] = t0[(i-SEQLEN):i]
15for i in range(t0.shape[0]-pred_len,t0.shape[0]):
16 Y_test[i-t0.shape[0]+pred_len] = t0[i]
17 X_test[i-t0.shape[0]+pred_len] = t0[(i-SEQLEN):i]
2from keras.models import Sequential
3model = Sequential()
4model.add(LSTM(64, input_shape=(SEQLEN, dim_in),activation='relu',recurrent_dropout=0.01))
5model.add(Dense(dim_out,activation='linear'))
6model.compile(loss = 'mean_squared_error', optimizer = 'rmsprop')
7history = model.fit(X_train, Y_train, epochs=200, batch_size=10, validation_split=0)
8# Epoch 1/200
9# 1350/1350 [==============================] - 1s 1ms/step - loss: 0.0447
10# Epoch 2/200
11# 1350/1350 [==============================] - 1s 737us/step - loss: 0.0059
12# Epoch 3/200
13# 1350/1350 [==============================] - 1s 743us/step - loss: 0.0043
14# ......
15# Epoch 200/200
16# 1350/1350 [==============================] - 1s 821us/step - loss: 9.2794e-04
2# array([[1069.35781887, 1038.57915742, 1056.77147186, 1053.83827734],
3# [1070.65142282, 1039.58533719, 1057.34561875, 1054.85567074],
4# [1083.58529328, 1052.70457308, 1070.78824637, 1067.49741882],
5#
6# [1186.19297789, 1161.52758381, 1172.33666591, 1170.44623263],
7# [1181.42680223, 1155.14778501, 1166.5726204 , 1165.00336968],
8# [1186.75600881, 1160.84733425, 1172.37636963, 1170.09819923]])
9
10preddf.shape
11# (30, 4)
2plt.figure(figsize=(10,7))
3for i in range(4):
4 plt.subplot(2,2,i+1)
5 plt.plot(range(30),data.iloc[-30:data.shape[0],i].values,'o-',c='black')
6 plt.plot(range(30),preddf[:,i],'o--',c='gray')
7 plt.ylim(1000,1200)
8 plt.ylabel("$"+data.columns[i]+"$")
9plt.show()
10v = 100*(1 - np.sum(np.abs(preddf - data.iloc[-30:data.shape[0],:4]).values)/np.sum (data.iloc[-30:data.shape[0],: 4].values))
11print("Evaluation on test data: accuracy = %0.2f%% \n" % v)
12# Evaluation on test data: accuracy = 99.01%
本文节选自《Python预测之美:数据分析与算法实战》一书。《Python预测之美:数据分析与算法实战(双色)》,作者游皓麟,以Python语言为基础,体系化介绍预测技术工程实施的必备技能。基于Python 来做预测,不仅能够在业务上快速落地,还让代码维护起来更加方便。对预测原理的深度剖析和算法的细致解读,是本书的一大亮点。本书共分为预测基础、预测算法、预测案例三部分。希望读者在看完本书后,能够将本书的精要融会贯通,进一步在工作和学习实践中提炼价值。
更多精彩推荐
☞厉害!从电影花瓶到 Wi-Fi 之母,这才是乘风破浪的姐姐!
☞用 Python 实现抖音上的“人像动漫化”特效,原来这么简单!
☞市场占比 44%,IDC 最新报告:阿里云智能语音市场排名第一
☞解读领跑全国的区块链发展“北京方案”:设专项基金,构建开源生态