PonderingDNCore L2正则化-checkpoint



In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 1000)
import datetime
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import time
import os
import copy

定义获取数据函数


In [2]:
def fix_data(path):
    tmp = pd.read_csv(path, encoding="gbk", engine='python')
    tmp.rename(columns={'Unnamed: 0':'trading_time'}, inplace=True)
    tmp['trading_point'] = pd.to_datetime(tmp.trading_time)
    del tmp['trading_time']
    tmp.set_index(tmp.trading_point, inplace=True)
    return tmp

def High_2_Low(tmp, freq):
    """处理从RiceQuant下载的分钟线数据,
    从分钟线数据合成低频数据
    2017-08-11    
    """
    # 分别处理bar数据
    tmp_open = tmp['open'].resample(freq).ohlc()
    tmp_open = tmp_open['open'].dropna()

    tmp_high = tmp['high'].resample(freq).ohlc()
    tmp_high = tmp_high['high'].dropna()

    tmp_low = tmp['low'].resample(freq).ohlc()
    tmp_low = tmp_low['low'].dropna()

    tmp_close = tmp['close'].resample(freq).ohlc()
    tmp_close = tmp_close['close'].dropna()

    tmp_price = pd.concat([tmp_open, tmp_high, tmp_low, tmp_close], axis=1)
    
    # 处理成交量
    tmp_volume = tmp['volume'].resample(freq).sum()
    tmp_volume.dropna(inplace=True)
    
    return pd.concat([tmp_price, tmp_volume], axis=1)

计算技术分析指标


In [3]:
import talib 

def get_factors(index, 
                Open, 
                Close, 
                High, 
                Low, 
                Volume,
                rolling = 26,
                drop=False, 
                normalization=True):
    
    tmp = pd.DataFrame()
    tmp['tradeTime'] = index
    
    #累积/派发线(Accumulation / Distribution Line,该指标将每日的成交量通过价格加权累计,
    #用以计算成交量的动量。属于趋势型因子
    tmp['AD'] = talib.AD(High, Low, Close, Volume)

    # 佳庆指标(Chaikin Oscillator),该指标基于AD曲线的指数移动均线而计算得到。属于趋势型因子
    tmp['ADOSC'] = talib.ADOSC(High, Low, Close, Volume, fastperiod=3, slowperiod=10)

    # 平均动向指数,DMI因子的构成部分。属于趋势型因子
    tmp['ADX'] = talib.ADX(High, Low, Close,timeperiod=14)

    # 相对平均动向指数,DMI因子的构成部分。属于趋势型因子
    tmp['ADXR'] = talib.ADXR(High, Low, Close,timeperiod=14)

    # 绝对价格振荡指数
    tmp['APO'] = talib.APO(Close, fastperiod=12, slowperiod=26)

    # Aroon通过计算自价格达到近期最高值和最低值以来所经过的期间数,帮助投资者预测证券价格从趋势到区域区域或反转的变化,
    #Aroon指标分为Aroon、AroonUp和AroonDown3个具体指标。属于趋势型因子
    tmp['AROONDown'], tmp['AROONUp'] = talib.AROON(High, Low,timeperiod=14)
    tmp['AROONOSC'] = talib.AROONOSC(High, Low,timeperiod=14)

    # 均幅指标(Average TRUE Ranger),取一定时间周期内的股价波动幅度的移动平均值,
    #是显示市场变化率的指标,主要用于研判买卖时机。属于超买超卖型因子。
    tmp['ATR14']= talib.ATR(High, Low, Close, timeperiod=14)
    tmp['ATR6']= talib.ATR(High, Low, Close, timeperiod=6)

    # 布林带
    tmp['Boll_Up'],tmp['Boll_Mid'],tmp['Boll_Down']= talib.BBANDS(Close, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)

    # 均势指标
    tmp['BOP'] = talib.BOP(Open, High, Low, Close)

    #5日顺势指标(Commodity Channel Index),专门测量股价是否已超出常态分布范围。属于超买超卖型因子。
    tmp['CCI5'] = talib.CCI(High, Low, Close, timeperiod=5)
    tmp['CCI10'] = talib.CCI(High, Low, Close, timeperiod=10)
    tmp['CCI20'] = talib.CCI(High, Low, Close, timeperiod=20)
    tmp['CCI88'] = talib.CCI(High, Low, Close, timeperiod=88)

    # 钱德动量摆动指标(Chande Momentum Osciliator),与其他动量指标摆动指标如相对强弱指标(RSI)和随机指标(KDJ)不同,
    # 钱德动量指标在计算公式的分子中采用上涨日和下跌日的数据。属于超买超卖型因子
    tmp['CMO_Close'] = talib.CMO(Close,timeperiod=14)
    tmp['CMO_Open'] = talib.CMO(Close,timeperiod=14)

    # DEMA双指数移动平均线
    tmp['DEMA6'] = talib.DEMA(Close, timeperiod=6)
    tmp['DEMA12'] = talib.DEMA(Close, timeperiod=12)
    tmp['DEMA26'] = talib.DEMA(Close, timeperiod=26)

    # DX 动向指数
    tmp['DX'] = talib.DX(High, Low, Close,timeperiod=14)

    # EMA 指数移动平均线
    tmp['EMA6'] = talib.EMA(Close, timeperiod=6)
    tmp['EMA12'] = talib.EMA(Close, timeperiod=12)
    tmp['EMA26'] = talib.EMA(Close, timeperiod=26)

    # KAMA 适应性移动平均线
    tmp['KAMA'] = talib.KAMA(Close, timeperiod=30)

    # MACD
    tmp['MACD_DIF'],tmp['MACD_DEA'],tmp['MACD_bar'] = talib.MACD(Close, fastperiod=12, slowperiod=24, signalperiod=9)

    # 中位数价格 不知道是什么意思
    tmp['MEDPRICE'] = talib.MEDPRICE(High, Low)

    # 负向指标 负向运动
    tmp['MiNUS_DI'] = talib.MINUS_DI(High, Low, Close,timeperiod=14)
    tmp['MiNUS_DM'] = talib.MINUS_DM(High, Low,timeperiod=14)

    # 动量指标(Momentom Index),动量指数以分析股价波动的速度为目的,研究股价在波动过程中各种加速,
    #减速,惯性作用以及股价由静到动或由动转静的现象。属于趋势型因子
    tmp['MOM'] = talib.MOM(Close, timeperiod=10)

    # 归一化平均值范围
    tmp['NATR'] = talib.NATR(High, Low, Close,timeperiod=14)

    # OBV 	能量潮指标(On Balance Volume,OBV),以股市的成交量变化来衡量股市的推动力,
    #从而研判股价的走势。属于成交量型因子
    tmp['OBV'] = talib.OBV(Close, Volume)

    # PLUS_DI 更向指示器
    tmp['PLUS_DI'] = talib.PLUS_DI(High, Low, Close,timeperiod=14)
    tmp['PLUS_DM'] = talib.PLUS_DM(High, Low, timeperiod=14)

    # PPO 价格振荡百分比
    tmp['PPO'] = talib.PPO(Close, fastperiod=6, slowperiod= 26, matype=0)

    # ROC 6日变动速率(Price Rate of Change),以当日的收盘价和N天前的收盘价比较,
    #通过计算股价某一段时间内收盘价变动的比例,应用价格的移动比较来测量价位动量。属于超买超卖型因子。
    tmp['ROC6'] = talib.ROC(Close, timeperiod=6)
    tmp['ROC20'] = talib.ROC(Close, timeperiod=20)
    #12日量变动速率指标(Volume Rate of Change),以今天的成交量和N天前的成交量比较,
    #通过计算某一段时间内成交量变动的幅度,应用成交量的移动比较来测量成交量运动趋向,
    #达到事先探测成交量供需的强弱,进而分析成交量的发展趋势及其将来是否有转势的意愿,
    #属于成交量的反趋向指标。属于成交量型因子
    tmp['VROC6'] = talib.ROC(Volume, timeperiod=6)
    tmp['VROC20'] = talib.ROC(Volume, timeperiod=20)

    # ROC 6日变动速率(Price Rate of Change),以当日的收盘价和N天前的收盘价比较,
    #通过计算股价某一段时间内收盘价变动的比例,应用价格的移动比较来测量价位动量。属于超买超卖型因子。
    tmp['ROCP6'] = talib.ROCP(Close, timeperiod=6)
    tmp['ROCP20'] = talib.ROCP(Close, timeperiod=20)
    #12日量变动速率指标(Volume Rate of Change),以今天的成交量和N天前的成交量比较,
    #通过计算某一段时间内成交量变动的幅度,应用成交量的移动比较来测量成交量运动趋向,
    #达到事先探测成交量供需的强弱,进而分析成交量的发展趋势及其将来是否有转势的意愿,
    #属于成交量的反趋向指标。属于成交量型因子
    tmp['VROCP6'] = talib.ROCP(Volume, timeperiod=6)
    tmp['VROCP20'] = talib.ROCP(Volume, timeperiod=20)

    # RSI
    tmp['RSI'] = talib.RSI(Close, timeperiod=14)

    # SAR 抛物线转向
    tmp['SAR'] = talib.SAR(High, Low, acceleration=0.02, maximum=0.2)

    # TEMA
    tmp['TEMA6'] = talib.TEMA(Close, timeperiod=6)
    tmp['TEMA12'] = talib.TEMA(Close, timeperiod=12)
    tmp['TEMA26'] = talib.TEMA(Close, timeperiod=26)

    # TRANGE 真实范围
    tmp['TRANGE'] = talib.TRANGE(High, Low, Close)

    # TYPPRICE 典型价格
    tmp['TYPPRICE'] = talib.TYPPRICE(High, Low, Close)

    # TSF 时间序列预测
    tmp['TSF'] = talib.TSF(Close, timeperiod=14)

    # ULTOSC 极限振子
    tmp['ULTOSC'] = talib.ULTOSC(High, Low, Close, timeperiod1=7, timeperiod2=14, timeperiod3=28)

    # 威廉指标
    tmp['WILLR'] = talib.WILLR(High, Low, Close, timeperiod=14)
    
    # 标准化
    if normalization:
        factors_list = tmp.columns.tolist()[1:]

        if rolling >= 26:
            for i in factors_list:
                tmp[i] = (tmp[i] - tmp[i].rolling(window=rolling, center=False).mean())\
                /tmp[i].rolling(window=rolling, center=False).std()
        elif rolling < 26 & rolling > 0:
            print ('Recommended rolling range greater than 26')
        elif rolling <=0:
            for i in factors_list:
                tmp[i] = (tmp[i] - tmp[i].mean())/tmp[i].std()
            
    if drop:
        tmp.dropna(inplace=True)
        
    tmp.set_index('tradeTime', inplace=True)
    
    return tmp

处理数据


In [4]:
tmp = fix_data('期货测试数据/白银88.csv')

# targets 1d 数据合成
tmp_1d = High_2_Low(tmp, '1d')
rolling = 88
targets = tmp_1d
targets['returns'] =  targets['close'].shift(-2) / targets['close'] - 1.0
targets['upper_boundary']= targets.returns.rolling(rolling).mean() + 0.5 * targets.returns.rolling(rolling).std()
targets['lower_boundary']= targets.returns.rolling(rolling).mean() - 0.5 * targets.returns.rolling(rolling).std()
targets.dropna(inplace=True)
targets['labels'] = 1
targets.loc[targets['returns']>=targets['upper_boundary'], 'labels'] = 2
targets.loc[targets['returns']<=targets['lower_boundary'], 'labels'] = 0

# factors 1d 数据合成
tmp_1d = High_2_Low(tmp, '1d')
Index = tmp_1d.index
High = tmp_1d.high.values
Low = tmp_1d.low.values
Close = tmp_1d.close.values
Open = tmp_1d.open.values
Volume = tmp_1d.volume.values
factors = get_factors(Index, Open, Close, High, Low, Volume, rolling = 26, drop=True)

factors = factors.loc[:targets.index[-1]]

tmp_factors_1 = factors.iloc[:12]
targets = targets.loc[tmp_factors_1.index[-1]:]

gather_list = np.arange(factors.shape[0])[11:]

转换数据


In [5]:
inputs = np.array(factors).reshape(-1, 1, factors.shape[1])

def dense_to_one_hot(labels_dense):
    """标签 转换one hot 编码
    输入labels_dense 必须为非负数
    2016-11-21
    """
    num_classes = len(np.unique(labels_dense)) # np.unique 去掉重复函数
    raws_labels = labels_dense.shape[0]
    index_offset = np.arange(raws_labels) * num_classes
    labels_one_hot = np.zeros((raws_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot  

targets = dense_to_one_hot(targets['labels'])
targets = np.expand_dims(targets, axis=1)

构建RNN模型


In [6]:
import tensorflow as tf
from DNCore import DNCoreDeepLSTM

In [7]:
class Classifier_DNCoreDeepLSTM(object):
    
    def __init__(self, 
                 inputs, 
                 targets,
                 gather_list=None,
                 batch_size=1, 
                 hidden_size=20, 
                 memory_size=20, 
                 num_reads=3,
                 num_writes=1,  
                 learning_rate = 1e-3,
                 optimizer_epsilon = 1e-8,
                 l2_coefficient = 1e-3,
                 max_gard_norm = 50,
                 reset_graph = True):
        
        if reset_graph:
            tf.reset_default_graph()
        # 控制参数
        self._tmp_inputs = inputs
        self._tmp_targets = targets
        self._in_length = None
        self._in_width = inputs.shape[2]
        self._out_length = None
        self._out_width = targets.shape[2]
        self._batch_size = batch_size

        # 声明会话
        self._sess = tf.InteractiveSession()
        
        self._inputs = tf.placeholder(
            dtype=tf.float32,
            shape=[self._in_length, self._batch_size, self._in_width],
            name='inputs')
        self._targets = tf.placeholder(
            dtype=tf.float32,
            shape=[self._out_length, self._batch_size, self._out_width],
            name='targets')
        
        self._RNNCoreCell = DNCoreDeepLSTM(
            dnc_output_size=self._out_width, 
            hidden_size=hidden_size, 
            memory_size=memory_size, 
            word_size=self._in_width, 
            num_read_heads=num_reads,
            num_write_heads=num_writes)
        
        self._initial_state = \
        self._RNNCoreCell.initial_state(batch_size)
        
        output_sequences, _ = \
        tf.nn.dynamic_rnn(cell= self._RNNCoreCell, 
                          inputs=self._inputs, 
                          initial_state=self._initial_state, 
                          time_major=True)
        
        self._original_output_sequences = output_sequences
        if gather_list is not None:
            output_sequences = tf.gather(output_sequences, gather_list)
        
        # L2 正则化测试 2017-09-03 
        self._trainable_variables = tf.trainable_variables()
        _l2_regularizer = tf.add_n([tf.nn.l2_loss(v) for v in self._trainable_variables])        
        self._l2_regularizer = _l2_regularizer * l2_coefficient / len(self._trainable_variables)
        
        rnn_cost = tf.nn.softmax_cross_entropy_with_logits(
            labels=self._targets, logits=output_sequences)
        self._rnn_cost = tf.reduce_mean(rnn_cost)
        self._cost = self._rnn_cost + self._l2_regularizer
        
        
        train_pred = tf.nn.softmax(output_sequences, dim=2)
        correct_pred = tf.equal(tf.argmax(train_pred,2), tf.argmax(self._targets,2))
        self._accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        # Set up optimizer with global norm clipping.
        trainable_variables = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self._cost, trainable_variables), max_gard_norm)
        global_step = tf.get_variable(
            name="global_step",
            shape=[],
            dtype=tf.int64,
            initializer=tf.zeros_initializer(),
            trainable=False,
            collections=[tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP])
        
        optimizer = tf.contrib.opt.NadamOptimizer(
            learning_rate=learning_rate, epsilon=optimizer_epsilon)
        self._train_step = optimizer.apply_gradients(
            zip(grads, trainable_variables), global_step=global_step)  
        
        self._sess.run(tf.global_variables_initializer())
        self._variables_saver = tf.train.Saver()
        
        
    def fit(self, 
            training_iters =1e2,             
            display_step = 5, 
            save_path = None,
            restore_path = None):
        
        if restore_path is not None:
            self._variables_saver.restore(self._sess, restore_path)
              
        for scope in range(np.int(training_iters)):
            self._sess.run([self._train_step],
                           feed_dict = {self._inputs:self._tmp_inputs, self._targets:self._tmp_targets})
            
            if scope % display_step == 0:
                loss, acc, l2_loss, rnn_loss = self._sess.run(
                    [self._cost, self._accuracy, self._l2_regularizer, self._rnn_cost], 
                    feed_dict = {self._inputs:self._tmp_inputs, self._targets:self._tmp_targets}) 
                print (scope, '  loss--', loss, '  acc--', acc, '  l2_loss', l2_loss, '  rnn_cost', rnn_loss)                       
                    
        print ("Optimization Finished!")         
        loss, acc, l2_loss, rnn_loss = self._sess.run(
            [self._cost, self._accuracy, self._l2_regularizer, self._rnn_cost], 
            feed_dict = {self._inputs:self._tmp_inputs, self._targets:self._tmp_targets}) 
        print ('Model assessment  loss--', loss, '  acc--', acc, '  l2_loss', l2_loss, '  rnn_cost', rnn_loss)      
        # 保存模型可训练变量
        if save_path is not None:
            self._variables_saver.save(self._sess, save_path) 
            
    def close(self):
        self._sess.close()
        print ('结束进程,清理tensorflow内存/显存占用')
        
    def pred(self, inputs, gather_list=None, restore_path=None):
        output_sequences = self._original_output_sequences
        if gather_list is not None:
            output_sequences = tf.gather(output_sequences, gather_list)
        probability = tf.nn.softmax(output_sequences)
        classification = tf.argmax(probability, axis=-1)
        return self._sess.run([probability, classification],feed_dict = {self._inputs:inputs})
    
    def restore_trainable_variables(self, restore_path):
        self._variables_saver.restore(self._sess, restore_path)
    
    def score(self, inputs, targets, gather_list=None):
        acc = self._sess.run(
            self._accuracy,
            feed_dict = {self._inputs:self._tmp_inputs, 
                         self._targets:self._tmp_targets})
        return acc

训练模型


In [8]:
a = Classifier_DNCoreDeepLSTM(
    inputs, 
    targets, 
    gather_list,
    hidden_size=50, 
    memory_size=50, 
    learning_rate = 1e-3, 
    l2_coefficient=5e-3)

a.fit(training_iters = 200,
      display_step=5, 
      save_path='models/DNCoreDeepLSTM_NADM_saver_1.ckpt')

print (a.score(inputs, targets, gather_list))
a.close()


WARNING:tensorflow:The `skip_connections` argument will be deprecated. Please use snt.SkipConnectionCore instead.
G:\PythonDevelopment\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py:95: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
0   loss-- 1.09564   acc-- 0.520918   l2_loss 0.0654176   rnn_cost 1.03022
5   loss-- 1.02197   acc-- 0.535762   l2_loss 0.0611229   rnn_cost 0.960843
10   loss-- 0.972609   acc-- 0.569501   l2_loss 0.0578166   rnn_cost 0.914792
15   loss-- 0.987256   acc-- 0.554656   l2_loss 0.0553882   rnn_cost 0.931868
20   loss-- 0.896166   acc-- 0.618084   l2_loss 0.0538287   rnn_cost 0.842338
25   loss-- 0.85353   acc-- 0.654521   l2_loss 0.0525933   rnn_cost 0.800937
30   loss-- 0.80255   acc-- 0.678812   l2_loss 0.051608   rnn_cost 0.750942
35   loss-- 0.816485   acc-- 0.665317   l2_loss 0.050841   rnn_cost 0.765644
40   loss-- 0.743006   acc-- 0.696356   l2_loss 0.050242   rnn_cost 0.692764
45   loss-- 0.693751   acc-- 0.736842   l2_loss 0.0498432   rnn_cost 0.643907
50   loss-- 0.66089   acc-- 0.740891   l2_loss 0.0495279   rnn_cost 0.611363
55   loss-- 0.61985   acc-- 0.766532   l2_loss 0.0493084   rnn_cost 0.570541
60   loss-- 0.586818   acc-- 0.780027   l2_loss 0.0491726   rnn_cost 0.537645
65   loss-- 0.545643   acc-- 0.801619   l2_loss 0.0490661   rnn_cost 0.496577
70   loss-- 0.508457   acc-- 0.82861   l2_loss 0.0490282   rnn_cost 0.459428
75   loss-- 0.564138   acc-- 0.778677   l2_loss 0.0490356   rnn_cost 0.515102
80   loss-- 0.459515   acc-- 0.847503   l2_loss 0.0490646   rnn_cost 0.41045
85   loss-- 0.427858   acc-- 0.869096   l2_loss 0.0491549   rnn_cost 0.378703
90   loss-- 0.404269   acc-- 0.874494   l2_loss 0.04922   rnn_cost 0.355049
95   loss-- 0.428364   acc-- 0.848853   l2_loss 0.0493303   rnn_cost 0.379033
100   loss-- 0.35319   acc-- 0.897436   l2_loss 0.0494398   rnn_cost 0.30375
105   loss-- 0.325448   acc-- 0.916329   l2_loss 0.0495598   rnn_cost 0.275888
110   loss-- 0.335813   acc-- 0.904184   l2_loss 0.0496586   rnn_cost 0.286155
115   loss-- 0.295673   acc-- 0.917679   l2_loss 0.0497647   rnn_cost 0.245909
120   loss-- 0.270637   acc-- 0.933873   l2_loss 0.0498683   rnn_cost 0.220769
125   loss-- 0.41744   acc-- 0.854251   l2_loss 0.0499807   rnn_cost 0.367459
130   loss-- 0.260258   acc-- 0.944669   l2_loss 0.0501238   rnn_cost 0.210134
135   loss-- 0.230941   acc-- 0.955466   l2_loss 0.0502792   rnn_cost 0.180662
140   loss-- 0.206087   acc-- 0.962213   l2_loss 0.0504352   rnn_cost 0.155652
145   loss-- 0.249695   acc-- 0.939271   l2_loss 0.0505658   rnn_cost 0.199129
150   loss-- 0.188839   acc-- 0.967611   l2_loss 0.0506697   rnn_cost 0.138169
155   loss-- 0.164535   acc-- 0.973009   l2_loss 0.050785   rnn_cost 0.11375
160   loss-- 0.259585   acc-- 0.924426   l2_loss 0.0508611   rnn_cost 0.208724
165   loss-- 0.157975   acc-- 0.977058   l2_loss 0.0509473   rnn_cost 0.107027
170   loss-- 0.137439   acc-- 0.982456   l2_loss 0.0510269   rnn_cost 0.0864121
175   loss-- 0.200951   acc-- 0.951417   l2_loss 0.0510403   rnn_cost 0.149911
180   loss-- 0.133147   acc-- 0.989204   l2_loss 0.0510718   rnn_cost 0.0820754
185   loss-- 0.114043   acc-- 0.993252   l2_loss 0.0511048   rnn_cost 0.0629386
190   loss-- 0.101665   acc-- 0.987854   l2_loss 0.0511078   rnn_cost 0.050557
195   loss-- 0.0907295   acc-- 0.994602   l2_loss 0.0510639   rnn_cost 0.0396656
Optimization Finished!
Model assessment  loss-- 0.114459   acc-- 0.979757   l2_loss 0.0510009   rnn_cost 0.0634583
0.979757
结束进程,清理tensorflow内存/显存占用

In [ ]: