notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
from collections import namedtuple
import os, sys
print( os.getcwd() )
print( os.listdir( os.getcwd() ) )
import time









    



/home/topolo/PropD/MLgrabbag/kaggle
['stage1.7z.part', 'LSTM_model201702271930.save', 'cleaning_dueSigmaFin.pyc', 'LSTM_model201702280608.save', '.ipynb_checkpoints', 'dueSigmaFinancial_kaggle.py', 'LSTM_model.save', 'LSTM_model201703012346.save', 'glass-classification.zip', 'DatSciBow2017_FullPreprocessTutorial.ipynb', 'LSTM_model201702282350.save', 'GRU_model201703022010.save', 'DueSigmaFin_runs.ipynb', 'dueSigmaFinancial_local.ipynb', 'GRU_model201703012348.save', 'GRU_model201703021741.save', 'kaggle.ipynb', 'glass.csv', 'train.h5.zip', '__init__.py', 'train.h5', 'stage1.torrent', 'dueSigmaFinancial_local_GRUs.ipynb', 'cleaning_dueSigmaFin.py']



In [3]:

    
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from six.moves import cPickle



In [4]:

    
import theano









    



WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 40.0% of memory, cuDNN 5105)



In [5]:

    
from theano import function, config, sandbox, shared 
import theano.tensor as T



In [6]:

    
print( theano.config.device )
print( theano.config.lib.cnmem)  # cf. http://deeplearning.net/software/theano/library/config.html
print( theano.config.print_active_device)# Print active device at when the GPU device is initialized.









    



gpu0
0.4
True



In [7]:

    
print(theano.config.allow_gc)
print(theano.config.optimizer_excluding)









    



False



In [8]:

    
sys.path.append( '../ML' )



In [9]:

    
from GRUs_Right import *



In [10]:

    
timeseries_pd = pd.read_hdf( 'train.h5')



In [28]:

    
import cleaning_dueSigmaFin
from cleaning_dueSigmaFin import clean_tseries, clean_test, id_only, split_tseries,y_only, calc_R, calc_Pearson_R



In [12]:

    
train_to_all = 0.85 
T_0 = 906 # timestamp to stop at
num_valid = 100 # how many time steps to validate on # 10
obs_train = timeseries_pd[timeseries_pd["timestamp"]<int(T_0*train_to_all)]  # kaggle call these "observation"



In [13]:

    
# clean training data
cleaned_obs_train = clean_tseries( obs_train)
obs_valid = timeseries_pd[
    (timeseries_pd['timestamp']>=int(T_0*train_to_all)) & (timeseries_pd['timestamp']<int(T_0*(train_to_all)+num_valid))]
cleaned_obs_valid = clean_tseries( obs_valid)



In [14]:

    
# this is for a second try, with smaller number of layers
L = Gates(z=2,r=2,h=2,y=3)
n_in = 110 # s1
H = 110
s_2 = 50

s_l = Gates(z=[n_in,H],r=[n_in,H],h=[H,H],y=[H,s_2,1])

activations=Psis(z=(T.nnet.sigmoid,T.nnet.sigmoid),r=(T.nnet.sigmoid,T.nnet.sigmoid),h=(T.tanh,T.tanh),
                 y=(T.nnet.sigmoid,None))



In [15]:

    
lambda_learn =0.0000002  # regularization for cost function # 0.01 # 0.0000001
alpha_reg = 0.000001 # learning rate # 0.00001 
beta_mom = 0.000000001 # momentum constant, old "Mo" (it's an inside joke) # 0.001 # 0.000000001



In [16]:

    
# for the L2 norm
GRU_model=GRU_MUT001_right(L,s_l,H,1,activations )
Memblck = MemoryBlock_right(H, GRU_model)
Memblck.build_scan_over_t()
Memblck.build_J_L2norm(lambda_learn)  # inputted value, lambda_val, is the learning rate









    



Total number of parameters: 11 
Total number of parameters: 11 






    Out[16]:





GpuFromHost.0



In [17]:

    
# be aware that this step may take more than a few seconds
Memblck.build_update(alpha=alpha_reg,beta=beta_mom)









    



Total number of parameters: 11



In [25]:

    
MAX_ITERS = 10 # 10
%time results_cost = Memblck.train_model_full( cleaned_obs_train, MAX_ITERS ) # 2 # 50, # theano.config.allow_gc =:  False
#CPU times: user 1h 16min 38s, sys: 23min 7s, total: 1h 39min 46s
#Wall time: 1h 39min 44s
# 150 CPU times: user 4h 31min 58s, sys: 29min 30s, total: 5h 1min 29s
# Wall time: 5h 1min 21s
# MAX_ITERS=50  CPU times: user 1h 31min 57s, sys: 11min 6s, total: 1h 43min 3s
# Wall time: 1h 43min 1s
# cnmem=0.4, MAX_ITERS=10 CPU times: user 18min 11s, sys: 2min 45s, total: 20min 56s  Wall time: 20min 56s









    



theano.config.allow_gc =:  False
CPU times: user 18min 11s, sys: 2min 45s, total: 20min 56s
Wall time: 20min 56s



In [42]:

    
MAX_ITERS = 100 # MAX_ITERS=100 CPU times: user 3h 59min 33s, sys: 35min 34s, total: 4h 35min 7s; CPU times: user 4h 2min 1s, sys: 38min 31s, total: 4h 40min 32s
%time results_cost = Memblck.train_model_full( cleaned_obs_train, MAX_ITERS ) # MAX_ITERS=2 CPU times: user 4min 40s, sys: 41.2 s, total: 5min 21s
# MAX_ITERS=100 CPU times: user 4h 2min 1s, sys: 38min 31s, total: 4h 40min 32s









    



theano.config.allow_gc =:  False
CPU times: user 4h 2min 1s, sys: 38min 31s, total: 4h 40min 32s
Wall time: 4h 40min 26s



In [31]:

    
#print(len(results_cost));print(len(results_cost[0]));print(type(results_cost[0][0]));
#print(dir(results_cost[0][0])); np.array( results_cost[0][0] ) + 0. 
# cast a CudaNdarray as a numpy array as usual to get data back from the GPU
#2
#1047
#<type 'CudaNdarray'>
#['__add__', '__array__', '__class__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__iadd__', '__idiv__', '__init__', '__itruediv__', '__len__', '__new__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '_dev_data', '_set_shape_i', '_set_stride', '_strides', 'base', 'copy', 'dtype', 'exp', 'gpudata', 'is_c_contiguous', 'mem_size', 'ndim', 'reduce_sum', 'reshape', 'shape', 'size', 'strides', 'take', 'view', 'zeros']
#7.6817378997802734
results_cost # array([ 2392.01805741,  2026.30881399,  1718.79077414,  1460.21671918,
#        1242.77239746,  1059.89497049,   906.07414965,   776.67492714,
 #        667.8074246 ,   576.19623596])
#    array([ 499.09311096,  434.19197716,  379.55148638,  333.53831519,
 #       294.77457814,  262.10549985,  234.56227435,  211.331933  ,
  #      191.72780603,  175.17298945,  161.18276756,  149.35165075,
   #     139.33629566,  130.84899801,  123.64665813,  117.52760176,
    #    112.32031585,  107.88055044,  104.08750449,  100.83924748,
     #    98.05002079,   95.64768349,   93.57117918,   91.76981368,
      #   90.20068056,   88.82773009,   87.62072169,   86.55424258,
       #  85.60652637,   84.7592262 ,   83.9971908 ,   83.30755471,
        # 82.67956948,   82.104293  ,   81.5739191 ,   81.08202097,
         #80.62313188,   80.19265432,   79.78657391,   79.40169733,
         #79.03528562,   78.68498599,   78.34873173,   78.02491135,
         #77.71211131,   77.40903791,   77.11481158,   76.82847728,
         #76.5492742 ,   76.27653115])









    Out[31]:





array([ 76.00975371,  75.74841108,  75.49209556,  75.24054598,
        74.9934125 ,  74.75049671,  74.51153713,  74.27631294,
        74.04469443,  73.81647502,  73.59152924,  73.36974094,
        73.1510009 ,  72.93522567,  72.72235127,  72.51226596,
        72.30488643,  72.10011468,  71.89789904,  71.69810825,
        71.50072718,  71.30574825,  71.11314285,  70.92284609,
        70.73476145,  70.54886158,  70.36506623,  70.18336344,
        70.00373211,  69.82611257,  69.65048223,  69.47679553,
        69.3050498 ,  69.13522202,  68.967239  ,  68.8010506 ,
        68.63662915,  68.47392935,  68.31295128,  68.15366334,
        67.99603153,  67.84004894,  67.68571973,  67.53296597,
        67.38176911,  67.23208419,  67.08387476,  66.93718501,
        66.79197773,  66.64821152,  66.50584632,  66.36484665,
        66.22519162,  66.08689091,  65.94993202,  65.81429283,
        65.67994089,  65.5468432 ,  65.41496488,  65.28433098,
        65.15490491,  65.02669865,  64.89968294,  64.77380289,
        64.64906341,  64.52544371,  64.40293073,  64.28152864,
        64.16124458,  64.04201883,  63.92382911,  63.80665092,
        63.69049266,  63.57533455,  63.46117029,  63.34794972,
        63.23567585,  63.124378  ,  63.01404423,  62.90462898,
        62.79611763,  62.68849002,  62.58173567,  62.47584523,
        62.37078568,  62.26658024,  62.16320748,  62.06067426,
        61.95896875,  61.85808929,  61.75801156,  61.65870387,
        61.56016703,  61.46239121,  61.36536782,  61.26907724,
        61.17350798,  61.07866918,  60.98456428,  60.89116011])



In [21]:

    
results_cost









    Out[21]:





array([ 208.68999271,  196.17384224,  184.51532601,  173.65364589,
        163.535571  ,  154.10933666,  145.3276792 ,  137.14510178,
        129.52184793,  122.41859483,  115.80043555,  109.63317656,
        103.88645048,   98.53125428,   93.5405834 ,   88.88907005,
         84.55365502,   80.51235897,   76.74450211,   73.23163303,
         69.95670609,   66.90267833,   64.05488925,   61.40001843,
         58.92394299,   56.6144753 ,   54.46057633,   52.45162899,
         50.57754368,   48.82979661,   47.19936025,   45.67855332,
         44.25959137,   42.93566738,   41.70026356,   40.54712033,
         39.47109725,   38.46691792,   37.52977408,   36.65496045,
         35.83851088,   35.07592645,   34.3639231 ,   33.69898058,
         33.07800723,   32.49809263,   31.95625958,   31.45031453,
         30.97766785,   30.53601499,   30.12338186,   29.73778768,
         29.37733613,   29.04032971,   28.72522689,   28.43077564,
         28.1553002 ,   27.89753963,   27.6565584 ,   27.43106991,
         27.21996023,   27.02237734,   26.83731713,   26.66399114,
         26.50159786,   26.34939434,   26.2068533 ,   26.07326921,
         25.94798387,   25.83032984,   25.71989743,   25.6162427 ,
         25.51897941,   25.42764717,   25.34183084,   25.26115134,
         25.18532261,   25.11392905,   25.04664122,   24.9832045 ,
         24.92338414,   24.8669154 ,   24.81364254,   24.76332766,
         24.715821  ,   24.67092254,   24.62845275,   24.58823119,
         24.55009473,   24.51388736,   24.4795161 ,   24.44683354,
         24.415719  ,   24.38608173,   24.35781712,   24.33085875,
         24.30510391,   24.28048126,   24.25691543,   24.23433368])



In [85]:

    
plt.plot(results_cost)









    Out[85]:





[<matplotlib.lines.Line2D at 0x7f1d31423390>]



In [19]:

    
dir( theano.shared( np.array(range(6)).reshape((3,2)).astype(theano.config.floatX)) )
theano.shared( np.array(range(6)).reshape((3,2)).astype(theano.config.floatX)).view()









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-19-e23933aae0ef> in <module>()
      1 dir( theano.shared( np.array(range(6)).reshape((3,2)).astype(theano.config.floatX)) )
----> 2 theano.shared( np.array(range(6)).reshape((3,2)).astype(theano.config.floatX)).view()

AttributeError: 'CudaNdarraySharedVariable' object has no attribute 'view'



In [26]:

    
A=T.matrix(dtype=theano.config.floatX)
B=T.matrix(dtype=theano.config.floatX)
feg = theano.function([A,B],sandbox.cuda.basic_ops.gpu_from_host(T.dot(A,B)) )
C=feg(np.array(range(6)).reshape((3,2)).astype(theano.config.floatX),
      np.array(range(4,10)).reshape((2,3)).astype(theano.config.floatX) )



In [30]:

    
print(type(C));dir(C);print(type(C.view));print(type(C.view()))









    



<type 'CudaNdarray'>
<type 'builtin_function_or_method'>
<type 'CudaNdarray'>



In [20]:

    
cleaned_obs_valid[0][0].shape









    Out[20]:





(10, 110)



In [54]:

    
# valid_predictions = Memblck.predict_on_lst( cleaned_obs_valid,verbose=False)
%time valid_predictions = Memblck.predict_on_lst_givens( cleaned_obs_valid,verbose=False)









    



CPU times: user 4min 51s, sys: 7.86 s, total: 4min 58s
Wall time: 4min 58s



In [24]:

    
X_cleaned_obs_valid,y_cleaned_obs_valid=split_tseries(cleaned_obs_valid)



In [55]:

    
valid_predictions_np = [np.array(arr) for arr in valid_predictions]



In [86]:

    
#R_valid = calc_R(valid_predictions,y_cleaned_obs_valid)
R_valid = calc_R(valid_predictions_np,y_cleaned_obs_valid)
print(R_valid) # -37.9900914576, -35.089626; # -19.7438594245 # -19.3266766405 -19.01511991 -18.8868142933









    



-18.1848033605



In [87]:

    
PearsonR_valid = calc_Pearson_R(valid_predictions_np,y_cleaned_obs_valid)



In [88]:

    
PearsonR_valid









    Out[88]:





136.25999



In [35]:

    
print(calc_R_each(valid_predictions_np,y_cleaned_obs_valid) )



In [89]:

    
time.strftime( "%Y%m%d%H%M%S",time.gmtime())









    Out[89]:





'20170305070903'



In [90]:

    
f = open('GRU_model'+time.strftime('%Y%m%d%H%M',time.gmtime())+'.save' ,'wb')
for param in Memblck.__get_state__():
    cPickle.dump( param, f, protocol=cPickle.HIGHEST_PROTOCOL)
f.close()









    



Total number of parameters: 11

Load a saved model, load its parameters



In [18]:

    
print( os.listdir( os.getcwd() ) )









    



['stage1.7z.part', 'LSTM_model201702271930.save', 'cleaning_dueSigmaFin.pyc', 'LSTM_model201702280608.save', '.ipynb_checkpoints', 'dueSigmaFinancial_kaggle.py', 'LSTM_model.save', 'LSTM_model201703012346.save', 'glass-classification.zip', 'DatSciBow2017_FullPreprocessTutorial.ipynb', 'LSTM_model201702282350.save', 'GRU_model201703022010.save', 'DueSigmaFin_runs.ipynb', 'dueSigmaFinancial_local.ipynb', 'GRU_model201703012348.save', 'GRU_model201703021741.save', 'kaggle.ipynb', 'glass.csv', 'train.h5.zip', '__init__.py', 'train.h5', 'stage1.torrent', 'dueSigmaFinancial_local_GRUs.ipynb', 'cleaning_dueSigmaFin.py']



In [20]:

    
filename_in = "GRU_model201703021741.save" # change this MANUALLY 
f = open(filename_in,'rb')
number_of_params = len( Memblck.__get_state__() )
loaded_param = []
for _ in range(number_of_params):
    loaded_param.append(cPickle.load(f))
f.close()









    



Total number of parameters: 11 






    



---------------------------------------------------------------------------
EOFError                                  Traceback (most recent call last)
<ipython-input-20-62e2e50c08f8> in <module>()
      4 loaded_param = []
      5 for _ in range(number_of_params):
----> 6     loaded_param.append(cPickle.load(f))
      7 f.close()

EOFError:



In [23]:

    
Memblck.__set_state__(*loaded_param)



In [35]:

    
# Memblck.__get_state__()
# valid_predictions_np,y_cleaned_obs_valid 
print(type(valid_predictions_np));print(type(y_cleaned_obs_valid));print(len(valid_predictions_np));
print(len(y_cleaned_obs_valid));print(type(valid_predictions_np[0]));print(type(y_cleaned_obs_valid[0]));
print(valid_predictions_np[0].shape);print(y_cleaned_obs_valid[0].shape)
print(valid_predictions_np[1].shape);print(y_cleaned_obs_valid[1].shape)









    



<type 'list'>
<type 'list'>
951
951
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(10, 1)
(10, 1)
(10, 1)
(10, 1)



In [40]:

    
Rts_test  =calc_R_each(valid_predictions_np,y_cleaned_obs_valid)









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-40-88a3240fd5ba> in <module>()
----> 1 Rts_test  =calc_R_each(valid_predictions_np,y_cleaned_obs_valid)

<ipython-input-39-3f307ab62df6> in calc_R_each(yhat, y)
     16     Rts = [] # R values as a function time t
     17     for t in range(T):
---> 18         Rt_training_example = [np.power(y[idx_training_example][t]-yhat[idx_training_example][t],2) for idx_training_example in range(m)]
     19         Rts.append( Rt_training_example)
     20     return Rts

IndexError: index 1 is out of bounds for axis 0 with size 1



In [ ]:

Extraneous



In [49]:

    
m=len(valid_predictions_np)
T=y_cleaned_obs_valid[0].shape[0]
#[np.power(valid_predictions_np[idx][T-1]-y_cleaned_obs_valid[idx][T-1],2) for idx in range(m)]
mu = np.mean( [yvals.mean() for yvals in y_cleaned_obs_valid])



In [34]:

    
def calc_R_each( yhat,y):
    """
    @type yhat : Python list (of length m, number of examples) of numpy arrays, each of size dims. T,1, where T is the number of time steps predicted for
    @param yhat : predicted y values

    @type y
    """
    m = len(y)  # number of training examples at each time t
    assert m == len(yhat)

    yhat_Tavg = [yhat_val.mean() for yhat_val in yhat]
    y_Tavg    = [y_val.mean() for y_val in y]

    yhat_Tavg = np.array( yhat_Tavg)
    y_Tavg    = np.array( y_Tavg)

    mu = np.mean( y_Tavg )

    num = np.sum(  np.power( yhat_Tavg - y_Tavg,2) )
    dem = np.sum( np.power( y_Tavg - mu ,2 ) )
    R = num/dem
    R = np.sqrt( R)
    
    return R



In [28]:

    
R_test = calc_R_each( valid_predictions_np, y_cleaned_obs_valid)



In [30]:

    
np.sqrt( R_test )









    Out[30]:





149.686



In [53]:

    
R[0]









    Out[53]:





array([[ 1.67123961],
       [ 3.16267943],
       [ 0.01629504],
       [ 1.03388846],
       [ 1.34194529],
       [ 1.05554676],
       [ 1.11785388],
       [ 1.03495133],
       [ 1.0439266 ],
       [ 1.21566224]], dtype=float32)



In [55]:

    
y[0].mean()









    Out[55]:





0.046060514



In [56]:

    
y_cleaned_obs_valid[0].mean()









    Out[56]:





-0.0030814814



In [52]:

    
#T_0*train_to_all
def make_valid_set(timeseries_pd, T_f,T_valid):
    obs_valid = []
    for Deltat in range(T_valid):
        t = int( T_f + Deltat + 1 )
        single_t = timeseries_pd[ timeseries_pd['timestamp']== t ]
        obs_valid.append( single_t )
    return obs_valid



In [61]:

    
obs_valid_singles = make_valid_set(timeseries_pd,T_0*train_to_all,100 )



In [73]:

    
len( split_tseries( clean_tseries(obs_valid_singles[0]) )[1] )









    Out[73]:





943

Subsequent training



In [51]:

    
MAX_ITERS = 50  
%time results_cost = Memblck.train_model_full( cleaned_obs_train, MAX_ITERS )









    



theano.config.allow_gc =:  False
CPU times: user 2h 7min 8s, sys: 25min 2s, total: 2h 32min 11s
Wall time: 2h 32min 7s



In [57]:

    
MAX_ITERS = 100  
%time results_cost = Memblck.train_model_full( cleaned_obs_train, MAX_ITERS )









    



theano.config.allow_gc =:  False
CPU times: user 4h 19min, sys: 59min 44s, total: 5h 18min 44s
Wall time: 5h 18min 36s



In [77]:

    
MAX_ITERS = 100  
%time results_cost = Memblck.train_model_full( cleaned_obs_train, MAX_ITERS )









    



theano.config.allow_gc =:  False
CPU times: user 4h 11min 27s, sys: 44min 52s, total: 4h 56min 20s
Wall time: 4h 56min 12s



In [78]:

    
%time valid_predictions = Memblck.predict_on_lst_givens( cleaned_obs_valid,verbose=False)
# valid_predictions_np = [np.array(arr) for arr in valid_predictions]









    



CPU times: user 4min 48s, sys: 7.14 s, total: 4min 55s
Wall time: 4min 55s



In [83]:

    
MAX_ITERS = 100  
%time results_cost = Memblck.train_model_full( cleaned_obs_train, MAX_ITERS )









    



theano.config.allow_gc =:  False
CPU times: user 4h 7min 32s, sys: 31min 53s, total: 4h 39min 25s
Wall time: 4h 39min 18s



In [84]:

    
%time valid_predictions = Memblck.predict_on_lst_givens( cleaned_obs_valid,verbose=False)
valid_predictions_np = [np.array(arr) for arr in valid_predictions]









    



CPU times: user 4min 50s, sys: 8.09 s, total: 4min 58s
Wall time: 4min 58s



In [ ]: