In [1]:
%matplotlib inline

In [2]:
import theano


WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5105)

In [3]:
from theano import function, config, sandbox, shared 
import theano.tensor as T

In [4]:
print( theano.config.device )
print( theano.config.lib.cnmem)  # cf. http://deeplearning.net/software/theano/library/config.html
print( theano.config.print_active_device)# Print active device at when the GPU device is initialized.


gpu
0.8
True

In [5]:
print(theano.config.allow_gc)
print(theano.config.optimizer_excluding)


False


In [6]:
import sys
sys.path.append( '../ML' )

In [7]:
from DNN import DNN, Feedforward

In [8]:
import numpy as np
import pandas as pd

In [9]:
import os
print(os.getcwd())
os.listdir( os.getcwd() )


/home/topolo/PropD/MLgrabbag/kaggle
Out[9]:
['2017datascibowl',
 'HOG_SVM32.ipynb',
 'HOG_process_322.ipynb',
 'data_password.txt',
 'HOG_DNN.ipynb',
 'LSTM_model201702271930.save',
 'cleaning_dueSigmaFin.pyc',
 'LSTM_model201702280608.save',
 'DatSciBow2017_DNN.ipynb',
 '.ipynb_checkpoints',
 'dueSigmaFinancial_kaggle.py',
 'HOG_process.ipynb',
 'LSTM_model.save',
 'LSTM_model201703012346.save',
 'DatSciBow2017_FullPreprocessTutorial.ipynb',
 'LSTM_model201702282350.save',
 'HOG_process_32.ipynb',
 'GRU_model201703022010.save',
 'DueSigmaFin_runs.ipynb',
 'ImagePreprocessing.ipynb',
 'dueSigmaFinancial_local.ipynb',
 'GRU_model201703012348.save',
 'GRU_model201703050709.save',
 'GRU_model201703021741.save',
 'kaggle.ipynb',
 'glass.csv',
 'DatSciBow2017_SVM.ipynb',
 '__init__.py',
 'train.h5',
 'HOG_process2.ipynb',
 'dueSigmaFinancial_local_GRUs.ipynb',
 'HOG_DNN_32.ipynb',
 'cleaning_dueSigmaFin.py']

In [10]:
def load_feat_vec(patientid,sub_name="stage1_feat"):
    f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
    arr = np.load(f)
    f.close()
    return arr

In [11]:
def prepare_inputX(sub_name="stage1_HOG", ratio_of_train_to_total = 0.45,
                                                    ratio_valid_to_rest = 0.2):
    patients_stage1_feat = os.listdir('./2017datascibowl/'+sub_name)

    patients_stage1_feat = [id.replace("feat_vec","") for id in patients_stage1_feat]  # remove the suffix "feat_vec"
    
    # get y labels
    y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
    
    y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
    
    m = len(patients_stage1_feat)
    found_indices =[]
    for i in range(m):
        if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
            found_indices.append(i)

    patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
    
    y_found=[]
    for i in range(len(patients_stage1_feat_found)):
        if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
            cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
            y_found.append( cancer_val )
    y_found=np.array(y_found).flatten()
    
    assert (len(y_found)==len(patients_stage1_feat_found))


    numberofexamples = len(patients_stage1_feat_found)
    numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
    numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
    numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
    
    shuffledindices = np.random.permutation( numberofexamples)
    
    patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
    patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
    patients_test  = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]

    y_train = y_found[shuffledindices[:numberoftrainingexamples]]
    y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
    y_test  = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
    
    patients_train_vecs = [load_feat_vec(id,sub_name) for id in patients_train]
    patients_train_vecs = np.array(patients_train_vecs)
    
    patients_valid_vecs = [load_feat_vec(id,sub_name) for id in patients_valid]
    patients_valid_vecs = np.array(patients_valid_vecs)
    
    patients_test_vecs = [load_feat_vec(id,sub_name) for id in patients_test]
    patients_test_vecs = np.array(patients_test_vecs)

    patient_ids = {"train":patients_train,"valid":patients_valid,"test":patients_test}
    ys = {"train":y_train,"valid":y_valid,"test":y_test}
    Xs = {"train":patients_train_vecs,"valid":patients_valid_vecs,"test":patients_test_vecs}
    
    return patient_ids, ys, Xs

In [14]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG",0.20,0.125)

In [15]:
y_train_rep2 = np.copy(ys["train"])  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1

y_valid_rep2 = np.copy(ys["valid"])  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1

y_test_rep2 = np.copy(ys["test"])  # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1

In [16]:
d = Xs["train"][0].shape[0]
print(d)


1310728

In [15]:
Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)

In [15]:
d/4096


Out[15]:
320

In [18]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff.connect_through(X)


Out[18]:
sigmoid.0

In [19]:
DNN_HOG = DNN(Ff,ys["train"] , X.get_value())

In [20]:
DNN_HOG.build_J_xent()


Out[20]:
GpuFromHost.0

In [21]:
DNN_HOG.build_update(alpha=0.0001)

In [23]:
%time DNN_HOG.train_model_full(max_iters=25000) # max_iters=3, CPU times: user 397 ms, sys: 729 ms, total: 1.13 s


theano.config.allow_gc =:  False
CPU times: user 45min 4s, sys: 1h 47min 38s, total: 2h 32min 42s
Wall time: 2h 32min 40s
Out[23]:
array([ 0.52877861,  0.52518553,  0.52337295, ...,  0.00820196,
        0.00820161,  0.00820128])

In [24]:
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-24-8001f7729bfc> in <module>()
----> 1 DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))

/home/topolo/PropD/Theano/theano/compile/sharedvalue.pyc in shared(value, name, strict, allow_downcast, **kwargs)
    266             try:
    267                 var = ctor(value, name=name, strict=strict,
--> 268                            allow_downcast=allow_downcast, **kwargs)
    269                 utils.add_tag_trace(var)
    270                 return var

/home/topolo/PropD/Theano/theano/sandbox/cuda/var.pyc in float32_shared_constructor(value, name, strict, allow_downcast, borrow, broadcastable, target)
    186         # type.broadcastable is guaranteed to be a tuple, which this next
    187         # function requires
--> 188         deviceval = type_support_filter(value, type.broadcastable, False, None)
    189 
    190     try:

MemoryError: ('Error allocating 728764768 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).', "you might consider using 'theano.shared(..., borrow=True)'")

In [31]:
Ff_reloaded.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))


Out[31]:
sigmoid.0

In [32]:
yhat_valid = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()

In [35]:
ys["valid"][:20]


Out[35]:
array([1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [38]:
( (yhat_valid>0.3).astype(theano.config.floatX)==ys["valid"]).mean()


Out[38]:
0.74090367993375084
Breakdown (step-by-step) of long-term persistence (saving work, saving files, I/O); I'll

In [29]:
import cPickle

In [26]:
params_val32 = [weight.get_value() for weight in DNN32.DNN_model.__get_state__()['params'] ]

In [35]:
print(len(params_val32))


4

In [31]:
f = open("./2017datascibowl/DNN32_L3_128.pkl",'wb')
for param in params_val32:
    cPickle.dump(param,f,protocol=cPickle.HIGHEST_PROTOCOL)  # Python problem cf.
    # https://github.com/numpy/numpy/issues/2396
f.close()


---------------------------------------------------------------------------
SystemError                               Traceback (most recent call last)
<ipython-input-31-486676351de4> in <module>()
      1 f = open("./2017datascibowl/DNN32_L3_128.pkl",'wb')
      2 for param in params_val32:
----> 3     cPickle.dump(param,f,protocol=cPickle.HIGHEST_PROTOCOL)
      4 f.close()

SystemError: error return without exception set

In [34]:
# above is a problem with Python, for large arrays
for param_idx in range(len(params_val32)):
    f = open("./2017datascibowl/DNN32_L3_128"+str(param_idx)+".pkl",'wb')
    np.save(f,params_val32[param_idx])
    f.close()

In [25]:
# In summary
params_val = [weight.get_value() for weight in DNN_HOG.DNN_model.__get_state__()['params'] ]

In [28]:
for param_idx in range(len(params_val)):
    f = open("./2017datascibowl/DNN_L3_4096"+str(param_idx)+".pkl",'wb')
    np.save(f,params_val[param_idx])
    f.close()

In [ ]:
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))

load back saved parameters


In [12]:
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNN_L3_4096"
for param_idx in range(no_params):
    name_reload = filename_reload+str(param_idx)+".pkl"
    param_val = np.load(name_reload)
    params_reloaded.append(param_val)

In [18]:
Ff_reloaded = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)

In [19]:
Ff_reloaded.__set_state__(*params_reloaded)

In [20]:
Ff_reloaded.__get_state__()['params'][0].get_value()


Out[20]:
array([[-0.00528204,  0.00208968, -0.00106569, ...,  0.0046794 ,
         0.00650883,  0.00235038],
       [ 0.00603575,  0.00167849,  0.00331697, ..., -0.00813977,
        -0.0041525 , -0.00153932],
       [-0.00841335, -0.00048763,  0.00603139, ..., -0.0053605 ,
         0.00165126,  0.00742338],
       ..., 
       [ 0.00601871,  0.00825573, -0.00272351, ..., -0.0055783 ,
        -0.00625956,  0.00047734],
       [-0.00655436, -0.00158524, -0.00667119, ...,  0.00792592,
         0.00048231,  0.00058239],
       [-0.00739082, -0.00413787, -0.00332426, ...,  0.0024629 ,
        -0.0012366 , -0.00588966]], dtype=float32)

In [21]:
params_reloaded[0]


Out[21]:
array([[-0.00528204,  0.00208968, -0.00106569, ...,  0.0046794 ,
         0.00650883,  0.00235038],
       [ 0.00603575,  0.00167849,  0.00331697, ..., -0.00813977,
        -0.0041525 , -0.00153932],
       [-0.00841335, -0.00048763,  0.00603139, ..., -0.0053605 ,
         0.00165126,  0.00742338],
       ..., 
       [ 0.00601871,  0.00825573, -0.00272351, ..., -0.0055783 ,
        -0.00625956,  0.00047734],
       [-0.00655436, -0.00158524, -0.00667119, ...,  0.00792592,
         0.00048231,  0.00058239],
       [-0.00739082, -0.00413787, -0.00332426, ...,  0.0024629 ,
        -0.0012366 , -0.00588966]], dtype=float32)

Submissions out


In [12]:
stage2_sample_submission_csv = pd.read_csv("./2017datascibowl/stage2_sample_submission.csv")

In [13]:
sub_name="stage2_HOG"
patients_sample2_vecs = np.array( [load_feat_vec(id,sub_name) for id in stage2_sample_submission_csv['id'].as_matrix()] )

In [24]:
patients_sample2_vecs.shape


Out[24]:
(506, 1310728)

In [25]:
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))


Out[25]:
sigmoid.0

In [26]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()

In [28]:
yhat_sample2.shape


Out[28]:
(506, 1)

In [27]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]

In [29]:
sample2_out.to_csv("./2017datascibowl/sample2submit02.csv",index=False)

Other models; $L=4$


In [39]:
d/8192


Out[39]:
160

In [13]:
#Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff = Feedforward(4,[d,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-0df326ac6104> in <module>()
      1 #Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
----> 2 Ff = Feedforward(4,[d,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)

NameError: name 'd' is not defined

In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff.connect_through(X)


Out[16]:
sigmoid.0

In [17]:
DNN32 = DNN(Ff,ys["train"] , X.get_value())

In [18]:
DNN32.build_J_xent()


Out[18]:
GpuFromHost.0

In [19]:
DNN32.build_update(alpha=0.0001)

In [20]:
%time DNN32.train_model_full(max_iters=3) # max_iters=3, CPU times: user 465 ms, sys: 748 ms, total: 1.21 s


theano.config.allow_gc =:  False
CPU times: user 465 ms, sys: 748 ms, total: 1.21 s
Wall time: 1.21 s
Out[20]:
array([ 4.68310547,  4.00545692,  3.38988042])

In [21]:
%time DNN32.train_model_full(max_iters=25000) # CPU times: user 43min 55s, sys: 1h 48min 59s, total: 2h 32min 54s


theano.config.allow_gc =:  False
CPU times: user 43min 55s, sys: 1h 48min 59s, total: 2h 32min 54s
Wall time: 2h 32min 52s
Out[21]:
array([ 2.86399698,  2.44442129,  2.12585139, ...,  0.02420862,
        0.02420756,  0.0242065 ])

In [23]:
DNN32.save_parameters("./2017datascibowl/DNNHOG_L4_4096_")

In [22]:
# predictions on validation set
DNN32.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-22-9ecfa9107347> in <module>()
      1 # predictions on validation set
----> 2 DNN32.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))

/home/topolo/PropD/Theano/theano/compile/sharedvalue.pyc in shared(value, name, strict, allow_downcast, **kwargs)
    266             try:
    267                 var = ctor(value, name=name, strict=strict,
--> 268                            allow_downcast=allow_downcast, **kwargs)
    269                 utils.add_tag_trace(var)
    270                 return var

/home/topolo/PropD/Theano/theano/sandbox/cuda/var.pyc in float32_shared_constructor(value, name, strict, allow_downcast, borrow, broadcastable, target)
    186         # type.broadcastable is guaranteed to be a tuple, which this next
    187         # function requires
--> 188         deviceval = type_support_filter(value, type.broadcastable, False, None)
    189 
    190     try:

MemoryError: ('Error allocating 728764768 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).', "you might consider using 'theano.shared(..., borrow=True)'")

In [12]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=8
filename_reload = "./2017datascibowl/DNNHOG_L4_4096_"
for param_idx in range(no_params):
    name_reload = filename_reload+str(param_idx)
    param_val = np.load(name_reload)
    params_reloaded.append(param_val)

In [17]:
#d=1310728
Ff_reloaded = Feedforward(4,[d,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)

In [18]:
Ff_reloaded.__set_state__(*params_reloaded)

In [20]:
# predictions on validation set after reload
Ff_reloaded.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))


Out[20]:
sigmoid.0

In [21]:
yhat_valid = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()

In [27]:
( (yhat_valid>0.70).astype(theano.config.floatX)==ys["valid"]).mean()


Out[27]:
0.69727239790901097

In [30]:
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))


Out[30]:
sigmoid.0

In [31]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()

In [32]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]

In [33]:
sample2_out.to_csv("./2017datascibowl/sample2submit04_L4_4096_.csv",index=False)

$L=3$, train ratio$=0.225$


In [13]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG",0.225,0.15)

In [14]:
y_train_rep2 = np.copy(ys["train"])  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1

y_valid_rep2 = np.copy(ys["valid"])  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1

y_test_rep2 = np.copy(ys["test"])  # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1

In [15]:
d = Xs["train"][0].shape[0]
print(d)


1310728

In [18]:
# Ff = Feedforward(3,[d,d/2048,d/4096,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid) # doesn't work at DNN class
Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)

In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff.connect_through(X)


Out[16]:
sigmoid.0

In [17]:
# I tried this, it didn't work, but then again, I had loaded and used the GPU RAM for something else
#Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)

In [18]:
# I tried this, it didn't work, but then again, I had loaded and used the GPU RAM for something else
#X=theano.shared( Xs["train"].astype(theano.config.floatX))
#Ff.connect_through(X)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-18-9dac6754143d> in <module>()
----> 1 X=theano.shared( Xs["train"].astype(theano.config.floatX))
      2 Ff.connect_through(X)

/home/topolo/PropD/Theano/theano/compile/sharedvalue.pyc in shared(value, name, strict, allow_downcast, **kwargs)
    266             try:
    267                 var = ctor(value, name=name, strict=strict,
--> 268                            allow_downcast=allow_downcast, **kwargs)
    269                 utils.add_tag_trace(var)
    270                 return var

/home/topolo/PropD/Theano/theano/sandbox/cuda/var.pyc in float32_shared_constructor(value, name, strict, allow_downcast, borrow, broadcastable, target)
    186         # type.broadcastable is guaranteed to be a tuple, which this next
    187         # function requires
--> 188         deviceval = type_support_filter(value, type.broadcastable, False, None)
    189 
    190     try:

MemoryError: ('Error allocating 1829776288 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).', "you might consider using 'theano.shared(..., borrow=True)'")

In [17]:
DNN_HOG = DNN(Ff,ys["train"] , X.get_value())

In [18]:
DNN_HOG.build_J_xent()


Out[18]:
GpuFromHost.0

In [19]:
DNN_HOG.build_update(alpha=0.0001)

In [20]:
%time DNN_HOG.train_model_full(max_iters=50000)


theano.config.allow_gc =:  False
CPU times: user 1h 44min 37s, sys: 4h 13min 2s, total: 5h 57min 40s
Wall time: 5h 57min 36s
Out[20]:
array([ 1.16972339,  0.67778563,  0.62330377, ...,  0.00545839,
        0.00545829,  0.00545818])

In [21]:
DNN_HOG.save_parameters("./2017datascibowl/DNNHOG_L3_4096_")

In [22]:
# predictions on validation set
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-22-80ee6b244e21> in <module>()
      1 # predictions on validation set
----> 2 DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))

/home/topolo/PropD/Theano/theano/compile/sharedvalue.pyc in shared(value, name, strict, allow_downcast, **kwargs)
    266             try:
    267                 var = ctor(value, name=name, strict=strict,
--> 268                            allow_downcast=allow_downcast, **kwargs)
    269                 utils.add_tag_trace(var)
    270                 return var

/home/topolo/PropD/Theano/theano/sandbox/cuda/var.pyc in float32_shared_constructor(value, name, strict, allow_downcast, borrow, broadcastable, target)
    186         # type.broadcastable is guaranteed to be a tuple, which this next
    187         # function requires
--> 188         deviceval = type_support_filter(value, type.broadcastable, False, None)
    189 
    190     try:

MemoryError: ('Error allocating 849351744 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).', "you might consider using 'theano.shared(..., borrow=True)'")

In [15]:
d=1310728
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNNHOG_L3_4096_"
for param_idx in range(no_params):
    name_reload = filename_reload+str(param_idx)
    param_val = np.load(name_reload)
    params_reloaded.append(param_val)

In [16]:
Ff_reloaded = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)

In [17]:
Ff_reloaded.__set_state__(*params_reloaded)

In [16]:
# for submission
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))


Out[16]:
sigmoid.0

In [17]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()

In [18]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
sample2_out.to_csv("./2017datascibowl/sample2submit03_L3_4096_.csv",index=False)

In [18]:
# on validation set
Ff_reloaded.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))


Out[18]:
sigmoid.0

In [19]:
yhat_valid = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()

In [31]:
( (yhat_valid>0.80).astype(theano.config.floatX)==ys["valid"]).mean()


Out[31]:
0.75148605395518975

$L=5$, train ratio$=0.265$


In [12]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG",0.265,0.15)

In [13]:
y_train_rep2 = np.copy(ys["train"])  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1

y_valid_rep2 = np.copy(ys["valid"])  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1

y_test_rep2 = np.copy(ys["test"])  # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1

d = Xs["train"][0].shape[0]
print(d)


1310728

In [14]:
print(d/8192)


160

In [15]:
# Ff = Feedforward(5,[d,d/8192,d/8192,d/16384,d/32768,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff = Feedforward(5,[d,d/4096,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
#Ff = Feedforward(5,[d,d/2048,d/2048,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid) # MemoryError: ('Error allocating 1939877440 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).', "you might consider using 'theano.shared(..., borrow=True)'")

In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff.connect_through(X)


Out[16]:
sigmoid.0

In [17]:
DNN_HOG = DNN(Ff,ys["train"] )

In [18]:
DNN_HOG.build_J_xent()


Out[18]:
GpuFromHost.0

In [19]:
DNN_HOG.build_update(alpha=0.0001)

In [20]:
%time DNN_HOG.train_model_full(max_iters=2) # max_iters=2 CPU times: user 374 ms, sys: 679 ms, total: 1.05 s


theano.config.allow_gc =:  False
CPU times: user 374 ms, sys: 679 ms, total: 1.05 s
Wall time: 1.05 s
Out[20]:
array([ 1.13762689,  1.0588975 ])

In [21]:
%time DNN_HOG.train_model_full(max_iters=20000)


theano.config.allow_gc =:  False
CPU times: user 53min 36s, sys: 2h 5min 47s, total: 2h 59min 24s
Wall time: 2h 59min 22s
Out[21]:
array([ 0.9907918 ,  0.9327687 ,  0.88388103, ...,  0.06769657,
        0.06769135,  0.06768612])

In [22]:
DNN_HOG.save_parameters("./2017datascibowl/DNNHOG_L5_4096_")

In [23]:
# predictions on validation set
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))


Out[23]:
sigmoid.0

In [24]:
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG.DNN_model._get_outer_layer_() )()

In [33]:
( (yhat_valid>0.80).astype(theano.config.floatX)==ys["valid"]).mean()


Out[33]:
0.75636700961376291

In [34]:
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["test"].astype(theano.config.floatX)))


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-34-e8ab855b9ba5> in <module>()
----> 1 DNN_HOG.DNN_model.connect_through(theano.shared(Xs["test"].astype(theano.config.floatX)))

/home/topolo/PropD/Theano/theano/compile/sharedvalue.pyc in shared(value, name, strict, allow_downcast, **kwargs)
    266             try:
    267                 var = ctor(value, name=name, strict=strict,
--> 268                            allow_downcast=allow_downcast, **kwargs)
    269                 utils.add_tag_trace(var)
    270                 return var

/home/topolo/PropD/Theano/theano/sandbox/cuda/var.pyc in float32_shared_constructor(value, name, strict, allow_downcast, borrow, broadcastable, target)
    186         # type.broadcastable is guaranteed to be a tuple, which this next
    187         # function requires
--> 188         deviceval = type_support_filter(value, type.broadcastable, False, None)
    189 
    190     try:

MemoryError: ('Error allocating 4577062176 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).', "you might consider using 'theano.shared(..., borrow=True)'")

In [42]:
# DNN_HOG.DNN_model.__get_state__()['params'][1].get_value();

In [43]:
# params_reloaded;

I will try to do a pseudo-"batch" gradient descent where I take another randomized training set out of the given input (this new, randomly shuffled training, validation, and test sets could include previous cases, but that's ok, because it's all chosen at random).


In [14]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=10
filename_reload = "./2017datascibowl/DNNHOG_L5_4096_"
for param_idx in range(no_params):
    name_reload = filename_reload+str(param_idx)
    param_val = np.load(name_reload)
    params_reloaded.append(param_val)

In [15]:
Ff_reloaded = Feedforward(5,[d,d/4096,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)

In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff_reloaded.connect_through(X)

DNN_HOG_reloaded = DNN(Ff_reloaded,ys["train"] )

In [18]:
DNN_HOG_reloaded.build_J_xent()

DNN_HOG_reloaded.build_update(alpha=0.0001)

In [19]:
%time DNN_HOG_reloaded.train_model_full(max_iters=2) # max_iters=2 CPU times: user 374 ms, sys: 679 ms, total: 1.05 s


theano.config.allow_gc =:  False
CPU times: user 373 ms, sys: 732 ms, total: 1.11 s
Wall time: 1.11 s
Out[19]:
array([ 0.5220021 ,  0.51816344])

In [20]:
%time DNN_HOG_reloaded.train_model_full(max_iters=60000)


theano.config.allow_gc =:  False
CPU times: user 2h 33min 19s, sys: 6h 21min 59s, total: 8h 55min 18s
Wall time: 8h 55min 11s
Out[20]:
array([ 0.51710677,  0.51663357,  0.51626259, ...,  0.01149656,
        0.01149639,  0.01149621])

In [22]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG_L5_4096b_")

In [23]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))


Out[23]:
sigmoid.0

In [24]:
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()

In [29]:
( (yhat_valid>0.70).astype(theano.config.floatX)==ys["valid"]).mean()


Out[29]:
0.70365997638724909

In [14]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=10
filename_reload = "./2017datascibowl/DNNHOG_L5_4096b_"
for param_idx in range(no_params):
    name_reload = filename_reload+str(param_idx)
    param_val = np.load(name_reload)
    params_reloaded.append(param_val)

In [15]:
Ff_reloaded = Feedforward(5,[d,d/4096,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)

In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff_reloaded.connect_through(X)

DNN_HOG_reloaded = DNN(Ff_reloaded,ys["train"] )

DNN_HOG_reloaded.build_J_xent()

DNN_HOG_reloaded.build_update(alpha=0.0001)

In [17]:
%time DNN_HOG_reloaded.train_model_full(max_iters=2) # CPU times: user 358 ms, sys: 713 ms, total: 1.07 s


theano.config.allow_gc =:  False
CPU times: user 358 ms, sys: 713 ms, total: 1.07 s
Wall time: 1.07 s
Out[17]:
array([ 0.3888883 ,  0.36364251])

In [18]:
%time DNN_HOG_reloaded.train_model_full(max_iters=20000)


theano.config.allow_gc =:  False
CPU times: user 52min 50s, sys: 2h 5min 2s, total: 2h 57min 53s
Wall time: 2h 57min 51s
Out[18]:
array([ 0.35451606,  0.35202163,  0.35122684, ...,  0.015284  ,
        0.01528342,  0.01528284])

In [19]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG_L5_4096c_")

In [20]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))

yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()

In [23]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys["valid"]).mean()


Out[23]:
0.68215550683083148

In [27]:
# submission
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-27-45999b481a25> in <module>()
      1 # submission
----> 2 DNN_HOG_reloaded.DNN_model.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))

/home/topolo/PropD/Theano/theano/compile/sharedvalue.pyc in shared(value, name, strict, allow_downcast, **kwargs)
    266             try:
    267                 var = ctor(value, name=name, strict=strict,
--> 268                            allow_downcast=allow_downcast, **kwargs)
    269                 utils.add_tag_trace(var)
    270                 return var

/home/topolo/PropD/Theano/theano/sandbox/cuda/var.pyc in float32_shared_constructor(value, name, strict, allow_downcast, borrow, broadcastable, target)
    186         # type.broadcastable is guaranteed to be a tuple, which this next
    187         # function requires
--> 188         deviceval = type_support_filter(value, type.broadcastable, False, None)
    189 
    190     try:

MemoryError: ('Error allocating 2652913472 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).', "you might consider using 'theano.shared(..., borrow=True)'")

In [14]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=10
filename_reload = "./2017datascibowl/DNNHOG_L5_4096c_"
for param_idx in range(no_params):
    name_reload = filename_reload+str(param_idx)
    param_val = np.load(name_reload)
    params_reloaded.append(param_val)

In [16]:
d = patients_sample2_vecs[0].shape[0]
print(d)


1310728

In [17]:
Ff_reloaded = Feedforward(5,[d,d/4096,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)

In [18]:
# submission
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))


Out[18]:
sigmoid.0

In [19]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()

In [20]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
sample2_out.to_csv("./2017datascibowl/sample2submit04_L5_4096_.csv",index=False)

In [23]:
Ff_reloaded.__get_state__()["params"][0].get_value()


Out[23]:
array([[-0.00527595,  0.0020921 , -0.00106458, ...,  0.00465846,
         0.00650882,  0.00234706],
       [ 0.00603453,  0.00167604,  0.00331816, ..., -0.0081442 ,
        -0.0041525 , -0.00154524],
       [-0.00841332, -0.00048159,  0.00603145, ..., -0.00529411,
         0.00164849,  0.0074848 ],
       ..., 
       [ 0.00601871,  0.00825573, -0.00272351, ..., -0.0055783 ,
        -0.00625956,  0.00047734],
       [-0.00657746, -0.00156745, -0.00672882, ...,  0.00791172,
         0.00049852,  0.00062179],
       [-0.00739082, -0.00413787, -0.00332426, ...,  0.0024629 ,
        -0.0012366 , -0.00588966]], dtype=float32)

In [24]:
params_reloaded[0]


Out[24]:
array([[-0.00527595,  0.0020921 , -0.00106458, ...,  0.00465846,
         0.00650882,  0.00234706],
       [ 0.00603453,  0.00167604,  0.00331816, ..., -0.0081442 ,
        -0.0041525 , -0.00154524],
       [-0.00841332, -0.00048159,  0.00603145, ..., -0.00529411,
         0.00164849,  0.0074848 ],
       ..., 
       [ 0.00601871,  0.00825573, -0.00272351, ..., -0.0055783 ,
        -0.00625956,  0.00047734],
       [-0.00657746, -0.00156745, -0.00672882, ...,  0.00791172,
         0.00049852,  0.00062179],
       [-0.00739082, -0.00413787, -0.00332426, ...,  0.0024629 ,
        -0.0012366 , -0.00588966]], dtype=float32)

In [ ]: