In [1]:
%matplotlib inline
In [2]:
import theano
In [3]:
from theano import function, config, sandbox, shared
import theano.tensor as T
In [4]:
print( theano.config.device )
print( theano.config.lib.cnmem) # cf. http://deeplearning.net/software/theano/library/config.html
print( theano.config.print_active_device)# Print active device at when the GPU device is initialized.
In [5]:
print(theano.config.allow_gc)
print(theano.config.optimizer_excluding)
In [6]:
import sys
sys.path.append( '../ML' )
In [7]:
from DNN import DNN, Feedforward
In [8]:
import numpy as np
import pandas as pd
In [9]:
import os
print(os.getcwd())
os.listdir( os.getcwd() )
Out[9]:
In [10]:
def load_feat_vec(patientid,sub_name="stage1_feat"):
f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
arr = np.load(f)
f.close()
return arr
In [11]:
def prepare_inputX(sub_name="stage1_HOG32", ratio_of_train_to_total = 0.4,
ratio_valid_to_rest = 0.2):
patients_stage1_feat = os.listdir('./2017datascibowl/'+sub_name)
patients_stage1_feat = [id.replace("feat_vec","") for id in patients_stage1_feat] # remove the suffix "feat_vec"
# get y labels
y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
m = len(patients_stage1_feat)
found_indices =[]
for i in range(m):
if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
found_indices.append(i)
patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
y_found=[]
for i in range(len(patients_stage1_feat_found)):
if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
y_found.append( cancer_val )
y_found=np.array(y_found).flatten()
assert (len(y_found)==len(patients_stage1_feat_found))
numberofexamples = len(patients_stage1_feat_found)
numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
shuffledindices = np.random.permutation( numberofexamples)
patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
patients_test = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]
y_train = y_found[shuffledindices[:numberoftrainingexamples]]
y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
y_test = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
patients_train_vecs = [load_feat_vec(id,sub_name) for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
patients_valid_vecs = [load_feat_vec(id,sub_name) for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
patients_test_vecs = [load_feat_vec(id,sub_name) for id in patients_test]
patients_test_vecs = np.array(patients_test_vecs)
patient_ids = {"train":patients_train,"valid":patients_valid,"test":patients_test}
ys = {"train":y_train,"valid":y_valid,"test":y_test}
Xs = {"train":patients_train_vecs,"valid":patients_valid_vecs,"test":patients_test_vecs}
return patient_ids, ys, Xs
In [12]:
patient_ids32, ys32,Xs32=prepare_inputX("stage1_HOG32",0.375,0.2)
In [13]:
y_train_rep2 = np.copy(ys32["train"]) # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1
y_valid_rep2 = np.copy(ys32["valid"]) # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1
y_test_rep2 = np.copy(ys32["test"]) # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1
In [14]:
d = Xs32["train"][0].shape[0]
print(d)
In [15]:
Ff32 = Feedforward(2,[d,d/128,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [16]:
d/128
Out[16]:
In [17]:
X32=theano.shared( Xs32["train"].astype(theano.config.floatX))
Ff32.connect_through(X32)
Out[17]:
In [16]:
DNN32 = DNN(Ff32,ys32["train"] , X32.get_value())
In [20]:
DNN32.build_J_xent()
Out[20]:
In [21]:
DNN32.build_update(alpha=0.0001)
In [23]:
%time DNN32.train_model_full(max_iters=10000) # max_iters=3, CPU times: user 320 ms, sys: 713 ms, total: 1.03 s
Out[23]:
In [24]:
DNN32.DNN_model.connect_through(theano.shared(Xs32["valid"].astype(theano.config.floatX)))
In [29]:
import cPickle
In [26]:
params_val32 = [weight.get_value() for weight in DNN32.DNN_model.__get_state__()['params'] ]
In [35]:
print(len(params_val32))
In [31]:
f = open("./2017datascibowl/DNN32_L3_128.pkl",'wb')
for param in params_val32:
cPickle.dump(param,f,protocol=cPickle.HIGHEST_PROTOCOL) # Python problem cf.
# https://github.com/numpy/numpy/issues/2396
f.close()
In [34]:
# above is a problem with Python, for large arrays
for param_idx in range(len(params_val32)):
f = open("./2017datascibowl/DNN32_L3_128"+str(param_idx)+".pkl",'wb')
np.save(f,params_val32[param_idx])
f.close()
In [15]:
params_reloaded = []
no_params=4
filename_reload = "./2017datascibowl/DNN32_L3_128"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)+".pkl"
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [16]:
Ff32_reloaded = Feedforward(2,[d,d/128,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [17]:
Ff32_reloaded.__set_state__(*params_reloaded)
In [19]:
Ff32_reloaded.__get_state__()['params'][0].get_value()
Out[19]:
In [20]:
params_reloaded[0]
Out[20]:
In [27]:
stage2_sample_submission_csv = pd.read_csv("./2017datascibowl/stage2_sample_submission.csv")
In [28]:
sub_name="stage2_HOG32"
patients_sample2_vecs = np.array( [load_feat_vec(id,sub_name) for id in stage2_sample_submission_csv['id'].as_matrix()] )
In [17]:
print(patients_sample2_vecs.shape)
d=patients_sample2_vecs.shape[1]
print(d)
In [24]:
Ff32_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
Out[24]:
In [25]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff32_reloaded._get_outer_layer_() )()
In [28]:
yhat_sample2.shape
Out[28]:
In [29]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
In [30]:
sample2_out.to_csv("./2017datascibowl/sample2submit01.csv",index=False)
In [16]:
d/1024
Out[16]:
In [15]:
Ff32 = Feedforward(3,[d,d/256,d/1024,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [16]:
X32=theano.shared( Xs32["train"].astype(theano.config.floatX))
Ff32.connect_through(X32)
Out[16]:
In [17]:
DNN32 = DNN(Ff32,ys32["train"] , X32.get_value())
In [18]:
DNN32.build_J_xent()
Out[18]:
In [19]:
DNN32.build_update(alpha=0.0001)
In [20]:
%time DNN32.train_model_full(max_iters=3) # max_iters=3, CPU times: user 320 ms, sys: 713 ms, total: 1.03 s
Out[20]:
In [21]:
%time DNN32.train_model_full(max_iters=15000) # max_iters=3, CPU times: user 309 ms, sys: 559 ms, total: 868 ms
Out[21]:
In [24]:
DNN32.save_parameters("./2017datascibowl/DNNHOG32_L3_256_")
In [25]:
DNN32.DNN_model.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
Out[25]:
In [26]:
yhat_sample2 = theano.function(inputs=[],outputs=DNN32.DNN_model._get_outer_layer_() )()
In [27]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
In [28]:
sample2_out.to_csv("./2017datascibowl/sample2submit02.csv",index=False)
In [12]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG32",0.575,0.2)
In [14]:
d = Xs32["train"][0].shape[0]
print(d)
In [15]:
Ff = Feedforward(3,[d,d/256,d/256,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [16]:
X=theano.shared( Xs32["train"].astype(theano.config.floatX))
Ff.connect_through(X)
Out[16]:
In [19]:
DNN_HOG = DNN(Ff,ys32["train"] )
In [20]:
DNN_HOG.build_J_xent()
Out[20]:
In [21]:
DNN_HOG.build_update(alpha=0.0001)
In [22]:
%time DNN_HOG.train_model_full(max_iters=2) # max_iters=2 CPU times: user 374 ms, sys: 679 ms, total: 1.05 s
Out[22]:
In [23]:
%time DNN_HOG.train_model_full(max_iters=10000)
Out[23]:
In [25]:
DNN_HOG.save_parameters("./2017datascibowl/DNNHOG32_L3_256_")
In [27]:
# predictions on validation set
DNN_HOG.DNN_model.connect_through(theano.shared(Xs32["valid"].astype(theano.config.floatX)))
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG.DNN_model._get_outer_layer_() )()
In [36]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys32["valid"]).mean() # threshold 0.8 0.72493536340132148
Out[36]:
In [37]:
%time DNN_HOG.train_model_full(max_iters=5000)
Out[37]:
In [38]:
DNN_HOG.save_parameters("./2017datascibowl/DNNHOG32_L3_256b_")
In [39]:
# predictions on validation set
DNN_HOG.DNN_model.connect_through(theano.shared(Xs32["valid"].astype(theano.config.floatX)))
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG.DNN_model._get_outer_layer_() )()
In [42]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys32["valid"]).mean() # threshold 0.8 0.72493536340132148
Out[42]:
In [12]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG32",0.595,0.2)
In [14]:
d = Xs["train"][0].shape[0]
print(d)
In [17]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNNHOG32_L3_256b_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [19]:
Ff_reloaded = Feedforward(3,[d,d/256,d/256,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)
In [22]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff_reloaded.connect_through(X)
DNN_HOG_reloaded = DNN(Ff_reloaded,ys["train"] )
DNN_HOG_reloaded.build_J_xent()
DNN_HOG_reloaded.build_update(alpha=0.0001)
In [23]:
%time DNN_HOG_reloaded.train_model_full(max_iters=2) # CPU times: user 342 ms, sys: 580 ms, total: 922 ms
Out[23]:
In [24]:
%time DNN_HOG_reloaded.train_model_full(max_iters=15000)
Out[24]:
In [25]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG32_L3_256c_")
In [26]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [29]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys["valid"]).mean() # threshold 0.8 0.76991150442477874
Out[29]:
In [33]:
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
In [15]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNNHOG32_L3_256c_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [18]:
Ff_reloaded = Feedforward(3,[d,d/256,d/256,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)
In [19]:
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
Out[19]:
In [22]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
In [23]:
sample2_out.to_csv("./2017datascibowl/sample2submit05_L3_256.csv",index=False)
In [12]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNNHOG32_L3_256b_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [13]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG32",0.635,0.2) # 0.615
d = Xs["train"][0].shape[0]
print(d)
In [15]:
Ff_reloaded = Feedforward(3,[d,d/256,d/256,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)
In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff_reloaded.connect_through(X)
DNN_HOG_reloaded = DNN(Ff_reloaded,ys["train"] )
DNN_HOG_reloaded.build_J_xent()
DNN_HOG_reloaded.build_update(alpha=0.0001)
In [17]:
%time DNN_HOG_reloaded.train_model_full(max_iters=2) #
Out[17]:
In [18]:
%time DNN_HOG_reloaded.train_model_full(max_iters=10000)
Out[18]:
In [19]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG32_L3_256d_")
In [20]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [23]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys["valid"]).mean() # threshold 0.8 0.7289719626168224
Out[23]:
In [24]:
# predictions on test set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["test"].astype(theano.config.floatX)))
yhat_test = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [27]:
( (yhat_test>0.50).astype(theano.config.floatX)==ys["test"]).mean() # threshold 0.8 0.72764466168894437
Out[27]:
In [21]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNNHOG32_L3_256d_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [24]:
%time DNN_HOG_reloaded.train_model_full(max_iters=5000)
Out[24]:
In [25]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG32_L3_256e_")
In [26]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [28]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys["valid"]).mean() # threshold 0.8 0.74990388312187617
Out[28]:
In [33]:
print(X.get_value().shape)
print(Xs["train"].shape)
print(X.get_value()[:3]);
print(Xs["train"][:3]);
In [34]:
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["train"].astype(theano.config.floatX)))
Out[34]:
In [35]:
DNN_HOG_reloaded.build_J_xent()
DNN_HOG_reloaded.build_update(alpha=0.0001)
In [36]:
%time DNN_HOG_reloaded.train_model_full(max_iters=2000)
Out[36]:
In [37]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG32_L3_256f_")
In [38]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [41]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys["valid"]).mean() # threshold 0.8 0.74990388312187617
Out[41]:
In [43]:
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["train"].astype(theano.config.floatX)))
DNN_HOG_reloaded.build_J_xent()
DNN_HOG_reloaded.build_update(alpha=0.0001)
In [12]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNNHOG32_L3_256f_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [13]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG32",0.655,0.2) # 0.615, 0.635
d = Xs["train"][0].shape[0]
print(d)
In [14]:
Ff_reloaded = Feedforward(3,[d,d/256,d/256,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)
In [15]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff_reloaded.connect_through(X)
DNN_HOG_reloaded = DNN(Ff_reloaded,ys["train"] )
DNN_HOG_reloaded.build_J_xent()
DNN_HOG_reloaded.build_update(alpha=0.0001)
In [16]:
%time DNN_HOG_reloaded.train_model_full(max_iters=1500)
Out[16]:
In [17]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG32_L3_256g_")
In [18]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [21]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys["valid"]).mean() # threshold 0.8 0.72916666666666663
Out[21]:
In [22]:
# predictions on test set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["test"].astype(theano.config.floatX)))
yhat_test = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [25]:
( (yhat_test>0.50).astype(theano.config.floatX)==ys["test"]).mean() # threshold 0.8 0.74741603801444334
Out[25]:
In [29]:
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
Out[29]:
In [30]:
yhat_sample2 = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
In [31]:
sample2_out.to_csv("./2017datascibowl/sample2submit06_L3_256_066.csv",index=False)
In [42]:
print(DNN_HOG_reloaded.y.get_value()[:10])
print(ys["train"][:10])
In [22]:
print(params_reloaded[0]); print(params_reloaded[2])
In [23]:
print( DNN_HOG_reloaded.DNN_model.__get_state__()['params'][0].get_value() )
print( DNN_HOG_reloaded.DNN_model.__get_state__()['params'][2].get_value() )
In [ ]: