In [1]:
%matplotlib inline
In [2]:
import theano
In [3]:
from theano import function, config, sandbox, shared
import theano.tensor as T
In [4]:
print( theano.config.device )
print( theano.config.lib.cnmem) # cf. http://deeplearning.net/software/theano/library/config.html
print( theano.config.print_active_device)# Print active device at when the GPU device is initialized.
In [5]:
print(theano.config.allow_gc)
print(theano.config.optimizer_excluding)
In [6]:
import sys
sys.path.append( '../ML' )
In [7]:
from DNN import DNN, Feedforward
In [8]:
import numpy as np
import pandas as pd
In [9]:
import os
print(os.getcwd())
os.listdir( os.getcwd() )
Out[9]:
In [10]:
def load_feat_vec(patientid,sub_name="stage1_feat"):
f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
arr = np.load(f)
f.close()
return arr
In [11]:
def prepare_inputX(sub_name="stage1_HOG", ratio_of_train_to_total = 0.45,
ratio_valid_to_rest = 0.2):
patients_stage1_feat = os.listdir('./2017datascibowl/'+sub_name)
patients_stage1_feat = [id.replace("feat_vec","") for id in patients_stage1_feat] # remove the suffix "feat_vec"
# get y labels
y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
m = len(patients_stage1_feat)
found_indices =[]
for i in range(m):
if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
found_indices.append(i)
patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
y_found=[]
for i in range(len(patients_stage1_feat_found)):
if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
y_found.append( cancer_val )
y_found=np.array(y_found).flatten()
assert (len(y_found)==len(patients_stage1_feat_found))
numberofexamples = len(patients_stage1_feat_found)
numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
shuffledindices = np.random.permutation( numberofexamples)
patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
patients_test = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]
y_train = y_found[shuffledindices[:numberoftrainingexamples]]
y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
y_test = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
patients_train_vecs = [load_feat_vec(id,sub_name) for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
patients_valid_vecs = [load_feat_vec(id,sub_name) for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
patients_test_vecs = [load_feat_vec(id,sub_name) for id in patients_test]
patients_test_vecs = np.array(patients_test_vecs)
patient_ids = {"train":patients_train,"valid":patients_valid,"test":patients_test}
ys = {"train":y_train,"valid":y_valid,"test":y_test}
Xs = {"train":patients_train_vecs,"valid":patients_valid_vecs,"test":patients_test_vecs}
return patient_ids, ys, Xs
In [14]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG",0.20,0.125)
In [15]:
y_train_rep2 = np.copy(ys["train"]) # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1
y_valid_rep2 = np.copy(ys["valid"]) # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1
y_test_rep2 = np.copy(ys["test"]) # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1
In [16]:
d = Xs["train"][0].shape[0]
print(d)
In [15]:
Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [15]:
d/4096
Out[15]:
In [18]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff.connect_through(X)
Out[18]:
In [19]:
DNN_HOG = DNN(Ff,ys["train"] , X.get_value())
In [20]:
DNN_HOG.build_J_xent()
Out[20]:
In [21]:
DNN_HOG.build_update(alpha=0.0001)
In [23]:
%time DNN_HOG.train_model_full(max_iters=25000) # max_iters=3, CPU times: user 397 ms, sys: 729 ms, total: 1.13 s
Out[23]:
In [24]:
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
In [31]:
Ff_reloaded.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
Out[31]:
In [32]:
yhat_valid = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()
In [35]:
ys["valid"][:20]
Out[35]:
In [38]:
( (yhat_valid>0.3).astype(theano.config.floatX)==ys["valid"]).mean()
Out[38]:
In [29]:
import cPickle
In [26]:
params_val32 = [weight.get_value() for weight in DNN32.DNN_model.__get_state__()['params'] ]
In [35]:
print(len(params_val32))
In [31]:
f = open("./2017datascibowl/DNN32_L3_128.pkl",'wb')
for param in params_val32:
cPickle.dump(param,f,protocol=cPickle.HIGHEST_PROTOCOL) # Python problem cf.
# https://github.com/numpy/numpy/issues/2396
f.close()
In [34]:
# above is a problem with Python, for large arrays
for param_idx in range(len(params_val32)):
f = open("./2017datascibowl/DNN32_L3_128"+str(param_idx)+".pkl",'wb')
np.save(f,params_val32[param_idx])
f.close()
In [25]:
# In summary
params_val = [weight.get_value() for weight in DNN_HOG.DNN_model.__get_state__()['params'] ]
In [28]:
for param_idx in range(len(params_val)):
f = open("./2017datascibowl/DNN_L3_4096"+str(param_idx)+".pkl",'wb')
np.save(f,params_val[param_idx])
f.close()
In [ ]:
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
In [12]:
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNN_L3_4096"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)+".pkl"
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [18]:
Ff_reloaded = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [19]:
Ff_reloaded.__set_state__(*params_reloaded)
In [20]:
Ff_reloaded.__get_state__()['params'][0].get_value()
Out[20]:
In [21]:
params_reloaded[0]
Out[21]:
In [12]:
stage2_sample_submission_csv = pd.read_csv("./2017datascibowl/stage2_sample_submission.csv")
In [13]:
sub_name="stage2_HOG"
patients_sample2_vecs = np.array( [load_feat_vec(id,sub_name) for id in stage2_sample_submission_csv['id'].as_matrix()] )
In [24]:
patients_sample2_vecs.shape
Out[24]:
In [25]:
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
Out[25]:
In [26]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()
In [28]:
yhat_sample2.shape
Out[28]:
In [27]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
In [29]:
sample2_out.to_csv("./2017datascibowl/sample2submit02.csv",index=False)
In [39]:
d/8192
Out[39]:
In [13]:
#Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff = Feedforward(4,[d,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff.connect_through(X)
Out[16]:
In [17]:
DNN32 = DNN(Ff,ys["train"] , X.get_value())
In [18]:
DNN32.build_J_xent()
Out[18]:
In [19]:
DNN32.build_update(alpha=0.0001)
In [20]:
%time DNN32.train_model_full(max_iters=3) # max_iters=3, CPU times: user 465 ms, sys: 748 ms, total: 1.21 s
Out[20]:
In [21]:
%time DNN32.train_model_full(max_iters=25000) # CPU times: user 43min 55s, sys: 1h 48min 59s, total: 2h 32min 54s
Out[21]:
In [23]:
DNN32.save_parameters("./2017datascibowl/DNNHOG_L4_4096_")
In [22]:
# predictions on validation set
DNN32.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
In [12]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=8
filename_reload = "./2017datascibowl/DNNHOG_L4_4096_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [17]:
#d=1310728
Ff_reloaded = Feedforward(4,[d,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [18]:
Ff_reloaded.__set_state__(*params_reloaded)
In [20]:
# predictions on validation set after reload
Ff_reloaded.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
Out[20]:
In [21]:
yhat_valid = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()
In [27]:
( (yhat_valid>0.70).astype(theano.config.floatX)==ys["valid"]).mean()
Out[27]:
In [30]:
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
Out[30]:
In [31]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()
In [32]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
In [33]:
sample2_out.to_csv("./2017datascibowl/sample2submit04_L4_4096_.csv",index=False)
In [13]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG",0.225,0.15)
In [14]:
y_train_rep2 = np.copy(ys["train"]) # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1
y_valid_rep2 = np.copy(ys["valid"]) # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1
y_test_rep2 = np.copy(ys["test"]) # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1
In [15]:
d = Xs["train"][0].shape[0]
print(d)
In [18]:
# Ff = Feedforward(3,[d,d/2048,d/4096,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid) # doesn't work at DNN class
Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff.connect_through(X)
Out[16]:
In [17]:
# I tried this, it didn't work, but then again, I had loaded and used the GPU RAM for something else
#Ff = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [18]:
# I tried this, it didn't work, but then again, I had loaded and used the GPU RAM for something else
#X=theano.shared( Xs["train"].astype(theano.config.floatX))
#Ff.connect_through(X)
In [17]:
DNN_HOG = DNN(Ff,ys["train"] , X.get_value())
In [18]:
DNN_HOG.build_J_xent()
Out[18]:
In [19]:
DNN_HOG.build_update(alpha=0.0001)
In [20]:
%time DNN_HOG.train_model_full(max_iters=50000)
Out[20]:
In [21]:
DNN_HOG.save_parameters("./2017datascibowl/DNNHOG_L3_4096_")
In [22]:
# predictions on validation set
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
In [15]:
d=1310728
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=6
filename_reload = "./2017datascibowl/DNNHOG_L3_4096_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [16]:
Ff_reloaded = Feedforward(3,[d,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
In [17]:
Ff_reloaded.__set_state__(*params_reloaded)
In [16]:
# for submission
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
Out[16]:
In [17]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()
In [18]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
sample2_out.to_csv("./2017datascibowl/sample2submit03_L3_4096_.csv",index=False)
In [18]:
# on validation set
Ff_reloaded.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
Out[18]:
In [19]:
yhat_valid = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()
In [31]:
( (yhat_valid>0.80).astype(theano.config.floatX)==ys["valid"]).mean()
Out[31]:
In [12]:
patient_ids, ys,Xs=prepare_inputX("stage1_HOG",0.265,0.15)
In [13]:
y_train_rep2 = np.copy(ys["train"]) # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1
y_valid_rep2 = np.copy(ys["valid"]) # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1
y_test_rep2 = np.copy(ys["test"]) # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1
d = Xs["train"][0].shape[0]
print(d)
In [14]:
print(d/8192)
In [15]:
# Ff = Feedforward(5,[d,d/8192,d/8192,d/16384,d/32768,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff = Feedforward(5,[d,d/4096,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
#Ff = Feedforward(5,[d,d/2048,d/2048,d/4096,d/8192,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid) # MemoryError: ('Error allocating 1939877440 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).', "you might consider using 'theano.shared(..., borrow=True)'")
In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff.connect_through(X)
Out[16]:
In [17]:
DNN_HOG = DNN(Ff,ys["train"] )
In [18]:
DNN_HOG.build_J_xent()
Out[18]:
In [19]:
DNN_HOG.build_update(alpha=0.0001)
In [20]:
%time DNN_HOG.train_model_full(max_iters=2) # max_iters=2 CPU times: user 374 ms, sys: 679 ms, total: 1.05 s
Out[20]:
In [21]:
%time DNN_HOG.train_model_full(max_iters=20000)
Out[21]:
In [22]:
DNN_HOG.save_parameters("./2017datascibowl/DNNHOG_L5_4096_")
In [23]:
# predictions on validation set
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
Out[23]:
In [24]:
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG.DNN_model._get_outer_layer_() )()
In [33]:
( (yhat_valid>0.80).astype(theano.config.floatX)==ys["valid"]).mean()
Out[33]:
In [34]:
DNN_HOG.DNN_model.connect_through(theano.shared(Xs["test"].astype(theano.config.floatX)))
In [42]:
# DNN_HOG.DNN_model.__get_state__()['params'][1].get_value();
In [43]:
# params_reloaded;
I will try to do a pseudo-"batch" gradient descent where I take another randomized training set out of the given input (this new, randomly shuffled training, validation, and test sets could include previous cases, but that's ok, because it's all chosen at random).
In [14]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=10
filename_reload = "./2017datascibowl/DNNHOG_L5_4096_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [15]:
Ff_reloaded = Feedforward(5,[d,d/4096,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)
In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff_reloaded.connect_through(X)
DNN_HOG_reloaded = DNN(Ff_reloaded,ys["train"] )
In [18]:
DNN_HOG_reloaded.build_J_xent()
DNN_HOG_reloaded.build_update(alpha=0.0001)
In [19]:
%time DNN_HOG_reloaded.train_model_full(max_iters=2) # max_iters=2 CPU times: user 374 ms, sys: 679 ms, total: 1.05 s
Out[19]:
In [20]:
%time DNN_HOG_reloaded.train_model_full(max_iters=60000)
Out[20]:
In [22]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG_L5_4096b_")
In [23]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
Out[23]:
In [24]:
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [29]:
( (yhat_valid>0.70).astype(theano.config.floatX)==ys["valid"]).mean()
Out[29]:
In [14]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=10
filename_reload = "./2017datascibowl/DNNHOG_L5_4096b_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [15]:
Ff_reloaded = Feedforward(5,[d,d/4096,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)
In [16]:
X=theano.shared( Xs["train"].astype(theano.config.floatX))
Ff_reloaded.connect_through(X)
DNN_HOG_reloaded = DNN(Ff_reloaded,ys["train"] )
DNN_HOG_reloaded.build_J_xent()
DNN_HOG_reloaded.build_update(alpha=0.0001)
In [17]:
%time DNN_HOG_reloaded.train_model_full(max_iters=2) # CPU times: user 358 ms, sys: 713 ms, total: 1.07 s
Out[17]:
In [18]:
%time DNN_HOG_reloaded.train_model_full(max_iters=20000)
Out[18]:
In [19]:
DNN_HOG_reloaded.save_parameters("./2017datascibowl/DNNHOG_L5_4096c_")
In [20]:
# predictions on validation set
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared(Xs["valid"].astype(theano.config.floatX)))
yhat_valid = theano.function(inputs=[],outputs=DNN_HOG_reloaded.DNN_model._get_outer_layer_() )()
In [23]:
( (yhat_valid>0.50).astype(theano.config.floatX)==ys["valid"]).mean()
Out[23]:
In [27]:
# submission
DNN_HOG_reloaded.DNN_model.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
In [14]:
# reload the parameters after clearing the GPU RAM
params_reloaded = []
no_params=10
filename_reload = "./2017datascibowl/DNNHOG_L5_4096c_"
for param_idx in range(no_params):
name_reload = filename_reload+str(param_idx)
param_val = np.load(name_reload)
params_reloaded.append(param_val)
In [16]:
d = patients_sample2_vecs[0].shape[0]
print(d)
In [17]:
Ff_reloaded = Feedforward(5,[d,d/4096,d/4096,d/8192,d/16384,1],activation_fxn=T.nnet.sigmoid,psi_Lm1=T.nnet.sigmoid)
Ff_reloaded.__set_state__(*params_reloaded)
In [18]:
# submission
Ff_reloaded.connect_through(theano.shared( patients_sample2_vecs.astype(theano.config.floatX)))
Out[18]:
In [19]:
yhat_sample2 = theano.function(inputs=[],outputs=Ff_reloaded._get_outer_layer_() )()
In [20]:
sample2_out = pd.DataFrame(zip(stage2_sample_submission_csv['id'].as_matrix(),yhat_sample2.flatten()))
sample2_out.columns=["id","cancer"]
sample2_out.to_csv("./2017datascibowl/sample2submit04_L5_4096_.csv",index=False)
In [23]:
Ff_reloaded.__get_state__()["params"][0].get_value()
Out[23]:
In [24]:
params_reloaded[0]
Out[24]:
In [ ]: