In [1]:
%matplotlib inline
In [2]:
import theano
In [3]:
from theano import function, config, sandbox, shared
import theano.tensor as T
In [4]:
print( theano.config.device )
print( theano.config.lib.cnmem) # cf. http://deeplearning.net/software/theano/library/config.html
print( theano.config.print_active_device)# Print active device at when the GPU device is initialized.
In [5]:
print(theano.config.allow_gc)
print(theano.config.optimizer_excluding)
In [6]:
import sys
sys.path.append( '../ML' )
In [7]:
from SVM import SVM, SVM_serial, SVM_parallel
In [8]:
import numpy as np
import pandas as pd
In [9]:
import os
os.getcwd()
os.listdir( os.getcwd() )
Out[9]:
In [9]:
patients_stage1_feat = os.listdir('./2017datascibowl/stage1_feat')
print(len(patients_stage1_feat))
In [10]:
patients_stage1_feat = [patientname.replace("feat_vec","") for patientname in patients_stage1_feat]
In [10]:
patients_stage1_feat_lowres = os.listdir('./2017datascibowl/stage1_feat_lowres')
print(len(patients_stage1_feat_lowres))
In [11]:
patients_stage1_feat_lowres = [id.replace("feat_vec","") for id in patients_stage1_feat_lowres]
In [12]:
y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
print(len(y_ids))
In [13]:
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
print(len(y_ids_found))
In [14]:
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat_lowres)]
print(len(y_ids_found))
In [15]:
m = len(patients_stage1_feat)
found_indices =[]
for i in range(m):
if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
found_indices.append(i)
patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
print(len(patients_stage1_feat_found))
In [15]:
m = len(patients_stage1_feat_lowres)
found_indices =[]
for i in range(m):
if patients_stage1_feat_lowres[i] in y_ids_found['id'].as_matrix():
found_indices.append(i)
patients_stage1_lowres_found = [patients_stage1_feat_lowres[i] for i in found_indices]
print(len(patients_stage1_lowres_found))
In [16]:
y_found=[]
for i in range(len(patients_stage1_feat_found)):
if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
y_found.append( cancer_val )
y_found=np.array(y_found).flatten()
In [16]:
y_found=[]
for i in range(len(patients_stage1_lowres_found)):
if (patients_stage1_lowres_found[i] in y_ids_found['id'].as_matrix()):
cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_lowres_found[i]]['cancer'].as_matrix()
y_found.append( cancer_val )
y_found=np.array(y_found).flatten()
In [17]:
# it should be this condition, as the indices for each now correspond to each other
len(y_found)==len(patients_stage1_feat_found)
Out[17]:
In [17]:
len(y_found)==len(patients_stage1_lowres_found)
Out[17]:
In [38]:
patients_stage1_feat_found;
In [18]:
patients_stage1_feat_found = patients_stage1_lowres_found
In [43]:
ratio_of_train_to_total = 0.2
ratio_valid_to_rest = 0.2
numberofexamples = len(patients_stage1_feat_found)
numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
print(numberofexamples);print(numbertotest);print(numberoftrainingexamples);print(numbertovalidate)
In [44]:
shuffledindices = np.random.permutation( numberofexamples)
In [45]:
#patients_train = patients_stage1_feat[shuffledindices[:numberoftrainingexamples]]
#patients_valid = patients_stage1_feat[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
#patients_test = patients_stage1_feat[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
#patients_train = [patients_stage1_feat[id] for id in shuffledindices[:numberoftrainingexamples]]
#patients_valid = [patients_stage1_feat[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
#patients_test = [patients_stage1_feat[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]
patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
patients_test = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]
y_train = y_found[shuffledindices[:numberoftrainingexamples]]
y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
y_test = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
Indeed
In [46]:
# sanity check
y_ids.loc[y_ids['id']== patients_train[2]]
Out[46]:
In [47]:
# sanity check
y_train[2]
Out[47]:
In [48]:
#sanity check
for i in range(10,20):
print(y_ids.loc[y_ids['id']== patients_train[i]]['cancer'].as_matrix().flatten() == y_train[i])
In [25]:
def load_feat_vec(patientid):
f=file("./2017datascibowl/stage1_feat/"+patientid+"feat_vec","rb")
arr = np.load(f)
f.close()
return arr
In [25]:
%time patients_train_vecs = [load_feat_vec(id) for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
print(patients_train_vecs.shape)
In [52]:
%time patients_valid_vecs = [load_feat_vec(id) for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
print(patients_valid_vecs.shape)
In [26]:
def load_feat_vec(patientid,sub_name="stage1_feat"):
f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
arr = np.load(f)
f.close()
return arr
In [49]:
%time patients_train_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
print(patients_train_vecs.shape)
In [50]:
%time patients_valid_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
print(patients_valid_vecs.shape)
In [54]:
y_train;
In [52]:
y_train_rep2 = np.copy(y_train) # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1
In [55]:
y_train_rep2;
In [56]:
y_train;
In [57]:
y_valid_rep2 = np.copy(y_valid) # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1
In [58]:
SVM_stage1 = SVM_parallel(patients_train_vecs,y_train_rep2,len(y_train_rep2),1.0,1.0,0.001) # C=1.0,sigma=1.0, alpha=0.001
In [59]:
SVM_stage1.build_W();
SVM_stage1.build_update();
In [60]:
%time SVM_stage1.train_model_full(100) # training iterations 100->user 10min 1s, sys: 16min 54s, total: 26min 56s
Out[60]:
In [62]:
SVM_stage1.build_b()
Out[62]:
In [63]:
%time yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs)
In [64]:
print(np.sign(yhat_valid[0]).shape)
np.sign(yhat_valid[0])
Out[64]:
In [41]:
print(y_valid_rep2.shape)
y_valid_rep2
Out[41]:
In [65]:
(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))
Out[65]:
In [59]:
%time yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs[0:2])
In [66]:
C_trial = np.array([0.01,0.05,0.1,0.5,1.,5.,10.,50.,100,500])
sigma_trial=np.array([0.1,0.5,1.,5.,10.])
In [67]:
Csigma_mesh = np.meshgrid(C_trial,sigma_trial)
In [73]:
C_trial = np.array([0.01,0.1,1.,10.,100,500])
sigma_trial=np.array([0.1,1.,10.])
In [74]:
Csigma_mesh = np.meshgrid(C_trial,sigma_trial)
In [75]:
accuracy_score = np.zeros(Csigma_mesh[0].shape)
In [78]:
for i in range(len(sigma_trial)):
for j in range(len(C_trial)):
C_temp = Csigma_mesh[0][i][j]
sigma_temp = Csigma_mesh[1][i][j]
SVM_stage1 = SVM_parallel(patients_train_vecs,y_train_rep2,len(y_train_rep2),C_temp,sigma_temp,0.001)
SVM_stage1.build_W();
SVM_stage1.build_update();
SVM_stage1.train_model_full(50)
SVM_stage1.build_b()
yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs)
accuracy_score_temp=(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))
print(C_temp,sigma_temp,accuracy_score_temp)
accuracy_score[i][j] = accuracy_score_temp
In [11]:
def load_feat_vec(patientid,sub_name="stage1_feat"):
f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
arr = np.load(f)
f.close()
return arr
In [12]:
def prepare_inputX(sub_name="stage1_feat_lowres64", ratio_of_train_to_total = 0.4,
ratio_valid_to_rest = 0.2):
patients_stage1_feat = os.listdir('./2017datascibowl/'+sub_name)
patients_stage1_feat = [id.replace("feat_vec","") for id in patients_stage1_feat] # remove the suffix "feat_vec"
# get y labels
y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
m = len(patients_stage1_feat)
found_indices =[]
for i in range(m):
if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
found_indices.append(i)
patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
y_found=[]
for i in range(len(patients_stage1_feat_found)):
if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
y_found.append( cancer_val )
y_found=np.array(y_found).flatten()
assert (len(y_found)==len(patients_stage1_feat_found))
numberofexamples = len(patients_stage1_feat_found)
numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
shuffledindices = np.random.permutation( numberofexamples)
patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
patients_test = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]
y_train = y_found[shuffledindices[:numberoftrainingexamples]]
y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
y_test = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
patients_train_vecs = [load_feat_vec(id,sub_name) for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
patients_valid_vecs = [load_feat_vec(id,sub_name) for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
patients_test_vecs = [load_feat_vec(id,sub_name) for id in patients_test]
patients_test_vecs = np.array(patients_test_vecs)
patient_ids = {"train":patients_train,"valid":patients_valid,"test":patients_test}
ys = {"train":y_train,"valid":y_valid,"test":y_test}
Xs = {"train":patients_train_vecs,"valid":patients_valid_vecs,"test":patients_test_vecs}
return patient_ids, ys, Xs
In [98]:
patient_ids64, ys64,Xs64=prepare_inputX("stage1_feat_lowres64",0.2,0.2)
In [99]:
y_train_rep2 = np.copy(ys64["train"]) # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1
y_valid_rep2 = np.copy(ys64["valid"]) # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1
y_test_rep2 = np.copy(ys64["test"]) # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1
In [100]:
C_trial=[0.1,1.0,10.]
sigma_trial=[0.1,1.0,10.]
In [101]:
SVM_stage1 = SVM_parallel(Xs64["train"],y_train_rep2,len(y_train_rep2),
C_trial[1],sigma_trial[1],0.0005) # C=1.0,sigma=1.0, alpha=0.001
SVM_stage1.build_W();
SVM_stage1.build_update();
In [102]:
%time SVM_stage1.train_model_full(3) # training iterations 100->user 10min 1s, sys: 16min 54s, total: 26min 56s
In [361]:
patient_ids16, ys16,Xs16=prepare_inputX("stage1_feat_lowres",0.55,0.3)
In [104]:
y_train_rep2 = np.copy(ys16["train"]) # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1
y_valid_rep2 = np.copy(ys16["valid"]) # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1
y_test_rep2 = np.copy(ys16["test"]) # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1
In [105]:
C_trial=[0.1,1.0,10.]
sigma_trial=[0.1,1.0,10.]
In [116]:
accuracy_scores=np.zeros((len(C_trial),len(sigma_trial)) )
In [106]:
SVM_stage1 = SVM_parallel(Xs16["train"],y_train_rep2,len(y_train_rep2),
C_trial[1],sigma_trial[1],0.0005) # C=1.0,sigma=1.0, alpha=0.001
SVM_stage1.build_W();
SVM_stage1.build_update();
In [112]:
%time SVM_stage1.train_model_full(200) # training iterations=3 PU times: user 3min 33s, sys: 2min 34s, total: 6min 8s
Out[112]:
In [113]:
SVM_stage1.build_b()
Out[113]:
In [114]:
yhat_valid = SVM_stage1.make_predictions_parallel( Xs16["valid"] )
In [115]:
accuracy_score_temp=(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))
print(accuracy_score_temp)
In [118]:
accuracy_scores[1][1]=accuracy_score_temp
In [119]:
%time yhat_test = SVM_stage1.make_predictions_parallel( Xs16["test"] )
In [120]:
accuracy_score_temp_test=(np.sign(yhat_test[0]) == y_test_rep2).sum()/float(len(y_test_rep2))
print(len(y_test_rep2))
print(accuracy_score_temp_test)
In [307]:
Xs32["train"][0].shape
Out[307]:
In [13]:
patient_ids32, ys32,Xs32=prepare_inputX("stage1_feat_lowres32",0.275,0.25) #0.275,0.25 works,0.30,0.25 works
In [14]:
y32_train_rep2 = np.copy(ys32["train"]) # 2nd representation
y32_train_rep2[y32_train_rep2<=0]=-1
y32_valid_rep2 = np.copy(ys32["valid"]) # 2nd representation
y32_valid_rep2[y32_valid_rep2<=0]=-1
y32_test_rep2 = np.copy(ys32["test"]) # 2nd representation
y32_test_rep2[y32_test_rep2<=0]=-1
In [15]:
C32_trial=[0.1,1.0,10.,200.]
sigma32_trial=[0.1,1.0,10.]
In [16]:
C32_trial[3]
Out[16]:
In [17]:
accuracy_scores32=np.zeros((len(C32_trial),len(sigma32_trial)) )
In [18]:
SVM_stage1_32 = SVM_parallel(Xs32["train"],y32_train_rep2,len(y32_train_rep2),
C32_trial[3],sigma32_trial[1],0.0005)
SVM_stage1_32.build_W();
SVM_stage1_32.build_update();
In [ ]:
%time SVM_stage1_32.train_model_full(20) # training_iterations=2,CPU times: user 1min 54s, sys: 2min 34s, total: 4min 29s
In [126]:
patients_stage1_ids = os.listdir('./2017datascibowl/stage1_feat_lowres')
patients_stage1_ids = [id.replace("feat_vec","") for id in patients_stage1_ids] # remove the suffix "feat_vec"
print(len(patients_stage1_ids))
In [128]:
patients_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_stage1_ids]
patients_vecs = np.array(patients_vecs)
In [154]:
%time yhat = SVM_stage1.make_predictions_parallel( patients_vecs )
In [155]:
yhat_rep2 = np.copy(yhat[0]) # representation 2, {-1,1}, not representation of binary classes as {0,1}
yhat_rep2 = np.sign( yhat_rep2); # representation 1, {0,1}, not representation of binary classes as {-1,1}
yhat_rep1 = np.copy(yhat_rep2)
np.place(yhat_rep1,yhat_rep1<0.,0.)
In [158]:
pd.DataFrame(yhat_rep1).describe();
Out[158]:
In [206]:
Prattscaling_results = SVM_stage1.make_prob_Pratt(yhat_rep1)
In [207]:
pd.DataFrame(Prattscaling_results[0]).describe()
Out[207]:
In [214]:
((Prattscaling_results[0]>0.1).astype("float32")==yhat_rep1).sum()
Out[214]:
In [213]:
print(len(Prattscaling_results[0]))
print(len(yhat_rep1));print(yhat_rep1.dtype)
In [143]:
%time patients_found_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_stage1_lowres_found]
patients_found_vecs = np.array(patients_found_vecs)
In [144]:
patients_found_vecs.shape
Out[144]:
In [148]:
y_found_rep2 = np.copy(y_found)
y_found_rep2[y_found_rep2<=0]=-1
In [151]:
%time yhat_found = SVM_stage1.make_predictions_parallel( patients_found_vecs )
In [152]:
accuracy_score_temp_found=(np.sign(yhat_found[0]) == y_found_rep2).sum()/float(len(y_found_rep2))
print(accuracy_score_temp_found)
In [216]:
np.unique(yhat_rep1)
Out[216]:
In [217]:
np.unique(y_found)
Out[217]:
In [218]:
np.count_nonzero(y_found)
Out[218]:
In [219]:
np.count_nonzero(yhat_rep1)
Out[219]:
In [223]:
Prattscaling_results[0][:100]
Out[223]:
In [224]:
stage1_sample_submission_csv = pd.read_csv("./2017datascibowl/stage1_sample_submission.csv")
In [226]:
stage1_sample_submission_csv.describe()
Out[226]:
In [228]:
stage1_sample_submission_csv.head()
Out[228]:
We need to match up these ids with what we have.
In [236]:
m = len(patients_stage1_ids)
m_sample = len(stage1_sample_submission_csv['id'].as_matrix())
sample_indices =[]
for j in range(m_sample):
for i in range(m):
#if patients_stage1_ids[i] in stage1_sample_submission_csv['id'].as_matrix():
condition = (stage1_sample_submission_csv['id'].as_matrix()[j] == patients_stage1_ids[i])
if condition:
sample_indices.append(i)
patients_sample_ids = [patients_stage1_ids[i] for i in sample_indices]
print(len(patients_sample_ids))
In [237]:
set(stage1_sample_submission_csv['id'].as_matrix()) == set(np.array(patients_sample_ids))
Out[237]:
In [240]:
sample_yhat_prob = np.array( [Prattscaling_results[0][idx] for idx in sample_indices] )
In [246]:
pd.DataFrame(Prattscaling_results[0]).describe()
Out[246]:
In [247]:
Prattscaling_results[0]
Out[247]:
In [232]:
stage1_sample_submission_csv['id'].as_matrix()[0]
Out[232]:
In [250]:
sample_out = pd.DataFrame(zip(patients_sample_ids,sample_yhat_prob))
sample_out.columns=["id","cancer"]
In [278]:
sample_out.head()
Out[278]:
In [279]:
sample_out.to_csv("./2017datascibowl/samplesubmit00.csv",index=False)
In [265]:
import time
In [276]:
#time.gmtime().__str__()
Out[276]:
In [281]:
yhat_rep1
sample_yhat_cls = np.array( [yhat_rep1[idx] for idx in sample_indices] )
In [282]:
sample_yhat_cls
Out[282]:
In [284]:
stage1_sample_submission_csv = pd.read_csv("./2017datascibowl/stage1_sample_submission.csv")
In [287]:
sub_name="stage1_feat_lowres"
patients_sample_vecs = np.array( [load_feat_vec(id,sub_name) for id in stage1_sample_submission_csv['id'].as_matrix()] )
In [289]:
%time yhat_sample = SVM_stage1.make_predictions_parallel( patients_sample_vecs )
In [290]:
yhat_sample_rep2 = np.copy(yhat_sample[0]) # representation 2, {-1,1}, not representation of binary classes as {0,1}
yhat_sample_rep2 = np.sign( yhat_sample_rep2); # representation 1, {0,1}, not representation of binary classes as {-1,1}
yhat_sample_rep1 = np.copy(yhat_sample_rep2)
np.place(yhat_sample_rep1,yhat_sample_rep1<0.,0.)
In [293]:
yhat_sample[0]
Out[293]:
In [ ]: