notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import theano









    



WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5105)



In [3]:

    
from theano import function, config, sandbox, shared 
import theano.tensor as T



In [4]:

    
print( theano.config.device )
print( theano.config.lib.cnmem)  # cf. http://deeplearning.net/software/theano/library/config.html
print( theano.config.print_active_device)# Print active device at when the GPU device is initialized.









    



gpu
0.8
True



In [5]:

    
print(theano.config.allow_gc)
print(theano.config.optimizer_excluding)









    



False



In [6]:

    
import sys
sys.path.append( '../ML' )



In [7]:

    
from SVM import SVM, SVM_serial, SVM_parallel



In [8]:

    
import numpy as np
import pandas as pd



In [9]:

    
import os
os.getcwd()
os.listdir( os.getcwd() )









    Out[9]:





['2017datascibowl',
 'data_password.txt',
 'LSTM_model201702271930.save',
 'cleaning_dueSigmaFin.pyc',
 'LSTM_model201702280608.save',
 '.ipynb_checkpoints',
 'dueSigmaFinancial_kaggle.py',
 'LSTM_model.save',
 'LSTM_model201703012346.save',
 'DatSciBow2017_FullPreprocessTutorial.ipynb',
 'LSTM_model201702282350.save',
 'GRU_model201703022010.save',
 'DueSigmaFin_runs.ipynb',
 'ImagePreprocessing.ipynb',
 'dueSigmaFinancial_local.ipynb',
 'GRU_model201703012348.save',
 'GRU_model201703050709.save',
 'GRU_model201703021741.save',
 'kaggle.ipynb',
 'glass.csv',
 'DatSciBow2017_SVM.ipynb',
 '__init__.py',
 'train.h5',
 'dueSigmaFinancial_local_GRUs.ipynb',
 'cleaning_dueSigmaFin.py']



In [9]:

    
patients_stage1_feat = os.listdir('./2017datascibowl/stage1_feat')
print(len(patients_stage1_feat))



In [10]:

    
patients_stage1_feat = [patientname.replace("feat_vec","") for patientname in patients_stage1_feat]

low-resolution case



In [10]:

    
patients_stage1_feat_lowres = os.listdir('./2017datascibowl/stage1_feat_lowres')
print(len(patients_stage1_feat_lowres))



In [11]:

    
patients_stage1_feat_lowres = [id.replace("feat_vec","") for id in patients_stage1_feat_lowres]

Get the $y$ value (outcomes), the label for the class that each example belongs to, by matching patient IDs



In [12]:

    
y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
print(len(y_ids))



In [13]:

    
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
print(len(y_ids_found))









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-de1b06632ffa> in <module>()
----> 1 y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
      2 print(len(y_ids_found))

NameError: name 'patients_stage1_feat' is not defined



In [14]:

    
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat_lowres)]
print(len(y_ids_found))



In [15]:

    
m = len(patients_stage1_feat)
found_indices =[]
for i in range(m):
    if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
        found_indices.append(i)

patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
print(len(patients_stage1_feat_found))



In [15]:

    
m = len(patients_stage1_feat_lowres)
found_indices =[]
for i in range(m):
    if patients_stage1_feat_lowres[i] in y_ids_found['id'].as_matrix():
        found_indices.append(i)

patients_stage1_lowres_found = [patients_stage1_feat_lowres[i] for i in found_indices]
print(len(patients_stage1_lowres_found))



In [16]:

    
y_found=[]
for i in range(len(patients_stage1_feat_found)):
    if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
        cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
        y_found.append( cancer_val )
y_found=np.array(y_found).flatten()



In [16]:

    
y_found=[]
for i in range(len(patients_stage1_lowres_found)):
    if (patients_stage1_lowres_found[i] in y_ids_found['id'].as_matrix()):
        cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_lowres_found[i]]['cancer'].as_matrix()
        y_found.append( cancer_val )
y_found=np.array(y_found).flatten()



In [17]:

    
# it should be this condition, as the indices for each now correspond to each other 
len(y_found)==len(patients_stage1_feat_found)









    Out[17]:





True



In [17]:

    
len(y_found)==len(patients_stage1_lowres_found)









    Out[17]:





True



In [38]:

    
patients_stage1_feat_found;

low-resolution case



In [18]:

    
patients_stage1_feat_found = patients_stage1_lowres_found

Training, (Cross-)Validation, Test Set randomization and processing



In [43]:

    
ratio_of_train_to_total = 0.2
ratio_valid_to_rest = 0.2
numberofexamples = len(patients_stage1_feat_found)
numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
print(numberofexamples);print(numbertotest);print(numberoftrainingexamples);print(numbertovalidate)



In [44]:

    
shuffledindices = np.random.permutation( numberofexamples)



In [45]:

    
#patients_train = patients_stage1_feat[shuffledindices[:numberoftrainingexamples]] 
#patients_valid = patients_stage1_feat[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
#patients_test  = patients_stage1_feat[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
#patients_train = [patients_stage1_feat[id] for id in shuffledindices[:numberoftrainingexamples]]
#patients_valid = [patients_stage1_feat[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
#patients_test  = [patients_stage1_feat[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]
patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
patients_test  = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]

y_train = y_found[shuffledindices[:numberoftrainingexamples]]
y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
y_test  = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]

Indeed



In [46]:

    
# sanity check
y_ids.loc[y_ids['id']== patients_train[2]]









    Out[46]:






  
    
      
      id
      cancer
    
  
  
    
      1272
      e8be143b9f5e352f71043b24f79f5a17
      0



In [47]:

    
# sanity check
y_train[2]









    Out[47]:





0



In [48]:

    
#sanity check
for i in range(10,20):
    print(y_ids.loc[y_ids['id']== patients_train[i]]['cancer'].as_matrix().flatten() == y_train[i])









    



[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]



In [25]:

    
def load_feat_vec(patientid):
    f=file("./2017datascibowl/stage1_feat/"+patientid+"feat_vec","rb")
    arr = np.load(f)
    f.close()
    return arr



In [25]:

    
%time patients_train_vecs = [load_feat_vec(id) for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
print(patients_train_vecs.shape)









    



CPU times: user 136 ms, sys: 1.57 s, total: 1.7 s
Wall time: 9.23 s
(69, 2621448)



In [52]:

    
%time patients_valid_vecs = [load_feat_vec(id) for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
print(patients_valid_vecs.shape)









    



CPU times: user 110 ms, sys: 1.65 s, total: 1.76 s
Wall time: 9.78 s
(66, 2621448)



In [26]:

    
def load_feat_vec(patientid,sub_name="stage1_feat"):
    f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
    arr = np.load(f)
    f.close()
    return arr



In [49]:

    
%time patients_train_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
print(patients_train_vecs.shape)









    



CPU times: user 144 ms, sys: 117 ms, total: 261 ms
Wall time: 2.4 s
(279, 40968)



In [50]:

    
%time patients_valid_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
print(patients_valid_vecs.shape)









    



CPU times: user 112 ms, sys: 71.2 ms, total: 183 ms
Wall time: 2.08 s
(223, 40968)

Preprocess y labels to be in the -1,1 representation for the binary classes, NOT the 0,1 representation



In [54]:

    
y_train;



In [52]:

    
y_train_rep2 = np.copy(y_train)  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1



In [55]:

    
y_train_rep2;



In [56]:

    
y_train;



In [57]:

    
y_valid_rep2 = np.copy(y_valid)  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1



In [58]:

    
SVM_stage1 = SVM_parallel(patients_train_vecs,y_train_rep2,len(y_train_rep2),1.0,1.0,0.001)  # C=1.0,sigma=1.0, alpha=0.001



In [59]:

    
SVM_stage1.build_W();
SVM_stage1.build_update();



In [60]:

    
%time SVM_stage1.train_model_full(100) # training iterations 100->user 10min 1s, sys: 16min 54s, total: 26min 56s









    



CPU times: user 10min 1s, sys: 16min 54s, total: 26min 56s
Wall time: 26min 54s






    Out[60]:





array([ -91.00325012,  -78.68531036,  -78.06800079,  -78.55661774,
        -79.26457214,  -79.97239685,  -80.6671524 ,  -81.34812927,
        -82.01557922,  -82.66971588,  -83.30926514,  -83.93421173,
        -84.54647827,  -85.14653778,  -85.73464966,  -86.31061554,
        -86.87378693,  -87.42567444,  -87.96500397,  -88.49226379,
        -89.00816345,  -89.51264191,  -90.0063324 ,  -90.4901123 ,
        -90.96418762,  -91.42666626,  -91.87518311,  -92.31246948,
        -92.73873901,  -93.15627289,  -93.56546021,  -93.96653748,
        -94.35960388,  -94.74481964,  -95.12241364,  -95.49246216,
        -95.8536911 ,  -96.20722961,  -96.55370331,  -96.89065552,
        -97.21958923,  -97.54145813,  -97.85430908,  -98.15711212,
        -98.45239258,  -98.74053955,  -99.0219574 ,  -99.29771423,
        -99.56790161,  -99.8325119 , -100.09016418, -100.34063721,
       -100.58366394, -100.81987   , -101.05105591, -101.27754211,
       -101.49958801, -101.71712494, -101.92977905, -102.13681793,
       -102.3394928 , -102.5381546 , -102.7322998 , -102.92115784,
       -103.10598755, -103.28557587, -103.45835876, -103.62210083,
       -103.7815094 , -103.93566895, -104.08392334, -104.22496033,
       -104.36073303, -104.49319458, -104.62014008, -104.73869324,
       -104.85159302, -104.95965576, -105.06283569, -105.15847778,
       -105.24778748, -105.33259583, -105.41456604, -105.49463654,
       -105.57310486, -105.64996338, -105.72370148, -105.79509735,
       -105.86325836, -105.92457581, -105.97979736, -106.02992249,
       -106.07594299, -106.11862183, -106.15917969, -106.19702148,
       -106.23287964, -106.26145172, -106.28283691, -106.3019104 ])



In [62]:

    
SVM_stage1.build_b()









    Out[62]:





(Elemwise{mul,no_inplace}.0, OrderedUpdates())



In [63]:

    
%time yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs)









    



CPU times: user 4.58 s, sys: 5.98 s, total: 10.6 s
Wall time: 10.5 s



In [64]:

    
print(np.sign(yhat_valid[0]).shape)
np.sign(yhat_valid[0])









    



(223,)






    Out[64]:





array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1.], dtype=float32)



In [41]:

    
print(y_valid_rep2.shape)
y_valid_rep2









    Out[41]:





array([-1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1,  1, -1,
       -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1,
        1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,
        1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1,  1])



In [65]:

    
(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))









    Out[65]:





0.73542600896860988



In [59]:

    
%time yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs[0:2])









    



---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-59-75ee5a149093> in <module>()
----> 1 get_ipython().magic(u'time yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs[0:2])')

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2161         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2162         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163         return self.run_line_magic(magic_name, magic_arg_s)
   2164 
   2165     #-------------------------------------------------------------------------

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2082                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2083             with self.builtin_trap:
-> 2084                 result = fn(*args,**kwargs)
   2085             return result
   2086 

<decorator-gen-60> in time(self, line, cell, local_ns)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1175         else:
   1176             st = clock2()
-> 1177             exec(code, glob, local_ns)
   1178             end = clock2()
   1179             out = None

<timed exec> in <module>()

/home/topolo/PropD/MLgrabbag/ML/SVM.pyc in make_predictions_parallel(self, X_pred_vals)
    708 
    709                 predictions_function = theano.function(inputs=[],outputs=output)
--> 710                 predictions_vals = predictions_function()
    711                 self._yhat = theano.shared( predictions_vals ) # added this line later
    712 

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    896                     node=self.fn.nodes[self.fn.position_of_error],
    897                     thunk=thunk,
--> 898                     storage_map=getattr(self.fn, 'storage_map', None))
    899             else:
    900                 # old-style linkers raise their own exceptions

/home/topolo/PropD/Theano/theano/gof/link.pyc in raise_with_op(node, thunk, exc_info, storage_map)
    323         # extra long error message in that case.
    324         pass
--> 325     reraise(exc_type, exc_value, exc_trace)
    326 
    327 

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in rval(p, i, o, n, allow_gc)
    987         def rval(p=p, i=node_input_storage, o=node_output_storage, n=node,
    988                  allow_gc=allow_gc):
--> 989             r = p(n, [x[0] for x in i], o)
    990             for o in node.outputs:
    991                 compute_map[o][0] = True

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in p(node, args, outs)
    976                                                 args,
    977                                                 outs,
--> 978                                                 self, node)
    979         except (ImportError, theano.gof.cmodule.MissingGXX):
    980             p = self.execute

/home/topolo/PropD/Theano/theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (/home/topolo/.theano/compiledir_Linux-4.2-fc23.x86_64-x86_64-with-fedora-23-Twenty_Three-x86_64-2.7.11-64/scan_perform/mod.cpp:4606)()
    403                 if hasattr(fn, 'thunks'):
    404                     # For the CVM
--> 405                     gof.link.raise_with_op(fn.nodes[fn.position_of_error],
    406                                            fn.thunks[fn.position_of_error])
    407                 else:

/home/topolo/PropD/Theano/theano/gof/link.pyc in raise_with_op(node, thunk, exc_info, storage_map)
    323         # extra long error message in that case.
    324         pass
--> 325     reraise(exc_type, exc_value, exc_trace)
    326 
    327 

/home/topolo/PropD/Theano/theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (/home/topolo/.theano/compiledir_Linux-4.2-fc23.x86_64-x86_64-with-fedora-23-Twenty_Three-x86_64-2.7.11-64/scan_perform/mod.cpp:4490)()
    395 
    396         try:
--> 397             fn()
    398         except Exception:
    399             if hasattr(fn, 'position_of_error'):

MemoryError: Error allocating 723519648 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).
Apply node that caused the error: GpuElemwise{Composite{sqr((i0 - i1))},no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuDimShuffle{x,0}.0)
Toposort index: 4
Inputs types: [CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, row)]
Inputs shapes: [(69, 2621448), (1, 2621448)]
Inputs strides: [(2621448, 1), (0, 1)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[for{gpu,scan_fn}(Elemwise{maximum,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{Composite{sqr((i0 - i1))},no_inplace}.0, GpuSubtensor{:int64:}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,gpu,scan_fn}(Elemwise{maximum,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, Elemwise{Composite{minimum(minimum(i0, i1), i2)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{Set;:int64:}.0)
Toposort index: 58
Inputs types: [TensorType(int64, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, vector), TensorType(int64, scalar), CudaNdarrayType(float32, vector), CudaNdarrayType(float32, vector), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, vector)]
Inputs shapes: [(), (69, 2621448), (2,), (), (69,), (69,), (69, 2621448), (70,)]
Inputs strides: [(), (2621448, 1), (1,), (), (1,), (1,), (2621448, 1), (1,)]
Inputs values: [array(69), 'not shown', CudaNdarray([ 0.   -0.25]), array(69), 'not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

Model selection; varying $C$ and $\sigma$



In [66]:

    
C_trial = np.array([0.01,0.05,0.1,0.5,1.,5.,10.,50.,100,500])
sigma_trial=np.array([0.1,0.5,1.,5.,10.])



In [67]:

    
Csigma_mesh = np.meshgrid(C_trial,sigma_trial)



In [73]:

    
C_trial = np.array([0.01,0.1,1.,10.,100,500])
sigma_trial=np.array([0.1,1.,10.])



In [74]:

    
Csigma_mesh = np.meshgrid(C_trial,sigma_trial)



In [75]:

    
accuracy_score = np.zeros(Csigma_mesh[0].shape)



In [78]:

    
for i in range(len(sigma_trial)):
    for j in range(len(C_trial)):
        C_temp = Csigma_mesh[0][i][j]
        sigma_temp = Csigma_mesh[1][i][j]
        SVM_stage1 = SVM_parallel(patients_train_vecs,y_train_rep2,len(y_train_rep2),C_temp,sigma_temp,0.001)  
        SVM_stage1.build_W();
        SVM_stage1.build_update();
        SVM_stage1.train_model_full(50)
        SVM_stage1.build_b()
        yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs)
        accuracy_score_temp=(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))
        print(C_temp,sigma_temp,accuracy_score_temp)
        accuracy_score[i][j] = accuracy_score_temp









    



(0.01, 0.10000000000000001, 0.73542600896860988)
(0.10000000000000001, 0.10000000000000001, 0.73542600896860988)
(1.0, 0.10000000000000001, 0.73542600896860988)
(10.0, 0.10000000000000001, 0.73542600896860988)
(100.0, 0.10000000000000001, 0.73542600896860988)
(500.0, 0.10000000000000001, 0.73542600896860988)
(0.01, 1.0, 0.73542600896860988)
(0.10000000000000001, 1.0, 0.73542600896860988)
(1.0, 1.0, 0.73542600896860988)
(10.0, 1.0, 0.73542600896860988)
(100.0, 1.0, 0.73542600896860988)






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-78-8001980da83f> in <module>()
      6         SVM_stage1.build_W();
      7         SVM_stage1.build_update();
----> 8         SVM_stage1.train_model_full(50)
      9         SVM_stage1.build_b()
     10         yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs)

/home/topolo/PropD/MLgrabbag/ML/SVM.pyc in train_model_full(self, max_iters)
    541                         error = 0.
    542 
--> 543                         W_train = update_function()
    544 
    545                         if np.isnan( W_train ) or np.isinf( W_train):

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in rval(p, i, o, n, allow_gc)
    987         def rval(p=p, i=node_input_storage, o=node_output_storage, n=node,
    988                  allow_gc=allow_gc):
--> 989             r = p(n, [x[0] for x in i], o)
    990             for o in node.outputs:
    991                 compute_map[o][0] = True

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in p(node, args, outs)
    976                                                 args,
    977                                                 outs,
--> 978                                                 self, node)
    979         except (ImportError, theano.gof.cmodule.MissingGXX):
    980             p = self.execute

/home/topolo/PropD/Theano/theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (/home/topolo/.theano/compiledir_Linux-4.2-fc23.x86_64-x86_64-with-fedora-23-Twenty_Three-x86_64-2.7.11-64/scan_perform/mod.cpp:4490)()
    395 
    396         try:
--> 397             fn()
    398         except Exception:
    399             if hasattr(fn, 'position_of_error'):

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in rval(p, i, o, n, allow_gc)
    987         def rval(p=p, i=node_input_storage, o=node_output_storage, n=node,
    988                  allow_gc=allow_gc):
--> 989             r = p(n, [x[0] for x in i], o)
    990             for o in node.outputs:
    991                 compute_map[o][0] = True

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in p(node, args, outs)
    976                                                 args,
    977                                                 outs,
--> 978                                                 self, node)
    979         except (ImportError, theano.gof.cmodule.MissingGXX):
    980             p = self.execute

KeyboardInterrupt:

low-resolution 64x64 case



In [11]:

    
def load_feat_vec(patientid,sub_name="stage1_feat"):
    f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
    arr = np.load(f)
    f.close()
    return arr



In [12]:

    
def prepare_inputX(sub_name="stage1_feat_lowres64", ratio_of_train_to_total = 0.4,
                                                    ratio_valid_to_rest = 0.2):
    patients_stage1_feat = os.listdir('./2017datascibowl/'+sub_name)

    patients_stage1_feat = [id.replace("feat_vec","") for id in patients_stage1_feat]  # remove the suffix "feat_vec"
    
    # get y labels
    y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
    
    y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
    
    m = len(patients_stage1_feat)
    found_indices =[]
    for i in range(m):
        if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
            found_indices.append(i)

    patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
    
    y_found=[]
    for i in range(len(patients_stage1_feat_found)):
        if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
            cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
            y_found.append( cancer_val )
    y_found=np.array(y_found).flatten()
    
    assert (len(y_found)==len(patients_stage1_feat_found))


    numberofexamples = len(patients_stage1_feat_found)
    numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
    numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
    numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
    
    shuffledindices = np.random.permutation( numberofexamples)
    
    patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
    patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
    patients_test  = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]

    y_train = y_found[shuffledindices[:numberoftrainingexamples]]
    y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
    y_test  = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
    
    patients_train_vecs = [load_feat_vec(id,sub_name) for id in patients_train]
    patients_train_vecs = np.array(patients_train_vecs)
    
    patients_valid_vecs = [load_feat_vec(id,sub_name) for id in patients_valid]
    patients_valid_vecs = np.array(patients_valid_vecs)
    
    patients_test_vecs = [load_feat_vec(id,sub_name) for id in patients_test]
    patients_test_vecs = np.array(patients_test_vecs)

    patient_ids = {"train":patients_train,"valid":patients_valid,"test":patients_test}
    ys = {"train":y_train,"valid":y_valid,"test":y_test}
    Xs = {"train":patients_train_vecs,"valid":patients_valid_vecs,"test":patients_test_vecs}
    
    return patient_ids, ys, Xs



In [98]:

    
patient_ids64, ys64,Xs64=prepare_inputX("stage1_feat_lowres64",0.2,0.2)



In [99]:

    
y_train_rep2 = np.copy(ys64["train"])  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1

y_valid_rep2 = np.copy(ys64["valid"])  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1

y_test_rep2 = np.copy(ys64["test"])  # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1



In [100]:

    
C_trial=[0.1,1.0,10.]
sigma_trial=[0.1,1.0,10.]



In [101]:

    
SVM_stage1 = SVM_parallel(Xs64["train"],y_train_rep2,len(y_train_rep2),
                          C_trial[1],sigma_trial[1],0.0005)  # C=1.0,sigma=1.0, alpha=0.001

SVM_stage1.build_W();
SVM_stage1.build_update();



In [102]:

    
%time SVM_stage1.train_model_full(3) # training iterations 100->user 10min 1s, sys: 16min 54s, total: 26min 56s









    



---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-102-e9bda6d66b45> in <module>()
----> 1 get_ipython().magic(u'time SVM_stage1.train_model_full(3) # training iterations 100->user 10min 1s, sys: 16min 54s, total: 26min 56s')

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2161         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2162         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163         return self.run_line_magic(magic_name, magic_arg_s)
   2164 
   2165     #-------------------------------------------------------------------------

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2082                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2083             with self.builtin_trap:
-> 2084                 result = fn(*args,**kwargs)
   2085             return result
   2086 

<decorator-gen-60> in time(self, line, cell, local_ns)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1171         if mode=='eval':
   1172             st = clock2()
-> 1173             out = eval(code, glob, local_ns)
   1174             end = clock2()
   1175         else:

<timed eval> in <module>()

/home/topolo/PropD/MLgrabbag/ML/SVM.pyc in train_model_full(self, max_iters)
    541                         error = 0.
    542 
--> 543                         W_train = update_function()
    544 
    545                         if np.isnan( W_train ) or np.isinf( W_train):

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    896                     node=self.fn.nodes[self.fn.position_of_error],
    897                     thunk=thunk,
--> 898                     storage_map=getattr(self.fn, 'storage_map', None))
    899             else:
    900                 # old-style linkers raise their own exceptions

/home/topolo/PropD/Theano/theano/gof/link.pyc in raise_with_op(node, thunk, exc_info, storage_map)
    323         # extra long error message in that case.
    324         pass
--> 325     reraise(exc_type, exc_value, exc_trace)
    326 
    327 

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

RuntimeError: Cuda error: GpuElemwise node_f36d4d896685d79bc0957e19896d6464_0 Mul: out of memory.
    n_blocks=9 threads_per_block=32
   Call: kernel_Mul_node_f36d4d896685d79bc0957e19896d6464_0_Ccontiguous<<<n_blocks, threads_per_block>>>(numEls, i0_data, i1_data, o0_data)

Apply node that caused the error: GpuElemwise{mul,no_inplace}(GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0)
Toposort index: 176
Inputs types: [CudaNdarrayType(float32, vector), CudaNdarrayType(float32, vector)]
Inputs shapes: [(279,), (279,)]
Inputs strides: [(1,), (1,)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[forall_inplace,gpu,scan_fn&grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(i0, i1), i2)}}.0, GpuElemwise{mul,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuElemwise{mul,no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuFromHost.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuAlloc{memset_0=True}.0, Elemwise{Composite{minimum(minimum(i0, i1), i2)}}.0, Elemwise{Composite{minimum(minimum(i0, i1), i2)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, ScalarFromTensor.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

low-resolution case (16x16)



In [361]:

    
patient_ids16, ys16,Xs16=prepare_inputX("stage1_feat_lowres",0.55,0.3)



In [104]:

    
y_train_rep2 = np.copy(ys16["train"])  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1

y_valid_rep2 = np.copy(ys16["valid"])  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1

y_test_rep2 = np.copy(ys16["test"])  # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1



In [105]:

    
C_trial=[0.1,1.0,10.]
sigma_trial=[0.1,1.0,10.]



In [116]:

    
accuracy_scores=np.zeros((len(C_trial),len(sigma_trial)) )



In [106]:

    
SVM_stage1 = SVM_parallel(Xs16["train"],y_train_rep2,len(y_train_rep2),
                          C_trial[1],sigma_trial[1],0.0005)  # C=1.0,sigma=1.0, alpha=0.001

SVM_stage1.build_W();
SVM_stage1.build_update();



In [112]:

    
%time SVM_stage1.train_model_full(200)  # training iterations=3 PU times: user 3min 33s, sys: 2min 34s, total: 6min 8s









    



CPU times: user 3h 59min 7s, sys: 2h 46min 6s, total: 6h 45min 14s
Wall time: 6h 44min 57s






    Out[112]:





array([-200.83131409, -202.55712891, -204.33587646, -206.08123779,
       -207.7901001 , -209.45532227, -211.08381653, -212.67712402,
       -214.23626709, -215.76069641, -217.24943542, -218.70394897,
       -220.12663269, -221.52130127, -222.88653564, -224.22303772,
       -225.5309906 , -226.809021  , -228.05993652, -229.28213501,
       -230.47427368, -231.64016724, -232.77420044, -233.8817749 ,
       -234.96562195, -236.02755737, -237.06628418, -238.08067322,
       -239.07070923, -240.03775024, -240.98564148, -241.91238403,
       -242.81768799, -243.69985962, -244.56072998, -245.39962769,
       -246.21714783, -247.0166626 , -247.79553223, -248.55636597,
       -249.2930603 , -250.00802612, -250.70266724, -251.37922668,
       -252.04145813, -252.6869812 , -253.31645203, -253.93121338,
       -254.53184509, -255.11587524, -255.68267822, -256.22912598,
       -256.75299072, -257.25146484, -257.72674561, -258.18280029,
       -258.62084961, -259.04241943, -259.44540405, -259.83544922,
       -260.21270752, -260.56716919, -260.90518188, -261.22686768,
       -261.53460693, -261.83227539, -262.11776733, -262.38827515,
       -262.64398193, -262.88275146, -263.09793091, -263.29821777,
       -263.4864502 , -263.65771484, -263.81768799, -263.9666748 ,
       -264.09890747, -264.21740723, -264.32885742, -264.42810059,
       -264.51879883, -264.5965271 , -264.66400146, -264.72744751,
       -264.78866577, -264.84820557, -264.90667725, -264.96395874,
       -265.02001953, -265.07522583, -265.12902832, -265.1819458 ,
       -265.23364258, -265.28430176, -265.33435059, -265.38296509,
       -265.43078613, -265.47763062, -265.5234375 , -265.568573  ,
       -265.6126709 , -265.65612793, -265.69830322, -265.73983765,
       -265.78070068, -265.82034302, -265.85934448, -265.89813232,
       -265.93530273, -265.97241211, -266.00817871, -266.04336548,
       -266.07836914, -266.11212158, -266.14562988, -266.17828369,
       -266.21017456, -266.24133301, -266.2723999 , -266.30255127,
       -266.33163452, -266.36083984, -266.38909912, -266.41702271,
       -266.44393921, -266.47064209, -266.49713135, -266.52258301,
       -266.54785156, -266.57214355, -266.59625244, -266.62023926,
       -266.64318848, -266.66607666, -266.68841553, -266.71020508,
       -266.73144531, -266.75241089, -266.77307129, -266.79315186,
       -266.81295776, -266.83227539, -266.85095215, -266.86972046,
       -266.88806152, -266.90594482, -266.92321777, -266.9402771 ,
       -266.95733643, -266.97387695, -266.99002075, -267.0057373 ,
       -267.02105713, -267.03643799, -267.05114746, -267.0657959 ,
       -267.08013916, -267.09448242, -267.10797119, -267.12173462,
       -267.13458252, -267.14758301, -267.16009521, -267.17285156,
       -267.18481445, -267.19696045, -267.2086792 , -267.22021484,
       -267.23132324, -267.24230957, -267.25332642, -267.2635498 ,
       -267.2741394 , -267.28424072, -267.29418945, -267.30377197,
       -267.3137207 , -267.32315063, -267.33215332, -267.34103394,
       -267.34991455, -267.3588562 , -267.36724854, -267.37567139,
       -267.38348389, -267.39151001, -267.39941406, -267.40686035,
       -267.41485596, -267.42202759, -267.42932129, -267.43618774,
       -267.44311523, -267.44989014, -267.45648193, -267.46334839,
       -267.46966553, -267.47601318, -267.48187256, -267.48797607])



In [113]:

    
SVM_stage1.build_b()









    Out[113]:





(Elemwise{mul,no_inplace}.0, OrderedUpdates())



In [114]:

    
yhat_valid = SVM_stage1.make_predictions_parallel( Xs16["valid"] )



In [115]:

    
accuracy_score_temp=(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))
print(accuracy_score_temp)









    



0.712765957447



In [118]:

    
accuracy_scores[1][1]=accuracy_score_temp



In [119]:

    
%time yhat_test = SVM_stage1.make_predictions_parallel( Xs16["test"] )









    



CPU times: user 30.6 s, sys: 33.6 s, total: 1min 4s
Wall time: 1min 4s



In [120]:

    
accuracy_score_temp_test=(np.sign(yhat_test[0]) == y_test_rep2).sum()/float(len(y_test_rep2))
print(len(y_test_rep2))
print(accuracy_score_temp_test)









    



441
0.759637188209

low-resolution 32x32 case



In [307]:

    
Xs32["train"][0].shape









    Out[307]:





(163848,)



In [13]:

    
patient_ids32, ys32,Xs32=prepare_inputX("stage1_feat_lowres32",0.275,0.25)  #0.275,0.25 works,0.30,0.25 works



In [14]:

    
y32_train_rep2 = np.copy(ys32["train"])  # 2nd representation
y32_train_rep2[y32_train_rep2<=0]=-1

y32_valid_rep2 = np.copy(ys32["valid"])  # 2nd representation
y32_valid_rep2[y32_valid_rep2<=0]=-1

y32_test_rep2 = np.copy(ys32["test"])  # 2nd representation
y32_test_rep2[y32_test_rep2<=0]=-1



In [15]:

    
C32_trial=[0.1,1.0,10.,200.]
sigma32_trial=[0.1,1.0,10.]



In [16]:

    
C32_trial[3]









    Out[16]:





200.0



In [17]:

    
accuracy_scores32=np.zeros((len(C32_trial),len(sigma32_trial)) )



In [18]:

    
SVM_stage1_32 = SVM_parallel(Xs32["train"],y32_train_rep2,len(y32_train_rep2),
                          C32_trial[3],sigma32_trial[1],0.0005)  

SVM_stage1_32.build_W();
SVM_stage1_32.build_update();



In [ ]:

    
%time SVM_stage1_32.train_model_full(20) # training_iterations=2,CPU times: user 1min 54s, sys: 2min 34s, total: 4min 29s

To go out to the competition; over the full data set



In [126]:

    
patients_stage1_ids = os.listdir('./2017datascibowl/stage1_feat_lowres')
patients_stage1_ids = [id.replace("feat_vec","") for id in patients_stage1_ids]  # remove the suffix "feat_vec"

print(len(patients_stage1_ids))



In [128]:

    
patients_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_stage1_ids]
patients_vecs = np.array(patients_vecs)



In [154]:

    
%time yhat = SVM_stage1.make_predictions_parallel( patients_vecs )









    



CPU times: user 57.1 s, sys: 1min 5s, total: 2min 2s
Wall time: 2min 2s



In [155]:

    
yhat_rep2 = np.copy(yhat[0])  # representation 2, {-1,1}, not representation of binary classes as {0,1}
yhat_rep2 = np.sign( yhat_rep2);  # representation 1, {0,1}, not representation of binary classes as {-1,1}
yhat_rep1 = np.copy(yhat_rep2)
np.place(yhat_rep1,yhat_rep1<0.,0.)



In [158]:

    
pd.DataFrame(yhat_rep1).describe();









    Out[158]:






  
    
      
      0
    
  
  
    
      count
      1595.000000
    
    
      mean
      0.126646
    
    
      std
      0.332678
    
    
      min
      0.000000
    
    
      25%
      0.000000
    
    
      50%
      0.000000
    
    
      75%
      0.000000
    
    
      max
      1.000000



In [206]:

    
Prattscaling_results = SVM_stage1.make_prob_Pratt(yhat_rep1)



In [207]:

    
pd.DataFrame(Prattscaling_results[0]).describe()









    Out[207]:






  
    
      
      0
    
  
  
    
      count
      1595.000000
    
    
      mean
      0.125520
    
    
      std
      0.271362
    
    
      min
      0.004191
    
    
      25%
      0.006749
    
    
      50%
      0.033630
    
    
      75%
      0.033630
    
    
      max
      0.837107



In [214]:

    
((Prattscaling_results[0]>0.1).astype("float32")==yhat_rep1).sum()









    Out[214]:





1595



In [213]:

    
print(len(Prattscaling_results[0]))
print(len(yhat_rep1));print(yhat_rep1.dtype)



In [143]:

    
%time patients_found_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_stage1_lowres_found]
patients_found_vecs = np.array(patients_found_vecs)









    



CPU times: user 315 ms, sys: 203 ms, total: 518 ms
Wall time: 516 ms



In [144]:

    
patients_found_vecs.shape









    Out[144]:





(1397, 40968)



In [148]:

    
y_found_rep2 = np.copy(y_found)
y_found_rep2[y_found_rep2<=0]=-1



In [151]:

    
%time yhat_found = SVM_stage1.make_predictions_parallel( patients_found_vecs )









    



CPU times: user 52.9 s, sys: 1min, total: 1min 53s
Wall time: 1min 53s



In [152]:

    
accuracy_score_temp_found=(np.sign(yhat_found[0]) == y_found_rep2).sum()/float(len(y_found_rep2))
print(accuracy_score_temp_found)









    



0.885468861847



In [216]:

    
np.unique(yhat_rep1)









    Out[216]:





array([ 0.,  1.], dtype=float32)



In [217]:

    
np.unique(y_found)









    Out[217]:





array([0, 1])



In [218]:

    
np.count_nonzero(y_found)









    Out[218]:





362



In [219]:

    
np.count_nonzero(yhat_rep1)









    Out[219]:





202



In [223]:

    
Prattscaling_results[0][:100]









    Out[223]:





array([ 0.00676017,  0.03362985,  0.0066766 ,  0.00684445,  0.00445424,
        0.03362985,  0.03362985,  0.00494832,  0.00491834,  0.03362985,
        0.00515345,  0.00684445,  0.03362985,  0.03362985,  0.03362985,
        0.03362985,  0.03362985,  0.00475647,  0.03362985,  0.03362985,
        0.00570897,  0.83710682,  0.00640817,  0.00506418,  0.03362985,
        0.00559623,  0.03362985,  0.03362985,  0.83710682,  0.03362985,
        0.83710682,  0.83710682,  0.83710682,  0.03362985,  0.00684445,
        0.03362985,  0.03362985,  0.83710682,  0.00573395,  0.00571569,
        0.03362985,  0.004754  ,  0.03362985,  0.00468638,  0.03362985,
        0.03362985,  0.00556879,  0.83710682,  0.00557399,  0.03362985,
        0.00553154,  0.00537084,  0.83710682,  0.03362985,  0.03362985,
        0.00582898,  0.03362985,  0.03362985,  0.03362985,  0.03362985,
        0.005304  ,  0.03362985,  0.03362985,  0.83710682,  0.83710682,
        0.00684445,  0.00636829,  0.03362985,  0.00427081,  0.00435718,
        0.03362985,  0.83710682,  0.83710682,  0.00684445,  0.03362985,
        0.00424011,  0.03362985,  0.00684445,  0.83710682,  0.00473127,
        0.03362985,  0.03362985,  0.03362985,  0.00505903,  0.03362985,
        0.03362985,  0.00623662,  0.03362985,  0.00512414,  0.03362985,
        0.00596082,  0.03362985,  0.006568  ,  0.03362985,  0.03362985,
        0.83710682,  0.00523753,  0.03362985,  0.03362985,  0.03362985], dtype=float32)

Submissions



In [224]:

    
stage1_sample_submission_csv = pd.read_csv("./2017datascibowl/stage1_sample_submission.csv")



In [226]:

    
stage1_sample_submission_csv.describe()



In [228]:

    
stage1_sample_submission_csv.head()









    Out[228]:






  
    
      
      id
      cancer
    
  
  
    
      0
      026470d51482c93efc18b9803159c960
      0.5
    
    
      1
      031b7ec4fe96a3b035a8196264a8c8c3
      0.5
    
    
      2
      03bd22ed5858039af223c04993e9eb22
      0.5
    
    
      3
      06a90409e4fcea3e634748b967993531
      0.5
    
    
      4
      07b1defcfae5873ee1f03c90255eb170
      0.5

We need to match up these ids with what we have.



In [236]:

    
m = len(patients_stage1_ids)
m_sample = len(stage1_sample_submission_csv['id'].as_matrix())
sample_indices =[]
for j in range(m_sample):
    for i in range(m):
    #if patients_stage1_ids[i] in stage1_sample_submission_csv['id'].as_matrix():
        condition = (stage1_sample_submission_csv['id'].as_matrix()[j] == patients_stage1_ids[i])
        if condition:
            sample_indices.append(i)

patients_sample_ids = [patients_stage1_ids[i] for i in sample_indices]
print(len(patients_sample_ids))



In [237]:

    
set(stage1_sample_submission_csv['id'].as_matrix()) == set(np.array(patients_sample_ids))









    Out[237]:





True



In [240]:

    
sample_yhat_prob = np.array( [Prattscaling_results[0][idx] for idx in sample_indices] )



In [246]:

    
pd.DataFrame(Prattscaling_results[0]).describe()









    Out[246]:






  
    
      
      0
    
  
  
    
      count
      1595.000000
    
    
      mean
      0.125520
    
    
      std
      0.271362
    
    
      min
      0.004191
    
    
      25%
      0.006749
    
    
      50%
      0.033630
    
    
      75%
      0.033630
    
    
      max
      0.837107



In [247]:

    
Prattscaling_results[0]









    Out[247]:





array([ 0.00676017,  0.03362985,  0.0066766 , ...,  0.00669918,
        0.0060775 ,  0.03362985], dtype=float32)



In [232]:

    
stage1_sample_submission_csv['id'].as_matrix()[0]









    Out[232]:





'026470d51482c93efc18b9803159c960'



In [250]:

    
sample_out = pd.DataFrame(zip(patients_sample_ids,sample_yhat_prob))
sample_out.columns=["id","cancer"]



In [278]:

    
sample_out.head()









    Out[278]:






  
    
      
      id
      cancer
    
  
  
    
      0
      026470d51482c93efc18b9803159c960
      0.03363
    
    
      1
      031b7ec4fe96a3b035a8196264a8c8c3
      0.03363
    
    
      2
      03bd22ed5858039af223c04993e9eb22
      0.03363
    
    
      3
      06a90409e4fcea3e634748b967993531
      0.03363
    
    
      4
      07b1defcfae5873ee1f03c90255eb170
      0.03363



In [279]:

    
sample_out.to_csv("./2017datascibowl/samplesubmit00.csv",index=False)



In [265]:

    
import time



In [276]:

    
#time.gmtime().__str__()









    Out[276]:





'time.struct_time(tm_year=2017, tm_mon=4, tm_mday=7, tm_hour=11, tm_min=50, tm_sec=35, tm_wday=4, tm_yday=97, tm_isdst=0)'



In [281]:

    
yhat_rep1
sample_yhat_cls = np.array( [yhat_rep1[idx] for idx in sample_indices] )



In [282]:

    
sample_yhat_cls









    Out[282]:





array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.], dtype=float32)

To go out to competition, for only the sample



In [284]:

    
stage1_sample_submission_csv = pd.read_csv("./2017datascibowl/stage1_sample_submission.csv")



In [287]:

    
sub_name="stage1_feat_lowres"
patients_sample_vecs = np.array( [load_feat_vec(id,sub_name) for id in stage1_sample_submission_csv['id'].as_matrix()] )



In [289]:

    
%time yhat_sample = SVM_stage1.make_predictions_parallel( patients_sample_vecs )









    



CPU times: user 24.7 s, sys: 27.5 s, total: 52.2 s
Wall time: 52.1 s



In [290]:

    
yhat_sample_rep2 = np.copy(yhat_sample[0])  # representation 2, {-1,1}, not representation of binary classes as {0,1}
yhat_sample_rep2 = np.sign( yhat_sample_rep2);  # representation 1, {0,1}, not representation of binary classes as {-1,1}
yhat_sample_rep1 = np.copy(yhat_sample_rep2)
np.place(yhat_sample_rep1,yhat_sample_rep1<0.,0.)



In [293]:

    
yhat_sample[0]









    Out[293]:





CudaNdarray([-0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284])



In [ ]:

	0
count	1595.000000
mean	0.126646
std	0.332678
min	0.000000
25%	0.000000
50%	0.000000
75%	0.000000
max	1.000000

	id	cancer
0	026470d51482c93efc18b9803159c960	0.5
1	031b7ec4fe96a3b035a8196264a8c8c3	0.5
2	03bd22ed5858039af223c04993e9eb22	0.5
3	06a90409e4fcea3e634748b967993531	0.5
4	07b1defcfae5873ee1f03c90255eb170	0.5

	id	cancer
0	026470d51482c93efc18b9803159c960	0.03363
1	031b7ec4fe96a3b035a8196264a8c8c3	0.03363
2	03bd22ed5858039af223c04993e9eb22	0.03363
3	06a90409e4fcea3e634748b967993531	0.03363
4	07b1defcfae5873ee1f03c90255eb170	0.03363