In [1]:
%matplotlib inline

In [2]:
import theano


WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5105)

In [3]:
from theano import function, config, sandbox, shared 
import theano.tensor as T

In [4]:
print( theano.config.device )
print( theano.config.lib.cnmem)  # cf. http://deeplearning.net/software/theano/library/config.html
print( theano.config.print_active_device)# Print active device at when the GPU device is initialized.


gpu
0.8
True

In [5]:
print(theano.config.allow_gc)
print(theano.config.optimizer_excluding)


False


In [6]:
import sys
sys.path.append( '../ML' )

In [7]:
from SVM import SVM, SVM_serial, SVM_parallel

In [8]:
import numpy as np
import pandas as pd

In [9]:
import os
os.getcwd()
os.listdir( os.getcwd() )


Out[9]:
['2017datascibowl',
 'data_password.txt',
 'LSTM_model201702271930.save',
 'cleaning_dueSigmaFin.pyc',
 'LSTM_model201702280608.save',
 '.ipynb_checkpoints',
 'dueSigmaFinancial_kaggle.py',
 'LSTM_model.save',
 'LSTM_model201703012346.save',
 'DatSciBow2017_FullPreprocessTutorial.ipynb',
 'LSTM_model201702282350.save',
 'GRU_model201703022010.save',
 'DueSigmaFin_runs.ipynb',
 'ImagePreprocessing.ipynb',
 'dueSigmaFinancial_local.ipynb',
 'GRU_model201703012348.save',
 'GRU_model201703050709.save',
 'GRU_model201703021741.save',
 'kaggle.ipynb',
 'glass.csv',
 'DatSciBow2017_SVM.ipynb',
 '__init__.py',
 'train.h5',
 'dueSigmaFinancial_local_GRUs.ipynb',
 'cleaning_dueSigmaFin.py']

In [9]:
patients_stage1_feat = os.listdir('./2017datascibowl/stage1_feat')
print(len(patients_stage1_feat))


1595

In [10]:
patients_stage1_feat = [patientname.replace("feat_vec","") for patientname in patients_stage1_feat]

low-resolution case


In [10]:
patients_stage1_feat_lowres = os.listdir('./2017datascibowl/stage1_feat_lowres')
print(len(patients_stage1_feat_lowres))


1595

In [11]:
patients_stage1_feat_lowres = [id.replace("feat_vec","") for id in patients_stage1_feat_lowres]

Get the $y$ value (outcomes), the label for the class that each example belongs to, by matching patient IDs


In [12]:
y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
print(len(y_ids))


1397

In [13]:
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
print(len(y_ids_found))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-de1b06632ffa> in <module>()
----> 1 y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
      2 print(len(y_ids_found))

NameError: name 'patients_stage1_feat' is not defined

In [14]:
y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat_lowres)]
print(len(y_ids_found))


1397

In [15]:
m = len(patients_stage1_feat)
found_indices =[]
for i in range(m):
    if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
        found_indices.append(i)

patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
print(len(patients_stage1_feat_found))


1397

In [15]:
m = len(patients_stage1_feat_lowres)
found_indices =[]
for i in range(m):
    if patients_stage1_feat_lowres[i] in y_ids_found['id'].as_matrix():
        found_indices.append(i)

patients_stage1_lowres_found = [patients_stage1_feat_lowres[i] for i in found_indices]
print(len(patients_stage1_lowres_found))


1397

In [16]:
y_found=[]
for i in range(len(patients_stage1_feat_found)):
    if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
        cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
        y_found.append( cancer_val )
y_found=np.array(y_found).flatten()

In [16]:
y_found=[]
for i in range(len(patients_stage1_lowres_found)):
    if (patients_stage1_lowres_found[i] in y_ids_found['id'].as_matrix()):
        cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_lowres_found[i]]['cancer'].as_matrix()
        y_found.append( cancer_val )
y_found=np.array(y_found).flatten()

In [17]:
# it should be this condition, as the indices for each now correspond to each other 
len(y_found)==len(patients_stage1_feat_found)


Out[17]:
True

In [17]:
len(y_found)==len(patients_stage1_lowres_found)


Out[17]:
True

In [38]:
patients_stage1_feat_found;

low-resolution case


In [18]:
patients_stage1_feat_found = patients_stage1_lowres_found

Training, (Cross-)Validation, Test Set randomization and processing


In [43]:
ratio_of_train_to_total = 0.2
ratio_valid_to_rest = 0.2
numberofexamples = len(patients_stage1_feat_found)
numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
print(numberofexamples);print(numbertotest);print(numberoftrainingexamples);print(numbertovalidate)


1397
895
279
223

In [44]:
shuffledindices = np.random.permutation( numberofexamples)

In [45]:
#patients_train = patients_stage1_feat[shuffledindices[:numberoftrainingexamples]] 
#patients_valid = patients_stage1_feat[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
#patients_test  = patients_stage1_feat[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
#patients_train = [patients_stage1_feat[id] for id in shuffledindices[:numberoftrainingexamples]]
#patients_valid = [patients_stage1_feat[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
#patients_test  = [patients_stage1_feat[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]
patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
patients_test  = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]

y_train = y_found[shuffledindices[:numberoftrainingexamples]]
y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
y_test  = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]

Indeed


In [46]:
# sanity check
y_ids.loc[y_ids['id']== patients_train[2]]


Out[46]:
id cancer
1272 e8be143b9f5e352f71043b24f79f5a17 0

In [47]:
# sanity check
y_train[2]


Out[47]:
0

In [48]:
#sanity check
for i in range(10,20):
    print(y_ids.loc[y_ids['id']== patients_train[i]]['cancer'].as_matrix().flatten() == y_train[i])


[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]

In [25]:
def load_feat_vec(patientid):
    f=file("./2017datascibowl/stage1_feat/"+patientid+"feat_vec","rb")
    arr = np.load(f)
    f.close()
    return arr

In [25]:
%time patients_train_vecs = [load_feat_vec(id) for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
print(patients_train_vecs.shape)


CPU times: user 136 ms, sys: 1.57 s, total: 1.7 s
Wall time: 9.23 s
(69, 2621448)

In [52]:
%time patients_valid_vecs = [load_feat_vec(id) for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
print(patients_valid_vecs.shape)


CPU times: user 110 ms, sys: 1.65 s, total: 1.76 s
Wall time: 9.78 s
(66, 2621448)

In [26]:
def load_feat_vec(patientid,sub_name="stage1_feat"):
    f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
    arr = np.load(f)
    f.close()
    return arr

In [49]:
%time patients_train_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_train]
patients_train_vecs = np.array(patients_train_vecs)
print(patients_train_vecs.shape)


CPU times: user 144 ms, sys: 117 ms, total: 261 ms
Wall time: 2.4 s
(279, 40968)

In [50]:
%time patients_valid_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_valid]
patients_valid_vecs = np.array(patients_valid_vecs)
print(patients_valid_vecs.shape)


CPU times: user 112 ms, sys: 71.2 ms, total: 183 ms
Wall time: 2.08 s
(223, 40968)

Preprocess y labels to be in the -1,1 representation for the binary classes, NOT the 0,1 representation


In [54]:
y_train;

In [52]:
y_train_rep2 = np.copy(y_train)  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1

In [55]:
y_train_rep2;

In [56]:
y_train;

In [57]:
y_valid_rep2 = np.copy(y_valid)  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1

In [58]:
SVM_stage1 = SVM_parallel(patients_train_vecs,y_train_rep2,len(y_train_rep2),1.0,1.0,0.001)  # C=1.0,sigma=1.0, alpha=0.001

In [59]:
SVM_stage1.build_W();
SVM_stage1.build_update();

In [60]:
%time SVM_stage1.train_model_full(100) # training iterations 100->user 10min 1s, sys: 16min 54s, total: 26min 56s


CPU times: user 10min 1s, sys: 16min 54s, total: 26min 56s
Wall time: 26min 54s
Out[60]:
array([ -91.00325012,  -78.68531036,  -78.06800079,  -78.55661774,
        -79.26457214,  -79.97239685,  -80.6671524 ,  -81.34812927,
        -82.01557922,  -82.66971588,  -83.30926514,  -83.93421173,
        -84.54647827,  -85.14653778,  -85.73464966,  -86.31061554,
        -86.87378693,  -87.42567444,  -87.96500397,  -88.49226379,
        -89.00816345,  -89.51264191,  -90.0063324 ,  -90.4901123 ,
        -90.96418762,  -91.42666626,  -91.87518311,  -92.31246948,
        -92.73873901,  -93.15627289,  -93.56546021,  -93.96653748,
        -94.35960388,  -94.74481964,  -95.12241364,  -95.49246216,
        -95.8536911 ,  -96.20722961,  -96.55370331,  -96.89065552,
        -97.21958923,  -97.54145813,  -97.85430908,  -98.15711212,
        -98.45239258,  -98.74053955,  -99.0219574 ,  -99.29771423,
        -99.56790161,  -99.8325119 , -100.09016418, -100.34063721,
       -100.58366394, -100.81987   , -101.05105591, -101.27754211,
       -101.49958801, -101.71712494, -101.92977905, -102.13681793,
       -102.3394928 , -102.5381546 , -102.7322998 , -102.92115784,
       -103.10598755, -103.28557587, -103.45835876, -103.62210083,
       -103.7815094 , -103.93566895, -104.08392334, -104.22496033,
       -104.36073303, -104.49319458, -104.62014008, -104.73869324,
       -104.85159302, -104.95965576, -105.06283569, -105.15847778,
       -105.24778748, -105.33259583, -105.41456604, -105.49463654,
       -105.57310486, -105.64996338, -105.72370148, -105.79509735,
       -105.86325836, -105.92457581, -105.97979736, -106.02992249,
       -106.07594299, -106.11862183, -106.15917969, -106.19702148,
       -106.23287964, -106.26145172, -106.28283691, -106.3019104 ])

In [62]:
SVM_stage1.build_b()


Out[62]:
(Elemwise{mul,no_inplace}.0, OrderedUpdates())

In [63]:
%time yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs)


CPU times: user 4.58 s, sys: 5.98 s, total: 10.6 s
Wall time: 10.5 s

In [64]:
print(np.sign(yhat_valid[0]).shape)
np.sign(yhat_valid[0])


(223,)
Out[64]:
array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1.], dtype=float32)

In [41]:
print(y_valid_rep2.shape)
y_valid_rep2


Out[41]:
array([-1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1,  1, -1,
       -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1,
        1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,
        1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1,  1])

In [65]:
(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))


Out[65]:
0.73542600896860988

In [59]:
%time yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs[0:2])


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-59-75ee5a149093> in <module>()
----> 1 get_ipython().magic(u'time yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs[0:2])')

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2161         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2162         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163         return self.run_line_magic(magic_name, magic_arg_s)
   2164 
   2165     #-------------------------------------------------------------------------

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2082                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2083             with self.builtin_trap:
-> 2084                 result = fn(*args,**kwargs)
   2085             return result
   2086 

<decorator-gen-60> in time(self, line, cell, local_ns)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1175         else:
   1176             st = clock2()
-> 1177             exec(code, glob, local_ns)
   1178             end = clock2()
   1179             out = None

<timed exec> in <module>()

/home/topolo/PropD/MLgrabbag/ML/SVM.pyc in make_predictions_parallel(self, X_pred_vals)
    708 
    709                 predictions_function = theano.function(inputs=[],outputs=output)
--> 710                 predictions_vals = predictions_function()
    711                 self._yhat = theano.shared( predictions_vals ) # added this line later
    712 

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    896                     node=self.fn.nodes[self.fn.position_of_error],
    897                     thunk=thunk,
--> 898                     storage_map=getattr(self.fn, 'storage_map', None))
    899             else:
    900                 # old-style linkers raise their own exceptions

/home/topolo/PropD/Theano/theano/gof/link.pyc in raise_with_op(node, thunk, exc_info, storage_map)
    323         # extra long error message in that case.
    324         pass
--> 325     reraise(exc_type, exc_value, exc_trace)
    326 
    327 

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in rval(p, i, o, n, allow_gc)
    987         def rval(p=p, i=node_input_storage, o=node_output_storage, n=node,
    988                  allow_gc=allow_gc):
--> 989             r = p(n, [x[0] for x in i], o)
    990             for o in node.outputs:
    991                 compute_map[o][0] = True

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in p(node, args, outs)
    976                                                 args,
    977                                                 outs,
--> 978                                                 self, node)
    979         except (ImportError, theano.gof.cmodule.MissingGXX):
    980             p = self.execute

/home/topolo/PropD/Theano/theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (/home/topolo/.theano/compiledir_Linux-4.2-fc23.x86_64-x86_64-with-fedora-23-Twenty_Three-x86_64-2.7.11-64/scan_perform/mod.cpp:4606)()
    403                 if hasattr(fn, 'thunks'):
    404                     # For the CVM
--> 405                     gof.link.raise_with_op(fn.nodes[fn.position_of_error],
    406                                            fn.thunks[fn.position_of_error])
    407                 else:

/home/topolo/PropD/Theano/theano/gof/link.pyc in raise_with_op(node, thunk, exc_info, storage_map)
    323         # extra long error message in that case.
    324         pass
--> 325     reraise(exc_type, exc_value, exc_trace)
    326 
    327 

/home/topolo/PropD/Theano/theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (/home/topolo/.theano/compiledir_Linux-4.2-fc23.x86_64-x86_64-with-fedora-23-Twenty_Three-x86_64-2.7.11-64/scan_perform/mod.cpp:4490)()
    395 
    396         try:
--> 397             fn()
    398         except Exception:
    399             if hasattr(fn, 'position_of_error'):

MemoryError: Error allocating 723519648 bytes of device memory (CNMEM_STATUS_OUT_OF_MEMORY).
Apply node that caused the error: GpuElemwise{Composite{sqr((i0 - i1))},no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuDimShuffle{x,0}.0)
Toposort index: 4
Inputs types: [CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, row)]
Inputs shapes: [(69, 2621448), (1, 2621448)]
Inputs strides: [(2621448, 1), (0, 1)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[for{gpu,scan_fn}(Elemwise{maximum,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{Composite{sqr((i0 - i1))},no_inplace}.0, GpuSubtensor{:int64:}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,gpu,scan_fn}(Elemwise{maximum,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, Elemwise{Composite{minimum(minimum(i0, i1), i2)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{Set;:int64:}.0)
Toposort index: 58
Inputs types: [TensorType(int64, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, vector), TensorType(int64, scalar), CudaNdarrayType(float32, vector), CudaNdarrayType(float32, vector), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, vector)]
Inputs shapes: [(), (69, 2621448), (2,), (), (69,), (69,), (69, 2621448), (70,)]
Inputs strides: [(), (2621448, 1), (1,), (), (1,), (1,), (2621448, 1), (1,)]
Inputs values: [array(69), 'not shown', CudaNdarray([ 0.   -0.25]), array(69), 'not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

Model selection; varying $C$ and $\sigma$


In [66]:
C_trial = np.array([0.01,0.05,0.1,0.5,1.,5.,10.,50.,100,500])
sigma_trial=np.array([0.1,0.5,1.,5.,10.])

In [67]:
Csigma_mesh = np.meshgrid(C_trial,sigma_trial)

In [73]:
C_trial = np.array([0.01,0.1,1.,10.,100,500])
sigma_trial=np.array([0.1,1.,10.])

In [74]:
Csigma_mesh = np.meshgrid(C_trial,sigma_trial)

In [75]:
accuracy_score = np.zeros(Csigma_mesh[0].shape)

In [78]:
for i in range(len(sigma_trial)):
    for j in range(len(C_trial)):
        C_temp = Csigma_mesh[0][i][j]
        sigma_temp = Csigma_mesh[1][i][j]
        SVM_stage1 = SVM_parallel(patients_train_vecs,y_train_rep2,len(y_train_rep2),C_temp,sigma_temp,0.001)  
        SVM_stage1.build_W();
        SVM_stage1.build_update();
        SVM_stage1.train_model_full(50)
        SVM_stage1.build_b()
        yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs)
        accuracy_score_temp=(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))
        print(C_temp,sigma_temp,accuracy_score_temp)
        accuracy_score[i][j] = accuracy_score_temp


(0.01, 0.10000000000000001, 0.73542600896860988)
(0.10000000000000001, 0.10000000000000001, 0.73542600896860988)
(1.0, 0.10000000000000001, 0.73542600896860988)
(10.0, 0.10000000000000001, 0.73542600896860988)
(100.0, 0.10000000000000001, 0.73542600896860988)
(500.0, 0.10000000000000001, 0.73542600896860988)
(0.01, 1.0, 0.73542600896860988)
(0.10000000000000001, 1.0, 0.73542600896860988)
(1.0, 1.0, 0.73542600896860988)
(10.0, 1.0, 0.73542600896860988)
(100.0, 1.0, 0.73542600896860988)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-78-8001980da83f> in <module>()
      6         SVM_stage1.build_W();
      7         SVM_stage1.build_update();
----> 8         SVM_stage1.train_model_full(50)
      9         SVM_stage1.build_b()
     10         yhat_valid = SVM_stage1.make_predictions_parallel( patients_valid_vecs)

/home/topolo/PropD/MLgrabbag/ML/SVM.pyc in train_model_full(self, max_iters)
    541                         error = 0.
    542 
--> 543                         W_train = update_function()
    544 
    545                         if np.isnan( W_train ) or np.isinf( W_train):

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in rval(p, i, o, n, allow_gc)
    987         def rval(p=p, i=node_input_storage, o=node_output_storage, n=node,
    988                  allow_gc=allow_gc):
--> 989             r = p(n, [x[0] for x in i], o)
    990             for o in node.outputs:
    991                 compute_map[o][0] = True

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in p(node, args, outs)
    976                                                 args,
    977                                                 outs,
--> 978                                                 self, node)
    979         except (ImportError, theano.gof.cmodule.MissingGXX):
    980             p = self.execute

/home/topolo/PropD/Theano/theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (/home/topolo/.theano/compiledir_Linux-4.2-fc23.x86_64-x86_64-with-fedora-23-Twenty_Three-x86_64-2.7.11-64/scan_perform/mod.cpp:4490)()
    395 
    396         try:
--> 397             fn()
    398         except Exception:
    399             if hasattr(fn, 'position_of_error'):

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in rval(p, i, o, n, allow_gc)
    987         def rval(p=p, i=node_input_storage, o=node_output_storage, n=node,
    988                  allow_gc=allow_gc):
--> 989             r = p(n, [x[0] for x in i], o)
    990             for o in node.outputs:
    991                 compute_map[o][0] = True

/home/topolo/PropD/Theano/theano/scan_module/scan_op.pyc in p(node, args, outs)
    976                                                 args,
    977                                                 outs,
--> 978                                                 self, node)
    979         except (ImportError, theano.gof.cmodule.MissingGXX):
    980             p = self.execute

KeyboardInterrupt: 

low-resolution 64x64 case


In [11]:
def load_feat_vec(patientid,sub_name="stage1_feat"):
    f=file("./2017datascibowl/"+sub_name+"/"+patientid+"feat_vec","rb")
    arr = np.load(f)
    f.close()
    return arr

In [12]:
def prepare_inputX(sub_name="stage1_feat_lowres64", ratio_of_train_to_total = 0.4,
                                                    ratio_valid_to_rest = 0.2):
    patients_stage1_feat = os.listdir('./2017datascibowl/'+sub_name)

    patients_stage1_feat = [id.replace("feat_vec","") for id in patients_stage1_feat]  # remove the suffix "feat_vec"
    
    # get y labels
    y_ids = pd.read_csv('./2017datascibowl/stage1_labels.csv')
    
    y_ids_found=y_ids.loc[y_ids['id'].isin(patients_stage1_feat)]
    
    m = len(patients_stage1_feat)
    found_indices =[]
    for i in range(m):
        if patients_stage1_feat[i] in y_ids_found['id'].as_matrix():
            found_indices.append(i)

    patients_stage1_feat_found = [patients_stage1_feat[i] for i in found_indices]
    
    y_found=[]
    for i in range(len(patients_stage1_feat_found)):
        if (patients_stage1_feat_found[i] in y_ids_found['id'].as_matrix()):
            cancer_val = y_ids_found.loc[y_ids_found['id']==patients_stage1_feat_found[i]]['cancer'].as_matrix()
            y_found.append( cancer_val )
    y_found=np.array(y_found).flatten()
    
    assert (len(y_found)==len(patients_stage1_feat_found))


    numberofexamples = len(patients_stage1_feat_found)
    numberoftrainingexamples = int(numberofexamples*ratio_of_train_to_total)
    numbertovalidate = int((numberofexamples - numberoftrainingexamples)*ratio_valid_to_rest)
    numbertotest= numberofexamples - numberoftrainingexamples - numbertovalidate
    
    shuffledindices = np.random.permutation( numberofexamples)
    
    patients_train = [patients_stage1_feat_found[id] for id in shuffledindices[:numberoftrainingexamples]]
    patients_valid = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
    patients_test  = [patients_stage1_feat_found[id] for id in shuffledindices[numberoftrainingexamples+numbertovalidate:]]

    y_train = y_found[shuffledindices[:numberoftrainingexamples]]
    y_valid = y_found[shuffledindices[numberoftrainingexamples:numberoftrainingexamples+numbertovalidate]]
    y_test  = y_found[shuffledindices[numberoftrainingexamples+numbertovalidate:]]
    
    patients_train_vecs = [load_feat_vec(id,sub_name) for id in patients_train]
    patients_train_vecs = np.array(patients_train_vecs)
    
    patients_valid_vecs = [load_feat_vec(id,sub_name) for id in patients_valid]
    patients_valid_vecs = np.array(patients_valid_vecs)
    
    patients_test_vecs = [load_feat_vec(id,sub_name) for id in patients_test]
    patients_test_vecs = np.array(patients_test_vecs)

    patient_ids = {"train":patients_train,"valid":patients_valid,"test":patients_test}
    ys = {"train":y_train,"valid":y_valid,"test":y_test}
    Xs = {"train":patients_train_vecs,"valid":patients_valid_vecs,"test":patients_test_vecs}
    
    return patient_ids, ys, Xs

In [98]:
patient_ids64, ys64,Xs64=prepare_inputX("stage1_feat_lowres64",0.2,0.2)

In [99]:
y_train_rep2 = np.copy(ys64["train"])  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1

y_valid_rep2 = np.copy(ys64["valid"])  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1

y_test_rep2 = np.copy(ys64["test"])  # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1

In [100]:
C_trial=[0.1,1.0,10.]
sigma_trial=[0.1,1.0,10.]

In [101]:
SVM_stage1 = SVM_parallel(Xs64["train"],y_train_rep2,len(y_train_rep2),
                          C_trial[1],sigma_trial[1],0.0005)  # C=1.0,sigma=1.0, alpha=0.001

SVM_stage1.build_W();
SVM_stage1.build_update();

In [102]:
%time SVM_stage1.train_model_full(3) # training iterations 100->user 10min 1s, sys: 16min 54s, total: 26min 56s


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-102-e9bda6d66b45> in <module>()
----> 1 get_ipython().magic(u'time SVM_stage1.train_model_full(3) # training iterations 100->user 10min 1s, sys: 16min 54s, total: 26min 56s')

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2161         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2162         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163         return self.run_line_magic(magic_name, magic_arg_s)
   2164 
   2165     #-------------------------------------------------------------------------

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2082                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2083             with self.builtin_trap:
-> 2084                 result = fn(*args,**kwargs)
   2085             return result
   2086 

<decorator-gen-60> in time(self, line, cell, local_ns)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1171         if mode=='eval':
   1172             st = clock2()
-> 1173             out = eval(code, glob, local_ns)
   1174             end = clock2()
   1175         else:

<timed eval> in <module>()

/home/topolo/PropD/MLgrabbag/ML/SVM.pyc in train_model_full(self, max_iters)
    541                         error = 0.
    542 
--> 543                         W_train = update_function()
    544 
    545                         if np.isnan( W_train ) or np.isinf( W_train):

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    896                     node=self.fn.nodes[self.fn.position_of_error],
    897                     thunk=thunk,
--> 898                     storage_map=getattr(self.fn, 'storage_map', None))
    899             else:
    900                 # old-style linkers raise their own exceptions

/home/topolo/PropD/Theano/theano/gof/link.pyc in raise_with_op(node, thunk, exc_info, storage_map)
    323         # extra long error message in that case.
    324         pass
--> 325     reraise(exc_type, exc_value, exc_trace)
    326 
    327 

/home/topolo/PropD/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

RuntimeError: Cuda error: GpuElemwise node_f36d4d896685d79bc0957e19896d6464_0 Mul: out of memory.
    n_blocks=9 threads_per_block=32
   Call: kernel_Mul_node_f36d4d896685d79bc0957e19896d6464_0_Ccontiguous<<<n_blocks, threads_per_block>>>(numEls, i0_data, i1_data, o0_data)

Apply node that caused the error: GpuElemwise{mul,no_inplace}(GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0)
Toposort index: 176
Inputs types: [CudaNdarrayType(float32, vector), CudaNdarrayType(float32, vector)]
Inputs shapes: [(279,), (279,)]
Inputs strides: [(1,), (1,)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[forall_inplace,gpu,scan_fn&grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(i0, i1), i2)}}.0, GpuElemwise{mul,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuElemwise{mul,no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuFromHost.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuAlloc{memset_0=True}.0, Elemwise{Composite{minimum(minimum(i0, i1), i2)}}.0, Elemwise{Composite{minimum(minimum(i0, i1), i2)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, ScalarFromTensor.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

low-resolution case (16x16)


In [361]:
patient_ids16, ys16,Xs16=prepare_inputX("stage1_feat_lowres",0.55,0.3)

In [104]:
y_train_rep2 = np.copy(ys16["train"])  # 2nd representation
y_train_rep2[y_train_rep2<=0]=-1

y_valid_rep2 = np.copy(ys16["valid"])  # 2nd representation
y_valid_rep2[y_valid_rep2<=0]=-1

y_test_rep2 = np.copy(ys16["test"])  # 2nd representation
y_test_rep2[y_test_rep2<=0]=-1

In [105]:
C_trial=[0.1,1.0,10.]
sigma_trial=[0.1,1.0,10.]

In [116]:
accuracy_scores=np.zeros((len(C_trial),len(sigma_trial)) )

In [106]:
SVM_stage1 = SVM_parallel(Xs16["train"],y_train_rep2,len(y_train_rep2),
                          C_trial[1],sigma_trial[1],0.0005)  # C=1.0,sigma=1.0, alpha=0.001

SVM_stage1.build_W();
SVM_stage1.build_update();

In [112]:
%time SVM_stage1.train_model_full(200)  # training iterations=3 PU times: user 3min 33s, sys: 2min 34s, total: 6min 8s


CPU times: user 3h 59min 7s, sys: 2h 46min 6s, total: 6h 45min 14s
Wall time: 6h 44min 57s
Out[112]:
array([-200.83131409, -202.55712891, -204.33587646, -206.08123779,
       -207.7901001 , -209.45532227, -211.08381653, -212.67712402,
       -214.23626709, -215.76069641, -217.24943542, -218.70394897,
       -220.12663269, -221.52130127, -222.88653564, -224.22303772,
       -225.5309906 , -226.809021  , -228.05993652, -229.28213501,
       -230.47427368, -231.64016724, -232.77420044, -233.8817749 ,
       -234.96562195, -236.02755737, -237.06628418, -238.08067322,
       -239.07070923, -240.03775024, -240.98564148, -241.91238403,
       -242.81768799, -243.69985962, -244.56072998, -245.39962769,
       -246.21714783, -247.0166626 , -247.79553223, -248.55636597,
       -249.2930603 , -250.00802612, -250.70266724, -251.37922668,
       -252.04145813, -252.6869812 , -253.31645203, -253.93121338,
       -254.53184509, -255.11587524, -255.68267822, -256.22912598,
       -256.75299072, -257.25146484, -257.72674561, -258.18280029,
       -258.62084961, -259.04241943, -259.44540405, -259.83544922,
       -260.21270752, -260.56716919, -260.90518188, -261.22686768,
       -261.53460693, -261.83227539, -262.11776733, -262.38827515,
       -262.64398193, -262.88275146, -263.09793091, -263.29821777,
       -263.4864502 , -263.65771484, -263.81768799, -263.9666748 ,
       -264.09890747, -264.21740723, -264.32885742, -264.42810059,
       -264.51879883, -264.5965271 , -264.66400146, -264.72744751,
       -264.78866577, -264.84820557, -264.90667725, -264.96395874,
       -265.02001953, -265.07522583, -265.12902832, -265.1819458 ,
       -265.23364258, -265.28430176, -265.33435059, -265.38296509,
       -265.43078613, -265.47763062, -265.5234375 , -265.568573  ,
       -265.6126709 , -265.65612793, -265.69830322, -265.73983765,
       -265.78070068, -265.82034302, -265.85934448, -265.89813232,
       -265.93530273, -265.97241211, -266.00817871, -266.04336548,
       -266.07836914, -266.11212158, -266.14562988, -266.17828369,
       -266.21017456, -266.24133301, -266.2723999 , -266.30255127,
       -266.33163452, -266.36083984, -266.38909912, -266.41702271,
       -266.44393921, -266.47064209, -266.49713135, -266.52258301,
       -266.54785156, -266.57214355, -266.59625244, -266.62023926,
       -266.64318848, -266.66607666, -266.68841553, -266.71020508,
       -266.73144531, -266.75241089, -266.77307129, -266.79315186,
       -266.81295776, -266.83227539, -266.85095215, -266.86972046,
       -266.88806152, -266.90594482, -266.92321777, -266.9402771 ,
       -266.95733643, -266.97387695, -266.99002075, -267.0057373 ,
       -267.02105713, -267.03643799, -267.05114746, -267.0657959 ,
       -267.08013916, -267.09448242, -267.10797119, -267.12173462,
       -267.13458252, -267.14758301, -267.16009521, -267.17285156,
       -267.18481445, -267.19696045, -267.2086792 , -267.22021484,
       -267.23132324, -267.24230957, -267.25332642, -267.2635498 ,
       -267.2741394 , -267.28424072, -267.29418945, -267.30377197,
       -267.3137207 , -267.32315063, -267.33215332, -267.34103394,
       -267.34991455, -267.3588562 , -267.36724854, -267.37567139,
       -267.38348389, -267.39151001, -267.39941406, -267.40686035,
       -267.41485596, -267.42202759, -267.42932129, -267.43618774,
       -267.44311523, -267.44989014, -267.45648193, -267.46334839,
       -267.46966553, -267.47601318, -267.48187256, -267.48797607])

In [113]:
SVM_stage1.build_b()


Out[113]:
(Elemwise{mul,no_inplace}.0, OrderedUpdates())

In [114]:
yhat_valid = SVM_stage1.make_predictions_parallel( Xs16["valid"] )

In [115]:
accuracy_score_temp=(np.sign(yhat_valid[0]) == y_valid_rep2).sum()/float(len(y_valid_rep2))
print(accuracy_score_temp)


0.712765957447

In [118]:
accuracy_scores[1][1]=accuracy_score_temp

In [119]:
%time yhat_test = SVM_stage1.make_predictions_parallel( Xs16["test"] )


CPU times: user 30.6 s, sys: 33.6 s, total: 1min 4s
Wall time: 1min 4s

In [120]:
accuracy_score_temp_test=(np.sign(yhat_test[0]) == y_test_rep2).sum()/float(len(y_test_rep2))
print(len(y_test_rep2))
print(accuracy_score_temp_test)


441
0.759637188209

low-resolution 32x32 case


In [307]:
Xs32["train"][0].shape


Out[307]:
(163848,)

In [13]:
patient_ids32, ys32,Xs32=prepare_inputX("stage1_feat_lowres32",0.275,0.25)  #0.275,0.25 works,0.30,0.25 works

In [14]:
y32_train_rep2 = np.copy(ys32["train"])  # 2nd representation
y32_train_rep2[y32_train_rep2<=0]=-1

y32_valid_rep2 = np.copy(ys32["valid"])  # 2nd representation
y32_valid_rep2[y32_valid_rep2<=0]=-1

y32_test_rep2 = np.copy(ys32["test"])  # 2nd representation
y32_test_rep2[y32_test_rep2<=0]=-1

In [15]:
C32_trial=[0.1,1.0,10.,200.]
sigma32_trial=[0.1,1.0,10.]

In [16]:
C32_trial[3]


Out[16]:
200.0

In [17]:
accuracy_scores32=np.zeros((len(C32_trial),len(sigma32_trial)) )

In [18]:
SVM_stage1_32 = SVM_parallel(Xs32["train"],y32_train_rep2,len(y32_train_rep2),
                          C32_trial[3],sigma32_trial[1],0.0005)  

SVM_stage1_32.build_W();
SVM_stage1_32.build_update();

In [ ]:
%time SVM_stage1_32.train_model_full(20) # training_iterations=2,CPU times: user 1min 54s, sys: 2min 34s, total: 4min 29s

To go out to the competition; over the full data set


In [126]:
patients_stage1_ids = os.listdir('./2017datascibowl/stage1_feat_lowres')
patients_stage1_ids = [id.replace("feat_vec","") for id in patients_stage1_ids]  # remove the suffix "feat_vec"

print(len(patients_stage1_ids))


1595

In [128]:
patients_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_stage1_ids]
patients_vecs = np.array(patients_vecs)

In [154]:
%time yhat = SVM_stage1.make_predictions_parallel( patients_vecs )


CPU times: user 57.1 s, sys: 1min 5s, total: 2min 2s
Wall time: 2min 2s

In [155]:
yhat_rep2 = np.copy(yhat[0])  # representation 2, {-1,1}, not representation of binary classes as {0,1}
yhat_rep2 = np.sign( yhat_rep2);  # representation 1, {0,1}, not representation of binary classes as {-1,1}
yhat_rep1 = np.copy(yhat_rep2)
np.place(yhat_rep1,yhat_rep1<0.,0.)

In [158]:
pd.DataFrame(yhat_rep1).describe();


Out[158]:
0
count 1595.000000
mean 0.126646
std 0.332678
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 1.000000

In [206]:
Prattscaling_results = SVM_stage1.make_prob_Pratt(yhat_rep1)

In [207]:
pd.DataFrame(Prattscaling_results[0]).describe()


Out[207]:
0
count 1595.000000
mean 0.125520
std 0.271362
min 0.004191
25% 0.006749
50% 0.033630
75% 0.033630
max 0.837107

In [214]:
((Prattscaling_results[0]>0.1).astype("float32")==yhat_rep1).sum()


Out[214]:
1595

In [213]:
print(len(Prattscaling_results[0]))
print(len(yhat_rep1));print(yhat_rep1.dtype)


1595
1595
float32

In [143]:
%time patients_found_vecs = [load_feat_vec(id,"stage1_feat_lowres") for id in patients_stage1_lowres_found]
patients_found_vecs = np.array(patients_found_vecs)


CPU times: user 315 ms, sys: 203 ms, total: 518 ms
Wall time: 516 ms

In [144]:
patients_found_vecs.shape


Out[144]:
(1397, 40968)

In [148]:
y_found_rep2 = np.copy(y_found)
y_found_rep2[y_found_rep2<=0]=-1

In [151]:
%time yhat_found = SVM_stage1.make_predictions_parallel( patients_found_vecs )


CPU times: user 52.9 s, sys: 1min, total: 1min 53s
Wall time: 1min 53s

In [152]:
accuracy_score_temp_found=(np.sign(yhat_found[0]) == y_found_rep2).sum()/float(len(y_found_rep2))
print(accuracy_score_temp_found)


0.885468861847

In [216]:
np.unique(yhat_rep1)


Out[216]:
array([ 0.,  1.], dtype=float32)

In [217]:
np.unique(y_found)


Out[217]:
array([0, 1])

In [218]:
np.count_nonzero(y_found)


Out[218]:
362

In [219]:
np.count_nonzero(yhat_rep1)


Out[219]:
202

In [223]:
Prattscaling_results[0][:100]


Out[223]:
array([ 0.00676017,  0.03362985,  0.0066766 ,  0.00684445,  0.00445424,
        0.03362985,  0.03362985,  0.00494832,  0.00491834,  0.03362985,
        0.00515345,  0.00684445,  0.03362985,  0.03362985,  0.03362985,
        0.03362985,  0.03362985,  0.00475647,  0.03362985,  0.03362985,
        0.00570897,  0.83710682,  0.00640817,  0.00506418,  0.03362985,
        0.00559623,  0.03362985,  0.03362985,  0.83710682,  0.03362985,
        0.83710682,  0.83710682,  0.83710682,  0.03362985,  0.00684445,
        0.03362985,  0.03362985,  0.83710682,  0.00573395,  0.00571569,
        0.03362985,  0.004754  ,  0.03362985,  0.00468638,  0.03362985,
        0.03362985,  0.00556879,  0.83710682,  0.00557399,  0.03362985,
        0.00553154,  0.00537084,  0.83710682,  0.03362985,  0.03362985,
        0.00582898,  0.03362985,  0.03362985,  0.03362985,  0.03362985,
        0.005304  ,  0.03362985,  0.03362985,  0.83710682,  0.83710682,
        0.00684445,  0.00636829,  0.03362985,  0.00427081,  0.00435718,
        0.03362985,  0.83710682,  0.83710682,  0.00684445,  0.03362985,
        0.00424011,  0.03362985,  0.00684445,  0.83710682,  0.00473127,
        0.03362985,  0.03362985,  0.03362985,  0.00505903,  0.03362985,
        0.03362985,  0.00623662,  0.03362985,  0.00512414,  0.03362985,
        0.00596082,  0.03362985,  0.006568  ,  0.03362985,  0.03362985,
        0.83710682,  0.00523753,  0.03362985,  0.03362985,  0.03362985], dtype=float32)

Submissions


In [224]:
stage1_sample_submission_csv = pd.read_csv("./2017datascibowl/stage1_sample_submission.csv")

In [226]:
stage1_sample_submission_csv.describe()


Out[226]:
cancer
count 198.0
mean 0.5
std 0.0
min 0.5
25% 0.5
50% 0.5
75% 0.5
max 0.5

In [228]:
stage1_sample_submission_csv.head()


Out[228]:
id cancer
0 026470d51482c93efc18b9803159c960 0.5
1 031b7ec4fe96a3b035a8196264a8c8c3 0.5
2 03bd22ed5858039af223c04993e9eb22 0.5
3 06a90409e4fcea3e634748b967993531 0.5
4 07b1defcfae5873ee1f03c90255eb170 0.5

We need to match up these ids with what we have.


In [236]:
m = len(patients_stage1_ids)
m_sample = len(stage1_sample_submission_csv['id'].as_matrix())
sample_indices =[]
for j in range(m_sample):
    for i in range(m):
    #if patients_stage1_ids[i] in stage1_sample_submission_csv['id'].as_matrix():
        condition = (stage1_sample_submission_csv['id'].as_matrix()[j] == patients_stage1_ids[i])
        if condition:
            sample_indices.append(i)

patients_sample_ids = [patients_stage1_ids[i] for i in sample_indices]
print(len(patients_sample_ids))


198

In [237]:
set(stage1_sample_submission_csv['id'].as_matrix()) == set(np.array(patients_sample_ids))


Out[237]:
True

In [240]:
sample_yhat_prob = np.array( [Prattscaling_results[0][idx] for idx in sample_indices] )

In [246]:
pd.DataFrame(Prattscaling_results[0]).describe()


Out[246]:
0
count 1595.000000
mean 0.125520
std 0.271362
min 0.004191
25% 0.006749
50% 0.033630
75% 0.033630
max 0.837107

In [247]:
Prattscaling_results[0]


Out[247]:
array([ 0.00676017,  0.03362985,  0.0066766 , ...,  0.00669918,
        0.0060775 ,  0.03362985], dtype=float32)

In [232]:
stage1_sample_submission_csv['id'].as_matrix()[0]


Out[232]:
'026470d51482c93efc18b9803159c960'

In [250]:
sample_out = pd.DataFrame(zip(patients_sample_ids,sample_yhat_prob))
sample_out.columns=["id","cancer"]

In [278]:
sample_out.head()


Out[278]:
id cancer
0 026470d51482c93efc18b9803159c960 0.03363
1 031b7ec4fe96a3b035a8196264a8c8c3 0.03363
2 03bd22ed5858039af223c04993e9eb22 0.03363
3 06a90409e4fcea3e634748b967993531 0.03363
4 07b1defcfae5873ee1f03c90255eb170 0.03363

In [279]:
sample_out.to_csv("./2017datascibowl/samplesubmit00.csv",index=False)

In [265]:
import time

In [276]:
#time.gmtime().__str__()


Out[276]:
'time.struct_time(tm_year=2017, tm_mon=4, tm_mday=7, tm_hour=11, tm_min=50, tm_sec=35, tm_wday=4, tm_yday=97, tm_isdst=0)'

In [281]:
yhat_rep1
sample_yhat_cls = np.array( [yhat_rep1[idx] for idx in sample_indices] )

In [282]:
sample_yhat_cls


Out[282]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.], dtype=float32)

To go out to competition, for only the sample


In [284]:
stage1_sample_submission_csv = pd.read_csv("./2017datascibowl/stage1_sample_submission.csv")

In [287]:
sub_name="stage1_feat_lowres"
patients_sample_vecs = np.array( [load_feat_vec(id,sub_name) for id in stage1_sample_submission_csv['id'].as_matrix()] )

In [289]:
%time yhat_sample = SVM_stage1.make_predictions_parallel( patients_sample_vecs )


CPU times: user 24.7 s, sys: 27.5 s, total: 52.2 s
Wall time: 52.1 s

In [290]:
yhat_sample_rep2 = np.copy(yhat_sample[0])  # representation 2, {-1,1}, not representation of binary classes as {0,1}
yhat_sample_rep2 = np.sign( yhat_sample_rep2);  # representation 1, {0,1}, not representation of binary classes as {-1,1}
yhat_sample_rep1 = np.copy(yhat_sample_rep2)
np.place(yhat_sample_rep1,yhat_sample_rep1<0.,0.)

In [293]:
yhat_sample[0]


Out[293]:
CudaNdarray([-0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284
 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284 -0.47227284])

In [ ]: