In [16]:

    
import pandas as pd
print 'Pandas version:',pd.version.version









    



Pandas version: 0.13.1



In [2]:

    
df_SBM_train = pd.read_csv('./Train/train_SBM.csv',index_col='Id')
df_FNC_train = pd.read_csv('./Train/train_FNC.csv',index_col='Id')
df_labels = pd.read_csv('./Train/train_labels.csv',index_col='Id')



In [3]:

    
df_SBM_train.describe()









    Out[3]:






  
    
      
      SBM_map1
      SBM_map2
      SBM_map3
      SBM_map4
      SBM_map5
      SBM_map6
      SBM_map7
      SBM_map8
      SBM_map10
      SBM_map13
      SBM_map17
      SBM_map22
      SBM_map26
      SBM_map28
      SBM_map32
      SBM_map36
      SBM_map40
      SBM_map43
      SBM_map45
      SBM_map48
      
    
  
  
    
      count
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
      ...
    
    
      mean
        0.134995
       -0.094195
       -0.076866
        0.012471
        0.073704
       -0.004953
        0.118802
        0.106247
        0.064476
       -0.007778
        0.038051
       -0.000217
        0.049556
       -0.050626
        0.112465
        0.052462
        0.111135
       -0.066720
        0.088153
        0.068664
      ...
    
    
      std
        0.883114
        0.937444
        0.882526
        0.937842
        0.941510
        0.994260
        0.926693
        0.964946
        1.017846
        1.081989
        1.095996
        0.924269
        1.016228
        1.029606
        1.033777
        0.941974
        0.946056
        1.020597
        0.956373
        1.041473
      ...
    
    
      min
       -1.945161
       -2.183338
       -2.241210
       -2.005569
       -2.503497
       -3.099610
       -1.964108
       -2.305639
       -2.880430
       -2.542064
       -2.266664
       -2.498967
       -2.031587
       -2.830852
       -2.018822
       -2.298281
       -2.481110
       -2.392854
       -2.327743
       -3.060313
      ...
    
    
      25%
       -0.400377
       -0.632557
       -0.641487
       -0.726047
       -0.459417
       -0.600037
       -0.547326
       -0.471092
       -0.567420
       -0.800934
       -0.839474
       -0.407716
       -0.648872
       -0.733467
       -0.534045
       -0.516783
       -0.480631
       -0.824895
       -0.522789
       -0.605645
      ...
    
    
      50%
        0.087173
       -0.118148
       -0.141126
       -0.034350
        0.128777
        0.028739
        0.107167
        0.172475
        0.178142
        0.009001
        0.069828
        0.006513
       -0.091811
       -0.126834
        0.125509
        0.035049
        0.055866
       -0.068575
        0.106404
        0.061215
      ...
    
    
      75%
        0.730467
        0.575077
        0.653878
        0.587966
        0.573641
        0.613419
        0.752733
        0.739824
        0.620836
        0.778447
        0.768192
        0.608311
        0.590319
        0.532110
        0.775017
        0.699353
        0.822284
        0.670277
        0.755848
        0.594908
      ...
    
    
      max
        2.419532
        1.778067
        1.784392
        2.633859
        2.296887
        2.895577
        2.707580
        2.930673
        2.897110
        2.673868
        2.959008
        1.735059
        2.724727
        3.047440
        2.756722
        2.298306
        2.473325
        2.663749
        2.632448
        2.483554
      ...
    
  

8 rows × 32 columns



In [4]:

    
df_FNC_train.describe()









    Out[4]:






  
    
      
      FNC1
      FNC2
      FNC3
      FNC4
      FNC5
      FNC6
      FNC7
      FNC8
      FNC9
      FNC10
      FNC11
      FNC12
      FNC13
      FNC14
      FNC15
      FNC16
      FNC17
      FNC18
      FNC19
      FNC20
      
    
  
  
    
      count
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
      ...
    
    
      mean
        0.211394
        0.049030
       -0.091031
       -0.102662
        0.158572
        0.042185
       -0.103091
        0.024820
        0.129220
        0.120118
       -0.026984
        0.118425
       -0.057568
       -0.018594
        0.012992
        0.198765
        0.134094
        0.116469
       -0.048463
        0.074565
      ...
    
    
      std
        0.258132
        0.270017
        0.324109
        0.320256
        0.279851
        0.289825
        0.279123
        0.249710
        0.256124
        0.230091
        0.320044
        0.257747
        0.293518
        0.222936
        0.243644
        0.252242
        0.292570
        0.261642
        0.270248
        0.240503
      ...
    
    
      min
       -0.391350
       -0.673890
       -0.734010
       -0.775180
       -0.510520
       -0.582640
       -0.637690
       -0.527620
       -0.460770
       -0.434500
       -0.722000
       -0.349420
       -0.597630
       -0.521770
       -0.479970
       -0.433540
       -0.568170
       -0.455010
       -0.611690
       -0.584940
      ...
    
    
      25%
        0.061147
       -0.129993
       -0.344335
       -0.351830
       -0.049274
       -0.162628
       -0.312198
       -0.115870
       -0.047829
       -0.034778
       -0.237752
       -0.078697
       -0.302097
       -0.201118
       -0.138442
        0.041785
       -0.095365
       -0.034860
       -0.241715
       -0.081967
      ...
    
    
      50%
        0.238885
        0.100597
       -0.088119
       -0.082341
        0.194575
        0.052100
       -0.096169
        0.024141
        0.155995
        0.107495
       -0.006029
        0.110625
       -0.061210
       -0.022768
        0.003046
        0.190660
        0.137335
        0.115265
       -0.009641
        0.060237
      ...
    
    
      75%
        0.401767
        0.256295
        0.107702
        0.135475
        0.377210
        0.231305
        0.083581
        0.216883
        0.300102
        0.265210
        0.156185
        0.335000
        0.126425
        0.157753
        0.141903
        0.388057
        0.375603
        0.309670
        0.126193
        0.240020
      ...
    
    
      max
        0.820240
        0.766600
        0.601020
        0.731570
        0.763950
        0.607150
        0.612370
        0.610530
        0.729200
        0.629010
        0.655700
        0.853550
        0.584890
        0.393810
        0.602690
        0.735400
        0.707900
        0.730410
        0.531700
        0.649770
      ...
    
  

8 rows × 378 columns



In [5]:

    
df_FULL = df_SBM_train.join(df_FNC_train)



In [6]:

    
df_FULL.describe()









    Out[6]:






  
    
      
      SBM_map1
      SBM_map2
      SBM_map3
      SBM_map4
      SBM_map5
      SBM_map6
      SBM_map7
      SBM_map8
      SBM_map10
      SBM_map13
      SBM_map17
      SBM_map22
      SBM_map26
      SBM_map28
      SBM_map32
      SBM_map36
      SBM_map40
      SBM_map43
      SBM_map45
      SBM_map48
      
    
  
  
    
      count
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
       86.000000
      ...
    
    
      mean
        0.134995
       -0.094195
       -0.076866
        0.012471
        0.073704
       -0.004953
        0.118802
        0.106247
        0.064476
       -0.007778
        0.038051
       -0.000217
        0.049556
       -0.050626
        0.112465
        0.052462
        0.111135
       -0.066720
        0.088153
        0.068664
      ...
    
    
      std
        0.883114
        0.937444
        0.882526
        0.937842
        0.941510
        0.994260
        0.926693
        0.964946
        1.017846
        1.081989
        1.095996
        0.924269
        1.016228
        1.029606
        1.033777
        0.941974
        0.946056
        1.020597
        0.956373
        1.041473
      ...
    
    
      min
       -1.945161
       -2.183338
       -2.241210
       -2.005569
       -2.503497
       -3.099610
       -1.964108
       -2.305639
       -2.880430
       -2.542064
       -2.266664
       -2.498967
       -2.031587
       -2.830852
       -2.018822
       -2.298281
       -2.481110
       -2.392854
       -2.327743
       -3.060313
      ...
    
    
      25%
       -0.400377
       -0.632557
       -0.641487
       -0.726047
       -0.459417
       -0.600037
       -0.547326
       -0.471092
       -0.567420
       -0.800934
       -0.839474
       -0.407716
       -0.648872
       -0.733467
       -0.534045
       -0.516783
       -0.480631
       -0.824895
       -0.522789
       -0.605645
      ...
    
    
      50%
        0.087173
       -0.118148
       -0.141126
       -0.034350
        0.128777
        0.028739
        0.107167
        0.172475
        0.178142
        0.009001
        0.069828
        0.006513
       -0.091811
       -0.126834
        0.125509
        0.035049
        0.055866
       -0.068575
        0.106404
        0.061215
      ...
    
    
      75%
        0.730467
        0.575077
        0.653878
        0.587966
        0.573641
        0.613419
        0.752733
        0.739824
        0.620836
        0.778447
        0.768192
        0.608311
        0.590319
        0.532110
        0.775017
        0.699353
        0.822284
        0.670277
        0.755848
        0.594908
      ...
    
    
      max
        2.419532
        1.778067
        1.784392
        2.633859
        2.296887
        2.895577
        2.707580
        2.930673
        2.897110
        2.673868
        2.959008
        1.735059
        2.724727
        3.047440
        2.756722
        2.298306
        2.473325
        2.663749
        2.632448
        2.483554
      ...
    
  

8 rows × 410 columns



In [7]:

    
df_labels.describe()









    Out[7]:






  
    
      
      Class
    
  
  
    
      count
       86.000000
    
    
      mean
        0.465116
    
    
      std
        0.501707
    
    
      min
        0.000000
    
    
      25%
        0.000000
    
    
      50%
        0.000000
    
    
      75%
        1.000000
    
    
      max
        1.000000
    
  

8 rows × 1 columns

Only 86 cases but 410 features. May need to perform dimensionality reduction to properly fit the data. As a first go attempting Naive Bayes Classifier using Gaussian distribution.



In [8]:

    
from sklearn.naive_bayes import GaussianNB



In [41]:

    
gnb = GaussianNB()
X = df_FULL.as_matrix()
y = df_labels.Class.as_matrix()
gnb.fit(X,y)
y_pred = gnb.predict(X)
total_pred = len(y_pred)
error_score = (y_pred != df_labels.Class.as_matrix()).sum()
error_rate = error_score*1./total_pred
print 'Total predictions on training set: {:d}\nErrors: {:d}\nError rate: {:.2%}'.format(total_pred,error_score,error_rate)









    



Total predictions on training set: 86
Errors: 5
Error rate: 5.81%



In [11]:

    
# Note directly attempting to read the test data into memory causes a memory error
# the files will need to be read sequentially and predicted to get reasonable answers

# df_SBM_test = pd.read_csv('./Test/test_SBM.csv',index_col='Id')
# df_FNC_test = pd.read_csv('./Test/test_FNC.csv',index_col='Id')



In [ ]:

    
pred_NB = []
with open('./Test/test_SBM.csv','r') as SBM_test, open('./Test/test_FNC.csv','r') as FNC_test:
    for x,y in zip(SBM_test,FNC_test):
        x = x.strip()
        y = y.strip()
        x = x.split(',')
        y = y.split(',')
        if x[0] != 'Id':
            features = x[1:]+y[1:]
            features = np.array(map(float,features))
            pred = gnb.predict(features)
            pred_NB.append([int(x[0]),pred[0]])



In [ ]:

    
df_pred = pd.DataFrame(data=pred_NB,columns=['Id','Probability'])
df_pred = df_pred.set_index('Id')
df_pred.to_csv('GaussianNB_pred.csv')
df_pred.describe()



In [17]:

    
import sklearn.preprocessing as skpp



In [42]:

    
# scaling the data to zero mean and unit variance
scaler = skpp.StandardScaler().fit(X)
Xt = scaler.transform(X)



In [43]:

    
from sklearn import svm



In [44]:

    
clf = svm.SVC()
clf.fit(Xt,y)
y_pred = clf.predict(Xt)
total_pred = len(y_pred)
error_score = (y_pred != df_labels.Class.as_matrix()).sum()
error_rate = error_score*1./total_pred
print 'Total predictions on training set: {:d}\nErrors: {:d}\nError rate: {:.2%}'.format(total_pred,error_score,error_rate)









    



Total predictions on training set: 86
Errors: 1
Error rate: 1.16%



In [47]:

    
pred_SVM = []
with open('./Test/test_SBM.csv','r') as SBM_test, open('./Test/test_FNC.csv','r') as FNC_test:
    for x,y in zip(SBM_test,FNC_test):
        x = x.strip()
        y = y.strip()
        x = x.split(',')
        y = y.split(',')
        if x[0] != 'Id':
            features = x[1:]+y[1:]
            features = np.array(map(float,features))
            featuresT = scaler.transform(features)
            pred = clf.predict(featuresT)
            pred_SVM.append([int(x[0]),pred[0]])



In [48]:

    
df_pred = pd.DataFrame(data=pred_SVM,columns=['Id','Probability'])
df_pred = df_pred.set_index('Id')
df_pred.to_csv('SVM_pred.csv')
df_pred.describe()









    Out[48]:






  
    
      
      Probability
    
  
  
    
      count
       119748.000000
    
    
      mean
            0.372257
    
    
      std
            0.483408
    
    
      min
            0.000000
    
    
      25%
            0.000000
    
    
      50%
            0.000000
    
    
      75%
            1.000000
    
    
      max
            1.000000
    
  

8 rows × 1 columns

Note that the fits are overfitting the training set. Will need to implement cross validation to ensure training set is not overfit. Se below for example from scikit learn documentation on the iris data set.



In [ ]:

    
>>> import numpy as np
>>> from sklearn import cross_validation
>>> from sklearn import datasets
>>> from sklearn import svm

>>> iris = datasets.load_iris()
>>> iris.data.shape, iris.target.shape
((150, 4), (150,))



In [ ]:

    
>>> X_train, X_test, y_train, y_test = cross_validation.train_test_split(
...     iris.data, iris.target, test_size=0.4, random_state=0)

>>> X_train.shape, y_train.shape
((90, 4), (90,))
>>> X_test.shape, y_test.shape
((60, 4), (60,))

>>> clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
>>> clf.score(X_test, y_test)                           
0.96...



In [ ]:

    
>>> clf = svm.SVC(kernel='linear', C=1)
>>> scores = cross_validation.cross_val_score(
...    clf, iris.data, iris.target, cv=5)
...
>>> scores                                            
array([ 1.  ...,  0.96...,  0.9 ...,  0.96...,  1.        ])



In [ ]:

    
>>> from sklearn import metrics
>>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5,
...     scoring='f1')
...                                                     
array([ 1.  ...,  0.96...,  0.89...,  0.96...,  1.        ])

	SBM_map1	SBM_map2	SBM_map3	SBM_map4	SBM_map5	SBM_map6	SBM_map7	SBM_map8	SBM_map10	SBM_map13	SBM_map17	SBM_map22	SBM_map26	SBM_map28	SBM_map32	SBM_map36	SBM_map40	SBM_map43	SBM_map45	SBM_map48
count	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	86.000000	...
mean	0.134995	-0.094195	-0.076866	0.012471	0.073704	-0.004953	0.118802	0.106247	0.064476	-0.007778	0.038051	-0.000217	0.049556	-0.050626	0.112465	0.052462	0.111135	-0.066720	0.088153	0.068664	...
std	0.883114	0.937444	0.882526	0.937842	0.941510	0.994260	0.926693	0.964946	1.017846	1.081989	1.095996	0.924269	1.016228	1.029606	1.033777	0.941974	0.946056	1.020597	0.956373	1.041473	...
min	-1.945161	-2.183338	-2.241210	-2.005569	-2.503497	-3.099610	-1.964108	-2.305639	-2.880430	-2.542064	-2.266664	-2.498967	-2.031587	-2.830852	-2.018822	-2.298281	-2.481110	-2.392854	-2.327743	-3.060313	...
25%	-0.400377	-0.632557	-0.641487	-0.726047	-0.459417	-0.600037	-0.547326	-0.471092	-0.567420	-0.800934	-0.839474	-0.407716	-0.648872	-0.733467	-0.534045	-0.516783	-0.480631	-0.824895	-0.522789	-0.605645	...
50%	0.087173	-0.118148	-0.141126	-0.034350	0.128777	0.028739	0.107167	0.172475	0.178142	0.009001	0.069828	0.006513	-0.091811	-0.126834	0.125509	0.035049	0.055866	-0.068575	0.106404	0.061215	...
75%	0.730467	0.575077	0.653878	0.587966	0.573641	0.613419	0.752733	0.739824	0.620836	0.778447	0.768192	0.608311	0.590319	0.532110	0.775017	0.699353	0.822284	0.670277	0.755848	0.594908	...
max	2.419532	1.778067	1.784392	2.633859	2.296887	2.895577	2.707580	2.930673	2.897110	2.673868	2.959008	1.735059	2.724727	3.047440	2.756722	2.298306	2.473325	2.663749	2.632448	2.483554	...

	Probability
count	119748.000000
mean	0.372257
std	0.483408
min	0.000000
25%	0.000000
50%	0.000000
75%	1.000000
max	1.000000