In [16]:
import pandas as pd
print 'Pandas version:',pd.version.version


Pandas version: 0.13.1

In [2]:
df_SBM_train = pd.read_csv('./Train/train_SBM.csv',index_col='Id')
df_FNC_train = pd.read_csv('./Train/train_FNC.csv',index_col='Id')
df_labels = pd.read_csv('./Train/train_labels.csv',index_col='Id')

In [3]:
df_SBM_train.describe()


Out[3]:
SBM_map1 SBM_map2 SBM_map3 SBM_map4 SBM_map5 SBM_map6 SBM_map7 SBM_map8 SBM_map10 SBM_map13 SBM_map17 SBM_map22 SBM_map26 SBM_map28 SBM_map32 SBM_map36 SBM_map40 SBM_map43 SBM_map45 SBM_map48
count 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 ...
mean 0.134995 -0.094195 -0.076866 0.012471 0.073704 -0.004953 0.118802 0.106247 0.064476 -0.007778 0.038051 -0.000217 0.049556 -0.050626 0.112465 0.052462 0.111135 -0.066720 0.088153 0.068664 ...
std 0.883114 0.937444 0.882526 0.937842 0.941510 0.994260 0.926693 0.964946 1.017846 1.081989 1.095996 0.924269 1.016228 1.029606 1.033777 0.941974 0.946056 1.020597 0.956373 1.041473 ...
min -1.945161 -2.183338 -2.241210 -2.005569 -2.503497 -3.099610 -1.964108 -2.305639 -2.880430 -2.542064 -2.266664 -2.498967 -2.031587 -2.830852 -2.018822 -2.298281 -2.481110 -2.392854 -2.327743 -3.060313 ...
25% -0.400377 -0.632557 -0.641487 -0.726047 -0.459417 -0.600037 -0.547326 -0.471092 -0.567420 -0.800934 -0.839474 -0.407716 -0.648872 -0.733467 -0.534045 -0.516783 -0.480631 -0.824895 -0.522789 -0.605645 ...
50% 0.087173 -0.118148 -0.141126 -0.034350 0.128777 0.028739 0.107167 0.172475 0.178142 0.009001 0.069828 0.006513 -0.091811 -0.126834 0.125509 0.035049 0.055866 -0.068575 0.106404 0.061215 ...
75% 0.730467 0.575077 0.653878 0.587966 0.573641 0.613419 0.752733 0.739824 0.620836 0.778447 0.768192 0.608311 0.590319 0.532110 0.775017 0.699353 0.822284 0.670277 0.755848 0.594908 ...
max 2.419532 1.778067 1.784392 2.633859 2.296887 2.895577 2.707580 2.930673 2.897110 2.673868 2.959008 1.735059 2.724727 3.047440 2.756722 2.298306 2.473325 2.663749 2.632448 2.483554 ...

8 rows × 32 columns


In [4]:
df_FNC_train.describe()


Out[4]:
FNC1 FNC2 FNC3 FNC4 FNC5 FNC6 FNC7 FNC8 FNC9 FNC10 FNC11 FNC12 FNC13 FNC14 FNC15 FNC16 FNC17 FNC18 FNC19 FNC20
count 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 ...
mean 0.211394 0.049030 -0.091031 -0.102662 0.158572 0.042185 -0.103091 0.024820 0.129220 0.120118 -0.026984 0.118425 -0.057568 -0.018594 0.012992 0.198765 0.134094 0.116469 -0.048463 0.074565 ...
std 0.258132 0.270017 0.324109 0.320256 0.279851 0.289825 0.279123 0.249710 0.256124 0.230091 0.320044 0.257747 0.293518 0.222936 0.243644 0.252242 0.292570 0.261642 0.270248 0.240503 ...
min -0.391350 -0.673890 -0.734010 -0.775180 -0.510520 -0.582640 -0.637690 -0.527620 -0.460770 -0.434500 -0.722000 -0.349420 -0.597630 -0.521770 -0.479970 -0.433540 -0.568170 -0.455010 -0.611690 -0.584940 ...
25% 0.061147 -0.129993 -0.344335 -0.351830 -0.049274 -0.162628 -0.312198 -0.115870 -0.047829 -0.034778 -0.237752 -0.078697 -0.302097 -0.201118 -0.138442 0.041785 -0.095365 -0.034860 -0.241715 -0.081967 ...
50% 0.238885 0.100597 -0.088119 -0.082341 0.194575 0.052100 -0.096169 0.024141 0.155995 0.107495 -0.006029 0.110625 -0.061210 -0.022768 0.003046 0.190660 0.137335 0.115265 -0.009641 0.060237 ...
75% 0.401767 0.256295 0.107702 0.135475 0.377210 0.231305 0.083581 0.216883 0.300102 0.265210 0.156185 0.335000 0.126425 0.157753 0.141903 0.388057 0.375603 0.309670 0.126193 0.240020 ...
max 0.820240 0.766600 0.601020 0.731570 0.763950 0.607150 0.612370 0.610530 0.729200 0.629010 0.655700 0.853550 0.584890 0.393810 0.602690 0.735400 0.707900 0.730410 0.531700 0.649770 ...

8 rows × 378 columns


In [5]:
df_FULL = df_SBM_train.join(df_FNC_train)

In [6]:
df_FULL.describe()


Out[6]:
SBM_map1 SBM_map2 SBM_map3 SBM_map4 SBM_map5 SBM_map6 SBM_map7 SBM_map8 SBM_map10 SBM_map13 SBM_map17 SBM_map22 SBM_map26 SBM_map28 SBM_map32 SBM_map36 SBM_map40 SBM_map43 SBM_map45 SBM_map48
count 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 86.000000 ...
mean 0.134995 -0.094195 -0.076866 0.012471 0.073704 -0.004953 0.118802 0.106247 0.064476 -0.007778 0.038051 -0.000217 0.049556 -0.050626 0.112465 0.052462 0.111135 -0.066720 0.088153 0.068664 ...
std 0.883114 0.937444 0.882526 0.937842 0.941510 0.994260 0.926693 0.964946 1.017846 1.081989 1.095996 0.924269 1.016228 1.029606 1.033777 0.941974 0.946056 1.020597 0.956373 1.041473 ...
min -1.945161 -2.183338 -2.241210 -2.005569 -2.503497 -3.099610 -1.964108 -2.305639 -2.880430 -2.542064 -2.266664 -2.498967 -2.031587 -2.830852 -2.018822 -2.298281 -2.481110 -2.392854 -2.327743 -3.060313 ...
25% -0.400377 -0.632557 -0.641487 -0.726047 -0.459417 -0.600037 -0.547326 -0.471092 -0.567420 -0.800934 -0.839474 -0.407716 -0.648872 -0.733467 -0.534045 -0.516783 -0.480631 -0.824895 -0.522789 -0.605645 ...
50% 0.087173 -0.118148 -0.141126 -0.034350 0.128777 0.028739 0.107167 0.172475 0.178142 0.009001 0.069828 0.006513 -0.091811 -0.126834 0.125509 0.035049 0.055866 -0.068575 0.106404 0.061215 ...
75% 0.730467 0.575077 0.653878 0.587966 0.573641 0.613419 0.752733 0.739824 0.620836 0.778447 0.768192 0.608311 0.590319 0.532110 0.775017 0.699353 0.822284 0.670277 0.755848 0.594908 ...
max 2.419532 1.778067 1.784392 2.633859 2.296887 2.895577 2.707580 2.930673 2.897110 2.673868 2.959008 1.735059 2.724727 3.047440 2.756722 2.298306 2.473325 2.663749 2.632448 2.483554 ...

8 rows × 410 columns


In [7]:
df_labels.describe()


Out[7]:
Class
count 86.000000
mean 0.465116
std 0.501707
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000

8 rows × 1 columns

Only 86 cases but 410 features. May need to perform dimensionality reduction to properly fit the data. As a first go attempting Naive Bayes Classifier using Gaussian distribution.


In [8]:
from sklearn.naive_bayes import GaussianNB

In [41]:
gnb = GaussianNB()
X = df_FULL.as_matrix()
y = df_labels.Class.as_matrix()
gnb.fit(X,y)
y_pred = gnb.predict(X)
total_pred = len(y_pred)
error_score = (y_pred != df_labels.Class.as_matrix()).sum()
error_rate = error_score*1./total_pred
print 'Total predictions on training set: {:d}\nErrors: {:d}\nError rate: {:.2%}'.format(total_pred,error_score,error_rate)


Total predictions on training set: 86
Errors: 5
Error rate: 5.81%

In [11]:
# Note directly attempting to read the test data into memory causes a memory error
# the files will need to be read sequentially and predicted to get reasonable answers

# df_SBM_test = pd.read_csv('./Test/test_SBM.csv',index_col='Id')
# df_FNC_test = pd.read_csv('./Test/test_FNC.csv',index_col='Id')

In [ ]:
pred_NB = []
with open('./Test/test_SBM.csv','r') as SBM_test, open('./Test/test_FNC.csv','r') as FNC_test:
    for x,y in zip(SBM_test,FNC_test):
        x = x.strip()
        y = y.strip()
        x = x.split(',')
        y = y.split(',')
        if x[0] != 'Id':
            features = x[1:]+y[1:]
            features = np.array(map(float,features))
            pred = gnb.predict(features)
            pred_NB.append([int(x[0]),pred[0]])

In [ ]:
df_pred = pd.DataFrame(data=pred_NB,columns=['Id','Probability'])
df_pred = df_pred.set_index('Id')
df_pred.to_csv('GaussianNB_pred.csv')
df_pred.describe()

In [17]:
import sklearn.preprocessing as skpp

In [42]:
# scaling the data to zero mean and unit variance
scaler = skpp.StandardScaler().fit(X)
Xt = scaler.transform(X)

In [43]:
from sklearn import svm

In [44]:
clf = svm.SVC()
clf.fit(Xt,y)
y_pred = clf.predict(Xt)
total_pred = len(y_pred)
error_score = (y_pred != df_labels.Class.as_matrix()).sum()
error_rate = error_score*1./total_pred
print 'Total predictions on training set: {:d}\nErrors: {:d}\nError rate: {:.2%}'.format(total_pred,error_score,error_rate)


Total predictions on training set: 86
Errors: 1
Error rate: 1.16%

In [47]:
pred_SVM = []
with open('./Test/test_SBM.csv','r') as SBM_test, open('./Test/test_FNC.csv','r') as FNC_test:
    for x,y in zip(SBM_test,FNC_test):
        x = x.strip()
        y = y.strip()
        x = x.split(',')
        y = y.split(',')
        if x[0] != 'Id':
            features = x[1:]+y[1:]
            features = np.array(map(float,features))
            featuresT = scaler.transform(features)
            pred = clf.predict(featuresT)
            pred_SVM.append([int(x[0]),pred[0]])

In [48]:
df_pred = pd.DataFrame(data=pred_SVM,columns=['Id','Probability'])
df_pred = df_pred.set_index('Id')
df_pred.to_csv('SVM_pred.csv')
df_pred.describe()


Out[48]:
Probability
count 119748.000000
mean 0.372257
std 0.483408
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000

8 rows × 1 columns

Note that the fits are overfitting the training set. Will need to implement cross validation to ensure training set is not overfit. Se below for example from scikit learn documentation on the iris data set.


In [ ]:
>>> import numpy as np
>>> from sklearn import cross_validation
>>> from sklearn import datasets
>>> from sklearn import svm

>>> iris = datasets.load_iris()
>>> iris.data.shape, iris.target.shape
((150, 4), (150,))

In [ ]:
>>> X_train, X_test, y_train, y_test = cross_validation.train_test_split(
...     iris.data, iris.target, test_size=0.4, random_state=0)

>>> X_train.shape, y_train.shape
((90, 4), (90,))
>>> X_test.shape, y_test.shape
((60, 4), (60,))

>>> clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
>>> clf.score(X_test, y_test)                           
0.96...

In [ ]:
>>> clf = svm.SVC(kernel='linear', C=1)
>>> scores = cross_validation.cross_val_score(
...    clf, iris.data, iris.target, cv=5)
...
>>> scores                                            
array([ 1.  ...,  0.96...,  0.9 ...,  0.96...,  1.        ])

In [ ]:
>>> from sklearn import metrics
>>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5,
...     scoring='f1')
...                                                     
array([ 1.  ...,  0.96...,  0.89...,  0.96...,  1.        ])