In [16]:
import pandas as pd
print 'Pandas version:',pd.version.version
In [2]:
df_SBM_train = pd.read_csv('./Train/train_SBM.csv',index_col='Id')
df_FNC_train = pd.read_csv('./Train/train_FNC.csv',index_col='Id')
df_labels = pd.read_csv('./Train/train_labels.csv',index_col='Id')
In [3]:
df_SBM_train.describe()
Out[3]:
In [4]:
df_FNC_train.describe()
Out[4]:
In [5]:
df_FULL = df_SBM_train.join(df_FNC_train)
In [6]:
df_FULL.describe()
Out[6]:
In [7]:
df_labels.describe()
Out[7]:
Only 86 cases but 410 features. May need to perform dimensionality reduction to properly fit the data. As a first go attempting Naive Bayes Classifier using Gaussian distribution.
In [8]:
from sklearn.naive_bayes import GaussianNB
In [41]:
gnb = GaussianNB()
X = df_FULL.as_matrix()
y = df_labels.Class.as_matrix()
gnb.fit(X,y)
y_pred = gnb.predict(X)
total_pred = len(y_pred)
error_score = (y_pred != df_labels.Class.as_matrix()).sum()
error_rate = error_score*1./total_pred
print 'Total predictions on training set: {:d}\nErrors: {:d}\nError rate: {:.2%}'.format(total_pred,error_score,error_rate)
In [11]:
# Note directly attempting to read the test data into memory causes a memory error
# the files will need to be read sequentially and predicted to get reasonable answers
# df_SBM_test = pd.read_csv('./Test/test_SBM.csv',index_col='Id')
# df_FNC_test = pd.read_csv('./Test/test_FNC.csv',index_col='Id')
In [ ]:
pred_NB = []
with open('./Test/test_SBM.csv','r') as SBM_test, open('./Test/test_FNC.csv','r') as FNC_test:
for x,y in zip(SBM_test,FNC_test):
x = x.strip()
y = y.strip()
x = x.split(',')
y = y.split(',')
if x[0] != 'Id':
features = x[1:]+y[1:]
features = np.array(map(float,features))
pred = gnb.predict(features)
pred_NB.append([int(x[0]),pred[0]])
In [ ]:
df_pred = pd.DataFrame(data=pred_NB,columns=['Id','Probability'])
df_pred = df_pred.set_index('Id')
df_pred.to_csv('GaussianNB_pred.csv')
df_pred.describe()
In [17]:
import sklearn.preprocessing as skpp
In [42]:
# scaling the data to zero mean and unit variance
scaler = skpp.StandardScaler().fit(X)
Xt = scaler.transform(X)
In [43]:
from sklearn import svm
In [44]:
clf = svm.SVC()
clf.fit(Xt,y)
y_pred = clf.predict(Xt)
total_pred = len(y_pred)
error_score = (y_pred != df_labels.Class.as_matrix()).sum()
error_rate = error_score*1./total_pred
print 'Total predictions on training set: {:d}\nErrors: {:d}\nError rate: {:.2%}'.format(total_pred,error_score,error_rate)
In [47]:
pred_SVM = []
with open('./Test/test_SBM.csv','r') as SBM_test, open('./Test/test_FNC.csv','r') as FNC_test:
for x,y in zip(SBM_test,FNC_test):
x = x.strip()
y = y.strip()
x = x.split(',')
y = y.split(',')
if x[0] != 'Id':
features = x[1:]+y[1:]
features = np.array(map(float,features))
featuresT = scaler.transform(features)
pred = clf.predict(featuresT)
pred_SVM.append([int(x[0]),pred[0]])
In [48]:
df_pred = pd.DataFrame(data=pred_SVM,columns=['Id','Probability'])
df_pred = df_pred.set_index('Id')
df_pred.to_csv('SVM_pred.csv')
df_pred.describe()
Out[48]:
Note that the fits are overfitting the training set. Will need to implement cross validation to ensure training set is not overfit. Se below for example from scikit learn documentation on the iris data set.
In [ ]:
>>> import numpy as np
>>> from sklearn import cross_validation
>>> from sklearn import datasets
>>> from sklearn import svm
>>> iris = datasets.load_iris()
>>> iris.data.shape, iris.target.shape
((150, 4), (150,))
In [ ]:
>>> X_train, X_test, y_train, y_test = cross_validation.train_test_split(
... iris.data, iris.target, test_size=0.4, random_state=0)
>>> X_train.shape, y_train.shape
((90, 4), (90,))
>>> X_test.shape, y_test.shape
((60, 4), (60,))
>>> clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
>>> clf.score(X_test, y_test)
0.96...
In [ ]:
>>> clf = svm.SVC(kernel='linear', C=1)
>>> scores = cross_validation.cross_val_score(
... clf, iris.data, iris.target, cv=5)
...
>>> scores
array([ 1. ..., 0.96..., 0.9 ..., 0.96..., 1. ])
In [ ]:
>>> from sklearn import metrics
>>> cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5,
... scoring='f1')
...
array([ 1. ..., 0.96..., 0.89..., 0.96..., 1. ])