Run Random Forest after combining two feature sets
In [1]:
%matplotlib inline
from matplotlib import pylab as pl
import cPickle as pickle
import pandas as pd
import numpy as np
import os
import random
In [2]:
import sys
sys.path.append('..')
uncommoent the relevant pipeline in ../seizure_detection.py and run
cd ..
./doall data
or
./doall td
./doall tt
In [3]:
FEATURES = 'gen-8_medianwindow-bands2-usf-w60-b0.2-b4-b8-b12-b30-b70-0.1-0.5-0.9'
In [4]:
from common.data import CachedDataLoader
cached_data_loader = CachedDataLoader('../data-cache')
In [5]:
def read_data(target, data_type, features):
fname = 'data_%s_%s_%s'%(data_type,target,features)
print fname
return cached_data_loader.load(fname,None)
In [6]:
best = pd.read_csv('../submissions/141103-predict.1.csv', index_col='clip', squeeze=True)
In [7]:
best.hist(bins=50)
Out[7]:
In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression as LR
clf = RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False,max_depth=10,
n_jobs=-1)#, max_features=15
In [11]:
fpout = open('../submissions/141103-predict.12.csv','w')
print >>fpout,'clip,preictal'
In [13]:
import random
for target in ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']:
pdata0 = read_data(target, 'preictal', FEATURES) # positive examples
ndata0 = read_data(target, 'interictal', FEATURES) # negative examples
X0 = np.concatenate((pdata0.X, ndata0.X))
y0 = np.zeros(X0.shape[0])
y0[:pdata0.X.shape[0]] = 1
# predict
tdata = read_data(target, 'test', FEATURES) # test examples
Xt = tdata.X
Nt = Xt.shape[0]
yt = np.array([best['%s_test_segment_%04d.mat' % (target, i+1)] for i in range(Nt)])
yt = np.clip(yt,0.05,0.95)
y_proba = np.zeros(Nt)
y_proba_count = np.zeros(Nt)
Nts = int(0.2*Nt)
for j in range(50):
s = [random.randrange(Nt) for i in range(Nts)]
snot = [i for i in range(Nt) if i not in s]
y1 = np.array(np.random.random(Nts) < yt[s],dtype=int).ravel()
X = np.concatenate((X0,Xt[s,:]))
y = np.concatenate((y0,y1))
clf.fit(X,y)
y_proba[snot] += clf.predict_proba(Xt[snot,:])[:,1]
y_proba_count[snot] += 1
assert not np.any(y_proba_count == 0)
y_proba /= y_proba_count
# write results
for i,p in enumerate(y_proba):
print >>fpout,'%s_test_segment_%04d.mat,%.15f' % (target, i+1, p)
In [14]:
fpout.close()
In [ ]: