In [1]:
import numpy as np
import h5py
import pickle
FEAT_TRAIN = '/media/raid_arr/data/ndsb/features/features_train.hdf5'
# FEAT_TRAIN = '~/data/ndsb/features/features_train.hdf5'
FEAT_TEST = '/media/raid_arr/data/ndsb/features/features_test.hdf5'
LAYER = 'fc2'
MODE = ['train', 'predict']
In [2]:
# f_train.close()
f_train = h5py.File(FEAT_TRAIN, 'r')
f_test = h5py.File(FEAT_TEST, 'r')
In [15]:
f_test.keys()
Out[15]:
In [3]:
train_feats_db = f_train[LAYER]
train_lbls_db = f_train['lbls']
test_feats_db = f_test[LAYER]
test_lbls_db = f_test['lbls']
test_paths_db = f_test['im_paths']
In [4]:
from sklearn import cross_validation
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import LabelPropagation
import time
In [5]:
train_feats = train_feats_db[()]
test_feats = test_feats_db[()]
train_lbls = train_lbls_db[()]
test_lbls = test_lbls_db[()]
In [7]:
test_feats[:30000,:].shape
Out[7]:
In [ ]:
if 'train' in MODE:
# clf = svm.LinearSVC()
# clf = svm.SVC(probability=True) #0.82
# clf = GaussianNB()
# clf = RandomForestClassifier(n_jobs=-1)
# clf = GradientBoostingClassifier()
# clf = ExtraTreesClassifier(n_jobs=-1) #0.78
# clf = KNeighborsClassifier() #0.78
# clf = QDA() # 0.79
clf = LabelSpreading()
clf_name = str(clf).split('(')[0]
tic = time.time()
# scores = cross_validation.cross_val_score(
# clf, train_feats, train_lbls_db,
# cv=5, n_jobs=-1, scoring='log_loss')
scores = cross_validation.cross_val_score(clf,
np.r_[train_feats, test_feats[:10000,:]],
np.r_[train_lbls, test_lbls[:10000]],
scoring='log_loss', cv=5, n_jobs=-1)
# clf.fit(train_feats_db, train_lbls_db)
# clf.fit(train_feats, train_lbls_db)
# clf.fit(np.r_[train_feats, test_feats[:30000,:]],
# np.r_[train_lbls, test_lbls[:30000]])
# pickle.dump(clf, open('/media/raid_arr/data/ndsb/classifiers/' + clf_name + '_fc2.p', 'wb'))
print 'Done:', time.time() - tic, 'seconds'
print scores
# scores
In [ ]:
print scores
In [8]:
if 'predict'in MODE:
tic = time.time()
# pred = clf.predict_proba(test_feats_db)
pred = clf.predict_proba(test_feats)
print 'Prediction done in:', time.time() - tic
pickle.dump(pred, open('/media/raid_arr/data/ndsb/svc_fc2_pred.p', 'wb'))
In [ ]:
if 'predict'in MODE:
# SUBMISSION CREATION
# test_files_all, images, labels = zip(*data)
import tools.submission as sub
f_name='SUBMISSION_PL_deep_56000_' + clf_name + '_fc2.csv'
sub.make_submission(test_paths_db, pred, f_name=f_name)
print 'Submission created:', f_name