In [1]:
import numpy as np
import h5py
import pickle

FEAT_TRAIN = '/media/raid_arr/data/ndsb/features/features_train.hdf5'
# FEAT_TRAIN = '~/data/ndsb/features/features_train.hdf5'
FEAT_TEST = '/media/raid_arr/data/ndsb/features/features_test.hdf5'

LAYER = 'fc2'
MODE = ['train', 'predict']

In [2]:
# f_train.close()
f_train = h5py.File(FEAT_TRAIN, 'r')
f_test = h5py.File(FEAT_TEST, 'r')

In [15]:
f_test.keys()


Out[15]:
[u'fc1', u'fc2', u'im_paths', u'lbls']

In [3]:
train_feats_db = f_train[LAYER]
train_lbls_db = f_train['lbls']

test_feats_db = f_test[LAYER]
test_lbls_db = f_test['lbls']
test_paths_db = f_test['im_paths']

In [4]:
from sklearn import cross_validation
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import LabelPropagation

import time

In [5]:
train_feats = train_feats_db[()]
test_feats = test_feats_db[()]
train_lbls = train_lbls_db[()]
test_lbls = test_lbls_db[()]

In [7]:
test_feats[:30000,:].shape


Out[7]:
(30000, 2048)

In [ ]:
if 'train' in MODE:
    # clf = svm.LinearSVC()
#     clf = svm.SVC(probability=True) #0.82
#     clf = GaussianNB()
#     clf = RandomForestClassifier(n_jobs=-1)
#     clf = GradientBoostingClassifier()
#     clf = ExtraTreesClassifier(n_jobs=-1) #0.78
#     clf = KNeighborsClassifier()  #0.78
#     clf = QDA()  # 0.79
    clf = LabelSpreading()
    clf_name = str(clf).split('(')[0]
    
    
    tic = time.time()
#     scores = cross_validation.cross_val_score(
#         clf, train_feats, train_lbls_db, 
#         cv=5, n_jobs=-1, scoring='log_loss')
    
    scores = cross_validation.cross_val_score(clf, 
            np.r_[train_feats, test_feats[:10000,:]], 
            np.r_[train_lbls, test_lbls[:10000]], 
            scoring='log_loss', cv=5, n_jobs=-1)
    
    
    
#     clf.fit(train_feats_db, train_lbls_db)
#     clf.fit(train_feats, train_lbls_db)
#     clf.fit(np.r_[train_feats, test_feats[:30000,:]], 
#             np.r_[train_lbls, test_lbls[:30000]])
#     pickle.dump(clf, open('/media/raid_arr/data/ndsb/classifiers/' + clf_name + '_fc2.p', 'wb'))
    
    print 'Done:', time.time() - tic, 'seconds'
    print scores
    # scores

In [ ]:
print scores

In [8]:
if 'predict'in MODE:
    tic = time.time()
#     pred = clf.predict_proba(test_feats_db)
    pred = clf.predict_proba(test_feats)
    print 'Prediction done in:', time.time() - tic
    pickle.dump(pred, open('/media/raid_arr/data/ndsb/svc_fc2_pred.p', 'wb'))


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-8-78715e57bc0b> in <module>()
      2     tic = time.time()
      3 #     pred = clf.predict_proba(test_feats_db)
----> 4     pred = clf.predict_proba(test_feats)
      5     print 'Prediction done in:', time.time() - tic
      6     pickle.dump(pred, open('/media/raid_arr/data/ndsb/svc_fc2_pred.p', 'wb'))

/usr/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in predict_proba(self, X)
    466                 self.n_classes_,
    467                 self.n_outputs_)
--> 468             for i in range(n_jobs))
    469 
    470         # Reduce

/usr/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    650                 os.environ[JOBLIB_SPAWNED_PROCESS] = '1'
    651             self._iterating = True
--> 652             for function, args, kwargs in iterable:
    653                 self.dispatch(function, args, kwargs)
    654 

/usr/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in <genexpr>((i,))
    466                 self.n_classes_,
    467                 self.n_outputs_)
--> 468             for i in range(n_jobs))
    469 
    470         # Reduce

AttributeError: 'ExtraTreesClassifier' object has no attribute 'n_classes_'

In [ ]:
if 'predict'in MODE:
    # SUBMISSION CREATION
    # test_files_all, images, labels = zip(*data)
    import tools.submission as sub
    f_name='SUBMISSION_PL_deep_56000_' + clf_name + '_fc2.csv'
    sub.make_submission(test_paths_db, pred, f_name=f_name)

    print 'Submission created:', f_name