In [39]:
import numpy as np
import logging
import matplotlib.pyplot as plt

import sys
sys.path.append('..')

try:
  import user_project_config as conf
except:
  import project_config as conf

from IO import data_loading as dl
from utils import logg 
from utils import data_processing as dp
from models_utils import models_utils as mu


from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier

import xgboost as xgb #!!!

%matplotlib inline

In [40]:
import sklearn
sklearn.__version__


Out[40]:
'0.17'

In [41]:
#!!!
USED_EXAMPLES_NUMBER = None # 'None' means that all examples are used; otherwise randomly selected

#!!!
OBJECTIVE_NAME = 'cl_sleep_interval' # e.g. 'BMIgr', 'Sex', 'cl_sleep_interval' #!!!!
sample_name = OBJECTIVE_NAME + '_3' # train-test filename
SEED = 0


classifiers = [
  ("XGBoost", xgb.XGBClassifier()),
  ("Dummy", DummyClassifier(strategy='stratified')), # see http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
  # ("Linear SVM", SVC(kernel="linear", C=0.025)),
  # ("RBF SVM", SVC(gamma=2, C=1)),
  # ("Decision Tree", DecisionTreeClassifier(max_depth=5)),
  ("Random Forest", RandomForestClassifier(n_estimators=100)),
  ("Nearest Neighbors", KNeighborsClassifier(3)),
  # ("AdaBoost", AdaBoostClassifier()),
  ("Naive Bayes", GaussianNB())
  ] # TODO: xgboost

###############################################################
# Initial configuration
np.random.seed(SEED)

In [42]:
trainX, trainY, testX, testY, sample_info = dl.load_hdf5_sample(sample_name)

print trainX.shape
print len(sample_info['Features names'])
trainX[0]
print sample_info['Features names']


(327834, 38)
38
['maxNN', 'minNN', 'meanNN', 'medianNN', 'SDNN', 'SDA60000NN', 'SD60000NNind', 'NN20', 'pNN20', 'NN50', 'pNN50', 'RMSSD', 'MeanHR', 'sd600000HR', 'HRVti', 'TINN', 'outlier', 'meanSD', 'stdSD', 'aVLF', 'aLF', 'aHF', 'peakVLF', 'peakLF', 'peakHF', 'aTotal', 'pVLF', 'pLF', 'pHF', 'nLF', 'nHF', 'LF/HF', 'poincSD1', 'poincSD2', 'sampen', 'alpha0', 'alpha1', 'alpha2']

In [43]:
trainX.shape[0] + testX.shape[0]


#print np.any(np.isnan(trainX[:, 13])==False)
sample_info['Features names'][13]

nans_counter = np.sum(np.isnan(trainX[:, :]), axis=0) + np.sum(np.isnan(testX[:, :]), axis=0)
#np.array(sample_info['Features names'])[ind]
ind = nans_counter == 0

#print np.sum(np.isnan(testX[:, :]), axis=0)
nans_counter

np.array(sample_info['Features names'])[~ind]
nans_counter


Out[43]:
array([     0,      0,      0,      0,      0,    664,    664,      0,
            0,      0,      0,      0,      0, 467395,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0, 110516,      0,      0,      0])

In [ ]:


In [44]:
clf = xgb.XGBClassifier()
clf.fit(trainX, trainY)

from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score

def show_results(clf, testX, testY):
    predictions = clf.predict(testX)
    actuals = testY
    cm = confusion_matrix(actuals, predictions)
    print(cm)

    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print(cm_normalized)
    print 'accuracy', accuracy_score(actuals, predictions)
    
    
show_results(clf, testX, testY)


/home/iv/anaconda2/lib/python2.7/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-44-e72151373848> in <module>()
      1 clf = xgb.XGBClassifier()
----> 2 clf.fit(trainX, trainY)
      3 
      4 from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score
      5 

/home/iv/anaconda2/lib/python2.7/site-packages/xgboost/sklearn.pyc in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose)
    341                               early_stopping_rounds=early_stopping_rounds,
    342                               evals_result=evals_result, feval=feval,
--> 343                               verbose_eval=verbose)
    344 
    345         if evals_result:

/home/iv/anaconda2/lib/python2.7/site-packages/xgboost/training.pyc in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model)
    119     if not early_stopping_rounds:
    120         for i in range(num_boost_round):
--> 121             bst.update(dtrain, i, obj)
    122             nboost += 1
    123             if len(evals) != 0:

/home/iv/anaconda2/lib/python2.7/site-packages/xgboost/core.pyc in update(self, dtrain, iteration, fobj)
    692 
    693         if fobj is None:
--> 694             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    695         else:
    696             pred = self.predict(dtrain)

/home/iv/anaconda2/lib/python2.7/site-packages/xgboost/core.pyc in _check_call(ret)
     95     """
     96     if ret != 0:
---> 97         raise XGBoostError(_LIB.XGBGetLastError())
     98 
     99 

XGBoostError: base_score must be in (0,1) for logistic loss

In [8]:
clf = RandomForestClassifier(n_estimators=100)#xgb.XGBClassifier()
clf.fit(trainX[:, ind], trainY)
show_results(clf, testX[:, ind], testY)


[[79372 13096]
 [14712 32381]]
[[ 0.85837263  0.14162737]
 [ 0.31240312  0.68759688]]
accuracy 0.800746626923
/home/iv/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  from ipykernel import kernelapp as app

In [ ]:
# Recursive Feature Elimination
from sklearn import datasets
from sklearn.feature_selection import RFE


clf = RandomForestClassifier(n_estimators=100) #xgb.XGBClassifier()

rfe = RFE(clf, 5)
rfe = rfe.fit(trainX[:, ind], trainY)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

In [ ]:


In [65]:



/home/iv/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 327834 but corresponding boolean dimension is 38
  from ipykernel import kernelapp as app
Out[65]:
((327834, 34), (34, 1))

In [9]:
np.any(np.isnan(trainX[:, ~ind]))
ind
trainX[:, ind].shape, trainY[ind].shape


/home/iv/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:3: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 327834 but corresponding boolean dimension is 38
  app.launch_new_instance()
Out[9]:
((327834, 34), (34, 1))

In [15]:
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets, feature_selection, cross_validation
from sklearn.pipeline import Pipeline

###############################################################################
X = trainX[:, ind]
y = trainY.ravel()

###############################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = feature_selection.SelectPercentile(feature_selection.f_classif)

clf = Pipeline([('anova', transform), ('svc', RandomForestClassifier(n_estimators=100))])

###############################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

for percentile in percentiles:
    clf.set_params(anova__percentile=percentile)
    # Compute cross-validation score using all CPUs
    this_scores = cross_validation.cross_val_score(clf, X, y, n_jobs=1)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

plt.errorbar(percentiles, score_means, np.array(score_stds))

plt.title(
    'Performance of the SVM-Anova varying the percentile of features selected')
plt.xlabel('Percentile')
plt.ylabel('Prediction rate')

plt.axis('tight')
plt.show()


Automatically created module for IPython interactive environment

In [16]:
clf.set_params(anova__percentile=40)
clf.fit(X, y)


Out[16]:
Pipeline(steps=[('anova', SelectPercentile(percentile=40,
         score_func=<function f_classif at 0x7f5d60508f50>)), ('svc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [18]:
show_results(clf, testX[:, ind], testY)


[[79144 13324]
 [14967 32126]]
[[ 0.85590691  0.14409309]
 [ 0.31781793  0.68218207]]
accuracy 0.797285774679

In [29]:
from sklearn.feature_selection import SelectFromModel
clf = Pipeline([
  ('feature_selection', SelectFromModel(RandomForestClassifier(), threshold="0.75*mean")),
  ('classification', RandomForestClassifier())
])
clf.fit(X, y)


Out[29]:
Pipeline(steps=[('feature_selection', SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_es...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [30]:
show_results(clf, testX[:, ind], testY)


[[80478 11990]
 [17860 29233]]
[[ 0.87033352  0.12966648]
 [ 0.37924957  0.62075043]]
accuracy 0.786115032136

In [35]:
clf = xgb.XGBClassifier()
clf.fit(trainX, trainY)
y


/home/iv/anaconda2/lib/python2.7/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-35-485124996f27> in <module>()
      1 clf = xgb.XGBClassifier()
----> 2 clf.fit(trainX, trainY)
      3 y

/home/iv/anaconda2/lib/python2.7/site-packages/xgboost/sklearn.pyc in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose)
    341                               early_stopping_rounds=early_stopping_rounds,
    342                               evals_result=evals_result, feval=feval,
--> 343                               verbose_eval=verbose)
    344 
    345         if evals_result:

/home/iv/anaconda2/lib/python2.7/site-packages/xgboost/training.pyc in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model)
    119     if not early_stopping_rounds:
    120         for i in range(num_boost_round):
--> 121             bst.update(dtrain, i, obj)
    122             nboost += 1
    123             if len(evals) != 0:

/home/iv/anaconda2/lib/python2.7/site-packages/xgboost/core.pyc in update(self, dtrain, iteration, fobj)
    692 
    693         if fobj is None:
--> 694             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    695         else:
    696             pred = self.predict(dtrain)

/home/iv/anaconda2/lib/python2.7/site-packages/xgboost/core.pyc in _check_call(ret)
     95     """
     96     if ret != 0:
---> 97         raise XGBoostError(_LIB.XGBGetLastError())
     98 
     99 

XGBoostError: base_score must be in (0,1) for logistic loss

In [37]:
np.mean(y)
trainX.shape


Out[37]:
(327834, 38)

In [ ]: