In [39]:
import numpy as np
import logging
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
try:
import user_project_config as conf
except:
import project_config as conf
from IO import data_loading as dl
from utils import logg
from utils import data_processing as dp
from models_utils import models_utils as mu
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
import xgboost as xgb #!!!
%matplotlib inline
In [40]:
import sklearn
sklearn.__version__
Out[40]:
In [41]:
#!!!
USED_EXAMPLES_NUMBER = None # 'None' means that all examples are used; otherwise randomly selected
#!!!
OBJECTIVE_NAME = 'cl_sleep_interval' # e.g. 'BMIgr', 'Sex', 'cl_sleep_interval' #!!!!
sample_name = OBJECTIVE_NAME + '_3' # train-test filename
SEED = 0
classifiers = [
("XGBoost", xgb.XGBClassifier()),
("Dummy", DummyClassifier(strategy='stratified')), # see http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
# ("Linear SVM", SVC(kernel="linear", C=0.025)),
# ("RBF SVM", SVC(gamma=2, C=1)),
# ("Decision Tree", DecisionTreeClassifier(max_depth=5)),
("Random Forest", RandomForestClassifier(n_estimators=100)),
("Nearest Neighbors", KNeighborsClassifier(3)),
# ("AdaBoost", AdaBoostClassifier()),
("Naive Bayes", GaussianNB())
] # TODO: xgboost
###############################################################
# Initial configuration
np.random.seed(SEED)
In [42]:
trainX, trainY, testX, testY, sample_info = dl.load_hdf5_sample(sample_name)
print trainX.shape
print len(sample_info['Features names'])
trainX[0]
print sample_info['Features names']
In [43]:
trainX.shape[0] + testX.shape[0]
#print np.any(np.isnan(trainX[:, 13])==False)
sample_info['Features names'][13]
nans_counter = np.sum(np.isnan(trainX[:, :]), axis=0) + np.sum(np.isnan(testX[:, :]), axis=0)
#np.array(sample_info['Features names'])[ind]
ind = nans_counter == 0
#print np.sum(np.isnan(testX[:, :]), axis=0)
nans_counter
np.array(sample_info['Features names'])[~ind]
nans_counter
Out[43]:
In [ ]:
In [44]:
clf = xgb.XGBClassifier()
clf.fit(trainX, trainY)
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score
def show_results(clf, testX, testY):
predictions = clf.predict(testX)
actuals = testY
cm = confusion_matrix(actuals, predictions)
print(cm)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm_normalized)
print 'accuracy', accuracy_score(actuals, predictions)
show_results(clf, testX, testY)
In [8]:
clf = RandomForestClassifier(n_estimators=100)#xgb.XGBClassifier()
clf.fit(trainX[:, ind], trainY)
show_results(clf, testX[:, ind], testY)
In [ ]:
# Recursive Feature Elimination
from sklearn import datasets
from sklearn.feature_selection import RFE
clf = RandomForestClassifier(n_estimators=100) #xgb.XGBClassifier()
rfe = RFE(clf, 5)
rfe = rfe.fit(trainX[:, ind], trainY)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
In [ ]:
In [65]:
Out[65]:
In [9]:
np.any(np.isnan(trainX[:, ~ind]))
ind
trainX[:, ind].shape, trainY[ind].shape
Out[9]:
In [15]:
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets, feature_selection, cross_validation
from sklearn.pipeline import Pipeline
###############################################################################
X = trainX[:, ind]
y = trainY.ravel()
###############################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator
transform = feature_selection.SelectPercentile(feature_selection.f_classif)
clf = Pipeline([('anova', transform), ('svc', RandomForestClassifier(n_estimators=100))])
###############################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)
for percentile in percentiles:
clf.set_params(anova__percentile=percentile)
# Compute cross-validation score using all CPUs
this_scores = cross_validation.cross_val_score(clf, X, y, n_jobs=1)
score_means.append(this_scores.mean())
score_stds.append(this_scores.std())
plt.errorbar(percentiles, score_means, np.array(score_stds))
plt.title(
'Performance of the SVM-Anova varying the percentile of features selected')
plt.xlabel('Percentile')
plt.ylabel('Prediction rate')
plt.axis('tight')
plt.show()
In [16]:
clf.set_params(anova__percentile=40)
clf.fit(X, y)
Out[16]:
In [18]:
show_results(clf, testX[:, ind], testY)
In [29]:
from sklearn.feature_selection import SelectFromModel
clf = Pipeline([
('feature_selection', SelectFromModel(RandomForestClassifier(), threshold="0.75*mean")),
('classification', RandomForestClassifier())
])
clf.fit(X, y)
Out[29]:
In [30]:
show_results(clf, testX[:, ind], testY)
In [35]:
clf = xgb.XGBClassifier()
clf.fit(trainX, trainY)
y
In [37]:
np.mean(y)
trainX.shape
Out[37]:
In [ ]: