In [5]:
import numpy as np
import logging
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
try:
import user_project_config as conf
except:
import project_config as conf
from IO import data_loading as dl
from utils import logg
from utils import data_processing as dp
from models_utils import models_utils as mu
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
import xgboost as xgb #!!!
In [48]:
import sklearn
sklearn.__version__
Out[48]:
In [7]:
#!!!
USED_EXAMPLES_NUMBER = None # 'None' means that all examples are used; otherwise randomly selected
#!!!
OBJECTIVE_NAME = 'cl_sleep_interval' # e.g. 'BMIgr', 'Sex', 'cl_sleep_interval' #!!!!
sample_name = OBJECTIVE_NAME + '_3' # train-test filename
SEED = 0
classifiers = [
("XGBoost", xgb.XGBClassifier()),
("Dummy", DummyClassifier(strategy='stratified')), # see http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
# ("Linear SVM", SVC(kernel="linear", C=0.025)),
# ("RBF SVM", SVC(gamma=2, C=1)),
# ("Decision Tree", DecisionTreeClassifier(max_depth=5)),
("Random Forest", RandomForestClassifier(n_estimators=100)),
("Nearest Neighbors", KNeighborsClassifier(3)),
# ("AdaBoost", AdaBoostClassifier()),
("Naive Bayes", GaussianNB())
] # TODO: xgboost
###############################################################
# Initial configuration
np.random.seed(SEED)
In [99]:
trainX, trainY, testX, testY, sample_info = dl.load_hdf5_sample(sample_name)
print trainX.shape
print len(sample_info['Features names'])
trainX[0]
print sample_info#['Features names']
In [100]:
trainX.shape[0] + testX.shape[0]
#print np.any(np.isnan(trainX[:, 13])==False)
sample_info['Features names'][13]
nans_counter = np.sum(np.isnan(trainX[:, :]), axis=0)
#np.array(sample_info['Features names'])[ind]
ind = nans_counter == 0
#print np.sum(np.isnan(testX[:, :]), axis=0)
nans_counter
Out[100]:
In [8]:
clf = xgb.XGBClassifier()
clf.fit(trainX, trainY)
Out[8]:
In [15]:
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score
predictions = clf.predict(testX)
actuals = testY
cm = confusion_matrix(actuals, predictions)
print(cm)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm_normalized)
print 'accuracy', accuracy_score(actuals, predictions)
In [67]:
# Recursive Feature Elimination
from sklearn import datasets
from sklearn.feature_selection import RFE
clf = RandomForestClassifier(n_estimators=100) #xgb.XGBClassifier()
rfe = RFE(clf, 5, verbose=1)
rfe = rfe.fit(trainX[:, ind], trainY)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
In [74]:
clf = rfe
clf
Out[74]:
In [89]:
print(__doc__)
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn
ranking = rfe.ranking_
ranking = ranking[np.newaxis, :]
# Plot pixel ranking
plt.matshow(ranking)
plt.colorbar()
plt.title("Ranking of features with RFE")
plt.show()
In [83]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
plt.show()
In [81]:
importance = '9 6 1 1 14 23 12 26 20 17 1 4 15 27 2 16 1 5 8 30 29 28 1 11 24 22 18 25 19 3 21 10 13 7'.split(' ')
importance = [int(el) for el in importance if el!='']
importance = np.array(importance)
importance
Out[81]:
In [73]:
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score
def show_results(clf, testX, testY):
predictions = clf.predict(testX)
actuals = testY
cm = confusion_matrix(actuals, predictions)
print(cm)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm_normalized)
print 'accuracy', accuracy_score(actuals, predictions)
show_results(clf, testX[:, ind], testY)
In [ ]:
np.any(np.isnan(trainX[:, ~ind]))
ind
trainX[:, ind].shape, trainY[ind].shape
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [90]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(trainX[:, ind], trainY)
# display the relative importance of each attribute
print(model.feature_importances_)
ETC_imp = model.feature_importances_
In [108]:
sel_ind = ETC_imp > 3 * 10**-2
print sum(sel_ind), 'features'
In [111]:
clf = RandomForestClassifier(n_estimators=150)#xgb.XGBClassifier()
clf.fit(trainX[:, ind][:, sel_ind], trainY)
show_results(clf, testX[:, ind][:, sel_ind], testY)
In [113]:
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
%matplotlib inline
X = trainX[:, ind]
y = trainY
# Create the RFE object and compute a cross-validated score.
svc = RandomForestClassifier(n_estimators=100)
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1,scoring='accuracy')
rfecv.fit(X, y)
print("Optimal number of features : %d" % rfecv.n_features_)
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
In [ ]: