In [1]:
from __future__ import division
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid
from sklearn.ensemble import VotingClassifier
from composition.analysis.load_sim import load_sim
from composition.analysis.preprocessing import get_train_test_sets, LabelEncoder
from composition.analysis.pipelines import get_pipeline
from composition.analysis.features import get_training_features
from composition.analysis.plotting_functions import plot_decision_regions
import composition.analysis.data_functions as data_functions
from composition.support_functions.checkdir import checkdir
%matplotlib inline
In [2]:
sns.set_palette('muted')
sns.set_color_codes()
In [3]:
df, cut_dict = load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['reco_exists', 'reco_zenith', 'num_hits', 'IT_signal',
'StationDensity', 'max_charge_frac', 'reco_containment', 'energy_range']
for key in standard_cut_keys:
selection_mask *= cut_dict[key]
df = df[selection_mask]
feature_list = get_training_features()
X_train, X_test, y_train, y_test, le = get_train_test_sets(df, feature_list)
print('events = ' + str(y_train.shape[0]))
In [10]:
pipeline_RF = get_pipeline('RF')
pipeline_KN = get_pipeline('KN')
pipeline_GBC = get_pipeline('GBC')
eclf = VotingClassifier(estimators=[('RF', pipeline_RF), ('KN', pipeline_KN), ('GBC', pipeline_GBC)], voting='soft')
eclf = eclf1.fit(X_train, y_train)
In [11]:
test_predictions = eclf.predict(X_test)
test_acc = accuracy_score(y_test, test_predictions)
print('Test accuracy: {:.4%}'.format(test_acc))
train_predictions = eclf.predict(X_train)
train_acc = accuracy_score(y_train, train_predictions)
print('Train accuracy: {:.4%}'.format(train_acc))
scores = cross_val_score(
estimator=eclf, X=X_test, y=y_test, cv=10)
print('CV score: {:.2%} (+/- {:.2%})'.format(scores.mean(), scores.std()))
In [ ]: