Author: James Bourbeau
In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
In [2]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')
In [12]:
%matplotlib inline
from __future__ import division, print_function
from collections import defaultdict
import itertools
import numpy as np
from scipy import interp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import cross_val_score, ShuffleSplit, KFold, StratifiedKFold
from sklearn.cluster import KMeans
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import composition as comp
import composition.analysis.plotting as plotting
color_dict = {'light': 'C0', 'heavy': 'C1', 'total': 'C2',
'P': 'C0', 'He': 'C1', 'O': 'C3', 'Fe':'C4'}
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [4]:
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
Get composition classifier pipeline
In [5]:
pipeline_str = 'xgboost'
pipeline = comp.get_pipeline(pipeline_str)
Define energy binning for this analysis
In [6]:
energybins = comp.analysis.get_energybins()
[ back to top ]
In [7]:
sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)
In [8]:
data = comp.preprocess_data(comp_class=comp_class, return_energy=True)
Run classifier over training and testing sets to get an idea of the degree of overfitting
In [9]:
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print('=' * 30)
print(clf_name)
pipeline.fit(sim_train.X, sim_train.y)
train_pred = pipeline.predict(sim_train.X)
train_acc = accuracy_score(sim_train.y, train_pred)
print('Training accuracy = {:.2%}'.format(train_acc))
test_pred = pipeline.predict(sim_test.X)
test_acc = accuracy_score(sim_test.y, test_pred)
print('Testing accuracy = {:.2%}'.format(test_acc))
# scores = cross_val_score(
# estimator=pipeline, X=sim_train.X, y=sim_train.y, cv=3, n_jobs=10)
# print('CV score: {:.2%} (+/- {:.2%})'.format(scores.mean(), scores.std()))
print('=' * 30)
In [13]:
splitter = ShuffleSplit(n_splits=1, test_size=.5, random_state=2)
for set1_index, set2_index in splitter.split(sim_train.X):
sim_train1 = sim_train[set1_index]
sim_train2 = sim_train[set2_index]
In [35]:
kmeans = KMeans(n_clusters=4)
In [36]:
pred = kmeans.fit_predict(sim_train.X)
In [37]:
MC_comp_mask = {}
for composition in comp_list:
MC_comp_mask[composition] = sim_train.le.inverse_transform(sim_train.y) == composition
MC_comp_mask
Out[37]:
In [38]:
light_0 = np.sum(pred[MC_comp_mask['light']] == 0)/np.sum(MC_comp_mask['light'])
light_1 = np.sum(pred[MC_comp_mask['light']] == 1)/np.sum(MC_comp_mask['light'])
print('percent light cluster in 0 = {}'.format(light_0))
print('percent light cluster in 1 = {}'.format(light_1))
In [39]:
heavy_0 = np.sum(pred[MC_comp_mask['heavy']] == 0)/np.sum(MC_comp_mask['heavy'])
heavy_1 = np.sum(pred[MC_comp_mask['heavy']] == 1)/np.sum(MC_comp_mask['heavy'])
print('percent heavy cluster in 0 = {}'.format(heavy_0))
print('percent heavy cluster in 1 = {}'.format(heavy_1))
In [ ]: