In [13]:
import base.data_handler as dh
import pandas as pd
import numpy as np
import base.models as models
import experiments
import base.grouped_classifier as group
import base.grouped_clusterer as clusters
from base.grouped_classifier import GroupedClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KDTree, KNeighborsClassifier, NearestCentroid, NearestNeighbors
from sklearn.neural_network import BernoulliRBM, rbm
from sklearn.decomposition import PCA, KernelPCA
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree, KNeighborsClassifier, NearestCentroid, NearestNeighbors
from sklearn.cluster import KMeans, DBSCAN, MeanShift
%matplotlib inline
%config InlineBackend.figure_format = 'png' 
import warnings
warnings.filterwarnings('ignore')

Loading of spectra from fits files and their preprocessing


In [14]:
unprocessed = dh.load_spectra_from_fits('./data')
spectra_unprocessed_df = dh.to_dataframe(dh.process_set(unprocessed, False, False, ','))

In [15]:
spectra_processed = dh.process_set(unprocessed, False, True, ',')
spectra_processed_df= dh.to_dataframe(spectra_processed)

In [16]:
top_features, scores = dh.select_features(spectra_processed_df, 10)

In [17]:
spectra_processed_df['class'].value_counts()


Out[17]:
3    124
2     57
1     31
4     12
dtype: int64

In [18]:
res_unprocessed_without_pca = experiments.run_without_pca(spectra_unprocessed_df)
res_unprocessed_with_pca = experiments.run_pca_tests(spectra_processed_df, n_components=50)


{'min_samples_split': 2, 'min_samples_leaf': 1}
{'gamma': 0.03125, 'C': 32}
{'C': 0.25}
{'n_neighbors': 1}
{'min_samples_split': 3, 'min_samples_leaf': 1}
{'gamma': 0, 'C': 512}
{'C': 64}
{'n_neighbors': 1}
{'min_samples_split': 2, 'min_samples_leaf': 2}
{'gamma': 0.0625, 'C': 32}
{'C': 1}
{'n_neighbors': 1}
{'min_samples_split': 3, 'min_samples_leaf': 3}
{'gamma': 0.03125, 'C': 32}
{'C': 4}
{'n_neighbors': 1}

In [19]:
res_processed_without_pca = experiments.run_without_pca(spectra_processed_df)
res_processed_with_pca = experiments.run_pca_tests(spectra_processed_df, n_components=50)


{'min_samples_split': 3, 'min_samples_leaf': 1}
{'gamma': 0.03125, 'C': 32}
{'C': 1}
{'n_neighbors': 1}
{'min_samples_split': 4, 'min_samples_leaf': 1}
{'gamma': 2, 'C': 512}
{'C': 64}
{'n_neighbors': 1}
{'min_samples_split': 4, 'min_samples_leaf': 1}
{'gamma': 0.0625, 'C': 32}
{'C': 16}
{'n_neighbors': 1}
{'min_samples_split': 5, 'min_samples_leaf': 4}
{'gamma': 0.03125, 'C': 32}
{'C': 1}
{'n_neighbors': 1}

In [20]:
res_top_without_pca = experiments.run_without_pca(top_features)
res_top_with_pca = experiments.run_pca_tests(top_features, n_components=50)


{'min_samples_split': 3, 'min_samples_leaf': 1}
{'gamma': 0, 'C': 32}
{'C': 1}
{'n_neighbors': 1}
{'min_samples_split': 2, 'min_samples_leaf': 1}
{'gamma': 1, 'C': 512}
{'C': 16}
{'n_neighbors': 1}
{'min_samples_split': 4, 'min_samples_leaf': 1}
{'gamma': 0.03125, 'C': 128}
{'C': 16}
{'n_neighbors': 1}
{'min_samples_split': 2, 'min_samples_leaf': 1}
{'gamma': 0, 'C': 128}
{'C': 4}
{'n_neighbors': 1}

In [21]:
experiments.plot_results(res_unprocessed_without_pca, save_file='unprocessed.pdf')



In [22]:
res_unprocessed_without_pca


Out[22]:
{'classifiers': GroupedClassifier(classifiers=None,
          labels=['forest', 'SVC', 'LinearSVC', 'KNN']),
 'crossvalidation': [('forest',
   array([ 0.70128335,  0.81065919,  0.73400563,  0.79243061,  0.86140714])),
  ('LinearSVC',
   array([ 0.81984603,  0.7800459 ,  0.86735537,  0.84360226,  0.91884196])),
  ('KNN',
   array([ 0.86460348,  0.883509  ,  0.95161846,  0.90792541,  0.90578414])),
  ('SVC',
   array([ 0.89222942,  0.83229814,  0.92951252,  0.90757576,  0.89134768]))],
 'score': [('forest', 0.62705128205128202),
  ('LinearSVC', 0.6469780219780219),
  ('KNN', 0.96886446886446886),
  ('SVC', 0.79318488529014841)],
 'matrix': [('forest', array([[ 3,  3,  0,  0],
          [ 0, 11,  0,  0],
          [ 0,  0, 25,  0],
          [ 0,  0,  2,  0]])), ('LinearSVC', array([[ 5,  1,  0,  0],
          [ 2,  9,  0,  0],
          [ 0,  0, 25,  0],
          [ 0,  0,  2,  0]])), ('KNN', array([[ 6,  0,  0,  0],
          [ 1, 10,  0,  0],
          [ 0,  0, 25,  0],
          [ 0,  0,  0,  2]])), ('SVC', array([[ 5,  1,  0,  0],
          [ 2,  7,  0,  2],
          [ 0,  0, 25,  0],
          [ 0,  0,  0,  2]]))]}

In [23]:
for result in res_unprocessed_with_pca:
    print(result)
    experiments.plot_results(res_unprocessed_with_pca[result], save_file='unprocessed_' + result + '.pdf')


PCA
ICA
Kernel

In [24]:
experiments.plot_results(res_processed_without_pca, save_file='processed.pdf')



In [25]:
for result in res_processed_with_pca:
    print(result)
    experiments.plot_results(res_processed_with_pca[result], save_file='processed_' + result + '.pdf')


PCA
ICA
Kernel

In [26]:
experiments.plot_results(res_top_without_pca, save_file='top.pdf')



In [27]:
for result in res_top_with_pca:
    print(result)
    experiments.plot_results(res_top_with_pca[result], save_file='top_' + result + '.pdf')


PCA
ICA
Kernel

In [28]:
#combined_crossvalidation = res_unprocessed_without_pca['crossvalidation'] + res_unprocessed_with_pca['crossvalidation'] + res_processed_without_pca['crossvalidation'] + res_processed_with_pca['crossvalidation'] + res_top_without_pca['crossvalidation']
#for result in res_top_with_pca:
#    combined_crossvalidation += result['crossvalidation']
#group.plot_crossvalidation(combined_crossvalidation, labels=['RF-unprocessed', 'SVC-unprocessed', 'LinearSVC-unprocessed', 'KNN-unprocessed',
#                                                             'RF-un-decomposed', 'SVC-un-decomposed', 'LinearSVC-un-decomposed', 'KNN-un-decomposed'
#                                                             'RF-processed', 'SVC-processed', 'LinearSVC-processed', 'KNN-processed',
#                                                            'RF-decomposed', 'SVC-decomposed', 'LinearSVC-decomposed', 'KNN-decomposed',
#                                                            'RF-top', 'SVC-top', 'LinearSVC-top', 'KNN-top',
#                                                            'RF-top-decomposed', 'SVC-top-decomposed', 'LinearSVC-top-decomposed', 'KNN-top-decomposed'])

In [29]:
importances = experiments.get_feature_importances_using_rdf(spectra_processed_df)
importances_with_index = [(importance, idx) for idx, importance in enumerate(importances)]
sorted_imp = sorted(importances_with_index, key=lambda tup: tup[0])
indices = [tup[1] for tup in sorted_imp[0:50]]

In [29]:


In [30]:
train, test = dh.split_train_set(spectra_processed_df)
test['class'].value_counts()


Out[30]:
3    25
2    11
1     6
4     2
dtype: int64

In [30]:


In [30]: