In [13]:
    
import base.data_handler as dh
import pandas as pd
import numpy as np
import base.models as models
import experiments
import base.grouped_classifier as group
import base.grouped_clusterer as clusters
from base.grouped_classifier import GroupedClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KDTree, KNeighborsClassifier, NearestCentroid, NearestNeighbors
from sklearn.neural_network import BernoulliRBM, rbm
from sklearn.decomposition import PCA, KernelPCA
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree, KNeighborsClassifier, NearestCentroid, NearestNeighbors
from sklearn.cluster import KMeans, DBSCAN, MeanShift
%matplotlib inline
%config InlineBackend.figure_format = 'png' 
import warnings
warnings.filterwarnings('ignore')
    
Loading of spectra from fits files and their preprocessing
In [14]:
    
unprocessed = dh.load_spectra_from_fits('./data')
spectra_unprocessed_df = dh.to_dataframe(dh.process_set(unprocessed, False, False, ','))
    
In [15]:
    
spectra_processed = dh.process_set(unprocessed, False, True, ',')
spectra_processed_df= dh.to_dataframe(spectra_processed)
    
In [16]:
    
top_features, scores = dh.select_features(spectra_processed_df, 10)
    
In [17]:
    
spectra_processed_df['class'].value_counts()
    
    Out[17]:
In [18]:
    
res_unprocessed_without_pca = experiments.run_without_pca(spectra_unprocessed_df)
res_unprocessed_with_pca = experiments.run_pca_tests(spectra_processed_df, n_components=50)
    
    
In [19]:
    
res_processed_without_pca = experiments.run_without_pca(spectra_processed_df)
res_processed_with_pca = experiments.run_pca_tests(spectra_processed_df, n_components=50)
    
    
In [20]:
    
res_top_without_pca = experiments.run_without_pca(top_features)
res_top_with_pca = experiments.run_pca_tests(top_features, n_components=50)
    
    
In [21]:
    
experiments.plot_results(res_unprocessed_without_pca, save_file='unprocessed.pdf')
    
    
    
    
    
    
    
In [22]:
    
res_unprocessed_without_pca
    
    Out[22]:
In [23]:
    
for result in res_unprocessed_with_pca:
    print(result)
    experiments.plot_results(res_unprocessed_with_pca[result], save_file='unprocessed_' + result + '.pdf')
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [24]:
    
experiments.plot_results(res_processed_without_pca, save_file='processed.pdf')
    
    
    
    
    
    
    
In [25]:
    
for result in res_processed_with_pca:
    print(result)
    experiments.plot_results(res_processed_with_pca[result], save_file='processed_' + result + '.pdf')
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [26]:
    
experiments.plot_results(res_top_without_pca, save_file='top.pdf')
    
    
    
    
    
    
    
In [27]:
    
for result in res_top_with_pca:
    print(result)
    experiments.plot_results(res_top_with_pca[result], save_file='top_' + result + '.pdf')
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [28]:
    
#combined_crossvalidation = res_unprocessed_without_pca['crossvalidation'] + res_unprocessed_with_pca['crossvalidation'] + res_processed_without_pca['crossvalidation'] + res_processed_with_pca['crossvalidation'] + res_top_without_pca['crossvalidation']
#for result in res_top_with_pca:
#    combined_crossvalidation += result['crossvalidation']
#group.plot_crossvalidation(combined_crossvalidation, labels=['RF-unprocessed', 'SVC-unprocessed', 'LinearSVC-unprocessed', 'KNN-unprocessed',
#                                                             'RF-un-decomposed', 'SVC-un-decomposed', 'LinearSVC-un-decomposed', 'KNN-un-decomposed'
#                                                             'RF-processed', 'SVC-processed', 'LinearSVC-processed', 'KNN-processed',
#                                                            'RF-decomposed', 'SVC-decomposed', 'LinearSVC-decomposed', 'KNN-decomposed',
#                                                            'RF-top', 'SVC-top', 'LinearSVC-top', 'KNN-top',
#                                                            'RF-top-decomposed', 'SVC-top-decomposed', 'LinearSVC-top-decomposed', 'KNN-top-decomposed'])
    
In [29]:
    
importances = experiments.get_feature_importances_using_rdf(spectra_processed_df)
importances_with_index = [(importance, idx) for idx, importance in enumerate(importances)]
sorted_imp = sorted(importances_with_index, key=lambda tup: tup[0])
indices = [tup[1] for tup in sorted_imp[0:50]]
    
In [29]:
    
    
In [30]:
    
train, test = dh.split_train_set(spectra_processed_df)
test['class'].value_counts()
    
    Out[30]:
In [30]:
    
    
In [30]: