In [13]:
import base.data_handler as dh
import pandas as pd
import numpy as np
import base.models as models
import experiments
import base.grouped_classifier as group
import base.grouped_clusterer as clusters
from base.grouped_classifier import GroupedClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KDTree, KNeighborsClassifier, NearestCentroid, NearestNeighbors
from sklearn.neural_network import BernoulliRBM, rbm
from sklearn.decomposition import PCA, KernelPCA
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree, KNeighborsClassifier, NearestCentroid, NearestNeighbors
from sklearn.cluster import KMeans, DBSCAN, MeanShift
%matplotlib inline
%config InlineBackend.figure_format = 'png'
import warnings
warnings.filterwarnings('ignore')
Loading of spectra from fits files and their preprocessing
In [14]:
unprocessed = dh.load_spectra_from_fits('./data')
spectra_unprocessed_df = dh.to_dataframe(dh.process_set(unprocessed, False, False, ','))
In [15]:
spectra_processed = dh.process_set(unprocessed, False, True, ',')
spectra_processed_df= dh.to_dataframe(spectra_processed)
In [16]:
top_features, scores = dh.select_features(spectra_processed_df, 10)
In [17]:
spectra_processed_df['class'].value_counts()
Out[17]:
In [18]:
res_unprocessed_without_pca = experiments.run_without_pca(spectra_unprocessed_df)
res_unprocessed_with_pca = experiments.run_pca_tests(spectra_processed_df, n_components=50)
In [19]:
res_processed_without_pca = experiments.run_without_pca(spectra_processed_df)
res_processed_with_pca = experiments.run_pca_tests(spectra_processed_df, n_components=50)
In [20]:
res_top_without_pca = experiments.run_without_pca(top_features)
res_top_with_pca = experiments.run_pca_tests(top_features, n_components=50)
In [21]:
experiments.plot_results(res_unprocessed_without_pca, save_file='unprocessed.pdf')
In [22]:
res_unprocessed_without_pca
Out[22]:
In [23]:
for result in res_unprocessed_with_pca:
print(result)
experiments.plot_results(res_unprocessed_with_pca[result], save_file='unprocessed_' + result + '.pdf')
In [24]:
experiments.plot_results(res_processed_without_pca, save_file='processed.pdf')
In [25]:
for result in res_processed_with_pca:
print(result)
experiments.plot_results(res_processed_with_pca[result], save_file='processed_' + result + '.pdf')
In [26]:
experiments.plot_results(res_top_without_pca, save_file='top.pdf')
In [27]:
for result in res_top_with_pca:
print(result)
experiments.plot_results(res_top_with_pca[result], save_file='top_' + result + '.pdf')
In [28]:
#combined_crossvalidation = res_unprocessed_without_pca['crossvalidation'] + res_unprocessed_with_pca['crossvalidation'] + res_processed_without_pca['crossvalidation'] + res_processed_with_pca['crossvalidation'] + res_top_without_pca['crossvalidation']
#for result in res_top_with_pca:
# combined_crossvalidation += result['crossvalidation']
#group.plot_crossvalidation(combined_crossvalidation, labels=['RF-unprocessed', 'SVC-unprocessed', 'LinearSVC-unprocessed', 'KNN-unprocessed',
# 'RF-un-decomposed', 'SVC-un-decomposed', 'LinearSVC-un-decomposed', 'KNN-un-decomposed'
# 'RF-processed', 'SVC-processed', 'LinearSVC-processed', 'KNN-processed',
# 'RF-decomposed', 'SVC-decomposed', 'LinearSVC-decomposed', 'KNN-decomposed',
# 'RF-top', 'SVC-top', 'LinearSVC-top', 'KNN-top',
# 'RF-top-decomposed', 'SVC-top-decomposed', 'LinearSVC-top-decomposed', 'KNN-top-decomposed'])
In [29]:
importances = experiments.get_feature_importances_using_rdf(spectra_processed_df)
importances_with_index = [(importance, idx) for idx, importance in enumerate(importances)]
sorted_imp = sorted(importances_with_index, key=lambda tup: tup[0])
indices = [tup[1] for tup in sorted_imp[0:50]]
In [29]:
In [30]:
train, test = dh.split_train_set(spectra_processed_df)
test['class'].value_counts()
Out[30]:
In [30]:
In [30]: