This project analyses the data observed by the NASA Kepler space telescope searching for exoplanets using the transit technique.
planets themselves do not emit light, but the stars that they orbit do. If said star is watched over several months or years, there may be a regular 'dimming' of the flux (the light intensity). This is evidence that there may be an orbiting body around the star; such a star could be considered to be a 'candidate' system.
NASA itself utilises python to interpret the data and has created PyKE, a library for data reduction to help with extraction and preprocessing of the light curve images, however this project analyses only FLUX data, not pictures.
Some of the machine learning techniques already been used by developers are 1-D CNN, XGBoosting, PCA.
In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from time import time
import scipy
import matplotlib.patches as mpatches
from sklearn.metrics import fbeta_score, accuracy_score,make_scorer ,confusion_matrix, precision_score
from sklearn.decomposition import PCA, NMF
from sklearn.preprocessing import normalize, MinMaxScaler
from pandas.tools.plotting import autocorrelation_plot
from sklearn import svm
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest
from imblearn.metrics import classification_report_imbalanced, sensitivity_specificity_support
from sklearn.model_selection import GridSearchCV, ShuffleSplit,train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
import helper as hp
# Pretty display for notebooks
%matplotlib inline
print "Libraries imported successfully"
In [2]:
# Load the kepler training dataset
try:
data = pd.read_csv(os.path.join("project_dataset", 'exoTrain.csv'),index_col=0)
print "kepler dataset has {} samples with {} features each.".format(*data.shape)
except:
print "Dataset could not be loaded. Is the dataset missing?"
In [3]:
#create label array and drop it from features
labels = data.LABEL
#labels = [1 if x == 2 else 0 for x in labels]
labels.replace(1,0, inplace=True)
labels.replace(2,1, inplace=True)
data.drop('LABEL',axis=1, inplace =True)
In [21]:
# Display a description of the dataset
display(data[labels ==1].head())
display(data.describe())
In [15]:
scaler = MinMaxScaler(feature_range=(-10000,10000))
norm_data = normalize(data)
norm_data = scaler.fit_transform(norm_data)
norm_data = pd.DataFrame(norm_data, columns = data.keys(), index = data.index)
display(norm_data[labels ==1].head())
In [34]:
autocorrelation_plot(data.loc[37])
Out[34]:
In [28]:
fig = plt.figure(figsize=(30,40))
x = np.array(range(3197))
for i in range(37):
ax = fig.add_subplot(8,5,i+1)
ax.scatter(x,data.iloc[i,:])
In [36]:
#look for outliers on class 0 because if in class 1 it's valuable information
cnt = hp.find_outliers(norm_data[labels == 0])
outliers = list(cnt.most_common(37))
fig = plt.figure(figsize=(30,40))
x = np.array(range(3197))
for i in range(37):
outliers[i] = outliers[i][0]
ax = fig.add_subplot(8,5,i+1)
ax.scatter(x,data.iloc[outliers[i],:])
In [37]:
#special outlier
fig , ax = plt.subplots(figsize=(7,7))
ax.scatter(x,data.iloc[outliers[21],:])
Out[37]:
In [38]:
#remove the 1% outliers
OnePer = int(len(data['FLUX-1'])/100)
print "removed 1% of the outliers which is: {} ".format(OnePer)
outliers = list(cnt.most_common(OnePer))
for i in range(OnePer):
outliers[i] = outliers[i][0]
good_data = data.drop(data.index[outliers]).reset_index(drop = True)
good_labels = labels.drop(labels.index[outliers]).reset_index(drop = True)
In [12]:
#save data
pickle.dump((good_data, good_labels), open('OneClass-Train.p', 'wb'))
In [13]:
smote = SMOTE(n_jobs =-1,kind = 'svm',k_neighbors = 10,m_neighbors =15, random_state=42)
over_data, over_labels = smote.fit_sample(good_data,good_labels)
In [14]:
TL = TomekLinks(n_jobs =-1,random_state=42)
under_data, under_labels = TL.fit_sample(over_data,over_labels)
In [15]:
hp.vs_class_imbalance(good_data, good_labels,under_data, under_labels)
In [16]:
#save data
pickle.dump((under_data, under_labels), open('Classifier-Train.p', 'wb'))
In [29]:
# Load the Preprocessed data
good_data, good_labels = pickle.load(open('OneClass-Train.p', mode='rb'))
In [17]:
def mod_clas(y_pred):
y_pred = [0 if i == -1 else 1 for i in y_pred]
return y_pred
In [44]:
def vs_frontiers(good_data,good_lables):
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
pca = PCA(n_components=2, random_state = 42)
xx, yy = np.meshgrid(np.linspace(-100000,100000, 1000), np.linspace(-100000,100000, 1000))
X_train = pca.fit_transform(good_data[good_lables == 0].sample(4500))
X_test = pca.transform(good_data[good_lables == 0].sample(1000))
X_outliers = pca.transform(good_data[good_lables == 1])
# fit the model
clf = IsolationForest(n_jobs = -1,random_state =42).fit(X_train)
#predict
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers =clf.predict(X_outliers)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size
# plot the line, the points, and the nearest vectors to the plane
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(20,20))
plt.title("Novelty Detection")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
s = 30
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s)
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s)
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s)
#d = plt.scatter(X_TrainOUTliers[:, 0], X_TrainOUTliers[:, 1], c='red', s=s)
plt.axis('tight')
plt.xlim((-100000, 100000))
plt.ylim((-100000, 100000))
plt.legend([a.collections[0], b1, b2, c],
["learned frontier", "training observations",
"new regular observations", "new abnormal observations"],
loc="upper left",
prop=matplotlib.font_manager.FontProperties(size=11))
plt.xlabel(
"error train: %d/4500 ; errors novel regular: %d/1000 ; "
"errors novel abnormal: %d/37"
% (n_error_train, n_error_test, n_error_outliers))
plt.show()
vs_frontiers(good_data,good_labels)
In [90]:
#testing performance against benchmark no pca
onesvm = svm.OneClassSVM(random_state =42)
onesvm.fit(good_data[good_labels == 0])
y_pred = onesvm.predict(good_data)
y_pred = [0 if i == -1 else 1 for i in y_pred]
st, sp, su = sensitivity_specificity_support(good_labels,y_pred)
print 'sensitivity: {}, specitivity: {}, support : {}'.format(st, sp, su)
In [92]:
pca = PCA(n_components = 46, random_state = 42)
PCA_train = pca.fit_transform(good_data[good_labels == 0])
PCA_data = pca.transform(good_data)
onesvm.fit(PCA_train)
y_pred = clf.predict(PCA_data)
y_pred = [0 if i == -1 else 1 for i in y_pred]
st, sp, su = sensitivity_specificity_support(good_labels,y_pred)
print 'benchmark : sensitivity: {}, specitivity: {}, support : {}'.format(st, sp, su)
# build the pipeline
Ifo = IsolationForest(n_jobs = -1,random_state =42)
ifo = ifo.fit(PCA_train)
y_pred = ifo.predict(PCA_data)
st, sp, su = sensitivity_specificity_support(good_labels,y_pred)
print 'sensitivity: {}, specitivity: {}, support : {}'.format(st, sp, su)
print classification_report_imbalanced(good_labels, y_pred)
In [9]:
# Load the Preprocessed data
under_data, under_labels = pickle.load(open('Classifier-Train.p', mode='rb'))
In [10]:
#train test split
X_train, X_val, y_train, y_val = train_test_split(under_data, under_labels,stratify =under_labels,
test_size=0.4, random_state=42)
In [74]:
def fit_model(X, y):
pca = PCA(random_state = 42)
score = make_scorer(precision_score)
gbc = GradientBoostingClassifier(n_estimators = 100, max_depth= 7,min_impurity_split= 100000.0,random_state = 42)
Pipe = Pipeline([
('reduce_dim', PCA()),
('classify', gbc)])
# Create cross-validation sets from the training data
cv_sets = ShuffleSplit(n_splits = 5,test_size = 0.20, random_state = 42)
#params dictionary for grid search
##tests made:
##learning_rate: (0.1,0.08,0.01,0.5)n_estimators:(50,100,180,200,250)max_depth(3,4,5,6,7,8,10)
params = {'reduce_dim__n_components':(40,45,46)}
#scoring function using 'make_scorer'
scoring_fnc = make_scorer(precision_score)
#Create the grid search object
grid = GridSearchCV(Pipe,params,scoring = scoring_fnc,cv = cv_sets)
# Fit the grid search object to the data to compute the optimal model
grid = grid.fit(X, y)
# Return the optimal model after fitting the data
return grid.best_estimator_
In [75]:
model = fit_model(under_data, under_labels)
In [76]:
print model.named_steps['reduce_dim'].n_components_
In [11]:
pca = PCA (n_components = 46, random_state = 42)
X_trainPCA = pca.fit_transform(X_train)
X_valPCA = pca.transform(X_val)
#Initialize the three models
clf_A = GaussianNB()
clf_B = SVC()
clf_C = GradientBoostingClassifier(n_estimators = 100, max_depth= 7,min_impurity_split= 100000.0,random_state = 42)
tot_training = X_trainPCA.shape[0]
#Calculate the number of samples for 1%, 10%,100% of the training data
samples_1 = int(tot_training * 0.2)
samples_10 = int(tot_training * 0.4)
samples_100 = int(tot_training * 1)
# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
clf_name = clf.__class__.__name__
results[clf_name] = {}
for i, samples in enumerate([samples_1, samples_10, samples_100]):
results[clf_name][i] = \
hp.train_predict(clf, samples, X_trainPCA, y_train, X_valPCA, y_val)
# Run metrics visualization for the three supervised learning models chosen
hp.evaluate(results, 0.6, 0.6)
In [79]:
#standard gradient boosting
y_pred = clf_C.predict(X_valPCA)
print classification_report_imbalanced(y_val, y_pred)
In [81]:
#optimised gradient boosting
y_pred = model.predict(X_val)
print classification_report_imbalanced(y_val, y_pred)
In [12]:
# Load the kepler dataset
Testdata = pd.read_csv(os.path.join("project_dataset", 'exoTest.csv'),index_col=0)
In [13]:
#create label array and drop it from features
testlabels = Testdata.LABEL
testlabels.replace(1,0, inplace=True)
testlabels.replace(2,1, inplace=True)
Testdata.drop('LABEL',axis=1, inplace =True)
In [18]:
X_testdata = scaler.transform(normalize(Testdata))
y_pred = model.predict(X_testdata)
print classification_report_imbalanced(testlabels, y_pred)
In [94]:
x_testdata = pca.transform(Testdata)
y_pred = ifo.predict(x_testdata)
y_pred = [0 if i == -1 else 1 for i in y_pred]
st, sp, su = sensitivity_specificity_support(testlabels,y_pred)
print 'sensitivity: {}, specitivity: {}, support : {}'.format(st, sp, su)
print
print classification_report_imbalanced(testlabels, y_pred)
print confusion_matrix(testlabels, y_pred)
In [ ]: