notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

from astropy.io import ascii
from astropy.table import Table, Column

%matplotlib inline



In [2]:

    
cat = ascii.read('cat_lau.csv')



In [3]:

    
cat['J-Ks'] = cat['J$_{2\\prime\\prime}$'] - cat['Ks$_{2\\prime\\prime}$']
cat['H-Ks'] = cat['H$_{2\\prime\\prime}$'] - cat['Ks$_{2\\prime\\prime}$']
cat['J-H'] = cat['J$_{2\\prime\\prime}$'] - cat['H$_{2\\prime\\prime}$']



In [4]:

    
vis = cat[cat['Visual']=='True']



In [5]:

    
d010 = ascii.read('d010_resto.dat', format='ipac')
d115 = ascii.read('d115_resto.dat', format='ipac')



In [25]:

    
gxs = vis[['J', 'H', 'Ks', 'J-Ks', 'J-H', 'H-Ks', 'R$_{1/2}$', 'C', '$\\epsilon$', 'n']].to_pandas()



In [26]:

    
d010['J-Ks'] = d010['MAG_APER_J_C'] - d010['MAG_APER_Ks_C']
d010['H-Ks'] = d010['MAG_APER_H_C'] - d010['MAG_APER_Ks_C']
d010['J-H'] = d010['MAG_APER_J_C'] - d010['MAG_APER_H_C']
d115['J-Ks'] = d115['MAG_APER_J_C'] - d115['MAG_APER_Ks_C']
d115['H-Ks'] = d115['MAG_APER_H_C'] - d115['MAG_APER_Ks_C']
d115['J-H'] = d115['MAG_APER_J_C'] - d115['MAG_APER_H_C']



In [27]:

    
strs_d010= d010[['MAG_PSF_J_C','MAG_PSF_H_C','MAG_PSF_Ks_C', 'J-Ks', 'J-H', 'H-Ks',
            'FLUX_RADIUS_051','C','ELLIPTICITY','SPHEROID_SERSICN']].to_pandas()



In [28]:

    
strs_d115= d115[['MAG_PSF_J_C','MAG_PSF_H_C','MAG_PSF_Ks_C', 'J-Ks', 'J-H', 'H-Ks',
                 'FLUX_RADIUS_051','C','ELLIPTICITY','SPHEROID_SERSICN']].to_pandas()



In [303]:

    
strs = pd.concat([strs_d010, strs_d115])
strs.to_csv('stars_corrected.csv')



In [30]:

    
strs = strs[strs.MAG_PSF_J_C < 30]
gxs = gxs[gxs.J<30]



In [31]:

    
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import neighbors
import sklearn.cross_validation as cv
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics



In [158]:

    
def experiment(clf, x, y, nfolds=10):
    skf = StratifiedKFold(n_splits=nfolds)
    probabilities = np.array([])
    predictions = np.array([])
    y_testing = np.array([])
    
    for train, test in skf.split(x, y):
        
        x_train = x[train]
        y_train = y[train]
        clf.fit(x_train, y_train)

        x_test = x[test]
        y_test = y[test]
        pr = clf.predict(x_test)
        probs = clf.predict_proba(x_test)[:, 0]

        probabilities = np.hstack([probabilities, probs])
        predictions = np.hstack([predictions, pr])
        y_testing = np.hstack([y_testing, y_test])

    print metrics.classification_report(y_testing, predictions)
    fpr, tpr, thresholds = metrics.roc_curve(y_testing, 1.-probabilities)
    pr_rec_curve = metrics.precision_recall_curve(y_testing, 1.-probabilities)
    roc_auc = metrics.auc(fpr, tpr)
    return {'fpr': fpr, 
            'tpr': tpr, 
            'thresh': thresholds, 
            'roc_auc': roc_auc, 
            'brier_loss': metrics.brier_score_loss(y_testing, 1.-probabilities),
            'pr_rec_curve': pr_rec_curve,
            'y_test': y_testing, 
            'predictions': predictions,
            'probabilities': probabilities, 
            'confusion_matrix': metrics.confusion_matrix(y_testing, predictions),
            }



In [304]:

    
strs2 = strs.sample(1000)



In [305]:

    
X = np.vstack((strs2.as_matrix(), gxs.as_matrix()))
Y = np.hstack((np.repeat(0, len(strs2)), np.repeat(1, len(gxs))))

Decision Tree (gini)



In [306]:

    
res_Dtree = experiment(DecisionTreeClassifier(max_leaf_nodes=12), X, Y, 10)









    



             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      1000
        1.0       0.96      0.94      0.95        90

avg / total       0.99      0.99      0.99      1090

K-Neirest Neighbors



In [307]:

    
res_knn = experiment(neighbors.KNeighborsClassifier(n_neighbors=3, weights='uniform'), X, Y, 10)









    



             precision    recall  f1-score   support

        0.0       0.99      0.99      0.99      1000
        1.0       0.89      0.88      0.88        90

avg / total       0.98      0.98      0.98      1090

Support Vector Machines



In [308]:

    
res_svc = experiment(svm.SVC(kernel='linear', probability=True), X, Y, 10)









    



             precision    recall  f1-score   support

        0.0       0.99      0.98      0.98      1000
        1.0       0.83      0.84      0.84        90

avg / total       0.97      0.97      0.97      1090

Random Forest



In [309]:

    
res_Rforest = experiment(RandomForestClassifier(n_estimators=300, 
                                                max_features=8, 
                                                min_samples_leaf=20,
                                                n_jobs=8), X, Y, 10)









    



             precision    recall  f1-score   support

        0.0       0.98      0.99      0.99      1000
        1.0       0.91      0.77      0.83        90

avg / total       0.97      0.97      0.97      1090

C45



In [310]:

    
res_C45 = experiment(DecisionTreeClassifier(criterion='entropy', min_samples_leaf=15), X, Y, 10)









    



             precision    recall  f1-score   support

        0.0       0.98      0.99      0.99      1000
        1.0       0.89      0.78      0.83        90

avg / total       0.97      0.97      0.97      1090

ROC Curve



In [315]:

    
plt.figure(figsize=(6, 6))
#plt.figaspect(.8)

fpr = res_Dtree['fpr']
tpr = res_Dtree['tpr']
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1.5, color='blue', label='Dtree AUC={:06.4f}'.format(roc_auc))

fpr = res_C45['fpr']
tpr = res_C45['tpr']
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1.5, color='red', label='C45 AUC={:06.4f}'.format(roc_auc))

fpr = res_Rforest['fpr']
tpr = res_Rforest['tpr']
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1.5, color='green', label='Rforest AUC={:06.4f}'.format(roc_auc))

fpr = res_svc['fpr']
tpr = res_svc['tpr']
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1.5, color='black', label='SVM AUC={:06.4f}'.format(roc_auc))

fpr = res_knn['fpr']
tpr = res_knn['tpr']
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1.5, color='magenta', label='KNN AUC={:06.4f}'.format(roc_auc))

plt.xlim(0, 0.1)
plt.ylim(0.8, 1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.title('Receiver Operating Characteristic Curve')









    Out[315]:





<matplotlib.text.Text at 0x7f6230b8e690>



In [314]:

    
plt.figure(figsize=(6, 6))
#plt.figaspect(.8)

pr, rec, thr = res_Dtree['pr_rec_curve']
plt.plot(rec, pr, lw=1.5, color='blue', label='Dtree AUC={:06.4f}'.format(roc_auc))

pr, rec, thr = res_C45['pr_rec_curve']
plt.plot(rec, pr, lw=1.5, color='red', label='C45 AUC={:06.4f}'.format(roc_auc))

pr, rec, thr = res_Rforest['pr_rec_curve']
plt.plot(rec, pr, lw=1.5, color='green', label='Rforest AUC={:06.4f}'.format(roc_auc))

pr, rec, thr = res_svc['pr_rec_curve']
plt.plot(rec, pr, lw=1.5, color='black', label='SVM AUC={:06.4f}'.format(roc_auc))

pr, rec, thr = res_knn['pr_rec_curve']
plt.plot(rec, pr, lw=1.5, color='magenta', label='KNN AUC={:06.4f}'.format(roc_auc))

#plt.xlim(0, 0.2)
plt.ylim(0.5, 1)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='lower right')
plt.title('Precision-Recall Curve')









    Out[314]:





<matplotlib.text.Text at 0x7f623100b810>



In [313]:

    
# methods = {'C45'    : res_C45, 
#            'DTree'  : res_Dtree,
#            'KNN'    : res_knn, 
#            'RForest': res_Rforest, 
#            'SVC'    : res_svc}
true_labels = {'predictions': res_svc['y_test']}

methods = [res_C45, res_Dtree, res_knn, res_Rforest, res_svc, true_labels]

kappa_agreement = np.empty((len(methods), len(methods)))

for i in range(len(methods)):
    for j in range(len(methods)):
        kappa_agreement[i, j] = metrics.cohen_kappa_score(methods[i]['predictions'],
                                                          methods[j]['predictions'])#, 
                                                          #labels=['star', 'galaxy'])

plot_confusion_matrix(np.round(kappa_agreement, 3), 
                      classes=['C45', 'DTree', 'KNN', 'RForest', 'SVC', 'TrueLabels'],
                      title=r'Cohen $\kappa$ agreement')









    



Confusion matrix, without normalization
[[ 1.     0.858  0.794  0.826  0.715  0.814]
 [ 0.858  1.     0.865  0.823  0.777  0.945]
 [ 0.794  0.865  1.     0.797  0.825  0.872]
 [ 0.826  0.823  0.797  1.     0.781  0.818]
 [ 0.715  0.777  0.825  0.781  1.     0.82 ]
 [ 0.814  0.945  0.872  0.818  0.82   1.   ]]

Testing vs non-visual galaxies



In [316]:

    
no_vis = cat[cat['Visual']=='False']



In [317]:

    
gxs_no_vis = no_vis[['J', 'H', 'Ks', 'J-Ks', 'J-H', 'H-Ks', 'R$_{1/2}$', 'C', '$\\epsilon$', 'n']].to_pandas()

gxs_no_vis.dropna(inplace=True)



In [318]:

    
strs2 = strs#.sample(5000)



In [319]:

    
X_test = np.vstack((strs2.as_matrix(), gxs_no_vis.as_matrix()))

Y_test = np.hstack((np.repeat(0, len(strs2)), np.repeat(1, len(gxs_no_vis))))



In [320]:

    
rforest = RandomForestClassifier(n_estimators=300, max_features=8, min_samples_leaf=20, n_jobs=8)
rforest.fit(X, Y)









    Out[320]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=8, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=8, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



In [321]:

    
y_proba = rforest.predict_proba(X_test)[:,0]
y_predict  = rforest.predict(X_test)

print metrics.classification_report(Y_test, y_predict)









    



             precision    recall  f1-score   support

          0       0.99      0.99      0.99     40000
          1       0.38      0.38      0.38       482

avg / total       0.99      0.99      0.99     40482



In [322]:

    
pr, rec, thr = metrics.precision_recall_curve(Y_test, 1.-y_proba)
fpr, tpr, thr = metrics.roc_curve(Y_test, 1.-y_proba)



In [323]:

    
print "Brier-loss score = {}".format(metrics.brier_score_loss(Y_test, 1.-y_proba))









    



Brier-loss score = 0.0108851263182



In [324]:

    
plt.figure(figsize=(6, 3))
plt.subplot(121)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1.)
plt.plot(fpr, tpr)
plt.subplot(122)
plt.plot(rec, pr)









    Out[324]:





[<matplotlib.lines.Line2D at 0x7f62304a8090>]



In [325]:

    
print metrics.confusion_matrix(Y_test, y_predict)









    



[[39708   292]
 [  300   182]]



In [326]:

    
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')



In [327]:

    
plot_confusion_matrix(metrics.confusion_matrix(Y_test, y_predict), ['Star', 'Galaxy'], normalize=False)









    



Confusion matrix, without normalization
[[39708   292]
 [  300   182]]



In [ ]: