The data set we are using today is created from SPyFFI, an image simulator created by Zack Berta and his undergrad student Jacobi Kosiarok.
The ingrediants included by SPyFFI are:
SPyFFI out puts image time series like this:
We process the images from 10 days of TESS observations with standard photometry pipeline, and create light curves for all the stars with TESS magnituded brighter than 14.
For the region of sky we simulated (6 by 6 square degree), this results in 16279 stars. To make our tasks today simpler, we are going to work with only ~4000 stars.
In [1]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
In [30]:
def make_ROC_curve(testY, predY, name):
fig2 = plt.figure()
ax= fig2.add_subplot(1,1,1)
fpr, tpr, _ = roc_curve(testY, predY)
ax.plot(fpr, tpr, label = name)
ax.set_title(('ROC Curve for %s') % name)
ax.set_ylabel('True Positive Rate')
ax.set_xlabel('False Positive Rate')
def collect_lc_feature(idlist):
LCfeature=np.zeros([len(idlist),481])
count=0
for i in idlist:
#print i
infile="LTFsmall/"+str(i)+".ltf"
lc=np.loadtxt(infile)[:,1]
LCfeature[count,0]=i
LCfeature[count,1:]=lc
count+=1
return LCfeature
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['false positives', 'transits'], rotation=45)
plt.yticks(tick_marks, ['false positives', 'transits'])
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
def fit(model,name,data,cv=True):
trainX,trainY,testX,testY,X,Y=data
model.fit(trainX, trainY)
predY = model.predict(testX)
f1score = metrics.f1_score(testY, predY)
cm = metrics.confusion_matrix(testY, predY)
plot_confusion_matrix(cm)
predY=model.predict_proba(testX)[:,1]
rocscore = metrics.roc_auc_score(testY, predY)
precision, recall, thresholds = metrics.precision_recall_curve(testY, predY)
aucscore=metrics.auc(precision,recall,reorder=True)
print "#####################################"
print "Result using",model
print "f1 score from train test split %f" % f1score
print "roc score from train test split %f" % rocscore
print "auc score from train test split %f" % aucscore
if cv:
#cvscore= cross_val_score(model, X, Y, cv = 5, scoring = 'f1')
cvscore= cross_val_score(model, X, Y, cv = 5, scoring = 'roc_auc')
print "f1 score from CV5 %f" % np.mean(cvscore)
print cm
make_ROC_curve(testY,predY,name)
return
Let's first look at what the TESS light curves:
TESS_simulated_10day_small.csv is the combined feature file.
TESS_simulated_lc_small.csv is the light curve file.
In [37]:
#df=pd.read_csv("TESS_simulated_10day_small.csv",index_col=0)
#df=pd.read_csv("TESS_simulateddata_combinedfeatures.csv",index_col=0)
df=pd.read_csv("TESSfield_19h_44d_combinedfeatures.csv")
In [38]:
#LCfeature=pd.DataFrame(collect_lc_feature(df['Ids']),columns=['Ids']+list(np.arange(480)))
#LCfeature.to_csv
#LCfeature=pd.read_csv("TESS_simulated_lc_small.csv",index_col=0)
In [39]:
#plt.plot(LCfeature.iloc[-1,1:],'.')
In [40]:
#plt.plot(LCfeature.iloc[0,:],'.')
The combined feature files contain features from Box Least Squal measurements, and 20 PCA components from the light curve. Later on hopefully we can explore how to create new features from the light curves.
Let's first examine the columns in the combined feature files:
In [41]:
df.columns
Out[41]:
Columns Ids, Catalog_Period, Depth, Catalog_Epoch records the information we have regarding the injected transits. Anything with period smaller than 0 is not a transit.
SNR is the signal to noise calculated for the transits using the catalog value.
SNR=\sqrt{Ntransit}*Depth/200mmag
There are three type of Y values included in this feature file:
-CatalogY marks True for all the transit planets with SNR larger than 8.5 and BLS_SignaltoPinkNoise_1_0 larger than 7.
-ManuleY marks True all the transits identified by eye.
-CombinedY marks True if either CatalogY and ManuleY is True.
The signals identified by manule effort but not by Catalog is due to blending. The signal missed by manule effort is either because of low signal to noise or due to cuts in Ntransit and Q value (standard practice before manuel inspection).
Let's drop the irrelevent columns before training:
In [42]:
X=df.drop(['Ids','CatalogY','ManuleY','CombinedY','Catalog_Period','Depth','Catalog_Epoch','SNR'],axis=1)
#print X.isnull().any()
In [43]:
Y=df['CombinedY']
In [49]:
trainX, testX, trainY, testY= train_test_split(X, Y,test_size = 0.2)
data=[trainX,trainY,testX,testY,X,Y]
print X.shape, Y[Y==1].shape
We show the results from some standard algorithms here:
In [45]:
model=RandomForestClassifier(n_estimators=1000,class_weight={0:10,1:1},n_jobs=-1)
name="RFC"
fit(model,name,data,cv=False)
In [46]:
model=GradientBoostingClassifier(n_estimators=1000)
name="GBC"
fit(model,name,data,cv=False)
In [47]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=1000)
#model=XGBClassifier(learning_rate=0.1,
# n_estimators=1000,
# max_depth=5,
# min_child_weight=1,
# gamma=0,
# subsample=0.8,
# colsample_bytree=0.8,
# objective='binary:logistic')
model.fit(trainX,trainY)
#model.plot_importance(bst)
#name="XGBoost"
fit(model,name,data,cv=False)
We can compare the prediction with the Manuel selection and the Catalog selection as the following:
In [48]:
from sklearn.cross_validation import StratifiedKFold
model=RandomForestClassifier(n_estimators=3000,n_jobs=-1,class_weight='balanced_subsample',oob_score=True)
skf=StratifiedKFold(Y,n_folds=4)
i=1
for train_index,test_index in skf:
trainX=X.iloc[train_index];testX=X.iloc[test_index]
trainY=np.array(Y)[train_index];testY=np.array(Y)[test_index]
#print train_index
traincatY=np.array(df['CatalogY'])[train_index];testcatY=np.array(df['CatalogY'])[test_index]
trainmanY=np.array(df['ManuleY'])[train_index];testmanY=np.array(df['ManuleY'])[test_index]
model.fit(trainX,trainY)
predY=model.predict_proba(testX)[:,1]
rocscore = metrics.roc_auc_score(testY, predY)
precision, recall, thresholds = metrics.precision_recall_curve(testY, predY)
aucscore=metrics.auc(precision,recall,reorder=True)
predY=model.predict(testX)
f1score = metrics.f1_score(testY, predY)
print "#####################################"
print "fold %d:" % i
print "f1 score from train test split %f" % f1score
print "roc score from train test split %f" % rocscore
print "auc score from train test split %f" % aucscore
print "oob score from RF %f" % model.oob_score_
flag1=(predY==1)*(predY==np.array(testY))
flag2=(predY==1)*(predY==np.array(testmanY))
print "predict Transit %d" % len(predY[predY==1])
print "real Transit %d" % len(testY[testY==1])
print "real Transit selected by eye %d" % len(testmanY[testmanY==1])
print "predicted Transit that's real %d" % len(predY[flag1])
print "predicted Transits selected by eye %d" % len(predY[flag2])
i+=1
In [22]:
model=GradientBoostingClassifier(n_estimators=3000)
skf=StratifiedKFold(Y,n_folds=4)
i=1
for train_index,test_index in skf:
trainX=X.iloc[train_index];testX=X.iloc[test_index]
trainY=np.array(Y)[train_index];testY=np.array(Y)[test_index]
#print train_index
traincatY=np.array(df['CatalogY'])[train_index];testcatY=np.array(df['CatalogY'])[test_index]
trainmanY=np.array(df['ManuleY'])[train_index];testmanY=np.array(df['ManuleY'])[test_index]
model.fit(trainX,trainY)
predY=model.predict(testX)
f1score = metrics.f1_score(testY, predY)
predY=model.predict_proba(testX)[:,1]
rocscore = metrics.roc_auc_score(testY, predY)
print "#####################################"
print "fold %d:" % i
print "f1 score from train test split %f" % f1score
print "roc score from train test split %f" % rocscore
flag1=(predY==1)*(predY==np.array(testY))
flag2=(predY==1)*(predY==np.array(testmanY))
print "predict Transit %d" % len(predY[predY==1])
print "real Transit %d" % len(testY[testY==1])
print "real Transit selected by eye %d" % len(testmanY[testmanY==1])
print "predicted Transit that's real %d" % len(predY[flag1])
print "predicted Transits selected by eye %d" % len(predY[flag2])
i+=1
In [23]:
model=XGBClassifier(n_estimators=3000)
#GradientBoostingClassifier(n_estimators=3000)
skf=StratifiedKFold(Y,n_folds=4)
i=1
for train_index,test_index in skf:
trainX=X.iloc[train_index];testX=X.iloc[test_index]
trainY=np.array(Y)[train_index];testY=np.array(Y)[test_index]
#print train_index
traincatY=np.array(df['CatalogY'])[train_index];testcatY=np.array(df['CatalogY'])[test_index]
trainmanY=np.array(df['ManuleY'])[train_index];testmanY=np.array(df['ManuleY'])[test_index]
model.fit(trainX,trainY)
predY=model.predict(testX)
f1score = metrics.f1_score(testY, predY)
predY=model.predict_proba(testX)[:,1]
rocscore = metrics.roc_auc_score(testY, predY)
print "#####################################"
print "fold %d:" % i
print "f1 score from train test split %f" % f1score
print "roc score from train test split %f" % rocscore
flag1=(predY==1)*(predY==np.array(testY))
flag2=(predY==1)*(predY==np.array(testmanY))
print "predict Transit %d" % len(predY[predY==1])
print "real Transit %d" % len(testY[testY==1])
print "real Transit selected by eye %d" % len(testmanY[testmanY==1])
print "predicted Transit that's real %d" % len(predY[flag1])
print "predicted Transits selected by eye %d" % len(predY[flag2])
i+=1
In [17]:
featurelist=X.columns
rfc= RandomForestClassifier(n_estimators=1000)
rfc.fit(trainX, trainY)
Out[17]:
In [18]:
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
threshold=0.02
droplist=[]
for f in range(X.shape[1]):
if importances[indices[f]]<threshold:
droplist.append(featurelist[indices[f]])
print("%d. feature %d (%s %f)" % (f + 1, indices[f], featurelist[indices[f]],importances[indices[f]]))
In [19]:
X_selected=X.drop(droplist,axis=1)
X_selected.head()
Out[19]:
In [20]:
model=RandomForestClassifier(n_estimators=1000,n_jobs=-1,class_weight='balanced_subsample')
name="RFC"
fit(model,name,data)
In [ ]: