Today our job is to find out blending stars and distinguish them with the true signals.
When two stars are close to each other in the CCD, part of their lights may fall into the same pixel. If one of the star has a eclipsing signal, the others will see a similar signal in their light curves. Usually the mirrored signal will be more diluted than the main signal. But there could be exceptions.
In [1]:
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import scipy as sp
%matplotlib inline
Let's create two stars as if their point spread functions (PSFs) are one-d gaussians with full width half maximum (FWHM) of 1.2 pixels, which means sigma ~0.5 pixel. Star one is twice the brightness as star two, and the two stars are one pixel apart. Note I haven't pixelize the flux in the left figure below.
In [2]:
from scipy.stats import norm
FWHM=1.2
x=np.linspace(-3,3,50)
rv = norm(scale=FWHM/2.35)
starone=rv.pdf(x)*1000.
rv = norm(scale=FWHM/2.35)
startwo=(rv.pdf(x-1.0))*500.
fig=plt.figure()
ax=fig.add_subplot(121)
ax.plot(x,starone,label="star1")
ax.plot(x,startwo,label="star2")
ax.plot(x,starone+startwo,label="Total")
ax.set_xlabel("Pixel")
ax.set_ylabel("Flux")
ax.legend()
x_pixel=np.arange(7)-3
def pixelize(x,flux,x_pixel):
flux_pixel=np.zeros(len(x_pixel))
for i in xrange(len(x_pixel)):
index=(x>=x_pixel[i]-0.5)*(x<x_pixel[i]+0.5)
flux_pixel[i]=np.sum(flux[index])
return flux_pixel
starone_pixel=pixelize(x,starone,x_pixel)
startwo_pixel=pixelize(x,startwo,x_pixel)
ax2=fig.add_subplot(122)
ax2.plot(x_pixel,starone_pixel,label="star1")
ax2.plot(x_pixel,startwo_pixel,label="star2")
total=starone_pixel+startwo_pixel
ax2.plot(x_pixel,total,label="Total")
ax2.set_xlabel("Pixel")
#ax2.set_ylabel("Flux")
#ax2.legend()
plt.show()
Let's first create a Eclipse signal that makes star one dimmer by 10%.
In [3]:
fig=plt.figure()
ax=fig.add_subplot(121)
ax.plot(x,starone*0.9,label="star1")
ax.plot(x,startwo,label="star2")
ax.plot(x,starone*0.9+startwo,label="Total")
ax.set_xlabel("Pixel")
ax.set_ylabel("Flux")
ax.legend()
starone_pixel_ec=pixelize(x,starone*0.9,x_pixel)
startwo_pixel_ec=pixelize(x,startwo,x_pixel)
ax2=fig.add_subplot(122)
ax2.plot(x_pixel,starone_pixel_ec,label="star1")
ax2.plot(x_pixel,startwo_pixel_ec,label="star2")
total_ec=starone_pixel_ec+startwo_pixel_ec
ax2.plot(x_pixel,total_ec,label="Total")
ax2.set_xlabel("Pixel")
#ax2.set_ylabel("Flux")
plt.show()
Let's look at the actual dimming of two stars assuming we take a radius equals to one apperture around both stars.
In [4]:
plt.scatter(np.arange(2),[np.sum(total[2:5]),np.sum(total_ec[2:5])],marker='o',s=50,label='star1_obs')
plt.scatter(np.arange(2),[np.sum(total[3:6]),np.sum(total_ec[3:6])],marker='d',s=50,label='star2_obs')
plt.legend()
Out[4]:
Both star dimmed, however, star 1 dimed by about 7%, while star 2 dimed by slightly less than 6%.
In [5]:
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
In [47]:
def make_ROC_curve(testY, predY, name):
fig2 = plt.figure()
ax= fig2.add_subplot(1,1,1)
fpr, tpr, _ = roc_curve(testY, predY)
ax.plot(fpr, tpr, label = name)
ax.set_title(('ROC Curve for %s') % name)
ax.set_ylabel('True Positive Rate')
ax.set_xlabel('False Positive Rate')
def collect_lc_feature(idlist):
LCfeature=np.zeros([len(idlist),481])
count=0
for i in idlist:
#print i
infile="LTFsmall/"+str(i)+".ltf"
lc=np.loadtxt(infile)[:,1]
LCfeature[count,0]=i
LCfeature[count,1:]=lc
count+=1
return LCfeature
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['false positives', 'transits'], rotation=45)
plt.yticks(tick_marks, ['false positives', 'transits'])
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
def fit(model,name,data,cv=True):
trainX,trainY,testX,testY,X,Y=data
model.fit(trainX, trainY)
predY = model.predict(testX)
f1score = metrics.f1_score(testY, predY)
cm = metrics.confusion_matrix(testY, predY)
plot_confusion_matrix(cm)
predY=model.predict_proba(testX)[:,1]
rocscore = metrics.roc_auc_score(testY, predY)
precision, recall, thresholds = metrics.precision_recall_curve(testY, predY)
aucscore=metrics.auc(precision,recall,reorder=True)
print "#####################################"
print "Result using",model
print "f1 score from train test split %f" % f1score
print "roc score from train test split %f" % rocscore
print "auc score from train test split %f" % aucscore
if cv:
f1cvscore= cross_val_score(model, X, Y, cv = 5, scoring = 'f1')
cvscore= cross_val_score(model, X, Y, cv = 5, scoring = 'roc_auc')
print "roc_auc score from CV5 %f +- %f" % (np.mean(cvscore),np.std(cvscore))
print "f1 score from CV5 %f +- %f" % (np.mean(f1cvscore),np.std(f1cvscore))
print cm
make_ROC_curve(testY,predY,name)
return
In [7]:
df=pd.read_csv("TESSfield_19h_44d_combinedfeatures.csv")
Let's first have compare the Manuel selected signals with the catalog selected signals.
As a reminder:
In [8]:
#print df.shape
plt.plot(df['ManuleY']+np.random.randn(df.shape[0])*0.1,df['CatalogY']+np.random.randn(df.shape[0])*0.1,'.')
plt.xlim([-0.4,1.3])
plt.ylim([-0.4,1.3])
plt.xlabel('ManuleY')
plt.ylabel('CatalogY')
Out[8]:
There are four groups of cluster above:
The answer to the question is blending. And our task today is to further distinuish the stars affected by blending and those with true astrophysical signals.
In [9]:
Manule_selected=df.loc[df['ManuleY']==1,:]
Manule_selected.shape
Out[9]:
We drop all the irrelevant columns and all the informations from catalog to creat our innitial feature set.
In [10]:
X=Manule_selected.drop(['Ids','CatalogY','ManuleY','CombinedY','Catalog_Period','Depth','Catalog_Epoch','SNR','Catmag','CatX','CatY'],axis=1)
In [11]:
Y=Manule_selected['CatalogY']
In [12]:
trainX, testX, trainY, testY= train_test_split(X, Y,test_size = 0.2)
data=[trainX,trainY,testX,testY,X,Y]
print X.shape, Y[Y==1].shape
This is a slightly inbalanced sample, with the false sample twice as large as the true samples.
In [26]:
model=RandomForestClassifier(n_estimators=3000,n_jobs=-1)
name="RFC"
fit(model,name,data)
Note that we added the new auc score which computes the area under the precision recall curve.
In [27]:
model=GradientBoostingClassifier(n_estimators=3000)
name="GBC"
fit(model,name,data)
In [28]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=3000)
model.fit(trainX,trainY)
name="XGBoost"
fit(model,name,data)
Let's make some new features: One feature that makes sense is the to compare the detected period of planet with any other detected signals and compute an dimentionless distance.
In [84]:
import scipy as sp
from scipy import special
def find_distance(Parr,Earr,threshold=0.005):
#print Parr
dist_P=np.zeros(len(Parr))+0.5
dist_E=np.zeros(len(Parr))+0.5
for i in xrange(len(Parr)):
for j in xrange(len(Parr)):
if i==j:
continue
if Parr[j]>Parr[i]:
delt_P= np.abs(np.abs(Parr[j]-Parr[i])/Parr[i]-round(np.abs(Parr[j]-Parr[i])/Parr[i]))
else:
delt_P= np.abs(np.abs(Parr[j]-Parr[i])/Parr[j]-round(np.abs(Parr[j]-Parr[i])/Parr[j]))
if dist_P[i]>delt_P:
dist_P[i]=delt_P
delt_E=np.abs(np.abs(Earr[j]-Earr[i])/Parr[i]-round(np.abs(Earr[j]-Earr[i])/Parr[i]))
if dist_E[i]>delt_E:
dist_E[i]=delt_E
if dist_P[i]==0:
dist_P[i]=10
else:
dist_P[i]=np.sqrt(2)*sp.special.erfcinv(dist_P[i])
#print i, dist_P[i]
if dist_E[i]==0:
dist_E[i]=10
else:
dist_E[i]=np.sqrt(2)*sp.special.erfcinv(dist_E[i])
return [dist_P,dist_E]
#add back the magnitude of stars from the catalog
X=Manule_selected.drop(['Ids','CatalogY','ManuleY','CombinedY','Catalog_Period','Depth','Catalog_Epoch','SNR','CatX','CatY'],axis=1)
Pdist,Edist=find_distance(np.array(X['BLS_Period_1_0']),np.array(X['BLS_Tc_1_0']))
X['dist_E']=Edist
X['dist_P']=Pdist
X['diff_mag']=X['Catmag']-X['BLS_OOTmag_1_0']
plt.plot(Pdist,Edist,'.')
Out[84]:
In [85]:
trainX, testX, trainY, testY= train_test_split(X, Y,test_size = 0.2)
data=[trainX,trainY,testX,testY,X,Y]
print X.shape, Y[Y==1].shape
In [50]:
model=RandomForestClassifier(n_estimators=5000,class_weight={0:1,1:3},n_jobs=-1)
name="RFC"
fit(model,name,data)
In [51]:
model=GradientBoostingClassifier(n_estimators=3000)
name="GBC"
fit(model,name,data)
In [86]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=3000,max_depth=18,colsample_bytree=1.0,min_child_weight=1.0,learning_rate=0.001)
model.fit(trainX,trainY)
name="XGBoost"
fit(model,name,data)
In [21]:
featurelist=X.columns
rfc= RandomForestClassifier(n_estimators=3000)
rfc.fit(trainX, trainY)
Out[21]:
In [22]:
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
print indices.shape,trainX.shape
# Print the feature ranking
print("Feature ranking:")
threshold=0.03
droplist=[]
for f in range(X.shape[1]):
try:
if importances[indices[f]]<threshold:
droplist.append(featurelist[indices[f]])
print("%d. feature %d (%s %f)" % (f + 1, indices[f], featurelist[indices[f]],importances[indices[f]]))
except:
pass
In [23]:
X_selected=X.drop(droplist,axis=1)
X_selected.head()
Out[23]:
In [24]:
model=RandomForestClassifier(n_estimators=3000,n_jobs=-1,class_weight='balanced_subsample')
name="RFC"
fit(model,name,data)
In [ ]: