20 September 2017
Notebook for testing sklearn algorithms for selection of $3.5<z$ quasars from SDSS+SpIES.
In [1]:
# Read in data file
%matplotlib inline
from astropy.table import Table
import numpy as np
import matplotlib.pyplot as plt
data = Table.read('/Users/johntimlin/Catalogs/QSO_candidates/Training_set/GTR-ADM-QSO-Trainingset-with-McGreer-VVDS-DR12Q_splitlabel_VCVcut_best.fits')
labname = 'labels'
# X is in the format need for all of the sklearn tools, it just has the colors
X = np.vstack([np.asarray(data[name]) for name in ['ug', 'gr', 'ri', 'iz', 'zs1', 's1s2']]).T
y = np.array(data[labname])
# If we want to make a scaled version of X (i.e., set the mean of each attribute to zero and the variance to 1)
from sklearn.preprocessing import StandardScaler
XS = StandardScaler().fit_transform(X)
# List the attributes that are available to train on?
print data.keys()
In [ ]:
In [2]:
# Make training and test sets
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import classification_report
# Split the training data into training and test sets for cross-validation
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)
# Do the same for the scaled data sets
XStrain, XStest, yStrain, yStest = train_test_split(XS, y, test_size=0.25, random_state=42)
In [ ]:
In [5]:
import matplotlib.gridspec as gridspec
gdx = yStrain == 0
bdx = yStest == 0
fig = plt.figure(1,figsize = (12,12))
gs = gridspec.GridSpec(3,2)#,width_ratios = [1,1,1,1,1,1], height_ratios = [1,1,1,1,1,1])
ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax2 = plt.subplot(gs[2])
ax3 = plt.subplot(gs[3])
ax4 = plt.subplot(gs[4])
ax5 = plt.subplot(gs[5])
for i in range(5):
name = [ax0,ax1,ax2,ax3,ax4,ax5]
lab = ['ug','gr','ri','iz','zs1','s1s2']
plt.axes(name[i])
plt.scatter(XStrain[:,i],XStrain[:,i+1],s=1,alpha=0.5,color = 'b')
plt.scatter(XStrain[:,i][gdx],XStrain[:,i+1][gdx],s=1,alpha=1,color = 'c')
plt.scatter(XStest[:,i],XStest[:,i+1],s=1,alpha=0.5,color = 'r')
plt.scatter(XStest[:,i][bdx],XStest[:,i+1][bdx],s=1,alpha=1,color = 'y')
plt.xlabel(lab[i])
plt.ylabel(lab[i+1])
plt.show()
In [6]:
import matplotlib.gridspec as gridspec
gdx = y == 0
fig = plt.figure(1,figsize = (12,12))
gs = gridspec.GridSpec(3,2)#,width_ratios = [1,1,1,1,1,1], height_ratios = [1,1,1,1,1,1])
ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax2 = plt.subplot(gs[2])
ax3 = plt.subplot(gs[3])
ax4 = plt.subplot(gs[4])
ax5 = plt.subplot(gs[5])
for i in range(5):
name = [ax0,ax1,ax2,ax3,ax4,ax5]
lab = ['ug','gr','ri','iz','zs1','s1s2']
plt.axes(name[i])
plt.scatter(XS[:,i],XS[:,i+1],s=1,alpha=0.5,color = 'b')
plt.scatter(XS[:,i][gdx],XS[:,i+1][gdx],s=1,alpha=0.5,color = 'c')
plt.scatter(X[:,i],X[:,i+1],s=1,alpha=0.5,color = 'r')
plt.scatter(X[:,i][gdx],X[:,i+1][gdx],s=1,alpha=0.5,color = 'y')
plt.xlabel(lab[i])
plt.ylabel(lab[i+1])
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [18]:
# Neural Network
from sklearn.neural_network import MLPClassifier
# Instantiate the classifier
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
# Train the classifier with the training subset
nn.fit(Xtrain, ytrain)
# Now predict the labels for the test data
# ypred = clf.predict(Xtest)
# Added timing function for comparison with other methods.
ypred = nn.predict(Xtest)
In [19]:
# The classification report gives:
# efficiency = precision
# completeness = recall
# while the f1-score is a combination of the two
print(classification_report(ytest, ypred, target_names=['QSOs', 'stars']))
So that is 70% complete with 12% contamination (where some of that contamination will be from $z<3.5$ quasars rather than from stars.)
The above only classifies 25% of the data. To find the self-classification for all the training objects we can do a cross-validation instead which averages over all of the results, using N different subsamples for testing.
In [20]:
N = 5
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
ypredCV = cross_val_predict(clf, X, y, cv=N) # Note that we use X and y and not Xtrain and ytrain
print(classification_report(y, ypredCV, target_names=['QSOs', 'stars']))
In [3]:
from sklearn.svm import SVC
svm = SVC(random_state=42)
svm.fit(Xtrain, ytrain)
ypredSVM = svm.predict(Xtest)
print(classification_report(ytest, ypredSVM, target_names=['QSOs', 'stars']))
Pretty good. 81% completeness and 89% efficiency.
Do it again with scaled data to see if that makes any difference. It doesn't seem to for colors alone, but might for other attributes?
In [4]:
from sklearn.svm import SVC
svm = SVC(random_state=42)
svm.fit(XStrain, yStrain)
ySpredSVM = svm.predict(XStest)
print(classification_report(yStest, ySpredSVM, target_names=['QSOs', 'stars']))
In [10]:
print len(ySpredSVM), len(yStest)
good = ySpredSVM == 0
qq = ((yStest==0) & (ySpredSVM==0))
ss = ((yStest==1) & (ySpredSVM==1))
qs = ((yStest==0) & (ySpredSVM==1))
sq = ((yStest==1) & (ySpredSVM==0))
print len(ySpredSVM[qq]), "quasars selected as quasars"
print len(ySpredSVM[qs]), "quasars selected as stars"
print len(ySpredSVM[sq]), "stars selected as quasars"
In [ ]:
In [12]:
ypredCVSVM = cross_val_predict(svm, XS, y)
data['ypred'] = ypredCVSVM
qq = ((data[labname]==0) & (data['ypred']==0))
ss = ((data[labname]==1) & (data['ypred']==1))
qs = ((data[labname]==0) & (data['ypred']==1))
sq = ((data[labname]==1) & (data['ypred']==0))
dataqq = data[qq]
datass = data[ss]
dataqs = data[qs]
datasq = data[sq]
print(classification_report(y, ypredCVSVM, target_names=['QSOs', 'stars']))
print len(dataqq), "quasars selected as quasars"
print len(dataqs), "quasars selected as stars"
print len(datasq), "stars selected as quasars"
fig = plt.figure(figsize=(6, 6))
bins=np.linspace(0,5.5,100)
plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)
#plt.xlim([-4,8])
#plt.ylim([-1.5,1.5])
plt.legend()
plt.xlabel('zspec')
plt.ylabel('N')
Out[12]:
Plot the colors of the classified objects from above
In [13]:
import matplotlib.gridspec as gridspec
fig = plt.figure(5,figsize = (12,12))
gs = gridspec.GridSpec(3,2)#,width_ratios = [1,1,1,1,1,1], height_ratios = [1,1,1,1,1,1])
ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax2 = plt.subplot(gs[2])
ax3 = plt.subplot(gs[3])
ax4 = plt.subplot(gs[4])
ax5 = plt.subplot(gs[5])
for i in range(5):
name = [ax0,ax1,ax2,ax3,ax4,ax5]
lab = ['ug','gr','ri','iz','zs1','s1s2']
plt.axes(name[i])
plt.scatter(data[lab[i]][qq],data[lab[i+1]][qq],s=1,alpha=0.5,color = 'b')
plt.scatter(data[lab[i]][sq],data[lab[i+1]][sq],s=1,alpha=0.5,color = 'g')
#plt.scatter(data[lab[i]][qs],data[lab[i+1]][qs],s=1,alpha=0.5,color = 'r')
#plt.scatter(XStest[:,i][bdx],XStest[:,i+1][bdx],s=1,alpha=1,color = 'y')
plt.xlabel(lab[i])
plt.ylabel(lab[i+1])
fig = plt.figure(6,figsize = (12,12))
gs = gridspec.GridSpec(6,1)#,width_ratios = [1,1,1,1,1,1], height_ratios = [1,1,1,1,1,1])
ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1],sharex = ax0)
ax2 = plt.subplot(gs[2],sharex = ax0)
ax3 = plt.subplot(gs[3],sharex = ax0)
ax4 = plt.subplot(gs[4],sharex = ax0)
ax5 = plt.subplot(gs[5],sharex = ax0)
for i in range(6):
name = [ax0,ax1,ax2,ax3,ax4,ax5]
lab = ['ug','gr','ri','iz','zs1','s1s2']
plt.axes(name[i])
plt.scatter(data['zspec'],data[lab[i]],s=1,alpha=0.5,color = 'y')
plt.scatter(data['zspec'][qq],data[lab[i]][qq],s=1,alpha=0.5,color = 'b')
plt.scatter(data['zspec'][sq],data[lab[i]][sq],s=1,alpha=0.5,color = 'g')
#plt.scatter(data[lab[i]][qs],data[lab[i+1]][qs],s=1,alpha=0.5,color = 'r')
#plt.scatter(XStest[:,i][bdx],XStest[:,i+1][bdx],s=1,alpha=1,color = 'y')
plt.xlabel('zspec')
plt.ylabel(lab[i])
plt.show()
In [ ]:
In [ ]:
In [ ]:
Now we'll try a DecisionTree, a RandomForest, and an ExtraTrees classifier
Note that n_jobs=-1 is supposed to allow it to use multiple processesors if it can, but I'm honestly not sure how that works (and also not convinced that it isn't causing problems as sometimes when I use it I get a warning that it is setting n_jobs=1 instead.
In [7]:
# Random Forests, etc.
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
clfDTC = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=42)
clfRFC = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
clfETC = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42)
clfDTC.fit(Xtrain, ytrain)
clfRFC.fit(Xtrain, ytrain)
clfETC.fit(Xtrain, ytrain)
ypredDTC = clfDTC.predict(Xtest)
ypredRFC = clfRFC.predict(Xtest)
ypredETC = clfETC.predict(Xtest)
In [8]:
print clfDTC.feature_importances_
print clfRFC.feature_importances_
print clfETC.feature_importances_
print(classification_report(ytest, ypredDTC, target_names=['QSOs', 'stars']))
print(classification_report(ytest, ypredRFC, target_names=['QSOs', 'stars']))
print(classification_report(ytest, ypredETC, target_names=['QSOs', 'stars']))
In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
#Decision tree on scaled data
clfDTC = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=42)
clfRFC = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
clfETC = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42)
clfDTC.fit(XStrain, yStrain)
clfRFC.fit(XStrain, yStrain)
clfETC.fit(XStrain, yStrain)
ypredDTC = clfDTC.predict(XStest)
ypredRFC = clfRFC.predict(XStest)
ypredETC = clfETC.predict(XStest)
print clfDTC.feature_importances_
print clfRFC.feature_importances_
print clfETC.feature_importances_
print(classification_report(yStest, ypredDTC, target_names=['QSOs', 'stars']))
print(classification_report(yStest, ypredRFC, target_names=['QSOs', 'stars']))
print(classification_report(yStest, ypredETC, target_names=['QSOs', 'stars']))
In [15]:
ypredCVRFC = cross_val_predict(clfRFC, XS, y)
data['ypred'] = ypredCVRFC
qq = ((data[labname]==0) & (data['ypred']==0))
ss = ((data[labname]==1) & (data['ypred']==1))
qs = ((data[labname]==0) & (data['ypred']==1))
sq = ((data[labname]==1) & (data['ypred']==0))
dataqq = data[qq]
datass = data[ss]
dataqs = data[qs]
datasq = data[sq]
print len(dataqq), "quasars selected as quasars"
print len(dataqs), "quasars selected as stars"
print len(datasq), "stars selected as quasars"
fig = plt.figure(figsize=(6, 6))
bins=np.linspace(0,5.5,100)
plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)
#plt.xlim([-4,8])
#plt.ylim([-1.5,1.5])
plt.legend()
plt.xlabel('zspec')
plt.ylabel('N')
Out[15]:
In [ ]:
Now we'll try a bagging classifier, based on K Nearest Neighbors. I did some playing around with max_samples and max_features (both of which run from 0 to 1) and found 0.5 and 1.0 to work best. Note that you have to give 1.0 in decimal otherwise it takes it to be 1 feature instead of 100% of them.
In [9]:
# Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=1.0, random_state=42, n_jobs=-1)
bagging.fit(Xtrain, ytrain)
ypredBag = bagging.predict(Xtest)
print(classification_report(ytest, ypredBag, target_names=['QSOs', 'stars']))
# Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(n_neighbors=7), max_samples=0.5, max_features=1.0, random_state=42, n_jobs=-1)
bagging.fit(Xtrain, ytrain)
ypredBag = bagging.predict(Xtest)
print "7 neighbors"
print(classification_report(ytest, ypredBag, target_names=['QSOs', 'stars']))
This seems to work better than the RandomForest. Might be worth trying to optimize the parameters for this. First try n_neighbors=7. The default is 5.
In [ ]:
Now do the same with the scaled data.
In [28]:
# Bagging Scaled data; 7 neighbors
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(n_neighbors=7), max_samples=0.5, max_features=1.0, random_state=42, n_jobs=-1)
bagging.fit(XStrain, yStrain)
ypredBag = bagging.predict(XStest)
print(classification_report(ytest, ypredBag, target_names=['QSOs', 'stars']))
Basically the same with 7 neighbors instead of the default of 5. Scaled seems to be somewhat better than not. Try cross-validation.
In [16]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
N = 5
clf = BaggingClassifier(KNeighborsClassifier(n_neighbors=7), max_samples=0.5, max_features=1.0, random_state=42, n_jobs=-1)
ypredCVBAG = cross_val_predict(clf, XS, y, cv=N)
print(classification_report(y, ypredCVBAG, target_names=['QSOs', 'stars']))
In [17]:
#Plot the cross-validation
data['ypred'] = ypredCVBAG
qq = ((data[labname]==0) & (data['ypred']==0))
ss = ((data[labname]==1) & (data['ypred']==1))
qs = ((data[labname]==0) & (data['ypred']==1))
sq = ((data[labname]==1) & (data['ypred']==0))
dataqq = data[qq]
datass = data[ss]
dataqs = data[qs]
datasq = data[sq]
print len(dataqq), "quasars selected as quasars"
print len(dataqs), "quasars selected as stars"
print len(datasq), "stars selected as quasars"
fig = plt.figure(figsize=(6, 6))
bins=np.linspace(0,5.5,100)
plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)
#plt.xlim([-4,8])
#plt.ylim([-1.5,1.5])
plt.legend()
plt.xlabel('zspec')
plt.ylabel('N')
Out[17]:
In [18]:
#print selection fraction
bins=np.linspace(0,5.5,100)
hz, bins = np.histogram(data['zspec'][data[labname]==0],bins)
recovered,bins = np.histogram(dataqq['zspec'],bins)
frac = np.asarray(recovered)*1.0/np.asarray(hz)
mdpt = (bins[:-1]+bins[1:])/2.0
print mdpt
print hz
print recovered
print frac
plt.plot(mdpt, frac)
plt.ylim(0,1)
Out[18]:
In [55]:
#print selection fraction
bins=np.linspace(0,5.5,100)
hz, bins = np.histogram(data['zspec'][data[labname]==0],bins)
recovered,bins = np.histogram(dataqq['zspec'],bins)
frac = np.asarray(recovered)*1.0/np.asarray(hz)
mdpt = (bins[:-1]+bins[1:])/2.0
print mdpt
print hz
print recovered
print frac
plt.plot(mdpt, frac)
plt.ylim(0,1)
Out[55]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [23]:
# Boosting
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
boost = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
boost.fit(Xtrain, ytrain)
ypredBoost = boost.predict(Xtest)
print f1_score(ytest, ypredBoost)
print(classification_report(ytest, ypredBoost, target_names=['QSOs', 'stars']))
print boost.feature_importances_
Also pretty good, but not the best so far. This algorithm outputs the importances of the features, which shows that $u-g$ is the most important with $r-i$ being next.
The main parameter for boosting is the depth of the underlying trees. Let's make that a parameter and do a grid search over a range of depths. The training set should always improve with greater depth, but the test set will plateau. That's the value that we want for the depth.
In [24]:
# Depth-optimized Boosting
depth = np.arange(1, 10)
score_test = np.zeros(len(depth))
score_train = np.zeros(len(depth))
i_best = 0
z_fit_best = None
# Loop over the depths
for i, d in enumerate(depth):
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=d, random_state=42)
clf.fit(Xtrain, ytrain)
ypredtrain = clf.predict(Xtrain)
# We need some way to decide what is best
# Try using the f1_score and comparing the values for the different depths
score_train[i] = f1_score(ytrain, ypredtrain)
ypredtest = clf.predict(Xtest)
score_test[i] = f1_score(ytest, ypredtest)
if score_test[i] >= score_test[i_best]:
i_best = i
ybest = ypredtest
best_depth = depth[i_best]
# Give the best depth and the corresponding classification report.
print best_depth
print(classification_report(ytest, ybest, target_names=['QSOs', 'stars']))
In [25]:
# Plot the f1 score vs. depth
fig = plt.figure(figsize=(6, 6))
plt.plot(depth,score_test,label='test')
plt.plot(depth,score_train,label='train')
plt.xlabel("depth")
plt.ylabel("score")
plt.legend()
plt.show()
This suggests that a depth of 3 is optimal.
RandomForests were better than Boosting, so let's try optimizing the depth for RandomForests.
In [26]:
# Depth-optimized Random Forest
depth = np.arange(10, 21)
score_test = np.zeros(len(depth))
score_train = np.zeros(len(depth))
i_best = 0
z_fit_best = None
for i, d in enumerate(depth):
clf = RandomForestClassifier(n_estimators=10, max_depth=d, min_samples_split=2, random_state=42, n_jobs=-1)
clf.fit(Xtrain, ytrain)
ypredtrain = clf.predict(Xtrain)
score_train[i] = f1_score(ytrain, ypredtrain)
ypredtest = clf.predict(Xtest)
score_test[i] = f1_score(ytest, ypredtest)
if score_test[i] >= score_test[i_best]:
i_best = i
ybest = ypredtest
best_depth = depth[i_best]
print best_depth
print(classification_report(ytest, ybest, target_names=['QSOs', 'stars']))
In [27]:
fig = plt.figure(figsize=(6, 6))
plt.plot(depth,score_test,label='test')
plt.plot(depth,score_train,label='train')
plt.xlabel("depth")
plt.ylabel("score")
plt.legend()
plt.show()
So far a depth=17 RandomForest is giving the best results:
78% Completeness 7% Contamination
Though we might be able to optimize more by tweaking the other paramters.
In [28]:
# Try increasing the number of estimators
clf = RandomForestClassifier(n_estimators=20, max_depth=15, min_samples_split=2, random_state=42, n_jobs=-1)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
print(classification_report(ytest, ypred, target_names=['QSOs', 'stars']))
Somehow this isn't quite as good, so stick with 10 estimators.
But first need to produce classifications for all using cross-validation and the parameters above.
In [29]:
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
clf = RandomForestClassifier(n_estimators=10, max_depth=15, min_samples_split=2, n_jobs=-1, random_state=42)
N=5
ypredCV = cross_val_predict(clf, X, y, cv=N)
print(classification_report(y, ypredCV, target_names=['QSOs', 'stars']))
Final result is 79% completeness and 8% contamination.
Should output the classification vector and do some exploring in TOPCAT with it.
Create a new Table column for data filled with the predicted $y$ values. Then write it out as FITS.
In [30]:
data['ypred'] = ypredCV
In [57]:
# Input file output with results of self-classification with RF
data.write('GTR-ADM-QSO-ir-testhighz_findbw_lup_2016_starclean_class.fits', format='fits')
Make the same histograms as above.
In [31]:
qq = ((data['labels']==0) & (data['ypred']==0))
ss = ((data['labels']==1) & (data['ypred']==1))
qs = ((data['labels']==0) & (data['ypred']==1))
sq = ((data['labels']==1) & (data['ypred']==0))
dataqq = data[qq]
datass = data[ss]
dataqs = data[qs]
datasq = data[sq]
print len(dataqq), "quasars selected as quasars"
print len(dataqs), "quasars selected as stars"
print len(datasq), "stars selected as quasars"
In [32]:
fig = plt.figure(figsize=(6, 6))
bins=np.linspace(0,5.5,100)
plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)
#plt.xlim([-4,8])
#plt.ylim([-1.5,1.5])
plt.legend()
plt.xlabel('zspec')
plt.ylabel('N')
Out[32]:
This is actually worse that for both SVM and Bagging. But it is what I decided to go with for the first comparison with KDE. We'll come back and look at this more in the next notebook. However, it is clear that we might want to use something else if contamination is the main thing that we are worried about.
What is interesting is that about half of the "stars" are quasars with $\lesssim3.5$. So, the contamination is more like 4% than 8%.
Similarly, most of the failed quasars are at $z\gtrsim3$, which isn't all that surprising.
In [38]:
print len(Xtest)
One of the reasons that I settled on RF is shown by the timing analysis below. While the bagging analysis works well, it takes forever. For just 173,390 sources bagging takes 32.8s as compared to 129ms for RF and SVM takes 7.45s.
So the prediction for classifying 50 million sources is:
RF 37 seconds
SVM 35.8 minutes
BAG 2.62 hours
In [34]:
%timeit -n1 ypred = nn.predict(Xtest)
In [35]:
%timeit -n1 ypredRFC = clfRFC.predict(Xtest)
In [37]:
%timeit -n1 ypredBag = bagging.predict(XStest)
In [39]:
%timeit -n1 ySpredSVM = svm.predict(XStest)
Assuming class1 is stars and class2 is quasars:
689365 true stars. 4193 true quasars.
690189 points were labeled as stars.
3369 points were labeled as quasars.
689273 out of 689365 stars were labeled as stars (99.99%).
92 out of 689365 stars were labeled as quasars (0.01335%).
916 out of 4193 quasars were labeled as stars (21.85%).
3277 out of 4193 quasars were labeled as quasars (78.15%).
Overall Accuracy: 99.85%
Weighted Accuracy: 89.07%
Completeness: 78.15%
Efficiency: 97.27%
Rating: 0.7602
This is about as good as the KDE results (at least for self classification), which got 78% completeness with 97% efficiency (where some of those stars classified as quasars are also likely $z\sim3.5$). The combination of these suggests that the candidate list that John is using should has 3% or less contamination. We can check that by also testing the photo-z's.
Make a few plots illustrating that not all of the contamination is from stars (but is instead from $z\lesssim3.5$).
In [68]:
%matplotlib inline
fig = plt.figure(figsize=(12, 8))
fig.subplots_adjust(hspace=0.05)
ax1 = fig.add_subplot(211)
ax1.xaxis.set_major_formatter(plt.NullFormatter())
plt.scatter(datass['zspec'],datass['imag'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['zspec'],dataqq['imag'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['zspec'],dataqs['imag'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['zspec'],datasq['imag'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-0.1,6.5])
plt.xlabel('zspec')
plt.ylabel('imag')
ax2 = fig.add_subplot(212, sharex=ax1)
plt.scatter(datass['zspec'],datass['gr'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['zspec'],dataqq['gr'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['zspec'],dataqs['gr'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['zspec'],datasq['gr'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-0.1,6.5])
plt.xlabel('zspec')
plt.ylabel('gr')
plt.show()
In [71]:
fig = plt.figure(figsize=(12, 4))
#fig.subplots_adjust(vspace=0.05)
ax1 = fig.add_subplot(131)
plt.scatter(datass['ug'],datass['s1s2'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['ug'],dataqq['s1s2'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['ug'],dataqs['s1s2'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['ug'],datasq['s1s2'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-4,8])
plt.ylim([-1.5,1.5])
plt.xlabel('u-g')
plt.ylabel('ch1-ch2')
ax2 = fig.add_subplot(132)
ax2.yaxis.set_major_formatter(plt.NullFormatter())
plt.scatter(datass['ri'],datass['s1s2'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['ri'],dataqq['s1s2'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['ri'],dataqs['s1s2'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['ri'],datasq['s1s2'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-2,4.5])
plt.ylim([-1.5,1.5])
plt.xlabel('r-i')
plt.ylabel('ch1-ch2')
ax3 = fig.add_subplot(133)
ax3.yaxis.set_major_formatter(plt.NullFormatter())
plt.scatter(datass['zs1'],datass['s1s2'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['zs1'],dataqq['s1s2'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['zs1'],dataqs['s1s2'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['zs1'],datasq['s1s2'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-3,6])
plt.ylim([-1.5,1.5])
plt.xlabel('z-ch1')
plt.ylabel('ch1-ch2')
plt.show()
These are more revealing as contours or density plots (which I examined in TOPCAT), but I'm going to leave them as they are for now.
See more in further notebooks.