20 September 2017

Notebook for testing sklearn algorithms for selection of $3.5<z$ quasars from SDSS+SpIES.

Training Data


In [1]:
# Read in data file
%matplotlib inline
from astropy.table import Table
import numpy as np
import matplotlib.pyplot as plt
data = Table.read('/Users/johntimlin/Catalogs/QSO_candidates/Training_set/GTR-ADM-QSO-Trainingset-with-McGreer-VVDS-DR12Q_splitlabel_VCVcut_best.fits')
labname = 'labels'
# X is in the format need for all of the sklearn tools, it just has the colors
X = np.vstack([np.asarray(data[name]) for name in ['ug', 'gr', 'ri', 'iz', 'zs1', 's1s2']]).T
y = np.array(data[labname])

# If we want to make a scaled version of X (i.e., set the mean of each attribute to zero and the variance to 1)
from sklearn.preprocessing import StandardScaler 
XS = StandardScaler().fit_transform(X)

# List the attributes that are available to train on?
print data.keys()


['ug', 'gr', 'ri', 'iz', 'zs1', 's1s2', 'ra', 'dec', 'iflux', 'zspec', 'extinctu', 'imag', 'morph', 'labels', 'shenlabel', 'hzlabel', 'lzlabel', 'Catalog']

In [ ]:


In [2]:
# Make training and test sets
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import classification_report

# Split the training data into training and test sets for cross-validation
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)

# Do the same for the scaled data sets
XStrain, XStest, yStrain, yStest = train_test_split(XS, y, test_size=0.25, random_state=42)

In [ ]:


In [5]:
import matplotlib.gridspec as gridspec

gdx = yStrain == 0
bdx = yStest == 0
fig = plt.figure(1,figsize = (12,12))

gs = gridspec.GridSpec(3,2)#,width_ratios = [1,1,1,1,1,1], height_ratios = [1,1,1,1,1,1])

ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax2 = plt.subplot(gs[2])
ax3 = plt.subplot(gs[3])
ax4 = plt.subplot(gs[4])
ax5 = plt.subplot(gs[5])

for i in range(5):
    name = [ax0,ax1,ax2,ax3,ax4,ax5]
    lab = ['ug','gr','ri','iz','zs1','s1s2']
    plt.axes(name[i])
    plt.scatter(XStrain[:,i],XStrain[:,i+1],s=1,alpha=0.5,color = 'b')
    plt.scatter(XStrain[:,i][gdx],XStrain[:,i+1][gdx],s=1,alpha=1,color = 'c')
    plt.scatter(XStest[:,i],XStest[:,i+1],s=1,alpha=0.5,color = 'r')
    plt.scatter(XStest[:,i][bdx],XStest[:,i+1][bdx],s=1,alpha=1,color = 'y')
    plt.xlabel(lab[i])
    plt.ylabel(lab[i+1])


plt.show()



In [6]:
import matplotlib.gridspec as gridspec

gdx = y == 0

fig = plt.figure(1,figsize = (12,12))

gs = gridspec.GridSpec(3,2)#,width_ratios = [1,1,1,1,1,1], height_ratios = [1,1,1,1,1,1])

ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax2 = plt.subplot(gs[2])
ax3 = plt.subplot(gs[3])
ax4 = plt.subplot(gs[4])
ax5 = plt.subplot(gs[5])

for i in range(5):
    name = [ax0,ax1,ax2,ax3,ax4,ax5]
    lab = ['ug','gr','ri','iz','zs1','s1s2']
    plt.axes(name[i])
    plt.scatter(XS[:,i],XS[:,i+1],s=1,alpha=0.5,color = 'b')
    plt.scatter(XS[:,i][gdx],XS[:,i+1][gdx],s=1,alpha=0.5,color = 'c')
    plt.scatter(X[:,i],X[:,i+1],s=1,alpha=0.5,color = 'r')
    plt.scatter(X[:,i][gdx],X[:,i+1][gdx],s=1,alpha=0.5,color = 'y')
    plt.xlabel(lab[i])
    plt.ylabel(lab[i+1])


plt.show()



In [ ]:


In [ ]:


In [ ]:


Neural Network Classification

Let's start by trying a neural network. (More because that's the last thing that I taught than anything else.) Set random_state so that this is repeatable.


In [18]:
# Neural Network
from sklearn.neural_network import MLPClassifier
# Instantiate the classifier
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
# Train the classifier with the training subset
nn.fit(Xtrain, ytrain)

# Now predict the labels for the test data
# ypred = clf.predict(Xtest)
# Added timing function for comparison with other methods.
ypred = nn.predict(Xtest)

In [19]:
# The classification report gives:
# efficiency = precision
# completeness = recall
# while the f1-score is a combination of the two
print(classification_report(ytest, ypred, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.87      0.80      0.83      5670
      stars       0.99      1.00      0.99    169554

avg / total       0.99      0.99      0.99    175224

So that is 70% complete with 12% contamination (where some of that contamination will be from $z<3.5$ quasars rather than from stars.)

The above only classifies 25% of the data. To find the self-classification for all the training objects we can do a cross-validation instead which averages over all of the results, using N different subsamples for testing.


In [20]:
N = 5
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
ypredCV = cross_val_predict(clf, X, y, cv=N) # Note that we use X and y and not Xtrain and ytrain
print(classification_report(y, ypredCV, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.83      0.80      0.82     22752
      stars       0.99      0.99      0.99    678142

avg / total       0.99      0.99      0.99    700894


SVM Classification

Now try classifcation with SVM.


In [3]:
from sklearn.svm import SVC
svm = SVC(random_state=42)
svm.fit(Xtrain, ytrain)

ypredSVM = svm.predict(Xtest)
print(classification_report(ytest, ypredSVM, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.93      0.79      0.86      1304
      stars       1.00      1.00      1.00    173889

avg / total       1.00      1.00      1.00    175193

Pretty good. 81% completeness and 89% efficiency.

Do it again with scaled data to see if that makes any difference. It doesn't seem to for colors alone, but might for other attributes?


In [4]:
from sklearn.svm import SVC

svm = SVC(random_state=42)
svm.fit(XStrain, yStrain)

ySpredSVM = svm.predict(XStest)
print(classification_report(yStest, ySpredSVM, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.93      0.83      0.88      1304
      stars       1.00      1.00      1.00    173889

avg / total       1.00      1.00      1.00    175193


In [10]:
print len(ySpredSVM), len(yStest)
good = ySpredSVM == 0

qq = ((yStest==0) & (ySpredSVM==0))
ss = ((yStest==1) & (ySpredSVM==1))
qs = ((yStest==0) & (ySpredSVM==1))
sq = ((yStest==1) & (ySpredSVM==0))


print len(ySpredSVM[qq]), "quasars selected as quasars"
print len(ySpredSVM[qs]), "quasars selected as stars"
print len(ySpredSVM[sq]), "stars selected as quasars"


175193 175193
1081 quasars selected as quasars
223 quasars selected as stars
76 stars selected as quasars
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-10-0f5f53a1be59> in <module>()
     21 fig = plt.figure(figsize=(6, 6))
     22 bins=np.linspace(0,5.5,100)
---> 23 plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
     24 plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
     25 plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
<matplotlib.figure.Figure at 0x12997b510>

In [ ]:


In [12]:
ypredCVSVM = cross_val_predict(svm, XS, y)
data['ypred'] = ypredCVSVM

qq = ((data[labname]==0) & (data['ypred']==0))
ss = ((data[labname]==1) & (data['ypred']==1))
qs = ((data[labname]==0) & (data['ypred']==1))
sq = ((data[labname]==1) & (data['ypred']==0))
dataqq = data[qq]
datass = data[ss]
dataqs = data[qs]
datasq = data[sq]

print(classification_report(y, ypredCVSVM, target_names=['QSOs', 'stars']))

print len(dataqq), "quasars selected as quasars"
print len(dataqs), "quasars selected as stars"
print len(datasq), "stars selected as quasars"

fig = plt.figure(figsize=(6, 6))
bins=np.linspace(0,5.5,100)
plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)
#plt.xlim([-4,8])
#plt.ylim([-1.5,1.5])
plt.legend()
plt.xlabel('zspec')
plt.ylabel('N')


             precision    recall  f1-score   support

       QSOs       0.91      0.82      0.86      5353
      stars       1.00      1.00      1.00    695419

avg / total       1.00      1.00      1.00    700772

4375 quasars selected as quasars
978 quasars selected as stars
418 stars selected as quasars
Out[12]:
<matplotlib.text.Text at 0x129b0be10>

Plot the colors of the classified objects from above


In [13]:
import matplotlib.gridspec as gridspec


fig = plt.figure(5,figsize = (12,12))

gs = gridspec.GridSpec(3,2)#,width_ratios = [1,1,1,1,1,1], height_ratios = [1,1,1,1,1,1])

ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax2 = plt.subplot(gs[2])
ax3 = plt.subplot(gs[3])
ax4 = plt.subplot(gs[4])
ax5 = plt.subplot(gs[5])

for i in range(5):
    name = [ax0,ax1,ax2,ax3,ax4,ax5]
    lab = ['ug','gr','ri','iz','zs1','s1s2']
    plt.axes(name[i])
    plt.scatter(data[lab[i]][qq],data[lab[i+1]][qq],s=1,alpha=0.5,color = 'b')
    plt.scatter(data[lab[i]][sq],data[lab[i+1]][sq],s=1,alpha=0.5,color = 'g')
    #plt.scatter(data[lab[i]][qs],data[lab[i+1]][qs],s=1,alpha=0.5,color = 'r')
    #plt.scatter(XStest[:,i][bdx],XStest[:,i+1][bdx],s=1,alpha=1,color = 'y')
    plt.xlabel(lab[i])
    plt.ylabel(lab[i+1])

fig = plt.figure(6,figsize = (12,12))

gs = gridspec.GridSpec(6,1)#,width_ratios = [1,1,1,1,1,1], height_ratios = [1,1,1,1,1,1])

ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1],sharex = ax0)
ax2 = plt.subplot(gs[2],sharex = ax0)
ax3 = plt.subplot(gs[3],sharex = ax0)
ax4 = plt.subplot(gs[4],sharex = ax0)
ax5 = plt.subplot(gs[5],sharex = ax0)

for i in range(6):
    name = [ax0,ax1,ax2,ax3,ax4,ax5]
    lab = ['ug','gr','ri','iz','zs1','s1s2']
    plt.axes(name[i])
    plt.scatter(data['zspec'],data[lab[i]],s=1,alpha=0.5,color = 'y')
    plt.scatter(data['zspec'][qq],data[lab[i]][qq],s=1,alpha=0.5,color = 'b')
    plt.scatter(data['zspec'][sq],data[lab[i]][sq],s=1,alpha=0.5,color = 'g')
    #plt.scatter(data[lab[i]][qs],data[lab[i+1]][qs],s=1,alpha=0.5,color = 'r')
    #plt.scatter(XStest[:,i][bdx],XStest[:,i+1][bdx],s=1,alpha=1,color = 'y')
    plt.xlabel('zspec')
    plt.ylabel(lab[i])
    
plt.show()



In [ ]:


In [ ]:


In [ ]:


Random Forest Classification

Now we'll try a DecisionTree, a RandomForest, and an ExtraTrees classifier

Note that n_jobs=-1 is supposed to allow it to use multiple processesors if it can, but I'm honestly not sure how that works (and also not convinced that it isn't causing problems as sometimes when I use it I get a warning that it is setting n_jobs=1 instead.


In [7]:
# Random Forests, etc.
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

clfDTC = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=42)
clfRFC = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
clfETC = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42)

clfDTC.fit(Xtrain, ytrain)
clfRFC.fit(Xtrain, ytrain)
clfETC.fit(Xtrain, ytrain)

ypredDTC = clfDTC.predict(Xtest)
ypredRFC = clfRFC.predict(Xtest)
ypredETC = clfETC.predict(Xtest)

In [8]:
print clfDTC.feature_importances_
print clfRFC.feature_importances_
print clfETC.feature_importances_

print(classification_report(ytest, ypredDTC, target_names=['QSOs', 'stars']))
print(classification_report(ytest, ypredRFC, target_names=['QSOs', 'stars']))
print(classification_report(ytest, ypredETC, target_names=['QSOs', 'stars']))


[ 0.19203536  0.30239037  0.275911    0.10797368  0.08415375  0.03753584]
[ 0.22725437  0.32311411  0.16790707  0.12476938  0.10498243  0.05197265]
[ 0.17785105  0.22404359  0.26680527  0.15027406  0.10742968  0.07359635]
             precision    recall  f1-score   support

       QSOs       0.79      0.78      0.78      1331
      stars       1.00      1.00      1.00    173893

avg / total       1.00      1.00      1.00    175224

             precision    recall  f1-score   support

       QSOs       0.91      0.80      0.85      1331
      stars       1.00      1.00      1.00    173893

avg / total       1.00      1.00      1.00    175224

             precision    recall  f1-score   support

       QSOs       0.92      0.80      0.85      1331
      stars       1.00      1.00      1.00    173893

avg / total       1.00      1.00      1.00    175224


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

#Decision tree on scaled data
clfDTC = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=42)
clfRFC = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42, n_jobs=-1)
clfETC = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42)

clfDTC.fit(XStrain, yStrain)
clfRFC.fit(XStrain, yStrain)
clfETC.fit(XStrain, yStrain)

ypredDTC = clfDTC.predict(XStest)
ypredRFC = clfRFC.predict(XStest)
ypredETC = clfETC.predict(XStest)

print clfDTC.feature_importances_
print clfRFC.feature_importances_
print clfETC.feature_importances_

print(classification_report(yStest, ypredDTC, target_names=['QSOs', 'stars']))
print(classification_report(yStest, ypredRFC, target_names=['QSOs', 'stars']))
print(classification_report(yStest, ypredETC, target_names=['QSOs', 'stars']))


[ 0.1025114   0.43704589  0.24878018  0.10074349  0.07734252  0.03357652]
[ 0.21104035  0.29221346  0.20197148  0.11379188  0.11847108  0.06251175]
[ 0.17931039  0.26450605  0.23664625  0.15432442  0.0908319   0.07438099]
             precision    recall  f1-score   support

       QSOs       0.80      0.79      0.79      1304
      stars       1.00      1.00      1.00    173889

avg / total       1.00      1.00      1.00    175193

             precision    recall  f1-score   support

       QSOs       0.91      0.82      0.87      1304
      stars       1.00      1.00      1.00    173889

avg / total       1.00      1.00      1.00    175193

             precision    recall  f1-score   support

       QSOs       0.93      0.81      0.87      1304
      stars       1.00      1.00      1.00    173889

avg / total       1.00      1.00      1.00    175193


In [15]:
ypredCVRFC = cross_val_predict(clfRFC, XS, y)
data['ypred'] = ypredCVRFC

qq = ((data[labname]==0) & (data['ypred']==0))
ss = ((data[labname]==1) & (data['ypred']==1))
qs = ((data[labname]==0) & (data['ypred']==1))
sq = ((data[labname]==1) & (data['ypred']==0))
dataqq = data[qq]
datass = data[ss]
dataqs = data[qs]
datasq = data[sq]

print len(dataqq), "quasars selected as quasars"
print len(dataqs), "quasars selected as stars"
print len(datasq), "stars selected as quasars"

fig = plt.figure(figsize=(6, 6))
bins=np.linspace(0,5.5,100)
plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)
#plt.xlim([-4,8])
#plt.ylim([-1.5,1.5])
plt.legend()
plt.xlabel('zspec')
plt.ylabel('N')


4328 quasars selected as quasars
1025 quasars selected as stars
525 stars selected as quasars
Out[15]:
<matplotlib.text.Text at 0x125446890>

In [ ]:


Bagging

Now we'll try a bagging classifier, based on K Nearest Neighbors. I did some playing around with max_samples and max_features (both of which run from 0 to 1) and found 0.5 and 1.0 to work best. Note that you have to give 1.0 in decimal otherwise it takes it to be 1 feature instead of 100% of them.


In [9]:
# Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=1.0, random_state=42, n_jobs=-1)
bagging.fit(Xtrain, ytrain)
ypredBag = bagging.predict(Xtest)
                           
print(classification_report(ytest, ypredBag, target_names=['QSOs', 'stars']))

# Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(n_neighbors=7), max_samples=0.5, max_features=1.0, random_state=42, n_jobs=-1)
bagging.fit(Xtrain, ytrain)
ypredBag = bagging.predict(Xtest)
print "7 neighbors"
print(classification_report(ytest, ypredBag, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.93      0.76      0.84      1498
      stars       1.00      1.00      1.00    173726

avg / total       1.00      1.00      1.00    175224

7 neighbors
             precision    recall  f1-score   support

       QSOs       0.94      0.75      0.83      1498
      stars       1.00      1.00      1.00    173726

avg / total       1.00      1.00      1.00    175224

This seems to work better than the RandomForest. Might be worth trying to optimize the parameters for this. First try n_neighbors=7. The default is 5.


In [ ]:

Now do the same with the scaled data.


In [28]:
# Bagging Scaled data; 7 neighbors
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(n_neighbors=7), max_samples=0.5, max_features=1.0, random_state=42, n_jobs=-1)
bagging.fit(XStrain, yStrain)
ypredBag = bagging.predict(XStest)
                           
print(classification_report(ytest, ypredBag, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.94      0.79      0.86      1325
      stars       1.00      1.00      1.00    173787

avg / total       1.00      1.00      1.00    175112

Basically the same with 7 neighbors instead of the default of 5. Scaled seems to be somewhat better than not. Try cross-validation.


In [16]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
N = 5
clf = BaggingClassifier(KNeighborsClassifier(n_neighbors=7), max_samples=0.5, max_features=1.0, random_state=42, n_jobs=-1)
ypredCVBAG = cross_val_predict(clf, XS, y, cv=N)
print(classification_report(y, ypredCVBAG, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.92      0.81      0.86      5353
      stars       1.00      1.00      1.00    695419

avg / total       1.00      1.00      1.00    700772


In [17]:
#Plot the cross-validation
data['ypred'] = ypredCVBAG
qq = ((data[labname]==0) & (data['ypred']==0))
ss = ((data[labname]==1) & (data['ypred']==1))
qs = ((data[labname]==0) & (data['ypred']==1))
sq = ((data[labname]==1) & (data['ypred']==0))
dataqq = data[qq]
datass = data[ss]
dataqs = data[qs]
datasq = data[sq]

print len(dataqq), "quasars selected as quasars"
print len(dataqs), "quasars selected as stars"
print len(datasq), "stars selected as quasars"

fig = plt.figure(figsize=(6, 6))
bins=np.linspace(0,5.5,100)
plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)
#plt.xlim([-4,8])
#plt.ylim([-1.5,1.5])
plt.legend()
plt.xlabel('zspec')
plt.ylabel('N')


4313 quasars selected as quasars
1040 quasars selected as stars
399 stars selected as quasars
Out[17]:
<matplotlib.text.Text at 0x12b387090>

In [18]:
#print selection fraction
bins=np.linspace(0,5.5,100)
hz, bins = np.histogram(data['zspec'][data[labname]==0],bins)
recovered,bins = np.histogram(dataqq['zspec'],bins)

frac = np.asarray(recovered)*1.0/np.asarray(hz)
mdpt = (bins[:-1]+bins[1:])/2.0

print mdpt
print hz
print recovered

print frac

plt.plot(mdpt, frac)
plt.ylim(0,1)


[ 0.02777778  0.08333333  0.13888889  0.19444444  0.25        0.30555556
  0.36111111  0.41666667  0.47222222  0.52777778  0.58333333  0.63888889
  0.69444444  0.75        0.80555556  0.86111111  0.91666667  0.97222222
  1.02777778  1.08333333  1.13888889  1.19444444  1.25        1.30555556
  1.36111111  1.41666667  1.47222222  1.52777778  1.58333333  1.63888889
  1.69444444  1.75        1.80555556  1.86111111  1.91666667  1.97222222
  2.02777778  2.08333333  2.13888889  2.19444444  2.25        2.30555556
  2.36111111  2.41666667  2.47222222  2.52777778  2.58333333  2.63888889
  2.69444444  2.75        2.80555556  2.86111111  2.91666667  2.97222222
  3.02777778  3.08333333  3.13888889  3.19444444  3.25        3.30555556
  3.36111111  3.41666667  3.47222222  3.52777778  3.58333333  3.63888889
  3.69444444  3.75        3.80555556  3.86111111  3.91666667  3.97222222
  4.02777778  4.08333333  4.13888889  4.19444444  4.25        4.30555556
  4.36111111  4.41666667  4.47222222  4.52777778  4.58333333  4.63888889
  4.69444444  4.75        4.80555556  4.86111111  4.91666667  4.97222222
  5.02777778  5.08333333  5.13888889  5.19444444  5.25        5.30555556
  5.36111111  5.41666667  5.47222222]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 539 632 602 576 476 476 375 300 252
 203 194 157 110 101  95  67  59  51  53  35   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 307 499 528 510 425 420 326 265 217
 176 163 122  81  66  69  49  37  28  21   4   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0]
[        nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan  0.56957328  0.78955696  0.87707641
  0.88541667  0.89285714  0.88235294  0.86933333  0.88333333  0.86111111
  0.86699507  0.84020619  0.77707006  0.73636364  0.65346535  0.72631579
  0.73134328  0.62711864  0.54901961  0.39622642  0.11428571         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan]
/Users/johntimlin/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: RuntimeWarning: invalid value encountered in divide
Out[18]:
(0, 1)

In [55]:
#print selection fraction
bins=np.linspace(0,5.5,100)
hz, bins = np.histogram(data['zspec'][data[labname]==0],bins)
recovered,bins = np.histogram(dataqq['zspec'],bins)

frac = np.asarray(recovered)*1.0/np.asarray(hz)
mdpt = (bins[:-1]+bins[1:])/2.0

print mdpt
print hz
print recovered

print frac

plt.plot(mdpt, frac)
plt.ylim(0,1)


[ 0.02777778  0.08333333  0.13888889  0.19444444  0.25        0.30555556
  0.36111111  0.41666667  0.47222222  0.52777778  0.58333333  0.63888889
  0.69444444  0.75        0.80555556  0.86111111  0.91666667  0.97222222
  1.02777778  1.08333333  1.13888889  1.19444444  1.25        1.30555556
  1.36111111  1.41666667  1.47222222  1.52777778  1.58333333  1.63888889
  1.69444444  1.75        1.80555556  1.86111111  1.91666667  1.97222222
  2.02777778  2.08333333  2.13888889  2.19444444  2.25        2.30555556
  2.36111111  2.41666667  2.47222222  2.52777778  2.58333333  2.63888889
  2.69444444  2.75        2.80555556  2.86111111  2.91666667  2.97222222
  3.02777778  3.08333333  3.13888889  3.19444444  3.25        3.30555556
  3.36111111  3.41666667  3.47222222  3.52777778  3.58333333  3.63888889
  3.69444444  3.75        3.80555556  3.86111111  3.91666667  3.97222222
  4.02777778  4.08333333  4.13888889  4.19444444  4.25        4.30555556
  4.36111111  4.41666667  4.47222222  4.52777778  4.58333333  4.63888889
  4.69444444  4.75        4.80555556  4.86111111  4.91666667  4.97222222
  5.02777778  5.08333333  5.13888889  5.19444444  5.25        5.30555556
  5.36111111  5.41666667  5.47222222]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 538 631 602 575 475 476 373 299 251
 201 194 156 109 103  95  68  59  51  53  32   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 305 500 529 503 424 413 324 266 212
 171 163 116  83  72  68  47  38  28  20   5   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0]
[        nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan  0.5669145   0.79239303  0.87873754
  0.87478261  0.89263158  0.86764706  0.86863271  0.88963211  0.84462151
  0.85074627  0.84020619  0.74358974  0.76146789  0.69902913  0.71578947
  0.69117647  0.6440678   0.54901961  0.37735849  0.15625            nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan]
/Users/johntimlin/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: RuntimeWarning: invalid value encountered in divide
Out[55]:
(0, 1)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


Boosting

Now try a boosting-based classifier.


In [23]:
# Boosting
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
boost = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
boost.fit(Xtrain, ytrain)
ypredBoost = boost.predict(Xtest)

print f1_score(ytest, ypredBoost)
print(classification_report(ytest, ypredBoost, target_names=['QSOs', 'stars']))
print boost.feature_importances_


0.999098150774
             precision    recall  f1-score   support

       QSOs       0.91      0.78      0.84      1043
      stars       1.00      1.00      1.00    172347

avg / total       1.00      1.00      1.00    173390

[ 0.27386614  0.21087245  0.23887571  0.12073587  0.09678462  0.05886521]

Also pretty good, but not the best so far. This algorithm outputs the importances of the features, which shows that $u-g$ is the most important with $r-i$ being next.

The main parameter for boosting is the depth of the underlying trees. Let's make that a parameter and do a grid search over a range of depths. The training set should always improve with greater depth, but the test set will plateau. That's the value that we want for the depth.


In [24]:
# Depth-optimized Boosting
depth = np.arange(1, 10)
score_test = np.zeros(len(depth))
score_train = np.zeros(len(depth))
i_best = 0
z_fit_best = None

# Loop over the depths
for i, d in enumerate(depth):
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=d, random_state=42) 
    clf.fit(Xtrain, ytrain)

    ypredtrain = clf.predict(Xtrain)
    # We need some way to decide what is best
    # Try using the f1_score and comparing the values for the different depths
    score_train[i] = f1_score(ytrain, ypredtrain)
   
    ypredtest = clf.predict(Xtest)
    score_test[i] = f1_score(ytest, ypredtest)

    if score_test[i] >= score_test[i_best]:
        i_best = i
        ybest = ypredtest

best_depth = depth[i_best]

# Give the best depth and the corresponding classification report.
print best_depth
print(classification_report(ytest, ybest, target_names=['QSOs', 'stars']))


3
             precision    recall  f1-score   support

       QSOs       0.91      0.73      0.81      1043
      stars       1.00      1.00      1.00    172347

avg / total       1.00      1.00      1.00    173390


In [25]:
# Plot the f1 score vs. depth
fig = plt.figure(figsize=(6, 6))
plt.plot(depth,score_test,label='test')
plt.plot(depth,score_train,label='train')
plt.xlabel("depth")
plt.ylabel("score")
plt.legend()
plt.show()


This suggests that a depth of 3 is optimal.


RandomForests were better than Boosting, so let's try optimizing the depth for RandomForests.


In [26]:
# Depth-optimized Random Forest
depth = np.arange(10, 21)
score_test = np.zeros(len(depth))
score_train = np.zeros(len(depth))
i_best = 0
z_fit_best = None

for i, d in enumerate(depth):
    clf = RandomForestClassifier(n_estimators=10, max_depth=d, min_samples_split=2, random_state=42, n_jobs=-1)
    clf.fit(Xtrain, ytrain)

    ypredtrain = clf.predict(Xtrain)
    score_train[i] = f1_score(ytrain, ypredtrain)
   
    ypredtest = clf.predict(Xtest)
    score_test[i] = f1_score(ytest, ypredtest)

    if score_test[i] >= score_test[i_best]:
        i_best = i
        ybest = ypredtest

best_depth = depth[i_best]
print best_depth
print(classification_report(ytest, ybest, target_names=['QSOs', 'stars']))


17
             precision    recall  f1-score   support

       QSOs       0.93      0.78      0.85      1043
      stars       1.00      1.00      1.00    172347

avg / total       1.00      1.00      1.00    173390


In [27]:
fig = plt.figure(figsize=(6, 6))
plt.plot(depth,score_test,label='test')
plt.plot(depth,score_train,label='train')
plt.xlabel("depth")
plt.ylabel("score")
plt.legend()
plt.show()


So far a depth=17 RandomForest is giving the best results:

78% Completeness 7% Contamination

Though we might be able to optimize more by tweaking the other paramters.


In [28]:
# Try increasing the number of estimators
clf = RandomForestClassifier(n_estimators=20, max_depth=15, min_samples_split=2, random_state=42, n_jobs=-1)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

print(classification_report(ytest, ypred, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.95      0.76      0.84      1043
      stars       1.00      1.00      1.00    172347

avg / total       1.00      1.00      1.00    173390

Somehow this isn't quite as good, so stick with 10 estimators.

But first need to produce classifications for all using cross-validation and the parameters above.


In [29]:
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
clf = RandomForestClassifier(n_estimators=10, max_depth=15, min_samples_split=2, n_jobs=-1, random_state=42)
N=5
ypredCV = cross_val_predict(clf, X, y, cv=N)

print(classification_report(y, ypredCV, target_names=['QSOs', 'stars']))


             precision    recall  f1-score   support

       QSOs       0.92      0.79      0.85      4193
      stars       1.00      1.00      1.00    689365

avg / total       1.00      1.00      1.00    693558

Final result is 79% completeness and 8% contamination.

Should output the classification vector and do some exploring in TOPCAT with it.

Create a new Table column for data filled with the predicted $y$ values. Then write it out as FITS.


In [30]:
data['ypred'] = ypredCV

In [57]:
# Input file output with results of self-classification with RF
data.write('GTR-ADM-QSO-ir-testhighz_findbw_lup_2016_starclean_class.fits', format='fits')

Make the same histograms as above.


In [31]:
qq = ((data['labels']==0) & (data['ypred']==0))
ss = ((data['labels']==1) & (data['ypred']==1))
qs = ((data['labels']==0) & (data['ypred']==1))
sq = ((data['labels']==1) & (data['ypred']==0))
dataqq = data[qq]
datass = data[ss]
dataqs = data[qs]
datasq = data[sq]

print len(dataqq), "quasars selected as quasars"
print len(dataqs), "quasars selected as stars"
print len(datasq), "stars selected as quasars"


3300 quasars selected as quasars
893 quasars selected as stars
288 stars selected as quasars

In [32]:
fig = plt.figure(figsize=(6, 6))
bins=np.linspace(0,5.5,100)
plt.hist(dataqq['zspec'],bins=bins,label='qq',color='b',alpha=0.5)
plt.hist(dataqs['zspec'],bins=bins,label='qs',color='r',alpha=0.5)
plt.hist(datasq['zspec'],bins=bins,label='sq',color='g',alpha=0.5)
#plt.xlim([-4,8])
#plt.ylim([-1.5,1.5])
plt.legend()
plt.xlabel('zspec')
plt.ylabel('N')


Out[32]:
<matplotlib.text.Text at 0x117233390>

This is actually worse that for both SVM and Bagging. But it is what I decided to go with for the first comparison with KDE. We'll come back and look at this more in the next notebook. However, it is clear that we might want to use something else if contamination is the main thing that we are worried about.

What is interesting is that about half of the "stars" are quasars with $\lesssim3.5$. So, the contamination is more like 4% than 8%.

Similarly, most of the failed quasars are at $z\gtrsim3$, which isn't all that surprising.


In [38]:
print len(Xtest)


173390

Timing test

One of the reasons that I settled on RF is shown by the timing analysis below. While the bagging analysis works well, it takes forever. For just 173,390 sources bagging takes 32.8s as compared to 129ms for RF and SVM takes 7.45s.

So the prediction for classifying 50 million sources is:
RF 37 seconds
SVM 35.8 minutes
BAG 2.62 hours


In [34]:
%timeit -n1 ypred = nn.predict(Xtest)


1 loop, best of 3: 15.1 ms per loop

In [35]:
%timeit -n1 ypredRFC = clfRFC.predict(Xtest)


1 loop, best of 3: 129 ms per loop

In [37]:
%timeit -n1 ypredBag = bagging.predict(XStest)


1 loop, best of 3: 32.8 s per loop

In [39]:
%timeit -n1 ySpredSVM = svm.predict(XStest)


1 loop, best of 3: 7.45 s per loop

Comparison with KDE Results

Assuming class1 is stars and class2 is quasars:
689365 true stars. 4193 true quasars.
690189 points were labeled as stars.
3369 points were labeled as quasars.

689273 out of 689365 stars were labeled as stars (99.99%).
92 out of 689365 stars were labeled as quasars (0.01335%).
916 out of 4193 quasars were labeled as stars (21.85%).
3277 out of 4193 quasars were labeled as quasars (78.15%).

Overall Accuracy: 99.85%
Weighted Accuracy: 89.07%
Completeness: 78.15%
Efficiency: 97.27%
Rating: 0.7602

This is about as good as the KDE results (at least for self classification), which got 78% completeness with 97% efficiency (where some of those stars classified as quasars are also likely $z\sim3.5$). The combination of these suggests that the candidate list that John is using should has 3% or less contamination. We can check that by also testing the photo-z's.

Make a few plots illustrating that not all of the contamination is from stars (but is instead from $z\lesssim3.5$).


In [68]:
%matplotlib inline
fig = plt.figure(figsize=(12, 8))
fig.subplots_adjust(hspace=0.05)
ax1 = fig.add_subplot(211)
ax1.xaxis.set_major_formatter(plt.NullFormatter())
plt.scatter(datass['zspec'],datass['imag'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['zspec'],dataqq['imag'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['zspec'],dataqs['imag'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['zspec'],datasq['imag'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-0.1,6.5])
plt.xlabel('zspec')
plt.ylabel('imag')

ax2 = fig.add_subplot(212, sharex=ax1)
plt.scatter(datass['zspec'],datass['gr'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['zspec'],dataqq['gr'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['zspec'],dataqs['gr'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['zspec'],datasq['gr'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-0.1,6.5])
plt.xlabel('zspec')
plt.ylabel('gr')

plt.show()



In [71]:
fig = plt.figure(figsize=(12, 4))
#fig.subplots_adjust(vspace=0.05)
ax1 = fig.add_subplot(131)
plt.scatter(datass['ug'],datass['s1s2'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['ug'],dataqq['s1s2'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['ug'],dataqs['s1s2'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['ug'],datasq['s1s2'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-4,8])
plt.ylim([-1.5,1.5])
plt.xlabel('u-g')
plt.ylabel('ch1-ch2')

ax2 = fig.add_subplot(132)
ax2.yaxis.set_major_formatter(plt.NullFormatter())
plt.scatter(datass['ri'],datass['s1s2'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['ri'],dataqq['s1s2'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['ri'],dataqs['s1s2'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['ri'],datasq['s1s2'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-2,4.5])
plt.ylim([-1.5,1.5])
plt.xlabel('r-i')
plt.ylabel('ch1-ch2')

ax3 = fig.add_subplot(133)
ax3.yaxis.set_major_formatter(plt.NullFormatter())
plt.scatter(datass['zs1'],datass['s1s2'],label='ss',edgecolor='None',color='m')
plt.scatter(dataqq['zs1'],dataqq['s1s2'],label='qq',edgecolor='None',color='b',alpha=0.5)
plt.scatter(dataqs['zs1'],dataqs['s1s2'],label='qs',edgecolor='None',color='k',alpha=0.5)
plt.scatter(datasq['zs1'],datasq['s1s2'],label='sq',edgecolor='None',color='g',alpha=0.5)
plt.legend(scatterpoints=1)
plt.xlim([-3,6])
plt.ylim([-1.5,1.5])
plt.xlabel('z-ch1')
plt.ylabel('ch1-ch2')

plt.show()


These are more revealing as contours or density plots (which I examined in TOPCAT), but I'm going to leave them as they are for now.


See more in further notebooks.