In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.html.widgets import interact, interactive, fixed
from IPython.html import widgets
dataLoc = '../../'

labels = ['ID','mag_model_i','g-r', 'r-i', 'i-z', 'WISE1', 'WISE2' ]
pQSO = np.loadtxt(dataLoc+'pQSO/pSDSScolmag.txt')
lQSO = np.loadtxt(dataLoc+'lQSO/SDSScolmag.txt')
sinQSO = np.loadtxt(dataLoc+'sinQSO/sSDSScolmag.txt')
unlQSO = np.loadtxt(dataLoc+'unlQSO/nlSDSScolmag.txt')
unlQSO[:,5:7] = -unlQSO[:,5:7] #bug in WISE magnitudes for this file

data = np.concatenate((lQSO,pQSO,unlQSO,sinQSO),axis=0)
truth = np.concatenate((np.ones(lQSO.shape[0]),np.zeros(data.shape[0] - lQSO.shape[0])))
numPts = data.shape[0]
data = data[:,1:] #don't use IDs

In [7]:
import numpy as np
from sklearn.cross_validation import train_test_split
dataTrain, dataTest, truthTrain, truthTest = train_test_split(data, truth, test_size=0.33, random_state=42)

Random Forests: Out-of-the-box


In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics

clf = RandomForestClassifier(n_estimators=200,oob_score=True)
myRF = clf.fit(dataTrain,truthTrain)
cm = confusion_matrix(myRF.predict(dataTest), truthTest)
print cm
print metrics.classification_report(truthTest, myRF.predict(dataTest),target_names=['Dud', 'lQSO'])


[[1451   76]
 [  41  284]]
             precision    recall  f1-score   support

        Dud       0.95      0.97      0.96      1492
       lQSO       0.87      0.79      0.83       360

avg / total       0.94      0.94      0.94      1852


In [38]:
from sklearn import grid_search
trialRF = RandomForestClassifier()

# parameter values over which we will search
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",2,4],
              'criterion':["gini","entropy"],"min_samples_leaf": [1,2]}

# do a grid search to find the highest 3-fold CV zero-one score
tunedRF = grid_search.GridSearchCV(trialRF, parameters, score_func=metrics.accuracy_score,\
                                    n_jobs = -1, cv = 3,verbose=1)
optRF = tunedRF.fit(dataTrain, truthTrain)

# print the best score and estimator
print(optRF.best_score_)
print(optRF.best_estimator_)

print metrics.classification_report(truthTest, optRF.predict(dataTest),target_names=['Dud', 'lQSO'])
cm = confusion_matrix(optRF.predict(dataTest), truthTest)
print cm


Fitting 3 folds for each of 36 candidates, totalling 108 fits
/Users/mbaumer/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py:466: DeprecationWarning: Passing function as ``score_func`` is deprecated and will be removed in 0.15. Either use strings or score objects.The relevant new parameter is called ''scoring''.
  self.loss_func, self.score_func, self.scoring)
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   11.2s finished
0.944400106411
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion=gini, max_depth=None, max_features=4,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)
             precision    recall  f1-score   support

        Dud       0.96      0.97      0.96      1492
       lQSO       0.87      0.82      0.85       360

avg / total       0.94      0.94      0.94      1852

[[1448   64]
 [  44  296]]

SVMs (linear and gaussian kernels)


In [36]:
from sklearn import svm
trialSVM = svm.SVC()

# parameter values over which we will search
parameters = {'kernel':('linear', 'rbf'), 'class_weight':('auto',None), \
              'gamma':[0.7, 0.5, 0.3, 0.1,  0.01],
              'C':[0.1, 2, 4, 5, 10, 20,30]}

# do a grid search to find the highest 3-fold CV zero-one score
tunedSVM = grid_search.GridSearchCV(trialSVM, parameters, score_func=metrics.accuracy_score,\
                                    n_jobs = -1, cv = 3,verbose=1)
optSVM = tunedSVM.fit(dataTrain, truthTrain)

# print the best score and estimator
print(optSVM.best_score_)
print(optSVM.best_estimator_)
print metrics.classification_report(truthTest, optSVM.predict(dataTest),target_names=['Dud', 'lQSO'])


Fitting 3 folds for each of 140 candidates, totalling 420 fits
/Users/mbaumer/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py:466: DeprecationWarning: Passing function as ``score_func`` is deprecated and will be removed in 0.15. Either use strings or score objects.The relevant new parameter is called ''scoring''.
  self.loss_func, self.score_func, self.scoring)
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed:    7.4s finished
0.937483373238
SVC(C=20, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.5,
  kernel=rbf, max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
             precision    recall  f1-score   support

        Dud       0.95      0.97      0.96      1492
       lQSO       0.84      0.77      0.81       360

avg / total       0.93      0.93      0.93      1852


In [ ]: