In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.html.widgets import interact, interactive, fixed
from IPython.html import widgets
dataLoc = '../../'
labels = ['ID','mag_model_i','g-r', 'r-i', 'i-z', 'WISE1', 'WISE2' ]
pQSO = np.loadtxt(dataLoc+'pQSO/pSDSScolmag.txt')
lQSO = np.loadtxt(dataLoc+'lQSO/SDSScolmag.txt')
sinQSO = np.loadtxt(dataLoc+'sinQSO/sSDSScolmag.txt')
unlQSO = np.loadtxt(dataLoc+'unlQSO/nlSDSScolmag.txt')
unlQSO[:,5:7] = -unlQSO[:,5:7] #bug in WISE magnitudes for this file
data = np.concatenate((lQSO,pQSO,unlQSO,sinQSO),axis=0)
truth = np.concatenate((np.ones(lQSO.shape[0]),np.zeros(data.shape[0] - lQSO.shape[0])))
numPts = data.shape[0]
data = data[:,1:] #don't use IDs
In [7]:
import numpy as np
from sklearn.cross_validation import train_test_split
dataTrain, dataTest, truthTrain, truthTest = train_test_split(data, truth, test_size=0.33, random_state=42)
In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
clf = RandomForestClassifier(n_estimators=200,oob_score=True)
myRF = clf.fit(dataTrain,truthTrain)
cm = confusion_matrix(myRF.predict(dataTest), truthTest)
print cm
print metrics.classification_report(truthTest, myRF.predict(dataTest),target_names=['Dud', 'lQSO'])
In [38]:
from sklearn import grid_search
trialRF = RandomForestClassifier()
# parameter values over which we will search
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",2,4],
'criterion':["gini","entropy"],"min_samples_leaf": [1,2]}
# do a grid search to find the highest 3-fold CV zero-one score
tunedRF = grid_search.GridSearchCV(trialRF, parameters, score_func=metrics.accuracy_score,\
n_jobs = -1, cv = 3,verbose=1)
optRF = tunedRF.fit(dataTrain, truthTrain)
# print the best score and estimator
print(optRF.best_score_)
print(optRF.best_estimator_)
print metrics.classification_report(truthTest, optRF.predict(dataTest),target_names=['Dud', 'lQSO'])
cm = confusion_matrix(optRF.predict(dataTest), truthTest)
print cm
In [36]:
from sklearn import svm
trialSVM = svm.SVC()
# parameter values over which we will search
parameters = {'kernel':('linear', 'rbf'), 'class_weight':('auto',None), \
'gamma':[0.7, 0.5, 0.3, 0.1, 0.01],
'C':[0.1, 2, 4, 5, 10, 20,30]}
# do a grid search to find the highest 3-fold CV zero-one score
tunedSVM = grid_search.GridSearchCV(trialSVM, parameters, score_func=metrics.accuracy_score,\
n_jobs = -1, cv = 3,verbose=1)
optSVM = tunedSVM.fit(dataTrain, truthTrain)
# print the best score and estimator
print(optSVM.best_score_)
print(optSVM.best_estimator_)
print metrics.classification_report(truthTest, optSVM.predict(dataTest),target_names=['Dud', 'lQSO'])
In [ ]: