In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split
from astroML.density_estimation import XDGMM
from astropy.io import fits
import triangle
dataLoc = '../../'
labels = ['ID','mag_model_i','g-r', 'r-i', 'i-z', 'WISE1', 'WISE2' ]
pQSO = np.loadtxt(dataLoc+'pQSO/pSDSScolmag.txt')[:,2:]
lQSO = np.loadtxt(dataLoc+'lQSO/SDSScolmag.txt')[:,2:]
sinQSO = np.loadtxt(dataLoc+'sinQSO/sSDSScolmag.txt')[:,2:]
unlQSO = np.loadtxt(dataLoc+'unlQSO/nlSDSScolmag.txt')[:,2:]
unlQSO[:,3:5] = -unlQSO[:,3:5] #bug in WISE magnitudes for this file
duds = np.concatenate((pQSO,unlQSO,sinQSO),axis=0)
data = np.concatenate((lQSO,duds),axis=0) #all sims
truth = np.concatenate((np.ones(lQSO.shape[0]),np.zeros(duds.shape[0])),axis=0)
numPts = data.shape[0]
(lQSO_train, lQSO_test) = train_test_split(lQSO, test_size=0.25)
(dud_train, dud_test) = train_test_split(duds, test_size=0.25)
In [2]:
perturbingSigma = .1
nFeatures=data.shape[1]
z = zip(np.random.normal(scale=perturbingSigma,size=nFeatures),np.random.normal(scale=perturbingSigma,size=nFeatures))
covmat = np.cov(np.round(np.array(z),decimals=4))
covmat = perturbingSigma*np.array([[1 ,.1, .1, .1, .1],
[.1, 1, .1, .1, .1],
[.1, .1 ,1 ,.1, .1],
[.1, .1 ,.1 ,1, .1],
[.1, .1 ,.1, .1, 1]])
print covmat
print np.linalg.inv(covmat)
print np.linalg.inv(np.linalg.inv(covmat))
def noisify(data,covmat):
nFeatures = data.shape[1]
#perturb data
perturbations = np.random.multivariate_normal(np.zeros(nFeatures),covmat,size=data.shape[0])
noisyData = data + perturbations
#XD function needs to be delivered a list of covariance matrices
covmatlist = np.dstack((covmat,covmat))
while covmatlist.shape[2] < data.shape[0]:
covmatlist = np.dstack((covmatlist,covmat))
return noisyData, covmatlist
noisy_lQSO_train, covmatlist_lQSO_train = noisify(lQSO_train,covmat)
noisy_dud_train, covmatlist_dud_train = noisify(dud_train,covmat)
noisy_lQSO_test, covmatlist_lQSO_test = noisify(lQSO_test,covmat)
noisy_dud_test, covmatlist_dud_test = noisify(dud_test,covmat)
#fit indep XD models to both signal and duds
lQSO_model = XDGMM(n_components=30,verbose=True)
dud_model = XDGMM(n_components=30,verbose=True)
lQSO_model.fit(noisy_lQSO_train, covmatlist_lQSO_train.transpose())
dud_model.fit(noisy_dud_train, covmatlist_dud_train.transpose())
Out[2]:
In [14]:
#compute probabilities
lQSO_sample_lQSO_model = np.sum(np.exp(lQSO_model.logprob_a(noisy_lQSO_train, covmatlist_lQSO_train.transpose())),axis=1)
lQSO_sample_dud_model = np.sum(np.exp(dud_model.logprob_a(noisy_lQSO_train, covmatlist_lQSO_train.transpose())),axis=1)
dud_sample_dud_model = np.sum(np.exp(dud_model.logprob_a(noisy_dud_train, covmatlist_dud_train.transpose())),axis=1)
dud_sample_lQSO_model = np.sum(np.exp(lQSO_model.logprob_a(noisy_dud_train, covmatlist_dud_train.transpose())),axis=1)
lQSO_test_lQSO_model = np.sum(np.exp(lQSO_model.logprob_a(noisy_lQSO_test, covmatlist_lQSO_test.transpose())),axis=1)
lQSO_test_dud_model = np.sum(np.exp(dud_model.logprob_a(noisy_lQSO_test, covmatlist_lQSO_test.transpose())),axis=1)
dud_test_dud_model = np.sum(np.exp(dud_model.logprob_a(noisy_dud_test, covmatlist_dud_test.transpose())),axis=1)
dud_test_lQSO_model = np.sum(np.exp(lQSO_model.logprob_a(noisy_dud_test, covmatlist_dud_test.transpose())),axis=1)
In [15]:
#training data probabilities
lQSO_prior = .06007 #~1800 in a 30,000 sq deg PS1-like survey according to OM10
#TODO: what is the actual magnitude limit of Eric's sample data?
dud_prior = 12300 # reciprocal number of objects in PS1 data sample from eric
plQSO_lQSO = (lQSO_sample_lQSO_model * lQSO_prior)/(lQSO_sample_lQSO_model * lQSO_prior + lQSO_sample_dud_model * dud_prior)
pdud_lQSO = (lQSO_sample_dud_model * dud_prior) / (lQSO_sample_lQSO_model * lQSO_prior + lQSO_sample_dud_model * dud_prior)
plQSO_dud = (dud_sample_lQSO_model * lQSO_prior) / (dud_sample_lQSO_model * lQSO_prior + dud_sample_dud_model * dud_prior)
pdud_dud = (dud_sample_dud_model * dud_prior) / (dud_sample_lQSO_model * lQSO_prior + dud_sample_dud_model * dud_prior)
plQSO_lQSO_test = (lQSO_test_lQSO_model * lQSO_prior)/(lQSO_test_lQSO_model * lQSO_prior + lQSO_test_dud_model * dud_prior)
pdud_lQSO_test = (lQSO_test_dud_model * dud_prior) / (lQSO_test_lQSO_model * lQSO_prior + lQSO_test_dud_model * dud_prior)
plQSO_dud_test = (dud_test_lQSO_model * lQSO_prior) / (dud_test_lQSO_model * lQSO_prior + dud_test_dud_model * dud_prior)
pdud_dud_test = (dud_test_dud_model * dud_prior) / (dud_test_lQSO_model * lQSO_prior + dud_test_dud_model * dud_prior)
In [6]:
_ = plt.hist(plQSO_lQSO,bins=20)
plt.title('lQSO probability of lQSO')
plt.figure()
_ = plt.hist(pdud_lQSO,bins=20)
plt.title('dud probability of lQSO')
plt.figure()
_ = plt.hist(plQSO_dud,bins=20)
plt.title('lQSO probability of dud')
plt.figure()
_ = plt.hist(pdud_dud,bins=20)
plt.title('dud probability of dud')
Out[6]:
In [16]:
#quick comparison to random forest on noisified data
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
from sklearn import metrics
trialRF = RandomForestClassifier()
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",2,4],
'criterion':["gini","entropy"],"min_samples_leaf": [1,2]}
# do a grid search to find the highest 3-fold CV zero-one score
tunedRF = grid_search.GridSearchCV(trialRF, parameters, score_func=metrics.accuracy_score,\
n_jobs = -1, cv = 3,verbose=1)
noisy_train = np.concatenate((noisy_lQSO_train,noisy_dud_train),axis=0)
noisy_truth = np.concatenate((np.ones(noisy_lQSO_train.shape[0]),np.zeros(noisy_dud_train.shape[0])),axis=0)
noisy_test = np.concatenate((noisy_lQSO_test,noisy_dud_test),axis=0)
noisy_truth_test = np.concatenate((np.ones(noisy_lQSO_test.shape[0]),np.zeros(noisy_dud_test.shape[0])),axis=0)
optRF = tunedRF.fit(noisy_train, noisy_truth)
# print the best score and estimator
print(optRF.best_score_)
print(optRF.best_estimator_)
RF_train_probs = optRF.predict_proba(noisy_train)
RF_test_probs = optRF.predict_proba(noisy_test)
In [20]:
from sklearn.metrics import roc_curve
trainTruth = np.concatenate([np.ones(lQSO_train.shape[0]),np.zeros(dud_train.shape[0])])
trainProbs = np.concatenate([plQSO_lQSO,plQSO_dud])
testProbs = np.concatenate([plQSO_lQSO_test,plQSO_dud_test])
fpr, tpr, _ = roc_curve(trainTruth,trainProbs,pos_label=1)
fpr_rf, tpr_rf, _ = roc_curve(trainTruth,RF_train_probs[:,1],pos_label=1)
test_fpr_rf, test_tpr_rf, _ = roc_curve(noisy_truth_test,RF_test_probs[:,1],pos_label=1)
test_fpr, test_tpr, _ = roc_curve(noisy_truth_test,testProbs,pos_label=1)
plt.title('ROC Curve')
plt.plot(fpr,tpr,'b--')
plt.plot(fpr_rf,tpr_rf,'r--')
plt.plot(test_fpr,test_tpr,'b')
plt.plot(test_fpr_rf,test_tpr_rf,'r')
plt.xlabel('FPR')
plt.ylabel('TPR')
Out[20]:
In [3]:
test = fits.getdata('/Users/mbaumer/Downloads/xdcore_000094.fits')
In [8]:
print test['PSTAR']
Hmm, from looking at Bovy's XDQSO catalog, it's clear that the XD "density factor" can be greater than one. In my analysis of Adri's sims I see as high as 54!
In [ ]: