In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split
from astroML.density_estimation import XDGMM
from astropy.io import fits
import triangle
dataLoc = '../../'

labels = ['ID','mag_model_i','g-r', 'r-i', 'i-z', 'WISE1', 'WISE2' ]
pQSO = np.loadtxt(dataLoc+'pQSO/pSDSScolmag.txt')[:,2:]
lQSO = np.loadtxt(dataLoc+'lQSO/SDSScolmag.txt')[:,2:]
sinQSO = np.loadtxt(dataLoc+'sinQSO/sSDSScolmag.txt')[:,2:]
unlQSO = np.loadtxt(dataLoc+'unlQSO/nlSDSScolmag.txt')[:,2:]
unlQSO[:,3:5] = -unlQSO[:,3:5] #bug in WISE magnitudes for this file

duds = np.concatenate((pQSO,unlQSO,sinQSO),axis=0)
data = np.concatenate((lQSO,duds),axis=0) #all sims
truth = np.concatenate((np.ones(lQSO.shape[0]),np.zeros(duds.shape[0])),axis=0)

numPts = data.shape[0]

(lQSO_train, lQSO_test) = train_test_split(lQSO, test_size=0.25)
(dud_train, dud_test) = train_test_split(duds, test_size=0.25)

In [2]:
perturbingSigma = .1
nFeatures=data.shape[1]
z = zip(np.random.normal(scale=perturbingSigma,size=nFeatures),np.random.normal(scale=perturbingSigma,size=nFeatures))
covmat = np.cov(np.round(np.array(z),decimals=4))
covmat = perturbingSigma*np.array([[1 ,.1, .1, .1, .1],
 [.1, 1, .1, .1, .1],
 [.1, .1 ,1 ,.1, .1],
 [.1, .1 ,.1 ,1, .1],
 [.1, .1 ,.1, .1, 1]])


print covmat
print np.linalg.inv(covmat)
print np.linalg.inv(np.linalg.inv(covmat))

def noisify(data,covmat):

    nFeatures = data.shape[1]

    #perturb data
    perturbations = np.random.multivariate_normal(np.zeros(nFeatures),covmat,size=data.shape[0])
    noisyData = data + perturbations

    #XD function needs to be delivered a list of covariance matrices
    covmatlist = np.dstack((covmat,covmat))
    while covmatlist.shape[2] < data.shape[0]:
        covmatlist = np.dstack((covmatlist,covmat))
    return noisyData, covmatlist
    
noisy_lQSO_train, covmatlist_lQSO_train = noisify(lQSO_train,covmat)
noisy_dud_train, covmatlist_dud_train = noisify(dud_train,covmat)
noisy_lQSO_test, covmatlist_lQSO_test = noisify(lQSO_test,covmat)
noisy_dud_test, covmatlist_dud_test = noisify(dud_test,covmat)

#fit indep XD models to both signal and duds
lQSO_model = XDGMM(n_components=30,verbose=True)
dud_model = XDGMM(n_components=30,verbose=True)
lQSO_model.fit(noisy_lQSO_train, covmatlist_lQSO_train.transpose())
dud_model.fit(noisy_dud_train, covmatlist_dud_train.transpose())


[[ 0.1   0.01  0.01  0.01  0.01]
 [ 0.01  0.1   0.01  0.01  0.01]
 [ 0.01  0.01  0.1   0.01  0.01]
 [ 0.01  0.01  0.01  0.1   0.01]
 [ 0.01  0.01  0.01  0.01  0.1 ]]
[[ 10.31746032  -0.79365079  -0.79365079  -0.79365079  -0.79365079]
 [ -0.79365079  10.31746032  -0.79365079  -0.79365079  -0.79365079]
 [ -0.79365079  -0.79365079  10.31746032  -0.79365079  -0.79365079]
 [ -0.79365079  -0.79365079  -0.79365079  10.31746032  -0.79365079]
 [ -0.79365079  -0.79365079  -0.79365079  -0.79365079  10.31746032]]
[[ 0.1   0.01  0.01  0.01  0.01]
 [ 0.01  0.1   0.01  0.01  0.01]
 [ 0.01  0.01  0.1   0.01  0.01]
 [ 0.01  0.01  0.01  0.1   0.01]
 [ 0.01  0.01  0.01  0.01  0.1 ]]
1: log(L) = 124.64
    (3.4 sec)
2: log(L) = 159.87
    (3.4 sec)
3: log(L) = 179.89
    (3.5 sec)
4: log(L) = 192.79
    (3.4 sec)
5: log(L) = 201.77
    (3.5 sec)
6: log(L) = 208.39
    (3.5 sec)
7: log(L) = 213.46
    (3.5 sec)
8: log(L) = 217.48
    (3.6 sec)
9: log(L) = 220.74
    (3.5 sec)
10: log(L) = 223.43
    (3.4 sec)
11: log(L) = 225.69
    (3.4 sec)
12: log(L) = 227.62
    (3.5 sec)
13: log(L) = 229.27
    (3.5 sec)
14: log(L) = 230.7
    (3.5 sec)
15: log(L) = 231.95
    (3.4 sec)
16: log(L) = 233.05
    (3.4 sec)
17: log(L) = 234.02
    (3.5 sec)
18: log(L) = 234.88
    (3.4 sec)
19: log(L) = 235.66
    (3.4 sec)
20: log(L) = 236.35
    (3.4 sec)
21: log(L) = 236.96
    (3.4 sec)
22: log(L) = 237.52
    (3.6 sec)
23: log(L) = 238.02
    (3.4 sec)
24: log(L) = 238.47
    (3.5 sec)
25: log(L) = 238.87
    (3.4 sec)
26: log(L) = 239.23
    (3.5 sec)
27: log(L) = 239.55
    (3.5 sec)
28: log(L) = 239.83
    (3.5 sec)
29: log(L) = 240.07
    (3.4 sec)
30: log(L) = 240.28
    (3.4 sec)
31: log(L) = 240.45
    (3.5 sec)
32: log(L) = 240.59
    (3.5 sec)
33: log(L) = 240.71
    (3.4 sec)
34: log(L) = 240.79
    (3.4 sec)
35: log(L) = 240.85
    (3.4 sec)
36: log(L) = 240.87
    (3.4 sec)
37: log(L) = 240.88
    (3.4 sec)
38: log(L) = 240.85
    (3.4 sec)
1: log(L) = -450.6
    (16 sec)
2: log(L) = -299.61
    (16 sec)
3: log(L) = -221.41
    (16 sec)
4: log(L) = -175.58
    (16 sec)
5: log(L) = -146.35
    (16 sec)
6: log(L) = -126.54
    (16 sec)
7: log(L) = -112.5
    (16 sec)
8: log(L) = -102.18
    (16 sec)
9: log(L) = -94.334
    (16 sec)
10: log(L) = -88.178
    (16 sec)
11: log(L) = -83.193
    (16 sec)
12: log(L) = -79.03
    (16 sec)
13: log(L) = -75.454
    (16 sec)
14: log(L) = -72.305
    (16 sec)
15: log(L) = -69.476
    (16 sec)
16: log(L) = -66.891
    (16 sec)
17: log(L) = -64.499
    (16 sec)
18: log(L) = -62.263
    (16 sec)
19: log(L) = -60.158
    (17 sec)
20: log(L) = -58.166
    (16 sec)
21: log(L) = -56.273
    (16 sec)
22: log(L) = -54.469
    (16 sec)
23: log(L) = -52.747
    (16 sec)
24: log(L) = -51.102
    (16 sec)
25: log(L) = -49.528
    (16 sec)
26: log(L) = -48.023
    (16 sec)
27: log(L) = -46.582
    (16 sec)
28: log(L) = -45.205
    (16 sec)
29: log(L) = -43.887
    (16 sec)
30: log(L) = -42.628
    (16 sec)
31: log(L) = -41.425
    (16 sec)
32: log(L) = -40.275
    (16 sec)
33: log(L) = -39.178
    (16 sec)
34: log(L) = -38.13
    (16 sec)
35: log(L) = -37.13
    (16 sec)
36: log(L) = -36.177
    (16 sec)
37: log(L) = -35.266
    (16 sec)
38: log(L) = -34.398
    (16 sec)
39: log(L) = -33.569
    (16 sec)
40: log(L) = -32.777
    (16 sec)
41: log(L) = -32.021
    (16 sec)
42: log(L) = -31.297
    (16 sec)
43: log(L) = -30.605
    (16 sec)
44: log(L) = -29.942
    (16 sec)
45: log(L) = -29.306
    (16 sec)
46: log(L) = -28.696
    (16 sec)
47: log(L) = -28.109
    (16 sec)
48: log(L) = -27.545
    (16 sec)
49: log(L) = -27.001
    (16 sec)
50: log(L) = -26.477
    (16 sec)
51: log(L) = -25.972
    (16 sec)
52: log(L) = -25.483
    (16 sec)
53: log(L) = -25.012
    (16 sec)
54: log(L) = -24.556
    (16 sec)
55: log(L) = -24.115
    (16 sec)
56: log(L) = -23.688
    (16 sec)
57: log(L) = -23.275
    (16 sec)
58: log(L) = -22.875
    (16 sec)
59: log(L) = -22.488
    (16 sec)
60: log(L) = -22.114
    (16 sec)
61: log(L) = -21.751
    (16 sec)
62: log(L) = -21.4
    (16 sec)
63: log(L) = -21.06
    (16 sec)
64: log(L) = -20.732
    (16 sec)
65: log(L) = -20.413
    (16 sec)
66: log(L) = -20.106
    (16 sec)
67: log(L) = -19.808
    (16 sec)
68: log(L) = -19.521
    (16 sec)
69: log(L) = -19.242
    (16 sec)
70: log(L) = -18.974
    (16 sec)
71: log(L) = -18.715
    (16 sec)
72: log(L) = -18.464
    (16 sec)
73: log(L) = -18.223
    (16 sec)
74: log(L) = -17.99
    (16 sec)
75: log(L) = -17.766
    (16 sec)
76: log(L) = -17.55
    (17 sec)
77: log(L) = -17.343
    (17 sec)
78: log(L) = -17.143
    (16 sec)
79: log(L) = -16.952
    (16 sec)
80: log(L) = -16.768
    (16 sec)
81: log(L) = -16.592
    (16 sec)
82: log(L) = -16.424
    (16 sec)
83: log(L) = -16.264
    (17 sec)
84: log(L) = -16.111
    (16 sec)
85: log(L) = -15.965
    (16 sec)
86: log(L) = -15.828
    (16 sec)
87: log(L) = -15.697
    (16 sec)
88: log(L) = -15.574
    (16 sec)
89: log(L) = -15.459
    (16 sec)
90: log(L) = -15.35
    (16 sec)
91: log(L) = -15.25
    (16 sec)
92: log(L) = -15.156
    (16 sec)
93: log(L) = -15.071
    (16 sec)
94: log(L) = -14.992
    (16 sec)
95: log(L) = -14.922
    (16 sec)
96: log(L) = -14.859
    (16 sec)
97: log(L) = -14.803
    (16 sec)
98: log(L) = -14.755
    (16 sec)
99: log(L) = -14.715
    (16 sec)
100: log(L) = -14.682
    (16 sec)
Out[2]:
<astroML.density_estimation.xdeconv.XDGMM at 0x10a4d65d0>

In [14]:
#compute probabilities
lQSO_sample_lQSO_model = np.sum(np.exp(lQSO_model.logprob_a(noisy_lQSO_train, covmatlist_lQSO_train.transpose())),axis=1)
lQSO_sample_dud_model = np.sum(np.exp(dud_model.logprob_a(noisy_lQSO_train, covmatlist_lQSO_train.transpose())),axis=1)
dud_sample_dud_model = np.sum(np.exp(dud_model.logprob_a(noisy_dud_train, covmatlist_dud_train.transpose())),axis=1)
dud_sample_lQSO_model = np.sum(np.exp(lQSO_model.logprob_a(noisy_dud_train, covmatlist_dud_train.transpose())),axis=1)

lQSO_test_lQSO_model = np.sum(np.exp(lQSO_model.logprob_a(noisy_lQSO_test, covmatlist_lQSO_test.transpose())),axis=1)
lQSO_test_dud_model = np.sum(np.exp(dud_model.logprob_a(noisy_lQSO_test, covmatlist_lQSO_test.transpose())),axis=1)
dud_test_dud_model = np.sum(np.exp(dud_model.logprob_a(noisy_dud_test, covmatlist_dud_test.transpose())),axis=1)
dud_test_lQSO_model = np.sum(np.exp(lQSO_model.logprob_a(noisy_dud_test, covmatlist_dud_test.transpose())),axis=1)

In [15]:
#training data probabilities
lQSO_prior = .06007 #~1800 in a 30,000 sq deg PS1-like survey according to OM10 
#TODO: what is the actual magnitude limit of Eric's sample data?
dud_prior = 12300 # reciprocal number of objects in PS1 data sample from eric 
plQSO_lQSO = (lQSO_sample_lQSO_model * lQSO_prior)/(lQSO_sample_lQSO_model * lQSO_prior + lQSO_sample_dud_model * dud_prior)
pdud_lQSO = (lQSO_sample_dud_model * dud_prior) / (lQSO_sample_lQSO_model * lQSO_prior + lQSO_sample_dud_model * dud_prior)
plQSO_dud = (dud_sample_lQSO_model * lQSO_prior) / (dud_sample_lQSO_model * lQSO_prior + dud_sample_dud_model * dud_prior)
pdud_dud =  (dud_sample_dud_model * dud_prior) /  (dud_sample_lQSO_model * lQSO_prior + dud_sample_dud_model * dud_prior)

plQSO_lQSO_test = (lQSO_test_lQSO_model * lQSO_prior)/(lQSO_test_lQSO_model * lQSO_prior + lQSO_test_dud_model * dud_prior)
pdud_lQSO_test = (lQSO_test_dud_model * dud_prior) / (lQSO_test_lQSO_model * lQSO_prior + lQSO_test_dud_model * dud_prior)
plQSO_dud_test = (dud_test_lQSO_model * lQSO_prior) / (dud_test_lQSO_model * lQSO_prior + dud_test_dud_model * dud_prior)
pdud_dud_test =  (dud_test_dud_model * dud_prior) /  (dud_test_lQSO_model * lQSO_prior + dud_test_dud_model * dud_prior)

In [6]:
_ = plt.hist(plQSO_lQSO,bins=20)
plt.title('lQSO probability of lQSO')
plt.figure()
_ = plt.hist(pdud_lQSO,bins=20)
plt.title('dud probability of lQSO')
plt.figure()
_ = plt.hist(plQSO_dud,bins=20)
plt.title('lQSO probability of dud')
plt.figure()
_ = plt.hist(pdud_dud,bins=20)
plt.title('dud probability of dud')


Out[6]:
<matplotlib.text.Text at 0x104afd190>

In [16]:
#quick comparison to random forest on noisified data
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
from sklearn import metrics

trialRF = RandomForestClassifier()
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",2,4],
              'criterion':["gini","entropy"],"min_samples_leaf": [1,2]}

# do a grid search to find the highest 3-fold CV zero-one score
tunedRF = grid_search.GridSearchCV(trialRF, parameters, score_func=metrics.accuracy_score,\
                                    n_jobs = -1, cv = 3,verbose=1)
noisy_train = np.concatenate((noisy_lQSO_train,noisy_dud_train),axis=0)
noisy_truth = np.concatenate((np.ones(noisy_lQSO_train.shape[0]),np.zeros(noisy_dud_train.shape[0])),axis=0)
noisy_test = np.concatenate((noisy_lQSO_test,noisy_dud_test),axis=0)
noisy_truth_test = np.concatenate((np.ones(noisy_lQSO_test.shape[0]),np.zeros(noisy_dud_test.shape[0])),axis=0)
optRF = tunedRF.fit(noisy_train, noisy_truth)

# print the best score and estimator
print(optRF.best_score_)
print(optRF.best_estimator_)

RF_train_probs = optRF.predict_proba(noisy_train)
RF_test_probs = optRF.predict_proba(noisy_test)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
/Users/mbaumer/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py:466: DeprecationWarning: Passing function as ``score_func`` is deprecated and will be removed in 0.15. Either use strings or score objects.The relevant new parameter is called ''scoring''.
  self.loss_func, self.score_func, self.scoring)
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   17.3s finished
0.881653992395
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion=gini, max_depth=None, max_features=auto,
            min_density=None, min_samples_leaf=2, min_samples_split=2,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)

In [20]:
from sklearn.metrics import roc_curve
trainTruth = np.concatenate([np.ones(lQSO_train.shape[0]),np.zeros(dud_train.shape[0])])
trainProbs = np.concatenate([plQSO_lQSO,plQSO_dud])
testProbs = np.concatenate([plQSO_lQSO_test,plQSO_dud_test])
fpr, tpr, _ = roc_curve(trainTruth,trainProbs,pos_label=1)
fpr_rf, tpr_rf, _ = roc_curve(trainTruth,RF_train_probs[:,1],pos_label=1)
test_fpr_rf, test_tpr_rf, _ = roc_curve(noisy_truth_test,RF_test_probs[:,1],pos_label=1)
test_fpr, test_tpr, _ = roc_curve(noisy_truth_test,testProbs,pos_label=1)
plt.title('ROC Curve')
plt.plot(fpr,tpr,'b--')
plt.plot(fpr_rf,tpr_rf,'r--')
plt.plot(test_fpr,test_tpr,'b')
plt.plot(test_fpr_rf,test_tpr_rf,'r')
plt.xlabel('FPR')
plt.ylabel('TPR')


Out[20]:
<matplotlib.text.Text at 0x10bb09dd0>

In [3]:
test = fits.getdata('/Users/mbaumer/Downloads/xdcore_000094.fits')

In [8]:
print test['PSTAR']


[  9.99997541e-01   8.95314494e-01   9.99934148e-01   9.99997841e-01
   9.99999673e-01   9.96007379e-01   9.99357509e-01   9.58773115e-01
   9.78032992e-01   9.99999935e-01   9.99992040e-01   9.84752323e-01
   9.99999811e-01   9.82736030e-01   9.97955201e-01   9.68154739e-01
   9.69744004e-01   9.62718035e-01   9.99999638e-01   9.54606012e-01
   9.95664419e-01   9.99735642e-01   9.93665839e-01   9.06727273e-01
   9.96384841e-01   9.92775939e-01   9.98635999e-01   9.74086846e-01
   9.55341178e-01   9.96779529e-01   9.95995741e-01   9.99999520e-01
   9.95541248e-01   9.69512259e-01   9.85131411e-01   9.99144071e-01
   9.98621358e-01   9.63278701e-01   9.95366692e-01   9.96590614e-01
   9.99997675e-01   9.99966208e-01   9.99999029e-01   9.99998551e-01
   9.98420483e-01   9.98211327e-01   7.59943861e-01   9.99642012e-01
   9.99967553e-01   9.99042967e-01   9.99926608e-01   9.97905125e-01
   9.99867681e-01   9.99999283e-01   9.99998483e-01   9.99916176e-01
   9.99982659e-01   9.99982704e-01   9.99938674e-01   9.99448803e-01
   9.85194597e-01   9.99597690e-01   9.99999788e-01   9.85866426e-01
   9.99999593e-01   9.68365597e-01   9.99669780e-01   9.98661863e-01
   9.30375026e-01   9.99993674e-01   9.99844595e-01   9.99998758e-01
   9.99854056e-01   9.99998013e-01   9.99999726e-01   9.85785875e-01
   9.99997676e-01   9.39065233e-01   9.97968333e-01   8.76457046e-01
   9.99420177e-01   9.58320256e-01   9.87131633e-01   9.97924741e-01
   9.41722131e-01   9.89521962e-01   9.42695233e-01   9.98201097e-01
   9.96642257e-01   9.71332477e-01   9.98406131e-01   9.29445004e-01
   9.99368828e-01   9.99998855e-01   9.99999936e-01   9.80593199e-01
   9.91830268e-01   9.99148204e-01   9.34737533e-01   9.99893143e-01
   9.97666168e-01   9.99584944e-01   9.92825999e-01   9.57939532e-01
   9.93771675e-01   9.99999861e-01   5.70500940e-01   9.99953714e-01
   9.04241452e-01   9.97961419e-01   9.50483049e-01   9.99997897e-01
   9.98356604e-01   9.24108100e-01   9.99994345e-01   9.99985468e-01
   9.95713391e-01   9.46698142e-01   9.96787771e-01   9.99552557e-01
   9.99388376e-01   9.99999952e-01   9.97716498e-01   9.99584984e-01
   9.99732994e-01   9.99999350e-01   9.91412855e-01   9.88148854e-01
   9.99976519e-01   9.99996731e-01   9.77218988e-01   9.99996748e-01
   9.83202264e-01   9.85361669e-01   9.99956840e-01   9.99357131e-01
   9.99146813e-01   9.99686271e-01   9.99816246e-01   9.99998200e-01
   9.99989389e-01   9.99996551e-01   9.48961668e-01   9.76336522e-01
   9.99922847e-01   9.69786908e-01   9.99999158e-01   9.99999971e-01
   5.51536499e-02   9.10727088e-01   9.99723265e-01   9.99901978e-01
   9.99361450e-01   6.92338276e-01   9.77033618e-01   9.99674249e-01
   9.99994296e-01   9.83436034e-01   9.79944250e-01   9.99681925e-01
   9.68644192e-01   9.89285007e-01   9.95323837e-01   9.79201141e-01
   9.96231319e-01   9.99998524e-01   9.68127951e-01   9.94286020e-01
   9.99042082e-01   9.90707638e-01   1.10488201e-05   9.99177465e-01
   9.85825618e-01   9.94181713e-01   9.98678199e-01   9.59321831e-01
   9.99949045e-01   9.99977521e-01   9.99973662e-01   9.99999958e-01
   9.99999525e-01   9.70630856e-01   9.82679393e-01   9.99999846e-01
   9.98757394e-01   9.99997405e-01   9.99040703e-01   9.99933065e-01
   9.92785695e-01   9.97929237e-01   9.97481966e-01]

Hmm, from looking at Bovy's XDQSO catalog, it's clear that the XD "density factor" can be greater than one. In my analysis of Adri's sims I see as high as 54!


In [ ]: