Mine domain #1. Extract subgroups with high concentration of PHAs



In [1]:

    
import pickle
from copy import copy, deepcopy
import numpy as np
import pandas as pd
from sklearn import neighbors, svm
import matplotlib as mpl

# Import Asterion modules
import read_database as rdb
import learn_data as ld
import asterion_learn as al
import visualize_data as vd

# Plotting settings for the current notebook
%matplotlib inline
# font = {'size': 25}
font = {'size': 14}
mpl.rc('font', **font)
plotgrid = np.array([[0.0, 0.0], [1.0, 1.0]])

Load generated and real NEAs from the domain #1



In [2]:

    
dirpath = './asteroid_data/'
real_datasets = ['haz_real', 'nohaz_real']
gen_datasets = ['haz_gen', 'nohaz_gen']
genu_datasets = ['haz_gen', 'nohaz_gen']
name_suffixes = ['_dom1.p', '_dom1_rest.p']



In [3]:

    
dumps_real = [dirpath + ds + ns for ns in name_suffixes for ds in real_datasets]
dumps_gen = [dirpath + ds + ns for ns in name_suffixes for ds in gen_datasets]
dumps_genu = [dirpath + ds + ns for ns in name_suffixes for ds in genu_datasets]

haz_real, nohaz_real, haz_real_rest, nohaz_real_rest = map(rdb.loadObject, dumps_real)
haz_gen, nohaz_gen, haz_gen_rest, nohaz_gen_rest = map(rdb.loadObject, dumps_gen)
haz_genu, nohaz_genu, haz_genu_rest, nohaz_genu_rest = map(rdb.loadObject, dumps_genu)



In [4]:

    
gen_num = sum(map(len, [haz_gen, nohaz_gen]))
real_num = sum(map(len, [haz_real, nohaz_real]))

print "Number of virtual asteroids in the domain:", gen_num
print "Number of real asteroids in the domain:", real_num









    



Number of virtual asteroids in the domain: 76944
Number of real asteroids in the domain: 4336

Investigate distributions of orbital parameters for the domain #1



In [5]:

    
# vd.plot_alldistcombs(haz_gen, nohaz_gen, labels=True)

Cut off non-hazardous asteroids with high values of q



In [6]:

    
cutcol = ['w', 'q']
q_split = 1.066
p1, p2 = [[0.0, q_split], [360.0, q_split]]
vd.plot_distributions2d(cutcol, haz_gen, nohaz_gen, line=[p1, p2], invertaxes=[0,1], labels=True)









    



/usr/lib/pymodules/python2.7/matplotlib/collections.py:548: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == 'face':



In [7]:

    
haz_gen_dom1, haz_gen_dom1__ = ld.split_by_colval(haz_gen, 'q', q_split)
nohaz_gen_dom1, nohaz_gen_dom1__ = ld.split_by_colval(nohaz_gen, 'q', q_split)

haz_genu_dom1, haz_genu_dom1__ = ld.split_by_colval(haz_genu, 'q', q_split)
nohaz_genu_dom1, nohaz_genu_dom1__ = ld.split_by_colval(nohaz_genu, 'q', q_split)

haz_real_dom1, haz_real_dom1__ = ld.split_by_colval(haz_real, 'q', q_split)
nohaz_real_dom1, nohaz_real_dom1__ = ld.split_by_colval(nohaz_real, 'q', q_split)



In [8]:

    
gen_dom1_num = sum(map(len, [haz_gen_dom1, nohaz_gen_dom1]))
real_dom1_num = sum(map(len, [haz_real_dom1, nohaz_real_dom1]))
print gen_dom1_num
print real_dom1_num



In [ ]:

Atiras & Atens



In [9]:

    
haz_gen_extracted_aa = []
nohaz_gen_trapped_aa = []

haz_real_extracted_aa = []
nohaz_real_trapped_aa = []

Atiras



In [10]:

    
haz_gen_atiras, haz_gen_atiras_num = rdb.get_atiras(haz_gen_dom1)
nohaz_gen_atiras, nohaz_gen_atiras_num = rdb.get_atiras(nohaz_gen_dom1)
atiras_gen_num = haz_gen_atiras_num + nohaz_gen_atiras_num

haz_real_atiras, haz_real_atiras_num = rdb.get_atiras(haz_real_dom1)
nohaz_real_atiras, nohaz_real_atiras_num = rdb.get_atiras(nohaz_real_dom1)
atiras_real_num = haz_real_atiras_num + nohaz_real_atiras_num

print "Number of virtual Atiras:", atiras_gen_num
print "Number of real Atiras:", atiras_real_num









    



Number of virtual Atiras: 554
Number of real Atiras: 0

Atens



In [11]:

    
haz_gen_atens, haz_gen_atens_num = rdb.get_atens(haz_gen_dom1)
nohaz_gen_atens, nohaz_gen_atens_num = rdb.get_atens(nohaz_gen_dom1)
atens_gen_num = haz_gen_atens_num + nohaz_gen_atens_num

haz_real_atens, haz_real_atens_num = rdb.get_atens(haz_real_dom1)
nohaz_real_atens, nohaz_real_atens_num = rdb.get_atens(nohaz_real_dom1)
atens_real_num = haz_real_atens_num + nohaz_real_atens_num

print "Number of virtual Atens:", atens_gen_num
print "Number of real Atens:", atens_real_num









    



Number of virtual Atens: 1053
Number of real Atens: 39

Atiras + Atens



In [12]:

    
haz_gen_atiras_atens = pd.concat((haz_gen_atiras, haz_gen_atens))
nohaz_gen_atiras_atens = pd.concat((nohaz_gen_atiras, nohaz_gen_atens))

haz_gen_atiras_atens_num = len(haz_gen_atiras_atens)
nohaz_gen_atiras_atens_num = len(nohaz_gen_atiras_atens)
atiras_atens_gen_num = haz_gen_atiras_atens_num + nohaz_gen_atiras_atens_num

haz_real_atiras_atens = pd.concat((haz_real_atiras, haz_real_atens))
nohaz_real_atiras_atens = pd.concat((nohaz_real_atiras, nohaz_real_atens))

haz_real_atiras_atens_num = len(haz_real_atiras_atens)
nohaz_real_atiras_atens_num = len(nohaz_real_atiras_atens)
atiras_atens_real_num = haz_real_atiras_atens_num + nohaz_real_atiras_atens_num



In [13]:

    
print "Number of virtual PHAs in the group:", haz_gen_atiras_atens_num
print "Number of virtual NHAs in the group:", nohaz_gen_atiras_atens_num
print "Number of virtual Atiras and Atens:", atiras_atens_gen_num
print "Virtual Atiras and Atens group weight:", float(atiras_atens_gen_num)/gen_dom1_num









    



Number of virtual PHAs in the group: 1053
Number of virtual NHAs in the group: 554
Number of virtual Atiras and Atens: 1607
Virtual Atiras and Atens group weight: 0.0225645202056



In [14]:

    
print "Number of real PHAs in the group:", haz_real_atiras_atens_num
print "Number of real NHAs in the group:", nohaz_real_atiras_atens_num
print "Number of real Atiras and Atens:", atiras_atens_real_num
print "Real Atiras and Atens group weight:", float(atiras_atens_real_num)/real_dom1_num









    



Number of real PHAs in the group: 37
Number of real NHAs in the group: 2
Number of real Atiras and Atens: 39
Real Atiras and Atens group weight: 0.00986592461422

Plot distributions of 'a' and 'i' parameters



In [15]:

    
cutcol = ['a', 'i']
vd.plot_distributions2d(cutcol, haz_gen_atiras_atens, nohaz_gen_atiras_atens, labels=True)

Separate PHAs from NHAs by the LinearSVC

Cut a and i columns and nomalize datasets



In [16]:

    
cutcol = ['a', 'i']
pairs, atiras_atens_ai_sc = ld.cut_normalize(cutcol, 
                                             [haz_gen_atiras_atens, nohaz_gen_atiras_atens], 
                                             [haz_real_atiras_atens, nohaz_real_atiras_atens])

haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]

Find decision surface with SVM



In [17]:

    
clf_atiras_atens_ai = svm.LinearSVC()
xtrain, ytrain = ld.mix_up(haz_gen_cut, nohaz_gen_cut)
clf_atiras_atens_ai = clf_atiras_atens_ai.fit(xtrain, ytrain)



In [18]:

    
# reload(al)
# clf_aa_ai = svm.LinearSVC()
# splitres = al.split_by_clf(clf_aa_ai, cutcol, haz_gen_atiras_atens, 
#                                               nohaz_gen_atiras_atens)
# haz_gen_aa_ai, nohaz_gen_aa_ai = splitres[0]
# haz_gen_aa_ai__, nohaz_gen_aa_ai__ = splitres[1]
# aa_ai_sc = splitres[2]

Estimate split quality for virtual Atiras & Atens



In [19]:

    
predicted_gen = al.clf_split_quality(clf_atiras_atens_ai, haz_gen_cut, nohaz_gen_cut)

haz_gen_atiras_atens_ai = haz_gen_atiras_atens.iloc[predicted_gen[0]]
nohaz_gen_atiras_atens_ai = haz_gen_atiras_atens.iloc[predicted_gen[1]]

haz_gen_atiras_atens_ai__ = haz_gen_atiras_atens.iloc[predicted_gen[2]]
nohaz_gen_atiras_atens_ai__ = haz_gen_atiras_atens.iloc[predicted_gen[3]]









    



purity of PHA region: 0.923357664234
number of PHAs in the PHA region: 1012
number of NHAs in the PHA region: 84

purity of NHA region: 0.919765166341
number of PHAs in the NHA region: 41
number of NHAs in the NHA region: 470

fraction of correctly classified PHAs: 0.96106362773

Estimate split quality for real Atiras & Atens



In [20]:

    
predicted_real = al.clf_split_quality(clf_atiras_atens_ai, haz_real_cut, nohaz_real_cut)

haz_real_atiras_atens_ai = haz_real_atiras_atens.iloc[predicted_real[0]]
nohaz_real_atiras_atens_ai = haz_real_atiras_atens.iloc[predicted_real[1]]

haz_real_atiras_atens_ai__ = haz_real_atiras_atens.iloc[predicted_real[2]]
nohaz_real_atiras_atens_ai__ = haz_real_atiras_atens.iloc[predicted_real[3]]









    



purity of PHA region: 1.0
number of PHAs in the PHA region: 36
number of NHAs in the PHA region: 0

purity of NHA region: 0.666666666667
number of PHAs in the NHA region: 1
number of NHAs in the NHA region: 2

fraction of correctly classified PHAs: 0.972972972973

Plot decision boundary



In [21]:

    
vd.plot_clf2d(clf_atiras_atens_ai, cutcol, haz_cut=haz_gen_cut, nohaz_cut=nohaz_gen_cut,
              num=400, scales=atiras_atens_ai_sc, cmap='winter', figsize=(8,8))



In [22]:

    
haz_gen_extracted_aa.append(haz_gen_atiras_atens_ai)
nohaz_gen_trapped_aa.append(nohaz_gen_atiras_atens_ai)

haz_real_extracted_aa.append(haz_real_atiras_atens_ai)
nohaz_real_trapped_aa.append(nohaz_real_atiras_atens_ai)

Atiras & Atens divisions qualitiy

Divisions quality for virtual Atiras & Atens



In [23]:

    
vd.print_summary(haz_gen_extracted_aa, nohaz_gen_trapped_aa, 
                 haz_gen_atiras_atens, nohaz_gen_atiras_atens, 'virtual')









    



Number of correctly classified virtual PHAs 1012
Number of trapped virtual NHAs: 84

Mass fraction of correctly classified virtual PHAs: 0.96106362773
Mass fraction of trapped virtual NHAs: 0.151624548736

Cummulative purity of the outlined PHA regions: 0.923357664234

Divisions quality for real Atiras & Atens



In [24]:

    
vd.print_summary(haz_real_extracted_aa, nohaz_real_trapped_aa, 
                 haz_real_atiras_atens, nohaz_real_atiras_atens, 'real')









    



Number of correctly classified real PHAs 36
Number of trapped real NHAs: 0

Mass fraction of correctly classified real PHAs: 0.972972972973
Mass fraction of trapped real NHAs: 0.0

Cummulative purity of the outlined PHA regions: 1.0

Apollos



In [25]:

    
haz_gen_extracted_ap = []
nohaz_gen_trapped_ap = []

haz_real_extracted_ap = []
nohaz_real_trapped_ap = []



In [26]:

    
haz_gen_apollo, haz_gen_apollo_num = rdb.get_apollos(haz_gen_dom1)
nohaz_gen_apollo, nohaz_gen_apollo_num = rdb.get_apollos(nohaz_gen_dom1)
apollo_gen_num = haz_gen_apollo_num + nohaz_gen_apollo_num

haz_real_apollo, haz_real_apollo_num = rdb.get_apollos(haz_real_dom1)
nohaz_real_apollo, nohaz_real_apollo_num = rdb.get_apollos(nohaz_real_dom1)
apollo_real_num = haz_real_apollo_num + nohaz_real_apollo_num



In [27]:

    
print "Number of virtual PHAs in the group:", haz_gen_apollo_num
print "Number of virtual NHAs in the group:", nohaz_gen_apollo_num
print "Number of virtual Apollo:", apollo_gen_num
print "Apollo group weight:", float(apollo_gen_num)/gen_dom1_num









    



Number of virtual PHAs in the group: 21977
Number of virtual NHAs in the group: 22929
Number of virtual Apollo: 44906
Apollo group weight: 0.630542840293



In [28]:

    
print "Number of real PHAs in the group:", haz_real_apollo_num
print "Number of real NHAs in the group:", nohaz_real_apollo_num
print "Number of real Apollo:", apollo_real_num
print "Apollo group weight:", float(apollo_real_num)/real_dom1_num









    



Number of real PHAs in the group: 1708
Number of real NHAs in the group: 1014
Number of real Apollo: 2722
Apollo group weight: 0.688590943587



In [29]:

    
# vd.display_allparams([apollos_haz, apollos_nohaz], vd.combs, vd.colnames)

Split Apollos by a w-q-i surface

Amplify datasets by their symmetric copies over 'w' parameter



In [30]:

    
# haz_gen_apollo_se = ld.add_mirror_column(haz_gen_apollo, 'w', 180.0)
# nohaz_gen_apollo_se = ld.add_mirror_column(nohaz_gen_apollo, 'w', 180.0)

haz_gen_apollo_se = ld.add_doublemirror_column(haz_gen_apollo, 'w', 180.0)
nohaz_gen_apollo_se = ld.add_doublemirror_column(nohaz_gen_apollo, 'w', 180.0)



In [31]:

    
cutcol = ['w', 'q']
vd.plot_distributions2d(cutcol, haz_gen_apollo_se, nohaz_gen_apollo_se, invertaxes=[0,1], labels=True)

Cut off annoying tips



In [32]:

    
haz_gen_apollo_bq = haz_gen_apollo[haz_gen_apollo.q > 0.7]
nohaz_gen_apollo_bq = nohaz_gen_apollo[nohaz_gen_apollo.q > 0.7]

haz_gen_apollo_sq = haz_gen_apollo[haz_gen_apollo.q <= 0.7]
nohaz_gen_apollo_sq = nohaz_gen_apollo[nohaz_gen_apollo.q <= 0.7]

haz_gen_apollo_se_bq = haz_gen_apollo_se[haz_gen_apollo_se.q > 0.7]
nohaz_gen_apollo_se_bq = nohaz_gen_apollo_se[nohaz_gen_apollo_se.q > 0.7]

Cut w and i columns and nomalize datasets



In [33]:

    
cutcol = ['w', 'q', 'i']
pairs, apollo_wqi_sc = ld.cut_normalize(cutcol, 
                                        [haz_gen_apollo, nohaz_gen_apollo], 
                                        [haz_real_apollo, nohaz_real_apollo], 
                                        [haz_gen_apollo_se_bq, nohaz_gen_apollo_se_bq])

haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_se_cut, nohaz_gen_se_cut = pairs[2]

Train SVM



In [34]:

    
clf_apollo_wqi = svm.SVC(gamma=40.0, C=0.05, class_weight={0: 1.2})
# clf_apollo_wqi = svm.SVC(gamma=35.0, C=0.05, class_weight={0: 1.2})
xtrain, ytrain = ld.mix_up(haz_gen_se_cut, nohaz_gen_se_cut)
clf_apollo_wqi = clf_apollo_wqi.fit(xtrain, ytrain)



In [35]:

    
# # reload(al)
# cutcol = ['w', 'q', 'i']

# # clf_apollo_wqi3 = neighbors.KNeighborsClassifier()
# # splitres = al.split_by_clf(clf_apollo_wqi3, cutcol,  haz_gen_apollo_se_bq,
# #                                                      nohaz_gen_apollo_se_bq,
# #                                                      haz_gen_apollo,
# #                                                      nohaz_gen_apollo)



# clf_apollo_wqi3 = svm.SVC(gamma=40.0, C=0.05, class_weight={0: 1.2}) #class_weight={0: 1.5} 
# # clf_apollo_wqi3 = svm.NuSVC(gamma=10.0, class_weight={0: 1.5}) #class_weight={0: 1.5}
# #(20 0.5), (30 0.1) (5 0.01)
# splitres = al.split_by_clf(clf_apollo_wqi3, cutcol,  haz_gen_apollo_se_bq,
#                                                      nohaz_gen_apollo_se_bq,
#                                                      haz_gen_apollo,
#                                                      nohaz_gen_apollo)

# haz_gen_apollo_wqi3, nohaz_gen_apollo_wqi3 = splitres[0]
# haz_gen_apollo_wqi3__, nohaz_gen_apollo_wqi3__ = splitres[1]
# apollo_wqi3_sc = splitres[2]
# clf_apollo_wqi = clf_apollo_wqi3

Estimate split quality for virtual Apollos



In [36]:

    
predicted_gen = al.clf_split_quality(clf_apollo_wqi, haz_gen_cut, nohaz_gen_cut)

haz_gen_apollo_wqi = haz_gen_apollo.iloc[predicted_gen[0]]
nohaz_gen_apollo_wqi = nohaz_gen_apollo.iloc[predicted_gen[1]]

haz_gen_apollo_wqi__ = haz_gen_apollo.iloc[predicted_gen[2]]
nohaz_gen_apollo_wqi__ = nohaz_gen_apollo.iloc[predicted_gen[3]]









    



purity of PHA region: 0.91442363188
number of PHAs in the PHA region: 17428
number of NHAs in the PHA region: 1631

purity of NHA region: 0.824002785623
number of PHAs in the NHA region: 4549
number of NHAs in the NHA region: 21298

fraction of correctly classified PHAs: 0.793010875006

Estimate split quality for real Apollos



In [37]:

    
predicted_real = al.clf_split_quality(clf_apollo_wqi, haz_real_cut, nohaz_real_cut)

haz_real_apollo_wqi = haz_real_apollo.iloc[predicted_real[0]]
nohaz_real_apollo_wqi = nohaz_real_apollo.iloc[predicted_real[1]]

haz_real_apollo_wqi__ = haz_real_apollo.iloc[predicted_real[2]]
nohaz_real_apollo_wqi__ = nohaz_real_apollo.iloc[predicted_real[3]]









    



purity of PHA region: 0.938098747237
number of PHAs in the PHA region: 1273
number of NHAs in the PHA region: 84

purity of NHA region: 0.681318681319
number of PHAs in the NHA region: 435
number of NHAs in the NHA region: 930

fraction of correctly classified PHAs: 0.745316159251



In [ ]:

Prepare w-q domain mask to exclude out-of-domain points from the plot



In [38]:

    
genu = pd.concat((haz_gen, nohaz_gen))
genu_rest = pd.concat((haz_genu_rest, nohaz_genu_rest))

apollo_wq_sc = apollo_wqi_sc[:2]



In [39]:

    
cutcol_ = ['w', 'q']
clfmask = svm.SVC(gamma=10.0, C=1e3) # class_weight={1: 2}
clfmask = al.sgmask_clf2d_fit(clfmask, cutcol_, genu, genu_rest, apollo_wq_sc)



In [40]:

    
vd.plot_clf2d(clfmask, cutcol_, num=250, figsize=(6,6), scales=apollo_wq_sc, 
              labels=True, cmap='Blues', invertaxes=[0, 1])



In [41]:

    
# cutcol_ = ['w', 'q']
# # labels = [vd.colnames[nm] for nm in cutcol]

# clfmask = svm.SVC(gamma=10.0, C=1e3) # class_weight={1: 2}
# clfmask = al.sgmask_clf(haz_gen_apollo_se, nohaz_gen_apollo_se, 
#                         haz_genu_rest, nohaz_genu_rest, clfmask, cutcol_)

# # clfmask = al.sgmask_clf(haz_gen_apollo, nohaz_gen_apollo, 
# #                         haz_genu_rest, nohaz_genu_rest, clfmask, cutcol)



In [42]:

    
# plotgrid = np.array([[0.0, 0.0], [1.0, 1.0]])
# scales = ld.dfcommon_bounds([haz_gen_apollo_se, nohaz_gen_apollo_se], cutcol_)
# # vd.plot_classifier(plotgrid, clfmask, num=200, figsize=(6,6), scales=scales, 
# #                    labels = labels, cmap='Blues', invertaxes=[0, 1])

# vd.plot_clf2d(clfmask, cutcol_, num=200, figsize=(6,6), scales=scales, 
#                    labels=True, cmap='Blues', invertaxes=[0, 1])



In [ ]:

Plot decision surface



In [43]:

    
# cutcol = ['w', 'q', 'i']
vd.plot_clf3d(clf_apollo_wqi, cutcol, num=250, labels=True, figsize=(10,9), mode='2d', 
              scales=apollo_wqi_sc, clf_masks=[(clfmask, 0)], invertaxes=[0, 1])



In [44]:

    
haz_gen_extracted_ap.append(haz_gen_apollo_wqi)
nohaz_gen_trapped_ap.append(nohaz_gen_apollo_wqi)

haz_real_extracted_ap.append(haz_real_apollo_wqi)
nohaz_real_trapped_ap.append(nohaz_real_apollo_wqi)

Split rest of Apollos by a w-q-a surface

Amplify datasets by their symmetric copies over the 'w' parameter



In [45]:

    
haz_gen_apollo_wqi__se = ld.add_doublemirror_column(haz_gen_apollo_wqi__, 'w', 180.0)
nohaz_gen_apollo_wqi__se = ld.add_doublemirror_column(nohaz_gen_apollo_wqi__, 'w', 180.0)

Cut w, q and a columns and nomalize datasets



In [46]:

    
# reload(ld)
cutcol = ['w', 'q', 'a']
pairs, apollo_wqa_sc = ld.cut_normalize(cutcol, [haz_gen_apollo_wqi__, nohaz_gen_apollo_wqi__], 
                                                 [haz_real_apollo_wqi__, nohaz_real_apollo_wqi__], 
                                                 [haz_gen_apollo_wqi__se, nohaz_gen_apollo_wqi__se])
haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_se_cut, nohaz_gen_se_cut = pairs[2]

Train SVM



In [47]:

    
clf_apollo_wqa = svm.SVC(gamma=40.0, C=0.1, class_weight={0: 1.5})
xtrain, ytrain = ld.mix_up(haz_gen_se_cut, nohaz_gen_se_cut)
clf_apollo_wqa = clf_apollo_wqa.fit(xtrain, ytrain)



In [48]:

    
# # reload(al)
# cutcol = ['w', 'q', 'a']
# clf_apollo_wqa3 = svm.SVC(gamma=40.0, C=0.1, class_weight={0: 1.5}) #class_weight={0: 1.5}
# # (30 0.5)
# splitres = al.split_by_clf(clf_apollo_wqa3, cutcol,  haz_gen_apollo_wqi3__se,
#                                                      nohaz_gen_apollo_wqi3__se,
#                                                      haz_gen_apollo_wqi3__,
#                                                      nohaz_gen_apollo_wqi3__)

# haz_gen_apollo_wqa3, nohaz_gen_apollo_wqa3 = splitres[0]
# haz_gen_apollo_wqa3__, nohaz_gen_apollo_wqa3__ = splitres[1]
# apollo_wqa3_sc = splitres[2]

Estimate split quality for virtual Apollos



In [49]:

    
predicted_gen = al.clf_split_quality(clf_apollo_wqa, haz_gen_cut, nohaz_gen_cut)

haz_gen_apollo_wqa = haz_gen_apollo_wqi__.iloc[predicted_gen[0]]
nohaz_gen_apollo_wqa = nohaz_gen_apollo_wqi__.iloc[predicted_gen[1]]

haz_gen_apollo_wqa__ = haz_gen_apollo_wqi__.iloc[predicted_gen[2]]
nohaz_gen_apollo_wqa__ = nohaz_gen_apollo_wqi__.iloc[predicted_gen[3]]









    



purity of PHA region: 0.909785483397
number of PHAs in the PHA region: 3096
number of NHAs in the PHA region: 307

purity of NHA region: 0.935261094279
number of PHAs in the NHA region: 1453
number of NHAs in the NHA region: 20991

fraction of correctly classified PHAs: 0.68058914047

Estimate split quality for real Apollos



In [50]:

    
predicted_real = al.clf_split_quality(clf_apollo_wqa, haz_real_cut, nohaz_real_cut)

haz_real_apollo_wqa = haz_real_apollo_wqi__.iloc[predicted_real[0]]
nohaz_real_apollo_wqa = nohaz_real_apollo_wqi__.iloc[predicted_real[1]]

haz_real_apollo_wqa__ = haz_real_apollo_wqi__.iloc[predicted_real[2]]
nohaz_real_apollo_wqa__ = nohaz_real_apollo_wqi__.iloc[predicted_real[3]]









    



purity of PHA region: 0.909090909091
number of PHAs in the PHA region: 360
number of NHAs in the PHA region: 36

purity of NHA region: 0.922600619195
number of PHAs in the NHA region: 75
number of NHAs in the NHA region: 894

fraction of correctly classified PHAs: 0.827586206897



In [51]:

    
# clf_masks = [(clfmask, 0)]
# cutcol = ['w', 'q', 'a']
# labels = [vd.colnames[nm] for nm in cutcol]
vd.plot_clf3d(clf_apollo_wqa, cutcol, num=250, labels=True, figsize=(10,9), mode='2d', 
              scales=apollo_wqa_sc, clf_masks=[(clfmask, 0)], invertaxes=[0, 1])



In [52]:

    
haz_gen_extracted_ap.append(haz_gen_apollo_wqa)
nohaz_gen_trapped_ap.append(nohaz_gen_apollo_wqa)

haz_real_extracted_ap.append(haz_real_apollo_wqa)
nohaz_real_trapped_ap.append(nohaz_real_apollo_wqa)

Apollo divisions qualitiy

Divisions quality for virtual Apollos



In [53]:

    
vd.print_summary(haz_gen_extracted_ap, nohaz_gen_trapped_ap, haz_gen_apollo, nohaz_gen_apollo, 'virtual')









    



Number of correctly classified virtual PHAs 20524
Number of trapped virtual NHAs: 1938

Mass fraction of correctly classified virtual PHAs: 0.933885425672
Mass fraction of trapped virtual NHAs: 0.0845217846395

Cummulative purity of the outlined PHA regions: 0.913720950939

Divisions quality for real Apollos



In [54]:

    
vd.print_summary(haz_real_extracted_ap, nohaz_real_trapped_ap, haz_real_apollo, nohaz_real_apollo, 'real')









    



Number of correctly classified real PHAs 1633
Number of trapped real NHAs: 120

Mass fraction of correctly classified real PHAs: 0.956088992974
Mass fraction of trapped real NHAs: 0.118343195266

Cummulative purity of the outlined PHA regions: 0.931545921278



In [ ]:

Amors



In [55]:

    
haz_gen_extracted_am = []
nohaz_gen_trapped_am = []

haz_real_extracted_am = []
nohaz_real_trapped_am = []



In [56]:

    
haz_gen_amor, haz_gen_amors_num = rdb.get_amors(haz_gen_dom1)
nohaz_gen_amor, nohaz_gen_amors_num = rdb.get_amors(nohaz_gen_dom1)
amors_gen_num = haz_gen_amors_num + nohaz_gen_amors_num

haz_real_amor, haz_real_amors_num = rdb.get_amors(haz_real_dom1)
nohaz_real_amor, nohaz_real_amors_num = rdb.get_amors(nohaz_real_dom1)
amors_real_num = haz_real_amors_num + nohaz_real_amors_num



In [57]:

    
print "Number of virtual PHAs in the group:", haz_gen_amors_num
print "Number of virtual NHAs in the group:", nohaz_gen_amors_num
print "Number of virtual Amor:", amors_gen_num
print "Amor group weight:", float(amors_gen_num)/gen_dom1_num









    



Number of virtual PHAs in the group: 4092
Number of virtual NHAs in the group: 20613
Number of virtual Amor: 24705
Amor group weight: 0.346892639501



In [58]:

    
print "Number of real PHAs in the group:", haz_real_amors_num
print "Number of real NHAs in the group:", nohaz_real_amors_num
print "Number of real Amor:", amors_real_num
print "Amor group weight:", float(amors_real_num)/real_dom1_num









    



Number of real PHAs in the group: 247
Number of real NHAs in the group: 945
Number of real Amor: 1192
Amor group weight: 0.301543131799

Cut off non-hazardous areas by a w-i surface



In [59]:

    
haz_gen_amor_se = ld.add_doublemirror_column(haz_gen_amor, 'w', 180.0)
nohaz_gen_amor_se = ld.add_doublemirror_column(nohaz_gen_amor, 'w', 180.0)



In [60]:

    
ef = 0.25
haz_gen_amor_see = ld.extend_by_copies(haz_gen_amor_se, 'w', extend_factor=ef)
nohaz_gen_amor_see = ld.extend_by_copies(nohaz_gen_amor_se, 'w', extend_factor=ef)



In [61]:

    
cutcol = ['w', 'i']
# labels = [vd.colnames[nm] for nm in cutcol]
# vd.display_param2d(cutcol, labels, [haz_gen_amor_see, nohaz_gen_amor_see])
vd.plot_distributions2d(cutcol, haz_gen_amor_see, nohaz_gen_amor_see, labels=True)

Cut w and i columns and nomalize datasets



In [62]:

    
pairs, amor_wi_sc = ld.cut_normalize(cutcol, [haz_gen_amor, nohaz_gen_amor], 
                                             [haz_real_amor, nohaz_real_amor],
                                             [haz_gen_amor_see, nohaz_gen_amor_see])
haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_see_cut, nohaz_gen_see_cut = pairs[2]

Train SVM



In [63]:

    
clf_amor_wi = svm.SVC(gamma=30., C=0.1, class_weight={1: 5.5})
xtrain, ytrain = ld.mix_up(haz_gen_see_cut, nohaz_gen_see_cut)
clf_amor_wi = clf_amor_wi.fit(xtrain, ytrain)



In [64]:

    
# reload(al)
# clf_amor_wi = svm.SVC(gamma=30., C=0.1, class_weight={1: 5.5})
# # clf = svm.SVC(gamma=12., C=0.02, class_weight={1: 5.5})
# # clf = svm.SVC(gamma=10., C=0.08, class_weight={1: 5.5})
# splitres = al.split_by_clf(clf_amor_wi, cutcol, haz_gen_amor_see, 
#                                                 nohaz_gen_amor_see, 
#                                                 haz_gen_amor, 
#                                                 nohaz_gen_amor)

# haz_gen_amor_wi, nohaz_gen_amor_wi = splitres[0]
# haz_gen_amor_wi__, nohaz_gen_amor_wi__ = splitres[1]
# amor_wi_sc = splitres[2]

Estimate split quality for virtual Amors



In [65]:

    
predicted_gen = al.clf_split_quality(clf_amor_wi, haz_gen_cut, nohaz_gen_cut)

haz_gen_amor_wi = haz_gen_amor.iloc[predicted_gen[0]]
nohaz_gen_amor_wi = nohaz_gen_amor.iloc[predicted_gen[1]]

haz_gen_amor_wi__ = haz_gen_amor.iloc[predicted_gen[2]]
nohaz_gen_amor_wi__ = nohaz_gen_amor.iloc[predicted_gen[3]]









    



purity of PHA region: 0.360488798371
number of PHAs in the PHA region: 3540
number of NHAs in the PHA region: 6280

purity of NHA region: 0.962915686933
number of PHAs in the NHA region: 552
number of NHAs in the NHA region: 14333

fraction of correctly classified PHAs: 0.865102639296

Estimate split quality for real Amors



In [66]:

    
predicted_real = al.clf_split_quality(clf_amor_wi, haz_real_cut, nohaz_real_cut)

haz_real_amor_wi = haz_real_amor.iloc[predicted_real[0]]
nohaz_real_amor_wi = nohaz_real_amor.iloc[predicted_real[1]]

haz_real_amor_wi__ = haz_real_amor.iloc[predicted_real[2]]
nohaz_real_amor_wi__ = nohaz_real_amor.iloc[predicted_real[3]]









    



purity of PHA region: 0.3648
number of PHAs in the PHA region: 228
number of NHAs in the PHA region: 397

purity of NHA region: 0.966490299824
number of PHAs in the NHA region: 19
number of NHAs in the NHA region: 548

fraction of correctly classified PHAs: 0.923076923077

Plot decision surface



In [67]:

    
vd.plot_clf2d(clf_amor_wi, cutcol, num=400, haz_cut=haz_gen_cut, nohaz_cut=nohaz_gen_cut, s=2,
              figsize=(8,8), scales=amor_wi_sc, labels=True, cmap='winter', extend_factors=[ef, 0]) 
# plotbounds=plt_bounds



In [68]:

    
# vd.display_param2d(cutcol, labels, [haz_gen_amor_wi, nohaz_gen_amor_wi])

Extract PHA region by a w-om-q surface



In [69]:

    
haz_gen_amor_wi_se = ld.add_mirror_column(haz_gen_amor_wi, 'w', 180.0)
nohaz_gen_amor_wi_se = ld.add_mirror_column(nohaz_gen_amor_wi, 'w', 180.0)



In [70]:

    
# ef = 0.25
# haz_gen_amor_wi_see = ld.extend_by_copies(haz_gen_amor_wi_se , 'w', extend_factor=ef)
# nohaz_gen_amor_wi_see = ld.extend_by_copies(nohaz_gen_amor_wi_se , 'w', extend_factor=ef)



In [71]:

    
cutcol = ['w', 'om']
vd.plot_distributions2d(cutcol, haz_gen_amor_wi_se, nohaz_gen_amor_wi_se, labels=True)

Cut w, om and q columns and nomalize datasets



In [72]:

    
cutcol = ['w', 'om', 'q']
pairs, amor_womq_sc = ld.cut_normalize(cutcol, [haz_gen_amor_wi, nohaz_gen_amor_wi], 
                                               [haz_real_amor_wi, nohaz_real_amor_wi], 
                                               [haz_gen_amor_wi_se, nohaz_gen_amor_wi_se])
haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_se_cut, nohaz_gen_se_cut = pairs[2]

Train SVM



In [73]:

    
clf_amor_womq = svm.SVC(gamma=20.0, C=8.0, class_weight={0: 2.4})
xtrain, ytrain = ld.mix_up(haz_gen_se_cut, nohaz_gen_se_cut)
clf_amor_womq = clf_amor_womq.fit(xtrain, ytrain)



In [74]:

    
# cutcol = ['w', 'om', 'q']
# # clf = svm.SVC(gamma=10.0, C=1000, class_weight={0: 2.0})
# clf_amor_womq = svm.SVC(gamma=20.0, C=8.0, class_weight={0: 2.4})
# splitres = al.split_by_clf(clf_amor_womq, cutcol, haz_gen_amor_wi_se, 
#                                                   nohaz_gen_amor_wi_se,
#                                                   haz_gen_amor_wi, 
#                                                   nohaz_gen_amor_wi)

# haz_gen_amor_wqom, nohaz_gen_amor_wqom = splitres[0]
# haz_gen_amor_wqom__, nohaz_gen_amor_wqom__ = splitres[1]
# amor_wqom_sc = splitres[2]

Estimate split quality for virtual Amors



In [75]:

    
predicted_gen = al.clf_split_quality(clf_amor_womq, haz_gen_cut, nohaz_gen_cut)

haz_gen_amor_womq = haz_gen_amor_wi.iloc[predicted_gen[0]]
nohaz_gen_amor_womq = nohaz_gen_amor_wi.iloc[predicted_gen[1]]

haz_gen_amor_womq__ = haz_gen_amor_wi.iloc[predicted_gen[2]]
nohaz_gen_amor_womq__ = nohaz_gen_amor_wi.iloc[predicted_gen[3]]









    



purity of PHA region: 0.881790744467
number of PHAs in the PHA region: 1753
number of NHAs in the PHA region: 235

purity of NHA region: 0.771833503575
number of PHAs in the NHA region: 1787
number of NHAs in the NHA region: 6045

fraction of correctly classified PHAs: 0.495197740113

Estimate split quality for real Amors



In [76]:

    
predicted_real = al.clf_split_quality(clf_amor_womq, haz_real_cut, nohaz_real_cut)

haz_real_amor_womq = haz_real_amor_wi.iloc[predicted_real[0]]
nohaz_real_amor_womq = nohaz_real_amor_wi.iloc[predicted_real[1]]

haz_real_amor_womq__ = haz_real_amor_wi.iloc[predicted_real[2]]
nohaz_real_amor_womq__ = nohaz_real_amor_wi.iloc[predicted_real[3]]









    



purity of PHA region: 0.914728682171
number of PHAs in the PHA region: 118
number of NHAs in the PHA region: 11

purity of NHA region: 0.778225806452
number of PHAs in the NHA region: 110
number of NHAs in the NHA region: 386

fraction of correctly classified PHAs: 0.517543859649



In [ ]:



In [77]:

    
cutcol = ['w', 'om', 'q']
vd.plot_clf3d(clf_amor_womq, cutcol, num=250, labels=True, figsize=(10,9), mode='2d', scales=amor_womq_sc)



In [78]:

    
haz_gen_extracted_am.append(haz_gen_amor_womq)
nohaz_gen_trapped_am.append(nohaz_gen_amor_womq)

haz_real_extracted_am.append(haz_real_amor_womq)
nohaz_real_trapped_am.append(nohaz_real_amor_womq)

Amor divisions qualitiy

Divisions quality for virtual Amors



In [79]:

    
vd.print_summary(haz_gen_extracted_am, nohaz_gen_trapped_am, haz_gen_amor, nohaz_gen_amor, 'virtual')









    



Number of correctly classified virtual PHAs 1753
Number of trapped virtual NHAs: 235

Mass fraction of correctly classified virtual PHAs: 0.428396871945
Mass fraction of trapped virtual NHAs: 0.0114005724543

Cummulative purity of the outlined PHA regions: 0.881790744467

Divisions quality for real Amors



In [80]:

    
vd.print_summary(haz_real_extracted_am, nohaz_real_trapped_am, haz_real_amor, nohaz_real_amor, 'real')









    



Number of correctly classified real PHAs 118
Number of trapped real NHAs: 11

Mass fraction of correctly classified real PHAs: 0.477732793522
Mass fraction of trapped real NHAs: 0.0116402116402

Cummulative purity of the outlined PHA regions: 0.914728682171



In [ ]:

Count down cummulative split quality

Virtual asteroids



In [81]:

    
haz_gen_extracted = haz_gen_extracted_aa + haz_gen_extracted_ap + haz_gen_extracted_am
nohaz_gen_trapped = nohaz_gen_trapped_aa + nohaz_gen_trapped_ap + nohaz_gen_trapped_am



In [82]:

    
vd.print_summary(haz_gen_extracted, nohaz_gen_trapped, haz_gen, nohaz_gen, 'virtual')









    



Number of correctly classified virtual PHAs 23289
Number of trapped virtual NHAs: 2257

Mass fraction of correctly classified virtual PHAs: 0.858643955315
Mass fraction of trapped virtual NHAs: 0.0453021818109

Cummulative purity of the outlined PHA regions: 0.911649573319

Real asteroids



In [83]:

    
haz_real_extracted = haz_real_extracted_aa + haz_real_extracted_ap + haz_real_extracted_am
nohaz_real_trapped = nohaz_real_trapped_aa + nohaz_real_trapped_ap + nohaz_real_trapped_am



In [84]:

    
vd.print_summary(haz_real_extracted, nohaz_real_trapped, haz_real, nohaz_real, 'real')









    



Number of correctly classified real PHAs 1787
Number of trapped real NHAs: 131

Mass fraction of correctly classified real PHAs: 0.897088353414
Mass fraction of trapped real NHAs: 0.0558873720137

Cummulative purity of the outlined PHA regions: 0.931699687174



In [ ]:



In [ ]: