Mine domain #1. Extract subgroups with high concentration of PHAs

import pickle
from copy import copy, deepcopy
import numpy as np
import pandas as pd
from sklearn import neighbors, svm
import matplotlib as mpl

# Import Asterion modules
import read_database as rdb
import learn_data as ld
import asterion_learn as al
import visualize_data as vd

# Plotting settings for the current notebook
%matplotlib inline
# font = {'size': 25}
font = {'size': 14}
mpl.rc('font', **font)
plotgrid = np.array([[0.0, 0.0], [1.0, 1.0]])

Load generated and real NEAs from the domain #1

dirpath = './asteroid_data/'
real_datasets = ['haz_real', 'nohaz_real']
gen_datasets = ['haz_gen', 'nohaz_gen']
genu_datasets = ['haz_gen', 'nohaz_gen']
name_suffixes = ['_dom1.p', '_dom1_rest.p']

dumps_real = [dirpath + ds + ns for ns in name_suffixes for ds in real_datasets]
dumps_gen = [dirpath + ds + ns for ns in name_suffixes for ds in gen_datasets]
dumps_genu = [dirpath + ds + ns for ns in name_suffixes for ds in genu_datasets]

haz_real, nohaz_real, haz_real_rest, nohaz_real_rest = map(rdb.loadObject, dumps_real)
haz_gen, nohaz_gen, haz_gen_rest, nohaz_gen_rest = map(rdb.loadObject, dumps_gen)
haz_genu, nohaz_genu, haz_genu_rest, nohaz_genu_rest = map(rdb.loadObject, dumps_genu)

gen_num = sum(map(len, [haz_gen, nohaz_gen]))
real_num = sum(map(len, [haz_real, nohaz_real]))

print "Number of virtual asteroids in the domain:", gen_num
print "Number of real asteroids in the domain:", real_num

Number of virtual asteroids in the domain: 76944
Number of real asteroids in the domain: 4336

Investigate distributions of orbital parameters for the domain #1

# vd.plot_alldistcombs(haz_gen, nohaz_gen, labels=True)

Cut off non-hazardous asteroids with high values of q

cutcol = ['w', 'q']
q_split = 1.066
p1, p2 = [[0.0, q_split], [360.0, q_split]]
vd.plot_distributions2d(cutcol, haz_gen, nohaz_gen, line=[p1, p2], invertaxes=[0,1], labels=True)

/usr/lib/pymodules/python2.7/matplotlib/collections.py:548: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == 'face':

haz_gen_dom1, haz_gen_dom1__ = ld.split_by_colval(haz_gen, 'q', q_split)
nohaz_gen_dom1, nohaz_gen_dom1__ = ld.split_by_colval(nohaz_gen, 'q', q_split)

haz_genu_dom1, haz_genu_dom1__ = ld.split_by_colval(haz_genu, 'q', q_split)
nohaz_genu_dom1, nohaz_genu_dom1__ = ld.split_by_colval(nohaz_genu, 'q', q_split)

haz_real_dom1, haz_real_dom1__ = ld.split_by_colval(haz_real, 'q', q_split)
nohaz_real_dom1, nohaz_real_dom1__ = ld.split_by_colval(nohaz_real, 'q', q_split)

gen_dom1_num = sum(map(len, [haz_gen_dom1, nohaz_gen_dom1]))
real_dom1_num = sum(map(len, [haz_real_dom1, nohaz_real_dom1]))
print gen_dom1_num
print real_dom1_num


Atiras & Atens

haz_gen_extracted_aa = []
nohaz_gen_trapped_aa = []

haz_real_extracted_aa = []
nohaz_real_trapped_aa = []


haz_gen_atiras, haz_gen_atiras_num = rdb.get_atiras(haz_gen_dom1)
nohaz_gen_atiras, nohaz_gen_atiras_num = rdb.get_atiras(nohaz_gen_dom1)
atiras_gen_num = haz_gen_atiras_num + nohaz_gen_atiras_num

haz_real_atiras, haz_real_atiras_num = rdb.get_atiras(haz_real_dom1)
nohaz_real_atiras, nohaz_real_atiras_num = rdb.get_atiras(nohaz_real_dom1)
atiras_real_num = haz_real_atiras_num + nohaz_real_atiras_num

print "Number of virtual Atiras:", atiras_gen_num
print "Number of real Atiras:", atiras_real_num

Number of virtual Atiras: 554
Number of real Atiras: 0


haz_gen_atens, haz_gen_atens_num = rdb.get_atens(haz_gen_dom1)
nohaz_gen_atens, nohaz_gen_atens_num = rdb.get_atens(nohaz_gen_dom1)
atens_gen_num = haz_gen_atens_num + nohaz_gen_atens_num

haz_real_atens, haz_real_atens_num = rdb.get_atens(haz_real_dom1)
nohaz_real_atens, nohaz_real_atens_num = rdb.get_atens(nohaz_real_dom1)
atens_real_num = haz_real_atens_num + nohaz_real_atens_num

print "Number of virtual Atens:", atens_gen_num
print "Number of real Atens:", atens_real_num

Number of virtual Atens: 1053
Number of real Atens: 39

Atiras + Atens

haz_gen_atiras_atens = pd.concat((haz_gen_atiras, haz_gen_atens))
nohaz_gen_atiras_atens = pd.concat((nohaz_gen_atiras, nohaz_gen_atens))

haz_gen_atiras_atens_num = len(haz_gen_atiras_atens)
nohaz_gen_atiras_atens_num = len(nohaz_gen_atiras_atens)
atiras_atens_gen_num = haz_gen_atiras_atens_num + nohaz_gen_atiras_atens_num

haz_real_atiras_atens = pd.concat((haz_real_atiras, haz_real_atens))
nohaz_real_atiras_atens = pd.concat((nohaz_real_atiras, nohaz_real_atens))

haz_real_atiras_atens_num = len(haz_real_atiras_atens)
nohaz_real_atiras_atens_num = len(nohaz_real_atiras_atens)
atiras_atens_real_num = haz_real_atiras_atens_num + nohaz_real_atiras_atens_num

print "Number of virtual PHAs in the group:", haz_gen_atiras_atens_num
print "Number of virtual NHAs in the group:", nohaz_gen_atiras_atens_num
print "Number of virtual Atiras and Atens:", atiras_atens_gen_num
print "Virtual Atiras and Atens group weight:", float(atiras_atens_gen_num)/gen_dom1_num

Number of virtual PHAs in the group: 1053
Number of virtual NHAs in the group: 554
Number of virtual Atiras and Atens: 1607
Virtual Atiras and Atens group weight: 0.0225645202056

print "Number of real PHAs in the group:", haz_real_atiras_atens_num
print "Number of real NHAs in the group:", nohaz_real_atiras_atens_num
print "Number of real Atiras and Atens:", atiras_atens_real_num
print "Real Atiras and Atens group weight:", float(atiras_atens_real_num)/real_dom1_num

Number of real PHAs in the group: 37
Number of real NHAs in the group: 2
Number of real Atiras and Atens: 39
Real Atiras and Atens group weight: 0.00986592461422

Plot distributions of 'a' and 'i' parameters

cutcol = ['a', 'i']
vd.plot_distributions2d(cutcol, haz_gen_atiras_atens, nohaz_gen_atiras_atens, labels=True)

Separate PHAs from NHAs by the LinearSVC

Cut a and i columns and nomalize datasets

In [16]:
cutcol = ['a', 'i']
pairs, atiras_atens_ai_sc = ld.cut_normalize(cutcol, 
                                             [haz_gen_atiras_atens, nohaz_gen_atiras_atens], 
                                             [haz_real_atiras_atens, nohaz_real_atiras_atens])

haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]

Find decision surface with SVM

clf_atiras_atens_ai = svm.LinearSVC()
xtrain, ytrain = ld.mix_up(haz_gen_cut, nohaz_gen_cut)
clf_atiras_atens_ai = clf_atiras_atens_ai.fit(xtrain, ytrain)

In [18]:
# reload(al)
# clf_aa_ai = svm.LinearSVC()
# splitres = al.split_by_clf(clf_aa_ai, cutcol, haz_gen_atiras_atens, 
#                                               nohaz_gen_atiras_atens)
# haz_gen_aa_ai, nohaz_gen_aa_ai = splitres[0]
# haz_gen_aa_ai__, nohaz_gen_aa_ai__ = splitres[1]
# aa_ai_sc = splitres[2]

Estimate split quality for virtual Atiras & Atens

predicted_gen = al.clf_split_quality(clf_atiras_atens_ai, haz_gen_cut, nohaz_gen_cut)

haz_gen_atiras_atens_ai = haz_gen_atiras_atens.iloc[predicted_gen[0]]
nohaz_gen_atiras_atens_ai = haz_gen_atiras_atens.iloc[predicted_gen[1]]

haz_gen_atiras_atens_ai__ = haz_gen_atiras_atens.iloc[predicted_gen[2]]
nohaz_gen_atiras_atens_ai__ = haz_gen_atiras_atens.iloc[predicted_gen[3]]

purity of PHA region: 0.923357664234
number of PHAs in the PHA region: 1012
number of NHAs in the PHA region: 84

purity of NHA region: 0.919765166341
number of PHAs in the NHA region: 41
number of NHAs in the NHA region: 470

fraction of correctly classified PHAs: 0.96106362773

Estimate split quality for real Atiras & Atens

predicted_real = al.clf_split_quality(clf_atiras_atens_ai, haz_real_cut, nohaz_real_cut)

haz_real_atiras_atens_ai = haz_real_atiras_atens.iloc[predicted_real[0]]
nohaz_real_atiras_atens_ai = haz_real_atiras_atens.iloc[predicted_real[1]]

haz_real_atiras_atens_ai__ = haz_real_atiras_atens.iloc[predicted_real[2]]
nohaz_real_atiras_atens_ai__ = haz_real_atiras_atens.iloc[predicted_real[3]]

purity of PHA region: 1.0
number of PHAs in the PHA region: 36
number of NHAs in the PHA region: 0

purity of NHA region: 0.666666666667
number of PHAs in the NHA region: 1
number of NHAs in the NHA region: 2

fraction of correctly classified PHAs: 0.972972972973

Plot decision boundary

vd.plot_clf2d(clf_atiras_atens_ai, cutcol, haz_cut=haz_gen_cut, nohaz_cut=nohaz_gen_cut,
              num=400, scales=atiras_atens_ai_sc, cmap='winter', figsize=(8,8))

Atiras & Atens divisions qualitiy

Divisions quality for virtual Atiras & Atens

vd.print_summary(haz_gen_extracted_aa, nohaz_gen_trapped_aa, 
                 haz_gen_atiras_atens, nohaz_gen_atiras_atens, 'virtual')

Number of correctly classified virtual PHAs 1012
Number of trapped virtual NHAs: 84

Mass fraction of correctly classified virtual PHAs: 0.96106362773
Mass fraction of trapped virtual NHAs: 0.151624548736

Cummulative purity of the outlined PHA regions: 0.923357664234

Divisions quality for real Atiras & Atens

vd.print_summary(haz_real_extracted_aa, nohaz_real_trapped_aa, 
                 haz_real_atiras_atens, nohaz_real_atiras_atens, 'real')

Number of correctly classified real PHAs 36
Number of trapped real NHAs: 0

Mass fraction of correctly classified real PHAs: 0.972972972973
Mass fraction of trapped real NHAs: 0.0

Cummulative purity of the outlined PHA regions: 1.0


haz_gen_extracted_ap = []
nohaz_gen_trapped_ap = []

haz_real_extracted_ap = []
nohaz_real_trapped_ap = []

haz_gen_apollo, haz_gen_apollo_num = rdb.get_apollos(haz_gen_dom1)
nohaz_gen_apollo, nohaz_gen_apollo_num = rdb.get_apollos(nohaz_gen_dom1)
apollo_gen_num = haz_gen_apollo_num + nohaz_gen_apollo_num

haz_real_apollo, haz_real_apollo_num = rdb.get_apollos(haz_real_dom1)
nohaz_real_apollo, nohaz_real_apollo_num = rdb.get_apollos(nohaz_real_dom1)
apollo_real_num = haz_real_apollo_num + nohaz_real_apollo_num

print "Number of virtual PHAs in the group:", haz_gen_apollo_num
print "Number of virtual NHAs in the group:", nohaz_gen_apollo_num
print "Number of virtual Apollo:", apollo_gen_num
print "Apollo group weight:", float(apollo_gen_num)/gen_dom1_num

Number of virtual PHAs in the group: 21977
Number of virtual NHAs in the group: 22929
Number of virtual Apollo: 44906
Apollo group weight: 0.630542840293

print "Number of real PHAs in the group:", haz_real_apollo_num
print "Number of real NHAs in the group:", nohaz_real_apollo_num
print "Number of real Apollo:", apollo_real_num
print "Apollo group weight:", float(apollo_real_num)/real_dom1_num

Number of real PHAs in the group: 1708
Number of real NHAs in the group: 1014
Number of real Apollo: 2722
Apollo group weight: 0.688590943587

# vd.display_allparams([apollos_haz, apollos_nohaz], vd.combs, vd.colnames)

Split Apollos by a w-q-i surface

Amplify datasets by their symmetric copies over 'w' parameter

In [30]:
# haz_gen_apollo_se = ld.add_mirror_column(haz_gen_apollo, 'w', 180.0)
# nohaz_gen_apollo_se = ld.add_mirror_column(nohaz_gen_apollo, 'w', 180.0)

haz_gen_apollo_se = ld.add_doublemirror_column(haz_gen_apollo, 'w', 180.0)
nohaz_gen_apollo_se = ld.add_doublemirror_column(nohaz_gen_apollo, 'w', 180.0)

cutcol = ['w', 'q']
vd.plot_distributions2d(cutcol, haz_gen_apollo_se, nohaz_gen_apollo_se, invertaxes=[0,1], labels=True)

Cut off annoying tips

haz_gen_apollo_bq = haz_gen_apollo[haz_gen_apollo.q > 0.7]
nohaz_gen_apollo_bq = nohaz_gen_apollo[nohaz_gen_apollo.q > 0.7]

haz_gen_apollo_sq = haz_gen_apollo[haz_gen_apollo.q <= 0.7]
nohaz_gen_apollo_sq = nohaz_gen_apollo[nohaz_gen_apollo.q <= 0.7]

haz_gen_apollo_se_bq = haz_gen_apollo_se[haz_gen_apollo_se.q > 0.7]
nohaz_gen_apollo_se_bq = nohaz_gen_apollo_se[nohaz_gen_apollo_se.q > 0.7]

Cut w and i columns and nomalize datasets

cutcol = ['w', 'q', 'i']
pairs, apollo_wqi_sc = ld.cut_normalize(cutcol, 
                                        [haz_gen_apollo, nohaz_gen_apollo], 
                                        [haz_real_apollo, nohaz_real_apollo], 
                                        [haz_gen_apollo_se_bq, nohaz_gen_apollo_se_bq])

haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_se_cut, nohaz_gen_se_cut = pairs[2]

Train SVM

clf_apollo_wqi = svm.SVC(gamma=40.0, C=0.05, class_weight={0: 1.2})
# clf_apollo_wqi = svm.SVC(gamma=35.0, C=0.05, class_weight={0: 1.2})
xtrain, ytrain = ld.mix_up(haz_gen_se_cut, nohaz_gen_se_cut)
clf_apollo_wqi = clf_apollo_wqi.fit(xtrain, ytrain)

# # reload(al)
# cutcol = ['w', 'q', 'i']

# # clf_apollo_wqi3 = neighbors.KNeighborsClassifier()
# # splitres = al.split_by_clf(clf_apollo_wqi3, cutcol,  haz_gen_apollo_se_bq,
# #                                                      nohaz_gen_apollo_se_bq,
# #                                                      haz_gen_apollo,
# #                                                      nohaz_gen_apollo)

# clf_apollo_wqi3 = svm.SVC(gamma=40.0, C=0.05, class_weight={0: 1.2}) #class_weight={0: 1.5} 
# # clf_apollo_wqi3 = svm.NuSVC(gamma=10.0, class_weight={0: 1.5}) #class_weight={0: 1.5}
# #(20 0.5), (30 0.1) (5 0.01)
# splitres = al.split_by_clf(clf_apollo_wqi3, cutcol,  haz_gen_apollo_se_bq,
#                                                      nohaz_gen_apollo_se_bq,
#                                                      haz_gen_apollo,
#                                                      nohaz_gen_apollo)

# haz_gen_apollo_wqi3, nohaz_gen_apollo_wqi3 = splitres[0]
# haz_gen_apollo_wqi3__, nohaz_gen_apollo_wqi3__ = splitres[1]
# apollo_wqi3_sc = splitres[2]
# clf_apollo_wqi = clf_apollo_wqi3

In [36]:
predicted_gen = al.clf_split_quality(clf_apollo_wqi, haz_gen_cut, nohaz_gen_cut)

haz_gen_apollo_wqi = haz_gen_apollo.iloc[predicted_gen[0]]
nohaz_gen_apollo_wqi = nohaz_gen_apollo.iloc[predicted_gen[1]]

haz_gen_apollo_wqi__ = haz_gen_apollo.iloc[predicted_gen[2]]
nohaz_gen_apollo_wqi__ = nohaz_gen_apollo.iloc[predicted_gen[3]]

purity of PHA region: 0.91442363188
number of PHAs in the PHA region: 17428
number of NHAs in the PHA region: 1631

purity of NHA region: 0.824002785623
number of PHAs in the NHA region: 4549
number of NHAs in the NHA region: 21298

fraction of correctly classified PHAs: 0.793010875006

Estimate split quality for real Apollos

predicted_real = al.clf_split_quality(clf_apollo_wqi, haz_real_cut, nohaz_real_cut)

haz_real_apollo_wqi = haz_real_apollo.iloc[predicted_real[0]]
nohaz_real_apollo_wqi = nohaz_real_apollo.iloc[predicted_real[1]]

haz_real_apollo_wqi__ = haz_real_apollo.iloc[predicted_real[2]]
nohaz_real_apollo_wqi__ = nohaz_real_apollo.iloc[predicted_real[3]]

purity of PHA region: 0.938098747237
number of PHAs in the PHA region: 1273
number of NHAs in the PHA region: 84

purity of NHA region: 0.681318681319
number of PHAs in the NHA region: 435
number of NHAs in the NHA region: 930

fraction of correctly classified PHAs: 0.745316159251

Prepare w-q domain mask to exclude out-of-domain points from the plot

In [38]:
genu = pd.concat((haz_gen, nohaz_gen))
genu_rest = pd.concat((haz_genu_rest, nohaz_genu_rest))

apollo_wq_sc = apollo_wqi_sc[:2]

In [39]:
cutcol_ = ['w', 'q']
clfmask = svm.SVC(gamma=10.0, C=1e3) # class_weight={1: 2}
clfmask = al.sgmask_clf2d_fit(clfmask, cutcol_, genu, genu_rest, apollo_wq_sc)

vd.plot_clf2d(clfmask, cutcol_, num=250, figsize=(6,6), scales=apollo_wq_sc, 
              labels=True, cmap='Blues', invertaxes=[0, 1])

In [41]:
# cutcol_ = ['w', 'q']
# # labels = [vd.colnames[nm] for nm in cutcol]

# clfmask = svm.SVC(gamma=10.0, C=1e3) # class_weight={1: 2}
# clfmask = al.sgmask_clf(haz_gen_apollo_se, nohaz_gen_apollo_se, 
#                         haz_genu_rest, nohaz_genu_rest, clfmask, cutcol_)

# # clfmask = al.sgmask_clf(haz_gen_apollo, nohaz_gen_apollo, 
# #                         haz_genu_rest, nohaz_genu_rest, clfmask, cutcol)

# plotgrid = np.array([[0.0, 0.0], [1.0, 1.0]])
# scales = ld.dfcommon_bounds([haz_gen_apollo_se, nohaz_gen_apollo_se], cutcol_)
# # vd.plot_classifier(plotgrid, clfmask, num=200, figsize=(6,6), scales=scales, 
# #                    labels = labels, cmap='Blues', invertaxes=[0, 1])

# vd.plot_clf2d(clfmask, cutcol_, num=200, figsize=(6,6), scales=scales, 
#                    labels=True, cmap='Blues', invertaxes=[0, 1])

Plot decision surface

# cutcol = ['w', 'q', 'i']
vd.plot_clf3d(clf_apollo_wqi, cutcol, num=250, labels=True, figsize=(10,9), mode='2d', 
              scales=apollo_wqi_sc, clf_masks=[(clfmask, 0)], invertaxes=[0, 1])

Split rest of Apollos by a w-q-a surface

Amplify datasets by their symmetric copies over the 'w' parameter

haz_gen_apollo_wqi__se = ld.add_doublemirror_column(haz_gen_apollo_wqi__, 'w', 180.0)
nohaz_gen_apollo_wqi__se = ld.add_doublemirror_column(nohaz_gen_apollo_wqi__, 'w', 180.0)

Cut w, q and a columns and nomalize datasets

# reload(ld)
cutcol = ['w', 'q', 'a']
pairs, apollo_wqa_sc = ld.cut_normalize(cutcol, [haz_gen_apollo_wqi__, nohaz_gen_apollo_wqi__], 
                                                 [haz_real_apollo_wqi__, nohaz_real_apollo_wqi__], 
                                                 [haz_gen_apollo_wqi__se, nohaz_gen_apollo_wqi__se])
haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_se_cut, nohaz_gen_se_cut = pairs[2]

Train SVM

clf_apollo_wqa = svm.SVC(gamma=40.0, C=0.1, class_weight={0: 1.5})
xtrain, ytrain = ld.mix_up(haz_gen_se_cut, nohaz_gen_se_cut)
clf_apollo_wqa = clf_apollo_wqa.fit(xtrain, ytrain)

# # reload(al)
# cutcol = ['w', 'q', 'a']
# clf_apollo_wqa3 = svm.SVC(gamma=40.0, C=0.1, class_weight={0: 1.5}) #class_weight={0: 1.5}
# # (30 0.5)
# splitres = al.split_by_clf(clf_apollo_wqa3, cutcol,  haz_gen_apollo_wqi3__se,
#                                                      nohaz_gen_apollo_wqi3__se,
#                                                      haz_gen_apollo_wqi3__,
#                                                      nohaz_gen_apollo_wqi3__)

# haz_gen_apollo_wqa3, nohaz_gen_apollo_wqa3 = splitres[0]
# haz_gen_apollo_wqa3__, nohaz_gen_apollo_wqa3__ = splitres[1]
# apollo_wqa3_sc = splitres[2]

Estimate split quality for virtual Apollos

predicted_gen = al.clf_split_quality(clf_apollo_wqa, haz_gen_cut, nohaz_gen_cut)

haz_gen_apollo_wqa = haz_gen_apollo_wqi__.iloc[predicted_gen[0]]
nohaz_gen_apollo_wqa = nohaz_gen_apollo_wqi__.iloc[predicted_gen[1]]

haz_gen_apollo_wqa__ = haz_gen_apollo_wqi__.iloc[predicted_gen[2]]
nohaz_gen_apollo_wqa__ = nohaz_gen_apollo_wqi__.iloc[predicted_gen[3]]

purity of PHA region: 0.909785483397
number of PHAs in the PHA region: 3096
number of NHAs in the PHA region: 307

purity of NHA region: 0.935261094279
number of PHAs in the NHA region: 1453
number of NHAs in the NHA region: 20991

fraction of correctly classified PHAs: 0.68058914047

Estimate split quality for real Apollos

predicted_real = al.clf_split_quality(clf_apollo_wqa, haz_real_cut, nohaz_real_cut)

haz_real_apollo_wqa = haz_real_apollo_wqi__.iloc[predicted_real[0]]
nohaz_real_apollo_wqa = nohaz_real_apollo_wqi__.iloc[predicted_real[1]]

haz_real_apollo_wqa__ = haz_real_apollo_wqi__.iloc[predicted_real[2]]
nohaz_real_apollo_wqa__ = nohaz_real_apollo_wqi__.iloc[predicted_real[3]]

purity of PHA region: 0.909090909091
number of PHAs in the PHA region: 360
number of NHAs in the PHA region: 36

purity of NHA region: 0.922600619195
number of PHAs in the NHA region: 75
number of NHAs in the NHA region: 894

fraction of correctly classified PHAs: 0.827586206897

# clf_masks = [(clfmask, 0)]
# cutcol = ['w', 'q', 'a']
# labels = [vd.colnames[nm] for nm in cutcol]
vd.plot_clf3d(clf_apollo_wqa, cutcol, num=250, labels=True, figsize=(10,9), mode='2d', 
              scales=apollo_wqa_sc, clf_masks=[(clfmask, 0)], invertaxes=[0, 1])

Apollo divisions qualitiy

Divisions quality for virtual Apollos

vd.print_summary(haz_gen_extracted_ap, nohaz_gen_trapped_ap, haz_gen_apollo, nohaz_gen_apollo, 'virtual')

Number of correctly classified virtual PHAs 20524
Number of trapped virtual NHAs: 1938

Mass fraction of correctly classified virtual PHAs: 0.933885425672
Mass fraction of trapped virtual NHAs: 0.0845217846395

Cummulative purity of the outlined PHA regions: 0.913720950939

Divisions quality for real Apollos

vd.print_summary(haz_real_extracted_ap, nohaz_real_trapped_ap, haz_real_apollo, nohaz_real_apollo, 'real')

Number of correctly classified real PHAs 1633
Number of trapped real NHAs: 120

Mass fraction of correctly classified real PHAs: 0.956088992974
Mass fraction of trapped real NHAs: 0.118343195266

Cummulative purity of the outlined PHA regions: 0.931545921278

haz_gen_extracted_am = []
nohaz_gen_trapped_am = []

haz_real_extracted_am = []
nohaz_real_trapped_am = []

haz_gen_amor, haz_gen_amors_num = rdb.get_amors(haz_gen_dom1)
nohaz_gen_amor, nohaz_gen_amors_num = rdb.get_amors(nohaz_gen_dom1)
amors_gen_num = haz_gen_amors_num + nohaz_gen_amors_num

haz_real_amor, haz_real_amors_num = rdb.get_amors(haz_real_dom1)
nohaz_real_amor, nohaz_real_amors_num = rdb.get_amors(nohaz_real_dom1)
amors_real_num = haz_real_amors_num + nohaz_real_amors_num

print "Number of virtual PHAs in the group:", haz_gen_amors_num
print "Number of virtual NHAs in the group:", nohaz_gen_amors_num
print "Number of virtual Amor:", amors_gen_num
print "Amor group weight:", float(amors_gen_num)/gen_dom1_num

Number of virtual PHAs in the group: 4092
Number of virtual NHAs in the group: 20613
Number of virtual Amor: 24705
Amor group weight: 0.346892639501

print "Number of real PHAs in the group:", haz_real_amors_num
print "Number of real NHAs in the group:", nohaz_real_amors_num
print "Number of real Amor:", amors_real_num
print "Amor group weight:", float(amors_real_num)/real_dom1_num

Number of real PHAs in the group: 247
Number of real NHAs in the group: 945
Number of real Amor: 1192
Amor group weight: 0.301543131799

Cut off non-hazardous areas by a w-i surface

In [59]:
haz_gen_amor_se = ld.add_doublemirror_column(haz_gen_amor, 'w', 180.0)
nohaz_gen_amor_se = ld.add_doublemirror_column(nohaz_gen_amor, 'w', 180.0)

ef = 0.25
haz_gen_amor_see = ld.extend_by_copies(haz_gen_amor_se, 'w', extend_factor=ef)
nohaz_gen_amor_see = ld.extend_by_copies(nohaz_gen_amor_se, 'w', extend_factor=ef)

In [61]:
cutcol = ['w', 'i']
# labels = [vd.colnames[nm] for nm in cutcol]
# vd.display_param2d(cutcol, labels, [haz_gen_amor_see, nohaz_gen_amor_see])
vd.plot_distributions2d(cutcol, haz_gen_amor_see, nohaz_gen_amor_see, labels=True)

Cut w and i columns and nomalize datasets

pairs, amor_wi_sc = ld.cut_normalize(cutcol, [haz_gen_amor, nohaz_gen_amor], 
                                             [haz_real_amor, nohaz_real_amor],
                                             [haz_gen_amor_see, nohaz_gen_amor_see])
haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_see_cut, nohaz_gen_see_cut = pairs[2]

Train SVM

clf_amor_wi = svm.SVC(gamma=30., C=0.1, class_weight={1: 5.5})
xtrain, ytrain = ld.mix_up(haz_gen_see_cut, nohaz_gen_see_cut)
clf_amor_wi = clf_amor_wi.fit(xtrain, ytrain)

# reload(al)
# clf_amor_wi = svm.SVC(gamma=30., C=0.1, class_weight={1: 5.5})
# # clf = svm.SVC(gamma=12., C=0.02, class_weight={1: 5.5})
# # clf = svm.SVC(gamma=10., C=0.08, class_weight={1: 5.5})
# splitres = al.split_by_clf(clf_amor_wi, cutcol, haz_gen_amor_see, 
#                                                 nohaz_gen_amor_see, 
#                                                 haz_gen_amor, 
#                                                 nohaz_gen_amor)

# haz_gen_amor_wi, nohaz_gen_amor_wi = splitres[0]
# haz_gen_amor_wi__, nohaz_gen_amor_wi__ = splitres[1]
# amor_wi_sc = splitres[2]

In [65]:
predicted_gen = al.clf_split_quality(clf_amor_wi, haz_gen_cut, nohaz_gen_cut)

haz_gen_amor_wi = haz_gen_amor.iloc[predicted_gen[0]]
nohaz_gen_amor_wi = nohaz_gen_amor.iloc[predicted_gen[1]]

haz_gen_amor_wi__ = haz_gen_amor.iloc[predicted_gen[2]]
nohaz_gen_amor_wi__ = nohaz_gen_amor.iloc[predicted_gen[3]]

purity of PHA region: 0.360488798371
number of PHAs in the PHA region: 3540
number of NHAs in the PHA region: 6280

purity of NHA region: 0.962915686933
number of PHAs in the NHA region: 552
number of NHAs in the NHA region: 14333

fraction of correctly classified PHAs: 0.865102639296

Estimate split quality for real Amors

predicted_real = al.clf_split_quality(clf_amor_wi, haz_real_cut, nohaz_real_cut)

haz_real_amor_wi = haz_real_amor.iloc[predicted_real[0]]
nohaz_real_amor_wi = nohaz_real_amor.iloc[predicted_real[1]]

haz_real_amor_wi__ = haz_real_amor.iloc[predicted_real[2]]
nohaz_real_amor_wi__ = nohaz_real_amor.iloc[predicted_real[3]]

purity of PHA region: 0.3648
number of PHAs in the PHA region: 228
number of NHAs in the PHA region: 397

purity of NHA region: 0.966490299824
number of PHAs in the NHA region: 19
number of NHAs in the NHA region: 548

fraction of correctly classified PHAs: 0.923076923077

Plot decision surface

vd.plot_clf2d(clf_amor_wi, cutcol, num=400, haz_cut=haz_gen_cut, nohaz_cut=nohaz_gen_cut, s=2,
              figsize=(8,8), scales=amor_wi_sc, labels=True, cmap='winter', extend_factors=[ef, 0]) 
# plotbounds=plt_bounds

# vd.display_param2d(cutcol, labels, [haz_gen_amor_wi, nohaz_gen_amor_wi])

Extract PHA region by a w-om-q surface

In [69]:
haz_gen_amor_wi_se = ld.add_mirror_column(haz_gen_amor_wi, 'w', 180.0)
nohaz_gen_amor_wi_se = ld.add_mirror_column(nohaz_gen_amor_wi, 'w', 180.0)

# ef = 0.25
# haz_gen_amor_wi_see = ld.extend_by_copies(haz_gen_amor_wi_se , 'w', extend_factor=ef)
# nohaz_gen_amor_wi_see = ld.extend_by_copies(nohaz_gen_amor_wi_se , 'w', extend_factor=ef)

cutcol = ['w', 'om']
vd.plot_distributions2d(cutcol, haz_gen_amor_wi_se, nohaz_gen_amor_wi_se, labels=True)

Cut w, om and q columns and nomalize datasets

In [72]:
cutcol = ['w', 'om', 'q']
pairs, amor_womq_sc = ld.cut_normalize(cutcol, [haz_gen_amor_wi, nohaz_gen_amor_wi], 
                                               [haz_real_amor_wi, nohaz_real_amor_wi], 
                                               [haz_gen_amor_wi_se, nohaz_gen_amor_wi_se])
haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_se_cut, nohaz_gen_se_cut = pairs[2]

Train SVM

clf_amor_womq = svm.SVC(gamma=20.0, C=8.0, class_weight={0: 2.4})
xtrain, ytrain = ld.mix_up(haz_gen_se_cut, nohaz_gen_se_cut)
clf_amor_womq = clf_amor_womq.fit(xtrain, ytrain)

# cutcol = ['w', 'om', 'q']
# # clf = svm.SVC(gamma=10.0, C=1000, class_weight={0: 2.0})
# clf_amor_womq = svm.SVC(gamma=20.0, C=8.0, class_weight={0: 2.4})
# splitres = al.split_by_clf(clf_amor_womq, cutcol, haz_gen_amor_wi_se, 
#                                                   nohaz_gen_amor_wi_se,
#                                                   haz_gen_amor_wi, 
#                                                   nohaz_gen_amor_wi)

# haz_gen_amor_wqom, nohaz_gen_amor_wqom = splitres[0]
# haz_gen_amor_wqom__, nohaz_gen_amor_wqom__ = splitres[1]
# amor_wqom_sc = splitres[2]

In [75]:
predicted_gen = al.clf_split_quality(clf_amor_womq, haz_gen_cut, nohaz_gen_cut)

haz_gen_amor_womq = haz_gen_amor_wi.iloc[predicted_gen[0]]
nohaz_gen_amor_womq = nohaz_gen_amor_wi.iloc[predicted_gen[1]]

haz_gen_amor_womq__ = haz_gen_amor_wi.iloc[predicted_gen[2]]
nohaz_gen_amor_womq__ = nohaz_gen_amor_wi.iloc[predicted_gen[3]]

purity of PHA region: 0.881790744467
number of PHAs in the PHA region: 1753
number of NHAs in the PHA region: 235

purity of NHA region: 0.771833503575
number of PHAs in the NHA region: 1787
number of NHAs in the NHA region: 6045

fraction of correctly classified PHAs: 0.495197740113

Estimate split quality for real Amors

predicted_real = al.clf_split_quality(clf_amor_womq, haz_real_cut, nohaz_real_cut)

haz_real_amor_womq = haz_real_amor_wi.iloc[predicted_real[0]]
nohaz_real_amor_womq = nohaz_real_amor_wi.iloc[predicted_real[1]]

haz_real_amor_womq__ = haz_real_amor_wi.iloc[predicted_real[2]]
nohaz_real_amor_womq__ = nohaz_real_amor_wi.iloc[predicted_real[3]]

purity of PHA region: 0.914728682171
number of PHAs in the PHA region: 118
number of NHAs in the PHA region: 11

purity of NHA region: 0.778225806452
number of PHAs in the NHA region: 110
number of NHAs in the NHA region: 386

fraction of correctly classified PHAs: 0.517543859649

cutcol = ['w', 'om', 'q']
vd.plot_clf3d(clf_amor_womq, cutcol, num=250, labels=True, figsize=(10,9), mode='2d', scales=amor_womq_sc)

Amor divisions qualitiy

Divisions quality for virtual Amors

vd.print_summary(haz_gen_extracted_am, nohaz_gen_trapped_am, haz_gen_amor, nohaz_gen_amor, 'virtual')

Number of correctly classified virtual PHAs 1753
Number of trapped virtual NHAs: 235

Mass fraction of correctly classified virtual PHAs: 0.428396871945
Mass fraction of trapped virtual NHAs: 0.0114005724543

Cummulative purity of the outlined PHA regions: 0.881790744467

Divisions quality for real Amors

vd.print_summary(haz_real_extracted_am, nohaz_real_trapped_am, haz_real_amor, nohaz_real_amor, 'real')

Number of correctly classified real PHAs 118
Number of trapped real NHAs: 11

Mass fraction of correctly classified real PHAs: 0.477732793522
Mass fraction of trapped real NHAs: 0.0116402116402

Cummulative purity of the outlined PHA regions: 0.914728682171

Count down cummulative split quality

Virtual asteroids

haz_gen_extracted = haz_gen_extracted_aa + haz_gen_extracted_ap + haz_gen_extracted_am
nohaz_gen_trapped = nohaz_gen_trapped_aa + nohaz_gen_trapped_ap + nohaz_gen_trapped_am

vd.print_summary(haz_gen_extracted, nohaz_gen_trapped, haz_gen, nohaz_gen, 'virtual')

Number of correctly classified virtual PHAs 23289
Number of trapped virtual NHAs: 2257

Mass fraction of correctly classified virtual PHAs: 0.858643955315
Mass fraction of trapped virtual NHAs: 0.0453021818109

Cummulative purity of the outlined PHA regions: 0.911649573319

Real asteroids

haz_real_extracted = haz_real_extracted_aa + haz_real_extracted_ap + haz_real_extracted_am
nohaz_real_trapped = nohaz_real_trapped_aa + nohaz_real_trapped_ap + nohaz_real_trapped_am

vd.print_summary(haz_real_extracted, nohaz_real_trapped, haz_real, nohaz_real, 'real')

Number of correctly classified real PHAs 1787
Number of trapped real NHAs: 131

Mass fraction of correctly classified real PHAs: 0.897088353414
Mass fraction of trapped real NHAs: 0.0558873720137

Cummulative purity of the outlined PHA regions: 0.931699687174

