In [1]:
import pickle
from copy import copy, deepcopy
import numpy as np
import pandas as pd
from sklearn import neighbors, svm
import matplotlib as mpl
# Import Asterion modules
import read_database as rdb
import learn_data as ld
import asterion_learn as al
import visualize_data as vd
# Plotting settings for the current notebook
%matplotlib inline
# %matplotlib auto
# font = {'size': 25}
font = {'size': 14}
mpl.rc('font', **font)
Load NEAs from the 2-nd domain
In [2]:
dirpath = './asteroid_data/'
real_datasets = ['haz_real', 'nohaz_real']
gen_datasets = ['haz_gen', 'nohaz_gen']
genu_datasets = ['haz_gen', 'nohaz_gen']
name_sufixes = ['_dom2.p', '_dom2_rest.p']
In [3]:
dumps_real = [dirpath + ds + ns for ns in name_sufixes for ds in real_datasets]
dumps_gen = [dirpath + ds + ns for ns in name_sufixes for ds in gen_datasets]
dumps_genu = [dirpath + ds + ns for ns in name_sufixes for ds in genu_datasets]
haz_real, nohaz_real, haz_real_rest, nohaz_real_rest = map(rdb.loadObject, dumps_real)
haz_gen, nohaz_gen, haz_gen_rest, nohaz_gen_rest = map(rdb.loadObject, dumps_gen)
haz_genu, nohaz_genu, haz_genu_rest, nohaz_genu_rest = map(rdb.loadObject, dumps_genu)
In [4]:
gen_num = sum(map(len, [haz_gen, nohaz_gen]))
real_num = sum(map(len, [haz_real, nohaz_real]))
print "Number of virtual asteroids in the domain:", gen_num
print "Number of real asteroids in the domain:", real_num
Investigate distributions of asteroids' orbital parameters in the 2-nd domain
In [5]:
# vd.plot_alldistcombs(haz_gen, nohaz_gen, labels=True)
In [6]:
haz_gen_extracted_aa = []
nohaz_gen_trapped_aa = []
haz_real_extracted_aa = []
nohaz_real_trapped_aa = []
Atiras
In [7]:
haz_gen_atiras, haz_gen_atiras_num = rdb.get_atiras(haz_gen)
nohaz_gen_atiras, nohaz_gen_atiras_num = rdb.get_atiras(nohaz_gen)
atiras_gen_num = haz_gen_atiras_num + nohaz_gen_atiras_num
haz_real_atiras, haz_real_atiras_num = rdb.get_atiras(haz_real)
nohaz_real_atiras, nohaz_real_atiras_num = rdb.get_atiras(nohaz_real)
atiras_real_num = haz_real_atiras_num + nohaz_real_atiras_num
print "Number of virtual Atiras:", atiras_gen_num
print "Number of real Atiras:", atiras_real_num
Atens
In [8]:
haz_gen_atens, haz_gen_atens_num = rdb.get_atens(haz_gen)
nohaz_gen_atens, nohaz_gen_atens_num = rdb.get_atens(nohaz_gen)
atens_gen_num = haz_gen_atens_num + nohaz_gen_atens_num
haz_real_atens, haz_real_atens_num = rdb.get_atens(haz_real)
nohaz_real_atens, nohaz_real_atens_num = rdb.get_atens(nohaz_real)
atens_real_num = haz_real_atens_num + nohaz_real_atens_num
print "Number of virtual Atens:", atens_gen_num
print "Number of real Atens:", atens_real_num
Atiras + Atens
In [9]:
haz_gen_atiras_atens = pd.concat((haz_gen_atiras, haz_gen_atens))
nohaz_gen_atiras_atens = pd.concat((nohaz_gen_atiras, nohaz_gen_atens))
haz_gen_atiras_atens_num = len(haz_gen_atiras_atens)
nohaz_gen_atiras_atens_num = len(nohaz_gen_atiras_atens)
atiras_atens_gen_num = haz_gen_atiras_atens_num + nohaz_gen_atiras_atens_num
haz_real_atiras_atens = pd.concat((haz_real_atiras, haz_real_atens))
nohaz_real_atiras_atens = pd.concat((nohaz_real_atiras, nohaz_real_atens))
haz_real_atiras_atens_num = len(haz_real_atiras_atens)
nohaz_real_atiras_atens_num = len(nohaz_real_atiras_atens)
atiras_atens_real_num = haz_real_atiras_atens_num + nohaz_real_atiras_atens_num
In [10]:
print "Number of virtual PHAs in the group:", haz_gen_atiras_atens_num
print "Number of virtual NHAs in the group:", nohaz_gen_atiras_atens_num
print "Number of virtual Atiras and Atens:", atiras_atens_gen_num
print "Virtual Atiras and Atens group weight:", float(atiras_atens_gen_num)/gen_num
In [11]:
print "Number of real PHAs in the group:", haz_real_atiras_atens_num
print "Number of real NHAs in the group:", nohaz_real_atiras_atens_num
print "Number of real Atiras and Atens:", atiras_atens_real_num
print "Real Atiras and Atens group weight:", float(atiras_atens_real_num)/real_num
In [12]:
cutcol = ['a', 'q']
p1, p2 = [[0.545, 0.15], [0.945, 0.95]]
vd.plot_distributions2d(cutcol, haz_gen_atiras_atens, nohaz_gen_atiras_atens, line=[p1, p2], labels=True)
In [13]:
splitres = ld.split_by_line(haz_gen_atiras_atens, nohaz_gen_atiras_atens, [p1, p2], cutcol)
haz_gen_atiras_atens_qa, nohaz_gen_atiras_atens_qa = splitres[0]
haz_gen_atiras_atens_qa__, nohaz_gen_atiras_atens_qa__ = splitres[1]
In [14]:
splitres = ld.split_by_line(haz_real_atiras_atens, nohaz_real_atiras_atens, [p1, p2], cutcol)
haz_real_atiras_atens_qa, nohaz_real_atiras_atens_qa = splitres[0]
haz_real_atiras_atens_qa__, nohaz_real_atiras_atens_qa__ = splitres[1]
Amplify datasets by their symetric copies over the w parameter
In [15]:
haz_gen_atiras_atens_qa_se = ld.add_doublemirror_column(haz_gen_atiras_atens_qa, 'w', 180.0)
nohaz_gen_atiras_atens_qa_se = ld.add_doublemirror_column(nohaz_gen_atiras_atens_qa, 'w', 180.0)
Cut w, a and i columns and nomalize datasets
In [16]:
cutcol = ['w', 'a', 'i']
pairs, atiras_atens_wai_sc = ld.cut_normalize(cutcol, [haz_gen_atiras_atens_qa, nohaz_gen_atiras_atens_qa],
[haz_real_atiras_atens_qa, nohaz_real_atiras_atens_qa],
[haz_gen_atiras_atens_qa_se, nohaz_gen_atiras_atens_qa_se])
haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
haz_gen_se_cut, nohaz_gen_se_cut = pairs[2]
Find decision surface with SVM
In [17]:
clf_atiras_atens_wai = svm.SVC(gamma=80.0, C=0.1, class_weight={0: 1.5})
xtrain, ytrain = ld.mix_up(haz_gen_se_cut, nohaz_gen_se_cut)
clf_atiras_atens_wai = clf_atiras_atens_wai.fit(xtrain, ytrain)
In [18]:
# cutcol = ['w', 'a', 'i']
# clf_aa_wai = svm.SVC(gamma=80.0, C=0.1, class_weight={0: 1.5}) #class_weight={0: 1.5}
# #(20 0.5), (30 0.1) (200 0.1)
# splitres = al.split_by_clf(clf_aa_wai, cutcol, haz_gen_atiras_atens_qa_se,
# nohaz_gen_atiras_atens_qa_se,
# haz_gen_atiras_atens_qa,
# nohaz_gen_atiras_atens_qa)
# haz_gen_atiras_atens_wqa, nohaz_gen_atiras_atens_wqa = splitres[0]
# haz_gen_atiras_atens_wqa__, nohaz_gen_atiras_atens_wqa__ = splitres[1]
# aa_wai_sc = splitres[2]
Estimate split quality for virtual Atiras & Atens
In [19]:
predicted_gen = al.clf_split_quality(clf_atiras_atens_wai, haz_gen_cut, nohaz_gen_cut)
haz_gen_atiras_atens_wqa = haz_gen_atiras_atens_qa.iloc[predicted_gen[0]]
nohaz_gen_atiras_atens_wqa = nohaz_gen_atiras_atens_qa.iloc[predicted_gen[1]]
haz_gen_atiras_atens_wqa__ = haz_gen_atiras_atens_qa.iloc[predicted_gen[2]]
nohaz_gen_atiras_atens_wqa__ = nohaz_gen_atiras_atens_qa.iloc[predicted_gen[3]]
Estimate split quality for real Atiras & Atens
In [20]:
predicted_real = al.clf_split_quality(clf_atiras_atens_wai, haz_real_cut, nohaz_real_cut)
haz_real_atiras_atens_wqa = haz_real_atiras_atens_qa.iloc[predicted_real[0]]
nohaz_real_atiras_atens_wqa = nohaz_real_atiras_atens_qa.iloc[predicted_real[1]]
haz_real_atiras_atens_wqa__ = haz_real_atiras_atens_qa.iloc[predicted_real[2]]
nohaz_real_atiras_atens_wqa__ = nohaz_real_atiras_atens_qa.iloc[predicted_real[3]]
Plot decision surface
In [21]:
vd.plot_clf3d(clf_atiras_atens_wai, cutcol, num=250, labels=True, figsize=(9,8),
mode='2d', scales=atiras_atens_wai_sc)
In [22]:
haz_gen_extracted_aa.append(haz_gen_atiras_atens_wqa)
nohaz_gen_trapped_aa.append(nohaz_gen_atiras_atens_wqa)
haz_real_extracted_aa.append(haz_real_atiras_atens_wqa)
nohaz_real_trapped_aa.append(nohaz_real_atiras_atens_wqa)
In [23]:
# vd.display_allparams([haz_gen_atiras_atens_wqa__, nohaz_gen_atiras_atens_wqa__], vd.combs, vd.colnames)
In [24]:
cutcol = ['a', 'q']
vd.plot_distributions2d(cutcol, haz_gen_atiras_atens_wqa__, nohaz_gen_atiras_atens_wqa__, labels=True)
Select asteroids with high values of perihelion distance
In [25]:
hg_gaa_wqa__, nhg_gaa_wqa__ = haz_gen_atiras_atens_wqa__, nohaz_gen_atiras_atens_wqa__
hg_raa_wqa__, nhg_raa_wqa__ = haz_real_atiras_atens_wqa__, nohaz_real_atiras_atens_wqa__
haz_gen_aa_wqa__bq = hg_gaa_wqa__[hg_gaa_wqa__.q > 0.5]
nohaz_gen_aa_wqa__bq = nhg_gaa_wqa__[nhg_gaa_wqa__.q > 0.5]
haz_real_aa_wqa__bq = hg_raa_wqa__[hg_raa_wqa__.q > 0.5]
nohaz_real_aa_wqa__bq = nhg_raa_wqa__[nhg_raa_wqa__.q > 0.5]
haz_gen_aa_wqa__sq = hg_gaa_wqa__[hg_gaa_wqa__.q <= 0.5]
nohaz_gen_aa_wqa__sq = nhg_gaa_wqa__[nhg_gaa_wqa__.q <= 0.5]
haz_real_aa_wqa__sq = hg_raa_wqa__[hg_raa_wqa__.q <= 0.5]
nohaz_real_aa_wqa__sq = nhg_raa_wqa__[nhg_raa_wqa__.q <= 0.5]
# print len(haz_gen_aa_wqa__sq)
Cut a and q columns and nomalize datasets
In [26]:
pairs, atiras_atens_aq_sc = ld.cut_normalize(cutcol, [haz_gen_aa_wqa__bq, nohaz_gen_aa_wqa__bq],
[haz_real_aa_wqa__bq, nohaz_real_aa_wqa__bq])
haz_gen_cut, nohaz_gen_cut = pairs[0]
haz_real_cut, nohaz_real_cut = pairs[1]
Train SVM
In [27]:
clf_atiras_atens_aq = svm.SVC(gamma=8.0, C=1000.0, class_weight={0: 1.5})
xtrain, ytrain = ld.mix_up(haz_gen_cut, nohaz_gen_cut)
clf_atiras_atens_aq = clf_atiras_atens_aq.fit(xtrain, ytrain)
In [28]:
# cutcol = ['a', 'q']
# clf_aa_aq = svm.SVC(gamma=8.0, C=1000.0, class_weight={0: 1.5}) #class_weight={0: 1.5}
# #(20 0.5), (30 0.1) (200 0.1)
# splitres = al.split_by_clf(clf_aa_aq, cutcol, haz_gen_aa_wqa__bq,
# nohaz_gen_aa_wqa__bq)
# haz_gen_aa_aq, nohaz_gen_aa_aq = splitres[0]
# haz_gen_aa_aq__, nohaz_gen_aa_aq__ = splitres[1]
# aa_aq_sc = splitres[2]
Estimate split quality for virtual Atiras & Atens
In [29]:
predicted_gen = al.clf_split_quality(clf_atiras_atens_aq, haz_gen_cut, nohaz_gen_cut)
haz_gen_atiras_atens_aq = haz_gen_aa_wqa__bq.iloc[predicted_gen[0]]
nohaz_gen_atiras_atens_aq = nohaz_gen_aa_wqa__bq.iloc[predicted_gen[1]]
haz_gen_atiras_atens_aq__ = haz_gen_aa_wqa__bq.iloc[predicted_gen[2]]
nohaz_gen_atiras_atens_aq__ = nohaz_gen_aa_wqa__bq.iloc[predicted_gen[3]]
Estimate split quality for real Atiras & Atens
In [30]:
predicted_real = al.clf_split_quality(clf_atiras_atens_aq, haz_real_cut, nohaz_real_cut)
haz_real_atiras_atens_aq = haz_real_aa_wqa__bq.iloc[predicted_real[0]]
nohaz_real_atiras_atens_aq = nohaz_real_aa_wqa__bq.iloc[predicted_real[1]]
haz_real_atiras_atens_aq__ = haz_real_aa_wqa__bq.iloc[predicted_real[2]]
nohaz_real_atiras_atens_aq__ = nohaz_real_aa_wqa__bq.iloc[predicted_real[3]]
Plot decision surface
In [31]:
vd.plot_clf2d(clf_atiras_atens_aq, cutcol, num=400, figsize=(8, 8), scales=atiras_atens_aq_sc,
labels=True, cmap='winter', haz_cut=haz_gen_cut, nohaz_cut=nohaz_gen_cut, gridlines=True)
In [32]:
haz_gen_extracted_aa.append(haz_gen_atiras_atens_aq)
nohaz_gen_trapped_aa.append(nohaz_gen_atiras_atens_aq)
haz_real_extracted_aa.append(haz_real_atiras_atens_aq)
nohaz_real_trapped_aa.append(nohaz_real_atiras_atens_aq)
Divisions quality for virtual Atiras & Atens
In [33]:
vd.print_summary(haz_gen_extracted_aa, nohaz_gen_trapped_aa,
haz_gen_atiras_atens, nohaz_gen_atiras_atens, 'virtual')
Divisions quality for real Atiras & Atens
In [34]:
vd.print_summary(haz_real_extracted_aa, nohaz_real_trapped_aa,
haz_real_atiras_atens, nohaz_real_atiras_atens, 'real')
In [ ]:
In [35]:
haz_gen_extracted_ap = []
nohaz_gen_trapped_ap = []
haz_real_extracted_ap = []
nohaz_real_trapped_ap = []
In [36]:
haz_gen_apollo, haz_gen_apollo_num = rdb.get_apollos(haz_gen)
nohaz_gen_apollo, nohaz_gen_apollo_num = rdb.get_apollos(nohaz_gen)
apollo_gen_num = haz_gen_apollo_num + nohaz_gen_apollo_num
haz_real_apollo, haz_real_apollo_num = rdb.get_apollos(haz_real)
nohaz_real_apollo, nohaz_real_apollo_num = rdb.get_apollos(nohaz_real)
apollo_real_num = haz_real_apollo_num + nohaz_real_apollo_num
Virtual Apollos
In [37]:
print "Number of virtual PHAs in the group:", haz_gen_apollo_num
print "Number of virtual NHAs in the group:", nohaz_gen_apollo_num
print "Number of virtual Apollo:", apollo_gen_num
print "Apollo group weight:", float(apollo_gen_num)/gen_num
Real Apollos
In [38]:
print "Number of real PHAs in the group:", haz_real_apollo_num
print "Number of real NHAs in the group:", nohaz_real_apollo_num
print "Number of real Apollo:", apollo_real_num
print "Apollo group weight:", float(apollo_real_num)/real_num
In [39]:
# vd.display_allparams([haz_gen_apollo, nohaz_gen_apollo], vd.combs, vd.colnames)
Amplify datasets by it's symmetric copies over the w parameter
In [40]:
haz_gen_apollo_se = ld.add_doublemirror_column(haz_gen_apollo, 'w', 180.0)
nohaz_gen_apollo_se = ld.add_doublemirror_column(nohaz_gen_apollo, 'w', 180.0)
In [41]:
# haz_gen_apollo_see = ld.extend_by_copies(haz_gen_apollo_se, 'w', extend_factor=0.25)
# nohaz_gen_apollo_see = ld.extend_by_copies(nohaz_gen_apollo_se, 'w', extend_factor=0.25)
In [42]:
cutcol = ['w', 'i']
vd.plot_distributions2d(cutcol, haz_gen_apollo_se, nohaz_gen_apollo_se, labels=True)