notebook.community

Edit and run



In [ ]:

    
# setup
import planet4 as p4
from planet4.dbscan import DBScanner
from planet4 import io, markings
import pandas as pd
from pathlib import Path
import pkg_resources as pr

with pr.resource_stream('planet4', 'data/test_db.csv') as f:
    data = pd.read_csv(f)

# import warnings
# with warnings.catch_warnings():
#     warnings.filterwarnings("ignore",category=DeprecationWarning)
    
from tempfile import TemporaryDirectory
import tempfile

_tdir = TemporaryDirectory()
tdir = Path(_tdir.name)

imid1 = 'APF000012w'
imid2 = 'APF000012q'

imid1data = data[data.image_id==imid1]
imid2data = data[data.image_id==imid2]

fans1 = imid1data[imid1data.marking=='fan']
blotches1 = imid1data[imid1data.marking=='blotch']
fans2 = imid2data[imid2data.marking=='fan']
blotches2 = imid2data[imid2data.marking=='blotch']



In [ ]:

    
dbscanner = DBScanner(save_results=False)



In [ ]:

    
from planet4 import region_data



In [ ]:

    
region_data.Ithaca.season2



In [ ]:

    
obsid = region_data.Ithaca.season2[-1]
dbscanner.cluster_image_name(obsid)



In [ ]:

    
db = io.DBManager()



In [ ]:

    
data = db.get_image_name_markings(obsid)



In [ ]:

    
data.shape



In [ ]:

    
db.dbname



In [ ]:

    
dbscanner.eps_values



In [ ]:

    
dbscanner.cluster_image_id('12w')



In [ ]:

    
p4id = markings.ImageID('12w')



In [ ]:

    
dbscanner.reduced_data['fan'].shape



In [ ]:

    
%matplotlib inline



In [ ]:

    
dbscanner.cluster_and_plot('17a', 'blotch')



In [ ]:

    
dbscanner.parameter_scan('17a', 'fan', [0.1, 0.13], [50, 60, 70], size_to_scan='small')



In [ ]:

    
dbscanner.save_results



In [ ]:

    
# test_calc_fnotch
actual = clustering.calc_fnotch(4, 4)
assert actual == 0.5
actual = clustering.calc_fnotch(4, 0)
assert actual == 1
actual = clustering.calc_fnotch(0, 4)
assert actual == 0



In [ ]:

    
# test_dbscan_xy_blotch

# using only x and y (or image_x,image_y)
coords = ['image_x','image_y']
X = blotches1[coords].values
dbscanner = clustering.DBScanner(X, min_samples=2)

assert dbscanner.n_clusters == 26
assert dbscanner.n_rejected == 25



In [ ]:

    
# test_dbscan_xy_fan

# using only x and y (or image_x,image_y)
coords = ['image_x','image_y']
X = fans1[coords].values
dbscanner = clustering.DBScanner(X, min_samples=2)

assert dbscanner.n_clusters == 7
assert dbscanner.n_rejected == 11



In [ ]:

    
# test_dbscan_xy_angle_blotch

coords = ['image_x','image_y', 'angle']
X = blotches1[coords].values
dbscanner = clustering.DBScanner(X, min_samples=2)

assert dbscanner.n_clusters == 35
assert dbscanner.n_rejected == 102



In [ ]:

    
# test_dbscan_xy_angle_fan

coords = ['image_x','image_y', 'angle']
X = fans1[coords].values
dbscanner = clustering.DBScanner(X, min_samples=2)

assert dbscanner.n_clusters == 6
assert dbscanner.n_rejected == 15



In [ ]:

    
# test_clustering_basics

cm.cluster_image_id(imid1, data=imid1data)

assert cm.n_classifications == 28



In [ ]:

    
cm.cluster_image_id(imid2, data=imid2data)
assert cm.n_classifications == 23

for subdir in ['just_clustering']:  # 'applied_cut_0.5', 
    expected = tdir / subdir
    assert expected.exists() and expected.is_dir()



In [ ]:

    
# test_output_file_creation_just_clustering

for marking in ['blotches', 'fans']:
    expected = tdir / 'just_clustering' / (imid1 + '_' + marking + '.csv')
    print()
    assert expected.exists()

for marking in ['blotches']:
    expected = tdir / 'just_clustering' / (imid2 + '_' + marking + '.csv')
    if marking == 'blotches':
        assert expected.exists()
    else: # 12q,i.e. imdid2 only has blotches
        assert not expected.exists()

same user rejection



In [ ]:

    
s = """
x	y	image_x	image_y	angle	spread	distance	user_name	marking	classification_id
67.0	320.0	67.0	27172.0	90.0	2.01745014480398	10.0	test_user	fan	50ec9c10861cf8095600017b
74.0	318.0	74.0	27170.0	90.0	2.01745014480398	10.0	test_user	fan	50ec9c10861cf8095600017b
82.0	336.0	82.0	27188.0	90.0	2.01745014480398	10.0	test_user	fan	50ec9c10861cf8095600017b
57.0	310.0	57.0	27162.0	90.0	2.01745014480398	10.0	test_user	fan	50ec9c10861cf8095600017b
60.0	315.0	60.0	27167.0	35.21759296819272	10.434250055350423	41.617304093369626	test_user	fan	50ec9c10861cf8095600017b
63.0	307.0	63.0	27159.0	62.38162109858792	2.01745014480398	437.89610639968015	test_user	fan	50ec9c10861cf8095600017b
"""



In [ ]:

    
from io import StringIO



In [ ]:

    
df = pd.read_table(StringIO(s))
df



In [ ]:

    
cm.data = df
cm.pm.id_ = 'test_id'



In [ ]:

    
cm.cluster_data()



In [ ]:

    
cm.reduced_data['fan'][0]

some bug hunting



In [ ]:

    
def hunt_bug(fname):
    df = pd.read_csv(fname)
    return (df[df.duplicated()].shape[0])



In [ ]:

    
fnames = p.glob('*.csv')
obsids = []
no_of_dupes = []
kind = []
for fname in fnames:
    tokens = fname.name.split('_')
    if fname.name.startswith('ESP'):
        obsids.append('_'.join(tokens[:3]))
        kind.append(tokens[3].split('.')[0])
    else:
        obsids.append(tokens[0])
        kind.append(tokens[1].split('.')[0])
    no_of_dupes.append(hunt_bug(fname))



In [ ]:

    
df = pd.DataFrame(dict(obsids=obsids, no_of_dupes=no_of_dupes,
                       kind=kind))
df



In [ ]:

    
%matplotlib nbagg



In [ ]:

    
db.get_image_id_markings('apx').iloc[0]



In [ ]:

    
db.get_image_id_markings('ani').iloc[0]



In [ ]:

    
plotting.plot_clustered_fans('apx', _dir=tdir)



In [ ]:

    
plotting.plot_clustered_fans('ani', _dir=tdir)



In [ ]:

    
plotting.plot_raw_fans('ani')



In [ ]:

    
newblotches = cm.newblotches.apply(lambda x: x.store())



In [ ]:

    
newblotches[newblotches.duplicated(keep=False)].head()



In [ ]:

    
cm.pm.fnotchdf.head()



In [ ]:

    
cm.pm.fnotchdf.filter(regex='_image_id').head()



In [ ]:

    
cm.pm.fnotchdf.iloc[2:4].T



In [ ]:

    
fn1 = markings.Fnotch.from_series(cm.pm.fnotchdf.iloc[2], scope='hirise')



In [ ]:

    
fn2 = markings.Fnotch.from_series(cm.pm.fnotchdf.iloc[3], scope='hirise')



In [ ]:

    
fn1.fan



In [ ]:

    
fn2.fan



In [ ]:

    
norm(fn1.fan.midpoint - fn2.fan.midpoint)



In [ ]:

    
p4id = markings.ImageID('apx', data=data, scope='planet4')



In [ ]:

    
%matplotlib inline



In [ ]:

    
p4id.plot_blotches()



In [ ]:

    
from planet4 import plotting



In [ ]:

    
plotting.plot_clustered_blotches('apx', _dir=tdir)



In [ ]:

    
pm = io.PathManager(id_='apx', datapath=tdir)



In [ ]:

    
pm.reduced_blotchfile



In [ ]:

    
tdir



In [ ]:

    
list(tdir.glob('just_clustering/*'))



In [ ]:

    
newblotches = cm.newblotches.apply(lambda x: x.store())



In [ ]:

    
newblotches.head()



In [ ]:

    
b1 = markings.Blotch(cm.newblotches.iloc[0].data)
b2 = markings.Blotch(cm.newblotches.iloc[1].data)
b1 == b2



In [ ]:

    
df = cm.pm.fnotchdf



In [ ]:

    
df.apply?



In [ ]:

    
df.duplicated().value_counts()



In [ ]:

    
final_clusters = df.apply(markings.Fnotch.from_series, axis=1).apply(lambda x: x.get_marking(0.5))



In [ ]:

    
pd.set_option('display.width', 10000)



In [ ]:

    
final_clusters.head()



In [ ]:

    
df.filter(regex='fan_').head()



In [ ]:

    
df.filter(regex='blotch_').head()



In [ ]:

    
from planet4 import markings



In [ ]:

    
fnotch = markings.Fnotch.from_series(df.iloc[0], scope='planet4')



In [ ]:

    
from numpy.linalg import norm



In [ ]:

    
norm(fnotch.blotch.center - fnotch.fan.midpoint)



In [ ]:

    
def filter_for_fans(x):
            if isinstance(x, markings.Fan):
                return x

def filter_for_blotches(x):
    if isinstance(x, markings.Blotch):
        return x



In [ ]:

    
final_clusters.apply(filter_for_blotches)



In [ ]:

    
pd.read_csv(cm.pm.final_fanfile).duplicated().value_counts()



In [ ]:

    
# teardown
_tdir.cleanup()



In [ ]: