In [1]:
import holoviews as hv
hv.notebook_extension('bokeh')



In [2]:
import dask
import dask.dataframe as dd
import pandas as pd
from distributed import Client
client = Client(scheduler_file='/scratch/tmorton/dask/scheduler.json')

In [3]:
import glob
import sys
sys.path.append('..')
from explorer.catalog import ParquetCatalog, MatchedCatalog

def get_files(rerun, filt, tracts, unforced=False):
    table = 'unforced' if unforced else 'forced'
    return ['{}/plots/{}/tract-{}/{}.parq'.format(rerun, filt, t, table) for t in tracts]

filt = 'HSC-Z'
rerun = '/scratch/tmorton/hscRerun/DM-10859'

cosmos_files = get_files(rerun, filt, tracts=[9813])
cosmos_files_unforced = get_files(rerun, filt, tracts=[9813], unforced=True)
wide_files = get_files(rerun, filt, tracts=[8766, 8767])
wide_files_unforced = get_files(rerun, filt, tracts=[8766, 8767], unforced=True)

cosmos = ParquetCatalog(cosmos_files, client=client)
cosmos_unforced = ParquetCatalog(cosmos_files_unforced, client=client)
wide = ParquetCatalog(wide_files, client=client)
wide_unforced = ParquetCatalog(wide_files_unforced, client=client)

In [4]:
import numpy as np
import fastparquet

def perturb_coords(df, scatter=0.01):
    new_df = df.copy()
    new_df.index = new_df.index.astype(int)*10
    
    n = len(new_df)
    r = np.random.randn(n)*scatter * 1./3600 * np.pi / 180
    pa = np.random.random(n)*2*np.pi
    dra = r*np.cos(pa)
    ddec = r*np.sin(pa)
    
    new_df.coord_ra += dra
    new_df.coord_dec += ddec
    return new_df

def write_test_cats():
    wide1 = ParquetCatalog([wide_files[0]])
    wide2 = ParquetCatalog([wide_files[1]])
    df1 = wide1.df_all.sample(0.01).compute()
    df2 = wide2.df_all.sample(0.01).compute()    
    fastparquet.write('test_data/8766/forced.parq', df1)
    fastparquet.write('test_data/8767/forced.parq', df2)
    xdf1 = perturb_coords(df1)
    xdf2 = perturb_coords(df2)
    fastparquet.write('test_data/8766_perturb/forced.parq', xdf1)
    fastparquet.write('test_data/8767_perturb/forced.parq', xdf2)

In [5]:
# write_test_cats()

In [6]:
from explorer.catalog import MatchedCatalog
files1 = ['test_data/8766/forced.parq', 'test_data/8767/forced.parq']
files2 = ['test_data/8766_perturb/forced.parq', 'test_data/8767_perturb/forced.parq']

cat1 = ParquetCatalog(files1)
cat2 = ParquetCatalog(files2)

matched = MatchedCatalog(cat1, cat2)

In [7]:
from explorer.functors import (Mag, MagDiff, CustomFunctor, DeconvolvedMoments, Column,
                            SdssTraceSize, PsfSdssTraceSizeDiff, HsmTraceSize,
                            PsfHsmTraceSizeDiff, CompositeFunctor)

funcs = [Mag('base_PsfFlux'), MagDiff('modelfit_CModel', 'base_PsfFlux')]
f = CompositeFunctor({k:fn for k,fn in zip(['x', 'y'], funcs)})

In [8]:
from explorer.catalog import MatchedCatalog
cosmos_matched = MatchedCatalog(cosmos, cosmos_unforced)
wide_matched = MatchedCatalog(wide, wide_unforced)

In [9]:
%time wide_matched._match_cats()


1671378 good matches, 50466 bad.
CPU times: user 9.44 s, sys: 664 ms, total: 10.1 s
Wall time: 12.3 s

In [10]:
from explorer.dataset import QADataset

matched_data = QADataset(wide_matched, funcs[0])

In [11]:
matched_data.df.head()


Out[11]:
dec label ra x y0 match_distance
id
38553275716337691 -4.557487 galaxy 36.443522 23.425706 0.010022 0.047808
38553275716337695 -4.557338 galaxy 36.464870 23.901023 -0.005784 0.052379
38553275716337697 -4.557429 galaxy 36.425067 25.368566 -0.001902 0.213202
38553275716337698 -4.557356 star 36.406413 21.921688 0.000295 0.013396
38553275716337700 -4.557428 galaxy 36.412057 24.201396 0.018878 0.086750

In [12]:
matched_data.ds


Out[12]:
:Dataset   [ra,dec,x,label]   (y0,match_distance)

In [13]:
from explorer.plots import scattersky, multi_scattersky

multi_scattersky(matched_data.ds)


WARNING:root:Bounds: Bounds is deprecated use BoundsXY instead.
WARNING:root:Bounds: Bounds is deprecated use BoundsXY instead.
WARNING:root:Bounds: Bounds is deprecated use BoundsXY instead.
WARNING:root:Bounds: Bounds is deprecated use BoundsXY instead.
Out[13]:

In [13]:
matched_data.allfuncs


Out[13]:
{'dec': <explorer.functors.DecColumn at 0x7f93d2890e48>,
 'label': <explorer.functors.StarGalaxyLabeller at 0x7f93d8082fd0>,
 'ra': <explorer.functors.RAColumn at 0x7f93d2890d30>,
 'x': <explorer.functors.Mag at 0x7f93d8082ef0>,
 'y0': <explorer.functors.MagDiff at 0x7f93d29ff438>}

In [17]:
dec = matched_data.allfuncs['dec'](wide_matched)

In [18]:
label = matched_data.allfuncs['label'](wide_matched)

In [19]:
ra = matched_data.allfuncs['ra'](wide_matched)

In [40]:
x = matched_data.allfuncs['x'](wide_matched)

In [23]:
y0 = matched_data.allfuncs['y0'](wide_matched)

In [24]:
d = {'dec':dec, 'label':label, 'ra':ra, 'x':x, 'y0':y0}

In [26]:
len(dec), len(label), len(ra), len(x), len(y0)


Out[26]:
(1721844, 1671378, 1721844, 1643578, 1610758)

In [28]:
dec.index


Out[28]:
Int64Index([38553275716337691, 38553275716337695, 38553275716337697,
            38553275716337698, 38553275716337700, 38553275716337702,
            38553275716337704, 38553275716337707, 38553275716337708,
            38553275716337709,
            ...
            38558807634232103, 38558807634232114, 38558807634232116,
            38558807634232120, 38558807634232121, 38558807634232125,
            38558807634232133, 38558807634232134, 38558807634232137,
            38558807634232140],
           dtype='int64', name='id', length=1721844)

In [29]:
ra.index


Out[29]:
Int64Index([38553275716337691, 38553275716337695, 38553275716337697,
            38553275716337698, 38553275716337700, 38553275716337702,
            38553275716337704, 38553275716337707, 38553275716337708,
            38553275716337709,
            ...
            38558807634232103, 38558807634232114, 38558807634232116,
            38558807634232120, 38558807634232121, 38558807634232125,
            38558807634232133, 38558807634232134, 38558807634232137,
            38558807634232140],
           dtype='int64', name='id', length=1721844)

In [30]:
label.index


Out[30]:
Int64Index([38553275716337698, 38553275716337702, 38553275716337704,
            38553275716337709, 38553275716337717, 38553275716337719,
            38553275716337724, 38553275716337729, 38553275716337730,
            38553275716337736,
            ...
            38558807634232088, 38558807634232091, 38558807634232092,
            38558807634232094, 38558807634232095, 38558807634232096,
            38558807634232103, 38558807634232121, 38558807634232133,
            38558807634232140],
           dtype='int64', name='id', length=1671378)

In [31]:
x.index


Out[31]:
Int64Index([38553275716337698, 38553275716337702, 38553275716337704,
            38553275716337709, 38553275716337717, 38553275716337719,
            38553275716337724, 38553275716337729, 38553275716337730,
            38553275716337736,
            ...
            38558807634232065, 38558807634232066, 38558807634232067,
            38558803339271165, 38558807634232080, 38558807634232092,
            38558807634232094, 38558807634232096, 38558807634232121,
            38558807634232133],
           dtype='int64', name='id', length=1643578)

In [32]:
y0.index


Out[32]:
Int64Index([38553275716337698, 38553275716337702, 38553275716337704,
            38553275716337709, 38553275716337717, 38553275716337719,
            38553275716337724, 38553275716337729, 38553275716337730,
            38553275716337736,
            ...
            38558807634232058, 38558807634232059, 38558807634232061,
            38558807634232062, 38558807634232063, 38558807634232064,
            38558807634232065, 38558807634232066, 38558807634232121,
            38558807634232133],
           dtype='int64', name='id', length=1610758)

In [36]:
x.name = 'x'

In [37]:
y0.name = 'y0'

In [42]:
x.index.duplicated().sum()


Out[42]:
157855

In [38]:
df = pd.DataFrame({'x':x, 'y0':y0})


---------------------------------------------------------------------------
InvalidIndexError                         Traceback (most recent call last)
<ipython-input-38-84ff2e6f3b86> in <module>()
----> 1 df = pd.DataFrame({'x':x, 'y0':y0})

~/.conda/envs/qa/lib/python3.5/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    273                                  dtype=dtype, copy=copy)
    274         elif isinstance(data, dict):
--> 275             mgr = self._init_dict(data, index, columns, dtype=dtype)
    276         elif isinstance(data, ma.MaskedArray):
    277             import numpy.ma.mrecords as mrecords

~/.conda/envs/qa/lib/python3.5/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
    409             arrays = [data[k] for k in keys]
    410 
--> 411         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    412 
    413     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

~/.conda/envs/qa/lib/python3.5/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5494     # figure out the index, if necessary
   5495     if index is None:
-> 5496         index = extract_index(arrays)
   5497     else:
   5498         index = _ensure_index(index)

~/.conda/envs/qa/lib/python3.5/site-packages/pandas/core/frame.py in extract_index(data)
   5537 
   5538         if have_series or have_dicts:
-> 5539             index = _union_indexes(indexes)
   5540 
   5541         if have_raw_arrays:

~/.conda/envs/qa/lib/python3.5/site-packages/pandas/core/indexes/api.py in _union_indexes(indexes)
     75         else:
     76             for other in indexes[1:]:
---> 77                 result = result.union(other)
     78             return result
     79     elif kind == 'array':

~/.conda/envs/qa/lib/python3.5/site-packages/pandas/core/indexes/base.py in union(self, other)
   2173                 result.extend([x for x in other._values if x not in value_set])
   2174         else:
-> 2175             indexer = self.get_indexer(other)
   2176             indexer, = (indexer == -1).nonzero()
   2177 

~/.conda/envs/qa/lib/python3.5/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
   2581 
   2582         if not self.is_unique:
-> 2583             raise InvalidIndexError('Reindexing only valid with uniquely'
   2584                                     ' valued Index objects')
   2585 

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [44]:
y = f(wide_matched)

In [45]:
y2 = f(wide)

In [46]:
y3 = f(wide_unforced)

In [47]:
len(y)


Out[47]:
1610758

In [48]:
len(y2)


Out[48]:
1684340

In [49]:
len(y3)


Out[49]:
1666222

In [50]:
len(wide_matched.match_inds1)


Out[50]:
1671378

In [51]:
len(wide_matched.match_inds2)


Out[51]:
1671378

In [52]:
z = f(wide_matched, dropna=False)

In [53]:
len(z)


Out[53]:
1671378

In [54]:
z2 = f(wide, dropna=False)

In [55]:
z3 = f(wide_unforced, dropna=False)

In [56]:
len(z2)


Out[56]:
1721844

In [57]:
len(z3)


Out[57]:
1721844

In [ ]: