The search for NNs evolved: trying parallel processing

First try in cross-matching two (mock) catalogs was done in notebook xNN_mock_sources_v1, where the workflow was setup.

Now, we take the chance to better build the algorithm and workout some parallelization in python.

TOC:


In [1]:
%matplotlib inline

from matplotlib import pyplot as plt
from matplotlib import cm

import numpy

plt.rcParams['figure.figsize'] = (10.0, 10.0)

Simulation of source images


In [2]:
# first of all, let us define some parameters
#
# size of the images
sx = 5000
sy = 5000
# number of sources on each image
nsrc1 = int( 0.1 * (sx*sy)/(sx+sy) )
#nsrc2 = int( 0.5 * nsrc1 )
# typical error radius (in pixels)
rerr1 = 20
rerr2 = rerr1

In [3]:
# generate coordinate pairs for each image
def generate_positions(npts,img_shape):
    """
    Generate 'npts' points uniformly across 'image_shape'.
    
    Args:
        npts      : number of points to generate
        img_shape : (y,x) shape where to generate points

    Returns:
        Pair_Coordinates_List : list of (y,x) tuples
    """
    import numpy as np
    _sy,_sx = img_shape
    assert _sy>=5 and _sx>=5 # because I want
    indy = np.random.randint(0,_sy-1,npts)
    indx = np.random.randint(0,_sx-1,npts)
    _inds = zip(indy,indx)
    return _inds



def generate_positions_around(seeds,radius,img_shape,fraction=1.0):
    """
    """
    assert 0 < fraction <= 1.0
    
    def _gen_pos_around_individual(seed,radius,img_shape,fraction,distribution='normal'):
        """
        TODO: implement other kinds of distributions
        """
        from random import random
        if random() < fraction:
            import numpy as np
            sy,sx = seed
            y = int(np.random.normal(sy,radius))
            y = y if y > 0 and y < img_shape[0] else sy
            x = int(np.random.normal(sx,radius))
            x = x if x > 0 and x < img_shape[1] else sx
        else:
            y,x = generate_positions(1,img_shape)[0]
        return y,x
    
    _inds = []
    for seed in seeds:
        y,x = _gen_pos_around_individual(seed,radius,img_shape,fraction)
        _inds.append((y,x))
    return _inds

In [4]:
# "sources 1"
coords1 = generate_positions(nsrc1,(sy,sx))
assert isinstance(coords1,list)
assert len(coords1) == nsrc1

# "sources 2"
img_shape = (sy,sx)
coords2 = generate_positions_around(coords1,rerr1,img_shape,fraction=0.75)
assert isinstance(coords2,list)

In [5]:
# create the positions table
def create_positions_table(coords,err_radius):
    """
    """
    tab = {}
    for i,oo in enumerate(coords):
        i = i+1
        tab[i] = [i,oo[1],oo[0],err_radius]
    return tab

def tab2df(tab):
    nt = {'ID':[],'x':[],'y':[],'r':[]}
    for k,v in tab.iteritems():
        nt['ID'].append(v[0])
        nt['x'].append(v[1])
        nt['y'].append(v[2])
        nt['r'].append(v[3])
    import pandas
    df = pandas.DataFrame(nt)
    return df

In [6]:
# table for "sources 1" and a DataFrame for convenience...
tab1 = create_positions_table(coords1,rerr1)
df1 = tab2df(tab1)

# table for "sources 2"...
tab2 = create_positions_table(coords2,rerr2)
df2 = tab2df(tab2)

In [7]:
# create and draw each source on black(null) images
def draw_image_sources(tab_positions,img_shape,colormap='colorful'):
    """
    """
    def color_filling(mode='colorful'):
        def _colorful(x,y,size):
            _R = int(255 - ( int(x/256) + int(y/256)*(1 + ceil(size[0]/256)) )) #TODO: restrict total size of image to avoid _R<=0
            _G = x%256
            _B = y%256
            return (_R,_G,_B)

        def _blue(x,y,size):
            _R = 0
            _G = 0
            _B = 255
            return (_R,_G,_B)

        def _green(x,y,size):
            _R = 0
            _G = 255
            _B = 0
            return (_R,_G,_B)

        def _red(x,y,size):
            _R = 255
            _G = 0
            _B = 0
            return (_R,_G,_B)

        foos = {'blue'    : _blue,
                'red'     : _red,
                'green'   : _green,
                'colorful': _colorful}
        
        try:
            foo = foos[mode]
        except:
            foo = _colorful
        return foo
        
        
    from math import ceil
    from PIL import Image,ImageDraw
    assert(isinstance(img_shape,tuple) and len(img_shape) is 2)
    size = img_shape[::-1]
    # Modification to accomplish color codes ---
    #mode = 'L'
    mode = 'RGB'
    # ---
    color = "black"
    img = Image.new(mode,size,color)
    assert(len(tab_positions)>=1)
    #
    dictColorId = {}
    filling_foo = color_filling(colormap)
    #
    for i,src in tab_positions.items():
        assert isinstance(src,list) and src is tab_positions[i]
        assert len(src)>=4, "length of table raw %d is %d" % (i,len(src))
        assert i==src[0]
        draw = ImageDraw.Draw(img)
        x = src[1]
        assert 0<=x and x<size[0], "coordinate x is %d" % x
        y = src[2]
        assert 0<=y and y<size[1], "coordinate y is %d" % y
        r = src[3]
        assert r<size[0]/2 and r<size[1]/2
        box = (x-r,y-r,x+r,y+r)
        # Modification to accomplish color codes ---
        #fill=255
        fill = filling_foo(x,y,size)
        # ---
        dictColorId[str(fill)] = i
        draw.ellipse(box,fill=fill)
        del draw,box,x,y,r
    return img,dictColorId

In [8]:
img1,cor2id1 = draw_image_sources(tab1,(sy,sx),colormap='blue')
img2,cor2id2 = draw_image_sources(tab2,(sy,sx),colormap='red')
#img1.show()
#img2.show()

In [9]:
# cmap reference:
#
# cm api: http://matplotlib.org/api/cm_api.html
# cmaps : http://matplotlib.org/users/colormaps.html
# imshow: http://matplotlib.org/users/image_tutorial.html
#cmap = cm.get_cmap('Blues')

def pilImage_2_numpyArray(img):
    import numpy
    #img_array = numpy.array(list(img.getdata())).reshape(sx,sy,3)
    img_array = numpy.asarray(img)
    return img_array

def rgbArray_2_mono(img_arr,chanel='R'):
    chanels = {'R':0,
               'G':1,
               'B':2}
    _i = chanels[chanel]
    return img_arr[:,:,_i]

Result of the simulation for the first image


In [10]:
img1_array = pilImage_2_numpyArray(img1)
img1_mono = rgbArray_2_mono(img1_array,'B')

plt.imshow(img1_mono,cmap='Blues')
print "Catalog A:"
print "----------"
print df1


Catalog A:
----------
      ID   r     x     y
0      1  20  3708  4304
1      2  20  4304   494
2      3  20   214  3537
3      4  20  1912  4540
4      5  20  4788  4431
5      6  20  4716  4808
6      7  20  1619  4183
7      8  20  1271  2960
8      9  20   525  1717
9     10  20  1941  1433
10    11  20  2897  4946
11    12  20   298  3576
12    13  20  3630  3544
13    14  20   730  3585
14    15  20  3680  2836
15    16  20   478   378
16    17  20  1791  2246
17    18  20  4473  4642
18    19  20  1824   476
19    20  20  3910  4762
20    21  20  2148  2417
21    22  20  4068  3836
22    23  20  3078  4157
23    24  20  2408  2921
24    25  20  2120  3191
25    26  20   621  1375
26    27  20   995  4408
27    28  20  2791  3183
28    29  20  4140  3489
29    30  20  2329   999
..   ...  ..   ...   ...
220  221  20  1991  3713
221  222  20  4682  4921
222  223  20  4424  3791
223  224  20  4713  2315
224  225  20   720  4803
225  226  20  2514   107
226  227  20  3330  2022
227  228  20  1661  4995
228  229  20  1526  2256
229  230  20   480  4924
230  231  20  3010  1755
231  232  20   238   906
232  233  20   300  1156
233  234  20  1528  3477
234  235  20  2643   659
235  236  20  2494  1298
236  237  20   568   639
237  238  20  1508  3037
238  239  20  4530   902
239  240  20  2690  2556
240  241  20   673  3385
241  242  20   278  3267
242  243  20  3060  2667
243  244  20   842  2105
244  245  20  4454  3331
245  246  20  3481  3238
246  247  20  2923  3151
247  248  20  2079  3653
248  249  20  1226  2936
249  250  20  3698  3690

[250 rows x 4 columns]

Result of the simulation for the second image


In [11]:
img2_array = pilImage_2_numpyArray(img2)
img2_mono = rgbArray_2_mono(img2_array,'R')

print "Catalog B:"
print "----------"
print df2
plt.imshow(img2_mono,cmap='Reds')


Catalog B:
----------
      ID   r     x     y
0      1  20  3702  4282
1      2  20  4320   466
2      3  20  1403  3449
3      4  20  1910  4554
4      5  20  4764  4428
5      6  20  4709  4833
6      7  20  2899  1986
7      8  20  1294  2944
8      9  20  3827  3196
9     10  20  1952  1461
10    11  20  2878  4956
11    12  20   286  3578
12    13  20  3658  3568
13    14  20  3279  3774
14    15  20  1074  1742
15    16  20   502   403
16    17  20  1614  2585
17    18  20  4501  4666
18    19  20   287  4559
19    20  20  3921  4780
20    21  20  2146  2368
21    22  20  4077  3829
22    23  20  3065  4171
23    24  20  2395  2939
24    25  20  2120  3157
25    26  20  1350  4131
26    27  20   997  4409
27    28  20  2818  3196
28    29  20  4142  3502
29    30  20  2338  1012
..   ...  ..   ...   ...
220  221  20  2000  3731
221  222  20  4709  4942
222  223  20  4409  3814
223  224  20  4687  2315
224  225  20   722  4795
225  226  20  2513    86
226  227  20  3319  2040
227  228  20  3003     5
228  229  20  1533  2241
229  230  20   483  4915
230  231  20  3024  1761
231  232  20   222   904
232  233  20  4580  3161
233  234  20  1568  3484
234  235  20  2647   667
235  236  20  1719  4818
236  237  20   576   646
237  238  20  1501  3056
238  239  20  4515   906
239  240  20  3243  2625
240  241  20   674  3349
241  242  20   264  3248
242  243  20  3044  2674
243  244  20   818  2129
244  245  20  4482  3367
245  246  20  3498  3276
246  247  20  2902  3153
247  248  20  2086  3637
248  249  20  3551  2540
249  250  20  4293  3404

[250 rows x 4 columns]
Out[11]:
<matplotlib.image.AxesImage at 0x7f8855749590>

Merge images


In [12]:
def add_arrays_2_image(img1,img2):
    """
    """
    def array_2_image(arr):
        from PIL import Image
        imgout = Image.fromarray(numpy.uint8(arr))
        return imgout
    return array_2_image(img1+img2)

In [13]:
img_sum = add_arrays_2_image(img1_array,img2_array)
plt.imshow(img_sum)


Out[13]:
<matplotlib.image.AxesImage at 0x7f884983a950>

Cross-matching

Serial experiment


In [14]:
def nn_search(catA,catB, dview=None):
    """
    """
    
    import pandas
    assert isinstance(catA,pandas.DataFrame)
    assert isinstance(catB,pandas.DataFrame)
    
    A = catA.copy()
    B = catB.copy()
    
    from astropy.coordinates import SkyCoord
    from astropy import units
    norm_fact = 500.0
    Ax_norm = A.x / norm_fact
    Ay_norm = A.y / norm_fact
    A_coord = SkyCoord(ra=Ax_norm, dec=Ay_norm, unit=units.deg)

    Bx_norm = B.x / norm_fact
    By_norm = B.y / norm_fact
    B_coord = SkyCoord(ra=Bx_norm, dec=By_norm, unit=units.deg)

    if dview:
        print "Running in parallel"
        # Encapsulate some variables to sand for processing
        def make_nn_search_parallel(foo,cat2):
            def pkg_nn_search(cat1,foo=foo,cat2=cat2):
                return foo(cat1,cat2)
            return pkg_nn_search

        # A-B
        from astropy.coordinates import match_coordinates_sky
        match_eps = make_nn_search_parallel(match_coordinates_sky,B_coord)
        #
        from numpy import arange,array_split,append
        A_list = [ A_coord[idx] for idx in array_split(arange(len(A_coord)),len(dview)) ]
        A_list_out = dview.map_sync(match_eps, A_list)
        #
        match_A_nn_idx, match_A_nn_sep = None, None
        for each_out in A_list_out:
            match_idx, match_sep, _d3d = each_out
            del _d3d
            if match_A_nn_idx is None:
                assert match_A_nn_sep is None
                match_A_nn_idx = match_idx
                match_A_nn_sep = match_sep.value
            else:
                match_A_nn_idx = append(match_A_nn_idx,match_idx)
                match_A_nn_sep = append(match_A_nn_sep,match_sep.value)

        # B-A
        from astropy.coordinates import match_coordinates_sky
        match_eps = make_nn_search_parallel(match_coordinates_sky,A_coord)
        #
        from numpy import arange,array_split,append
        B_list = [ B_coord[idx] for idx in array_split(arange(len(B_coord)),len(dview)) ]
        B_list_out = dview.map_sync(match_eps, B_list)
        #
        match_B_nn_idx, match_B_nn_sep = None, None
        for each_out in B_list_out:
            match_idx, match_sep, _d3d = each_out
            del _d3d
            if match_B_nn_idx is None:
                assert match_B_nn_sep is None
                match_B_nn_idx = match_idx
                match_B_nn_sep = match_sep.value
            else:
                match_B_nn_idx = append(match_B_nn_idx,match_idx)
                match_B_nn_sep = append(match_B_nn_sep,match_sep.value)
        
    else:
        print "Running in serial"
        from astropy.coordinates import match_coordinates_sky
        match_A_nn_idx, match_A_nn_sep, _d3d = match_coordinates_sky(A_coord, B_coord)
        match_B_nn_idx, match_B_nn_sep, _d3d = match_coordinates_sky(B_coord, A_coord)
        match_A_nn_sep = match_A_nn_sep.value
        match_B_nn_sep = match_B_nn_sep.value

    A['NN_in_B'] = B.ID[match_A_nn_idx].values
    B['NN_in_A'] = A.ID[match_B_nn_idx].values

    import numpy
    A_matched_pairs = zip(numpy.arange(len(match_A_nn_idx)),
                          match_A_nn_idx )
    B_matched_pairs = set(zip(match_B_nn_idx,
                              numpy.arange(len(match_B_nn_idx))))

    duplicate_pairs = []
    duplicate_dists = []
    for i,p in enumerate(A_matched_pairs):
        if p in B_matched_pairs:
            duplicate_pairs.append(p)
            duplicate_dists.append(match_A_nn_sep[i])

    A_matched_idx,B_matched_idx = zip(*duplicate_pairs)
    import pandas
    df_matched = pandas.DataFrame({ 'A_idx':A_matched_idx,
                                    'B_idx':B_matched_idx,
                                    'separation':duplicate_dists})
    df_matched = df_matched.set_index('A_idx')

    A.columns = [ 'A_'+c for c in A.columns ]
    B.columns = [ 'B_'+c for c in B.columns ]

    B_matched = B.iloc[df_matched.B_idx]
    B_matched['A_idx'] = df_matched.index
    B_matched = B_matched.set_index('A_idx')

    B_matched['dist'] = numpy.asarray(df_matched.separation * norm_fact, dtype=int)

    df = pandas.concat([A,B_matched],axis=1)
    return df

In [15]:
%time df_match_serial = nn_search(df1,df2)
# from astropy.table import Table
# table_match = Table.from_pandas( df_match )
# table_match.show_in_notebook()


Running in serial
CPU times: user 1.26 s, sys: 36.5 ms, total: 1.29 s
Wall time: 1.28 s
/home/chbrandt/.conda/envs/booq-dev/lib/python2.7/site-packages/ipykernel/__main__.py:106: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Parallel experiment


In [16]:
# Lets us see what happens in parallel
# First, run from the command line:
#$ ipcluster start -n 2

In [17]:
try:
    import ipyparallel as ipp
    client = ipp.Client()
    client.ids
    dview = client[:]
except:
    dview = None


Waiting for connection file: ~/.ipython/profile_default/security/ipcontroller-client.json

In [18]:
%time df_match_parallel = nn_search(df1,df2,dview)


Running in serial
CPU times: user 331 ms, sys: 28.9 ms, total: 360 ms
Wall time: 331 ms
/home/chbrandt/.conda/envs/booq-dev/lib/python2.7/site-packages/ipykernel/__main__.py:106: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Are they equal?


In [19]:
df_match_parallel.equals(df_match_serial)


Out[19]:
True

[ This is for another notebook: the functions organized ;]


In [20]:
# ---

def read_catalogs(A_filename, B_filename, column_names, sample_size=0):
    """
    """

    def read_fitsio(filename,column_names=None):
        """
        """
        import fitsio
        if True:
            catalog_data = fitsio.read(filename, columns=column_names, ext=1)
        # else:
        #     catalog_data = fitsio.read(filename, ext=1)
        return catalog_data


    A_data = read_fitsio(A_filename, column_names=column_names[0])
    B_data = read_fitsio(B_filename, column_names=column_names[1])

    if sample_size:
        A_data = sample(A_data, fraction=sample_size)

    return A_data,B_data

# ---

def get_coordinates(ra,dec):
    from astropy.coordinates import SkyCoord
    from astropy import units
    return SkyCoord(ra=ra, dec=dec, unit=units.deg)

# ---

def sample(data_array, fraction=0.1):
    """
    Returns a (normal) random sample from array' rows

    Input:
     - data_array : ~numpy.ndarray
            Array from where rows are randomly chosen
     - fraction : int, float
            Number between (0,array'length); limits are non-inclusive.
            If the number is higher (inclusive) than '1', it is assumed to be
            the absolute value for the sample size; otherwise, if between (0,1)
            it is assumed to be the relative, array-length multiplication factor.
    Output:
     - sampled_array : ~numpy.ndarray
            Rows are randomly chosen from a normal distribution, all same columns
    """
    import numpy
    nrows = len(data_array)

    assert fraction > 0, "ValueError: There is no sense in asking for 'fraction' <= 0"
    assert fraction < nrows, "ValueError: There is no sense in asking for 'fraction' >= array's length"

    nsamp = fraction if fraction >= 1 else fraction*nrows
    nsamp = int(nsamp)
    idx = numpy.random.randint(0, nrows, nsamp)
    sample_data = data_array[idx]

    return sample_data

# ---

def xmatch_nn(A_coord,B_coord,parallel=False):
        if parallel:
            match_A_nn_idx, match_A_nn_sep = xmatch_nn_parallel(A_coord,B_coord)
        else:
            match_A_nn_idx, match_A_nn_sep = xmatch_nn_serial(A_coord,B_coord)
        return (match_A_nn_idx, match_A_nn_sep)

# ---

def xmatch_nn_serial(pin, neiborhood):
    """
    Nearest-Neighbor search

    Input:
     - pin : ~astropy.coordinates.SkyCoord
            reference catalog (catalog "A")
     - neiborhood : ~astropy.coordinates.SkyCoord
            matching catalog (catalog "B")

    Output:
     - tuple with ~numpy.ndarray , ~astropy.units.Quantity
            array of respective (to 'pin') index entries in 'neiborhood'
            , array of respective pair distances
    """
    A_coord = pin
    B_coord = neiborhood

    from astropy.coordinates import SkyCoord
    assert isinstance(A_coord,SkyCoord)
    assert isinstance(B_coord,SkyCoord)

    from astropy.coordinates import match_coordinates_sky
    match_A_nn_idx, match_A_nn_sep, _d3d = match_coordinates_sky(A_coord,B_coord)

    assert len(match_A_nn_idx) == len(A_coord)
    assert match_A_nn_idx.max() < len(B_coord)

    return (match_A_nn_idx, match_A_nn_sep.value)

# ---

def xmatch_nn_parallel(A_coord,B_coord,nprocs=2):
    """
    """
    
    def parallel_setup(nprocs=2):
        """
        Run xmatch in parallel

        It splits the first array in 'nprocs' for parallel processing,
        then it concatenates the outputs and return that.
        """
        #TODO: do all the verifications for parallel run

        # Lets us see what happens in parallel
        # First, run from the command line:
        #$ ipcluster start -n 2
        try:
            import ipyparallel as ipp
            client = ipp.Client()
            client.ids
            dview = client[:]
        except:
            dview = None
        return dview

    # ---

    dview = parallel_setup(nprocs=2)
    if not dview:
        return False

    print "Running in parallel"

    # Encapsulate some variables to send for processing
    def make_nn_search_parallel(foo,cat2):
        def pkg_nn_search(cat1,foo=foo,cat2=cat2):
            return foo(cat1,cat2)
        return pkg_nn_search
    # ---

    # Split array (of coordinates) in N pieces
    def split_array(A_coord,N):
        from numpy import arange,array_split
        index = arange(len(A_coord))
        A_pieces = [ A_coord[idx]   for idx in array_split( index,N ) ]
        return A_pieces

    # Join array/list of tuples in N pieces
    def join_array(A_coord_outs):
        from numpy import append
        match_A_nn_idx = None
        match_A_nn_sep = None
        for each_out in A_coord_outs:
            match_idx, match_sep, _d3d = each_out
            del _d3d
            if match_A_nn_idx is None:
                assert match_A_nn_sep is None
                match_A_nn_idx = match_idx
                match_A_nn_sep = match_sep.value
            else:
                match_A_nn_idx = append(match_A_nn_idx,match_idx)
                match_A_nn_sep = append(match_A_nn_sep,match_sep.value)
        return (match_A_nn_idx,match_A_nn_sep)
    # ---

    # A-B
    # from astropy.coordinates import match_coordinates_sky
    foo_match_coordinates = make_nn_search_parallel(xmatch_nn, B_coord)

    A_coord_pieces = split_array(A_coord,N=len(dview))

    A_coord_outs = dview.map_sync( foo_match_coordinates, A_coord_pieces )

    match_A_nn_idx,match_A_nn_sep = join_array(A_coord_outs)

    return (match_A_nn_idx,match_A_nn_sep.value)

# ---

def match_pairs(match_A_nn_idx,match_B_nn_idx,match_A_nn_sep):
    import numpy
    A_matched_pairs = zip(numpy.arange(len(match_A_nn_idx)),
                          match_A_nn_idx )
    B_matched_pairs = set(zip(match_B_nn_idx,
                          numpy.arange(len(match_B_nn_idx))))

    matched_pairs = []
    matched_dists = []
    for i,p in enumerate(A_matched_pairs):
        if p in B_matched_pairs:
            matched_pairs.append(p)
            matched_dists.append(match_A_nn_sep[i])

    A_matched_idx,B_matched_idx = zip(*matched_pairs)
    
    return (A_matched_idx, B_matched_idx, matched_dists)

# ---

def assemble_indexes(A_matched_idx,B_matched_idx,matched_dists):
    import pandas
    df_matched_idx = pandas.DataFrame({ 'A_idx':A_matched_idx,
                                        'B_idx':B_matched_idx,
                                        'separation':matched_dists})
    df_matched_idx = df_matched_idx.set_index('A_idx')
    return df_matched_idx

# ---

def merge_catalogs(A,B,df_matched_idx):

    A.columns = [ 'A_'+c for c in A.columns ]
    B.columns = [ 'B_'+c for c in B.columns ]

    B_matched = B.iloc[df_matched_idx.B_idx]
    B_matched['A_idx'] = df_matched_idx.index
    B_matched = B_matched.set_index('A_idx')

    B_matched['dist'] = numpy.asarray(df_matched_idx.separation)

    df = pandas.concat([ A,B_matched ],axis=1)
    
    return df

# ---

In [29]:
class Mock:
    @staticmethod
    def normalize_coordinates(A,B,norm_factor):

        def _normalize_coordinates(x, y, norm_fact):
            from astropy.coordinates import SkyCoord
            from astropy import units
            x_norm = x / norm_fact
            y_norm = y / norm_fact
            coord = SkyCoord(ra=x_norm, dec=y_norm, unit=units.deg)
            return coord

        A_coord = _normalize_coordinates(A.x,A.y,norm_factor)
        B_coord = _normalize_coordinates(B.x,B.y,norm_factor)
        return A_coord,B_coord

# ---
# A_filename = 'cs82/cs82.fits'
# B_filename = 'spies/spies.fits'

# column_names = (['OBJID','RA','DEC'],['id','RA_ch1','DEC_ch1'])
# A_data, B_data = read_catalogs(A_filename,B_filename,column_names)

# from astropy.table import Table
# A = Table(A_data)
# B = Table(B_data)

# A_coord = get_coordinates(A['RA'],A['DEC'])
# B_coord = get_coordinates(B['RA_ch1'],B['DEC_ch1'])

import pandas
assert isinstance(df1,pandas.DataFrame)
assert isinstance(df2,pandas.DataFrame)
A = df1.copy()
B = df2.copy()
norm_factor=500

A_coord,B_coord = Mock.normalize_coordinates(A, B, norm_factor)

match_A_nn_idx, match_A_nn_sep = xmatch_nn(A_coord, B_coord)
match_B_nn_idx, match_B_nn_sep = xmatch_nn(B_coord, A_coord)
del A_coord,B_coord

A_matched_idx, B_matched_idx, matched_dists = match_pairs(match_A_nn_idx, match_B_nn_idx, match_A_nn_sep)

import numpy
matched_dists = numpy.asarray([ _d * norm_factor for _d in matched_dists ], dtype=int)

df_matched_idx = assemble_indexes(A_matched_idx, B_matched_idx, matched_dists)
del A_matched_idx, B_matched_idx, matched_dists

matched_catalog = merge_catalogs(A,B,df_matched_idx)
del df_matched_idx


/home/chbrandt/.conda/envs/booq-dev/lib/python2.7/site-packages/ipykernel/__main__.py:222: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [30]:
from astropy.table import Table
tab = Table.from_pandas(matched_catalog)
print tab


A_ID A_r A_x  A_y   B_ID B_r   B_x    B_y   dist
---- --- ---- ---- ----- ---- ------ ------ ----
   1  20 3708 4304   1.0 20.0 3702.0 4282.0 22.0
   2  20 4304  494   2.0 20.0 4320.0  466.0 32.0
   3  20  214 3537    --   --     --     --   --
   4  20 1912 4540   4.0 20.0 1910.0 4554.0 14.0
   5  20 4788 4431   5.0 20.0 4764.0 4428.0 23.0
   6  20 4716 4808   6.0 20.0 4709.0 4833.0 25.0
   7  20 1619 4183    --   --     --     --   --
   8  20 1271 2960   8.0 20.0 1294.0 2944.0 27.0
   9  20  525 1717    --   --     --     --   --
  10  20 1941 1433    --   --     --     --   --
 ... ...  ...  ...   ...  ...    ...    ...  ...
 240  20 2690 2556    --   --     --     --   --
 241  20  673 3385 241.0 20.0  674.0 3349.0 36.0
 242  20  278 3267 242.0 20.0  264.0 3248.0 23.0
 243  20 3060 2667 243.0 20.0 3044.0 2674.0 17.0
 244  20  842 2105 244.0 20.0  818.0 2129.0 33.0
 245  20 4454 3331 245.0 20.0 4482.0 3367.0 45.0
 246  20 3481 3238 246.0 20.0 3498.0 3276.0 41.0
 247  20 2923 3151 247.0 20.0 2902.0 3153.0 20.0
 248  20 2079 3653 248.0 20.0 2086.0 3637.0 17.0
 249  20 1226 2936    --   --     --     --   --
 250  20 3698 3690    --   --     --     --   --
Length = 250 rows

In [36]:
for fat in df_match_serial.columns:
    print "Checking column {}".format(fat)
    assert fat in df_match_parallel.columns
    if fat not in matched_catalog.columns:
        print "Removing {}".format(fat)
        del df_match_serial[fat]
        del df_match_parallel[fat]
        
matched_catalog.equals(df_match_serial)


Checking column A_ID
Checking column A_r
Checking column A_x
Checking column A_y
Checking column B_ID
Checking column B_r
Checking column B_x
Checking column B_y
Checking column dist
Out[36]:
True

In [37]:
tab.show_in_notebook()


Out[37]:
<Table masked=True length=250>
A_IDA_rA_xA_yB_IDB_rB_xB_ydist
120370843041.020.03702.04282.022.0
22043044942.020.04320.0466.032.0
3202143537----------
420191245404.020.01910.04554.014.0
520478844315.020.04764.04428.023.0
620471648086.020.04709.04833.025.0
72016194183----------
820127129608.020.01294.02944.027.0
9205251717----------
102019411433----------
11202897494611.020.02878.04956.021.0
1220298357612.020.0286.03578.012.0
13203630354413.020.03658.03568.036.0
14207303585----------
152036802836----------
162047837816.020.0502.0403.034.0
172017912246112.020.01849.02270.062.0
18204473464218.020.04501.04666.036.0
19201824476158.020.01829.0604.0128.0
20203910476220.020.03921.04780.021.0
21202148241721.020.02146.02368.049.0
22204068383622.020.04077.03829.011.0
23203078415723.020.03065.04171.019.0
24202408292124.020.02395.02939.022.0
25202120319125.020.02120.03157.034.0
26206211375----------
2720995440827.020.0997.04409.02.0
28202791318328.020.02818.03196.029.0
29204140348929.020.04142.03502.013.0
3020232999930.020.02338.01012.015.0
312041884486----------
3220324220332.020.03293.0203.050.0
3320920413033.020.0894.04112.031.0
34204229392934.020.04249.03931.019.0
352032341025----------
362023104946----------
37201613257417.020.01614.02585.011.0
38203144168438.020.03146.01711.027.0
3920176423339.020.01775.0245.016.0
40201372247640.020.01361.02465.015.0
41201602496041.020.01576.04956.025.0
42202485397142.020.02462.03984.026.0
43203033441443.020.03027.04438.024.0
44202572217644.020.02554.02230.056.0
45201071105045.020.01093.0990.063.0
46201586221446.020.01579.02204.012.0
4720839297747.020.0830.02975.09.0
482042753418250.020.04293.03404.022.0
4920515104049.020.0518.01045.05.0
50203212346650.020.03219.03452.015.0
512045014072----------
52202728225052.020.02732.02240.010.0
53202851464253.020.02905.04652.054.0
54204852100254.020.04885.01005.033.0
55202048326155.020.02080.03248.034.0
56202619225656.020.02614.02258.05.0
5720115484457.020.0102.04841.013.0
58202953410558.020.02982.04072.043.0
59207341859.020.0746.022.012.0
6020740315560.020.0721.03154.018.0
61204799391861.020.04822.03940.031.0
6220290916262.020.02945.0159.036.0
632031742119----------
642026601644----------
65202779444765.020.02785.04426.021.0
662027054555----------
67201694964----------
68201197904----------
69207693071----------
702031553574----------
7120382786471.020.03829.0851.013.0
722018243128----------
73202685448673.020.02709.04483.023.0
7420340276274.020.03402.0769.07.0
75202518473475.020.02544.04732.025.0
7620157827276.020.01566.0249.025.0
772019194743----------
7820812016----------
7920304988479.020.03056.0883.07.0
8020902171215.020.01074.01742.0174.0
8120111197----------
82202747356382.020.02750.03545.018.0
8320760338883.020.0788.03367.034.0
842023331769128.020.02207.01679.0154.0
85203975139285.020.03975.01385.07.0
862067525886.020.0686.0245.017.0
87202171309487.020.02173.03124.030.0
8820622398188.020.0635.04020.041.0
892071212089.020.0737.0142.033.0
90202289414590.020.02289.04148.02.0
912034444475----------
922039103817----------
932033954126----------
942022514959----------
95204781324195.020.04753.03227.031.0
96203893396677.020.03768.03992.0126.0
972023081991----------
98203345364598.020.03359.03691.048.0
99202829282799.020.02849.02838.022.0
1002043102904100.020.04341.02872.044.0
101204421947----------
1022013802668102.020.01419.02661.039.0
103205811763103.020.0578.01772.09.0
1042010294405----------
1052010472762105.020.01076.02715.055.0
1062043323912106.020.04321.03884.030.0
107203014731107.020.0318.04735.017.0
1082047592951108.020.04756.02947.04.0
109201934147910.020.01952.01461.025.0
110203156319----------
1112039943010111.020.03995.03054.044.0
1122036352695249.020.03551.02540.0176.0
1132015473960113.020.01539.03979.020.0
11420454160480.020.0431.01593.025.0
1152026853209115.020.02673.03199.015.0
1162019423425116.020.01971.03428.028.0
1172031442881117.020.03176.02842.050.0
1182041193109118.020.04119.03093.015.0
1192015532186119.020.01554.02185.01.0
1202017271609120.020.01736.01617.012.0
1212041563927----------
1222012123475122.020.01192.03476.019.0
1232041531466123.020.04157.01478.012.0
1242030743921124.020.03055.03930.020.0
125202246772125.020.02253.0777.08.0
1262025473279126.020.02560.03275.013.0
12720264675127.020.02650.059.016.0
1282014964923109.020.01466.04956.044.0
129207604274129.020.0798.04293.042.0
1302026463911130.020.02649.03911.02.0
1312018035----------
1322018242624132.020.01815.02619.010.0
1332020864804133.020.02089.04823.019.0
134204932146134.020.0496.02147.03.0
1352041201993204.020.04115.01984.010.0
1362044503682136.020.04470.03667.024.0
137204540308137.020.04542.0302.06.0
1382023494770138.020.02374.04757.027.0
1392040052722139.020.03983.02698.032.0
1402022413881140.020.02253.03871.015.0
1412032311649141.020.03221.01651.010.0
1422040702749142.020.04086.02756.017.0
1432016544475143.020.01657.04486.011.0
1442044592833144.020.04428.02793.050.0
1452016331767145.020.01624.01763.09.0
146201732836146.020.01719.0825.017.0
1472031134699147.020.03108.04687.012.0
1482016732889148.020.01675.02890.02.0
1492012331249178.020.01234.01268.019.0
150202747398150.020.02719.0399.028.0
1512039202913151.020.03948.02895.033.0
1522013923592152.020.01403.03586.012.0
153201513279153.020.01505.0271.011.0
154203459828154.020.03476.0870.045.0
155204416683155.020.04435.0683.018.0
1562025231274----------
157204432994----------
1582031791845175.020.03106.01938.0118.0
1592041904050159.020.04217.04051.026.0
160205054352160.020.0480.04361.026.0
1612040654891161.020.04074.04865.027.0
162203254846162.020.03280.0842.026.0
163203366474163.020.03362.0480.07.0
1642018194742164.020.01810.04730.014.0
1652011152088----------
1662024074061166.020.02421.04079.022.0
1672042162454----------
168204079504168.020.04107.0513.029.0
1692028142292169.020.02805.02257.036.0
17020221662170.020.0199.0652.024.0
1712029393331----------
1722011714680----------
1732035593080173.020.03557.03106.026.0
174202211236174.020.02232.0213.031.0
1752010441084----------
176206372312----------
1772049663909177.020.04981.03904.015.0
1782012121281----------
1792041693805179.020.04186.03797.018.0
180204576941180.020.04570.0937.07.0
1812020804011181.020.02063.04015.017.0
182202674191182.020.0275.04229.038.0
183202612134183.020.02609.0100.034.0
184204424921101.020.04441.0934.021.0
1852022423621185.020.02236.03653.032.0
1862017862610----------
1872047371128----------
188201665100188.020.01629.085.038.0
1892037562839189.020.03780.02827.026.0
1902030673875190.020.03080.03878.013.0
1912040364139----------
1922017412279192.020.01734.02296.018.0
1932026491761193.020.02641.01785.025.0
194202909595194.020.02893.0572.028.0
195203058958195.020.03038.0937.028.0
196203882841----------
1972034734352----------
1982011122386198.020.01104.02364.023.0
1992023623531199.020.02360.03530.02.0
200202794873----------
2012014721183201.020.01512.01172.041.0
202205553781202.020.0553.03779.02.0
203203144537203.020.0308.04516.021.0
204203561597----------
2052031283344205.020.03092.03331.038.0
2062039064068----------
2072010472213207.020.0999.02215.047.0
208205482617208.020.0508.02607.041.0
2092017943549209.020.01802.03541.011.0
210203156556210.020.03149.0562.09.0
2112029421988211.020.02936.02007.019.0
2122043832839212.020.04401.02816.029.0
2132042904991213.020.04303.04988.013.0
214203942202214.020.0385.02185.019.0
2152016693520----------
216209991179216.020.0986.01180.013.0
217201327592217.020.01308.0606.023.0
2182010471256218.020.01032.01251.015.0
219202876845219.020.02855.0819.033.0
2202022133454----------
2212019913713221.020.02000.03731.020.0
2222046824921222.020.04709.04942.033.0
2232044243791223.020.04409.03814.027.0
2242047132315224.020.04687.02315.025.0
225207204803225.020.0722.04795.08.0
226202514107226.020.02513.086.021.0
2272033302022227.020.03319.02040.021.0
2282016614995----------
2292015262256229.020.01533.02241.016.0
230204804924230.020.0483.04915.09.0
2312030101755231.020.03024.01761.015.0
23220238906232.020.0222.0904.016.0
233203001156114.020.0293.01225.069.0
2342015283477234.020.01568.03484.040.0
235202643659235.020.02647.0667.08.0
2362024941298156.020.02515.01297.021.0
23720568639237.020.0576.0646.010.0
2382015083037238.020.01501.03056.020.0
239204530902239.020.04515.0906.015.0
2402026902556----------
241206733385241.020.0674.03349.036.0
242202783267242.020.0264.03248.023.0
2432030602667243.020.03044.02674.017.0
244208422105244.020.0818.02129.033.0
2452044543331245.020.04482.03367.045.0
2462034813238246.020.03498.03276.041.0
2472029233151247.020.02902.03153.020.0
2482020793653248.020.02086.03637.017.0
2492012262936----------
2502036983690----------

In [ ]: