this notebook tests the grouping algorithm


In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import astropy as ast
import pandas as pd
import sys

In [2]:
# progress meter for big loops
# note: progress must go from 0 to 100 because reasons

def progress_meter(progress):
    sys.stdout.write("\rloading... %.1f%%" % progress)
    sys.stdout.flush()

In [47]:
# 100 random points

test_x = np.random.rand(100) * 100
test_y = np.random.rand(100) * 100

plt.scatter(test_x, test_y)
plt.show()



In [48]:
onc_gs = pd.DataFrame()

# get distances (cartesian)

for i in range(len(test_x)):
    
    dist = []
    
    for j in range(len(test_x)):
        
        dist.append(np.sqrt(abs(test_x[i] - test_x[j])**2 + abs(test_y[i] - test_y[j])**2))
    
    onc_gs.loc[:,str(i)] = dist

# mimic same columns as actual df

onc_gs.insert(0,'y',pd.Series(test_y))
onc_gs.insert(0,'x',pd.Series(test_x))

onc_gs.insert(0,'catID',pd.Series(np.random.randint(1,1000,100)))

onc_gs.insert(0,'catname','')

onc_gs.loc[:40,'catname'] = 'A'
onc_gs.loc[40:75,'catname'] = 'B'
onc_gs.loc[75:,'catname'] = 'C'

onc_gs.insert(0,'oncflag','')
onc_gs.insert(0,'oncID',np.nan)

In [49]:
# new source numbering starts at highest A number + 1
new_source = max(onc_gs.loc[onc_gs['catname'] == 'A', 'catID'].values) + 1

dist_crit = 7

# ====

exclude = set()

for k in range(len(onc_gs)):
    
    if k not in exclude:
        
        # find where dist < dist_crit
        m = onc_gs.loc[onc_gs[str(k)] < dist_crit]

        mindex = set(m[str(k)].index.tolist())

        mindex_updated = set(m[str(k)].index.tolist())

        mindex_same = False

        iter_count = 0

        # print 'initial', mindex

        # keep adding match values until no new values are added
        while mindex_same == False:
            for x in mindex:
                y = onc_gs.loc[onc_gs[str(x)] < dist_crit]

                yindex = set(y[str(x)].index.tolist())

                # print 'new', yindex

                mindex_updated.update(yindex)

            # print 'mindex', mindex
            # print 'updated', mindex_updated

            mindex_same = (mindex == mindex_updated)

            mindex.update(mindex_updated)

            iter_count += 1

        exclude.update(mindex)
        
        num_group = len(mindex)

        match = onc_gs.loc[mindex,['catname','catID']]
        
        # check for multiple sources in same catalog (any duplicates will flag as True)
        if True: #match.duplicated(subset='catname',keep=False).any() == True:

            onc_gs.loc[mindex,'oncflag'] += 'd' + str(num_group)
        
        # use A number if it exists -- if multiple, use lowest
        if ('A' in match['catname'].values) == True:            
            onc_gs.loc[mindex,'oncID'] = min(match.loc[match['catname'] == 'A','catID'].values)

        # otherwise give it a new number
        else:
            onc_gs.loc[mindex,'oncID'] = new_source
            new_source += 1

        progress_meter(k*100./len(onc_gs))

onc_gs


loading... 97.0%
Out[49]:
oncID oncflag catname catID x y 0 1 2 3 ... 90 91 92 93 94 95 96 97 98 99
0 421.0 d4 A 795 89.188761 54.417515 0.000000 73.032555 86.879022 40.553636 ... 35.635755 73.879619 34.913466 31.840638 82.689121 23.984063 7.772157 83.798724 37.104769 60.724301
1 328.0 d2 A 328 25.613405 90.360918 73.032555 0.000000 27.438879 46.459152 ... 49.649935 26.855785 38.529088 81.725209 33.190032 53.293520 65.397839 10.766601 71.400605 13.731502
2 273.0 d3 A 273 4.329793 73.043300 86.879022 27.438879 0.000000 50.077851 ... 72.227504 14.125304 56.587754 85.531890 10.450687 63.583664 80.052453 25.458029 72.671715 38.420916
3 252.0 d3 A 252 48.861675 50.136911 40.553636 46.459152 50.077851 0.000000 ... 45.426519 36.070942 27.496105 36.204957 43.829951 16.966137 35.568254 55.960049 24.952731 40.032902
4 168.0 d6 A 840 87.878310 42.622093 11.867992 78.459631 88.914592 39.733743 ... 46.333532 75.252335 42.047129 20.983910 83.447170 25.682534 17.397330 89.121075 29.110598 67.182165
5 587.0 d3 A 587 72.231987 46.303349 18.798188 64.143289 72.977620 23.682645 ... 40.977675 59.235089 30.730338 20.911819 67.375761 10.938543 17.072954 74.617692 20.017534 53.939466
6 996.0 d1 A 996 97.084658 63.336452 11.911870 76.409828 93.261396 49.996840 ... 32.387903 81.000398 37.987471 43.410228 90.147320 33.030946 14.429707 87.079422 48.978468 63.249790
7 695.0 d2 A 695 28.925472 26.926194 66.237729 63.521130 52.266001 30.597216 ... 75.946010 40.405192 57.615896 46.625166 42.180971 45.683368 63.244471 69.814688 33.284536 62.936787
8 183.0 d7 A 504 79.770723 4.376607 50.919465 101.618486 102.012003 55.221143 ... 82.927810 88.079238 72.972106 21.695001 93.446720 52.376411 55.111430 111.169201 30.292775 93.817681
9 587.0 d3 A 817 75.356739 47.517257 15.457633 65.650427 75.474538 26.624255 ... 39.659428 61.844998 30.937768 21.864213 70.131545 12.427482 14.226063 76.221120 22.736711 54.981606
10 266.0 d1 A 266 52.445785 89.385734 50.722999 26.850094 50.815586 39.412129 ... 22.822519 43.627369 16.798069 67.784932 52.642077 36.970414 42.972041 36.778793 61.146472 13.173397
11 592.0 d5 A 592 30.794355 77.837179 62.915716 13.553091 26.895248 33.071633 ... 45.339001 19.347169 30.240491 68.736310 28.134201 41.474017 55.567995 22.971895 58.022401 14.025199
12 881.0 d2 A 881 11.045908 26.110449 83.111946 65.881217 47.410956 44.802937 ... 88.542552 39.459497 69.721636 64.488981 37.041396 61.237031 79.462212 69.649389 51.180987 68.940976
13 860.0 d4 A 860 55.360605 72.189987 38.212628 34.857978 51.037946 22.990743 ... 24.832359 39.863679 6.140191 50.720383 49.301024 20.081855 30.651902 45.616837 43.706339 23.268500
14 703.0 d4 A 703 63.786795 91.658708 45.079556 38.195444 62.303038 44.122770 ... 12.225693 54.508002 17.043634 67.042027 63.698287 36.998669 37.670135 47.756617 62.666891 24.652095
15 332.0 d1 A 332 42.821370 4.531778 68.106693 87.537165 78.583906 46.003407 ... 88.746598 66.329889 72.615033 38.938495 68.535946 54.924087 68.113650 94.852933 31.188627 84.548226
16 421.0 d4 A 737 92.094146 49.456820 5.748891 78.056608 90.878511 43.237820 ... 41.345858 77.610084 40.240509 28.997387 86.214089 27.391943 13.321141 88.812933 36.261783 65.982378
17 168.0 d6 A 288 96.970377 43.432351 13.462072 85.405549 97.257833 48.573638 ... 48.879130 83.753692 47.852129 27.850097 92.117740 33.699189 21.065658 96.143982 37.692827 73.521133
18 879.0 d2 A 879 55.435879 14.893259 51.975224 81.146457 77.416144 35.851572 ... 74.926018 63.890422 60.217444 22.796787 68.256207 40.975887 52.444387 89.715444 15.632734 75.853395
19 975.0 d2 A 975 78.963607 60.492949 11.893892 61.141963 75.681686 31.833533 ... 26.952831 63.036185 23.091693 35.007665 72.066317 14.932236 4.387438 71.908430 35.691007 48.866508
20 695.0 d2 A 870 25.396905 29.264398 68.571716 61.096904 48.584108 31.404733 ... 76.356085 37.159709 57.783229 50.266211 38.388126 47.234563 65.185025 66.941262 36.748457 61.332750
21 168.0 d6 A 168 83.236596 39.053678 16.476521 77.154811 85.916110 36.117492 ... 48.795400 72.058003 42.211141 15.456362 79.944951 23.864284 20.327101 87.688165 23.359895 66.540031
22 444.0 d2 A 444 27.428143 36.782346 64.229068 53.609296 42.992913 25.253529 ... 69.411638 30.630769 50.692633 49.375590 33.214279 41.806331 60.183481 59.851656 35.575036 53.550876
23 68.0 d1 A 68 57.100253 81.391322 41.919669 32.739505 53.426687 32.322010 ... 18.964675 44.059086 7.547415 58.706494 53.453602 27.907215 34.147704 43.296928 52.620313 19.380824
24 168.0 d6 A 369 95.222787 33.877410 21.408068 89.642918 98.972235 49.129666 ... 56.949388 85.107099 53.466873 21.337889 92.939884 36.524583 28.191882 100.259725 33.433994 78.542752
25 856.0 d5 A 856 62.569423 51.740386 26.753620 53.453650 62.013456 13.801213 ... 37.606479 48.385242 23.183999 29.130287 56.744013 3.954344 22.007165 63.812181 22.731018 43.945435
26 183.0 d7 A 183 83.940223 5.693727 49.005660 102.813184 104.277445 56.618911 ... 81.954054 90.264202 72.909736 21.658245 95.940254 52.453131 53.692028 112.550795 31.919551 94.528373
27 883.0 d4 A 883 83.133068 89.403668 35.506370 57.527628 80.483650 52.119157 ... 8.277316 71.247575 26.575400 64.201310 80.665050 39.071727 30.023002 67.225851 63.933648 43.857004
28 421.0 d4 A 421 81.640411 52.236229 7.857201 67.768114 80.061638 32.845892 ... 35.535662 66.812182 30.702411 27.274989 75.462725 16.615969 7.314425 78.494960 30.321492 56.094385
29 592.0 d5 A 876 27.743584 76.150862 65.175518 14.368833 23.619114 33.506707 ... 48.682426 15.949127 33.169683 69.525621 24.654138 43.175621 57.944014 22.595534 58.355542 17.270988
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
70 856.0 d5 B 345 58.638301 47.668488 31.287058 53.974877 59.944100 10.083428 ... 42.823618 46.019950 27.287781 27.750468 53.903651 9.612539 27.218276 64.002522 18.981780 45.646388
71 444.0 d2 B 361 27.479804 36.604364 64.228527 53.788945 43.170813 25.304432 ... 69.505514 30.815247 50.792477 49.285409 33.386300 41.836274 60.202714 60.036175 35.486125 53.713090
72 592.0 d5 B 206 34.729465 68.963783 56.368510 23.258115 30.672178 23.540826 ... 44.344210 18.990595 26.815816 59.503858 28.440043 33.654880 49.416367 32.427738 48.452268 20.551570
73 883.0 d4 B 529 86.126453 86.676158 32.403670 60.625130 82.924957 52.189848 ... 10.976753 73.226339 27.865899 61.935039 82.673817 38.222309 27.451006 70.499071 62.451056 46.906466
74 252.0 d3 B 482 46.314089 45.050049 43.886067 49.815592 50.460908 5.689144 ... 51.056465 36.335701 33.184933 35.071025 43.242458 21.208220 39.579168 58.818772 22.533931 44.515267
75 1008.0 d2 C 22 20.752985 40.651682 69.806544 49.946289 36.317188 29.665941 ... 71.587523 25.027446 52.737836 56.796282 26.208296 46.616388 65.202565 55.107079 42.996615 51.781115
76 1003.0 d2 C 922 60.657075 5.352607 56.757575 91.948201 88.061300 46.311612 ... 83.099142 74.643958 69.511129 25.168192 78.740733 49.543964 58.545483 100.580166 23.707442 86.341838
77 592.0 d5 C 458 35.739470 74.608098 57.135684 18.726680 31.448631 27.767449 ... 41.376586 21.751545 25.150095 63.087829 31.081195 35.568640 49.850861 28.662562 52.688782 14.826047
78 881.0 d2 C 926 14.166645 25.071080 80.557626 66.285679 48.970374 42.802348 ... 87.048085 40.219805 68.280161 61.369392 38.536299 59.009802 77.096215 70.527538 48.139551 68.689236
79 1012.0 d3 C 307 43.169350 84.442045 54.947781 18.526852 40.477680 34.774197 ... 32.108370 33.180537 20.142281 67.108021 42.139562 37.026608 47.234951 29.183485 58.586669 5.997558
80 1012.0 d3 C 619 40.957144 88.032875 58.790147 15.519346 39.575879 38.711570 ... 34.214678 33.717549 23.889037 71.320846 42.282019 41.230406 51.049549 25.887921 62.707333 1.940717
81 183.0 d7 C 307 77.762289 15.690274 40.377760 91.078050 93.175647 44.964612 ... 71.533239 79.101118 61.531989 10.209775 85.129352 40.968540 44.028529 100.855231 20.528500 82.802202
82 587.0 d3 C 313 71.862153 42.137063 21.237251 66.816816 74.268534 24.351993 ... 45.159795 60.353967 34.517185 16.887171 68.170228 14.205963 20.641398 77.164480 16.329874 57.082450
83 1013.0 d1 C 511 91.528830 19.700248 34.796042 96.632156 102.221100 52.410653 ... 69.432753 88.106374 63.101600 17.067576 94.993684 43.783242 40.543600 106.920560 30.824865 86.795223
84 1014.0 d1 C 286 35.212861 23.004458 62.451404 68.037066 58.801783 30.372029 ... 75.590173 46.359777 57.866842 40.407354 48.918019 43.627999 60.222449 75.016951 27.593841 66.126321
85 879.0 d2 C 596 57.760015 12.518127 52.376758 84.219384 80.734660 38.656867 ... 76.659154 67.193908 62.423657 22.100529 71.578897 42.821526 53.326286 92.873895 17.067977 78.688887
86 1000.0 d2 C 461 2.823664 23.289376 91.803546 70.837589 49.776715 53.294358 ... 96.510262 44.007224 77.661917 72.748040 39.847019 69.838463 88.114625 73.505790 59.596358 75.150159
87 183.0 d7 C 288 76.121124 13.956717 42.518694 91.589473 92.979673 45.299934 ... 73.225779 78.943778 62.782625 11.711801 84.758480 42.167686 45.973510 101.253780 20.543802 83.604890
88 703.0 d4 C 421 68.362672 97.616524 47.957068 43.360623 68.586099 51.328379 ... 12.458687 61.470805 23.949395 72.319136 70.544801 43.045133 41.016457 52.212231 68.884413 30.332684
89 421.0 d4 C 420 88.504739 51.501583 2.995086 73.928126 86.887670 39.666546 ... 38.088466 73.690761 36.172388 28.920048 82.362273 23.515312 9.494189 84.683677 34.649431 61.885739
90 883.0 d4 C 211 75.161096 87.176202 35.635755 49.649935 72.227504 45.426519 ... 0.000000 62.995533 18.849930 61.523572 72.405462 33.979846 28.936744 59.564841 59.601595 35.929835
91 342.0 d5 C 69 16.106726 65.244066 73.879619 26.855785 14.125304 36.070942 ... 62.995533 0.000000 45.803027 71.406631 9.457582 50.221566 67.356693 30.325622 58.584491 33.189123
92 860.0 d4 C 987 60.888269 74.863352 34.913466 38.529088 56.587754 27.496105 ... 18.849930 45.803027 0.000000 51.342579 55.255439 20.632950 27.163751 49.260846 45.867217 25.826650
93 1015.0 d1 C 402 75.533272 25.653756 31.840638 81.725209 85.531890 36.204957 ... 61.523572 71.406631 51.342579 0.000000 78.065982 30.815474 34.589133 91.704832 13.803836 72.992636
94 342.0 d5 C 506 6.938285 62.923387 82.689121 33.190032 10.450687 43.829951 ... 72.405462 9.457582 55.255439 78.065982 0.000000 58.847134 76.351684 33.931966 64.789026 41.546800
95 856.0 d5 C 543 65.206215 54.687277 23.984063 53.293520 63.583664 16.966137 ... 33.979846 50.221566 20.632950 30.815474 58.847134 0.000000 18.603322 63.827066 25.855852 43.012025
96 975.0 d2 C 855 83.207738 59.380758 7.772157 65.397839 80.052453 35.568254 ... 28.936744 67.356693 27.163751 34.589133 76.351684 18.603322 0.000000 76.163833 36.957271 52.985577
97 1016.0 d1 C 375 16.190581 95.569572 83.798724 10.766601 25.458029 55.960049 ... 59.564841 30.325622 49.260846 91.704832 33.931966 63.827066 76.163833 0.000000 80.879509 24.002231
98 1004.0 d3 C 294 62.144505 29.013341 37.104769 71.400605 72.671715 24.952731 ... 59.601595 58.584491 45.867217 13.803836 64.789026 25.855852 36.957271 80.879509 0.000000 64.202555
99 1012.0 d3 C 258 39.277870 89.005715 60.724301 13.731502 38.420916 40.032902 ... 35.929835 33.189123 25.826650 72.992636 41.546800 43.012025 52.985577 24.002231 64.202555 0.000000

100 rows × 106 columns


In [50]:
print onc_gs['oncflag'].value_counts()


d2    22
d1    19
d4    16
d3    15
d5    15
d7     7
d6     6
Name: oncflag, dtype: int64

In [51]:
grp_clr = {'d1':'black',\
           'd2':'pink',\
           'd3':'red',\
           'd4':'orange',\
           'd5':'green',\
           'd6':'blue',\
           'd7':'purple',\
           'd8':'tan',\
           'd10':'grey'\
          }

cat_clr = {'A':'red',\
           'B':'green',\
           'C':'blue'\
          }

f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,20))

for z in range(len(test_x)):
    ax1.scatter(test_x[z], test_y[z], color=cat_clr[onc_gs.loc[z,'catname']])
    ax2.scatter(test_x[z], test_y[z], color=grp_clr[onc_gs.loc[z,'oncflag']])
    ax1.text(test_x[z], test_y[z], s=str(z))
    ax1.axis('equal')
    ax2.axis('equal')



In [52]:
onc_gs.to_csv('/Users/alin/Documents/check_group.csv')

In [ ]: