Weighted MWP distributions

Here, we compare the distribution of bubble parameters for objects in the MWP catalog, at different confidence thresholds.

We split the catalog in 3 roughly equal partitions, with 1-1.3K objects each. We split on bubble probability, according to the ranges [0, .5), [.5, .9), [.9, 1]

The distributions are similar in many respects. Some notable differences:

  1. There are longitudes where there are disproportionately many low-probability objects. Manual inspection confirms that the probabilistic score is still calibrated in these regions, and these regions are places where users seem to be overly optimistic in identifying bubbles. Furthermore, the 3 most prominent examples all contain very bright "super-bubbles".

  2. Low-scoring bubbles have a slightly wider latitude distribution. Perhaps users are more likely to identify random wisps more easily against a lower-intensity background?

  3. Of the 106 bubbles with known distances, most (69) are hi-probability objects. Mid and low-probability objects account for 24 and 11 bubbles, repsectively. This further bolsters the interpretation of probabilities as a means to identify "real" bubbles.


In [2]:
%matplotlib inline

import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
from scipy.stats import ks_2samp

matplotlib.rc_file('matplotlibrc')
matplotlib.rcParams['interactive'] = True
matplotlib.rcParams['axes.grid'] = False
matplotlib.rcParams['axes.facecolor'] = 'white'
matplotlib.rcParams['font.size'] = 13
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = 3.4, 3.4
figures = {}

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

In [3]:
from sklearn.neighbors import NearestNeighbors
from astropy.table import Table

def _xmatch_hii(lon, lat):
    hrds = Table.read('../data/hrds_distances.txt', format='cds')
    xy = np.column_stack((hrds['GLON'], hrds['GLAT']))
    nn = NearestNeighbors(n_neighbors=1).fit(xy)
    d, ind = nn.kneighbors(np.column_stack((lon, lat)))
    d = d.ravel()
    ind = ind.ravel()
    return hrds[ind], d
    
def bubble_distances(lon, lat, size):
    hrds, d = _xmatch_hii(lon, lat)
    result = np.array(hrds['D'])
    result[d > size] = np.nan
    return result

def matched_hii(lon, lat, size):
    hrds, d = _xmatch_hii(lon, lat)
    return d <= size

In [4]:
data = pd.read_csv('../data/pdr1.csv')
data.lon[data.lon > 270] = data.lon - 360
data['diam'] = 2 * np.sqrt(data.a * data.b) * 60.
data['dist'] = bubble_distances(data.lon.values, data.lat.values, data.a.values)
data['has_hii'] = matched_hii(data.lon.values, data.lat.values, data.a.values)
data['phys_diam'] = 1000 * np.radians(data.diam / 60) * data.dist
data['x'] = data.dist * (-1 * np.sin(np.radians(data.lon)))
data['y'] = data.dist * np.cos(np.radians(data.lon))

In [5]:
low = data[data.prob < .5]
mid = data[(data.prob >= .5) & (data.prob < .9)]
hi = data[data.prob >= .9]

nlow = data[~(data.prob < .5)]
nmid = data[~((data.prob >= .5) & (data.prob < .9))]
nhi = data[~(data.prob >= .9)]
              
print low.shape, mid.shape, hi.shape


(1224, 13) (1220, 13) (1218, 13)

In [6]:
def compare(c, **kwargs):
    kwargs.setdefault('lw', 2)
    kwargs.setdefault('histtype', 'step')
    
    labels = ['P < 0.5', '0.5 < P < 0.9', 'P > 0.9']
    color = ['#D95F02', '#E6AB02', '#7570B3']
    color = ['#737373', '#FE9929', '#1D91C0']
    result = plt.hist([low[c], mid[c], hi[c]], 
                      color = color,
                      label=labels, **kwargs)
    plt.ylabel("Number of Bubbles")
    
    print "KS Tests"
    print "low: %0.2e" % ks_2samp(low[c], nlow[c])[1]
    print "mid: %0.2e" % ks_2samp(mid[c], nmid[c])[1]
    print "hi:  %0.2e" % ks_2samp(hi[c], nhi[c])[1]
    
    return result

Longitude Distribution


In [7]:
figures['lon'] = plt.figure(figsize=(5.3, 3.5), tight_layout=True)
bins, num, artists = compare('lon', bins=np.arange(-67, 67, 1), 
                             lw=2, histtype='step', stacked=False)

def arm(l1, l2, name):
    opts = dict(color='#cccccf', zorder=0)
    plt.axvspan(l1, l2, **opts)
    plt.annotate(name, xy=((l1 + l2) / 2, 110), 
                 rotation=20, ha='left', va='bottom')
    
#add offset to each histogram
for offset, artist in zip([0, 30, 60], artists):
    artist = artist[0]
    xy = artist.get_xy()
    xy[:, 1] += offset
    baseline = np.array([[xy[:, 0].min(), offset], [xy[:, 0].max(), offset]])
    xy = np.vstack((xy, baseline))
    artist.set_xy(xy)
    artist.set_fill(True)
    artist.set_facecolor(artist.get_edgecolor())
    
plt.xlabel("Galactic Longitude (Degrees)")
plt.ylabel("Number of Bubbles")

plt.xlim(65, -65)
plt.ylim(0, 90)

yt = np.array([0, 20, 30, 50, 60, 80])
yl = yt % 30
plt.yticks(yt, yl)
for x in [-42, -54, -61]:   
    plt.axvline(x - 0.6, color='k', zorder=0)
    
remove_border(left=False, bottom=False)
opts = dict(color='k', lw=1)

plt.plot([65, 65], [0, 20], **opts)
plt.plot([65, 65], [30, 50], **opts)
plt.plot([65, 65], [60, 80], **opts)
#plt.plot([65, -65], [-.5, -.5], **opts)
#plt.plot([65, -65], [29.5, 29.5], **opts)
#plt.plot([65, -65], [60, ], **opts)
plt.gca().yaxis.tick_left()

plt.annotate('P > 0.9', (10, 80), ha='left', color=artists[2][0].get_facecolor())
plt.annotate('0.5 < P < 0.9', (10, 50), ha='left', color=artists[1][0].get_facecolor())
plt.annotate('P < 0.5', (10, 20), ha='left', color=artists[0][0].get_facecolor())


KS Tests
low: 7.00e-05
mid: 2.72e-02
hi:  8.59e-02
Out[7]:
<matplotlib.text.Annotation at 0x10c4aaf10>
/Users/beaumont/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:1595: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

There are sone longitudes with overdensities of low-p objects. How significant are they?

The $l=-61$ field has 25 low-p objects, 10 mid-p objects, and 6 high-p objects.

Below we compute the probability that we would observe such an excess, assuming the true low-p rate is < the high-p rate.

The $l=-61$ field has the most significant excess, by far. We can conservatively say that it shows an excess at 99% confidence


In [8]:
for lon in [38, -42, -54, -61]:
    for df in [low, mid, hi]:
        print df[(df.lon < lon) & (df.lon > lon - 1)].shape[0],
    print


24 16 9
23 8 9
25 11 14
22 14 5

In [9]:
from scipy import stats
from scipy.stats import poisson

def prob(nhi, nmid, nlo):    
    def prob_data(llo, lmid, lhi):
        return (poisson.pmf(nlo, llo) * 
                poisson.pmf(nmid, lmid) * 
                poisson.pmf(nhi, lhi))
    
    #compute all likelihoods
    llo, lmid, lhi = np.meshgrid(np.arange(nlo * 3), 
                                 np.arange(nmid *  3), 
                                 np.arange(nhi * 3))
    grid = np.nan_to_num(prob_data(llo, lmid, lhi))
    grid /= grid.sum()
    
    #sum up prob for where high rate <= other rates
    null = lhi <= np.minimum(llo, lmid)
    p_null = grid[null].sum()

    # return single-event probability,
    # and the prob that this occurs at least once in 130 draws
    return p_null, 1 - np.exp(130 * np.log(1 - p_null))

print prob(25, 13, 11)
print prob(23, 8, 9)
print prob(27, 9, 14)
print prob(25, 10, 6)


(0.0027696530088030854, 0.30271037750389451)
(0.0004914034637593384, 0.061899461165620284)
(0.00046353489894916401, 0.058493010744103069)
(4.9446671028174121e-05, 0.0064076093165995607)

In [10]:
figures['lat'] = plt.figure(tight_layout=True)
compare('lat', bins=np.linspace(-1.1, 1.1, 30))
plt.xlabel("Galactic Latitude (Degrees)")
plt.legend(loc='upper left', frameon=False, fontsize=11)

cuts = pd.cut(data.prob, [0, .5, .8, 1])
print data.lat.mean(), data.lat.std()
data.groupby(cuts).lat.agg([np.mean, np.std])
plt.xlim(-1.1, 1.1)
plt.ylim(0, 150)
remove_border()


KS Tests
low: 1.30e-03
mid: 1.86e-04
hi:  9.75e-02
-0.0614146519498 0.418734023693

In [11]:
figures['diameter'] = plt.figure(tight_layout=True)
compare('diam', bins=np.arange(0, 12, .3))
plt.xlabel("Bubble Angular Diameter (arcmin)")
plt.legend(loc='upper right', frameon=False)
remove_border()
#plt.yscale('log')


KS Tests
low: 5.76e-09
mid: 2.80e-13
hi:  1.48e-02

In [12]:
figures['dist'] = plt.figure(tight_layout=True)
compare('dist', bins = np.arange(20), normed=True, cumulative=True)
plt.xlabel("Bubble Distance (kpc)")
plt.ylabel("Cumulative Fraction of Bubbles")
#plt.legend(loc='upper left', frameon=False)

plt.ylim(0, 1.05)
remove_border()


KS Tests
low: 1.85e-01
mid: 9.41e-01
hi:  1.03e-02

In [13]:
data.phys_diam.describe()


Out[13]:
count    126.000000
mean       5.400022
std        9.870982
min        0.000000
25%        1.940610
50%        3.408382
75%        5.376190
max       77.697027
dtype: float64

In [14]:
compare('phys_diam', bins=np.linspace(1, 20, 30))
plt.xlabel("Bubble Diameter [pc]")
labels = ['P < 0.5', '0.5 < P < 0.9', 'P > 0.9']
color = ['#737373', '#FE9929', '#1D91C0']
for l, c, y in zip(labels, color, [.7, .8, .9]):
    plt.annotate(l, (.4, y), color=c, xycoords='axes fraction', fontsize=16)
remove_border()


KS Tests
low: 1.13e-01
mid: 9.50e-01
hi:  6.53e-03

In [23]:
from skimage.io import imread
figures['map'] = plt.figure(figsize=(6, 6))
im = imread('ssc2008-10b.jpg')
im = np.flipud(im)
shp = im.shape

#sun is at (x, y) = (0, -8.5), and (xpix, ypix) = (1200, 1777)
scale = (45 * 0.306594845) / (1777 - 983)
l = -1200 * scale
r = l + shp[1] * scale
t = 1777 * scale
b = t - shp[0] * scale
plt.imshow(im, extent=[l, r, b, t], alpha=0.9)


for df, c, y, l in zip([low, mid, hi], color, [.2, .28, .36], labels):
    plt.scatter(df['x'], df['y'], df['phys_diam'] * 7, 
                c=c, linewidth=.4)
    plt.annotate(l, (.02, y), xycoords='axes fraction', color = c, fontsize=17)

plt.scatter([-14.5], [-.5], 10 * 7, color='w')
plt.annotate('10 pc', (-14, -.5), va='center', color='w')
plt.xlim(-15, 3)
plt.ylim(-3, 16)
plt.xticks([])
plt.yticks([])


Out[23]:
([], <a list of 0 Text yticklabel objects>)

In [16]:
87 + 27 + 10


Out[16]:
124

In [17]:
low['phys_diam'].dropna().shape, mid['phys_diam'].dropna().shape, hi['phys_diam'].dropna().shape


Out[17]:
((10,), (27,), (87,))

In [18]:
df['phys_diam'].dropna().shape


Out[18]:
(87,)

In [19]:
big_bad = [583, 3115, 3584]
from bubbly.extractors import RGBExtractor

fig = plt.figure()

ex = RGBExtractor()
ex.shp = (100, 100)

for b in big_bad:
    row = low.ix[b]
    lon, lat, a = row['lon'], row['lat'], row['a']
    im = ex.extract(np.round(lon), lon, lat, a * 3)
    plt.imshow(im)
    plt.show()


ERROR:astropy:ImportError: No module named cloud
ERROR: ImportError: No module named cloud [IPython.core.interactiveshell]
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-19-8509aad26733> in <module>()
      1 big_bad = [583, 3115, 3584]
----> 2 from bubbly.extractors import RGBExtractor
      3 
      4 fig = plt.figure()
      5 

/Users/beaumont/mwp/bubbly/extractors.py in <module>()
     11 from skimage.feature import daisy
     12 
---> 13 from .field import get_field
     14 from .util import normalize, ellipse, multiwavelet_from_rgb
     15 

/Users/beaumont/mwp/bubbly/field.py in <module>()
      4 import random
      5 
----> 6 from cloud import running_on_cloud
      7 from astropy.io import fits
      8 from astropy.wcs import WCS

ImportError: No module named cloud

In [ ]:
data.has_hii.sum()

In [ ]:
figures['hii_score'] = plt.figure(tight_layout=True)
plt.hist(data[data.has_hii].prob,
         bins=np.linspace(0, 1.1, 12), histtype='step', lw=2, normed=True,
         label='H$_{II}$ Region',
         color = '#DE2D26')
plt.hist(data[~data.has_hii].prob,
         bins=np.linspace(0, 1.1, 12), histtype='stepfilled', lw=2, normed=True,
         label='No H$_{II}$ Region',
         color = '#888888', edgecolor='none', zorder=0)

plt.xlim(0, 1.1)
plt.xlabel("P(Bubble | Score)")
plt.yticks([])
plt.ylabel("Normalized Fraction")
plt.legend(loc='upper left', frameon=False)
remove_border(left=False)

Longitude Spikes

A number of longitudes show spikes of low-probability MWP bubbles. For example, the $l=-42$ field. Are these regions where users are biased, or are they regions where the probabalistic score is inaccurate?

We check by looking at 23 fields at $l=-42$. 8-9 of these are possible bubbles. By summing the probabilities, we expect 7.9 real bubbles if the probabilities are accurate. Thus, the probabalistic score seems calibrated in this range, and the user annotations are off.

We repeated this analysis for the $l=-61$ field. The results are the same -- ~8 expected bubbles in the low-p sample, with 5-6 cancidates.


In [ ]:
.0714 / .0154

In [ ]:
from bubbly.extractors import RGBExtractor
import bubbly.field
reload(bubbly.field)

fig = plt.figure()

ex = RGBExtractor()
ex.shp = (100, 100)
bs = low[(low.lon < -61) & (low.lon > -62)]
bs['lon'] += 360
print bs.prob.sum(), bs.shape
ims = []
for i, r in bs.iterrows():
    try:
        im = ex.extract(np.round(r['lon']), r['lon'], r['lat'], r['diam'] / 60 * 2)
        plt.imshow(im)
        plt.title(r['prob'])
        plt.show()
    except ValueError:
        pass

In [ ]: