## Weighted MWP distributions

Here, we compare the distribution of bubble parameters for objects in the MWP catalog, at different confidence thresholds.

We split the catalog in 3 roughly equal partitions, with 1-1.3K objects each. We split on bubble probability, according to the ranges [0, .5), [.5, .9), [.9, 1]

The distributions are similar in many respects. Some notable differences:

1. There are longitudes where there are disproportionately many low-probability objects. Manual inspection confirms that the probabilistic score is still calibrated in these regions, and these regions are places where users seem to be overly optimistic in identifying bubbles. Furthermore, the 3 most prominent examples all contain very bright "super-bubbles".

2. Low-scoring bubbles have a slightly wider latitude distribution. Perhaps users are more likely to identify random wisps more easily against a lower-intensity background?

3. Of the 106 bubbles with known distances, most (69) are hi-probability objects. Mid and low-probability objects account for 24 and 11 bubbles, repsectively. This further bolsters the interpretation of probabilities as a means to identify "real" bubbles.

``````

In [2]:

%matplotlib inline

import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
from scipy.stats import ks_2samp

matplotlib.rc_file('matplotlibrc')
matplotlib.rcParams['interactive'] = True
matplotlib.rcParams['axes.grid'] = False
matplotlib.rcParams['axes.facecolor'] = 'white'
matplotlib.rcParams['font.size'] = 13
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = 3.4, 3.4
figures = {}

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks

The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)

#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')

#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()

``````
``````

In [3]:

from sklearn.neighbors import NearestNeighbors
from astropy.table import Table

def _xmatch_hii(lon, lat):
xy = np.column_stack((hrds['GLON'], hrds['GLAT']))
nn = NearestNeighbors(n_neighbors=1).fit(xy)
d, ind = nn.kneighbors(np.column_stack((lon, lat)))
d = d.ravel()
ind = ind.ravel()
return hrds[ind], d

def bubble_distances(lon, lat, size):
hrds, d = _xmatch_hii(lon, lat)
result = np.array(hrds['D'])
result[d > size] = np.nan
return result

def matched_hii(lon, lat, size):
hrds, d = _xmatch_hii(lon, lat)
return d <= size

``````
``````

In [4]:

data.lon[data.lon > 270] = data.lon - 360
data['diam'] = 2 * np.sqrt(data.a * data.b) * 60.
data['dist'] = bubble_distances(data.lon.values, data.lat.values, data.a.values)
data['has_hii'] = matched_hii(data.lon.values, data.lat.values, data.a.values)
data['phys_diam'] = 1000 * np.radians(data.diam / 60) * data.dist
data['x'] = data.dist * (-1 * np.sin(np.radians(data.lon)))

``````
``````

In [5]:

low = data[data.prob < .5]
mid = data[(data.prob >= .5) & (data.prob < .9)]
hi = data[data.prob >= .9]

nlow = data[~(data.prob < .5)]
nmid = data[~((data.prob >= .5) & (data.prob < .9))]
nhi = data[~(data.prob >= .9)]

print low.shape, mid.shape, hi.shape

``````
``````

(1224, 13) (1220, 13) (1218, 13)

``````
``````

In [6]:

def compare(c, **kwargs):
kwargs.setdefault('lw', 2)
kwargs.setdefault('histtype', 'step')

labels = ['P < 0.5', '0.5 < P < 0.9', 'P > 0.9']
color = ['#D95F02', '#E6AB02', '#7570B3']
color = ['#737373', '#FE9929', '#1D91C0']
result = plt.hist([low[c], mid[c], hi[c]],
color = color,
label=labels, **kwargs)
plt.ylabel("Number of Bubbles")

print "KS Tests"
print "low: %0.2e" % ks_2samp(low[c], nlow[c])[1]
print "mid: %0.2e" % ks_2samp(mid[c], nmid[c])[1]
print "hi:  %0.2e" % ks_2samp(hi[c], nhi[c])[1]

return result

``````

## Longitude Distribution

``````

In [7]:

figures['lon'] = plt.figure(figsize=(5.3, 3.5), tight_layout=True)
bins, num, artists = compare('lon', bins=np.arange(-67, 67, 1),
lw=2, histtype='step', stacked=False)

def arm(l1, l2, name):
opts = dict(color='#cccccf', zorder=0)
plt.axvspan(l1, l2, **opts)
plt.annotate(name, xy=((l1 + l2) / 2, 110),
rotation=20, ha='left', va='bottom')

for offset, artist in zip([0, 30, 60], artists):
artist = artist[0]
xy = artist.get_xy()
xy[:, 1] += offset
baseline = np.array([[xy[:, 0].min(), offset], [xy[:, 0].max(), offset]])
xy = np.vstack((xy, baseline))
artist.set_xy(xy)
artist.set_fill(True)
artist.set_facecolor(artist.get_edgecolor())

plt.xlabel("Galactic Longitude (Degrees)")
plt.ylabel("Number of Bubbles")

plt.xlim(65, -65)
plt.ylim(0, 90)

yt = np.array([0, 20, 30, 50, 60, 80])
yl = yt % 30
plt.yticks(yt, yl)
for x in [-42, -54, -61]:
plt.axvline(x - 0.6, color='k', zorder=0)

remove_border(left=False, bottom=False)
opts = dict(color='k', lw=1)

plt.plot([65, 65], [0, 20], **opts)
plt.plot([65, 65], [30, 50], **opts)
plt.plot([65, 65], [60, 80], **opts)
#plt.plot([65, -65], [-.5, -.5], **opts)
#plt.plot([65, -65], [29.5, 29.5], **opts)
#plt.plot([65, -65], [60, ], **opts)
plt.gca().yaxis.tick_left()

plt.annotate('P > 0.9', (10, 80), ha='left', color=artists[2][0].get_facecolor())
plt.annotate('0.5 < P < 0.9', (10, 50), ha='left', color=artists[1][0].get_facecolor())
plt.annotate('P < 0.5', (10, 20), ha='left', color=artists[0][0].get_facecolor())

``````
``````

KS Tests
low: 7.00e-05
mid: 2.72e-02
hi:  8.59e-02

Out[7]:

<matplotlib.text.Annotation at 0x10c4aaf10>

/Users/beaumont/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:1595: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
warnings.warn("This figure includes Axes that are not "

``````

There are sone longitudes with overdensities of low-p objects. How significant are they?

The \$l=-61\$ field has 25 low-p objects, 10 mid-p objects, and 6 high-p objects.

Below we compute the probability that we would observe such an excess, assuming the true low-p rate is < the high-p rate.

The \$l=-61\$ field has the most significant excess, by far. We can conservatively say that it shows an excess at 99% confidence

``````

In [8]:

for lon in [38, -42, -54, -61]:
for df in [low, mid, hi]:
print df[(df.lon < lon) & (df.lon > lon - 1)].shape[0],
print

``````
``````

24 16 9
23 8 9
25 11 14
22 14 5

``````
``````

In [9]:

from scipy import stats
from scipy.stats import poisson

def prob(nhi, nmid, nlo):
def prob_data(llo, lmid, lhi):
return (poisson.pmf(nlo, llo) *
poisson.pmf(nmid, lmid) *
poisson.pmf(nhi, lhi))

#compute all likelihoods
llo, lmid, lhi = np.meshgrid(np.arange(nlo * 3),
np.arange(nmid *  3),
np.arange(nhi * 3))
grid = np.nan_to_num(prob_data(llo, lmid, lhi))
grid /= grid.sum()

#sum up prob for where high rate <= other rates
null = lhi <= np.minimum(llo, lmid)
p_null = grid[null].sum()

# return single-event probability,
# and the prob that this occurs at least once in 130 draws
return p_null, 1 - np.exp(130 * np.log(1 - p_null))

print prob(25, 13, 11)
print prob(23, 8, 9)
print prob(27, 9, 14)
print prob(25, 10, 6)

``````
``````

(0.0027696530088030854, 0.30271037750389451)
(0.0004914034637593384, 0.061899461165620284)
(0.00046353489894916401, 0.058493010744103069)
(4.9446671028174121e-05, 0.0064076093165995607)

``````
``````

In [10]:

figures['lat'] = plt.figure(tight_layout=True)
compare('lat', bins=np.linspace(-1.1, 1.1, 30))
plt.xlabel("Galactic Latitude (Degrees)")
plt.legend(loc='upper left', frameon=False, fontsize=11)

cuts = pd.cut(data.prob, [0, .5, .8, 1])
print data.lat.mean(), data.lat.std()
data.groupby(cuts).lat.agg([np.mean, np.std])
plt.xlim(-1.1, 1.1)
plt.ylim(0, 150)
remove_border()

``````
``````

KS Tests
low: 1.30e-03
mid: 1.86e-04
hi:  9.75e-02
-0.0614146519498 0.418734023693

``````
``````

In [11]:

figures['diameter'] = plt.figure(tight_layout=True)
compare('diam', bins=np.arange(0, 12, .3))
plt.xlabel("Bubble Angular Diameter (arcmin)")
plt.legend(loc='upper right', frameon=False)
remove_border()
#plt.yscale('log')

``````
``````

KS Tests
low: 5.76e-09
mid: 2.80e-13
hi:  1.48e-02

``````
``````

In [12]:

figures['dist'] = plt.figure(tight_layout=True)
compare('dist', bins = np.arange(20), normed=True, cumulative=True)
plt.xlabel("Bubble Distance (kpc)")
plt.ylabel("Cumulative Fraction of Bubbles")
#plt.legend(loc='upper left', frameon=False)

plt.ylim(0, 1.05)
remove_border()

``````
``````

KS Tests
low: 1.85e-01
mid: 9.41e-01
hi:  1.03e-02

``````
``````

In [13]:

data.phys_diam.describe()

``````
``````

Out[13]:

count    126.000000
mean       5.400022
std        9.870982
min        0.000000
25%        1.940610
50%        3.408382
75%        5.376190
max       77.697027
dtype: float64

``````
``````

In [14]:

compare('phys_diam', bins=np.linspace(1, 20, 30))
plt.xlabel("Bubble Diameter [pc]")
labels = ['P < 0.5', '0.5 < P < 0.9', 'P > 0.9']
color = ['#737373', '#FE9929', '#1D91C0']
for l, c, y in zip(labels, color, [.7, .8, .9]):
plt.annotate(l, (.4, y), color=c, xycoords='axes fraction', fontsize=16)
remove_border()

``````
``````

KS Tests
low: 1.13e-01
mid: 9.50e-01
hi:  6.53e-03

``````
``````

In [23]:

figures['map'] = plt.figure(figsize=(6, 6))
im = np.flipud(im)
shp = im.shape

#sun is at (x, y) = (0, -8.5), and (xpix, ypix) = (1200, 1777)
scale = (45 * 0.306594845) / (1777 - 983)
l = -1200 * scale
r = l + shp[1] * scale
t = 1777 * scale
b = t - shp[0] * scale
plt.imshow(im, extent=[l, r, b, t], alpha=0.9)

for df, c, y, l in zip([low, mid, hi], color, [.2, .28, .36], labels):
plt.scatter(df['x'], df['y'], df['phys_diam'] * 7,
c=c, linewidth=.4)
plt.annotate(l, (.02, y), xycoords='axes fraction', color = c, fontsize=17)

plt.scatter([-14.5], [-.5], 10 * 7, color='w')
plt.annotate('10 pc', (-14, -.5), va='center', color='w')
plt.xlim(-15, 3)
plt.ylim(-3, 16)
plt.xticks([])
plt.yticks([])

``````
``````

Out[23]:

([], <a list of 0 Text yticklabel objects>)

``````
``````

In [16]:

87 + 27 + 10

``````
``````

Out[16]:

124

``````
``````

In [17]:

low['phys_diam'].dropna().shape, mid['phys_diam'].dropna().shape, hi['phys_diam'].dropna().shape

``````
``````

Out[17]:

((10,), (27,), (87,))

``````
``````

In [18]:

df['phys_diam'].dropna().shape

``````
``````

Out[18]:

(87,)

``````
``````

In [19]:

from bubbly.extractors import RGBExtractor

fig = plt.figure()

ex = RGBExtractor()
ex.shp = (100, 100)

row = low.ix[b]
lon, lat, a = row['lon'], row['lat'], row['a']
im = ex.extract(np.round(lon), lon, lat, a * 3)
plt.imshow(im)
plt.show()

``````
``````

ERROR:astropy:ImportError: No module named cloud

ERROR: ImportError: No module named cloud [IPython.core.interactiveshell]

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
1 big_bad = [583, 3115, 3584]
----> 2 from bubbly.extractors import RGBExtractor
3
4 fig = plt.figure()
5

/Users/beaumont/mwp/bubbly/extractors.py in <module>()
11 from skimage.feature import daisy
12
---> 13 from .field import get_field
14 from .util import normalize, ellipse, multiwavelet_from_rgb
15

/Users/beaumont/mwp/bubbly/field.py in <module>()
4 import random
5
----> 6 from cloud import running_on_cloud
7 from astropy.io import fits
8 from astropy.wcs import WCS

ImportError: No module named cloud

``````
``````

In [ ]:

data.has_hii.sum()

``````
``````

In [ ]:

figures['hii_score'] = plt.figure(tight_layout=True)
plt.hist(data[data.has_hii].prob,
bins=np.linspace(0, 1.1, 12), histtype='step', lw=2, normed=True,
label='H\$_{II}\$ Region',
color = '#DE2D26')
plt.hist(data[~data.has_hii].prob,
bins=np.linspace(0, 1.1, 12), histtype='stepfilled', lw=2, normed=True,
label='No H\$_{II}\$ Region',
color = '#888888', edgecolor='none', zorder=0)

plt.xlim(0, 1.1)
plt.xlabel("P(Bubble | Score)")
plt.yticks([])
plt.ylabel("Normalized Fraction")
plt.legend(loc='upper left', frameon=False)
remove_border(left=False)

``````

## Longitude Spikes

A number of longitudes show spikes of low-probability MWP bubbles. For example, the \$l=-42\$ field. Are these regions where users are biased, or are they regions where the probabalistic score is inaccurate?

We check by looking at 23 fields at \$l=-42\$. 8-9 of these are possible bubbles. By summing the probabilities, we expect 7.9 real bubbles if the probabilities are accurate. Thus, the probabalistic score seems calibrated in this range, and the user annotations are off.

We repeated this analysis for the \$l=-61\$ field. The results are the same -- ~8 expected bubbles in the low-p sample, with 5-6 cancidates.

``````

In [ ]:

.0714 / .0154

``````
``````

In [ ]:

from bubbly.extractors import RGBExtractor
import bubbly.field

fig = plt.figure()

ex = RGBExtractor()
ex.shp = (100, 100)
bs = low[(low.lon < -61) & (low.lon > -62)]
bs['lon'] += 360
print bs.prob.sum(), bs.shape
ims = []
for i, r in bs.iterrows():
try:
im = ex.extract(np.round(r['lon']), r['lon'], r['lat'], r['diam'] / 60 * 2)
plt.imshow(im)
plt.title(r['prob'])
plt.show()
except ValueError:
pass

``````
``````

In [ ]:

``````