Here, we compare the distribution of bubble parameters for objects in the MWP catalog, at different confidence thresholds.
We split the catalog in 3 roughly equal partitions, with 1-1.3K objects each. We split on bubble probability, according to the ranges [0, .5), [.5, .9), [.9, 1]
The distributions are similar in many respects. Some notable differences:
There are longitudes where there are disproportionately many low-probability objects. Manual inspection confirms that the probabilistic score is still calibrated in these regions, and these regions are places where users seem to be overly optimistic in identifying bubbles. Furthermore, the 3 most prominent examples all contain very bright "super-bubbles".
Low-scoring bubbles have a slightly wider latitude distribution. Perhaps users are more likely to identify random wisps more easily against a lower-intensity background?
Of the 106 bubbles with known distances, most (69) are hi-probability objects. Mid and low-probability objects account for 24 and 11 bubbles, repsectively. This further bolsters the interpretation of probabilities as a means to identify "real" bubbles.
In [2]:
%matplotlib inline
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
from scipy.stats import ks_2samp
matplotlib.rc_file('matplotlibrc')
matplotlib.rcParams['interactive'] = True
matplotlib.rcParams['axes.grid'] = False
matplotlib.rcParams['axes.facecolor'] = 'white'
matplotlib.rcParams['font.size'] = 13
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = 3.4, 3.4
figures = {}
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
In [3]:
from sklearn.neighbors import NearestNeighbors
from astropy.table import Table
def _xmatch_hii(lon, lat):
hrds = Table.read('../data/hrds_distances.txt', format='cds')
xy = np.column_stack((hrds['GLON'], hrds['GLAT']))
nn = NearestNeighbors(n_neighbors=1).fit(xy)
d, ind = nn.kneighbors(np.column_stack((lon, lat)))
d = d.ravel()
ind = ind.ravel()
return hrds[ind], d
def bubble_distances(lon, lat, size):
hrds, d = _xmatch_hii(lon, lat)
result = np.array(hrds['D'])
result[d > size] = np.nan
return result
def matched_hii(lon, lat, size):
hrds, d = _xmatch_hii(lon, lat)
return d <= size
In [4]:
data = pd.read_csv('../data/pdr1.csv')
data.lon[data.lon > 270] = data.lon - 360
data['diam'] = 2 * np.sqrt(data.a * data.b) * 60.
data['dist'] = bubble_distances(data.lon.values, data.lat.values, data.a.values)
data['has_hii'] = matched_hii(data.lon.values, data.lat.values, data.a.values)
data['phys_diam'] = 1000 * np.radians(data.diam / 60) * data.dist
data['x'] = data.dist * (-1 * np.sin(np.radians(data.lon)))
data['y'] = data.dist * np.cos(np.radians(data.lon))
In [5]:
low = data[data.prob < .5]
mid = data[(data.prob >= .5) & (data.prob < .9)]
hi = data[data.prob >= .9]
nlow = data[~(data.prob < .5)]
nmid = data[~((data.prob >= .5) & (data.prob < .9))]
nhi = data[~(data.prob >= .9)]
print low.shape, mid.shape, hi.shape
In [6]:
def compare(c, **kwargs):
kwargs.setdefault('lw', 2)
kwargs.setdefault('histtype', 'step')
labels = ['P < 0.5', '0.5 < P < 0.9', 'P > 0.9']
color = ['#D95F02', '#E6AB02', '#7570B3']
color = ['#737373', '#FE9929', '#1D91C0']
result = plt.hist([low[c], mid[c], hi[c]],
color = color,
label=labels, **kwargs)
plt.ylabel("Number of Bubbles")
print "KS Tests"
print "low: %0.2e" % ks_2samp(low[c], nlow[c])[1]
print "mid: %0.2e" % ks_2samp(mid[c], nmid[c])[1]
print "hi: %0.2e" % ks_2samp(hi[c], nhi[c])[1]
return result
In [7]:
figures['lon'] = plt.figure(figsize=(5.3, 3.5), tight_layout=True)
bins, num, artists = compare('lon', bins=np.arange(-67, 67, 1),
lw=2, histtype='step', stacked=False)
def arm(l1, l2, name):
opts = dict(color='#cccccf', zorder=0)
plt.axvspan(l1, l2, **opts)
plt.annotate(name, xy=((l1 + l2) / 2, 110),
rotation=20, ha='left', va='bottom')
#add offset to each histogram
for offset, artist in zip([0, 30, 60], artists):
artist = artist[0]
xy = artist.get_xy()
xy[:, 1] += offset
baseline = np.array([[xy[:, 0].min(), offset], [xy[:, 0].max(), offset]])
xy = np.vstack((xy, baseline))
artist.set_xy(xy)
artist.set_fill(True)
artist.set_facecolor(artist.get_edgecolor())
plt.xlabel("Galactic Longitude (Degrees)")
plt.ylabel("Number of Bubbles")
plt.xlim(65, -65)
plt.ylim(0, 90)
yt = np.array([0, 20, 30, 50, 60, 80])
yl = yt % 30
plt.yticks(yt, yl)
for x in [-42, -54, -61]:
plt.axvline(x - 0.6, color='k', zorder=0)
remove_border(left=False, bottom=False)
opts = dict(color='k', lw=1)
plt.plot([65, 65], [0, 20], **opts)
plt.plot([65, 65], [30, 50], **opts)
plt.plot([65, 65], [60, 80], **opts)
#plt.plot([65, -65], [-.5, -.5], **opts)
#plt.plot([65, -65], [29.5, 29.5], **opts)
#plt.plot([65, -65], [60, ], **opts)
plt.gca().yaxis.tick_left()
plt.annotate('P > 0.9', (10, 80), ha='left', color=artists[2][0].get_facecolor())
plt.annotate('0.5 < P < 0.9', (10, 50), ha='left', color=artists[1][0].get_facecolor())
plt.annotate('P < 0.5', (10, 20), ha='left', color=artists[0][0].get_facecolor())
Out[7]:
There are sone longitudes with overdensities of low-p objects. How significant are they?
The $l=-61$ field has 25 low-p objects, 10 mid-p objects, and 6 high-p objects.
Below we compute the probability that we would observe such an excess, assuming the true low-p rate is < the high-p rate.
The $l=-61$ field has the most significant excess, by far. We can conservatively say that it shows an excess at 99% confidence
In [8]:
for lon in [38, -42, -54, -61]:
for df in [low, mid, hi]:
print df[(df.lon < lon) & (df.lon > lon - 1)].shape[0],
print
In [9]:
from scipy import stats
from scipy.stats import poisson
def prob(nhi, nmid, nlo):
def prob_data(llo, lmid, lhi):
return (poisson.pmf(nlo, llo) *
poisson.pmf(nmid, lmid) *
poisson.pmf(nhi, lhi))
#compute all likelihoods
llo, lmid, lhi = np.meshgrid(np.arange(nlo * 3),
np.arange(nmid * 3),
np.arange(nhi * 3))
grid = np.nan_to_num(prob_data(llo, lmid, lhi))
grid /= grid.sum()
#sum up prob for where high rate <= other rates
null = lhi <= np.minimum(llo, lmid)
p_null = grid[null].sum()
# return single-event probability,
# and the prob that this occurs at least once in 130 draws
return p_null, 1 - np.exp(130 * np.log(1 - p_null))
print prob(25, 13, 11)
print prob(23, 8, 9)
print prob(27, 9, 14)
print prob(25, 10, 6)
In [10]:
figures['lat'] = plt.figure(tight_layout=True)
compare('lat', bins=np.linspace(-1.1, 1.1, 30))
plt.xlabel("Galactic Latitude (Degrees)")
plt.legend(loc='upper left', frameon=False, fontsize=11)
cuts = pd.cut(data.prob, [0, .5, .8, 1])
print data.lat.mean(), data.lat.std()
data.groupby(cuts).lat.agg([np.mean, np.std])
plt.xlim(-1.1, 1.1)
plt.ylim(0, 150)
remove_border()
In [11]:
figures['diameter'] = plt.figure(tight_layout=True)
compare('diam', bins=np.arange(0, 12, .3))
plt.xlabel("Bubble Angular Diameter (arcmin)")
plt.legend(loc='upper right', frameon=False)
remove_border()
#plt.yscale('log')
In [12]:
figures['dist'] = plt.figure(tight_layout=True)
compare('dist', bins = np.arange(20), normed=True, cumulative=True)
plt.xlabel("Bubble Distance (kpc)")
plt.ylabel("Cumulative Fraction of Bubbles")
#plt.legend(loc='upper left', frameon=False)
plt.ylim(0, 1.05)
remove_border()
In [13]:
data.phys_diam.describe()
Out[13]:
In [14]:
compare('phys_diam', bins=np.linspace(1, 20, 30))
plt.xlabel("Bubble Diameter [pc]")
labels = ['P < 0.5', '0.5 < P < 0.9', 'P > 0.9']
color = ['#737373', '#FE9929', '#1D91C0']
for l, c, y in zip(labels, color, [.7, .8, .9]):
plt.annotate(l, (.4, y), color=c, xycoords='axes fraction', fontsize=16)
remove_border()
In [23]:
from skimage.io import imread
figures['map'] = plt.figure(figsize=(6, 6))
im = imread('ssc2008-10b.jpg')
im = np.flipud(im)
shp = im.shape
#sun is at (x, y) = (0, -8.5), and (xpix, ypix) = (1200, 1777)
scale = (45 * 0.306594845) / (1777 - 983)
l = -1200 * scale
r = l + shp[1] * scale
t = 1777 * scale
b = t - shp[0] * scale
plt.imshow(im, extent=[l, r, b, t], alpha=0.9)
for df, c, y, l in zip([low, mid, hi], color, [.2, .28, .36], labels):
plt.scatter(df['x'], df['y'], df['phys_diam'] * 7,
c=c, linewidth=.4)
plt.annotate(l, (.02, y), xycoords='axes fraction', color = c, fontsize=17)
plt.scatter([-14.5], [-.5], 10 * 7, color='w')
plt.annotate('10 pc', (-14, -.5), va='center', color='w')
plt.xlim(-15, 3)
plt.ylim(-3, 16)
plt.xticks([])
plt.yticks([])
Out[23]:
In [16]:
87 + 27 + 10
Out[16]:
In [17]:
low['phys_diam'].dropna().shape, mid['phys_diam'].dropna().shape, hi['phys_diam'].dropna().shape
Out[17]:
In [18]:
df['phys_diam'].dropna().shape
Out[18]:
In [19]:
big_bad = [583, 3115, 3584]
from bubbly.extractors import RGBExtractor
fig = plt.figure()
ex = RGBExtractor()
ex.shp = (100, 100)
for b in big_bad:
row = low.ix[b]
lon, lat, a = row['lon'], row['lat'], row['a']
im = ex.extract(np.round(lon), lon, lat, a * 3)
plt.imshow(im)
plt.show()
In [ ]:
data.has_hii.sum()
In [ ]:
figures['hii_score'] = plt.figure(tight_layout=True)
plt.hist(data[data.has_hii].prob,
bins=np.linspace(0, 1.1, 12), histtype='step', lw=2, normed=True,
label='H$_{II}$ Region',
color = '#DE2D26')
plt.hist(data[~data.has_hii].prob,
bins=np.linspace(0, 1.1, 12), histtype='stepfilled', lw=2, normed=True,
label='No H$_{II}$ Region',
color = '#888888', edgecolor='none', zorder=0)
plt.xlim(0, 1.1)
plt.xlabel("P(Bubble | Score)")
plt.yticks([])
plt.ylabel("Normalized Fraction")
plt.legend(loc='upper left', frameon=False)
remove_border(left=False)
A number of longitudes show spikes of low-probability MWP bubbles. For example, the $l=-42$ field. Are these regions where users are biased, or are they regions where the probabalistic score is inaccurate?
We check by looking at 23 fields at $l=-42$. 8-9 of these are possible bubbles. By summing the probabilities, we expect 7.9 real bubbles if the probabilities are accurate. Thus, the probabalistic score seems calibrated in this range, and the user annotations are off.
We repeated this analysis for the $l=-61$ field. The results are the same -- ~8 expected bubbles in the low-p sample, with 5-6 cancidates.
In [ ]:
.0714 / .0154
In [ ]:
from bubbly.extractors import RGBExtractor
import bubbly.field
reload(bubbly.field)
fig = plt.figure()
ex = RGBExtractor()
ex.shp = (100, 100)
bs = low[(low.lon < -61) & (low.lon > -62)]
bs['lon'] += 360
print bs.prob.sum(), bs.shape
ims = []
for i, r in bs.iterrows():
try:
im = ex.extract(np.round(r['lon']), r['lon'], r['lat'], r['diam'] / 60 * 2)
plt.imshow(im)
plt.title(r['prob'])
plt.show()
except ValueError:
pass
In [ ]: