In [14]:
import numpy as np
from scipy import stats
import pylab as pl
import healpix_util as hu
import astropy as ap
import numpy as np
from astropy.io import fits
from astropy.table import Table
import astropy.io.ascii as ascii
from astropy.constants import c
import matplotlib.pyplot as plt
import math
import scipy.special as sp
from scipy import integrate
%pylab inline
%matplotlib inline
import time


Populating the interactive namespace from numpy and matplotlib

In [2]:
hmqdat=ascii.read("./hmqdata_sorted_full.csv")

In [3]:
def bayesian_blocks(t):
    """Bayesian Blocks Implementation

    By Jake Vanderplas.  License: BSD
    Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S

    Parameters
    ----------
    t : ndarray, length N
        data to be histogrammed

    Returns
    -------
    bins : ndarray
        array containing the (N+1) bin edges

    Notes
    -----
    This is an incomplete implementation: it may fail for some
    datasets.  Alternate fitness functions and prior forms can
    be found in the paper listed above.
    """
    # copy and sort the array
    t = np.sort(t)
    N = t.size

    # create length-(N + 1) array of cell edges
    edges = np.concatenate([t[:1],
                            0.5 * (t[1:] + t[:-1]),
                            t[-1:]])
    block_length = t[-1] - edges

    # arrays needed for the iteration
    nn_vec = np.ones(N)
    best = np.zeros(N, dtype=float)
    last = np.zeros(N, dtype=int)

    #-----------------------------------------------------------------
    # Start with first data cell; add one cell at each iteration
    #-----------------------------------------------------------------
    for K in range(N):
        # Compute the width and count of the final bin for all possible
        # locations of the K^th changepoint
        width = block_length[:K + 1] - block_length[K + 1]
        count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1]

        # evaluate fitness function for these possibilities
        fit_vec = count_vec * (np.log(count_vec) - np.log(width))
        fit_vec -= 4  # 4 comes from the prior on the number of changepoints
        fit_vec[1:] += best[:K]

        # find the max of the fitness: this is the K^th changepoint
        i_max = np.argmax(fit_vec)
        last[K] = i_max
        best[K] = fit_vec[i_max]

    #-----------------------------------------------------------------
    # Recover changepoints by iteratively peeling off the last block
    #-----------------------------------------------------------------
    change_points =  np.zeros(N, dtype=int)
    i_cp = N
    ind = N
    while True:
        i_cp -= 1
        change_points[i_cp] = ind
        if ind == 0:
            break
        ind = last[ind - 1]
    change_points = change_points[i_cp:]

    return edges[change_points]

In [5]:
# plot a standard histogram in the background, with alpha transparency
H1 = hist(hmqdat['Z'], bins=200, histtype='stepfilled',
          alpha=0.2, normed=True)
# plot an adaptive-width histogram on top
H2 = hist(hmqdat['Z'], bins=bayesian_blocks(hmqdat['Z']), color='black',
          histtype='step', normed=True)


/Users/rohin/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:48: RuntimeWarning: divide by zero encountered in log
/Users/rohin/anaconda/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6198: RuntimeWarning: invalid value encountered in true_divide
  m = (m.astype(float) / db) / m.sum()

In [13]:
hmqdat['Z']


Out[13]:
<Column name='Z' dtype='float64' length=510355>
-0.002
-0.001
-0.001
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
6.22
6.23
6.25
6.29
6.305
6.43
6.438
6.44
6.604
6.745
6.886
7.085

In [15]:
starttime = time.time()
bins=bayesian_blocks(hmqdat['Z'])
endtime = time.time()
tottime  = endtime-starttime
print "total time taken"
print tottime


/Users/rohin/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:48: RuntimeWarning: divide by zero encountered in log
total time taken
13718.4701881

In [16]:
starttime = time.time()
bins=bayesian_blocks(hmqdat['Z'])
endtime = time.time()
tottime  = endtime-starttime
print "total time taken"
print tottime


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-99f702f956ac> in <module>()
----> 1 edges

NameError: name 'edges' is not defined

In [ ]:


In [6]:
import pylab as pl
pl.hist(hmqdat['Z'], normed=True)


Out[6]:
(array([  2.65548881e-01,   2.95950725e-01,   2.92154642e-01,
          4.18603198e-01,   1.13929500e-01,   2.14963929e-02,
          2.78692785e-03,   3.48365981e-04,   1.96301466e-04,
          1.93536656e-05]),
 array([ -2.00000000e-03,   7.06700000e-01,   1.41540000e+00,
          2.12410000e+00,   2.83280000e+00,   3.54150000e+00,
          4.25020000e+00,   4.95890000e+00,   5.66760000e+00,
          6.37630000e+00,   7.08500000e+00]),
 <a list of 10 Patch objects>)

In [7]:
help(pl.hist)


Help on function hist in module matplotlib.pyplot:

hist(x, bins=None, range=None, normed=False, weights=None, cumulative=False, bottom=None, histtype=u'bar', align=u'mid', orientation=u'vertical', rwidth=None, log=False, color=None, label=None, stacked=False, hold=None, data=None, **kwargs)
    Plot a histogram.
    
    Compute and draw the histogram of *x*. The return value is a
    tuple (*n*, *bins*, *patches*) or ([*n0*, *n1*, ...], *bins*,
    [*patches0*, *patches1*,...]) if the input contains multiple
    data.
    
    Multiple data can be provided via *x* as a list of datasets
    of potentially different length ([*x0*, *x1*, ...]), or as
    a 2-D ndarray in which each column is a dataset.  Note that
    the ndarray form is transposed relative to the list form.
    
    Masked arrays are not supported at present.
    
    Parameters
    ----------
    x : (n,) array or sequence of (n,) arrays
        Input values, this takes either a single array or a sequency of
        arrays which are not required to be of the same length
    
    bins : integer or array_like or 'auto', optional
        If an integer is given, `bins + 1` bin edges are returned,
        consistently with :func:`numpy.histogram` for numpy version >=
        1.3.
    
        Unequally spaced bins are supported if `bins` is a sequence.
    
        If Numpy 1.11 is installed, may also be ``'auto'``.
    
        Default is taken from the rcParam ``hist.bins``.
    
    range : tuple or None, optional
        The lower and upper range of the bins. Lower and upper outliers
        are ignored. If not provided, `range` is (x.min(), x.max()). Range
        has no effect if `bins` is a sequence.
    
        If `bins` is a sequence or `range` is specified, autoscaling
        is based on the specified bin range instead of the
        range of x.
    
        Default is ``None``
    
    normed : boolean, optional
        If `True`, the first element of the return tuple will
        be the counts normalized to form a probability density, i.e.,
        ``n/(len(x)`dbin)``, i.e., the integral of the histogram will sum
        to 1. If *stacked* is also *True*, the sum of the histograms is
        normalized to 1.
    
        Default is ``False``
    
    weights : (n, ) array_like or None, optional
        An array of weights, of the same shape as `x`.  Each value in `x`
        only contributes its associated weight towards the bin count
        (instead of 1).  If `normed` is True, the weights are normalized,
        so that the integral of the density over the range remains 1.
    
        Default is ``None``
    
    cumulative : boolean, optional
        If `True`, then a histogram is computed where each bin gives the
        counts in that bin plus all bins for smaller values. The last bin
        gives the total number of datapoints.  If `normed` is also `True`
        then the histogram is normalized such that the last bin equals 1.
        If `cumulative` evaluates to less than 0 (e.g., -1), the direction
        of accumulation is reversed.  In this case, if `normed` is also
        `True`, then the histogram is normalized such that the first bin
        equals 1.
    
        Default is ``False``
    
    bottom : array_like, scalar, or None
        Location of the bottom baseline of each bin.  If a scalar,
        the base line for each bin is shifted by the same amount.
        If an array, each bin is shifted independently and the length
        of bottom must match the number of bins.  If None, defaults to 0.
    
        Default is ``None``
    
    histtype : {'bar', 'barstacked', 'step',  'stepfilled'}, optional
        The type of histogram to draw.
    
        - 'bar' is a traditional bar-type histogram.  If multiple data
          are given the bars are aranged side by side.
    
        - 'barstacked' is a bar-type histogram where multiple
          data are stacked on top of each other.
    
        - 'step' generates a lineplot that is by default
          unfilled.
    
        - 'stepfilled' generates a lineplot that is by default
          filled.
    
        Default is 'bar'
    
    align : {'left', 'mid', 'right'}, optional
        Controls how the histogram is plotted.
    
            - 'left': bars are centered on the left bin edges.
    
            - 'mid': bars are centered between the bin edges.
    
            - 'right': bars are centered on the right bin edges.
    
        Default is 'mid'
    
    orientation : {'horizontal', 'vertical'}, optional
        If 'horizontal', `~matplotlib.pyplot.barh` will be used for
        bar-type histograms and the *bottom* kwarg will be the left edges.
    
    rwidth : scalar or None, optional
        The relative width of the bars as a fraction of the bin width.  If
        `None`, automatically compute the width.
    
        Ignored if `histtype` is 'step' or 'stepfilled'.
    
        Default is ``None``
    
    log : boolean, optional
        If `True`, the histogram axis will be set to a log scale. If `log`
        is `True` and `x` is a 1D array, empty bins will be filtered out
        and only the non-empty (`n`, `bins`, `patches`) will be returned.
    
        Default is ``False``
    
    color : color or array_like of colors or None, optional
        Color spec or sequence of color specs, one per dataset.  Default
        (`None`) uses the standard line color sequence.
    
        Default is ``None``
    
    label : string or None, optional
        String, or sequence of strings to match multiple datasets.  Bar
        charts yield multiple patches per dataset, but only the first gets
        the label, so that the legend command will work as expected.
    
        default is ``None``
    
    stacked : boolean, optional
        If `True`, multiple data are stacked on top of each other If
        `False` multiple data are aranged side by side if histtype is
        'bar' or on top of each other if histtype is 'step'
    
        Default is ``False``
    
    Returns
    -------
    n : array or list of arrays
        The values of the histogram bins. See **normed** and **weights**
        for a description of the possible semantics. If input **x** is an
        array, then this is an array of length **nbins**. If input is a
        sequence arrays ``[data1, data2,..]``, then this is a list of
        arrays with the values of the histograms for each of the arrays
        in the same order.
    
    bins : array
        The edges of the bins. Length nbins + 1 (nbins left edges and right
        edge of last bin).  Always a single array even when multiple data
        sets are passed in.
    
    patches : list or list of lists
        Silent list of individual patches used to create the histogram
        or list of such list if multiple input datasets.
    
    Other Parameters
    ----------------
    kwargs : `~matplotlib.patches.Patch` properties
    
    See also
    --------
    hist2d : 2D histograms
    
    Notes
    -----
    Until numpy release 1.5, the underlying numpy histogram function was
    incorrect with `normed`=`True` if bin sizes were unequal.  MPL
    inherited that error.  It is now corrected within MPL when using
    earlier numpy versions.
    
    Examples
    --------
    .. plot:: mpl_examples/statistics/histogram_demo_features.py
    
    .. note::
        In addition to the above described arguments, this function can take a
        **data** keyword argument. If such a **data** argument is given, the
        following arguments are replaced by **data[<arg>]**:
    
        * All arguments with the following names: 'weights', 'x'.


In [8]:
pl.hist(hmqdat['Z'])


Out[8]:
(array([  9.60460000e+04,   1.07042000e+05,   1.05669000e+05,
          1.51404000e+05,   4.12070000e+04,   7.77500000e+03,
          1.00800000e+03,   1.26000000e+02,   7.10000000e+01,
          7.00000000e+00]),
 array([ -2.00000000e-03,   7.06700000e-01,   1.41540000e+00,
          2.12410000e+00,   2.83280000e+00,   3.54150000e+00,
          4.25020000e+00,   4.95890000e+00,   5.66760000e+00,
          6.37630000e+00,   7.08500000e+00]),
 <a list of 10 Patch objects>)

In [10]:
pl.hist(hmqdat['Z'],bins=20)


Out[10]:
(array([  4.74070000e+04,   4.86390000e+04,   5.94300000e+04,
          4.76120000e+04,   5.42920000e+04,   5.13770000e+04,
          9.76030000e+04,   5.38010000e+04,   2.89230000e+04,
          1.22840000e+04,   5.75100000e+03,   2.02400000e+03,
          6.50000000e+02,   3.58000000e+02,   1.12000000e+02,
          1.40000000e+01,   4.40000000e+01,   2.70000000e+01,
          4.00000000e+00,   3.00000000e+00]),
 array([ -2.00000000e-03,   3.52350000e-01,   7.06700000e-01,
          1.06105000e+00,   1.41540000e+00,   1.76975000e+00,
          2.12410000e+00,   2.47845000e+00,   2.83280000e+00,
          3.18715000e+00,   3.54150000e+00,   3.89585000e+00,
          4.25020000e+00,   4.60455000e+00,   4.95890000e+00,
          5.31325000e+00,   5.66760000e+00,   6.02195000e+00,
          6.37630000e+00,   6.73065000e+00,   7.08500000e+00]),
 <a list of 20 Patch objects>)

In [12]:
plt.hist(hmqdat['Z'])


Out[12]:
(array([  9.60460000e+04,   1.07042000e+05,   1.05669000e+05,
          1.51404000e+05,   4.12070000e+04,   7.77500000e+03,
          1.00800000e+03,   1.26000000e+02,   7.10000000e+01,
          7.00000000e+00]),
 array([ -2.00000000e-03,   7.06700000e-01,   1.41540000e+00,
          2.12410000e+00,   2.83280000e+00,   3.54150000e+00,
          4.25020000e+00,   4.95890000e+00,   5.66760000e+00,
          6.37630000e+00,   7.08500000e+00]),
 <a list of 10 Patch objects>)

In [ ]:
pl.hist(hmqdat['Z'],bins=20)