Michelle's Data (HeCS_pared_small.npy)


In [11]:
import numpy as np
dat = np.load('./data/HeCS_pared_small.npy')

# for n in range(len(dat)):
#     print('')
#     Ngal = dat['Ngal'][n]
#     print('cluster index:', n)
#     print('velocity standard deviation:', np.std(dat['vlos'][n][0:Ngal])) 
#     print ('cluster M500 from SZ:', dat['MSZ'][n])
#     print ('cluster M200c:', dat['Mtot'][n])
#     print ('  ')

In [2]:
import pandas as pd

datasum = pd.DataFrame(columns = ['velsigma','M500_SZ','M200c'], index = range(len(dat)))

for n in range(len(dat)):
    Ngal = dat['Ngal'][n]
    
    velsigma = np.std(dat['vlos'][n][0:Ngal])
    m500 = dat['MSZ'][n]
    m200c = dat['Mtot'][n]
    
    datasum.loc[n] = pd.Series({'velsigma':velsigma ,
                                'M500_SZ' : m500 ,
                                'M200c' : m200c })

In [3]:
## PURE POWER LAW
alpha = 0.382
sig15 = 1244

## CONTAMINATED POWER LAW
# alpha = 0.359
# sig15 = 753

M_pow = (datasum['velsigma']/sig15)**(1/alpha) * 1.e15
M_pow.name = 'M_powlaw'

ep_m200c = (M_pow - datasum['M200c'])/datasum['M200c']
ep_m200c.name = 'ep_M200c'

datacalc = pd.concat([M_pow,ep_m200c], axis=1)

In [4]:
datares = datasum.join(datacalc)

datares.head(15)


Out[4]:
velsigma M500_SZ M200c M_powlaw ep_M200c
0 843.07 2.324e+14 2.77e+14 3.61164e+14 0.30384
1 747.848 2.681e+14 2.92e+14 2.63905e+14 -0.0962168
2 426.237 1.141e+14 1.19e+14 6.0573e+13 -0.490983
3 753.334 3.43e+14 2.5e+14 2.69003e+14 0.0760117
4 690.516 2.338e+14 1.77e+14 2.14175e+14 0.210027
5 485.47 1.309e+14 2.02e+14 8.51558e+13 -0.578437
6 670.823 1.26e+14 1.34e+14 1.98552e+14 0.48173
7 706.867 2.8e+14 2.26e+14 2.27706e+14 0.00754944
8 750.748 3.437e+14 2.04e+14 2.66592e+14 0.306824
9 781.662 3.409e+14 4.04e+14 2.96295e+14 -0.266597
10 751.004 8.036e+14 4.42e+14 2.6683e+14 -0.396312
11 822.919 3.878e+14 1.47e+14 3.39e+14 1.30612
12 823.836 4.956e+14 7.84e+14 3.39991e+14 -0.566338
13 674.572 2.842e+14 3.52e+14 2.01469e+14 -0.427644
14 697.52 2.555e+14 2.37e+14 2.19908e+14 -0.0721177

In [5]:
import matplotlib as mpl
% matplotlib inline
mpl.rcParams['figure.figsize'] = [10.,10.]

plt = mpl.pyplot

In [6]:
def plothist(X,Y, n=10):
    increment = (X.max() - X.min())/n
    
    i = X.min()
    
    y = []
    err = []
    while i < X.max():
        y.append(Y[(X>=i) & (X<(i+increment))].mean())
        err.append(Y[(X>=i) & (X<(i+increment))].std())
        i += increment
        
    plt.errorbar(np.array(range(n))*increment + X.min() + increment/2.,
                 y,
                 err,
                )

In [13]:
def binnedplot(X,Y, n=10, percentiles = [15,50,85], mean=True, label='', ax = None):
    if n=='auto':
        n = len(X)/1000

    increment = (X.max() - X.min())/n
    i = X.min()

    x = []
    y = []
    
    y_percent = np.ndarray((0,len(percentiles)))
    
    err = []
    while i < X.max():
        bindata = Y[(X>=i) & (X<(i+increment))]
        
        i+=increment
        
        if len(bindata) == 0: continue
            
        x.append(i-increment/2)
        
        y.append(bindata.mean())
        
        y_p = np.percentile(bindata,percentiles)
        y_percent = np.append(y_percent, [y_p],axis=0)
        
        err.append(bindata.std())

        
    if mean: 
        if ax is None: plt.plot(x,y, label=label+'mean')
        else: ax.plot(x,y, label=label+'mean')
    y_percent = np.swapaxes(y_percent,0,1)
    
    for i in range(len(percentiles)):
        if ax is None:
            plt.plot(x, y_percent[i],label=label+str(percentiles[i]))
        else:
            ax.plot(x, y_percent[i],label=label+str(percentiles[i]))

    return x,y,err

In [14]:
X = np.log10(datares['M200c'].astype('float64'))
Y = np.log10(datares['M_powlaw'].astype('float64'))
plt.plot(X,Y,'.')
plt.xlabel('log[ M200c ]', fontsize=20)
plt.ylabel('log[ M_powlaw ]', fontsize=20)

binnedplot(X,Y,10)


Out[14]:
([14.001929604044905,
  14.116823507616388,
  14.231717411187871,
  14.346611314759354,
  14.461505218330837,
  14.57639912190232,
  14.691293025473803,
  14.806186929045285,
  14.921080832616768,
  15.035974736188251,
  15.150868639759734],
 [14.251177725958025,
  14.237763990807203,
  14.276589028835748,
  14.317937618763743,
  14.391871293539257,
  14.383437689304913,
  14.444614068503489,
  14.388299024254884,
  14.492359610334043,
  14.626183919913663,
  14.453092532338976],
 [0.12141285380427878,
  0.24940541673168148,
  0.12590306356783526,
  0.18835067105027006,
  0.1042625826734961,
  0.12728516592376263,
  0.17097077599955457,
  0.059898104615205443,
  0.081427698469216339,
  nan,
  nan])

In [8]:
X = np.log10(datares['M200c'].astype('float64'))
Y = (datares['ep_M200c'].astype('float64'))
plt.plot(X,Y,'.')
plt.xlabel('log[ M200c ]', fontsize=20)
plt.ylabel(r'$\epsilon_{M200c}$', fontsize=20)

plothist(X,Y,10)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-08400494df90> in <module>()
      5 plt.ylabel(r'$\epsilon_{M200c}$', fontsize=20)
      6 
----> 7 plothist(X,Y,10)

<ipython-input-6-0dd9c3dd2ef2> in plothist(X, Y, n)
     13     plt.errorbar(np.array(range(n))*increment + X.min() + increment/2.,
     14                  y,
---> 15                  err,
     16                 )

/usr/lib64/python3.4/site-packages/matplotlib/pyplot.py in errorbar(x, y, yerr, xerr, fmt, ecolor, elinewidth, capsize, barsabove, lolims, uplims, xlolims, xuplims, errorevery, capthick, hold, data, **kwargs)
   2850                           xlolims=xlolims, xuplims=xuplims,
   2851                           errorevery=errorevery, capthick=capthick, data=data,
-> 2852                           **kwargs)
   2853     finally:
   2854         ax._hold = washold

/usr/lib64/python3.4/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1708                     warnings.warn(msg % (label_namer, func.__name__),
   1709                                   RuntimeWarning, stacklevel=2)
-> 1710             return func(ax, *args, **kwargs)
   1711         pre_doc = inner.__doc__
   1712         if pre_doc is None:

/usr/lib64/python3.4/site-packages/matplotlib/axes/_axes.py in errorbar(self, x, y, yerr, xerr, fmt, ecolor, elinewidth, capsize, barsabove, lolims, uplims, xlolims, xuplims, errorevery, capthick, **kwargs)
   2963         if plot_line:
   2964             data_line = mlines.Line2D(x, y, **plot_line_style)
-> 2965             self.add_line(data_line)
   2966 
   2967         barcols = []

/usr/lib64/python3.4/site-packages/matplotlib/axes/_base.py in add_line(self, line)
   1757             line.set_clip_path(self.patch)
   1758 
-> 1759         self._update_line_limits(line)
   1760         if not line.get_label():
   1761             line.set_label('_line%d' % len(self.lines))

/usr/lib64/python3.4/site-packages/matplotlib/axes/_base.py in _update_line_limits(self, line)
   1779         Figures out the data limit of the given line, updating self.dataLim.
   1780         """
-> 1781         path = line.get_path()
   1782         if path.vertices.size == 0:
   1783             return

/usr/lib64/python3.4/site-packages/matplotlib/lines.py in get_path(self)
    949         """
    950         if self._invalidy or self._invalidx:
--> 951             self.recache()
    952         return self._path
    953 

/usr/lib64/python3.4/site-packages/matplotlib/lines.py in recache(self, always)
    659             y = self._y
    660 
--> 661         self._xy = np.column_stack(np.broadcast_arrays(x, y)).astype(float)
    662         self._x, self._y = self._xy.T  # views
    663 

/usr/lib64/python3.4/site-packages/numpy/lib/stride_tricks.py in broadcast_arrays(*args, **kwargs)
    247     args = [np.array(_m, copy=False, subok=subok) for _m in args]
    248 
--> 249     shape = _broadcast_shape(*args)
    250 
    251     if all(array.shape == shape for array in args):

/usr/lib64/python3.4/site-packages/numpy/lib/stride_tricks.py in _broadcast_shape(*args)
    182     # use the old-iterator because np.nditer does not handle size 0 arrays
    183     # consistently
--> 184     b = np.broadcast(*args[:32])
    185     # unfortunately, it cannot handle 32 or more arguments directly
    186     for pos in range(32, len(args), 31):

ValueError: shape mismatch: objects cannot be broadcast to a single shape

In [9]:
n = 20

Ngal = dat['Ngal'][n]
vlos = dat['vlos'][n][0:Ngal]

h = plt.hist(np.absolute(vlos), 12)
plt.xlabel(r'$v_{los}$', fontsize=20)
plt.ylabel(r'$dN/dv$', fontsize=20)

print('cluster index:', n)
print('velocity standard deviation:', np.std(dat['vlos'][n][0:Ngal])) 
print ('cluster M500 from SZ:', dat['MSZ'][n])
print ('cluster M200c:', dat['Mtot'][n])


cluster index: 20
velocity standard deviation: 690.939
cluster M500 from SZ: 2.912e+14
cluster M200c: 4.13e+14

Using HeCS to train SDM


In [10]:
type(dat)

for n in range(len(dat)):
    Ngal = dat['Ngal'][n]
    
    velsigma = np.std(dat['vlos'][n][0:Ngal])
    m500 = dat['MSZ'][n]
    m200c = dat['Mtot'][n]
    
    datasum.loc[n] = pd.Series({'velsigma':velsigma ,
                                'M500_SZ' : m500 ,
                                'M200c' : m200c })

In [11]:
dat = np.load('./data/HeCS_pared_small.npy')

featuresList = []
massList = []

numHalos = len(dat)

#loop through all of the halos
for h in range(numHalos):
        numSubs = dat['Ngal'][h]
        
        #make a numpy array that will hold, in this case, 3 features for each of the numSubs subhalos.  Be aware that SDM requires at least 2 dimensions, so if you want to use line of sight velocity, you'll have to fill in a dummy value for the 2nd dimension.  I used an array of 1.0's
        subfeat = np.zeros((numSubs, 2), 'f')
        
        #loop through the subhalos/galaxies in this halo:
        for sh in range (0, numSubs):
            #and fill in each of the features
            subfeat[sh][0] = np.abs(dat['vlos'][h][sh])
            subfeat[sh][1] = 1.0

        #append the numpy array of features to the featuresList
        featuresList.append(subfeat)
        #and the mass to the massList
        massList.append(dat['Mtot'])

#turn this into a features object:
# features = sdm.Features(featuresList, mass=massList)

#model.crossvalidate is the workhorse.  It will do the 10-fold crossvalidation to predict answers for all of your data.  It outputs root mean squared error (rmse) and mass predictions:
# rmse, predictions = model.crossvalidate(trainData, trainData.mass)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-11-4ab7e5b79063> in <module>()
     16         for sh in range (0, numSubs):
     17             #and fill in each of the features
---> 18             subfeat[sh][0] = np.abs(dat['vlos'][h][sh])
     19             subfeat[sh][1] = 1.0
     20 

/usr/lib/python2.7/site-packages/numpy-1.10.1-py2.7-linux-x86_64.egg/numpy/core/_internal.pyc in _index_fields(ary, names)
    315     if isinstance(names, basestring):
    316         try:
--> 317             return ary.getfield(dt.fields[names][0], dt.fields[names][1])
    318         except KeyError:
    319             raise ValueError("no field of name %s" % names)

/usr/lib/python2.7/site-packages/numpy-1.10.1-py2.7-linux-x86_64.egg/numpy/core/_internal.pyc in _getfield_is_safe(oldtype, newtype, offset)
    445     """
    446     new_fields = _get_all_field_offsets(newtype, offset)
--> 447     old_fields = _get_all_field_offsets(oldtype)
    448     # raises if there is a problem
    449     _check_field_overlap(new_fields, old_fields)

/usr/lib/python2.7/site-packages/numpy-1.10.1-py2.7-linux-x86_64.egg/numpy/core/_internal.pyc in _get_all_field_offsets(dtype, base_offset)
    357             sub_dtype = dtype.fields[name][0]
    358             sub_offset = dtype.fields[name][1] + base_offset
--> 359             fields.extend(_get_all_field_offsets(sub_dtype, sub_offset))
    360     else:
    361         if dtype.shape:

/usr/lib/python2.7/site-packages/numpy-1.10.1-py2.7-linux-x86_64.egg/numpy/core/_internal.pyc in _get_all_field_offsets(dtype, base_offset)
    365                 count *= dim
    366             fields.extend((typ, off + dtype.base.itemsize*j)
--> 367                            for j in range(count) for (typ, off) in sub_offsets)
    368         else:
    369             fields.append((dtype, base_offset))

/usr/lib/python2.7/site-packages/numpy-1.10.1-py2.7-linux-x86_64.egg/numpy/core/_internal.pyc in <genexpr>((j,))
    365                 count *= dim
    366             fields.extend((typ, off + dtype.base.itemsize*j)
--> 367                            for j in range(count) for (typ, off) in sub_offsets)
    368         else:
    369             fields.append((dtype, base_offset))

KeyboardInterrupt: 

In [19]:
tempres = np.load('./tempresults.npy', encoding='latin1').item()
tempres.keys()


Out[19]:
['feat', 'mass', 'predictions', 'rmse']

In [20]:
X = np.log10(tempres['mass']).astype('float64')
Y = np.log10(tempres['predictions']).astype('float64')
plt.plot(X,Y,'.')
plt.xlabel('log[ M200c ]', fontsize=20)
plt.ylabel('log[ M_SDM ]', fontsize=20)

plothist(X,Y,10)



In [14]:
X.astype


Out[14]:
<function astype>

In [4]:
x = np.array([4,5,6,67])
x = np.append(x,124)

x[x<15]


Out[4]:
array([4, 5, 6])

In [ ]:

Minghan's Data (coma.p)


In [10]:
comadat = np.load('./data/coma.p')[0]


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
/usr/lib64/python3.4/site-packages/numpy/lib/npyio.py in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
    425             try:
--> 426                 return pickle.load(fid, **pickle_kwargs)
    427             except:

UnicodeDecodeError: 'ascii' codec can't decode byte 0x80 in position 6: ordinal not in range(128)

During handling of the above exception, another exception occurred:

OSError                                   Traceback (most recent call last)
<ipython-input-10-eb4a20dbf1e8> in <module>()
----> 1 comadat = np.load('./data/coma.p')[0]

/usr/lib64/python3.4/site-packages/numpy/lib/npyio.py in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
    427             except:
    428                 raise IOError(
--> 429                     "Failed to interpret file %s as a pickle" % repr(file))
    430     finally:
    431         if own_fid:

OSError: Failed to interpret file './data/coma.p' as a pickle

In [ ]:
vlos = []
for i in comadat:
    vlos.append(i[0])
    
h = plt.hist(vlos, 15)
plt.xlabel(r'$v_{los}$', fontsize=20)
plt.ylabel(r'$dN/dv$', fontsize=20)

velsigma_coma = np.std(vlos)
M_pow_coma = (velsigma_coma/sig15)**(1/alpha) * 1.e15

print ('cluster: COMA')
print ('velocity standard deviation:', np.std(vlos))
print ('mass from power law:', "%.4g" % M_pow_coma)

In [ ]: