Stephen Bailey, Spring 2015
Note: Our pipeline is designed around each step writing a file and having the next step read that file to keep going. At the same time, we would like to reserve the possibility of skipping the read by directly using the results that are already in memory. The loss of precision from using float in the I/O instead of double is readily reproducible in memory; I don't know of a viable way to mimic the fpack loss of precision without actually writing the file.
In [2]:
#- Loss of precision
import numpy as np
dd = np.random.normal(scale=1, loc=1, size=(500, 4000))
df = dd.astype(np.float32)
df_precision = np.std(dd-df)
print df_precision
In [3]:
#- compress a dataset; return compression factor and loss of precision
from astropy.io import fits
def testpack(data, compression_type='RICE_1', quantize_level=16.0):
hdu = fits.CompImageHDU(data,
compression_type=compression_type, quantize_level=quantize_level,
clobber=True)
hdu.writeto('blat.fits', clobber=True)
dx = fits.getdata('blat.fits')
mbx = os.stat('blat.fits').st_size / 1024.**2
compression = (data.size*data.dtype.itemsize/1024.**2) / mbx
return compression, np.std(data-dx)
In [7]:
print 'quantize=16', testpack(dd, 'GZIP_1', 16)
print 'quantize=10**9', testpack(dd, 'GZIP_1', 10**9)
In [6]:
#- Also test with DESI spectral data instead of pure random data
#- (hardwired filepath, not portable)
infile = '/data/desi/spectro/redux/sjb/dogwood/exposures/20150211/00000002/cframe-r0-00000002.fits'
spectra = fits.getdata(infile, 'FLUX')
print 'quantize=16', testpack(spectra, 'GZIP_1', 16)
print 'quantize=10**9', testpack(spectra, 'GZIP_1', 10**9)
In [32]:
#- Scan through the quantization parameter
#- Return arrays of q, compression_factor, loss_of_precision
def scanq(data, compression_type='GZIP_1'):
results = list()
qx = 10**np.arange(9,4,-0.5)
results = [testpack(data, compression_type=compression_type, quantize_level=q) for q in qx]
compression, precision = zip(*results)
return np.asarray(qx), np.asarray(compression), np.asarray(precision)
In [44]:
%pylab inline
def plotscan(data, comptype='GZIP_1', extra_noise=0.0, pre_compression=1.0):
qx, compression, precision = scanq(data, comptype)
precision = np.sqrt(precision**2 + extra_noise**2)
plot(compression*pre_compression, np.log10(precision), 'b.-')
xlim(1, np.max(compression*pre_compression)*1.2)
xlabel('compression factor')
ylabel('log10(std(delta))')
return qx, compression, precision
In [45]:
qx, compression, precision = plotscan(dd, 'GZIP_1')
plot([2,], [np.log10(df_precision),], 'rs')
Out[45]:
In [46]:
print 'Lossless compression', np.max(compression[precision==0.0])
px = np.std(dd-df)
print 'Precision improvement vs. float', px / np.interp(2, compression, precision)
print 'Compression improvement vs. float', np.interp(px, precision, compression)/2.0
In [47]:
#- comparing compression of double vs. float
qx, compression, precision = plotscan(dd, 'GZIP_1')
qx, compression, precision = plotscan(df, 'GZIP_1', extra_noise=df_precision, pre_compression=2.0)