Getting data into Python

Outline:

  • ASCII Files: numpy.loadtxt, astropy.io.ascii, read_csv (pandas)
  • Reading/Writing FITS files: astropy.io.fits, fitsio
  • IDL .sav files: scipy.readsav
  • Pandas

CSV data


In [1]:
import os
import numpy as np
import requests

In [2]:
# get some CSV data from the SDSS SQL server
URL = "http://skyserver.sdss.org/dr12/en/tools/search/x_sql.aspx"

cmd = """
SELECT TOP 1000
    p.u, p.g, p.r, p.i, p.z, s.class, s.z, s.zerr
FROM
    PhotoObj AS p
JOIN
    SpecObj AS s ON s.bestobjid = p.objid
WHERE
    p.u BETWEEN 0 AND 19.6 AND
    p.g BETWEEN 0 AND 20 AND
    s.class = 'GALAXY'
"""
if not os.path.exists('galaxy_colors.csv'):
    cmd = ' '.join(map(lambda x: x.strip(), cmd.split('\n')))
    response = requests.get(URL, params={'cmd': cmd, 'format':'csv'})
    with open('galaxy_colors.csv', 'w') as f:
        f.write(response.text)

In [3]:
!ls -lh galaxy_colors.csv


-rw-rw-r-- 1 kyle kyle 74K Sep 24 15:54 galaxy_colors.csv

In [4]:
!more galaxy_colors.csv


#Table1
u,g,r,i,z,class,z1,zerr
19.41061,18.23754,17.58132,17.20153,16.90159,GALAXY,0.03212454,6.06623E-06
19.54964,17.95799,17.02898,16.531,16.13408,GALAXY,0.1213151,2.358919E-05
18.74425,17.37778,16.80538,16.51149,16.28756,GALAXY,0.04876465,1.378529E-05
17.55033,15.75007,15.02809,14.66306,14.34982,GALAXY,0.04028672,1.167005E-05
17.60645,16.16628,15.51308,15.15529,14.87411,GALAXY,0.0254747,1.205017E-05
19.46927,18.18101,17.59062,17.25874,16.94567,GALAXY,0.03616738,8.249292E-06
19.58999,18.23981,17.54194,17.17573,16.92423,GALAXY,0.07254888,1.603681E-05
18.52309,16.65203,15.9179,15.47603,15.16455,GALAXY,0.06675781,1.785021E-05
18.7319,17.42271,16.80514,16.47006,16.18039,GALAXY,0.03646222,1.014089E-05
19.51618,18.32554,17.63795,17.25495,17.02355,GALAXY,0.1380212,9.179801E-06
18.97668,17.5344,17.04177,16.80851,16.6608,GALAXY,0.03877712,1.343119E-05
18.28252,16.52093,15.60923,15.08733,14.62764,GALAXY,0.0406868,9.147252E-06
18.79809,17.15676,16.46291,16.05119,15.75717,GALAXY,0.03758542,1.210123E-05
19.20826,18.0934,17.64492,17.31631,17.1272,GALAXY,0.07229859,6.830511E-06
19.44436,17.63904,16.6559,16.16875,15.77238,GALAXY,0.1004212,1.922567E-05
18.52571,16.41782,15.39036,14.88511,14.47884,GALAXY,0.03984664,1.687375E-05
17.85144,15.88613,14.90528,14.39902,14.00488,GALAXY,0.0393708,2.054666E-05
18.35856,16.24755,15.2596,14.76577,14.37919,GALAXY,0.035788,1.524728E-05
19.29955,17.97023,17.24436,16.77523,16.4432,GALAXY,0.1091017,1.079899E-05
19.41176,18.22642,17.49589,17.06814,16.81949,GALAXY,0.1641361,8.652342E-06
17.9766,16.00926,15.0054,14.50633,14.12062,GALAXY,0.0371233,1.478782E-05

Using numpy.loadtxt


In [5]:
dtype=[('u', 'f8'),
       ('g', 'f8'),
       ('r', 'f8'),
       ('i', 'f8'),
       ('z', 'f8'),
       ('class', 'S10'),
       ('redshift', 'f8'),
       ('redshift_err', 'f8')]
data = np.loadtxt('galaxy_colors.csv', skiprows=2, delimiter=',', dtype=dtype)

In [6]:
data[:10]


Out[6]:
array([ (19.41061, 18.23754, 17.58132, 17.20153, 16.90159, b'GALAXY', 0.03212454, 6.06623e-06),
       (19.54964, 17.95799, 17.02898, 16.531, 16.13408, b'GALAXY', 0.1213151, 2.358919e-05),
       (18.74425, 17.37778, 16.80538, 16.51149, 16.28756, b'GALAXY', 0.04876465, 1.378529e-05),
       (17.55033, 15.75007, 15.02809, 14.66306, 14.34982, b'GALAXY', 0.04028672, 1.167005e-05),
       (17.60645, 16.16628, 15.51308, 15.15529, 14.87411, b'GALAXY', 0.0254747, 1.205017e-05),
       (19.46927, 18.18101, 17.59062, 17.25874, 16.94567, b'GALAXY', 0.03616738, 8.249292e-06),
       (19.58999, 18.23981, 17.54194, 17.17573, 16.92423, b'GALAXY', 0.07254888, 1.603681e-05),
       (18.52309, 16.65203, 15.9179, 15.47603, 15.16455, b'GALAXY', 0.06675781, 1.785021e-05),
       (18.7319, 17.42271, 16.80514, 16.47006, 16.18039, b'GALAXY', 0.03646222, 1.014089e-05),
       (19.51618, 18.32554, 17.63795, 17.25495, 17.02355, b'GALAXY', 0.1380212, 9.179801e-06)], 
      dtype=[('u', '<f8'), ('g', '<f8'), ('r', '<f8'), ('i', '<f8'), ('z', '<f8'), ('class', 'S10'), ('redshift', '<f8'), ('redshift_err', '<f8')])

Using astropy.io.ascii


In [8]:
from astropy.io import ascii


/home/kyle/.conda/lib/python3.4/site-packages/IPython/kernel/__init__.py:13: ShimWarning: The `IPython.kernel` package has been deprecated. You should import from ipykernel or jupyter_client instead.
  "You should import from ipykernel or jupyter_client instead.", ShimWarning)

In [9]:
data = ascii.read('galaxy_colors.csv', format='csv', comment='#')

In [10]:
type(data)


Out[10]:
astropy.table.table.Table

In [11]:
data[:10]


Out[11]:
<Table masked=False length=10>
ugrizclassz1zerr
float64float64float64float64float64str192float64float64
19.4106118.2375417.5813217.2015316.90159GALAXY0.032124546.06623e-06
19.5496417.9579917.0289816.53116.13408GALAXY0.12131512.358919e-05
18.7442517.3777816.8053816.5114916.28756GALAXY0.048764651.378529e-05
17.5503315.7500715.0280914.6630614.34982GALAXY0.040286721.167005e-05
17.6064516.1662815.5130815.1552914.87411GALAXY0.02547471.205017e-05
19.4692718.1810117.5906217.2587416.94567GALAXY0.036167388.249292e-06
19.5899918.2398117.5419417.1757316.92423GALAXY0.072548881.603681e-05
18.5230916.6520315.917915.4760315.16455GALAXY0.066757811.785021e-05
18.731917.4227116.8051416.4700616.18039GALAXY0.036462221.014089e-05
19.5161818.3255417.6379517.2549517.02355GALAXY0.13802129.179801e-06

Using pandas


In [12]:
import pandas

In [13]:
data = pandas.read_csv('galaxy_colors.csv', comment='#')

In [14]:
type(data)


Out[14]:
pandas.core.frame.DataFrame

In [15]:
data.head()


Out[15]:
u g r i z class z1 zerr
0 19.41061 18.23754 17.58132 17.20153 16.90159 GALAXY 0.032125 0.000006
1 19.54964 17.95799 17.02898 16.53100 16.13408 GALAXY 0.121315 0.000024
2 18.74425 17.37778 16.80538 16.51149 16.28756 GALAXY 0.048765 0.000014
3 17.55033 15.75007 15.02809 14.66306 14.34982 GALAXY 0.040287 0.000012
4 17.60645 16.16628 15.51308 15.15529 14.87411 GALAXY 0.025475 0.000012

In [16]:
data.describe()


Out[16]:
u g r i z z1 zerr
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 18.847712 17.362790 16.652535 16.277254 16.015587 0.079304 0.000014
std 0.705162 0.777843 0.902157 0.942336 0.988064 0.051818 0.000011
min 14.065570 12.779670 12.145150 11.757920 11.650580 0.000000 0.000000
25% 18.537655 16.973130 16.168653 15.765792 15.432625 0.050569 0.000008
50% 19.043970 17.492975 16.719680 16.321100 16.040580 0.077798 0.000012
75% 19.355695 17.924820 17.313200 16.976025 16.745777 0.093881 0.000017
max 19.599320 19.260320 24.802040 24.361810 22.826910 0.760918 0.000233

In [ ]:
# Pandas reads from *lots* of different data sources
pandas.read_

Specialized text formats


In [17]:
# get some data from CDS
prefix = "http://cdsarc.u-strasbg.fr/vizier/ftp/cats/J/ApJ/686/749/"
for fname in ["ReadMe", "table10.dat"]:
    if not os.path.exists(fname):
        response = requests.get(prefix + fname)
        with open(fname, 'w') as f:
            f.write(response.text)

In [ ]:
!cat table10.dat

In [ ]:
!cat ReadMe

In [20]:
# must specify the "readme" here.
data = ascii.read("table10.dat", format='cds', readme="ReadMe")

In [21]:
data


Out[21]:
<Table masked=True length=140>
SNJDTelBmage_BmagVmage_VmagRmage_RmagImage_Imag
dmagmagmagmagmagmagmagmag
str288float64str448float64float64float64float64float64float64float64float64
SN 1999aa2451221.81LICK 1m DEWAR215.8280.032------------
SN 1999aa2451222.67YALO15.6420.01815.680.02815.6890.0415.7170.105
SN 1999aa2451223.67YALO15.4620.02----15.4860.03615.5140.083
SN 1999aa2451225.65YALO15.2110.01715.260.02815.2760.0315.3120.025
SN 1999aa2451227.73LICK 1m DEWAR215.0060.01615.060.02415.080.018----
SN 1999aa2451229.62YALO14.9240.01714.9650.02615.0920.03----
SN 1999aa2451232.61YALO14.9080.01714.9130.02815.0620.0315.2530.025
SN 1999aa2451235.6YALO14.9190.02114.8980.03215.0370.03115.3070.029
SN 1999aa2451241.6YALO15.1830.01315.0620.02715.2660.0315.5750.024
.................................
SN 1999bp2451275.62CTIO 1.5m T2K19.1320.01818.7480.02418.5990.03119.1270.048
SN 1999bp2451281.78LICK 1m DEWAR220.1280.15----19.0910.08----
SN 1999bp2451286.43JKT 1m20.2420.0419.4380.038--------
SN 1999bp2451288.83KPNO 2.1m20.3770.03------------
SN 1999bp2451289.74LICK 1m DEWAR2----20.0860.42719.5360.205----
SN 1999bp2451293.75LICK 1m DEWAR2------------19.5730.109
SN 1999bp2451302.72CTIO 1.5m T2K----20.3060.269--------
SN 1999bp2451306.57YALO21.7230.03920.6030.04219.6460.046----
SN 1999bp2451337.81CFHT--------20.6120.0620.9040.144
SN 1999bp2451338.83CFHT22.2080.107------------

Reading FITS files

Two options: astropy.io.fits (formerly pyfits) and fitsio.


In [ ]:
# get an SDSS image (can search for images from http://dr12.sdss3.org/fields/)
if not os.path.exists("frame-g-006728-4-0121.fits.bz2"):
    !wget http://dr12.sdss3.org/sas/dr12/boss/photoObj/frames/301/6728/4/frame-g-006728-4-0121.fits.bz2
if not os.path.exists("frame-g-006728-4-0121.fits"):
    !bunzip2 frame-g-006728-4-0121.fits.bz2

astropy.io.fits


In [22]:
from astropy.io import fits

hdulist = fits.open("frame-g-006728-4-0121.fits")

In [23]:
hdulist


Out[23]:
[<astropy.io.fits.hdu.image.PrimaryHDU at 0x7f2677d77710>,
 <astropy.io.fits.hdu.image.ImageHDU at 0x7f267ce25668>,
 <astropy.io.fits.hdu.table.BinTableHDU at 0x7f267be55470>,
 <astropy.io.fits.hdu.table.BinTableHDU at 0x7f267be11d68>]

In [24]:
hdulist.info()


Filename: frame-g-006728-4-0121.fits
No.    Name         Type      Cards   Dimensions   Format
0    PRIMARY     PrimaryHDU      96   (2048, 1489)   float32   
1                ImageHDU         6   (2048,)      float32   
2                BinTableHDU     27   1R x 3C      [49152E, 2048E, 1489E]   
3                BinTableHDU     79   1R x 31C     [J, 3A, J, A, D, D, 2J, J, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, E, E]   

In [25]:
hdulist[0].data


Out[25]:
array([[ 0.00395966,  0.0116272 ,  0.02697754, ..., -0.00365067,
         0.01544189,  0.0116272 ],
       [-0.01139832, -0.00756073, -0.00756073, ...,  0.00016785,
         0.0116272 ,  0.0269165 ],
       [ 0.01547241, -0.00756073, -0.01522827, ...,  0.01544189,
         0.00016856,  0.01925659],
       ..., 
       [ 0.00098419,  0.00866699,  0.00482178, ...,  0.03155518,
         0.00483704, -0.01806641],
       [-0.02206421, -0.02972412,  0.00866699, ...,  0.01629639,
         0.01629639,  0.01248169],
       [-0.02206421, -0.03356934,  0.01249695, ...,  0.00865173,
         0.00865173,  0.02011108]], dtype=float32)

In [ ]:
hdulist[0].header

fitsio

(pip install --no-deps fitsio)

  • Faster (mainly for tables)
  • Does a better job with ASCII table extensions

In [27]:
import fitsio

In [28]:
f = fitsio.FITS("frame-g-006728-4-0121.fits")

In [29]:
# summary of file HDUs
f


Out[29]:
  file: frame-g-006728-4-0121.fits
  mode: READONLY
  extnum hdutype         hduname[v]
  0      IMAGE_HDU       
  1      IMAGE_HDU       
  2      BINARY_TBL      
  3      BINARY_TBL      

In [30]:
# summary of first HDU
f[0]


Out[30]:
  file: frame-g-006728-4-0121.fits
  extension: 0
  type: IMAGE_HDU
  image info:
    data type: f4
    dims: [1489,2048]

In [31]:
# Summary of 3rd HDU
f[2]


Out[31]:
  file: frame-g-006728-4-0121.fits
  extension: 2
  type: BINARY_TBL
  rows: 1
  column info:
    ALLSKY              f4  array[256,192]
    XINTERP             f4  array[2048]
    YINTERP             f4  array[1489]

In [32]:
# Actually read the data.
data = f[0].read()
data


Out[32]:
array([[ 0.00395966,  0.0116272 ,  0.02697754, ..., -0.00365067,
         0.01544189,  0.0116272 ],
       [-0.01139832, -0.00756073, -0.00756073, ...,  0.00016785,
         0.0116272 ,  0.0269165 ],
       [ 0.01547241, -0.00756073, -0.01522827, ...,  0.01544189,
         0.00016856,  0.01925659],
       ..., 
       [ 0.00098419,  0.00866699,  0.00482178, ...,  0.03155518,
         0.00483704, -0.01806641],
       [-0.02206421, -0.02972412,  0.00866699, ...,  0.01629639,
         0.01629639,  0.01248169],
       [-0.02206421, -0.03356934,  0.01249695, ...,  0.00865173,
         0.00865173,  0.02011108]], dtype=float32)

Salvaging data from IDL

scipy.io.readsav: Formerly a separate idlsave module by Tom Robitaille.


In [34]:
from scipy.io import readsav

In [35]:
# Note: won't work unless you have this sav file!
data = readsav("150623434_det8_8100keV.sav")

In [36]:
data


Out[36]:
{'events': array([  5.96046448e-08,   4.04059887e-04,   1.60408020e-03, ...,
          2.36982572e+03,   2.36982581e+03,   2.36982593e+03]),
 'tstart': 456747888.71404397}

In [37]:
len(data.events)


Out[37]:
6653156

Clean up downloaded files


In [ ]:
!rm galaxy_colors.csv
!rm ReadMe
!rm table10.dat
!rm frame-g-006728-4-0121.fits.bz2
!rm frame-g-006728-4-0121.fits