In [1]:
%load_ext logit
user_name = 'tlbl' # your 4 letter username
In [2]:
%%logit readlines, gps.csv
with open('gps.csv') as f:
for i in range(5):
print f.readline()
Make a function that loads the file and returns the contents as a list of lists like this:
[['Time', 'Latitude N', 'Longitude E', 'UTM zone32U Easting', 'UTM zone 32U northing', 'Unknown', 'Platform orientation'],
['s', 'deg', 'deg', 'm', 'm', '-', 'deg'],
['20130925 00:00:17', '54.9490087', '11.0242362', '629642.4', '6090992.3', '277034.3', '327.7'],
['20130925 00:00:46', '54.9491601', '11.0244624', '629656.4', '6091009.5', '277046.3', '325.4'],
['20130925 00:01:17', '54.9493249', '11.0247175', '629672.2', '6091028.4', '277060.0', '323.7']]
In [3]:
def read_csv(filename):
with open(filename) as f:
return f.readline(), f.readline(), [line.strip().split(',') for line in f.readlines()]
names, units, data = read_csv('gps.csv')
print "Names:", names
print "Units:", units
for r in data[:5]:
print r
assert data[4][3]=='327.5', "%s!=%s"%(data[4][3], '327.5')
In [4]:
%%logit read_csv, gps.csv
tmp = read_csv('gps.csv')
Do the same using the csv-module
In [5]:
import csv
def read_csv_module(filename):
with open(filename) as f:
return f.readline(), f.readline(), [line.strip().split(',') for line in f.readlines()]
names, units, data = read_csv_module('gps.csv')
print "Names:", names
print "Units:", units
for r in data[:5]:
print r
assert data[4][3]=='327.5', "%s!=%s"%(data[4][3], '327.5')
In [6]:
%%logit read_csv_module, gps.csv
tmp = read_csv_module('gps.csv')
Constructor:
datetime.datetime(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]])Parse string:
datetime.strptime(date_string, format)Format datetime:
datetime.strftime(format)Format specification: http://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior
Create the right format and test it on the date in row 1000:
In [7]:
from datetime import datetime
fmt = "%Y%m%d %H:%M:%S"
d = datetime.strptime(data[1000][0], fmt)
print d
assert d==datetime(2013,11,14,14,25,19)
Make a list of datetime-objects from the date/time-strings in the first column
In [8]:
dates = [datetime.strptime(data[i][0], fmt) for i in range(len(data))]
print dates[:3]
Make a list with the weekday name of the dates
In [9]:
weekdays = [datetime.strftime(dates[i], '%A') for i in range(len(data))]
print set(weekdays)
In [9]:
In [10]:
import numpy as np
#np_floats = np.array(np.zeros([len(data), len(data[0])-1]), dtype=np.float32)
np_floats = np.array([d[1:4]for d in data], dtype=np.float32)
print np_floats;
print np_floats.shape; assert np_floats.shape == (142779, 3)
print np_floats.dtype; assert np_floats.dtype==np.float32
Make a dictionary with the keys:
dates, weekdays, latitude, longitude, orientation
and assign to each key the corresponding list
In [11]:
data_dict = {'dates':dates,
'weekdays': weekdays,
'latitude': np_floats[:, 0],
'longitude': np_floats[:, 1],
'orientation': np_floats[:, 2]
}
for k, v in data_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
Before we save any data, it is necessary to consider the datatype.
Dates can be saved as
| good | bad | |
|---|---|---|
| datetime-object | Easy to work with in python, pickles directly | file size, string representation not suitable for saving |
| string | Easy to read in file | file size, parsing required |
| 64 bit float (i.e. seconds since 1/1/1970) | Compact in file | difficult to read in file, parsing required |
In [12]:
date_objects = dates
date_strings = [ row[0] for row in data]
date_floats = [(dt - datetime.utcfromtimestamp(0)).total_seconds() for dt in date_objects]
print date_objects[0]
print date_strings[0]
print date_floats[0]
Weekdays can be saved as
In [13]:
weekday_strings = weekdays
weekday_integers = [int(datetime.strftime(d,"%w")) for d in dates]
print weekday_strings[0]
print weekday_integers[0]
Before we go on, make a function that returns the size of a file in mb
In [14]:
import os
def fsize(filename):
return os.path.getsize(filename)*1e-6
print fsize('gps.csv')
assert np.round(fsize('gps.csv'),2) == 6.33
Make a function save_csv that saves its data-argument (list of list) as csv with names and units as header
In [15]:
def save_csv(filename, names, units, data):
with open(filename, 'wb') as f:
for n in names.split(','):
f.write("%s"% n)
f.write("\n")
for u in units.split(','):
f.write("%s "% u)
f.write("\n")
for dd in data:
for d in dd:
f.write("%s " % d)
f.write("\n")
In [16]:
%%logit save_csv, tmp.csv
save_csv('tmp.csv', names, units, data)
In [17]:
import os
print fsize('tmp.csv'), fsize('gps.csv')
#assert fsize('tmp.csv') == fsize('gps.csv')
Do the same using the csv-module
In [18]:
import csv
def save_csv_module(filename, names, units, data):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(names.strip().split(','))
writer.writerow(units.strip().split(','))
writer.writerows(data)
%timeit save_csv_module('tmp.csv', names, units, data)
In [19]:
%%logit save_csv_module, tmp.csv
save_csv_module('tmp.csv', names, units, data)
In [20]:
import os
print fsize('tmp.csv'), fsize('gps.csv')
assert fsize('tmp.csv') == fsize('gps.csv')
Numpy arrays can be saved directly as csv, binary and compressed binary. Unfortunately these options do not support datetime-objects or strings.
So we need to concatenate a np_data array of date_floats, weekday_integers and np_floats
In [21]:
print np.atleast_2d(np.array(date_floats)).shape
print np.atleast_2d(np.array(weekday_integers)).shape
print np_floats.T.shape
np_data = np.array([np.r_[date_floats], np.r_[weekday_integers], np.r_[np_floats.T[0, :]], np.r_[np_floats.T[1, :]], np.r_[np_floats.T[2, :]]]).T
print np_data.shape
assert np_data.shape==(142779,5)
print np_data.dtype
assert np_data.dtype==np.float64
Save the float array np_data as csv and binary and compressed binary using numpy
In [22]:
%%logit save_csv, tmp.csv
np.savetxt('tmp.csv', np_data)
In [23]:
%%logit save_bin, tmp.npy
np.save('tmp.npy', np_data)
In [24]:
%%logit save_zipbin, tmp.npz
np.savez('tmp.npz', np_data)
In [25]:
for k, v in data_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
Save data_dict as binary using cpickle - a faster implementation of pickle
See example at https://wiki.python.org/moin/UsingPickle
In [26]:
%%logit cpickle_dump, tmp.bin
import cPickle as pickle
pickle.dump(data_dict, open('tmp.bin', 'wb'))
print fsize('tmp.bin')
In [27]:
loaded_dict = pickle.load(open('tmp.bin'))
for k, v in loaded_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
Json is a human readable standard data format. It is quite similar to xml, but a little more compact (compression (like zip) will reduce the file size of both xml and json considerably).
The python json module are able to format e.g. a python dictionary as json, but only pure python types are supported, i.e. not datetime-objects or numpy arrays. Therefore a pure python dict is created
In [28]:
pydata_dict = {'dates': date_strings,
'weekdays':weekdays,
'latitude':np_data[:,2].tolist(),
'longitude':np_data[:,3].tolist(),
'orientation':np_data[:,4].tolist()}
for k, v in pydata_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
The following function saves dumps the data argument into a json (pretty) formated file
In [29]:
import json
def save_json(filename, data):
with open(filename,'w') as fid:
fid.write(json.dumps(data, sort_keys=True, indent=4, separators=(',', ': ')))
In [30]:
%%logit save_json, tmp.json
save_json('tmp.json', pydata_dict)
HDF5 is a hirachical dataformat like json and xml, but it is binary and therefore much more compact. The drawback is that a program is required to read the files, e.g. HDFView
In order to use HDF5 from python the module h5py (http://www.h5py.org/) is required (included in winpython, but not in anaconda)
pip install h5py
HDF5 stores numpy arrays as datasets, so pydata_dict can be saved like this
In [31]:
for k, v in pydata_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
In [32]:
import h5py
def save_hdf5(filename, data):
f = h5py.File(filename, "w")
try:
for k,v in data.items():
f.create_dataset(k,data=np.array(v))
except Exception:
pass
f.close()
In [33]:
%%logit save_hdf5, tmp.hdf5
save_hdf5('tmp.hdf5', pydata_dict)
It is however possible to reduce the file size if proper data types are manually selected for all lists
In [34]:
npdata_dict = {'dates':np_data[:,0],
'weekdays': np_data[:,1].astype(np.int8),
'latitude':np_data[:,2],
'longitude':np_data[:,3].astype(np.float32),
'orientation':np_data[:,4].astype(np.float32)}
for k, v in npdata_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
In [35]:
%%logit save_hdf5_astype, tmp.hdf5
save_hdf5('tmp.hdf5', npdata_dict)