In [1]:
%load_ext logit
user_name = 'FcoH'
In [2]:
%%logit readlines, gps.csv
with open('gps.csv') as f:
for i in range(5):
print f.readline()
Make a function that loads the file and returns the contents as a list of lists like this:
[['Time', 'Latitude N', 'Longitude E', 'UTM zone32U Easting', 'UTM zone 32U northing', 'Unknown', 'Platform orientation'],
['s', 'deg', 'deg', 'm', 'm', '-', 'deg'],
['20130925 00:00:17', '54.9490087', '11.0242362', '629642.4', '6090992.3', '277034.3', '327.7'],
['20130925 00:00:46', '54.9491601', '11.0244624', '629656.4', '6091009.5', '277046.3', '325.4'],
['20130925 00:01:17', '54.9493249', '11.0247175', '629672.2', '6091028.4', '277060.0', '323.7']]
In [3]:
def read_csv(filename):
name=[]
units=[]
data=[]
with open(filename) as f:
name.append(f.readline().split(","))
units.append(f.readline().split(","))
for i in f.readlines():
data.append(i.strip().split(","))
return name, units, data
names, units, data = read_csv('gps.csv')
print "Names:", names
print "Units:", units
for r in data[:5]:
print r
assert data[4][3]=='327.5', "%s!=%s"%(data[4][3], '327.5')
In [4]:
%%logit read_csv, gps.csv
tmp = read_csv('gps.csv')
Do the same using the csv-module
In [5]:
import csv
def read_csv_module(filename):
with open(filename, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
return next(reader), next(reader), list(reader)
names, units, data = read_csv_module('gps.csv')
print "Names:", names
print "Units:", units
for r in data[:5]:
print r
assert data[4][3]=='327.5', "%s!=%s"%(data[4][3], '327.5')
In [6]:
%%logit read_csv_module, gps.csv
tmp = read_csv_module('gps.csv')
Constructor:
datetime.datetime(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]])Parse string:
datetime.strptime(date_string, format)Format datetime:
datetime.strftime(format)Format specification: http://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior
Create the right format and test it on the date in row 1000:
In [7]:
from datetime import datetime
fmt = "%Y%m%d %H:%M:%S"
d = datetime.strptime(data[1000][0], fmt)
print d
assert d==datetime(2013,11,14,14,25,19)
Make a list of datetime-objects from the date/time-strings in the first column
In [8]:
dates = [datetime.strptime(l[0], fmt) for l in data]
print dates[:3]
Make a list with the weekday name of the dates
In [9]:
weekdays = [datetime.strftime(l,"%A") for l in dates]
print set(weekdays)
In [10]:
import numpy as np
np_floats = np.array(data)[:,1:].astype(np.float32)
print np_floats;
print np_floats.shape; assert np_floats.shape == (142779, 3)
print np_floats.dtype; assert np_floats.dtype==np.float32
Make a dictionary with the keys:
dates, weekdays, latitude, longitude, orientation
and assign to each key the corresponding list
In [11]:
data_dict = {'dates':dates,
'weekdays':weekdays,
'latitude':np_floats[:,0],
'longitude':np_floats[:,1],
'orientation':np_floats[:,2]}
for k, v in data_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
Before we save any data, it is necessary to consider the datatype.
Dates can be saved as
| good | bad | |
|---|---|---|
| datetime-object | Easy to work with in python, pickles directly | file size, string representation not suitable for saving |
| string | Easy to read in file | file size, parsing required |
| 64 bit float (i.e. seconds since 1/1/1970) | Compact in file | difficult to read in file, parsing required |
In [12]:
date_objects = dates
date_strings = [ row[0] for row in data]
date_floats = [(dt - datetime.utcfromtimestamp(0)).total_seconds() for dt in date_objects]
print date_objects[0]
print date_strings[0]
print date_floats[0]
Weekdays can be saved as
In [13]:
weekday_strings = weekdays
weekday_integers = [int(datetime.strftime(d,"%w")) for d in dates]
print weekday_strings[0]
print weekday_integers[0]
Before we go on, make a function that returns the size of a file in mb
In [1]:
import os
def fsize(filename):
return os.path.getsize(filename)/1048576.0
print fsize('gps.csv')
assert np.round(fsize('gps.csv'),2) == 6.33
Make a function save_csv that saves its data-argument (list of list) as csv with names and units as header
In [15]:
def save_csv(filename, names, units, data):
with open(filename,'w') as f:
f.write(",".join(names)+"\n")
f.write(",".join(units)+"\n")
f.writelines([",".join(l)+"\n" for l in data])
In [16]:
%%logit save_csv, tmp.csv
save_csv('tmp.csv', names, units, data)
In [17]:
import os
print fsize('tmp.csv'), fsize('gps.csv')
assert fsize('tmp.csv') == fsize('gps.csv')
Do the same using the csv-module
In [18]:
import csv
def save_csv_module(filename, names, units, data):
with open(filename, 'wb') as csvfile:
writ = csv.writer(csvfile, delimiter=',')
writ.writerow(names)
writ.writerow(units)
for l in data:
writ.writerow(l)
%timeit save_csv_module('tmp.csv', names, units, data)
In [19]:
%%logit save_csv_module, tmp.csv
save_csv_module('tmp.csv', names, units, data)
In [20]:
import os
print fsize('tmp.csv'), fsize('gps.csv')
assert fsize('tmp.csv') == fsize('gps.csv')
Numpy arrays can be saved directly as csv, binary and compressed binary. Unfortunately these options do not support datetime-objects or strings.
So we need to concatenate a np_data array of date_floats, weekday_integers and np_floats
In [21]:
print np.atleast_2d(np.array(date_floats)).shape
print np.atleast_2d(np.array(weekday_integers)).shape
print np_floats.T.shape
np_data = np.r_[np.atleast_2d(np.array(date_floats)), np.atleast_2d(np.array(weekday_integers)),np_floats.T].T.astype(np.float64)
print np_data.shape
assert np_data.shape==(142779,5)
print np_data.dtype
assert np_data.dtype==np.float64
Save the float array np_data as csv and binary and compressed binary using numpy
In [22]:
%%logit save_csv, tmp.csv
np.savetxt('tmp.csv', np_data)
In [23]:
%%logit save_bin, tmp.npy
np.save('tmp', np_data)
In [26]:
%%logit save_zipbin, tmp.npz
np.savez('tmp', np_data)
In [27]:
for k, v in data_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
Save data_dict as binary using cpickle - a faster implementation of pickle
See example at https://wiki.python.org/moin/UsingPickle
In [28]:
%%logit cpickle_dump, tmp.bin
import cPickle as pickle
pickle.dump(data_dict, open( "tmp.bin", "wb" ))
print fsize('tmp.bin')
In [29]:
loaded_dict = pickle.load(open('tmp.bin'))
for k, v in loaded_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
Json is a human readable standard data format. It is quite similar to xml, but a little more compact (compression (like zip) will reduce the file size of both xml and json considerably).
The python json module are able to format e.g. a python dictionary as json, but only pure python types are supported, i.e. not datetime-objects or numpy arrays. Therefore a pure python dict is created
In [30]:
pydata_dict = {'dates': date_strings,
'weekdays':weekdays,
'latitude':np_data[:,2].tolist(),
'longitude':np_data[:,3].tolist(),
'orientation':np_data[:,4].tolist()}
for k, v in pydata_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
The following function saves dumps the data argument into a json (pretty) formated file
In [31]:
import json
def save_json(filename, data):
with open(filename,'w') as fid:
fid.write(json.dumps(data, sort_keys=True, indent=4, separators=(',', ': ')))
In [32]:
%%logit save_json, tmp.json
save_json('tmp.json', pydata_dict)
HDF5 is a hirachical dataformat like json and xml, but it is binary and therefore much more compact. The drawback is that a program is required to read the files, e.g. HDFView
In order to use HDF5 from python the module h5py (http://www.h5py.org/) is required (included in winpython, but not in anaconda)
pip install h5py
HDF5 stores numpy arrays as datasets, so pydata_dict can be saved like this
In [33]:
for k, v in pydata_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
In [34]:
import h5py
def save_hdf5(filename, data):
f = h5py.File(filename, "w")
try:
for k,v in data.items():
f.create_dataset(k,data=np.array(v))
except Exception:
pass
f.close()
In [35]:
%%logit save_hdf5, tmp.hdf5
save_hdf5('tmp.hdf5', pydata_dict)
It is however possible to reduce the file size if proper data types are manually selected for all lists
In [36]:
npdata_dict = {'dates':np_data[:,0],
'weekdays': np_data[:,1].astype(np.int8),
'latitude':np_data[:,2],
'longitude':np_data[:,3].astype(np.float32),
'orientation':np_data[:,4].astype(np.float32)}
for k, v in npdata_dict.items():
print "%-20s\t%-20s\t%s" % (k, v[0], v[0].__class__)
In [37]:
%%logit save_hdf5_astype, tmp.hdf5
save_hdf5('tmp.hdf5', npdata_dict)
In [38]: