In this notebook, we create a subset of the NYC taxi dataset. This subset only contains 0.5% of all rows.
minibook-2nd-code/chapter2/cleaning/
directory.minibook-2nd-code/chapter2/data/
directory.trip_data_1.csv.zip
, ..., trip_fare_1.csv.zip
. Make sure to download these files and put them in the data
directory.trip_data_subset.csv
and trip_fare_subset.csv
files in the data
directory.data/
, click on Cell > Run all
above to launch the extraction process. It will take several minutes.The procedure implemented here is memory-efficient in that the full ZIP files are not first extracted on disk. Unzipping occurs on the fly. We make heavy use of Python generators and iterators.
In [ ]:
# Relative path of the directory containing the data.
data_dir = '../data'
# Only keep one out of 'step' lines ('200' = 0.5%).
step = 200
# In every file, stop after 'stop' lines (None=until the end).
stop = None # type 400 to test the script and generate tiny subsets.
In [ ]:
import os
import os.path as op
import re
import zipfile
import glob
from itertools import chain, islice
In [ ]:
def _csv_filename(zip_filename):
"""Return the filename of the CSV in a ZIP file."""
return op.splitext(op.basename(zip_filename))[0]
In [ ]:
def _iter_lines(zip_filename):
"""Iterate over all rows from a zipped CSV file."""
print("Processing file {file}...".format(file=zip_filename))
csv_filename = _csv_filename(zip_filename)
with zipfile.ZipFile(zip_filename) as z:
with z.open(csv_filename) as f:
for line in f:
yield line
In [ ]:
def _iter_all_lines(files, step=None, stop=None):
"""Iterate over rows from several zipped CSV files."""
# Keep the header row in the first file, but not in the other files.
return chain(*[islice(_iter_lines(f), min(1, i), stop, step)
for i, f in enumerate(files)])
In [ ]:
def _extract_number(filename):
"""Return the month number appearing in a ZIP file."""
r = re.search(r'([\d]+)', filename)
if r:
return int(r.group(1))
In [ ]:
def _zip_filenames(name):
"""Return the ordered list of ZIP filenames."""
return sorted(glob.glob(op.join(data_dir, 'trip_{name}_*.zip'.format(name=name))),
key=_extract_number)
In [ ]:
def _make_extract(step=None, stop=None):
"""Create the subset data."""
for name in ('fare', 'data'):
# Output CSV filename (with the subset data).
filename = op.join(data_dir, 'trip_{name}_subset.csv'.format(name=name))
# List of zipped CSV files."
files = _zip_filenames(name)
with open(filename, 'wb') as f:
# Iterate over a subset of the rows from all files.
for line in _iter_all_lines(files, step=step, stop=stop):
f.write(line)
print("*** Done! {0} has been successfully created. ***\n".format(filename))
Make the subset data (this will take a while).
In [ ]:
_make_extract(step=step, stop=stop)