"Big Pandas" - Dask from the Inside

Part 3 - Pandas with many large csvs

PyData Berlin tutorial, 30 June 2017

Stephen Simmons


In [1]:
# Standard modules
import io
import logging
import lzma
import multiprocessing
import os
import ssl
import time
import urllib.request
import zipfile

# Third-party modules
import fastparquet      # Needs python-snappy
import graphviz         # To visualize Dask graphs 
import numpy as np
import pandas as pd
import psutil           # Memory stats
import dask
import dask.dataframe as dd

# Support multiple lines of output in each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Don't wrap tables
pd.options.display.max_rows = 20
pd.options.display.max_columns = 20
pd.options.display.width = 300

# Show matplotlib graphs inline in Jupyter notebook
%matplotlib inline

np.__version__, pd.__version__, dask.__version__


Out[1]:
('1.11.3', '0.19.2', '0.14.1')

In [2]:
def memory_usage():
    """String with current memory usage in MB. Requires `psutil` package."""
    pid = os.getpid()
    mem_bytes = psutil.Process(pid).memory_info().rss
    return "[Process %s uses %0.1fMB]" % (pid, mem_bytes/1024.0/1024.0)

memory_usage()


Out[2]:
'[Process 8921 uses 123.9MB]'

In [6]:
%%time
df = d.read_csv('flights-2016-01.xz', nrows=4, dialect="excel")


CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 13.2 ms

In [8]:
df.T


Out[8]:
0 1 2 3
Year 2016 2016 2016 2016
Quarter 1 1 1 1
Month 1 1 1 1
DayofMonth 6 7 8 9
DayOfWeek 3 4 5 6
FlightDate 2016-01-06 2016-01-07 2016-01-08 2016-01-09
UniqueCarrier AA AA AA AA
AirlineID 19805 19805 19805 19805
Carrier AA AA AA AA
TailNum N4YBAA N434AA N541AA N489AA
... ... ... ... ...
Div4TailNum NaN NaN NaN NaN
Div5Airport NaN NaN NaN NaN
Div5AirportID NaN NaN NaN NaN
Div5AirportSeqID NaN NaN NaN NaN
Div5WheelsOn NaN NaN NaN NaN
Div5TotalGTime NaN NaN NaN NaN
Div5LongestGTime NaN NaN NaN NaN
Div5WheelsOff NaN NaN NaN NaN
Div5TailNum NaN NaN NaN NaN
Unnamed: 109 NaN NaN NaN NaN

110 rows × 4 columns


In [9]:
memory_usage()


Out[9]:
'[Process 7762 uses 127.1MB]'

In [10]:
%%time
df = pd.read_csv('flights-2016-01.xz', dialect="excel")


<string>:2: DtypeWarning: Columns (77) have mixed types. Specify dtype option on import or set low_memory=False.
CPU times: user 5.48 s, sys: 1.36 s, total: 6.83 s
Wall time: 6.83 s

In [11]:
memory_usage()


Out[11]:
'[Process 7762 uses 855.6MB]'

In [12]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445827 entries, 0 to 445826
Columns: 110 entries, Year to Unnamed: 109
dtypes: float64(71), int64(21), object(18)
memory usage: 374.2+ MB

In [15]:
df.memory_usage(deep=True).sum() / 1024 / 1024


Out[15]:
745.24831008911133

In [16]:
import textwrap
print('\n'.join(textwrap.wrap(', '.join(df.columns), 60)))


Year, Quarter, Month, DayofMonth, DayOfWeek, FlightDate,
UniqueCarrier, AirlineID, Carrier, TailNum, FlightNum,
OriginAirportID, OriginAirportSeqID, OriginCityMarketID,
Origin, OriginCityName, OriginState, OriginStateFips,
OriginStateName, OriginWac, DestAirportID, DestAirportSeqID,
DestCityMarketID, Dest, DestCityName, DestState,
DestStateFips, DestStateName, DestWac, CRSDepTime, DepTime,
DepDelay, DepDelayMinutes, DepDel15, DepartureDelayGroups,
DepTimeBlk, TaxiOut, WheelsOff, WheelsOn, TaxiIn,
CRSArrTime, ArrTime, ArrDelay, ArrDelayMinutes, ArrDel15,
ArrivalDelayGroups, ArrTimeBlk, Cancelled, CancellationCode,
Diverted, CRSElapsedTime, ActualElapsedTime, AirTime,
Flights, Distance, DistanceGroup, CarrierDelay,
WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay,
FirstDepTime, TotalAddGTime, LongestAddGTime,
DivAirportLandings, DivReachedDest, DivActualElapsedTime,
DivArrDelay, DivDistance, Div1Airport, Div1AirportID,
Div1AirportSeqID, Div1WheelsOn, Div1TotalGTime,
Div1LongestGTime, Div1WheelsOff, Div1TailNum, Div2Airport,
Div2AirportID, Div2AirportSeqID, Div2WheelsOn,
Div2TotalGTime, Div2LongestGTime, Div2WheelsOff,
Div2TailNum, Div3Airport, Div3AirportID, Div3AirportSeqID,
Div3WheelsOn, Div3TotalGTime, Div3LongestGTime,
Div3WheelsOff, Div3TailNum, Div4Airport, Div4AirportID,
Div4AirportSeqID, Div4WheelsOn, Div4TotalGTime,
Div4LongestGTime, Div4WheelsOff, Div4TailNum, Div5Airport,
Div5AirportID, Div5AirportSeqID, Div5WheelsOn,
Div5TotalGTime, Div5LongestGTime, Div5WheelsOff,
Div5TailNum, Unnamed: 109

In [ ]:
%%time
def load_months(months):
    dfs = [ 
        pd.read_csv('flights-%s.xz' % month, dialect="excel")
            for month in months 
          ]
    return pd.concat(dfs)

In [18]:
df = load_months(['2015-12','2016-01','2016-02'])


/home/stephen/miniconda3/envs/py36/lib/python3.6/site-packages/IPython/core/magics/execution.py:1185: DtypeWarning: Columns (48,76,77,84,85) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code, glob, local_ns)
/home/stephen/miniconda3/envs/py36/lib/python3.6/site-packages/IPython/core/magics/execution.py:1185: DtypeWarning: Columns (77) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code, glob, local_ns)
/home/stephen/miniconda3/envs/py36/lib/python3.6/site-packages/IPython/core/magics/execution.py:1185: DtypeWarning: Columns (77,84) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code, glob, local_ns)
CPU times: user 18.4 s, sys: 6.66 s, total: 25.1 s
Wall time: 1min 26s

In [19]:
memory_usage()


Out[19]:
'[Process 7762 uses 1379.6MB]'

In [20]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1348946 entries, 0 to 423888
Columns: 110 entries, Year to Unnamed: 109
dtypes: float64(69), int64(21), object(20)
memory usage: 1.1+ GB

In [21]:
df.memory_usage(deep=True).sum() / 1024 / 1024


Out[21]:
2326.9082660675049

In [ ]: