In [1]:
import dask.dataframe as dd
from dask.distributed import Client

c = Client('localhost:8786')

In [2]:
c.restart()


Out[2]:

Client

Cluster

  • Workers: 4
  • Cores: 4
  • Memory: 10.03 GB

In [3]:
import numpy as np
import pandas as pd

In [4]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib
import matplotlib.pyplot as plt

import seaborn.apionly as sns
from matplotlib import rcParams

import matplotlib.dates as mdates
from matplotlib.colors import SymLogNorm as symlog

In [5]:
plt.style.use('seaborn-darkgrid')

rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 10
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'
rcParams['axes.titlesize'] = 11

rcParams['savefig.dpi'] = 150
rcParams['figure.dpi'] = 150

rcParams['grid.color'] = 'w'
rcParams['grid.alpha'] = 0.7

rcParams['xtick.direction'] = 'out'
rcParams['xtick.major.size'] = 3
rcParams['ytick.direction'] = 'out'
rcParams['ytick.major.size'] = 3




import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
    fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
    fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaOblique.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaBold.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
                             'normal:weight=normal:stretch=normal:file='
                             '{0}/Helvetica.ttf:' +
                             fontsize2
                             ).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))

Pickups


In [6]:
pickups = dd.read_parquet('/data/NYC_taxi_2009-2016.parquet', engine='arrow', 
                        columns=['pickup_datetime', 'pickup_taxizone_id', 'trip_type'])
pickups = pickups[pickups.trip_type != 'uber']

In [7]:
pickups = pickups.drop('trip_type', axis=1)

In [8]:
pickups.columns = ['t', 'id']

In [9]:
# Rounded to nearest 6 minutes (0.1 hour) for easier aggregation
pickups['timeofday'] = (pickups.t.dt.hour + (pickups.t.dt.minute // 6)/10)

In [10]:
pickups['dayofweek'] = pickups.t.dt.dayofweek
# pickups['weekofyear'] = pickups.t.dt.weekofyear
pickups['weeks_since_2009'] = (pickups.t - np.datetime64('2009-01-01 00:00:00').astype('<M8[ns]')).astype('timedelta64[W]').astype('i4')

In [11]:
def isbusday(d):
    return np.is_busday(d.t.values.astype('<M8[D]')).astype(int)

pickups['isbusday'] = pickups.map_partitions(isbusday, meta=(None, int))

In [12]:
# pickups = pickups[(pickups.isbusday == 1) | (pickups.dayofweek > 4)]

In [13]:
pickups.tail()


Out[13]:
t id timeofday dayofweek weeks_since_2009 isbusday
1432504 2016-12-31 23:59:57 36 23.9 5 417 0
1432505 2016-12-31 23:59:58 76 23.9 5 417 0
1432506 2016-12-31 23:59:58 168 23.9 5 417 0
1432507 2016-12-31 23:59:58 144 23.9 5 417 0
1432508 2016-12-31 23:59:59 135 23.9 5 417 0

In [14]:
heatmap_all_pickups = pickups[['id', 'timeofday', 'dayofweek', 'weeks_since_2009']].groupby(['weeks_since_2009', 'dayofweek', 'timeofday']).count().groupby(['dayofweek', 'timeofday']).mean().compute().reset_index()
heatmap_all_pickups_std = pickups[['id', 'timeofday', 'dayofweek', 'weeks_since_2009']].groupby(['weeks_since_2009', 'dayofweek', 'timeofday']).count().compute().groupby(['dayofweek', 'timeofday']).std().reset_index()

In [15]:
heatmap_all_pickups.sort_values(['dayofweek', 'timeofday'], inplace=True)
heatmap_all_pickups_std.sort_values(['dayofweek', 'timeofday'], inplace=True)

Dropoffs


In [16]:
dropoffs = dd.read_parquet('/data/all_trips_spark.parquet', engine='arrow', 
                        columns=['dropoff_datetime', 'dropoff_taxizone_id', 'trip_type'])
dropoffs = dropoffs[dropoffs.trip_type != 'uber']

In [17]:
dropoffs = dropoffs.drop('trip_type', axis=1)

In [18]:
dropoffs.columns = ['t', 'id']

In [19]:
# Rounded to nearest 6 minutes (0.1 hour) for easier aggregation
dropoffs['timeofday'] = (dropoffs.t.dt.hour + (dropoffs.t.dt.minute // 6)/10)

In [20]:
dropoffs['dayofweek'] = dropoffs.t.dt.dayofweek
# pickups['weekofyear'] = pickups.t.dt.weekofyear
dropoffs['weeks_since_2009'] = (dropoffs.t - np.datetime64('2009-01-01 00:00:00').astype('<M8[ns]')).astype('timedelta64[W]').astype('i4')

In [21]:
dropoffs = dropoffs[dropoffs.weeks_since_2009 > 0]

In [22]:
def isbusday(d):
    return np.is_busday(d.t.values.astype('<M8[D]')).astype(int)

dropoffs['isbusday'] = dropoffs.map_partitions(isbusday, meta=(None, int))

In [23]:
# dropoffs = dropoffs[(dropoffs.isbusday == 1) | (dropoffs.dayofweek > 4)]

In [24]:
dropoffs.tail()


Out[24]:
t id timeofday dayofweek weeks_since_2009 isbusday
1432504 2017-01-01 00:07:47 36 0.1 6 417 0
1432505 2017-01-01 00:15:29 63 0.2 6 417 0
1432506 2017-01-01 00:39:07 161 0.6 6 417 0
1432507 2017-01-01 00:03:50 209 0.0 6 417 0
1432508 2017-01-01 00:14:30 134 0.2 6 417 0

In [25]:
heatmap_all_dropoffs = dropoffs[['id', 'timeofday', 'dayofweek', 'weeks_since_2009']].groupby(['weeks_since_2009', 'dayofweek', 'timeofday']).count().groupby(['dayofweek', 'timeofday']).mean().compute().reset_index()
heatmap_all_dropoffs_std = dropoffs[['id', 'timeofday', 'dayofweek', 'weeks_since_2009']].groupby(['weeks_since_2009', 'dayofweek', 'timeofday']).count().compute().groupby(['dayofweek', 'timeofday']).std().reset_index()

In [26]:
heatmap_all_dropoffs.sort_values(['dayofweek', 'timeofday'], inplace=True)
heatmap_all_dropoffs_std.sort_values(['dayofweek', 'timeofday'], inplace=True)

Plotting


In [148]:
!mkdir 04_temporal_figures

In [27]:
pickup_counts = np.zeros((7, 240), dtype=np.float32) 
dropoff_counts = np.zeros((7, 240), dtype=np.float32) 

pickup_counts_std = np.zeros((7, 240), dtype=np.float32) 
dropoff_counts_std = np.zeros((7, 240), dtype=np.float32) 

for weekday in range(7):
    pickup_counts[weekday, :] = heatmap_all_pickups[heatmap_all_pickups.dayofweek == weekday].id.values * 10 # convert units into counts per hour
    dropoff_counts[weekday, :] = heatmap_all_dropoffs[heatmap_all_dropoffs.dayofweek == weekday].id.values * 10 # same
    
    pickup_counts_std[weekday, :] = heatmap_all_pickups_std[heatmap_all_pickups.dayofweek == weekday].id.values * 10 # convert units into counts per hour
    dropoff_counts_std[weekday, :] = heatmap_all_dropoffs_std[heatmap_all_dropoffs.dayofweek == weekday].id.values * 10 # same

In [28]:
pickup_counts = np.roll(pickup_counts.ravel(), -60).reshape(pickup_counts.shape)
dropoff_counts = np.roll(dropoff_counts.ravel(), -60).reshape(dropoff_counts.shape)
pickup_counts_std = np.roll(pickup_counts_std.ravel(), -60).reshape(pickup_counts_std.shape)
dropoff_counts_std = np.roll(dropoff_counts_std.ravel(), -60).reshape(dropoff_counts_std.shape)

In [29]:
from skimage import exposure
from matplotlib.ticker import MultipleLocator, FormatStrFormatter, AutoMinorLocator

In [30]:
# p2, p98 = np.percentile(pickup_counts/pickup_counts.sum(), (2, 98))
# img_rescale = exposure.rescale_intensity(counts/counts.sum(), in_range=(p2, p98))
pickups_rescaled = exposure.equalize_hist(pickup_counts)
dropoffs_rescaled = exposure.equalize_hist(dropoff_counts)


pickups_rescaled_std = exposure.equalize_hist(pickup_counts_std)
dropoffs_rescaled_std = exposure.equalize_hist(dropoff_counts_std)

# pickups_rescaled = pickup_counts
# dropoffs_rescaled = dropoff_counts

In [150]:
plt.imshow(pickups_rescaled, aspect='auto', interpolation='none', cmap=plt.cm.inferno)
plt.gcf().set_size_inches(8, 5)
cb = plt.colorbar(orientation='vertical')
cb.set_ticks(np.percentile(pickups_rescaled.ravel(), q=np.arange(5, 100, 10)))
cb.set_ticklabels((np.percentile(pickup_counts.ravel(), q=np.arange(5, 100, 10)).round(-2)).astype(int).astype(str))

plt.gca().yaxis.set_ticklabels(['', 'Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun'])

plt.xticks(np.arange(0, 240, 10))
plt.gca().xaxis.set_ticklabels(np.roll([str(x) for x in range(0, 24)], -6))
plt.xlabel("Hour")
plt.ylabel("Weekday")
plt.grid(False, which='both')

for i in range(7):
    plt.axhline(i-0.5, color='w', lw=0.4)
for i in range(0, 240, 30):
    plt.axvline(i, color='w', lw=0.4)
    
    
plt.title("NYC Taxi Weekly Heatmap (2009-2016 Average)")
plt.savefig('04_temporal_figures/weekly_pickups_heatmap.png', dpi=300)



In [112]:
plt.imshow(dropoffs_rescaled, aspect='auto', interpolation='none', cmap=plt.cm.inferno)
plt.gcf().set_size_inches(8, 5)
cb = plt.colorbar(orientation='vertical')
cb.set_ticks(np.percentile(pickups_rescaled.ravel(), q=np.arange(5, 100, 10)))
cb.set_ticklabels((np.percentile(pickup_counts.ravel(), q=np.arange(5, 100, 10)).round(-2)).astype(int).astype(str))

plt.gca().yaxis.set_ticklabels(['', 'Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun'])

plt.xticks(np.arange(0, 240, 10))
plt.gca().xaxis.set_ticklabels(np.roll([str(x) for x in range(0, 24)], -6))
plt.xlabel("Hour")
plt.ylabel("Weekday")
plt.grid(False, which='both')

for i in range(7):
    plt.axhline(i-0.5, color='w', lw=0.4)
for i in range(0, 241, 30):
    plt.axvline(i, color='2', lw=0.4)
plt.xlim(0, 240)


Out[112]:
(0, 240)

In [151]:
xv = pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='6T').values

palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#2ecc71", "#34495e", "#000000"]

for i,day in enumerate(['Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun']):
    plt.plot(xv, 0.001*(pickup_counts)[i], label=day, color=palette[i])

plt.gcf().set_size_inches(8, 5)
plt.xlim('2009-01-01 06:00:00', '2009-01-02 05:54:00')
plt.legend(loc='lower center')
plt.gca().xaxis.set_ticks(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').values)
plt.gca().xaxis.set_ticklabels(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').strftime('%H'))
plt.ylim(0, 37)
plt.axhline(0, color='k', lw=0.5)

plt.xlabel("Hour")
plt.ylabel("Pickups Per Hour (Thousands)")
plt.title("NYC Weekly Taxi Trips (2009-2016 Average)")

plt.savefig('04_temporal_figures/weekly_pickups_lines.png', dpi=300)



In [152]:
xv = pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='6T').values

palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#2ecc71", "#34495e", "#000000"]

for i,day in enumerate(['Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun']):
    if i== 1:
        plt.plot(xv, 0.001*(pickup_counts)[i], label=day + " pickups", color=palette[i])
        plt.plot(xv, 0.001*(dropoff_counts)[i], label=day + " dropoffs", color=palette[i], linestyle='--')

plt.gcf().set_size_inches(8, 5)
plt.xlim('2009-01-01 08:00:00', '2009-01-01 18:00:00')
plt.legend(loc='upper center')
plt.gca().xaxis.set_ticks(pd.date_range('2009-01-01 08:00:00', '2009-01-01 18:00:00', freq='H').values)
plt.gca().xaxis.set_ticklabels(pd.date_range('2009-01-01 08:00:00', '2009-01-01 18:00:00', freq='H').strftime('%H'))
plt.ylim(15, 30)
plt.axhline(0, color='k', lw=0.5)

plt.xlabel("Hour")
plt.ylabel("Pickups or Dropoffs Per Hour (Thousands)")
plt.title("NYC Weekly Taxi Trips (2009-2016 Average)")

plt.savefig('04_temporal_figures/weekly_pickups_lines_tuesday.png', dpi=300)



In [34]:
import scipy.stats
scipy.stats.describe((pickup_counts - dropoff_counts).ravel())


Out[34]:
DescribeResult(nobs=1680, minmax=(-2293.6172, 4044.457), mean=286.418, variance=1023736.9, skewness=0.6254827380180359, kurtosis=0.41857570138961364)

In [35]:
plt.imshow(pickup_counts - dropoff_counts, aspect='auto', cmap=plt.cm.RdBu, vmin=-4041, vmax=4041)
plt.gcf().set_size_inches(8, 5)

plt.gca().yaxis.set_ticklabels(['', 'Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun'])

plt.xticks(np.arange(0, 240, 10))
plt.gca().xaxis.set_ticklabels(np.roll([str(x) for x in range(0, 24)], -6))
plt.xlabel("Hour")
plt.ylabel("Weekday")
plt.grid(False, which='both')

for i in range(7):
    plt.axhline(i-0.5, color='k', lw=0.4)
for i in range(0, 241, 30):
    plt.axvline(i, color='k', lw=0.4)
plt.xlim(0, 240)


Out[35]:
(0, 240)

In [115]:
xv = pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='6T').values

palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#2ecc71", "#34495e", "#000000"]

for i,day in enumerate(['Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun']):
    plt.plot(xv, (pickup_counts-dropoff_counts)[i], label=day, color=palette[i])

plt.gcf().set_size_inches(8, 5)
plt.xlim('2009-01-01 06:00:00', '2009-01-02 05:54:00')
plt.legend(loc='upper right')

plt.gca().xaxis.set_ticks(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').values)
plt.gca().xaxis.set_ticklabels(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').strftime('%H'))

plt.axhline(0, color='k', lw=0.5)

plt.xlabel("Hour")
plt.ylabel("Pickups - Dropoffs [Trips / Hour]")


Out[115]:
<matplotlib.text.Text at 0x7f6541285a90>

Pickup Trends


In [37]:
heatmap_all_pickups_all = (pickups[['id', 'timeofday', 'dayofweek', 'weeks_since_2009']] \
                            .groupby(['dayofweek', 'timeofday', 'weeks_since_2009']).count().compute())

In [39]:
import statsmodels.api as sm


/home/shekhar/anaconda3/envs/py35/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [81]:
grouped = heatmap_all_pickups_all.reset_index().groupby(['dayofweek', 'timeofday'])

rows = []

for group in grouped.groups.keys():
    df = grouped.get_group(group)
    df['constant']=1
    X = df[['weeks_since_2009','constant']]
    y = df['id']
    model = sm.OLS(y,X)
    results = model.fit()
    
    rows.append(group + (results.params.weeks_since_2009, results.params.constant, 
                         results.rsquared_adj, results.pvalues.weeks_since_2009))


/home/shekhar/anaconda3/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys

In [90]:
df_pickup_trend_slopes = pd.DataFrame.from_records(
    rows, columns=['dayofweek', 'timeofday', 'slope', 'intercept', 'rsq_adj', 'pvalue']
                                                  ).sort_values(['dayofweek', 'timeofday'])

In [91]:
pickup_trends = np.zeros((7, 240), dtype=np.float32) 
pickup_trends_rsq_adj = np.zeros((7, 240), dtype=np.float32)
pickup_trends_pvalue = np.zeros((7, 240), dtype=np.float32)

for weekday in range(7):
    pickup_trends[weekday, :] = df_pickup_trend_slopes[df_pickup_trend_slopes.dayofweek == weekday].slope.values * 10 * 52 # convert units into counts per hour
    pickup_trends_rsq_adj[weekday, :] = df_pickup_trend_slopes[df_pickup_trend_slopes.dayofweek == weekday].rsq_adj.values
    pickup_trends_pvalue[weekday, :] = df_pickup_trend_slopes[df_pickup_trend_slopes.dayofweek == weekday].pvalue.values
    
pickup_trends = np.roll(pickup_trends.ravel(), -60).reshape(pickup_trends.shape)
pickup_trends_rsq_adj = np.roll(pickup_trends_rsq_adj.ravel(), -60).reshape(pickup_trends_rsq_adj.shape)
pickup_trends_pvalue = np.roll(pickup_trends_pvalue.ravel(), -60).reshape(pickup_trends_pvalue.shape)

In [164]:
import scipy.ndimage

pickup_trends_smoothed = scipy.ndimage.filters.gaussian_filter1d(
    pickup_trends.ravel(), sigma=3, axis=-1, mode='wrap').reshape(pickup_trends.shape)
pickup_counts_smoothed = scipy.ndimage.filters.gaussian_filter1d(
    pickup_counts.ravel(), sigma=3, axis=-1, mode='wrap').reshape(pickup_counts.shape)

In [162]:
plt.imshow(pickup_trends_smoothed, aspect='auto', interpolation='none', cmap=plt.cm.RdBu, vmin=-600, vmax=600)
plt.gcf().set_size_inches(8, 5)
cb = plt.colorbar(orientation='vertical')

plt.gca().yaxis.set_ticklabels(['', 'Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun'])

plt.xticks(np.roll(np.arange(0, 240, 10), -0))
plt.gca().xaxis.set_ticklabels(np.roll([str(x) for x in range(0, 24)], -6))
plt.xlabel("Hour")
plt.ylabel("Weekday")
plt.grid(False, which='both')

for i in range(7):
    plt.axhline(i-0.5, color='k', lw=0.4)
for i in range(0, 241, 30):
    plt.axvline(i, color='k', lw=0.4)
plt.xlim(0, 240)


Out[162]:
(0, 240)

In [163]:
xv = pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='6T').values

palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#2ecc71", "#34495e", "#000000"]

for i,day in enumerate(['Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun']):
    plt.plot(xv, (pickup_trends_smoothed)[i], label=day, color=palette[i])

plt.gcf().set_size_inches(8, 5)
plt.xlim('2009-01-01 06:00:00', '2009-01-02 05:54:00')
plt.legend(loc='lower right')

plt.gca().xaxis.set_ticks(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').values)
plt.gca().xaxis.set_ticklabels(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').strftime('%H'))

plt.axhline(0, color='k', lw=0.5)

plt.xlabel("Hour")
plt.ylabel("Pickups Per Hour")


Out[163]:
<matplotlib.text.Text at 0x7f6539b487f0>

In [165]:
percentage_change = pickup_trends_smoothed/pickup_counts_smoothed*100
# percentage_change[pickup_trends_pvalue > 0.05] = 0.

plt.imshow(percentage_change, aspect='auto', interpolation='none', cmap=plt.cm.RdBu, vmin=-3.4, vmax=3.4)
plt.gcf().set_size_inches(8, 5)
cb = plt.colorbar(orientation='vertical')

plt.gca().yaxis.set_ticklabels(['', 'Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun'])

plt.xticks(np.roll(np.arange(0, 240, 10), -0))
plt.gca().xaxis.set_ticklabels(np.roll([str(x) for x in range(0, 24)], -6))
plt.xlabel("Hour")
plt.ylabel("Weekday")
plt.grid(False, which='both')

for i in range(7):
    plt.axhline(i-0.5, color='k', lw=0.4)
for i in range(0, 241, 30):
    plt.axvline(i, color='k', lw=0.4)
plt.xlim(0, 240)


Out[165]:
(0, 240)

In [172]:
xv = pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='6T').values

palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#2ecc71", "#34495e", "#000000"]

for i,day in enumerate(['Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun']):
    plt.plot(xv, percentage_change[i], label=day, color=palette[i])

plt.gcf().set_size_inches(8, 5)
plt.xlim('2009-01-01 06:00:00', '2009-01-02 05:54:00')
plt.legend(loc=(0.6, 0.6))
# plt.gcf().autofmt_xdate()
plt.gca().xaxis.set_ticks(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').values)
plt.gca().xaxis.set_ticklabels(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').strftime('%H'))
# plt.ylim(0, 37000)
plt.axhline(0, color='k', lw=0.5)

plt.xlabel("Hour")
plt.ylabel("Pickups Per Hour")


Out[172]:
<matplotlib.text.Text at 0x7f6531f2a518>

Dropoff Trends


In [173]:
heatmap_all_dropoffs_all = (dropoffs[['id', 'timeofday', 'dayofweek', 'weeks_since_2009']] \
                            .groupby(['dayofweek', 'timeofday', 'weeks_since_2009']).count().compute())

In [174]:
import statsmodels.api as sm

In [192]:
grouped2 = heatmap_all_dropoffs_all.reset_index().groupby(['dayofweek', 'timeofday'])

rows2 = []

for group in grouped2.groups.keys():
    df = grouped2.get_group(group)
    df['constant']=1
    X = df[['weeks_since_2009','constant']]
    y = df['id']
    model = sm.OLS(y,X)
    results = model.fit()
    
    rows2.append(group + (results.params.weeks_since_2009, results.params.constant, 
                         results.rsquared_adj, results.pvalues.weeks_since_2009))


/home/shekhar/anaconda3/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys

In [193]:
d0 = grouped2.get_group((6, 6.5)).reset_index(drop=True)
# d0 = d0[d0.weeks_since_2009>-2]
sns.regplot(d0.weeks_since_2009, d0.id)


Out[193]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6530a67588>

In [194]:
df_dropoff_trend_slopes = pd.DataFrame.from_records(
    rows2, columns=['dayofweek', 'timeofday', 'slope', 'intercept', 'rsq_adj', 'pvalue']
                                                  ).sort_values(['dayofweek', 'timeofday'])

In [195]:
dropoff_trends = np.zeros((7, 240), dtype=np.float32) 
dropoff_trends_rsq_adj = np.zeros((7, 240), dtype=np.float32)
dropoff_trends_pvalue = np.zeros((7, 240), dtype=np.float32)

for weekday in range(7):
    dropoff_trends[weekday, :] = df_dropoff_trend_slopes[df_dropoff_trend_slopes.dayofweek == weekday].slope.values * 10 * 52 # convert units into counts per hour
    dropoff_trends_rsq_adj[weekday, :] = df_dropoff_trend_slopes[df_dropoff_trend_slopes.dayofweek == weekday].rsq_adj.values
    dropoff_trends_pvalue[weekday, :] = df_dropoff_trend_slopes[df_dropoff_trend_slopes.dayofweek == weekday].pvalue.values
    
dropoff_trends = np.roll(dropoff_trends.ravel(), -60).reshape(dropoff_trends.shape)
dropoff_trends_rsq_adj = np.roll(dropoff_trends_rsq_adj.ravel(), -60).reshape(dropoff_trends_rsq_adj.shape)
dropoff_trends_pvalue = np.roll(dropoff_trends_pvalue.ravel(), -60).reshape(dropoff_trends_pvalue.shape)

In [196]:
dropoff_trends_smoothed = scipy.ndimage.filters.gaussian_filter1d(
    dropoff_trends.ravel(), sigma=2, axis=-1, mode='wrap').reshape(dropoff_trends.shape)
dropoff_counts_smoothed = scipy.ndimage.filters.gaussian_filter1d(
    dropoff_counts.ravel(), sigma=2, axis=-1, mode='wrap').reshape(dropoff_counts.shape)

In [197]:
plt.imshow(dropoff_trends_smoothed, aspect='auto', interpolation='none', cmap=plt.cm.RdBu, vmin=-700, vmax=700)
plt.gcf().set_size_inches(8, 5)
cb = plt.colorbar(orientation='vertical')

plt.gca().yaxis.set_ticklabels(['', 'Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun'])

plt.xticks(np.roll(np.arange(0, 240, 10), -0))
plt.gca().xaxis.set_ticklabels(np.roll([str(x) for x in range(0, 24)], -6))
plt.xlabel("Hour")
plt.ylabel("Weekday")
plt.grid(False, which='both')

for i in range(7):
    plt.axhline(i-0.5, color='k', lw=0.4)
for i in range(0, 241, 30):
    plt.axvline(i, color='k', lw=0.4)
plt.xlim(0, 240)


Out[197]:
(0, 240)

In [198]:
xv = pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='6T').values

palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#2ecc71", "#34495e", "#000000"]

for i,day in enumerate(['Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun']):
    plt.plot(xv, (dropoff_trends_smoothed)[i], label=day, color=palette[i])

plt.gcf().set_size_inches(8, 5)
plt.xlim('2009-01-01 06:00:00', '2009-01-02 05:54:00')
plt.legend(loc='lower right')

plt.gca().xaxis.set_ticks(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').values)
plt.gca().xaxis.set_ticklabels(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').strftime('%H'))

plt.axhline(0, color='k', lw=0.5)

plt.xlabel("Hour")
plt.ylabel("Pickups Per Hour")


Out[198]:
<matplotlib.text.Text at 0x7f653d7129e8>

In [199]:
percentage_change_dropoff = dropoff_trends_smoothed/dropoff_counts_smoothed*100

plt.imshow(percentage_change_dropoff, aspect='auto', interpolation='none', cmap=plt.cm.RdBu, vmin=-3.2, vmax=3.2)
plt.gcf().set_size_inches(8, 5)
cb = plt.colorbar(orientation='vertical')

plt.gca().yaxis.set_ticklabels(['', 'Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun'])

plt.xticks(np.roll(np.arange(0, 240, 10), -0))
plt.gca().xaxis.set_ticklabels(np.roll([str(x) for x in range(0, 24)], -6))
plt.xlabel("Hour")
plt.ylabel("Weekday")
plt.grid(False, which='both')

for i in range(7):
    plt.axhline(i-0.5, color='k', lw=0.4)
for i in range(0, 241, 30):
    plt.axvline(i, color='k', lw=0.4)
plt.xlim(0, 240)


Out[199]:
(0, 240)

In [201]:
xv = pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='6T').values

palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#2ecc71", "#34495e", "#000000"]

for i,day in enumerate(['Mon', 'Tue',  "Wed", 'Thu', 'Fri',  'Sat', 'Sun']):
    plt.plot(xv, percentage_change_dropoff[i], label=day, color=palette[i])

plt.gcf().set_size_inches(8, 5)
plt.xlim('2009-01-01 06:00:00', '2009-01-02 05:54:00')
plt.legend(loc=(0.6, 0.6))
# plt.gcf().autofmt_xdate()
plt.gca().xaxis.set_ticks(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').values)
plt.gca().xaxis.set_ticklabels(pd.date_range('2009-01-01 06:00:00', '2009-01-02 05:54:00', freq='H').strftime('%H'))
# plt.ylim(0, 37000)
plt.axhline(0, color='k', lw=0.5)

plt.xlabel("Hour")
plt.ylabel("Dropoffs Per Hour")


Out[201]:
<matplotlib.text.Text at 0x7f6530305400>

In [ ]: