In [1]:

    
%matplotlib inline
%config InlineBackend.figure_format='retina'

import dask.dataframe as dd
import dask.distributed
import numpy as np
import pandas as pd
# import geopandas as gpd

from matplotlib.colors import SymLogNorm as symlog
from matplotlib import rcParams

import sklearn, sklearn.cluster
import matplotlib.pyplot as plt
import palettable

import seaborn as sns

import netCDF4
import geopandas


pd.options.display.max_rows = 300
pd.options.display.max_columns = 100



In [2]:

    
client = dask.distributed.Client()



In [45]:

    
tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')



In [4]:

    
rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 12
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'

rcParams['savefig.dpi'] = 150
rcParams['figure.dpi'] = 150
import seaborn as sns

import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
    fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
    fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaOblique.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaBold.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
                             'normal:weight=normal:stretch=normal:file='
                             '{0}/Helvetica.ttf:' +
                             fontsize2
                             ).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))

Section 1: Sample 10 random rows from the dataset, show with human readable names for blog post as HTML



In [5]:

    
df = dd.read_parquet('/data/all_trips.parquet', index='trip_id', 
    columns='pickup_datetime dropoff_datetime pickup_taxizone_id dropoff_taxizone_id'.split())









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '



In [6]:

    
df2 = df.sample(frac=1.0e-6, random_state=42).compute()



In [7]:

    
df2 = df2.dropna()



In [8]:

    
df3 = df2.merge(
    tzdf['LocationID borough zone'.split()], left_on='pickup_taxizone_id', right_on='LocationID'
)
df3['pickup_location'] = df3.borough.map(str) + " | " +  df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)

df3 = df3.merge(
    tzdf['LocationID borough zone'.split()], left_on='dropoff_taxizone_id', right_on='LocationID'
)
df3['dropoff_location'] = df3.borough.map(str) + " | " +  df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)
df3 = df3.sample(frac=1, replace=False, random_state=42).reset_index(drop=True)



In [9]:

    
df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html().replace("""\n""", "")









    Out[9]:





'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>pickup_datetime</th>      <th>dropoff_datetime</th>      <th>pickup_taxizone_id</th>      <th>dropoff_taxizone_id</th>      <th>pickup_location</th>      <th>dropoff_location</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>2009-04-02 15:02:43</td>      <td>2009-04-02 15:22:45</td>      <td>138</td>      <td>229</td>      <td>Queens | LaGuardia Airport</td>      <td>Manhattan | Sutton Place/Turtle Bay North</td>    </tr>    <tr>      <th>1</th>      <td>2009-05-03 01:34:00</td>      <td>2009-05-03 01:36:00</td>      <td>79</td>      <td>164</td>      <td>Manhattan | East Village</td>      <td>Manhattan | Midtown South</td>    </tr>    <tr>      <th>2</th>      <td>2010-10-09 00:48:38</td>      <td>2010-10-09 01:00:55</td>      <td>79</td>      <td>237</td>      <td>Manhattan | East Village</td>      <td>Manhattan | Upper East Side South</td>    </tr>    <tr>      <th>3</th>      <td>2011-09-07 10:03:00</td>      <td>2011-09-07 10:09:00</td>      <td>113</td>      <td>211</td>      <td>Manhattan | Greenwich Village North</td>      <td>Manhattan | SoHo</td>    </tr>    <tr>      <th>4</th>      <td>2012-07-21 20:34:00</td>      <td>2012-07-21 20:39:00</td>      <td>211</td>      <td>114</td>      <td>Manhattan | SoHo</td>      <td>Manhattan | Greenwich Village South</td>    </tr>    <tr>      <th>5</th>      <td>2013-05-31 17:45:06</td>      <td>2013-05-31 18:41:07</td>      <td>138</td>      <td>25</td>      <td>Queens | LaGuardia Airport</td>      <td>Brooklyn | Boerum Hill</td>    </tr>    <tr>      <th>6</th>      <td>2014-08-08 09:06:15</td>      <td>2014-08-08 09:14:52</td>      <td>230</td>      <td>100</td>      <td>Manhattan | Times Sq/Theatre District</td>      <td>Manhattan | Garment District</td>    </tr>    <tr>      <th>7</th>      <td>2015-03-28 05:51:31</td>      <td>2015-03-28 06:00:05</td>      <td>263</td>      <td>107</td>      <td>Manhattan | Yorkville West</td>      <td>Manhattan | Gramercy</td>    </tr>    <tr>      <th>8</th>      <td>2015-10-11 10:20:04</td>      <td>2015-10-11 10:31:45</td>      <td>75</td>      <td>194</td>      <td>Manhattan | East Harlem South</td>      <td>Manhattan | Randalls Island</td>    </tr>    <tr>      <th>9</th>      <td>2016-07-15 00:57:44</td>      <td>2016-07-15 01:03:16</td>      <td>125</td>      <td>186</td>      <td>Manhattan | Hudson Sq</td>      <td>Manhattan | Penn Station/Madison Sq West</td>    </tr>  </tbody></table>'



In [10]:

    
from IPython.display import HTML
HTML(df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html())









    Out[10]:





  
    
      
      pickup_datetime
      dropoff_datetime
      pickup_taxizone_id
      dropoff_taxizone_id
      pickup_location
      dropoff_location
    
  
  
    
      0
      2009-04-02 15:02:43
      2009-04-02 15:22:45
      138
      229
      Queens | LaGuardia Airport
      Manhattan | Sutton Place/Turtle Bay North
    
    
      1
      2009-05-03 01:34:00
      2009-05-03 01:36:00
      79
      164
      Manhattan | East Village
      Manhattan | Midtown South
    
    
      2
      2010-10-09 00:48:38
      2010-10-09 01:00:55
      79
      237
      Manhattan | East Village
      Manhattan | Upper East Side South
    
    
      3
      2011-09-07 10:03:00
      2011-09-07 10:09:00
      113
      211
      Manhattan | Greenwich Village North
      Manhattan | SoHo
    
    
      4
      2012-07-21 20:34:00
      2012-07-21 20:39:00
      211
      114
      Manhattan | SoHo
      Manhattan | Greenwich Village South
    
    
      5
      2013-05-31 17:45:06
      2013-05-31 18:41:07
      138
      25
      Queens | LaGuardia Airport
      Brooklyn | Boerum Hill
    
    
      6
      2014-08-08 09:06:15
      2014-08-08 09:14:52
      230
      100
      Manhattan | Times Sq/Theatre District
      Manhattan | Garment District
    
    
      7
      2015-03-28 05:51:31
      2015-03-28 06:00:05
      263
      107
      Manhattan | Yorkville West
      Manhattan | Gramercy
    
    
      8
      2015-10-11 10:20:04
      2015-10-11 10:31:45
      75
      194
      Manhattan | East Harlem South
      Manhattan | Randalls Island
    
    
      9
      2016-07-15 00:57:44
      2016-07-15 01:03:16
      125
      186
      Manhattan | Hudson Sq
      Manhattan | Penn Station/Madison Sq West

Section 2: Calculate Count Matrix for Full Dataset



In [11]:

    
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['pickup_taxizone_id', 'dropoff_taxizone_id'])
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '



In [12]:

    
df.head()









    Out[12]:







  
    
      
      pickup_taxizone_id
      dropoff_taxizone_id
    
    
      pickup_datetime
      
      
    
  
  
    
      2009-01-01 00:00:00
      237
      263
    
    
      2009-01-01 00:00:00
      114
      249
    
    
      2009-01-01 00:00:02
      237
      43
    
    
      2009-01-01 00:00:04
      261
      261
    
    
      2009-01-01 00:00:07
      144
      80



In [13]:

    
count_dataframe = df.reset_index().groupby(['pickup_taxizone_id', 'dropoff_taxizone_id']).count().compute()
count_dataframe.columns = ['count']
count_dataframe.shape









    Out[13]:





(60710, 1)



In [14]:

    
count_dataframe.head()









    Out[14]:







  
    
      
      
      count
    
    
      pickup_taxizone_id
      dropoff_taxizone_id
      
    
  
  
    
      1
      1
      101493
    
    
      186
      35
    
    
      266
      37311
    
    
      2
      2
      4297
    
    
      14
      20



In [15]:

    
count_matrix = np.zeros((267, 267), dtype=np.int64)
for r in count_dataframe.reset_index().itertuples():
    count_matrix[r[1], r[2]] = r[3]



In [16]:

    
count_dataframe.describe()









    Out[16]:







  
    
      
      count
    
  
  
    
      count
      6.071000e+04
    
    
      mean
      2.277015e+04
    
    
      std
      1.698617e+05
    
    
      min
      1.000000e+00
    
    
      25%
      9.000000e+00
    
    
      50%
      8.800000e+01
    
    
      75%
      1.258000e+03
    
    
      max
      2.556999e+07



In [17]:

    
count_dataframe.reset_index().head()









    Out[17]:







  
    
      
      pickup_taxizone_id
      dropoff_taxizone_id
      count
    
  
  
    
      0
      1
      1
      101493
    
    
      1
      1
      186
      35
    
    
      2
      1
      266
      37311
    
    
      3
      2
      2
      4297
    
    
      4
      2
      14
      20



In [18]:

    
# <!-- collapse=True -->
plt.imshow(count_matrix[1:-3, 1:-3].T, norm=symlog(10000), origin='upper', cmap=plt.cm.Blues)
plt.grid(False)
plt.xlabel("Dropoff Taxi Zone ID")
plt.ylabel("Pickup Taxi Zone ID")
plt.gcf().set_size_inches(4, 4)

Section 3: Calculate Pickups matrix



In [19]:

    
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['pickup_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '



In [20]:

    
def get_year_mo_day(data, col):
#     d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
#     return d
    return data.index.values.astype('M8[h]')



In [21]:

    
df['pickup_ymd'] = df.map_partitions(get_year_mo_day, 'pickup_datetime', meta=('asdf', np.datetime64))









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/dask/dataframe/utils.py:232: FutureWarning: Passing in 'datetime64' dtype with no frequency is deprecated and will raise in a future version. Please pass in 'datetime64[ns]' instead.
  return pd.Series([], dtype=dtype, name=name, index=index)



In [22]:

    
df.reset_index().rename(columns=dict(index='N')).tail()









    Out[22]:







  
    
      
      N
      pickup_taxizone_id
      pickup_ymd
    
  
  
    
      1432504
      2016-12-31 23:59:57
      36
      2016-12-31 23:00:00
    
    
      1432505
      2016-12-31 23:59:58
      76
      2016-12-31 23:00:00
    
    
      1432506
      2016-12-31 23:59:58
      168
      2016-12-31 23:00:00
    
    
      1432507
      2016-12-31 23:59:58
      144
      2016-12-31 23:00:00
    
    
      1432508
      2016-12-31 23:59:59
      135
      2016-12-31 23:00:00



In [23]:

    
pickup_counts_df = df.reset_index().rename(columns=dict(index='N')).groupby(['pickup_taxizone_id', 'pickup_ymd',]).count().compute()
pickup_counts_df.sort_index(inplace=True)



In [24]:

    
pickup_counts_df.head()









    Out[24]:







  
    
      
      
      N
    
    
      pickup_taxizone_id
      pickup_ymd
      
    
  
  
    
      1
      2009-01-01 01:00:00
      1
    
    
      2009-01-01 02:00:00
      1
    
    
      2009-01-01 04:00:00
      2
    
    
      2009-01-01 05:00:00
      1
    
    
      2009-01-01 07:00:00
      1



In [25]:

    
z = pickup_counts_df.unstack(0)



In [26]:

    
z.columns = np.arange(1, 267).astype(str)



In [27]:

    
z = z.merge(
    pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')), 
    how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)



In [28]:

    
z.head()









    Out[28]:







  
    
      
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      ...
      217
      218
      219
      220
      221
      222
      223
      224
      225
      226
      227
      228
      229
      230
      231
      232
      233
      234
      235
      236
      237
      238
      239
      240
      241
      242
      243
      244
      245
      246
      247
      248
      249
      250
      251
      252
      253
      254
      255
      256
      257
      258
      259
      260
      261
      262
      263
      264
      265
      266
    
  
  
    
      2009-01-01 00:00:00
      0
      0
      0
      121
      0
      0
      42
      0
      0
      0
      0
      4
      63
      2
      0
      0
      13
      1
      1
      0
      0
      1
      3
      78
      28
      0
      0
      3
      0
      0
      0
      1
      29
      1
      0
      3
      16
      0
      0
      27
      53
      30
      207
      0
      21
      0
      0
      594
      18
      178
      ...
      2
      0
      0
      0
      0
      0
      18
      71
      5
      35
      1
      1
      379
      57
      328
      70
      211
      544
      1
      384
      459
      407
      439
      0
      3
      0
      10
      32
      0
      211
      1
      2
      409
      0
      0
      0
      0
      0
      74
      53
      4
      2
      0
      6
      70
      179
      449
      0
      0
      369
    
    
      2009-01-01 01:00:00
      1
      0
      0
      137
      0
      0
      88
      0
      0
      0
      0
      4
      63
      2
      0
      1
      11
      1
      0
      0
      0
      0
      5
      84
      51
      0
      1
      0
      0
      0
      0
      0
      78
      2
      0
      8
      22
      0
      0
      37
      79
      41
      103
      0
      39
      0
      0
      440
      37
      184
      ...
      7
      0
      0
      2
      0
      0
      35
      121
      10
      68
      0
      8
      540
      130
      360
      131
      298
      402
      1
      382
      459
      486
      549
      0
      1
      4
      24
      64
      0
      167
      3
      0
      322
      1
      0
      0
      0
      0
      97
      92
      7
      0
      0
      22
      85
      239
      627
      0
      0
      452
    
    
      2009-01-01 02:00:00
      1
      0
      0
      125
      0
      0
      115
      0
      0
      0
      0
      2
      58
      4
      0
      2
      19
      1
      0
      0
      0
      0
      4
      90
      54
      1
      0
      1
      0
      0
      0
      0
      54
      0
      0
      4
      15
      0
      0
      54
      99
      50
      121
      0
      47
      0
      0
      486
      50
      130
      ...
      11
      0
      1
      3
      0
      0
      40
      119
      6
      69
      1
      10
      496
      155
      348
      115
      288
      295
      1
      329
      267
      401
      491
      0
      1
      0
      28
      71
      0
      117
      4
      0
      335
      0
      0
      1
      0
      0
      122
      115
      6
      1
      0
      21
      55
      170
      594
      0
      0
      378
    
    
      2009-01-01 03:00:00
      0
      0
      0
      124
      0
      0
      110
      1
      1
      0
      0
      1
      33
      8
      0
      3
      18
      0
      0
      0
      0
      2
      3
      51
      61
      0
      0
      1
      1
      0
      0
      0
      55
      1
      1
      8
      22
      1
      0
      35
      67
      44
      67
      0
      40
      0
      0
      541
      42
      129
      ...
      9
      0
      0
      5
      0
      0
      34
      74
      4
      53
      2
      9
      349
      303
      322
      80
      144
      348
      1
      220
      169
      199
      339
      0
      0
      1
      15
      61
      0
      115
      2
      0
      261
      0
      1
      0
      0
      0
      80
      98
      5
      1
      0
      28
      70
      96
      437
      0
      0
      313
    
    
      2009-01-01 04:00:00
      2
      0
      0
      100
      0
      0
      105
      0
      1
      0
      0
      0
      19
      7
      0
      4
      8
      2
      3
      1
      0
      1
      0
      41
      43
      1
      0
      1
      0
      0
      0
      0
      27
      3
      0
      7
      24
      0
      0
      20
      54
      28
      34
      0
      38
      0
      1
      390
      27
      120
      ...
      3
      0
      0
      5
      0
      0
      29
      34
      3
      59
      1
      7
      233
      308
      221
      53
      116
      286
      3
      91
      79
      112
      186
      0
      0
      0
      10
      27
      0
      69
      6
      1
      241
      0
      0
      0
      0
      0
      80
      74
      5
      0
      0
      20
      37
      62
      296
      0
      0
      224
    
  

5 rows × 266 columns



In [29]:

    
import fastparquet
fastparquet.write('/data/trips_pickups_matrix.parquet', z, compression='SNAPPY')

Section 4: Calculate Dropoffs matrix



In [30]:

    
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['dropoff_datetime', 'dropoff_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '



In [31]:

    
def get_year_mo_day(data, col):
#     d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
#     return d
    return data.index.values.astype('M8[h]')



In [32]:

    
df['dropoff_ymd'] = df.map_partitions(get_year_mo_day, 'dropoff_datetime', meta=('asdf', np.datetime64))









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/dask/dataframe/utils.py:232: FutureWarning: Passing in 'datetime64' dtype with no frequency is deprecated and will raise in a future version. Please pass in 'datetime64[ns]' instead.
  return pd.Series([], dtype=dtype, name=name, index=index)



In [33]:

    
df.reset_index(drop=True).tail()









    Out[33]:







  
    
      
      dropoff_datetime
      dropoff_taxizone_id
      dropoff_ymd
    
  
  
    
      1432504
      2017-01-01 00:07:47
      36
      2016-12-31 23:00:00
    
    
      1432505
      2017-01-01 00:15:29
      63
      2016-12-31 23:00:00
    
    
      1432506
      2017-01-01 00:39:07
      161
      2016-12-31 23:00:00
    
    
      1432507
      2017-01-01 00:03:50
      209
      2016-12-31 23:00:00
    
    
      1432508
      2017-01-01 00:14:30
      134
      2016-12-31 23:00:00



In [34]:

    
dropoff_counts_df = df.reset_index(drop=True).rename(columns=dict(dropoff_datetime='N')).groupby(['dropoff_taxizone_id', 'dropoff_ymd',]).count().compute()
dropoff_counts_df.sort_index(inplace=True)



In [35]:

    
dropoff_counts_df.head()









    Out[35]:







  
    
      
      
      N
    
    
      dropoff_taxizone_id
      dropoff_ymd
      
    
  
  
    
      1
      2009-01-01 01:00:00
      2
    
    
      2009-01-01 02:00:00
      3
    
    
      2009-01-01 03:00:00
      1
    
    
      2009-01-01 04:00:00
      10
    
    
      2009-01-01 05:00:00
      10



In [36]:

    
z2 = dropoff_counts_df.unstack(0)



In [37]:

    
z2 = z2.merge(
    pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')), 
    how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/merge.py:551: UserWarning: merging between different levels can give an unintended result (2 levels on the left, 1 on the right)
  warnings.warn(msg, UserWarning)



In [38]:

    
z2.columns = np.arange(1, 267).astype(str)



In [39]:

    
z2.head()









    Out[39]:







  
    
      
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      ...
      217
      218
      219
      220
      221
      222
      223
      224
      225
      226
      227
      228
      229
      230
      231
      232
      233
      234
      235
      236
      237
      238
      239
      240
      241
      242
      243
      244
      245
      246
      247
      248
      249
      250
      251
      252
      253
      254
      255
      256
      257
      258
      259
      260
      261
      262
      263
      264
      265
      266
    
  
  
    
      2009-01-01 00:00:00
      0
      0
      1
      128
      0
      1
      113
      0
      0
      1
      1
      1
      111
      20
      1
      8
      28
      3
      1
      6
      2
      1
      4
      84
      36
      5
      1
      6
      2
      0
      0
      0
      57
      4
      1
      10
      23
      1
      0
      32
      95
      81
      95
      0
      30
      0
      0
      434
      44
      150
      ...
      7
      3
      1
      9
      0
      0
      53
      111
      16
      56
      9
      17
      352
      50
      281
      75
      291
      366
      3
      432
      253
      407
      404
      1
      10
      2
      43
      82
      0
      188
      0
      3
      306
      3
      0
      2
      0
      0
      85
      75
      13
      6
      3
      36
      111
      282
      450
      0
      0
      379
    
    
      2009-01-01 01:00:00
      2
      0
      2
      154
      0
      1
      188
      2
      1
      0
      3
      1
      87
      25
      4
      5
      28
      4
      1
      1
      2
      4
      5
      96
      47
      4
      1
      3
      0
      0
      1
      1
      104
      5
      3
      8
      25
      0
      0
      37
      140
      91
      97
      0
      39
      1
      2
      402
      49
      147
      ...
      10
      1
      2
      24
      0
      0
      89
      161
      22
      94
      9
      23
      372
      118
      226
      96
      335
      331
      8
      395
      227
      434
      413
      0
      9
      5
      86
      106
      1
      157
      2
      2
      340
      2
      0
      5
      0
      1
      103
      84
      22
      7
      7
      41
      75
      346
      627
      0
      0
      469
    
    
      2009-01-01 02:00:00
      3
      0
      1
      134
      1
      0
      245
      0
      4
      4
      3
      3
      88
      27
      3
      10
      41
      7
      1
      7
      3
      7
      4
      89
      45
      5
      0
      7
      2
      0
      1
      3
      85
      1
      3
      15
      30
      2
      0
      31
      150
      103
      77
      0
      47
      0
      1
      414
      65
      121
      ...
      10
      0
      1
      15
      1
      0
      105
      145
      26
      103
      11
      24
      285
      162
      210
      84
      301
      243
      2
      390
      160
      358
      370
      0
      4
      2
      71
      118
      1
      115
      12
      2
      283
      1
      0
      3
      1
      0
      92
      98
      28
      3
      4
      53
      66
      283
      536
      0
      0
      408
    
    
      2009-01-01 03:00:00
      1
      0
      1
      109
      0
      0
      229
      0
      4
      1
      3
      1
      69
      40
      2
      7
      31
      7
      3
      9
      5
      13
      6
      86
      55
      3
      0
      9
      2
      1
      1
      1
      69
      2
      1
      21
      46
      0
      0
      33
      113
      100
      50
      0
      33
      0
      3
      368
      69
      143
      ...
      9
      0
      1
      19
      0
      1
      107
      113
      16
      84
      13
      33
      245
      197
      171
      85
      215
      204
      3
      267
      136
      245
      251
      0
      6
      1
      71
      105
      0
      90
      5
      2
      241
      0
      1
      5
      0
      2
      59
      81
      24
      8
      5
      56
      89
      233
      448
      0
      0
      334
    
    
      2009-01-01 04:00:00
      10
      0
      2
      103
      0
      0
      221
      1
      3
      2
      2
      1
      50
      38
      1
      5
      32
      4
      2
      2
      4
      6
      1
      56
      46
      2
      0
      9
      1
      0
      2
      2
      46
      2
      3
      23
      56
      1
      1
      25
      97
      79
      23
      0
      28
      0
      0
      258
      48
      98
      ...
      7
      1
      0
      20
      0
      0
      98
      68
      24
      82
      8
      26
      143
      157
      96
      72
      141
      110
      8
      122
      71
      198
      169
      3
      8
      1
      52
      87
      1
      39
      6
      1
      171
      3
      0
      4
      0
      3
      62
      84
      20
      4
      6
      52
      68
      143
      291
      0
      0
      246
    
  

5 rows × 266 columns



In [40]:

    
import fastparquet
fastparquet.write('/data/trips_dropoffs_matrix.parquet', z2, compression='SNAPPY')

Section 5 : Perform PCA



In [86]:

    
tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')



In [87]:

    
import fastparquet
dropoffs_matrix = fastparquet.ParquetFile('/data/trips_dropoffs_matrix.parquet').to_pandas()
pickups_matrix = fastparquet.ParquetFile('/data/trips_pickups_matrix.parquet').to_pandas()



In [88]:

    
dropoffs_matrix = dropoffs_matrix.iloc[:, :-3]
pickups_matrix = pickups_matrix.iloc[:, :-3]



In [135]:

    
counts_matrix = pd.concat([dropoffs_matrix, pickups_matrix], axis=1 )



In [321]:

    
tzdf.zone[0]









    Out[321]:





'Newark Airport'



In [324]:

    
sns.distplot(counts_matrix.iloc[:, 263+0], kde=False)
sns.distplot(counts_matrix.iloc[:, 0], kde=False)









    Out[324]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f049a3dc710>



In [136]:

    
import sklearn, sklearn.decomposition



In [414]:

    
# pca = sklearn.decomposition.PCA(n_components=20, whiten=True)
# # pca.fit(counts_matrix.resample('1D').sum().values)
# pca.fit(counts_matrix.values)
# pca.explained_variance_ratio_



In [500]:

    
pca = sklearn.decomposition.FastICA(n_components=3, random_state=42, whiten=True)
# pca.fit(counts_matrix.resample('1D').sum().values)
yvals = pca.fit_transform(counts_matrix.values)
# pca.explained_variance_ratio_









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/fastica_.py:116: UserWarning: FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.
  warnings.warn('FastICA did not converge. Consider increasing '



In [501]:

    
yvals.shape









    Out[501]:





(70128, 3)



In [502]:

    
pickup_eof1, dropoff_eof1 = pca.components_[0, :263], pca.components_[0, 263:]
pickup_eof2, dropoff_eof2 = pca.components_[1, :263], pca.components_[1, 263:]
pickup_eof3, dropoff_eof3 = pca.components_[2, :263], pca.components_[2, 263:]
# pickup_eof4, dropoff_eof4 = pca.components_[3, :263], pca.components_[3, 263:]
# pickup_eof5, dropoff_eof5 = pca.components_[4, :263], pca.components_[4, 263:]



In [503]:

    
tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
# tzdf['pEOF4'] = pickup_eof4
# tzdf['dEOF4'] = dropoff_eof4
# tzdf['pEOF5'] = pickup_eof5
# tzdf['dEOF5'] = dropoff_eof5



In [504]:

    
tzdf['N_dropoffs'] = dropoffs_matrix.sum(axis=0).values
tzdf['N_pickups'] = pickups_matrix.sum(axis=0).values



In [505]:

    
tzdf['log10_N_dropoffs'] = np.log10(tzdf.N_dropoffs)
tzdf['log10_N_pickups'] = np.log10(tzdf.N_pickups)



In [506]:

    
tzdf = tzdf.to_crs({'init': 'epsg:3857'})



In [507]:

    
tzdf.head()









    Out[507]:







  
    
      
      LocationID
      OBJECTID
      Shape_Area
      Shape_Leng
      borough
      geometry
      zone
      N_dropoffs
      N_pickups
      log10_N_dropoffs
      log10_N_pickups
      N_dropoffs_ranked
      N_pickups_ranked
      pEOF1
      dEOF1
      pEOF2
      dEOF2
      pEOF3
      dEOF3
      pEOF4
      dEOF4
      pEOF5
      dEOF5
    
  
  
    
      0
      1
      1
      0.000782
      0.116357
      EWR
      POLYGON ((-8258175.532737531 4967457.202992616...
      Newark Airport
      1801889
      109695
      6.255728
      5.040187
      0.667939
      0.469466
      -2.717661e-08
      -2.881359e-09
      -1.794189e-08
      -1.433908e-09
      5.950944e-08
      3.195680e-09
      -8.254247e-08
      -5.718085e-10
      -1.040261e-07
      -3.076457e-09
    
    
      1
      2
      2
      0.004866
      0.433470
      Queens
      (POLYGON ((-8217980.621910957 4959237.28547167...
      Jamaica Bay
      10387
      11702
      4.016490
      4.068260
      0.053435
      0.164122
      1.004834e-10
      7.612880e-11
      -4.304754e-11
      -6.442688e-11
      -1.523368e-10
      -1.357631e-10
      3.737542e-10
      5.648617e-10
      3.248668e-10
      4.355836e-10
    
    
      2
      3
      3
      0.000314
      0.084341
      Bronx
      POLYGON ((-8220713.534155379 4993383.154018582...
      Allerton/Pelham Gardens
      96883
      16547
      4.986248
      4.218719
      0.213740
      0.202290
      -4.847524e-11
      4.417117e-11
      1.598215e-10
      -4.446037e-11
      -2.216636e-09
      -5.661792e-11
      -5.797356e-09
      -8.936931e-10
      -2.671187e-09
      -4.799690e-10
    
    
      3
      4
      4
      0.000112
      0.043567
      Manhattan
      POLYGON ((-8234500.226961648 4971984.093397928...
      Alphabet City
      6643997
      4752651
      6.822429
      6.676936
      0.812977
      0.805344
      -7.417994e-08
      2.380392e-07
      -2.521742e-08
      6.128409e-08
      -1.776279e-07
      -2.114199e-07
      3.549393e-08
      -5.165418e-07
      1.502940e-07
      -1.212258e-07
    
    
      4
      5
      5
      0.000498
      0.092146
      Staten Island
      POLYGON ((-8257036.10884249 4948033.094989423,...
      Arden Heights
      7340
      1025
      3.865696
      3.010724
      0.034351
      0.015267
      -4.787781e-11
      7.245953e-12
      6.745541e-12
      -1.782181e-12
      -2.716417e-10
      -1.632698e-11
      -1.791913e-11
      -2.047445e-11
      3.142611e-11
      -1.017828e-11



In [526]:

    
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]



In [527]:

    
tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')



In [528]:

    
tzdf2.plot(figsize=(12, 18), alpha=1, column='N_dropoffs_ranked', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)



In [529]:

    
tzdf2.plot(figsize=(12, 18), alpha=1, column='N_pickups_ranked', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)



In [530]:

    
tzdf2.iloc[:, -10:].describe()









    Out[530]:







  
    
      
      pEOF1
      dEOF1
      pEOF2
      dEOF2
      pEOF3
      dEOF3
      pEOF4
      dEOF4
      pEOF5
      dEOF5
    
  
  
    
      count
      2.420000e+02
      2.420000e+02
      2.420000e+02
      2.420000e+02
      2.420000e+02
      2.420000e+02
      2.420000e+02
      2.420000e+02
      2.420000e+02
      2.420000e+02
    
    
      mean
      1.337247e-08
      1.323585e-08
      -3.038017e-08
      -3.044156e-08
      -3.483880e-08
      -3.482810e-08
      -4.837575e-08
      -4.892726e-08
      -3.441977e-08
      -3.494932e-08
    
    
      std
      3.221157e-07
      2.155272e-07
      1.011613e-07
      1.191920e-07
      1.461272e-07
      2.128391e-07
      2.563503e-07
      4.767413e-07
      2.825250e-07
      2.698045e-07
    
    
      min
      -1.041723e-06
      -1.178042e-06
      -6.680994e-07
      -7.350545e-07
      -1.130895e-06
      -1.927351e-06
      -1.568877e-06
      -4.064235e-06
      -2.427254e-06
      -1.581324e-06
    
    
      25%
      -5.394186e-09
      -5.953277e-10
      -3.959468e-09
      -3.293448e-09
      -5.268074e-08
      -1.307326e-08
      -6.383727e-08
      -2.630257e-08
      -2.527213e-08
      -7.778206e-09
    
    
      50%
      -3.556521e-10
      1.076115e-10
      -7.536052e-11
      -9.318704e-11
      -5.885055e-09
      -2.705909e-10
      -1.180033e-08
      -2.135500e-09
      -4.541858e-09
      -5.239243e-10
    
    
      75%
      4.593931e-09
      6.834840e-09
      9.943823e-10
      5.473768e-11
      -1.525962e-09
      4.124035e-12
      -4.379473e-10
      2.358084e-11
      -1.293940e-10
      3.026373e-10
    
    
      max
      3.253512e-06
      1.252968e-06
      2.057265e-07
      3.911548e-07
      8.292906e-07
      7.651686e-07
      1.753640e-06
      1.882881e-06
      1.445455e-06
      1.378802e-06



In [531]:

    
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 8), alpha=1, column='pEOF1', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [537]:

    
import pysal.esda.mapclassify



In [544]:

    
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF2', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [533]:

    
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF3', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF3', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [516]:

    
# ax1 = plt.subplot(121)
# tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF4', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
# plt.grid(False)
# ax1.set_facecolor('xkcd:silver')
# ax1.xaxis.set_visible(False)
# ax1.yaxis.set_visible(False)

# ax2 = plt.subplot(122)
# tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF4', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
# plt.grid(False)
# ax2.set_facecolor('xkcd:silver')
# ax2.xaxis.set_visible(False)
# ax2.yaxis.set_visible(False)



In [517]:

    
# ax1 = plt.subplot(121)
# tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF5', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
# plt.grid(False)
# ax1.set_facecolor('xkcd:silver')
# ax1.xaxis.set_visible(False)
# ax1.yaxis.set_visible(False)

# ax2 = plt.subplot(122)
# tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF5', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
# plt.grid(False)
# ax2.set_facecolor('xkcd:silver')
# ax2.xaxis.set_visible(False)
# ax2.yaxis.set_visible(False)



In [520]:

    
df4 = pd.DataFrame(data=pca.transform(counts_matrix.values)[:, :3], index=counts_matrix.index)
df4.index = df4.index.rename('timepoints')
df4.rename(columns={i:'pc%d' % (i+1) for i in range(3)}, inplace=True)
# df4.reset_index(inplace=True)



In [534]:

    
df4.plot(lw=1)
plt.xlim('2015-06-22', '2015-06-29')
plt.ylim(-0.02, 0.01)









    Out[534]:





(-0.02, 0.01)



In [522]:

    
df4.plot(lw=0.5)
# plt.xlim('2015-06-22', '2015-06-29')









    Out[522]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f04897ae518>



In [524]:

    
df4.resample('1M').mean().plot()
df4.resample('1M').std().plot()
# plt.xlim('2015-06-22', '2015-06-29')









    Out[524]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f04c4441c88>



In [402]:

    
# df4 = pd.DataFrame(data=pca.transform(counts_matrix.resample('1D').sum().values)[:, :5], index=counts_matrix.resample('1D').sum().index)
# df4.index = df4.index.rename('timepoints')
# df4.rename(columns={i:'pc%d' % i for i in range(5)}, inplace=True)
# # df4.reset_index(inplace=True)



In [403]:

    
df4.plot()
plt.xlim('2014-04-01', '2014-09-01')









    Out[403]:





(16161, 16314)



In [222]:

    
nmf = sklearn.decomposition.NMF(5, random_state=42)



In [223]:

    
nmf.fit(counts_matrix.resample('1D').sum().values)
# nmf.explained_variance_ratio_









    Out[223]:





NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=5, nls_max_iter=2000, random_state=42, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)



In [224]:

    
nmf.reconstruction_err_









    Out[224]:





291686.29896657454



In [225]:

    
pickup_eof1, dropoff_eof1 = nmf.components_[0, :263], nmf.components_[0, 263:]
pickup_eof2, dropoff_eof2 = nmf.components_[1, :263], nmf.components_[1, 263:]
pickup_eof3, dropoff_eof3 = nmf.components_[2, :263], nmf.components_[2, 263:]
pickup_eof4, dropoff_eof4 = nmf.components_[3, :263], nmf.components_[3, 263:]
pickup_eof5, dropoff_eof5 = nmf.components_[4, :263], nmf.components_[4, 263:]



In [226]:

    
tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['dEOF4'] = dropoff_eof4
tzdf['pEOF5'] = pickup_eof5
tzdf['dEOF5'] = dropoff_eof5



In [227]:

    
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]



In [228]:

    
tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')



In [235]:

    
tzdf2.iloc[:, -10:].describe()



In [236]:

    
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF1', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=430., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=430., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [240]:

    
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF2', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=292., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=292., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [241]:

    
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF3', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=168., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF3', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=168., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [243]:

    
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF4', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=113., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF4', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=113., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [244]:

    
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF5', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=257., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF5', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=257., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [259]:

    
df4 = pd.DataFrame(data=nmf.transform(counts_matrix.resample('1D').sum().values), index=counts_matrix.resample('1D').sum().index)



In [272]:

    
df5 = df4.reset_index()
df5 = df5.rename(columns={'index':'d', 0: 'pc1', 1: 'pc2', 2:'pc3', 3:'pc4', 4:'pc5'})



In [273]:

    
import plotnine as p9



In [311]:

    
(p9.ggplot(df5, p9.aes('d', 'pc1')) + p9.geom_point(color='steelblue', size=.2)) + p9.stat_smooth(
    method='lm',size=1)









    












    Out[311]:





<ggplot: (8728644232519)>



In [300]:

    
(p9.ggplot(df5, p9.aes('d', 'pc2')) + p9.geom_point()) + p9.stat_smooth(method='lowess')









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"






    












    Out[300]:





<ggplot: (8728654057252)>



In [294]:

    
(p9.ggplot(df5, p9.aes('d', 'pc3')) + p9.geom_point()) + p9.stat_smooth(method='lowess')









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"






    












    Out[294]:





<ggplot: (8728655259824)>



In [295]:

    
(p9.ggplot(df5, p9.aes('d', 'pc4')) + p9.geom_point()) + p9.stat_smooth(method='lowess')









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"






    












    Out[295]:





<ggplot: (-9223363308200697137)>



In [296]:

    
(p9.ggplot(df5, p9.aes('d', 'pc5')) + p9.geom_point()) + p9.stat_smooth(method='lowess')









    



/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"






    












    Out[296]:





<ggplot: (-9223363308201012066)>



In [411]:

    
dir(sklearn.decomposition)









    Out[411]:





['DictionaryLearning',
 'FactorAnalysis',
 'FastICA',
 'IncrementalPCA',
 'KernelPCA',
 'LatentDirichletAllocation',
 'MiniBatchDictionaryLearning',
 'MiniBatchSparsePCA',
 'NMF',
 'PCA',
 'ProjectedGradientNMF',
 'RandomizedPCA',
 'SparseCoder',
 'SparsePCA',
 'TruncatedSVD',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_online_lda',
 'base',
 'cdnmf_fast',
 'dict_learning',
 'dict_learning_online',
 'factor_analysis',
 'fastica',
 'fastica_',
 'incremental_pca',
 'kernel_pca',
 'nmf',
 'non_negative_factorization',
 'online_lda',
 'pca',
 'randomized_svd',
 'sparse_encode',
 'sparse_pca',
 'truncated_svd']



In [ ]:

	pEOF1	dEOF1	pEOF2	dEOF2	pEOF3	dEOF3	pEOF4	dEOF4	pEOF5	dEOF5
count	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000
mean	33.614941	33.744037	19.446174	19.522582	22.824028	23.029265	12.387393	12.510342	9.107524	8.914849
std	73.643847	77.649309	39.660845	46.353030	30.361697	36.030383	31.689024	33.863347	18.863313	27.253940
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.284592	0.012276	0.344686	0.011539	2.180188	0.232518	0.363029	0.000000	0.284869	0.000000
50%	1.089898	0.155107	1.118604	0.161643	8.617385	4.500133	0.918350	0.077347	1.153524	0.105230
75%	16.729933	7.733491	16.267112	10.980080	33.205927	29.741598	5.186028	2.094543	7.727718	3.108362
max	431.663740	436.021587	292.790759	390.687009	141.827463	168.425644	243.430495	213.360773	110.659220	257.589822

	pickup_datetime	dropoff_datetime	pickup_taxizone_id	dropoff_taxizone_id	pickup_location	dropoff_location
0	2009-04-02 15:02:43	2009-04-02 15:22:45	138	229	Queens \| LaGuardia Airport	Manhattan \| Sutton Place/Turtle Bay North
1	2009-05-03 01:34:00	2009-05-03 01:36:00	79	164	Manhattan \| East Village	Manhattan \| Midtown South
2	2010-10-09 00:48:38	2010-10-09 01:00:55	79	237	Manhattan \| East Village	Manhattan \| Upper East Side South
3	2011-09-07 10:03:00	2011-09-07 10:09:00	113	211	Manhattan \| Greenwich Village North	Manhattan \| SoHo
4	2012-07-21 20:34:00	2012-07-21 20:39:00	211	114	Manhattan \| SoHo	Manhattan \| Greenwich Village South
5	2013-05-31 17:45:06	2013-05-31 18:41:07	138	25	Queens \| LaGuardia Airport	Brooklyn \| Boerum Hill
6	2014-08-08 09:06:15	2014-08-08 09:14:52	230	100	Manhattan \| Times Sq/Theatre District	Manhattan \| Garment District
7	2015-03-28 05:51:31	2015-03-28 06:00:05	263	107	Manhattan \| Yorkville West	Manhattan \| Gramercy
8	2015-10-11 10:20:04	2015-10-11 10:31:45	75	194	Manhattan \| East Harlem South	Manhattan \| Randalls Island
9	2016-07-15 00:57:44	2016-07-15 01:03:16	125	186	Manhattan \| Hudson Sq	Manhattan \| Penn Station/Madison Sq West

	pickup_taxizone_id	dropoff_taxizone_id
pickup_datetime
2009-01-01 00:00:00	237	263
2009-01-01 00:00:00	114	249
2009-01-01 00:00:02	237	43
2009-01-01 00:00:04	261	261
2009-01-01 00:00:07	144	80

		count
pickup_taxizone_id	dropoff_taxizone_id
1	1	101493
	186	35
	266	37311
2	2	4297
2	14	20

	count
count	6.071000e+04
mean	2.277015e+04
std	1.698617e+05
min	1.000000e+00
25%	9.000000e+00
50%	8.800000e+01
75%	1.258000e+03
max	2.556999e+07

	N	pickup_taxizone_id	pickup_ymd
1432504	2016-12-31 23:59:57	36	2016-12-31 23:00:00
1432505	2016-12-31 23:59:58	76	2016-12-31 23:00:00
1432506	2016-12-31 23:59:58	168	2016-12-31 23:00:00
1432507	2016-12-31 23:59:58	144	2016-12-31 23:00:00
1432508	2016-12-31 23:59:59	135	2016-12-31 23:00:00

		N
pickup_taxizone_id	pickup_ymd
1	2009-01-01 01:00:00	1
	2009-01-01 02:00:00	1
	2009-01-01 04:00:00	2
	2009-01-01 05:00:00	1
	2009-01-01 07:00:00	1

	dropoff_datetime	dropoff_taxizone_id	dropoff_ymd
1432504	2017-01-01 00:07:47	36	2016-12-31 23:00:00
1432505	2017-01-01 00:15:29	63	2016-12-31 23:00:00
1432506	2017-01-01 00:39:07	161	2016-12-31 23:00:00
1432507	2017-01-01 00:03:50	209	2016-12-31 23:00:00
1432508	2017-01-01 00:14:30	134	2016-12-31 23:00:00

		N
dropoff_taxizone_id	dropoff_ymd
1	2009-01-01 01:00:00	2
	2009-01-01 02:00:00	3
	2009-01-01 03:00:00	1
	2009-01-01 04:00:00	10
	2009-01-01 05:00:00	10

	LocationID	OBJECTID	Shape_Area	Shape_Leng	borough	geometry	zone	N_dropoffs	N_pickups	log10_N_dropoffs	log10_N_pickups	N_dropoffs_ranked	N_pickups_ranked	pEOF1	dEOF1	pEOF2	dEOF2	pEOF3	dEOF3	pEOF4	dEOF4	pEOF5	dEOF5
0	1	1	0.000782	0.116357	EWR	POLYGON ((-8258175.532737531 4967457.202992616...	Newark Airport	1801889	109695	6.255728	5.040187	0.667939	0.469466	-2.717661e-08	-2.881359e-09	-1.794189e-08	-1.433908e-09	5.950944e-08	3.195680e-09	-8.254247e-08	-5.718085e-10	-1.040261e-07	-3.076457e-09
1	2	2	0.004866	0.433470	Queens	(POLYGON ((-8217980.621910957 4959237.28547167...	Jamaica Bay	10387	11702	4.016490	4.068260	0.053435	0.164122	1.004834e-10	7.612880e-11	-4.304754e-11	-6.442688e-11	-1.523368e-10	-1.357631e-10	3.737542e-10	5.648617e-10	3.248668e-10	4.355836e-10
2	3	3	0.000314	0.084341	Bronx	POLYGON ((-8220713.534155379 4993383.154018582...	Allerton/Pelham Gardens	96883	16547	4.986248	4.218719	0.213740	0.202290	-4.847524e-11	4.417117e-11	1.598215e-10	-4.446037e-11	-2.216636e-09	-5.661792e-11	-5.797356e-09	-8.936931e-10	-2.671187e-09	-4.799690e-10
3	4	4	0.000112	0.043567	Manhattan	POLYGON ((-8234500.226961648 4971984.093397928...	Alphabet City	6643997	4752651	6.822429	6.676936	0.812977	0.805344	-7.417994e-08	2.380392e-07	-2.521742e-08	6.128409e-08	-1.776279e-07	-2.114199e-07	3.549393e-08	-5.165418e-07	1.502940e-07	-1.212258e-07
4	5	5	0.000498	0.092146	Staten Island	POLYGON ((-8257036.10884249 4948033.094989423,...	Arden Heights	7340	1025	3.865696	3.010724	0.034351	0.015267	-4.787781e-11	7.245953e-12	6.745541e-12	-1.782181e-12	-2.716417e-10	-1.632698e-11	-1.791913e-11	-2.047445e-11	3.142611e-11	-1.017828e-11

	pEOF1	dEOF1	pEOF2	dEOF2	pEOF3	dEOF3	pEOF4	dEOF4	pEOF5	dEOF5
count	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02
mean	1.337247e-08	1.323585e-08	-3.038017e-08	-3.044156e-08	-3.483880e-08	-3.482810e-08	-4.837575e-08	-4.892726e-08	-3.441977e-08	-3.494932e-08
std	3.221157e-07	2.155272e-07	1.011613e-07	1.191920e-07	1.461272e-07	2.128391e-07	2.563503e-07	4.767413e-07	2.825250e-07	2.698045e-07
min	-1.041723e-06	-1.178042e-06	-6.680994e-07	-7.350545e-07	-1.130895e-06	-1.927351e-06	-1.568877e-06	-4.064235e-06	-2.427254e-06	-1.581324e-06
25%	-5.394186e-09	-5.953277e-10	-3.959468e-09	-3.293448e-09	-5.268074e-08	-1.307326e-08	-6.383727e-08	-2.630257e-08	-2.527213e-08	-7.778206e-09
50%	-3.556521e-10	1.076115e-10	-7.536052e-11	-9.318704e-11	-5.885055e-09	-2.705909e-10	-1.180033e-08	-2.135500e-09	-4.541858e-09	-5.239243e-10
75%	4.593931e-09	6.834840e-09	9.943823e-10	5.473768e-11	-1.525962e-09	4.124035e-12	-4.379473e-10	2.358084e-11	-1.293940e-10	3.026373e-10
max	3.253512e-06	1.252968e-06	2.057265e-07	3.911548e-07	8.292906e-07	7.651686e-07	1.753640e-06	1.882881e-06	1.445455e-06	1.378802e-06