In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import dask.dataframe as dd
import dask.distributed
import numpy as np
import pandas as pd
# import geopandas as gpd

from matplotlib.colors import SymLogNorm as symlog
from matplotlib import rcParams

import sklearn, sklearn.cluster
import matplotlib.pyplot as plt
import palettable

import seaborn as sns

import netCDF4
import geopandas


pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

In [2]:
client = dask.distributed.Client()

In [45]:
tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')

In [4]:
rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 12
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'

rcParams['savefig.dpi'] = 150
rcParams['figure.dpi'] = 150
import seaborn as sns

import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
    fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
    fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaOblique.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaBold.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
                             'normal:weight=normal:stretch=normal:file='
                             '{0}/Helvetica.ttf:' +
                             fontsize2
                             ).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))

Section 1: Sample 10 random rows from the dataset, show with human readable names for blog post as HTML


In [5]:
df = dd.read_parquet('/data/all_trips.parquet', index='trip_id', 
    columns='pickup_datetime dropoff_datetime pickup_taxizone_id dropoff_taxizone_id'.split())


/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '

In [6]:
df2 = df.sample(frac=1.0e-6, random_state=42).compute()

In [7]:
df2 = df2.dropna()

In [8]:
df3 = df2.merge(
    tzdf['LocationID borough zone'.split()], left_on='pickup_taxizone_id', right_on='LocationID'
)
df3['pickup_location'] = df3.borough.map(str) + " | " +  df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)

df3 = df3.merge(
    tzdf['LocationID borough zone'.split()], left_on='dropoff_taxizone_id', right_on='LocationID'
)
df3['dropoff_location'] = df3.borough.map(str) + " | " +  df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)
df3 = df3.sample(frac=1, replace=False, random_state=42).reset_index(drop=True)

In [9]:
df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html().replace("""\n""", "")


Out[9]:
'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>pickup_datetime</th>      <th>dropoff_datetime</th>      <th>pickup_taxizone_id</th>      <th>dropoff_taxizone_id</th>      <th>pickup_location</th>      <th>dropoff_location</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>2009-04-02 15:02:43</td>      <td>2009-04-02 15:22:45</td>      <td>138</td>      <td>229</td>      <td>Queens | LaGuardia Airport</td>      <td>Manhattan | Sutton Place/Turtle Bay North</td>    </tr>    <tr>      <th>1</th>      <td>2009-05-03 01:34:00</td>      <td>2009-05-03 01:36:00</td>      <td>79</td>      <td>164</td>      <td>Manhattan | East Village</td>      <td>Manhattan | Midtown South</td>    </tr>    <tr>      <th>2</th>      <td>2010-10-09 00:48:38</td>      <td>2010-10-09 01:00:55</td>      <td>79</td>      <td>237</td>      <td>Manhattan | East Village</td>      <td>Manhattan | Upper East Side South</td>    </tr>    <tr>      <th>3</th>      <td>2011-09-07 10:03:00</td>      <td>2011-09-07 10:09:00</td>      <td>113</td>      <td>211</td>      <td>Manhattan | Greenwich Village North</td>      <td>Manhattan | SoHo</td>    </tr>    <tr>      <th>4</th>      <td>2012-07-21 20:34:00</td>      <td>2012-07-21 20:39:00</td>      <td>211</td>      <td>114</td>      <td>Manhattan | SoHo</td>      <td>Manhattan | Greenwich Village South</td>    </tr>    <tr>      <th>5</th>      <td>2013-05-31 17:45:06</td>      <td>2013-05-31 18:41:07</td>      <td>138</td>      <td>25</td>      <td>Queens | LaGuardia Airport</td>      <td>Brooklyn | Boerum Hill</td>    </tr>    <tr>      <th>6</th>      <td>2014-08-08 09:06:15</td>      <td>2014-08-08 09:14:52</td>      <td>230</td>      <td>100</td>      <td>Manhattan | Times Sq/Theatre District</td>      <td>Manhattan | Garment District</td>    </tr>    <tr>      <th>7</th>      <td>2015-03-28 05:51:31</td>      <td>2015-03-28 06:00:05</td>      <td>263</td>      <td>107</td>      <td>Manhattan | Yorkville West</td>      <td>Manhattan | Gramercy</td>    </tr>    <tr>      <th>8</th>      <td>2015-10-11 10:20:04</td>      <td>2015-10-11 10:31:45</td>      <td>75</td>      <td>194</td>      <td>Manhattan | East Harlem South</td>      <td>Manhattan | Randalls Island</td>    </tr>    <tr>      <th>9</th>      <td>2016-07-15 00:57:44</td>      <td>2016-07-15 01:03:16</td>      <td>125</td>      <td>186</td>      <td>Manhattan | Hudson Sq</td>      <td>Manhattan | Penn Station/Madison Sq West</td>    </tr>  </tbody></table>'

In [10]:
from IPython.display import HTML
HTML(df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html())


Out[10]:
pickup_datetime dropoff_datetime pickup_taxizone_id dropoff_taxizone_id pickup_location dropoff_location
0 2009-04-02 15:02:43 2009-04-02 15:22:45 138 229 Queens | LaGuardia Airport Manhattan | Sutton Place/Turtle Bay North
1 2009-05-03 01:34:00 2009-05-03 01:36:00 79 164 Manhattan | East Village Manhattan | Midtown South
2 2010-10-09 00:48:38 2010-10-09 01:00:55 79 237 Manhattan | East Village Manhattan | Upper East Side South
3 2011-09-07 10:03:00 2011-09-07 10:09:00 113 211 Manhattan | Greenwich Village North Manhattan | SoHo
4 2012-07-21 20:34:00 2012-07-21 20:39:00 211 114 Manhattan | SoHo Manhattan | Greenwich Village South
5 2013-05-31 17:45:06 2013-05-31 18:41:07 138 25 Queens | LaGuardia Airport Brooklyn | Boerum Hill
6 2014-08-08 09:06:15 2014-08-08 09:14:52 230 100 Manhattan | Times Sq/Theatre District Manhattan | Garment District
7 2015-03-28 05:51:31 2015-03-28 06:00:05 263 107 Manhattan | Yorkville West Manhattan | Gramercy
8 2015-10-11 10:20:04 2015-10-11 10:31:45 75 194 Manhattan | East Harlem South Manhattan | Randalls Island
9 2016-07-15 00:57:44 2016-07-15 01:03:16 125 186 Manhattan | Hudson Sq Manhattan | Penn Station/Madison Sq West

Section 2: Calculate Count Matrix for Full Dataset


In [11]:
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['pickup_taxizone_id', 'dropoff_taxizone_id'])
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)


/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '

In [12]:
df.head()


Out[12]:
pickup_taxizone_id dropoff_taxizone_id
pickup_datetime
2009-01-01 00:00:00 237 263
2009-01-01 00:00:00 114 249
2009-01-01 00:00:02 237 43
2009-01-01 00:00:04 261 261
2009-01-01 00:00:07 144 80

In [13]:
count_dataframe = df.reset_index().groupby(['pickup_taxizone_id', 'dropoff_taxizone_id']).count().compute()
count_dataframe.columns = ['count']
count_dataframe.shape


Out[13]:
(60710, 1)

In [14]:
count_dataframe.head()


Out[14]:
count
pickup_taxizone_id dropoff_taxizone_id
1 1 101493
186 35
266 37311
2 2 4297
14 20

In [15]:
count_matrix = np.zeros((267, 267), dtype=np.int64)
for r in count_dataframe.reset_index().itertuples():
    count_matrix[r[1], r[2]] = r[3]

In [16]:
count_dataframe.describe()


Out[16]:
count
count 6.071000e+04
mean 2.277015e+04
std 1.698617e+05
min 1.000000e+00
25% 9.000000e+00
50% 8.800000e+01
75% 1.258000e+03
max 2.556999e+07

In [17]:
count_dataframe.reset_index().head()


Out[17]:
pickup_taxizone_id dropoff_taxizone_id count
0 1 1 101493
1 1 186 35
2 1 266 37311
3 2 2 4297
4 2 14 20

In [18]:
# <!-- collapse=True -->
plt.imshow(count_matrix[1:-3, 1:-3].T, norm=symlog(10000), origin='upper', cmap=plt.cm.Blues)
plt.grid(False)
plt.xlabel("Dropoff Taxi Zone ID")
plt.ylabel("Pickup Taxi Zone ID")
plt.gcf().set_size_inches(4, 4)


Section 3: Calculate Pickups matrix


In [19]:
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['pickup_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)


/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '

In [20]:
def get_year_mo_day(data, col):
#     d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
#     return d
    return data.index.values.astype('M8[h]')

In [21]:
df['pickup_ymd'] = df.map_partitions(get_year_mo_day, 'pickup_datetime', meta=('asdf', np.datetime64))


/home/shekhar/anaconda3/lib/python3.6/site-packages/dask/dataframe/utils.py:232: FutureWarning: Passing in 'datetime64' dtype with no frequency is deprecated and will raise in a future version. Please pass in 'datetime64[ns]' instead.
  return pd.Series([], dtype=dtype, name=name, index=index)

In [22]:
df.reset_index().rename(columns=dict(index='N')).tail()


Out[22]:
N pickup_taxizone_id pickup_ymd
1432504 2016-12-31 23:59:57 36 2016-12-31 23:00:00
1432505 2016-12-31 23:59:58 76 2016-12-31 23:00:00
1432506 2016-12-31 23:59:58 168 2016-12-31 23:00:00
1432507 2016-12-31 23:59:58 144 2016-12-31 23:00:00
1432508 2016-12-31 23:59:59 135 2016-12-31 23:00:00

In [23]:
pickup_counts_df = df.reset_index().rename(columns=dict(index='N')).groupby(['pickup_taxizone_id', 'pickup_ymd',]).count().compute()
pickup_counts_df.sort_index(inplace=True)

In [24]:
pickup_counts_df.head()


Out[24]:
N
pickup_taxizone_id pickup_ymd
1 2009-01-01 01:00:00 1
2009-01-01 02:00:00 1
2009-01-01 04:00:00 2
2009-01-01 05:00:00 1
2009-01-01 07:00:00 1

In [25]:
z = pickup_counts_df.unstack(0)

In [26]:
z.columns = np.arange(1, 267).astype(str)

In [27]:
z = z.merge(
    pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')), 
    how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)

In [28]:
z.head()


Out[28]:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 ... 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
2009-01-01 00:00:00 0 0 0 121 0 0 42 0 0 0 0 4 63 2 0 0 13 1 1 0 0 1 3 78 28 0 0 3 0 0 0 1 29 1 0 3 16 0 0 27 53 30 207 0 21 0 0 594 18 178 ... 2 0 0 0 0 0 18 71 5 35 1 1 379 57 328 70 211 544 1 384 459 407 439 0 3 0 10 32 0 211 1 2 409 0 0 0 0 0 74 53 4 2 0 6 70 179 449 0 0 369
2009-01-01 01:00:00 1 0 0 137 0 0 88 0 0 0 0 4 63 2 0 1 11 1 0 0 0 0 5 84 51 0 1 0 0 0 0 0 78 2 0 8 22 0 0 37 79 41 103 0 39 0 0 440 37 184 ... 7 0 0 2 0 0 35 121 10 68 0 8 540 130 360 131 298 402 1 382 459 486 549 0 1 4 24 64 0 167 3 0 322 1 0 0 0 0 97 92 7 0 0 22 85 239 627 0 0 452
2009-01-01 02:00:00 1 0 0 125 0 0 115 0 0 0 0 2 58 4 0 2 19 1 0 0 0 0 4 90 54 1 0 1 0 0 0 0 54 0 0 4 15 0 0 54 99 50 121 0 47 0 0 486 50 130 ... 11 0 1 3 0 0 40 119 6 69 1 10 496 155 348 115 288 295 1 329 267 401 491 0 1 0 28 71 0 117 4 0 335 0 0 1 0 0 122 115 6 1 0 21 55 170 594 0 0 378
2009-01-01 03:00:00 0 0 0 124 0 0 110 1 1 0 0 1 33 8 0 3 18 0 0 0 0 2 3 51 61 0 0 1 1 0 0 0 55 1 1 8 22 1 0 35 67 44 67 0 40 0 0 541 42 129 ... 9 0 0 5 0 0 34 74 4 53 2 9 349 303 322 80 144 348 1 220 169 199 339 0 0 1 15 61 0 115 2 0 261 0 1 0 0 0 80 98 5 1 0 28 70 96 437 0 0 313
2009-01-01 04:00:00 2 0 0 100 0 0 105 0 1 0 0 0 19 7 0 4 8 2 3 1 0 1 0 41 43 1 0 1 0 0 0 0 27 3 0 7 24 0 0 20 54 28 34 0 38 0 1 390 27 120 ... 3 0 0 5 0 0 29 34 3 59 1 7 233 308 221 53 116 286 3 91 79 112 186 0 0 0 10 27 0 69 6 1 241 0 0 0 0 0 80 74 5 0 0 20 37 62 296 0 0 224

5 rows × 266 columns


In [29]:
import fastparquet
fastparquet.write('/data/trips_pickups_matrix.parquet', z, compression='SNAPPY')

Section 4: Calculate Dropoffs matrix


In [30]:
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['dropoff_datetime', 'dropoff_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)


/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '

In [31]:
def get_year_mo_day(data, col):
#     d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
#     return d
    return data.index.values.astype('M8[h]')

In [32]:
df['dropoff_ymd'] = df.map_partitions(get_year_mo_day, 'dropoff_datetime', meta=('asdf', np.datetime64))


/home/shekhar/anaconda3/lib/python3.6/site-packages/dask/dataframe/utils.py:232: FutureWarning: Passing in 'datetime64' dtype with no frequency is deprecated and will raise in a future version. Please pass in 'datetime64[ns]' instead.
  return pd.Series([], dtype=dtype, name=name, index=index)

In [33]:
df.reset_index(drop=True).tail()


Out[33]:
dropoff_datetime dropoff_taxizone_id dropoff_ymd
1432504 2017-01-01 00:07:47 36 2016-12-31 23:00:00
1432505 2017-01-01 00:15:29 63 2016-12-31 23:00:00
1432506 2017-01-01 00:39:07 161 2016-12-31 23:00:00
1432507 2017-01-01 00:03:50 209 2016-12-31 23:00:00
1432508 2017-01-01 00:14:30 134 2016-12-31 23:00:00

In [34]:
dropoff_counts_df = df.reset_index(drop=True).rename(columns=dict(dropoff_datetime='N')).groupby(['dropoff_taxizone_id', 'dropoff_ymd',]).count().compute()
dropoff_counts_df.sort_index(inplace=True)

In [35]:
dropoff_counts_df.head()


Out[35]:
N
dropoff_taxizone_id dropoff_ymd
1 2009-01-01 01:00:00 2
2009-01-01 02:00:00 3
2009-01-01 03:00:00 1
2009-01-01 04:00:00 10
2009-01-01 05:00:00 10

In [36]:
z2 = dropoff_counts_df.unstack(0)

In [37]:
z2 = z2.merge(
    pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')), 
    how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)


/home/shekhar/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/merge.py:551: UserWarning: merging between different levels can give an unintended result (2 levels on the left, 1 on the right)
  warnings.warn(msg, UserWarning)

In [38]:
z2.columns = np.arange(1, 267).astype(str)

In [39]:
z2.head()


Out[39]:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 ... 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
2009-01-01 00:00:00 0 0 1 128 0 1 113 0 0 1 1 1 111 20 1 8 28 3 1 6 2 1 4 84 36 5 1 6 2 0 0 0 57 4 1 10 23 1 0 32 95 81 95 0 30 0 0 434 44 150 ... 7 3 1 9 0 0 53 111 16 56 9 17 352 50 281 75 291 366 3 432 253 407 404 1 10 2 43 82 0 188 0 3 306 3 0 2 0 0 85 75 13 6 3 36 111 282 450 0 0 379
2009-01-01 01:00:00 2 0 2 154 0 1 188 2 1 0 3 1 87 25 4 5 28 4 1 1 2 4 5 96 47 4 1 3 0 0 1 1 104 5 3 8 25 0 0 37 140 91 97 0 39 1 2 402 49 147 ... 10 1 2 24 0 0 89 161 22 94 9 23 372 118 226 96 335 331 8 395 227 434 413 0 9 5 86 106 1 157 2 2 340 2 0 5 0 1 103 84 22 7 7 41 75 346 627 0 0 469
2009-01-01 02:00:00 3 0 1 134 1 0 245 0 4 4 3 3 88 27 3 10 41 7 1 7 3 7 4 89 45 5 0 7 2 0 1 3 85 1 3 15 30 2 0 31 150 103 77 0 47 0 1 414 65 121 ... 10 0 1 15 1 0 105 145 26 103 11 24 285 162 210 84 301 243 2 390 160 358 370 0 4 2 71 118 1 115 12 2 283 1 0 3 1 0 92 98 28 3 4 53 66 283 536 0 0 408
2009-01-01 03:00:00 1 0 1 109 0 0 229 0 4 1 3 1 69 40 2 7 31 7 3 9 5 13 6 86 55 3 0 9 2 1 1 1 69 2 1 21 46 0 0 33 113 100 50 0 33 0 3 368 69 143 ... 9 0 1 19 0 1 107 113 16 84 13 33 245 197 171 85 215 204 3 267 136 245 251 0 6 1 71 105 0 90 5 2 241 0 1 5 0 2 59 81 24 8 5 56 89 233 448 0 0 334
2009-01-01 04:00:00 10 0 2 103 0 0 221 1 3 2 2 1 50 38 1 5 32 4 2 2 4 6 1 56 46 2 0 9 1 0 2 2 46 2 3 23 56 1 1 25 97 79 23 0 28 0 0 258 48 98 ... 7 1 0 20 0 0 98 68 24 82 8 26 143 157 96 72 141 110 8 122 71 198 169 3 8 1 52 87 1 39 6 1 171 3 0 4 0 3 62 84 20 4 6 52 68 143 291 0 0 246

5 rows × 266 columns


In [40]:
import fastparquet
fastparquet.write('/data/trips_dropoffs_matrix.parquet', z2, compression='SNAPPY')

Section 5 : Perform PCA


In [86]:
tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')

In [87]:
import fastparquet
dropoffs_matrix = fastparquet.ParquetFile('/data/trips_dropoffs_matrix.parquet').to_pandas()
pickups_matrix = fastparquet.ParquetFile('/data/trips_pickups_matrix.parquet').to_pandas()

In [88]:
dropoffs_matrix = dropoffs_matrix.iloc[:, :-3]
pickups_matrix = pickups_matrix.iloc[:, :-3]

In [135]:
counts_matrix = pd.concat([dropoffs_matrix, pickups_matrix], axis=1 )

In [321]:
tzdf.zone[0]


Out[321]:
'Newark Airport'

In [324]:
sns.distplot(counts_matrix.iloc[:, 263+0], kde=False)
sns.distplot(counts_matrix.iloc[:, 0], kde=False)


Out[324]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f049a3dc710>

In [136]:
import sklearn, sklearn.decomposition

In [414]:
# pca = sklearn.decomposition.PCA(n_components=20, whiten=True)
# # pca.fit(counts_matrix.resample('1D').sum().values)
# pca.fit(counts_matrix.values)
# pca.explained_variance_ratio_

In [500]:
pca = sklearn.decomposition.FastICA(n_components=3, random_state=42, whiten=True)
# pca.fit(counts_matrix.resample('1D').sum().values)
yvals = pca.fit_transform(counts_matrix.values)
# pca.explained_variance_ratio_


/home/shekhar/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/fastica_.py:116: UserWarning: FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.
  warnings.warn('FastICA did not converge. Consider increasing '

In [501]:
yvals.shape


Out[501]:
(70128, 3)

In [502]:
pickup_eof1, dropoff_eof1 = pca.components_[0, :263], pca.components_[0, 263:]
pickup_eof2, dropoff_eof2 = pca.components_[1, :263], pca.components_[1, 263:]
pickup_eof3, dropoff_eof3 = pca.components_[2, :263], pca.components_[2, 263:]
# pickup_eof4, dropoff_eof4 = pca.components_[3, :263], pca.components_[3, 263:]
# pickup_eof5, dropoff_eof5 = pca.components_[4, :263], pca.components_[4, 263:]

In [503]:
tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
# tzdf['pEOF4'] = pickup_eof4
# tzdf['dEOF4'] = dropoff_eof4
# tzdf['pEOF5'] = pickup_eof5
# tzdf['dEOF5'] = dropoff_eof5

In [504]:
tzdf['N_dropoffs'] = dropoffs_matrix.sum(axis=0).values
tzdf['N_pickups'] = pickups_matrix.sum(axis=0).values

In [505]:
tzdf['log10_N_dropoffs'] = np.log10(tzdf.N_dropoffs)
tzdf['log10_N_pickups'] = np.log10(tzdf.N_pickups)

In [506]:
tzdf = tzdf.to_crs({'init': 'epsg:3857'})

In [507]:
tzdf.head()


Out[507]:
LocationID OBJECTID Shape_Area Shape_Leng borough geometry zone N_dropoffs N_pickups log10_N_dropoffs log10_N_pickups N_dropoffs_ranked N_pickups_ranked pEOF1 dEOF1 pEOF2 dEOF2 pEOF3 dEOF3 pEOF4 dEOF4 pEOF5 dEOF5
0 1 1 0.000782 0.116357 EWR POLYGON ((-8258175.532737531 4967457.202992616... Newark Airport 1801889 109695 6.255728 5.040187 0.667939 0.469466 -2.717661e-08 -2.881359e-09 -1.794189e-08 -1.433908e-09 5.950944e-08 3.195680e-09 -8.254247e-08 -5.718085e-10 -1.040261e-07 -3.076457e-09
1 2 2 0.004866 0.433470 Queens (POLYGON ((-8217980.621910957 4959237.28547167... Jamaica Bay 10387 11702 4.016490 4.068260 0.053435 0.164122 1.004834e-10 7.612880e-11 -4.304754e-11 -6.442688e-11 -1.523368e-10 -1.357631e-10 3.737542e-10 5.648617e-10 3.248668e-10 4.355836e-10
2 3 3 0.000314 0.084341 Bronx POLYGON ((-8220713.534155379 4993383.154018582... Allerton/Pelham Gardens 96883 16547 4.986248 4.218719 0.213740 0.202290 -4.847524e-11 4.417117e-11 1.598215e-10 -4.446037e-11 -2.216636e-09 -5.661792e-11 -5.797356e-09 -8.936931e-10 -2.671187e-09 -4.799690e-10
3 4 4 0.000112 0.043567 Manhattan POLYGON ((-8234500.226961648 4971984.093397928... Alphabet City 6643997 4752651 6.822429 6.676936 0.812977 0.805344 -7.417994e-08 2.380392e-07 -2.521742e-08 6.128409e-08 -1.776279e-07 -2.114199e-07 3.549393e-08 -5.165418e-07 1.502940e-07 -1.212258e-07
4 5 5 0.000498 0.092146 Staten Island POLYGON ((-8257036.10884249 4948033.094989423,... Arden Heights 7340 1025 3.865696 3.010724 0.034351 0.015267 -4.787781e-11 7.245953e-12 6.745541e-12 -1.782181e-12 -2.716417e-10 -1.632698e-11 -1.791913e-11 -2.047445e-11 3.142611e-11 -1.017828e-11

In [526]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]

In [527]:
tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')

In [528]:
tzdf2.plot(figsize=(12, 18), alpha=1, column='N_dropoffs_ranked', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)



In [529]:
tzdf2.plot(figsize=(12, 18), alpha=1, column='N_pickups_ranked', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)



In [530]:
tzdf2.iloc[:, -10:].describe()


Out[530]:
pEOF1 dEOF1 pEOF2 dEOF2 pEOF3 dEOF3 pEOF4 dEOF4 pEOF5 dEOF5
count 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02
mean 1.337247e-08 1.323585e-08 -3.038017e-08 -3.044156e-08 -3.483880e-08 -3.482810e-08 -4.837575e-08 -4.892726e-08 -3.441977e-08 -3.494932e-08
std 3.221157e-07 2.155272e-07 1.011613e-07 1.191920e-07 1.461272e-07 2.128391e-07 2.563503e-07 4.767413e-07 2.825250e-07 2.698045e-07
min -1.041723e-06 -1.178042e-06 -6.680994e-07 -7.350545e-07 -1.130895e-06 -1.927351e-06 -1.568877e-06 -4.064235e-06 -2.427254e-06 -1.581324e-06
25% -5.394186e-09 -5.953277e-10 -3.959468e-09 -3.293448e-09 -5.268074e-08 -1.307326e-08 -6.383727e-08 -2.630257e-08 -2.527213e-08 -7.778206e-09
50% -3.556521e-10 1.076115e-10 -7.536052e-11 -9.318704e-11 -5.885055e-09 -2.705909e-10 -1.180033e-08 -2.135500e-09 -4.541858e-09 -5.239243e-10
75% 4.593931e-09 6.834840e-09 9.943823e-10 5.473768e-11 -1.525962e-09 4.124035e-12 -4.379473e-10 2.358084e-11 -1.293940e-10 3.026373e-10
max 3.253512e-06 1.252968e-06 2.057265e-07 3.911548e-07 8.292906e-07 7.651686e-07 1.753640e-06 1.882881e-06 1.445455e-06 1.378802e-06

In [531]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 8), alpha=1, column='pEOF1', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [537]:
import pysal.esda.mapclassify

In [544]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF2', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [533]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF3', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF3', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [516]:
# ax1 = plt.subplot(121)
# tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF4', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
# plt.grid(False)
# ax1.set_facecolor('xkcd:silver')
# ax1.xaxis.set_visible(False)
# ax1.yaxis.set_visible(False)

# ax2 = plt.subplot(122)
# tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF4', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
# plt.grid(False)
# ax2.set_facecolor('xkcd:silver')
# ax2.xaxis.set_visible(False)
# ax2.yaxis.set_visible(False)

In [517]:
# ax1 = plt.subplot(121)
# tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF5', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
# plt.grid(False)
# ax1.set_facecolor('xkcd:silver')
# ax1.xaxis.set_visible(False)
# ax1.yaxis.set_visible(False)

# ax2 = plt.subplot(122)
# tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF5', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
# plt.grid(False)
# ax2.set_facecolor('xkcd:silver')
# ax2.xaxis.set_visible(False)
# ax2.yaxis.set_visible(False)

In [520]:
df4 = pd.DataFrame(data=pca.transform(counts_matrix.values)[:, :3], index=counts_matrix.index)
df4.index = df4.index.rename('timepoints')
df4.rename(columns={i:'pc%d' % (i+1) for i in range(3)}, inplace=True)
# df4.reset_index(inplace=True)

In [534]:
df4.plot(lw=1)
plt.xlim('2015-06-22', '2015-06-29')
plt.ylim(-0.02, 0.01)


Out[534]:
(-0.02, 0.01)

In [522]:
df4.plot(lw=0.5)
# plt.xlim('2015-06-22', '2015-06-29')


Out[522]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f04897ae518>

In [524]:
df4.resample('1M').mean().plot()
df4.resample('1M').std().plot()
# plt.xlim('2015-06-22', '2015-06-29')


Out[524]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f04c4441c88>

In [402]:
# df4 = pd.DataFrame(data=pca.transform(counts_matrix.resample('1D').sum().values)[:, :5], index=counts_matrix.resample('1D').sum().index)
# df4.index = df4.index.rename('timepoints')
# df4.rename(columns={i:'pc%d' % i for i in range(5)}, inplace=True)
# # df4.reset_index(inplace=True)

In [403]:
df4.plot()
plt.xlim('2014-04-01', '2014-09-01')


Out[403]:
(16161, 16314)

In [222]:
nmf = sklearn.decomposition.NMF(5, random_state=42)

In [223]:
nmf.fit(counts_matrix.resample('1D').sum().values)
# nmf.explained_variance_ratio_


Out[223]:
NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=5, nls_max_iter=2000, random_state=42, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [224]:
nmf.reconstruction_err_


Out[224]:
291686.29896657454

In [225]:
pickup_eof1, dropoff_eof1 = nmf.components_[0, :263], nmf.components_[0, 263:]
pickup_eof2, dropoff_eof2 = nmf.components_[1, :263], nmf.components_[1, 263:]
pickup_eof3, dropoff_eof3 = nmf.components_[2, :263], nmf.components_[2, 263:]
pickup_eof4, dropoff_eof4 = nmf.components_[3, :263], nmf.components_[3, 263:]
pickup_eof5, dropoff_eof5 = nmf.components_[4, :263], nmf.components_[4, 263:]

In [226]:
tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['dEOF4'] = dropoff_eof4
tzdf['pEOF5'] = pickup_eof5
tzdf['dEOF5'] = dropoff_eof5

In [227]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]

In [228]:
tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')

In [235]:
tzdf2.iloc[:, -10:].describe()


Out[235]:
pEOF1 dEOF1 pEOF2 dEOF2 pEOF3 dEOF3 pEOF4 dEOF4 pEOF5 dEOF5
count 242.000000 242.000000 242.000000 242.000000 242.000000 242.000000 242.000000 242.000000 242.000000 242.000000
mean 33.614941 33.744037 19.446174 19.522582 22.824028 23.029265 12.387393 12.510342 9.107524 8.914849
std 73.643847 77.649309 39.660845 46.353030 30.361697 36.030383 31.689024 33.863347 18.863313 27.253940
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.284592 0.012276 0.344686 0.011539 2.180188 0.232518 0.363029 0.000000 0.284869 0.000000
50% 1.089898 0.155107 1.118604 0.161643 8.617385 4.500133 0.918350 0.077347 1.153524 0.105230
75% 16.729933 7.733491 16.267112 10.980080 33.205927 29.741598 5.186028 2.094543 7.727718 3.108362
max 431.663740 436.021587 292.790759 390.687009 141.827463 168.425644 243.430495 213.360773 110.659220 257.589822

In [236]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF1', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=430., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=430., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [240]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF2', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=292., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=292., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [241]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF3', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=168., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF3', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=168., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [243]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF4', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=113., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF4', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=113., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [244]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF5', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=257., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF5', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=257., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)



In [259]:
df4 = pd.DataFrame(data=nmf.transform(counts_matrix.resample('1D').sum().values), index=counts_matrix.resample('1D').sum().index)

In [272]:
df5 = df4.reset_index()
df5 = df5.rename(columns={'index':'d', 0: 'pc1', 1: 'pc2', 2:'pc3', 3:'pc4', 4:'pc5'})

In [273]:
import plotnine as p9

In [311]:
(p9.ggplot(df5, p9.aes('d', 'pc1')) + p9.geom_point(color='steelblue', size=.2)) + p9.stat_smooth(
    method='lm',size=1)


Out[311]:
<ggplot: (8728644232519)>

In [300]:
(p9.ggplot(df5, p9.aes('d', 'pc2')) + p9.geom_point()) + p9.stat_smooth(method='lowess')


/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"
Out[300]:
<ggplot: (8728654057252)>

In [294]:
(p9.ggplot(df5, p9.aes('d', 'pc3')) + p9.geom_point()) + p9.stat_smooth(method='lowess')


/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"
Out[294]:
<ggplot: (8728655259824)>

In [295]:
(p9.ggplot(df5, p9.aes('d', 'pc4')) + p9.geom_point()) + p9.stat_smooth(method='lowess')


/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"
Out[295]:
<ggplot: (-9223363308200697137)>

In [296]:
(p9.ggplot(df5, p9.aes('d', 'pc5')) + p9.geom_point()) + p9.stat_smooth(method='lowess')


/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"
Out[296]:
<ggplot: (-9223363308201012066)>

In [411]:
dir(sklearn.decomposition)


Out[411]:
['DictionaryLearning',
 'FactorAnalysis',
 'FastICA',
 'IncrementalPCA',
 'KernelPCA',
 'LatentDirichletAllocation',
 'MiniBatchDictionaryLearning',
 'MiniBatchSparsePCA',
 'NMF',
 'PCA',
 'ProjectedGradientNMF',
 'RandomizedPCA',
 'SparseCoder',
 'SparsePCA',
 'TruncatedSVD',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_online_lda',
 'base',
 'cdnmf_fast',
 'dict_learning',
 'dict_learning_online',
 'factor_analysis',
 'fastica',
 'fastica_',
 'incremental_pca',
 'kernel_pca',
 'nmf',
 'non_negative_factorization',
 'online_lda',
 'pca',
 'randomized_svd',
 'sparse_encode',
 'sparse_pca',
 'truncated_svd']

In [ ]: