In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import matplotlib 
%matplotlib inline
import geopandas as gpd
import sklearn, sklearn.decomposition

# import seaborn as sns
import seaborn.apionly as sns
from matplotlib import rcParams


## Matplotlib has some pretty annoying defaults. The rest of this cell is to
## produce nice plots, for my definition of nice

plt.style.use('seaborn-darkgrid')

rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 10
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'
rcParams['axes.titlesize'] = 11

rcParams['savefig.dpi'] = 150
rcParams['figure.dpi'] = 150

import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
    fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
    fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaOblique.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaBold.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
                             'normal:weight=normal:stretch=normal:file='
                             '{0}/Helvetica.ttf:' +
                             fontsize2
                             ).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))

%config InlineBackend.figure_format='retina'

Section 1: Load data and explore


In [2]:
tzdf = gpd.read_file('../shapefiles/taxi_zones.shp').drop(['OBJECTID', 'Shape_Area', 'Shape_Leng'], axis=1)
tzdf = tzdf.to_crs({'init': 'epsg:3857'})

In [3]:
import fastparquet
dropoffs_matrix = fastparquet.ParquetFile('/data/trips_dropoffs_matrix.parquet').to_pandas()
pickups_matrix = fastparquet.ParquetFile('/data/trips_pickups_matrix.parquet').to_pandas()

In [4]:
# drop last three columns that indicate unknown taxi zone for the trip
dropoffs_matrix = dropoffs_matrix.iloc[:, :-3]
pickups_matrix = pickups_matrix.iloc[:, :-3]

In [5]:
dropoffs_matrix.to_csv('/data/trips_dropoffs_matrix.csv', index=False)
pickups_matrix.to_csv('/data/pickups_matrix.csv', index=False)

In [6]:
# concat along the taxizone_id dimension. This effectively makes a 526 column 
# dataframe that has both pickup and dropoff information. 
counts_matrix = pd.concat([dropoffs_matrix, pickups_matrix], axis=1 )

In [7]:
counts_matrix.head()


Out[7]:
1 2 3 4 5 6 7 8 9 10 ... 254 255 256 257 258 259 260 261 262 263
index
2009-01-01 00:00:00 0 0 1 128 0 1 113 0 0 1 ... 0 74 53 4 2 0 6 70 179 449
2009-01-01 01:00:00 2 0 2 154 0 1 188 2 1 0 ... 0 97 92 7 0 0 22 85 239 627
2009-01-01 02:00:00 3 0 1 134 1 0 245 0 4 4 ... 0 122 115 6 1 0 21 55 170 594
2009-01-01 03:00:00 1 0 1 109 0 0 229 0 4 1 ... 0 80 98 5 1 0 28 70 96 437
2009-01-01 04:00:00 10 0 2 103 0 0 221 1 3 2 ... 0 80 74 5 0 0 20 37 62 296

5 rows × 526 columns


In [8]:
counts_matrix.to_csv('/data/counts_matrix.csv', index=False)

In [9]:
zz = pd.DataFrame(dropoffs_matrix.sum(axis=0).sort_values(ascending=False))
zz.index = zz.index.astype(np.int32)
zz.columns = ["N_dropoffs"]

Top 15 Dropoff taxizones

Notice all are in Manhattan


In [10]:
from IPython.display import HTML
HTML(zz.merge(tzdf, left_index=True, right_on='LocationID').iloc[:15].to_html())


Out[10]:
N_dropoffs LocationID borough geometry zone
160 52036413 161 Manhattan POLYGON ((-8234897.600846949 4976315.14938336,... Midtown Center
229 43646194 230 Manhattan POLYGON ((-8235819.387790391 4976346.987497879... Times Sq/Theatre District
235 43309442 236 Manhattan POLYGON ((-8232943.946566503 4979004.835525835... Upper East Side North
236 42804206 237 Manhattan POLYGON ((-8233871.646244711 4977326.197700994... Upper East Side South
169 41978640 170 Manhattan POLYGON ((-8234529.081123573 4974919.946359011... Murray Hill
161 40821468 162 Manhattan POLYGON ((-8234438.214808108 4976299.51758913,... Midtown East
233 37832622 234 Manhattan POLYGON ((-8236525.713410378 4973318.432098114... Union Sq
78 35519161 79 Manhattan POLYGON ((-8235836.706542813 4971354.640135782... East Village
185 35202663 186 Manhattan POLYGON ((-8236636.917574188 4974863.5121946, ... Penn Station/Madison Sq West
47 34811364 48 Manhattan POLYGON ((-8236660.189359007 4976319.580681437... Clinton East
141 32945653 142 Manhattan POLYGON ((-8235580.569077416 4978141.296012126... Lincoln Square East
162 31824501 163 Manhattan POLYGON ((-8235567.561283603 4978157.711087875... Midtown North
67 31089088 68 Manhattan POLYGON ((-8237867.114832911 4974132.279770708... East Chelsea
106 29481396 107 Manhattan POLYGON ((-8235681.569251416 4972805.239988495... Gramercy
140 29450688 141 Manhattan POLYGON ((-8233387.31862869 4976988.231984851,... Lenox Hill West

Top 15 Dropoff taxizones

Manhattan specifically excluded


In [11]:
HTML(zz.merge(tzdf[tzdf.borough != 'Manhattan'], left_index=True, right_on='LocationID').iloc[:15].to_html())


Out[11]:
N_dropoffs LocationID borough geometry zone
137 16669381 138 Queens (POLYGON ((-8223309.774232504 4980833.38033783... LaGuardia Airport
131 10533002 132 Queens (POLYGON ((-8218165.267816752 4962845.73596870... JFK Airport
6 7863852 7 Queens POLYGON ((-8226970.8291156 4978111.202464147, ... Astoria
180 6450704 181 Brooklyn POLYGON ((-8234951.873885442 4963173.727069839... Park Slope
255 5405726 256 Brooklyn POLYGON ((-8233004.979703741 4970145.93550765,... Williamsburg (South Side)
254 5290329 255 Brooklyn POLYGON ((-8233385.539715838 4971897.062786663... Williamsburg (North Side)
111 5138546 112 Brooklyn (POLYGON ((-8232565.655014612 4973936.97494895... Greenpoint
225 4551835 226 Queens POLYGON ((-8227612.38972376 4975402.242556782,... Sunnyside
32 4442511 33 Brooklyn POLYGON ((-8237218.444431831 4968687.819923609... Brooklyn Heights
144 4218257 145 Queens POLYGON ((-8231347.659800225 4975838.130591199... Long Island City/Hunters Point
222 3850093 223 Queens POLYGON ((-8227233.381837943 4981442.494158686... Steinway
79 3841805 80 Brooklyn POLYGON ((-8230044.586816066 4972312.023878687... East Williamsburg
128 3668753 129 Queens POLYGON ((-8223823.678054077 4977895.337014052... Jackson Heights
48 3555866 49 Brooklyn POLYGON ((-8233258.620050297 4967651.618383895... Clinton Hill
60 3171638 61 Brooklyn POLYGON ((-8229756.591469016 4965760.130422773... Crown Heights North

A plot of a few particular taxi districts


In [12]:
def plot_zone(tzid=160, ax=None, xl=True, yl=True):
    sns.distplot(counts_matrix.iloc[:, tzid-1], kde=False, label='pickups', ax=ax)
    sns.distplot(counts_matrix.iloc[:, 263+tzid-1], kde=False, color='gray', label='dropoffs', ax=ax)
    if ax is not None:
        plt.sca(ax)
    plt.legend()
    plt.title(tzdf.loc[tzid-1]['zone'])
    if xl:
        plt.xlabel("Hourly Pickups or Dropoffs")
    else:
        plt.xlabel("")
    if yl:
        plt.ylabel('N')
    else:
        plt.ylabel("")

In [13]:
f, axlist = plt.subplots(5, 2)

ids = [161, 79, 237, 234, 132, 138, 181, 65, 92, 62]

for j, (i, ax) in enumerate(zip(ids, axlist.ravel())):
    plot_zone(i, ax=ax, 
              xl=(j >= 6),
              yl=(j%2 == 0)
             )

plt.gcf().set_size_inches(8, 10)
plt.tight_layout()


PCA / ICA / NMF Decomposition Attempts

Looking at the distributions of pickups for 10 arbitrary taxi zones above, it is clear that one type of distribution (probably) cannot explain all the different distributions of pickups and dropoffs. Let's try a few different types of PCA, ICA, and NMF decompositions. The pickups_matrix, dropoffs_matrix, or compbined counts_matrix, have one axis of the matrix represent time, and the other axis represent space, so the matrix decompositions will reflect a similar duality in spatial structure.


In [14]:
pickups_matrix.head()


Out[14]:
1 2 3 4 5 6 7 8 9 10 ... 254 255 256 257 258 259 260 261 262 263
index
2009-01-01 00:00:00 0 0 0 121 0 0 42 0 0 0 ... 0 74 53 4 2 0 6 70 179 449
2009-01-01 01:00:00 1 0 0 137 0 0 88 0 0 0 ... 0 97 92 7 0 0 22 85 239 627
2009-01-01 02:00:00 1 0 0 125 0 0 115 0 0 0 ... 0 122 115 6 1 0 21 55 170 594
2009-01-01 03:00:00 0 0 0 124 0 0 110 1 1 0 ... 0 80 98 5 1 0 28 70 96 437
2009-01-01 04:00:00 2 0 0 100 0 0 105 0 1 0 ... 0 80 74 5 0 0 20 37 62 296

5 rows × 263 columns


In [15]:
dropoffs_matrix.head()


Out[15]:
1 2 3 4 5 6 7 8 9 10 ... 254 255 256 257 258 259 260 261 262 263
index
2009-01-01 00:00:00 0 0 1 128 0 1 113 0 0 1 ... 0 85 75 13 6 3 36 111 282 450
2009-01-01 01:00:00 2 0 2 154 0 1 188 2 1 0 ... 1 103 84 22 7 7 41 75 346 627
2009-01-01 02:00:00 3 0 1 134 1 0 245 0 4 4 ... 0 92 98 28 3 4 53 66 283 536
2009-01-01 03:00:00 1 0 1 109 0 0 229 0 4 1 ... 2 59 81 24 8 5 56 89 233 448
2009-01-01 04:00:00 10 0 2 103 0 0 221 1 3 2 ... 3 62 84 20 4 6 52 68 143 291

5 rows × 263 columns

Naive PCA


In [16]:
pca_pickup = sklearn.decomposition.PCA(n_components=5)
pickups_pcs = pca_pickup.fit_transform(pickups_matrix.values)
pca_pickup.explained_variance_ratio_


Out[16]:
array([ 0.6525235 ,  0.19236085,  0.04518883,  0.01969177,  0.0167445 ])

In [17]:
pca_pickup = sklearn.decomposition.PCA(n_components=5, whiten=True)
pickups_pcs = pca_pickup.fit_transform(pickups_matrix.values)
pca_pickup.explained_variance_ratio_


Out[17]:
array([ 0.6525235 ,  0.19236085,  0.04518883,  0.01969177,  0.0167445 ])

In [18]:
pickup_eof1 = pca_pickup.components_[0]
pickup_eof2 = pca_pickup.components_[1]
pickup_eof3 = pca_pickup.components_[2]
pickup_eof4 = pca_pickup.components_[3]
pickup_eof5 = pca_pickup.components_[4]

In [19]:
tzdf['pEOF1'] = pickup_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['pEOF5'] = pickup_eof5

In [20]:
tzdf.head()


Out[20]:
LocationID borough geometry zone pEOF1 pEOF2 pEOF3 pEOF4 pEOF5
0 1 EWR POLYGON ((-8258175.532737532 4967457.202992616... Newark Airport 0.000190 -0.000846 0.000280 -0.000257 0.001303
1 2 Queens (POLYGON ((-8217980.621910957 4959237.28547167... Jamaica Bay 0.000069 0.000035 -0.000006 -0.000256 -0.000235
2 3 Bronx POLYGON ((-8220713.534155379 4993383.154018582... Allerton/Pelham Gardens 0.000041 0.000004 0.000100 0.000312 0.000335
3 4 Manhattan POLYGON ((-8234500.226961649 4971984.093397928... Alphabet City 0.014967 0.052920 0.046970 -0.025640 0.000473
4 5 Staten Island POLYGON ((-8257036.10884249 4948033.094989426,... Arden Heights 0.000004 0.000005 -0.000002 -0.000016 -0.000006

In [21]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]

In [22]:
tzdf2.iloc[:, -5:].describe()


Out[22]:
pEOF1 pEOF2 pEOF3 pEOF4 pEOF5
count 242.000000 242.000000 242.000000 242.000000 242.000000
mean 0.026131 0.006993 0.006632 0.011919 0.008212
std 0.058853 0.064033 0.064072 0.063299 0.063888
min -0.002833 -0.280135 -0.448488 -0.186421 -0.360792
25% 0.000066 -0.000079 0.000005 -0.000013 0.000024
50% 0.000315 0.000036 0.000400 0.001285 0.001235
75% 0.008549 0.001588 0.005123 0.010949 0.013316
max 0.304526 0.569635 0.304542 0.555637 0.314465

In [23]:
fig, axlist = plt.subplots(1, 5)
for i, ax in enumerate(axlist.ravel()):
    tzdf2.plot(figsize=(8, 8), alpha=1, column='pEOF{}'.format(i+1), cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2, vmax=0.2, ax=ax)
    ax.grid(False)
    ax.set_facecolor('xkcd:silver')
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_aspect('equal', 'datalim')
fig.set_size_inches(14, 4)
plt.tight_layout()



In [24]:
for i, y in enumerate(pickups_pcs.T[:]):
    plt.subplot(5, 1, i+1)
    plt.plot(pickups_matrix.index.values, y, label='pc{}'.format(i+1), lw=0.5)
    plt.xlim('2015-06-15', '2015-06-22')
plt.gcf().set_size_inches(11, 11)
plt.legend()
# plt.xlim('2014-01-01', '2014-07-01')


Out[24]:
<matplotlib.legend.Legend at 0x7fdab26aca20>

Fast ICA


In [25]:
pca_pickup = sklearn.decomposition.FastICA(n_components=5)
pickups_pcs = pca_pickup.fit_transform(pickups_matrix.values)
# pca_pickup.explained_variance_ratio_

In [26]:
pickup_eof1 = pca_pickup.components_[0]
pickup_eof2 = pca_pickup.components_[1]
pickup_eof3 = pca_pickup.components_[2]
pickup_eof4 = pca_pickup.components_[3]
pickup_eof5 = pca_pickup.components_[4]

In [27]:
tzdf['pEOF1'] = pickup_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['pEOF5'] = pickup_eof5

In [28]:
tzdf.head()


Out[28]:
LocationID borough geometry zone pEOF1 pEOF2 pEOF3 pEOF4 pEOF5
0 1 EWR POLYGON ((-8258175.532737532 4967457.202992616... Newark Airport 1.011461e-08 -6.660147e-09 1.015878e-08 -1.687143e-08 5.599168e-09
1 2 Queens (POLYGON ((-8217980.621910957 4959237.28547167... Jamaica Bay -8.827358e-11 5.552652e-10 -2.273537e-09 4.105658e-09 3.441241e-09
2 3 Bronx POLYGON ((-8220713.534155379 4993383.154018582... Allerton/Pelham Gardens -3.322515e-10 -1.408956e-09 3.025205e-09 -5.944370e-09 -3.772628e-09
3 4 Manhattan POLYGON ((-8234500.226961649 4971984.093397928... Alphabet City -1.811558e-07 -4.215255e-07 -2.436830e-07 -6.394212e-08 4.739039e-07
4 5 Staten Island POLYGON ((-8257036.10884249 4948033.094989426,... Arden Heights 6.978993e-11 -3.533233e-11 -6.808323e-11 1.730704e-10 2.038738e-10

In [29]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]

In [30]:
tzdf2.iloc[:, -5:].describe()


Out[30]:
pEOF1 pEOF2 pEOF3 pEOF4 pEOF5
count 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02 2.420000e+02
mean -1.053331e-07 -6.192474e-08 1.147599e-07 -1.625754e-07 -1.218586e-07
std 7.702525e-07 5.883781e-07 6.077089e-07 8.887209e-07 8.945301e-07
min -6.058296e-06 -4.027633e-06 -2.629512e-06 -4.746138e-06 -7.995341e-06
25% -4.006033e-08 -5.022617e-08 5.008670e-11 -2.247355e-07 -1.267315e-07
50% -2.493026e-09 -4.479310e-09 1.168326e-08 -3.089410e-08 -1.525424e-08
75% 2.088510e-10 -5.765840e-11 1.295504e-07 -5.225069e-10 3.074416e-10
max 3.677464e-06 3.678675e-06 3.199434e-06 4.462400e-06 3.634492e-06

In [31]:
fig, axlist = plt.subplots(1, 5)
for i, ax in enumerate(axlist.ravel()):
    tzdf2.plot(figsize=(8, 8), alpha=1, column='pEOF{}'.format(i+1), cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax)
    ax.grid(False)
    ax.set_facecolor('xkcd:silver')
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_aspect('equal', 'datalim')
fig.set_size_inches(14, 4)
plt.tight_layout()



In [32]:
for i, y in enumerate(pickups_pcs.T[:]):
    plt.subplot(5, 1, i+1)
    plt.plot(pickups_matrix.index.values, y, label='pc{}'.format(i+1), lw=0.5)
    plt.xlim('2015-06-15', '2015-06-22')
plt.gcf().set_size_inches(11, 11)
# plt.xlim('2014-01-01', '2014-07-01')


Sparse PCA


In [33]:
pca_pickup = sklearn.decomposition.SparsePCA(n_components=5, random_state=11, alpha=10)
pickups_pcs = pca_pickup.fit_transform(pickups_matrix.values)
# pca_pickup.explained_variance_ratio_

In [34]:
pickup_eof1 = pca_pickup.components_[0]
pickup_eof2 = pca_pickup.components_[1]
pickup_eof3 = pca_pickup.components_[2]
pickup_eof4 = pca_pickup.components_[3]
pickup_eof5 = pca_pickup.components_[4]

In [35]:
tzdf['pEOF1'] = pickup_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['pEOF5'] = pickup_eof5

In [36]:
tzdf.head()


Out[36]:
LocationID borough geometry zone pEOF1 pEOF2 pEOF3 pEOF4 pEOF5
0 1 EWR POLYGON ((-8258175.532737532 4967457.202992616... Newark Airport -359.609481 -129.557848 -34.686735 -46.856016 77.462518
1 2 Queens (POLYGON ((-8217980.621910957 4959237.28547167... Jamaica Bay -42.479373 0.000000 0.000000 4.546199 0.000000
2 3 Bronx POLYGON ((-8220713.534155379 4993383.154018582... Allerton/Pelham Gardens -43.616401 0.000000 -0.729936 -20.931008 9.048270
3 4 Manhattan POLYGON ((-8234500.226961649 4971984.093397928... Alphabet City -18690.477425 10266.690900 -3793.638589 1768.231036 362.851097
4 5 Staten Island POLYGON ((-8257036.10884249 4948033.094989426,... Arden Heights 0.000000 0.000000 0.000000 0.000000 0.000000

In [37]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]

In [38]:
tzdf2.iloc[:, -5:].describe()


Out[38]:
pEOF1 pEOF2 pEOF3 pEOF4 pEOF5
count 242.000000 242.000000 242.000000 242.000000 242.000000
mean -22257.706097 818.266274 -195.621151 -1267.135739 426.805824
std 46971.905586 12773.728639 6199.662273 4470.362359 3785.564924
min -208851.472277 -61336.008504 -25193.634974 -39442.401033 -22102.569752
25% -10599.141648 -0.785347 -239.434541 -1079.998212 0.000000
50% -447.223489 0.105106 -19.591950 -161.589938 49.440516
75% -105.796168 375.032304 0.000000 0.000000 610.123720
max 0.000000 109370.939153 45921.967284 8279.389433 23117.667194

In [39]:
fig, axlist = plt.subplots(1, 5)
for i, ax in enumerate(axlist.ravel()):
    tzdf2.plot(figsize=(8, 8), alpha=1, column='pEOF{}'.format(i+1), cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e6/(i+1), vmax=0.2e6/(i+1), ax=ax)
    ax.grid(False)
    ax.set_facecolor('xkcd:silver')
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_aspect('equal', 'datalim')
fig.set_size_inches(14, 4)
plt.tight_layout()



In [40]:
for i, y in enumerate(pickups_pcs.T[:]):
    plt.subplot(5, 1, i+1)
    plt.plot(pickups_matrix.index.values, y, label='pc{}'.format(i+1), lw=0.5)
    plt.xlim('2015-06-15', '2015-06-22')
plt.gcf().set_size_inches(11, 11)
# plt.xlim('2014-01-01', '2014-07-01')


Poisson PCA

Direct R code : Requires running notebook with R kernel


In [1]:
pickups_matrix <- read.csv("/data/pickups_matrix.csv")
dropoffs_matrix <- read.csv("/data/trips_dropoffs_matrix.csv")
counts_matrix <- read.csv("/data/counts_matrix.csv")

In [2]:
data.matrix(head(pickups_matrix))


X1X2X3X4X5X6X7X8X9X10X254X255X256X257X258X259X260X261X262X263
10 0 0 1210 0 420 0 0 0 74 534 2 0 6 70 179449
21 0 0 1370 0 880 0 0 0 97 927 0 0 22 85 239627
31 0 0 1250 0 1150 0 0 0 1221156 1 0 21 55 170594
40 0 0 1240 0 1101 1 0 0 80 985 1 0 28 70 96437
52 0 0 1000 0 1050 1 0 0 80 745 0 0 20 37 62296
61 0 0 510 0 660 0 0 0 58 371 2 0 24 17 26120

In [36]:
library("generalizedPCA")
library(jsonlite)

In [12]:
k_pickups <- generalizedPCA(data.matrix(pickups_matrix), k=3, family="poisson", quiet=FALSE, max_iters=200)


0    4.620588 0 hours elapsed
1    4.537463 0 hours elapsed. Max 0.3 hours remain.
2    4.497068 0 hours elapsed. Max 0.3 hours remain.
3    4.467757 0 hours elapsed. Max 0.3 hours remain.
4    4.44437 0 hours elapsed. Max 0.3 hours remain.
5    4.424864 0 hours elapsed. Max 0.3 hours remain.
6    4.408139 0 hours elapsed. Max 0.3 hours remain.
7    4.39349 0 hours elapsed. Max 0.3 hours remain.
8    4.380426 0 hours elapsed. Max 0.3 hours remain.
9    4.368588 0.1 hours elapsed. Max 0.2 hours remain.
10    4.357714 0.1 hours elapsed. Max 0.2 hours remain.
11    4.347608 0.1 hours elapsed. Max 0.2 hours remain.
12    4.338124 0.1 hours elapsed. Max 0.2 hours remain.
13    4.329153 0.1 hours elapsed. Max 0.2 hours remain.
14    4.320613 0.1 hours elapsed. Max 0.2 hours remain.
15    4.312441 0.1 hours elapsed. Max 0.2 hours remain.
16    4.304587 0.1 hours elapsed. Max 0.2 hours remain.
17    4.297015 0.1 hours elapsed. Max 0.2 hours remain.
18    4.289695 0.1 hours elapsed. Max 0.2 hours remain.
19    4.282602 0.1 hours elapsed. Max 0.2 hours remain.
20    4.275717 0.1 hours elapsed. Max 0.2 hours remain.
21    4.269025 0.1 hours elapsed. Max 0.2 hours remain.
22    4.26251 0.1 hours elapsed. Max 0.2 hours remain.
23    4.256163 0.1 hours elapsed. Max 0.2 hours remain.
24    4.249971 0.1 hours elapsed. Max 0.2 hours remain.
25    4.243927 0.2 hours elapsed. Max 0.2 hours remain.
26    4.238022 0.2 hours elapsed. Max 0.1 hours remain.
27    4.23225 0.2 hours elapsed. Max 0.1 hours remain.
28    4.226603 0.2 hours elapsed. Max 0.1 hours remain.
29    4.221077 0.2 hours elapsed. Max 0.1 hours remain.
30    4.215665 0.2 hours elapsed. Max 0.1 hours remain.
31    4.210363 0.2 hours elapsed. Max 0.1 hours remain.
32    4.205165 0.2 hours elapsed. Max 0.1 hours remain.
33    4.200068 0.2 hours elapsed. Max 0.1 hours remain.
34    4.195068 0.2 hours elapsed. Max 0.1 hours remain.
35    4.19016 0.2 hours elapsed. Max 0.1 hours remain.
36    4.185341 0.2 hours elapsed. Max 0.1 hours remain.
37    4.180609 0.2 hours elapsed. Max 0.1 hours remain.
38    4.175959 0.2 hours elapsed. Max 0.1 hours remain.
39    4.171388 0.2 hours elapsed. Max 0.1 hours remain.
40    4.166895 0.2 hours elapsed. Max 0.1 hours remain.
41    4.162475 0.2 hours elapsed. Max 0.1 hours remain.
42    4.158127 0.3 hours elapsed. Max 0 hours remain.
43    4.153848 0.3 hours elapsed. Max 0 hours remain.
44    4.149636 0.3 hours elapsed. Max 0 hours remain.
45    4.145488 0.3 hours elapsed. Max 0 hours remain.
46    4.141403 0.3 hours elapsed. Max 0 hours remain.
47    4.137378 0.3 hours elapsed. Max 0 hours remain.
48    4.133412 0.3 hours elapsed. Max 0 hours remain.
49    4.129502 0.3 hours elapsed. Max 0 hours remain.
50    4.125646 0.3 hours elapsed. Max 0 hours remain.

In [13]:
attributes(k_pickups)


$names
  1. 'mu'
  2. 'U'
  3. 'PCs'
  4. 'M'
  5. 'family'
  6. 'iters'
  7. 'loss_trace'
  8. 'prop_deviance_expl'
$class
'gpca'

In [ ]:
k_pickups$prop_deviance_expl

In [32]:
write.csv(k_pickups$mu, file="/data/poisson_pca/pickups_mu.csv")
write.csv(k_pickups$U, file="/data/poisson_pca/pickups_U.csv")
write.csv(k_pickups$PCs, file="/data/poisson_pca/pickups_PCs.csv")
write.csv(k_pickups$loss_trace, file="/data/poisson_pca/pickups_loss_trace.csv")
write.csv(k_pickups$)
write(serializeJSON(k_pickups, pretty=TRUE), file="/data/poisson_pca/pickups.json")

In [ ]:
k_dropoffs <- generalizedPCA(data.matrix(dropoffs_matrix), k=3, family="poisson", quiet=FALSE, max_iters=200)

In [ ]:
write.csv(k_dropoffs$mu, file="/data/poisson_pca/dropoffs_mu.csv")
write.csv(k_dropoffs$U, file="/data/poisson_pca/dropoffs_U.csv")
write.csv(k_dropoffs$PCs, file="/data/poisson_pca/dropoffs_PCs.csv")
write.csv(k_dropoffs$loss_trace, file="/data/poisson_pca/dropoffs_loss_trace.csv")
write.csv(k_dropoffs$)
write(serializeJSON(k_dropoffs, pretty=TRUE), file="/data/poisson_pca/dropoffs.json")

In [ ]:
k_dropoffs$prop_deviance_expl

In [ ]:
k_counts <- generalizedPCA(data.matrix(counts_matrix), k=3, family="poisson", quiet=FALSE, max_iters=200)

In [ ]:
write.csv(k_counts$mu, file="/data/poisson_pca/counts_mu.csv")
write.csv(k_counts$U, file="/data/poisson_pca/counts_U.csv")
write.csv(k_counts$PCs, file="/data/poisson_pca/counts_PCs.csv")
write.csv(k_counts$loss_trace, file="/data/poisson_pca/counts_loss_trace.csv")
write.csv(k_counts$)
write(serializeJSON(k_counts, pretty=TRUE), file="/data/poisson_pca/counts.json")

In [ ]:
k_counts$prop_deviance_expl