In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline
import geopandas as gpd
import sklearn, sklearn.decomposition
# import seaborn as sns
import seaborn.apionly as sns
from matplotlib import rcParams
## Matplotlib has some pretty annoying defaults. The rest of this cell is to
## produce nice plots, for my definition of nice
plt.style.use('seaborn-darkgrid')
rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 10
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'
rcParams['axes.titlesize'] = 11
rcParams['savefig.dpi'] = 150
rcParams['figure.dpi'] = 150
import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'HelveticaOblique.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'HelveticaBold.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file='
'{0}/Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
%config InlineBackend.figure_format='retina'
In [2]:
tzdf = gpd.read_file('../shapefiles/taxi_zones.shp').drop(['OBJECTID', 'Shape_Area', 'Shape_Leng'], axis=1)
tzdf = tzdf.to_crs({'init': 'epsg:3857'})
In [3]:
import fastparquet
dropoffs_matrix = fastparquet.ParquetFile('/data/trips_dropoffs_matrix.parquet').to_pandas()
pickups_matrix = fastparquet.ParquetFile('/data/trips_pickups_matrix.parquet').to_pandas()
In [4]:
# drop last three columns that indicate unknown taxi zone for the trip
dropoffs_matrix = dropoffs_matrix.iloc[:, :-3]
pickups_matrix = pickups_matrix.iloc[:, :-3]
In [5]:
dropoffs_matrix.to_csv('/data/trips_dropoffs_matrix.csv', index=False)
pickups_matrix.to_csv('/data/pickups_matrix.csv', index=False)
In [6]:
# concat along the taxizone_id dimension. This effectively makes a 526 column
# dataframe that has both pickup and dropoff information.
counts_matrix = pd.concat([dropoffs_matrix, pickups_matrix], axis=1 )
In [7]:
counts_matrix.head()
Out[7]:
In [8]:
counts_matrix.to_csv('/data/counts_matrix.csv', index=False)
In [9]:
zz = pd.DataFrame(dropoffs_matrix.sum(axis=0).sort_values(ascending=False))
zz.index = zz.index.astype(np.int32)
zz.columns = ["N_dropoffs"]
In [10]:
from IPython.display import HTML
HTML(zz.merge(tzdf, left_index=True, right_on='LocationID').iloc[:15].to_html())
Out[10]:
In [11]:
HTML(zz.merge(tzdf[tzdf.borough != 'Manhattan'], left_index=True, right_on='LocationID').iloc[:15].to_html())
Out[11]:
In [12]:
def plot_zone(tzid=160, ax=None, xl=True, yl=True):
sns.distplot(counts_matrix.iloc[:, tzid-1], kde=False, label='pickups', ax=ax)
sns.distplot(counts_matrix.iloc[:, 263+tzid-1], kde=False, color='gray', label='dropoffs', ax=ax)
if ax is not None:
plt.sca(ax)
plt.legend()
plt.title(tzdf.loc[tzid-1]['zone'])
if xl:
plt.xlabel("Hourly Pickups or Dropoffs")
else:
plt.xlabel("")
if yl:
plt.ylabel('N')
else:
plt.ylabel("")
In [13]:
f, axlist = plt.subplots(5, 2)
ids = [161, 79, 237, 234, 132, 138, 181, 65, 92, 62]
for j, (i, ax) in enumerate(zip(ids, axlist.ravel())):
plot_zone(i, ax=ax,
xl=(j >= 6),
yl=(j%2 == 0)
)
plt.gcf().set_size_inches(8, 10)
plt.tight_layout()
Looking at the distributions of pickups for 10 arbitrary taxi zones above, it is clear that one type of distribution (probably) cannot explain all the different distributions of pickups and dropoffs. Let's try a few different types of PCA, ICA, and NMF decompositions. The pickups_matrix, dropoffs_matrix, or compbined counts_matrix, have one axis of the matrix represent time, and the other axis represent space, so the matrix decompositions will reflect a similar duality in spatial structure.
In [14]:
pickups_matrix.head()
Out[14]:
In [15]:
dropoffs_matrix.head()
Out[15]:
In [16]:
pca_pickup = sklearn.decomposition.PCA(n_components=5)
pickups_pcs = pca_pickup.fit_transform(pickups_matrix.values)
pca_pickup.explained_variance_ratio_
Out[16]:
In [17]:
pca_pickup = sklearn.decomposition.PCA(n_components=5, whiten=True)
pickups_pcs = pca_pickup.fit_transform(pickups_matrix.values)
pca_pickup.explained_variance_ratio_
Out[17]:
In [18]:
pickup_eof1 = pca_pickup.components_[0]
pickup_eof2 = pca_pickup.components_[1]
pickup_eof3 = pca_pickup.components_[2]
pickup_eof4 = pca_pickup.components_[3]
pickup_eof5 = pca_pickup.components_[4]
In [19]:
tzdf['pEOF1'] = pickup_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['pEOF5'] = pickup_eof5
In [20]:
tzdf.head()
Out[20]:
In [21]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]
In [22]:
tzdf2.iloc[:, -5:].describe()
Out[22]:
In [23]:
fig, axlist = plt.subplots(1, 5)
for i, ax in enumerate(axlist.ravel()):
tzdf2.plot(figsize=(8, 8), alpha=1, column='pEOF{}'.format(i+1), cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2, vmax=0.2, ax=ax)
ax.grid(False)
ax.set_facecolor('xkcd:silver')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.set_aspect('equal', 'datalim')
fig.set_size_inches(14, 4)
plt.tight_layout()
In [24]:
for i, y in enumerate(pickups_pcs.T[:]):
plt.subplot(5, 1, i+1)
plt.plot(pickups_matrix.index.values, y, label='pc{}'.format(i+1), lw=0.5)
plt.xlim('2015-06-15', '2015-06-22')
plt.gcf().set_size_inches(11, 11)
plt.legend()
# plt.xlim('2014-01-01', '2014-07-01')
Out[24]:
In [25]:
pca_pickup = sklearn.decomposition.FastICA(n_components=5)
pickups_pcs = pca_pickup.fit_transform(pickups_matrix.values)
# pca_pickup.explained_variance_ratio_
In [26]:
pickup_eof1 = pca_pickup.components_[0]
pickup_eof2 = pca_pickup.components_[1]
pickup_eof3 = pca_pickup.components_[2]
pickup_eof4 = pca_pickup.components_[3]
pickup_eof5 = pca_pickup.components_[4]
In [27]:
tzdf['pEOF1'] = pickup_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['pEOF5'] = pickup_eof5
In [28]:
tzdf.head()
Out[28]:
In [29]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]
In [30]:
tzdf2.iloc[:, -5:].describe()
Out[30]:
In [31]:
fig, axlist = plt.subplots(1, 5)
for i, ax in enumerate(axlist.ravel()):
tzdf2.plot(figsize=(8, 8), alpha=1, column='pEOF{}'.format(i+1), cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax)
ax.grid(False)
ax.set_facecolor('xkcd:silver')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.set_aspect('equal', 'datalim')
fig.set_size_inches(14, 4)
plt.tight_layout()
In [32]:
for i, y in enumerate(pickups_pcs.T[:]):
plt.subplot(5, 1, i+1)
plt.plot(pickups_matrix.index.values, y, label='pc{}'.format(i+1), lw=0.5)
plt.xlim('2015-06-15', '2015-06-22')
plt.gcf().set_size_inches(11, 11)
# plt.xlim('2014-01-01', '2014-07-01')
In [33]:
pca_pickup = sklearn.decomposition.SparsePCA(n_components=5, random_state=11, alpha=10)
pickups_pcs = pca_pickup.fit_transform(pickups_matrix.values)
# pca_pickup.explained_variance_ratio_
In [34]:
pickup_eof1 = pca_pickup.components_[0]
pickup_eof2 = pca_pickup.components_[1]
pickup_eof3 = pca_pickup.components_[2]
pickup_eof4 = pca_pickup.components_[3]
pickup_eof5 = pca_pickup.components_[4]
In [35]:
tzdf['pEOF1'] = pickup_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['pEOF5'] = pickup_eof5
In [36]:
tzdf.head()
Out[36]:
In [37]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]
In [38]:
tzdf2.iloc[:, -5:].describe()
Out[38]:
In [39]:
fig, axlist = plt.subplots(1, 5)
for i, ax in enumerate(axlist.ravel()):
tzdf2.plot(figsize=(8, 8), alpha=1, column='pEOF{}'.format(i+1), cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e6/(i+1), vmax=0.2e6/(i+1), ax=ax)
ax.grid(False)
ax.set_facecolor('xkcd:silver')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.set_aspect('equal', 'datalim')
fig.set_size_inches(14, 4)
plt.tight_layout()
In [40]:
for i, y in enumerate(pickups_pcs.T[:]):
plt.subplot(5, 1, i+1)
plt.plot(pickups_matrix.index.values, y, label='pc{}'.format(i+1), lw=0.5)
plt.xlim('2015-06-15', '2015-06-22')
plt.gcf().set_size_inches(11, 11)
# plt.xlim('2014-01-01', '2014-07-01')
In [1]:
pickups_matrix <- read.csv("/data/pickups_matrix.csv")
dropoffs_matrix <- read.csv("/data/trips_dropoffs_matrix.csv")
counts_matrix <- read.csv("/data/counts_matrix.csv")
In [2]:
data.matrix(head(pickups_matrix))
In [36]:
library("generalizedPCA")
library(jsonlite)
In [12]:
k_pickups <- generalizedPCA(data.matrix(pickups_matrix), k=3, family="poisson", quiet=FALSE, max_iters=200)
In [13]:
attributes(k_pickups)
In [ ]:
k_pickups$prop_deviance_expl
In [32]:
write.csv(k_pickups$mu, file="/data/poisson_pca/pickups_mu.csv")
write.csv(k_pickups$U, file="/data/poisson_pca/pickups_U.csv")
write.csv(k_pickups$PCs, file="/data/poisson_pca/pickups_PCs.csv")
write.csv(k_pickups$loss_trace, file="/data/poisson_pca/pickups_loss_trace.csv")
write.csv(k_pickups$)
write(serializeJSON(k_pickups, pretty=TRUE), file="/data/poisson_pca/pickups.json")
In [ ]:
k_dropoffs <- generalizedPCA(data.matrix(dropoffs_matrix), k=3, family="poisson", quiet=FALSE, max_iters=200)
In [ ]:
write.csv(k_dropoffs$mu, file="/data/poisson_pca/dropoffs_mu.csv")
write.csv(k_dropoffs$U, file="/data/poisson_pca/dropoffs_U.csv")
write.csv(k_dropoffs$PCs, file="/data/poisson_pca/dropoffs_PCs.csv")
write.csv(k_dropoffs$loss_trace, file="/data/poisson_pca/dropoffs_loss_trace.csv")
write.csv(k_dropoffs$)
write(serializeJSON(k_dropoffs, pretty=TRUE), file="/data/poisson_pca/dropoffs.json")
In [ ]:
k_dropoffs$prop_deviance_expl
In [ ]:
k_counts <- generalizedPCA(data.matrix(counts_matrix), k=3, family="poisson", quiet=FALSE, max_iters=200)
In [ ]:
write.csv(k_counts$mu, file="/data/poisson_pca/counts_mu.csv")
write.csv(k_counts$U, file="/data/poisson_pca/counts_U.csv")
write.csv(k_counts$PCs, file="/data/poisson_pca/counts_PCs.csv")
write.csv(k_counts$loss_trace, file="/data/poisson_pca/counts_loss_trace.csv")
write.csv(k_counts$)
write(serializeJSON(k_counts, pretty=TRUE), file="/data/poisson_pca/counts.json")
In [ ]:
k_counts$prop_deviance_expl