In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
import dask.dataframe as dd
import dask.distributed
import numpy as np
import pandas as pd
# import geopandas as gpd
from matplotlib.colors import SymLogNorm as symlog
from matplotlib import rcParams
import sklearn, sklearn.cluster
import matplotlib.pyplot as plt
import palettable
import seaborn as sns
import netCDF4
import geopandas
pd.options.display.max_rows = 300
pd.options.display.max_columns = 100
In [2]:
client = dask.distributed.Client()
In [45]:
tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')
In [4]:
rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 12
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'
rcParams['savefig.dpi'] = 150
rcParams['figure.dpi'] = 150
import seaborn as sns
import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'HelveticaOblique.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'HelveticaBold.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file='
'{0}/Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
In [5]:
df = dd.read_parquet('/data/all_trips.parquet', index='trip_id',
columns='pickup_datetime dropoff_datetime pickup_taxizone_id dropoff_taxizone_id'.split())
In [6]:
df2 = df.sample(frac=1.0e-6, random_state=42).compute()
In [7]:
df2 = df2.dropna()
In [8]:
df3 = df2.merge(
tzdf['LocationID borough zone'.split()], left_on='pickup_taxizone_id', right_on='LocationID'
)
df3['pickup_location'] = df3.borough.map(str) + " | " + df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)
df3 = df3.merge(
tzdf['LocationID borough zone'.split()], left_on='dropoff_taxizone_id', right_on='LocationID'
)
df3['dropoff_location'] = df3.borough.map(str) + " | " + df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)
df3 = df3.sample(frac=1, replace=False, random_state=42).reset_index(drop=True)
In [9]:
df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html().replace("""\n""", "")
Out[9]:
In [10]:
from IPython.display import HTML
HTML(df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html())
Out[10]:
In [11]:
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
columns=['pickup_taxizone_id', 'dropoff_taxizone_id'])
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)
In [12]:
df.head()
Out[12]:
In [13]:
count_dataframe = df.reset_index().groupby(['pickup_taxizone_id', 'dropoff_taxizone_id']).count().compute()
count_dataframe.columns = ['count']
count_dataframe.shape
Out[13]:
In [14]:
count_dataframe.head()
Out[14]:
In [15]:
count_matrix = np.zeros((267, 267), dtype=np.int64)
for r in count_dataframe.reset_index().itertuples():
count_matrix[r[1], r[2]] = r[3]
In [16]:
count_dataframe.describe()
Out[16]:
In [17]:
count_dataframe.reset_index().head()
Out[17]:
In [18]:
# <!-- collapse=True -->
plt.imshow(count_matrix[1:-3, 1:-3].T, norm=symlog(10000), origin='upper', cmap=plt.cm.Blues)
plt.grid(False)
plt.xlabel("Dropoff Taxi Zone ID")
plt.ylabel("Pickup Taxi Zone ID")
plt.gcf().set_size_inches(4, 4)
In [19]:
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
columns=['pickup_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)
In [20]:
def get_year_mo_day(data, col):
# d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
# return d
return data.index.values.astype('M8[h]')
In [21]:
df['pickup_ymd'] = df.map_partitions(get_year_mo_day, 'pickup_datetime', meta=('asdf', np.datetime64))
In [22]:
df.reset_index().rename(columns=dict(index='N')).tail()
Out[22]:
In [23]:
pickup_counts_df = df.reset_index().rename(columns=dict(index='N')).groupby(['pickup_taxizone_id', 'pickup_ymd',]).count().compute()
pickup_counts_df.sort_index(inplace=True)
In [24]:
pickup_counts_df.head()
Out[24]:
In [25]:
z = pickup_counts_df.unstack(0)
In [26]:
z.columns = np.arange(1, 267).astype(str)
In [27]:
z = z.merge(
pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')),
how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)
In [28]:
z.head()
Out[28]:
In [29]:
import fastparquet
fastparquet.write('/data/trips_pickups_matrix.parquet', z, compression='SNAPPY')
In [30]:
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
columns=['dropoff_datetime', 'dropoff_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)
In [31]:
def get_year_mo_day(data, col):
# d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
# return d
return data.index.values.astype('M8[h]')
In [32]:
df['dropoff_ymd'] = df.map_partitions(get_year_mo_day, 'dropoff_datetime', meta=('asdf', np.datetime64))
In [33]:
df.reset_index(drop=True).tail()
Out[33]:
In [34]:
dropoff_counts_df = df.reset_index(drop=True).rename(columns=dict(dropoff_datetime='N')).groupby(['dropoff_taxizone_id', 'dropoff_ymd',]).count().compute()
dropoff_counts_df.sort_index(inplace=True)
In [35]:
dropoff_counts_df.head()
Out[35]:
In [36]:
z2 = dropoff_counts_df.unstack(0)
In [37]:
z2 = z2.merge(
pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')),
how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)
In [38]:
z2.columns = np.arange(1, 267).astype(str)
In [39]:
z2.head()
Out[39]:
In [40]:
import fastparquet
fastparquet.write('/data/trips_dropoffs_matrix.parquet', z2, compression='SNAPPY')
In [86]:
tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')
In [87]:
import fastparquet
dropoffs_matrix = fastparquet.ParquetFile('/data/trips_dropoffs_matrix.parquet').to_pandas()
pickups_matrix = fastparquet.ParquetFile('/data/trips_pickups_matrix.parquet').to_pandas()
In [88]:
dropoffs_matrix = dropoffs_matrix.iloc[:, :-3]
pickups_matrix = pickups_matrix.iloc[:, :-3]
In [135]:
counts_matrix = pd.concat([dropoffs_matrix, pickups_matrix], axis=1 )
In [321]:
tzdf.zone[0]
Out[321]:
In [324]:
sns.distplot(counts_matrix.iloc[:, 263+0], kde=False)
sns.distplot(counts_matrix.iloc[:, 0], kde=False)
Out[324]:
In [136]:
import sklearn, sklearn.decomposition
In [414]:
# pca = sklearn.decomposition.PCA(n_components=20, whiten=True)
# # pca.fit(counts_matrix.resample('1D').sum().values)
# pca.fit(counts_matrix.values)
# pca.explained_variance_ratio_
In [500]:
pca = sklearn.decomposition.FastICA(n_components=3, random_state=42, whiten=True)
# pca.fit(counts_matrix.resample('1D').sum().values)
yvals = pca.fit_transform(counts_matrix.values)
# pca.explained_variance_ratio_
In [501]:
yvals.shape
Out[501]:
In [502]:
pickup_eof1, dropoff_eof1 = pca.components_[0, :263], pca.components_[0, 263:]
pickup_eof2, dropoff_eof2 = pca.components_[1, :263], pca.components_[1, 263:]
pickup_eof3, dropoff_eof3 = pca.components_[2, :263], pca.components_[2, 263:]
# pickup_eof4, dropoff_eof4 = pca.components_[3, :263], pca.components_[3, 263:]
# pickup_eof5, dropoff_eof5 = pca.components_[4, :263], pca.components_[4, 263:]
In [503]:
tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
# tzdf['pEOF4'] = pickup_eof4
# tzdf['dEOF4'] = dropoff_eof4
# tzdf['pEOF5'] = pickup_eof5
# tzdf['dEOF5'] = dropoff_eof5
In [504]:
tzdf['N_dropoffs'] = dropoffs_matrix.sum(axis=0).values
tzdf['N_pickups'] = pickups_matrix.sum(axis=0).values
In [505]:
tzdf['log10_N_dropoffs'] = np.log10(tzdf.N_dropoffs)
tzdf['log10_N_pickups'] = np.log10(tzdf.N_pickups)
In [506]:
tzdf = tzdf.to_crs({'init': 'epsg:3857'})
In [507]:
tzdf.head()
Out[507]:
In [526]:
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]
In [527]:
tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')
In [528]:
tzdf2.plot(figsize=(12, 18), alpha=1, column='N_dropoffs_ranked', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
In [529]:
tzdf2.plot(figsize=(12, 18), alpha=1, column='N_pickups_ranked', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
In [530]:
tzdf2.iloc[:, -10:].describe()
Out[530]:
In [531]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 8), alpha=1, column='pEOF1', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
In [537]:
import pysal.esda.mapclassify
In [544]:
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF2', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)