In [3]:
%matplotlib inline
In [4]:
import os
files = os.listdir('/bigdata/all_trips.parquet/')
import fastparquet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [5]:
len(files)
Out[5]:
In [9]:
for f in files:
d = fastparquet.ParquetFile(os.path.join('/bigdata/all_trips.parquet/', f))
df = d.to_pandas(columns=['pickup_taxizone_id', 'pickup_datetime'])
df = df[df.pickup_datetime < '2016-07-01']
print("{} {}".format(f, df.shape[0]))
if 'alldf' in locals():
alldf = alldf.merge(df.groupby('pickup_taxizone_id').count()[['pickup_datetime']],
left_index=True, right_index=True, how='outer'
)
else:
alldf = df.groupby('pickup_taxizone_id').count()[['pickup_datetime']]
In [19]:
zz = pd.DataFrame(index=alldf.index,)
zz['N'] = np.nansum(alldf.values, axis=1)
zz['logN'] = np.log10(zz['N'])
In [20]:
import seaborn
seaborn.distplot(zz.logN.fillna(0), bins=np.arange(0, 7., 0.5), norm_hist=True)
# plt.xticks(np.linspace(0, 8, 17));
# plt.xlabel("Log10(Taxi Trips)")
# plt.ylabel("Frequency")
# plt.gcf().set_size_inches(8, 4)
Out[20]:
In [31]:
import geopandas as gpd
import matplotlib.pyplot as plt
tz = gpd.read_file('../shapefiles/taxi_zones.shp')
tz = tz.merge(zz, left_on='LocationID', right_index=True)
In [32]:
tz.N.sum()
Out[32]:
In [33]:
tz.plot(column='logN', cmap=plt.cm.viridis, linewidth=0.5, vmin=2, vmax=6.5)
plt.gcf().set_size_inches(12, 9)
In [34]:
z = tz
In [35]:
z = z[(z.borough != 'Staten Island')]
z = z[(z.borough != 'EWR')]
In [37]:
z.plot(column='logN', cmap=plt.cm.viridis, linewidth=0.5, vmin=2, vmax=6.5)
plt.gcf().set_size_inches(12, 9)
plt.tight_layout()
In [ ]: