In [ ]:
%%time
ds = vaex.open("/Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5")
!ls -lh /Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5
In [ ]:
ds
In [ ]:
ds.trip_distance
In [ ]:
np.log10(ds.trip_distance)
In [ ]:
ds.count()
In [ ]:
ds.count(ds.pickup_latitude)
In [ ]:
ds.mean(ds.pickup_latitude)
In [ ]:
ds.count(binby=ds.pickup_latitude, limits=[40.5, 41])
In [ ]:
plt.plot(_)
In [ ]:
counts2d = ds.count(binby=[ds.pickup_longitude, ds.pickup_latitude], shape=128)
print(counts2d.shape)
In [ ]:
plt.imshow(np.log10(counts2d+1).T, origin='lower')
In [ ]:
limits = ds.limits([ds.pickup_longitude, ds.pickup_latitude], "98%")
limits
In [ ]:
%%time
ds.plot(ds.pickup_longitude, ds.pickup_latitude, f="log1p",
limits=limits, figsize=(10,8), shape=512, colormap="viridis")
In [ ]:
# %%timeit
# counts2d = ds.count(binby=["pickup_longitude", "pickup_latitude"], shape=128, limits=limits)#, limits=[[-90, 90], [-180, 180]])
In [ ]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude, what=vaex.stat.mean(ds.total_amount),
vmin=0, vmax=50, shape=512, figsize=(10,8), limits=limits, colormap="Greys")
In [ ]:
ds.trip_distance.minmax()
In [ ]:
ds.plot1d(ds.trip_distance, limits=[0, 50])
In [ ]:
ds = ds[(ds.trip_distance > 0) & (ds.trip_distance < 40)] # no memory copy! and not wasting 46 GB of memory
In [ ]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude,
what=vaex.stat.mean(ds.total_amount/ds.trip_distance),
vmin=0, vmax=15,
shape=512, figsize=(10,8), limits=limits, colormap="Greys")
In [ ]:
# do not do
#ratio = ds.data.total_amount/ds.data.trip_distance
print(len(ds.data.total_amount) * 8 / 1024**3, "GB")
In [ ]:
ds.total_amount/ds.trip_distance
In [ ]:
#ds.add_virtual_column("ratio", "total_amount/trip_distance")
ds['ratio'] = ds.total_amount / ds.trip_distance
In [ ]:
ds.mean(ds.ratio), ds.ratio.mean()
In [ ]:
def arc_distance(theta_1, phi_1, theta_2, phi_2):
temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
+ np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
return distance * 6400
In [ ]:
arc_distance(0, 0, 0, 180)
In [ ]:
ds["arc_distance"] = arc_distance(ds.pickup_longitude, ds.pickup_latitude,
ds.dropoff_longitude, ds.dropoff_latitude)
In [ ]:
%%time
ds.arc_distance.mean()
In [ ]:
ds['arc_distance_jit'] = ds.arc_distance.jit_numba()
# ds['arc_distance_jit'] = ds.arc_distance.jit_pythran()
In [ ]:
%%time
ds.arc_distance_jit.mean()
In [ ]:
ds['extra'] = (ds.trip_distance*1.6 - ds.arc_distance_jit)
In [ ]:
ds.select(ds.pickup_longitude != ds.dropoff_longitude)
In [ ]:
ds.extra.mean(selection=True), ds.extra.minmax(selection=True)
In [ ]:
ds.plot1d(ds.extra, selection=True, limits=[-5, 10])
In [ ]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude, what=vaex.stat.mean(ds.extra),
selection=True, vmin=0, vmax=3,
shape=512, figsize=(10,8), limits=limits, colormap="Greys")