Step 0: reading in data

vaex reads 'anything':

  • ds = vaex.open('super_fast.hdf5')
  • ds = vaex.open('gadget_is_fine.hdf5')
  • ds = vaex.from_pandas(df)
  • ds = vaex.from_astropy_table(table)
  • ds = vaex.from_ascii('takes_hours.asc')
  • ds = vaex.from_csv('this_may_be_slow.csv')
  • ds = vaex.from_arrays(x=x, y=y)

In [ ]:
%%time
ds = vaex.open("/Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5")
!ls -lh /Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5

In [ ]:
ds

In [ ]:
ds.trip_distance

In [ ]:
np.log10(ds.trip_distance)

0 dimensional


In [ ]:
ds.count()

In [ ]:
ds.count(ds.pickup_latitude)

In [ ]:
ds.mean(ds.pickup_latitude)

1 dimensional


In [ ]:
ds.count(binby=ds.pickup_latitude, limits=[40.5, 41])

In [ ]:
plt.plot(_)

2 dimensional


In [ ]:
counts2d = ds.count(binby=[ds.pickup_longitude, ds.pickup_latitude], shape=128)
print(counts2d.shape)

In [ ]:
plt.imshow(np.log10(counts2d+1).T, origin='lower')

In [ ]:
limits = ds.limits([ds.pickup_longitude, ds.pickup_latitude], "98%")
limits

In [ ]:
%%time
ds.plot(ds.pickup_longitude, ds.pickup_latitude, f="log1p",
        limits=limits, figsize=(10,8), shape=512, colormap="viridis")

In [ ]:
# %%timeit
# counts2d = ds.count(binby=["pickup_longitude", "pickup_latitude"], shape=128, limits=limits)#, limits=[[-90, 90], [-180, 180]])

Where to pick up customers?


In [ ]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude, what=vaex.stat.mean(ds.total_amount),
        vmin=0, vmax=50, shape=512, figsize=(10,8), limits=limits, colormap="Greys")

In [ ]:
ds.trip_distance.minmax()

In [ ]:
ds.plot1d(ds.trip_distance, limits=[0, 50])

In [ ]:
ds = ds[(ds.trip_distance > 0) & (ds.trip_distance < 40)] # no memory copy! and not wasting 46 GB of memory

In [ ]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude,
         what=vaex.stat.mean(ds.total_amount/ds.trip_distance),
         vmin=0, vmax=15,
         shape=512, figsize=(10,8), limits=limits, colormap="Greys")

Lazy expressions and virtual columns


In [ ]:
# do not do
#ratio = ds.data.total_amount/ds.data.trip_distance
print(len(ds.data.total_amount) * 8 / 1024**3, "GB")

In [ ]:
ds.total_amount/ds.trip_distance

In [ ]:
#ds.add_virtual_column("ratio", "total_amount/trip_distance")
ds['ratio'] = ds.total_amount / ds.trip_distance

In [ ]:
ds.mean(ds.ratio), ds.ratio.mean()

In [ ]:
def arc_distance(theta_1, phi_1, theta_2, phi_2):
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return distance * 6400

In [ ]:
arc_distance(0, 0, 0, 180)

In [ ]:
ds["arc_distance"] = arc_distance(ds.pickup_longitude,  ds.pickup_latitude,
                                ds.dropoff_longitude, ds.dropoff_latitude)

In [ ]:
%%time
ds.arc_distance.mean()

In [ ]:
ds['arc_distance_jit'] = ds.arc_distance.jit_numba()
# ds['arc_distance_jit'] = ds.arc_distance.jit_pythran()

In [ ]:
%%time
ds.arc_distance_jit.mean()

In [ ]:
ds['extra'] = (ds.trip_distance*1.6 - ds.arc_distance_jit)

In [ ]:
ds.select(ds.pickup_longitude != ds.dropoff_longitude)

In [ ]:
ds.extra.mean(selection=True), ds.extra.minmax(selection=True)

In [ ]:
ds.plot1d(ds.extra, selection=True, limits=[-5, 10])

In [ ]:
ds.plot(ds.pickup_longitude, ds.pickup_latitude, what=vaex.stat.mean(ds.extra),
       selection=True, vmin=0, vmax=3,
       shape=512, figsize=(10,8), limits=limits, colormap="Greys")