notebook.community

Edit and run



In [ ]:

    
from dask_jobqueue import LSFCluster
from dask.distributed import Client
import dask.dataframe as dd
import dask.array as da



In [ ]:

    
# Chose a cluster based on job queues. Type 'bqueues' on pegasus

cluster = LSFCluster(cores=8, memory='25 GB', queue='general', walltime='00:10')
#cluster = LSFCluster(cores=8, memory='250 GB', queue='bigmem', walltime='00:10')

# Click on 'Manual Scaling' and choose 20 workers.
cluster



In [ ]:

    
client = Client(cluster)
# Click on the link for the Dask Dashboard
client



In [ ]:

    
# Miliseconds in one year. Partition (seperate pandas dataframe) by 1 day
# ~ 10 billion points. 100 Gb data set:
df = dd.demo.make_timeseries(start='2000-01-01', end='2000-12-31',
                             dtypes={'x': float, 'y': float, 'id': int},
                             freq='10ms', partition_freq='24h')
df



In [ ]:

    
df.head()



In [ ]:

    
# Load the data (persist) into memory
df = df.persist()



In [ ]:

    
# Calculate the mean of the x and y columns for different ids
%time df.groupby('id')[['x', 'y']].mean().compute()



In [ ]:

    
# Calculate a rolling 1 minute window. Take the standard deviation for each windows. 
# Remove the first and last day of data. Find the index that corresponds to the maximum.
%time df.x.rolling('1min').std().loc['2000-01-02':'2000-12-30'].idxmax().compute()



In [ ]:

    
# Calculate the singular vector decomposition ()
u, s, v = da.linalg.svd(df.values + 1)
%time s.compute()