In [1]:
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, visualize
import bokeh.plotting as bp
In [2]:
ProgressBar().register()
bp.output_notebook()
In [3]:
cols = ['square_id', 'timestamp', 'country_code',
'sms_in', 'sms_out','call_in','call_out', 'internet']
dtypes = {'square_id': int, 'timestamp': int, 'countrycode': int,
'sms_in': float,'sms_out': float, 'call_in': float, 'call_out': float, 'internet': float}
In [4]:
#reads file from external SSD because of txt files account for 20,8GB
df = dd.read_csv('/Volumes/Samsung_T5/data/cells/*.txt',
header=0,
names=cols,
dtype=dtypes,
sep="\t")
In [5]:
df.head(2)
Out[5]:
In [6]:
df.tail(2)
Out[6]:
In [7]:
#Show count of data in dask dataframe
with Profiler() as prof, ResourceProfiler() as rprof:
records = df.count().compute()
print(records)
visualize([prof, rprof])
Out[7]:
In [8]:
#The dask DataFrame is partitioned into chunks along the index.
#To see how many partitions, you can use the npartitions attribute.
df.npartitions
Out[8]:
In [9]:
#show highest internet traffic squares by country code
with Profiler() as prof, ResourceProfiler() as rprof:
top_internet_cells = df.internet.groupby([df.square_id, df.country_code]).mean().nlargest(5).compute()
print(top_internet_cells)
visualize([prof, rprof])
Out[9]:
In [ ]: