notebook.community

Edit and run



In [1]:

    
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, visualize
import bokeh.plotting as bp



In [2]:

    
ProgressBar().register()
bp.output_notebook()









    





    
        
        Loading BokehJS ...



In [3]:

    
cols = ['square_id', 'timestamp', 'country_code',
        'sms_in', 'sms_out','call_in','call_out', 'internet']

dtypes = {'square_id': int, 'timestamp': int, 'countrycode': int, 
          'sms_in': float,'sms_out': float, 'call_in': float, 'call_out': float, 'internet': float}



In [4]:

    
#reads file from external SSD because of txt files account for 20,8GB
df = dd.read_csv('/Volumes/Samsung_T5/data/cells/*.txt', 
                 header=0, 
                 names=cols, 
                 dtype=dtypes, 
                 sep="\t")



In [5]:

    
df.head(2)









    



[########################################] | 100% Completed |  1.2s






    Out[5]:







  
    
      
      square_id
      timestamp
      country_code
      sms_in
      sms_out
      call_in
      call_out
      internet
    
  
  
    
      0
      1
      1383260400000
      39
      0.141864
      0.156787
      0.160938
      0.052275
      11.028366
    
    
      1
      1
      1383261000000
      0
      0.136588
      NaN
      NaN
      0.027300
      NaN



In [6]:

    
df.tail(2)









    



[########################################] | 100% Completed |  0.2s






    Out[6]:







  
    
      
      square_id
      timestamp
      country_code
      sms_in
      sms_out
      call_in
      call_out
      internet
    
  
  
    
      137425
      9999
      1388616600000
      0
      0.085995
      NaN
      NaN
      NaN
      NaN
    
    
      137426
      9999
      1388616600000
      39
      0.328587
      0.50038
      NaN
      0.085995
      13.823244



In [7]:

    
#Show count of data in dask dataframe
with Profiler() as prof, ResourceProfiler() as rprof:
    records = df.count().compute()
print(records)
visualize([prof, rprof])









    



[########################################] | 100% Completed |  2min 46.7s
square_id       319896227
timestamp       319896227
country_code    319896227
sms_in          173720327
sms_out          99966079
call_in          98596516
call_out        138075046
internet        160681340
dtype: int64






    






    







    














    Out[7]:




Column(
id = 'dadf63c7-0298-4ed0-aa26-f285b4db0411', …)
children = [ToolbarBox(id='5a30c8fd-47a1-45ad-afe2-3ee706239c8c', ...), Column(id='2a5b53d1-2017-4acf-8da6-0bd809b606db', ...)],
css_classes = [],
disabled = False,
height = None,
js_event_callbacks = {},
js_property_callbacks = {},
name = None,
sizing_mode = 'fixed',
subscribed_events = [],
tags = [],
width = None)



In [8]:

    
#The dask DataFrame is partitioned into chunks along the index. 
#To see how many partitions, you can use the npartitions attribute.
df.npartitions









    Out[8]:





353



In [9]:

    
#show highest internet traffic squares by country code
with Profiler() as prof, ResourceProfiler() as rprof:
    top_internet_cells = df.internet.groupby([df.square_id, df.country_code]).mean().nlargest(5).compute()
print(top_internet_cells)
visualize([prof, rprof])









    



[########################################] | 100% Completed |  3min 43.1s
square_id  country_code
5161       39              1405.944113
5059       39              1238.826801
5259       39              1166.016916
5061       39              1053.595438
5258       39               968.056671
Name: internet, dtype: float64






    






    







    














    Out[9]:




Column(
id = '6f005b26-1ad6-4029-850b-5f2eb654a9e2', …)
children = [ToolbarBox(id='7d45d2ae-9406-49d8-8049-d1113a341019', ...), Column(id='cf6473a8-dbb6-45bf-87a8-0d975089dff9', ...)],
css_classes = [],
disabled = False,
height = None,
js_event_callbacks = {},
js_property_callbacks = {},
name = None,
sizing_mode = 'fixed',
subscribed_events = [],
tags = [],
width = None)



In [ ]:

	square_id	timestamp	country_code	sms_in	sms_out	call_in	call_out	internet
0	1	1383260400000	39	0.141864	0.156787	0.160938	0.052275	11.028366
1	1	1383261000000	0	0.136588	NaN	NaN	0.027300	NaN

	square_id	timestamp	country_code	sms_in	sms_out	call_in	call_out	internet
137425	9999	1388616600000	0	0.085995	NaN	NaN	NaN	NaN
137426	9999	1388616600000	39	0.328587	0.50038	NaN	0.085995	13.823244