notebook.community

Edit and run



In [1]:

    
from IPython.parallel import Client

# create a load balanced view of the ipcluster
# ipcluster must be running on the notebook server
pc = Client()
lbv = pc.load_balanced_view()
pc.clear() # make sure namespaces are clean



In [2]:

    
%%px
# import required packages into the parallel engines
import numpy as np
import pandas as pd
import re

from utils import loadmat_url



In [3]:

    
@lbv.parallel()
def counts(args):
    # compute cell counts per bin
    # each arg is the date and the bin PID
    date, pid = args
    cs_url = pid + '_class_scores.mat'
    try:
        mat = loadmat_url(cs_url)
    except:
        # the load failed; return None, which the caller must be
        # ready to deal with
        return None
    # get the list of class names
    classes = mat['class2useTB'][:-1]
    # create a DataFrame from the score matrix indexed by roi number
    scores = pd.DataFrame(mat['TBscores'], index=mat['roinum'], columns=classes)
    # use idxmax and value_counts to count the number of highest-scoring ROIs in each class
    counts = scores.idxmax(axis=1).value_counts()
    # include classes that have zero counts
    zeroes = pd.Series(np.zeros(len(classes)), index=classes)
    row = (zeroes + counts).fillna(0)
    # note that the preceding operations can be done using newer versions of pandas using Categorical:
    # row = pd.Categorical(scores.idxmax(axis=1), categories=classes).value_counts()
    # row.index = row.index.astype(str)
    row = row.to_frame().T
    # add the bin LID which will be used later
    row.insert(0,'lid',re.sub(r'.*/','',pid))
    # index the row DataFrame by date
    row.index = [date]
    return row



In [4]:

    
import requests

# use the dashboard metrics endpoint to get a list of dates and bin PIDs over a selected time range
START='2016-05-01'
END='2016-05-10'

feed_url = 'http://ifcb-data.whoi.edu/mvco/api/feed/temperature/start/%s/end/%s' % (START, END)
feed = requests.get(feed_url).json()



In [ ]:

    
import numpy as np
from utils import progress_map

# construct the arguments for counts()
args = [(e['date'], e['pid']) for e in feed]

# initialize variable to hold result DataFrame
ts = None
for row in progress_map(counts, args):
    # create DataFrame or append new row to it
    if row is not None:
        ts = row if ts is None else ts.append(row)



In [6]:

    
import pandas as pd
from utils import loadmat_url

# now we need the sample volume for each bin. this is in a sidecar file
# which has been staged to Google Drive.

# given a file ID, generate a direct Google Drive link
def gdrive_link(file_id):
    return 'https://drive.google.com/uc?export=download&id=%s' % file_id

# ml_analyzed for 2016 is in Google Drive
file_id = '0BzoJnj-e6BWpQmRCWk1tdS1tUTQ'

# load it
mat = loadmat_url(gdrive_link(file_id))
# make it into a DataFrame
vol = pd.DataFrame({
    'lid': mat['filelist_all'],
    'volume': mat['ml_analyzed']
})
vol.head()









    Out[6]:






  
    
      
      lid
      volume
    
  
  
    
      0
      IFCB1_2016_064_145530
      NaN
    
    
      1
      IFCB1_2016_064_145922
      2.371047
    
    
      2
      IFCB1_2016_064_151621
      3.332269
    
    
      3
      IFCB1_2016_064_153617
      0.027548
    
    
      4
      IFCB1_2016_064_153934
      3.386921



In [7]:

    
# now merge the class counts with the volume on the bin LID to associate
# the appropriate volume to each bin
merged = vol.merge(ts, on='lid', left_index=True)
merged.pop('lid') # don't need the LID anymore
vols = merged.pop('volume') # but we do need the volume
# finally, divide the class counts by the volumes to get abundance
abundance = merged.divide(vols,axis=0)
# type-coerce the index to be a datetime index
abundance.index = abundance.index.to_datetime()
abundance.head()









    Out[7]:






  
    
      
      Asterionellopsis
      Cerataulina
      Ceratium
      Chaetoceros
      Corethron
      Coscinodiscus
      Cylindrotheca
      DactFragCerataul
      Dactyliosolen
      Dictyocha
      ...
      bad
      ciliate_mix
      clusterflagellate
      detritus
      dino30
      kiteflagellates
      mix
      mix_elongated
      pennate
      tintinnid
    
  
  
    
      2016-05-01 00:19:07
      1.880973
      0.000000
      0
      5.374209
      1.343552
      0.000000
      0.806131
      0.000000
      0.268710
      0.268710
      ...
      0.000000
      0.537421
      0.537421
      63.415667
      11.017129
      0.000000
      436.385778
      79.269584
      0.000000
      0.268710
    
    
      2016-05-01 00:42:19
      0.282197
      0.000000
      0
      9.594708
      1.128789
      0.282197
      0.564395
      0.282197
      0.000000
      0.282197
      ...
      0.000000
      1.693184
      0.000000
      68.009549
      10.723497
      0.000000
      531.941911
      92.560714
      0.282197
      0.564395
    
    
      2016-05-01 01:05:33
      1.987228
      0.000000
      0
      6.245574
      0.851669
      0.000000
      0.567779
      0.000000
      0.000000
      0.000000
      ...
      0.283890
      5.961684
      0.283890
      59.332952
      11.071699
      0.000000
      562.385541
      87.154145
      0.000000
      0.851669
    
    
      2016-05-01 01:28:44
      1.122173
      0.000000
      0
      8.135753
      0.561086
      0.280543
      0.280543
      0.280543
      0.561086
      0.280543
      ...
      0.280543
      5.049778
      0.280543
      48.814518
      11.221728
      0.561086
      561.086412
      77.149382
      0.000000
      0.280543
    
    
      2016-05-01 01:51:57
      2.272412
      0.284052
      0
      12.782320
      0.000000
      0.000000
      0.284052
      0.000000
      0.000000
      0.000000
      ...
      0.284052
      7.385340
      0.284052
      52.265485
      13.066371
      0.284052
      592.247480
      94.021063
      0.000000
      0.284052
    
  

5 rows × 50 columns



In [8]:

    
%matplotlib inline

smoothed = abundance.resample('12h')
smoothed.plot(y='dino30')









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f63ac70db90>

	lid	volume
0	IFCB1_2016_064_145530	NaN
1	IFCB1_2016_064_145922	2.371047
2	IFCB1_2016_064_151621	3.332269
3	IFCB1_2016_064_153617	0.027548
4	IFCB1_2016_064_153934	3.386921

	Asterionellopsis	Cerataulina	Chaetoceros	Corethron	Coscinodiscus	Cylindrotheca	DactFragCerataul	Dactyliosolen	Dictyocha	...	bad	ciliate_mix	clusterflagellate	detritus	dino30	kiteflagellates	mix	mix_elongated	pennate	tintinnid
2016-05-01 00:19:07	1.880973	0.000000	5.374209	1.343552	0.000000	0.806131	0.000000	0.268710	0.268710	...	0.000000	0.537421	0.537421	63.415667	11.017129	0.000000	436.385778	79.269584	0.000000	0.268710
2016-05-01 00:42:19	0.282197	0.000000	9.594708	1.128789	0.282197	0.564395	0.282197	0.000000	0.282197	...	0.000000	1.693184	0.000000	68.009549	10.723497	0.000000	531.941911	92.560714	0.282197	0.564395
2016-05-01 01:05:33	1.987228	0.000000	6.245574	0.851669	0.000000	0.567779	0.000000	0.000000	0.000000	...	0.283890	5.961684	0.283890	59.332952	11.071699	0.000000	562.385541	87.154145	0.000000	0.851669
2016-05-01 01:28:44	1.122173	0.000000	8.135753	0.561086	0.280543	0.280543	0.280543	0.561086	0.280543	...	0.280543	5.049778	0.280543	48.814518	11.221728	0.561086	561.086412	77.149382	0.000000	0.280543
2016-05-01 01:51:57	2.272412	0.284052	12.782320	0.000000	0.000000	0.284052	0.000000	0.000000	0.000000	...	0.284052	7.385340	0.284052	52.265485	13.066371	0.284052	592.247480	94.021063	0.000000	0.284052