In [1]:
from IPython.parallel import Client

# create a load balanced view of the ipcluster
# ipcluster must be running on the notebook server
pc = Client()
lbv = pc.load_balanced_view()
pc.clear() # make sure namespaces are clean

In [2]:
%%px
# import required packages into the parallel engines
import numpy as np
import pandas as pd
import re

from utils import loadmat_url

In [3]:
@lbv.parallel()
def counts(args):
    # compute cell counts per bin
    # each arg is the date and the bin PID
    date, pid = args
    cs_url = pid + '_class_scores.mat'
    try:
        mat = loadmat_url(cs_url)
    except:
        # the load failed; return None, which the caller must be
        # ready to deal with
        return None
    # get the list of class names
    classes = mat['class2useTB'][:-1]
    # create a DataFrame from the score matrix indexed by roi number
    scores = pd.DataFrame(mat['TBscores'], index=mat['roinum'], columns=classes)
    # use idxmax and value_counts to count the number of highest-scoring ROIs in each class
    counts = scores.idxmax(axis=1).value_counts()
    # include classes that have zero counts
    zeroes = pd.Series(np.zeros(len(classes)), index=classes)
    row = (zeroes + counts).fillna(0)
    # note that the preceding operations can be done using newer versions of pandas using Categorical:
    # row = pd.Categorical(scores.idxmax(axis=1), categories=classes).value_counts()
    # row.index = row.index.astype(str)
    row = row.to_frame().T
    # add the bin LID which will be used later
    row.insert(0,'lid',re.sub(r'.*/','',pid))
    # index the row DataFrame by date
    row.index = [date]
    return row

In [4]:
import requests

# use the dashboard metrics endpoint to get a list of dates and bin PIDs over a selected time range
START='2016-05-01'
END='2016-05-10'

feed_url = 'http://ifcb-data.whoi.edu/mvco/api/feed/temperature/start/%s/end/%s' % (START, END)
feed = requests.get(feed_url).json()

In [ ]:
import numpy as np
from utils import progress_map

# construct the arguments for counts()
args = [(e['date'], e['pid']) for e in feed]

# initialize variable to hold result DataFrame
ts = None
for row in progress_map(counts, args):
    # create DataFrame or append new row to it
    if row is not None:
        ts = row if ts is None else ts.append(row)

In [6]:
import pandas as pd
from utils import loadmat_url

# now we need the sample volume for each bin. this is in a sidecar file
# which has been staged to Google Drive.

# given a file ID, generate a direct Google Drive link
def gdrive_link(file_id):
    return 'https://drive.google.com/uc?export=download&id=%s' % file_id

# ml_analyzed for 2016 is in Google Drive
file_id = '0BzoJnj-e6BWpQmRCWk1tdS1tUTQ'

# load it
mat = loadmat_url(gdrive_link(file_id))
# make it into a DataFrame
vol = pd.DataFrame({
    'lid': mat['filelist_all'],
    'volume': mat['ml_analyzed']
})
vol.head()


Out[6]:
lid volume
0 IFCB1_2016_064_145530 NaN
1 IFCB1_2016_064_145922 2.371047
2 IFCB1_2016_064_151621 3.332269
3 IFCB1_2016_064_153617 0.027548
4 IFCB1_2016_064_153934 3.386921

In [7]:
# now merge the class counts with the volume on the bin LID to associate
# the appropriate volume to each bin
merged = vol.merge(ts, on='lid', left_index=True)
merged.pop('lid') # don't need the LID anymore
vols = merged.pop('volume') # but we do need the volume
# finally, divide the class counts by the volumes to get abundance
abundance = merged.divide(vols,axis=0)
# type-coerce the index to be a datetime index
abundance.index = abundance.index.to_datetime()
abundance.head()


Out[7]:
Asterionellopsis Cerataulina Ceratium Chaetoceros Corethron Coscinodiscus Cylindrotheca DactFragCerataul Dactyliosolen Dictyocha ... bad ciliate_mix clusterflagellate detritus dino30 kiteflagellates mix mix_elongated pennate tintinnid
2016-05-01 00:19:07 1.880973 0.000000 0 5.374209 1.343552 0.000000 0.806131 0.000000 0.268710 0.268710 ... 0.000000 0.537421 0.537421 63.415667 11.017129 0.000000 436.385778 79.269584 0.000000 0.268710
2016-05-01 00:42:19 0.282197 0.000000 0 9.594708 1.128789 0.282197 0.564395 0.282197 0.000000 0.282197 ... 0.000000 1.693184 0.000000 68.009549 10.723497 0.000000 531.941911 92.560714 0.282197 0.564395
2016-05-01 01:05:33 1.987228 0.000000 0 6.245574 0.851669 0.000000 0.567779 0.000000 0.000000 0.000000 ... 0.283890 5.961684 0.283890 59.332952 11.071699 0.000000 562.385541 87.154145 0.000000 0.851669
2016-05-01 01:28:44 1.122173 0.000000 0 8.135753 0.561086 0.280543 0.280543 0.280543 0.561086 0.280543 ... 0.280543 5.049778 0.280543 48.814518 11.221728 0.561086 561.086412 77.149382 0.000000 0.280543
2016-05-01 01:51:57 2.272412 0.284052 0 12.782320 0.000000 0.000000 0.284052 0.000000 0.000000 0.000000 ... 0.284052 7.385340 0.284052 52.265485 13.066371 0.284052 592.247480 94.021063 0.000000 0.284052

5 rows × 50 columns


In [8]:
%matplotlib inline

smoothed = abundance.resample('12h')
smoothed.plot(y='dino30')


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f63ac70db90>