In [1]:
from IPython.parallel import Client
# create a load balanced view of the ipcluster
# ipcluster must be running on the notebook server
pc = Client()
lbv = pc.load_balanced_view()
pc.clear() # make sure namespaces are clean
In [2]:
%%px
# import required packages into the parallel engines
import numpy as np
import pandas as pd
import re
from utils import loadmat_url
In [3]:
@lbv.parallel()
def counts(args):
# compute cell counts per bin
# each arg is the date and the bin PID
date, pid = args
cs_url = pid + '_class_scores.mat'
try:
mat = loadmat_url(cs_url)
except:
# the load failed; return None, which the caller must be
# ready to deal with
return None
# get the list of class names
classes = mat['class2useTB'][:-1]
# create a DataFrame from the score matrix indexed by roi number
scores = pd.DataFrame(mat['TBscores'], index=mat['roinum'], columns=classes)
# use idxmax and value_counts to count the number of highest-scoring ROIs in each class
counts = scores.idxmax(axis=1).value_counts()
# include classes that have zero counts
zeroes = pd.Series(np.zeros(len(classes)), index=classes)
row = (zeroes + counts).fillna(0)
# note that the preceding operations can be done using newer versions of pandas using Categorical:
# row = pd.Categorical(scores.idxmax(axis=1), categories=classes).value_counts()
# row.index = row.index.astype(str)
row = row.to_frame().T
# add the bin LID which will be used later
row.insert(0,'lid',re.sub(r'.*/','',pid))
# index the row DataFrame by date
row.index = [date]
return row
In [4]:
import requests
# use the dashboard metrics endpoint to get a list of dates and bin PIDs over a selected time range
START='2016-05-01'
END='2016-05-10'
feed_url = 'http://ifcb-data.whoi.edu/mvco/api/feed/temperature/start/%s/end/%s' % (START, END)
feed = requests.get(feed_url).json()
In [ ]:
import numpy as np
from utils import progress_map
# construct the arguments for counts()
args = [(e['date'], e['pid']) for e in feed]
# initialize variable to hold result DataFrame
ts = None
for row in progress_map(counts, args):
# create DataFrame or append new row to it
if row is not None:
ts = row if ts is None else ts.append(row)
In [6]:
import pandas as pd
from utils import loadmat_url
# now we need the sample volume for each bin. this is in a sidecar file
# which has been staged to Google Drive.
# given a file ID, generate a direct Google Drive link
def gdrive_link(file_id):
return 'https://drive.google.com/uc?export=download&id=%s' % file_id
# ml_analyzed for 2016 is in Google Drive
file_id = '0BzoJnj-e6BWpQmRCWk1tdS1tUTQ'
# load it
mat = loadmat_url(gdrive_link(file_id))
# make it into a DataFrame
vol = pd.DataFrame({
'lid': mat['filelist_all'],
'volume': mat['ml_analyzed']
})
vol.head()
Out[6]:
In [7]:
# now merge the class counts with the volume on the bin LID to associate
# the appropriate volume to each bin
merged = vol.merge(ts, on='lid', left_index=True)
merged.pop('lid') # don't need the LID anymore
vols = merged.pop('volume') # but we do need the volume
# finally, divide the class counts by the volumes to get abundance
abundance = merged.divide(vols,axis=0)
# type-coerce the index to be a datetime index
abundance.index = abundance.index.to_datetime()
abundance.head()
Out[7]:
In [8]:
%matplotlib inline
smoothed = abundance.resample('12h')
smoothed.plot(y='dino30')
Out[8]: