In [11]:
%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
%load_ext autotime


Installed autotime.py. To use it, type:
  %load_ext autotime
/home/eecs/vaishaal/anaconda2/lib/python2.7/site-packages/IPython/core/magics/extension.py:47: UserWarning: %install_ext` is deprecated, please distribute your extension(s)as a python packages.
  "as a python packages.", UserWarning)

In [86]:
%pylab inline
from snakebite.client import Client
import pyaml as yaml
import sys
import pythonrun
reload(pythonrun)
import os
import seaborn as sns
from sklearn import metrics
import scala_python_endive_wrapper
reload(scala_python_endive_wrapper)
from scala_python_endive_wrapper import *


Populating the interactive namespace from numpy and matplotlib
time: 13.5 ms
WARNING: pylab import has clobbered these variables: ['power', 'random', 'fft', 'linalg', 'info']
`%matplotlib` prevents importing * from pylab and numpy

In [2]:
hdfsclient = Client("amp-spark-master.amp", 8020, use_trash=False)
notebook_stdout = sys.stdout
notebook_stderr = sys.stderr
terminal_stdout = open('/dev/stdout', 'w')
terminal_stderr = open('/dev/stderr', 'w')
logpath = "/tmp/log"

In [24]:
executor_mem = "100g"
cores_per_executor = 32
num_executors = 14

In [51]:
# CHANGE ME
dataset_creation_config = \
{
    "labels": "/user/vaishaal/endive-data/ChIPseq/labels/EGR1.train.labels.tsv", \
    "reference": "/home/eecs/akmorrow/ADAM/endive/workfiles/hg19.2bit", \
    "dnase": "/user/vaishaal/endive-data/DNASE/peaks/relaxed/", \
    "aggregatedSequenceOutput": "/user/vaishaal/endive-data/aggregated/EGR1/"
}

EGR1_PATH = "/user/vaishaal/endive-data/aggregated/EGR1/EGR1"
ATF2_PATH = "/user/vaishaal/endive-data/aggregated/ATF2/ATF2"


time: 6.71 ms

In [21]:
# Run scala job send output to terminal to create windows
sys.stdout = terminal_stdout
sys.stderr = terminal_stderr
pythonrun.run(dataset_creation_config, 
              logpath,
              dataset_creation_pipeline_class, 
              pipeline_jar, 
              executor_mem,
              cores_per_executor, 
              num_executors,
              use_yarn=True)

In [28]:
sys.stdout = notebook_stdout
sys.stderr = notebook_stderr
os.system("mkdir -p {0}".format(kernel_pipeline_config["predictionsOutput"]))


Out[28]:
0
time: 12.2 ms

In [ ]:
# this is where the magic happens
((y_train, y_train_pred), (y_test, y_test_pred)) = run_kernel_pipeline(EGR1_PATH,
                                                                       "/tmp/filters.csv", 
                                                                       logpath, 
                                                                       filter_gen_gen=make_gaussian_filter_gen,
                                                                       reg=1e-8,
                                                                       negativeSamplingFreq=0.1)

ROC Metrics


In [93]:
sys.stdout = notebook_stdout
sys.stderr = notebook_stderr
fpr, tpr, thresh = metrics.roc_curve(y_train, y_train_pred)
train_auc = metrics.roc_auc_score(y_train, y_train_pred)
plot(fpr, tpr, label="train")

fpr, tpr, thresh = metrics.roc_curve(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_pred)

plot(fpr, tpr, label="test")

plt.legend(loc=4)
plt.figure()
print("Train AUC {0}".format(train_auc))
print("Test AUC {0}".format(test_auc))


Train AUC 0.905775637466
Test AUC 0.890266688574
<matplotlib.figure.Figure at 0x7f58e70c67d0>
time: 2.73 s

PR Metrics


In [68]:
sys.stdout = notebook_stdout
sys.stderr = notebook_stderr
fpr, tpr, thresh = metrics.precision_recall_curve(y_train, y_train_pred)
train_auc = metrics.average_precision_score(y_train, y_train_pred)
plot(fpr, tpr, label="train")

fpr, tpr, thresh = metrics.precision_recall_curve(y_test, y_test_pred)
test_auc = metrics.average_precision_score(y_test, y_test_pred)

plot(fpr, tpr, label="test")

plt.legend(loc=4)
plt.figure()
print("Train PR AUC {0}".format(train_auc))
print("Test PR AUC {0}".format(test_auc))


Train PR AUC 0.757754546452
Test PR AUC 0.121105773929
<matplotlib.figure.Figure at 0x7f58d264d5d0>
time: 2.96 s

In [92]:
print("HI")


HI
time: 1.83 ms

In [ ]: