In [ ]:
%pylab inline
from snakebite.client import Client
import pyaml as yaml
import sys
import pythonrun
reload(pythonrun)
import os
import seaborn as sns
from sklearn import metrics
import scala_python_endive_wrapper
reload(scala_python_endive_wrapper)
from scala_python_endive_wrapper import *
import pandas as pd
import os

In [ ]:
hdfsclient = Client("amp-bdg-master.amplab.net", 8020, use_trash=False)
notebook_stdout = sys.stdout
notebook_stderr = sys.stderr
terminal_stdout = open('/dev/stdout', 'w')
terminal_stderr = open('/dev/stderr', 'w')
logpath = "/home/eecs/akmorrow/ADAM/endive"

In [ ]:
executor_mem = "100g"
cores_per_executor = 8
num_executors = 50
EGR1_PATH = "/data/anv/DREAMDATA/aggregated/full_dnase/EGR1"

featurized = "hdfs://amp-spark-master.amp:8020/user/akmorrow/featurized/wavelets/EGR1_dnase_all_dim_256"

In [ ]:
# this is where the magic happens
results = []
featureOutput = "/data/anv/featurized/EGR1_dnase"
res = run_kitchensink_featurize_pipeline(EGR1_PATH,
                                         "fakePath",
                                         logpath,
                                         featureOutput,
                                         seed=0,
                                         cores_per_executor=cores_per_executor,
                                         sample=0.01,
                                         executor_mem=executor_mem,
                                         num_partitions=cores_per_executor*num_executors,
                                         num_executors=num_executors)
results.append(res)

In [ ]:
# chromosomes = map(lambda x: "chr{0}".format(x+1), range(23))
# chromosomes.append("chrX")
chromosomes=["chr15"]
# CHANGE ME BASED ON ENUM + TF
egr1_cell_types = string_to_enum_celltypes(['GM12878', 'H1hESC', 'HCT116', 'MCF7'])

In [ ]:
reload(scala_python_endive_wrapper)
results_df = cross_validate(featurized,
                            hdfsclient,
                            chromosomes,
                            egr1_cell_types,
                            logpath,
                            numHoldOutChr=1,
                            numHoldOutCell=1,
                            num_folds=1,
                            negativeSamplingFreqs=[0.001],
                            mixtureWeights=[-1.0],
                            cores_per_executor=8,
                            num_executors=32,
                            regs=[1e-2])

In [ ]: