In [ ]:
%pylab inline
from snakebite.client import Client
import pyaml as yaml
import sys
import pythonrun
reload(pythonrun)
import os
import seaborn as sns
from sklearn import metrics
import scala_python_endive_wrapper
reload(scala_python_endive_wrapper)
from scala_python_endive_wrapper import *
import pandas as pd
import os
In [ ]:
hdfsclient = Client("amp-bdg-master.amplab.net", 8020, use_trash=False)
notebook_stdout = sys.stdout
notebook_stderr = sys.stderr
terminal_stdout = open('/dev/stdout', 'w')
terminal_stderr = open('/dev/stderr', 'w')
logpath = "/home/eecs/akmorrow/ADAM/endive"
In [ ]:
executor_mem = "100g"
cores_per_executor = 8
num_executors = 50
EGR1_PATH = "/data/anv/DREAMDATA/aggregated/full_dnase/EGR1"
featurized = "hdfs://amp-spark-master.amp:8020/user/akmorrow/featurized/wavelets/EGR1_dnase_all_dim_256"
In [ ]:
# this is where the magic happens
results = []
featureOutput = "/data/anv/featurized/EGR1_dnase"
res = run_kitchensink_featurize_pipeline(EGR1_PATH,
"fakePath",
logpath,
featureOutput,
seed=0,
cores_per_executor=cores_per_executor,
sample=0.01,
executor_mem=executor_mem,
num_partitions=cores_per_executor*num_executors,
num_executors=num_executors)
results.append(res)
In [ ]:
# chromosomes = map(lambda x: "chr{0}".format(x+1), range(23))
# chromosomes.append("chrX")
chromosomes=["chr15"]
# CHANGE ME BASED ON ENUM + TF
egr1_cell_types = string_to_enum_celltypes(['GM12878', 'H1hESC', 'HCT116', 'MCF7'])
In [ ]:
reload(scala_python_endive_wrapper)
results_df = cross_validate(featurized,
hdfsclient,
chromosomes,
egr1_cell_types,
logpath,
numHoldOutChr=1,
numHoldOutCell=1,
num_folds=1,
negativeSamplingFreqs=[0.001],
mixtureWeights=[-1.0],
cores_per_executor=8,
num_executors=32,
regs=[1e-2])
In [ ]: