In [1]:
# Environment at time of execution
%load_ext watermark
%pylab inline
%watermark -a "Anthony Abercrombie" -d -t -v -p numpy,pandas,matplotlib -g
In [2]:
from __future__ import print_function
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dotenv
In [3]:
import os
import sys
import dotenv
import subprocess
import glob
from tqdm import tqdm
#File path to get to the project root
PROJ_ROOT = os.path.join(os.path.pardir, os.pardir)
# add local python functions
sys.path.append(os.path.join(PROJ_ROOT, "src"))
#Load AWS keys as environment variables
dotenv_path = os.path.join(PROJ_ROOT, '.env')
dotenv.load_dotenv(dotenv_path)
AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
SPARK_HOME = os.environ.get("SPARK_HOME")
ec2_keypair = os.environ.get("ec2_keypair")
ec2_keypair_pem = os.environ.get("ec2_keypair_pem")
from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
In [4]:
SPARK_HOME
Out[4]:
In [5]:
def spinup_spark_ec2(SPARK_HOME, keypair, keyfile, num_slaves, cluster_name):
bash_command = '{}/ec2/spark-ec2 -k {} -i {}> -s {} launch {}'.format(SPARK_HOME, keypair, keyfile, num_slaves, cluster_name)
return bash_command
In [6]:
args = (SPARK_HOME, ec2_keypair, ec2_keypair_pem, 1, 'spark_ec2_cluster')
x = spinup_spark_ec2(*args)
In [7]:
x
Out[7]:
In [ ]:
x
In [ ]:
'{}/bin/spark-ec2 -k {}<keypair> -i {}<key-file> -s {}<num-slaves> launch {}<cluster-name>'
df = sqlContext.createDataFrame()
x= tf.placeholder(tf.int32, name='x')
y= tf.placeholder(tf.int32, name='y')
output = tf.add(x, 3*y, name='z')
session = tf.session()
output_value = session.run(output, {x:3, y:5})
output_df = tfs.map_rows(output, df)
output_df.collect()
In [12]:
def connect_master_node(SPARK_HOME, keypair, keyfile, region,cluster_name):
bash_cmd = '{}/ec2/spark-ec2 -k {} -i {} --region={} login {}'.format(SPARK_HOME, keypair, keyfile, region,cluster_name)
return bash_cmd
args = (SPARK_HOME, ec2_keypair, ec2_keypair_pem, 'us-west-2b', 'spark_ec2_cluster')
y = connect_master_node(*args)
In [13]:
y
Out[13]:
In [ ]: