Import Dependencies



In [1]:

    
# Environment at time of execution
%load_ext watermark
%pylab inline
%watermark -a "Anthony Abercrombie" -d -t -v -p numpy,pandas,matplotlib -g









    



Populating the interactive namespace from numpy and matplotlib
Anthony Abercrombie 2017-01-28 22:09:39 

CPython 3.5.2
IPython 5.1.0

numpy 1.11.2
pandas 0.19.2+0.g825876c.dirty
matplotlib 1.5.1
Git hash: 5ab329262c0bf29337f14c3e151cd0f2fdb1c34e



In [2]:

    
from __future__ import print_function

import os

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import dotenv



In [3]:

    
import os
import sys
import dotenv
import subprocess
import glob
from tqdm import tqdm

#File path to get to the project root
PROJ_ROOT = os.path.join(os.path.pardir, os.pardir)
# add local python functions
sys.path.append(os.path.join(PROJ_ROOT, "src"))

#Load AWS keys as environment variables
dotenv_path = os.path.join(PROJ_ROOT, '.env')
dotenv.load_dotenv(dotenv_path)

AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
SPARK_HOME = os.environ.get("SPARK_HOME")
ec2_keypair = os.environ.get("ec2_keypair")
ec2_keypair_pem = os.environ.get("ec2_keypair_pem")


from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1

where is spark-ec2?



In [4]:

    
SPARK_HOME









    Out[4]:





'~/spark-1.6.3-bin-hadoop2.6'



In [5]:

    
def spinup_spark_ec2(SPARK_HOME, keypair, keyfile, num_slaves, cluster_name):
    bash_command = '{}/ec2/spark-ec2 -k {} -i {}> -s {} launch {}'.format(SPARK_HOME, keypair, keyfile, num_slaves, cluster_name)
    return bash_command



In [6]:

    
args = (SPARK_HOME, ec2_keypair, ec2_keypair_pem, 1, 'spark_ec2_cluster')
x = spinup_spark_ec2(*args)



In [7]:

    
x









    Out[7]:





'~/spark-1.6.3-bin-hadoop2.6/ec2/spark-ec2 -k aws-key-fast-ai -i ~/.ssh/aws-key-fast-ai.pem> -s 1 launch spark_ec2_cluster'



In [ ]:

    
x



In [ ]:

    
'{}/bin/spark-ec2 -k {}<keypair> -i {}<key-file> -s {}<num-slaves> launch {}<cluster-name>'

Numerical DataFlow with Spark and Tensorflow

checkout tensorframes from databricks

df = sqlContext.createDataFrame()
x= tf.placeholder(tf.int32, name='x')
y= tf.placeholder(tf.int32, name='y')
output = tf.add(x, 3*y, name='z')


session = tf.session()
output_value = session.run(output, {x:3, y:5})

output_df = tfs.map_rows(output, df)
output_df.collect()

Connect to master node



In [12]:

    
def connect_master_node(SPARK_HOME, keypair, keyfile, region,cluster_name):
    bash_cmd = '{}/ec2/spark-ec2 -k {} -i {} --region={} login {}'.format(SPARK_HOME, keypair, keyfile, region,cluster_name)
    return bash_cmd
args = (SPARK_HOME, ec2_keypair, ec2_keypair_pem, 'us-west-2b', 'spark_ec2_cluster')
y = connect_master_node(*args)



In [13]:

    
y









    Out[13]:





'~/spark-1.6.3-bin-hadoop2.6/ec2/spark-ec2 -k aws-key-fast-ai -i ~/.ssh/aws-key-fast-ai.pem --region=us-west-2b login spark_ec2_cluster'

AttributeError: 'NoneType' object has no attribute 'get_all_zones'



In [ ]: