Import Dependencies


In [1]:
# Environment at time of execution
%load_ext watermark
%pylab inline
%watermark -a "Anthony Abercrombie" -d -t -v -p numpy,pandas,matplotlib -g


Populating the interactive namespace from numpy and matplotlib
Anthony Abercrombie 2017-01-28 22:09:39 

CPython 3.5.2
IPython 5.1.0

numpy 1.11.2
pandas 0.19.2+0.g825876c.dirty
matplotlib 1.5.1
Git hash: 5ab329262c0bf29337f14c3e151cd0f2fdb1c34e

In [2]:
from __future__ import print_function

import os

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import dotenv

In [3]:
import os
import sys
import dotenv
import subprocess
import glob
from tqdm import tqdm

#File path to get to the project root
PROJ_ROOT = os.path.join(os.path.pardir, os.pardir)
# add local python functions
sys.path.append(os.path.join(PROJ_ROOT, "src"))

#Load AWS keys as environment variables
dotenv_path = os.path.join(PROJ_ROOT, '.env')
dotenv.load_dotenv(dotenv_path)

AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
SPARK_HOME = os.environ.get("SPARK_HOME")
ec2_keypair = os.environ.get("ec2_keypair")
ec2_keypair_pem = os.environ.get("ec2_keypair_pem")


from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1

where is spark-ec2?


In [4]:
SPARK_HOME


Out[4]:
'~/spark-1.6.3-bin-hadoop2.6'

In [5]:
def spinup_spark_ec2(SPARK_HOME, keypair, keyfile, num_slaves, cluster_name):
    bash_command = '{}/ec2/spark-ec2 -k {} -i {}> -s {} launch {}'.format(SPARK_HOME, keypair, keyfile, num_slaves, cluster_name)
    return bash_command

In [6]:
args = (SPARK_HOME, ec2_keypair, ec2_keypair_pem, 1, 'spark_ec2_cluster')
x = spinup_spark_ec2(*args)

In [7]:
x


Out[7]:
'~/spark-1.6.3-bin-hadoop2.6/ec2/spark-ec2 -k aws-key-fast-ai -i ~/.ssh/aws-key-fast-ai.pem> -s 1 launch spark_ec2_cluster'

In [ ]:
x

In [ ]:
'{}/bin/spark-ec2 -k {}<keypair> -i {}<key-file> -s {}<num-slaves> launch {}<cluster-name>'

Numerical DataFlow with Spark and Tensorflow

checkout tensorframes from databricks

df = sqlContext.createDataFrame()
x= tf.placeholder(tf.int32, name='x')
y= tf.placeholder(tf.int32, name='y')
output = tf.add(x, 3*y, name='z')


session = tf.session()
output_value = session.run(output, {x:3, y:5})

output_df = tfs.map_rows(output, df)
output_df.collect()

Connect to master node


In [12]:
def connect_master_node(SPARK_HOME, keypair, keyfile, region,cluster_name):
    bash_cmd = '{}/ec2/spark-ec2 -k {} -i {} --region={} login {}'.format(SPARK_HOME, keypair, keyfile, region,cluster_name)
    return bash_cmd
args = (SPARK_HOME, ec2_keypair, ec2_keypair_pem, 'us-west-2b', 'spark_ec2_cluster')
y = connect_master_node(*args)

In [13]:
y


Out[13]:
'~/spark-1.6.3-bin-hadoop2.6/ec2/spark-ec2 -k aws-key-fast-ai -i ~/.ssh/aws-key-fast-ai.pem --region=us-west-2b login spark_ec2_cluster'

AttributeError: 'NoneType' object has no attribute 'get_all_zones'


In [ ]: