Before everything

In the client (where DSB is available)


In [ ]:
dsb install conda nltk
dsb install conda requests
dsb cmd "sudo /home/ubuntu/anaconda/bin/python -m nltk.downloader -d /usr/share/nltk_data all"

In the head node:


In [ ]:
!sudo -u hdfs hadoop fs -mkdir /user/ubuntu
!sudo -u hdfs hadoop fs -chown ubuntu /user/ubuntu

Pyspark


In [1]:
import os
import sys

In [2]:
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"

In [3]:
os.environ["PYSPARK_PYTHON"] = "/home/ubuntu/anaconda/bin/python"

In [4]:
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.8.2.1-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [5]:
from pyspark import SparkConf
from pyspark import SparkContext

conf = SparkConf()
conf.setMaster('yarn-client')
conf.setAppName('spark-yarn')
sc = SparkContext(conf=conf)

Test


In [6]:
def noop(x):
    import socket
    return socket.gethostname()

In [7]:
rdd = sc.parallelize(range(1000), 100)
hosts = rdd.map(noop).distinct().collect()
print(hosts)


['ip-172-31-8-22', 'ip-172-31-6-49', 'ip-172-31-9-114']

Requests


In [8]:
import requests

In [9]:
urls = sc.parallelize(['http://google.com', 'http://yahoo.com'])

In [10]:
html = urls.map(lambda x: requests.get(x).text)

In [11]:
responses = html.collect()

In [12]:
len(responses)


Out[12]:
2

In [13]:
responses[0][:500]


Out[13]:
u'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title>'

In [15]:
html.saveAsTextFile('hdfs://ec2-54-175-19-187.compute-1.amazonaws.com:8020/user/ubuntu/crawl')

NLTK


In [22]:
import nltk

In [23]:
html = sc.textFile('hdfs://ec2-54-175-19-187.compute-1.amazonaws.com:8020/user/ubuntu/crawl')

In [24]:
counts = html.map(lambda x: nltk.word_tokenize(x))

In [26]:
words = counts.collect()

In [27]:
len(words)


Out[27]:
1107

In [ ]: