Before everything

In the client (where DSB is available)



In [ ]:

    
dsb install conda nltk
dsb install conda requests
dsb cmd "sudo /home/ubuntu/anaconda/bin/python -m nltk.downloader -d /usr/share/nltk_data all"

In the head node:



In [ ]:

    
!sudo -u hdfs hadoop fs -mkdir /user/ubuntu
!sudo -u hdfs hadoop fs -chown ubuntu /user/ubuntu

Pyspark



In [1]:

    
import os
import sys



In [2]:

    
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"



In [3]:

    
os.environ["PYSPARK_PYTHON"] = "/home/ubuntu/anaconda/bin/python"



In [4]:

    
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.8.2.1-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")



In [5]:

    
from pyspark import SparkConf
from pyspark import SparkContext

conf = SparkConf()
conf.setMaster('yarn-client')
conf.setAppName('spark-yarn')
sc = SparkContext(conf=conf)

Test



In [6]:

    
def noop(x):
    import socket
    return socket.gethostname()



In [7]:

    
rdd = sc.parallelize(range(1000), 100)
hosts = rdd.map(noop).distinct().collect()
print(hosts)









    



['ip-172-31-8-22', 'ip-172-31-6-49', 'ip-172-31-9-114']

Requests



In [8]:

    
import requests



In [9]:

    
urls = sc.parallelize(['http://google.com', 'http://yahoo.com'])



In [10]:

    
html = urls.map(lambda x: requests.get(x).text)



In [11]:

    
responses = html.collect()



In [12]:

    
len(responses)









    Out[12]:





2



In [13]:

    
responses[0][:500]









    Out[13]:





u'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title>'



In [15]:

    
html.saveAsTextFile('hdfs://ec2-54-175-19-187.compute-1.amazonaws.com:8020/user/ubuntu/crawl')

NLTK



In [22]:

    
import nltk



In [23]:

    
html = sc.textFile('hdfs://ec2-54-175-19-187.compute-1.amazonaws.com:8020/user/ubuntu/crawl')



In [24]:

    
counts = html.map(lambda x: nltk.word_tokenize(x))



In [26]:

    
words = counts.collect()



In [27]:

    
len(words)









    Out[27]:





1107



In [ ]: