In the client (where DSB is available)
In [ ]:
dsb install conda nltk
dsb install conda requests
dsb cmd "sudo /home/ubuntu/anaconda/bin/python -m nltk.downloader -d /usr/share/nltk_data all"
In the head node:
In [ ]:
!sudo -u hdfs hadoop fs -mkdir /user/ubuntu
!sudo -u hdfs hadoop fs -chown ubuntu /user/ubuntu
In [1]:
import os
import sys
In [2]:
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
In [3]:
os.environ["PYSPARK_PYTHON"] = "/home/ubuntu/anaconda/bin/python"
In [4]:
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.8.2.1-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")
In [5]:
from pyspark import SparkConf
from pyspark import SparkContext
conf = SparkConf()
conf.setMaster('yarn-client')
conf.setAppName('spark-yarn')
sc = SparkContext(conf=conf)
In [6]:
def noop(x):
import socket
return socket.gethostname()
In [7]:
rdd = sc.parallelize(range(1000), 100)
hosts = rdd.map(noop).distinct().collect()
print(hosts)
In [8]:
import requests
In [9]:
urls = sc.parallelize(['http://google.com', 'http://yahoo.com'])
In [10]:
html = urls.map(lambda x: requests.get(x).text)
In [11]:
responses = html.collect()
In [12]:
len(responses)
Out[12]:
In [13]:
responses[0][:500]
Out[13]:
In [15]:
html.saveAsTextFile('hdfs://ec2-54-175-19-187.compute-1.amazonaws.com:8020/user/ubuntu/crawl')
In [22]:
import nltk
In [23]:
html = sc.textFile('hdfs://ec2-54-175-19-187.compute-1.amazonaws.com:8020/user/ubuntu/crawl')
In [24]:
counts = html.map(lambda x: nltk.word_tokenize(x))
In [26]:
words = counts.collect()
In [27]:
len(words)
Out[27]:
In [ ]: