In [1]:
import os
In [2]:
from pyspark import SparkContext
In [3]:
SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
In [4]:
sc = SparkContext(os.environ.get("CLUSTER_URL"), 'pyspark-demo')
In [5]:
data = sc.parallelize(range(1000))
In [6]:
ret = data.filter(lambda x: x > 10 and x < 20)
In [7]:
ret.collect()
Out[7]:
A simple (and useless crawler) crawler
In [8]:
import requests
In [9]:
urls = sc.parallelize(['http://google.com', 'http://yahoo.com'])
In [10]:
html = urls.map(lambda x: requests.get(x).text)
In [11]:
responses = html.collect()
In [12]:
len(responses)
Out[12]:
In [13]:
responses[0][:500]
Out[13]:
In [14]:
responses[1][:500]
Out[14]:
In [16]:
html.saveAsTextFile('hdfs://10.0.8.149:8020/user/dsb/crawl')
In [17]:
!hadoop fs -ls -R
In [18]:
!hadoop fs -tail crawl/part-00003
This doesn't do anything usefull just shows that NLTK is working :)
In [5]:
import nltk
In [6]:
html = sc.textFile('hdfs://10.0.8.149:8020/user/dsb/crawl')
In [9]:
counts = html.map(lambda x: nltk.word_tokenize(x))
In [11]:
words = counts.collect()
In [12]:
len(words)
Out[12]:
In [14]:
len(words[0])
Out[14]:
In [17]:
words[0][:20]
Out[17]:
In [ ]: