In [1]:
import os

In [2]:
from pyspark import SparkContext

In [3]:
SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])

In [4]:
sc = SparkContext(os.environ.get("CLUSTER_URL"), 'pyspark-demo')

In [5]:
data = sc.parallelize(range(1000))

In [6]:
ret = data.filter(lambda x: x > 10 and x < 20)

In [7]:
ret.collect()


Out[7]:
[11, 12, 13, 14, 15, 16, 17, 18, 19]

Python libraries + HDFS

A simple (and useless crawler) crawler


In [8]:
import requests

In [9]:
urls = sc.parallelize(['http://google.com', 'http://yahoo.com'])

In [10]:
html = urls.map(lambda x: requests.get(x).text)

In [11]:
responses = html.collect()

In [12]:
len(responses)


Out[12]:
2

In [13]:
responses[0][:500]


Out[13]:
u'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="/images/google_favicon_128.png" itemprop="image"><title>Google</title><script>(function(){window.google={kEI:\'tcgMVNapCuLisASx7YHYCQ\',kEXPI:\'17259,4000116,4007661,4008'

In [14]:
responses[1][:500]


Out[14]:
u'<!DOCTYPE html> <html lang="en-US" class="dev-desktop uni-purple-border  bkt901 https  uni-dark-purple sasb-space" style=""> <!-- m2 template  --> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <link rel="dns-prefetch" href="//s.yimg.com"><link rel="dns-prefetch" href="//y.analytics.yahoo.com"><link rel="dns-prefetch" href="//geo.query.yahoo.com"><link rel="dns-prefetch" href="//csc.beap.bc.yahoo.com"> <title>Yahoo</title> <meta http-equiv="X-UA-Compatible" content="c'

In [16]:
html.saveAsTextFile('hdfs://10.0.8.149:8020/user/dsb/crawl')

In [17]:
!hadoop fs -ls -R


log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
drwxr-xr-x   - dsb supergroup          0 2014-09-07 21:06 crawl
-rw-r--r--   3 dsb supergroup          0 2014-09-07 21:06 crawl/_SUCCESS
-rw-r--r--   3 dsb supergroup          0 2014-09-07 21:06 crawl/part-00000
-rw-r--r--   3 dsb supergroup          0 2014-09-07 21:06 crawl/part-00001
-rw-r--r--   3 dsb supergroup          0 2014-09-07 21:06 crawl/part-00002
-rw-r--r--   3 dsb supergroup      18506 2014-09-07 21:06 crawl/part-00003
-rw-r--r--   3 dsb supergroup          0 2014-09-07 21:06 crawl/part-00004
-rw-r--r--   3 dsb supergroup          0 2014-09-07 21:06 crawl/part-00005
-rw-r--r--   3 dsb supergroup          0 2014-09-07 21:06 crawl/part-00006
-rw-r--r--   3 dsb supergroup     336018 2014-09-07 21:06 crawl/part-00007

In [18]:
!hadoop fs -tail crawl/part-00003


log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
ction(e){throw e};if(google.timers&&google.timers.load.t){google.timers.load.t.xjsls=new Date().getTime();}google.dljp('/xjs/_/js/k\x3dxjs.hp.en_US.slTuXGyXTUw.O/m\x3dsb_he,pcc/rt\x3dj/d\x3d1/t\x3dzcms/rs\x3dAItRSTNoMR_1dWYQIljmoBFN-c3G40TfYw');google.xjs=1;}google.pmc={"sb_he":{"agen":true,"cgen":true,"client":"heirloom-hp","dh":true,"ds":"","eqch":true,"fl":true,"host":"google.com","jam":0,"jsonp":true,"msgs":{"cibl":"Clear Search","dym":"Did you mean:","lcky":"I\u0026#39;m Feeling Lucky","lml":"Learn more","oskt":"Input tools","psrc":"This search was removed from your \u003Ca href=\"/history\"\u003EWeb History\u003C/a\u003E","psrl":"Remove","sbit":"Search by image","srch":"Google Search"},"ovr":{},"pq":"","qcpw":false,"refoq":true,"scd":10,"sce":5,"stok":"xOxLf1m5MTLwPHezXyZeDftDYNo"},"pcc":{}};google.y.first.push(function(){if(google.med){google.med('init');google.initHistory();google.med('history');}});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);}</script></div></body></html>

NLTK

This doesn't do anything usefull just shows that NLTK is working :)


In [5]:
import nltk

In [6]:
html = sc.textFile('hdfs://10.0.8.149:8020/user/dsb/crawl')

In [9]:
counts = html.map(lambda x: nltk.word_tokenize(x))

In [11]:
words = counts.collect()

In [12]:
len(words)


Out[12]:
1128

In [14]:
len(words[0])


Out[14]:
599

In [17]:
words[0][:20]


Out[17]:
[u'<',
 u'!',
 u'doctype',
 u'html',
 u'>',
 u'<',
 u'html',
 u'itemscope=',
 u"''",
 u"''",
 u'itemtype=',
 u"''",
 u'http',
 u':',
 u'//schema.org/WebPage',
 u"''",
 u'lang=',
 u"''",
 u'en',
 u"''"]

In [ ]: