In [3]:
import os
import sys
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))
print sys.path
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
In [13]:
def parseRaw(json_map):
url = json_map['url']
content = json_map['html']
return (url,content)
In [38]:
import json
import pprint
pp = pprint.PrettyPrinter(indent=2)
path = "./pixnet.txt"
all_content = sc.textFile(path).map(json.loads).map(parseRaw)
In [39]:
def parseImgSrc(x):
try:
urls = list()
import lxml.html
from urlparse import urlparse
root = lxml.html.fromstring(x)
t = root.getroottree()
for src in root.xpath('//img/@src'):
try :
host = urlparse(src).netloc
if '.' not in host : continue
if host.count('.') == 1 :
pass
else:
host = host[host.index('.')+1:]
urls.append('imgsrc_'+host)
except :
print "Error Parse At:" , src
for src in root.xpath('//input[@src]/@src'):
try :
host = urlparse(src).netloc
if '.' not in host : continue
if host.count('.') == 1 :
pass
else:
host = host[host.index('.')+1:]
urls.append('imgsrc_'+host)
except :
print "Error parseImgSrc At:" , src
except :
pass
return urls
In [41]:
image_list = all_content.map(lambda x :parseImgSrc(x[1]))
pp.pprint(image_list.first()[:10])
In [47]:
img_src_count = all_content.map(
lambda x :parseImgSrc(x[1])).flatMap(
lambda x: x).countByValue()
for i in img_src_count:
print i , ':' , img_src_count[i]
請參照以下文件 [http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD]
In [64]:
from operator import add
all_content.map(
lambda x :parseImgSrc(x[1])).flatMap(
lambda x: x).map(
lambda x: (x,1)).reduceByKey(add).sortBy(lambda x:x[1] ,ascending =False).collect()
Out[64]:
[('imgsrc_pixfs.net', 219),
('imgsrc_agoda.net', 103),
('imgsrc_static.flickr.com', 53),
('imgsrc_staticflickr.com', 28),
('imgsrc_pimg.tw', 19),
('imgsrc_facebook.com', 12),
('imgsrc_sitebro.com', 10),
('imgsrc_linkwithin.com', 5),
('imgsrc_cloudfront.net', 5),
('imgsrc_prchecker.info', 5),
('imgsrc_visit-japan.jp', 5),
('imgsrc_yimg.com', 2),
('imgsrc_zenfs.com', 2),
('imgsrc_googleusercontent.com', 1)]