In [1]:
!sudo pip install jieba
!sudo pip install BeautifulSoup4
In [2]:
urllist = ['http://chahabi77.pixnet.net/blog/post/436715527',
'http://chahabi77.pixnet.net/blog/post/403682269',
'http://chahabi77.pixnet.net/blog/post/354943724',
'http://chahabi77.pixnet.net/blog/post/386442944',
'http://chahabi77.pixnet.net/blog/post/235296791',
]
In [3]:
import urllib2
import json
f = open('./pixnet.txt',"w")
for u in urllist:
line = {}
response = urllib2.urlopen(u)
html = response.read()
html = html.replace('\r','').replace('\n','')
line['html'] = html
line['url'] =u
line_str = json.dumps(line)
f.write(line_str+"\r\n")
f.close()
In [4]:
import json
pixnet = sc.textFile('./pixnet.txt',use_unicode=False).map(
lambda x : json.loads(x)).map(lambda x : (x['url'],x['html']))
print "URL:", pixnet.first()[0]
print "資料筆數: ", pixnet.count()
print "HTML 前 200 字元:", pixnet.first()[1][:200]
In [6]:
count_nummber = pixnet.filter(lambda x : u"好吃" in x[1] ).count()
if count_nummber == 5 : print "你答對了"
In [7]:
def word_count(text):
return text.count(u"好吃")
print "好吃出現了",word_count(u"老師好吃好吃好吃好吃!!!!"),"次"
In [8]:
pixnet.mapValues(word_count).collect()
Out[8]:
In [9]:
total_count = pixnet.mapValues(word_count).map(lambda x : x[1]).reduce(lambda x,y: x+y)
if total_count == 59 : print "你答對了"
else : print "答錯了!你的答案是 %d, 正確答案是59" % (total_count)
In [ ]: