In [1]:
def parseRaw(json_map):
url = json_map['url']
content = json_map['html']
return (url,content)
In [2]:
import json
import pprint
pp = pprint.PrettyPrinter(indent=2)
path = "./pixnet.txt"
all_content = sc.textFile(path).map(json.loads).map(parseRaw)
In [3]:
def getContent(x):
from bs4 import BeautifulSoup
soup = BeautifulSoup(x)
text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
import jieba
r = list()
for term in jieba.cut(text):
if len(term) > 1 and checkword(term): r.append(term)
return r
def checkword(x):
return all(u'\u4e00' <= c <= u'\u9fff' for c in x)
In [4]:
parsed = all_content.mapValues(lambda x : getContent(x))
print 'url:',parsed.first()[0]
print 'term:',
for term in parsed.first()[1][:10] :
print term ,
In [5]:
for i in parsed.map(lambda x: x[1]).flatMap(lambda x : x).take(10):
print i
In [6]:
from operator import add
top_term = parsed.map(
lambda x: x[1]).flatMap(
lambda x : x).map(
lambda x: (x,1)).reduceByKey(
add).sortBy(
lambda x: x[1],ascending=False)
for term in top_term.take(10):
print term[0] , term[1]
In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
In [8]:
no_urls_no_tags = " ".join(top_term.map(lambda x : x[0]).take(30))
wordcloud = WordCloud(
font_path='./cwTeXQFangsong-Medium.ttf',
background_color='white',
width=600,
height=600
).generate(no_urls_no_tags)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
In [10]:
top_term2 = parsed.map(
lambda x: x[1]).flatMap(
lambda x : x).map(
lambda x: (x,1)).reduceByKey(
add).sortBy(
lambda x: x[1],ascending=True)
In [11]:
top_term2.first()
Out[11]:
In [12]:
no_urls_no_tags = " ".join(top_term2.map(lambda x : x[0]).take(40))
wordcloud = WordCloud(
font_path='./cwTeXQFangsong-Medium.ttf',
background_color='white',
width=600,
height=600
).generate(no_urls_no_tags)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
In [ ]: