In [1]:
    
def parseRaw(json_map):
    url = json_map['url']
    content = json_map['html']
    return (url,content)
    
In [2]:
    
import json
import pprint
pp = pprint.PrettyPrinter(indent=2)
path = "./pixnet.txt"
all_content = sc.textFile(path).map(json.loads).map(parseRaw)
    
In [3]:
    
def getContent(x):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(x)
    text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
    import jieba
    r = list()
    
    for term in jieba.cut(text):
        if len(term) > 1 and checkword(term): r.append(term)
    return r
def checkword(x):
    return all(u'\u4e00' <= c <= u'\u9fff' for c in x)
    
In [4]:
    
parsed = all_content.mapValues(lambda x : getContent(x))
print 'url:',parsed.first()[0]
print 'term:',
for term in parsed.first()[1][:10] :
    print term ,
    
    
In [5]:
    
for i in parsed.map(lambda x: x[1]).flatMap(lambda x : x).take(10):
    print i
    
    
In [6]:
    
from operator import add
top_term = parsed.map(
    lambda x: x[1]).flatMap(
    lambda x : x).map(
    lambda x: (x,1)).reduceByKey(
    add).sortBy(
    lambda x: x[1],ascending=False)
for term in top_term.take(10):
    print term[0] , term[1]
    
    
In [7]:
    
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
    
In [8]:
    
no_urls_no_tags  = " ".join(top_term.map(lambda x : x[0]).take(30))
wordcloud = WordCloud(
                      font_path='./cwTeXQFangsong-Medium.ttf',
                      background_color='white',
                      width=600,
                      height=600
                     ).generate(no_urls_no_tags)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
    
    
In [10]:
    
top_term2 = parsed.map(
    lambda x: x[1]).flatMap(
    lambda x : x).map(
    lambda x: (x,1)).reduceByKey(
    add).sortBy(
    lambda x: x[1],ascending=True)
    
In [11]:
    
top_term2.first()
    
    Out[11]:
In [12]:
    
no_urls_no_tags  = " ".join(top_term2.map(lambda x : x[0]).take(40))
wordcloud = WordCloud(
                      font_path='./cwTeXQFangsong-Medium.ttf',
                      background_color='white',
                      width=600,
                      height=600
                     ).generate(no_urls_no_tags)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
    
    
In [ ]: