In [1]:
    
def parseRaw(json_map):
    url = json_map['url']
    content = json_map['html']
    return (url,content)
    
In [2]:
    
import json
import pprint
pp = pprint.PrettyPrinter(indent=2)
path = "./raingod.txt"
all_content = sc.textFile(path).map(json.loads).map(parseRaw)
    
In [19]:
    
def getContent(x):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(x)
    mydivs = soup.findAll("div", { "class" : "article-content" })[0]
    
    text = mydivs.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
    import jieba
    import jieba.posseg as pseg
    r = list()
    
    for term in pseg.cut(text):
        if len(term.word) > 1 and checkword(term.word ) and 'n' in term.flag : 
            r.append(term.word)
    return r
def checkword(x):
    return all(u'\u4e00' <= c <= u'\u9fff' for c in x)
    
In [20]:
    
parsed = all_content.mapValues(lambda x : getContent(x))
print 'url:',parsed.first()[0]
print 'term:',
for term in parsed.first()[1][:10] :
    print term ,
    
    
In [21]:
    
parsed.map(lambda x: x[1]).flatMap(lambda x : x).take(10)
    
    Out[21]:
In [22]:
    
from operator import add
top_term = parsed.map(
    lambda x: x[1]).flatMap(
    lambda x : x).map(
    lambda x: (x,1)).reduceByKey(
    add).sortBy(
    lambda x: x[1],ascending=False)
for term in top_term.take(10):
    print term[0] , term[1]
    
    
In [23]:
    
%matplotlib inline
    
In [24]:
    
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
    
In [25]:
    
no_urls_no_tags  = " ".join(top_term.map(lambda x : x[0]).take(30))
wordcloud = WordCloud(
                      font_path='./cwTeXQFangsong-Medium.ttf',
                      background_color='white',
                      width=600,
                      height=600
                     )
wordcloud.fit_words(top_term.take(300))
    
    Out[25]:
In [26]:
    
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
    
    
In [2]:
    
from gensim.models import Word2Vec
wmodels = Word2Vec.load('../data/word2vec.model')
    
In [13]:
    
m= ['日本','星野']
f = open('./google-word2vec-visual/data/word2vec.csv','w')
f.write('name,value\r\n')
for i in wmodels.most_similar(positive=m,topn=40):
    print i[0],i[1]
    f.write(str(i[0]) + " " + str(i[1]) + "\r\n")
f.close()
    
    
In [ ]: