In [1]:
def parseRaw(json_map):
url = json_map['url']
content = json_map['html']
return (url,content)
In [2]:
import json
import pprint
pp = pprint.PrettyPrinter(indent=2)
path = "./raingod.txt"
all_content = sc.textFile(path).map(json.loads).map(parseRaw)
In [19]:
def getContent(x):
from bs4 import BeautifulSoup
soup = BeautifulSoup(x)
mydivs = soup.findAll("div", { "class" : "article-content" })[0]
text = mydivs.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
import jieba
import jieba.posseg as pseg
r = list()
for term in pseg.cut(text):
if len(term.word) > 1 and checkword(term.word ) and 'n' in term.flag :
r.append(term.word)
return r
def checkword(x):
return all(u'\u4e00' <= c <= u'\u9fff' for c in x)
In [20]:
parsed = all_content.mapValues(lambda x : getContent(x))
print 'url:',parsed.first()[0]
print 'term:',
for term in parsed.first()[1][:10] :
print term ,
In [21]:
parsed.map(lambda x: x[1]).flatMap(lambda x : x).take(10)
Out[21]:
In [22]:
from operator import add
top_term = parsed.map(
lambda x: x[1]).flatMap(
lambda x : x).map(
lambda x: (x,1)).reduceByKey(
add).sortBy(
lambda x: x[1],ascending=False)
for term in top_term.take(10):
print term[0] , term[1]
In [23]:
%matplotlib inline
In [24]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
In [25]:
no_urls_no_tags = " ".join(top_term.map(lambda x : x[0]).take(30))
wordcloud = WordCloud(
font_path='./cwTeXQFangsong-Medium.ttf',
background_color='white',
width=600,
height=600
)
wordcloud.fit_words(top_term.take(300))
Out[25]:
In [26]:
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
In [2]:
from gensim.models import Word2Vec
wmodels = Word2Vec.load('../data/word2vec.model')
In [13]:
m= ['日本','星野']
f = open('./google-word2vec-visual/data/word2vec.csv','w')
f.write('name,value\r\n')
for i in wmodels.most_similar(positive=m,topn=40):
print i[0],i[1]
f.write(str(i[0]) + " " + str(i[1]) + "\r\n")
f.close()
In [ ]: