In [1]:
sc
Out[1]:
In [3]:
docs = sc.textFile('/Users/adrianopagano/Desktop/Big_Dive/BigDive5/Data/reddit_2014-12-10k.json')
In [4]:
docs.take(1)
Out[4]:
In [5]:
import json
In [53]:
docs.map(json.loads).map(lambda x: x['body']).take(2)
Out[53]:
In [7]:
import re
pattern = '(?u)\\b[A-Za-z]{3,}'
In [8]:
docs.map(json.loads)\
.map(lambda x: x['body']) \
.flatMap(lambda x: re.findall(pattern, x))\
.map(lambda x: (x.lower(), 1))\
.reduceByKey(lambda x,y: x+y)\
.map(lambda (a, b): (b, a))\
.sortByKey(ascending=False)\
.take(10)
Out[8]:
In [40]:
total_subreddit = set(docs.map(json.loads).map(lambda x: x['subreddit']).collect())
topics_serious = ['philosophy', 'history', 'politics', 'science']
topics_funny = ['funny', 'memes', 'humour', 'comic']
subreddit_serious= []
subreddit_funny= []
for sub in total_subreddit:
for i, topic in enumerate(topics_serious):
if topic in sub or topic.upper() in sub.upper():
subreddit_serious.append(sub)
for i, topic in enumerate(topics_funny):
if topic in sub or topic.upper() in sub.upper():
subreddit_funny.append(sub)
print subreddit_serious, subreddit_funny
print 'Number of serious subreddits: ' + str(len(subreddit_serious))
print 'Number of funny subreddits :' +str(len(subreddit_funny))
In [11]:
%pylab inline
In [52]:
%pylab inline
import matplotlib.pyplot as plt
import seaborn
num_serious = docs.map(json.loads)\
.filter(lambda x: x['subreddit'] in subreddit_serious) \
.map(lambda x: (x['subreddit'], 1)) \
.groupByKey() \
.map(lambda pair: len(pair[0]))\
.collect()
num_funny = docs.map(json.loads)\
.filter(lambda x: x['subreddit'] in subreddit_funny) \
.map(lambda x: (x['subreddit'], 1)) \
.groupByKey() \
.map(lambda pair: len(pair[0]))\
.collect()
fig, ax = plt.subplots(1,2)
ax[0].bar(range(len(num_serious)), sorted(num_serious))
ax[1].bar(range(len(num_funny)), sorted(num_funny))
ax[0].set_xlabel('Serious subreddit')
ax[1].set_xlabel('Funny subreddit')
ax[0].set_ylabel('Comments')
ax[0].set_ylim([0,25])
Out[52]:
In [ ]:
In [ ]: