In [1]:
%pylab inline
In [27]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import time
import string
import re
sc
Out[27]:
In [3]:
text = sc.textFile('Jungle_Book.txt')
In [4]:
def splitlines(text):
return text.split(' ')
splitted = text.flatMap(splitlines).collect()
In [36]:
start = time.time()
def get_freq(word):
return word, 1
def get_count(a, b): # this will be used with reduceByKey
return a+b
def switch_tuple(t): # this is needed to sort by Key (flips the passed tuple)
return t[1], t[0]
freqs = text.flatMap(splitlines).map(get_freq).reduceByKey(get_count).map(switch_tuple).sortByKey(0, 1)
time_spark = time.time()-start
In [17]:
freqs.count() #total number of unique words in the text
Out[17]:
In [9]:
start = time.time()
freqs_2 = {}
for word in splitted:
try:
freqs_2[word] += 1
except KeyError:
freqs_2[word] = 1
time_py = time.time()-start
In [10]:
print 'Time Spark: ' + str(time_spark)
print 'Time Python: ' + str(time_py)
In [18]:
freqs_20 = freqs.collect()[:20]
print freqs_20
In [19]:
words = []
frequencies = []
for t in freqs_20:
words.append(t[1])
frequencies.append(t[0])
In [20]:
fig, ax = plt.subplots(figsize=(20,20))
plt.bar(range(0, len(frequencies)), frequencies, axes=ax)
plt.xticks(range(0, len(frequencies)), words, rotation=70, fontsize=14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [22]:
freqs.filter(lambda pair: pair[1].startswith('time')).take(10) # this shows that we have to parse the text
Out[22]:
In [78]:
wre = re.compile('\w')
print wre.sub('', 'This. Is, 22 A Text{}')
notwre = re.compile('[^\w]')
print notwre.sub('', 'This. Is, 22 A Text{}')
notwre = re.compile('([^\w\s]|\d+)')
print notwre.sub('', 'This. Is, 22 A Text{}').lower()
In [91]:
text.map(lambda line: line.replace('--', ' ').lower()) \
.flatMap(lambda line: line.split()) \
.map(lambda word: notwre.sub('', word)) \
.filter(lambda word: len(word) > 2) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda v1, v2: v1+v2) \
.sortBy(lambda pair: pair[1], False) \
.take(20)
# to parse the text for 'stopwords' it is possible to use the list found at: http://www.ranks.nl/stopwords
Out[91]:
In [95]:
import pymongo
import csv
cli = pymongo.MongoClient('mongodb://localhost:27017')
with open('/Users/adrianopagano/Desktop/Big_Dive/BigDive5/Data/Baby_Names__Beginning_2007.csv') as csvfile:
babies = csv.reader(csvfile)
next(babies)
for row in babies:
cli.testdb.babies.insert({'year': row[0],
'firstname': row[1],
'county': row[2],
'sex': row[3],
'count': row[4]})
In [96]:
cli.testdb.babies.count()
Out[96]:
In [97]:
babies_rdd = sc.mongoRDD('mongodb://localhost:27017/testdb.babies') # this creates a Spark RDD from the mongoDB collections
In [99]:
babies_rdd.take(2)
Out[99]:
In [104]:
babies_rdd.filter(lambda d: d['firstname']=='MICHAEL').take(5)
Out[104]:
In [ ]:
babies
In [101]:
names_to_counties = babies_rdd.map(lambda d: (d['firstname'], d['county']))
names_to_counties.take(5)
Out[101]:
In [102]:
grouped_names=names_to_counties.groupByKey()
grouped_names.take(5)
Out[102]:
In [105]:
grouped_names.map(lambda pair: (pair[0], len(pair[1]))).sortBy(lambda pair: pair[1], False).take(10) # pair[1] is a list of all counties where the name appeared
Out[105]:
In [112]:
names_freqs = babies_rdd.map(lambda d: (d['firstname'], int(d['count']))).reduceByKey(lambda x,y: x+y)
names_freqs.sortBy(lambda pair: pair[1], False).take(10)
Out[112]:
In [118]:
babies_rdd.map(lambda d: (int(d['year']), int(d['count']))).reduceByKey(lambda x,y: x+y).sortBy(lambda pair: pair[1], False).sortBy(lambda pair: pair[0], True).collect()
Out[118]:
In [140]:
babies_rdd.map(lambda d: ((d['county'].lower(), d['firstname'].lower()), int(d['count']))) \
.reduceByKey(lambda x,y: x+y) \
.map(lambda pair: (pair[0][0], (pair[1], pair[0][1]))) \
.reduceByKey(lambda freq1, freq2: max(freq1, freq2)).take(5)
Out[140]:
In [ ]: