In [84]:
import pandas as pd
%pylab inline
from mpltools import style
style.use('ggplot')
pylab.rcParams['figure.figsize'] = 16, 12
In [3]:
import collections
import itertools
import multiprocessing
class SimpleMapReduce(object):
def __init__(self, map_func, reduce_func, num_workers=None):
self.map_func = map_func
self.reduce_func = reduce_func
self.pool = multiprocessing.Pool(num_workers)
def partition(self, mapped_values):
partitioned_data = collections.defaultdict(list)
for key, value in mapped_values:
partitioned_data[key].append(value)
return partitioned_data.items()
def __call__(self, inputs, chunksize=1):
map_responses = self.pool.map(self.map_func, inputs, chunksize=chunksize)
partitioned_data = self.partition(itertools.chain(*map_responses))
reduced_values = self.pool.map(self.reduce_func, partitioned_data)
return reduced_values
In [104]:
import string
def file_to_words(filename):
STOP_WORDS = set(['arnab', 'rahul', 'that', 'have', 'this', 'what', 'there', 'want', 'with', 'dont', 'into', \
'your', 'yours', 'about', 'them', 'here','should','when', 'take','said','they', 'would', \
'because','about','been','them', 'were', 'think', 'like', 'from', 'there','done','which','make', \
'then', 'going', 'need', 'just', 'view', 'look', 'things', 'these', 'where', 'much', 'being', \
'thing', 'does', 'give', 'back', 'bring', 'doing', 'feel', 'more', 'give', 'case', 'asking', \
'some', 'taking', 'needs', 'says', 'modi', 'narendra', 'goswami', 'will', 'gandhi'])
TR = string.maketrans(string.punctuation, ' ' * len(string.punctuation))
# print multiprocessing.current_process().name, 'reading', filename
output = []
with open(filename, 'rt') as f:
for line in f:
line = line.translate(TR) # strip punctuation
for word in line.split():
word = word.lower()
if word.isalpha() and len(word) > 3 and word not in STOP_WORDS:
output.append( (word, 1))
f.close()
return output
def count_words(item):
word, occurances = item
return (word, sum(occurances))
In [105]:
import operator
def word_counts(input_files):
mapper = SimpleMapReduce(file_to_words, count_words)
word_counts = mapper(input_files)
word_counts.sort(key=operator.itemgetter(1))
word_counts.reverse()
return word_counts
In [106]:
modi = pd.DataFrame(word_counts(['data/modi.txt']), columns=['word', 'count'])
rahul = pd.DataFrame(word_counts(['data/rahul.txt']), columns=['word', 'count'])
In [107]:
common_text = pd.merge(modi, rahul, on='word', suffixes=['_modi', '_rahul'], how='inner')
In [108]:
greater_10 = common_text[(common_text.count_modi > 10) & (common_text.count_rahul > 10)]
In [ ]:
greater_10.plot(x=greater_10.word, kind='bar')
plt.xlabel('Words')
plt.ylabel('Word Count')
plt.title('Word count of Modi Vs Rahul')