In [84]:
import pandas as pd
%pylab inline
from mpltools import style
style.use('ggplot')
pylab.rcParams['figure.figsize'] = 16, 12


Populating the interactive namespace from numpy and matplotlib

In [3]:
import collections
import itertools
import multiprocessing

class SimpleMapReduce(object):
    def __init__(self, map_func, reduce_func, num_workers=None):
        self.map_func = map_func
        self.reduce_func = reduce_func
        self.pool = multiprocessing.Pool(num_workers)
        
    def partition(self, mapped_values):
        partitioned_data = collections.defaultdict(list)
        for key, value in mapped_values:
            partitioned_data[key].append(value)
        return partitioned_data.items()
    
    def __call__(self, inputs, chunksize=1):
        map_responses = self.pool.map(self.map_func, inputs, chunksize=chunksize)
        partitioned_data = self.partition(itertools.chain(*map_responses))
        reduced_values = self.pool.map(self.reduce_func, partitioned_data)
        return reduced_values

In [104]:
import string

def file_to_words(filename):
    STOP_WORDS = set(['arnab', 'rahul', 'that', 'have', 'this', 'what', 'there', 'want', 'with', 'dont', 'into', \
                      'your', 'yours', 'about', 'them', 'here','should','when', 'take','said','they', 'would', \
                      'because','about','been','them', 'were', 'think', 'like', 'from', 'there','done','which','make', \
                      'then', 'going', 'need', 'just', 'view', 'look', 'things', 'these', 'where', 'much', 'being', \
                      'thing', 'does', 'give', 'back', 'bring', 'doing', 'feel', 'more', 'give', 'case', 'asking', \
                      'some', 'taking', 'needs', 'says', 'modi', 'narendra', 'goswami', 'will', 'gandhi'])
    
    TR = string.maketrans(string.punctuation, ' ' * len(string.punctuation))
#    print multiprocessing.current_process().name, 'reading', filename
    output = []
    
    with open(filename, 'rt') as f:
        for line in f:
            line = line.translate(TR) # strip punctuation
            for word in line.split():
                word = word.lower()
                if word.isalpha() and len(word) > 3 and word not in STOP_WORDS:
                    output.append( (word, 1))
        f.close()
        return output
    
def count_words(item):
    word, occurances = item
    return (word, sum(occurances))

In [105]:
import operator

def word_counts(input_files):
    mapper = SimpleMapReduce(file_to_words, count_words)
    word_counts = mapper(input_files)
    word_counts.sort(key=operator.itemgetter(1))
    word_counts.reverse()
    return word_counts

In [106]:
modi = pd.DataFrame(word_counts(['data/modi.txt']), columns=['word', 'count'])
rahul = pd.DataFrame(word_counts(['data/rahul.txt']), columns=['word', 'count'])

In [107]:
common_text = pd.merge(modi, rahul, on='word', suffixes=['_modi', '_rahul'], how='inner')

In [108]:
greater_10 = common_text[(common_text.count_modi > 10) & (common_text.count_rahul > 10)]

In [ ]:
greater_10.plot(x=greater_10.word, kind='bar')
plt.xlabel('Words')
plt.ylabel('Word Count')
plt.title('Word count of Modi Vs Rahul')