In [126]:
from nltk import word_tokenize,sent_tokenize,FreqDist
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
import glob
import operator
import re
import pandas as pd

In [65]:
#utility function to remove stopwords and other simbols
def removeStopwords(s):
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    regex = re.compile('[^a-zA-Z]')
    return [w for w in s if w not in stopwords_list and len(regex.sub('', w))>2]

In [112]:
#utility function to create the word dictionary of frequent words
def createWordDict(list_of_tokenized_sentences):
    all_words = [item for sublist in list_of_tokenized_sentences for item in sublist]
    freq_dist = FreqDist(all_words)
    sorted_x = sorted(freq_dist.items(), key=operator.itemgetter(1),reverse=True)
    toReturn = {}
    for pair in sorted_x:
        if pair[1]>1:
            toReturn[pair[0]]={}
    return toReturn

In [116]:
#utility function to create a list of tokens without stopwords for a given sentence
def createCleanSentence(sentence):
    word_list = word_tokenize(sentence.lower())
    clean_word_list = removeStopwords(word_list)
    return clean_word_list

In [115]:
# main function to create a dictionary with the requested results
# input : the documents path, see example below
def createHashtags(document_path):
    files = glob.glob(document_path)
    documents = []
    for f in files:
        with open(f, 'r') as input_f:
            documents.append(input_f.read())
    all_documents = '\n'.join(documents)
    sentence_list = sent_tokenize(all_documents.decode('utf-8'))
    clean_sentence_list = []
    for s in sentence_list:
        clean_sentence_list.append(createCleanSentence(s))
    word_dict = createWordDict(clean_sentence_list)
    frequent_words = word_dict.keys()
    for doc,doc_name in zip(documents,files):
        doc_sentence_list = sent_tokenize(doc.decode('utf-8'))
        doc_clean_sentence_list = []
        for s in doc_sentence_list:
            doc_clean_sentence_list.append(createCleanSentence(s))
        for clean_s,sen in zip(doc_clean_sentence_list,doc_sentence_list):
            for word in clean_s:
                if word in frequent_words:
                    if doc in word_dict[word]:
                        word_dict[word][doc_name].append(sen)
                    else:
                        word_dict[word][doc_name]=[sen]
        
    return word_dict

In [158]:
#utility function to save the results into a csv file with elements separated by delimiter: ";--;"
def saveHashtags(hashtag_dict):
    delim = ';--;'
    header = 'Word;--;Document;--;Sentence'
    with open('output.csv','w') as out:
        out.write(header+'\n')
        for word,mapping in hashtag_dict.iteritems():
            for doc,sents in mapping.iteritems():
                for s in sents:
                    toWrite = word+delim+doc+delim+s.replace('\n','')+'\n'
                    out.write(toWrite.encode('utf-8'))

In [159]:
res = createHashtags('*.txt')

In [160]:
saveHashtags(res)

In [161]:
#Lets read the created output and put it into a dataframe object
read_output = pd.read_csv('output.csv',delimiter=';--;',engine='python')

In [162]:
# Now we can look at the results in a readable format, and we can group the result if we want
read_output.head()


Out[162]:
Word Document Sentence
0 neighbors doc3.txt We know that we've been called in churches and...
1 neighbors doc2.txt I've seen it in the workers who would rather c...
2 teach doc3.txt You teach law school, you're a civil rights at...
3 teach doc5.txt We can teach the soldiers to fight and police ...
4 ended doc4.txt His ideas about how Kenya should progress ofte...

In [165]:
# Lets group the results by word and show the first result
grouped = read_output.groupby('Word')
for name, g in grouped:
    print name
    print g.head()
    break


ability
         Word  Document                                           Sentence
2050  ability  doc4.txt  Of course, in the end, one of the strongest we...
2051  ability  doc6.txt  I also saw their ability to ensure that shortc...
2052  ability  doc5.txt  But we need to also move towards more conditio...

In [ ]:


In [ ]: