notebook.community

Edit and run



In [126]:

    
from nltk import word_tokenize,sent_tokenize,FreqDist
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
import glob
import operator
import re
import pandas as pd



In [65]:

    
#utility function to remove stopwords and other simbols
def removeStopwords(s):
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    regex = re.compile('[^a-zA-Z]')
    return [w for w in s if w not in stopwords_list and len(regex.sub('', w))>2]



In [112]:

    
#utility function to create the word dictionary of frequent words
def createWordDict(list_of_tokenized_sentences):
    all_words = [item for sublist in list_of_tokenized_sentences for item in sublist]
    freq_dist = FreqDist(all_words)
    sorted_x = sorted(freq_dist.items(), key=operator.itemgetter(1),reverse=True)
    toReturn = {}
    for pair in sorted_x:
        if pair[1]>1:
            toReturn[pair[0]]={}
    return toReturn



In [116]:

    
#utility function to create a list of tokens without stopwords for a given sentence
def createCleanSentence(sentence):
    word_list = word_tokenize(sentence.lower())
    clean_word_list = removeStopwords(word_list)
    return clean_word_list



In [115]:

    
# main function to create a dictionary with the requested results
# input : the documents path, see example below
def createHashtags(document_path):
    files = glob.glob(document_path)
    documents = []
    for f in files:
        with open(f, 'r') as input_f:
            documents.append(input_f.read())
    all_documents = '\n'.join(documents)
    sentence_list = sent_tokenize(all_documents.decode('utf-8'))
    clean_sentence_list = []
    for s in sentence_list:
        clean_sentence_list.append(createCleanSentence(s))
    word_dict = createWordDict(clean_sentence_list)
    frequent_words = word_dict.keys()
    for doc,doc_name in zip(documents,files):
        doc_sentence_list = sent_tokenize(doc.decode('utf-8'))
        doc_clean_sentence_list = []
        for s in doc_sentence_list:
            doc_clean_sentence_list.append(createCleanSentence(s))
        for clean_s,sen in zip(doc_clean_sentence_list,doc_sentence_list):
            for word in clean_s:
                if word in frequent_words:
                    if doc in word_dict[word]:
                        word_dict[word][doc_name].append(sen)
                    else:
                        word_dict[word][doc_name]=[sen]
        
    return word_dict



In [158]:

    
#utility function to save the results into a csv file with elements separated by delimiter: ";--;"
def saveHashtags(hashtag_dict):
    delim = ';--;'
    header = 'Word;--;Document;--;Sentence'
    with open('output.csv','w') as out:
        out.write(header+'\n')
        for word,mapping in hashtag_dict.iteritems():
            for doc,sents in mapping.iteritems():
                for s in sents:
                    toWrite = word+delim+doc+delim+s.replace('\n','')+'\n'
                    out.write(toWrite.encode('utf-8'))



In [159]:

    
res = createHashtags('*.txt')



In [160]:

    
saveHashtags(res)



In [161]:

    
#Lets read the created output and put it into a dataframe object
read_output = pd.read_csv('output.csv',delimiter=';--;',engine='python')



In [162]:

    
# Now we can look at the results in a readable format, and we can group the result if we want
read_output.head()









    Out[162]:






  
    
      
      Word
      Document
      Sentence
    
  
  
    
      0
      neighbors
      doc3.txt
      We know that we've been called in churches and...
    
    
      1
      neighbors
      doc2.txt
      I've seen it in the workers who would rather c...
    
    
      2
      teach
      doc3.txt
      You teach law school, you're a civil rights at...
    
    
      3
      teach
      doc5.txt
      We can teach the soldiers to fight and police ...
    
    
      4
      ended
      doc4.txt
      His ideas about how Kenya should progress ofte...



In [165]:

    
# Lets group the results by word and show the first result
grouped = read_output.groupby('Word')
for name, g in grouped:
    print name
    print g.head()
    break









    



ability
         Word  Document                                           Sentence
2050  ability  doc4.txt  Of course, in the end, one of the strongest we...
2051  ability  doc6.txt  I also saw their ability to ensure that shortc...
2052  ability  doc5.txt  But we need to also move towards more conditio...



In [ ]:



In [ ]:

	Word	Document	Sentence
0	neighbors	doc3.txt	We know that we've been called in churches and...
1	neighbors	doc2.txt	I've seen it in the workers who would rather c...
2	teach	doc3.txt	You teach law school, you're a civil rights at...
3	teach	doc5.txt	We can teach the soldiers to fight and police ...
4	ended	doc4.txt	His ideas about how Kenya should progress ofte...