In [126]:
from nltk import word_tokenize,sent_tokenize,FreqDist
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
import glob
import operator
import re
import pandas as pd
In [65]:
#utility function to remove stopwords and other simbols
def removeStopwords(s):
stopwords_list = stopwords.words('english') + list(string.punctuation)
regex = re.compile('[^a-zA-Z]')
return [w for w in s if w not in stopwords_list and len(regex.sub('', w))>2]
In [112]:
#utility function to create the word dictionary of frequent words
def createWordDict(list_of_tokenized_sentences):
all_words = [item for sublist in list_of_tokenized_sentences for item in sublist]
freq_dist = FreqDist(all_words)
sorted_x = sorted(freq_dist.items(), key=operator.itemgetter(1),reverse=True)
toReturn = {}
for pair in sorted_x:
if pair[1]>1:
toReturn[pair[0]]={}
return toReturn
In [116]:
#utility function to create a list of tokens without stopwords for a given sentence
def createCleanSentence(sentence):
word_list = word_tokenize(sentence.lower())
clean_word_list = removeStopwords(word_list)
return clean_word_list
In [115]:
# main function to create a dictionary with the requested results
# input : the documents path, see example below
def createHashtags(document_path):
files = glob.glob(document_path)
documents = []
for f in files:
with open(f, 'r') as input_f:
documents.append(input_f.read())
all_documents = '\n'.join(documents)
sentence_list = sent_tokenize(all_documents.decode('utf-8'))
clean_sentence_list = []
for s in sentence_list:
clean_sentence_list.append(createCleanSentence(s))
word_dict = createWordDict(clean_sentence_list)
frequent_words = word_dict.keys()
for doc,doc_name in zip(documents,files):
doc_sentence_list = sent_tokenize(doc.decode('utf-8'))
doc_clean_sentence_list = []
for s in doc_sentence_list:
doc_clean_sentence_list.append(createCleanSentence(s))
for clean_s,sen in zip(doc_clean_sentence_list,doc_sentence_list):
for word in clean_s:
if word in frequent_words:
if doc in word_dict[word]:
word_dict[word][doc_name].append(sen)
else:
word_dict[word][doc_name]=[sen]
return word_dict
In [158]:
#utility function to save the results into a csv file with elements separated by delimiter: ";--;"
def saveHashtags(hashtag_dict):
delim = ';--;'
header = 'Word;--;Document;--;Sentence'
with open('output.csv','w') as out:
out.write(header+'\n')
for word,mapping in hashtag_dict.iteritems():
for doc,sents in mapping.iteritems():
for s in sents:
toWrite = word+delim+doc+delim+s.replace('\n','')+'\n'
out.write(toWrite.encode('utf-8'))
In [159]:
res = createHashtags('*.txt')
In [160]:
saveHashtags(res)
In [161]:
#Lets read the created output and put it into a dataframe object
read_output = pd.read_csv('output.csv',delimiter=';--;',engine='python')
In [162]:
# Now we can look at the results in a readable format, and we can group the result if we want
read_output.head()
Out[162]:
In [165]:
# Lets group the results by word and show the first result
grouped = read_output.groupby('Word')
for name, g in grouped:
print name
print g.head()
break
In [ ]:
In [ ]: