In [1]:
#import numpy as np
#import matplotlib.pyplot as plt
#import pylab as P
import codecs
from thanks import Thanks
thanks = Thanks(codecs.open("gratitude_tweets.txt","r").read())
In [ ]:
thanks.pos_frequency("noun")[0:50]
In [ ]:
thanks.pos_frequency("nouns").plot()
In [ ]:
thanks.reset_filter()
#print thanks.word_filter(["appreciate"])
#print thanks.word_collocation_filter([["appreciate","you"],["thank","you"]])
thanks.pos_frequency("modifiers")
In [ ]:
thanks.reset_filter()
print thanks.word_filter(["brother", "sister", "mother"])
#for msg in thanks.thanks:
# print msg.line
thanks.pos_frequency("nouns")
In [ ]:
import nltk
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
thanks.reset_filter()
for thank in thanks.thanks:
thank.length = len(thank.line)
thank.num_sentences = len(sent_detector.tokenize(thank.line))
print thanks.thanks[1].length
print thanks.thanks[1].num_sentences
print len(thanks.thanks[1].tokens)
In [ ]:
import matplotlib
wordcount_dist = []
for thank in thanks:
#wordcount: thank.length
#sentence list: thank.sentences
#num sentences: thank.num_sentences
wordcount_dist.append(thank.length) #in wordcount
plt.hist(wordcount_dist)
plt.show()
In [ ]: