In [1]:
#import numpy as np
#import matplotlib.pyplot as plt
#import pylab as P
import codecs
from thanks import Thanks

thanks = Thanks(codecs.open("gratitude_tweets.txt","r").read())

In [ ]:
thanks.pos_frequency("noun")[0:50]

In [ ]:
thanks.pos_frequency("nouns").plot()

In [ ]:
thanks.reset_filter()
#print thanks.word_filter(["appreciate"])
#print thanks.word_collocation_filter([["appreciate","you"],["thank","you"]])
thanks.pos_frequency("modifiers")

In [ ]:
thanks.reset_filter()
print thanks.word_filter(["brother", "sister", "mother"])
#for msg in thanks.thanks:
#    print msg.line
thanks.pos_frequency("nouns")

In [ ]:
import nltk
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
thanks.reset_filter()
for thank in thanks.thanks:
    thank.length = len(thank.line)
    thank.num_sentences =  len(sent_detector.tokenize(thank.line))

print thanks.thanks[1].length
print thanks.thanks[1].num_sentences
print len(thanks.thanks[1].tokens)

In [ ]:
import matplotlib

wordcount_dist = []
for thank in thanks:
    #wordcount: thank.length
    #sentence list: thank.sentences
    #num sentences: thank.num_sentences
    wordcount_dist.append(thank.length) #in wordcount
    
plt.hist(wordcount_dist)
plt.show()

In [ ]: