In [ ]:
from pprint import pprint
In [ ]:
# we'll use data from a job that collected tweets about parenting
tweet_bodies = [body for body in open('tweet_bodies.txt')]
In [ ]:
# sanity checks
pprint(len(tweet_bodies))
In [ ]:
# sanity checks
pprint(tweet_bodies[:10])
In [ ]:
# lets do some quick deduplication
from duplicate_filter import duplicateFilter
## set the similarity threshold at 90%
dup_filter = duplicateFilter(0.9)
deduped_tweet_bodies = []
for id,tweet_body in enumerate(tweet_bodies):
if not dup_filter.isDup(id,tweet_body):
deduped_tweet_bodies.append(tweet_body)
pprint(deduped_tweet_bodies[:10])
In [ ]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
tokenized_deduped_tweet_bodies = [tt.tokenize(body) for body in deduped_tweet_bodies]
In [ ]:
# sanity checks
len(tokenized_deduped_tweet_bodies)
In [ ]:
pprint(tokenized_deduped_tweet_bodies[:2])
The default configuration is the Greedy Averaged Perceptron tagger (https://explosion.ai/blog/part-of-speech-pos-tagger-in-python)
In [ ]:
from nltk.tag import pos_tag as pos_tagger
tagged_tokenized_deduped_tweet_bodies = [ pos_tagger(tokens) for tokens in tokenized_deduped_tweet_bodies]
In [ ]:
pprint(tagged_tokenized_deduped_tweet_bodies[:2])
In [ ]:
# let's look at the taxonomy of tags; in our case derived from the Penn treebank project
# (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.8216&rep=rep1&type=pdf)
import nltk
nltk.help.upenn_tagset()
In [ ]:
# let's peek at the tag dictionary for our tagger
from nltk.tag.perceptron import PerceptronTagger
t = PerceptronTagger()
pprint(list(t.tagdict.items())[:10])
We must choose which parts of speech to evaluate. Let's focus on adjectives, which are useful for sentiment analysis, and proper nouns, which provide a set of potential events and topics.
In [ ]:
adjective_tags = ['JJ','JJR','JJS']
pn_tags = ['NNP','NNPS']
tag_types = [('adj',adjective_tags),('PN',pn_tags)]
In [ ]:
# print format: "POS: TOKEN --> TWEET TEXT"
for body,tweet_tokens,tagged_tokens in zip(deduped_tweet_bodies,tokenized_deduped_tweet_bodies,tagged_tokenized_deduped_tweet_bodies):
for token,tag in tagged_tokens:
if tag in adjective_tags:
#if tag in pn_tags:
print_str = '{}: {} --> {}'.format(tag,token,body)
print(print_str)
These seem like dreadful results. Let's try a different NLP engine.
Download:
http://nlp.stanford.edu/software/stanford-corenlp-full-2016-10-31.zip
Then unzip. Start up the server from the unzipped directory:
$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
In [ ]:
from corenlp_pywrap import pywrap
cn = pywrap.CoreNLP(url='http://localhost:9000', annotator_list=["pos"])
In [ ]:
corenlp_results = []
for tweet_body in deduped_tweet_bodies:
try:
corenlp_results.append( cn.basic(tweet_body,out_format='json').json() )
except UnicodeEncodeError:
corenlp_results.append( {'sentences':[]} )
In [ ]:
# pull out the tokens and tags
corenlp_tagged_tokenized_deduped_tweet_bodies = [ [(token['word'],token['pos']) for sentence in result['sentences'] for token in sentence['tokens']] for result in corenlp_results]
In [ ]:
# print format: "POS: TOKEN --> TWEET TEXT"
for body,tagged_tokens in zip(deduped_tweet_bodies,corenlp_tagged_tokenized_deduped_tweet_bodies):
for token,tag in tagged_tokens:
#if tag in pn_tags:
if tag in adjective_tags:
print_str = '{}: {} --> {}'.format(tag,token,body)
print(print_str)
For Tweet bodies:
Next steps:
In [ ]: