In [1]:
import re
import nltk
from nltk.corpus import brown
from nltk.corpus import wordnet as wn
import corpii
In [6]:
from urllib import urlopen
You can uncomment some lines below to use a different text source.
In [7]:
text = nltk.clean_html(corpii.load_pres_debates().raw())
# code to get text from alternate source
#text = urlopen("www.url.com").read()
Now let's tokenize, split into sentences, and tag the text.
In [8]:
# tokenize
token_regex= """(?x)
# taken from ntlk book example
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
"""
tokens = nltk.regexp_tokenize(text, token_regex)
In [9]:
# get sentences
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
sents = list(sent_tokenizer.sentences_from_tokens(tokens))
In [10]:
#Create tagger
def build_backoff_tagger (train_sents):
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
return t2
tagger = build_backoff_tagger(brown.tagged_sents())
In [11]:
sent_tags = [tagger.tag(s) for s in sents]
In [12]:
tags = [t for s in sent_tags for t in s]
In [93]:
# we're going to use frequency distributions a lot, so let's create a nice way of looking at those
def fd_view(fd, n=10):
"""Prints a nice format of items in FreqDist fd[0:n]"""
print "{:<16}|{:<16}|{:<16}".format("Word", "Count", "Frequency")
print "========================================================="
for i in fd.items()[0:n]:
print "{:<16}|{:<16,d}|{:<16.3%}".format(i[0], i[1], fd.freq(i[0]))
In [105]:
def common_nouns(tags, min_length=4, pos=r"N.*"):
"""Takes a tagset and returns a frequency distribution of the words
that are at least min_length and whose tag matches pos"""
fd_nouns = nltk.FreqDist([ t[0].lower() for t in tags if len(t[0]) >= min_length and re.match(pos, t[1])])
return fd_nouns
In [106]:
# Let's look for common nouns
# I was getting some noise from very short tokens, so I'm excluding them.
fd_tokens = common_nouns(tags, 4)
In [107]:
#First let's see what some of these tokens are
fd_view(fd_tokens, 10)
That's actually not too bad for a corpus of presidential debates.
Let's try the same thing on the brown news corpus.
In [108]:
fd_brown = common_nouns(brown.tagged_words(categories='news'), 4)
fd_view(fd_brown, 10)
hmmmmm... a lot of things that sound newsy in there. A little scattered, but the news corpus probably covers a lot of ground.
In [18]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
In [19]:
def bigram_collocs(tokens, mfreq=3, measure=bigram_measures.pmi, n=10):
"""
Look for collocations in token list:tokens
args:
mfreq - minimum frequency to be included
measure - The measure to be used to find collocations
n - The number of matches to show
"""
finder = nltk.BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(mfreq)
return finder.nbest(measure, 10)
In [20]:
bigram_collocs(tokens)
Out[20]:
In [21]:
bigram_collocs(brown.words(categories=['news']))
Out[21]:
That isn't too helpful, maybe the chi measure?
In [22]:
bigram_collocs(tokens, measure=bigram_measures.chi_sq)
Out[22]:
In [23]:
bigram_collocs(brown.words(categories=['news']), measure=bigram_measures.chi_sq)
Out[23]:
I think one problem with applying the collocations metric to my corpus is that it is very large and covers several debates where different topics are important, so it doesn't create a very coherent result. I will try running it against individual debates to see if that's more interesting.
In [24]:
debates = corpii.load_pres_debates()
In [25]:
for debate in debates.fileids():
d = nltk.clean_html(debates.raw(fileids=[debate]))
d_tokens = nltk.regexp_tokenize(debate, token_regex)
collocs = bigram_collocs(d_tokens, n=5, mfreq=2)
if collocs:
print collocs
else:
print "No collocations found in {}".format(debate)
Well that didn't work, even with a lowered minimum frequency. (There are results when mfreq is lowered to 1, but they're pretty useless)
In [22]:
def get_hypernyms(synsets):
"""
Takes a list of synsets (as generated by wn.synsets) and returns a list of all hypernyms.
"""
hypernyms = set()
for synset in synsets:
for path in synset.hypernym_paths():
hypernyms.update([h for h in path if h != synset])
return hypernyms
In [23]:
def common_hypernyms(wordforms, min_depth=3):
"""
Takes a list of wordforms and extracts all hypernyms associated with the wordforms.
Returns a frequency distribution of of the sysnsets extracted.
arguments:
wordforms - Wordforms to be processed
min_depth - A filter to only include synsets of a certain depth.
Unintuitively, max_depth is used to calculate the depth of a synset.
"""
hypernyms = []
for l in wordforms:
hset = get_hypernyms(wn.synsets(l, pos=wn.NOUN))
hypernyms.extend(h for h in hset if h.max_depth()>=min_depth)
return nltk.FreqDist(hypernyms)
In [24]:
def fd_hypernyms(fd, depth=None, min_depth=3, pos=None):
"""
Takes a frequency distribution and analyzes the hypernyms of the wordforms contained therein.
Returns a weighted
fd - frequency distribution
depth - How far down fd to look
min_depth - A filter to only include synsets of a certain depth.
Unintuitively, max_depth is used to calculate the depth of a synset.
pos - part of speech to limit sysnsets to
"""
hypernyms = {}
for wf in fd.keys()[0:depth]:
freq = fd.freq(wf)
hset = get_hypernyms(wn.synsets(wf, pos=pos))
for h in hset:
if h.max_depth()>=min_depth:
if h in hypernyms:
hypernyms[h] += freq
else:
hypernyms[h] = freq
hlist = hypernyms.items()
hlist.sort(key=lambda s: s[1], reverse=True)
return hlist
In [25]:
debate_concepts = fd_hypernyms(fd_tokens, pos=wn.NOUN, min_depth=7)
In [26]:
[s[0].definition for s in debate_concepts[0:10]]
Out[26]:
In [27]:
def concept_printer(concepts, n=10):
"Prints first n concepts in concept list generated by fd_hypernyms"
print "{:<20} | {:<10} | {}".format("Concept", "Noun Freq", "Definition")
print "===================================================================="
for s in debate_concepts[0:10]:
print "{:<20} | {:<10.3%} | {}".format(s[0].lemma_names[0], s[1], s[0].definition)
In [28]:
concept_printer(debate_concepts, 10)
In [29]:
syn = debate_concepts[2][0]
In [30]:
syn.lemma_names
Out[30]:
In [31]:
brown_concepts = fd_hypernyms(fd_brown, pos=wn.NOUN, min_depth=7)
In [32]:
concept_printer(brown_concepts)
In [101]:
def get_propnoun_fd(sents):
"""
Finds proper nouns from tagged sentences and returns a frequency distribution of those nouns.
"""
grammar = r"""
NPROP: {<NP>+|<NP><IN.*|DT.*><NP>+}
"""
noun_parser = nltk.RegexpParser(grammar)
trees = [t for s in sents for t in noun_parser.parse(s).subtrees() if t.node == "NPROP"]
fd = nltk.FreqDist([" ".join([w[0] for w in t]) for t in trees])
return fd
In [117]:
fd_debate_np = get_propnoun_fd(sent_tags)
In [103]:
fd_view(fd_debate_np)
In [104]:
fd_brown_np = get_propnoun_fd(brown.tagged_sents(categories=['news']))
fd_view(fd_brown_np)
In [110]:
porter = nltk.stem.PorterStemmer()
def common_verbs(tags, min_length=4, pos=r"V.*"):
"""Takes a tagset and returns a frequency distribution of the words
that are at least min_length and whose tag matches pos"""
fd_verbs = nltk.FreqDist([ porter.stem(t[0].lower()) for t in tags if len(t[0]) >= min_length and re.match(pos, t[1])])
return fd_verbs
In [111]:
fd_debate_vb = common_verbs(tags)
fd_view(fd_debate_vb)
In [118]:
def get_verb_phrases(sent_tags):
grammar = r"""
NPROP: {<NP>+|<NP><IN.*|DT.*><NP>+}
VPHRASE: {<V.*><DET>?<NPROP>}
"""
parser = nltk.RegexpParser(grammar)
trees = [t for s in sent_tags for t in parser.parse(s).subtrees() if t.node == "VPHRASE"]
return trees
In [157]:
def get_verb_subjects_fd(vphrases, verb, stem=True):
"""
Takes a list of verb phrases, as made by get_verb_phrases and a verb
and returns a frequency distribution of the subjects of that verb.
"""
if stem:
verb = porter.stem(verb)
subjects = []
for phrase in vphrases:
v = phrase[0][0]
if stem:
v = porter.stem(v)
if v == verb:
print phrase
subjects.append(" ".join([w[0] for w in phrase[1]]))
return nltk.FreqDist(subjects)
In [158]:
vphrases = get_verb_phrases(sent_tags)
In [152]:
def get_top_verb_subjects(sent_tags, n=10):
"""
Takes a list of tagged sentences, finds the most common verbs and
creates a frequency distribution for the subjects of the most used verbs.
n - the number of verbs to test
"""
vphrases = get_verb_phrases(sent_tags)
verbs = common_verbs([t for s in sent_tags for t in s])
v_subjects = {}
for v in verbs.keys()[0:n]:
v_subjects[v] = get_verb_subjects_fd(vphrases, v)
return v_subjects
In [159]:
debates_vsubjs = get_top_verb_subjects(sent_tags)
In [156]:
for k in debates_vsubjs.keys():
print k
fd_view(debates_vsubjs[k])
print "********************************************"