Exercise 1)


In [1]:
phrase = ['My', 'very', 'unimaginative', 'phrase', '.']

In [2]:
phrase + phrase


Out[2]:
['My',
 'very',
 'unimaginative',
 'phrase',
 '.',
 'My',
 'very',
 'unimaginative',
 'phrase',
 '.']

In [3]:
phrase[-1:]


Out[3]:
['.']

In [4]:
phrase * 3


Out[4]:
['My',
 'very',
 'unimaginative',
 'phrase',
 '.',
 'My',
 'very',
 'unimaginative',
 'phrase',
 '.',
 'My',
 'very',
 'unimaginative',
 'phrase',
 '.']

In [5]:
phrase.sort()
phrase


Out[5]:
['.', 'My', 'phrase', 'unimaginative', 'very']

Exercise 2)


In [6]:
import nltk
persuasion = nltk.corpus.gutenberg.words('austen-persuasion.txt')
len(persuasion)


Out[6]:
98171

In [7]:
len(set(persuasion))


Out[7]:
6132

Exercise 3)


In [8]:
from nltk.corpus import brown
brown.words(categories=['religion', 'lore'])


Out[8]:
[u'As', u'a', u'result', u',', u'although', u'we', ...]

Exercise 4)


In [9]:
from nltk.corpus import state_union
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men', 'women', 'people']
    if w.lower() == target)
cfd.plot()


Exercise 5)


In [10]:
from nltk.corpus import wordnet as wn
def relations(noun):
    noun_synset = wn.synset(noun)
    print 'Member Meronyms:\n '
    print noun_synset.member_meronyms()
    print '\nPart Meronyms:\n'
    print noun_synset.part_meronyms()
    print '\nSubstance Meronyms:\n'
    print noun_synset.substance_meronyms()
    print '\nMember Holonyms:\n'
    print noun_synset.member_holonyms()
    print '\nPart Holonyms:\n'
    print noun_synset.part_holonyms()
    print '\nSubstance Holonyms:\n'
    print noun_synset.substance_holonyms()
relations('tree.n.01')


Member Meronyms:
 
[]

Part Meronyms:

[Synset('burl.n.02'), Synset('crown.n.07'), Synset('limb.n.02'), Synset('stump.n.01'), Synset('trunk.n.01')]

Substance Meronyms:

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

Member Holonyms:

[Synset('forest.n.01')]

Part Holonyms:

[]

Substance Holonyms:

[]

In [11]:
relations('honey.n.01')


Member Meronyms:
 
[]

Part Meronyms:

[]

Substance Meronyms:

[]

Member Holonyms:

[]

Part Holonyms:

[]

Substance Holonyms:

[Synset('mead.n.03'), Synset('oenomel.n.01')]

In [12]:
relations('wood.n.01')


Member Meronyms:
 
[]

Part Meronyms:

[]

Substance Meronyms:

[Synset('lignin.n.01')]

Member Holonyms:

[]

Part Holonyms:

[]

Substance Holonyms:

[Synset('beam.n.02'), Synset('chopping_block.n.01'), Synset('lumber.n.01'), Synset('spindle.n.02')]

Exercise 6)

Words could be spelled the same in two languages, but have different translations. With the means given so far: e.g. tag the words with the language ID.


In [13]:
from nltk.corpus import swadesh
it2en = [(i + '-it', e) for (i, e) in swadesh.entries(['it', 'en'])]

In [14]:
translate = dict(it2en)
translate['madre-it']


Out[14]:
u'mother'

In [15]:
de2en = [(d + '-de', e) for (d, e) in swadesh.entries(['de', 'en'])]
translate.update(dict(de2en))
translate['Hund-de']


Out[15]:
u'dog'

Exercise 7)


In [16]:
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance('however')


Displaying 25 of 131 matches:
 her many enjoyments . The danger , however , was at present so unperceived , t
ion would offend . Miss Churchill , however , being of age , and with the full 
n . From the expense of the child , however , he was soon relieved . The boy ha
 -- and been very well brought up . However , I do not mean to set up my opinio
f and predict . It was not likely , however , that any body should have equalle
to be borne . We will not despair , however . Weston may grow cross from the wa
is so very handsome and agreeable . However , I do really think Mr . Martin a v
 accepted after all . This letter , however , was written , and sealed , and se
e him ." " And if I did , ( which , however , I am far from allowing ) I should
 slightingly . Waiving that point , however , and supposing her to be , as you 
e was not so materially cast down , however , but that a little time and the re
ld inspire him ." The very next day however produced some proof of inspiration 
and staid to look through herself ; however , she called me back presently , an
t turn up . His ostensible reason , however , was to ask whether Mr . Woodhouse
l and cross . This does not apply , however , to Miss Bates ; she is only too g
and sufferings of the poor family , however , were the first subject on meeting
ting for her . She gained on them , however , involuntarily : the child ' s pac
ould close it . It was not closed , however , it still remained ajar ; but by e
 believes himself secure ." Still , however , though every thing had not been a
ght advance rapidly if they would , however ; they must advance somehow or othe
 offence came not . The beginning , however , of every visit displayed none but
eed !-- and my memory is very bad . However , it was an exceeding good , pretty
first day . Emma ' s sense of right however had decided it ; and besides the co
le fatigued . I could have wished , however , as you know , that you had seen M
" Our little friend Harriet Smith , however , is just such another pretty kind 

In [17]:
from nltk.corpus import inaugural
last = inaugural.fileids()[-3]
print last
inaugural = nltk.Text(nltk.corpus.inaugural.words(last))
inaugural.concordance('however')


2001-Bush.txt
Displaying 1 of 1 matches:
 And the proliferation of prisons , however necessary , is no substitute for ho

Exercise 8)


In [18]:
names = nltk.corpus.names
cfd = nltk.ConditionalFreqDist(
    (fileid, name[0])
    for fileid in names.fileids()
    for name in names.words(fileid))
cfd.plot()



In [19]:
len(names.words('female.txt'))


Out[19]:
5001

In [20]:
len(names.words('male.txt'))


Out[20]:
2943

Exercise 9)


In [21]:
religion = brown.words(fileids='cd12')

## Vocabulary:
print len(set(religion))


849

In [22]:
from nltk.corpus import webtext
movie = webtext.words(fileids='pirates.txt')
print len(set(movie))


3129

In [23]:
from __future__ import division

## Vocabulary richness:
print len(set(religion)) / len(religion)


0.368010403121

In [24]:
print len(set(movie)) / len(movie)


0.137969046254

In [25]:
movie_text = nltk.Text(movie)
religion_text = nltk.Text(religion)

## Word use:
movie_text.concordance('love')


Displaying 12 of 12 matches:
ACK SPARROW : Shhh ! GIBBS : For the love of mother and child , Jack , what ' 
IA DALMA : A wo -* man *. He fell in love . GIBBS : No - no - no - no , I hear
 heard it was the * sea * he fell in love with . TIA DALMA : Same story , diff
id I happen to mention ... he ' s in love . With a girl . Due to be married . 
SWANN : I ' m here to find the man I love . JACK SPARROW : I ' m deeply flatte
ttered , son , but my first and only love is the sea . [ Norrington vomits ove
 truth ? JACK SPARROW : Every word , love . And what you want most in this wor
ROW : A truly discomforting notion , love . GIBBS : And bad . Bad for every mo
ry certain . JACK SPARROW : One word love : curiosity . You long for freedom .
the right thing . JACK SPARROW : I * love * those moments . I like to wave at 
RROW : Pretty much . Time and tide , love . [ Will stoops down with the key to
ACK SPARROW : We ' re not free yet , love . ELIZABETH SWANN : You came back . 

In [26]:
religion_text.concordance('love')


No matches

In [27]:
movie_text.concordance('bear')


Displaying 1 of 1 matches:
, and they ' re not valid until they bear my signature and my seal . ELIZABETH

In [28]:
religion_text.concordance('bear')


Displaying 2 of 2 matches:
rch and university help him bring to bear upon his situation ? ? Recently , a 
 to bring his particular insights to bear upon the question of discrimination 

Exercise 10)


In [29]:
from __future__ import division

def third_of_tokens(text):
    words_in_text = [w for w in text if any(c.isalpha() for c in w)]

    fd = nltk.FreqDist(words_in_text)
    most = fd.most_common(1000)
    count = 0
    third_words = []

    for word, num in most:
        if ((count < (len(words_in_text) / 3)) & any(c.isalpha() for c in word)):
            count = count + num
            third_words.append(word)
    print third_words        
    print len(third_words)
    
third_of_tokens(movie)


[u'the', u'a', u'to', u'of', u'Jack', u's', u'I', u'you', u'and', u'is', u'SPARROW', u'JACK', u'in', u'on', u'it', u'his', u'TURNER', u'WILL', u'Will', u'with', u'SWANN', u'ELIZABETH', u'at', u'from', u'GIBBS', u'that', u'Elizabeth', u't', u'up', u'as']
30

In [30]:
third_of_tokens(religion)


[u'the', u'of', u'and', u'in', u'to', u'a', u'as', u'that', u'or', u'is', u'for', u'their', u'realtors', u'by', u'social', u'this', u'such', u'they', u'have', u'be']
20

In [31]:
third_of_tokens(emma)


[u'to', u'the', u'and', u'of', u'I', u'a', u'was', u'her', u'it', u'in', u'not', u'be', u'she', u'that', u'you', u'had', u'as', u'he', u'for', u'have', u'is', u'with', u'Mr', u'very']
24

In [32]:
third_of_tokens(inaugural)


[u'and', u'of', u'the', u'our', u'to', u'a', u'we', u'is', u'in', u'not', u'will', u'are', u'that', u'And', u'We', u'it']
16

Exercise 11)


In [33]:
pronouns = ['I', 'you', 'he', 'she', 'it', 'we', 'they']
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor', 'editorial', 'belles_lettres', 'government']
cfd.tabulate(conditions=genres, samples=pronouns)


                   I  you   he  she   it   we they 
           news  179   55  451   42  363   77  205 
       religion  155  100  137   10  264  176  115 
        hobbies  154  383  155   21  476  100  177 
science_fiction   98   81  139   36  129   30   53 
        romance  951  456  702  496  573   78  168 
          humor  239  131  146   58  162   32   70 
      editorial  201   83  268   41  386  167  148 
 belles_lettres  845  188 1174  178 1059  398  488 
     government   97   74  120    0  218  112   92 

In [34]:
wh = ['what', 'when', 'who', 'why', 'where']
cfd.tabulate(conditions=genres, samples=wh)


                 what  when   who   why where 
           news    76   128   268     9    58 
       religion    64    53   100    14    20 
        hobbies    78   119   103    10    72 
science_fiction    27    21    13     4    10 
        romance   121   126    89    34    54 
          humor    36    52    48     9    15 
      editorial    84   103   172    10    40 
 belles_lettres   244   252   452    36   107 
     government    43    56    74     6    46 

Exercise 12)


In [35]:
count_distinct = 0
dublettes = []
prev = ''
for entry in nltk.corpus.cmudict.entries():
    if ((entry[0] == prev) and (entry[0] not in dublettes)):
        dublettes.append(entry[0])
    else: 
        count_distinct = count_distinct + 1
        prev = entry[0]
print count_distinct
print (len(dublettes) / count_distinct) * 100


125033
6.96136220038

Exercise 13)


In [36]:
all_syns = list(wn.all_synsets('n'))
no_hyponyms = [s for s in all_syns if len(s.hyponyms()) == 0]
print (len(no_hyponyms) / len(all_syns)) * 100


79.6711928393

Exercise 14)


In [37]:
def supergloss(s):
    gloss = 'definition: ' + s.definition() + '\n\n'
    gloss = gloss + 'Hypernyms:\n'
    for hypernym in s.hypernyms():
        gloss = gloss + hypernym.name() + ': ' + hypernym.definition() + '\n'
    gloss = gloss + '\nHyponyms:\n'
    for hyponym in s.hyponyms():
        gloss = gloss + hyponym.name() + ': ' + hyponym.definition() + '\n'
    return gloss

In [38]:
print supergloss(wn.synset('bicycle.n.01'))


definition: a wheeled vehicle that has two wheels and is moved by foot pedals

Hypernyms:
wheeled_vehicle.n.01: a vehicle that moves on wheels and usually has a container for transporting things or people

Hyponyms:
bicycle-built-for-two.n.01: a bicycle with two sets of pedals and two seats
mountain_bike.n.01: a bicycle with a sturdy frame and fat tires; originally designed for riding in mountainous country
ordinary.n.04: an early bicycle with a very large front wheel and small back wheel
push-bike.n.01: a bicycle that must be pedaled
safety_bicycle.n.01: bicycle that has two wheels of equal size; pedals are connected to the rear wheel by a multiplying gear
velocipede.n.01: any of several early bicycles with pedals on the front wheel


In [39]:
print supergloss(wn.synset('believe.v.01'))


definition: accept as true; take to be true

Hypernyms:
accept.v.01: consider or hold as true

Hyponyms:
believe.v.04: follow a credo; have a faith; be a believer
believe_in.v.01: have a firm conviction as to the goodness of something
buy.v.05: accept as true
swallow.v.08: believe or accept without questioning or challenge
trust.v.01: have confidence or faith in
understand.v.04: believe to be the case

Exercise 15)


In [40]:
fd = nltk.FreqDist(brown.words())

In [41]:
triple_words = [w for w in fd.keys() if fd[w] > 2]
print len(brown.words())
print len(triple_words)


1161192
22339

Exercise 16)


In [42]:
def lexical_diversity(text):
    return len(text) / len(set(text))
for genre in nltk.corpus.brown.categories():
    print genre + ': ' + str(lexical_diversity(brown.words(categories=genre)))


adventure: 7.81406355646
belles_lettres: 9.39666684762
editorial: 6.22891809909
fiction: 7.36271769512
government: 8.57071262682
hobbies: 6.89945538333
humor: 4.32429738888
learned: 10.7887775076
lore: 7.60525408536
mystery: 8.18805499857
news: 6.9858274281
religion: 6.18217479994
reviews: 4.71875724554
romance: 8.28466635116
science_fiction: 4.4757191463

Exercise 17)


In [43]:
def most_frequent_content_words(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content_words = [w.lower() for w in text if w.lower() not in stopwords and any(c.isalpha() for c in w)]
    fd = nltk.FreqDist(content_words)
    return [w for w, num in fd.most_common(50)]
print most_frequent_content_words(emma)


[u'mr', u'emma', u'could', u'would', u'mrs', u'miss', u'must', u'harriet', u'much', u'said', u'one', u'weston', u'every', u'well', u'thing', u'knightley', u'elton', u'think', u'little', u'never', u'good', u'know', u'might', u'woodhouse', u'say', u'jane', u'quite', u'time', u'great', u'nothing', u'dear', u'fairfax', u'always', u'man', u'thought', u'churchill', u'soon', u'see', u'may', u'shall', u'without', u'first', u'frank', u'father', u'sure', u'indeed', u'like', u'made', u'ever', u'body']

In [44]:
print most_frequent_content_words(movie)


[u'jack', u'sparrow', u'elizabeth', u'turner', u'gibbs', u'swann', u'jones', u'norrington', u'pintel', u'ragetti', u'back', u'davy', u'one', u'beckett', u'ship', u'chest', u'black', u'ha', u'pearl', u'crew', u'deck', u'tia', u'cannibal', u'flying', u'man', u'scene', u'lord', u'key', u'dutchman', u'captain', u'know', u'cutler', u'come', u'us', u'men', u'want', u'dalma', u'find', u'sword', u'oh', u'hand', u'behind', u'compass', u'beach', u'see', u'bottle', u'look', u'kraken', u'cage', u'head']

Exercise 18)


In [45]:
def most_frequent_bigrams(text):
    stopwords = nltk.corpus.stopwords.words('english')
    bigrams = [b for b in nltk.bigrams(text) if b[0] not in stopwords and b[1] not in stopwords and any(c.isalpha() for c in b[0]) and any(c.isalpha() for c in b[1])]
    fd = nltk.FreqDist(bigrams)
    return [b for b, num in fd.most_common(50)]

print most_frequent_bigrams(emma)


[(u'Miss', u'Woodhouse'), (u'Frank', u'Churchill'), (u'Miss', u'Fairfax'), (u'I', u'shall'), (u'every', u'thing'), (u'Miss', u'Bates'), (u'Jane', u'Fairfax'), (u'I', u'think'), (u'every', u'body'), (u'I', u'hope'), (u'I', u'could'), (u'I', u'must'), (u'young', u'man'), (u'I', u'cannot'), (u'great', u'deal'), (u'I', u'would'), (u'Emma', u'could'), (u'said', u'Emma'), (u'I', u'know'), (u'Miss', u'Smith'), (u'John', u'Knightley'), (u'I', u'never'), (u'I', u'dare'), (u'dare', u'say'), (u'Miss', u'Taylor'), (u'I', u'assure'), (u'I', u'believe'), (u'But', u'I'), (u'My', u'dear'), (u'I', u'suppose'), (u'said', u'Mr'), (u'I', u'thought'), (u'She', u'could'), (u'I', u'wish'), (u'dear', u'Emma'), (u'Robert', u'Martin'), (u'Harriet', u'Smith'), (u'Maple', u'Grove'), (u'Colonel', u'Campbell'), (u'It', u'would'), (u'cried', u'Emma'), (u'She', u'would'), (u'young', u'woman'), (u'And', u'I'), (u'body', u'else'), (u'young', u'lady'), (u'I', u'always'), (u'If', u'I'), (u'She', u'must'), (u'good', u'deal')]

In [46]:
print most_frequent_bigrams(movie)


[(u'JACK', u'SPARROW'), (u'WILL', u'TURNER'), (u'ELIZABETH', u'SWANN'), (u'DAVY', u'JONES'), (u'CUTLER', u'BECKETT'), (u'LORD', u'CUTLER'), (u'Black', u'Pearl'), (u'Flying', u'Dutchman'), (u'Jack', u'Sparrow'), (u'TIA', u'DALMA'), (u'CANNIBAL', u'ISLAND'), (u'Davy', u'Jones'), (u'FLYING', u'DUTCHMAN'), (u'bamboo', u'pole'), (u'I', u'want'), (u'Jack', u'grabs'), (u'Mister', u'Turner'), (u'I', u'think'), (u'left', u'hand'), (u'Jack', u'looks'), (u'CANNIBAL', u'CROWD'), (u'Edinburgh', u'Trader'), (u'Ha', u'ha'), (u'You', u'know'), (u'And', u'I'), (u'Port', u'Royal'), (u'S', u'PARROT'), (u'East', u'India'), (u'slow', u'motion'), (u'black', u'spot'), (u'Jack', u'picks'), (u'Jack', u'walks'), (u'turns', u'around'), (u'PORT', u'ROYAL'), (u'aerial', u'view'), (u'BLACK', u'PEARL'), (u'scuttled', u'ship'), (u'I', u'know'), (u'motion', u'scene'), (u'music', u'stops'), (u'I', u'would'), (u'India', u'Trading'), (u'Trading', u'Company'), (u'small', u'chest'), (u'Will', u'sees'), (u'I', u'thought'), (u'hermit', u'crab'), (u'William', u'Turner'), (u'WOUNDED', u'SAILOR'), (u'looks', u'back')]

Exercise 19)


In [47]:
def table(words, genres):
    cfd = nltk.ConditionalFreqDist(
        (genre, word)
        for genre in brown.categories()
        for word in brown.words(categories=genre))
    cfd.tabulate(conditions=genres, samples=words)
    
table(['perhaps', 'maybe', 'possibly', 'surely', 'certainly', 'absolutely'], ['news', 'religion', 'government', 'learned', 'fiction', 'romance', 'humor'])


              perhaps      maybe   possibly     surely  certainly absolutely 
      news         11          1          3          2          6          1 
  religion          5          0          7          3          6          1 
government          4          1          2          0          2          0 
   learned         30          2          7          1         19          4 
   fiction         11          7          3          3          8          2 
   romance         15         15          4          3          4          2 
     humor          6          1          1          0          3          0 

Exercise 20)


In [48]:
def word_freq(word, genre):
    fd = nltk.FreqDist(brown.words(categories=genre))
    return fd[word]

word_freq('God', 'religion')


Out[48]:
131

In [49]:
word_freq('God', 'government')


Out[49]:
3

Exercise 21)


In [50]:
prondict = nltk.corpus.cmudict.dict()
    
def guess_syllables(text):
    count_syllables = 0;
    for word in text:
        if any(c.isalpha() for c in word):
            try:
                pron = prondict[word.lower()][0]
            except KeyError:
                print '"' + word.lower() + '" does not exist in CMU!'
                continue
            else:    
                for syllable in pron:
                    if any(c.isnumeric() for c in syllable):
                        count_syllables = count_syllables + 1;
    return count_syllables

guess_syllables(['She', 'sells', 'seashells', 'by', 'the', 'seashore'])


Out[50]:
8

In [51]:
guess_syllables(['This', 'is', 'an', 'absolutely', 'fantastic', 'pythonic', 'program', '.'])


"pythonic" does not exist in CMU!
Out[51]:
12

In [52]:
guess_syllables(religion_text)


"well-being" does not exist in CMU!
"one-sixth" does not exist in CMU!
"self-evident" does not exist in CMU!
"re-evaluation" does not exist in CMU!
"nareb" does not exist in CMU!
"self-images" does not exist in CMU!
"nareb's" does not exist in CMU!
"non-white" does not exist in CMU!
"all-white" does not exist in CMU!
"client-service" does not exist in CMU!
"etc." does not exist in CMU!
"pervasively" does not exist in CMU!
"nareb" does not exist in CMU!
"illumine" does not exist in CMU!
"residentially" does not exist in CMU!
"negroes'" does not exist in CMU!
"well-kept" does not exist in CMU!
"etc." does not exist in CMU!
"anti-discriminatory" does not exist in CMU!
"anti-discrimination" does not exist in CMU!
"unenforcible" does not exist in CMU!
"reformism" does not exist in CMU!
"permissibility" does not exist in CMU!
"all-white" does not exist in CMU!
"all-inclusive" does not exist in CMU!
"skillfulness" does not exist in CMU!
Out[52]:
3595

Exercise 22)


In [53]:
def hedge(text):
    text_hedged = []
    count = 0
    for word in text:
        text_hedged.append(word)
        count = count + 1
        if count == 3:
            text_hedged.append('like')
            count = 0
    return text_hedged

hedge(['She', 'sells', 'seashells', 'by', 'the', 'seashore', 'the', 'shells', 'she', 'sells', 'are', 'seashells'])


Out[53]:
['She',
 'sells',
 'seashells',
 'like',
 'by',
 'the',
 'seashore',
 'like',
 'the',
 'shells',
 'she',
 'like',
 'sells',
 'are',
 'seashells',
 'like']

Exercise 23)


In [1]:
# a
import pylab
%pylab inline
def zipf(text):
    fd = nltk.FreqDist(text)
    rank = 1
    freqs = []
    ranks = []
    for sample, count in fd.most_common(200):
        if any(c.isalpha() for c in sample):
            freqs.append(fd.freq(sample))
            ranks.append(rank)
            rank = rank + 1
    pylab.plot(ranks, freqs)


Populating the interactive namespace from numpy and matplotlib
c:\python27\lib\site-packages\IPython\core\magics\pylab.py:161: UserWarning: pylab import has clobbered these variables: ['pylab']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [55]:
zipf(emma)



In [56]:
zipf(religion_text)



In [3]:
# b
import random
import nltk
random_text = ''
count = 0
while count < 100001:
    random_text = random_text + random.choice('abcdefg ')
    count = count + 1
text_split = random_text.split()
zipf(random_text.split())


Exercise 24)


In [4]:
# a
def generate_model(text, num=15, n=50):
    words = [w for w, count in nltk.FreqDist(text).most_common(n)]
    bigrams = nltk.bigrams(text)
    cfd = nltk.ConditionalFreqDist(bigrams)
    word = random.choice(words)
    for i in range(num):
        print word,
        word = cfd[word].max()

generate_model(nltk.corpus.genesis.words('english-kjv.txt'))


Jacob , and the land of the land of the land of the land of

In [59]:
generate_model(emma)


Mr . I am sure I am sure I am sure I am sure I

In [60]:
generate_model(movie)


You ' s a man ' s a man ' s a man ' s

In [61]:
def generate_model_random(text, num=15, n=50):
    words = [w for w, count in nltk.FreqDist(text).most_common(n)]
    word = random.choice(words)
    for i in range(num):
        print word,
        word = random.choice(words)
generate_model_random(emma)


her Mrs my ' was so ' to my .-- would s the you his

In [62]:
generate_model_random(movie, 25, 200)


down ] Dutchman men look how know grabs CUTLER be ] when had if You that through DALMA RAGETTI when now off key JACK sea

In [6]:
# b
from nltk.corpus import brown
generate_model(brown.words(categories='news'))


they are not be a year . The President Kennedy , and the first time

In [64]:
generate_model_random(brown.words(categories='news'))


was but be or at would has this . an their '' he ; the

In [7]:
# c
generate_model(brown.words(categories=['news', 'romance']))


`` I was a little more than the first time , and the first time

In [66]:
generate_model_random(brown.words(categories=['news', 'romance']))


he the she from would a -- up is by . him was as but

In [67]:
generate_model(brown.words(categories=['belles_lettres', 'science_fiction']))


all the `` I have been a few days , and the `` I have

In [68]:
generate_model(brown.words(categories=['news', 'romance']), 100, 100)


he had been a little more than the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time , and the first time

Exercise 25)


In [70]:
from nltk.corpus import udhr
def find_language(word):
    languages = []
    for fileid in udhr.fileids():
        if fileid.endswith('-Latin1') and word in udhr.words(fileid):
            languages.append(fileid[:-7])
    return languages        
            
print find_language('and')


[u'Albanian_Shqip', u'Bemba', u'Cebuano', u'English', u'NigerianPidginEnglish', u'Romani']

In [73]:
print find_language('in')


[u'Afaan_Oromo_Oromiffa', u'Afrikaans', u'Aguaruna', u'Amarakaeri', u'Cakchiquel', u'Chayahuita', u'Corsican', u'Dutch_Nederlands', u'English', u'Frisian', u'Friulian_Friulano', u'German_Deutsch', u'Hausa_Haoussa', u'Huasteco', u'Interlingua', u'IrishGaelic_Gaeilge', u'Italian', u'Italian_Italiano', u'Kapampangan', u'Latin_Latina', u'Latin_Latina-v2', u'Marshallese', u'Mazateco', u'Mikmaq_Micmac-Mikmaq', u'Mixteco', u'Oromiffa_AfaanOromo', u'Paez', u'Picard', u'Qechi_Kekchi', u'Rhaeto-Romance_Rumantsch', u'Sammarinese', u'Sardinian', u'Somali', u'Tenek_Huasteco', u'Tojol-abal', u'Totonaco', u'Tzeltal', u'Tzotzil', u'Uzbek', u'Walloon_Wallon', u'Waray', u'Wayuu']

Exercise 26)


In [74]:
num_hyponyms = 0
sum_hyponyms = 0
for synset in wn.all_synsets('n'):
    hyponyms = synset.hyponyms()
    if len(hyponyms) > 0:
        num_hyponyms = num_hyponyms + 1
        sum_hyponyms = sum_hyponyms + len(hyponyms)
        
print sum_hyponyms / num_hyponyms


4.54382076319

Exercise 27)


In [82]:
def average_polysemy(category):
    seen_words = []
    num_poly = 0
    sum_poly = 0
    for synset in wn.all_synsets(category):
        if num_poly > 20000:
            break;
        for lemma in synset.lemmas():
            lemma_name = lemma.name()
            if lemma_name not in seen_words:
                seen_words.append(lemma_name)
                num_poly = num_poly + 1
                sum_poly = sum_poly + len(wn.synsets(lemma_name, category))
    return sum_poly / num_poly

average_polysemy('n')


Out[82]:
1.5455227238638067

In [83]:
average_polysemy('v')


Out[83]:
2.1866273523545225

In [84]:
average_polysemy('a')


Out[84]:
1.4336283185840708

In [88]:
average_polysemy('r')


Out[88]:
1.2532916759651864

Exercise 28)


In [92]:
pairs = [('car', 'automobile'), ('gem', 'jewel'), ('journey', 'voyage'), ('boy', 'lad'), ('coast', 'shore'), 
         ('asylum', 'madhouse'), ('magician', 'wizard'), ('midday', 'noon'), ('furnace', 'stove'), ('food', 'fruit'), 
         ('bird', 'cock'), ('bird', 'crane'), ('tool', 'implement'), ('brother', 'monk'), ('lad', 'brother'), 
         ('crane', 'implement'), ('journey', 'car'), ('monk', 'oracle'), ('cemetery', 'woodland'), ('food', 'rooster'), 
         ('coast', 'hill'), ('forest', 'graveyard'), ('shore', 'woodland'), ('monk', 'slave'), ('coast', 'forest'), 
         ('lad', 'wizard'), ('chord', 'smile'), ('glass', 'magician'), ('rooster', 'voyage'), ('noon', 'string')]
lch = []
for word1, word2 in pairs:
    lch.append((word1, word2, wn.lch_similarity(wn.synsets(word1)[0], wn.synsets(word2)[0])))
from operator import itemgetter
sorted(lch,key=itemgetter(2),reverse=True)


Out[92]:
[('car', 'automobile', 3.6375861597263857),
 ('midday', 'noon', 3.6375861597263857),
 ('coast', 'shore', 2.9444389791664407),
 ('tool', 'implement', 2.9444389791664407),
 ('boy', 'lad', 2.538973871058276),
 ('journey', 'voyage', 2.2512917986064953),
 ('coast', 'hill', 2.0281482472922856),
 ('shore', 'woodland', 2.0281482472922856),
 ('monk', 'slave', 2.0281482472922856),
 ('lad', 'wizard', 2.0281482472922856),
 ('magician', 'wizard', 1.845826690498331),
 ('lad', 'brother', 1.6916760106710724),
 ('gem', 'jewel', 1.55814461804655),
 ('asylum', 'madhouse', 1.55814461804655),
 ('brother', 'monk', 1.55814461804655),
 ('monk', 'oracle', 1.55814461804655),
 ('bird', 'crane', 1.4403615823901665),
 ('cemetery', 'woodland', 1.4403615823901665),
 ('glass', 'magician', 1.4403615823901665),
 ('crane', 'implement', 1.3350010667323402),
 ('food', 'fruit', 1.2396908869280152),
 ('coast', 'forest', 1.2396908869280152),
 ('chord', 'smile', 1.2396908869280152),
 ('furnace', 'stove', 1.072636802264849),
 ('forest', 'graveyard', 0.9985288301111273),
 ('bird', 'cock', 0.8649974374866046),
 ('food', 'rooster', 0.8649974374866046),
 ('noon', 'string', 0.8043728156701697),
 ('journey', 'car', 0.6418538861723948),
 ('rooster', 'voyage', 0.4595323293784402)]

In [93]:
path = []
for word1, word2 in pairs:
    path.append((word1, word2, wn.path_similarity(wn.synsets(word1)[0], wn.synsets(word2)[0])))
sorted(path,key=itemgetter(2),reverse=True)


Out[93]:
[('car', 'automobile', 1.0),
 ('midday', 'noon', 1.0),
 ('coast', 'shore', 0.5),
 ('tool', 'implement', 0.5),
 ('boy', 'lad', 0.3333333333333333),
 ('journey', 'voyage', 0.25),
 ('coast', 'hill', 0.2),
 ('shore', 'woodland', 0.2),
 ('monk', 'slave', 0.2),
 ('lad', 'wizard', 0.2),
 ('magician', 'wizard', 0.16666666666666666),
 ('lad', 'brother', 0.14285714285714285),
 ('gem', 'jewel', 0.125),
 ('asylum', 'madhouse', 0.125),
 ('brother', 'monk', 0.125),
 ('monk', 'oracle', 0.125),
 ('bird', 'crane', 0.1111111111111111),
 ('cemetery', 'woodland', 0.1111111111111111),
 ('glass', 'magician', 0.1111111111111111),
 ('crane', 'implement', 0.1),
 ('food', 'fruit', 0.09090909090909091),
 ('coast', 'forest', 0.09090909090909091),
 ('chord', 'smile', 0.09090909090909091),
 ('furnace', 'stove', 0.07692307692307693),
 ('forest', 'graveyard', 0.07142857142857142),
 ('bird', 'cock', 0.0625),
 ('food', 'rooster', 0.0625),
 ('noon', 'string', 0.058823529411764705),
 ('journey', 'car', 0.05),
 ('rooster', 'voyage', 0.041666666666666664)]