Exercise 1


In [1]:
# MACHINE TRANSLATION:
# see e.g. http://www.aclweb.org/anthology/R11-1077, https://nlp.stanford.edu/courses/cs224n/2010/reports/bipins.pdf
# data: parallel corpora, aligned at sentence level (automatically or manually)
# size: usually assumed the larger the better, 2nd paper: 100,00 documents
# reasons for large amount: probability that a word or combination of words has been seen during training increases

Exercise 2


In [2]:
import nltk
from nltk.corpus import names
import random
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
test, devtest, training = names[:500], names[500:1000], names[1000:]

def gender_features1(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    features["suffix2"] = name[-2:].lower()
    return features

train_set = [(gender_features1(n), g) for (n,g) in training]
devtest_set = [(gender_features1(n), g) for (n,g) in devtest]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)


0.786

In [3]:
def error_analysis(gender_features):
    errors = []
    for (name, tag) in devtest:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append((tag, guess, name))
    print 'no. of errors: ', len(errors)        
        
    for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
        print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)        
        
error_analysis(gender_features1)


no. of errors:  107
correct=female   guess=male     name=Alison                        
correct=female   guess=male     name=Aubry                         
correct=female   guess=male     name=Beilul                        
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Beret                         
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Betsey                        
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Brear                         
correct=female   guess=male     name=Cherry                        
correct=female   guess=male     name=Christen                      
correct=female   guess=male     name=Christian                     
correct=female   guess=male     name=Cody                          
correct=female   guess=male     name=Con                           
correct=female   guess=male     name=Deerdre                       
correct=female   guess=male     name=Demetris                      
correct=female   guess=male     name=Dorry                         
correct=female   guess=male     name=Elsbeth                       
correct=female   guess=male     name=Em                            
correct=female   guess=male     name=Emmy                          
correct=female   guess=male     name=Esme                          
correct=female   guess=male     name=Estell                        
correct=female   guess=male     name=Ethyl                         
correct=female   guess=male     name=Florence                      
correct=female   guess=male     name=Friederike                    
correct=female   guess=male     name=Gabriell                      
correct=female   guess=male     name=Gunvor                        
correct=female   guess=male     name=Gwennie                       
correct=female   guess=male     name=Gwyneth                       
correct=female   guess=male     name=Harmony                       
correct=female   guess=male     name=Hester                        
correct=female   guess=male     name=Hope                          
correct=female   guess=male     name=Janifer                       
correct=female   guess=male     name=Janot                         
correct=female   guess=male     name=Jody                          
correct=female   guess=male     name=Judith                        
correct=female   guess=male     name=Kim                           
correct=female   guess=male     name=Kore                          
correct=female   guess=male     name=Margery                       
correct=female   guess=male     name=Margret                       
correct=female   guess=male     name=Marion                        
correct=female   guess=male     name=Marry                         
correct=female   guess=male     name=Meg                           
correct=female   guess=male     name=Meggan                        
correct=female   guess=male     name=Mercy                         
correct=female   guess=male     name=Olympe                        
correct=female   guess=male     name=Patsy                         
correct=female   guess=male     name=Philis                        
correct=female   guess=male     name=Philly                        
correct=female   guess=male     name=Pier                          
correct=female   guess=male     name=Raquel                        
correct=female   guess=male     name=Roseann                       
correct=female   guess=male     name=Sapphire                      
correct=female   guess=male     name=Scarlet                       
correct=female   guess=male     name=Shannon                       
correct=female   guess=male     name=Sharyl                        
correct=female   guess=male     name=Solange                       
correct=female   guess=male     name=Sophey                        
correct=female   guess=male     name=Van                           
correct=female   guess=male     name=Wendy                         
correct=female   guess=male     name=Wilone                        
correct=female   guess=male     name=Winnah                        
correct=male     guess=female   name=Abby                          
correct=male     guess=female   name=Allan                         
correct=male     guess=female   name=Arnie                         
correct=male     guess=female   name=Bailey                        
correct=male     guess=female   name=Baily                         
correct=male     guess=female   name=Benjy                         
correct=male     guess=female   name=Bubba                         
correct=male     guess=female   name=Charlie                       
correct=male     guess=female   name=Dwane                         
correct=male     guess=female   name=Elisha                        
correct=male     guess=female   name=Emanuel                       
correct=male     guess=female   name=Fonsie                        
correct=male     guess=female   name=Franklyn                      
correct=male     guess=female   name=Hannibal                      
correct=male     guess=female   name=Jeramie                       
correct=male     guess=female   name=Jere                          
correct=male     guess=female   name=Jermayne                      
correct=male     guess=female   name=Jerrie                        
correct=male     guess=female   name=Jimmie                        
correct=male     guess=female   name=Jude                          
correct=male     guess=female   name=Kevin                         
correct=male     guess=female   name=Kyle                          
correct=male     guess=female   name=Lawrence                      
correct=male     guess=female   name=Lazare                        
correct=male     guess=female   name=Lin                           
correct=male     guess=female   name=Lonnie                        
correct=male     guess=female   name=Michele                       
correct=male     guess=female   name=Micky                         
correct=male     guess=female   name=Moishe                        
correct=male     guess=female   name=Neil                          
correct=male     guess=female   name=Noble                         
correct=male     guess=female   name=Odie                          
correct=male     guess=female   name=Prentice                      
correct=male     guess=female   name=Reza                          
correct=male     guess=female   name=Ricki                         
correct=male     guess=female   name=Ronnie                        
correct=male     guess=female   name=Samuele                       
correct=male     guess=female   name=Sly                           
correct=male     guess=female   name=Sydney                        
correct=male     guess=female   name=Tann                          
correct=male     guess=female   name=Terence                       
correct=male     guess=female   name=Uli                           
correct=male     guess=female   name=Vail                          
correct=male     guess=female   name=Val                           
correct=male     guess=female   name=Vasili                        

In [4]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    features["suffix2"] = name[-2:].lower()
    features["suffix3"] = name[-3:].lower()
    return features

train_set = [(gender_features2(n), g) for (n,g) in training]
devtest_set = [(gender_features2(n), g) for (n,g) in devtest]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)


0.792

In [5]:
error_analysis(gender_features2)


no. of errors:  104
correct=female   guess=male     name=Alison                        
correct=female   guess=male     name=Aubry                         
correct=female   guess=male     name=Beilul                        
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Beret                         
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Betsey                        
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Brear                         
correct=female   guess=male     name=Cal                           
correct=female   guess=male     name=Cherry                        
correct=female   guess=male     name=Christen                      
correct=female   guess=male     name=Christian                     
correct=female   guess=male     name=Cody                          
correct=female   guess=male     name=Con                           
correct=female   guess=male     name=Demetris                      
correct=female   guess=male     name=Dorry                         
correct=female   guess=male     name=Ealasaid                      
correct=female   guess=male     name=Elsbeth                       
correct=female   guess=male     name=Em                            
correct=female   guess=male     name=Emmy                          
correct=female   guess=male     name=Esme                          
correct=female   guess=male     name=Estell                        
correct=female   guess=male     name=Ethyl                         
correct=female   guess=male     name=Florence                      
correct=female   guess=male     name=France                        
correct=female   guess=male     name=Friederike                    
correct=female   guess=male     name=Gabriell                      
correct=female   guess=male     name=Gillan                        
correct=female   guess=male     name=Gunvor                        
correct=female   guess=male     name=Gwyneth                       
correct=female   guess=male     name=Harmony                       
correct=female   guess=male     name=Hester                        
correct=female   guess=male     name=Hope                          
correct=female   guess=male     name=Janifer                       
correct=female   guess=male     name=Janot                         
correct=female   guess=male     name=Jody                          
correct=female   guess=male     name=Judith                        
correct=female   guess=male     name=Kim                           
correct=female   guess=male     name=Margery                       
correct=female   guess=male     name=Margret                       
correct=female   guess=male     name=Marion                        
correct=female   guess=male     name=Marry                         
correct=female   guess=male     name=Meg                           
correct=female   guess=male     name=Meggan                        
correct=female   guess=male     name=Mercy                         
correct=female   guess=male     name=Murial                        
correct=female   guess=male     name=Olympe                        
correct=female   guess=male     name=Patsy                         
correct=female   guess=male     name=Paule                         
correct=female   guess=male     name=Philis                        
correct=female   guess=male     name=Pier                          
correct=female   guess=male     name=Raquel                        
correct=female   guess=male     name=Sal                           
correct=female   guess=male     name=Scarlet                       
correct=female   guess=male     name=Shannon                       
correct=female   guess=male     name=Sharyl                        
correct=female   guess=male     name=Sophey                        
correct=female   guess=male     name=Van                           
correct=female   guess=male     name=Wendy                         
correct=female   guess=male     name=Wilone                        
correct=female   guess=male     name=Winnah                        
correct=male     guess=female   name=Abby                          
correct=male     guess=female   name=Allan                         
correct=male     guess=female   name=Arnie                         
correct=male     guess=female   name=Bailey                        
correct=male     guess=female   name=Baily                         
correct=male     guess=female   name=Benjy                         
correct=male     guess=female   name=Bubba                         
correct=male     guess=female   name=Carroll                       
correct=male     guess=female   name=Charlie                       
correct=male     guess=female   name=Dwane                         
correct=male     guess=female   name=Elisha                        
correct=male     guess=female   name=Fonsie                        
correct=male     guess=female   name=Franklyn                      
correct=male     guess=female   name=Hannibal                      
correct=male     guess=female   name=Herrmann                      
correct=male     guess=female   name=Jeramie                       
correct=male     guess=female   name=Jere                          
correct=male     guess=female   name=Jermayne                      
correct=male     guess=female   name=Jerrie                        
correct=male     guess=female   name=Jimmie                        
correct=male     guess=female   name=Jude                          
correct=male     guess=female   name=Kyle                          
correct=male     guess=female   name=Lazare                        
correct=male     guess=female   name=Lin                           
correct=male     guess=female   name=Lonnie                        
correct=male     guess=female   name=Michele                       
correct=male     guess=female   name=Micky                         
correct=male     guess=female   name=Neil                          
correct=male     guess=female   name=Noble                         
correct=male     guess=female   name=Odie                          
correct=male     guess=female   name=Penny                         
correct=male     guess=female   name=Prentice                      
correct=male     guess=female   name=Reza                          
correct=male     guess=female   name=Ricki                         
correct=male     guess=female   name=Ronnie                        
correct=male     guess=female   name=Samuele                       
correct=male     guess=female   name=Sydney                        
correct=male     guess=female   name=Tann                          
correct=male     guess=female   name=Tobie                         
correct=male     guess=female   name=Uli                           
correct=male     guess=female   name=Vail                          
correct=male     guess=female   name=Vasili                        

In [6]:
def gender_features3(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    features["suffix2"] = name[-2:].lower()
    features["suffix3"] = name[-3:].lower()
    features["prefix3"] = name[:3].lower()
    return features

train_set = [(gender_features3(n), g) for (n,g) in training]
devtest_set = [(gender_features3(n), g) for (n,g) in devtest]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)


0.8

In [7]:
error_analysis(gender_features3)


no. of errors:  100
correct=female   guess=male     name=Alison                        
correct=female   guess=male     name=Aubry                         
correct=female   guess=male     name=Beilul                        
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Beret                         
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Brear                         
correct=female   guess=male     name=Caitrin                       
correct=female   guess=male     name=Cal                           
correct=female   guess=male     name=Cherry                        
correct=female   guess=male     name=Christen                      
correct=female   guess=male     name=Christian                     
correct=female   guess=male     name=Cody                          
correct=female   guess=male     name=Con                           
correct=female   guess=male     name=Demetris                      
correct=female   guess=male     name=Dorry                         
correct=female   guess=male     name=Ealasaid                      
correct=female   guess=male     name=Em                            
correct=female   guess=male     name=Emmy                          
correct=female   guess=male     name=Esme                          
correct=female   guess=male     name=Ethyl                         
correct=female   guess=male     name=France                        
correct=female   guess=male     name=Friederike                    
correct=female   guess=male     name=Gabriell                      
correct=female   guess=male     name=Gillan                        
correct=female   guess=male     name=Gunvor                        
correct=female   guess=male     name=Gwyneth                       
correct=female   guess=male     name=Harmony                       
correct=female   guess=male     name=Hester                        
correct=female   guess=male     name=Hope                          
correct=female   guess=male     name=Janifer                       
correct=female   guess=male     name=Janot                         
correct=female   guess=male     name=Jody                          
correct=female   guess=male     name=Judith                        
correct=female   guess=male     name=Kim                           
correct=female   guess=male     name=Margery                       
correct=female   guess=male     name=Margret                       
correct=female   guess=male     name=Marion                        
correct=female   guess=male     name=Meg                           
correct=female   guess=male     name=Mercy                         
correct=female   guess=male     name=Murial                        
correct=female   guess=male     name=Nicol                         
correct=female   guess=male     name=Olympe                        
correct=female   guess=male     name=Patsy                         
correct=female   guess=male     name=Paule                         
correct=female   guess=male     name=Philis                        
correct=female   guess=male     name=Philly                        
correct=female   guess=male     name=Pier                          
correct=female   guess=male     name=Raquel                        
correct=female   guess=male     name=Sal                           
correct=female   guess=male     name=Scarlet                       
correct=female   guess=male     name=Shannon                       
correct=female   guess=male     name=Solange                       
correct=female   guess=male     name=Sophey                        
correct=female   guess=male     name=Van                           
correct=female   guess=male     name=Wendy                         
correct=female   guess=male     name=Wilone                        
correct=female   guess=male     name=Winnah                        
correct=male     guess=female   name=Abby                          
correct=male     guess=female   name=Allan                         
correct=male     guess=female   name=Arnie                         
correct=male     guess=female   name=Baily                         
correct=male     guess=female   name=Bubba                         
correct=male     guess=female   name=Carroll                       
correct=male     guess=female   name=Charlie                       
correct=male     guess=female   name=Dory                          
correct=male     guess=female   name=Elisha                        
correct=male     guess=female   name=Esteban                       
correct=male     guess=female   name=Florian                       
correct=male     guess=female   name=Fonsie                        
correct=male     guess=female   name=Franklyn                      
correct=male     guess=female   name=Hannibal                      
correct=male     guess=female   name=Herrmann                      
correct=male     guess=female   name=Jeramie                       
correct=male     guess=female   name=Jere                          
correct=male     guess=female   name=Jermayne                      
correct=male     guess=female   name=Jerrie                        
correct=male     guess=female   name=Jimmie                        
correct=male     guess=female   name=Jo                            
correct=male     guess=female   name=Jude                          
correct=male     guess=female   name=Kyle                          
correct=male     guess=female   name=Lin                           
correct=male     guess=female   name=Lonnie                        
correct=male     guess=female   name=Michele                       
correct=male     guess=female   name=Micky                         
correct=male     guess=female   name=Neil                          
correct=male     guess=female   name=Noble                         
correct=male     guess=female   name=Noe                           
correct=male     guess=female   name=Odie                          
correct=male     guess=female   name=Prince                        
correct=male     guess=female   name=Reza                          
correct=male     guess=female   name=Ricki                         
correct=male     guess=female   name=Ronnie                        
correct=male     guess=female   name=Samuele                       
correct=male     guess=female   name=Sydney                        
correct=male     guess=female   name=Tann                          
correct=male     guess=female   name=Uli                           
correct=male     guess=female   name=Val                           
correct=male     guess=female   name=Vasili                        

In [8]:
def gender_features4(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    features["suffix2"] = name[-2:].lower()
    features["suffix3"] = name[-3:].lower()
    features["prefix3"] = name[:3].lower()
    features["num_vowels"] = len([letter for letter in name if letter in 'aeiouy'])
    return features

train_set = [(gender_features4(n), g) for (n,g) in training]
devtest_set = [(gender_features4(n), g) for (n,g) in devtest]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)


0.802

In [9]:
# final performance test:
test_set = [(gender_features4(n), g) for (n,g) in test]
print nltk.classify.accuracy(classifier, test_set)


0.82

In [10]:
# performance slightly worse than in dev-test -> features reflect some idiosyncracies of dev-test

Exercise 3)


In [11]:
from nltk.corpus import senseval
instances = senseval.instances('serve.pos')
size = int(len(instances) * 0.1)
training, test = instances[size:], instances[:size]

In [12]:
training[0]


Out[12]:
SensevalInstance(word=u'serve-v', position=11, context=[('oh', 'UH'), (',', ','), ('that', 'DT'), ("'s", 'VBZ'), ('no', 'DT'), ('good', 'NN'), ('.', '.'), ('that', 'DT'), ("'s", 'VBZ'), ('where', 'WRB'), ('they', 'PRP'), ('serve', 'VB'), ('cardboard', 'NN'), ('instead', 'RB'), ('of', 'IN'), ('meat', 'NN'), ('.', '.')], senses=('SERVE10',))

In [13]:
def sense_features(instance):
    features = {}
    features["word-type"] = instance.word
    features["word-tag"] = instance.context[instance.position][1] 
    features["prev-word"] = instance.context[instance.position-1][0]
    features["prev-word-tag"] = instance.context[instance.position-1][1]
    features["next-word"] = instance.context[instance.position+1][0]
    features["next-word-tag"] = instance.context[instance.position+1][1]
    return features

train_set = [(sense_features(instance), instance.senses) for instance in training]
test_set = [(sense_features(instance), instance.senses) for instance in test]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)


0.807780320366

Exercise 4)


In [14]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(30)


0.77
Most Informative Features
        contains(doubts) = True              pos : neg    =      9.8 : 1.0
          contains(sans) = True              neg : pos    =      8.8 : 1.0
          contains(hugo) = True              pos : neg    =      7.8 : 1.0
  contains(effortlessly) = True              pos : neg    =      7.5 : 1.0
     contains(dismissed) = True              pos : neg    =      7.1 : 1.0
    contains(mediocrity) = True              neg : pos    =      6.9 : 1.0
   contains(overwhelmed) = True              pos : neg    =      6.4 : 1.0
        contains(fabric) = True              pos : neg    =      6.4 : 1.0
         contains(wires) = True              neg : pos    =      6.2 : 1.0
          contains(wits) = True              pos : neg    =      5.8 : 1.0
       contains(topping) = True              pos : neg    =      5.8 : 1.0
           contains(ugh) = True              neg : pos    =      5.7 : 1.0
     contains(uplifting) = True              pos : neg    =      5.7 : 1.0
   contains(bruckheimer) = True              neg : pos    =      5.6 : 1.0
        contains(bounce) = True              neg : pos    =      5.6 : 1.0
          contains(lang) = True              pos : neg    =      5.1 : 1.0
   contains(understands) = True              pos : neg    =      4.5 : 1.0
      contains(matheson) = True              pos : neg    =      4.4 : 1.0
   contains(controversy) = True              pos : neg    =      4.3 : 1.0
       contains(quicker) = True              neg : pos    =      4.3 : 1.0
       contains(maxwell) = True              neg : pos    =      4.3 : 1.0
         contains(locks) = True              neg : pos    =      4.3 : 1.0
          contains(tsui) = True              neg : pos    =      4.3 : 1.0
       contains(admired) = True              pos : neg    =      4.2 : 1.0
    contains(cronenberg) = True              pos : neg    =      3.9 : 1.0
    contains(derivative) = True              neg : pos    =      3.8 : 1.0
      contains(attorney) = True              pos : neg    =      3.8 : 1.0
   contains(existential) = True              pos : neg    =      3.7 : 1.0
       contains(bandits) = True              pos : neg    =      3.7 : 1.0
     contains(restoring) = True              pos : neg    =      3.7 : 1.0

In [15]:
# most of them already indicate some judgment in themselves ('ugh', 'mediocrity') or belong to typical phrases that
# indicate one direction of judgement ('understands' -> '... understands how to create atmosphere' or something like that)
# some seem to be names of actors etc. which tend to be judged one direction or the other
# surprising -> '33', 'wires'

Exercise 5)


In [ ]: