In [1]:
import nltk
from nltk.corpus import names
import random
import re

A feature recognition function


In [2]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Samantha')


Out[2]:
{'last_letter': 'a'}

Create name datasets


In [38]:
def create_name_data():
    male_names = [(name, 'male') for name in names.words('male.txt')]
    female_names = [(name, 'female') for name in names.words('female.txt')]
    allnames = male_names + female_names
    
    # Randomize the order of male and female names, and de-alphabatize
    random.shuffle(allnames)
    return allnames

names_data = create_name_data()

First Pass at Training and Testing Data


In [4]:
# This function allows experimentation with different feature definitions
# items is a list of (key, value) pairs from which features are extracted and training sets are made
def create_training_sets (feature_function, items):
    # Create the features sets.  Call the function that was passed in.
    # For names, key is the name, and value is the gender
    featuresets = [(feature_function(key), value) for (key, value) in items]
    
    # Divided training and testing in half.  Could divide in other proportions instead.
    halfsize = int(float(len(featuresets)) / 2.0)
    train_set, test_set = featuresets[halfsize:], featuresets[:halfsize]
    return train_set, test_set

Train the classifier on the training data, with the first definition of features


In [5]:
# pass in a function name
train_set, test_set = create_training_sets(gender_features, names_data)
cl = nltk.NaiveBayesClassifier.train(train_set)

Test the classifier on some examples


In [6]:
print cl.classify(gender_features('Carl'))
print cl.classify(gender_features('Carla'))
print cl.classify(gender_features('Carly'))
print cl.classify(gender_features('Carlo'))
print cl.classify(gender_features('Carlos'))


female
female
female
male
male

In [7]:
print cl.classify(gender_features('Carli'))
print cl.classify(gender_features('Carle'))
print cl.classify(gender_features('Charles'))
print cl.classify(gender_features('Carlie'))
print cl.classify(gender_features('Charlie'))


female
female
male
female
female

Run the NLTK evaluation function on the test set


In [8]:
print "%.3f" % nltk.classify.accuracy(cl, test_set)


0.760

Run the NLTK feature inspection function on the classifier


In [9]:
cl.show_most_informative_features(15)


Most Informative Features
             last_letter = 'k'              male : female =     35.5 : 1.0
             last_letter = 'a'            female : male   =     33.9 : 1.0
             last_letter = 'f'              male : female =     14.5 : 1.0
             last_letter = 'p'              male : female =     12.2 : 1.0
             last_letter = 'w'              male : female =     12.2 : 1.0
             last_letter = 'v'              male : female =      9.9 : 1.0
             last_letter = 'm'              male : female =      8.9 : 1.0
             last_letter = 'd'              male : female =      8.4 : 1.0
             last_letter = 'z'              male : female =      7.6 : 1.0
             last_letter = 'r'              male : female =      7.4 : 1.0
             last_letter = 'o'              male : female =      7.4 : 1.0
             last_letter = 'i'            female : male   =      3.9 : 1.0
             last_letter = 'g'              male : female =      3.9 : 1.0
             last_letter = 's'              male : female =      3.8 : 1.0
             last_letter = 't'              male : female =      3.7 : 1.0

Let's add some more features to improve results


In [10]:
def gender_features2(word):
    features = {}
    word = word.lower()
    features['last'] = word[-1]
    features['first'] = word[:1]
    features['second'] = word[1:2] # get the 'h' in Charlie?
    return features
gender_features2('Samantha')


Out[10]:
{'first': 's', 'last': 'a', 'second': 'a'}

We wrote the code so that we can easily pass in the new feature function.


In [11]:
train_set2, test_set2 = create_training_sets(gender_features2, names_data)
cl2 = nltk.NaiveBayesClassifier.train(train_set2)
print "%.3f" % nltk.classify.accuracy(cl2, test_set2)


0.771

Let's hand check some of the harder cases ... oops some are right but some are now wrong.


In [12]:
print cl2.classify(gender_features2('Carli'))
print cl2.classify(gender_features2('Carle'))
print cl2.classify(gender_features2('Charles')) 
print cl2.classify(gender_features2('Carlie'))
print cl2.classify(gender_features2('Charlie'))


female
female
male
female
female

We can see the influence of some of the new features


In [13]:
cl2.show_most_informative_features(15)


Most Informative Features
                    last = 'k'              male : female =     35.5 : 1.0
                    last = 'a'            female : male   =     33.9 : 1.0
                    last = 'f'              male : female =     14.5 : 1.0
                    last = 'p'              male : female =     12.2 : 1.0
                    last = 'w'              male : female =     12.2 : 1.0
                    last = 'v'              male : female =      9.9 : 1.0
                    last = 'm'              male : female =      8.9 : 1.0
                    last = 'd'              male : female =      8.4 : 1.0
                    last = 'z'              male : female =      7.6 : 1.0
                    last = 'r'              male : female =      7.4 : 1.0
                    last = 'o'              male : female =      7.4 : 1.0
                   first = 'w'              male : female =      4.1 : 1.0
                    last = 'i'            female : male   =      3.9 : 1.0
                    last = 'g'              male : female =      3.9 : 1.0
                  second = 'z'              male : female =      3.8 : 1.0

We really need a development set to test our features on before testing on the real test set. So let's redo our division of the data. In this case we do the dividing up before applying the feature selection so we can keep track of the names.


In [14]:
def create_training_sets3 (feature_function, items):
    # Create the features sets.  Call the function that was passed in.
    # For names, key is the name, and value is the gender
    featuresets = [(feature_function(key), value) for (key, value) in items]
    
    # Divide data into thirds
    third = int(float(len(featuresets)) / 3.0)
    return items[0:third], items[third:third*2], items[third*2:], featuresets[0:third], featuresets[third:third*2], featuresets[third*2:]
    
train_items, dev_items, test_items, train_features, dev_features, test_features = create_training_sets3(gender_features2, names_data)

In [15]:
cl3 = nltk.NaiveBayesClassifier.train(train_features)
# This is code from the NLTK chapter
errors = []
for (name, tag) in dev_items:
    guess = cl3.classify(gender_features2(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

Print out the correct vs. the guessed answer for the errors, in order to inspect those that were wrong.


In [45]:
for (tag, guess, name) in sorted(errors)[:50]: 
    print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)


correct=female   guess=male     name=Abagail                       
correct=female   guess=male     name=Abbe                          
correct=female   guess=male     name=Abby                          
correct=female   guess=male     name=Aileen                        
correct=female   guess=male     name=Allys                         
correct=female   guess=male     name=Alyss                         
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Anais                         
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Ardith                        
correct=female   guess=male     name=Ariel                         
correct=female   guess=male     name=Arleen                        
correct=female   guess=male     name=Aryn                          
correct=female   guess=male     name=Ashleigh                      
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Astrix                        
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Austin                        
correct=female   guess=male     name=Averyl                        
correct=female   guess=male     name=Avivah                        
correct=female   guess=male     name=Avrit                         
correct=female   guess=male     name=Barb                          
correct=female   guess=male     name=Beilul                        
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Bette-Ann                     
correct=female   guess=male     name=Beulah                        
correct=female   guess=male     name=Birgit                        
correct=female   guess=male     name=Bren                          
correct=female   guess=male     name=Bridget                       
correct=female   guess=male     name=Bridgett                      
correct=female   guess=male     name=Brigid                        
correct=female   guess=male     name=Britt                         
correct=female   guess=male     name=Brittan                       
correct=female   guess=male     name=Brittany                      
correct=female   guess=male     name=Brittney                      
correct=female   guess=male     name=Brooks                        
correct=female   guess=male     name=Cameo                         
correct=female   guess=male     name=Caro                          
correct=female   guess=male     name=Chad                          
correct=female   guess=male     name=Charis                        
correct=female   guess=male     name=Charlot                       
correct=female   guess=male     name=Charlott                      
correct=female   guess=male     name=Chris                         
correct=female   guess=male     name=Clem                          
correct=female   guess=male     name=Cleo                          
correct=female   guess=male     name=Clo                           
correct=female   guess=male     name=Corliss                       
correct=female   guess=male     name=Cyb                           
correct=female   guess=male     name=Dagmar                        

Exercise Rewrite the functions above to add some additional features, and then rerun the classifier to evaluate if they improve or degrade results. But don't overfit!

Ideas for features:

  • name length
  • pairs of characters
  • your idea goes here

Features added:

-first three characters of name
-last three characters of name
-length of name
-the extracted vowels of a name. eg for Paul it would be 'au'

The starting and ending strings definitely increased the accuracy. Both length and the vowels check appeared to increase accuracy but I re-randomized the data a few times and found they didn't really hold up.


In [39]:
def gender_features3(word):
    features = {}
    word = word.lower()
    features['last'] = word[-1]
    features['first'] = word[0]
    features['end'] = word[-3:]
    features['start']= word[:2]
    features['length'] = len(word)
    features['vowels'] = "".join([c for c in word if c in 'aeiou'])
    return features

train_items, dev_items, test_items, train_features4, dev_features, test_features = create_training_sets3(gender_features3, names_data)
cl4 = nltk.NaiveBayesClassifier.train(train_features4)

In [18]:
gender_features3('Samantha')


Out[18]:
{'end': 'tha',
 'first': 's',
 'last': 'a',
 'length': 8,
 'start': 'sa',
 'vowels': 'aaa'}

In [40]:
nltk.classify.accuracy(cl4, test_features)


Out[40]:
0.8058912386706949

In [41]:
cl4.show_most_informative_features(20)


Most Informative Features
                    last = 'a'            female : male   =     42.4 : 1.0
                  vowels = 'ue'             male : female =     17.9 : 1.0
                  vowels = 'eo'             male : female =     16.6 : 1.0
                    last = 'm'              male : female =     15.4 : 1.0
                  vowels = 'eia'          female : male   =     15.0 : 1.0
                     end = 'nne'          female : male   =     14.4 : 1.0
                    last = 'k'              male : female =     14.1 : 1.0
                     end = 'son'            male : female =     12.6 : 1.0
                     end = 'ert'            male : female =     12.6 : 1.0
                  vowels = 'au'             male : female =     12.5 : 1.0
                   start = 'ka'           female : male   =     12.0 : 1.0
                  vowels = 'io'             male : female =     10.7 : 1.0
                    last = 'o'              male : female =      9.5 : 1.0
                    last = 'v'              male : female =      8.6 : 1.0
                    last = 'b'              male : female =      7.9 : 1.0
                    last = 'd'              male : female =      7.4 : 1.0
                   start = 'wa'             male : female =      7.3 : 1.0
                  vowels = 'uo'             male : female =      7.0 : 1.0
                  vowels = 'aia'          female : male   =      6.9 : 1.0
                   start = 'th'             male : female =      6.9 : 1.0

In [44]:
errors4 = []
for (name, tag) in dev_items:
    guess = cl4.classify(gender_features3(name))
    if guess != tag:
        errors4.append( (tag, guess, name) )

for (tag, guess, name) in sorted(errors)[:50]:
    print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)


correct=female   guess=male     name=Abagail                       
correct=female   guess=male     name=Abbe                          
correct=female   guess=male     name=Abby                          
correct=female   guess=male     name=Aileen                        
correct=female   guess=male     name=Allys                         
correct=female   guess=male     name=Alyss                         
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Anais                         
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Ardith                        
correct=female   guess=male     name=Ariel                         
correct=female   guess=male     name=Arleen                        
correct=female   guess=male     name=Aryn                          
correct=female   guess=male     name=Ashleigh                      
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Astrix                        
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Austin                        
correct=female   guess=male     name=Averyl                        
correct=female   guess=male     name=Avivah                        
correct=female   guess=male     name=Avrit                         
correct=female   guess=male     name=Barb                          
correct=female   guess=male     name=Beilul                        
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Bette-Ann                     
correct=female   guess=male     name=Beulah                        
correct=female   guess=male     name=Birgit                        
correct=female   guess=male     name=Bren                          
correct=female   guess=male     name=Bridget                       
correct=female   guess=male     name=Bridgett                      
correct=female   guess=male     name=Brigid                        
correct=female   guess=male     name=Britt                         
correct=female   guess=male     name=Brittan                       
correct=female   guess=male     name=Brittany                      
correct=female   guess=male     name=Brittney                      
correct=female   guess=male     name=Brooks                        
correct=female   guess=male     name=Cameo                         
correct=female   guess=male     name=Caro                          
correct=female   guess=male     name=Chad                          
correct=female   guess=male     name=Charis                        
correct=female   guess=male     name=Charlot                       
correct=female   guess=male     name=Charlott                      
correct=female   guess=male     name=Chris                         
correct=female   guess=male     name=Clem                          
correct=female   guess=male     name=Cleo                          
correct=female   guess=male     name=Clo                           
correct=female   guess=male     name=Corliss                       
correct=female   guess=male     name=Cyb                           
correct=female   guess=male     name=Dagmar                        

In [43]:
for i in range(1,16):
    ds = create_name_data()
    train_items, dev_items, test_items, train_features4, dev_features, test_features2 = create_training_sets3(gender_features3, ds)
    cl4 = nltk.NaiveBayesClassifier.train(train_features)
    accuracy = nltk.classify.accuracy(cl4, test_features2)
    print "Run {} accuracy: {:0.3%}".format(i, accuracy)


Run 1 accuracy: 79.456%
Run 2 accuracy: 77.795%
Run 3 accuracy: 78.361%
Run 4 accuracy: 79.305%
Run 5 accuracy: 77.530%
Run 6 accuracy: 78.663%
Run 7 accuracy: 77.190%
Run 8 accuracy: 78.814%
Run 9 accuracy: 78.399%
Run 10 accuracy: 78.361%
Run 11 accuracy: 77.795%
Run 12 accuracy: 77.757%
Run 13 accuracy: 78.965%
Run 14 accuracy: 78.512%
Run 15 accuracy: 78.965%