In [1]:
import nltk
from nltk.corpus import names
import random
import re
A feature recognition function
In [2]:
def gender_features(word):
return {'last_letter': word[-1]}
gender_features('Samantha')
Out[2]:
Create name datasets
In [38]:
def create_name_data():
male_names = [(name, 'male') for name in names.words('male.txt')]
female_names = [(name, 'female') for name in names.words('female.txt')]
allnames = male_names + female_names
# Randomize the order of male and female names, and de-alphabatize
random.shuffle(allnames)
return allnames
names_data = create_name_data()
First Pass at Training and Testing Data
In [4]:
# This function allows experimentation with different feature definitions
# items is a list of (key, value) pairs from which features are extracted and training sets are made
def create_training_sets (feature_function, items):
# Create the features sets. Call the function that was passed in.
# For names, key is the name, and value is the gender
featuresets = [(feature_function(key), value) for (key, value) in items]
# Divided training and testing in half. Could divide in other proportions instead.
halfsize = int(float(len(featuresets)) / 2.0)
train_set, test_set = featuresets[halfsize:], featuresets[:halfsize]
return train_set, test_set
Train the classifier on the training data, with the first definition of features
In [5]:
# pass in a function name
train_set, test_set = create_training_sets(gender_features, names_data)
cl = nltk.NaiveBayesClassifier.train(train_set)
Test the classifier on some examples
In [6]:
print cl.classify(gender_features('Carl'))
print cl.classify(gender_features('Carla'))
print cl.classify(gender_features('Carly'))
print cl.classify(gender_features('Carlo'))
print cl.classify(gender_features('Carlos'))
In [7]:
print cl.classify(gender_features('Carli'))
print cl.classify(gender_features('Carle'))
print cl.classify(gender_features('Charles'))
print cl.classify(gender_features('Carlie'))
print cl.classify(gender_features('Charlie'))
Run the NLTK evaluation function on the test set
In [8]:
print "%.3f" % nltk.classify.accuracy(cl, test_set)
Run the NLTK feature inspection function on the classifier
In [9]:
cl.show_most_informative_features(15)
Let's add some more features to improve results
In [10]:
def gender_features2(word):
features = {}
word = word.lower()
features['last'] = word[-1]
features['first'] = word[:1]
features['second'] = word[1:2] # get the 'h' in Charlie?
return features
gender_features2('Samantha')
Out[10]:
We wrote the code so that we can easily pass in the new feature function.
In [11]:
train_set2, test_set2 = create_training_sets(gender_features2, names_data)
cl2 = nltk.NaiveBayesClassifier.train(train_set2)
print "%.3f" % nltk.classify.accuracy(cl2, test_set2)
Let's hand check some of the harder cases ... oops some are right but some are now wrong.
In [12]:
print cl2.classify(gender_features2('Carli'))
print cl2.classify(gender_features2('Carle'))
print cl2.classify(gender_features2('Charles'))
print cl2.classify(gender_features2('Carlie'))
print cl2.classify(gender_features2('Charlie'))
We can see the influence of some of the new features
In [13]:
cl2.show_most_informative_features(15)
We really need a development set to test our features on before testing on the real test set. So let's redo our division of the data. In this case we do the dividing up before applying the feature selection so we can keep track of the names.
In [14]:
def create_training_sets3 (feature_function, items):
# Create the features sets. Call the function that was passed in.
# For names, key is the name, and value is the gender
featuresets = [(feature_function(key), value) for (key, value) in items]
# Divide data into thirds
third = int(float(len(featuresets)) / 3.0)
return items[0:third], items[third:third*2], items[third*2:], featuresets[0:third], featuresets[third:third*2], featuresets[third*2:]
train_items, dev_items, test_items, train_features, dev_features, test_features = create_training_sets3(gender_features2, names_data)
In [15]:
cl3 = nltk.NaiveBayesClassifier.train(train_features)
# This is code from the NLTK chapter
errors = []
for (name, tag) in dev_items:
guess = cl3.classify(gender_features2(name))
if guess != tag:
errors.append( (tag, guess, name) )
Print out the correct vs. the guessed answer for the errors, in order to inspect those that were wrong.
In [45]:
for (tag, guess, name) in sorted(errors)[:50]:
print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
Exercise Rewrite the functions above to add some additional features, and then rerun the classifier to evaluate if they improve or degrade results. But don't overfit!
Ideas for features:
-first three characters of name
-last three characters of name
-length of name
-the extracted vowels of a name. eg for Paul it would be 'au'
The starting and ending strings definitely increased the accuracy. Both length and the vowels check appeared to increase accuracy but I re-randomized the data a few times and found they didn't really hold up.
In [39]:
def gender_features3(word):
features = {}
word = word.lower()
features['last'] = word[-1]
features['first'] = word[0]
features['end'] = word[-3:]
features['start']= word[:2]
features['length'] = len(word)
features['vowels'] = "".join([c for c in word if c in 'aeiou'])
return features
train_items, dev_items, test_items, train_features4, dev_features, test_features = create_training_sets3(gender_features3, names_data)
cl4 = nltk.NaiveBayesClassifier.train(train_features4)
In [18]:
gender_features3('Samantha')
Out[18]:
In [40]:
nltk.classify.accuracy(cl4, test_features)
Out[40]:
In [41]:
cl4.show_most_informative_features(20)
In [44]:
errors4 = []
for (name, tag) in dev_items:
guess = cl4.classify(gender_features3(name))
if guess != tag:
errors4.append( (tag, guess, name) )
for (tag, guess, name) in sorted(errors)[:50]:
print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
In [43]:
for i in range(1,16):
ds = create_name_data()
train_items, dev_items, test_items, train_features4, dev_features, test_features2 = create_training_sets3(gender_features3, ds)
cl4 = nltk.NaiveBayesClassifier.train(train_features)
accuracy = nltk.classify.accuracy(cl4, test_features2)
print "Run {} accuracy: {:0.3%}".format(i, accuracy)