This code is provided as supplementary material of the lecture Machine Learning and Optimization in Communications (MLOC).
This code illustrates
scikit-learn
library to perform classification as in the name competition (assign "+" if second letter is vowel)
In [7]:
import numpy as np
import string
from sklearn.ensemble import GradientBoostingClassifier
Load file from data and convert to training set and test set (reading from two distinct files)
In [30]:
def read_file(filename):
with open(filename) as f:
content = f.readlines()
y = [line[0] for line in content]
X = [line[2:].strip() for line in content]
return X,y
X_train,y_train = read_file('Names_data_train.txt')
X_test,y_test = read_file('Names_data_test.txt')
A simple class that converts the string into numbers and than trains a simple classifier using the gradient boosting technique. The resulting gradient boosting classifier is essentially a rule-based system, where the results are derived from the inputs to the classifier.
The main take-away message is that rule-based systems perform extremely well, if the underlying data follows a process that results from simple rules (as are often encountered in practice). For example, medical diagnosis systems follow very clear rules and hence often benefit by training from gradient boosting classifiers.
In [31]:
class Gradient_Boosting_Estimator():
'''
Class for training a gradient boosting, rule-based estimator on the letters
Parameter is the number of letters of the word to consider
'''
def __init__( self, letters ):
self.letters = letters
self.gbes = GradientBoostingClassifier()
# compute the histograms P(Y) (stored in self.Py) and P(x_i|y) (stored in self.Px)
def fit( self, X, y) :
# convert to numeric entries
ty = np.zeros( len(y) )
for k in range( len(y) ):
if y[k]=='+':
ty[k] = 1
tX = np.empty( (0, self.letters) )
for mys in X:
if len(mys) < self.letters:
# add spaces if string is too short
mys += ( ' ' * (self.letters-len(mys) ) )
tX = np.vstack( (tX, [ord(x) for x in mys[0:self.letters] ] ) )
# fit the classifier (taken from the SciKit-Learn library)
gbes.fit(tX, ty)
# perform the prediction based on maximizing P(y)\prod P(x_i|y)
# normally, the prediction would be done in logairthmic domain, but here we just use plain probabilities
def predict(self, X):
rety = ['+' for _ in X]
for idx, elem_X in enumerate(X):
# add spaces if string is too short
elem_X += ( ' ' * max(0,self.letters-len(elem_X) ) )
elem_numeric = np.array([ord(x) for x in elem_X[0:self.letters]])
rv = gbes.predict(elem_numeric.reshape(1,-1))
if rv == 0:
rety[idx] = '-'
return rety
In [32]:
clf = Gradient_Boosting_Estimator(10)
clf.fit(X_train,y_train)
In [33]:
y = clf.predict(X_test)
errors = 0
for idx,value in enumerate(y_test):
print(value,'predicted as:', y[idx], ' (',X_test[idx],')')
if value != y[idx]:
errors += 1
print('Prediction errors: %d (error rate %1.2f %%)' % (errors, errors/len(y)*100))
In [34]:
# find optimal number of errors
for letter in range(1,10):
clf = Gradient_Boosting_Estimator(letter)
clf.fit(X_train,y_train)
y = clf.predict(X_test)
errors = 0
for idx,k in enumerate(y_test):
if k != y[idx]:
errors += 1
print('%d letters: %d prediction errors (error rate %1.2f %%)' % (letter, errors,errors*100/len(y_test)))
In [36]:
# Train with 5 letters
clf = Gradient_Boosting_Estimator(5)
clf.fit(X_train,y_train)
In [37]:
print(clf.predict(['Xavier Jones']))