Load the important packages:
In [1]:
import numpy as np
import sklearn
Load the training data:
In [2]:
import csv
def load_train_data(filename):
X = []
y = []
with open(filename) as fd:
reader = csv.reader(fd, delimiter='\t')
# ignore header row
next(reader, None)
for row in reader:
X.append(row[1])
y.append(row[0].split())
return np.array(X), np.array(y)
X, y = load_train_data('data/train.tsv')
Show some input and output data:
In [3]:
print 'Input:', X[0]
print
print 'Output:', y[0]
Preprocessing steps are applied differently for input vectors and target vectors.
First, we need to transform the input text into a numerical representation. This is done by generating a vector where each position is the number of occurrences for a given word in the data.
For instance, given the text hello. this is my first line. this is my next line. this is the final one, its count vector, considering that the first position is with respect to this, the second is line and the third is final is [3, 2, 1]. The count vector did not use any stop words but only considered words that appeared at least 2 times in the training data, with maximum frequency of 95%.
Next, we apply tf-idf to weight the words according to their importance. Too frequent or too rare words are less important than the others.
Usually, the output is given as a list of tags for each description, such as [['part-time-job', 'salary', 'supervising-job'], ['2-4-years-experience-required', 'hourly-wage']]. However, since some tags are mutually exclusive (only one can exist at a time), we take that into account. For instance, no description can be both 'part-time-job' and 'full-time-job' at the same time.
Therefore, the target vector is splitted into several vectors, one for each mutually exclusive set of tags, in a format such as:
{
'job': [['part-time-job'], ['full-time-job'], ['part-time-job']],
'wage': [['salary'], [], []],
'degree': [[], [], []],
'experience': [[], [], []],
'supervising': [[], [], ['supervising-job']]
}
With the splitted target vector, we will be able to train one model for each tag type.
After that, each tag type target label will be encoded in numerical format, where each tag will be replaced by an integer. For instance, [['part-time-job'], ['full-time-job'], [], ['part-time-job'], []] may be encoded to [1, 2, 0, 1, 0].
Define input data preprocessor as bag-of-words and tf-idf feature extraction:
CountVectorizer: Transforms text to vector of occurrences for each word found in training set (bag-of-words representation).TfidfTransformer: Transforms bag-of-words to its relative frequency, removing too frequent or rare words from the final representation.
In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
X_preprocessor = Pipeline([
('count', CountVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 2))),
('tfidf', TfidfTransformer())
])
Define multi-label binarizer for output data. Each target sample will be a binary array: 0 if not present, 1 otherwise.
In [5]:
from sklearn.preprocessing import LabelEncoder
y_preprocessors = {
'job': LabelEncoder(),
'wage': LabelEncoder(),
'degree': LabelEncoder(),
'experience': LabelEncoder(),
'supervising': LabelEncoder()
}
Separate the target vector y into one vector for each mutually exclusive tag type:
>>> y = [['part-time-job', 'salary'], ['full-time-job'], ['part-time-job', 'supervising-job']]
>>> split_y = split_exclusive_tags(y)
>>> split_y
{
'job': [['part-time-job'], ['full-time-job'], ['part-time-job']],
'wage': [['salary'], [], []],
'degree': [[], [], []],
'experience': [[], [], []],
'supervising': [[], [], ['supervising-job']]
}
This is a useful step when training one model for each exclusive tag type.
In [7]:
# Separate targets for mutually exclusive tags
def split_exclusive_tags(y):
split_y = {
'job': [],
'wage': [],
'degree': [],
'experience': [],
'supervising': []
}
for target in y:
split_y['job'].append(filter(lambda x: x in ['part-time-job', 'full-time-job'], target))
split_y['wage'].append(filter(lambda x: x in ['hourly-wage', 'salary'], target))
split_y['degree'].append(filter(lambda x: x in ['associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed'], target))
split_y['experience'].append(filter(lambda x: x in ['1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed'], target))
split_y['supervising'].append(filter(lambda x: x in ['supervising-job'], target))
return split_y
Define classifier as SVM with one-vs-all strategy for multilabel classification.
In [6]:
# F1 score: 0.511
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
models = {
'job': OneVsRestClassifier(LinearSVC()),
'wage': OneVsRestClassifier(LinearSVC()),
'degree': OneVsRestClassifier(LinearSVC()),
'experience': OneVsRestClassifier(LinearSVC()),
'supervising': OneVsRestClassifier(LinearSVC())
}
For each mutually exclusive tag type, we train one multiclass model capable of deciding which tag (or even none) is appropriate for the given input.
Initially, an attempt of a single multilabel model was used, which would be able to output multiple labels at once. However, considering that the input space was huge for this situation, better results were achieved by using multiclass models, one for each mutually exclusive tag type. Thus the output would be the output for each tag type model aggregated in a single vector.
In [10]:
def fit_models(models, X_preprocessor, y_preprocessors, X, y):
print 'Fitting models'
split_y = split_exclusive_tags(y)
X_processed = X_preprocessor.fit_transform(X)
for tag_type, model in models.items():
# Learn one preprocessor for each mutually exclusive tag
X_processed = X_preprocessor.transform(X)
y_processed = y_preprocessors[tag_type].fit_transform(split_y[tag_type])
# Learn one model for each mutually exclusive tag
model.fit(X_processed, y_processed)
Predict the output by executing the model for each tag type:
In [11]:
def predict_models(models, X_preprocessor, y_preprocessors, X):
print 'Predicting with models'
output = [[] for _ in X]
for tag_type, model in models.items():
# Preprocess and use model for the given type of tag
X_processed = X_preprocessor.transform(X)
model_output = model.predict(X_processed)
tag_type_output = y_preprocessors[tag_type].inverse_transform(model_output)
# Aggregate outputs for all types of tags in the same array
for i, out in enumerate(tag_type_output):
if type(out) in [list, tuple]:
output[i].extend(out)
else:
output[i].append(out)
return output
Calculate the F1 score given the target vector and the model output.
In [12]:
def calculate_f1_score(y_test, y_output):
print 'Calculating F1 score'
tags = ['part-time-job', 'full-time-job', 'hourly-wage', 'salary', 'associate-needed', 'bs-degree-needed',
'ms-or-phd-needed', 'licence-needed', '1-year-experience-needed', '2-4-years-experience-needed',
'5-plus-years-experience-needed', 'supervising-job']
true_positive = np.array([0.0 for _ in tags])
true_negative = np.array([0.0 for _ in tags])
false_positive = np.array([0.0 for _ in tags])
false_negative = np.array([0.0 for _ in tags])
for target, output in zip(y_test, y_output):
for i, tag in enumerate(tags):
if tag in target and tag in output:
true_positive[i] += 1
elif tag not in target and tag not in output:
true_negative[i] += 1
elif tag in target and tag not in output:
false_negative[i] += 1
elif tag not in target and tag in output:
false_positive[i] += 1
else:
raise Exception('Unknown situation - tag: {} target: {} output: {}'.format(tag, target, output))
tags_precision = np.array([0.0 for _ in tags])
tags_recall = np.array([0.0 for _ in tags])
tags_f1_score = np.array([0.0 for _ in tags])
for i, tag in enumerate(tags):
tags_precision[i] = true_positive[i] / (true_positive[i] + false_positive[i])
tags_recall[i] = true_positive[i] / (true_positive[i] + false_negative[i])
tags_f1_score[i] = 2*tags_precision[i]*tags_recall[i] / (tags_precision[i] + tags_recall[i])
min_tags_precision = np.argmin(tags_precision)
min_tags_recall = np.argmin(tags_recall)
min_tags_f1_score = np.argmin(tags_f1_score)
print
print '{:30s} | {:5s} | {:5s} | {:5s}'.format('Tag', 'Prec.', 'Rec. ', 'F1')
for i in range(len(tags)):
print '{:30s} | {:.3f} | {:.3f} | {:.3f}'.format(
tags[i], tags_precision[i], tags_recall[i], tags_f1_score[i])
print
print 'Worst precision:', tags[min_tags_precision]
print 'Worst recall:', tags[min_tags_recall]
print 'Worst F1 score:', tags[min_tags_f1_score]
print
precision = np.sum(true_positive) / (np.sum(true_positive) + np.sum(false_positive))
recall = np.sum(true_positive) / (np.sum(true_positive) + np.sum(false_negative))
f1_score = 2*precision*recall / (precision + recall)
print 'General:'
print 'Precision: {:.3f}'.format(precision)
print 'Recall: {:.3f}'.format(recall)
print 'F1 score: {:.3f}'.format(f1_score)
return f1_score
Evaluate model with 5-fold cross-validation using the F1 score metric:
In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
scores = []
k_fold = KFold(n_splits=5)
for i, (train, validation) in enumerate(k_fold.split(X)):
X_train, X_validation, y_train, y_validation = X[train], X[validation], y[train], y[validation]
fit_models(models, X_preprocessor, y_preprocessors, X_train, y_train)
y_output = predict_models(models, X_preprocessor, y_preprocessors, X_validation)
score = calculate_f1_score(y_validation, y_output)
scores.append(score)
print '#{0} F1 score: {1:.3f}'.format(i, score)
print
f1_score = np.mean(scores)
print 'Total F1 score: {0:.3f}'.format(f1_score)
Load the data:
In [14]:
def load_test_data(filename):
with open(filename) as fd:
reader = csv.reader(fd, delimiter='\t')
next(reader, None) # ignore header row
X = [row[0] for row in reader]
return np.array(X)
X_train, y_train = load_train_data('data/train.tsv')
X_test = load_test_data('data/test.tsv')
Train the model with all training data:
In [15]:
fit_models(models, X_preprocessor, y_preprocessors, X_train, y_train)
Predict output from test data:
In [16]:
y_output = predict_models(models, X_preprocessor, y_preprocessors, X_test)
Show some output data:
In [17]:
print y_output[:10]
Save output data:
In [18]:
def save_output(filename, output):
with open(filename, 'w') as fd:
fd.write('tags\n')
for i, tags in enumerate(output):
fd.write(' '.join(tags))
fd.write('\n')
save_output('data/tags.tsv', y_output)
Save preprocessors and model:
In [19]:
import pickle
def save(filename, obj):
pickle.dump(obj, open(filename, 'w'))
save('models/X_preprocessor.pickle', X_preprocessor)
save('models/y_preprocessor.pickle', y_preprocessors)
save('models/clf_{0:.3f}_f1_score.pickle'.format(f1_score), models)
In [20]:
def load(filename):
return pickle.load(open(filename))
models = load('models/clf_0.461_f1_score.pickle')
X_preprocessors = load('models/X_preprocessor.pickle')
y_preprocessors = load('models/y_preprocessor.pickle')