In [1]:
import pandas as pd
from matplotlib.pyplot import plot
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import ShuffleSplit
import time
import re
from collections import Counter
from scipy import sparse
%pylab inline
pylab.rcParams['figure.figsize'] = (20, 15)
In [2]:
#utility function to remove html tag from text
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', raw_html)
return cleantext
In [20]:
#utility function to transform text into vector representation
def text2vector(text,count_vect):
X_train_counts = count_vect.transform(text)
tf_transformer = TfidfTransformer(use_idf=False)
X_train_tf = tf_transformer.fit_transform(X_train_counts)
# print X_train_tf.shape
return X_train_tf
In [4]:
# Utility function to report best scores
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
In [18]:
# utility function to train a classifier with cross validation technique
def trainWithCrossValidation(clf,data,labels,test_size,n_splits):
ss = ShuffleSplit(n_splits=n_splits, test_size=test_size,random_state=42)
for train, test in ss.split(data):
X_train = [data[i] for i in train]
X_test = [data[i] for i in test]
y_train = [labels[i] for i in train]
y_test = [labels[i] for i in test]
X_train_titles = [x[0] for x in X_train]
X_train_description = [x[1] for x in X_train]
X_test_titles = [x[0] for x in X_test]
X_test_description = [x[1] for x in X_test]
count_vect = CountVectorizer()
count_vect.fit(X_train_description)
X_train_titles_vec = text2vector(X_train_titles,count_vect)
X_train_description_vec = text2vector(X_train_description,count_vect)
X_test_titles_vec = text2vector(X_test_titles,count_vect)
X_test_description_vec = text2vector(X_test_description,count_vect)
x_train_vec = sparse.hstack([X_train_titles_vec,X_train_description_vec])
x_test_vec = sparse.hstack([X_test_titles_vec,X_test_description_vec])
clf.fit(x_train_vec, y_train)
prediction = clf.predict(x_test_vec)
print accuracy_score(y_test,prediction)
confusion_matrix(y_test,prediction)
In [5]:
#loading the dataset
dataset = pd.read_csv('reed_data_scientist_task_jobs_dataset.csv')
In [6]:
#Lets see how the dataset looks like
dataset.head()
Out[6]:
In [7]:
#Lets check how big is the dataset and if there are some nan values or duplicates
dataset.shape
Out[7]:
In [8]:
dataset.dropna().drop_duplicates().shape
Out[8]:
In [9]:
#Lets check if the dataset is balanced, luckly it is!
dataset.job_sector.value_counts()
Out[9]:
With problems based on text data, there is a temptation to use word2vec as it usually works so well!
But one thing to keep in mind with word2vec and similar solutions is that they need alot of data to shine.
In this problem there is not enough training data to properly train a specialized word2vec model.
So I am going to use the standard bag of words approach.
In [10]:
#Lets make a copy of the original dataset and trasform the job_sector labels into numerical values
data = dataset.copy()
mymap = {'Education':0, 'Engineering': 1, 'Legal': 2, 'IT & Telecoms':3, 'Sales':4, 'Transport & Logistics':5 }
data['labels'] = data.job_sector.apply(lambda s: mymap.get(s) if s in mymap else s)
In [11]:
data['labels'].value_counts()
Out[11]:
In [12]:
#Lets remove all the html tags from the job_description
data.job_description = data.job_description.apply(lambda x: cleanhtml(x))
In [13]:
data.head()
Out[13]:
In [14]:
#Lets select just the data that we need to work with
data_for_classifier = data[['job_title','job_description']]
labels_for_classifier = data['labels']
In [17]:
from sklearn.model_selection import ShuffleSplit
def trainWithCrossValidation(clf,data,labels,test_size,n_splits):
ss = ShuffleSplit(n_splits=n_splits, test_size=test_size,random_state=42)
for train, test in ss.split(data):
X_train = [data[i] for i in train]
X_test = [data[i] for i in test]
y_train = [labels[i] for i in train]
y_test = [labels[i] for i in test]
X_train_titles = [x[0] for x in X_train]
X_train_description = [x[1] for x in X_train]
X_test_titles = [x[0] for x in X_test]
X_test_description = [x[1] for x in X_test]
count_vect = CountVectorizer()
count_vect.fit(X_train_description)
X_train_titles_vec = text2vector(X_train_titles,count_vect)
X_train_description_vec = text2vector(X_train_description,count_vect)
X_test_titles_vec = text2vector(X_test_titles,count_vect)
X_test_description_vec = text2vector(X_test_description,count_vect)
x_train_vec = sparse.hstack([X_train_titles_vec,X_train_description_vec])
x_test_vec = sparse.hstack([X_test_titles_vec,X_test_description_vec])
clf.fit(x_train_vec, y_train)
prediction = clf.predict(x_test_vec)
print accuracy_score(y_test,prediction)
confusion_matrix(y_test,prediction)
In [200]:
#Lets split the data into train and test set, and remember the answer is 42!
X_train, X_test, y_train, y_test = train_test_split(data_for_classifier.as_matrix(),labels_for_classifier.tolist(),test_size=0.2, random_state=42)
In [201]:
#Now I need to separate the title from the description so later I can join the vector representations of them
X_train_titles = [x[0] for x in X_train]
X_train_description = [x[1] for x in X_train]
X_test_titles = [x[0] for x in X_test]
X_test_description = [x[1] for x in X_test]
In [202]:
#lets check if the distribution of the classes between train and test is still balanced
print Counter(y_train)
print Counter(y_test)
In [203]:
#Lets fit the CountVectorizer with the training data and use it to transform training and testing data
count_vect = CountVectorizer()
count_vect.fit(X_train_description)
X_train_titles_vec = text2vector(X_train_titles,count_vect)
X_train_description_vec = text2vector(X_train_description,count_vect)
X_test_titles_vec = text2vector(X_test_titles,count_vect)
X_test_description_vec = text2vector(X_test_description,count_vect)
In [204]:
#Lets join title vectors and description vectors into one bigger and stronger vector!
x_train_vec = sparse.hstack([X_train_titles_vec,X_train_description_vec])
x_test_vec = sparse.hstack([X_test_titles_vec,X_test_description_vec])
print x_train_vec.shape
print x_test_vec.shape
In [ ]:
In [205]:
#SVM classifier with linear kernel
clf1 = SVC(kernel='linear').fit(x_train_vec, y_train)
prediction1 = clf1.predict(x_test_vec)
print accuracy_score(y_test,prediction1)
confusion_matrix(y_test,prediction1)
Out[205]:
In [21]:
#SVM classifier with cross validation
clf1 = SVC(kernel='linear')
trainWithCrossValidation(clf1,data_for_classifier.as_matrix(),labels_for_classifier.tolist(),0.2,4)
In [206]:
#Multinomial Naive Bayes classifier
clf2 = MultinomialNB().fit(x_train_vec, y_train)
prediction2 = clf2.predict(x_test_vec)
print accuracy_score(y_test,prediction2)
confusion_matrix(y_test,prediction2)
Out[206]:
In [22]:
#Multinomial Naive Bayes classifier with cross validation
clf2 = MultinomialNB()
trainWithCrossValidation(clf2,data_for_classifier.as_matrix(),labels_for_classifier.tolist(),0.2,4)
In [207]:
#Stochastic Gradient Descent classifier
clf3 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42).fit(x_train_vec, y_train)
prediction3 = clf3.predict(x_test_vec)
print accuracy_score(y_test,prediction3)
confusion_matrix(y_test,prediction3)
Out[207]:
In [23]:
#Stochastic Gradient Descent classifier with cross validation
clf3 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
trainWithCrossValidation(clf3,data_for_classifier.as_matrix(),labels_for_classifier.tolist(),0.2,4)
In [208]:
#Neural network: the Multilayer perceptron
clf4 = MLPClassifier( random_state=42).fit(x_train_vec, y_train)
prediction4 = clf4.predict(x_test_vec)
print accuracy_score(y_test,prediction4)
confusion_matrix(y_test,prediction4)
Out[208]:
In [24]:
#Neural network: the Multilayer perceptron with cross validation
clf4 = MLPClassifier( random_state=42)
trainWithCrossValidation(clf4,data_for_classifier.as_matrix(),labels_for_classifier.tolist(),0.2,4)
In [209]:
#Random forest classifier
clf5 = RandomForestClassifier(n_estimators=200).fit(x_train_vec,y_train)
prediction5 = clf5.predict(x_test_vec)
print accuracy_score(y_test,prediction5)
confusion_matrix(y_test,prediction5)
Out[209]:
In [25]:
#Random forest classifier with cross validation
clf5 = RandomForestClassifier(n_estimators=200)
trainWithCrossValidation(clf5,data_for_classifier.as_matrix(),labels_for_classifier.tolist(),0.2,4)
In [210]:
#KNN classifier
clf6 = KNeighborsClassifier(n_neighbors=15).fit(x_train_vec,y_train)
prediction6 = clf6.predict(x_test_vec)
print accuracy_score(y_test,prediction6)
confusion_matrix(y_test,prediction6)
Out[210]:
In [26]:
#KNN classifier with cross validation
clf6 = KNeighborsClassifier(n_neighbors=15)
trainWithCrossValidation(clf6,data_for_classifier.as_matrix(),labels_for_classifier.tolist(),0.2,4)
In [211]:
c1 = SVC()
k=['rbf', 'linear','poly','sigmoid']
c= range(1,100)
g=np.arange(1e-4,1e-2,0.0001)
g=g.tolist()
param_dist1=dict(kernel=k, C=c, gamma=g)
c2 = MLPClassifier( random_state=42)
batch_size = range(10,300,10)
max_iter = range(10,300,10)
learning_rate_init = [0.0001,0.001, 0.01, 0.1,0.2]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
activation = ['identity', 'logistic', 'tanh', 'relu']
solver = ['adam','sgd','lbfgs']
hidden_layer_sizes = range(10,300,10)
param_dist2 = dict(batch_size=batch_size, max_iter=max_iter,learning_rate_init=learning_rate_init,momentum=momentum,activation=activation,solver=solver,hidden_layer_sizes=hidden_layer_sizes)
c3 = RandomForestClassifier()
n_estimators=range(10,500,10)
criterion = ['gini','entropy']
n_jobs = [-1]
param_dist3 = dict(n_estimators=n_estimators,criterion=criterion,n_jobs=n_jobs)
classifiers = []
classifiers.append(['SVC',c1,param_dist1])
classifiers.append(['MLPC',c2,param_dist2])
classifiers.append(['RandomForest',c3,param_dist3])
for name,clf,params in classifiers:
print name
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=params,
n_iter=n_iter_search,n_jobs=-1)
start = time.time()
random_search.fit(x_train_vec, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)
In [ ]:
In [26]:
In [ ]:
RandomizedSearchCV took 237.22 seconds for 20 candidates parameter settings.
Model with rank: 1 Mean validation score: 0.926 (std: 0.002) Parameters: {'kernel': 'sigmoid', 'C': 98, 'gamma': 0.008}
Model with rank: 2 Mean validation score: 0.924 (std: 0.002) Parameters: {'kernel': 'rbf', 'C': 70, 'gamma': 0.0034}
Model with rank: 3 Mean validation score: 0.923 (std: 0.001) Parameters: {'kernel': 'rbf', 'C': 59, 'gamma': 0.0034}
RandomizedSearchCV took 8403.01 seconds for 20 candidates parameter settings.
Model with rank: 1 Mean validation score: 0.942 (std: 0.007) Parameters: {'solver': 'adam', 'activation': 'tanh', 'max_iter': 190, 'batch_size': 260, 'learning_rate_init': 0.0001, 'momentum': 0.4, 'hidden_layer_sizes': 40}
Model with rank: 2 Mean validation score: 0.940 (std: 0.006) Parameters: {'solver': 'adam', 'activation': 'tanh', 'max_iter': 30, 'batch_size': 110, 'learning_rate_init': 0.0001, 'momentum': 0.9, 'hidden_layer_sizes': 170}
Model with rank: 3 Mean validation score: 0.937 (std: 0.005) Parameters: {'solver': 'adam', 'activation': 'identity', 'max_iter': 130, 'batch_size': 110, 'learning_rate_init': 0.01, 'momentum': 0.0, 'hidden_layer_sizes': 60}
RandomizedSearchCV took 159.82 seconds for 20 candidates parameter settings.
Model with rank: 1 Mean validation score: 0.934 (std: 0.009) Parameters: {'n_estimators': 360, 'n_jobs': -1, 'criterion': 'gini'}
Model with rank: 2 Mean validation score: 0.933 (std: 0.009) Parameters: {'n_estimators': 430, 'n_jobs': -1, 'criterion': 'gini'}
Model with rank: 2 Mean validation score: 0.933 (std: 0.009) Parameters: {'n_estimators': 310, 'n_jobs': -1, 'criterion': 'gini'}
Model with rank: 2 Mean validation score: 0.933 (std: 0.007) Parameters: {'n_estimators': 170, 'n_jobs': -1, 'criterion': 'gini'}
In [ ]:
As showed in the results, the multilayer perceptron achieved the best results on this type of problem.
Given more computational power and time, it would be probably possible to tune it even more, and try different and more complex architectures.
Given a bigger dataset it would be intereseting to use word2vec to preprocess the text data. Every industry have its own technical words and a solution like word2vec would be able to associate those specific words to the industry sector, creating a 'meaning' space for each industry.
As for the practical use of this results, I think it would be a good solution to show to the client a drop down list with the top 3 most probable industries given by the classifier, or a selection box where the user have to just click on the appropriate industry sector.
In [ ]:
In [ ]: