In [17]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
plt.rcParams['figure.figsize'] = (10.0, 8.0)

In [2]:
#load data (don't forget to download first)
data = pd.read_csv('./data/Train_rev1.csv')
data.head()


Out[2]:
Id Title FullDescription LocationRaw LocationNormalized ContractType ContractTime Company Category SalaryRaw SalaryNormalized SourceName
0 12612628 Engineering Systems Analyst Engineering Systems Analyst Dorking Surrey Sal... Dorking, Surrey, Surrey Dorking NaN permanent Gregory Martin International Engineering Jobs 20000 - 30000/annum 20-30K 25000 cv-library.co.uk
1 12612830 Stress Engineer Glasgow Stress Engineer Glasgow Salary **** to **** We... Glasgow, Scotland, Scotland Glasgow NaN permanent Gregory Martin International Engineering Jobs 25000 - 35000/annum 25-35K 30000 cv-library.co.uk
2 12612844 Modelling and simulation analyst Mathematical Modeller / Simulation Analyst / O... Hampshire, South East, South East Hampshire NaN permanent Gregory Martin International Engineering Jobs 20000 - 40000/annum 20-40K 30000 cv-library.co.uk
3 12613049 Engineering Systems Analyst / Mathematical Mod... Engineering Systems Analyst / Mathematical Mod... Surrey, South East, South East Surrey NaN permanent Gregory Martin International Engineering Jobs 25000 - 30000/annum 25K-30K negotiable 27500 cv-library.co.uk
4 12613647 Pioneer, Miser Engineering Systems Analyst Pioneer, Miser Engineering Systems Analyst Do... Surrey, South East, South East Surrey NaN permanent Gregory Martin International Engineering Jobs 20000 - 30000/annum 20-30K 25000 cv-library.co.uk

plot salary

Normalized salary is the target variable. Seems fairly straight on a ylog plot suggesting a simple linear regression on categories isn't going to cut it.


In [3]:
data.SalaryNormalized.hist(); plt.ylabel('frequency'); plt.xlabel(u'salary (£)'); plt.yscale('log');


explore job title

Find all unique words in a job title


In [10]:
cachedStopWords = stopwords.words("english") #cache stop words to speed-up removing them.
wordset = set()
text = ' '.join(
    data['Title'].replace(r'[^0-9a-zA-Z]+',' ',regex=True)
    .fillna('').str.lower()
)
data['Title'].replace(r'[^0-9a-zA-Z]+',' ',regex=True).fillna('').str.lower().str.split().apply(wordset.update)
print(list(wordset)[1:100])
most_common_terms = Counter([w for w in text.split(' ') if w not in cachedStopWords]).most_common(50)


['webdeveloper', 'mecahic', 'smjs', 'tickdata', 'hopsital', 'holyrood', 'four', 'opengl', 'woods', 'nursingrgn', 'nhiberna', 'cyprus', 'verwood', 'whizzkidz', 'authorit', '2550mmch', 'analytic', 'staffmin', 'eligible', 'electricity', 'aggregations', 'profound', 'mecca', 'opener', 'inwards', 'farsley', 'dependency', 'appointers', 'delhi', 'weekslive', 'osat', 'deli', 'commercialisation', 'interpreters', 'regional', 'dell', 'bratislava', 'loughton', 'hdtv', 'executiveconsumer', 'unify', 'cjun', 'dementiaessex', 'designunigraphics', 'commision', 'internally', 'stadia', 'umts', 'timefashion', 'reliabilty', 'combermere', 'rabbitmq', 'piling', 'recruitmentnottingham', 'ehorb', 'auntomation', 'telesalestilehurst', 'wisbeach', 'succession', 'basick', 'mlh', 'libjingle', 'croatian', 'charter', 'javasript', 'nigh', 'tired', 'miller', 'bankers', 'watbio', 'shorthaul', 'pulse', 'feasibility', 'elegant', 'second', '275', 'sustaining', 'workless', 'esol', 'cheetham', 'sterile', 'managercyps', 'presswork', 'executivetravel', 'southbank', 'cheflondonmichelin', 'partiemichelin', 'cooking', 'illford', 'minolta', 'teignmouth', 'gujarati', 'bonuses', 'optimization', 'secretarypartner', 'shopware', 'locationexcel', 'increasing', 'admiral']

In [11]:
labels, values = zip(*most_common_terms)

indexes = np.arange(len(labels))
width = 1.0

plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels, rotation='vertical')
plt.show()


plot job title as a word cloud


In [12]:
from wordcloud import WordCloud
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")


Out[12]:
(-0.5, 399.5, 199.5, -0.5)

Stem words to reduce similarity amongst features

Stemming produces a unique word for possibly similar words e.g. engineer/engineering. Porter Stemmer was used here. Wikipedia link


These words along with location and words from job descriptions will be used as features in the fitting.


In [18]:
most_common_terms = Counter([porter_stemmer.stem(w) for w in text.split(' ') if w not in cachedStopWords]).most_common(50)

In [19]:
labels, values = zip(*most_common_terms)

indexes = np.arange(len(labels))
width = 1.0

plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels, rotation='vertical')
plt.show()