In [17]:

    
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
plt.rcParams['figure.figsize'] = (10.0, 8.0)



In [2]:

    
#load data (don't forget to download first)
data = pd.read_csv('./data/Train_rev1.csv')
data.head()









    Out[2]:






  
    
      
      Id
      Title
      FullDescription
      LocationRaw
      LocationNormalized
      ContractType
      ContractTime
      Company
      Category
      SalaryRaw
      SalaryNormalized
      SourceName
    
  
  
    
      0
      12612628
      Engineering Systems Analyst
      Engineering Systems Analyst Dorking Surrey Sal...
      Dorking, Surrey, Surrey
      Dorking
      NaN
      permanent
      Gregory Martin International
      Engineering Jobs
      20000 - 30000/annum 20-30K
      25000
      cv-library.co.uk
    
    
      1
      12612830
      Stress Engineer Glasgow
      Stress Engineer Glasgow Salary **** to **** We...
      Glasgow, Scotland, Scotland
      Glasgow
      NaN
      permanent
      Gregory Martin International
      Engineering Jobs
      25000 - 35000/annum 25-35K
      30000
      cv-library.co.uk
    
    
      2
      12612844
      Modelling and simulation analyst
      Mathematical Modeller / Simulation Analyst / O...
      Hampshire, South East, South East
      Hampshire
      NaN
      permanent
      Gregory Martin International
      Engineering Jobs
      20000 - 40000/annum 20-40K
      30000
      cv-library.co.uk
    
    
      3
      12613049
      Engineering Systems Analyst / Mathematical Mod...
      Engineering Systems Analyst / Mathematical Mod...
      Surrey, South East, South East
      Surrey
      NaN
      permanent
      Gregory Martin International
      Engineering Jobs
      25000 - 30000/annum 25K-30K negotiable
      27500
      cv-library.co.uk
    
    
      4
      12613647
      Pioneer, Miser Engineering Systems Analyst
      Pioneer, Miser  Engineering Systems Analyst Do...
      Surrey, South East, South East
      Surrey
      NaN
      permanent
      Gregory Martin International
      Engineering Jobs
      20000 - 30000/annum 20-30K
      25000
      cv-library.co.uk

plot salary

Normalized salary is the target variable. Seems fairly straight on a ylog plot suggesting a simple linear regression on categories isn't going to cut it.



In [3]:

    
data.SalaryNormalized.hist(); plt.ylabel('frequency'); plt.xlabel(u'salary (£)'); plt.yscale('log');

explore job title

Find all unique words in a job title



In [10]:

    
cachedStopWords = stopwords.words("english") #cache stop words to speed-up removing them.
wordset = set()
text = ' '.join(
    data['Title'].replace(r'[^0-9a-zA-Z]+',' ',regex=True)
    .fillna('').str.lower()
)
data['Title'].replace(r'[^0-9a-zA-Z]+',' ',regex=True).fillna('').str.lower().str.split().apply(wordset.update)
print(list(wordset)[1:100])
most_common_terms = Counter([w for w in text.split(' ') if w not in cachedStopWords]).most_common(50)









    



['webdeveloper', 'mecahic', 'smjs', 'tickdata', 'hopsital', 'holyrood', 'four', 'opengl', 'woods', 'nursingrgn', 'nhiberna', 'cyprus', 'verwood', 'whizzkidz', 'authorit', '2550mmch', 'analytic', 'staffmin', 'eligible', 'electricity', 'aggregations', 'profound', 'mecca', 'opener', 'inwards', 'farsley', 'dependency', 'appointers', 'delhi', 'weekslive', 'osat', 'deli', 'commercialisation', 'interpreters', 'regional', 'dell', 'bratislava', 'loughton', 'hdtv', 'executiveconsumer', 'unify', 'cjun', 'dementiaessex', 'designunigraphics', 'commision', 'internally', 'stadia', 'umts', 'timefashion', 'reliabilty', 'combermere', 'rabbitmq', 'piling', 'recruitmentnottingham', 'ehorb', 'auntomation', 'telesalestilehurst', 'wisbeach', 'succession', 'basick', 'mlh', 'libjingle', 'croatian', 'charter', 'javasript', 'nigh', 'tired', 'miller', 'bankers', 'watbio', 'shorthaul', 'pulse', 'feasibility', 'elegant', 'second', '275', 'sustaining', 'workless', 'esol', 'cheetham', 'sterile', 'managercyps', 'presswork', 'executivetravel', 'southbank', 'cheflondonmichelin', 'partiemichelin', 'cooking', 'illford', 'minolta', 'teignmouth', 'gujarati', 'bonuses', 'optimization', 'secretarypartner', 'shopware', 'locationexcel', 'increasing', 'admiral']



In [11]:

    
labels, values = zip(*most_common_terms)

indexes = np.arange(len(labels))
width = 1.0

plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels, rotation='vertical')
plt.show()

plot job title as a word cloud



In [12]:

    
from wordcloud import WordCloud
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")









    Out[12]:





(-0.5, 399.5, 199.5, -0.5)

Stem words to reduce similarity amongst features

Stemming produces a unique word for possibly similar words e.g. engineer/engineering. Porter Stemmer was used here. Wikipedia link

These words along with location and words from job descriptions will be used as features in the fitting.



In [18]:

    
most_common_terms = Counter([porter_stemmer.stem(w) for w in text.split(' ') if w not in cachedStopWords]).most_common(50)



In [19]:

    
labels, values = zip(*most_common_terms)

indexes = np.arange(len(labels))
width = 1.0

plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels, rotation='vertical')
plt.show()

	Id	Title	FullDescription	LocationRaw	LocationNormalized	ContractType	ContractTime	Company	Category	SalaryRaw	SalaryNormalized	SourceName
0	12612628	Engineering Systems Analyst	Engineering Systems Analyst Dorking Surrey Sal...	Dorking, Surrey, Surrey	Dorking	NaN	permanent	Gregory Martin International	Engineering Jobs	20000 - 30000/annum 20-30K	25000	cv-library.co.uk
1	12612830	Stress Engineer Glasgow	Stress Engineer Glasgow Salary ** to ** We...	Glasgow, Scotland, Scotland	Glasgow	NaN	permanent	Gregory Martin International	Engineering Jobs	25000 - 35000/annum 25-35K	30000	cv-library.co.uk
2	12612844	Modelling and simulation analyst	Mathematical Modeller / Simulation Analyst / O...	Hampshire, South East, South East	Hampshire	NaN	permanent	Gregory Martin International	Engineering Jobs	20000 - 40000/annum 20-40K	30000	cv-library.co.uk
3	12613049	Engineering Systems Analyst / Mathematical Mod...	Engineering Systems Analyst / Mathematical Mod...	Surrey, South East, South East	Surrey	NaN	permanent	Gregory Martin International	Engineering Jobs	25000 - 30000/annum 25K-30K negotiable	27500	cv-library.co.uk
4	12613647	Pioneer, Miser Engineering Systems Analyst	Pioneer, Miser Engineering Systems Analyst Do...	Surrey, South East, South East	Surrey	NaN	permanent	Gregory Martin International	Engineering Jobs	20000 - 30000/annum 20-30K	25000	cv-library.co.uk