In [17]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
plt.rcParams['figure.figsize'] = (10.0, 8.0)
In [2]:
#load data (don't forget to download first)
data = pd.read_csv('./data/Train_rev1.csv')
data.head()
Out[2]:
In [3]:
data.SalaryNormalized.hist(); plt.ylabel('frequency'); plt.xlabel(u'salary (£)'); plt.yscale('log');
In [10]:
cachedStopWords = stopwords.words("english") #cache stop words to speed-up removing them.
wordset = set()
text = ' '.join(
data['Title'].replace(r'[^0-9a-zA-Z]+',' ',regex=True)
.fillna('').str.lower()
)
data['Title'].replace(r'[^0-9a-zA-Z]+',' ',regex=True).fillna('').str.lower().str.split().apply(wordset.update)
print(list(wordset)[1:100])
most_common_terms = Counter([w for w in text.split(' ') if w not in cachedStopWords]).most_common(50)
In [11]:
labels, values = zip(*most_common_terms)
indexes = np.arange(len(labels))
width = 1.0
plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels, rotation='vertical')
plt.show()
In [12]:
from wordcloud import WordCloud
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
Out[12]:
Stemming produces a unique word for possibly similar words e.g. engineer/engineering. Porter Stemmer was used here. Wikipedia link
These words along with location and words from job descriptions will be used as features in the fitting.
In [18]:
most_common_terms = Counter([porter_stemmer.stem(w) for w in text.split(' ') if w not in cachedStopWords]).most_common(50)
In [19]:
labels, values = zip(*most_common_terms)
indexes = np.arange(len(labels))
width = 1.0
plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels, rotation='vertical')
plt.show()