In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text_manip
import matplotlib.pyplot as plt
import gc
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from scipy.sparse import *
from my_util import *
In [308]:
def skill_length_hist(skill_df):
min_n_word = np.min(skill_df['n_word'])
max_n_word = np.max(skill_df['n_word'])
n, bins, patches = plt.hist(skill_df['n_word'], bins= range(min_n_word, max_n_word+1), facecolor='blue',
log=True, align='left', rwidth=.5)
plt.xlabel('No. of words in skill (skill length)')
plt.ylabel('No. of skills (log scale)')
plt.title('Distribution of skill length')
plt.xticks(range(min_n_word, max_n_word+1))
plt.grid(True)
# plt.savefig(REPORT_DIR + 'skill_length.pdf')
plt.show()
plt.close()
# end
def freq(skills=None, docs=None, max_n_word=1):
t0 = time()
print('Counting occurrence of skills with length <= %d ...' %max_n_word)
count_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))
doc_term_mat = count_vectorizer.fit_transform(docs)
print('Done after %.1fs' %(time() - t0))
# Sum over all documents to obtain total occurrence of each skill token
token_counts = np.asarray(doc_term_mat.sum(axis=0)).ravel()
df = pd.DataFrame({'skill': skills})
df['occurrence'] = token_counts
return df, doc_term_mat
def skillsPresentInJD(df):
occur_skills = df.query('occurrence > 0')
no_occur_skills = df.query('occurrence == 0')
return occur_skills, no_occur_skills
def n_match_skills(df):
occur_skills = df.query('occurrence > 0')
return occur_skills.shape[0]
def get_top_words(n_top_words, word_dist, feature_names):
norm_word_dist = np.divide(word_dist, sum(word_dist))
sorting_idx = word_dist.argsort()
top_words = [feature_names[i] for i in sorting_idx[:-n_top_words - 1:-1]]
probs = [norm_word_dist[i] for i in sorting_idx[:-n_top_words - 1:-1]]
return pd.DataFrame({'top_words': top_words, 'word_probs': probs})
In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
REPORT_DIR = HOME_DIR + 'reports/skill_cluster/'
In [304]:
jd_df = pd.read_csv(DATA_DIR + 'jd_df.csv')
n_jd = jd_df.shape[0]
In [305]:
jd_df['clean_text'] = [' '.join(words_in_doc(d)) for d in jd_df['text']]
print('Some sample JD records:')
jd_df.head(3)
Out[305]:
In [306]:
jd_docs = jd_df['clean_text'].apply(str.lower)
In [3]:
linkedin_skill_df = pd.read_csv(DATA_DIR + 'LinkedInSkillsList_10.csv')
linkedin_skills = linkedin_skill_df['skill']
onet_skill_df = pd.read_csv(DATA_DIR + 'onet_skills_list_all.csv')
onet_skills = onet_skill_df['skill'].apply(str.lower)
In [4]:
skills = linkedin_skills.append(onet_skills)
skills = list(set(skills))
pd.DataFrame({'skill': skills}).to_csv(DATA_DIR + 'all_skills.csv')
In [10]:
skill_df = pd.DataFrame({'skill': skills})
skill_df['n_word'] = skill_df['skill'].apply(n_word)
quantile(skill_df['n_word'])
Out[10]:
In [11]:
skill_length_hist(skill_df)
Based on the quartile summary and the distribution, we can try the following options:
In [396]:
stats = pd.DataFrame({'# JDs': n_jd,
'# LinkedIn skills': len(linkedin_skills), '# ONET skills': len(onet_skills),
'Total no. of unique skills': len(skills),
'min skill length': min(skill_df['n_word']), 'max skill length': max(skill_df['n_word'])},
index=[0])
stats.to_csv(DATA_DIR + 'stats.csv')
stats
Out[396]:
In [310]:
unigram_skills, doc_unigram_freq = freq(skills, jd_docs, max_n_word=1)
In [311]:
trigram_skills, doc_trigram_freq = freq(skills, jd_docs, max_n_word=3)
In [379]:
# sevengram_skills, doc_7gram_freq = freq(skills, jd_docs, max_n_word=7)
In [312]:
unigram_match, trigram_match = n_match_skills(df=unigram_skills), n_match_skills(df=trigram_skills)
# sevengram_match = n_match_skills(df=sevengram_skills)
pd.DataFrame({'# matching skills': [unigram_match, trigram_match]},
index=['Tokens_1', 'Tokens_3'])
Out[312]:
As the difference between Tokens_3 and Tokens_7 is negligible, we can just use the former for analysis. This means we only include 1-, 2- and 3-gram skills in our subsequent analysis.
In [380]:
t0 = time()
print('Counting occurrence of uni-gram skills...')
uni_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills)
doc_unigram_freq = uni_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))
In [381]:
## For each doc, "its no. of unique uni-grams = no. of non-zero counts" in its row in doc-term mat
def n_non_zero(r, sp_mat):
return len(sp_mat.getrow(r).nonzero()[1])
n_unigram = [n_non_zero(r, doc_unigram_freq) for r in range(n_jd)]
# sum(n_unigram) == len(doc_unigram_freq.nonzero()[0]) # sanity check
Out[381]:
In [395]:
jd_df['n_unigram'] = n_unigram
print quantile(n_unigram)
In [394]:
# pull up some JDs to check
tmp = jd_df.query('n_unigram == 2')
print tmp.shape
tmp[['job_id', 'clean_text', 'n_unigram']].head(10)
Out[394]:
Based on the quartile summary, $50$% of documents contain $\leq 14$ uni-gram skills and $75$% of them contain $\leq 22$ uni-gram skills (which looks reasonable).
In [387]:
fig = plt.figure(figsize=(10, 6))
ns, bins, patches = plt.hist(n_unigram, bins=np.unique(n_unigram), rwidth=.5)
plt.xlabel('No. of unique uni-gram skills in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of uni-gram skills in job descriptions')
plt.grid(True)
plt.show()
fig.savefig(REPORT_DIR + 'unigram_in_jd.pdf')
plt.close(fig)
In [386]:
print('Ratio of JDs with no uni-gram skills: %.3f' %round(n_unigram.count(0)/float(n_jd), 3))
In [313]:
jd_df['length'] = [n_words_in_doc(d) for d in jd_df['text']]
# print sum(jd_df['length'] == 0)
jd_df['unigram_freq'] = doc_unigram_freq.sum(axis=1).A1
clean_jd_df = jd_df.query('length > 0')
del clean_jd_df['text']
clean_jd_df['unigram_ratio'] = np.divide(clean_jd_df['unigram_freq'], clean_jd_df['length']*1.)
clean_jd_df = clean_jd_df.sort_values(by='unigram_ratio', ascending=False)
In [383]:
print quantile(clean_jd_df['unigram_ratio'], dec=2)
# clean_jd_df.to_csv(DATA_DIR + 'jd_df.csv')
In [318]:
plt.hist(clean_jd_df['unigram_ratio'])
plt.xlabel('Ratio of unigram skill tokens in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of ratio of unigram skills')
plt.savefig(REPORT_DIR + 'unigram_ratio.pdf')
plt.show()
plt.close()
Decision: Based on the distribution, half of JDs have $\leq 20$% of their words are uni-gram skills (and 75% of them having unigram skills occupying $\leq 26$% of their words). Thus, we may first want to try on the sub dataset of JDs where uni-gram skills occuppy at least $20$%.
In [429]:
jd_df.columns
Out[429]:
In [430]:
sub_df = jd_df.query('n_unigram >= 5')
sub_df = sub_df.sort_values(by='length', ascending=False)
print sub_df.shape
# sub_df.head(10)
In [431]:
unigram_df, doc_unigram_freq = freq(skills, max_n_word=1, docs=sub_df['clean_text'])
doc_unigram_freq.shape
Out[431]:
In [432]:
occur_unigrams = unigram_df.query('occurrence > 0')['skill']
unigram_df, doc_unigram_freq = freq(skills=occur_unigrams, docs=sub_df['clean_text'])
doc_unigram_freq.shape
Out[432]:
In [434]:
n_unigram_skill = doc_unigram_freq.shape[1]
n_document = [n_non_zero(r, doc_unigram_freq.transpose()) for r in range(n_unigram_skill)]
In [447]:
tmp_df = pd.DataFrame({'skill' : occur_unigrams, 'n_doc' : n_document})
tmp_df = tmp_df.sort_values(by='n_doc', ascending=False)
sum(tmp_df['n_doc'] == 1)
Out[447]:
In [435]:
quantile(n_document)
Out[435]:
In [439]:
plt.hist(n_document, bins=np.unique(n_document), log=True)
# plt.xlabel('No. of documents containing the skill')
# plt.ylabel
plt.show()
In [406]:
n_ins = sub_df.shape[0]
train_idx, test_idx = mkPartition(n_instances=n_ins)
X_train, X_test = doc_unigram_freq[train_idx, :], doc_unigram_freq[test_idx, :]
In [411]:
# ks, n_top_words = range(2, 11), 10
ks, n_top_words = range(5, 25, 5), 10
RES_DIR = REPORT_DIR + 'r4/'
In [412]:
rnmf = {k: NMF(n_components=k, random_state=0) for k in ks}
print( "Fitting NMF using %d uni-gram skills on %d job descriptions..." % (len(occur_unigrams), n_ins) ) # (random initialization)
print('No. of topics, Error, Running time')
rnmf_error = []
for k in ks:
t0 = time()
rnmf[k].fit(X_train)
elapsed = time() - t0
err = rnmf[k].reconstruction_err_
print('%d, %0.1f, %0.1fs' %(k, err, elapsed))
rnmf_error.append(err)
# end
First, we choose the best no. of topics $k^*$ for random NMF as the one that minimizes the error of predicting test data. For that, we compute the error for different $k$'s by the following function.
In [357]:
from numpy import linalg as la
def cal_test_err(models):
test_error = []
print('No. of topics, Test error, Running time')
for k in ks:
t0 = time()
H = models[k].components_
W_test = models[k].fit_transform(X_test, H=H)
err = la.norm(X_test - np.matmul(W_test, H))
# sp_W_test = csr_matrix(W_test)
# sp_H = csc_matrix(H)
# print(sp_W_test.shape)
# print(sp_H.shape)
# err = la.norm(X_test - sp_W_test * sp_H)
test_error.append(err)
print('%d, %0.1f, %0.1fs' %(k, err, time() - t0))
return test_error
In [413]:
print('Calculating test errors of random NMF to choose best no. of topics...')
rnmf_test_error = cal_test_err(models=rnmf)
In [414]:
best_k = ks[np.argmin(rnmf_test_error)]
print('The best no. of topics is %d' %best_k)
rnmf_best = rnmf[best_k]
rnmf_word_dists = pd.DataFrame(rnmf_best.components_).apply(normalize, axis=1)
# rnmf_word_dists.to_csv(RES_DIR + 'rnmf_word_dists.csv', index=False)
We now validate NMF results in the following ways:
We manually label each topic based on its top 10 words.
In [360]:
# count_vectorizer = text_manip.CountVectorizer(vocabulary= occur_skills['skill'], ngram_range=(1, 3))
# count_vectorizer.fit_transform(jd_docs)
# nmf_features = count_vectorizer.get_feature_names()
# nmf_top_words = top_words_df(rnmf_best, n_top_words, nmf_features)
# nmf_top_words.to_csv(REPORT_DIR + 'nmf_top_words.csv', index=False)
# pd.DataFrame(nmf_features).to_csv(REPORT_DIR + 'nmf_features.csv')
In [361]:
# H = rnmf_best.components_
# W_test = pd.DataFrame(rnmf_best.fit_transform(X_test, H=H))
# W_test.to_csv(REPORT_DIR + 'nmf_doc_topic_distr.csv', index=False)
In [415]:
scores = []
lda = {k: LatentDirichletAllocation(n_topics=k, max_iter=5, learning_method='online', learning_offset=50.,
random_state=0) # verbose=1
for k in ks}
print("Fitting LDA using %d uni-gram skills on %d job descriptions..." % (len(occur_unigrams), n_ins))
print('No. of topics, Log-likelihood, Running time')
for k in ks:
t0 = time()
lda[k].fit(X_train)
s = lda[k].score(X_train)
print('%d, %0.1f, %0.1fs' %(k, s, time() - t0))
scores.append(s)
# end
In [416]:
perp = [lda[k].perplexity(X_test) for k in ks]
perp_df = pd.DataFrame({'No. of topics': ks, 'Perplexity': perp})
perp_df.to_csv(RES_DIR + 'perplexity.csv', index=False)
In [417]:
best_k = ks[np.argmin(perp)]
print('Best no. of topics: %d' %best_k)
In [365]:
# lda_best = lda[best_k]
# lda_word_dists = pd.DataFrame(lda_best.components_).apply(normalize, axis=1)
# lda_word_dists.to_csv(RES_DIR + 'lda_word_dists.csv', index=False)
# lda_features = count_vectorizer.get_feature_names()
# lda_topics = top_words_df(lda_best, n_top_words, lda_features)
# lda_topics.to_csv(REPORT_DIR + 'lda_topics.csv', index=False)
# pd.DataFrame(lda_features).to_csv(RES_DIR + 'lda_features.csv')
In [366]:
# doc_topic_distr = lda_best.transform(X_test)
# pd.DataFrame(doc_topic_distr).to_csv(RES_DIR + 'lda_doc_topic_distr.csv', index=False)
In [421]:
# Put all model metrics on training & test datasets into 2 data frames
model_list = ['LDA', 'randomNMF']
train_metric = pd.DataFrame({'No. of topics': ks, 'LDA': np.divide(scores, 10**6), 'randomNMF': rnmf_error})
test_metric = pd.DataFrame({'No. of topics': ks, 'LDA': perp, 'randomNMF': rnmf_test_error, })
In [424]:
ks
Out[424]:
In [423]:
fig = plt.figure(figsize=(10, 6))
for i, model in enumerate(model_list):
plt.subplot(2, 2, i+1)
plt.subplots_adjust(wspace=.5, hspace=.5)
# train metric
plt.title(model)
plt.plot(ks, train_metric[model], '--')
plt.xlabel('No. of topics')
if model == 'LDA':
plt.ylabel(r'Log likelihood ($\times 10^6$)')
else:
plt.ylabel(r'$\| X_{train} - W_{train} H \|_2$')
plt.grid(True)
# test metric
# plt.subplot(2, 2, i+3)
# plt.title(model)
# plt.plot(ks, test_metric[model], 'r')
# plt.xlabel('No. of topics')
# if model == 'LDA':
# plt.ylabel(r'Perplexity')
# else:
# plt.ylabel(r'$\| X_{test} - W_{test} H \|_2$')
# plt.grid(True)
# end
plt.show()
# fig.savefig(RES_DIR + 'new_lda_vs_nmf.pdf')
plt.close(fig)
In [369]:
t0 = time()
print('Counting occurrence of multi-gram skills...')
multi_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(2, 3))
multigram_doc_mat = multi_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))
In [370]:
n_multigram = [n_non_zero(r, doc_multigram_freq) for r in range(n_jd)]
multigram_df = pd.DataFrame({'jd_id': jd_df['job_id'], 'n_multigram': n_multigram})
quantile(n_multigram)
Out[370]:
In [371]:
plt.hist(n_multigram, bins=np.unique(n_multigram))
plt.xlabel('No. of multi-grams in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of multi-grams in job descriptions')
plt.grid(True)
plt.show()
plt.savefig(REPORT_DIR + 'multigram_in_jd.pdf')
plt.close()
In [372]:
unigram_skills, bigram_skills = occur_skills.query('n_word == 1')['skill'], occur_skills.query('n_word == 2')['skill']
## for each unigram, find no. of bigrams containing it (aka super-bigrams)
def super_bigrams(unigram='business', bigrams=bigram_skills):
idx = [s.find(unigram) for s in bigrams]
df = pd.DataFrame({'bigram': bigrams, 'idx_of_given_unigram': idx})
return df.query('idx_of_given_unigram > -1') # those bigrams not containing the unigram give -1 indices
def n_super_bigrams(unigram='business', bigrams=bigram_skills):
idx = [s.find(unigram) for s in bigrams]
return len(bigrams) - idx.count(-1)
In [373]:
# super_bigrams(unigram='business')
n_super_bigrams = [n_super_bigrams(ug) for ug in unigram_skills]
overlap_df = pd.DataFrame({'unigram': unigram_skills, 'n_super_bigrams': n_super_bigrams})
quantile(n_super_bigrams)
Out[373]:
In [399]:
plt.hist(n_super_bigrams)
plt.show()
plt.close()
In [402]:
overlapped_unigrams = overlap_df.query('n_super_bigrams > 0')
overlapped_unigrams = overlapped_unigrams.sort_values(by='n_super_bigrams', ascending=False)
# overlapped_unigrams.head(10)
n_overlap = overlapped_unigrams.shape[0]
n_unigram_skills = overlap_df.shape[0]
print n_overlap
print n_unigram_skills
n_overlap*1./n_unigram_skills
Out[402]:
In [375]:
unigram_df = unigram_df.sort_values(by='occurrence', ascending=False)
In [376]:
top10 = unigram_df[['skill', 'occurrence']].head(10)
# top10.reset_index(inplace=True, drop=True)
top10
Out[376]:
In [377]:
bottom10 = unigram_df[['skill', 'occurrence']].tail(10)
bottom10.reset_index(inplace=True, drop=True)
print(bottom10)
In [19]:
quantile(occur_skills.query('n_word == 1')['occurrence'])
Out[19]:
In [20]:
quantile(occur_skills.query('n_word == 2')['occurrence'])
Out[20]:
In [21]:
quantile(occur_skills.query('n_word == 3')['occurrence'])
Out[21]:
In [22]:
n, bins, patches = plt.hist(x=occur_skills['occurrence']/10**3, bins=50, facecolor='blue', alpha=0.75, log=True)
plt.title('Histogram of skill frequency')
plt.xlabel(r'Frequency ($\times 10^3$)') # in thousand
plt.ylabel('No. of skills (log scale)')
plt.ylim(1, 10**4)
plt.grid(True)
plt.savefig(REPORT_DIR + 'skill_occur.pdf')
plt.show()
plt.close()
In [378]:
# bi_gram_skills.sort_values(by='occurrence', inplace=True, ascending=False)
# print('10 most common bi-gram skills')
# print(bi_gram_skills.head(10))
In [18]:
occur_skills, no_occur_skills = skillsPresentInJD(df=trigram_skills)
occur_skills = occur_skills.sort_values(by='occurrence', ascending=False) # inplace=True,
In [94]:
# occur_skills['skill'].head(3)
trigram_skills, doc_trigram_freq = freq(occur_skills['skill'], max_n_word=3)
In [98]:
print('No. of skills in the new doc-skill matrix after re-building: %d' %doc_trigram_freq.shape[1] )
In [97]:
trigram_skills.to_csv(REPORT_DIR + 'trigram_skills.csv')
pd.DataFrame(doc_trigram_freq.data).to_csv(REPORT_DIR + 'doc_trigram_freq.csv')
# unigram_skills.to_csv(REPORT_DIR + 'unigram_skills.csv')
# pd.DataFrame(doc_unigram_freq.data).to_csv(REPORT_DIR + 'doc_unigram_freq.csv')
# sevengram_skills.to_csv(REPORT_DIR + 'sevengram_skills.csv')