Preparations

  • Import libraries:

In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text_manip
import matplotlib.pyplot as plt
import gc

from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from scipy.sparse import *
from my_util import *
  • Load data

In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
REPORT_DIR = HOME_DIR + 'reports/skill_cluster/'

# job descriptions (JDs)
init_posts = pd.read_csv(DATA_DIR + 'jd_df.csv')

In [20]:
init_skills = skill_df['skill']
jd_docs = list(init_posts['clean_text'].apply(str.lower))

n_skill, n_jd = len(init_skills) , init_posts.shape[0]
print('Initial no. of skills: %d' %n_skill)
print('Initial no. of JDs: %d' %n_jd) # some garbage JDs with no text already removed


Initial no. of skills: 44919
Initial no. of JDs: 263411

Distribution of unigram, bigram and trigram skills


In [8]:
skill_df = pd.read_csv(REPORT_DIR + 'skill_stats.csv')
skill_df.head(3)


Out[8]:
skill n_word freq
0 business 1 170302
1 management 1 161636
2 support 1 156517

In [27]:
uni_gram_skills = list(skill_df.query('n_word == 1')['skill'])
bi_gram_skills = list(skill_df.query('n_word == 2')['skill'])
tri_gram_skills = list(skill_df.query('n_word == 3')['skill'])

pd.DataFrame({'n_unigram_skill': len(uni_gram_skills), 'n_bigram_skill': len(bi_gram_skills), 
              'n_trigram_skill': len(tri_gram_skills)}, index=[0])


Out[27]:
n_bigram_skill n_trigram_skill n_unigram_skill
0 20386 10778 7537

In [31]:
skills = pd.Series.unique(skill_df.query('freq > 0')['skill'])
len(skills)


Out[31]:
14829
No. of unique uni-grams per document

In [ ]:
t0 = time()
print('Counting occurrence of uni-gram skills...')
uni_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills)  
doc_unigram_freq = uni_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))

## For each doc, "its no. of unique uni-grams = no. of non-zero counts" in its row in doc-term mat
def n_non_zero(r, sp_mat):
    return len(sp_mat.getrow(r).nonzero()[1])

In [12]:
# binary_vectorizer = text_manip.CountVectorizer(vocabulary=skills, binary=True)
# print('Marking unique unigram skills in JDs...')
# t0 = time()
# doc_unigram_occurrence = binary_vectorizer.fit_transform(jd_docs)
# print('Done after %.1fs' %(time() - t0))
# init_posts['n_uniq_unigram'] = doc_unigram_occurrence.sum(axis=1).A1
quantile(init_posts['n_uniq_unigram'])


Out[12]:
min 25% 50% (median) 75% max
0 0.0 8.0 14.0 22.0 119.0

In [ ]:
plt.hist(n_uniq_unigram, bins=np.unique(init_posts['n_uniq_unigram']))
plt.xlabel('no. of unique unigrams in JD')
plt.ylabel('no. of JDs')

plt.show()

No. of unique skills per JDs

Here each skill can be a uni-, bi-, or tri-gram (i.e. len(skill) <= 3)

Remove skills never occurring in JDs

This step is already done in previous run, no need to do again.


In [ ]:

From now on, we work with the set of occurring skills.


In [ ]:
# Count no. of unique skills in each JD by binary vectorizer
binary_vectorizer = text_manip.CountVectorizer(vocabulary=occur_skills, ngram_range=(1, max_n_word), binary=True)
t0 = time()
print('Marking occurrence of skills with length <= %d ...' %max_n_word)
doc_skill_occurrence = binary_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))

init_posts['n_uniq_skill'] = doc_skill_occurrence.sum(axis=1).A1
quantile(init_posts['n_uniq_skill'])

Filtering

There are two goals: i) to remove JDs with too few skills, and ii) to remove skills occurring in too few JDs. Thus, we repeat the following process until the two goals are satisfied.

  • Count no. of unique skills in each JD
  • Remove JDs with $<= 1$ skills
  • Count no. of JDs containing each skill
  • Remove skills occuring in $<= 1$ JDs

In [ ]:
n_iter, posts = 0, init_posts
n_post = posts.shape[0]

stop_cond, thres = False, .98
while not stop_cond:
    n_iter = n_iter + 1
    print('Iteration %d' %n_iter)
    new_posts = extractJDs(posts, skills, min_n_skill=2)
    n_new_post = new_posts.shape[0]
    print('No. of posts after filtering: %d' %n_new_post)
    
    skill_df = extractSkills(skills, new_posts, min_n_jd=2)
    new_skills = skill_df['skill']
    print('No. of skills after filtering: %d' %len(new_skills) )
    stop_cond = (n_new_post >= thres*n_post) and (len(new_skills) >= thres*len(skills))
    
    posts = new_posts
    n_post = posts.shape[0]
    skills = new_skills
# end
  • Save the hard-earned JDs and skills after all these filters:

In [ ]:
# print min(posts['n_uniq_skill'])
# print min(skill_df['n_jd_with_skill'])
posts.to_csv(DATA_DIR + 'filtered/posts.csv', index=False)
skill_df.to_csv(DATA_DIR + 'filtered/skills.csv', index=False)
  • Sample job postings:

In [ ]:
posts = posts.sort_values(by='n_uniq_skill', ascending=False)
posts.head()

In [ ]:
# Sanity check by pull up skills occuring in the JD with most skills
# post_with_most_skill = init_posts.query('job_id == {}'.format('JOB-2015-0196805') )

In [ ]:
train_idx, test_idx = mkPartition(n_instance, p=80)
X_train, X_test = doc_skill_tfidf[train_idx, :], doc_skill_tfidf[test_idx, :]
n_train, n_test = X_train.shape[0], X_test.shape[0]
print('Train set has %d JDs and test set has %d JDs' %(n_train, n_test))

In [ ]:
stats = pd.DataFrame({'n_train': n_train, 'n_test': n_test, 'n_jd (train & test)': n_post, 'n_skill': len(skills)}, index=[0])
stats.to_csv(RES_DIR + 'stats.csv', index=False)

Set global arguments:

  • no. of topics: k in {5, 10, ..., 20}
  • no. of top words to be printed out in result
  • directory to save results

In [ ]:
RES_DIR = REPORT_DIR + 'r6/'
n_top_words = 10

In [ ]:
# ks  = range(5, 25, 5)
ks = range(4, 24, 2)

Skill Clustering by NMF


In [ ]:
tf_idf_vect = text_manip.TfidfVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))
n_instance, n_feat = posts.shape[0], len(skills)
t0 =time()
print('Building tf_idf for %d JDs using %d features (skills)...' %(n_instance, n_feat))
doc_skill_tfidf = tf_idf_vect.fit_transform(posts['clean_text'])
print('Done after %.1fs' %(time()-t0))

Training


In [ ]:
rnmf = {k: NMF(n_components=k, random_state=0) for k in ks}
print( "Fitting NMF using random initialization..." )
print('No. of topics, Error, Running time')
rnmf_error = []

for k in ks:
    t0 = time()
    rnmf[k].fit(X_train)
    elapsed = time() - t0
    err = rnmf[k].reconstruction_err_
    print('%d, %0.1f, %0.1fs' %(k, err, elapsed))
    rnmf_error.append(err)
# end
  • Save each model

In [ ]:
nmf_features = tf_idf_vect.get_feature_names()
pd.DataFrame(nmf_features).to_csv(RES_DIR + 'nmf_features.csv', index=False)

for k in ks:
    top_words = top_words_df(n_top_words, model=rnmf[k],feature_names=nmf_features)
    top_words.to_csv(RES_DIR + 'nmf_{}_topics.csv'.format(k), index=False)
#   each word dist is a component in NMF
    word_dist = pd.DataFrame(rnmf[k].components_).apply(normalize, axis=1)
    word_dist.to_csv(RES_DIR + 'nmf_word_dist_{}topics.csv'.format(k), index=False)

Evaluation on test data


In [ ]:
print('Calculating test errors of random NMF ...')
rnmf_test_error = cal_test_err(mf_models=rnmf)

In [ ]:
best_k = ks[np.argmin(rnmf_test_error)]
print('The best no. of topics is %d' %best_k)
rnmf_best = rnmf[best_k]

In [ ]:
nmf_fig = plotMetrics(train_metric=rnmf_error, test_metric=rnmf_test_error, model_name='NMF')
nmf_fig.savefig(RES_DIR + 'nmf.pdf')
plt.close(nmf_fig)

Skill Clustering by LDA


In [ ]:
t0 = time()
print('Building count features for LDA from %d JDs and %d skills...' %(n_post, len(skills)))
count_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))
doc_skill_freq = count_vectorizer.fit_transform(posts['clean_text'])
print('Done after %.1fs' %(time() - t0))
doc_skill_freq.shape

In [ ]:
lda_X_train, lda_X_test = doc_skill_freq[train_idx, :], doc_skill_freq[test_idx, :]

Trainning


In [ ]:
lda_scores = []
lda = {k: LatentDirichletAllocation(n_topics=k, max_iter=5, learning_method='online', learning_offset=50.,
                                   random_state=0) # verbose=1
 for k in ks}

print("Fitting LDA ...")
print('No. of topics, Log-likelihood, Running time')

for k in ks:
    t0 = time()
    lda[k].fit(lda_X_train)
    s = lda[k].score(lda_X_train)
    print('%d, %0.1f, %0.1fs' %(k, s, time() - t0))
    lda_scores.append(s)
# end

Evaluation of LDA on test set by perplexity


In [ ]:
perp = [lda[k].perplexity(lda_X_test) for k in ks]
perp_df = pd.DataFrame({'No. of topics': ks, 'Perplexity': perp})
perp_df.to_csv(RES_DIR + 'perplexity.csv', index=False)

In [ ]:
lda_best_k = ks[np.argmin(perp)]
print('Best no. of topics for LDA: %d' %lda_best_k)
perp_df
  • Save LDA models

In [ ]:
lda_feats = count_vectorizer.get_feature_names()
pd.DataFrame(lda_feats).to_csv(RES_DIR + 'lda_feats.csv')

for k in ks:
    word_dist = pd.DataFrame(lda[k].components_).apply(normalize, axis=1)
    word_dist.to_csv(RES_DIR + 'lda_word_dist_{}topics.csv'.format(k), index=False)
    
    lda_topics = top_words_df(n_top_words, model=lda[k], feature_names=lda_feats)
    lda_topics.to_csv(RES_DIR + 'lda_{}topics.csv'.format(k), index=False)

Model Comparison


In [ ]:
# Put all model metrics on training & test datasets into 2 data frames
model_list = ['LDA', 'randomNMF']

train_metric = pd.DataFrame({'No. of topics': ks, 'LDA': np.divide(lda_scores, 10**6), 'randomNMF': rnmf_error})
test_metric = pd.DataFrame({'No. of topics': ks, 'LDA': perp, 'randomNMF': rnmf_test_error, })

In [ ]:
fig = plt.figure(figsize=(10, 6))

for i, model in enumerate(model_list):
    plt.subplot(2, 2, i+1)
    plt.subplots_adjust(wspace=.5, hspace=.5)  
    #     train metric
    plt.title(model)
    plt.plot(ks, train_metric[model], '--')
    plt.xlabel('No. of topics')
    if model == 'LDA':
        plt.ylabel(r'Log likelihood ($\times 10^6$)')
    else:
        plt.ylabel(r'$\| X_{train} - W_{train} H \|_2$')
    plt.grid(True)
    plt.xticks(ks)
    
    #     test metric
    plt.subplot(2, 2, i+3)
    plt.title(model)
    plt.plot(ks, test_metric[model], 'r')
    plt.xlabel('No. of topics')
    if model == 'LDA':
        plt.ylabel(r'Perplexity')
    else:
        plt.ylabel(r'$\| X_{test} - W_{test} H \|_2$')
    plt.grid(True)
    plt.xticks(ks)
        
# end
plt.show()
fig.savefig(RES_DIR + 'lda_vs_nmf.pdf')
plt.close(fig)