Preparations

Import libraries:



In [1]:

    
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text_manip
import matplotlib.pyplot as plt
import gc

from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from scipy.sparse import *
from my_util import *

Load data



In [2]:

    
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
REPORT_DIR = HOME_DIR + 'reports/skill_cluster/'

# job descriptions (JDs)
init_posts = pd.read_csv(DATA_DIR + 'jd_df.csv')



In [20]:

    
init_skills = skill_df['skill']
jd_docs = list(init_posts['clean_text'].apply(str.lower))

n_skill, n_jd = len(init_skills) , init_posts.shape[0]
print('Initial no. of skills: %d' %n_skill)
print('Initial no. of JDs: %d' %n_jd) # some garbage JDs with no text already removed









    



Initial no. of skills: 44919
Initial no. of JDs: 263411

Distribution of unigram, bigram and trigram skills



In [8]:

    
skill_df = pd.read_csv(REPORT_DIR + 'skill_stats.csv')
skill_df.head(3)



In [27]:

    
uni_gram_skills = list(skill_df.query('n_word == 1')['skill'])
bi_gram_skills = list(skill_df.query('n_word == 2')['skill'])
tri_gram_skills = list(skill_df.query('n_word == 3')['skill'])

pd.DataFrame({'n_unigram_skill': len(uni_gram_skills), 'n_bigram_skill': len(bi_gram_skills), 
              'n_trigram_skill': len(tri_gram_skills)}, index=[0])









    Out[27]:






  
    
      
      n_bigram_skill
      n_trigram_skill
      n_unigram_skill
    
  
  
    
      0
      20386
      10778
      7537



In [31]:

    
skills = pd.Series.unique(skill_df.query('freq > 0')['skill'])
len(skills)









    Out[31]:





14829

No. of unique uni-grams per document



In [ ]:

    
t0 = time()
print('Counting occurrence of uni-gram skills...')
uni_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills)  
doc_unigram_freq = uni_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))

## For each doc, "its no. of unique uni-grams = no. of non-zero counts" in its row in doc-term mat
def n_non_zero(r, sp_mat):
    return len(sp_mat.getrow(r).nonzero()[1])



In [12]:

    
# binary_vectorizer = text_manip.CountVectorizer(vocabulary=skills, binary=True)
# print('Marking unique unigram skills in JDs...')
# t0 = time()
# doc_unigram_occurrence = binary_vectorizer.fit_transform(jd_docs)
# print('Done after %.1fs' %(time() - t0))
# init_posts['n_uniq_unigram'] = doc_unigram_occurrence.sum(axis=1).A1
quantile(init_posts['n_uniq_unigram'])









    Out[12]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      0.0
      8.0
      14.0
      22.0
      119.0



In [ ]:

    
plt.hist(n_uniq_unigram, bins=np.unique(init_posts['n_uniq_unigram']))
plt.xlabel('no. of unique unigrams in JD')
plt.ylabel('no. of JDs')

plt.show()

No. of unique skills per JDs

Here each skill can be a uni-, bi-, or tri-gram (i.e. len(skill) <= 3)

Remove skills never occurring in JDs

This step is already done in previous run, no need to do again.



In [ ]:

From now on, we work with the set of occurring skills.



In [ ]:

    
# Count no. of unique skills in each JD by binary vectorizer
binary_vectorizer = text_manip.CountVectorizer(vocabulary=occur_skills, ngram_range=(1, max_n_word), binary=True)
t0 = time()
print('Marking occurrence of skills with length <= %d ...' %max_n_word)
doc_skill_occurrence = binary_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))

init_posts['n_uniq_skill'] = doc_skill_occurrence.sum(axis=1).A1
quantile(init_posts['n_uniq_skill'])

Filtering

There are two goals: i) to remove JDs with too few skills, and ii) to remove skills occurring in too few JDs. Thus, we repeat the following process until the two goals are satisfied.

Count no. of unique skills in each JD
Remove JDs with $<= 1$ skills
Count no. of JDs containing each skill
Remove skills occuring in $<= 1$ JDs



In [ ]:

    
n_iter, posts = 0, init_posts
n_post = posts.shape[0]

stop_cond, thres = False, .98
while not stop_cond:
    n_iter = n_iter + 1
    print('Iteration %d' %n_iter)
    new_posts = extractJDs(posts, skills, min_n_skill=2)
    n_new_post = new_posts.shape[0]
    print('No. of posts after filtering: %d' %n_new_post)
    
    skill_df = extractSkills(skills, new_posts, min_n_jd=2)
    new_skills = skill_df['skill']
    print('No. of skills after filtering: %d' %len(new_skills) )
    stop_cond = (n_new_post >= thres*n_post) and (len(new_skills) >= thres*len(skills))
    
    posts = new_posts
    n_post = posts.shape[0]
    skills = new_skills
# end

Save the hard-earned JDs and skills after all these filters:



In [ ]:

    
# print min(posts['n_uniq_skill'])
# print min(skill_df['n_jd_with_skill'])
posts.to_csv(DATA_DIR + 'filtered/posts.csv', index=False)
skill_df.to_csv(DATA_DIR + 'filtered/skills.csv', index=False)

Sample job postings:



In [ ]:

    
posts = posts.sort_values(by='n_uniq_skill', ascending=False)
posts.head()



In [ ]:

    
# Sanity check by pull up skills occuring in the JD with most skills
# post_with_most_skill = init_posts.query('job_id == {}'.format('JOB-2015-0196805') )



In [ ]:

    
train_idx, test_idx = mkPartition(n_instance, p=80)
X_train, X_test = doc_skill_tfidf[train_idx, :], doc_skill_tfidf[test_idx, :]
n_train, n_test = X_train.shape[0], X_test.shape[0]
print('Train set has %d JDs and test set has %d JDs' %(n_train, n_test))



In [ ]:

    
stats = pd.DataFrame({'n_train': n_train, 'n_test': n_test, 'n_jd (train & test)': n_post, 'n_skill': len(skills)}, index=[0])
stats.to_csv(RES_DIR + 'stats.csv', index=False)

Set global arguments:

no. of topics: k in {5, 10, ..., 20}
no. of top words to be printed out in result
directory to save results



In [ ]:

    
RES_DIR = REPORT_DIR + 'r6/'
n_top_words = 10



In [ ]:

    
# ks  = range(5, 25, 5)
ks = range(4, 24, 2)

Skill Clustering by NMF



In [ ]:

    
tf_idf_vect = text_manip.TfidfVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))
n_instance, n_feat = posts.shape[0], len(skills)
t0 =time()
print('Building tf_idf for %d JDs using %d features (skills)...' %(n_instance, n_feat))
doc_skill_tfidf = tf_idf_vect.fit_transform(posts['clean_text'])
print('Done after %.1fs' %(time()-t0))

Training



In [ ]:

    
rnmf = {k: NMF(n_components=k, random_state=0) for k in ks}
print( "Fitting NMF using random initialization..." )
print('No. of topics, Error, Running time')
rnmf_error = []

for k in ks:
    t0 = time()
    rnmf[k].fit(X_train)
    elapsed = time() - t0
    err = rnmf[k].reconstruction_err_
    print('%d, %0.1f, %0.1fs' %(k, err, elapsed))
    rnmf_error.append(err)
# end

Save each model



In [ ]:

    
nmf_features = tf_idf_vect.get_feature_names()
pd.DataFrame(nmf_features).to_csv(RES_DIR + 'nmf_features.csv', index=False)

for k in ks:
    top_words = top_words_df(n_top_words, model=rnmf[k],feature_names=nmf_features)
    top_words.to_csv(RES_DIR + 'nmf_{}_topics.csv'.format(k), index=False)
#   each word dist is a component in NMF
    word_dist = pd.DataFrame(rnmf[k].components_).apply(normalize, axis=1)
    word_dist.to_csv(RES_DIR + 'nmf_word_dist_{}topics.csv'.format(k), index=False)

Evaluation on test data



In [ ]:

    
print('Calculating test errors of random NMF ...')
rnmf_test_error = cal_test_err(mf_models=rnmf)



In [ ]:

    
best_k = ks[np.argmin(rnmf_test_error)]
print('The best no. of topics is %d' %best_k)
rnmf_best = rnmf[best_k]



In [ ]:

    
nmf_fig = plotMetrics(train_metric=rnmf_error, test_metric=rnmf_test_error, model_name='NMF')
nmf_fig.savefig(RES_DIR + 'nmf.pdf')
plt.close(nmf_fig)

Skill Clustering by LDA



In [ ]:

    
t0 = time()
print('Building count features for LDA from %d JDs and %d skills...' %(n_post, len(skills)))
count_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))
doc_skill_freq = count_vectorizer.fit_transform(posts['clean_text'])
print('Done after %.1fs' %(time() - t0))
doc_skill_freq.shape



In [ ]:

    
lda_X_train, lda_X_test = doc_skill_freq[train_idx, :], doc_skill_freq[test_idx, :]

Trainning



In [ ]:

    
lda_scores = []
lda = {k: LatentDirichletAllocation(n_topics=k, max_iter=5, learning_method='online', learning_offset=50.,
                                   random_state=0) # verbose=1
 for k in ks}

print("Fitting LDA ...")
print('No. of topics, Log-likelihood, Running time')

for k in ks:
    t0 = time()
    lda[k].fit(lda_X_train)
    s = lda[k].score(lda_X_train)
    print('%d, %0.1f, %0.1fs' %(k, s, time() - t0))
    lda_scores.append(s)
# end

Evaluation of LDA on test set by perplexity



In [ ]:

    
perp = [lda[k].perplexity(lda_X_test) for k in ks]
perp_df = pd.DataFrame({'No. of topics': ks, 'Perplexity': perp})
perp_df.to_csv(RES_DIR + 'perplexity.csv', index=False)



In [ ]:

    
lda_best_k = ks[np.argmin(perp)]
print('Best no. of topics for LDA: %d' %lda_best_k)
perp_df

Save LDA models



In [ ]:

    
lda_feats = count_vectorizer.get_feature_names()
pd.DataFrame(lda_feats).to_csv(RES_DIR + 'lda_feats.csv')

for k in ks:
    word_dist = pd.DataFrame(lda[k].components_).apply(normalize, axis=1)
    word_dist.to_csv(RES_DIR + 'lda_word_dist_{}topics.csv'.format(k), index=False)
    
    lda_topics = top_words_df(n_top_words, model=lda[k], feature_names=lda_feats)
    lda_topics.to_csv(RES_DIR + 'lda_{}topics.csv'.format(k), index=False)

Model Comparison



In [ ]:

    
# Put all model metrics on training & test datasets into 2 data frames
model_list = ['LDA', 'randomNMF']

train_metric = pd.DataFrame({'No. of topics': ks, 'LDA': np.divide(lda_scores, 10**6), 'randomNMF': rnmf_error})
test_metric = pd.DataFrame({'No. of topics': ks, 'LDA': perp, 'randomNMF': rnmf_test_error, })



In [ ]:

    
fig = plt.figure(figsize=(10, 6))

for i, model in enumerate(model_list):
    plt.subplot(2, 2, i+1)
    plt.subplots_adjust(wspace=.5, hspace=.5)  
    #     train metric
    plt.title(model)
    plt.plot(ks, train_metric[model], '--')
    plt.xlabel('No. of topics')
    if model == 'LDA':
        plt.ylabel(r'Log likelihood ($\times 10^6$)')
    else:
        plt.ylabel(r'$\| X_{train} - W_{train} H \|_2$')
    plt.grid(True)
    plt.xticks(ks)
    
    #     test metric
    plt.subplot(2, 2, i+3)
    plt.title(model)
    plt.plot(ks, test_metric[model], 'r')
    plt.xlabel('No. of topics')
    if model == 'LDA':
        plt.ylabel(r'Perplexity')
    else:
        plt.ylabel(r'$\| X_{test} - W_{test} H \|_2$')
    plt.grid(True)
    plt.xticks(ks)
        
# end
plt.show()
fig.savefig(RES_DIR + 'lda_vs_nmf.pdf')
plt.close(fig)

	skill	n_word	freq
0	business	1	170302
1	management	1	161636
2	support	1	156517