Preparations

Import libraries:



In [1]:

    
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text_manip
import matplotlib.pyplot as plt
import gc

from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from scipy.sparse import *
from my_util import *

Define needed helpers:



In [308]:

    
def skill_length_hist(skill_df):
    min_n_word = np.min(skill_df['n_word'])
    max_n_word = np.max(skill_df['n_word'])
    n, bins, patches = plt.hist(skill_df['n_word'], bins= range(min_n_word, max_n_word+1), facecolor='blue', 
                                log=True, align='left', rwidth=.5)
    
    plt.xlabel('No. of words in skill (skill length)')
    plt.ylabel('No. of skills (log scale)')
    plt.title('Distribution of skill length')
    plt.xticks(range(min_n_word, max_n_word+1))
    plt.grid(True)
#     plt.savefig(REPORT_DIR + 'skill_length.pdf')

    plt.show()
    plt.close()
# end 

def freq(skills=None, docs=None, max_n_word=1):
    t0 = time()
    print('Counting occurrence of skills with length <= %d ...' %max_n_word)
    
    count_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))  
    doc_term_mat = count_vectorizer.fit_transform(docs)
    print('Done after %.1fs' %(time() - t0))

    # Sum over all documents to obtain total occurrence of each skill token
    token_counts = np.asarray(doc_term_mat.sum(axis=0)).ravel()
    
    df = pd.DataFrame({'skill': skills})
    df['occurrence'] = token_counts
    return df, doc_term_mat

def skillsPresentInJD(df):
    occur_skills = df.query('occurrence > 0')
    no_occur_skills = df.query('occurrence == 0')
    return occur_skills, no_occur_skills

def n_match_skills(df):
    occur_skills = df.query('occurrence > 0')
    return occur_skills.shape[0]

def get_top_words(n_top_words, word_dist, feature_names):
    norm_word_dist = np.divide(word_dist, sum(word_dist))
    sorting_idx = word_dist.argsort()
    top_words = [feature_names[i] for i in sorting_idx[:-n_top_words - 1:-1]]
    probs = [norm_word_dist[i] for i in sorting_idx[:-n_top_words - 1:-1]]
    
    return pd.DataFrame({'top_words': top_words, 'word_probs': probs})



In [2]:

    
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
REPORT_DIR = HOME_DIR + 'reports/skill_cluster/'

Load clean job descriptions (w/o html tags):



In [304]:

    
jd_df = pd.read_csv(DATA_DIR + 'jd_df.csv')
n_jd = jd_df.shape[0]



In [305]:

    
jd_df['clean_text'] = [' '.join(words_in_doc(d)) for d in jd_df['text']]
print('Some sample JD records:')
jd_df.head(3)









    



Some sample JD records:






    Out[305]:






  
    
      
      job_id
      job_description_clob
      job_other_requirements_clob
      month_year
      text
      length
      unigram_freq
      unigram_ratio
      clean_text
    
  
  
    
      105469
      JOB-2014-0110950
      Bookkeeping
      O levels , LCCI Intermediate, 2 years experien...
      Nov-14
      Bookkeeping
      1
      4
      4.0
      Bookkeeping
    
    
      4620
      JOB-2014-0007628
      Testing
      Testing
      Jul-14
      Testing
      1
      1
      1.0
      Testing
    
    
      115833
      JOB-2014-0121378
      <p>\r\r\r\r\r\r\n\t&bull; Java<br />\r\r\r\r\r...
      <p>\r\r\r\r\r\r\n\t&bull; Involved/experience ...
      Dec-14
      \r\r\r\r\r\r\n\t• Java\r\r\r\r\r\r\n\t• Spring...
      10
      3
      0.3
      Java Spring Hibernate REST API JSON XML Tomcat...

Get the text of JD records for further analysis. We need to use lower cases for JDs so that we can match with lowercased skills later.



In [306]:

    
jd_docs = jd_df['clean_text'].apply(str.lower)

Load skill lists obtained from LinkedIn & ONET:



In [3]:

    
linkedin_skill_df = pd.read_csv(DATA_DIR + 'LinkedInSkillsList_10.csv')
linkedin_skills = linkedin_skill_df['skill']

onet_skill_df = pd.read_csv(DATA_DIR + 'onet_skills_list_all.csv')
onet_skills = onet_skill_df['skill'].apply(str.lower)

Join two skill lists & remove duplicated skills:



In [4]:

    
skills = linkedin_skills.append(onet_skills)
skills = list(set(skills))
pd.DataFrame({'skill': skills}).to_csv(DATA_DIR + 'all_skills.csv')

Average no. of words in skills



In [10]:

    
skill_df = pd.DataFrame({'skill': skills})
skill_df['n_word'] = skill_df['skill'].apply(n_word)
quantile(skill_df['n_word'])









    Out[10]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      1.0
      2.0
      2.0
      3.0
      13.0

Distribution of skill length (no. of words in skill):



In [11]:

    
skill_length_hist(skill_df)

Based on the quartile summary and the distribution, we can try the following options:

including only 1-gram, 2-gram, 3-gram skills in our vocabulary (as 75% of skills have no more than 3 words)
including up to 7-gram skills in our vocabulary (as skills with more than 7 words only occuppy a small portion)

Statistics of data



In [396]:

    
stats = pd.DataFrame({'# JDs': n_jd, 
                      '# LinkedIn skills': len(linkedin_skills), '# ONET skills': len(onet_skills), 
                      'Total no. of unique skills': len(skills), 
                      'min skill length': min(skill_df['n_word']), 'max skill length': max(skill_df['n_word'])}, 
                     index=[0])

stats.to_csv(DATA_DIR + 'stats.csv')
stats









    Out[396]:






  
    
      
      # JDs
      # LinkedIn skills
      # ONET skills
      Total no. of unique skills
      max skill length
      min skill length
    
  
  
    
      0
      263477
      18251
      27025
      44919
      13
      1

1. Skill occurrence

Each skill is considered as a token. We count occurrence of each skill in documents and return the counts in a term-document matrix.

Tokens_1 = {skills with length = 1} (uni-gram skills)
Tokens_3 = {skills with length <= 3}
Tokens_7 = {skills with length <= 7}



In [310]:

    
unigram_skills, doc_unigram_freq = freq(skills, jd_docs, max_n_word=1)









    



Counting occurrence of skills with length <=1 ...
Done after 31.0s



In [311]:

    
trigram_skills, doc_trigram_freq = freq(skills, jd_docs, max_n_word=3)









    



Counting occurrence of skills with length <=3 ...
Done after 97.3s



In [379]:

    
# sevengram_skills, doc_7gram_freq = freq(skills, jd_docs, max_n_word=7)



In [312]:

    
unigram_match, trigram_match = n_match_skills(df=unigram_skills), n_match_skills(df=trigram_skills)
# sevengram_match = n_match_skills(df=sevengram_skills)

pd.DataFrame({'# matching skills': [unigram_match, trigram_match]}, 
            index=['Tokens_1', 'Tokens_3'])









    Out[312]:






  
    
      
      # matching skills
    
  
  
    
      Tokens_1
      5212
    
    
      Tokens_3
      14829

As the difference between Tokens_3 and Tokens_7 is negligible, we can just use the former for analysis. This means we only include 1-, 2- and 3-gram skills in our subsequent analysis.

2. Skill occurrence per document

Uni-gram skills



In [380]:

    
t0 = time()
print('Counting occurrence of uni-gram skills...')
uni_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills)  
doc_unigram_freq = uni_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))









    



Counting occurrence of uni-gram skills...
Done after 30.3s

No. of unique uni-grams per document



In [381]:

    
## For each doc, "its no. of unique uni-grams = no. of non-zero counts" in its row in doc-term mat
def n_non_zero(r, sp_mat):
    return len(sp_mat.getrow(r).nonzero()[1])

n_unigram = [n_non_zero(r, doc_unigram_freq) for r in range(n_jd)]
# sum(n_unigram) == len(doc_unigram_freq.nonzero()[0]) # sanity check









    Out[381]:






  
    
      
      jd_id
      n_unigram
    
  
  
    
      190543
      JOB-2015-0196805
      119
    
    
      210604
      JOB-2015-0217075
      106
    
    
      260098
      JOB-2015-0267654
      103
    
    
      260090
      JOB-2015-0267646
      103
    
    
      33133
      JOB-2014-0036543
      99
    
    
      203037
      JOB-2015-0209415
      98
    
    
      216119
      JOB-2015-0222694
      98
    
    
      256206
      JOB-2015-0263242
      98
    
    
      203547
      JOB-2015-0209933
      97
    
    
      140713
      JOB-2015-0146455
      97



In [395]:

    
jd_df['n_unigram'] = n_unigram
print quantile(n_unigram)









    



   min  25%  50% (median)   75%    max
0  0.0  8.0          14.0  22.0  119.0



In [394]:

    
# pull up some JDs to check
tmp = jd_df.query('n_unigram == 2')
print tmp.shape
tmp[['job_id', 'clean_text', 'n_unigram']].head(10)









    



(6464, 10)






    Out[394]:






  
    
      
      job_id
      clean_text
      n_unigram
    
  
  
    
      307
      JOB-2014-0003245
      Design sportswear shoe
      2
    
    
      234060
      JOB-2015-0240867
      HVAC ENGINEERING
      2
    
    
      148984
      JOB-2015-0154808
      Jewelry designer
      2
    
    
      39364
      JOB-2014-0044267
      Peformance Testing Mobile Testing
      2
    
    
      234248
      JOB-2015-0241058
      Project Management
      2
    
    
      162273
      JOB-2015-0168245
      CARPET SALES
      2
    
    
      92784
      JOB-2014-0098164
      CALIBRATION TESTING
      2
    
    
      18577
      JOB-2014-0021820
      Food Preparation Food Serving
      2
    
    
      193710
      JOB-2015-0200007
      Basic accounting
      2
    
    
      116694
      JOB-2014-0122242
      VBA MACRO
      2

Based on the quartile summary, $50$% of documents contain $\leq 14$ uni-gram skills and $75$% of them contain $\leq 22$ uni-gram skills (which looks reasonable).



In [387]:

    
fig = plt.figure(figsize=(10, 6))
ns, bins, patches = plt.hist(n_unigram, bins=np.unique(n_unigram), rwidth=.5)
plt.xlabel('No. of unique uni-gram skills in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of uni-gram skills in job descriptions')
plt.grid(True)

plt.show()
fig.savefig(REPORT_DIR + 'unigram_in_jd.pdf')
plt.close(fig)



In [386]:

    
print('Ratio of JDs with no uni-gram skills: %.3f' %round(n_unigram.count(0)/float(n_jd), 3))









    



Ratio of JDs with no uni-gram skills: 0.007

Ratio of unigram skill tokens in JDs



In [313]:

    
jd_df['length'] = [n_words_in_doc(d) for d in jd_df['text']]
# print sum(jd_df['length'] == 0)

jd_df['unigram_freq'] = doc_unigram_freq.sum(axis=1).A1
clean_jd_df = jd_df.query('length > 0')
del clean_jd_df['text']
clean_jd_df['unigram_ratio'] = np.divide(clean_jd_df['unigram_freq'], clean_jd_df['length']*1.)
clean_jd_df = clean_jd_df.sort_values(by='unigram_ratio', ascending=False)



In [383]:

    
print quantile(clean_jd_df['unigram_ratio'], dec=2)
# clean_jd_df.to_csv(DATA_DIR + 'jd_df.csv')









    



   min   25%  50% (median)   75%  max
0  0.0  0.16           0.2  0.26  1.0



In [318]:

    
plt.hist(clean_jd_df['unigram_ratio'])
plt.xlabel('Ratio of unigram skill tokens in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of ratio of unigram skills')
plt.savefig(REPORT_DIR + 'unigram_ratio.pdf')

plt.show()
plt.close()

Decision: Based on the distribution, half of JDs have $\leq 20$% of their words are uni-gram skills (and 75% of them having unigram skills occupying $\leq 26$% of their words). Thus, we may first want to try on the sub dataset of JDs where uni-gram skills occuppy at least $20$%.



In [429]:

    
jd_df.columns









    Out[429]:





Index([u'job_id', u'job_description_clob', u'job_other_requirements_clob',
       u'month_year', u'text', u'length', u'unigram_freq', u'unigram_ratio',
       u'clean_text', u'n_unigram'],
      dtype='object')



In [430]:

    
sub_df = jd_df.query('n_unigram >= 5')
sub_df = sub_df.sort_values(by='length', ascending=False)
print sub_df.shape
# sub_df.head(10)









    



(232559, 10)

First, we try using only uni-gram skills as features.



In [431]:

    
unigram_df, doc_unigram_freq  = freq(skills, max_n_word=1, docs=sub_df['clean_text'])
doc_unigram_freq.shape









    



Counting occurrence of skills with length <=1 ...
Done after 29.9s






    Out[431]:





(232559, 44919)



In [432]:

    
occur_unigrams = unigram_df.query('occurrence > 0')['skill']
unigram_df, doc_unigram_freq = freq(skills=occur_unigrams, docs=sub_df['clean_text'])
doc_unigram_freq.shape









    



Counting occurrence of skills with length <=1 ...
Done after 29.7s






    Out[432]:





(232559, 5196)



In [434]:

    
n_unigram_skill = doc_unigram_freq.shape[1]
n_document = [n_non_zero(r, doc_unigram_freq.transpose()) for r in range(n_unigram_skill)]



In [447]:

    
tmp_df = pd.DataFrame({'skill' : occur_unigrams, 'n_doc' : n_document}) 
tmp_df = tmp_df.sort_values(by='n_doc', ascending=False)
sum(tmp_df['n_doc'] == 1)









    Out[447]:





311



In [435]:

    
quantile(n_document)









    Out[435]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      1.0
      8.0
      37.0
      211.2
      84111.0



In [439]:

    
plt.hist(n_document, bins=np.unique(n_document), log=True)
# plt.xlabel('No. of documents containing the skill')
# plt.ylabel

plt.show()

Skill Clustering by NMF & LDA

Split into training and test sets



In [406]:

    
n_ins = sub_df.shape[0]
train_idx, test_idx = mkPartition(n_instances=n_ins)
X_train, X_test = doc_unigram_freq[train_idx, :], doc_unigram_freq[test_idx, :]

Set global arguments:

no. of topics: k in {5, 10, ..., 50}
no. of top words to be printed out in result
directory to save results



In [411]:

    
# ks, n_top_words = range(2, 11), 10
ks, n_top_words = range(5, 25, 5), 10
RES_DIR = REPORT_DIR + 'r4/'

A. Using NMF

Trainning NMF using random initialization



In [412]:

    
rnmf = {k: NMF(n_components=k, random_state=0) for k in ks}
print( "Fitting NMF using %d uni-gram skills on %d job descriptions..." % (len(occur_unigrams), n_ins) ) # (random initialization)
print('No. of topics, Error, Running time')
rnmf_error = []

for k in ks:
    t0 = time()
    rnmf[k].fit(X_train)
    elapsed = time() - t0
    err = rnmf[k].reconstruction_err_
    print('%d, %0.1f, %0.1fs' %(k, err, elapsed))
    rnmf_error.append(err)
# end









    



Fitting NMF using 4359 uni-gram skills on 62158 job descriptions...
No. of topics, Error, Running time
5, 1427.5, 3.7s
10, 1340.7, 1.6s
15, 1282.7, 5.3s
20, 1234.2, 4.5s

Evaluating NMF on test data

First, we choose the best no. of topics $k^*$ for random NMF as the one that minimizes the error of predicting test data. For that, we compute the error for different $k$'s by the following function.



In [357]:

    
from numpy import linalg as la

def cal_test_err(models): 
    test_error = []
    print('No. of topics, Test error, Running time')

    for k in ks:
        t0 = time()
        H = models[k].components_
        W_test = models[k].fit_transform(X_test, H=H)
        err = la.norm(X_test - np.matmul(W_test, H))
        
#         sp_W_test = csr_matrix(W_test)
#         sp_H = csc_matrix(H)
#         print(sp_W_test.shape)
#         print(sp_H.shape)
#         err = la.norm(X_test - sp_W_test * sp_H)
        test_error.append(err)
        print('%d, %0.1f, %0.1fs' %(k, err, time() - t0))
    
    return test_error



In [413]:

    
print('Calculating test errors of random NMF to choose best no. of topics...')
rnmf_test_error = cal_test_err(models=rnmf)









    



Calculating test errors of random NMF to choose best no. of topics...
No. of topics, Test error, Running time
5, 783.8, 1.0s
10, 709.2, 1.1s
15, 744.6, 2.3s
20, 781.4, 2.3s



In [414]:

    
best_k = ks[np.argmin(rnmf_test_error)]
print('The best no. of topics is %d' %best_k)
rnmf_best = rnmf[best_k]
rnmf_word_dists = pd.DataFrame(rnmf_best.components_).apply(normalize, axis=1)
# rnmf_word_dists.to_csv(RES_DIR + 'rnmf_word_dists.csv', index=False)









    



The best no. of topics is 10

We now validate NMF results in the following ways:

if learnt topics make sense
if topic it predicts for each JD in test set makes sense

Learnt topics

We manually label each topic based on its top 10 words.



In [360]:

    
# count_vectorizer = text_manip.CountVectorizer(vocabulary= occur_skills['skill'], ngram_range=(1, 3))
# count_vectorizer.fit_transform(jd_docs)

# nmf_features = count_vectorizer.get_feature_names()
# nmf_top_words = top_words_df(rnmf_best, n_top_words, nmf_features)

# nmf_top_words.to_csv(REPORT_DIR + 'nmf_top_words.csv', index=False)
# pd.DataFrame(nmf_features).to_csv(REPORT_DIR + 'nmf_features.csv')

Topic prediction on test JDs



In [361]:

    
# H = rnmf_best.components_
# W_test = pd.DataFrame(rnmf_best.fit_transform(X_test, H=H))
# W_test.to_csv(REPORT_DIR + 'nmf_doc_topic_distr.csv', index=False)

B. Using LDA

Trainning



In [415]:

    
scores = []
lda = {k: LatentDirichletAllocation(n_topics=k, max_iter=5, learning_method='online', learning_offset=50.,
                                   random_state=0) # verbose=1
 for k in ks}

print("Fitting LDA using %d uni-gram skills on %d job descriptions..." % (len(occur_unigrams), n_ins))
print('No. of topics, Log-likelihood, Running time')

for k in ks:
    t0 = time()
    lda[k].fit(X_train)
    s = lda[k].score(X_train)
    print('%d, %0.1f, %0.1fs' %(k, s, time() - t0))
    scores.append(s)
# end









    



Fitting LDA using 4359 uni-gram skills on 62158 job descriptions...
No. of topics, Log-likelihood, Running time
5, -9766356.7, 40.8s
10, -14281289.0, 42.0s
15, -19006623.0, 45.2s
20, -23848096.5, 47.3s

Perplexity of LDA on test set



In [416]:

    
perp = [lda[k].perplexity(X_test) for k in ks]
perp_df = pd.DataFrame({'No. of topics': ks, 'Perplexity': perp})
perp_df.to_csv(RES_DIR + 'perplexity.csv', index=False)

Choose the best no. of topics as the one minimizing perplexity.



In [417]:

    
best_k = ks[np.argmin(perp)]
print('Best no. of topics: %d' %best_k)









    



Best no. of topics: 5

Save the best LDA model:



In [365]:

    
# lda_best = lda[best_k]
# lda_word_dists = pd.DataFrame(lda_best.components_).apply(normalize, axis=1)
# lda_word_dists.to_csv(RES_DIR + 'lda_word_dists.csv', index=False)

# lda_features = count_vectorizer.get_feature_names()
# lda_topics = top_words_df(lda_best, n_top_words, lda_features)

# lda_topics.to_csv(REPORT_DIR + 'lda_topics.csv', index=False)
# pd.DataFrame(lda_features).to_csv(RES_DIR + 'lda_features.csv')

Using the best LDA to perform topic prediction on test JDs



In [366]:

    
# doc_topic_distr  = lda_best.transform(X_test)
# pd.DataFrame(doc_topic_distr).to_csv(RES_DIR + 'lda_doc_topic_distr.csv', index=False)

C. Model Comparison



In [421]:

    
# Put all model metrics on training & test datasets into 2 data frames
model_list = ['LDA', 'randomNMF']

train_metric = pd.DataFrame({'No. of topics': ks, 'LDA': np.divide(scores, 10**6), 'randomNMF': rnmf_error})

test_metric = pd.DataFrame({'No. of topics': ks, 'LDA': perp, 'randomNMF': rnmf_test_error, })



In [424]:

    
ks









    Out[424]:





[5, 10, 15, 20]

Performance of models for different number of topics



In [423]:

    
fig = plt.figure(figsize=(10, 6))

for i, model in enumerate(model_list):
    plt.subplot(2, 2, i+1)
    plt.subplots_adjust(wspace=.5, hspace=.5)  
    #     train metric
    plt.title(model)
    plt.plot(ks, train_metric[model], '--')
    plt.xlabel('No. of topics')
    if model == 'LDA':
        plt.ylabel(r'Log likelihood ($\times 10^6$)')
    else:
        plt.ylabel(r'$\| X_{train} - W_{train} H \|_2$')
    plt.grid(True)
    
    #     test metric
#     plt.subplot(2, 2, i+3)
#     plt.title(model)
#     plt.plot(ks, test_metric[model], 'r')
#     plt.xlabel('No. of topics')
#     if model == 'LDA':
#         plt.ylabel(r'Perplexity')
#     else:
#         plt.ylabel(r'$\| X_{test} - W_{test} H \|_2$')
#     plt.grid(True)
        
# end
plt.show()
# fig.savefig(RES_DIR + 'new_lda_vs_nmf.pdf')
plt.close(fig)

Multi-gram skills



In [369]:

    
t0 = time()
print('Counting occurrence of multi-gram skills...')
multi_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(2, 3))
multigram_doc_mat = multi_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))









    



Counting occurrence of multi-gram skills...
Done after 71.5s

Distribution of multigrams in documents



In [370]:

    
n_multigram = [n_non_zero(r, doc_multigram_freq) for r in range(n_jd)]
multigram_df = pd.DataFrame({'jd_id': jd_df['job_id'], 'n_multigram': n_multigram})

quantile(n_multigram)









    Out[370]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      0.0
      1.0
      2.0
      5.0
      45.0



In [371]:

    
plt.hist(n_multigram, bins=np.unique(n_multigram))
plt.xlabel('No. of multi-grams in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of multi-grams in job descriptions')
plt.grid(True)

plt.show()
plt.savefig(REPORT_DIR + 'multigram_in_jd.pdf')
plt.close()

Overlapping between uni-gram & multi-gram skills



In [372]:

    
unigram_skills, bigram_skills = occur_skills.query('n_word == 1')['skill'], occur_skills.query('n_word == 2')['skill']
## for each unigram, find no. of bigrams containing it (aka super-bigrams)
def super_bigrams(unigram='business', bigrams=bigram_skills):
    idx = [s.find(unigram) for s in bigrams]
    df = pd.DataFrame({'bigram': bigrams, 'idx_of_given_unigram': idx})
    return df.query('idx_of_given_unigram > -1') # those bigrams not containing the unigram give -1 indices
    
def n_super_bigrams(unigram='business', bigrams=bigram_skills):
    idx = [s.find(unigram) for s in bigrams]
    return len(bigrams) - idx.count(-1)



In [373]:

    
# super_bigrams(unigram='business')
n_super_bigrams = [n_super_bigrams(ug) for ug in unigram_skills]
overlap_df = pd.DataFrame({'unigram': unigram_skills, 'n_super_bigrams': n_super_bigrams})

quantile(n_super_bigrams)









    Out[373]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      0.0
      0.0
      0.0
      1.0
      1278.0



In [399]:

    
plt.hist(n_super_bigrams)

plt.show()
plt.close()



In [402]:

    
overlapped_unigrams = overlap_df.query('n_super_bigrams > 0')
overlapped_unigrams = overlapped_unigrams.sort_values(by='n_super_bigrams', ascending=False)
# overlapped_unigrams.head(10)

n_overlap = overlapped_unigrams.shape[0]
n_unigram_skills = overlap_df.shape[0]
print n_overlap
print n_unigram_skills
n_overlap*1./n_unigram_skills









    



1888
5212






    Out[402]:





0.3622409823484267

10 most common skills:



In [375]:

    
unigram_df = unigram_df.sort_values(by='occurrence', ascending=False)



In [376]:

    
top10 = unigram_df[['skill', 'occurrence']].head(10)
# top10.reset_index(inplace=True, drop=True)
top10









    Out[376]:






  
    
      
      skill
      occurrence
    
  
  
    
      31013
      management
      102822
    
    
      22914
      support
      101531
    
    
      2176
      business
      93456
    
    
      22585
      project
      85198
    
    
      11806
      team
      79446
    
    
      15184
      sales
      64598
    
    
      7465
      development
      56053
    
    
      20970
      design
      53787
    
    
      11387
      responsible
      48099
    
    
      2235
      customer
      46398

10 least common skills:



In [377]:

    
bottom10 = unigram_df[['skill', 'occurrence']].tail(10)
bottom10.reset_index(inplace=True, drop=True)
print(bottom10)









    



         skill  occurrence
0       trawls           1
1         sdsf           1
2    jprofiler           1
3      pulleys           1
4        toefl           1
5     griddles           1
6  respiration           1
7         cmbs           1
8       gambit           1
9     rabbitmq           1

Frequency of uni-gram skills in JDs



In [19]:

    
quantile(occur_skills.query('n_word == 1')['occurrence'])









    Out[19]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      1.0
      10.0
      47.0
      275.0
      170302.0

Frequency of bi-gram skills in JDs



In [20]:

    
quantile(occur_skills.query('n_word == 2')['occurrence'])









    Out[20]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      1.0
      5.0
      19.0
      81.0
      12700.0

Frequency of tri-gram skills in JDs



In [21]:

    
quantile(occur_skills.query('n_word == 3')['occurrence'])









    Out[21]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      1.0
      2.0
      7.0
      27.0
      2502.0

Distribution of skill frequency



In [22]:

    
n, bins, patches = plt.hist(x=occur_skills['occurrence']/10**3, bins=50, facecolor='blue', alpha=0.75, log=True)
plt.title('Histogram of skill frequency')
plt.xlabel(r'Frequency ($\times 10^3$)') # in thousand
plt.ylabel('No. of skills (log scale)')
plt.ylim(1, 10**4)
plt.grid(True)
plt.savefig(REPORT_DIR + 'skill_occur.pdf')

plt.show()
plt.close()



In [378]:

    
# bi_gram_skills.sort_values(by='occurrence', inplace=True, ascending=False)
# print('10 most common bi-gram skills')
# print(bi_gram_skills.head(10))

Skills which actually occur in JDs from JobBanks

Filter out all skills that never occur in JDs to reduce size of doc-skill feature matrix.



In [18]:

    
occur_skills, no_occur_skills = skillsPresentInJD(df=trigram_skills)
occur_skills = occur_skills.sort_values(by='occurrence',  ascending=False) # inplace=True,

Re-build the doc-skill matrix where each skill occurs at least once.



In [94]:

    
# occur_skills['skill'].head(3)
trigram_skills, doc_trigram_freq = freq(occur_skills['skill'], max_n_word=3)









    



Counting occurrence of skills with length <=3 ...
Done after 98.7s



In [98]:

    
print('No. of skills in the new doc-skill matrix after re-building: %d' %doc_trigram_freq.shape[1] )









    



No. of skills in the new doc-skill matrix after re-building: 14829



In [97]:

    
trigram_skills.to_csv(REPORT_DIR + 'trigram_skills.csv')
pd.DataFrame(doc_trigram_freq.data).to_csv(REPORT_DIR + 'doc_trigram_freq.csv')

# unigram_skills.to_csv(REPORT_DIR + 'unigram_skills.csv')
# pd.DataFrame(doc_unigram_freq.data).to_csv(REPORT_DIR + 'doc_unigram_freq.csv')
# sevengram_skills.to_csv(REPORT_DIR + 'sevengram_skills.csv')

	job_id	job_description_clob	job_other_requirements_clob	month_year	text	length	unigram_freq	unigram_ratio	clean_text
105469	JOB-2014-0110950	Bookkeeping	O levels , LCCI Intermediate, 2 years experien...	Nov-14	Bookkeeping	1	4	4.0	Bookkeeping
4620	JOB-2014-0007628	Testing	Testing	Jul-14	Testing	1	1	1.0	Testing
115833	JOB-2014-0121378	<p>\r\r\r\r\r\r\n\t• Java<br />\r\r\r\r\r...	<p>\r\r\r\r\r\r\n\t• Involved/experience ...	Dec-14	\r\r\r\r\r\r\n\t• Java\r\r\r\r\r\r\n\t• Spring...	10	3	0.3	Java Spring Hibernate REST API JSON XML Tomcat...

	jd_id	n_unigram
190543	JOB-2015-0196805	119
210604	JOB-2015-0217075	106
260098	JOB-2015-0267654	103
260090	JOB-2015-0267646	103
33133	JOB-2014-0036543	99
203037	JOB-2015-0209415	98
216119	JOB-2015-0222694	98
256206	JOB-2015-0263242	98
203547	JOB-2015-0209933	97
140713	JOB-2015-0146455	97

	job_id	clean_text	n_unigram
307	JOB-2014-0003245	Design sportswear shoe	2
234060	JOB-2015-0240867	HVAC ENGINEERING	2
148984	JOB-2015-0154808	Jewelry designer	2
39364	JOB-2014-0044267	Peformance Testing Mobile Testing	2
234248	JOB-2015-0241058	Project Management	2
162273	JOB-2015-0168245	CARPET SALES	2
92784	JOB-2014-0098164	CALIBRATION TESTING	2
18577	JOB-2014-0021820	Food Preparation Food Serving	2
193710	JOB-2015-0200007	Basic accounting	2
116694	JOB-2014-0122242	VBA MACRO	2

	skill	occurrence
31013	management	102822
22914	support	101531
2176	business	93456
22585	project	85198
11806	team	79446
15184	sales	64598
7465	development	56053
20970	design	53787
11387	responsible	48099
2235	customer	46398