Preparations

  • Import libraries:

In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text_manip
import matplotlib.pyplot as plt
import gc

from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from scipy.sparse import *
from my_util import *
  • Define needed helpers:

In [308]:
def skill_length_hist(skill_df):
    min_n_word = np.min(skill_df['n_word'])
    max_n_word = np.max(skill_df['n_word'])
    n, bins, patches = plt.hist(skill_df['n_word'], bins= range(min_n_word, max_n_word+1), facecolor='blue', 
                                log=True, align='left', rwidth=.5)
    
    plt.xlabel('No. of words in skill (skill length)')
    plt.ylabel('No. of skills (log scale)')
    plt.title('Distribution of skill length')
    plt.xticks(range(min_n_word, max_n_word+1))
    plt.grid(True)
#     plt.savefig(REPORT_DIR + 'skill_length.pdf')

    plt.show()
    plt.close()
# end 

def freq(skills=None, docs=None, max_n_word=1):
    t0 = time()
    print('Counting occurrence of skills with length <= %d ...' %max_n_word)
    
    count_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))  
    doc_term_mat = count_vectorizer.fit_transform(docs)
    print('Done after %.1fs' %(time() - t0))

    # Sum over all documents to obtain total occurrence of each skill token
    token_counts = np.asarray(doc_term_mat.sum(axis=0)).ravel()
    
    df = pd.DataFrame({'skill': skills})
    df['occurrence'] = token_counts
    return df, doc_term_mat

def skillsPresentInJD(df):
    occur_skills = df.query('occurrence > 0')
    no_occur_skills = df.query('occurrence == 0')
    return occur_skills, no_occur_skills

def n_match_skills(df):
    occur_skills = df.query('occurrence > 0')
    return occur_skills.shape[0]

def get_top_words(n_top_words, word_dist, feature_names):
    norm_word_dist = np.divide(word_dist, sum(word_dist))
    sorting_idx = word_dist.argsort()
    top_words = [feature_names[i] for i in sorting_idx[:-n_top_words - 1:-1]]
    probs = [norm_word_dist[i] for i in sorting_idx[:-n_top_words - 1:-1]]
    
    return pd.DataFrame({'top_words': top_words, 'word_probs': probs})

In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
REPORT_DIR = HOME_DIR + 'reports/skill_cluster/'
Load clean job descriptions (w/o html tags):

In [304]:
jd_df = pd.read_csv(DATA_DIR + 'jd_df.csv')
n_jd = jd_df.shape[0]

In [305]:
jd_df['clean_text'] = [' '.join(words_in_doc(d)) for d in jd_df['text']]
print('Some sample JD records:')
jd_df.head(3)


Some sample JD records:
Out[305]:
job_id job_description_clob job_other_requirements_clob month_year text length unigram_freq unigram_ratio clean_text
105469 JOB-2014-0110950 Bookkeeping O levels , LCCI Intermediate, 2 years experien... Nov-14 Bookkeeping 1 4 4.0 Bookkeeping
4620 JOB-2014-0007628 Testing Testing Jul-14 Testing 1 1 1.0 Testing
115833 JOB-2014-0121378 <p>\r\r\r\r\r\r\n\t&bull; Java<br />\r\r\r\r\r... <p>\r\r\r\r\r\r\n\t&bull; Involved/experience ... Dec-14 \r\r\r\r\r\r\n\t• Java\r\r\r\r\r\r\n\t• Spring... 10 3 0.3 Java Spring Hibernate REST API JSON XML Tomcat...
  • Get the text of JD records for further analysis. We need to use lower cases for JDs so that we can match with lowercased skills later.

In [306]:
jd_docs = jd_df['clean_text'].apply(str.lower)
Load skill lists obtained from LinkedIn & ONET:

In [3]:
linkedin_skill_df = pd.read_csv(DATA_DIR + 'LinkedInSkillsList_10.csv')
linkedin_skills = linkedin_skill_df['skill']

onet_skill_df = pd.read_csv(DATA_DIR + 'onet_skills_list_all.csv')
onet_skills = onet_skill_df['skill'].apply(str.lower)
  • Join two skill lists & remove duplicated skills:

In [4]:
skills = linkedin_skills.append(onet_skills)
skills = list(set(skills))
pd.DataFrame({'skill': skills}).to_csv(DATA_DIR + 'all_skills.csv')
Average no. of words in skills

In [10]:
skill_df = pd.DataFrame({'skill': skills})
skill_df['n_word'] = skill_df['skill'].apply(n_word)
quantile(skill_df['n_word'])


Out[10]:
min 25% 50% (median) 75% max
0 1.0 2.0 2.0 3.0 13.0
Distribution of skill length (no. of words in skill):

In [11]:
skill_length_hist(skill_df)


Based on the quartile summary and the distribution, we can try the following options:

  • including only 1-gram, 2-gram, 3-gram skills in our vocabulary (as 75% of skills have no more than 3 words)
  • including up to 7-gram skills in our vocabulary (as skills with more than 7 words only occuppy a small portion)
Statistics of data

In [396]:
stats = pd.DataFrame({'# JDs': n_jd, 
                      '# LinkedIn skills': len(linkedin_skills), '# ONET skills': len(onet_skills), 
                      'Total no. of unique skills': len(skills), 
                      'min skill length': min(skill_df['n_word']), 'max skill length': max(skill_df['n_word'])}, 
                     index=[0])

stats.to_csv(DATA_DIR + 'stats.csv')
stats


Out[396]:
# JDs # LinkedIn skills # ONET skills Total no. of unique skills max skill length min skill length
0 263477 18251 27025 44919 13 1

1. Skill occurrence

Each skill is considered as a token. We count occurrence of each skill in documents and return the counts in a term-document matrix.

  • Tokens_1 = {skills with length = 1} (uni-gram skills)
  • Tokens_3 = {skills with length <= 3}
  • Tokens_7 = {skills with length <= 7}

In [310]:
unigram_skills, doc_unigram_freq = freq(skills, jd_docs, max_n_word=1)


Counting occurrence of skills with length <=1 ...
Done after 31.0s

In [311]:
trigram_skills, doc_trigram_freq = freq(skills, jd_docs, max_n_word=3)


Counting occurrence of skills with length <=3 ...
Done after 97.3s

In [379]:
# sevengram_skills, doc_7gram_freq = freq(skills, jd_docs, max_n_word=7)

In [312]:
unigram_match, trigram_match = n_match_skills(df=unigram_skills), n_match_skills(df=trigram_skills)
# sevengram_match = n_match_skills(df=sevengram_skills)

pd.DataFrame({'# matching skills': [unigram_match, trigram_match]}, 
            index=['Tokens_1', 'Tokens_3'])


Out[312]:
# matching skills
Tokens_1 5212
Tokens_3 14829

As the difference between Tokens_3 and Tokens_7 is negligible, we can just use the former for analysis. This means we only include 1-, 2- and 3-gram skills in our subsequent analysis.

2. Skill occurrence per document

Uni-gram skills


In [380]:
t0 = time()
print('Counting occurrence of uni-gram skills...')
uni_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills)  
doc_unigram_freq = uni_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))


Counting occurrence of uni-gram skills...
Done after 30.3s
No. of unique uni-grams per document

In [381]:
## For each doc, "its no. of unique uni-grams = no. of non-zero counts" in its row in doc-term mat
def n_non_zero(r, sp_mat):
    return len(sp_mat.getrow(r).nonzero()[1])

n_unigram = [n_non_zero(r, doc_unigram_freq) for r in range(n_jd)]
# sum(n_unigram) == len(doc_unigram_freq.nonzero()[0]) # sanity check


Out[381]:
jd_id n_unigram
190543 JOB-2015-0196805 119
210604 JOB-2015-0217075 106
260098 JOB-2015-0267654 103
260090 JOB-2015-0267646 103
33133 JOB-2014-0036543 99
203037 JOB-2015-0209415 98
216119 JOB-2015-0222694 98
256206 JOB-2015-0263242 98
203547 JOB-2015-0209933 97
140713 JOB-2015-0146455 97

In [395]:
jd_df['n_unigram'] = n_unigram
print quantile(n_unigram)


   min  25%  50% (median)   75%    max
0  0.0  8.0          14.0  22.0  119.0

In [394]:
# pull up some JDs to check
tmp = jd_df.query('n_unigram == 2')
print tmp.shape
tmp[['job_id', 'clean_text', 'n_unigram']].head(10)


(6464, 10)
Out[394]:
job_id clean_text n_unigram
307 JOB-2014-0003245 Design sportswear shoe 2
234060 JOB-2015-0240867 HVAC ENGINEERING 2
148984 JOB-2015-0154808 Jewelry designer 2
39364 JOB-2014-0044267 Peformance Testing Mobile Testing 2
234248 JOB-2015-0241058 Project Management 2
162273 JOB-2015-0168245 CARPET SALES 2
92784 JOB-2014-0098164 CALIBRATION TESTING 2
18577 JOB-2014-0021820 Food Preparation Food Serving 2
193710 JOB-2015-0200007 Basic accounting 2
116694 JOB-2014-0122242 VBA MACRO 2

Based on the quartile summary, $50$% of documents contain $\leq 14$ uni-gram skills and $75$% of them contain $\leq 22$ uni-gram skills (which looks reasonable).


In [387]:
fig = plt.figure(figsize=(10, 6))
ns, bins, patches = plt.hist(n_unigram, bins=np.unique(n_unigram), rwidth=.5)
plt.xlabel('No. of unique uni-gram skills in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of uni-gram skills in job descriptions')
plt.grid(True)

plt.show()
fig.savefig(REPORT_DIR + 'unigram_in_jd.pdf')
plt.close(fig)



In [386]:
print('Ratio of JDs with no uni-gram skills: %.3f' %round(n_unigram.count(0)/float(n_jd), 3))


Ratio of JDs with no uni-gram skills: 0.007
Ratio of unigram skill tokens in JDs

In [313]:
jd_df['length'] = [n_words_in_doc(d) for d in jd_df['text']]
# print sum(jd_df['length'] == 0)

jd_df['unigram_freq'] = doc_unigram_freq.sum(axis=1).A1
clean_jd_df = jd_df.query('length > 0')
del clean_jd_df['text']
clean_jd_df['unigram_ratio'] = np.divide(clean_jd_df['unigram_freq'], clean_jd_df['length']*1.)
clean_jd_df = clean_jd_df.sort_values(by='unigram_ratio', ascending=False)

In [383]:
print quantile(clean_jd_df['unigram_ratio'], dec=2)
# clean_jd_df.to_csv(DATA_DIR + 'jd_df.csv')


   min   25%  50% (median)   75%  max
0  0.0  0.16           0.2  0.26  1.0

In [318]:
plt.hist(clean_jd_df['unigram_ratio'])
plt.xlabel('Ratio of unigram skill tokens in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of ratio of unigram skills')
plt.savefig(REPORT_DIR + 'unigram_ratio.pdf')

plt.show()
plt.close()


Decision: Based on the distribution, half of JDs have $\leq 20$% of their words are uni-gram skills (and 75% of them having unigram skills occupying $\leq 26$% of their words). Thus, we may first want to try on the sub dataset of JDs where uni-gram skills occuppy at least $20$%.


In [429]:
jd_df.columns


Out[429]:
Index([u'job_id', u'job_description_clob', u'job_other_requirements_clob',
       u'month_year', u'text', u'length', u'unigram_freq', u'unigram_ratio',
       u'clean_text', u'n_unigram'],
      dtype='object')

In [430]:
sub_df = jd_df.query('n_unigram >= 5')
sub_df = sub_df.sort_values(by='length', ascending=False)
print sub_df.shape
# sub_df.head(10)


(232559, 10)
  • First, we try using only uni-gram skills as features.

In [431]:
unigram_df, doc_unigram_freq  = freq(skills, max_n_word=1, docs=sub_df['clean_text'])
doc_unigram_freq.shape


Counting occurrence of skills with length <=1 ...
Done after 29.9s
Out[431]:
(232559, 44919)

In [432]:
occur_unigrams = unigram_df.query('occurrence > 0')['skill']
unigram_df, doc_unigram_freq = freq(skills=occur_unigrams, docs=sub_df['clean_text'])
doc_unigram_freq.shape


Counting occurrence of skills with length <=1 ...
Done after 29.7s
Out[432]:
(232559, 5196)

In [434]:
n_unigram_skill = doc_unigram_freq.shape[1]
n_document = [n_non_zero(r, doc_unigram_freq.transpose()) for r in range(n_unigram_skill)]

In [447]:
tmp_df = pd.DataFrame({'skill' : occur_unigrams, 'n_doc' : n_document}) 
tmp_df = tmp_df.sort_values(by='n_doc', ascending=False)
sum(tmp_df['n_doc'] == 1)


Out[447]:
311

In [435]:
quantile(n_document)


Out[435]:
min 25% 50% (median) 75% max
0 1.0 8.0 37.0 211.2 84111.0

In [439]:
plt.hist(n_document, bins=np.unique(n_document), log=True)
# plt.xlabel('No. of documents containing the skill')
# plt.ylabel

plt.show()


Skill Clustering by NMF & LDA

Split into training and test sets


In [406]:
n_ins = sub_df.shape[0]
train_idx, test_idx = mkPartition(n_instances=n_ins)
X_train, X_test = doc_unigram_freq[train_idx, :], doc_unigram_freq[test_idx, :]

Set global arguments:

  • no. of topics: k in {5, 10, ..., 50}
  • no. of top words to be printed out in result
  • directory to save results

In [411]:
# ks, n_top_words = range(2, 11), 10
ks, n_top_words = range(5, 25, 5), 10
RES_DIR = REPORT_DIR + 'r4/'

A. Using NMF

Trainning NMF using random initialization


In [412]:
rnmf = {k: NMF(n_components=k, random_state=0) for k in ks}
print( "Fitting NMF using %d uni-gram skills on %d job descriptions..." % (len(occur_unigrams), n_ins) ) # (random initialization)
print('No. of topics, Error, Running time')
rnmf_error = []

for k in ks:
    t0 = time()
    rnmf[k].fit(X_train)
    elapsed = time() - t0
    err = rnmf[k].reconstruction_err_
    print('%d, %0.1f, %0.1fs' %(k, err, elapsed))
    rnmf_error.append(err)
# end


Fitting NMF using 4359 uni-gram skills on 62158 job descriptions...
No. of topics, Error, Running time
5, 1427.5, 3.7s
10, 1340.7, 1.6s
15, 1282.7, 5.3s
20, 1234.2, 4.5s

Evaluating NMF on test data

First, we choose the best no. of topics $k^*$ for random NMF as the one that minimizes the error of predicting test data. For that, we compute the error for different $k$'s by the following function.


In [357]:
from numpy import linalg as la

def cal_test_err(models): 
    test_error = []
    print('No. of topics, Test error, Running time')

    for k in ks:
        t0 = time()
        H = models[k].components_
        W_test = models[k].fit_transform(X_test, H=H)
        err = la.norm(X_test - np.matmul(W_test, H))
        
#         sp_W_test = csr_matrix(W_test)
#         sp_H = csc_matrix(H)
#         print(sp_W_test.shape)
#         print(sp_H.shape)
#         err = la.norm(X_test - sp_W_test * sp_H)
        test_error.append(err)
        print('%d, %0.1f, %0.1fs' %(k, err, time() - t0))
    
    return test_error

In [413]:
print('Calculating test errors of random NMF to choose best no. of topics...')
rnmf_test_error = cal_test_err(models=rnmf)


Calculating test errors of random NMF to choose best no. of topics...
No. of topics, Test error, Running time
5, 783.8, 1.0s
10, 709.2, 1.1s
15, 744.6, 2.3s
20, 781.4, 2.3s

In [414]:
best_k = ks[np.argmin(rnmf_test_error)]
print('The best no. of topics is %d' %best_k)
rnmf_best = rnmf[best_k]
rnmf_word_dists = pd.DataFrame(rnmf_best.components_).apply(normalize, axis=1)
# rnmf_word_dists.to_csv(RES_DIR + 'rnmf_word_dists.csv', index=False)


The best no. of topics is 10

We now validate NMF results in the following ways:

  • if learnt topics make sense
  • if topic it predicts for each JD in test set makes sense
Learnt topics

We manually label each topic based on its top 10 words.


In [360]:
# count_vectorizer = text_manip.CountVectorizer(vocabulary= occur_skills['skill'], ngram_range=(1, 3))
# count_vectorizer.fit_transform(jd_docs)

# nmf_features = count_vectorizer.get_feature_names()
# nmf_top_words = top_words_df(rnmf_best, n_top_words, nmf_features)

# nmf_top_words.to_csv(REPORT_DIR + 'nmf_top_words.csv', index=False)
# pd.DataFrame(nmf_features).to_csv(REPORT_DIR + 'nmf_features.csv')
Topic prediction on test JDs

In [361]:
# H = rnmf_best.components_
# W_test = pd.DataFrame(rnmf_best.fit_transform(X_test, H=H))
# W_test.to_csv(REPORT_DIR + 'nmf_doc_topic_distr.csv', index=False)

B. Using LDA

Trainning


In [415]:
scores = []
lda = {k: LatentDirichletAllocation(n_topics=k, max_iter=5, learning_method='online', learning_offset=50.,
                                   random_state=0) # verbose=1
 for k in ks}

print("Fitting LDA using %d uni-gram skills on %d job descriptions..." % (len(occur_unigrams), n_ins))
print('No. of topics, Log-likelihood, Running time')

for k in ks:
    t0 = time()
    lda[k].fit(X_train)
    s = lda[k].score(X_train)
    print('%d, %0.1f, %0.1fs' %(k, s, time() - t0))
    scores.append(s)
# end


Fitting LDA using 4359 uni-gram skills on 62158 job descriptions...
No. of topics, Log-likelihood, Running time
5, -9766356.7, 40.8s
10, -14281289.0, 42.0s
15, -19006623.0, 45.2s
20, -23848096.5, 47.3s

Perplexity of LDA on test set


In [416]:
perp = [lda[k].perplexity(X_test) for k in ks]
perp_df = pd.DataFrame({'No. of topics': ks, 'Perplexity': perp})
perp_df.to_csv(RES_DIR + 'perplexity.csv', index=False)
  • Choose the best no. of topics as the one minimizing perplexity.

In [417]:
best_k = ks[np.argmin(perp)]
print('Best no. of topics: %d' %best_k)


Best no. of topics: 5
  • Save the best LDA model:

In [365]:
# lda_best = lda[best_k]
# lda_word_dists = pd.DataFrame(lda_best.components_).apply(normalize, axis=1)
# lda_word_dists.to_csv(RES_DIR + 'lda_word_dists.csv', index=False)

# lda_features = count_vectorizer.get_feature_names()
# lda_topics = top_words_df(lda_best, n_top_words, lda_features)

# lda_topics.to_csv(REPORT_DIR + 'lda_topics.csv', index=False)
# pd.DataFrame(lda_features).to_csv(RES_DIR + 'lda_features.csv')

Using the best LDA to perform topic prediction on test JDs


In [366]:
# doc_topic_distr  = lda_best.transform(X_test)
# pd.DataFrame(doc_topic_distr).to_csv(RES_DIR + 'lda_doc_topic_distr.csv', index=False)

C. Model Comparison


In [421]:
# Put all model metrics on training & test datasets into 2 data frames
model_list = ['LDA', 'randomNMF']

train_metric = pd.DataFrame({'No. of topics': ks, 'LDA': np.divide(scores, 10**6), 'randomNMF': rnmf_error})

test_metric = pd.DataFrame({'No. of topics': ks, 'LDA': perp, 'randomNMF': rnmf_test_error, })

In [424]:
ks


Out[424]:
[5, 10, 15, 20]

Performance of models for different number of topics


In [423]:
fig = plt.figure(figsize=(10, 6))

for i, model in enumerate(model_list):
    plt.subplot(2, 2, i+1)
    plt.subplots_adjust(wspace=.5, hspace=.5)  
    #     train metric
    plt.title(model)
    plt.plot(ks, train_metric[model], '--')
    plt.xlabel('No. of topics')
    if model == 'LDA':
        plt.ylabel(r'Log likelihood ($\times 10^6$)')
    else:
        plt.ylabel(r'$\| X_{train} - W_{train} H \|_2$')
    plt.grid(True)
    
    #     test metric
#     plt.subplot(2, 2, i+3)
#     plt.title(model)
#     plt.plot(ks, test_metric[model], 'r')
#     plt.xlabel('No. of topics')
#     if model == 'LDA':
#         plt.ylabel(r'Perplexity')
#     else:
#         plt.ylabel(r'$\| X_{test} - W_{test} H \|_2$')
#     plt.grid(True)
        
# end
plt.show()
# fig.savefig(RES_DIR + 'new_lda_vs_nmf.pdf')
plt.close(fig)


Multi-gram skills


In [369]:
t0 = time()
print('Counting occurrence of multi-gram skills...')
multi_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(2, 3))
multigram_doc_mat = multi_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))


Counting occurrence of multi-gram skills...
Done after 71.5s
Distribution of multigrams in documents

In [370]:
n_multigram = [n_non_zero(r, doc_multigram_freq) for r in range(n_jd)]
multigram_df = pd.DataFrame({'jd_id': jd_df['job_id'], 'n_multigram': n_multigram})

quantile(n_multigram)


Out[370]:
min 25% 50% (median) 75% max
0 0.0 1.0 2.0 5.0 45.0

In [371]:
plt.hist(n_multigram, bins=np.unique(n_multigram))
plt.xlabel('No. of multi-grams in job description')
plt.ylabel('No. of job descriptions')
plt.title('Distribution of multi-grams in job descriptions')
plt.grid(True)

plt.show()
plt.savefig(REPORT_DIR + 'multigram_in_jd.pdf')
plt.close()


Overlapping between uni-gram & multi-gram skills


In [372]:
unigram_skills, bigram_skills = occur_skills.query('n_word == 1')['skill'], occur_skills.query('n_word == 2')['skill']
## for each unigram, find no. of bigrams containing it (aka super-bigrams)
def super_bigrams(unigram='business', bigrams=bigram_skills):
    idx = [s.find(unigram) for s in bigrams]
    df = pd.DataFrame({'bigram': bigrams, 'idx_of_given_unigram': idx})
    return df.query('idx_of_given_unigram > -1') # those bigrams not containing the unigram give -1 indices
    
def n_super_bigrams(unigram='business', bigrams=bigram_skills):
    idx = [s.find(unigram) for s in bigrams]
    return len(bigrams) - idx.count(-1)

In [373]:
# super_bigrams(unigram='business')
n_super_bigrams = [n_super_bigrams(ug) for ug in unigram_skills]
overlap_df = pd.DataFrame({'unigram': unigram_skills, 'n_super_bigrams': n_super_bigrams})

quantile(n_super_bigrams)


Out[373]:
min 25% 50% (median) 75% max
0 0.0 0.0 0.0 1.0 1278.0

In [399]:
plt.hist(n_super_bigrams)

plt.show()
plt.close()



In [402]:
overlapped_unigrams = overlap_df.query('n_super_bigrams > 0')
overlapped_unigrams = overlapped_unigrams.sort_values(by='n_super_bigrams', ascending=False)
# overlapped_unigrams.head(10)

n_overlap = overlapped_unigrams.shape[0]
n_unigram_skills = overlap_df.shape[0]
print n_overlap
print n_unigram_skills
n_overlap*1./n_unigram_skills


1888
5212
Out[402]:
0.3622409823484267
10 most common skills:

In [375]:
unigram_df = unigram_df.sort_values(by='occurrence', ascending=False)

In [376]:
top10 = unigram_df[['skill', 'occurrence']].head(10)
# top10.reset_index(inplace=True, drop=True)
top10


Out[376]:
skill occurrence
31013 management 102822
22914 support 101531
2176 business 93456
22585 project 85198
11806 team 79446
15184 sales 64598
7465 development 56053
20970 design 53787
11387 responsible 48099
2235 customer 46398
10 least common skills:

In [377]:
bottom10 = unigram_df[['skill', 'occurrence']].tail(10)
bottom10.reset_index(inplace=True, drop=True)
print(bottom10)


         skill  occurrence
0       trawls           1
1         sdsf           1
2    jprofiler           1
3      pulleys           1
4        toefl           1
5     griddles           1
6  respiration           1
7         cmbs           1
8       gambit           1
9     rabbitmq           1
Frequency of uni-gram skills in JDs

In [19]:
quantile(occur_skills.query('n_word == 1')['occurrence'])


Out[19]:
min 25% 50% (median) 75% max
0 1.0 10.0 47.0 275.0 170302.0
Frequency of bi-gram skills in JDs

In [20]:
quantile(occur_skills.query('n_word == 2')['occurrence'])


Out[20]:
min 25% 50% (median) 75% max
0 1.0 5.0 19.0 81.0 12700.0
Frequency of tri-gram skills in JDs

In [21]:
quantile(occur_skills.query('n_word == 3')['occurrence'])


Out[21]:
min 25% 50% (median) 75% max
0 1.0 2.0 7.0 27.0 2502.0
Distribution of skill frequency

In [22]:
n, bins, patches = plt.hist(x=occur_skills['occurrence']/10**3, bins=50, facecolor='blue', alpha=0.75, log=True)
plt.title('Histogram of skill frequency')
plt.xlabel(r'Frequency ($\times 10^3$)') # in thousand
plt.ylabel('No. of skills (log scale)')
plt.ylim(1, 10**4)
plt.grid(True)
plt.savefig(REPORT_DIR + 'skill_occur.pdf')

plt.show()
plt.close()



In [378]:
# bi_gram_skills.sort_values(by='occurrence', inplace=True, ascending=False)
# print('10 most common bi-gram skills')
# print(bi_gram_skills.head(10))

Skills which actually occur in JDs from JobBanks

  • Filter out all skills that never occur in JDs to reduce size of doc-skill feature matrix.

In [18]:
occur_skills, no_occur_skills = skillsPresentInJD(df=trigram_skills)
occur_skills = occur_skills.sort_values(by='occurrence',  ascending=False) # inplace=True,
  • Re-build the doc-skill matrix where each skill occurs at least once.

In [94]:
# occur_skills['skill'].head(3)
trigram_skills, doc_trigram_freq = freq(occur_skills['skill'], max_n_word=3)


Counting occurrence of skills with length <=3 ...
Done after 98.7s

In [98]:
print('No. of skills in the new doc-skill matrix after re-building: %d' %doc_trigram_freq.shape[1] )


No. of skills in the new doc-skill matrix after re-building: 14829

In [97]:
trigram_skills.to_csv(REPORT_DIR + 'trigram_skills.csv')
pd.DataFrame(doc_trigram_freq.data).to_csv(REPORT_DIR + 'doc_trigram_freq.csv')

# unigram_skills.to_csv(REPORT_DIR + 'unigram_skills.csv')
# pd.DataFrame(doc_unigram_freq.data).to_csv(REPORT_DIR + 'doc_unigram_freq.csv')
# sevengram_skills.to_csv(REPORT_DIR + 'sevengram_skills.csv')