Preparations

  • Import libraries:

In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text_manip
import matplotlib.pyplot as plt
import gc

from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from scipy.sparse import *

# my own modules
from my_util import *
from helpers import *
  • Load data:

In [ ]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
REPORT_DIR = HOME_DIR + 'reports/skill_cluster/'

Filtering

There are two goals: i) to remove JDs with too few skills, and ii) to remove skills occurring in too few JDs. Thus, we repeat the following process until the two goals are satisfied.

  • Count no. of unique skills in each JD
  • Remove JDs with $\le 1$ skills
  • Count no. of JDs containing each skill
  • Remove skills occuring in $\le 1$ JDs

In [ ]:
# posts, skills = helpers.filtering(init_posts, init_skills=skills)

In [ ]:
# job descriptions (JDs)
posts = pd.read_csv(DATA_DIR + 'posts.csv')
jd_docs = list(posts['clean_text'].apply(str.lower))

In [ ]:
skill_df = pd.read_csv(DATA_DIR + 'skills.csv')
skills = skill_df['skill']

n_skill, n_jd = len(skills) , posts.shape[0]
print('No. of skills: %d' %n_skill)
print('No. of JDs: %d' %n_jd) # some garbage JDs with no text already removed
print('3 most popular skills')
skill_df.head(3)

In [ ]:
print('3 least popular skills')
skill_df.tail(3)

In [ ]:
# skill_df['n_word'] = skill_df['skill'].apply(my_util.n_word)
# skill_df.to_csv(DATA_DIR + 'skills.csv', index=False)

In [ ]:
print('Maximum length (no. of words) of skills: %d' 
      % max(occur_df['n_word']))

uni_gram_skills = occur_df.query('n_word == 1')['skill']
bi_gram_skills = occur_df.query('n_word == 2')['skill']
tri_gram_skills = occur_df.query('n_word == 3')['skill']

print('Among them, we have:' )
pd.DataFrame([len(uni_gram_skills), len(bi_gram_skills), len(tri_gram_skills)],
            index=['n_unigram_skill', 'n_bigram_skill', 'n_trigram_skill'])

Occurrence of unigram, bigram and trigram skills in JDs


In [ ]:
unigram_occur = skill_df.query('n_word == 1')['n_jd_with_skill']
unigram_quant = my_util.quantile(unigram_occur)

bigram_occur = skill_df.query('n_word == 2')['n_jd_with_skill']
bigram_quant = my_util.quantile(bigram_occur)

trigram_occur = skill_df.query('n_word == 3')['n_jd_with_skill']
trigram_quant = my_util.quantile(trigram_occur)

occur_quant = pd.concat([unigram_quant, bigram_quant, trigram_quant])
occur_quant.index = ['unigram skills', 'bigram skills', 'trigram skills']
occur_quant

In [ ]:
occur_quant.to_csv(REPORT_DIR + 'skill_occur.csv')
  • Distributions:

In [ ]:
gram_occur = {1: unigram_occur, 2: bigram_occur, 3: trigram_occur}
plt.hist(x=gram_occur[1], bins=np.unique(gram_occur[1]))

plt.xscale('log')
plt.title('1-gram')
plt.xlabel('# JDs containing skill')
plt.ylabel('# skills')

plt.show()
plt.close()

In [ ]:
def plot1(df=gram_occur, w=10, h=6):
    
    fig = plt.figure(figsize=(w, h)) 
    for i in range(3):
        n = i+1
        plt.subplot(3, 1, n)
        plt.subplots_adjust(wspace=.5, hspace=.5)

        plt.hist(x=df[n], bins=np.unique(df[n]))
        plt.xscale('log')

        plt.xlabel('# JDs containing skill')
        plt.ylabel('# {}-gram skills'.format(n))

    plt.show()
    return fig

In [ ]:
fig1 = plot1(w=7, h=10)
# fig1.savefig(REPORT_DIR + 'skill_occur.pdf')
# fig1.savefig(REPORT_DIR + 'skill_occur.jpeg')
plt.close(fig1)

In [ ]:
print('Sample job posts')
posts.head()

In [ ]:
n, bins, patches = plt.hist(x=posts['n_uniq_skill'], bins=np.unique(posts['n_uniq_skill']))
# plt.title('Skill count')
plt.xlabel('# skills in JD')
plt.ylabel('# JDs')

plt.show()

In [ ]:
def countOccur_ngram(n=1):
    t0 = time()
    print('Marking occurrence of {}-gram skills...'.format(n))
    vectorizer = text_manip.CountVectorizer(vocabulary=skills, binary=True, ngram_range=(n,n))
    doc_ngram_occurrence = vectorizer.fit_transform(jd_docs)
    print('Done after %.1fs' %(time() - t0))
    n_ngram_by_jd = doc_ngram_occurrence.sum(axis=1).A1
    return pd.DataFrame({'job_id': posts['job_id'], 'n_{}gram'.format(n): n_ngram_by_jd})
  • Consider only n-grams separately:

In [ ]:
unigram_occur = countOccur_ngram(n=1)
bigram_occur = countOccur_ngram(n=2)
trigram_occur = countOccur_ngram(n=3)

In [ ]:
occur_df = pd.merge(unigram_occur, bigram_occur, on='job_id')
occur_df = pd.merge(occur_df, trigram_occur, on='job_id')

In [ ]:
occur_df['n_skill'] = occur_df['n_1gram'] + occur_df['n_2gram'] + occur_df['n_3gram']
occur_df.head()

In [ ]:
from my_util import *
n_skill_by_jd = pd.concat([quantile(occur_df['n_1gram']), quantile(occur_df['n_2gram']), quantile(occur_df['n_3gram'])])
n_skill_by_jd.index = ['n_unigram', 'n_bigram', 'n_trigram']
n_skill_by_jd

In [ ]:
n_skill_by_jd.to_csv(REPORT_DIR + 'skill_by_jd.csv')

In [ ]:
def plot2(w=7, h=10):    
    fig = plt.figure(figsize=(w,h))
    for i in range(3):
        n = i+1
        plt.subplot(3, 1, n)
        plt.subplots_adjust(hspace=.25)

        ngram_occur = occur_df['n_{}gram'.format(n)]
        m, bins, patches = plt.hist(x=ngram_occur, bins=np.unique(ngram_occur), align='left')
        plt.xlim(0, max(bins)+1)

        if (n==3):
            plt.xticks(range(11))
    #     plt.title('{}-gram'.format(n))
        plt.xlabel('# {}-gram skills'.format(n), fontsize='large')
        plt.ylabel('# JDs', fontsize='large')

    plt.show()
    return fig

In [ ]:
fig2 = plot2()
fig2.savefig(REPORT_DIR + 'n_skill_in_jd.pdf')
fig2.savefig(REPORT_DIR + 'n_skill_in_jd.jpeg')
plt.close(fig2)