In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text_manip
import matplotlib.pyplot as plt
import gc
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from scipy.sparse import *
# my own modules
from my_util import *
from helpers import *
In [ ]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
REPORT_DIR = HOME_DIR + 'reports/skill_cluster/'
There are two goals: i) to remove JDs with too few skills, and ii) to remove skills occurring in too few JDs. Thus, we repeat the following process until the two goals are satisfied.
In [ ]:
# posts, skills = helpers.filtering(init_posts, init_skills=skills)
In [ ]:
# job descriptions (JDs)
posts = pd.read_csv(DATA_DIR + 'posts.csv')
jd_docs = list(posts['clean_text'].apply(str.lower))
In [ ]:
skill_df = pd.read_csv(DATA_DIR + 'skills.csv')
skills = skill_df['skill']
n_skill, n_jd = len(skills) , posts.shape[0]
print('No. of skills: %d' %n_skill)
print('No. of JDs: %d' %n_jd) # some garbage JDs with no text already removed
print('3 most popular skills')
skill_df.head(3)
In [ ]:
print('3 least popular skills')
skill_df.tail(3)
In [ ]:
# skill_df['n_word'] = skill_df['skill'].apply(my_util.n_word)
# skill_df.to_csv(DATA_DIR + 'skills.csv', index=False)
In [ ]:
print('Maximum length (no. of words) of skills: %d'
% max(occur_df['n_word']))
uni_gram_skills = occur_df.query('n_word == 1')['skill']
bi_gram_skills = occur_df.query('n_word == 2')['skill']
tri_gram_skills = occur_df.query('n_word == 3')['skill']
print('Among them, we have:' )
pd.DataFrame([len(uni_gram_skills), len(bi_gram_skills), len(tri_gram_skills)],
index=['n_unigram_skill', 'n_bigram_skill', 'n_trigram_skill'])
In [ ]:
unigram_occur = skill_df.query('n_word == 1')['n_jd_with_skill']
unigram_quant = my_util.quantile(unigram_occur)
bigram_occur = skill_df.query('n_word == 2')['n_jd_with_skill']
bigram_quant = my_util.quantile(bigram_occur)
trigram_occur = skill_df.query('n_word == 3')['n_jd_with_skill']
trigram_quant = my_util.quantile(trigram_occur)
occur_quant = pd.concat([unigram_quant, bigram_quant, trigram_quant])
occur_quant.index = ['unigram skills', 'bigram skills', 'trigram skills']
occur_quant
In [ ]:
occur_quant.to_csv(REPORT_DIR + 'skill_occur.csv')
In [ ]:
gram_occur = {1: unigram_occur, 2: bigram_occur, 3: trigram_occur}
plt.hist(x=gram_occur[1], bins=np.unique(gram_occur[1]))
plt.xscale('log')
plt.title('1-gram')
plt.xlabel('# JDs containing skill')
plt.ylabel('# skills')
plt.show()
plt.close()
In [ ]:
def plot1(df=gram_occur, w=10, h=6):
fig = plt.figure(figsize=(w, h))
for i in range(3):
n = i+1
plt.subplot(3, 1, n)
plt.subplots_adjust(wspace=.5, hspace=.5)
plt.hist(x=df[n], bins=np.unique(df[n]))
plt.xscale('log')
plt.xlabel('# JDs containing skill')
plt.ylabel('# {}-gram skills'.format(n))
plt.show()
return fig
In [ ]:
fig1 = plot1(w=7, h=10)
# fig1.savefig(REPORT_DIR + 'skill_occur.pdf')
# fig1.savefig(REPORT_DIR + 'skill_occur.jpeg')
plt.close(fig1)
In [ ]:
print('Sample job posts')
posts.head()
In [ ]:
n, bins, patches = plt.hist(x=posts['n_uniq_skill'], bins=np.unique(posts['n_uniq_skill']))
# plt.title('Skill count')
plt.xlabel('# skills in JD')
plt.ylabel('# JDs')
plt.show()
In [ ]:
def countOccur_ngram(n=1):
t0 = time()
print('Marking occurrence of {}-gram skills...'.format(n))
vectorizer = text_manip.CountVectorizer(vocabulary=skills, binary=True, ngram_range=(n,n))
doc_ngram_occurrence = vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))
n_ngram_by_jd = doc_ngram_occurrence.sum(axis=1).A1
return pd.DataFrame({'job_id': posts['job_id'], 'n_{}gram'.format(n): n_ngram_by_jd})
In [ ]:
unigram_occur = countOccur_ngram(n=1)
bigram_occur = countOccur_ngram(n=2)
trigram_occur = countOccur_ngram(n=3)
In [ ]:
occur_df = pd.merge(unigram_occur, bigram_occur, on='job_id')
occur_df = pd.merge(occur_df, trigram_occur, on='job_id')
In [ ]:
occur_df['n_skill'] = occur_df['n_1gram'] + occur_df['n_2gram'] + occur_df['n_3gram']
occur_df.head()
In [ ]:
from my_util import *
n_skill_by_jd = pd.concat([quantile(occur_df['n_1gram']), quantile(occur_df['n_2gram']), quantile(occur_df['n_3gram'])])
n_skill_by_jd.index = ['n_unigram', 'n_bigram', 'n_trigram']
n_skill_by_jd
In [ ]:
n_skill_by_jd.to_csv(REPORT_DIR + 'skill_by_jd.csv')
In [ ]:
def plot2(w=7, h=10):
fig = plt.figure(figsize=(w,h))
for i in range(3):
n = i+1
plt.subplot(3, 1, n)
plt.subplots_adjust(hspace=.25)
ngram_occur = occur_df['n_{}gram'.format(n)]
m, bins, patches = plt.hist(x=ngram_occur, bins=np.unique(ngram_occur), align='left')
plt.xlim(0, max(bins)+1)
if (n==3):
plt.xticks(range(11))
# plt.title('{}-gram'.format(n))
plt.xlabel('# {}-gram skills'.format(n), fontsize='large')
plt.ylabel('# JDs', fontsize='large')
plt.show()
return fig
In [ ]:
fig2 = plot2()
fig2.savefig(REPORT_DIR + 'n_skill_in_jd.pdf')
fig2.savefig(REPORT_DIR + 'n_skill_in_jd.jpeg')
plt.close(fig2)