In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text_manip
import matplotlib.pyplot as plt
import gc
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from scipy.sparse import *
from my_util import *
In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
REPORT_DIR = HOME_DIR + 'reports/skill_cluster/'
# job descriptions (JDs)
init_posts = pd.read_csv(DATA_DIR + 'jd_df.csv')
In [20]:
init_skills = skill_df['skill']
jd_docs = list(init_posts['clean_text'].apply(str.lower))
n_skill, n_jd = len(init_skills) , init_posts.shape[0]
print('Initial no. of skills: %d' %n_skill)
print('Initial no. of JDs: %d' %n_jd) # some garbage JDs with no text already removed
In [8]:
skill_df = pd.read_csv(REPORT_DIR + 'skill_stats.csv')
skill_df.head(3)
Out[8]:
In [27]:
uni_gram_skills = list(skill_df.query('n_word == 1')['skill'])
bi_gram_skills = list(skill_df.query('n_word == 2')['skill'])
tri_gram_skills = list(skill_df.query('n_word == 3')['skill'])
pd.DataFrame({'n_unigram_skill': len(uni_gram_skills), 'n_bigram_skill': len(bi_gram_skills),
'n_trigram_skill': len(tri_gram_skills)}, index=[0])
Out[27]:
In [31]:
skills = pd.Series.unique(skill_df.query('freq > 0')['skill'])
len(skills)
Out[31]:
In [ ]:
t0 = time()
print('Counting occurrence of uni-gram skills...')
uni_gram_vectorizer = text_manip.CountVectorizer(vocabulary=skills)
doc_unigram_freq = uni_gram_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))
## For each doc, "its no. of unique uni-grams = no. of non-zero counts" in its row in doc-term mat
def n_non_zero(r, sp_mat):
return len(sp_mat.getrow(r).nonzero()[1])
In [12]:
# binary_vectorizer = text_manip.CountVectorizer(vocabulary=skills, binary=True)
# print('Marking unique unigram skills in JDs...')
# t0 = time()
# doc_unigram_occurrence = binary_vectorizer.fit_transform(jd_docs)
# print('Done after %.1fs' %(time() - t0))
# init_posts['n_uniq_unigram'] = doc_unigram_occurrence.sum(axis=1).A1
quantile(init_posts['n_uniq_unigram'])
Out[12]:
In [ ]:
plt.hist(n_uniq_unigram, bins=np.unique(init_posts['n_uniq_unigram']))
plt.xlabel('no. of unique unigrams in JD')
plt.ylabel('no. of JDs')
plt.show()
Here each skill can be a uni-, bi-, or tri-gram (i.e. len(skill) <= 3)
In [ ]:
From now on, we work with the set of occurring skills.
In [ ]:
# Count no. of unique skills in each JD by binary vectorizer
binary_vectorizer = text_manip.CountVectorizer(vocabulary=occur_skills, ngram_range=(1, max_n_word), binary=True)
t0 = time()
print('Marking occurrence of skills with length <= %d ...' %max_n_word)
doc_skill_occurrence = binary_vectorizer.fit_transform(jd_docs)
print('Done after %.1fs' %(time() - t0))
init_posts['n_uniq_skill'] = doc_skill_occurrence.sum(axis=1).A1
quantile(init_posts['n_uniq_skill'])
There are two goals: i) to remove JDs with too few skills, and ii) to remove skills occurring in too few JDs. Thus, we repeat the following process until the two goals are satisfied.
In [ ]:
n_iter, posts = 0, init_posts
n_post = posts.shape[0]
stop_cond, thres = False, .98
while not stop_cond:
n_iter = n_iter + 1
print('Iteration %d' %n_iter)
new_posts = extractJDs(posts, skills, min_n_skill=2)
n_new_post = new_posts.shape[0]
print('No. of posts after filtering: %d' %n_new_post)
skill_df = extractSkills(skills, new_posts, min_n_jd=2)
new_skills = skill_df['skill']
print('No. of skills after filtering: %d' %len(new_skills) )
stop_cond = (n_new_post >= thres*n_post) and (len(new_skills) >= thres*len(skills))
posts = new_posts
n_post = posts.shape[0]
skills = new_skills
# end
In [ ]:
# print min(posts['n_uniq_skill'])
# print min(skill_df['n_jd_with_skill'])
posts.to_csv(DATA_DIR + 'filtered/posts.csv', index=False)
skill_df.to_csv(DATA_DIR + 'filtered/skills.csv', index=False)
In [ ]:
posts = posts.sort_values(by='n_uniq_skill', ascending=False)
posts.head()
In [ ]:
# Sanity check by pull up skills occuring in the JD with most skills
# post_with_most_skill = init_posts.query('job_id == {}'.format('JOB-2015-0196805') )
In [ ]:
train_idx, test_idx = mkPartition(n_instance, p=80)
X_train, X_test = doc_skill_tfidf[train_idx, :], doc_skill_tfidf[test_idx, :]
n_train, n_test = X_train.shape[0], X_test.shape[0]
print('Train set has %d JDs and test set has %d JDs' %(n_train, n_test))
In [ ]:
stats = pd.DataFrame({'n_train': n_train, 'n_test': n_test, 'n_jd (train & test)': n_post, 'n_skill': len(skills)}, index=[0])
stats.to_csv(RES_DIR + 'stats.csv', index=False)
In [ ]:
RES_DIR = REPORT_DIR + 'r6/'
n_top_words = 10
In [ ]:
# ks = range(5, 25, 5)
ks = range(4, 24, 2)
In [ ]:
tf_idf_vect = text_manip.TfidfVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))
n_instance, n_feat = posts.shape[0], len(skills)
t0 =time()
print('Building tf_idf for %d JDs using %d features (skills)...' %(n_instance, n_feat))
doc_skill_tfidf = tf_idf_vect.fit_transform(posts['clean_text'])
print('Done after %.1fs' %(time()-t0))
In [ ]:
rnmf = {k: NMF(n_components=k, random_state=0) for k in ks}
print( "Fitting NMF using random initialization..." )
print('No. of topics, Error, Running time')
rnmf_error = []
for k in ks:
t0 = time()
rnmf[k].fit(X_train)
elapsed = time() - t0
err = rnmf[k].reconstruction_err_
print('%d, %0.1f, %0.1fs' %(k, err, elapsed))
rnmf_error.append(err)
# end
In [ ]:
nmf_features = tf_idf_vect.get_feature_names()
pd.DataFrame(nmf_features).to_csv(RES_DIR + 'nmf_features.csv', index=False)
for k in ks:
top_words = top_words_df(n_top_words, model=rnmf[k],feature_names=nmf_features)
top_words.to_csv(RES_DIR + 'nmf_{}_topics.csv'.format(k), index=False)
# each word dist is a component in NMF
word_dist = pd.DataFrame(rnmf[k].components_).apply(normalize, axis=1)
word_dist.to_csv(RES_DIR + 'nmf_word_dist_{}topics.csv'.format(k), index=False)
In [ ]:
print('Calculating test errors of random NMF ...')
rnmf_test_error = cal_test_err(mf_models=rnmf)
In [ ]:
best_k = ks[np.argmin(rnmf_test_error)]
print('The best no. of topics is %d' %best_k)
rnmf_best = rnmf[best_k]
In [ ]:
nmf_fig = plotMetrics(train_metric=rnmf_error, test_metric=rnmf_test_error, model_name='NMF')
nmf_fig.savefig(RES_DIR + 'nmf.pdf')
plt.close(nmf_fig)
In [ ]:
t0 = time()
print('Building count features for LDA from %d JDs and %d skills...' %(n_post, len(skills)))
count_vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))
doc_skill_freq = count_vectorizer.fit_transform(posts['clean_text'])
print('Done after %.1fs' %(time() - t0))
doc_skill_freq.shape
In [ ]:
lda_X_train, lda_X_test = doc_skill_freq[train_idx, :], doc_skill_freq[test_idx, :]
In [ ]:
lda_scores = []
lda = {k: LatentDirichletAllocation(n_topics=k, max_iter=5, learning_method='online', learning_offset=50.,
random_state=0) # verbose=1
for k in ks}
print("Fitting LDA ...")
print('No. of topics, Log-likelihood, Running time')
for k in ks:
t0 = time()
lda[k].fit(lda_X_train)
s = lda[k].score(lda_X_train)
print('%d, %0.1f, %0.1fs' %(k, s, time() - t0))
lda_scores.append(s)
# end
In [ ]:
perp = [lda[k].perplexity(lda_X_test) for k in ks]
perp_df = pd.DataFrame({'No. of topics': ks, 'Perplexity': perp})
perp_df.to_csv(RES_DIR + 'perplexity.csv', index=False)
In [ ]:
lda_best_k = ks[np.argmin(perp)]
print('Best no. of topics for LDA: %d' %lda_best_k)
perp_df
In [ ]:
lda_feats = count_vectorizer.get_feature_names()
pd.DataFrame(lda_feats).to_csv(RES_DIR + 'lda_feats.csv')
for k in ks:
word_dist = pd.DataFrame(lda[k].components_).apply(normalize, axis=1)
word_dist.to_csv(RES_DIR + 'lda_word_dist_{}topics.csv'.format(k), index=False)
lda_topics = top_words_df(n_top_words, model=lda[k], feature_names=lda_feats)
lda_topics.to_csv(RES_DIR + 'lda_{}topics.csv'.format(k), index=False)
In [ ]:
# Put all model metrics on training & test datasets into 2 data frames
model_list = ['LDA', 'randomNMF']
train_metric = pd.DataFrame({'No. of topics': ks, 'LDA': np.divide(lda_scores, 10**6), 'randomNMF': rnmf_error})
test_metric = pd.DataFrame({'No. of topics': ks, 'LDA': perp, 'randomNMF': rnmf_test_error, })
In [ ]:
fig = plt.figure(figsize=(10, 6))
for i, model in enumerate(model_list):
plt.subplot(2, 2, i+1)
plt.subplots_adjust(wspace=.5, hspace=.5)
# train metric
plt.title(model)
plt.plot(ks, train_metric[model], '--')
plt.xlabel('No. of topics')
if model == 'LDA':
plt.ylabel(r'Log likelihood ($\times 10^6$)')
else:
plt.ylabel(r'$\| X_{train} - W_{train} H \|_2$')
plt.grid(True)
plt.xticks(ks)
# test metric
plt.subplot(2, 2, i+3)
plt.title(model)
plt.plot(ks, test_metric[model], 'r')
plt.xlabel('No. of topics')
if model == 'LDA':
plt.ylabel(r'Perplexity')
else:
plt.ylabel(r'$\| X_{test} - W_{test} H \|_2$')
plt.grid(True)
plt.xticks(ks)
# end
plt.show()
fig.savefig(RES_DIR + 'lda_vs_nmf.pdf')
plt.close(fig)