In [1]:
import ja_helpers as ja_helpers; from ja_helpers import *
In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'; DATA_DIR = HOME_DIR + 'data/clean/'
RES_DIR = HOME_DIR + 'results/skill_cluster/new/'
In [3]:
skill_df = pd.read_csv(DATA_DIR + 'skill_index.csv')
doc_skill = mmread(DATA_DIR + 'doc_skill.mtx')
In [4]:
skills = skill_df['skill']
print('# skills from the skill index: %d' %len(skills))
n_doc = doc_skill.shape[0]; n_skill = doc_skill.shape[1]
print ('# skills in matrix doc-skill: %d' %n_skill)
print('# documents in matrix doc-skill: %d' %n_doc)
In [ ]:
## May not be needed
# doc_index = pd.read_csv(DATA_DIR + 'doc_index.csv')
# jd_docs = doc_index['doc']; print('# JDs: %d' %len(jd_docs))
In [7]:
ks = range(15, 35, 5) # ks = [15]
n_top_words = 10
In [8]:
print('# docs: {}, # skills: {}'.format(n_doc, n_skill))
in_train, in_test = mkPartition(n_doc, p=80)
doc_skill = doc_skill.tocsr()
lda_X_train, lda_X_test = doc_skill[in_train, :], doc_skill[in_test, :]
In [9]:
beta = 0.1 # or 200/W
lda = trainLDA(beta, ks, trainning_set=lda_X_train)
In [12]:
LDA_DIR = RES_DIR + 'lda/'
In [16]:
for k in ks:
doc_topic_distr = lda[k].transform(doc_skill)
fname = RES_DIR + 'doc_{}topic_distr.mtx'.format(k)
with(open(fname, 'w')) as f:
mmwrite(f, doc_topic_distr)
In [10]:
perp_df = testLDA(lda, ks, test_set=lda_X_test)
perp_df
Out[10]:
In [ ]:
perp_df.to_csv(LDA_DIR + 'perplexity.csv', index=False)
In [17]:
for k in ks:
# word_dist = pd.DataFrame(lda[k].components_).apply(normalize, axis=1)
# word_dist.to_csv(LDA_DIR + 'lda_word_dist_{}topics.csv'.format(k), index=False)
lda_topics = top_words_df(n_top_words=10, model=lda[k], feature_names=skills)
lda_topics.to_csv(LDA_DIR + '{}topics.csv'.format(k), index=False)
In [ ]:
for k in ks:
topic_word_dist = lda[k].components_
fname = LDA_DIR + 'word_dist_{}_topics.mtx'.format(k)
with(open(fname, 'w')) as f:
mmwrite(f, topic_word_dist)
In [ ]:
clusters = pd.read_csv(LDA_DIR + 'cluster.csv')['cluster']
n_cluster = len(clusters)
In [ ]:
doc_index.to_csv(DATA_DIR + 'doc_index.csv', index=False)
In [ ]:
doc_topic_distr = lda[15].transform(doc_skill)
with(open(LDA_DIR + 'doc_topic_distr.mtx', 'w')) as f:
mmwrite(f, doc_topic_distr)
In [ ]:
thres = 0.4 # 0.5
t0 = time()
# doc_index['top_clusters'] = doc_index.apply(getTopTopics_GT, axis=1, doc_topic_distr=doc_topic_distr, thres=0.5)
# doc_index['n_top_cluster_40'] = doc_index.apply(getTopTopics_GT, axis=1, doc_topic_distr=doc_topic_distr, thres=thres)
doc_index['prob_top_cluster'] = doc_index.apply(getTopTopicProb, axis=1, doc_topic_distr=doc_topic_distr)
print('Done after %.1fs' %(time() - t0))
In [ ]:
res = doc_index.query('n_skill >= 2')
res.sort_values('n_skill', ascending=False, inplace=True)
print('No. of JDs in result: %d' %res.shape[0])
res.head()
In [ ]:
n_sample = 100
res.head(n_sample).to_csv(LDA_DIR + 'new/cluster_100top_docs.csv', index=False)
res.tail(n_sample).to_csv(LDA_DIR + 'new/cluster_100bottom_docs.csv', index=False)
# res.to_csv(LDA_DIR + 'new/cluster_assign2.csv', index=False)
In [ ]:
res.rename(columns={'n_top_cluster_40': 'n_top_cluster'}, inplace=True)
We want to see when the cluster assignment to job post is clear or fuzzy. The former (latter) means that we the list of top clusters assigned to the post has at most 3 clusters (more than 3 clusters) respectively.
In [ ]:
clear_assign = res.query('n_top_cluster <= 3'); fuzzy_assign = res.query('n_top_cluster > 3')
print('# posts with clear assignment: %d' %clear_assign.shape[0])
print('Distribution of skills in these posts:')
quantile(clear_assign['n_skill'])
These posts contain lots of skills. Only 25% of them contain no more than 31 skills in each post, so each of the remaining 75% contains at least 31 skills. We can contrast this quartile with the skill distribution in all job posts below.
In [ ]:
print('Distribution of skills in all posts:')
quantile(res['n_skill'])
In [ ]:
fig = plotSkillDist(res)
plt.savefig(LDA_DIR + 'fig/n_skill_hist.jpg')
plt.show(); plt.close()
In [ ]:
res = pd.read_csv(LDA_DIR + 'new/cluster_assign.csv')
In [ ]:
res.describe().round(2)
In [ ]:
g1 = res.query('n_skill < 7'); g2 = res.query('n_skill >= 7 & n_skill < 12')
g3 = res.query('n_skill >= 12 & n_skill < 18'); g4 = res.query('n_skill >= 18')
print('# posts in 4 groups:');
print(','.join([str(g1.shape[0]), str(g2.shape[0]), str(g3.shape[0]), str(g4.shape[0])]))
In [ ]:
bp = mixtureSizePlot(g1, g2, g3, g4)
plt.savefig(LDA_DIR + 'fig/boxplot_mixture_size.pdf'); plt.show(); plt.close()
The box plot reveals the following:
In [ ]:
thres = 0.4
fig = errorBarPlot(res, thres=thres)
plt.savefig(LDA_DIR + 'fig/mixture_size_thres{}.jpg'.format(int(thres*100)))
plt.show(); plt.close()
In [ ]:
fig = topClusterProbPlot(g1, g2, g3, g4)
plt.savefig(LDA_DIR + 'fig/top_cluster_prob.jpg')
plt.show(); plt.close()
In [ ]:
NMF_DIR = RES_DIR + 'new/nmf/'
In [ ]:
## TODO
tf_idf_vect = text_manip.TfidfVectorizer(vocabulary=skills, ngram_range=(1, max_n_word))
n_instance, n_feat = posts.shape[0], len(skills)
t0 =time()
print('Building tf_idf for %d JDs using %d features (skills)...' %(n_instance, n_feat))
doc_skill_tfidf = tf_idf_vect.fit_transform(posts['clean_text'])
print('Done after %.1fs' %(time()-t0))
In [ ]:
rnmf = {k: NMF(n_components=k, random_state=0) for k in ks}
print( "Fitting NMF using random initialization..." )
print('No. of topics, Error, Running time')
rnmf_error = []
for k in ks:
t0 = time()
rnmf[k].fit(X_train)
elapsed = time() - t0
err = rnmf[k].reconstruction_err_
print('%d, %0.1f, %0.1fs' %(k, err, elapsed))
rnmf_error.append(err)
# end
In [ ]:
nmf_features = tf_idf_vect.get_feature_names()
pd.DataFrame(nmf_features).to_csv(RES_DIR + 'nmf_features.csv', index=False)
for k in ks:
top_words = top_words_df(n_top_words, model=rnmf[k],feature_names=nmf_features)
top_words.to_csv(RES_DIR + 'nmf_{}_topics.csv'.format(k), index=False)
# each word dist is a component in NMF
word_dist = pd.DataFrame(rnmf[k].components_).apply(normalize, axis=1)
word_dist.to_csv(RES_DIR + 'nmf_word_dist_{}topics.csv'.format(k), index=False)
In [ ]:
print('Calculating test errors of random NMF ...')
rnmf_test_error = cal_test_err(mf_models=rnmf)
In [ ]:
best_k = ks[np.argmin(rnmf_test_error)]
print('The best no. of topics is %d' %best_k)
rnmf_best = rnmf[best_k]
In [ ]:
nmf_fig = plotMetrics(train_metric=rnmf_error, test_metric=rnmf_test_error, model_name='NMF')
nmf_fig.savefig(RES_DIR + 'nmf.pdf')
plt.close(nmf_fig)
In [ ]:
# Put all model metrics on training & test datasets into 2 data frames
model_list = ['LDA', 'randomNMF']
train_metric = pd.DataFrame({'No. of topics': ks, 'LDA': np.divide(lda_scores, 10**6), 'randomNMF': rnmf_error})
test_metric = pd.DataFrame({'No. of topics': ks, 'LDA': perp, 'randomNMF': rnmf_test_error, })
In [ ]:
fig = plt.figure(figsize=(10, 6))
for i, model in enumerate(model_list):
plt.subplot(2, 2, i+1)
plt.subplots_adjust(wspace=.5, hspace=.5)
# train metric
plt.title(model)
plt.plot(ks, train_metric[model], '--')
plt.xlabel('No. of topics')
if model == 'LDA':
plt.ylabel(r'Log likelihood ($\times 10^6$)')
else:
plt.ylabel(r'$\| X_{train} - W_{train} H \|_2$')
plt.grid(True)
plt.xticks(ks)
# test metric
plt.subplot(2, 2, i+3)
plt.title(model)
plt.plot(ks, test_metric[model], 'r')
plt.xlabel('No. of topics')
if model == 'LDA':
plt.ylabel(r'Perplexity')
else:
plt.ylabel(r'$\| X_{test} - W_{test} H \|_2$')
plt.grid(True)
plt.xticks(ks)
# end
plt.show()
fig.savefig(RES_DIR + 'lda_vs_nmf.pdf')
plt.close(fig)