Steps of skill clustering:
We can try MF on count matrix or tfidf matrix. However, on building these matrices, we need to take of "duplication" problem.
In [1]:
import my_util as my_util
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *
import random as rd
In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
SKILL_DAT = HOME_DIR + 'data/clean/skill_cluster/'
SKILL_RES = HOME_DIR + 'results/' + 'skill_cluster/new/'
First, we try it on count matrix as the matrix is already avail.
In [3]:
# Load count matrix
skill_df = pd.read_csv(SKILL_DAT + 'skill_index.csv')
skills = skill_df['skill']
doc_skill = mmread(SKILL_DAT + 'doc_skill.mtx')
There are various choices to initialize NMF including random and by SVD. We try random NMF, denoted as rnmf.
In [4]:
ks = range(10, 60, 10)
In [5]:
rnmf = {k: NMF(n_components=k, random_state=0) for k in ks}
print( "Fitting NMF using random initialization..." )
print('No. of factors, Error, Running time')
rnmf_error = []
for k in ks:
t0 = time()
rnmf[k].fit(doc_skill)
elapsed = time() - t0
err = rnmf[k].reconstruction_err_
print('%d, %0.1f, %0.1fs' %(k, err, elapsed))
rnmf_error.append(err)
# end
In [6]:
# Save learned factor-skill matrices
nmf_dir = SKILL_RES + 'nmf/'
for k in ks:
fname = '{}factor_skill.csv'.format(k)
pd.DataFrame(rnmf[k].components_).to_csv(nmf_dir + fname, index=False)
print('saved {}factor-skill matrix'.format(k))