In [1]:
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *

Build feature matrix

The matrix is a JD-Skill matrix where each entry $e(d, s)$ is the number of times skill $s$ occurs in job description $d$.


In [ ]:
doc_skill = buildDocSkillMat(jd_docs, skill_df, folder=SKILL_DIR)

with(open(SKILL_DIR + 'doc_skill.mtx', 'w')) as f:
    mmwrite(f, doc_skill)

Get skills in each JD

Using the matrix, we can retrieve skills in each JD.


In [ ]:
extracted_skill_df = getSkills4Docs(docs=doc_index['doc'], doc_term=doc_skill, skills=skills)

In [ ]:
df = pd.merge(doc_index, extracted_skill_df, left_index=True, right_index=True)

In [ ]:
print(df.shape)
df.head()

In [ ]:
# sanity check
# df.head(3).to_csv(LDA_DIR + 'tmp/skills_3_sample_docs.csv', index=False)

In [ ]:
df.to_csv(SKILL_DIR + 'doc_index.csv') # later no need to extract skill  again