In [1]:

    
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *

Build feature matrix

The matrix is a JD-Skill matrix where each entry $e(d, s)$ is the number of times skill $s$ occurs in job description $d$.



In [ ]:

    
doc_skill = buildDocSkillMat(jd_docs, skill_df, folder=SKILL_DIR)

with(open(SKILL_DIR + 'doc_skill.mtx', 'w')) as f:
    mmwrite(f, doc_skill)

Get skills in each JD

Using the matrix, we can retrieve skills in each JD.



In [ ]:

    
extracted_skill_df = getSkills4Docs(docs=doc_index['doc'], doc_term=doc_skill, skills=skills)



In [ ]:

    
df = pd.merge(doc_index, extracted_skill_df, left_index=True, right_index=True)



In [ ]:

    
print(df.shape)
df.head()



In [ ]:

    
# sanity check
# df.head(3).to_csv(LDA_DIR + 'tmp/skills_3_sample_docs.csv', index=False)



In [ ]:

    
df.to_csv(SKILL_DIR + 'doc_index.csv') # later no need to extract skill  again