In [1]:
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *
In [ ]:
doc_skill = buildDocSkillMat(jd_docs, skill_df, folder=SKILL_DIR)
with(open(SKILL_DIR + 'doc_skill.mtx', 'w')) as f:
mmwrite(f, doc_skill)
In [ ]:
extracted_skill_df = getSkills4Docs(docs=doc_index['doc'], doc_term=doc_skill, skills=skills)
In [ ]:
df = pd.merge(doc_index, extracted_skill_df, left_index=True, right_index=True)
In [ ]:
print(df.shape)
df.head()
In [ ]:
# sanity check
# df.head(3).to_csv(LDA_DIR + 'tmp/skills_3_sample_docs.csv', index=False)
In [ ]:
df.to_csv(SKILL_DIR + 'doc_index.csv') # later no need to extract skill again