In [1]:
import ja_helpers as ja_helpers
from ja_helpers import *

Load data


In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'; DATA_DIR = HOME_DIR + 'data/clean/'
RES_DIR = HOME_DIR + 'results/'

In [5]:
skill_df = pd.read_csv(DATA_DIR + 'skill_index.csv')

Build feature matrix

The matrix is a JD-Skill matrix where each entry $e(d, s)$ is the number of times skill $s$ occurs in job description $d$.


In [ ]:
doc_skill = buildDocSkillMat(jd_docs, skill_df, folder=DATA_DIR)

with(open(DATA_DIR + 'doc_skill.mtx', 'w')) as f:
    mmwrite(f, doc_skill)

Get skills in each JD

Using the matrix, we can retrieve skills in each JD.


In [ ]:
extracted_skill_df = getSkills4Docs(docs=doc_index['doc'], doc_term=doc_skill, skills=skills)

In [ ]:
df = pd.merge(doc_index, extracted_skill_df, left_index=True, right_index=True)

In [ ]:
print(df.shape)
df.head()

In [ ]:
df.to_csv(DATA_DIR + 'doc_index.csv') # later no need to extract skill  again

Extract features of new documents


In [9]:
reload(ja_helpers)
from ja_helpers import *

In [8]:
# load frameworks of SF as docs
pst_docs = pd.read_csv(DATA_DIR + 'SF/pst.csv')
pst_docs


Out[8]:
title doc
0 senior_pst Developing the Child Holistically: Child Devel...
1 pst Developing the Child Holistically: Child Devel...

In [10]:
pst_skill = buildDocSkillMat(pst_docs, skill_df, folder=None)


No folder passed, will not save intermediate matrices.
Counting occurrence of 3-gram skills...
Done after 0.0s
Removing tri-grams from JDs to avoid duplications...
Done
Counting occurrence of 2-gram skills...
Done after 0.0s
Removing bi-grams from JDs...
Done
Counting occurrence of 1-gram skills...
Done after 0.0s

In [11]:
with(open(DATA_DIR + 'pst_skill.mtx', 'w')) as f:
    mmwrite(f, pst_skill)