Building JobTitle-Skill matrix

Running LDA on document-skill matrix, where each document is a job post, still does not give good results!!! What is the problem here?

It seems that the job post level has too many noises:

  • other info not relating to skills i.e. salary, location, working time, required experience.

Thus, we now try putting all posts of the same job title together so that the aggregated skill info can win over the noises.


In [ ]:
import cluster_skill_helpers as cluster_skill_helpers

from cluster_skill_helpers import *

In [ ]:
HOME_DIR = 'd:/larc_projects/job_analytics/'; DATA_DIR = HOME_DIR + 'data/clean/'
SKILL_DIR = DATA_DIR + 'skill_cluster/'; RES_DIR = HOME_DIR + 'results/reports/skill_cluster/'

In [ ]:
jobs = pd.read_csv(DATA_DIR + 'jobs.csv')

In [ ]:
skill_df = pd.read_csv(SKILL_DIR + 'skill_df.csv')

Collapse all posts of the same job title into a single document


In [ ]:
by_job_title = jobs.groupby('title')
job_title_df = by_job_title.agg({'job_id': lambda x: ','.join(x), 'doc': lambda x: 'next_doc'.join(x)})

job_title_df = job_title_df.add_prefix('agg_').job_title_dfet_index()
job_title_df.head()

In [ ]:
n_job_title = by_job_title.ngroups
print('# job titles: %d' %n_job_title)

In [ ]:
reload(cluster_skill_helpers)
from cluster_skill_helpers import *

In [ ]:
jd_docs = job_title_df['agg_doc']

In [ ]:
# This version of skills still contain stopwords
doc_skill = buildDocSkillMat(jd_docs, skill_df)
  • Concat matrices doc_unigram, doc_bigram and doc_trigram to get occurrences of all skills:

In [ ]:
from scipy.sparse import hstack
jobtitle_skill = hstack([doc_unigram, doc_bigram, doc_trigram])

In [ ]:
with(open(SKILL_DIR + 'jobtitle_skill.mtx', 'w')) as f:
    mmwrite(f, jobtitle_skill)

In [ ]:
jobtitle_skill.shape

In [ ]:
jobtitle_skill = jobtitle_skill.toarray()

In [ ]:
job_title_df.head(1)

In [ ]:
idx_of_top_skill = np.apply_along_axis(np.argmax, 1, jobtitle_skill)

# skill_df = skills
skills = skill_df['skill']
top_skill_by_job_title = pd.DataFrame({'job_title': job_titles, 'idx_of_top_skill': idx_of_top_skill})
top_skill_by_job_title['top_skill'] = top_skill_by_job_title['idx_of_top_skill'].apply(lambda i: skills[i])

In [ ]:
top_skill_by_job_title.head(30)

In [ ]:
with(open(SKILL_DIR + 'jobtitle_skill.mtx', 'r')) as f:
    jobtitle_skill = mmread(f)

jobtitle_skill = jobtitle_skill.tocsr()
jobtitle_skill.shape

In [ ]:
job_titles = job_title_df['title']

In [ ]:
# for each row (corresponding to a jobtitle) in matrix jobtitle_skill, get non-zero freqs
global k
k = 3

def getTopK_Skills(idx):
    title = job_titles[idx]
    print('Finding top-{} skills of job title {}...'.format(k, title))
    
    skill_occur = jobtitle_skill.getrow(idx)
    tmp = find(skill_occur)
    nz_indices = tmp[1]
    values = tmp[2]
    res = pd.DataFrame({'job_title': title, 'skill_found_in_jd': skills[nz_indices], 'occur_freq': values})

    res.sort_values('occur_freq', ascending=False, inplace=True)
    return res.head(k)

In [ ]:
# getTopK_Skills(0)

In [ ]:
frames = map(getTopK_Skills, range(n_job_title))
res = pd.concat(frames) # concat() is great as it can concat as many df as possible
res.head(30)

In [ ]:
res.to_csv(RES_DIR + 'top3_skill_by_jobtitle.csv', index=False)