Running LDA on document-skill matrix, where each document is a job post, still does not give good results!!! What is the problem here?
It seems that the job post level has too many noises:
Thus, we now try putting all posts of the same job title together so that the aggregated skill info can win over the noises.
In [ ]:
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *
In [ ]:
HOME_DIR = 'd:/larc_projects/job_analytics/'; DATA_DIR = HOME_DIR + 'data/clean/'
SKILL_DIR = DATA_DIR + 'skill_cluster/'; RES_DIR = HOME_DIR + 'results/reports/skill_cluster/'
In [ ]:
jobs = pd.read_csv(DATA_DIR + 'jobs.csv')
In [ ]:
skill_df = pd.read_csv(SKILL_DIR + 'skill_df.csv')
In [ ]:
by_job_title = jobs.groupby('title')
job_title_df = by_job_title.agg({'job_id': lambda x: ','.join(x), 'doc': lambda x: 'next_doc'.join(x)})
job_title_df = job_title_df.add_prefix('agg_').job_title_dfet_index()
job_title_df.head()
In [ ]:
n_job_title = by_job_title.ngroups
print('# job titles: %d' %n_job_title)
In [ ]:
reload(cluster_skill_helpers)
from cluster_skill_helpers import *
In [ ]:
jd_docs = job_title_df['agg_doc']
In [ ]:
# This version of skills still contain stopwords
doc_skill = buildDocSkillMat(jd_docs, skill_df)
In [ ]:
from scipy.sparse import hstack
jobtitle_skill = hstack([doc_unigram, doc_bigram, doc_trigram])
In [ ]:
with(open(SKILL_DIR + 'jobtitle_skill.mtx', 'w')) as f:
mmwrite(f, jobtitle_skill)
In [ ]:
jobtitle_skill.shape
In [ ]:
jobtitle_skill = jobtitle_skill.toarray()
In [ ]:
job_title_df.head(1)
In [ ]:
idx_of_top_skill = np.apply_along_axis(np.argmax, 1, jobtitle_skill)
# skill_df = skills
skills = skill_df['skill']
top_skill_by_job_title = pd.DataFrame({'job_title': job_titles, 'idx_of_top_skill': idx_of_top_skill})
top_skill_by_job_title['top_skill'] = top_skill_by_job_title['idx_of_top_skill'].apply(lambda i: skills[i])
In [ ]:
top_skill_by_job_title.head(30)
In [ ]:
with(open(SKILL_DIR + 'jobtitle_skill.mtx', 'r')) as f:
jobtitle_skill = mmread(f)
jobtitle_skill = jobtitle_skill.tocsr()
jobtitle_skill.shape
In [ ]:
job_titles = job_title_df['title']
In [ ]:
# for each row (corresponding to a jobtitle) in matrix jobtitle_skill, get non-zero freqs
global k
k = 3
def getTopK_Skills(idx):
title = job_titles[idx]
print('Finding top-{} skills of job title {}...'.format(k, title))
skill_occur = jobtitle_skill.getrow(idx)
tmp = find(skill_occur)
nz_indices = tmp[1]
values = tmp[2]
res = pd.DataFrame({'job_title': title, 'skill_found_in_jd': skills[nz_indices], 'occur_freq': values})
res.sort_values('occur_freq', ascending=False, inplace=True)
return res.head(k)
In [ ]:
# getTopK_Skills(0)
In [ ]:
frames = map(getTopK_Skills, range(n_job_title))
res = pd.concat(frames) # concat() is great as it can concat as many df as possible
res.head(30)
In [ ]:
res.to_csv(RES_DIR + 'top3_skill_by_jobtitle.csv', index=False)