Given a parsed job title with known position, domain and function, we find top-$k$ titles most similar to the title.
In [1]:
import gc
import ja_helpers as ja_helpers; from ja_helpers import *
In [2]:
gc.enable()
In [3]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'; RES_DIR = HOME_DIR + 'results/'
In [4]:
domain_dir = RES_DIR + 'job_prof/by_domain/' + 'new/'
func_dir = RES_DIR + 'job_prof/by_func/' + 'new/'
In [8]:
title_df = pd.read_csv(DATA_DIR + 'new_titles_2posts_up.csv', keep_default_na=False)
In [9]:
title_df.head()
Out[9]:
In [10]:
stdForm = dict(zip(title_df.non_std_title, title_df.title))
In [11]:
positions = dict(zip(title_df.title, title_df.position))
domains = dict(zip(title_df.title, title_df.domain))
functions = dict(zip(title_df.title, title_df.pri_func))
In [12]:
print('# std titles: %d' % title_df.title.nunique())
# print
In [24]:
# reload(my_util); from my_util import *
reload(ja_helpers); from ja_helpers import *
In [10]:
avail_domains = listFiles(domain_dir + 'sims/')
avail_funcs = listFiles(func_dir + 'sims/')
empty_sim = pd.DataFrame({'t1': [], 't2': [], 'topic_sim': []})
In [36]:
def getDomainSims(job_title):
if job_title not in domains.keys():
dom_sims = empty_sim
else:
dom = clean(domains[job_title])
dom_file = domain_dir + 'sims/{}.csv'.format(dom)
dom_sims = pd.read_csv(dom_file) if (dom in avail_domains) else empty_sim
return dom_sims
def getFuncSims(job_title):
if job_title not in functions.keys():
func_sims = empty_sim
else:
pri_func = clean(functions[job_title])
func_file = func_dir + 'sims/{}.csv'.format(pri_func);
func_sims = pd.read_csv(func_file) if (pri_func in avail_funcs) else empty_sim
return func_sims
## Main function
def queryTopkSim(job_title, k=2):
'''
@return: k job titles with highest topic similarity to given job title
'''
job_title = camelCase(job_title)
## Load sim scores of relevant titles (by domain or function)
dom_sims = getDomainSims(job_title)
func_sims = getFuncSims(job_title)
# the query to retrieve all relevant obs
q = 't1 == "{}" or t2 == "{}"'.format(job_title, job_title)
rel_sims = dom_sims.query(q).append(func_sims.query(q))
if not rel_sims.empty:
return buildTopkFrom(rel_sims, k, job_title)
else:
return empty_sim
# return ''
def fanOut(titles, fan_width):
frames = [queryTopkSim(t, k=fan_width) for t in titles]
return pd.concat(frames)
def relevantTitles(job_title='Software Developer'):
job_title = camelCase(job_title)
dom = domains[job_title] if job_title in domains.keys() else None
func = functions[job_title] if job_title in functions.keys() else None
if (not dom) and (not func):
rel_titles = []
if dom and (not func):
rel_titles = titlesIn(dom, title_df)
if func and (not dom):
rel_titles = titlesWithFunc(func, title_df)
if dom and func:
rel_titles = np.concatenate([titlesIn(dom, title_df), titlesWithFunc(func, title_df)])
return np.unique(rel_titles)
In [37]:
relevantTitles('software developer')
Out[37]:
In [29]:
swr_dev_sims = queryTopkSim('software developer', k=5)
swr_dev_titles = swr_dev_sims.t2
fanOut(swr_dev_titles, fan_width=3)
Out[29]:
In [21]:
queryTopkSim('software programmer', k=5)
Out[21]:
In [25]:
queryTopkSim('software engineer', k=10)
Out[25]:
In [26]:
queryTopkSim('mechanical engineer', k=10)
Out[26]:
In [27]:
# queryTopkSim('electrical engineer', k=10)
Out[27]:
In [13]:
exe_sims = pd.read_csv(func_dir + 'sims/executive.csv')
exe_sims.head()
Out[13]:
In [34]:
topkByFunction(k=5, job_title='Financial Accounting Executive', func_sims=exe_sims)
exe_titles = titlesWithFunc('executive', title_df)
exe_top5 = pd.DataFrame({'title': exe_titles})
exe_top5['top5'] = exe_top5['title'].apply(topkByFunction, k=5, func_sims=exe_sims)
exe_top5.to_csv(func_dir + 'topk/executive.csv', index=False)
Out[34]:
In [44]:
queryTopkSim('Accounts Executive', k=10)
queryTopkSim('Marketing Executive', k=10)
queryTopkSim('Sales Executive', k=10)
sales_exe_top10 = queryTopkSim('Sales Executive', k=10)
Out[44]:
In [61]:
fan = fanOut(sales_exe_top10.t2, fan_width=2)
cand_titles = fan.t2
cand_titles
Out[61]:
In [ ]:
In [ ]:
In [ ]:
# TESTS
# swr_sims = pasteCols('t2', 'topic_sim', swr_sims, 'title_n_sim')
# swr_sims.head()
# swapCols('t1', 't2', swr_sims.head())