Connecting Job Titles by Topic Similarity

Given a parsed job title with known position, domain and function, we find top-$k$ titles most similar to the title.

Preps


In [1]:
import gc
import ja_helpers as ja_helpers; from ja_helpers import *

In [2]:
gc.enable()

In [3]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'; RES_DIR = HOME_DIR + 'results/'

In [4]:
domain_dir = RES_DIR + 'job_prof/by_domain/' + 'new/'
func_dir = RES_DIR + 'job_prof/by_func/' + 'new/'
  • Load parsed titles with at least 2 posts: mk sure to turn off default NAs s.t empty strings will not be regarded as NAs.

In [8]:
title_df = pd.read_csv(DATA_DIR + 'new_titles_2posts_up.csv', keep_default_na=False)

In [9]:
title_df.head()


Out[9]:
title n_post domain position pri_func sec_func non_std_title
0 Academic Assistant 13 academic assistant Academic Assistant
1 Academic Consultant 13 academic consultant Academic Consultant
2 Academic Coordinator 12 academic coordinator Academic Coordinator
3 Academic Director 6 academic director Academic Director
4 Academic Instructor 13 academic instructor Academic Instructor

In [10]:
stdForm = dict(zip(title_df.non_std_title, title_df.title))

In [11]:
positions = dict(zip(title_df.title, title_df.position))
domains = dict(zip(title_df.title, title_df.domain))
functions = dict(zip(title_df.title, title_df.pri_func))

In [12]:
print('# std titles: %d' % title_df.title.nunique())
# print


# std titles: 2839

Run


In [24]:
# reload(my_util); from my_util import *
reload(ja_helpers); from ja_helpers import *

In [10]:
avail_domains = listFiles(domain_dir + 'sims/')
avail_funcs = listFiles(func_dir + 'sims/')

empty_sim = pd.DataFrame({'t1': [], 't2': [], 'topic_sim': []})

In [36]:
def getDomainSims(job_title):
    if job_title not in domains.keys():
        dom_sims = empty_sim
    else:
        dom = clean(domains[job_title])
        dom_file = domain_dir + 'sims/{}.csv'.format(dom)
        dom_sims = pd.read_csv(dom_file) if (dom in avail_domains) else empty_sim        
    return dom_sims

def getFuncSims(job_title):
    if job_title not in functions.keys():
        func_sims = empty_sim
    else:
        pri_func = clean(functions[job_title])
        func_file = func_dir + 'sims/{}.csv'.format(pri_func); 
        func_sims = pd.read_csv(func_file) if (pri_func in avail_funcs) else empty_sim
    return func_sims

## Main function 
def queryTopkSim(job_title, k=2):
    '''
    @return: k job titles with highest topic similarity to given job title
    '''
    
    job_title = camelCase(job_title)
##  Load sim scores of relevant titles (by domain or function)
    dom_sims = getDomainSims(job_title)
    func_sims = getFuncSims(job_title)
    
    # the query to retrieve all relevant obs 
    q = 't1 == "{}" or t2 == "{}"'.format(job_title, job_title)
    rel_sims = dom_sims.query(q).append(func_sims.query(q))
    
    if not rel_sims.empty: 
        return buildTopkFrom(rel_sims, k, job_title)
    else:
        return empty_sim
#         return ''

def fanOut(titles, fan_width):
    frames = [queryTopkSim(t, k=fan_width) for t in titles]
    return pd.concat(frames)

def relevantTitles(job_title='Software Developer'):
    
    job_title = camelCase(job_title)
    dom = domains[job_title] if job_title in domains.keys() else None
    func = functions[job_title] if job_title in functions.keys() else None
    if (not dom) and (not func):
        rel_titles = []
    if dom and (not func):
        rel_titles = titlesIn(dom, title_df)
    if func and (not dom):
        rel_titles = titlesWithFunc(func, title_df)
    if dom and func:
        rel_titles = np.concatenate([titlesIn(dom, title_df), titlesWithFunc(func, title_df)])
    
    return np.unique(rel_titles)

In [37]:
relevantTitles('software developer')


Out[37]:
array(['Application Developer', 'Assistant Software Engineer',
       'Business Systems Developer', 'Concept Design Developer',
       'Curriculum Developer', 'Database Developer', 'Game Developer',
       'Hypertext Preprocessor Developer', 'Java Developer',
       'Multimedia Developer', 'Project Developer',
       'Senior Application Developer', 'Senior Database Developer',
       'Senior Game Developer', 'Senior Hypertext Preprocessor Developer',
       'Senior Java Developer', 'Senior Software Analyst',
       'Senior Software Architect', 'Senior Software Consultant',
       'Senior Software Developer', 'Senior Software Engineer',
       'Senior Software Manager', 'Senior Software Specialist',
       'Senior Solutions Developer', 'Senior Systems Developer',
       'Senior Technical Developer', 'Senior Web Application Developer',
       'Senior Web Developer', 'Software Analyst',
       'Software And Applications Developer', 'Software Architect',
       'Software Consultant', 'Software Developer', 'Software Engineer',
       'Software Manager', 'Software Programmer', 'Software Specialist',
       'Solutions Developer', 'Staff Software Engineer',
       'Systems Developer', 'Technical Developer',
       'Web Application Developer', 'Web Developer', 'Website Developer'], dtype=object)

In [29]:
swr_dev_sims = queryTopkSim('software developer', k=5)
swr_dev_titles = swr_dev_sims.t2
fanOut(swr_dev_titles, fan_width=3)


Out[29]:
t1 t2 topic_sim title_n_sim
0 Senior Hypertext Preprocessor Developer Hypertext Preprocessor Developer 0.901 Hypertext Preprocessor Developer(0.901)
1 Senior Hypertext Preprocessor Developer Multimedia Developer 0.893 Multimedia Developer(0.893)
2 Senior Hypertext Preprocessor Developer Software And Applications Developer 0.886 Software And Applications Developer(0.886)
3 Concept Design Developer Hypertext Preprocessor Developer 0.884 Hypertext Preprocessor Developer(0.884)
7 Concept Design Developer Senior Hypertext Preprocessor Developer 0.876 Senior Hypertext Preprocessor Developer(0.876)
10 Concept Design Developer Software And Applications Developer 0.873 Software And Applications Developer(0.873)
0 Hypertext Preprocessor Developer Senior Hypertext Preprocessor Developer 0.901 Senior Hypertext Preprocessor Developer(0.901)
3 Hypertext Preprocessor Developer Concept Design Developer 0.884 Concept Design Developer(0.884)
4 Hypertext Preprocessor Developer Multimedia Developer 0.879 Multimedia Developer(0.879)
2 Software And Applications Developer Senior Hypertext Preprocessor Developer 0.886 Senior Hypertext Preprocessor Developer(0.886)
6 Software And Applications Developer Hypertext Preprocessor Developer 0.877 Hypertext Preprocessor Developer(0.877)
10 Software And Applications Developer Concept Design Developer 0.873 Concept Design Developer(0.873)
316 Assistant Software Engineer Information Visualisation Engineer 0.906 Information Visualisation Engineer(0.906)
687 Assistant Software Engineer Solutions Quality Engineer 0.899 Solutions Quality Engineer(0.899)
974 Assistant Software Engineer Solar Engineer 0.895 Solar Engineer(0.895)

In [21]:
queryTopkSim('software programmer', k=5)


Out[21]:
t1 t2 topic_sim title_n_sim
0 Software Programmer Assistant Software Engineer 0.869 Assistant Software Engineer(0.869)
3 Software Programmer Systems Programmer 0.879 Systems Programmer(0.879)
22 Software Programmer Analyst Programmer 0.866 Analyst Programmer(0.866)
19 Software Programmer Lead Programmer 0.866 Lead Programmer(0.866)
28 Software Programmer Technical Programmer 0.860 Technical Programmer(0.86)

Engineer Jobs


In [25]:
queryTopkSim('software engineer', k=10)


Out[25]:
t1 t2 topic_sim title_n_sim
3456 Software Engineer Information Visualisation Engineer 0.880 Information Visualisation Engineer(0.88)
6816 Software Engineer Solutions Quality Engineer 0.869 Solutions Quality Engineer(0.869)
11335 Software Engineer Software Design Engineer 0.859 Software Design Engineer(0.859)
4 Software Engineer Assistant Software Engineer 0.858 Assistant Software Engineer(0.858)
12734 Software Engineer Assistant Software Engineer 0.857 Assistant Software Engineer(0.857)
13243 Software Engineer Solar Engineer 0.856 Solar Engineer(0.856)
15014 Software Engineer Senior Embedded Software Engineer 0.853 Senior Embedded Software Engineer(0.853)
16017 Software Engineer Senior Mobile Engineer 0.851 Senior Mobile Engineer(0.851)
15990 Software Engineer Environmental Affairs Engineer 0.851 Environmental Affairs Engineer(0.851)
17001 Software Engineer Embedded Software Engineer 0.850 Embedded Software Engineer(0.85)

In [26]:
queryTopkSim('mechanical engineer', k=10)


Out[26]:
t1 t2 topic_sim title_n_sim
2181 Mechanical Engineer Power Plant Service Engineer 0.886 Power Plant Service Engineer(0.886)
3607 Mechanical Engineer Senior Tunnel Engineer 0.879 Senior Tunnel Engineer(0.879)
3605 Mechanical Engineer Rail Track Engineer 0.879 Rail Track Engineer(0.879)
3734 Mechanical Engineer Dredging Engineer 0.878 Dredging Engineer(0.878)
4260 Mechanical Engineer Assistant Site Engineer 0.877 Assistant Site Engineer(0.877)
4557 Mechanical Engineer Senior Building Engineer 0.876 Senior Building Engineer(0.876)
4857 Mechanical Engineer Architectural Engineer 0.875 Architectural Engineer(0.875)
4831 Mechanical Engineer Senior Site Engineer 0.875 Senior Site Engineer(0.875)
4774 Mechanical Engineer Cost Schedule Engineer 0.875 Cost Schedule Engineer(0.875)
4607 Mechanical Engineer Layout Engineer 0.875 Layout Engineer(0.875)

In [27]:
# queryTopkSim('electrical engineer', k=10)


Out[27]:
t1 t2 topic_sim title_n_sim
1749 Electrical Engineer Dredging Engineer 0.888 Dredging Engineer(0.888)
1973 Electrical Engineer Assistant Site Engineer 0.887 Assistant Site Engineer(0.887)
2598 Electrical Engineer Architectural Engineer 0.884 Architectural Engineer(0.884)
2607 Electrical Engineer Construction Engineer 0.884 Construction Engineer(0.884)
2735 Electrical Engineer Senior Tunnel Engineer 0.883 Senior Tunnel Engineer(0.883)
2879 Electrical Engineer Rail Track Engineer 0.882 Rail Track Engineer(0.882)
2902 Electrical Engineer Senior Site Engineer 0.882 Senior Site Engineer(0.882)
2920 Electrical Engineer Power Plant Service Engineer 0.882 Power Plant Service Engineer(0.882)
3239 Electrical Engineer Geotechnical Engineer 0.881 Geotechnical Engineer(0.881)
3253 Electrical Engineer Cost Schedule Engineer 0.880 Cost Schedule Engineer(0.88)

Executive Jobs


In [13]:
exe_sims = pd.read_csv(func_dir + 'sims/executive.csv')
exe_sims.head()


Out[13]:
t1 t2 topic_sim
0 Financial Accounting Executive Senior Accounting Executive 0.933
1 Financial Accounting Executive Tax Executive 0.927
2 Restaurant Executive Catering Executive 0.926
3 Restaurant Executive Food & Beverage Executive 0.911
4 Financial Accounting Executive Accounts Executive 0.911

In [34]:
topkByFunction(k=5, job_title='Financial Accounting Executive', func_sims=exe_sims)

exe_titles = titlesWithFunc('executive', title_df)
exe_top5 = pd.DataFrame({'title': exe_titles})

exe_top5['top5'] = exe_top5['title'].apply(topkByFunction, k=5, func_sims=exe_sims)
exe_top5.to_csv(func_dir + 'topk/executive.csv', index=False)


Out[34]:
title top5
0 Accounts Executive Financial Accounting Executive(0.911), Tax Exe...
1 Sales Executive Senior Product Development Executive(0.888), C...
2 Marketing Executive Promotions Executive(0.866), Senior Distributi...
3 Executive Corporate Planning Executive(0.858), Governmen...
4 Human Resource Executive Senior Human Resource Executive(0.874), Payrol...

In [44]:
queryTopkSim('Accounts Executive', k=10)
queryTopkSim('Marketing Executive', k=10)
queryTopkSim('Sales Executive', k=10)
sales_exe_top10 = queryTopkSim('Sales Executive', k=10)


Out[44]:
t1 t2 topic_sim title_n_sim
77 Sales Executive Senior Product Development Executive 0.888 Senior Product Development Executive(0.888)
185 Sales Executive Key Account Executive 0.881 Key Account Executive(0.881)
192 Sales Executive Corporate Planning Executive 0.881 Corporate Planning Executive(0.881)
221 Sales Executive Sales Representative (Medical and Pharmaceutic... 0.881 Sales Representative (Medical and Pharmaceutic...
113 Sales Executive Sales Clerk 0.880 Sales Clerk(0.88)

In [61]:
fan = fanOut(sales_exe_top10.t2, fan_width=2)
cand_titles = fan.t2
cand_titles


Out[61]:
8                    Senior Account Management Executive
11                              Senior Product Executive
14                  Senior Product Development Executive
32                                Export Sales Executive
23                                  Government Executive
27                                   Ticketing Executive
11                                       Logistics Clerk
108                                     Sales Supervisor
0                                      Sales Coordinator
109                                          Sales Clerk
38                                 Assistant  Supervisor
55                                     Branch Supervisor
219    Sales Representative (Medical and Pharmaceutic...
256                                     Sales Supervisor
43                                     Sales Coordinator
2                               Assistant Sales Engineer
0                                      Ticketing Officer
27                          Corporate Planning Executive
Name: t2, dtype: object

In [ ]:


In [ ]:


In [ ]:
# TESTS

# swr_sims = pasteCols('t2', 'topic_sim', swr_sims, 'title_n_sim')
# swr_sims.head()
# swapCols('t1', 't2', swr_sims.head())