Connecting Job Titles by Topic Similarity

Given a parsed job title with known position, domain and function, we find top-$k$ titles most similar to the title.

Preps



In [1]:

    
import gc
import ja_helpers as ja_helpers; from ja_helpers import *



In [2]:

    
gc.enable()



In [3]:

    
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'; RES_DIR = HOME_DIR + 'results/'



In [4]:

    
domain_dir = RES_DIR + 'job_prof/by_domain/' + 'new/'
func_dir = RES_DIR + 'job_prof/by_func/' + 'new/'

Load parsed titles with at least 2 posts: mk sure to turn off default NAs s.t empty strings will not be regarded as NAs.



In [8]:

    
title_df = pd.read_csv(DATA_DIR + 'new_titles_2posts_up.csv', keep_default_na=False)



In [9]:

    
title_df.head()









    Out[9]:






  
    
      
      title
      n_post
      domain
      position
      pri_func
      sec_func
      non_std_title
    
  
  
    
      0
      Academic Assistant
      13
      academic
      
      assistant
      
      Academic Assistant
    
    
      1
      Academic Consultant
      13
      academic
      
      consultant
      
      Academic Consultant
    
    
      2
      Academic Coordinator
      12
      academic
      
      coordinator
      
      Academic Coordinator
    
    
      3
      Academic Director
      6
      academic
      
      director
      
      Academic Director
    
    
      4
      Academic Instructor
      13
      academic
      
      instructor
      
      Academic Instructor



In [10]:

    
stdForm = dict(zip(title_df.non_std_title, title_df.title))



In [11]:

    
positions = dict(zip(title_df.title, title_df.position))
domains = dict(zip(title_df.title, title_df.domain))
functions = dict(zip(title_df.title, title_df.pri_func))



In [12]:

    
print('# std titles: %d' % title_df.title.nunique())
# print









    



# std titles: 2839

Run



In [24]:

    
# reload(my_util); from my_util import *
reload(ja_helpers); from ja_helpers import *



In [10]:

    
avail_domains = listFiles(domain_dir + 'sims/')
avail_funcs = listFiles(func_dir + 'sims/')

empty_sim = pd.DataFrame({'t1': [], 't2': [], 'topic_sim': []})



In [36]:

    
def getDomainSims(job_title):
    if job_title not in domains.keys():
        dom_sims = empty_sim
    else:
        dom = clean(domains[job_title])
        dom_file = domain_dir + 'sims/{}.csv'.format(dom)
        dom_sims = pd.read_csv(dom_file) if (dom in avail_domains) else empty_sim        
    return dom_sims

def getFuncSims(job_title):
    if job_title not in functions.keys():
        func_sims = empty_sim
    else:
        pri_func = clean(functions[job_title])
        func_file = func_dir + 'sims/{}.csv'.format(pri_func); 
        func_sims = pd.read_csv(func_file) if (pri_func in avail_funcs) else empty_sim
    return func_sims

## Main function 
def queryTopkSim(job_title, k=2):
    '''
    @return: k job titles with highest topic similarity to given job title
    '''
    
    job_title = camelCase(job_title)
##  Load sim scores of relevant titles (by domain or function)
    dom_sims = getDomainSims(job_title)
    func_sims = getFuncSims(job_title)
    
    # the query to retrieve all relevant obs 
    q = 't1 == "{}" or t2 == "{}"'.format(job_title, job_title)
    rel_sims = dom_sims.query(q).append(func_sims.query(q))
    
    if not rel_sims.empty: 
        return buildTopkFrom(rel_sims, k, job_title)
    else:
        return empty_sim
#         return ''

def fanOut(titles, fan_width):
    frames = [queryTopkSim(t, k=fan_width) for t in titles]
    return pd.concat(frames)

def relevantTitles(job_title='Software Developer'):
    
    job_title = camelCase(job_title)
    dom = domains[job_title] if job_title in domains.keys() else None
    func = functions[job_title] if job_title in functions.keys() else None
    if (not dom) and (not func):
        rel_titles = []
    if dom and (not func):
        rel_titles = titlesIn(dom, title_df)
    if func and (not dom):
        rel_titles = titlesWithFunc(func, title_df)
    if dom and func:
        rel_titles = np.concatenate([titlesIn(dom, title_df), titlesWithFunc(func, title_df)])
    
    return np.unique(rel_titles)



In [37]:

    
relevantTitles('software developer')









    Out[37]:





array(['Application Developer', 'Assistant Software Engineer',
       'Business Systems Developer', 'Concept Design Developer',
       'Curriculum Developer', 'Database Developer', 'Game Developer',
       'Hypertext Preprocessor Developer', 'Java Developer',
       'Multimedia Developer', 'Project Developer',
       'Senior Application Developer', 'Senior Database Developer',
       'Senior Game Developer', 'Senior Hypertext Preprocessor Developer',
       'Senior Java Developer', 'Senior Software Analyst',
       'Senior Software Architect', 'Senior Software Consultant',
       'Senior Software Developer', 'Senior Software Engineer',
       'Senior Software Manager', 'Senior Software Specialist',
       'Senior Solutions Developer', 'Senior Systems Developer',
       'Senior Technical Developer', 'Senior Web Application Developer',
       'Senior Web Developer', 'Software Analyst',
       'Software And Applications Developer', 'Software Architect',
       'Software Consultant', 'Software Developer', 'Software Engineer',
       'Software Manager', 'Software Programmer', 'Software Specialist',
       'Solutions Developer', 'Staff Software Engineer',
       'Systems Developer', 'Technical Developer',
       'Web Application Developer', 'Web Developer', 'Website Developer'], dtype=object)



In [29]:

    
swr_dev_sims = queryTopkSim('software developer', k=5)
swr_dev_titles = swr_dev_sims.t2
fanOut(swr_dev_titles, fan_width=3)









    Out[29]:






  
    
      
      t1
      t2
      topic_sim
      title_n_sim
    
  
  
    
      0
      Senior Hypertext Preprocessor Developer
      Hypertext Preprocessor Developer
      0.901
      Hypertext Preprocessor Developer(0.901)
    
    
      1
      Senior Hypertext Preprocessor Developer
      Multimedia Developer
      0.893
      Multimedia Developer(0.893)
    
    
      2
      Senior Hypertext Preprocessor Developer
      Software And Applications Developer
      0.886
      Software And Applications Developer(0.886)
    
    
      3
      Concept Design Developer
      Hypertext Preprocessor Developer
      0.884
      Hypertext Preprocessor Developer(0.884)
    
    
      7
      Concept Design Developer
      Senior Hypertext Preprocessor Developer
      0.876
      Senior Hypertext Preprocessor Developer(0.876)
    
    
      10
      Concept Design Developer
      Software And Applications Developer
      0.873
      Software And Applications Developer(0.873)
    
    
      0
      Hypertext Preprocessor Developer
      Senior Hypertext Preprocessor Developer
      0.901
      Senior Hypertext Preprocessor Developer(0.901)
    
    
      3
      Hypertext Preprocessor Developer
      Concept Design Developer
      0.884
      Concept Design Developer(0.884)
    
    
      4
      Hypertext Preprocessor Developer
      Multimedia Developer
      0.879
      Multimedia Developer(0.879)
    
    
      2
      Software And Applications Developer
      Senior Hypertext Preprocessor Developer
      0.886
      Senior Hypertext Preprocessor Developer(0.886)
    
    
      6
      Software And Applications Developer
      Hypertext Preprocessor Developer
      0.877
      Hypertext Preprocessor Developer(0.877)
    
    
      10
      Software And Applications Developer
      Concept Design Developer
      0.873
      Concept Design Developer(0.873)
    
    
      316
      Assistant Software Engineer
      Information Visualisation Engineer
      0.906
      Information Visualisation Engineer(0.906)
    
    
      687
      Assistant Software Engineer
      Solutions Quality Engineer
      0.899
      Solutions Quality Engineer(0.899)
    
    
      974
      Assistant Software Engineer
      Solar Engineer
      0.895
      Solar Engineer(0.895)



In [21]:

    
queryTopkSim('software programmer', k=5)









    Out[21]:






  
    
      
      t1
      t2
      topic_sim
      title_n_sim
    
  
  
    
      0
      Software Programmer
      Assistant Software Engineer
      0.869
      Assistant Software Engineer(0.869)
    
    
      3
      Software Programmer
      Systems Programmer
      0.879
      Systems Programmer(0.879)
    
    
      22
      Software Programmer
      Analyst Programmer
      0.866
      Analyst Programmer(0.866)
    
    
      19
      Software Programmer
      Lead  Programmer
      0.866
      Lead  Programmer(0.866)
    
    
      28
      Software Programmer
      Technical Programmer
      0.860
      Technical Programmer(0.86)

Engineer Jobs



In [25]:

    
queryTopkSim('software engineer', k=10)









    Out[25]:






  
    
      
      t1
      t2
      topic_sim
      title_n_sim
    
  
  
    
      3456
      Software Engineer
      Information Visualisation Engineer
      0.880
      Information Visualisation Engineer(0.88)
    
    
      6816
      Software Engineer
      Solutions Quality Engineer
      0.869
      Solutions Quality Engineer(0.869)
    
    
      11335
      Software Engineer
      Software Design Engineer
      0.859
      Software Design Engineer(0.859)
    
    
      4
      Software Engineer
      Assistant Software Engineer
      0.858
      Assistant Software Engineer(0.858)
    
    
      12734
      Software Engineer
      Assistant Software Engineer
      0.857
      Assistant Software Engineer(0.857)
    
    
      13243
      Software Engineer
      Solar Engineer
      0.856
      Solar Engineer(0.856)
    
    
      15014
      Software Engineer
      Senior Embedded Software Engineer
      0.853
      Senior Embedded Software Engineer(0.853)
    
    
      16017
      Software Engineer
      Senior Mobile Engineer
      0.851
      Senior Mobile Engineer(0.851)
    
    
      15990
      Software Engineer
      Environmental Affairs Engineer
      0.851
      Environmental Affairs Engineer(0.851)
    
    
      17001
      Software Engineer
      Embedded Software Engineer
      0.850
      Embedded Software Engineer(0.85)



In [26]:

    
queryTopkSim('mechanical engineer', k=10)









    Out[26]:






  
    
      
      t1
      t2
      topic_sim
      title_n_sim
    
  
  
    
      2181
      Mechanical Engineer
      Power Plant Service Engineer
      0.886
      Power Plant Service Engineer(0.886)
    
    
      3607
      Mechanical Engineer
      Senior Tunnel Engineer
      0.879
      Senior Tunnel Engineer(0.879)
    
    
      3605
      Mechanical Engineer
      Rail Track Engineer
      0.879
      Rail Track Engineer(0.879)
    
    
      3734
      Mechanical Engineer
      Dredging Engineer
      0.878
      Dredging Engineer(0.878)
    
    
      4260
      Mechanical Engineer
      Assistant Site Engineer
      0.877
      Assistant Site Engineer(0.877)
    
    
      4557
      Mechanical Engineer
      Senior Building Engineer
      0.876
      Senior Building Engineer(0.876)
    
    
      4857
      Mechanical Engineer
      Architectural Engineer
      0.875
      Architectural Engineer(0.875)
    
    
      4831
      Mechanical Engineer
      Senior Site Engineer
      0.875
      Senior Site Engineer(0.875)
    
    
      4774
      Mechanical Engineer
      Cost Schedule Engineer
      0.875
      Cost Schedule Engineer(0.875)
    
    
      4607
      Mechanical Engineer
      Layout Engineer
      0.875
      Layout Engineer(0.875)



In [27]:

    
# queryTopkSim('electrical engineer', k=10)









    Out[27]:






  
    
      
      t1
      t2
      topic_sim
      title_n_sim
    
  
  
    
      1749
      Electrical Engineer
      Dredging Engineer
      0.888
      Dredging Engineer(0.888)
    
    
      1973
      Electrical Engineer
      Assistant Site Engineer
      0.887
      Assistant Site Engineer(0.887)
    
    
      2598
      Electrical Engineer
      Architectural Engineer
      0.884
      Architectural Engineer(0.884)
    
    
      2607
      Electrical Engineer
      Construction Engineer
      0.884
      Construction Engineer(0.884)
    
    
      2735
      Electrical Engineer
      Senior Tunnel Engineer
      0.883
      Senior Tunnel Engineer(0.883)
    
    
      2879
      Electrical Engineer
      Rail Track Engineer
      0.882
      Rail Track Engineer(0.882)
    
    
      2902
      Electrical Engineer
      Senior Site Engineer
      0.882
      Senior Site Engineer(0.882)
    
    
      2920
      Electrical Engineer
      Power Plant Service Engineer
      0.882
      Power Plant Service Engineer(0.882)
    
    
      3239
      Electrical Engineer
      Geotechnical Engineer
      0.881
      Geotechnical Engineer(0.881)
    
    
      3253
      Electrical Engineer
      Cost Schedule Engineer
      0.880
      Cost Schedule Engineer(0.88)

Executive Jobs



In [13]:

    
exe_sims = pd.read_csv(func_dir + 'sims/executive.csv')
exe_sims.head()









    Out[13]:






  
    
      
      t1
      t2
      topic_sim
    
  
  
    
      0
      Financial Accounting Executive
      Senior Accounting Executive
      0.933
    
    
      1
      Financial Accounting Executive
      Tax Executive
      0.927
    
    
      2
      Restaurant Executive
      Catering Executive
      0.926
    
    
      3
      Restaurant Executive
      Food & Beverage Executive
      0.911
    
    
      4
      Financial Accounting Executive
      Accounts Executive
      0.911



In [34]:

    
topkByFunction(k=5, job_title='Financial Accounting Executive', func_sims=exe_sims)

exe_titles = titlesWithFunc('executive', title_df)
exe_top5 = pd.DataFrame({'title': exe_titles})

exe_top5['top5'] = exe_top5['title'].apply(topkByFunction, k=5, func_sims=exe_sims)
exe_top5.to_csv(func_dir + 'topk/executive.csv', index=False)









    Out[34]:






  
    
      
      title
      top5
    
  
  
    
      0
      Accounts Executive
      Financial Accounting Executive(0.911), Tax Exe...
    
    
      1
      Sales Executive
      Senior Product Development Executive(0.888), C...
    
    
      2
      Marketing Executive
      Promotions Executive(0.866), Senior Distributi...
    
    
      3
      Executive
      Corporate Planning Executive(0.858), Governmen...
    
    
      4
      Human Resource Executive
      Senior Human Resource Executive(0.874), Payrol...



In [44]:

    
queryTopkSim('Accounts Executive', k=10)
queryTopkSim('Marketing Executive', k=10)
queryTopkSim('Sales Executive', k=10)
sales_exe_top10 = queryTopkSim('Sales Executive', k=10)









    Out[44]:






  
    
      
      t1
      t2
      topic_sim
      title_n_sim
    
  
  
    
      77
      Sales Executive
      Senior Product Development Executive
      0.888
      Senior Product Development Executive(0.888)
    
    
      185
      Sales Executive
      Key Account Executive
      0.881
      Key Account Executive(0.881)
    
    
      192
      Sales Executive
      Corporate Planning Executive
      0.881
      Corporate Planning Executive(0.881)
    
    
      221
      Sales Executive
      Sales Representative (Medical and Pharmaceutic...
      0.881
      Sales Representative (Medical and Pharmaceutic...
    
    
      113
      Sales Executive
      Sales Clerk
      0.880
      Sales Clerk(0.88)



In [61]:

    
fan = fanOut(sales_exe_top10.t2, fan_width=2)
cand_titles = fan.t2
cand_titles









    Out[61]:





8                    Senior Account Management Executive
11                              Senior Product Executive
14                  Senior Product Development Executive
32                                Export Sales Executive
23                                  Government Executive
27                                   Ticketing Executive
11                                       Logistics Clerk
108                                     Sales Supervisor
0                                      Sales Coordinator
109                                          Sales Clerk
38                                 Assistant  Supervisor
55                                     Branch Supervisor
219    Sales Representative (Medical and Pharmaceutic...
256                                     Sales Supervisor
43                                     Sales Coordinator
2                               Assistant Sales Engineer
0                                      Ticketing Officer
27                          Corporate Planning Executive
Name: t2, dtype: object



In [ ]:



In [ ]:



In [ ]:

    
# TESTS

# swr_sims = pasteCols('t2', 'topic_sim', swr_sims, 'title_n_sim')
# swr_sims.head()
# swapCols('t1', 't2', swr_sims.head())

	title	n_post	domain	pri_func	non_std_title
0	Academic Assistant	13	academic	assistant	Academic Assistant
1	Academic Consultant	13	academic	consultant	Academic Consultant
2	Academic Coordinator	12	academic	coordinator	Academic Coordinator
3	Academic Director	6	academic	director	Academic Director
4	Academic Instructor	13	academic	instructor	Academic Instructor

	t1	t2	topic_sim	title_n_sim
0	Senior Hypertext Preprocessor Developer	Hypertext Preprocessor Developer	0.901	Hypertext Preprocessor Developer(0.901)
1	Senior Hypertext Preprocessor Developer	Multimedia Developer	0.893	Multimedia Developer(0.893)
2	Senior Hypertext Preprocessor Developer	Software And Applications Developer	0.886	Software And Applications Developer(0.886)
3	Concept Design Developer	Hypertext Preprocessor Developer	0.884	Hypertext Preprocessor Developer(0.884)
7	Concept Design Developer	Senior Hypertext Preprocessor Developer	0.876	Senior Hypertext Preprocessor Developer(0.876)
10	Concept Design Developer	Software And Applications Developer	0.873	Software And Applications Developer(0.873)
0	Hypertext Preprocessor Developer	Senior Hypertext Preprocessor Developer	0.901	Senior Hypertext Preprocessor Developer(0.901)
3	Hypertext Preprocessor Developer	Concept Design Developer	0.884	Concept Design Developer(0.884)
4	Hypertext Preprocessor Developer	Multimedia Developer	0.879	Multimedia Developer(0.879)
2	Software And Applications Developer	Senior Hypertext Preprocessor Developer	0.886	Senior Hypertext Preprocessor Developer(0.886)
6	Software And Applications Developer	Hypertext Preprocessor Developer	0.877	Hypertext Preprocessor Developer(0.877)
10	Software And Applications Developer	Concept Design Developer	0.873	Concept Design Developer(0.873)
316	Assistant Software Engineer	Information Visualisation Engineer	0.906	Information Visualisation Engineer(0.906)
687	Assistant Software Engineer	Solutions Quality Engineer	0.899	Solutions Quality Engineer(0.899)
974	Assistant Software Engineer	Solar Engineer	0.895	Solar Engineer(0.895)

	t1	t2	topic_sim	title_n_sim
0	Software Programmer	Assistant Software Engineer	0.869	Assistant Software Engineer(0.869)
3	Software Programmer	Systems Programmer	0.879	Systems Programmer(0.879)
22	Software Programmer	Analyst Programmer	0.866	Analyst Programmer(0.866)
19	Software Programmer	Lead Programmer	0.866	Lead Programmer(0.866)
28	Software Programmer	Technical Programmer	0.860	Technical Programmer(0.86)

	t1	t2	topic_sim	title_n_sim
3456	Software Engineer	Information Visualisation Engineer	0.880	Information Visualisation Engineer(0.88)
6816	Software Engineer	Solutions Quality Engineer	0.869	Solutions Quality Engineer(0.869)
11335	Software Engineer	Software Design Engineer	0.859	Software Design Engineer(0.859)
4	Software Engineer	Assistant Software Engineer	0.858	Assistant Software Engineer(0.858)
12734	Software Engineer	Assistant Software Engineer	0.857	Assistant Software Engineer(0.857)
13243	Software Engineer	Solar Engineer	0.856	Solar Engineer(0.856)
15014	Software Engineer	Senior Embedded Software Engineer	0.853	Senior Embedded Software Engineer(0.853)
16017	Software Engineer	Senior Mobile Engineer	0.851	Senior Mobile Engineer(0.851)
15990	Software Engineer	Environmental Affairs Engineer	0.851	Environmental Affairs Engineer(0.851)
17001	Software Engineer	Embedded Software Engineer	0.850	Embedded Software Engineer(0.85)

	t1	t2	topic_sim	title_n_sim
2181	Mechanical Engineer	Power Plant Service Engineer	0.886	Power Plant Service Engineer(0.886)
3607	Mechanical Engineer	Senior Tunnel Engineer	0.879	Senior Tunnel Engineer(0.879)
3605	Mechanical Engineer	Rail Track Engineer	0.879	Rail Track Engineer(0.879)
3734	Mechanical Engineer	Dredging Engineer	0.878	Dredging Engineer(0.878)
4260	Mechanical Engineer	Assistant Site Engineer	0.877	Assistant Site Engineer(0.877)
4557	Mechanical Engineer	Senior Building Engineer	0.876	Senior Building Engineer(0.876)
4857	Mechanical Engineer	Architectural Engineer	0.875	Architectural Engineer(0.875)
4831	Mechanical Engineer	Senior Site Engineer	0.875	Senior Site Engineer(0.875)
4774	Mechanical Engineer	Cost Schedule Engineer	0.875	Cost Schedule Engineer(0.875)
4607	Mechanical Engineer	Layout Engineer	0.875	Layout Engineer(0.875)

	t1	t2	topic_sim	title_n_sim
1749	Electrical Engineer	Dredging Engineer	0.888	Dredging Engineer(0.888)
1973	Electrical Engineer	Assistant Site Engineer	0.887	Assistant Site Engineer(0.887)
2598	Electrical Engineer	Architectural Engineer	0.884	Architectural Engineer(0.884)
2607	Electrical Engineer	Construction Engineer	0.884	Construction Engineer(0.884)
2735	Electrical Engineer	Senior Tunnel Engineer	0.883	Senior Tunnel Engineer(0.883)
2879	Electrical Engineer	Rail Track Engineer	0.882	Rail Track Engineer(0.882)
2902	Electrical Engineer	Senior Site Engineer	0.882	Senior Site Engineer(0.882)
2920	Electrical Engineer	Power Plant Service Engineer	0.882	Power Plant Service Engineer(0.882)
3239	Electrical Engineer	Geotechnical Engineer	0.881	Geotechnical Engineer(0.881)
3253	Electrical Engineer	Cost Schedule Engineer	0.880	Cost Schedule Engineer(0.88)

	t1	t2	topic_sim
0	Financial Accounting Executive	Senior Accounting Executive	0.933
1	Financial Accounting Executive	Tax Executive	0.927
2	Restaurant Executive	Catering Executive	0.926
3	Restaurant Executive	Food & Beverage Executive	0.911
4	Financial Accounting Executive	Accounts Executive	0.911

	title	top5
0	Accounts Executive	Financial Accounting Executive(0.911), Tax Exe...
1	Sales Executive	Senior Product Development Executive(0.888), C...
2	Marketing Executive	Promotions Executive(0.866), Senior Distributi...
3	Executive	Corporate Planning Executive(0.858), Governmen...
4	Human Resource Executive	Senior Human Resource Executive(0.874), Payrol...

	t1	t2	topic_sim	title_n_sim
77	Sales Executive	Senior Product Development Executive	0.888	Senior Product Development Executive(0.888)
185	Sales Executive	Key Account Executive	0.881	Key Account Executive(0.881)
192	Sales Executive	Corporate Planning Executive	0.881	Corporate Planning Executive(0.881)
221	Sales Executive	Sales Representative (Medical and Pharmaceutic...	0.881	Sales Representative (Medical and Pharmaceutic...
113	Sales Executive	Sales Clerk	0.880	Sales Clerk(0.88)