Connecting Job Titles by Their Similarity Scores

import my_util as my_util; from my_util import *
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *

import os
import random
from time import time
import gc

# Turn on auto garbage collection

HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
SKILL_DAT = HOME_DIR + 'data/clean/skill_cluster/' 
SKILL_RES = HOME_DIR + 'results/' + 'skill_cluster/new/'
JOB_PROF = SKILL_RES + 'job_prof/'
  • Data loading:

title_stats = pd.read_csv(DATA_DIR + 'stats/job_titles.csv')
titles = title_stats['title']
print('# titles: %d' %len(titles))

# Bf standardizing job titles
df = pd.read_csv(SKILL_DAT + 'filter_doc_index.csv')
print df.shape
df.set_index('index', inplace=True)

with(open(SKILL_RES + 'doc_20topic_distr.mtx', 'r')) as f:
    doc_topic_distr = mmread(f)

n_topic = doc_topic_distr.shape[1]
print('# topics: %d' %n_topic)
topic_df = pd.read_csv(SKILL_RES + 'lda/20_topics.csv')
labels = map(str.upper, topic_df['label'])

# Parsed titles:
parsed_titles = pd.read_csv(DATA_DIR + 'parsed_titles.csv')
print('# titles parsed: {}'.format(parsed_titles.shape[0]))

# Domains
domain_df = pd.read_csv(DATA_DIR + 'stats/domains.csv')

After fixing some bugs with parsing job titles, we re-parsed job titles with problem by script. Thus, we need to reload newly parsed titles.

title_df = pd.read_csv(DATA_DIR + 'titles_2posts_up.csv')
print('# parsed titles: %d' % len(title_df.title))


from my_util import *

In [329]:
from cluster_skill_helpers import *

def vizPostPair(i, sim_df, labels, abbv_title=''):
    fig = vizDists4Pair(sim_df.iloc[i], df, doc_topic_distr, labels)
    fig.savefig(SKILL_RES + 'fig/{}_p{}.pdf'.format(abbv_title, i+1)); plt.close()

In [334]:
## Funcs for sim cals
def byRelevanceSims(domain='software', pri_func='engineer', min_post=2, verbose=False):
    if domain: 
        chosen_titles = titlesWithAtLeast(titlesIn(domain, title_df), 
    if pri_func: 
        chosen_titles = titlesWithAtLeast(titlesWith(pri_func, title_df), 
    n_title = len(chosen_titles)
    print('\t# titles with at least {} posts: {}'.format(min_post, n_title))
    sims = simsAmong(chosen_titles, doc_topic_distr, df, verbose)
    return sims

def calSims(domain='software', pri_func='engineer', verbose=False):
    @param: either domain or pri_func, not both
    t0 = time()
    if domain: 
        print('Domain: {}'.format(domain))
        sims = byRelevanceSims(domain=domain, pri_func=None, verbose=verbose)
    if pri_func:
        print('Primary function: {}'.format(pri_func))
        sims = byRelevanceSims(domain=None, pri_func=pri_func, verbose=verbose)
    elapse = round(time() - t0, 1)
    print('\tFinished sim cals after {}s'.format(elapse))
    if not sims.empty:
        sims = sims.sort_values('topic_sim', ascending=False)
        if domain: fname = domain_dir + 'sims/new/{}.csv'.format(clean(domain))
        if pri_func: fname = func_dir + 'sims/new/{}.csv'.format(clean(pri_func))
        sims.to_csv(fname, index=False)
        print('\tSaved sims to file: {}'.format(fname))
    return sims

def topkSim(title, k, sims):
    s1 = sims[sims['t1'] == title]
    s2 = sims[sims['t2'] == title]
    # Hacking to swap two cols
    s2 = s2.rename(columns={'t1': 'tmp', 't2':'t1'})
    s2 = s2.rename(columns={'tmp': 't2'})
    res = pd.concat([s1, s2])
    res['title_n_sim'] = res['t2'] + '(' + map(str, res['topic_sim']) + ')'
    res = res.sort_values('topic_sim', ascending=False)
    return ','.join(list(res.head(k)['title_n_sim']))

def findTopkSimByDomain(domain, sims, k=2):
    chosen_titles = set(sims['t1']).union(set(sims['t2']))
    topk_res = pd.DataFrame({'title': chosen_titles})
    col_topk = 'top_{}_sim'.format(k)
    topk_res[col_topk] = topk_res['title'].apply(topkSim, k=k, sims=sims)

    fname = domain_dir + 'topk/{}_top{}.csv'.format(clean(domain), k)
    topk_res.to_csv(fname, index=False)
    print('\tSaved top-{} similar title result'.format(k))
def findTopkSimByFunc(pri_func, sims, k=2):
    chosen_titles = set(sims['t1']).union(set(sims['t2']))
    topk_res = pd.DataFrame({'title': chosen_titles})
    col_topk = 'top_{}_sim'.format(k)
    topk_res[col_topk] = topk_res['title'].apply(topkSim, k=k, sims=sims)
    fname = func_dir + 'topk/{}_top{}.csv'.format(clean(pri_func), k)
    topk_res.to_csv(fname, index=False)
    print('\tSaved top-{} similar title result'.format(k))

def domainsInRange(min_n_title, max_n_title):
    q = '{} <= n_title < {}'.format(min_n_title, max_n_title)
    domains = by_domain_agg.query(q)['domain']
    msg = '# domains with # titles in range [{}, {}): {}'.format(min_n_title, max_n_title, len(domains))
    return domains

Given the parsed titles, we now compute pairwise sims among them based on:

  • domain
  • primary funcs.

Similarity By Job's Domain

# Get needed stats from agg
by_domain_agg = pd.read_csv(DATA_DIR + 'stats/domains.csv')

n_domain = by_domain_agg.shape[0]
print('# domains: {}'.format(n_domain))

First, we look at the top-10 domains with most no. of titles.

Among the top-10, Marketing is the one with least no. of titles. So we use it to test run the funcs for sim cals.


In [ ]:
mkt_sims = calSims('marketing')
findTopkSimByDomain('marketing', mkt_sims)
  • 1st run: 'Senior Marketing Engineer' has highest sim with 'Senior Marketing Representative'!!! Why?

In [ ]:
# posts by Senior Marketing Engineer
sme_posts = df[df.title == 'Senior Marketing Engineer']
print('# posts by Senior Marketing Engineer: %d' %sme_posts.shape[0])

# posts by Senior Marketing Representative
smr_posts = df[df.title == 'Senior Marketing Representative']
print('# posts by Senior Marketing Representative: %d' %smr_posts.shape[0])

Ans: forgot to filter out titles with only 1 post.

We fixed this bug, re-run again and got rid of this issue. Now, we can run it for other domains in top-10.

Top-9 Domains:

sim_dir = JOB_PROF + 'by_domain/sims/'

top9_domains = by_domain_agg.head(9)['domain']
for dom in top9_domains:
    sims = calSims(dom)
    findTopkSimByDomain(dom, sims, k=5)

Other domains

We decide the priority for domains to consider based on the no. of job titles under domain. Thus we look at the distribution of job titles in domains.

On avg, no. of job titles per domain is $2.7$.

In [ ]:
fig = plt.figure()
mean_n_title = round(by_domain_agg.n_title.mean(), 1)
xl = '# job titles' + r'$(\mu = {})$'.format(mean_n_title)
plt.xlabel(xl, fontsize=16); 
plt.ylabel('# domains', fontsize=16)
plt.savefig(JOB_PROF + 'title_dist_by_domain.pdf'); plt.close()

Re-compute sims after title standardization

Reload data after standardizing job titles

  • Job posts:

df = pd.read_csv(SKILL_DAT + 'filter_doc_index.csv')
print df.shape
df.set_index('index', inplace=True)

title_stats = getTitleStats(df)
# title_stats.to_csv(DATA_DIR + 'stats/job_titles.csv', index=False)

print('# job titles after standardizing: %d' %title_stats.shape[0])

# swr_sims = calSims(domain='software', pri_func=None)
viz(swr_sims, domain='software')

In [333]:
# mkt_sims = calSims(domain='marketing', pri_func=None, verbose=True)
# viz(mkt_sims, domain='marketing')
findTopkSimByDomain(domain='marketing', k=5, sims=mkt_sims)

  • Domains with at least 20 job titles:

domain_20_ = by_domain_agg.query('20 <= n_title')['domain']
print('# domains with at least 20 job titles: %d' %len(domain_20_))

for dom in domain_20_:
    sims = calSims(dom)
    print('\tViz pairwise sims among job titles')
    viz(sims, domain=dom)
#     findTopkSimByDomain(dom, sims, k=5)
  • Domains with # titles in range [10, 20):

domain_10_20 = domainsInRange(10, 20)

for dom in domain_10_20:
    sims = calSims(dom)
#     findTopkSimByDomain(dom, sims, k=5)
  • Domains with # titles in range [2, 10):

domain_2_10 = domainsInRange(2, 10)

This is a large number. We need to break down more.

dom_5_10 = domainsInRange(5, 10)

# 1st half
for dom in dom_5_10[:64] :
    sims = calSims(dom)
    findTopkSimByDomain(dom, sims, k=3)

# 2nd half
for dom in dom_5_10[76:]: # 64
    sims = calSims(dom)
    if not sims.empty: findTopkSimByDomain(dom, sims, k=3)

dom_2_5 = domainsInRange(2, 5)

list(dom_2_5).index('personnel / human resource')

In [ ]:
for dom in dom_2_5[100:300]:
    sims = calSims(dom)
    if not sims.empty: findTopkSimByDomain(dom, sims, k=2)

for dom in dom_2_5[300:]:
    sims = calSims(dom)
    if not sims.empty: findTopkSimByDomain(dom, sims, k=2)

Should we group similar domain together?

We can spot similar domain names e.g. account and accounts, financial and finance. Should we group them together?

Similarity By Job's Primary Function

by_func_agg = pd.read_csv(DATA_DIR + 'stats/pri_funcs.csv')

n_func = by_func_agg.shape[0]
print('# primary functions: %d' %n_func)

by_func_agg.sort_values('n_title', inplace=True)

# by_func_agg.query('pri_func == "developer"')
# by_func_agg.query('pri_func == "teacher"')
by_func_agg.query('pri_func == "programmer"')

# Functions with largest no. of titles

chosen_funcs = by_func_agg.query('n_title >= 2')['pri_func']
print('# funcs with at least 2 job titles: %d' %len(chosen_funcs))

teacher_sims = calSims(pri_func='teacher', domain=None)
findTopkSimByFunc('teacher', teacher_sims, k=5)

def functionInRange(min_n_title, max_n_title=None):
    if max_n_title: 
        q = '{} <= n_title < {}'.format(min_n_title, max_n_title)
        q = '{} <= n_title'.format(min_n_title)
    funcs = list(by_func_agg.query(q)['pri_func'])
    text = '#pri-funcs having # titles in range [{}, {}): {}'
    if max_n_title:
        msg = text.format(min_n_title, max_n_title, len(funcs))
        msg = text.format(min_n_title, np.infty, len(funcs))
    return funcs

func_2_5 = functionInRange(2, 5)

func_5_100 = functionInRange(5, 100)

print('\tViz pairwise sims among job titles')
viz(sims, pri_func=pri_func)

for func in func_5_100:
    sims = calSims(pri_func=func, domain=None)
    print('\tViz pairwise sims among job titles')
    viz(sims, pri_func=func)
#     findTopkSimByFunc(func, sims)

func_100_ = functionInRange(min_n_title=100)

supervisor_sims = pd.read_csv(func_dir + 'sims/supervisor.csv')
findTopkSimByFunc('supervisor', k=5, sims=supervisor_sims)

by_sec_func = parsed_titles.groupby('sec_func')
n_sec_func = by_sec_func.ngroups
print('# secondary funcs: %d' %n_sec_func)

Similarity of Job Titles in one Industry

We investigate topic similarity of job titles in a given industry.


  • a job title can appear in several industries in different job posts and the content of the posts can vary a lot. How to define/limit to posts of a job title in a given industry?

def calTitleSims(row):
    @param:  row contains full name and short name of an industry
    @return: pairwise sims bw titles with at least 2 posts in the industry
    def titleSims(industry='Education', short_name='edu'):
        posts = df[df.industry == industry]
        stats = getTitleStats(posts)
        stats.to_csv(directory + 'title_stats.csv', index=False)
    #     only analyze titles with at least 2 posts in the industry 
        titles = list(stats.query('n_post >= 2')['title'])
        sims = simsOfTitles(titles, industry)
        return sims.sort_values('topic_sim', ascending=False)
    industry = row['industry']; short_name = row['short_name']
    print('\nIndustry %s:' %industry)
    directory = JOB_PROF + '{}/'.format(short_name)
    if not os.path.exists(directory): os.makedirs(directory)
    title_sims = titleSims(industry, short_name)
    title_sims.to_csv(directory + 'sims.csv', index=False)
    print('Saved pairwise sims of job titles in {}'.format(industry))

by_industry = df.groupby('industry')

agg_df = by_industry.agg({'job_id': len, 'title': 'nunique', 'employer_id': 'nunique'})
agg_df = agg_df.rename(columns={'employer_id': 'n_employer', 'job_id': 'n_post',
                               'title': 'n_title'})
  • Sort industries by number of job titles:

agg_df.sort_values('n_title', ascending=False, inplace=True)
agg_df.to_csv(DATA_DIR + 'stats/industry_stat.csv', index=False)

agg_df = pd.read_csv(DATA_DIR + 'stats/industry_stat.csv')

Industries in 1st quarter

q1 = agg_df.n_title.quantile(.25)
q1_industries = agg_df[agg_df.n_title <= q1]

def simWrapper(industries):
    n_industry = industries.shape[0]
Similarity of Job Titles in Agriculture and Fishing

Some titles do not belong to Agriculture and Fishing, e.g., Signal Processing Engineer, Account Clerk, Accounts Analyst. Assigning the industry of the employer to the post (and its title) seems to have problem!!!

Similarity of Job Titles in Mining and Quarrying

Similarity of Job Titles in Electricity, Gas and Air-Conditioning Supply

electric_sims = pd.read_csv(JOB_PROF + 'electric/sims.csv')

fig, ax = plt.subplots()
plt.savefig(electric_dir + 'sim_dist.pdf'); plt.close()

In [ ]:
q2 = agg_df.n_title.quantile(.50)
q2_industries = agg_df.query('{} < n_title and n_title <= {}'.format(q1, q2))

del q2_industries['index']

q3 = agg_df.n_title.quantile(.75)
agg_df.query('{} < n_title and n_title <= {}'.format(q2, q3))

edu_sims = titleSims(industry='Education', short_name='edu')
edu_sims.to_csv(JOB_PROF + 'edu/sims.csv', index=False)

title_df = pd.read_csv(DATA_DIR + 'stats/job_titles.csv')
titles = list(title_df['title'])
print('# titles: %d' %len(titles))

Similarity By Job's Domain

domain_df = pd.read_csv(DATA_DIR + 'stats/domains.csv')
n_domain = domain_df.shape[0]
print('# domains: %d' %n_domain)

domains_2 = domain_df.query('n_title > 1')['domain']
print('# domains with at least 2 job titles: %d' %len(domains_2))

mkt_sims = byRelevanceSims('marketing')

In [ ]:
n_pri_func =
print('# primary functions: %d' %n_pri_func)