Connecting Job Titles by Their Similarity Scores



In [ ]:

    
import my_util as my_util; from my_util import *
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *

import os
import random
from time import time
import gc



In [ ]:

    
# Turn on auto garbage collection
gc.enable()



In [ ]:

    
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
SKILL_DAT = HOME_DIR + 'data/clean/skill_cluster/' 
SKILL_RES = HOME_DIR + 'results/' + 'skill_cluster/new/'
JOB_PROF = SKILL_RES + 'job_prof/'

Data loading:



In [ ]:

    
title_stats = pd.read_csv(DATA_DIR + 'stats/job_titles.csv')
titles = title_stats['title']
print('# titles: %d' %len(titles))

# Bf standardizing job titles
df = pd.read_csv(SKILL_DAT + 'filter_doc_index.csv')
print df.shape
df.set_index('index', inplace=True)

with(open(SKILL_RES + 'doc_20topic_distr.mtx', 'r')) as f:
    doc_topic_distr = mmread(f)

n_topic = doc_topic_distr.shape[1]
print('# topics: %d' %n_topic)
topic_df = pd.read_csv(SKILL_RES + 'lda/20_topics.csv')
labels = map(str.upper, topic_df['label'])

# Parsed titles:
parsed_titles = pd.read_csv(DATA_DIR + 'parsed_titles.csv')
print('# titles parsed: {}'.format(parsed_titles.shape[0]))

# Domains
domain_df = pd.read_csv(DATA_DIR + 'stats/domains.csv')

After fixing some bugs with parsing job titles, we re-parsed job titles with problem by title_parse.py script. Thus, we need to reload newly parsed titles.



In [ ]:

    
title_df = pd.read_csv(DATA_DIR + 'titles_2posts_up.csv')
print('# parsed titles: %d' % len(title_df.title))

Helpers



In [ ]:

    
reload(my_util)
from my_util import *



In [329]:

    
reload(cluster_skill_helpers)
from cluster_skill_helpers import *



In [ ]:

    
def vizPostPair(i, sim_df, labels, abbv_title=''):
    fig = vizDists4Pair(sim_df.iloc[i], df, doc_topic_distr, labels)
    fig.savefig(SKILL_RES + 'fig/{}_p{}.pdf'.format(abbv_title, i+1))
    plt.show(); plt.close()



In [334]:

    
## Funcs for sim cals
def byRelevanceSims(domain='software', pri_func='engineer', min_post=2, verbose=False):
    
    if domain: 
        chosen_titles = titlesWithAtLeast(titlesIn(domain, title_df), 
                                          min_post)
    if pri_func: 
        chosen_titles = titlesWithAtLeast(titlesWith(pri_func, title_df), 
                                          min_post)
    n_title = len(chosen_titles)
    print('\t# titles with at least {} posts: {}'.format(min_post, n_title))
    
    sims = simsAmong(chosen_titles, doc_topic_distr, df, verbose)
    return sims

def calSims(domain='software', pri_func='engineer', verbose=False):
    '''
    @param: either domain or pri_func, not both
    '''
    t0 = time()
    if domain: 
        print('Domain: {}'.format(domain))
        sims = byRelevanceSims(domain=domain, pri_func=None, verbose=verbose)
    if pri_func:
        print('Primary function: {}'.format(pri_func))
        sims = byRelevanceSims(domain=None, pri_func=pri_func, verbose=verbose)
        
    elapse = round(time() - t0, 1)
    print('\tFinished sim cals after {}s'.format(elapse))
    
    if not sims.empty:
        sims = sims.sort_values('topic_sim', ascending=False)
        if domain: fname = domain_dir + 'sims/new/{}.csv'.format(clean(domain))
        if pri_func: fname = func_dir + 'sims/new/{}.csv'.format(clean(pri_func))
        sims.to_csv(fname, index=False)
        print('\tSaved sims to file: {}'.format(fname))
    return sims

def topkSim(title, k, sims):
    s1 = sims[sims['t1'] == title]
    s2 = sims[sims['t2'] == title]
    # Hacking to swap two cols
    s2 = s2.rename(columns={'t1': 'tmp', 't2':'t1'})
    s2 = s2.rename(columns={'tmp': 't2'})
    
    res = pd.concat([s1, s2])
    res['title_n_sim'] = res['t2'] + '(' + map(str, res['topic_sim']) + ')'
    res = res.sort_values('topic_sim', ascending=False)
    return ','.join(list(res.head(k)['title_n_sim']))

def findTopkSimByDomain(domain, sims, k=2):
    
    chosen_titles = set(sims['t1']).union(set(sims['t2']))
    topk_res = pd.DataFrame({'title': chosen_titles})
    col_topk = 'top_{}_sim'.format(k)
    topk_res[col_topk] = topk_res['title'].apply(topkSim, k=k, sims=sims)

    fname = domain_dir + 'topk/{}_top{}.csv'.format(clean(domain), k)
    topk_res.to_csv(fname, index=False)
    print('\tSaved top-{} similar title result'.format(k))
    
def findTopkSimByFunc(pri_func, sims, k=2):
    chosen_titles = set(sims['t1']).union(set(sims['t2']))
    topk_res = pd.DataFrame({'title': chosen_titles})
    col_topk = 'top_{}_sim'.format(k)
    topk_res[col_topk] = topk_res['title'].apply(topkSim, k=k, sims=sims)
    
    fname = func_dir + 'topk/{}_top{}.csv'.format(clean(pri_func), k)
    topk_res.to_csv(fname, index=False)
    print('\tSaved top-{} similar title result'.format(k))



In [ ]:

    
def domainsInRange(min_n_title, max_n_title):
    q = '{} <= n_title < {}'.format(min_n_title, max_n_title)
    domains = by_domain_agg.query(q)['domain']
    msg = '# domains with # titles in range [{}, {}): {}'.format(min_n_title, max_n_title, len(domains))
    print(msg)
    return domains

Given the parsed titles, we now compute pairwise sims among them based on:

domain
primary funcs.

Similarity By Job's Domain



In [ ]:

    
# Get needed stats from agg
by_domain_agg = pd.read_csv(DATA_DIR + 'stats/domains.csv')

n_domain = by_domain_agg.shape[0]
print('# domains: {}'.format(n_domain))

First, we look at the top-10 domains with most no. of titles.



In [ ]:

    
by_domain_agg.head(10)

Among the top-10, Marketing is the one with least no. of titles. So we use it to test run the funcs for sim cals.

Marketing



In [ ]:

    
mkt_sims = calSims('marketing')
findTopkSimByDomain('marketing', mkt_sims)

1st run: 'Senior Marketing Engineer' has highest sim with 'Senior Marketing Representative'!!! Why?



In [ ]:

    
# posts by Senior Marketing Engineer
sme_posts = df[df.title == 'Senior Marketing Engineer']
print('# posts by Senior Marketing Engineer: %d' %sme_posts.shape[0])

# posts by Senior Marketing Representative
smr_posts = df[df.title == 'Senior Marketing Representative']
print('# posts by Senior Marketing Representative: %d' %smr_posts.shape[0])

Ans: forgot to filter out titles with only 1 post.

We fixed this bug, re-run again and got rid of this issue. Now, we can run it for other domains in top-10.

Top-9 Domains:



In [ ]:

    
sim_dir = JOB_PROF + 'by_domain/sims/'



In [ ]:

    
top9_domains = by_domain_agg.head(9)['domain']
for dom in top9_domains:
    sims = calSims(dom)
    findTopkSimByDomain(dom, sims, k=5)

Other domains

We decide the priority for domains to consider based on the no. of job titles under domain. Thus we look at the distribution of job titles in domains.



In [ ]:

    
by_domain_agg.describe().round(1)

On avg, no. of job titles per domain is $2.7$.



In [ ]:

    
fig = plt.figure()
plt.hist(by_domain_agg.n_title)
mean_n_title = round(by_domain_agg.n_title.mean(), 1)
xl = '# job titles' + r'$(\mu = {})$'.format(mean_n_title)
plt.xlabel(xl, fontsize=16); 
plt.ylabel('# domains', fontsize=16)
plt.savefig(JOB_PROF + 'title_dist_by_domain.pdf')
plt.show(); plt.close()

Re-compute sims after title standardization

Reload data after standardizing job titles

Job posts:



In [ ]:

    
df = pd.read_csv(SKILL_DAT + 'filter_doc_index.csv')
print df.shape
df.set_index('index', inplace=True)



In [ ]:

    
title_stats = getTitleStats(df)
# title_stats.to_csv(DATA_DIR + 'stats/job_titles.csv', index=False)
title_stats.head()



In [ ]:

    
print('# job titles after standardizing: %d' %title_stats.shape[0])



In [ ]:

    
by_n_post.head()



In [ ]:

    
# swr_sims = calSims(domain='software', pri_func=None)
viz(swr_sims, domain='software')



In [333]:

    
# mkt_sims = calSims(domain='marketing', pri_func=None, verbose=True)
# viz(mkt_sims, domain='marketing')
findTopkSimByDomain(domain='marketing', k=5, sims=mkt_sims)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-333-285350730adb> in <module>()
      1 # mkt_sims = calSims(domain='marketing', pri_func=None, verbose=True)
      2 # viz(mkt_sims, domain='marketing')
----> 3 findTopkSimByDomain(domain='marketing', k=5, sims=mkt_sims)

<ipython-input-331-0b9843d47897> in findTopkSimByDomain(domain, sims, k)
     52 
     53     chosen_titles = titlesIn(domain, title_df)
---> 54     topk_res = pd.DataFrame({'title': chosen_titles})
     55     col_topk = 'top_{}_sim'.format(k)
     56     topk_res[col_topk] = topk_res['title'].apply(topkSim, k=k, sims=sims)

C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\frame.pyc in __init__(self, data, index, columns, dtype, copy)
    222                                  dtype=dtype, copy=copy)
    223         elif isinstance(data, dict):
--> 224             mgr = self._init_dict(data, index, columns, dtype=dtype)
    225         elif isinstance(data, ma.MaskedArray):
    226             import numpy.ma.mrecords as mrecords

C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _init_dict(self, data, index, columns, dtype)
    358             arrays = [data[k] for k in keys]
    359 
--> 360         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    361 
    362     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5229     # figure out the index, if necessary
   5230     if index is None:
-> 5231         index = extract_index(arrays)
   5232     else:
   5233         index = _ensure_index(index)

C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\frame.pyc in extract_index(data)
   5268 
   5269         if not indexes and not raw_lengths:
-> 5270             raise ValueError('If using all scalar values, you must pass'
   5271                              ' an index')
   5272 

ValueError: If using all scalar values, you must pass an index

Domains with at least 20 job titles:



In [ ]:

    
domain_20_ = by_domain_agg.query('20 <= n_title')['domain']
print('# domains with at least 20 job titles: %d' %len(domain_20_))

for dom in domain_20_:
    sims = calSims(dom)
    print('\tViz pairwise sims among job titles')
    viz(sims, domain=dom)
#     findTopkSimByDomain(dom, sims, k=5)

Domains with # titles in range [10, 20):



In [ ]:

    
domain_10_20 = domainsInRange(10, 20)



In [ ]:

    
for dom in domain_10_20:
    sims = calSims(dom)
#     findTopkSimByDomain(dom, sims, k=5)

Domains with # titles in range [2, 10):



In [ ]:

    
domain_2_10 = domainsInRange(2, 10)

This is a large number. We need to break down more.



In [ ]:

    
dom_5_10 = domainsInRange(5, 10)



In [ ]:

    
# 1st half
for dom in dom_5_10[:64] :
    sims = calSims(dom)
    findTopkSimByDomain(dom, sims, k=3)



In [ ]:

    
# 2nd half
for dom in dom_5_10[76:]: # 64
    sims = calSims(dom)
    if not sims.empty: findTopkSimByDomain(dom, sims, k=3)



In [ ]:

    
dom_2_5 = domainsInRange(2, 5)



In [ ]:

    
list(dom_2_5).index('personnel / human resource')



In [ ]:

    
# 1st half
for dom in dom_2_5[100:300]:
    sims = calSims(dom)
    if not sims.empty: findTopkSimByDomain(dom, sims, k=2)



In [ ]:

    
for dom in dom_2_5[300:]:
    sims = calSims(dom)
    if not sims.empty: findTopkSimByDomain(dom, sims, k=2)

Should we group similar domain together?

We can spot similar domain names e.g. account and accounts, financial and finance. Should we group them together?

Similarity By Job's Primary Function



In [ ]:

    
by_func_agg = pd.read_csv(DATA_DIR + 'stats/pri_funcs.csv')
by_func_agg.describe().round(1)



In [ ]:

    
n_func = by_func_agg.shape[0]
print('# primary functions: %d' %n_func)



In [ ]:

    
by_func_agg.sort_values('n_title', inplace=True)



In [ ]:

    
# by_func_agg.query('pri_func == "developer"')
# by_func_agg.query('pri_func == "teacher"')
by_func_agg.query('pri_func == "programmer"')



In [ ]:

    
# Functions with largest no. of titles
by_func_agg.tail()



In [ ]:

    
chosen_funcs = by_func_agg.query('n_title >= 2')['pri_func']
print('# funcs with at least 2 job titles: %d' %len(chosen_funcs))



In [ ]:

    
chosen_funcs.tail()



In [ ]:

    
teacher_sims = calSims(pri_func='teacher', domain=None)
findTopkSimByFunc('teacher', teacher_sims, k=5)



In [ ]:

    
def functionInRange(min_n_title, max_n_title=None):
    if max_n_title: 
        q = '{} <= n_title < {}'.format(min_n_title, max_n_title)
    else:
        q = '{} <= n_title'.format(min_n_title)
        
    funcs = list(by_func_agg.query(q)['pri_func'])
    text = '#pri-funcs having # titles in range [{}, {}): {}'
    if max_n_title:
        msg = text.format(min_n_title, max_n_title, len(funcs))
    else:
        msg = text.format(min_n_title, np.infty, len(funcs))
    print(msg)
    
    return funcs



In [ ]:

    
func_2_5 = functionInRange(2, 5)



In [ ]:

    
func_5_100 = functionInRange(5, 100)



In [ ]:

    
print('\tViz pairwise sims among job titles')
viz(sims, pri_func=pri_func)



In [ ]:

    
for func in func_5_100:
    sims = calSims(pri_func=func, domain=None)
    print('\tViz pairwise sims among job titles')
    viz(sims, pri_func=func)
#     findTopkSimByFunc(func, sims)



In [ ]:

    
func_100_ = functionInRange(min_n_title=100)



In [ ]:

    
supervisor_sims = pd.read_csv(func_dir + 'sims/supervisor.csv')
findTopkSimByFunc('supervisor', k=5, sims=supervisor_sims)



In [ ]:

    
by_sec_func = parsed_titles.groupby('sec_func')
n_sec_func = by_sec_func.ngroups
print('# secondary funcs: %d' %n_sec_func)

Similarity of Job Titles in one Industry

We investigate topic similarity of job titles in a given industry.

Challenges:

a job title can appear in several industries in different job posts and the content of the posts can vary a lot. How to define/limit to posts of a job title in a given industry?



In [ ]:

    
def calTitleSims(row):
    '''
    @param:  row contains full name and short name of an industry
    @return: pairwise sims bw titles with at least 2 posts in the industry
    '''
    
    def titleSims(industry='Education', short_name='edu'):
        posts = df[df.industry == industry]
        stats = getTitleStats(posts)
        stats.to_csv(directory + 'title_stats.csv', index=False)
    #     only analyze titles with at least 2 posts in the industry 
        titles = list(stats.query('n_post >= 2')['title'])
        sims = simsOfTitles(titles, industry)
        return sims.sort_values('topic_sim', ascending=False)
    
    industry = row['industry']; short_name = row['short_name']
    print('\nIndustry %s:' %industry)
    directory = JOB_PROF + '{}/'.format(short_name)
    if not os.path.exists(directory): os.makedirs(directory)
    title_sims = titleSims(industry, short_name)
    
    title_sims.to_csv(directory + 'sims.csv', index=False)
    print('Saved pairwise sims of job titles in {}'.format(industry))



In [ ]:

    
by_industry = df.groupby('industry')

agg_df = by_industry.agg({'job_id': len, 'title': 'nunique', 'employer_id': 'nunique'})
agg_df.reset_index(inplace=True)
agg_df = agg_df.rename(columns={'employer_id': 'n_employer', 'job_id': 'n_post',
                               'title': 'n_title'})

Sort industries by number of job titles:



In [ ]:

    
agg_df.sort_values('n_title', ascending=False, inplace=True)
agg_df.to_csv(DATA_DIR + 'stats/industry_stat.csv', index=False)



In [ ]:

    
agg_df = pd.read_csv(DATA_DIR + 'stats/industry_stat.csv')

Industries in 1st quarter



In [ ]:

    
q1 = agg_df.n_title.quantile(.25)
q1_industries = agg_df[agg_df.n_title <= q1]



In [ ]:

    
def simWrapper(industries):
    n_industry = industries.shape[0]
    for i in range(n_industry):
        calTitleSims(industries.iloc[i])
        time.sleep(3)



In [ ]:

    
simWrapper(q1_industries)

Similarity of Job Titles in Agriculture and Fishing

Some titles do not belong to Agriculture and Fishing, e.g., Signal Processing Engineer, Account Clerk, Accounts Analyst. Assigning the industry of the employer to the post (and its title) seems to have problem!!!

Similarity of Job Titles in Mining and Quarrying

Similarity of Job Titles in Electricity, Gas and Air-Conditioning Supply



In [ ]:

    
electric_sims = pd.read_csv(JOB_PROF + 'electric/sims.csv')



In [ ]:

    
electric_sims.shape



In [ ]:

    
fig, ax = plt.subplots()
vizTopicSim(electric_sims)
fig.subplots_adjust(bottom=0.2)
plt.savefig(electric_dir + 'sim_dist.pdf')
plt.show(); plt.close()

Industries in 2nd Quarter



In [ ]:

    
q2 = agg_df.n_title.quantile(.50)
q2_industries = agg_df.query('{} < n_title and n_title <= {}'.format(q1, q2))



In [ ]:

    
q2_industries.reset_index(inplace=True)
del q2_industries['index']



In [ ]:

    
q2_industries



In [ ]:

    
calTitleSims(q2_industries.iloc[3])



In [ ]:

Industries in 3rd Quarter



In [ ]:

    
q3 = agg_df.n_title.quantile(.75)
agg_df.query('{} < n_title and n_title <= {}'.format(q2, q3))



In [ ]:

    
edu_sims = titleSims(industry='Education', short_name='edu')
edu_sims.to_csv(JOB_PROF + 'edu/sims.csv', index=False)



In [ ]:

    
edu_sims.head(10)

Industries in 4th Quarter



In [ ]:



In [ ]:

    
title_df = pd.read_csv(DATA_DIR + 'stats/job_titles.csv')
titles = list(title_df['title'])
print('# titles: %d' %len(titles))

Similarity By Job's Domain



In [ ]:

    
domain_df = pd.read_csv(DATA_DIR + 'stats/domains.csv')
n_domain = domain_df.shape[0]
print('# domains: %d' %n_domain)



In [ ]:

    
domains_2 = domain_df.query('n_title > 1')['domain']
print('# domains with at least 2 job titles: %d' %len(domains_2))



In [ ]:

    
mkt_sims = byRelevanceSims('marketing')

Similarity By Job's Primary Function



In [ ]:

    
n_pri_func =
print('# primary functions: %d' %n_pri_func)