In [ ]:
import my_util as my_util; from my_util import *
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *
import os
import random
from time import time
import gc
In [ ]:
# Turn on auto garbage collection
gc.enable()
In [ ]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
SKILL_DAT = HOME_DIR + 'data/clean/skill_cluster/'
SKILL_RES = HOME_DIR + 'results/' + 'skill_cluster/new/'
JOB_PROF = SKILL_RES + 'job_prof/'
In [ ]:
title_stats = pd.read_csv(DATA_DIR + 'stats/job_titles.csv')
titles = title_stats['title']
print('# titles: %d' %len(titles))
# Bf standardizing job titles
df = pd.read_csv(SKILL_DAT + 'filter_doc_index.csv')
print df.shape
df.set_index('index', inplace=True)
with(open(SKILL_RES + 'doc_20topic_distr.mtx', 'r')) as f:
doc_topic_distr = mmread(f)
n_topic = doc_topic_distr.shape[1]
print('# topics: %d' %n_topic)
topic_df = pd.read_csv(SKILL_RES + 'lda/20_topics.csv')
labels = map(str.upper, topic_df['label'])
# Parsed titles:
parsed_titles = pd.read_csv(DATA_DIR + 'parsed_titles.csv')
print('# titles parsed: {}'.format(parsed_titles.shape[0]))
# Domains
domain_df = pd.read_csv(DATA_DIR + 'stats/domains.csv')
After fixing some bugs with parsing job titles, we re-parsed job titles with problem by title_parse.py script. Thus, we need to reload newly parsed titles.
In [ ]:
title_df = pd.read_csv(DATA_DIR + 'titles_2posts_up.csv')
print('# parsed titles: %d' % len(title_df.title))
In [ ]:
reload(my_util)
from my_util import *
In [329]:
reload(cluster_skill_helpers)
from cluster_skill_helpers import *
In [ ]:
def vizPostPair(i, sim_df, labels, abbv_title=''):
fig = vizDists4Pair(sim_df.iloc[i], df, doc_topic_distr, labels)
fig.savefig(SKILL_RES + 'fig/{}_p{}.pdf'.format(abbv_title, i+1))
plt.show(); plt.close()
In [334]:
## Funcs for sim cals
def byRelevanceSims(domain='software', pri_func='engineer', min_post=2, verbose=False):
if domain:
chosen_titles = titlesWithAtLeast(titlesIn(domain, title_df),
min_post)
if pri_func:
chosen_titles = titlesWithAtLeast(titlesWith(pri_func, title_df),
min_post)
n_title = len(chosen_titles)
print('\t# titles with at least {} posts: {}'.format(min_post, n_title))
sims = simsAmong(chosen_titles, doc_topic_distr, df, verbose)
return sims
def calSims(domain='software', pri_func='engineer', verbose=False):
'''
@param: either domain or pri_func, not both
'''
t0 = time()
if domain:
print('Domain: {}'.format(domain))
sims = byRelevanceSims(domain=domain, pri_func=None, verbose=verbose)
if pri_func:
print('Primary function: {}'.format(pri_func))
sims = byRelevanceSims(domain=None, pri_func=pri_func, verbose=verbose)
elapse = round(time() - t0, 1)
print('\tFinished sim cals after {}s'.format(elapse))
if not sims.empty:
sims = sims.sort_values('topic_sim', ascending=False)
if domain: fname = domain_dir + 'sims/new/{}.csv'.format(clean(domain))
if pri_func: fname = func_dir + 'sims/new/{}.csv'.format(clean(pri_func))
sims.to_csv(fname, index=False)
print('\tSaved sims to file: {}'.format(fname))
return sims
def topkSim(title, k, sims):
s1 = sims[sims['t1'] == title]
s2 = sims[sims['t2'] == title]
# Hacking to swap two cols
s2 = s2.rename(columns={'t1': 'tmp', 't2':'t1'})
s2 = s2.rename(columns={'tmp': 't2'})
res = pd.concat([s1, s2])
res['title_n_sim'] = res['t2'] + '(' + map(str, res['topic_sim']) + ')'
res = res.sort_values('topic_sim', ascending=False)
return ','.join(list(res.head(k)['title_n_sim']))
def findTopkSimByDomain(domain, sims, k=2):
chosen_titles = set(sims['t1']).union(set(sims['t2']))
topk_res = pd.DataFrame({'title': chosen_titles})
col_topk = 'top_{}_sim'.format(k)
topk_res[col_topk] = topk_res['title'].apply(topkSim, k=k, sims=sims)
fname = domain_dir + 'topk/{}_top{}.csv'.format(clean(domain), k)
topk_res.to_csv(fname, index=False)
print('\tSaved top-{} similar title result'.format(k))
def findTopkSimByFunc(pri_func, sims, k=2):
chosen_titles = set(sims['t1']).union(set(sims['t2']))
topk_res = pd.DataFrame({'title': chosen_titles})
col_topk = 'top_{}_sim'.format(k)
topk_res[col_topk] = topk_res['title'].apply(topkSim, k=k, sims=sims)
fname = func_dir + 'topk/{}_top{}.csv'.format(clean(pri_func), k)
topk_res.to_csv(fname, index=False)
print('\tSaved top-{} similar title result'.format(k))
In [ ]:
def domainsInRange(min_n_title, max_n_title):
q = '{} <= n_title < {}'.format(min_n_title, max_n_title)
domains = by_domain_agg.query(q)['domain']
msg = '# domains with # titles in range [{}, {}): {}'.format(min_n_title, max_n_title, len(domains))
print(msg)
return domains
Given the parsed titles, we now compute pairwise sims among them based on:
In [ ]:
# Get needed stats from agg
by_domain_agg = pd.read_csv(DATA_DIR + 'stats/domains.csv')
n_domain = by_domain_agg.shape[0]
print('# domains: {}'.format(n_domain))
First, we look at the top-10 domains with most no. of titles.
In [ ]:
by_domain_agg.head(10)
In [ ]:
mkt_sims = calSims('marketing')
findTopkSimByDomain('marketing', mkt_sims)
In [ ]:
# posts by Senior Marketing Engineer
sme_posts = df[df.title == 'Senior Marketing Engineer']
print('# posts by Senior Marketing Engineer: %d' %sme_posts.shape[0])
# posts by Senior Marketing Representative
smr_posts = df[df.title == 'Senior Marketing Representative']
print('# posts by Senior Marketing Representative: %d' %smr_posts.shape[0])
In [ ]:
sim_dir = JOB_PROF + 'by_domain/sims/'
In [ ]:
top9_domains = by_domain_agg.head(9)['domain']
for dom in top9_domains:
sims = calSims(dom)
findTopkSimByDomain(dom, sims, k=5)
In [ ]:
by_domain_agg.describe().round(1)
On avg, no. of job titles per domain is $2.7$.
In [ ]:
fig = plt.figure()
plt.hist(by_domain_agg.n_title)
mean_n_title = round(by_domain_agg.n_title.mean(), 1)
xl = '# job titles' + r'$(\mu = {})$'.format(mean_n_title)
plt.xlabel(xl, fontsize=16);
plt.ylabel('# domains', fontsize=16)
plt.savefig(JOB_PROF + 'title_dist_by_domain.pdf')
plt.show(); plt.close()
In [ ]:
df = pd.read_csv(SKILL_DAT + 'filter_doc_index.csv')
print df.shape
df.set_index('index', inplace=True)
In [ ]:
title_stats = getTitleStats(df)
# title_stats.to_csv(DATA_DIR + 'stats/job_titles.csv', index=False)
title_stats.head()
In [ ]:
print('# job titles after standardizing: %d' %title_stats.shape[0])
In [ ]:
by_n_post.head()
In [ ]:
# swr_sims = calSims(domain='software', pri_func=None)
viz(swr_sims, domain='software')
In [333]:
# mkt_sims = calSims(domain='marketing', pri_func=None, verbose=True)
# viz(mkt_sims, domain='marketing')
findTopkSimByDomain(domain='marketing', k=5, sims=mkt_sims)
In [ ]:
domain_20_ = by_domain_agg.query('20 <= n_title')['domain']
print('# domains with at least 20 job titles: %d' %len(domain_20_))
for dom in domain_20_:
sims = calSims(dom)
print('\tViz pairwise sims among job titles')
viz(sims, domain=dom)
# findTopkSimByDomain(dom, sims, k=5)
In [ ]:
domain_10_20 = domainsInRange(10, 20)
In [ ]:
for dom in domain_10_20:
sims = calSims(dom)
# findTopkSimByDomain(dom, sims, k=5)
In [ ]:
domain_2_10 = domainsInRange(2, 10)
This is a large number. We need to break down more.
In [ ]:
dom_5_10 = domainsInRange(5, 10)
In [ ]:
# 1st half
for dom in dom_5_10[:64] :
sims = calSims(dom)
findTopkSimByDomain(dom, sims, k=3)
In [ ]:
# 2nd half
for dom in dom_5_10[76:]: # 64
sims = calSims(dom)
if not sims.empty: findTopkSimByDomain(dom, sims, k=3)
In [ ]:
dom_2_5 = domainsInRange(2, 5)
In [ ]:
list(dom_2_5).index('personnel / human resource')
In [ ]:
# 1st half
for dom in dom_2_5[100:300]:
sims = calSims(dom)
if not sims.empty: findTopkSimByDomain(dom, sims, k=2)
In [ ]:
for dom in dom_2_5[300:]:
sims = calSims(dom)
if not sims.empty: findTopkSimByDomain(dom, sims, k=2)
In [ ]:
by_func_agg = pd.read_csv(DATA_DIR + 'stats/pri_funcs.csv')
by_func_agg.describe().round(1)
In [ ]:
n_func = by_func_agg.shape[0]
print('# primary functions: %d' %n_func)
In [ ]:
by_func_agg.sort_values('n_title', inplace=True)
In [ ]:
# by_func_agg.query('pri_func == "developer"')
# by_func_agg.query('pri_func == "teacher"')
by_func_agg.query('pri_func == "programmer"')
In [ ]:
# Functions with largest no. of titles
by_func_agg.tail()
In [ ]:
chosen_funcs = by_func_agg.query('n_title >= 2')['pri_func']
print('# funcs with at least 2 job titles: %d' %len(chosen_funcs))
In [ ]:
chosen_funcs.tail()
In [ ]:
teacher_sims = calSims(pri_func='teacher', domain=None)
findTopkSimByFunc('teacher', teacher_sims, k=5)
In [ ]:
def functionInRange(min_n_title, max_n_title=None):
if max_n_title:
q = '{} <= n_title < {}'.format(min_n_title, max_n_title)
else:
q = '{} <= n_title'.format(min_n_title)
funcs = list(by_func_agg.query(q)['pri_func'])
text = '#pri-funcs having # titles in range [{}, {}): {}'
if max_n_title:
msg = text.format(min_n_title, max_n_title, len(funcs))
else:
msg = text.format(min_n_title, np.infty, len(funcs))
print(msg)
return funcs
In [ ]:
func_2_5 = functionInRange(2, 5)
In [ ]:
func_5_100 = functionInRange(5, 100)
In [ ]:
print('\tViz pairwise sims among job titles')
viz(sims, pri_func=pri_func)
In [ ]:
for func in func_5_100:
sims = calSims(pri_func=func, domain=None)
print('\tViz pairwise sims among job titles')
viz(sims, pri_func=func)
# findTopkSimByFunc(func, sims)
In [ ]:
func_100_ = functionInRange(min_n_title=100)
In [ ]:
supervisor_sims = pd.read_csv(func_dir + 'sims/supervisor.csv')
findTopkSimByFunc('supervisor', k=5, sims=supervisor_sims)
In [ ]:
by_sec_func = parsed_titles.groupby('sec_func')
n_sec_func = by_sec_func.ngroups
print('# secondary funcs: %d' %n_sec_func)
In [ ]:
def calTitleSims(row):
'''
@param: row contains full name and short name of an industry
@return: pairwise sims bw titles with at least 2 posts in the industry
'''
def titleSims(industry='Education', short_name='edu'):
posts = df[df.industry == industry]
stats = getTitleStats(posts)
stats.to_csv(directory + 'title_stats.csv', index=False)
# only analyze titles with at least 2 posts in the industry
titles = list(stats.query('n_post >= 2')['title'])
sims = simsOfTitles(titles, industry)
return sims.sort_values('topic_sim', ascending=False)
industry = row['industry']; short_name = row['short_name']
print('\nIndustry %s:' %industry)
directory = JOB_PROF + '{}/'.format(short_name)
if not os.path.exists(directory): os.makedirs(directory)
title_sims = titleSims(industry, short_name)
title_sims.to_csv(directory + 'sims.csv', index=False)
print('Saved pairwise sims of job titles in {}'.format(industry))
In [ ]:
by_industry = df.groupby('industry')
agg_df = by_industry.agg({'job_id': len, 'title': 'nunique', 'employer_id': 'nunique'})
agg_df.reset_index(inplace=True)
agg_df = agg_df.rename(columns={'employer_id': 'n_employer', 'job_id': 'n_post',
'title': 'n_title'})
In [ ]:
agg_df.sort_values('n_title', ascending=False, inplace=True)
agg_df.to_csv(DATA_DIR + 'stats/industry_stat.csv', index=False)
In [ ]:
agg_df = pd.read_csv(DATA_DIR + 'stats/industry_stat.csv')
In [ ]:
q1 = agg_df.n_title.quantile(.25)
q1_industries = agg_df[agg_df.n_title <= q1]
In [ ]:
def simWrapper(industries):
n_industry = industries.shape[0]
for i in range(n_industry):
calTitleSims(industries.iloc[i])
time.sleep(3)
In [ ]:
simWrapper(q1_industries)
Some titles do not belong to Agriculture and Fishing, e.g., Signal Processing Engineer, Account Clerk, Accounts Analyst. Assigning the industry of the employer to the post (and its title) seems to have problem!!!
In [ ]:
electric_sims = pd.read_csv(JOB_PROF + 'electric/sims.csv')
In [ ]:
electric_sims.shape
In [ ]:
fig, ax = plt.subplots()
vizTopicSim(electric_sims)
fig.subplots_adjust(bottom=0.2)
plt.savefig(electric_dir + 'sim_dist.pdf')
plt.show(); plt.close()
In [ ]:
q2 = agg_df.n_title.quantile(.50)
q2_industries = agg_df.query('{} < n_title and n_title <= {}'.format(q1, q2))
In [ ]:
q2_industries.reset_index(inplace=True)
del q2_industries['index']
In [ ]:
q2_industries
In [ ]:
calTitleSims(q2_industries.iloc[3])
In [ ]:
In [ ]:
q3 = agg_df.n_title.quantile(.75)
agg_df.query('{} < n_title and n_title <= {}'.format(q2, q3))
In [ ]:
edu_sims = titleSims(industry='Education', short_name='edu')
edu_sims.to_csv(JOB_PROF + 'edu/sims.csv', index=False)
In [ ]:
edu_sims.head(10)
In [ ]:
In [ ]:
title_df = pd.read_csv(DATA_DIR + 'stats/job_titles.csv')
titles = list(title_df['title'])
print('# titles: %d' %len(titles))
In [ ]:
domain_df = pd.read_csv(DATA_DIR + 'stats/domains.csv')
n_domain = domain_df.shape[0]
print('# domains: %d' %n_domain)
In [ ]:
domains_2 = domain_df.query('n_title > 1')['domain']
print('# domains with at least 2 job titles: %d' %len(domains_2))
In [ ]:
mkt_sims = byRelevanceSims('marketing')
In [ ]:
n_pri_func =
print('# primary functions: %d' %n_pri_func)