Similarity By Job's Primary Function


import os
import gc

import ja_helpers as ja_helpers
from ja_helpers import *

HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'; RES_DIR = HOME_DIR + 'results/'

df = pd.read_csv(DATA_DIR + 'doc_index_filter.csv')
df.set_index('index', inplace=True)

with(open(RES_DIR + 'doc_20topic_distr.mtx', 'r')) as f:
    doc_topic_distr = mmread(f)

title_df = pd.read_csv(DATA_DIR + 'new_titles_2posts_up.csv')
# title_df = title_df.drop_duplicates('title')
# title_df['index_by_title'] = title_df['title']
# title_df = title_df.set_index('index_by_title')

by_func_agg = pd.read_csv(DATA_DIR + 'stats_pri_funcs.csv')

reload(ja_helpers); from ja_helpers import *

func_dir = RES_DIR + 'job_prof/by_func/' + 'new/'
tmp_dir = RES_DIR + 'tmp/'

def saveSim(sims, folder, name):
    sim_file = folder + 'sims/{}.csv'.format(clean(name))
    sims.to_csv(sim_file, index=False)
    print('\tSaved sim scores to file: {}'.format(sim_file))

def functionInRange(min_n_title, max_n_title=None):
    if max_n_title: 
        q = '{} <= n_title < {}'.format(min_n_title, max_n_title)
        q = '{} <= n_title'.format(min_n_title)
    funcs = list(by_func_agg.query(q)['pri_func'])
    text = '#pri-funcs having # titles in range [{}, {}): {}'
    if max_n_title:
        msg = text.format(min_n_title, max_n_title, len(funcs))
        msg = text.format(min_n_title, np.infty, len(funcs))
    return funcs

def functionWiseSims(functions, verbose=False, bsize=50):
    for func in functions:
        print('Primary function: {}'.format(func))
        chosen_titles = titlesWithFunc(func, title_df)
        if len(chosen_titles) < bsize: 
            sims = simsAmong(chosen_titles, doc_topic_distr, df, verbose)
            sims = simsAmong(chosen_titles, doc_topic_distr, df, verbose, bsize, tmp_dir)
        if not sims.empty:
            sims = sims.sort_values('topic_sim', ascending=False)
            saveSim(sims, func_dir, func)
            fig = viz(sims); 
            fname = func_dir + 'fig/{}_sims.pdf'.format(clean(func))
            plt.savefig(fname); plt.close()
            print('\t Saved sim dist to file {}'.format(fname))

print('# posts loaded: %d' %df.shape[0])
n_title = title_df.title.nunique()
print('# job titles: %d' % n_title)

n_func = by_func_agg.shape[0]
print('# primary functions: %d' %n_func)

chosen_funcs = by_func_agg.query('n_title >= 2')['pri_func']
print('# funcs with at least 2 job titles: %d' %len(chosen_funcs))

# posts loaded: 71338
# job titles: 2839
# primary functions: 239
# funcs with at least 2 job titles: 111

Perform Cals

functionWiseSims(functions=['manager'], verbose=False)

Primary function: manager
# job titles: 545. For job titles with > 100 posts, only sample 100 posts.
Done caching sampled posts for titles with more than 100
	 Calculated sims of Manager to all subseq titles
	 Calculated sims of Project Manager to all subseq titles
	 Calculated sims of Assistant Manager to all subseq titles
	 Calculated sims of Business Development Manager to all subseq titles
	 Calculated sims of Information Technology Project Manager to all subseq titles
	 Calculated sims of Senior Manager to all subseq titles
	 Calculated sims of Finance Manager to all subseq titles
	 Calculated sims of Account Sales Manager to all subseq titles
	 Calculated sims of Marketing Manager to all subseq titles
	 Calculated sims of Account Manager to all subseq titles
	 Calculated sims of Sales Manager to all subseq titles
	 Calculated sims of Product Manager to all subseq titles
	 Calculated sims of Information Technology Manager to all subseq titles
Primary function: tutor
# job titles: 2. For job titles with > 100 posts, only sample 100 posts.
Done caching sampled posts for titles with more than 100
	 Calculated sims of Beauty Tutor to all subseq titles
	 Calculated sims of Tutor to all subseq titles
	Finished sim cals after 0.0s
	Saved sim scores to file: d:/larc_projects/job_analytics/results/job_prof/by_func/new/sims/tutor.csv
	 Saved sim dist to file d:/larc_projects/job_analytics/results/job_prof/by_func/new/fig/tutor_sims.pdf

func_top_10 = by_func_agg.head(10)
func_top_10 = func_top_10.sort_values('n_title')
func_top_10 = func_top_10.reset_index(); del func_top_10['index']

pri_func n_post non_std_title n_title
0 technician 1109 84 76
1 assistant 3676 82 79
2 specialist 1411 89 89
3 director 1250 96 91
4 consultant 2779 100 99
5 officer 2650 127 124
6 analyst 4870 138 137
7 executive 7463 220 217
8 engineer 9525 382 364
9 manager 16318 570 545

# functionWiseSims(functions=['assistant'], k=5, verbose=True)

functionWiseSims(functions=func_top_10[3:5]['pri_func'], verbose=True)

functionWiseSims(functions=func_top_10[5:7]['pri_func'], verbose=True)

In [ ]:
functionWiseSims(functions=func_top_10[7]['pri_func'], verbose=True)

functionWiseSims(functions=['engineer'], verbose=False)

eng_sims = pd.read_csv(func_dir + 'sims/engineer.csv')
fig = viz(eng_sims)
fname = func_dir + 'fig/engineer_sims.pdf'; plt.savefig(fname); plt.close()

func_2_5 = functionInRange(2, 5)

func_5_100 = functionInRange(5, 100)

func_100_ = functionInRange(min_n_title=100)
# functionWiseSims(func_100_)