Similarity By Job's Primary Function

Preps


In [1]:
import os
import gc

import ja_helpers as ja_helpers
from ja_helpers import *

In [2]:
gc.enable()

In [3]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'; RES_DIR = HOME_DIR + 'results/'

In [4]:
df = pd.read_csv(DATA_DIR + 'doc_index_filter.csv')
df.set_index('index', inplace=True)

with(open(RES_DIR + 'doc_20topic_distr.mtx', 'r')) as f:
    doc_topic_distr = mmread(f)

In [5]:
title_df = pd.read_csv(DATA_DIR + 'new_titles_2posts_up.csv')
# title_df = title_df.drop_duplicates('title')
# title_df['index_by_title'] = title_df['title']
# title_df = title_df.set_index('index_by_title')

by_func_agg = pd.read_csv(DATA_DIR + 'stats_pri_funcs.csv')

In [12]:
reload(ja_helpers); from ja_helpers import *

In [14]:
func_dir = RES_DIR + 'job_prof/by_func/' + 'new/'
tmp_dir = RES_DIR + 'tmp/'

def saveSim(sims, folder, name):
    sim_file = folder + 'sims/{}.csv'.format(clean(name))
    sims.to_csv(sim_file, index=False)
    print('\tSaved sim scores to file: {}'.format(sim_file))

def functionInRange(min_n_title, max_n_title=None):
    if max_n_title: 
        q = '{} <= n_title < {}'.format(min_n_title, max_n_title)
    else:
        q = '{} <= n_title'.format(min_n_title)
        
    funcs = list(by_func_agg.query(q)['pri_func'])
    text = '#pri-funcs having # titles in range [{}, {}): {}'
    if max_n_title:
        msg = text.format(min_n_title, max_n_title, len(funcs))
    else:
        msg = text.format(min_n_title, np.infty, len(funcs))
    print(msg)
    return funcs

def functionWiseSims(functions, verbose=False, bsize=50):
    for func in functions:
        print('Primary function: {}'.format(func))
        
        chosen_titles = titlesWithFunc(func, title_df)
        
        if len(chosen_titles) < bsize: 
            sims = simsAmong(chosen_titles, doc_topic_distr, df, verbose)
        else:
            sims = simsAmong(chosen_titles, doc_topic_distr, df, verbose, bsize, tmp_dir)
        
        if not sims.empty:
            sims = sims.sort_values('topic_sim', ascending=False)
            saveSim(sims, func_dir, func)
            
            fig = viz(sims); 
            fname = func_dir + 'fig/{}_sims.pdf'.format(clean(func))
            plt.savefig(fname); plt.close()
            print('\t Saved sim dist to file {}'.format(fname))

In [8]:
print('# posts loaded: %d' %df.shape[0])
n_title = title_df.title.nunique()
print('# job titles: %d' % n_title)

n_func = by_func_agg.shape[0]
print('# primary functions: %d' %n_func)

chosen_funcs = by_func_agg.query('n_title >= 2')['pri_func']
print('# funcs with at least 2 job titles: %d' %len(chosen_funcs))


# posts loaded: 71338
# job titles: 2839
# primary functions: 239
# funcs with at least 2 job titles: 111

Perform Cals


In [11]:
functionWiseSims(functions=['manager'], verbose=False)


Primary function: manager
# job titles: 545. For job titles with > 100 posts, only sample 100 posts.
Done caching sampled posts for titles with more than 100
	 Calculated sims of Manager to all subseq titles
	 Calculated sims of Project Manager to all subseq titles
	 Calculated sims of Assistant Manager to all subseq titles
	 Calculated sims of Business Development Manager to all subseq titles
	 Calculated sims of Information Technology Project Manager to all subseq titles
	 Calculated sims of Senior Manager to all subseq titles
	 Calculated sims of Finance Manager to all subseq titles
	 Calculated sims of Account Sales Manager to all subseq titles
	 Calculated sims of Marketing Manager to all subseq titles
	 Calculated sims of Account Manager to all subseq titles
	 Calculated sims of Sales Manager to all subseq titles
	 Calculated sims of Product Manager to all subseq titles
	 Calculated sims of Information Technology Manager to all subseq titles
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-11-dd710b53e90c> in <module>()
----> 1 functionWiseSims(functions=['manager'], verbose=False)

<ipython-input-7-97bd42da5770> in functionWiseSims(functions, verbose)
     27         chosen_titles = titlesWithFunc(func, title_df)
     28 
---> 29         sims = simsAmong(chosen_titles, doc_topic_distr, df, verbose)
     30 
     31         if not sims.empty:

D:\larc_projects\job_analytics\src\ja_helpers.pyc in simsAmong(titles, doc_topic_distr, df, verbose)
    631     print(msg.format(n_title))
    632 
--> 633     if n_title > 1:
    634         t0 = time()
    635 

D:\larc_projects\job_analytics\src\ja_helpers.pyc in sims2SubseqTitle(i, titles, doc_topic_distr, cached_posts, verbose)
    616     res['topic_sim'] = res['t1'].apply(titleSim, t2=focus_title, 
    617                                        doc_topic_distr=doc_topic_distr,
--> 618                                        cached_posts=cached_posts, verbose=verbose)
    619 
    620     print('\t Calculated sims of {} to all subseq titles'.format(focus_title))

D:\conda\lib\site-packages\pandas\core\series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2292             else:
   2293                 values = self.asobject
-> 2294                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2295 
   2296         if len(mapped) and isinstance(mapped[0], Series):

pandas\src\inference.pyx in pandas.lib.map_infer (pandas\lib.c:66124)()

D:\conda\lib\site-packages\pandas\core\series.pyc in <lambda>(x)
   2280 
   2281         if kwds or args and not isinstance(func, np.ufunc):
-> 2282             f = lambda x: func(x, *args, **kwds)
   2283         else:
   2284             f = func

D:\larc_projects\job_analytics\src\ja_helpers.pyc in titleSim(t1, t2, doc_topic_distr, df, cached_posts, verbose)
    603         n1, n2 = posts1.shape[0], posts2.shape[0]
    604         print('\t{} ({} posts) vs. {} ({} posts)'.format(t1, n1, t2, n2))
--> 605     return postSimScore(posts1, posts2, doc_topic_distr)
    606 
    607 def sims2SubseqTitle(i, titles, doc_topic_distr, cached_posts=None, verbose=False):

D:\larc_projects\job_analytics\src\ja_helpers.pyc in postSimScore(posts1, posts2, doc_topic_distr)
    587     n1, n2 = posts1.shape[0], posts2.shape[0]
    588     if (n1 > 0) and (n2 > 0):
--> 589         res = crossSimScores(posts1, posts2, doc_topic_distr, verbose=False)
    590         topic_sim = round(res['topic_sim'].mean(), 3)
    591         return topic_sim  # return res

D:\larc_projects\job_analytics\src\ja_helpers.pyc in crossSimScores(posts1, posts2, doc_topic_distr, verbose)
    576     n1 = posts1.shape[0]; n2 = posts2.shape[0]
    577 
--> 578     frames = [sims2Set(posts1.iloc[i], posts2) for i in xrange(n1)]
    579     res = pd.concat(frames);
    580     return res

D:\larc_projects\job_analytics\src\ja_helpers.pyc in sims2Set(p, posts)
    567     def sims2Set(p, posts):
    568         n_post = posts.shape[0]
--> 569         frames = [sims(p, posts.iloc[i]) for i in xrange(n_post)]
    570         # global count;  count += 1
    571         # if (count % 10 == 0) and verbose:

D:\conda\lib\site-packages\pandas\core\indexing.pyc in __getitem__(self, key)
   1310             return self._getitem_tuple(key)
   1311         else:
-> 1312             return self._getitem_axis(key, axis=0)
   1313 
   1314     def _getitem_axis(self, key, axis=0):

D:\conda\lib\site-packages\pandas\core\indexing.pyc in _getitem_axis(self, key, axis)
   1628                 self._is_valid_integer(key, axis)
   1629 
-> 1630             return self._get_loc(key, axis=axis)
   1631 
   1632     def _convert_to_indexer(self, obj, axis=0, is_setter=False):

D:\conda\lib\site-packages\pandas\core\indexing.pyc in _get_loc(self, key, axis)
    103 
    104     def _get_loc(self, key, axis=0):
--> 105         return self.obj._ixs(key, axis=axis)
    106 
    107     def _slice(self, obj, axis=0, kind=None):

D:\conda\lib\site-packages\pandas\core\frame.pyc in _ixs(self, i, axis)
   1974                     copy = True
   1975                 else:
-> 1976                     new_values = self._data.fast_xs(i)
   1977                     if is_scalar(new_values):
   1978                         return new_values

D:\conda\lib\site-packages\pandas\core\internals.pyc in fast_xs(self, loc)
   3493 
   3494         # non-unique (GH4726)
-> 3495         if not items.is_unique:
   3496             result = self._interleave()
   3497             if self.ndim == 2:

KeyboardInterrupt: 

In [10]:
# TEST
functionWiseSims(functions=['tutor'])


Primary function: tutor
# job titles: 2. For job titles with > 100 posts, only sample 100 posts.
Done caching sampled posts for titles with more than 100
	 Calculated sims of Beauty Tutor to all subseq titles
	 Calculated sims of Tutor to all subseq titles
	Finished sim cals after 0.0s
	Saved sim scores to file: d:/larc_projects/job_analytics/results/job_prof/by_func/new/sims/tutor.csv
	 Saved sim dist to file d:/larc_projects/job_analytics/results/job_prof/by_func/new/fig/tutor_sims.pdf

In [9]:
func_top_10 = by_func_agg.head(10)
func_top_10 = func_top_10.sort_values('n_title')
func_top_10 = func_top_10.reset_index(); del func_top_10['index']
func_top_10


Out[9]:
pri_func n_post non_std_title n_title
0 technician 1109 84 76
1 assistant 3676 82 79
2 specialist 1411 89 89
3 director 1250 96 91
4 consultant 2779 100 99
5 officer 2650 127 124
6 analyst 4870 138 137
7 executive 7463 220 217
8 engineer 9525 382 364
9 manager 16318 570 545

In [ ]:
# functionWiseSims(functions=['assistant'], k=5, verbose=True)

In [ ]:
functionWiseSims(functions=func_top_10[3:5]['pri_func'], verbose=True)

In [ ]:
functionWiseSims(functions=func_top_10[5:7]['pri_func'], verbose=True)

In [ ]:
functionWiseSims(functions=func_top_10[7]['pri_func'], verbose=True)

In [ ]:
functionWiseSims(functions=['engineer'], verbose=False)

eng_sims = pd.read_csv(func_dir + 'sims/engineer.csv')
fig = viz(eng_sims)
fname = func_dir + 'fig/engineer_sims.pdf'; plt.savefig(fname)
plt.show(); plt.close()

In [ ]:
func_2_5 = functionInRange(2, 5)
functionWiseSims(func_2_5)

In [ ]:
func_5_100 = functionInRange(5, 100)
functionWiseSims(func_5_100)

In [ ]:
func_100_ = functionInRange(min_n_title=100)
# functionWiseSims(func_100_)