In [16]:

    
from time import time
from my_util import *
from job_rec_helpers import *

import scipy.sparse as sp



In [2]:

    
def canon(strings):
    return map(str.strip, map(str.lower, strings))



In [3]:

    
# Global vars
DATA_DIR = 'D:/larc_projects/job_analytics/data/clean/'
RES_DIR = 'd:/larc_projects/job_analytics/results/'
AGG_DIR = RES_DIR + 'agg/'
FIG_DIR = RES_DIR + 'figs/'



In [4]:

    
apps = pd.read_csv(DATA_DIR + 'full_apps.csv')



In [6]:

    
# apps['job_title'] = canon(apps['job_title'])
apps.head()









    Out[6]:






  
    
      
      uid
      job_id
      job_title
      apply_date
      reg_no_uen_ep
      employer_creation_date
      organisation_name_ep
      ssic_code_ep
      ssic_description_ep
      ssic_group_ep
      third_party_entity_ep
      job_title_is_number
    
  
  
    
      0
      7
      JOB-2015-0223128
      housekeeping supervisor
      2015-07-01
      52865867X
      Jul 8, 2014
      THE FULLERTON HOTEL
      55101
      Hotels with restaurant
      Accommodation and Food Service Activities
      Y
      False
    
    
      1
      21073
      JOB-2015-0223128
      housekeeping supervisor
      2015-05-07
      52865867X
      Jul 8, 2014
      THE FULLERTON HOTEL
      55101
      Hotels with restaurant
      Accommodation and Food Service Activities
      Y
      False
    
    
      2
      46634
      JOB-2015-0223128
      housekeeping supervisor
      2015-05-01
      52865867X
      Jul 8, 2014
      THE FULLERTON HOTEL
      55101
      Hotels with restaurant
      Accommodation and Food Service Activities
      Y
      False
    
    
      3
      100427
      JOB-2015-0223128
      housekeeping supervisor
      2015-07-24
      52865867X
      Jul 8, 2014
      THE FULLERTON HOTEL
      55101
      Hotels with restaurant
      Accommodation and Food Service Activities
      Y
      False
    
    
      4
      39
      JOB-2014-0134411
      account assistant
      2015-06-16
      200203771R
      Jul 5, 2014
      THE SHICHIDA METHOD (S) PTE. LTD.
      82999
      Other business support services activities nec...
      Administrative and Support Service Activities
      N
      False



In [ ]:

    
apps['job_title_is_number'] = map(is_number, apps['job_title'])



In [ ]:

    
print ('bf filter: %d' %apps.shape[0])
tmp = apps.query('job_title_is_number == False')
print('after filter: %d' %tmp.shape[0])

Basic statistics



In [ ]:

    
n_application, n_applicant, n_job, n_job_title = apps.shape[0], apps['uid'].nunique(), apps['job_id'].nunique(), apps['job_title'].nunique()
n_company = apps['reg_no_uen_ep'].nunique()

stats = pd.DataFrame({'n_application': n_application, 'n_applicant': n_applicant, 
                      'n_job': n_job, 'n_job_title': n_job_title,
                     'n_company': n_company}, index=[0])
stats



In [ ]:

    
stats.to_csv(DATA_DIR + 'stats/stats.csv', index=False)

Applicant-apply-Job matrix

A. Number of times an applicant applies a specific job title (position).



In [7]:

    
agg_apps = pd.read_csv(AGG_DIR + 'timed_apps.csv')
print agg_apps.shape









    



(775693, 7)



In [8]:

    
agg_apps.sort_values('n_apply', ascending=False, inplace=True)



In [9]:

    
# top 10 extreme cases
agg_apps.head(10)









    Out[9]:






  
    
      
      uid
      job_title
      n_apply
      first_apply_date
      last_apply_date
      n_active_day
      total_duration_in_day
    
  
  
    
      134055
      33833
      Accounts Executive
      582
      2015-01-05
      2015-09-30
      133
      268
    
    
      134072
      33833
      Accountant
      534
      2015-01-03
      2015-09-30
      133
      270
    
    
      570708
      108289
      Accounts Assistant
      502
      2015-01-21
      2015-06-29
      103
      159
    
    
      570703
      108289
      Accounts Executive
      430
      2015-01-21
      2015-06-29
      99
      159
    
    
      116932
      30771
      Project Manager
      408
      2015-04-20
      2015-09-20
      41
      153
    
    
      538682
      103204
      Analyst
      398
      2015-01-15
      2015-07-16
      84
      182
    
    
      356522
      71838
      Business Analyst
      387
      2015-01-05
      2015-09-21
      86
      259
    
    
      538795
      103204
      Human Resource Executive
      374
      2015-01-02
      2015-07-18
      139
      197
    
    
      514460
      99370
      Human Resource Executive
      336
      2015-01-12
      2015-09-10
      85
      241
    
    
      553234
      105636
      Accountant
      333
      2015-01-14
      2015-09-04
      81
      233

As the number of active days changes with users, we need to calculate the avg. apply frequency by dividing n_apply by n_active_day.



In [10]:

    
agg_apps['apply_freq'] = agg_apps['n_apply']/agg_apps['n_active_day']



In [14]:

    
agg_apps['apply_freq'] = np.round(agg_apps['apply_freq'], 2)
agg_apps.sort_values(by='apply_freq', ascending=False, inplace=True)
agg_apps.head()









    Out[14]:






  
    
      
      uid
      job_title
      n_apply
      first_apply_date
      last_apply_date
      n_active_day
      total_duration_in_day
      apply_freq
    
  
  
    
      740410
      136703
      Accountant
      109
      2015-06-22
      2015-06-24
      2
      2
      54.50
    
    
      740360
      136703
      Project Engineer
      54
      2015-06-25
      2015-06-25
      1
      0
      54.00
    
    
      668807
      124950
      Recruitment Consultant
      52
      2015-09-28
      2015-09-28
      1
      0
      52.00
    
    
      740406
      136703
      Accounts Executive
      151
      2015-06-15
      2015-06-24
      3
      9
      50.33
    
    
      740398
      136703
      Accounts Assistant
      145
      2015-06-15
      2015-06-24
      3
      9
      48.33



In [13]:

    
quantile(agg_apps['apply_freq'])









    Out[13]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      1.0
      1.0
      1.0
      1.0
      54.5



In [15]:

    
agg_apps.to_csv(AGG_DIR + 'timed_apps.csv', index=False)

E. Build applicant-apply-job matrix

Jobs are considered at job title level.
Each entry $ e_{u,j} $ of the matrix is either the number of times (frequency) applicant $u$ applies job title $j$.

This is similar to building the user-item matrix.

To build the matrix, there are two ways.

define a function to convert df agg_apps to matrix form
use available countvectorizer() in module sklearn.feature_extraction.text, where we consider application history of each user as a document and each job title he applied as a word in the document.

The cons of 2nd way is that to cover all job titles, which have different lengths (# words), countvectorizer() need to repeatedly split each document into n-grams, which is time consuming.

We thus use the 1st way.

Map user ids and item ids into internal user and item indices

The indices will be used to build user-item matrix later.



In [17]:

    
user_ids = np.unique(agg_apps['uid'])
index_of_users = { user_ids[i]:i for i in range(len(user_ids)) }

item_ids = np.unique(agg_apps['job_title'])
index_of_items = { item_ids[i]:i for i in range(len(item_ids))}



In [27]:

    
n_user = len(index_of_users.keys())
n_item = len(index_of_items.keys())



In [34]:

    
# Given index_of_users and index_of_items,
## build user-item matrix from a df with columns (user_col, item_col, rating_col) containing triples (uid, item_id, rating)

def buildUserItemMat(df, user_col = 'uid', item_col = 'item_id', rating_col = 'rating'):
    print('Mapping user ids to internal user indices...')
    row_ind = list(df.apply(lambda r: index_of_users[r[user_col]], axis=1))
    print('Mapping item ids to internal item indices...')
    col_ind = list(df.apply(lambda r: index_of_items[r[item_col]], axis=1))
    ratings = list(df[rating_col])
    
    n_user, n_item = len(index_of_users.keys()), len(index_of_items.keys())
    user_item_mat = sp.csr_matrix((ratings, (row_ind, col_ind)), shape=(n_user, n_item))
    print('User-Item matrix built')
    return user_item_mat



In [33]:

    
user_apply_job = buildUserItemMat(df=agg_apps, user_col='uid', item_col='job_title', rating_col='n_apply')









    



Mapping user ids to internal user indices...
Mapping item ids to internal item indices...
All done



In [48]:

    
from scipy.io import *

mmwrite(DATA_DIR + 'user_apply_job.mtx', user_apply_job)



In [42]:

    
df = pd.DataFrame({'uid': index_of_users.keys(), 'u_index': index_of_users.values()})
df.sort_values('u_index', inplace=True)
df.to_csv(DATA_DIR + 'user_dict.csv', index=False)



In [47]:

    
# index_of_items.keys()[:3]
df = pd.DataFrame({'job_title': index_of_items.keys(), 'item_index': index_of_items.values()})
df.sort_values('item_index', inplace=True)
df.to_csv(DATA_DIR + 'item_dict.csv', index=False)



In [ ]:



In [ ]:

    
by_job_title = agg_apps[['job_title', 'n_apply']].groupby('job_title').sum()
by_job_title = by_job_title.add_prefix('total_').reset_index()



In [ ]:

    
# top-10 popular job titles
by_job_title.sort_values('total_n_apply', ascending=False, inplace=True)
by_job_title.head(10)



In [ ]:

    
by_user = agg_apps[['uid', 'n_apply']].groupby('uid').sum()
by_user = by_user.add_prefix('total_').reset_index()



In [ ]:

    
# top-10 hard working job hunters
by_user.sort_values('total_n_apply', ascending=False, inplace=True)
by_user.head(10)



In [ ]:

    
by_job_title.head(10).to_csv(RES_DIR + 'top10_job_titles.csv', index=False)



In [ ]:

    
by_user.head(10).to_csv(RES_DIR + 'top10_job_hunters.csv', index=False)

Quartiles of the number of times an applicant applies for a specific job:



In [ ]:

    
quantile(agg_apps['n_apply'])

As expected, for most of the cases (50%), an applicant applies just once for a specific job. However, we can also see at least 1 extreme case where an applicant applies 582 times for just a job title. Thus, let's look more closely at the distribution of $N_{apply}$.



In [ ]:

    
plt.hist(agg_apps['n_apply'], bins=np.unique(agg_apps['n_apply']), log=True)

plt.xlabel(r'$N_{apply}$')
plt.ylabel('# applicant-job pairs (log scale)')
# plt.savefig(DATA_DIR + 'apply_freq.pdf')

plt.show()
plt.close()

From the histogram, we can see that there are cases when a user applies for a job titles at least 100 times. Let's look closer at those extreme cases.

Extreme cases

To get a more complete picture on these extreme cases, let's look at:

No. of active days: already aggregated
companies:



In [ ]:

    
extremes = pd.read_csv(RES_DIR + 'extremes.csv')
print('No. of extreme cases: {}'.format(extremes.shape[0]))
extremes.head(3)



In [ ]:

    
quantile(extremes['n_active_day'])

B. Number of different job titles an applicant applies



In [ ]:

    
apps_by_job_title = pd.read_csv(AGG_DIR + 'apps_by_job_title.csv')



In [ ]:

    
fig = plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
loglog(apps_by_job_title['n_job_title'], xl='# Job titles applied', yl='# applicants')
plt.subplots_adjust(wspace=.5)

plt.subplot(1,2,2)
loglog(apps_by_job_title['n_job'], xl='# Jobs applied', yl='# applicants')

# plt.savefig(FIG_DIR + 'figs/applied_jobs.pdf')
plt.show()
plt.close()

C. Number of company an applicant applies



In [ ]:

    
apps_by_comp = pd.read_csv(AGG_DIR + 'apps_by_comp.csv')
apps_by_comp.shape



In [ ]:

    
loglog(apps_by_comp['n_apply'], xl='# applications', yl='# user-apply-company cases')

# plt.savefig(FIG_DIR + 'user_comp.pdf')
plt.show()
plt.close()

D. Number of (job title, company) an applicant applies



In [ ]:

    
apps_by_job_comp = pd.read_csv(AGG_DIR + 'apps_by_job_comp.csv')
apps_by_job_comp.shape



In [ ]:

    
loglog(apps_by_job_comp['n_apply'], xl='# applications', yl='# user-apply-job-at-company cases')
# plt.savefig(FIG_DIR + 'user_job_comp.pdf')
plt.show()
plt.close()



In [ ]:

    
job_comp = apps[['job_title', 'organisation_name_ep']].drop_duplicates()
print('No. of job-company pairs: {}'.format(job_comp.shape[0]))



In [ ]:

    
def getRecords(uids, df):
    return df[ df['uid'].isin(uids)]



In [ ]:

    
print('No. of applicants: {}'.format(n_applicant))
print('No. of job titles: {}'.format(n_job_title))

Thus the dimensions of applicant-apply-job matrix should be 68144 $\times$ 5794.



In [ ]:

    
apps_by_job_title = pd.read_csv(AGG_DIR + 'apps_by_job_title.csv')
# sanity check
print(apps_by_job_title.shape[0] == n_applicant)
apps_by_job_title.head()



In [ ]:

    
import sklearn.feature_extraction.text as text_manip
import scipy.sparse as sp



In [ ]:

    
docs = apps_by_job_title['job_titles']
job_titles = apps['job_title'].unique()
max_len = max(map(n_word, job_titles))
print('max no. of words in a job title: {}'.format(max_len))

job_title_len = map(n_word, job_titles)



In [ ]:

    
quantile(job_title_len)



In [ ]:

    
plt.hist(job_title_len, bins=np.unique(job_title_len))
plt.xlabel('# words in job title')
plt.ylabel('# job titles')

plt.show()



In [ ]:

    
count_vec = text_manip.CountVectorizer(vocabulary=job_titles, ngram_range=(1,6))



In [ ]:

    
t0 = time()
print('Building applicant-apply-job matrix...')
user_apply_job = count_vec.fit_transform(docs)
print('Done after {}s'.format(time()-t0))



In [ ]:

    
# sparsity of applicant-apply-job
float(user_apply_job.nnz)/(n_applicant * n_job_title)



In [ ]:

    
nrow, ncol = user_apply_job.shape[0], user_apply_job.shape[1]
print('Dimension of applicant-apply-job matrix: {} x {}'.format(nrow, ncol))



In [ ]:

    
feats = count_vec.get_feature_names()
# sum([1 for j in first_user_job_titles if j in feats])



In [ ]:

    
from scipy.io import *

mmwrite(DATA_DIR + 'user_apply_job.mtx', user_apply_job)



In [ ]:

    
# first_user_job_titles = docs[0].split(',')

# n_job_in_vocab = sum([1 for j in first_user_job_titles if j in vocab])
# print('Total # jobs: %d' %len(first_user_job_titles))
# print('# jobs in vocab: %d' %n_job_in_vocab)
# all(j in vocab for j in first_user_job_titles)

F. Build applicant-apply-(job, employer) matrix



In [ ]:

    
quantile(apps_by_job_comp['n_apply'])



In [ ]:

    
apps_by_job_comp.rename(columns={'organisation_name_ep': 'employer_name', 'reg_no_uen_ep': 'employer_id'}, inplace=True)
apps_by_job_comp.query('n_apply >= 50')



In [ ]:

    
apps.query('uid == 103204').query('job_title == "analyst"').query('reg_no_uen_ep=="196800306E"').to_csv(RES_DIR + 'tmp.csv')



In [ ]:

    
apps_by_job_comp['job_employer'] = apps_by_job_comp['job_title'] + ' at ' + apps_by_job_comp['employer_name']
apps_by_job_comp.head()



In [ ]:

    
uniq_job_employers = np.unique(apps_by_job_comp['job_employer'])
len(uniq_job_employers)



In [ ]:

    
users = np.unique(apps['uid'])
len(users)



In [ ]:

    
job_employer_idx = { uniq_job_employers[i]:i for i in range(len(uniq_job_employers))}
index_of_users = { users[i]:i for i in range(len(users)) }



In [ ]:

    
apps_by_job_comp.apply(putTriple, axis=1)

	uid	job_id	job_title	apply_date	reg_no_uen_ep	employer_creation_date	organisation_name_ep	ssic_code_ep	ssic_description_ep	ssic_group_ep	third_party_entity_ep	job_title_is_number
0	7	JOB-2015-0223128	housekeeping supervisor	2015-07-01	52865867X	Jul 8, 2014	THE FULLERTON HOTEL	55101	Hotels with restaurant	Accommodation and Food Service Activities	Y	False
1	21073	JOB-2015-0223128	housekeeping supervisor	2015-05-07	52865867X	Jul 8, 2014	THE FULLERTON HOTEL	55101	Hotels with restaurant	Accommodation and Food Service Activities	Y	False
2	46634	JOB-2015-0223128	housekeeping supervisor	2015-05-01	52865867X	Jul 8, 2014	THE FULLERTON HOTEL	55101	Hotels with restaurant	Accommodation and Food Service Activities	Y	False
3	100427	JOB-2015-0223128	housekeeping supervisor	2015-07-24	52865867X	Jul 8, 2014	THE FULLERTON HOTEL	55101	Hotels with restaurant	Accommodation and Food Service Activities	Y	False
4	39	JOB-2014-0134411	account assistant	2015-06-16	200203771R	Jul 5, 2014	THE SHICHIDA METHOD (S) PTE. LTD.	82999	Other business support services activities nec...	Administrative and Support Service Activities	N	False

	uid	job_title	n_apply	first_apply_date	last_apply_date	n_active_day	total_duration_in_day
134055	33833	Accounts Executive	582	2015-01-05	2015-09-30	133	268
134072	33833	Accountant	534	2015-01-03	2015-09-30	133	270
570708	108289	Accounts Assistant	502	2015-01-21	2015-06-29	103	159
570703	108289	Accounts Executive	430	2015-01-21	2015-06-29	99	159
116932	30771	Project Manager	408	2015-04-20	2015-09-20	41	153
538682	103204	Analyst	398	2015-01-15	2015-07-16	84	182
356522	71838	Business Analyst	387	2015-01-05	2015-09-21	86	259
538795	103204	Human Resource Executive	374	2015-01-02	2015-07-18	139	197
514460	99370	Human Resource Executive	336	2015-01-12	2015-09-10	85	241
553234	105636	Accountant	333	2015-01-14	2015-09-04	81	233

	uid	job_title	n_apply	first_apply_date	last_apply_date	n_active_day	total_duration_in_day	apply_freq
740410	136703	Accountant	109	2015-06-22	2015-06-24	2	2	54.50
740360	136703	Project Engineer	54	2015-06-25	2015-06-25	1	0	54.00
668807	124950	Recruitment Consultant	52	2015-09-28	2015-09-28	1	0	52.00
740406	136703	Accounts Executive	151	2015-06-15	2015-06-24	3	9	50.33
740398	136703	Accounts Assistant	145	2015-06-15	2015-06-24	3	9	48.33