In [16]:
from time import time
from my_util import *
from job_rec_helpers import *

import scipy.sparse as sp

In [2]:
def canon(strings):
    return map(str.strip, map(str.lower, strings))

In [3]:
# Global vars
DATA_DIR = 'D:/larc_projects/job_analytics/data/clean/'
RES_DIR = 'd:/larc_projects/job_analytics/results/'
AGG_DIR = RES_DIR + 'agg/'
FIG_DIR = RES_DIR + 'figs/'

In [4]:
apps = pd.read_csv(DATA_DIR + 'full_apps.csv')

In [6]:
# apps['job_title'] = canon(apps['job_title'])
apps.head()


Out[6]:
uid job_id job_title apply_date reg_no_uen_ep employer_creation_date organisation_name_ep ssic_code_ep ssic_description_ep ssic_group_ep third_party_entity_ep job_title_is_number
0 7 JOB-2015-0223128 housekeeping supervisor 2015-07-01 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y False
1 21073 JOB-2015-0223128 housekeeping supervisor 2015-05-07 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y False
2 46634 JOB-2015-0223128 housekeeping supervisor 2015-05-01 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y False
3 100427 JOB-2015-0223128 housekeeping supervisor 2015-07-24 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y False
4 39 JOB-2014-0134411 account assistant 2015-06-16 200203771R Jul 5, 2014 THE SHICHIDA METHOD (S) PTE. LTD. 82999 Other business support services activities nec... Administrative and Support Service Activities N False

In [ ]:
apps['job_title_is_number'] = map(is_number, apps['job_title'])

In [ ]:
print ('bf filter: %d' %apps.shape[0])
tmp = apps.query('job_title_is_number == False')
print('after filter: %d' %tmp.shape[0])

Basic statistics


In [ ]:
n_application, n_applicant, n_job, n_job_title = apps.shape[0], apps['uid'].nunique(), apps['job_id'].nunique(), apps['job_title'].nunique()
n_company = apps['reg_no_uen_ep'].nunique()

stats = pd.DataFrame({'n_application': n_application, 'n_applicant': n_applicant, 
                      'n_job': n_job, 'n_job_title': n_job_title,
                     'n_company': n_company}, index=[0])
stats

In [ ]:
stats.to_csv(DATA_DIR + 'stats/stats.csv', index=False)

Applicant-apply-Job matrix

A. Number of times an applicant applies a specific job title (position).


In [7]:
agg_apps = pd.read_csv(AGG_DIR + 'timed_apps.csv')
print agg_apps.shape


(775693, 7)

In [8]:
agg_apps.sort_values('n_apply', ascending=False, inplace=True)

In [9]:
# top 10 extreme cases
agg_apps.head(10)


Out[9]:
uid job_title n_apply first_apply_date last_apply_date n_active_day total_duration_in_day
134055 33833 Accounts Executive 582 2015-01-05 2015-09-30 133 268
134072 33833 Accountant 534 2015-01-03 2015-09-30 133 270
570708 108289 Accounts Assistant 502 2015-01-21 2015-06-29 103 159
570703 108289 Accounts Executive 430 2015-01-21 2015-06-29 99 159
116932 30771 Project Manager 408 2015-04-20 2015-09-20 41 153
538682 103204 Analyst 398 2015-01-15 2015-07-16 84 182
356522 71838 Business Analyst 387 2015-01-05 2015-09-21 86 259
538795 103204 Human Resource Executive 374 2015-01-02 2015-07-18 139 197
514460 99370 Human Resource Executive 336 2015-01-12 2015-09-10 85 241
553234 105636 Accountant 333 2015-01-14 2015-09-04 81 233

As the number of active days changes with users, we need to calculate the avg. apply frequency by dividing n_apply by n_active_day.


In [10]:
agg_apps['apply_freq'] = agg_apps['n_apply']/agg_apps['n_active_day']

In [14]:
agg_apps['apply_freq'] = np.round(agg_apps['apply_freq'], 2)
agg_apps.sort_values(by='apply_freq', ascending=False, inplace=True)
agg_apps.head()


Out[14]:
uid job_title n_apply first_apply_date last_apply_date n_active_day total_duration_in_day apply_freq
740410 136703 Accountant 109 2015-06-22 2015-06-24 2 2 54.50
740360 136703 Project Engineer 54 2015-06-25 2015-06-25 1 0 54.00
668807 124950 Recruitment Consultant 52 2015-09-28 2015-09-28 1 0 52.00
740406 136703 Accounts Executive 151 2015-06-15 2015-06-24 3 9 50.33
740398 136703 Accounts Assistant 145 2015-06-15 2015-06-24 3 9 48.33

In [13]:
quantile(agg_apps['apply_freq'])


Out[13]:
min 25% 50% (median) 75% max
0 1.0 1.0 1.0 1.0 54.5

In [15]:
agg_apps.to_csv(AGG_DIR + 'timed_apps.csv', index=False)

E. Build applicant-apply-job matrix

  • Jobs are considered at job title level.
  • Each entry $ e_{u,j} $ of the matrix is either the number of times (frequency) applicant $u$ applies job title $j$.

This is similar to building the user-item matrix.

To build the matrix, there are two ways.

  • define a function to convert df agg_apps to matrix form
  • use available countvectorizer() in module sklearn.feature_extraction.text, where we consider application history of each user as a document and each job title he applied as a word in the document.

The cons of 2nd way is that to cover all job titles, which have different lengths (# words), countvectorizer() need to repeatedly split each document into n-grams, which is time consuming.

We thus use the 1st way.

Map user ids and item ids into internal user and item indices

The indices will be used to build user-item matrix later.


In [17]:
user_ids = np.unique(agg_apps['uid'])
index_of_users = { user_ids[i]:i for i in range(len(user_ids)) }

item_ids = np.unique(agg_apps['job_title'])
index_of_items = { item_ids[i]:i for i in range(len(item_ids))}

In [27]:
n_user = len(index_of_users.keys())
n_item = len(index_of_items.keys())

In [34]:
# Given index_of_users and index_of_items,
## build user-item matrix from a df with columns (user_col, item_col, rating_col) containing triples (uid, item_id, rating)

def buildUserItemMat(df, user_col = 'uid', item_col = 'item_id', rating_col = 'rating'):
    print('Mapping user ids to internal user indices...')
    row_ind = list(df.apply(lambda r: index_of_users[r[user_col]], axis=1))
    print('Mapping item ids to internal item indices...')
    col_ind = list(df.apply(lambda r: index_of_items[r[item_col]], axis=1))
    ratings = list(df[rating_col])
    
    n_user, n_item = len(index_of_users.keys()), len(index_of_items.keys())
    user_item_mat = sp.csr_matrix((ratings, (row_ind, col_ind)), shape=(n_user, n_item))
    print('User-Item matrix built')
    return user_item_mat

In [33]:
user_apply_job = buildUserItemMat(df=agg_apps, user_col='uid', item_col='job_title', rating_col='n_apply')


Mapping user ids to internal user indices...
Mapping item ids to internal item indices...
All done

In [48]:
from scipy.io import *

mmwrite(DATA_DIR + 'user_apply_job.mtx', user_apply_job)

In [42]:
df = pd.DataFrame({'uid': index_of_users.keys(), 'u_index': index_of_users.values()})
df.sort_values('u_index', inplace=True)
df.to_csv(DATA_DIR + 'user_dict.csv', index=False)

In [47]:
# index_of_items.keys()[:3]
df = pd.DataFrame({'job_title': index_of_items.keys(), 'item_index': index_of_items.values()})
df.sort_values('item_index', inplace=True)
df.to_csv(DATA_DIR + 'item_dict.csv', index=False)

In [ ]:


In [ ]:
by_job_title = agg_apps[['job_title', 'n_apply']].groupby('job_title').sum()
by_job_title = by_job_title.add_prefix('total_').reset_index()

In [ ]:
# top-10 popular job titles
by_job_title.sort_values('total_n_apply', ascending=False, inplace=True)
by_job_title.head(10)

In [ ]:
by_user = agg_apps[['uid', 'n_apply']].groupby('uid').sum()
by_user = by_user.add_prefix('total_').reset_index()

In [ ]:
# top-10 hard working job hunters
by_user.sort_values('total_n_apply', ascending=False, inplace=True)
by_user.head(10)

In [ ]:
by_job_title.head(10).to_csv(RES_DIR + 'top10_job_titles.csv', index=False)

In [ ]:
by_user.head(10).to_csv(RES_DIR + 'top10_job_hunters.csv', index=False)

Quartiles of the number of times an applicant applies for a specific job:


In [ ]:
quantile(agg_apps['n_apply'])

As expected, for most of the cases (50%), an applicant applies just once for a specific job. However, we can also see at least 1 extreme case where an applicant applies 582 times for just a job title. Thus, let's look more closely at the distribution of $N_{apply}$.


In [ ]:
plt.hist(agg_apps['n_apply'], bins=np.unique(agg_apps['n_apply']), log=True)

plt.xlabel(r'$N_{apply}$')
plt.ylabel('# applicant-job pairs (log scale)')
# plt.savefig(DATA_DIR + 'apply_freq.pdf')

plt.show()
plt.close()

From the histogram, we can see that there are cases when a user applies for a job titles at least 100 times. Let's look closer at those extreme cases.

Extreme cases

To get a more complete picture on these extreme cases, let's look at:

  • No. of active days: already aggregated
  • companies:

In [ ]:
extremes = pd.read_csv(RES_DIR + 'extremes.csv')
print('No. of extreme cases: {}'.format(extremes.shape[0]))
extremes.head(3)

In [ ]:
quantile(extremes['n_active_day'])

B. Number of different job titles an applicant applies


In [ ]:
apps_by_job_title = pd.read_csv(AGG_DIR + 'apps_by_job_title.csv')

In [ ]:
fig = plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
loglog(apps_by_job_title['n_job_title'], xl='# Job titles applied', yl='# applicants')
plt.subplots_adjust(wspace=.5)

plt.subplot(1,2,2)
loglog(apps_by_job_title['n_job'], xl='# Jobs applied', yl='# applicants')

# plt.savefig(FIG_DIR + 'figs/applied_jobs.pdf')
plt.show()
plt.close()

C. Number of company an applicant applies


In [ ]:
apps_by_comp = pd.read_csv(AGG_DIR + 'apps_by_comp.csv')
apps_by_comp.shape

In [ ]:
loglog(apps_by_comp['n_apply'], xl='# applications', yl='# user-apply-company cases')

# plt.savefig(FIG_DIR + 'user_comp.pdf')
plt.show()
plt.close()

D. Number of (job title, company) an applicant applies


In [ ]:
apps_by_job_comp = pd.read_csv(AGG_DIR + 'apps_by_job_comp.csv')
apps_by_job_comp.shape

In [ ]:
loglog(apps_by_job_comp['n_apply'], xl='# applications', yl='# user-apply-job-at-company cases')
# plt.savefig(FIG_DIR + 'user_job_comp.pdf')
plt.show()
plt.close()

In [ ]:
job_comp = apps[['job_title', 'organisation_name_ep']].drop_duplicates()
print('No. of job-company pairs: {}'.format(job_comp.shape[0]))

In [ ]:
def getRecords(uids, df):
    return df[ df['uid'].isin(uids)]

In [ ]:
print('No. of applicants: {}'.format(n_applicant))
print('No. of job titles: {}'.format(n_job_title))

Thus the dimensions of applicant-apply-job matrix should be 68144 $\times$ 5794.


In [ ]:
apps_by_job_title = pd.read_csv(AGG_DIR + 'apps_by_job_title.csv')
# sanity check
print(apps_by_job_title.shape[0] == n_applicant)
apps_by_job_title.head()

In [ ]:
import sklearn.feature_extraction.text as text_manip
import scipy.sparse as sp

In [ ]:
docs = apps_by_job_title['job_titles']
job_titles = apps['job_title'].unique()
max_len = max(map(n_word, job_titles))
print('max no. of words in a job title: {}'.format(max_len))

job_title_len = map(n_word, job_titles)

In [ ]:
quantile(job_title_len)

In [ ]:
plt.hist(job_title_len, bins=np.unique(job_title_len))
plt.xlabel('# words in job title')
plt.ylabel('# job titles')

plt.show()

In [ ]:
count_vec = text_manip.CountVectorizer(vocabulary=job_titles, ngram_range=(1,6))

In [ ]:
t0 = time()
print('Building applicant-apply-job matrix...')
user_apply_job = count_vec.fit_transform(docs)
print('Done after {}s'.format(time()-t0))

In [ ]:
# sparsity of applicant-apply-job
float(user_apply_job.nnz)/(n_applicant * n_job_title)

In [ ]:
nrow, ncol = user_apply_job.shape[0], user_apply_job.shape[1]
print('Dimension of applicant-apply-job matrix: {} x {}'.format(nrow, ncol))

In [ ]:
feats = count_vec.get_feature_names()
# sum([1 for j in first_user_job_titles if j in feats])

In [ ]:
from scipy.io import *

mmwrite(DATA_DIR + 'user_apply_job.mtx', user_apply_job)

In [ ]:
# first_user_job_titles = docs[0].split(',')

# n_job_in_vocab = sum([1 for j in first_user_job_titles if j in vocab])
# print('Total # jobs: %d' %len(first_user_job_titles))
# print('# jobs in vocab: %d' %n_job_in_vocab)
# all(j in vocab for j in first_user_job_titles)

F. Build applicant-apply-(job, employer) matrix


In [ ]:
quantile(apps_by_job_comp['n_apply'])

In [ ]:
apps_by_job_comp.rename(columns={'organisation_name_ep': 'employer_name', 'reg_no_uen_ep': 'employer_id'}, inplace=True)
apps_by_job_comp.query('n_apply >= 50')

In [ ]:
apps.query('uid == 103204').query('job_title == "analyst"').query('reg_no_uen_ep=="196800306E"').to_csv(RES_DIR + 'tmp.csv')

In [ ]:
apps_by_job_comp['job_employer'] = apps_by_job_comp['job_title'] + ' at ' + apps_by_job_comp['employer_name']
apps_by_job_comp.head()

In [ ]:
uniq_job_employers = np.unique(apps_by_job_comp['job_employer'])
len(uniq_job_employers)

In [ ]:
users = np.unique(apps['uid'])
len(users)

In [ ]:
job_employer_idx = { uniq_job_employers[i]:i for i in range(len(uniq_job_employers))}
index_of_users = { users[i]:i for i in range(len(users)) }

In [ ]:
apps_by_job_comp.apply(putTriple, axis=1)