In [23]:
import job_rec_helpers

from time import time
from my_util import *
from job_rec_helpers import *

In [44]:
reload(job_rec_helpers)
from job_rec_helpers import *

In [2]:
# Global vars
DATA_DIR = 'D:/larc_projects/job_analytics/data/clean/'
RES_DIR = 'd:/larc_projects/job_analytics/results/'
AGG_DIR = RES_DIR + 'agg/'
FIG_DIR = RES_DIR + 'figs/'

In [ ]:
apps = pd.read_csv(DATA_DIR + 'full_apps.csv')

In [10]:
print apps.shape
apps.head()


(1506897, 12)
Out[10]:
uid job_id job_title apply_date reg_no_uen_ep employer_creation_date organisation_name_ep ssic_code_ep ssic_description_ep ssic_group_ep third_party_entity_ep job_title_is_number
0 7 JOB-2015-0223128 housekeeping supervisor 2015-07-01 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y False
1 21073 JOB-2015-0223128 housekeeping supervisor 2015-05-07 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y False
2 46634 JOB-2015-0223128 housekeeping supervisor 2015-05-01 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y False
3 100427 JOB-2015-0223128 housekeeping supervisor 2015-07-24 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y False
4 39 JOB-2014-0134411 account assistant 2015-06-16 200203771R Jul 5, 2014 THE SHICHIDA METHOD (S) PTE. LTD. 82999 Other business support services activities nec... Administrative and Support Service Activities N False

In [11]:
apps = apps.query('job_title_is_number == False')
print apps.shape


(1506897, 12)

Applicant-apply-Job matrix

  • Jobs are considered at job title level.
  • Each entry $ e_{u,j} $ of the matrix is either the number of times (frequency) applicant $u$ applies job title $j$.

Here we are using the total number of times instead of frequency.


In [32]:
index_of_users = mkUserIndex(df=apps, user_col='uid')
index_of_items = mkItemIndex(df=apps, item_col='job_title')

In [16]:
print('# users: %d' %len(user_ids))
print('# job titles: %d' %len(item_ids))


# users: 68144
# job titles: 5794

In [18]:
from scipy.io import *
user_apply_job = mmread(DATA_DIR + 'user_apply_job.mtx')

In [19]:
printInfo(user_apply_job)


Dims of user-apply-job matrix: (68144, 5829)
# non-zero entries: 775480
Max entry: 582

Applicant-apply-(Job, Employer) matrix


In [20]:
apps_by_job_emp = pd.read_csv(AGG_DIR + 'apps_by_job_emp.csv')
apps_by_job_emp.shape


Out[20]:
(1352961, 5)

In [28]:
apps_by_job_emp['job_emp'] = apps_by_job_emp['job_title'] + ' at ' + apps_by_job_emp['organisation_name_ep']
apps_by_job_emp.head()


Out[28]:
uid job_title reg_no_uen_ep organisation_name_ep n_apply job_emp
0 103204 Analyst 196800306E DBS BANK LTD. 132 Analyst at DBS BANK LTD.
1 103204 Information Technology Specialist 196800306E DBS BANK LTD. 90 Information Technology Specialist at DBS BANK ...
2 112664 Research Assistant 200604346E NATIONAL UNIVERSITY OF SINGAPORE 90 Research Assistant at NATIONAL UNIVERSITY OF S...
3 108289 Call Centre Agent 199907051E CREDIT MANAGEMENT CONSULTANCY (ASIA) PTE LTD 72 Call Centre Agent at CREDIT MANAGEMENT CONSULT...
4 76182 Information Technology Specialist 196800306E DBS BANK LTD. 64 Information Technology Specialist at DBS BANK ...

In [33]:
index_of_items = mkItemIndex(df=apps_by_job_emp, item_col='job_emp')

In [45]:
user_apply_job_emp = buildUserItemMat(df=apps_by_job_emp, 
                                      index_of_users=index_of_users, index_of_items=index_of_items, 
                                      user_col='uid', item_col='job_emp', rating_col='n_apply')


# users in index: 68144
# items in index: 89071
Mapping user ids to internal user indices...
Mapping item ids to internal item indices...
User-Item matrix built

In [ ]: