In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from time import time

from my_util import *
from job_rec_helpers import *
  • HELPERS:

In [337]:
# Global vars
DATA_DIR = 'D:/larc_projects/job_analytics/data/clean/'
RES_DIR = 'd:/larc_projects/job_analytics/results/'
AGG_DIR = RES_DIR + 'agg/'
FIG_DIR = RES_DIR + 'figs/'

In [ ]:
apps = pd.read_csv(DATA_DIR + 'apps_with_time.csv')
apps.shape

In [183]:
# Rm noise (numbers) in job_title column
apps['is_number'] = map(is_number, apps['job_title'])
apps = apps.query('is_number == False')

In [198]:
# del apps['user.id']; del apps['user.index']; del apps['item.index']; del apps['freq']
apps.rename(columns={'job.id': 'job_id', 'job.title': 'job_title', 'apply.date': 'apply_date'}, 
            inplace=True)

apps.to_csv(DATA_DIR + 'apps_with_time.csv', index=False)
apps.head(3)


Out[198]:
uid job_id job_title apply_date is_number
0 7 JOB-2015-0223128 Housekeeping Supervisor 2015-07-01 False
1 39 JOB-2014-0134411 Account Assistant 2015-06-16 False
2 39 JOB-2015-0179225 Audit Assistant 2015-06-16 False

Basic statistics


In [267]:
n_applicant = apps['uid'].nunique(); n_application = apps.shape[0]
n_job = len(np.unique(apps['job_id'])); n_job_title = len(np.unique(apps['job_title']))
n_company = posts['company_registration_number_uen_ep'].nunique()

In [268]:
stats = pd.DataFrame({'n_application': n_application, 'n_applicant': n_applicant, 
                      'n_job': n_job, 'n_job_title': n_job_title, 'n_company': n_company}, index=[0])
stats


Out[268]:
n_applicant n_application n_company n_job n_job_title
0 68350 1531701 15203 177307 5829

In [269]:
stats.to_csv(DATA_DIR + 'stats/stats.csv', index=False)

Applicant-apply-Job matrix

A. Number of times an applicant applies a specific job title (position).


In [199]:
agg_apps = apps.groupby(by=['uid', 'job_title']).agg({'job_id': 'nunique', 'apply_date': 'nunique'})

# convert to DF
agg_apps = agg_apps.add_prefix('n_').reset_index()
agg_apps['n_apply'] = agg_apps['n_job_id']
agg_apps.head(3)


Out[199]:
uid job_title n_job_id n_apply_date n_apply
0 7 Housekeeping Supervisor 1 1 1
1 39 Account Assistant 3 2 3
2 39 Accounts Assistant 6 3 6

Let's look at the quartiles of the number of times an applicant applies for a specific job.


In [103]:
quantile(df['n_apply'])


Out[103]:
min 25% 50% (median) 75% max
0 1.0 1.0 1.0 2.0 582.0

As expected, for most of the cases (50%), an applicant applies just once for a specific job. However, we can also see at least 1 extreme case where an applicant applies 582 times for just a job title. Thus, let's look more closely at the distribution of $N_{apply}$.


In [ ]:
plt.hist(df['n_apply'], bins=np.unique(df['n_apply']), log=True)

plt.xlabel(r'$N_{apply}$')
plt.ylabel('# applicant-job pairs (log scale)')
plt.savefig(DATA_DIR + 'apply_freq.pdf')

plt.show()
plt.close()

From the histogram, we can see that there are cases when a user applies for a job titles at least 100 times. Let's look closer at those extreme cases.

Extreme cases (a user applies the same job title at least 100 times)


In [218]:
extremes = agg_apps.query('n_apply >= 100')
extremes.sort_values(by='n_apply', ascending=False, inplace=True)
extremes.head()


C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
Out[218]:
uid job_title n_job_id n_apply_date n_apply
133988 33833 Accounts Executive 582 133 582
133960 33833 Accountant 534 133 534
570379 108289 Accounts Assistant 502 103 502
570385 108289 Accounts Executive 430 99 430
117092 30771 Project Manager 408 41 408

In [105]:
print('No. of extreme cases: {}'.format(extremes.shape[0]))


No. of extreme cases: 220

To get a more complete picture on these extreme cases, let's put in apply dates and companies of those jobs.

  • Get dates and compute duration of extreme applications:

In [84]:
# ext_users = np.unique(extremes['uid'])
df = apps[apps['uid'].isin(extremes['uid'])]
df = df[df['job_title'].isin(extremes['job_title'])]
ext_apps = df

ext_apps.head(1)


Out[84]:
uid job_id job_title apply_date
4788 1513 JOB-2014-0097548 Quantity Surveyor 2015-04-15

In [225]:
res = calDuration(ext_apps)

res = pd.merge(res, extremes, left_index=True, right_on=['uid', 'job_title'])
res.sort_values(by='uid', inplace=True)
res = res[['uid', 'job_title', 'n_apply', 'first_apply_date', 'last_apply_date', 'n_active_day', 'total_duration_in_day']]


Finished grouping by user-job cases. Totally we have 1877 cases.
Calculating duration of application for each case...
Done after: 0.40700006485s

In [226]:
res.sort_values('n_apply', ascending=False, inplace=True)
res.head()


Out[226]:
uid job_title n_apply first_apply_date last_apply_date n_active_day total_duration_in_day
133988 33833 Accounts Executive 582 2015-01-05 2015-09-30 133 268
133960 33833 Accountant 534 2015-01-03 2015-09-30 133 270
570379 108289 Accounts Assistant 502 2015-01-21 2015-06-29 103 159
570385 108289 Accounts Executive 430 2015-01-21 2015-06-29 99 159
117092 30771 Project Manager 408 2015-04-20 2015-09-20 41 153

In [228]:
res.tail()


Out[228]:
uid job_title n_apply first_apply_date last_apply_date n_active_day total_duration_in_day
553082 105624 Network Engineer/Manager 102 2015-04-08 2015-06-23 17 76
47954 17000 Safety Coordinator 101 2015-01-01 2015-09-30 44 272
646235 121057 Human Resource Executive 101 2015-01-08 2015-06-25 30 168
134052 33833 Finance Executive 100 2015-01-14 2015-09-30 60 259
433860 85126 Marketing Executive 100 2015-01-21 2015-05-12 44 111

In [239]:
tmp = apps.query('uid==33833')[['uid', 'job_title', 'job_id']] .groupby('job_title').agg({'job_id': 'nunique'})
tmp = tmp.add_prefix('n_').reset_index()
tmp.rename(columns={'n_job_id': 'n_apply'}, inplace=True)
tmp.sort_values('n_apply', ascending=False, inplace=True)


Out[239]:
job_title n_apply
38 Accounts Executive 582
10 Accountant 534
5 Account Executive 282
103 Finance Manager 213
32 Accounts Assistant 103
102 Finance Executive 100
49 Administrative Accounts Executive 77
40 Accounts Officer 68
30 Accounts Administrator 55
21 Accounting Executive 47
1 Account Assistant 47
63 Assistant Finance Manager 46
146 Senior Accountant (General) 44
23 Accounting Manager (Finance Department) 36
39 Accounts Manager 35
22 Accounting Manager 34
18 Accounting Associate Professional (eg Assistan... 25
33 Accounts Associate 20
61 Assistant Accountant 19
145 Senior Account Executive 19
104 Finance Officer 18
24 Accounting Officer 17
168 Senior Finance Manager 17
167 Senior Finance Executive 16
107 Financial Accountant 16
74 Category Finance Manager 16
14 Accountant, Company 15
110 Financial Controller 15
29 Accounts Administrative Manager 13
17 Accounting Assistant 11
... ... ...
19 Accounting Clerk 1
16 Accounting Analyst 1
9 Account Support Executive 1
7 Account Management Executive 1
4 Account Development Manager 1
3 Account Development Executive 1
80 Company Secretary (Executive) 1
84 Corporate Secretarial Executive 1
85 Credit Administrative Assistant 1
87 Credit Control Clerk 1
128 Operations Head 1
127 Operations Executive 1
126 Operations Accountant 1
125 Manager, Administration 1
123 Management Accountant 1
121 Learning Support Assistant 1
120 Key Account Executive 1
119 Human Resource Specialist 1
118 Human Resource Manager 1
115 Human Resource & Administrative Manager 1
113 General Ledger Bookkeeper 1
112 Fund Accountant 1
109 Financial Consultant 1
105 Finance Support Manager 1
99 Finance Associate 1
98 Finance Assistant Officer 1
96 Executive, Management 1
92 Deputy Manager 1
91 Deputy Finance Director 1
177 Training Officer 1

178 rows × 2 columns


In [240]:
quantile(res['n_active_day'])


Out[240]:
min 25% 50% (median) 75% max
0 2.0 28.8 44.0 67.2 139.0

In [241]:
res.to_csv(RES_DIR + 'extremes.csv')
  • Dates/duration of all applications:

In [227]:
apps_with_duration = calDuration(apps)

apps_with_duration.head()


Finished grouping by user-job cases. Totally we have 775693 cases.
Calculating duration of application for each case...
Done after: 147.805999994s
Out[227]:
first_apply_date last_apply_date n_active_day total_duration_in_day
uid job_title
7 Housekeeping Supervisor 2015-07-01 2015-07-01 1 0
39 Account Assistant 2015-06-12 2015-06-16 2 4
Accounts Assistant 2015-06-11 2015-06-16 3 5
Accounts Executive 2015-06-16 2015-06-16 1 0
Accounts Payable (Or Receivable) Bookkeeper 2015-06-11 2015-06-11 1 0

In [244]:
all_res = pd.merge(apps_with_duration, agg_apps, left_index=True, right_on=['uid', 'job_title'])
all_res.sort_values(by='uid', inplace=True)
all_res = all_res[['uid', 'job_title', 'n_apply', 'first_apply_date', 'last_apply_date', 'n_active_day', 'total_duration_in_day']]

all_res.head()


Out[244]:
uid job_title n_apply first_apply_date last_apply_date n_active_day total_duration_in_day
0 7 Housekeeping Supervisor 1 2015-07-01 2015-07-01 1 0
15 39 Shipping Clerk (Freight) 1 2015-06-11 2015-06-11 1 0
14 39 Senior Accounts Assistant 1 2015-06-16 2015-06-16 1 0
13 39 Operations Officer 1 2015-06-11 2015-06-11 1 0
12 39 Data Entry Clerk 2 2015-06-16 2015-06-16 1 0

In [334]:
all_res.shape


Out[334]:
(775693, 7)

In [335]:
all_res.to_csv(AGG_DIR + 'timed_apps.csv', index=False)

In [249]:
normal = all_res.query('n_apply < 100')
extremes = res

In [253]:
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
plt.hist(extremes['n_active_day'], bins=np.unique(extremes['n_active_day']))
plt.title('Extreme cases')
plt.xlabel('# active days')
plt.ylabel('# user-apply-job cases')

plt.subplots_adjust(wspace=.5)
plt.subplot(1,2,2)
plt.hist(normal['n_active_day'], bins=np.unique(normal['n_active_day']), 
         log=True)
plt.title('Normal cases')
plt.xlabel('# active days')
plt.ylabel('# user-apply-job cases')

plt.savefig(RES_DIR + 'n_active_day.pdf')
plt.show()
plt.close()


B. Number of different job titles an applicant applies


In [ ]:
agg_job_title = apps[['uid', 'job_title']].groupby('uid').agg({'job_title': 'nunique'})
agg_job_title = agg_job_title.add_prefix('n_').reset_index()

agg_job_title.sort_values('n_job_title', ascending=False, inplace=True)
# agg_job_title.head()

agg_job_id = apps[['uid', 'job_id']].groupby('uid').agg({'job_id': 'nunique'})
agg_job_id = agg_job_id.add_prefix('n_').reset_index()
agg_job_id.sort_values('n_job_id', ascending=False, inplace=True)

agg_df = pd.merge(agg_job_title, agg_job_id)

In [272]:
agg_df.rename(columns={'n_job_id': 'n_job'}, inplace=True)
agg_df.head()


Out[272]:
uid n_job_title n_job job_titles
0 26451 1034 4194 Business Consultant,Brand Development Manager,...
1 103204 1001 7754 Human Resource & Administration Manager,Admini...
2 14322 788 3649 Procurement Officer,Marine Operations Executiv...
3 141505 742 2068 Safety, Health & Environment Coordinator,Assis...
4 100500 647 2825 Software Consultant,Investment Analyst,Procure...

In [281]:
plt.close('all')

In [283]:
fig = plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
loglog(agg_df['n_job_title'], xl='# Job titles applied', yl='# applicants')
plt.subplots_adjust(wspace=.5)

plt.subplot(1,2,2)
loglog(agg_df['n_job'], xl='# Jobs applied', yl='# applicants')

plt.savefig(FIG_DIR + 'figs/applied_jobs.pdf')
plt.show()
plt.close()



In [184]:
print apps.shape[0]


1531701

In [336]:
# Join all job titles of each user for reference
t0 = time()
tmp = apps[['uid', 'job_title']].groupby('uid').agg({'job_title': paste})
print('Finished joining job titles after {}s'.format(time()-t0))
tmp = tmp.add_suffix('s').reset_index()

apps_by_job_title = pd.merge(apps_by_job_title, tmp)
apps_by_job_title.sort_values('n_job_title', ascending=False, inplace=True)

apps_by_job_title.to_csv(AGG_DIR + 'apps_by_job_title.csv', index=False)


Finished joining job titles after 2.87000012398s

C. Number of company an applicant applies

Merge necessary files to get a full dataset


In [285]:
posts = pd.read_csv(DATA_DIR + 'full_job_posts.csv')
print(posts.shape)
posts = dot2dash(posts)

posts.head()


Out[285]:
company_registration_number_uen_ep job_id no_of_applications company_registration_number_uen_job_posting job_posting_date_history job_expiry_date_history job_expiry_year job_ssoc_code_list job_ssoc_description_list no_of_vacancies job_monthly_min_sal job_monthly_max_sal no_of_times_job_posting_reopened job_posting_status job_employment_type_list job_experience_required_years job_level
0 002 JOB-2015-0145758 1.0 002 2015-01-13 2015-02-12 2015 31275 Architectural Assistant 2 4000.0 6000.0 0 Closed Full Time, Permanent 3.0 Manager
1 002 JOB-2015-0145763 3.0 002 2015-01-13 2015-02-12 2015 31271 Draftsman 3 3500.0 4500.0 0 Closed Full Time, Permanent 3.0 Executive
2 003 JOB-2015-0172574 3.0 003 2015-02-23 2015-03-25 2015 39229 Administrative Accounts Executive 1 NaN NaN 0 Closed Full Time, Permanent 5.0 NaN
3 04130400D JOB-2015-0173823 NaN 04130400D 2015-02-24 2015-03-26 2015 74399 Garland Maker 3 3500.0 4500.0 1 Closed Full Time 1.0 Fresh/entry level,Non-executive
4 07479300D JOB-2015-0144684 93.0 07479300D 2015-01-12 2015-02-11 2015 21319 Information Technology Manager 1 5000.0 7500.0 1 Closed Full Time 5.0 Senior Management

In [304]:
# Extract just job id and employer id
job_and_employer = posts[['job_id', 'company_registration_number_uen_ep']].drop_duplicates()
job_and_employer.head(1)


Out[304]:
job_id company_registration_number_uen_ep
0 JOB-2015-0145758 002

In [295]:
# Load employer details (names, desc,...)
employer_detail = pd.read_csv(DATA_DIR + 'employers.csv')
employer_detail.drop_duplicates(inplace=True)
print(employer_detail.shape)

employer_detail = dot2dash(employer_detail)
employer_detail.head(1)


Out[295]:
(21513, 7)

In [306]:
# Merge to add employer details
job_and_employer = job_and_employer.rename(columns={'company_registration_number_uen_ep': 'reg_no_uen_ep'})
df = pd.merge(apps, pd.merge(job_and_employer, employer_detail))
print(df.shape)

df.sort_values(by='organisation_name_ep', inplace=True, na_position='first')
df.head()


Out[306]:
job_id reg_no_uen_ep employer_creation_date organisation_name_ep ssic_code_ep ssic_description_ep ssic_group_ep third_party_entity_ep
173957 JOB-2015-0216458 197302448M Jul 8, 2014 "K" LINE (SINGAPORE) PTE LTD 52222 Shipping agencies (freight) Transportation and Storage N\r
173956 JOB-2015-0216414 197302448M Jul 8, 2014 "K" LINE (SINGAPORE) PTE LTD 52222 Shipping agencies (freight) Transportation and Storage N\r
173959 JOB-2015-0216978 197302448M Jul 8, 2014 "K" LINE (SINGAPORE) PTE LTD 52222 Shipping agencies (freight) Transportation and Storage N\r
173958 JOB-2015-0216499 197302448M Jul 8, 2014 "K" LINE (SINGAPORE) PTE LTD 52222 Shipping agencies (freight) Transportation and Storage N\r
24352 JOB-2015-0196481 198701829Z Jul 23, 2014 "K" LINE LOGISTICS (SINGAPORE) PTE. LTD. 52292 Freight forwarding, packing and crating services Transportation and Storage N\r

In [308]:
# del df['is_number']
# df.to_csv(DATA_DIR + 'full_apps.csv', index=False)
df.head()


Out[308]:
uid job_id job_title apply_date reg_no_uen_ep employer_creation_date organisation_name_ep ssic_code_ep ssic_description_ep ssic_group_ep third_party_entity_ep
0 7 JOB-2015-0223128 Housekeeping Supervisor 2015-07-01 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y\r
1 21073 JOB-2015-0223128 Housekeeping Supervisor 2015-05-07 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y\r
2 46634 JOB-2015-0223128 Housekeeping Supervisor 2015-05-01 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y\r
3 100427 JOB-2015-0223128 Housekeeping Supervisor 2015-07-24 52865867X Jul 8, 2014 THE FULLERTON HOTEL 55101 Hotels with restaurant Accommodation and Food Service Activities Y\r
4 39 JOB-2014-0134411 Account Assistant 2015-06-16 200203771R Jul 5, 2014 THE SHICHIDA METHOD (S) PTE. LTD. 82999 Other business support services activities nec... Administrative and Support Service Activities N\r

In [338]:
user_apply_comp = df[['uid', 'reg_no_uen_ep', 'organisation_name_ep']]
user_apply_comp['n_apply'] = ''


C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [340]:
apps_by_comp = user_apply_comp.groupby(['uid', 'reg_no_uen_ep', 'organisation_name_ep']).count()
apps_by_comp = apps_by_comp.reset_index()
apps_by_comp.sort_values('n_apply', ascending=False, inplace=True)
apps_by_comp.head()

apps_by_comp.to_csv(AGG_DIR + 'apps_by_comp.csv', index=False)

In [ ]:
loglog(apps_by_comp['n_apply'], xl='# applications', yl='# user-apply-company cases')

plt.savefig(FIG_DIR + 'user_comp.pdf')
plt.show()
plt.close()

In [323]:
quantile(user_apply_comp['n_apply'])


Out[323]:
min 25% 50% (median) 75% max
0 1.0 1.0 1.0 1.0 328.0

D. Number of (job title, company) an applicant applies


In [342]:
tmp = df[['uid', 'job_title', 'reg_no_uen_ep', 'organisation_name_ep']]
tmp['n_apply'] = ''

apps_by_job_comp = tmp.groupby(['uid', 'job_title', 'reg_no_uen_ep', 'organisation_name_ep']).count()
apps_by_job_comp = apps_by_job_comp.reset_index()
apps_by_job_comp.sort_values('n_apply', ascending=False, inplace=True)
print apps_by_job_comp.shape
apps_by_job_comp.head()


C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
(1352961, 5)
Out[342]:
uid job_title reg_no_uen_ep organisation_name_ep n_apply
958000 103204 Analyst 196800306E DBS BANK LTD. 132
960614 103204 Information Technology Specialist 196800306E DBS BANK LTD. 90
1064171 112664 Research Assistant 200604346E NATIONAL UNIVERSITY OF SINGAPORE 90
1018071 108289 Call Centre Agent 199907051E CREDIT MANAGEMENT CONSULTANCY (ASIA) PTE LTD 72
685189 76182 Information Technology Specialist 196800306E DBS BANK LTD. 64

In [343]:
apps_by_job_comp.to_csv(AGG_DIR + 'apps_by_job_comp.csv', index=False)

In [ ]:
loglog(apps_by_job_comp['n_apply'], xl='# applications', yl='# user-apply-job-at-company cases')
plt.savefig(FIG_DIR + 'user_job_comp.pdf')
plt.show()
plt.close()

In [332]:
job_comp = df[['job_title', 'organisation_name_ep']].drop_duplicates()
job_comp.shape


Out[332]:
(89071, 2)