In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from my_util import *
from job_rec_helpers import *
In [337]:
# Global vars
DATA_DIR = 'D:/larc_projects/job_analytics/data/clean/'
RES_DIR = 'd:/larc_projects/job_analytics/results/'
AGG_DIR = RES_DIR + 'agg/'
FIG_DIR = RES_DIR + 'figs/'
In [ ]:
apps = pd.read_csv(DATA_DIR + 'apps_with_time.csv')
apps.shape
In [183]:
# Rm noise (numbers) in job_title column
apps['is_number'] = map(is_number, apps['job_title'])
apps = apps.query('is_number == False')
In [198]:
# del apps['user.id']; del apps['user.index']; del apps['item.index']; del apps['freq']
apps.rename(columns={'job.id': 'job_id', 'job.title': 'job_title', 'apply.date': 'apply_date'},
inplace=True)
apps.to_csv(DATA_DIR + 'apps_with_time.csv', index=False)
apps.head(3)
Out[198]:
In [267]:
n_applicant = apps['uid'].nunique(); n_application = apps.shape[0]
n_job = len(np.unique(apps['job_id'])); n_job_title = len(np.unique(apps['job_title']))
n_company = posts['company_registration_number_uen_ep'].nunique()
In [268]:
stats = pd.DataFrame({'n_application': n_application, 'n_applicant': n_applicant,
'n_job': n_job, 'n_job_title': n_job_title, 'n_company': n_company}, index=[0])
stats
Out[268]:
In [269]:
stats.to_csv(DATA_DIR + 'stats/stats.csv', index=False)
In [199]:
agg_apps = apps.groupby(by=['uid', 'job_title']).agg({'job_id': 'nunique', 'apply_date': 'nunique'})
# convert to DF
agg_apps = agg_apps.add_prefix('n_').reset_index()
agg_apps['n_apply'] = agg_apps['n_job_id']
agg_apps.head(3)
Out[199]:
Let's look at the quartiles of the number of times an applicant applies for a specific job.
In [103]:
quantile(df['n_apply'])
Out[103]:
As expected, for most of the cases (50%), an applicant applies just once for a specific job. However, we can also see at least 1 extreme case where an applicant applies 582 times for just a job title. Thus, let's look more closely at the distribution of $N_{apply}$.
In [ ]:
plt.hist(df['n_apply'], bins=np.unique(df['n_apply']), log=True)
plt.xlabel(r'$N_{apply}$')
plt.ylabel('# applicant-job pairs (log scale)')
plt.savefig(DATA_DIR + 'apply_freq.pdf')
plt.show()
plt.close()
In [218]:
extremes = agg_apps.query('n_apply >= 100')
extremes.sort_values(by='n_apply', ascending=False, inplace=True)
extremes.head()
Out[218]:
In [105]:
print('No. of extreme cases: {}'.format(extremes.shape[0]))
To get a more complete picture on these extreme cases, let's put in apply dates and companies of those jobs.
In [84]:
# ext_users = np.unique(extremes['uid'])
df = apps[apps['uid'].isin(extremes['uid'])]
df = df[df['job_title'].isin(extremes['job_title'])]
ext_apps = df
ext_apps.head(1)
Out[84]:
In [225]:
res = calDuration(ext_apps)
res = pd.merge(res, extremes, left_index=True, right_on=['uid', 'job_title'])
res.sort_values(by='uid', inplace=True)
res = res[['uid', 'job_title', 'n_apply', 'first_apply_date', 'last_apply_date', 'n_active_day', 'total_duration_in_day']]
In [226]:
res.sort_values('n_apply', ascending=False, inplace=True)
res.head()
Out[226]:
In [228]:
res.tail()
Out[228]:
In [239]:
tmp = apps.query('uid==33833')[['uid', 'job_title', 'job_id']] .groupby('job_title').agg({'job_id': 'nunique'})
tmp = tmp.add_prefix('n_').reset_index()
tmp.rename(columns={'n_job_id': 'n_apply'}, inplace=True)
tmp.sort_values('n_apply', ascending=False, inplace=True)
Out[239]:
In [240]:
quantile(res['n_active_day'])
Out[240]:
In [241]:
res.to_csv(RES_DIR + 'extremes.csv')
In [227]:
apps_with_duration = calDuration(apps)
apps_with_duration.head()
Out[227]:
In [244]:
all_res = pd.merge(apps_with_duration, agg_apps, left_index=True, right_on=['uid', 'job_title'])
all_res.sort_values(by='uid', inplace=True)
all_res = all_res[['uid', 'job_title', 'n_apply', 'first_apply_date', 'last_apply_date', 'n_active_day', 'total_duration_in_day']]
all_res.head()
Out[244]:
In [334]:
all_res.shape
Out[334]:
In [335]:
all_res.to_csv(AGG_DIR + 'timed_apps.csv', index=False)
In [249]:
normal = all_res.query('n_apply < 100')
extremes = res
In [253]:
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
plt.hist(extremes['n_active_day'], bins=np.unique(extremes['n_active_day']))
plt.title('Extreme cases')
plt.xlabel('# active days')
plt.ylabel('# user-apply-job cases')
plt.subplots_adjust(wspace=.5)
plt.subplot(1,2,2)
plt.hist(normal['n_active_day'], bins=np.unique(normal['n_active_day']),
log=True)
plt.title('Normal cases')
plt.xlabel('# active days')
plt.ylabel('# user-apply-job cases')
plt.savefig(RES_DIR + 'n_active_day.pdf')
plt.show()
plt.close()
In [ ]:
agg_job_title = apps[['uid', 'job_title']].groupby('uid').agg({'job_title': 'nunique'})
agg_job_title = agg_job_title.add_prefix('n_').reset_index()
agg_job_title.sort_values('n_job_title', ascending=False, inplace=True)
# agg_job_title.head()
agg_job_id = apps[['uid', 'job_id']].groupby('uid').agg({'job_id': 'nunique'})
agg_job_id = agg_job_id.add_prefix('n_').reset_index()
agg_job_id.sort_values('n_job_id', ascending=False, inplace=True)
agg_df = pd.merge(agg_job_title, agg_job_id)
In [272]:
agg_df.rename(columns={'n_job_id': 'n_job'}, inplace=True)
agg_df.head()
Out[272]:
In [281]:
plt.close('all')
In [283]:
fig = plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
loglog(agg_df['n_job_title'], xl='# Job titles applied', yl='# applicants')
plt.subplots_adjust(wspace=.5)
plt.subplot(1,2,2)
loglog(agg_df['n_job'], xl='# Jobs applied', yl='# applicants')
plt.savefig(FIG_DIR + 'figs/applied_jobs.pdf')
plt.show()
plt.close()
In [184]:
print apps.shape[0]
In [336]:
# Join all job titles of each user for reference
t0 = time()
tmp = apps[['uid', 'job_title']].groupby('uid').agg({'job_title': paste})
print('Finished joining job titles after {}s'.format(time()-t0))
tmp = tmp.add_suffix('s').reset_index()
apps_by_job_title = pd.merge(apps_by_job_title, tmp)
apps_by_job_title.sort_values('n_job_title', ascending=False, inplace=True)
apps_by_job_title.to_csv(AGG_DIR + 'apps_by_job_title.csv', index=False)
In [285]:
posts = pd.read_csv(DATA_DIR + 'full_job_posts.csv')
print(posts.shape)
posts = dot2dash(posts)
posts.head()
Out[285]:
In [304]:
# Extract just job id and employer id
job_and_employer = posts[['job_id', 'company_registration_number_uen_ep']].drop_duplicates()
job_and_employer.head(1)
Out[304]:
In [295]:
# Load employer details (names, desc,...)
employer_detail = pd.read_csv(DATA_DIR + 'employers.csv')
employer_detail.drop_duplicates(inplace=True)
print(employer_detail.shape)
employer_detail = dot2dash(employer_detail)
employer_detail.head(1)
Out[295]:
In [306]:
# Merge to add employer details
job_and_employer = job_and_employer.rename(columns={'company_registration_number_uen_ep': 'reg_no_uen_ep'})
df = pd.merge(apps, pd.merge(job_and_employer, employer_detail))
print(df.shape)
df.sort_values(by='organisation_name_ep', inplace=True, na_position='first')
df.head()
Out[306]:
In [308]:
# del df['is_number']
# df.to_csv(DATA_DIR + 'full_apps.csv', index=False)
df.head()
Out[308]:
In [338]:
user_apply_comp = df[['uid', 'reg_no_uen_ep', 'organisation_name_ep']]
user_apply_comp['n_apply'] = ''
In [340]:
apps_by_comp = user_apply_comp.groupby(['uid', 'reg_no_uen_ep', 'organisation_name_ep']).count()
apps_by_comp = apps_by_comp.reset_index()
apps_by_comp.sort_values('n_apply', ascending=False, inplace=True)
apps_by_comp.head()
apps_by_comp.to_csv(AGG_DIR + 'apps_by_comp.csv', index=False)
In [ ]:
loglog(apps_by_comp['n_apply'], xl='# applications', yl='# user-apply-company cases')
plt.savefig(FIG_DIR + 'user_comp.pdf')
plt.show()
plt.close()
In [323]:
quantile(user_apply_comp['n_apply'])
Out[323]:
In [342]:
tmp = df[['uid', 'job_title', 'reg_no_uen_ep', 'organisation_name_ep']]
tmp['n_apply'] = ''
apps_by_job_comp = tmp.groupby(['uid', 'job_title', 'reg_no_uen_ep', 'organisation_name_ep']).count()
apps_by_job_comp = apps_by_job_comp.reset_index()
apps_by_job_comp.sort_values('n_apply', ascending=False, inplace=True)
print apps_by_job_comp.shape
apps_by_job_comp.head()
Out[342]:
In [343]:
apps_by_job_comp.to_csv(AGG_DIR + 'apps_by_job_comp.csv', index=False)
In [ ]:
loglog(apps_by_job_comp['n_apply'], xl='# applications', yl='# user-apply-job-at-company cases')
plt.savefig(FIG_DIR + 'user_job_comp.pdf')
plt.show()
plt.close()
In [332]:
job_comp = df[['job_title', 'organisation_name_ep']].drop_duplicates()
job_comp.shape
Out[332]: