In [16]:
from time import time
from my_util import *
from job_rec_helpers import *
import scipy.sparse as sp
In [2]:
def canon(strings):
return map(str.strip, map(str.lower, strings))
In [3]:
# Global vars
DATA_DIR = 'D:/larc_projects/job_analytics/data/clean/'
RES_DIR = 'd:/larc_projects/job_analytics/results/'
AGG_DIR = RES_DIR + 'agg/'
FIG_DIR = RES_DIR + 'figs/'
In [4]:
apps = pd.read_csv(DATA_DIR + 'full_apps.csv')
In [6]:
# apps['job_title'] = canon(apps['job_title'])
apps.head()
Out[6]:
In [ ]:
apps['job_title_is_number'] = map(is_number, apps['job_title'])
In [ ]:
print ('bf filter: %d' %apps.shape[0])
tmp = apps.query('job_title_is_number == False')
print('after filter: %d' %tmp.shape[0])
In [ ]:
n_application, n_applicant, n_job, n_job_title = apps.shape[0], apps['uid'].nunique(), apps['job_id'].nunique(), apps['job_title'].nunique()
n_company = apps['reg_no_uen_ep'].nunique()
stats = pd.DataFrame({'n_application': n_application, 'n_applicant': n_applicant,
'n_job': n_job, 'n_job_title': n_job_title,
'n_company': n_company}, index=[0])
stats
In [ ]:
stats.to_csv(DATA_DIR + 'stats/stats.csv', index=False)
In [7]:
agg_apps = pd.read_csv(AGG_DIR + 'timed_apps.csv')
print agg_apps.shape
In [8]:
agg_apps.sort_values('n_apply', ascending=False, inplace=True)
In [9]:
# top 10 extreme cases
agg_apps.head(10)
Out[9]:
As the number of active days changes with users, we need to calculate the avg. apply frequency by dividing n_apply by n_active_day.
In [10]:
agg_apps['apply_freq'] = agg_apps['n_apply']/agg_apps['n_active_day']
In [14]:
agg_apps['apply_freq'] = np.round(agg_apps['apply_freq'], 2)
agg_apps.sort_values(by='apply_freq', ascending=False, inplace=True)
agg_apps.head()
Out[14]:
In [13]:
quantile(agg_apps['apply_freq'])
Out[13]:
In [15]:
agg_apps.to_csv(AGG_DIR + 'timed_apps.csv', index=False)
To build the matrix, there are two ways.
The cons of 2nd way is that to cover all job titles, which have different lengths (# words), countvectorizer() need to repeatedly split each document into n-grams, which is time consuming.
We thus use the 1st way.
In [17]:
user_ids = np.unique(agg_apps['uid'])
index_of_users = { user_ids[i]:i for i in range(len(user_ids)) }
item_ids = np.unique(agg_apps['job_title'])
index_of_items = { item_ids[i]:i for i in range(len(item_ids))}
In [27]:
n_user = len(index_of_users.keys())
n_item = len(index_of_items.keys())
In [34]:
# Given index_of_users and index_of_items,
## build user-item matrix from a df with columns (user_col, item_col, rating_col) containing triples (uid, item_id, rating)
def buildUserItemMat(df, user_col = 'uid', item_col = 'item_id', rating_col = 'rating'):
print('Mapping user ids to internal user indices...')
row_ind = list(df.apply(lambda r: index_of_users[r[user_col]], axis=1))
print('Mapping item ids to internal item indices...')
col_ind = list(df.apply(lambda r: index_of_items[r[item_col]], axis=1))
ratings = list(df[rating_col])
n_user, n_item = len(index_of_users.keys()), len(index_of_items.keys())
user_item_mat = sp.csr_matrix((ratings, (row_ind, col_ind)), shape=(n_user, n_item))
print('User-Item matrix built')
return user_item_mat
In [33]:
user_apply_job = buildUserItemMat(df=agg_apps, user_col='uid', item_col='job_title', rating_col='n_apply')
In [48]:
from scipy.io import *
mmwrite(DATA_DIR + 'user_apply_job.mtx', user_apply_job)
In [42]:
df = pd.DataFrame({'uid': index_of_users.keys(), 'u_index': index_of_users.values()})
df.sort_values('u_index', inplace=True)
df.to_csv(DATA_DIR + 'user_dict.csv', index=False)
In [47]:
# index_of_items.keys()[:3]
df = pd.DataFrame({'job_title': index_of_items.keys(), 'item_index': index_of_items.values()})
df.sort_values('item_index', inplace=True)
df.to_csv(DATA_DIR + 'item_dict.csv', index=False)
In [ ]:
In [ ]:
by_job_title = agg_apps[['job_title', 'n_apply']].groupby('job_title').sum()
by_job_title = by_job_title.add_prefix('total_').reset_index()
In [ ]:
# top-10 popular job titles
by_job_title.sort_values('total_n_apply', ascending=False, inplace=True)
by_job_title.head(10)
In [ ]:
by_user = agg_apps[['uid', 'n_apply']].groupby('uid').sum()
by_user = by_user.add_prefix('total_').reset_index()
In [ ]:
# top-10 hard working job hunters
by_user.sort_values('total_n_apply', ascending=False, inplace=True)
by_user.head(10)
In [ ]:
by_job_title.head(10).to_csv(RES_DIR + 'top10_job_titles.csv', index=False)
In [ ]:
by_user.head(10).to_csv(RES_DIR + 'top10_job_hunters.csv', index=False)
Quartiles of the number of times an applicant applies for a specific job:
In [ ]:
quantile(agg_apps['n_apply'])
As expected, for most of the cases (50%), an applicant applies just once for a specific job. However, we can also see at least 1 extreme case where an applicant applies 582 times for just a job title. Thus, let's look more closely at the distribution of $N_{apply}$.
In [ ]:
plt.hist(agg_apps['n_apply'], bins=np.unique(agg_apps['n_apply']), log=True)
plt.xlabel(r'$N_{apply}$')
plt.ylabel('# applicant-job pairs (log scale)')
# plt.savefig(DATA_DIR + 'apply_freq.pdf')
plt.show()
plt.close()
To get a more complete picture on these extreme cases, let's look at:
In [ ]:
extremes = pd.read_csv(RES_DIR + 'extremes.csv')
print('No. of extreme cases: {}'.format(extremes.shape[0]))
extremes.head(3)
In [ ]:
quantile(extremes['n_active_day'])
In [ ]:
apps_by_job_title = pd.read_csv(AGG_DIR + 'apps_by_job_title.csv')
In [ ]:
fig = plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
loglog(apps_by_job_title['n_job_title'], xl='# Job titles applied', yl='# applicants')
plt.subplots_adjust(wspace=.5)
plt.subplot(1,2,2)
loglog(apps_by_job_title['n_job'], xl='# Jobs applied', yl='# applicants')
# plt.savefig(FIG_DIR + 'figs/applied_jobs.pdf')
plt.show()
plt.close()
In [ ]:
apps_by_comp = pd.read_csv(AGG_DIR + 'apps_by_comp.csv')
apps_by_comp.shape
In [ ]:
loglog(apps_by_comp['n_apply'], xl='# applications', yl='# user-apply-company cases')
# plt.savefig(FIG_DIR + 'user_comp.pdf')
plt.show()
plt.close()
In [ ]:
apps_by_job_comp = pd.read_csv(AGG_DIR + 'apps_by_job_comp.csv')
apps_by_job_comp.shape
In [ ]:
loglog(apps_by_job_comp['n_apply'], xl='# applications', yl='# user-apply-job-at-company cases')
# plt.savefig(FIG_DIR + 'user_job_comp.pdf')
plt.show()
plt.close()
In [ ]:
job_comp = apps[['job_title', 'organisation_name_ep']].drop_duplicates()
print('No. of job-company pairs: {}'.format(job_comp.shape[0]))
In [ ]:
def getRecords(uids, df):
return df[ df['uid'].isin(uids)]
In [ ]:
print('No. of applicants: {}'.format(n_applicant))
print('No. of job titles: {}'.format(n_job_title))
Thus the dimensions of applicant-apply-job matrix should be 68144 $\times$ 5794.
In [ ]:
apps_by_job_title = pd.read_csv(AGG_DIR + 'apps_by_job_title.csv')
# sanity check
print(apps_by_job_title.shape[0] == n_applicant)
apps_by_job_title.head()
In [ ]:
import sklearn.feature_extraction.text as text_manip
import scipy.sparse as sp
In [ ]:
docs = apps_by_job_title['job_titles']
job_titles = apps['job_title'].unique()
max_len = max(map(n_word, job_titles))
print('max no. of words in a job title: {}'.format(max_len))
job_title_len = map(n_word, job_titles)
In [ ]:
quantile(job_title_len)
In [ ]:
plt.hist(job_title_len, bins=np.unique(job_title_len))
plt.xlabel('# words in job title')
plt.ylabel('# job titles')
plt.show()
In [ ]:
count_vec = text_manip.CountVectorizer(vocabulary=job_titles, ngram_range=(1,6))
In [ ]:
t0 = time()
print('Building applicant-apply-job matrix...')
user_apply_job = count_vec.fit_transform(docs)
print('Done after {}s'.format(time()-t0))
In [ ]:
# sparsity of applicant-apply-job
float(user_apply_job.nnz)/(n_applicant * n_job_title)
In [ ]:
nrow, ncol = user_apply_job.shape[0], user_apply_job.shape[1]
print('Dimension of applicant-apply-job matrix: {} x {}'.format(nrow, ncol))
In [ ]:
feats = count_vec.get_feature_names()
# sum([1 for j in first_user_job_titles if j in feats])
In [ ]:
from scipy.io import *
mmwrite(DATA_DIR + 'user_apply_job.mtx', user_apply_job)
In [ ]:
# first_user_job_titles = docs[0].split(',')
# n_job_in_vocab = sum([1 for j in first_user_job_titles if j in vocab])
# print('Total # jobs: %d' %len(first_user_job_titles))
# print('# jobs in vocab: %d' %n_job_in_vocab)
# all(j in vocab for j in first_user_job_titles)
In [ ]:
quantile(apps_by_job_comp['n_apply'])
In [ ]:
apps_by_job_comp.rename(columns={'organisation_name_ep': 'employer_name', 'reg_no_uen_ep': 'employer_id'}, inplace=True)
apps_by_job_comp.query('n_apply >= 50')
In [ ]:
apps.query('uid == 103204').query('job_title == "analyst"').query('reg_no_uen_ep=="196800306E"').to_csv(RES_DIR + 'tmp.csv')
In [ ]:
apps_by_job_comp['job_employer'] = apps_by_job_comp['job_title'] + ' at ' + apps_by_job_comp['employer_name']
apps_by_job_comp.head()
In [ ]:
uniq_job_employers = np.unique(apps_by_job_comp['job_employer'])
len(uniq_job_employers)
In [ ]:
users = np.unique(apps['uid'])
len(users)
In [ ]:
job_employer_idx = { uniq_job_employers[i]:i for i in range(len(uniq_job_employers))}
index_of_users = { users[i]:i for i in range(len(users)) }
In [ ]:
apps_by_job_comp.apply(putTriple, axis=1)