In [16]:
    
from time import time
from my_util import *
from job_rec_helpers import *
import scipy.sparse as sp
    
In [2]:
    
def canon(strings):
    return map(str.strip, map(str.lower, strings))
    
In [3]:
    
# Global vars
DATA_DIR = 'D:/larc_projects/job_analytics/data/clean/'
RES_DIR = 'd:/larc_projects/job_analytics/results/'
AGG_DIR = RES_DIR + 'agg/'
FIG_DIR = RES_DIR + 'figs/'
    
In [4]:
    
apps = pd.read_csv(DATA_DIR + 'full_apps.csv')
    
In [6]:
    
# apps['job_title'] = canon(apps['job_title'])
apps.head()
    
    Out[6]:
In [ ]:
    
apps['job_title_is_number'] = map(is_number, apps['job_title'])
    
In [ ]:
    
print ('bf filter: %d' %apps.shape[0])
tmp = apps.query('job_title_is_number == False')
print('after filter: %d' %tmp.shape[0])
    
In [ ]:
    
n_application, n_applicant, n_job, n_job_title = apps.shape[0], apps['uid'].nunique(), apps['job_id'].nunique(), apps['job_title'].nunique()
n_company = apps['reg_no_uen_ep'].nunique()
stats = pd.DataFrame({'n_application': n_application, 'n_applicant': n_applicant, 
                      'n_job': n_job, 'n_job_title': n_job_title,
                     'n_company': n_company}, index=[0])
stats
    
In [ ]:
    
stats.to_csv(DATA_DIR + 'stats/stats.csv', index=False)
    
In [7]:
    
agg_apps = pd.read_csv(AGG_DIR + 'timed_apps.csv')
print agg_apps.shape
    
    
In [8]:
    
agg_apps.sort_values('n_apply', ascending=False, inplace=True)
    
In [9]:
    
# top 10 extreme cases
agg_apps.head(10)
    
    Out[9]:
As the number of active days changes with users, we need to calculate the avg. apply frequency by dividing n_apply by n_active_day.
In [10]:
    
agg_apps['apply_freq'] = agg_apps['n_apply']/agg_apps['n_active_day']
    
In [14]:
    
agg_apps['apply_freq'] = np.round(agg_apps['apply_freq'], 2)
agg_apps.sort_values(by='apply_freq', ascending=False, inplace=True)
agg_apps.head()
    
    Out[14]:
In [13]:
    
quantile(agg_apps['apply_freq'])
    
    Out[13]:
In [15]:
    
agg_apps.to_csv(AGG_DIR + 'timed_apps.csv', index=False)
    
To build the matrix, there are two ways.
The cons of 2nd way is that to cover all job titles, which have different lengths (# words), countvectorizer() need to repeatedly split each document into n-grams, which is time consuming.
We thus use the 1st way.
In [17]:
    
user_ids = np.unique(agg_apps['uid'])
index_of_users = { user_ids[i]:i for i in range(len(user_ids)) }
item_ids = np.unique(agg_apps['job_title'])
index_of_items = { item_ids[i]:i for i in range(len(item_ids))}
    
In [27]:
    
n_user = len(index_of_users.keys())
n_item = len(index_of_items.keys())
    
In [34]:
    
# Given index_of_users and index_of_items,
## build user-item matrix from a df with columns (user_col, item_col, rating_col) containing triples (uid, item_id, rating)
def buildUserItemMat(df, user_col = 'uid', item_col = 'item_id', rating_col = 'rating'):
    print('Mapping user ids to internal user indices...')
    row_ind = list(df.apply(lambda r: index_of_users[r[user_col]], axis=1))
    print('Mapping item ids to internal item indices...')
    col_ind = list(df.apply(lambda r: index_of_items[r[item_col]], axis=1))
    ratings = list(df[rating_col])
    
    n_user, n_item = len(index_of_users.keys()), len(index_of_items.keys())
    user_item_mat = sp.csr_matrix((ratings, (row_ind, col_ind)), shape=(n_user, n_item))
    print('User-Item matrix built')
    return user_item_mat
    
In [33]:
    
user_apply_job = buildUserItemMat(df=agg_apps, user_col='uid', item_col='job_title', rating_col='n_apply')
    
    
In [48]:
    
from scipy.io import *
mmwrite(DATA_DIR + 'user_apply_job.mtx', user_apply_job)
    
In [42]:
    
df = pd.DataFrame({'uid': index_of_users.keys(), 'u_index': index_of_users.values()})
df.sort_values('u_index', inplace=True)
df.to_csv(DATA_DIR + 'user_dict.csv', index=False)
    
In [47]:
    
# index_of_items.keys()[:3]
df = pd.DataFrame({'job_title': index_of_items.keys(), 'item_index': index_of_items.values()})
df.sort_values('item_index', inplace=True)
df.to_csv(DATA_DIR + 'item_dict.csv', index=False)
    
In [ ]:
    
    
In [ ]:
    
by_job_title = agg_apps[['job_title', 'n_apply']].groupby('job_title').sum()
by_job_title = by_job_title.add_prefix('total_').reset_index()
    
In [ ]:
    
# top-10 popular job titles
by_job_title.sort_values('total_n_apply', ascending=False, inplace=True)
by_job_title.head(10)
    
In [ ]:
    
by_user = agg_apps[['uid', 'n_apply']].groupby('uid').sum()
by_user = by_user.add_prefix('total_').reset_index()
    
In [ ]:
    
# top-10 hard working job hunters
by_user.sort_values('total_n_apply', ascending=False, inplace=True)
by_user.head(10)
    
In [ ]:
    
by_job_title.head(10).to_csv(RES_DIR + 'top10_job_titles.csv', index=False)
    
In [ ]:
    
by_user.head(10).to_csv(RES_DIR + 'top10_job_hunters.csv', index=False)
    
Quartiles of the number of times an applicant applies for a specific job:
In [ ]:
    
quantile(agg_apps['n_apply'])
    
As expected, for most of the cases (50%), an applicant applies just once for a specific job. However, we can also see at least 1 extreme case where an applicant applies 582 times for just a job title. Thus, let's look more closely at the distribution of $N_{apply}$.
In [ ]:
    
plt.hist(agg_apps['n_apply'], bins=np.unique(agg_apps['n_apply']), log=True)
plt.xlabel(r'$N_{apply}$')
plt.ylabel('# applicant-job pairs (log scale)')
# plt.savefig(DATA_DIR + 'apply_freq.pdf')
plt.show()
plt.close()
    
To get a more complete picture on these extreme cases, let's look at:
In [ ]:
    
extremes = pd.read_csv(RES_DIR + 'extremes.csv')
print('No. of extreme cases: {}'.format(extremes.shape[0]))
extremes.head(3)
    
In [ ]:
    
quantile(extremes['n_active_day'])
    
In [ ]:
    
apps_by_job_title = pd.read_csv(AGG_DIR + 'apps_by_job_title.csv')
    
In [ ]:
    
fig = plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
loglog(apps_by_job_title['n_job_title'], xl='# Job titles applied', yl='# applicants')
plt.subplots_adjust(wspace=.5)
plt.subplot(1,2,2)
loglog(apps_by_job_title['n_job'], xl='# Jobs applied', yl='# applicants')
# plt.savefig(FIG_DIR + 'figs/applied_jobs.pdf')
plt.show()
plt.close()
    
In [ ]:
    
apps_by_comp = pd.read_csv(AGG_DIR + 'apps_by_comp.csv')
apps_by_comp.shape
    
In [ ]:
    
loglog(apps_by_comp['n_apply'], xl='# applications', yl='# user-apply-company cases')
# plt.savefig(FIG_DIR + 'user_comp.pdf')
plt.show()
plt.close()
    
In [ ]:
    
apps_by_job_comp = pd.read_csv(AGG_DIR + 'apps_by_job_comp.csv')
apps_by_job_comp.shape
    
In [ ]:
    
loglog(apps_by_job_comp['n_apply'], xl='# applications', yl='# user-apply-job-at-company cases')
# plt.savefig(FIG_DIR + 'user_job_comp.pdf')
plt.show()
plt.close()
    
In [ ]:
    
job_comp = apps[['job_title', 'organisation_name_ep']].drop_duplicates()
print('No. of job-company pairs: {}'.format(job_comp.shape[0]))
    
In [ ]:
    
def getRecords(uids, df):
    return df[ df['uid'].isin(uids)]
    
In [ ]:
    
print('No. of applicants: {}'.format(n_applicant))
print('No. of job titles: {}'.format(n_job_title))
    
Thus the dimensions of applicant-apply-job matrix should be 68144 $\times$ 5794.
In [ ]:
    
apps_by_job_title = pd.read_csv(AGG_DIR + 'apps_by_job_title.csv')
# sanity check
print(apps_by_job_title.shape[0] == n_applicant)
apps_by_job_title.head()
    
In [ ]:
    
import sklearn.feature_extraction.text as text_manip
import scipy.sparse as sp
    
In [ ]:
    
docs = apps_by_job_title['job_titles']
job_titles = apps['job_title'].unique()
max_len = max(map(n_word, job_titles))
print('max no. of words in a job title: {}'.format(max_len))
job_title_len = map(n_word, job_titles)
    
In [ ]:
    
quantile(job_title_len)
    
In [ ]:
    
plt.hist(job_title_len, bins=np.unique(job_title_len))
plt.xlabel('# words in job title')
plt.ylabel('# job titles')
plt.show()
    
In [ ]:
    
count_vec = text_manip.CountVectorizer(vocabulary=job_titles, ngram_range=(1,6))
    
In [ ]:
    
t0 = time()
print('Building applicant-apply-job matrix...')
user_apply_job = count_vec.fit_transform(docs)
print('Done after {}s'.format(time()-t0))
    
In [ ]:
    
# sparsity of applicant-apply-job
float(user_apply_job.nnz)/(n_applicant * n_job_title)
    
In [ ]:
    
nrow, ncol = user_apply_job.shape[0], user_apply_job.shape[1]
print('Dimension of applicant-apply-job matrix: {} x {}'.format(nrow, ncol))
    
In [ ]:
    
feats = count_vec.get_feature_names()
# sum([1 for j in first_user_job_titles if j in feats])
    
In [ ]:
    
from scipy.io import *
mmwrite(DATA_DIR + 'user_apply_job.mtx', user_apply_job)
    
In [ ]:
    
# first_user_job_titles = docs[0].split(',')
# n_job_in_vocab = sum([1 for j in first_user_job_titles if j in vocab])
# print('Total # jobs: %d' %len(first_user_job_titles))
# print('# jobs in vocab: %d' %n_job_in_vocab)
# all(j in vocab for j in first_user_job_titles)
    
In [ ]:
    
quantile(apps_by_job_comp['n_apply'])
    
In [ ]:
    
apps_by_job_comp.rename(columns={'organisation_name_ep': 'employer_name', 'reg_no_uen_ep': 'employer_id'}, inplace=True)
apps_by_job_comp.query('n_apply >= 50')
    
In [ ]:
    
apps.query('uid == 103204').query('job_title == "analyst"').query('reg_no_uen_ep=="196800306E"').to_csv(RES_DIR + 'tmp.csv')
    
In [ ]:
    
apps_by_job_comp['job_employer'] = apps_by_job_comp['job_title'] + ' at ' + apps_by_job_comp['employer_name']
apps_by_job_comp.head()
    
In [ ]:
    
uniq_job_employers = np.unique(apps_by_job_comp['job_employer'])
len(uniq_job_employers)
    
In [ ]:
    
users = np.unique(apps['uid'])
len(users)
    
In [ ]:
    
job_employer_idx = { uniq_job_employers[i]:i for i in range(len(uniq_job_employers))}
index_of_users = { users[i]:i for i in range(len(users)) }
    
In [ ]:
    
apps_by_job_comp.apply(putTriple, axis=1)