In [ ]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from time import time

from my_util import *
from job_rec_helpers import *

HELPERS:



In [337]:

    
# Global vars
DATA_DIR = 'D:/larc_projects/job_analytics/data/clean/'
RES_DIR = 'd:/larc_projects/job_analytics/results/'
AGG_DIR = RES_DIR + 'agg/'
FIG_DIR = RES_DIR + 'figs/'



In [ ]:

    
apps = pd.read_csv(DATA_DIR + 'apps_with_time.csv')
apps.shape



In [183]:

    
# Rm noise (numbers) in job_title column
apps['is_number'] = map(is_number, apps['job_title'])
apps = apps.query('is_number == False')



In [198]:

    
# del apps['user.id']; del apps['user.index']; del apps['item.index']; del apps['freq']
apps.rename(columns={'job.id': 'job_id', 'job.title': 'job_title', 'apply.date': 'apply_date'}, 
            inplace=True)

apps.to_csv(DATA_DIR + 'apps_with_time.csv', index=False)
apps.head(3)









    Out[198]:






  
    
      
      uid
      job_id
      job_title
      apply_date
      is_number
    
  
  
    
      0
      7
      JOB-2015-0223128
      Housekeeping Supervisor
      2015-07-01
      False
    
    
      1
      39
      JOB-2014-0134411
      Account Assistant
      2015-06-16
      False
    
    
      2
      39
      JOB-2015-0179225
      Audit Assistant
      2015-06-16
      False

Basic statistics



In [267]:

    
n_applicant = apps['uid'].nunique(); n_application = apps.shape[0]
n_job = len(np.unique(apps['job_id'])); n_job_title = len(np.unique(apps['job_title']))
n_company = posts['company_registration_number_uen_ep'].nunique()



In [268]:

    
stats = pd.DataFrame({'n_application': n_application, 'n_applicant': n_applicant, 
                      'n_job': n_job, 'n_job_title': n_job_title, 'n_company': n_company}, index=[0])
stats









    Out[268]:






  
    
      
      n_applicant
      n_application
      n_company
      n_job
      n_job_title
    
  
  
    
      0
      68350
      1531701
      15203
      177307
      5829



In [269]:

    
stats.to_csv(DATA_DIR + 'stats/stats.csv', index=False)

Applicant-apply-Job matrix

A. Number of times an applicant applies a specific job title (position).



In [199]:

    
agg_apps = apps.groupby(by=['uid', 'job_title']).agg({'job_id': 'nunique', 'apply_date': 'nunique'})

# convert to DF
agg_apps = agg_apps.add_prefix('n_').reset_index()
agg_apps['n_apply'] = agg_apps['n_job_id']
agg_apps.head(3)









    Out[199]:






  
    
      
      uid
      job_title
      n_job_id
      n_apply_date
      n_apply
    
  
  
    
      0
      7
      Housekeeping Supervisor
      1
      1
      1
    
    
      1
      39
      Account Assistant
      3
      2
      3
    
    
      2
      39
      Accounts Assistant
      6
      3
      6

Let's look at the quartiles of the number of times an applicant applies for a specific job.



In [103]:

    
quantile(df['n_apply'])









    Out[103]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      1.0
      1.0
      1.0
      2.0
      582.0

As expected, for most of the cases (50%), an applicant applies just once for a specific job. However, we can also see at least 1 extreme case where an applicant applies 582 times for just a job title. Thus, let's look more closely at the distribution of $N_{apply}$.



In [ ]:

    
plt.hist(df['n_apply'], bins=np.unique(df['n_apply']), log=True)

plt.xlabel(r'$N_{apply}$')
plt.ylabel('# applicant-job pairs (log scale)')
plt.savefig(DATA_DIR + 'apply_freq.pdf')

plt.show()
plt.close()

From the histogram, we can see that there are cases when a user applies for a job titles at least 100 times. Let's look closer at those extreme cases.

Extreme cases (a user applies the same job title at least 100 times)



In [218]:

    
extremes = agg_apps.query('n_apply >= 100')
extremes.sort_values(by='n_apply', ascending=False, inplace=True)
extremes.head()









    



C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app






    Out[218]:






  
    
      
      uid
      job_title
      n_job_id
      n_apply_date
      n_apply
    
  
  
    
      133988
      33833
      Accounts Executive
      582
      133
      582
    
    
      133960
      33833
      Accountant
      534
      133
      534
    
    
      570379
      108289
      Accounts Assistant
      502
      103
      502
    
    
      570385
      108289
      Accounts Executive
      430
      99
      430
    
    
      117092
      30771
      Project Manager
      408
      41
      408



In [105]:

    
print('No. of extreme cases: {}'.format(extremes.shape[0]))









    



No. of extreme cases: 220

To get a more complete picture on these extreme cases, let's put in apply dates and companies of those jobs.

Get dates and compute duration of extreme applications:



In [84]:

    
# ext_users = np.unique(extremes['uid'])
df = apps[apps['uid'].isin(extremes['uid'])]
df = df[df['job_title'].isin(extremes['job_title'])]
ext_apps = df

ext_apps.head(1)









    Out[84]:






  
    
      
      uid
      job_id
      job_title
      apply_date
    
  
  
    
      4788
      1513
      JOB-2014-0097548
      Quantity Surveyor
      2015-04-15



In [225]:

    
res = calDuration(ext_apps)

res = pd.merge(res, extremes, left_index=True, right_on=['uid', 'job_title'])
res.sort_values(by='uid', inplace=True)
res = res[['uid', 'job_title', 'n_apply', 'first_apply_date', 'last_apply_date', 'n_active_day', 'total_duration_in_day']]









    



Finished grouping by user-job cases. Totally we have 1877 cases.
Calculating duration of application for each case...
Done after: 0.40700006485s



In [226]:

    
res.sort_values('n_apply', ascending=False, inplace=True)
res.head()









    Out[226]:






  
    
      
      uid
      job_title
      n_apply
      first_apply_date
      last_apply_date
      n_active_day
      total_duration_in_day
    
  
  
    
      133988
      33833
      Accounts Executive
      582
      2015-01-05
      2015-09-30
      133
      268
    
    
      133960
      33833
      Accountant
      534
      2015-01-03
      2015-09-30
      133
      270
    
    
      570379
      108289
      Accounts Assistant
      502
      2015-01-21
      2015-06-29
      103
      159
    
    
      570385
      108289
      Accounts Executive
      430
      2015-01-21
      2015-06-29
      99
      159
    
    
      117092
      30771
      Project Manager
      408
      2015-04-20
      2015-09-20
      41
      153



In [228]:

    
res.tail()









    Out[228]:






  
    
      
      uid
      job_title
      n_apply
      first_apply_date
      last_apply_date
      n_active_day
      total_duration_in_day
    
  
  
    
      553082
      105624
      Network Engineer/Manager
      102
      2015-04-08
      2015-06-23
      17
      76
    
    
      47954
      17000
      Safety Coordinator
      101
      2015-01-01
      2015-09-30
      44
      272
    
    
      646235
      121057
      Human Resource Executive
      101
      2015-01-08
      2015-06-25
      30
      168
    
    
      134052
      33833
      Finance Executive
      100
      2015-01-14
      2015-09-30
      60
      259
    
    
      433860
      85126
      Marketing Executive
      100
      2015-01-21
      2015-05-12
      44
      111



In [239]:

    
tmp = apps.query('uid==33833')[['uid', 'job_title', 'job_id']] .groupby('job_title').agg({'job_id': 'nunique'})
tmp = tmp.add_prefix('n_').reset_index()
tmp.rename(columns={'n_job_id': 'n_apply'}, inplace=True)
tmp.sort_values('n_apply', ascending=False, inplace=True)









    Out[239]:






  
    
      
      job_title
      n_apply
    
  
  
    
      38
      Accounts Executive
      582
    
    
      10
      Accountant
      534
    
    
      5
      Account Executive
      282
    
    
      103
      Finance Manager
      213
    
    
      32
      Accounts Assistant
      103
    
    
      102
      Finance Executive
      100
    
    
      49
      Administrative Accounts Executive
      77
    
    
      40
      Accounts Officer
      68
    
    
      30
      Accounts Administrator
      55
    
    
      21
      Accounting Executive
      47
    
    
      1
      Account Assistant
      47
    
    
      63
      Assistant Finance Manager
      46
    
    
      146
      Senior Accountant (General)
      44
    
    
      23
      Accounting Manager (Finance Department)
      36
    
    
      39
      Accounts Manager
      35
    
    
      22
      Accounting Manager
      34
    
    
      18
      Accounting Associate Professional (eg Assistan...
      25
    
    
      33
      Accounts Associate
      20
    
    
      61
      Assistant Accountant
      19
    
    
      145
      Senior Account Executive
      19
    
    
      104
      Finance Officer
      18
    
    
      24
      Accounting Officer
      17
    
    
      168
      Senior Finance Manager
      17
    
    
      167
      Senior Finance Executive
      16
    
    
      107
      Financial Accountant
      16
    
    
      74
      Category Finance Manager
      16
    
    
      14
      Accountant, Company
      15
    
    
      110
      Financial Controller
      15
    
    
      29
      Accounts Administrative Manager
      13
    
    
      17
      Accounting Assistant
      11
    
    
      ...
      ...
      ...
    
    
      19
      Accounting Clerk
      1
    
    
      16
      Accounting Analyst
      1
    
    
      9
      Account Support Executive
      1
    
    
      7
      Account Management Executive
      1
    
    
      4
      Account Development Manager
      1
    
    
      3
      Account Development Executive
      1
    
    
      80
      Company Secretary (Executive)
      1
    
    
      84
      Corporate Secretarial Executive
      1
    
    
      85
      Credit Administrative Assistant
      1
    
    
      87
      Credit Control Clerk
      1
    
    
      128
      Operations Head
      1
    
    
      127
      Operations Executive
      1
    
    
      126
      Operations Accountant
      1
    
    
      125
      Manager, Administration
      1
    
    
      123
      Management Accountant
      1
    
    
      121
      Learning Support Assistant
      1
    
    
      120
      Key Account Executive
      1
    
    
      119
      Human Resource Specialist
      1
    
    
      118
      Human Resource Manager
      1
    
    
      115
      Human Resource & Administrative Manager
      1
    
    
      113
      General Ledger Bookkeeper
      1
    
    
      112
      Fund Accountant
      1
    
    
      109
      Financial Consultant
      1
    
    
      105
      Finance Support Manager
      1
    
    
      99
      Finance Associate
      1
    
    
      98
      Finance Assistant Officer
      1
    
    
      96
      Executive, Management
      1
    
    
      92
      Deputy Manager
      1
    
    
      91
      Deputy Finance Director
      1
    
    
      177
      Training Officer
      1
    
  

178 rows × 2 columns



In [240]:

    
quantile(res['n_active_day'])









    Out[240]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      2.0
      28.8
      44.0
      67.2
      139.0



In [241]:

    
res.to_csv(RES_DIR + 'extremes.csv')

Dates/duration of all applications:



In [227]:

    
apps_with_duration = calDuration(apps)

apps_with_duration.head()









    



Finished grouping by user-job cases. Totally we have 775693 cases.
Calculating duration of application for each case...
Done after: 147.805999994s






    Out[227]:






  
    
      
      
      first_apply_date
      last_apply_date
      n_active_day
      total_duration_in_day
    
    
      uid
      job_title
      
      
      
      
    
  
  
    
      7
      Housekeeping Supervisor
      2015-07-01
      2015-07-01
      1
      0
    
    
      39
      Account Assistant
      2015-06-12
      2015-06-16
      2
      4
    
    
      Accounts Assistant
      2015-06-11
      2015-06-16
      3
      5
    
    
      Accounts Executive
      2015-06-16
      2015-06-16
      1
      0
    
    
      Accounts Payable (Or Receivable) Bookkeeper
      2015-06-11
      2015-06-11
      1
      0



In [244]:

    
all_res = pd.merge(apps_with_duration, agg_apps, left_index=True, right_on=['uid', 'job_title'])
all_res.sort_values(by='uid', inplace=True)
all_res = all_res[['uid', 'job_title', 'n_apply', 'first_apply_date', 'last_apply_date', 'n_active_day', 'total_duration_in_day']]

all_res.head()









    Out[244]:






  
    
      
      uid
      job_title
      n_apply
      first_apply_date
      last_apply_date
      n_active_day
      total_duration_in_day
    
  
  
    
      0
      7
      Housekeeping Supervisor
      1
      2015-07-01
      2015-07-01
      1
      0
    
    
      15
      39
      Shipping Clerk (Freight)
      1
      2015-06-11
      2015-06-11
      1
      0
    
    
      14
      39
      Senior Accounts Assistant
      1
      2015-06-16
      2015-06-16
      1
      0
    
    
      13
      39
      Operations Officer
      1
      2015-06-11
      2015-06-11
      1
      0
    
    
      12
      39
      Data Entry Clerk
      2
      2015-06-16
      2015-06-16
      1
      0



In [334]:

    
all_res.shape









    Out[334]:





(775693, 7)



In [335]:

    
all_res.to_csv(AGG_DIR + 'timed_apps.csv', index=False)



In [249]:

    
normal = all_res.query('n_apply < 100')
extremes = res



In [253]:

    
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
plt.hist(extremes['n_active_day'], bins=np.unique(extremes['n_active_day']))
plt.title('Extreme cases')
plt.xlabel('# active days')
plt.ylabel('# user-apply-job cases')

plt.subplots_adjust(wspace=.5)
plt.subplot(1,2,2)
plt.hist(normal['n_active_day'], bins=np.unique(normal['n_active_day']), 
         log=True)
plt.title('Normal cases')
plt.xlabel('# active days')
plt.ylabel('# user-apply-job cases')

plt.savefig(RES_DIR + 'n_active_day.pdf')
plt.show()
plt.close()

B. Number of different job titles an applicant applies



In [ ]:

    
agg_job_title = apps[['uid', 'job_title']].groupby('uid').agg({'job_title': 'nunique'})
agg_job_title = agg_job_title.add_prefix('n_').reset_index()

agg_job_title.sort_values('n_job_title', ascending=False, inplace=True)
# agg_job_title.head()

agg_job_id = apps[['uid', 'job_id']].groupby('uid').agg({'job_id': 'nunique'})
agg_job_id = agg_job_id.add_prefix('n_').reset_index()
agg_job_id.sort_values('n_job_id', ascending=False, inplace=True)

agg_df = pd.merge(agg_job_title, agg_job_id)



In [272]:

    
agg_df.rename(columns={'n_job_id': 'n_job'}, inplace=True)
agg_df.head()









    Out[272]:






  
    
      
      uid
      n_job_title
      n_job
      job_titles
    
  
  
    
      0
      26451
      1034
      4194
      Business Consultant,Brand Development Manager,...
    
    
      1
      103204
      1001
      7754
      Human Resource & Administration Manager,Admini...
    
    
      2
      14322
      788
      3649
      Procurement Officer,Marine Operations Executiv...
    
    
      3
      141505
      742
      2068
      Safety, Health & Environment Coordinator,Assis...
    
    
      4
      100500
      647
      2825
      Software Consultant,Investment Analyst,Procure...



In [281]:

    
plt.close('all')



In [283]:

    
fig = plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
loglog(agg_df['n_job_title'], xl='# Job titles applied', yl='# applicants')
plt.subplots_adjust(wspace=.5)

plt.subplot(1,2,2)
loglog(agg_df['n_job'], xl='# Jobs applied', yl='# applicants')

plt.savefig(FIG_DIR + 'figs/applied_jobs.pdf')
plt.show()
plt.close()



In [184]:

    
print apps.shape[0]



In [336]:

    
# Join all job titles of each user for reference
t0 = time()
tmp = apps[['uid', 'job_title']].groupby('uid').agg({'job_title': paste})
print('Finished joining job titles after {}s'.format(time()-t0))
tmp = tmp.add_suffix('s').reset_index()

apps_by_job_title = pd.merge(apps_by_job_title, tmp)
apps_by_job_title.sort_values('n_job_title', ascending=False, inplace=True)

apps_by_job_title.to_csv(AGG_DIR + 'apps_by_job_title.csv', index=False)









    



Finished joining job titles after 2.87000012398s

C. Number of company an applicant applies

Merge necessary files to get a full dataset



In [285]:

    
posts = pd.read_csv(DATA_DIR + 'full_job_posts.csv')
print(posts.shape)
posts = dot2dash(posts)

posts.head()









    Out[285]:






  
    
      
      company_registration_number_uen_ep
      job_id
      no_of_applications
      company_registration_number_uen_job_posting
      job_posting_date_history
      job_expiry_date_history
      job_expiry_year
      job_ssoc_code_list
      job_ssoc_description_list
      no_of_vacancies
      job_monthly_min_sal
      job_monthly_max_sal
      no_of_times_job_posting_reopened
      job_posting_status
      job_employment_type_list
      job_experience_required_years
      job_level
    
  
  
    
      0
      002
      JOB-2015-0145758
      1.0
      002
      2015-01-13
      2015-02-12
      2015
      31275
      Architectural Assistant
      2
      4000.0
      6000.0
      0
      Closed
      Full Time, Permanent
      3.0
      Manager
    
    
      1
      002
      JOB-2015-0145763
      3.0
      002
      2015-01-13
      2015-02-12
      2015
      31271
      Draftsman
      3
      3500.0
      4500.0
      0
      Closed
      Full Time, Permanent
      3.0
      Executive
    
    
      2
      003
      JOB-2015-0172574
      3.0
      003
      2015-02-23
      2015-03-25
      2015
      39229
      Administrative Accounts Executive
      1
      NaN
      NaN
      0
      Closed
      Full Time, Permanent
      5.0
      NaN
    
    
      3
      04130400D
      JOB-2015-0173823
      NaN
      04130400D
      2015-02-24
      2015-03-26
      2015
      74399
      Garland Maker
      3
      3500.0
      4500.0
      1
      Closed
      Full Time
      1.0
      Fresh/entry level,Non-executive
    
    
      4
      07479300D
      JOB-2015-0144684
      93.0
      07479300D
      2015-01-12
      2015-02-11
      2015
      21319
      Information Technology Manager
      1
      5000.0
      7500.0
      1
      Closed
      Full Time
      5.0
      Senior Management



In [304]:

    
# Extract just job id and employer id
job_and_employer = posts[['job_id', 'company_registration_number_uen_ep']].drop_duplicates()
job_and_employer.head(1)









    Out[304]:






  
    
      
      job_id
      company_registration_number_uen_ep
    
  
  
    
      0
      JOB-2015-0145758
      002



In [295]:

    
# Load employer details (names, desc,...)
employer_detail = pd.read_csv(DATA_DIR + 'employers.csv')
employer_detail.drop_duplicates(inplace=True)
print(employer_detail.shape)

employer_detail = dot2dash(employer_detail)
employer_detail.head(1)









    Out[295]:





(21513, 7)



In [306]:

    
# Merge to add employer details
job_and_employer = job_and_employer.rename(columns={'company_registration_number_uen_ep': 'reg_no_uen_ep'})
df = pd.merge(apps, pd.merge(job_and_employer, employer_detail))
print(df.shape)

df.sort_values(by='organisation_name_ep', inplace=True, na_position='first')
df.head()









    Out[306]:






  
    
      
      job_id
      reg_no_uen_ep
      employer_creation_date
      organisation_name_ep
      ssic_code_ep
      ssic_description_ep
      ssic_group_ep
      third_party_entity_ep
    
  
  
    
      173957
      JOB-2015-0216458
      197302448M
      Jul 8, 2014
      "K" LINE (SINGAPORE) PTE LTD
      52222
      Shipping agencies (freight)
      Transportation and Storage
      N\r
    
    
      173956
      JOB-2015-0216414
      197302448M
      Jul 8, 2014
      "K" LINE (SINGAPORE) PTE LTD
      52222
      Shipping agencies (freight)
      Transportation and Storage
      N\r
    
    
      173959
      JOB-2015-0216978
      197302448M
      Jul 8, 2014
      "K" LINE (SINGAPORE) PTE LTD
      52222
      Shipping agencies (freight)
      Transportation and Storage
      N\r
    
    
      173958
      JOB-2015-0216499
      197302448M
      Jul 8, 2014
      "K" LINE (SINGAPORE) PTE LTD
      52222
      Shipping agencies (freight)
      Transportation and Storage
      N\r
    
    
      24352
      JOB-2015-0196481
      198701829Z
      Jul 23, 2014
      "K" LINE LOGISTICS (SINGAPORE) PTE. LTD.
      52292
      Freight forwarding, packing and crating services
      Transportation and Storage
      N\r



In [308]:

    
# del df['is_number']
# df.to_csv(DATA_DIR + 'full_apps.csv', index=False)
df.head()









    Out[308]:






  
    
      
      uid
      job_id
      job_title
      apply_date
      reg_no_uen_ep
      employer_creation_date
      organisation_name_ep
      ssic_code_ep
      ssic_description_ep
      ssic_group_ep
      third_party_entity_ep
    
  
  
    
      0
      7
      JOB-2015-0223128
      Housekeeping Supervisor
      2015-07-01
      52865867X
      Jul 8, 2014
      THE FULLERTON HOTEL
      55101
      Hotels with restaurant
      Accommodation and Food Service Activities
      Y\r
    
    
      1
      21073
      JOB-2015-0223128
      Housekeeping Supervisor
      2015-05-07
      52865867X
      Jul 8, 2014
      THE FULLERTON HOTEL
      55101
      Hotels with restaurant
      Accommodation and Food Service Activities
      Y\r
    
    
      2
      46634
      JOB-2015-0223128
      Housekeeping Supervisor
      2015-05-01
      52865867X
      Jul 8, 2014
      THE FULLERTON HOTEL
      55101
      Hotels with restaurant
      Accommodation and Food Service Activities
      Y\r
    
    
      3
      100427
      JOB-2015-0223128
      Housekeeping Supervisor
      2015-07-24
      52865867X
      Jul 8, 2014
      THE FULLERTON HOTEL
      55101
      Hotels with restaurant
      Accommodation and Food Service Activities
      Y\r
    
    
      4
      39
      JOB-2014-0134411
      Account Assistant
      2015-06-16
      200203771R
      Jul 5, 2014
      THE SHICHIDA METHOD (S) PTE. LTD.
      82999
      Other business support services activities nec...
      Administrative and Support Service Activities
      N\r



In [338]:

    
user_apply_comp = df[['uid', 'reg_no_uen_ep', 'organisation_name_ep']]
user_apply_comp['n_apply'] = ''









    



C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app



In [340]:

    
apps_by_comp = user_apply_comp.groupby(['uid', 'reg_no_uen_ep', 'organisation_name_ep']).count()
apps_by_comp = apps_by_comp.reset_index()
apps_by_comp.sort_values('n_apply', ascending=False, inplace=True)
apps_by_comp.head()

apps_by_comp.to_csv(AGG_DIR + 'apps_by_comp.csv', index=False)



In [ ]:

    
loglog(apps_by_comp['n_apply'], xl='# applications', yl='# user-apply-company cases')

plt.savefig(FIG_DIR + 'user_comp.pdf')
plt.show()
plt.close()



In [323]:

    
quantile(user_apply_comp['n_apply'])









    Out[323]:






  
    
      
      min
      25%
      50% (median)
      75%
      max
    
  
  
    
      0
      1.0
      1.0
      1.0
      1.0
      328.0

D. Number of (job title, company) an applicant applies



In [342]:

    
tmp = df[['uid', 'job_title', 'reg_no_uen_ep', 'organisation_name_ep']]
tmp['n_apply'] = ''

apps_by_job_comp = tmp.groupby(['uid', 'job_title', 'reg_no_uen_ep', 'organisation_name_ep']).count()
apps_by_job_comp = apps_by_job_comp.reset_index()
apps_by_job_comp.sort_values('n_apply', ascending=False, inplace=True)
print apps_by_job_comp.shape
apps_by_job_comp.head()









    



C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app






    



(1352961, 5)






    Out[342]:






  
    
      
      uid
      job_title
      reg_no_uen_ep
      organisation_name_ep
      n_apply
    
  
  
    
      958000
      103204
      Analyst
      196800306E
      DBS BANK LTD.
      132
    
    
      960614
      103204
      Information Technology Specialist
      196800306E
      DBS BANK LTD.
      90
    
    
      1064171
      112664
      Research Assistant
      200604346E
      NATIONAL UNIVERSITY OF SINGAPORE
      90
    
    
      1018071
      108289
      Call Centre Agent
      199907051E
      CREDIT MANAGEMENT CONSULTANCY (ASIA) PTE LTD
      72
    
    
      685189
      76182
      Information Technology Specialist
      196800306E
      DBS BANK LTD.
      64



In [343]:

    
apps_by_job_comp.to_csv(AGG_DIR + 'apps_by_job_comp.csv', index=False)



In [ ]:

    
loglog(apps_by_job_comp['n_apply'], xl='# applications', yl='# user-apply-job-at-company cases')
plt.savefig(FIG_DIR + 'user_job_comp.pdf')
plt.show()
plt.close()



In [332]:

    
job_comp = df[['job_title', 'organisation_name_ep']].drop_duplicates()
job_comp.shape









    Out[332]:





(89071, 2)

	uid	job_id	job_title	apply_date	is_number
0	7	JOB-2015-0223128	Housekeeping Supervisor	2015-07-01	False
1	39	JOB-2014-0134411	Account Assistant	2015-06-16	False
2	39	JOB-2015-0179225	Audit Assistant	2015-06-16	False

	uid	job_title	n_job_id	n_apply_date	n_apply
0	7	Housekeeping Supervisor	1	1	1
1	39	Account Assistant	3	2	3
2	39	Accounts Assistant	6	3	6

	uid	job_title	n_job_id	n_apply_date	n_apply
133988	33833	Accounts Executive	582	133	582
133960	33833	Accountant	534	133	534
570379	108289	Accounts Assistant	502	103	502
570385	108289	Accounts Executive	430	99	430
117092	30771	Project Manager	408	41	408

	uid	job_title	n_apply	first_apply_date	last_apply_date	n_active_day	total_duration_in_day
553082	105624	Network Engineer/Manager	102	2015-04-08	2015-06-23	17	76
47954	17000	Safety Coordinator	101	2015-01-01	2015-09-30	44	272
646235	121057	Human Resource Executive	101	2015-01-08	2015-06-25	30	168
134052	33833	Finance Executive	100	2015-01-14	2015-09-30	60	259
433860	85126	Marketing Executive	100	2015-01-21	2015-05-12	44	111

	job_title	n_apply
38	Accounts Executive	582
10	Accountant	534
5	Account Executive	282
103	Finance Manager	213
32	Accounts Assistant	103
102	Finance Executive	100
49	Administrative Accounts Executive	77
40	Accounts Officer	68
30	Accounts Administrator	55
21	Accounting Executive	47
1	Account Assistant	47
63	Assistant Finance Manager	46
146	Senior Accountant (General)	44
23	Accounting Manager (Finance Department)	36
39	Accounts Manager	35
22	Accounting Manager	34
18	Accounting Associate Professional (eg Assistan...	25
33	Accounts Associate	20
61	Assistant Accountant	19
145	Senior Account Executive	19
104	Finance Officer	18
24	Accounting Officer	17
168	Senior Finance Manager	17
167	Senior Finance Executive	16
107	Financial Accountant	16
74	Category Finance Manager	16
14	Accountant, Company	15
110	Financial Controller	15
29	Accounts Administrative Manager	13
17	Accounting Assistant	11
...	...	...
19	Accounting Clerk	1
16	Accounting Analyst	1
9	Account Support Executive	1
7	Account Management Executive	1
4	Account Development Manager	1
3	Account Development Executive	1
80	Company Secretary (Executive)	1
84	Corporate Secretarial Executive	1
85	Credit Administrative Assistant	1
87	Credit Control Clerk	1
128	Operations Head	1
127	Operations Executive	1
126	Operations Accountant	1
125	Manager, Administration	1
123	Management Accountant	1
121	Learning Support Assistant	1
120	Key Account Executive	1
119	Human Resource Specialist	1
118	Human Resource Manager	1
115	Human Resource & Administrative Manager	1
113	General Ledger Bookkeeper	1
112	Fund Accountant	1
109	Financial Consultant	1
105	Finance Support Manager	1
99	Finance Associate	1
98	Finance Assistant Officer	1
96	Executive, Management	1
92	Deputy Manager	1
91	Deputy Finance Director	1
177	Training Officer	1

		first_apply_date	last_apply_date	n_active_day	total_duration_in_day
uid	job_title
7	Housekeeping Supervisor	2015-07-01	2015-07-01	1	0
39	Account Assistant	2015-06-12	2015-06-16	2	4
	Accounts Assistant	2015-06-11	2015-06-16	3	5
	Accounts Executive	2015-06-16	2015-06-16	1	0
	Accounts Payable (Or Receivable) Bookkeeper	2015-06-11	2015-06-11	1	0

	uid	n_job_title	n_job	job_titles
0	26451	1034	4194	Business Consultant,Brand Development Manager,...
1	103204	1001	7754	Human Resource & Administration Manager,Admini...
2	14322	788	3649	Procurement Officer,Marine Operations Executiv...
3	141505	742	2068	Safety, Health & Environment Coordinator,Assis...
4	100500	647	2825	Software Consultant,Investment Analyst,Procure...

	company_registration_number_uen_ep	job_id	no_of_applications	company_registration_number_uen_job_posting	job_posting_date_history	job_expiry_date_history	job_expiry_year	job_ssoc_code_list	job_ssoc_description_list	no_of_vacancies	job_monthly_min_sal	job_monthly_max_sal	no_of_times_job_posting_reopened	job_posting_status	job_employment_type_list	job_experience_required_years	job_level
0	002	JOB-2015-0145758	1.0	002	2015-01-13	2015-02-12	2015	31275	Architectural Assistant	2	4000.0	6000.0	0	Closed	Full Time, Permanent	3.0	Manager
1	002	JOB-2015-0145763	3.0	002	2015-01-13	2015-02-12	2015	31271	Draftsman	3	3500.0	4500.0	0	Closed	Full Time, Permanent	3.0	Executive
2	003	JOB-2015-0172574	3.0	003	2015-02-23	2015-03-25	2015	39229	Administrative Accounts Executive	1	NaN	NaN	0	Closed	Full Time, Permanent	5.0	NaN
3	04130400D	JOB-2015-0173823	NaN	04130400D	2015-02-24	2015-03-26	2015	74399	Garland Maker	3	3500.0	4500.0	1	Closed	Full Time	1.0	Fresh/entry level,Non-executive
4	07479300D	JOB-2015-0144684	93.0	07479300D	2015-01-12	2015-02-11	2015	21319	Information Technology Manager	1	5000.0	7500.0	1	Closed	Full Time	5.0	Senior Management

	job_id	reg_no_uen_ep	employer_creation_date	organisation_name_ep	ssic_code_ep	ssic_description_ep	ssic_group_ep	third_party_entity_ep
173957	JOB-2015-0216458	197302448M	Jul 8, 2014	"K" LINE (SINGAPORE) PTE LTD	52222	Shipping agencies (freight)	Transportation and Storage	N\r
173956	JOB-2015-0216414	197302448M	Jul 8, 2014	"K" LINE (SINGAPORE) PTE LTD	52222	Shipping agencies (freight)	Transportation and Storage	N\r
173959	JOB-2015-0216978	197302448M	Jul 8, 2014	"K" LINE (SINGAPORE) PTE LTD	52222	Shipping agencies (freight)	Transportation and Storage	N\r
173958	JOB-2015-0216499	197302448M	Jul 8, 2014	"K" LINE (SINGAPORE) PTE LTD	52222	Shipping agencies (freight)	Transportation and Storage	N\r
24352	JOB-2015-0196481	198701829Z	Jul 23, 2014	"K" LINE LOGISTICS (SINGAPORE) PTE. LTD.	52292	Freight forwarding, packing and crating services	Transportation and Storage	N\r

	uid	job_title	reg_no_uen_ep	organisation_name_ep	n_apply
958000	103204	Analyst	196800306E	DBS BANK LTD.	132
960614	103204	Information Technology Specialist	196800306E	DBS BANK LTD.	90
1064171	112664	Research Assistant	200604346E	NATIONAL UNIVERSITY OF SINGAPORE	90
1018071	108289	Call Centre Agent	199907051E	CREDIT MANAGEMENT CONSULTANCY (ASIA) PTE LTD	72
685189	76182	Information Technology Specialist	196800306E	DBS BANK LTD.	64