This script is dedicated to querying all needed statistics for the project.



In [1]:

    
import my_util as my_util; from my_util import *



In [2]:

    
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'



In [7]:

    
title_df = pd.read_csv(DATA_DIR + 'new_titles_2posts_up.csv')

Helpers



In [13]:

    
def distTitle(agg_df, for_domain=False, for_func=False):
    fig = plt.figure()
    plt.hist(agg_df.n_title)
    mean_n_title = round(agg_df.n_title.mean(), 1)
    xl = '# job titles' + r'$(\mu = {})$'.format(mean_n_title)
    plt.xlabel(xl, fontsize=16); 
    if for_domain: plt.ylabel('# domains', fontsize=16)
    if for_func: plt.ylabel('# functions', fontsize=16)
    plt.grid(True)
    return fig

def aggBy(col, title_df):
    by_col = title_df.groupby(col)
    print('# {}: {}'.format(col, by_col.ngroups) )
    
    agg_df = by_col.agg({'title': 'nunique','non_std_title': 'nunique','n_post': sum})

    agg_df = agg_df.rename(columns={'title': 'n_title', 
                                    'std_title': 'n_std_title'}).reset_index()
    return agg_df

Distribution of job posts among job titles



In [3]:

    
title_stats = pd.read_csv(DATA_DIR + 'stats_job_titles.csv')
titles = title_stats['title']
print('# titles: %d' %len(titles))









    



# titles: 4717



In [7]:

    
by_n_post = pd.read_csv(DATA_DIR + 'stats_job_post_dist.csv')



In [8]:

    
by_n_post.head()









    Out[8]:






  
    
      
      n_post
      n_title
      n_title_after_std
    
  
  
    
      0
      1.0
      1334.0
      1334.0
    
    
      1
      2.0
      646.0
      619.0
    
    
      2
      3.0
      431.0
      414.0
    
    
      3
      4.0
      333.0
      323.0
    
    
      4
      5.0
      243.0
      235.0

Job posts distribution among standard job titles



In [ ]:

    
by_n_post_after_std = title_stats.groupby('n_post').agg({'title': len})
by_n_post_after_std = by_n_post_after_std.rename(columns={'title': 'n_title_after_std'}).reset_index()
quantile(by_n_post_after_std.n_post)



In [ ]:

    
fig = vizJobPostDist(by_n_post)
plt.savefig(RES_DIR + 'fig/dist_job_post_by_title.pdf')
plt.show(); plt.close()



In [5]:

    
print('# job titles with >= 2 posts: {}'.format(title_df.shape[0]) )









    



# job titles with >= 2 posts: 2983

Statistics for Domains

Note: The domains are domains of job titles with >= 2 posts.



In [14]:

    
by_domain_agg = aggBy('domain', title_df)









    



# domain: 1021



In [15]:

    
by_domain_agg.sort_values('n_title', ascending=False, inplace=True)
by_domain_agg.to_csv(DATA_DIR + 'stats_domains.csv', index=False)

by_domain_agg.describe().round(1).to_csv(DATA_DIR + 'tmp/domain_desc.csv')
by_domain_agg.describe().round(1)









    Out[15]:






  
    
      
      n_post
      non_std_title
      n_title
    
  
  
    
      count
      1021.0
      1021.0
      1021.0
    
    
      mean
      57.3
      2.7
      2.6
    
    
      std
      187.7
      3.6
      3.4
    
    
      min
      2.0
      1.0
      1.0
    
    
      25%
      4.0
      1.0
      1.0
    
    
      50%
      10.0
      1.0
      1.0
    
    
      75%
      33.0
      3.0
      2.0
    
    
      max
      2068.0
      36.0
      35.0



In [17]:

    
plt.close('all')



In [16]:

    
fig = distTitle(by_domain_agg, for_domain=True)
fig.set_tight_layout(True)
plt.savefig(DATA_DIR + 'title_dist_by_domain.pdf')
plt.show(); plt.close()









    



D:\conda\lib\site-packages\matplotlib\figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

Why no. of job titles in IT is reduced a lot after std?



In [21]:

    
title_df.query('domain == "information technology"').sort_values('std_title')









    Out[21]:






  
    
      
      avg_n_skill
      domain
      n_employer
      n_post
      position
      pri_func
      sec_func
      title
      std_title
    
  
  
    
      386
      19.1
      information technology
      32.0
      35.0
      assistant
      manager
      NaN
      Assistant Information Technology Manager
      Assistant Information Technology Manager
    
    
      143
      16.1
      information technology
      67.0
      80.0
      NaN
      administrator
      NaN
      Application Administrator - Information Techno...
      Information Technology Administrator
    
    
      53
      16.3
      information technology
      90.0
      186.0
      NaN
      analyst
      NaN
      Information Technology Analyst
      Information Technology Analyst
    
    
      500
      18.7
      information technology
      22.0
      27.0
      NaN
      analyst
      NaN
      Security Analyst, Information Technology
      Information Technology Analyst
    
    
      1070
      17.3
      information technology
      11.0
      11.0
      NaN
      auditor
      NaN
      Auditor, Information Technology
      Information Technology Auditor
    
    
      1277
      12.6
      information technology
      8.0
      9.0
      NaN
      auditor
      NaN
      Information Technology Auditor
      Information Technology Auditor
    
    
      44
      15.6
      information technology
      57.0
      227.0
      NaN
      consultant
      NaN
      Information Technology Consultant
      Information Technology Consultant
    
    
      1027
      14.2
      information technology
      8.0
      12.0
      NaN
      coordinator
      NaN
      Information Technology Coordinator
      Information Technology Coordinator
    
    
      841
      21.5
      information technology
      13.0
      16.0
      NaN
      director
      NaN
      Information Technology Director
      Information Technology Director
    
    
      687
      13.0
      information technology
      18.0
      20.0
      NaN
      engineer
      NaN
      Field Engineer, Information Technology (It)
      Information Technology Engineer
    
    
      52
      14.3
      information technology
      84.0
      190.0
      NaN
      engineer
      NaN
      Information Technology Engineer
      Information Technology Engineer
    
    
      124
      13.3
      information technology
      76.0
      92.0
      NaN
      executive
      NaN
      Information Technology Executive
      Information Technology Executive
    
    
      2430
      6.7
      information technology
      3.0
      3.0
      NaN
      junior
      NaN
      Information Technology Junior Research Scientist
      Information Technology Junior
    
    
      67
      18.3
      information technology
      109.0
      156.0
      NaN
      manager
      NaN
      Information Technology Manager
      Information Technology Manager
    
    
      275
      16.9
      information technology
      36.0
      48.0
      NaN
      manager
      NaN
      Project Manager, Information Technology
      Information Technology Manager
    
    
      1532
      16.1
      information technology
      7.0
      7.0
      NaN
      master
      NaN
      Information Technology Master Data Management ...
      Information Technology Master
    
    
      2431
      22.7
      information technology
      3.0
      3.0
      NaN
      master
      NaN
      Information Technology Master Data Management ...
      Information Technology Master
    
    
      1187
      11.2
      information technology
      8.0
      10.0
      NaN
      specialist
      NaN
      Support Specialist, Information Technology
      Information Technology Specialist
    
    
      1699
      16.2
      information technology
      6.0
      6.0
      NaN
      specialist
      NaN
      Security Specialist, Information Technology
      Information Technology Specialist
    
    
      29
      14.0
      information technology
      40.0
      292.0
      NaN
      specialist
      NaN
      Information Technology Specialist
      Information Technology Specialist
    
    
      3232
      11.0
      information technology
      2.0
      2.0
      NaN
      specialist
      NaN
      Quality Assurance Specialist, Information Tech...
      Information Technology Specialist
    
    
      2423
      16.0
      information technology
      3.0
      3.0
      NaN
      supervisor
      NaN
      Information Technology Supervisor
      Information Technology Supervisor
    
    
      591
      13.3
      information technology
      23.0
      23.0
      NaN
      technician
      NaN
      Information Technology Technician
      Information Technology Technician
    
    
      1946
      10.6
      information technology
      5.0
      5.0
      NaN
      trainer
      NaN
      Information Technology Trainer (Extracurriculum)
      Information Technology Trainer
    
    
      1771
      12.2
      information technology
      4.0
      5.0
      senior
      analyst
      NaN
      Senior Security Analyst, Information Technology
      Senior Information Technology Analyst
    
    
      482
      17.9
      information technology
      14.0
      27.0
      senior
      analyst
      NaN
      Senior Information Technology Analyst
      Senior Information Technology Analyst
    
    
      3131
      15.5
      information technology
      2.0
      2.0
      senior
      auditor
      NaN
      Senior Information Technology Auditor
      Senior Information Technology Auditor
    
    
      2142
      13.0
      information technology
      4.0
      4.0
      senior
      auditor
      NaN
      Senior Auditor, Information Technology
      Senior Information Technology Auditor
    
    
      545
      16.6
      information technology
      12.0
      25.0
      senior
      consultant
      NaN
      Senior Information Technology Consultant
      Senior Information Technology Consultant
    
    
      2243
      17.5
      information technology
      1.0
      4.0
      senior
      coordinator
      NaN
      Senior Information Technology Coordinator
      Senior Information Technology Coordinator
    
    
      1753
      22.2
      information technology
      5.0
      5.0
      senior
      engineer
      NaN
      Senior Information Technology Engineer
      Senior Information Technology Engineer
    
    
      1229
      18.6
      information technology
      9.0
      9.0
      senior
      executive
      NaN
      Senior Information Technology Executive
      Senior Information Technology Executive
    
    
      997
      19.8
      information technology
      10.0
      13.0
      senior
      manager
      NaN
      Senior Information Technology Manager
      Senior Information Technology Manager
    
    
      925
      16.6
      information technology
      11.0
      14.0
      senior
      manager
      NaN
      Senior Project Manager, Information Technology
      Senior Information Technology Manager
    
    
      1935
      16.8
      information technology
      5.0
      5.0
      senior
      manager
      NaN
      Senior Manager, Information Technology
      Senior Information Technology Manager
    
    
      999
      16.4
      information technology
      8.0
      13.0
      senior
      specialist
      NaN
      Senior Information Technology Specialist
      Senior Information Technology Specialist
    
    
      2080
      30.8
      information technology
      4.0
      4.0
      senior
      specialist
      NaN
      Senior Security Specialist, Information Techno...
      Senior Information Technology Specialist
    
    
      2026
      17.5
      information technology
      2.0
      4.0
      senior
      specialist
      NaN
      Senior Support Specialist, Information Technology
      Senior Information Technology Specialist
    
    
      2517
      9.7
      information technology
      2.0
      3.0
      vice
      president
      NaN
      Vice President, Information Technology
      Vice Information Technology President

Statistics for functions

Note: Functions are limited to those of job titles with >= 2 posts.



In [17]:

    
by_func_agg = aggBy('pri_func', title_df)

by_func_agg.sort_values('n_title', ascending=False, inplace=True)

by_func_agg.to_csv(DATA_DIR + 'stats_pri_funcs.csv', index=False)

by_func_agg.describe().round(1).to_csv(DATA_DIR + 'tmp/func_desc.csv')
by_func_agg.describe().round(1)









    



# pri_func: 239






    Out[17]:






  
    
      
      n_post
      non_std_title
      n_title
    
  
  
    
      count
      239.0
      239.0
      239.0
    
    
      mean
      293.7
      12.5
      11.9
    
    
      std
      1392.2
      49.5
      47.6
    
    
      min
      2.0
      1.0
      1.0
    
    
      25%
      4.0
      1.0
      1.0
    
    
      50%
      11.0
      2.0
      1.0
    
    
      75%
      66.5
      5.0
      4.0
    
    
      max
      16318.0
      570.0
      545.0



In [18]:

    
by_func_agg.head(10)









    Out[18]:






  
    
      
      pri_func
      n_post
      non_std_title
      n_title
    
  
  
    
      130
      manager
      16318
      570
      545
    
    
      85
      engineer
      9525
      382
      364
    
    
      91
      executive
      7463
      220
      217
    
    
      10
      analyst
      4870
      138
      137
    
    
      141
      officer
      2650
      127
      124
    
    
      51
      consultant
      2779
      100
      99
    
    
      73
      director
      1250
      96
      91
    
    
      191
      specialist
      1411
      89
      89
    
    
      17
      assistant
      3676
      82
      79
    
    
      212
      technician
      1109
      84
      76



In [19]:

    
fig = distTitle(by_func_agg, for_func=True)
fig.set_tight_layout(True)
plt.savefig(DATA_DIR + 'title_dist_by_func.pdf')
plt.show(); plt.close()



In [31]:

    
sum(title_df.domain == 'information technology')









    Out[31]:





39



In [33]:

    
title_df.std_title[title_df.pri_func == 'technician'].nunique()









    Out[33]:





86



In [43]:

    
job_df = pd.read_csv(DATA_DIR + 'jobs.csv')

print job_df.shape
job_df.head(1)









    



(137564, 4)






    Out[43]:






  
    
      
      job_id
      title
      employer_name
      doc
    
  
  
    
      0
      JOB-2015-0145758
      Architectural Assistant
      HDA Architects
      to assist the project architect in developing ...



In [36]:

    
full_job_df = pd.read_csv(DATA_DIR + 'job_posts.csv')



In [38]:

    
print full_job_df.shape
full_job_df.head(1)









    



(249386, 17)






    Out[38]:






  
    
      
      employer_id
      job_id
      no_of_applications
      job_posting_date_history
      job_expiry_date_history
      job_expiry_year
      job_ssoc_code_list
      title
      no_of_vacancies
      job_monthly_min_sal
      job_monthly_max_sal
      no_of_times_job_posting_reopened
      job_posting_status
      job_employment_type_list
      job_experience_required_years
      job_level
      employer_name
    
  
  
    
      0
      002
      JOB-2015-0145758
      1.0
      2015-01-13
      2015-02-12
      2015
      31275
      Architectural Assistant
      2
      4000.0
      6000.0
      0
      Closed
      Full Time, Permanent
      3.0
      Manager
      HDA Architects



In [44]:

    
full_job_df = pd.merge(full_job_df, job_df[['job_id', 'doc']])
print full_job_df.shape









    



(176723, 18)



In [47]:

    
print('# job ids including dups: %d' %len(full_job_df.job_id))
print('# unique job ids: %d' % full_job_df.job_id.nunique())
full_job_df.head(1)









    



# job ids including dups: 176723
# unique job ids: 137554






    Out[47]:






  
    
      
      employer_id
      job_id
      no_of_applications
      job_posting_date_history
      job_expiry_date_history
      job_expiry_year
      job_ssoc_code_list
      title
      no_of_vacancies
      job_monthly_min_sal
      job_monthly_max_sal
      no_of_times_job_posting_reopened
      job_posting_status
      job_employment_type_list
      job_experience_required_years
      job_level
      employer_name
      doc
    
  
  
    
      0
      002
      JOB-2015-0145758
      1.0
      2015-01-13
      2015-02-12
      2015
      31275
      Architectural Assistant
      2
      4000.0
      6000.0
      0
      Closed
      Full Time, Permanent
      3.0
      Manager
      HDA Architects
      to assist the project architect in developing ...



In [46]:

    
full_job_df.to_csv(DATA_DIR + 'job_posts.csv', index=False)

	n_post	n_title	n_title_after_std
0	1.0	1334.0	1334.0
1	2.0	646.0	619.0
2	3.0	431.0	414.0
3	4.0	333.0	323.0
4	5.0	243.0	235.0

	n_post	non_std_title	n_title
count	1021.0	1021.0	1021.0
mean	57.3	2.7	2.6
std	187.7	3.6	3.4
min	2.0	1.0	1.0
25%	4.0	1.0	1.0
50%	10.0	1.0	1.0
75%	33.0	3.0	2.0
max	2068.0	36.0	35.0

	avg_n_skill	domain	n_employer	n_post	position	pri_func	sec_func	title	std_title
386	19.1	information technology	32.0	35.0	assistant	manager	NaN	Assistant Information Technology Manager	Assistant Information Technology Manager
143	16.1	information technology	67.0	80.0	NaN	administrator	NaN	Application Administrator - Information Techno...	Information Technology Administrator
53	16.3	information technology	90.0	186.0	NaN	analyst	NaN	Information Technology Analyst	Information Technology Analyst
500	18.7	information technology	22.0	27.0	NaN	analyst	NaN	Security Analyst, Information Technology	Information Technology Analyst
1070	17.3	information technology	11.0	11.0	NaN	auditor	NaN	Auditor, Information Technology	Information Technology Auditor
1277	12.6	information technology	8.0	9.0	NaN	auditor	NaN	Information Technology Auditor	Information Technology Auditor
44	15.6	information technology	57.0	227.0	NaN	consultant	NaN	Information Technology Consultant	Information Technology Consultant
1027	14.2	information technology	8.0	12.0	NaN	coordinator	NaN	Information Technology Coordinator	Information Technology Coordinator
841	21.5	information technology	13.0	16.0	NaN	director	NaN	Information Technology Director	Information Technology Director
687	13.0	information technology	18.0	20.0	NaN	engineer	NaN	Field Engineer, Information Technology (It)	Information Technology Engineer
52	14.3	information technology	84.0	190.0	NaN	engineer	NaN	Information Technology Engineer	Information Technology Engineer
124	13.3	information technology	76.0	92.0	NaN	executive	NaN	Information Technology Executive	Information Technology Executive
2430	6.7	information technology	3.0	3.0	NaN	junior	NaN	Information Technology Junior Research Scientist	Information Technology Junior
67	18.3	information technology	109.0	156.0	NaN	manager	NaN	Information Technology Manager	Information Technology Manager
275	16.9	information technology	36.0	48.0	NaN	manager	NaN	Project Manager, Information Technology	Information Technology Manager
1532	16.1	information technology	7.0	7.0	NaN	master	NaN	Information Technology Master Data Management ...	Information Technology Master
2431	22.7	information technology	3.0	3.0	NaN	master	NaN	Information Technology Master Data Management ...	Information Technology Master
1187	11.2	information technology	8.0	10.0	NaN	specialist	NaN	Support Specialist, Information Technology	Information Technology Specialist
1699	16.2	information technology	6.0	6.0	NaN	specialist	NaN	Security Specialist, Information Technology	Information Technology Specialist
29	14.0	information technology	40.0	292.0	NaN	specialist	NaN	Information Technology Specialist	Information Technology Specialist
3232	11.0	information technology	2.0	2.0	NaN	specialist	NaN	Quality Assurance Specialist, Information Tech...	Information Technology Specialist
2423	16.0	information technology	3.0	3.0	NaN	supervisor	NaN	Information Technology Supervisor	Information Technology Supervisor
591	13.3	information technology	23.0	23.0	NaN	technician	NaN	Information Technology Technician	Information Technology Technician
1946	10.6	information technology	5.0	5.0	NaN	trainer	NaN	Information Technology Trainer (Extracurriculum)	Information Technology Trainer
1771	12.2	information technology	4.0	5.0	senior	analyst	NaN	Senior Security Analyst, Information Technology	Senior Information Technology Analyst
482	17.9	information technology	14.0	27.0	senior	analyst	NaN	Senior Information Technology Analyst	Senior Information Technology Analyst
3131	15.5	information technology	2.0	2.0	senior	auditor	NaN	Senior Information Technology Auditor	Senior Information Technology Auditor
2142	13.0	information technology	4.0	4.0	senior	auditor	NaN	Senior Auditor, Information Technology	Senior Information Technology Auditor
545	16.6	information technology	12.0	25.0	senior	consultant	NaN	Senior Information Technology Consultant	Senior Information Technology Consultant
2243	17.5	information technology	1.0	4.0	senior	coordinator	NaN	Senior Information Technology Coordinator	Senior Information Technology Coordinator
1753	22.2	information technology	5.0	5.0	senior	engineer	NaN	Senior Information Technology Engineer	Senior Information Technology Engineer
1229	18.6	information technology	9.0	9.0	senior	executive	NaN	Senior Information Technology Executive	Senior Information Technology Executive
997	19.8	information technology	10.0	13.0	senior	manager	NaN	Senior Information Technology Manager	Senior Information Technology Manager
925	16.6	information technology	11.0	14.0	senior	manager	NaN	Senior Project Manager, Information Technology	Senior Information Technology Manager
1935	16.8	information technology	5.0	5.0	senior	manager	NaN	Senior Manager, Information Technology	Senior Information Technology Manager
999	16.4	information technology	8.0	13.0	senior	specialist	NaN	Senior Information Technology Specialist	Senior Information Technology Specialist
2080	30.8	information technology	4.0	4.0	senior	specialist	NaN	Senior Security Specialist, Information Techno...	Senior Information Technology Specialist
2026	17.5	information technology	2.0	4.0	senior	specialist	NaN	Senior Support Specialist, Information Technology	Senior Information Technology Specialist
2517	9.7	information technology	2.0	3.0	vice	president	NaN	Vice President, Information Technology	Vice Information Technology President

	n_post	non_std_title	n_title
count	239.0	239.0	239.0
mean	293.7	12.5	11.9
std	1392.2	49.5	47.6
min	2.0	1.0	1.0
25%	4.0	1.0	1.0
50%	11.0	2.0	1.0
75%	66.5	5.0	4.0
max	16318.0	570.0	545.0

	pri_func	n_post	non_std_title	n_title
130	manager	16318	570	545
85	engineer	9525	382	364
91	executive	7463	220	217
10	analyst	4870	138	137
141	officer	2650	127	124
51	consultant	2779	100	99
73	director	1250	96	91
191	specialist	1411	89	89
17	assistant	3676	82	79
212	technician	1109	84	76