This script is dedicated to querying all needed statistics for the project.


In [1]:
import my_util as my_util; from my_util import *

In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'

In [7]:
title_df = pd.read_csv(DATA_DIR + 'new_titles_2posts_up.csv')

Helpers


In [13]:
def distTitle(agg_df, for_domain=False, for_func=False):
    fig = plt.figure()
    plt.hist(agg_df.n_title)
    mean_n_title = round(agg_df.n_title.mean(), 1)
    xl = '# job titles' + r'$(\mu = {})$'.format(mean_n_title)
    plt.xlabel(xl, fontsize=16); 
    if for_domain: plt.ylabel('# domains', fontsize=16)
    if for_func: plt.ylabel('# functions', fontsize=16)
    plt.grid(True)
    return fig

def aggBy(col, title_df):
    by_col = title_df.groupby(col)
    print('# {}: {}'.format(col, by_col.ngroups) )
    
    agg_df = by_col.agg({'title': 'nunique','non_std_title': 'nunique','n_post': sum})

    agg_df = agg_df.rename(columns={'title': 'n_title', 
                                    'std_title': 'n_std_title'}).reset_index()
    return agg_df

Distribution of job posts among job titles


In [3]:
title_stats = pd.read_csv(DATA_DIR + 'stats_job_titles.csv')
titles = title_stats['title']
print('# titles: %d' %len(titles))


# titles: 4717

In [7]:
by_n_post = pd.read_csv(DATA_DIR + 'stats_job_post_dist.csv')

In [8]:
by_n_post.head()


Out[8]:
n_post n_title n_title_after_std
0 1.0 1334.0 1334.0
1 2.0 646.0 619.0
2 3.0 431.0 414.0
3 4.0 333.0 323.0
4 5.0 243.0 235.0

Job posts distribution among standard job titles


In [ ]:
by_n_post_after_std = title_stats.groupby('n_post').agg({'title': len})
by_n_post_after_std = by_n_post_after_std.rename(columns={'title': 'n_title_after_std'}).reset_index()
quantile(by_n_post_after_std.n_post)

In [ ]:
fig = vizJobPostDist(by_n_post)
plt.savefig(RES_DIR + 'fig/dist_job_post_by_title.pdf')
plt.show(); plt.close()

In [5]:
print('# job titles with >= 2 posts: {}'.format(title_df.shape[0]) )


# job titles with >= 2 posts: 2983

Statistics for Domains

Note: The domains are domains of job titles with >= 2 posts.


In [14]:
by_domain_agg = aggBy('domain', title_df)


# domain: 1021

In [15]:
by_domain_agg.sort_values('n_title', ascending=False, inplace=True)
by_domain_agg.to_csv(DATA_DIR + 'stats_domains.csv', index=False)

by_domain_agg.describe().round(1).to_csv(DATA_DIR + 'tmp/domain_desc.csv')
by_domain_agg.describe().round(1)


Out[15]:
n_post non_std_title n_title
count 1021.0 1021.0 1021.0
mean 57.3 2.7 2.6
std 187.7 3.6 3.4
min 2.0 1.0 1.0
25% 4.0 1.0 1.0
50% 10.0 1.0 1.0
75% 33.0 3.0 2.0
max 2068.0 36.0 35.0

In [17]:
plt.close('all')

In [16]:
fig = distTitle(by_domain_agg, for_domain=True)
fig.set_tight_layout(True)
plt.savefig(DATA_DIR + 'title_dist_by_domain.pdf')
plt.show(); plt.close()


D:\conda\lib\site-packages\matplotlib\figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

Why no. of job titles in IT is reduced a lot after std?


In [21]:
title_df.query('domain == "information technology"').sort_values('std_title')


Out[21]:
avg_n_skill domain n_employer n_post position pri_func sec_func title std_title
386 19.1 information technology 32.0 35.0 assistant manager NaN Assistant Information Technology Manager Assistant Information Technology Manager
143 16.1 information technology 67.0 80.0 NaN administrator NaN Application Administrator - Information Techno... Information Technology Administrator
53 16.3 information technology 90.0 186.0 NaN analyst NaN Information Technology Analyst Information Technology Analyst
500 18.7 information technology 22.0 27.0 NaN analyst NaN Security Analyst, Information Technology Information Technology Analyst
1070 17.3 information technology 11.0 11.0 NaN auditor NaN Auditor, Information Technology Information Technology Auditor
1277 12.6 information technology 8.0 9.0 NaN auditor NaN Information Technology Auditor Information Technology Auditor
44 15.6 information technology 57.0 227.0 NaN consultant NaN Information Technology Consultant Information Technology Consultant
1027 14.2 information technology 8.0 12.0 NaN coordinator NaN Information Technology Coordinator Information Technology Coordinator
841 21.5 information technology 13.0 16.0 NaN director NaN Information Technology Director Information Technology Director
687 13.0 information technology 18.0 20.0 NaN engineer NaN Field Engineer, Information Technology (It) Information Technology Engineer
52 14.3 information technology 84.0 190.0 NaN engineer NaN Information Technology Engineer Information Technology Engineer
124 13.3 information technology 76.0 92.0 NaN executive NaN Information Technology Executive Information Technology Executive
2430 6.7 information technology 3.0 3.0 NaN junior NaN Information Technology Junior Research Scientist Information Technology Junior
67 18.3 information technology 109.0 156.0 NaN manager NaN Information Technology Manager Information Technology Manager
275 16.9 information technology 36.0 48.0 NaN manager NaN Project Manager, Information Technology Information Technology Manager
1532 16.1 information technology 7.0 7.0 NaN master NaN Information Technology Master Data Management ... Information Technology Master
2431 22.7 information technology 3.0 3.0 NaN master NaN Information Technology Master Data Management ... Information Technology Master
1187 11.2 information technology 8.0 10.0 NaN specialist NaN Support Specialist, Information Technology Information Technology Specialist
1699 16.2 information technology 6.0 6.0 NaN specialist NaN Security Specialist, Information Technology Information Technology Specialist
29 14.0 information technology 40.0 292.0 NaN specialist NaN Information Technology Specialist Information Technology Specialist
3232 11.0 information technology 2.0 2.0 NaN specialist NaN Quality Assurance Specialist, Information Tech... Information Technology Specialist
2423 16.0 information technology 3.0 3.0 NaN supervisor NaN Information Technology Supervisor Information Technology Supervisor
591 13.3 information technology 23.0 23.0 NaN technician NaN Information Technology Technician Information Technology Technician
1946 10.6 information technology 5.0 5.0 NaN trainer NaN Information Technology Trainer (Extracurriculum) Information Technology Trainer
1771 12.2 information technology 4.0 5.0 senior analyst NaN Senior Security Analyst, Information Technology Senior Information Technology Analyst
482 17.9 information technology 14.0 27.0 senior analyst NaN Senior Information Technology Analyst Senior Information Technology Analyst
3131 15.5 information technology 2.0 2.0 senior auditor NaN Senior Information Technology Auditor Senior Information Technology Auditor
2142 13.0 information technology 4.0 4.0 senior auditor NaN Senior Auditor, Information Technology Senior Information Technology Auditor
545 16.6 information technology 12.0 25.0 senior consultant NaN Senior Information Technology Consultant Senior Information Technology Consultant
2243 17.5 information technology 1.0 4.0 senior coordinator NaN Senior Information Technology Coordinator Senior Information Technology Coordinator
1753 22.2 information technology 5.0 5.0 senior engineer NaN Senior Information Technology Engineer Senior Information Technology Engineer
1229 18.6 information technology 9.0 9.0 senior executive NaN Senior Information Technology Executive Senior Information Technology Executive
997 19.8 information technology 10.0 13.0 senior manager NaN Senior Information Technology Manager Senior Information Technology Manager
925 16.6 information technology 11.0 14.0 senior manager NaN Senior Project Manager, Information Technology Senior Information Technology Manager
1935 16.8 information technology 5.0 5.0 senior manager NaN Senior Manager, Information Technology Senior Information Technology Manager
999 16.4 information technology 8.0 13.0 senior specialist NaN Senior Information Technology Specialist Senior Information Technology Specialist
2080 30.8 information technology 4.0 4.0 senior specialist NaN Senior Security Specialist, Information Techno... Senior Information Technology Specialist
2026 17.5 information technology 2.0 4.0 senior specialist NaN Senior Support Specialist, Information Technology Senior Information Technology Specialist
2517 9.7 information technology 2.0 3.0 vice president NaN Vice President, Information Technology Vice Information Technology President

Statistics for functions

Note: Functions are limited to those of job titles with >= 2 posts.


In [17]:
by_func_agg = aggBy('pri_func', title_df)

by_func_agg.sort_values('n_title', ascending=False, inplace=True)

by_func_agg.to_csv(DATA_DIR + 'stats_pri_funcs.csv', index=False)

by_func_agg.describe().round(1).to_csv(DATA_DIR + 'tmp/func_desc.csv')
by_func_agg.describe().round(1)


# pri_func: 239
Out[17]:
n_post non_std_title n_title
count 239.0 239.0 239.0
mean 293.7 12.5 11.9
std 1392.2 49.5 47.6
min 2.0 1.0 1.0
25% 4.0 1.0 1.0
50% 11.0 2.0 1.0
75% 66.5 5.0 4.0
max 16318.0 570.0 545.0

In [18]:
by_func_agg.head(10)


Out[18]:
pri_func n_post non_std_title n_title
130 manager 16318 570 545
85 engineer 9525 382 364
91 executive 7463 220 217
10 analyst 4870 138 137
141 officer 2650 127 124
51 consultant 2779 100 99
73 director 1250 96 91
191 specialist 1411 89 89
17 assistant 3676 82 79
212 technician 1109 84 76

In [19]:
fig = distTitle(by_func_agg, for_func=True)
fig.set_tight_layout(True)
plt.savefig(DATA_DIR + 'title_dist_by_func.pdf')
plt.show(); plt.close()



In [31]:
sum(title_df.domain == 'information technology')


Out[31]:
39

In [33]:
title_df.std_title[title_df.pri_func == 'technician'].nunique()


Out[33]:
86

In [43]:
job_df = pd.read_csv(DATA_DIR + 'jobs.csv')

print job_df.shape
job_df.head(1)


(137564, 4)
Out[43]:
job_id title employer_name doc
0 JOB-2015-0145758 Architectural Assistant HDA Architects to assist the project architect in developing ...

In [36]:
full_job_df = pd.read_csv(DATA_DIR + 'job_posts.csv')

In [38]:
print full_job_df.shape
full_job_df.head(1)


(249386, 17)
Out[38]:
employer_id job_id no_of_applications job_posting_date_history job_expiry_date_history job_expiry_year job_ssoc_code_list title no_of_vacancies job_monthly_min_sal job_monthly_max_sal no_of_times_job_posting_reopened job_posting_status job_employment_type_list job_experience_required_years job_level employer_name
0 002 JOB-2015-0145758 1.0 2015-01-13 2015-02-12 2015 31275 Architectural Assistant 2 4000.0 6000.0 0 Closed Full Time, Permanent 3.0 Manager HDA Architects

In [44]:
full_job_df = pd.merge(full_job_df, job_df[['job_id', 'doc']])
print full_job_df.shape


(176723, 18)

In [47]:
print('# job ids including dups: %d' %len(full_job_df.job_id))
print('# unique job ids: %d' % full_job_df.job_id.nunique())
full_job_df.head(1)


# job ids including dups: 176723
# unique job ids: 137554
Out[47]:
employer_id job_id no_of_applications job_posting_date_history job_expiry_date_history job_expiry_year job_ssoc_code_list title no_of_vacancies job_monthly_min_sal job_monthly_max_sal no_of_times_job_posting_reopened job_posting_status job_employment_type_list job_experience_required_years job_level employer_name doc
0 002 JOB-2015-0145758 1.0 2015-01-13 2015-02-12 2015 31275 Architectural Assistant 2 4000.0 6000.0 0 Closed Full Time, Permanent 3.0 Manager HDA Architects to assist the project architect in developing ...

In [46]:
full_job_df.to_csv(DATA_DIR + 'job_posts.csv', index=False)