This script is dedicated to querying all needed statistics for the project.
In [1]:
import my_util as my_util; from my_util import *
In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
In [7]:
title_df = pd.read_csv(DATA_DIR + 'new_titles_2posts_up.csv')
In [13]:
def distTitle(agg_df, for_domain=False, for_func=False):
fig = plt.figure()
plt.hist(agg_df.n_title)
mean_n_title = round(agg_df.n_title.mean(), 1)
xl = '# job titles' + r'$(\mu = {})$'.format(mean_n_title)
plt.xlabel(xl, fontsize=16);
if for_domain: plt.ylabel('# domains', fontsize=16)
if for_func: plt.ylabel('# functions', fontsize=16)
plt.grid(True)
return fig
def aggBy(col, title_df):
by_col = title_df.groupby(col)
print('# {}: {}'.format(col, by_col.ngroups) )
agg_df = by_col.agg({'title': 'nunique','non_std_title': 'nunique','n_post': sum})
agg_df = agg_df.rename(columns={'title': 'n_title',
'std_title': 'n_std_title'}).reset_index()
return agg_df
In [3]:
title_stats = pd.read_csv(DATA_DIR + 'stats_job_titles.csv')
titles = title_stats['title']
print('# titles: %d' %len(titles))
In [7]:
by_n_post = pd.read_csv(DATA_DIR + 'stats_job_post_dist.csv')
In [8]:
by_n_post.head()
Out[8]:
In [ ]:
by_n_post_after_std = title_stats.groupby('n_post').agg({'title': len})
by_n_post_after_std = by_n_post_after_std.rename(columns={'title': 'n_title_after_std'}).reset_index()
quantile(by_n_post_after_std.n_post)
In [ ]:
fig = vizJobPostDist(by_n_post)
plt.savefig(RES_DIR + 'fig/dist_job_post_by_title.pdf')
plt.show(); plt.close()
In [5]:
print('# job titles with >= 2 posts: {}'.format(title_df.shape[0]) )
In [14]:
by_domain_agg = aggBy('domain', title_df)
In [15]:
by_domain_agg.sort_values('n_title', ascending=False, inplace=True)
by_domain_agg.to_csv(DATA_DIR + 'stats_domains.csv', index=False)
by_domain_agg.describe().round(1).to_csv(DATA_DIR + 'tmp/domain_desc.csv')
by_domain_agg.describe().round(1)
Out[15]:
In [17]:
plt.close('all')
In [16]:
fig = distTitle(by_domain_agg, for_domain=True)
fig.set_tight_layout(True)
plt.savefig(DATA_DIR + 'title_dist_by_domain.pdf')
plt.show(); plt.close()
Why no. of job titles in IT is reduced a lot after std?
In [21]:
title_df.query('domain == "information technology"').sort_values('std_title')
Out[21]:
Note: Functions are limited to those of job titles with >= 2 posts.
In [17]:
by_func_agg = aggBy('pri_func', title_df)
by_func_agg.sort_values('n_title', ascending=False, inplace=True)
by_func_agg.to_csv(DATA_DIR + 'stats_pri_funcs.csv', index=False)
by_func_agg.describe().round(1).to_csv(DATA_DIR + 'tmp/func_desc.csv')
by_func_agg.describe().round(1)
Out[17]:
In [18]:
by_func_agg.head(10)
Out[18]:
In [19]:
fig = distTitle(by_func_agg, for_func=True)
fig.set_tight_layout(True)
plt.savefig(DATA_DIR + 'title_dist_by_func.pdf')
plt.show(); plt.close()
In [31]:
sum(title_df.domain == 'information technology')
Out[31]:
In [33]:
title_df.std_title[title_df.pri_func == 'technician'].nunique()
Out[33]:
In [43]:
job_df = pd.read_csv(DATA_DIR + 'jobs.csv')
print job_df.shape
job_df.head(1)
Out[43]:
In [36]:
full_job_df = pd.read_csv(DATA_DIR + 'job_posts.csv')
In [38]:
print full_job_df.shape
full_job_df.head(1)
Out[38]:
In [44]:
full_job_df = pd.merge(full_job_df, job_df[['job_id', 'doc']])
print full_job_df.shape
In [47]:
print('# job ids including dups: %d' %len(full_job_df.job_id))
print('# unique job ids: %d' % full_job_df.job_id.nunique())
full_job_df.head(1)
Out[47]:
In [46]:
full_job_df.to_csv(DATA_DIR + 'job_posts.csv', index=False)