In [1]:
import ja_helpers as ja_helpers
In [3]:
def calTitleSims(row):
'''
@param: row contains full name and short name of an industry
@return: pairwise sims bw titles with at least 2 posts in the industry
'''
def titleSims(industry='Education', short_name='edu'):
posts = df[df.industry == industry]
stats = getTitleStats(posts)
stats.to_csv(directory + 'title_stats.csv', index=False)
# only analyze titles with at least 2 posts in the industry
titles = list(stats.query('n_post >= 2')['title'])
sims = simsOfTitles(titles, industry)
return sims.sort_values('topic_sim', ascending=False)
industry = row['industry']; short_name = row['short_name']
print('\nIndustry %s:' %industry)
directory = JOB_PROF + '{}/'.format(short_name)
if not os.path.exists(directory): os.makedirs(directory)
title_sims = titleSims(industry, short_name)
title_sims.to_csv(directory + 'sims.csv', index=False)
print('Saved pairwise sims of job titles in {}'.format(industry))
def industryWiseSim(industries):
n_industry = industries.shape[0]
for i in range(n_industry):
calTitleSims(industries.iloc[i])
time.sleep(3)
In [ ]:
by_industry = df.groupby('industry')
agg_df = by_industry.agg({'job_id': len, 'title': 'nunique', 'employer_id': 'nunique'})
agg_df.reset_index(inplace=True)
agg_df = agg_df.rename(columns={'employer_id': 'n_employer', 'job_id': 'n_post',
'title': 'n_title'})
In [ ]:
agg_df.sort_values('n_title', ascending=False, inplace=True)
agg_df.to_csv(DATA_DIR + 'stats/industry_stat.csv', index=False)
In [ ]:
q1 = agg_df.n_title.quantile(.25)
q1_industries = agg_df[agg_df.n_title <= q1]
In [ ]:
industryWiseSim(q1_industries)
Some titles do not belong to Agriculture and Fishing, e.g., Signal Processing Engineer, Account Clerk, Accounts Analyst. Assigning the industry of the employer to the post (and its title) seems to have problem!!!
In [ ]:
electric_sims = pd.read_csv(JOB_PROF + 'electric/sims.csv')
electric_sims.describe()
fig, ax = plt.subplots()
vizTopicSim(electric_sims)
fig.subplots_adjust(bottom=0.2)
plt.savefig(electric_dir + 'sim_dist.pdf')
plt.show(); plt.close()
In [ ]:
q2 = agg_df.n_title.quantile(.50)
q2_industries = agg_df.query('{} < n_title and n_title <= {}'.format(q1, q2))
In [ ]:
q2_industries.reset_index(inplace=True)
del q2_industries['index']
industryWiseSim(q2_industries)
In [ ]:
q3 = agg_df.n_title.quantile(.75)
agg_df.query('{} < n_title and n_title <= {}'.format(q2, q3))
In [ ]:
edu_sims = titleSims(industry='Education', short_name='edu')
edu_sims.to_csv(JOB_PROF + 'edu/sims.csv', index=False)
In [ ]: