In [1]:
import my_util as my_util
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *
import random as rd
In [2]:
global doc_topic_distr
In [3]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
SKILL_DAT = HOME_DIR + 'data/clean/skill_cluster/'
SKILL_RES = HOME_DIR + 'results/' + 'skill_cluster/new/'
JOB_PROF = SKILL_RES + 'job_prof/'
In [4]:
df = pd.read_csv(SKILL_DAT + 'filter_doc_index.csv')
print df.shape
df.set_index('index', inplace=True)
In [5]:
with(open(SKILL_RES + 'doc_20topic_distr.mtx', 'r')) as f:
doc_topic_distr = mmread(f)
In [6]:
# reload(my_util)
# from my_util import *
reload(cluster_skill_helpers)
from cluster_skill_helpers import *
In [19]:
def changeTitle(df, t1='Teacher, Kindergarten', t2='Kindergarten Teacher'):
n_post = df.shape[0]
idx = [i for i in range(n_post) if df.iloc[i].title == t1]
out_df = df.copy()
out_df.title.iloc[idx] = t2
return out_df
def vizPair(i, sim_df, labels, abbv_title=''):
fig = vizDists4Pair(sim_df.iloc[i], df, doc_topic_distr, labels)
fig.savefig(SKILL_RES + 'fig/{}_p{}.pdf'.format(abbv_title, i+1))
plt.show(); plt.close()
def calSimScores(job_title='Research Fellow', industry=None, df=df, out_fmt='data_frame'): # 'Education'
posts = df[(df['title'] == job_title) & (df['industry'] == industry)] if industry is not None else df[df['title'] == job_title]
# posts = rmBadPosts(posts, job_title)
n_post = posts.shape[0]
if n_post > 100: posts = posts.sample(100)
n_post = posts.shape[0]
# print('{} in {}: {} posts'.format(job_title, industry, n_post))
return pairwiseSim(posts, doc_topic_distr, out_fmt, verbose=False)
def consistency(job_title, industry, save_sim=False, abbv_job='', abbv_industry=''):
'''
@brief: calculate consistency score of given job title in given industry as avg of job post sims
@param: save_sim=True if want to save the sims
'''
sims = calSimScores(job_title, industry)
if save_sim:
fname = JOB_PROF + 'consistency/{}_{}_sims.csv'.format(abbv_industry, abbv_job)
sims.to_csv(fname, index=False)
cscore = round(sims['topic_sim'].mean(), 3)
return cscore
def cScoreAtRow(row):
'''
@return: consistency score of pair (job_title, industry) in the given row
'''
count = row.name
if (count % 100 == 0): print('{} pairs and counting...'.format(count))
job_title, industry = row['title'], row['industry']
sims = calSimScores(job_title, industry)
cscore = round(sims['topic_sim'].mean(), 3)
return cscore
def simScore(t1, t2):
print('{} vs. {}'.format(t1, t2))
posts1 = df[df.title == t1]; posts2 = df[df.title == t2]
## Rm lousy posts with too few skills from both sets
# posts1 = rmBadPosts(posts1, t1)
# posts2 = rmBadPosts(posts2, t2)
## Sample for efficiency (if too many posts)
n1, n2 = posts1.shape[0], posts2.shape[0]
if n1 > 100: posts1 = posts1.sample(100)
if n2 > 100: posts2 = posts2.sample(100)
if (n1 > 0) and (n2 > 0): # needed to avoid empty sets if bad posts are removed
res = crossSimScores(posts1, posts2, doc_topic_distr, verbose=False)
topic_sim = round(res['topic_sim'].mean(), 3)
return topic_sim # return res
# print('Topic similarity score bw {} and {}: {}'.format(t1, t2, topic_sim))
return np.nan
def AF_clustering(posts, job_title, sim_mat):
af = cluster.AffinityPropagation(affinity='precomputed').fit(sim_mat) # preference=-50,
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
n_post = posts.shape[0]
print('# posts to be clustered by Affinity Propagation model: {}'.format(n_post))
print('Estimated number of clusters: %d' %n_clusters_)
# Representatives (cluster centers)
reps = posts.iloc[cluster_centers_indices]
reps.to_csv(JOB_PROF + 'clusters/{}_reps.csv'.format(job_title), index=False)
# Retrieve labels of posts
res = posts.copy()
res['af_label'] = af.labels_
return res.sort_values('af_label')
def plotCluster(c, job_title, cluster_res): # figsize=(12,6)
posts = cluster_res.query('af_label == {}'.format(c))
n_post = posts.shape[0]
if (n_post % 2 == 1):
print('n_post={} is odd number, drop 1 post'.format(n_post)); n_post -= 1
posts = posts.iloc[1:]
w = 12; h = 3*n_post/4 if n_post >= 8 else 6
fig = vizTopicDists(posts, doc_topic_distr, figsize=(w, h))
i=c+1; title = 'Topic distribution of {} posts in {}th cluster'.format(job_title, i)
fig.suptitle(title, fontsize=20)
fig.savefig(SKILL_RES + 'fig/c{}_{}.pdf'.format(i, job_title))
return fig
In [ ]:
good_df = df.query('n_skill >= 10')
good_df.shape[0]
We need to get basic stats of job titles to understand more about them. Given a title, we need to know:
In [ ]:
stats = getTitleStats(df)
stats.describe().round(1)
stats.to_csv(SKILL_RES + 'stats.csv', index=False)
In [ ]:
k = 20; fname = SKILL_RES + 'lda/{}_topics.csv'.format(k)
doc_topic_distr = topic_distr[k]
topic_df = pd.read_csv(fname)
labels = map(str.upper, topic_df['label'])
In [ ]:
with(open(SKILL_RES + 'doc_topic_distr.mtx', 'r')) as f:
doc_15topic_distr = mmread(f)
with(open(SKILL_RES + 'doc_20topic_distr.mtx', 'r')) as f:
doc_20topic_distr = mmread(f)
with(open(SKILL_RES + 'doc_30topic_distr.mtx', 'r')) as f:
doc_30topic_distr = mmread(f)
In [ ]:
print(doc_15topic_distr.shape)
print(doc_20topic_distr.shape)
print(doc_30topic_distr.shape)
In [ ]:
# Allow us to loop over doc-topic dists wrt diff no. of topics
topic_distr = {15: doc_15topic_distr, 20: doc_20topic_distr, 30: doc_30topic_distr}
In [ ]:
rd.seed(1234567) # time()
size = 500; posts = good_df.sample(size)
In [ ]:
sims_15 = pairwiseSim(posts, doc_15topic_distr)
sims_20 = pairwiseSim(posts, doc_20topic_distr)
sims_30 = pairwiseSim(posts, doc_30topic_distr)
In [ ]:
sims_15.topic_sim.describe().round(3)
sims_20.topic_sim.describe().round(3)
sims_30.topic_sim.describe().round(3)
In [ ]:
sims_15.sort_values('topic_sim', inplace=True)
vizPair(0, sims_15)
vizPair(1, sims_15)
In [ ]:
medium = sims_15.query('0.7 < topic_sim and topic_sim <= 0.8')
medium.sort_values('topic_sim', inplace=True)
vizPair(0, medium, abbv_title='medium')
last = medium.shape[0]-1
vizPair(last, medium, abbv_title='medium')
large = sims_15.query('topic_sim > 0.8')
vizPair(0, large, abbv_title='large')
n_pair = sims_15.shape[0]; last = n_pair - 1
vizPair(last, sims_15)
In [ ]:
sims = {15: sims_15, 20: sims_20, 30: sims_30}
ks = [15, 20, 30]
In [413]:
for k in ks:
fig = plotSimDists(sims[k])
fig.suptitle('Sample size: {} posts'.format(size), fontsize=20)
fname = SKILL_RES + 'fig/sim_dists_{}topics.pdf'.format(k)
# fig.set_tight_layout(True)
fig.savefig(fname)
plt.show(); plt.close()
The plots show that topic similarity distribution $sim_{topic}$ changes gradually with the number of topics $k$ and with sample size. For a sample of size 300, we have
We will look at consistency score of a given job title in a given industry. First, let's see how many pairs (job title, industry) we have and get stats for the pairs.
In [6]:
df.head(1)
Out[6]:
In [8]:
by_job_and_industry = df.groupby(['title', 'industry'])
agg_df = by_job_and_industry.agg({'job_id': len, 'employer_id': 'nunique', 'n_skill': 'mean'})
agg_df = agg_df.rename(columns={'job_id': 'n_post', 'employer_id': 'n_employer',
'n_skill': 'avg_n_skill'})
agg_df = agg_df.reset_index()
agg_df.describe().round(1)
Out[8]:
In [12]:
agg_df.sort_values('n_post', inplace=True)
In [61]:
agg_df.head(10)
Out[61]:
In [10]:
res = agg_df.query('n_post >= 100').copy()
print('# pairs to analyze: %d' %res.shape[0])
In [11]:
cScoreAtRow(res.iloc[0])
Out[11]:
In [96]:
res['cscore'] = res.apply(cScoreAtRow, axis=1)
In [102]:
res.cscore.describe().round(3)
Out[102]:
In [99]:
res = res.sort_values('cscore', ascending=False)
In [100]:
res.head()
Out[100]:
In [101]:
res.tail()
Out[101]:
In [103]:
res.to_csv(JOB_PROF + 'cscore_jobs_100posts.csv', index=False)
In [60]:
def vizCScores(res):
fig = plt.figure(figsize=(6,5))
plt.hist(res.cscore)
avg, std = round(res.cscore.mean(), 3), round(res.cscore.std(), 3)
xl = 'Consistency score' + r'$(\mu = {}, \sigma = {})$'.format(avg, std)
plt.xlabel(xl, fontsize=16);
plt.ylabel('Count', fontsize=16)
plt.grid(True)
return fig
In [ ]:
res = agg_df.query('2 <= n_post < 100')
res.reset_index(inplace=True)
In [59]:
_ = agg_df.query('2 <= n_post')
print('# pairs with at least 2 posts: %d' % _.shape[0])
In [29]:
del res['index']
res.head()
Out[29]:
In [30]:
res['cscore'] = res.apply(cScoreAtRow, axis=1)
In [33]:
res_55 = pd.read_csv(JOB_PROF + 'cscore_jobs_100posts.csv')
res = pd.concat([res, res_55])
res.to_csv(JOB_PROF + 'cscore_all.csv', index=False)
In [61]:
fig = vizCScores(res)
fig.savefig(JOB_PROF + 'cscore_dist.pdf')
plt.show(); plt.close()
In [36]:
res = res.sort_values('cscore', ascending=False)
In [49]:
res.query('cscore == 1')
Out[49]:
As these cases have only 2, 3 posts, they are very likely to be re-posts. Let's see:
In [52]:
def checkRepost(in_df):
for i in range(in_df.shape[0]):
row = in_df.iloc[i]
title = row['title']; industry = row['industry']
docs = set(df[(df.title == title) & (df.industry == industry)]['doc'])
if (len(docs) == 1):
print (i, True)
else: print(docs)
In [53]:
tmp = res.query('cscore == 1')
checkRepost(tmp)
Gotcha: All of them are re-posts, some with a bit editing (cases 5, 7 and 17).
In [48]:
res.query('cscore < 1').head()
Out[48]:
In [54]:
checkRepost(res.query('cscore < 1').head())
In [70]:
info_df = agg_df.query('industry == "Information and Communications"')
info_top50 = info_df.iloc[range(50)]
info_top50['cscore'] = info_top50['title'].apply(consistency, industry='Information and Communications')
info_top50.to_csv(JOB_PROF + 'consistency/infocom_cscore.csv', index=False)
info_top50.cscore.describe()
Out[70]:
In [79]:
info_top50.sort_values('cscore', ascending=False, inplace=True)
In [80]:
info_top50.head()
Out[80]:
In [81]:
info_top50.tail()
Out[81]:
In [75]:
fin_df = agg_df.query('industry == "Financial and Insurance Activities"')
fin_top50 = fin_df.iloc[range(50)]
fin_top50['cscore'] = fin_top50['title'].apply(consistency, industry='Financial and Insurance Activities')
fin_top50.to_csv(JOB_PROF + 'consistency/fin50_cscore.csv', index=False)
Out[75]:
In [77]:
fin_top50.cscore.describe()
Out[77]:
In [88]:
fin_top50 = fin_top50.sort_values('cscore', ascending=False)
In [84]:
fin_top50.head()
Out[84]:
In [85]:
fin_top50.tail()
Out[85]:
In [86]:
fin_res = fin_df.query('n_post >= 2').copy()
print('# cscores to be computed: %d' %fin_res.shape[0])
fin_res['cscore'] = fin_res['title'].apply(consistency, industry='Financial and Insurance Activities')
fin_res.to_csv(JOB_PROF + 'consistency/fin_cscore.csv', index=False)
In [87]:
fin_res.cscore.describe()
Out[87]:
In [ ]:
# Plot dist of cscore
In [74]:
ent_df = agg_df.query('industry == "Arts, Entertainment and Recreation"')
ent_res = ent_df.query('n_post >= 2').copy()
ent_res['cscore'] = ent_res['title'].apply(consistency, industry='Arts, Entertainment and Recreation')
ent_res.to_csv(JOB_PROF + 'consistency/art_cscore.csv', index=False)
ent_res.cscore.describe()
Out[74]:
In [58]:
sse_cscore = consistency('Senior Software Engineer', 'Wholesale and Retail Trade', abbv_job='sse', abbv_industry='wholesale')
aa_cscore = consistency('Administrative Assistant', 'Wholesale and Retail Trade', abbv_job='aa', abbv_industry='wholesale')
me_sims = consistency('Marketing Executive', 'Wholesale and Retail Trade', abbv_job='me', abbv_industry='wholesale')
In [22]:
agg_df.query('industry == "Financial and Insurance Activities"').head()
Out[22]:
In [416]:
fin_se = calSimScores('Software Engineer', 'Financial and Insurance Activities', df)
In [417]:
print fin_se.topic_sim.describe().round(2)
print fin_se.skill_sim.describe().round(2)
In [ ]:
fin_se = fin_se.sort_values('topic_sim', ascending=False)
# del fin_se['index']
In [ ]:
fin_se.head()
In [ ]:
fin_se.head().to_csv(SKILL_RES + 'fin_se_top5.csv', index=False)
In [ ]:
fin_se = fin_se.sort_values('skill_sim', ascending=False)
fin_se.head()
In [ ]:
np.corrcoef(fin_se.skill_sim, fin_se.topic_sim)[0,1]
In [ ]:
posts = getPostsInPairs(fin_se.head())
fig = vizTopicDists(posts, doc_topic_distr, figsize=(12, 6))
plt.savefig(SKILL_RES + 'fig/fin_se_top5.pdf')
plt.show(); plt.close()
In [ ]:
fin_se.tail().to_csv(SKILL_RES + 'fin_se_bottom5.csv', index=False)
In [418]:
fin_man = calSimScores('Manager', 'Financial and Insurance Activities', df)
print fin_man.topic_sim.describe().round(2)
print fin_man.skill_sim.describe().round(2)
In [ ]:
fin_man = fin_man.sort_values('topic_sim', ascending=False); del fin_man['index']
In [ ]:
fin_man.head().to_csv(SKILL_RES + 'fin_man_top5.csv', index=False)
fin_man.tail().to_csv(SKILL_RES + 'fin_man_bottom5.csv', index=False)
In [ ]:
posts = getPostsInPairs(fin_man.tail(), df)
In [ ]:
top5 = fin_man.query('employer1 != employer2 and skill_sim <= 0.8').head()
In [ ]:
fig = vizTopicDists(posts, doc_topic_distr, figsize=(12, 6))
plt.savefig(SKILL_RES + 'fig/fin_man_bottom5.pdf')
plt.show(); plt.close()
In [ ]:
rf_sims = calSimScores(job_title='Research Fellow', industry='Education', df=df)
In [ ]:
print rf_sims.topic_sim.describe().round(3)
print rf_sims.skill_sim.describe().round(2)
In [ ]:
rf_sims = rf_sims.sort_values('topic_sim', ascending=False)
rf_sims.head()
In this section, we will try the following clustering models which can work directly on precomputed similarity matrix.
After training each model, we will analyze the returned clusters by:
In [ ]:
import sklearn.cluster as cluster
In [ ]:
# dir to store results
JOB_PROF = SKILL_RES + 'job_prof/'
In [ ]:
se_sims = calSimScores('Software Engineer', df=df, out_fmt='matrix_topic_sim') # 'Financial and Insurance Activities',
In [ ]:
se_posts = df[df.title == 'Software Engineer']
se_cluster = AF_clustering(se_posts, job_title='se', sim_mat=se_sims)
In [ ]:
In [ ]:
se_cluster.groupby('af_label').size().sort_values(ascending=False)
In [ ]:
fig = plotCluster(0, job_title='SE',cluster_res=se_cluster)
# plt.show(); plt.close()
In [ ]:
for i in range(9):
fig = plotCluster(i, job_title='SE',cluster_res=se_cluster)
In [ ]:
fig = plotCluster(10, job_title='SE',cluster_res=se_cluster)
plt.show(); plt.close()
In [ ]:
fig = plotCluster(22, job_title='SE', cluster_res=se_cluster, figsize=(12, 6))
plt.show(); plt.close()
In [ ]:
se_sims_df = calSimScores('Software Engineer')
In [ ]:
se_sims_df.sort_values('topic_sim', ascending=False, inplace=True)
se_sims_df.head()
In [ ]:
se_sims_df.query('skill_sim < 1 and employer1 != employer2').head()
In [ ]:
dev_titles = set([s for s in df.title if ('Developer' in s)])
dev_titles
In [421]:
dev_posts = df[(df.title == 'Software Developer') & (df.industry == 'Financial and Insurance Activities')]
print('# posts of Software Developer in Finance: %d' %dev_posts.shape[0])
In [ ]:
sd_sims_df = calSimScores('Software Developer')
In [ ]:
sd_sims_df.topic_sim.describe().round(3)
Result: The similarity of posts in Software Developer are also high with a mean of 0.83 .
In [ ]:
man_posts = df[(df.title == 'Manager')] # (df.industry == 'Financial and Insurance Activities')
man_sims = calSimScores('Manager', industry=None, df=df, out_fmt='matrix_topic_sim')
In [ ]:
man_cluster = AF_clustering(man_posts, job_title='Manager', sim_mat=man_sims)
In [ ]:
man_cluster.groupby('af_label').size().sort_values(ascending=False)
In [ ]:
for c in [7, 9, 11, 14, 22]:
plotCluster(c, job_title='fin_man', cluster_res=man_cluster)
plt.close('all')
In [ ]:
man_sim_df = calSimScores('Manager')
In [ ]:
assoc_sim_df = calSimScores('Associate')
In [ ]:
assoc_sim_df.topic_sim.describe().round(3)
In [ ]:
fig = plotSimDists(assoc_sim_df, job_title='Associate')
fig.savefig(SKILL_RES + 'fig/assoc_sim_dists.pdf')
plt.show(); plt.close()
In [ ]:
se_sims_df.topic_sim.describe().round(3)
In [ ]:
man_sim_df.topic_sim.describe().round(3)
In [ ]:
plt.close('all')
In [ ]:
fig = plotSimDists(sim_df=se_sims_df, job_title='Software Engineer')
plt.savefig(SKILL_RES + 'fig/se_sim_dists.pdf')
plt.show(); plt.close()
In [ ]:
fig = plotSimDists(sim_df=man_sim_df, job_title='Manager')
fig.savefig(SKILL_RES + 'fig/man_sim_dists.pdf')
plt.show(); plt.close()
In [ ]:
rf_sim_mat = calSimScores(job_title='Research Fellow', industry='Education', df=df, out_fmt='matrix_topic_sim')
In [ ]:
rf_posts = df[(df.title=='Research Fellow') & (df.industry == 'Education')]
print('# posts of Research Fellow: %d' %rf_posts.shape[0])
In [ ]:
rf_cluster = AF_clustering(rf_posts, job_title='rf', sim_mat=rf_sim_mat)
In [ ]:
rf_cluster.groupby('af_label').size().sort_values(ascending=False)
In [ ]:
rf_c0 = plotCluster(0, job_title='RF', cluster_res=rf_cluster)
rf_c1 = plotCluster(1, job_title='RF', cluster_res=rf_cluster)
plt.close('all')
In [ ]:
se_and_sd = simScore('Software Engineer', 'Software Developer')
In [ ]:
se_and_sd = se_and_sd.sort_values('topic_sim', ascending=False)
se_and_sd.reset_index(inplace=True); del se_and_sd['index']
se_and_sd.head()
In [ ]:
fig = plotSimDists(se_and_sd, 'SE_and_SD')
fig.savefig(SKILL_RES + 'fig/se_and_sd_sims.pdf')
plt.show(); plt.close()
In [ ]:
plt.close('all')
In [ ]:
vizPair(0, se_and_sd, abbv_title='se_vs_sd')
In [ ]:
last = se_and_sd.shape[0] - 1
vizPair(last, se_and_sd, abbv_title='se_vs_sd')
In [ ]:
se_and_man = simScore('Software Engineer', 'Manager')
In [ ]:
se_and_man.topic_sim.describe().round(3)
In [ ]:
fig = plotSimDists(se_and_man)
fig.savefig(SKILL_RES + 'fig/se_and_man_sims.pdf')
plt.show(); plt.close()
In [ ]:
spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="precomputed")
In [ ]:
spectral.fit(fin_se_sims)
In [ ]:
fin_se_posts = df[(df.title == 'Software Engineer') & (df.industry == 'Financial and Insurance Activities')]
fin_se_posts['cluster'] = spectral.labels_
fin_se_posts = fin_se_posts.sort_values('cluster')
In [ ]:
spectral.fit(fin_man_sims)
In [ ]:
fin_man_posts = df[(df.title == 'Manager') & (df.industry == 'Financial and Insurance Activities')]
In [ ]:
fin_man_posts['cluster'] = spectral.labels_
fin_man_posts = fin_man_posts.sort_values('cluster')
In this section, we use the skill frameworks from SkillsFuture (SF) as a source to evaluate our topic model as well as our proposed consistency score. Currently available frameworks are for 3 sectors: (i) Hotel and Accomodation services, (ii) Early Childcare and Education, and (iii) Precision Engineering. Given a job title t in one of the three sectors, we proceed by the following steps.
In [ ]:
df = df[-df.title.isnull()]
# standardize employer_name
df.employer_name = df.employer_name.apply(lambda s: s.replace('PTE LTD', 'PTE. LTD.').replace('PTE. LIMITED', 'PTE. LTD.')
.replace('PRE-SCHOOL', 'PRESCHOOL') )
df.to_csv(SKILL_DAT + 'filter_doc_index.csv')
The list of titles for PST may be formed by looking at the titles from certain pre-schools in SG. We tried with the top pre-schools obtained from https://skoolopedia.com/preschool-singapore-2015-infographic/. First try return empty results! Checking with employer name in data shows that we need to append 'PTE. LTD.' to school names. We then added schools found in data.
In [ ]:
keys = map(str.upper, ['PreSchool', 'Skool', 'CHILDCARE', 'Kid', 'toddler', 'infant'])
guessed_preschools = set([s for s in df.employer_name if found(keys, s)])
print('# guessed preschools: %d' %len(guessed_preschools))
guessed_preschools
In [ ]:
# ['Shaws CDLC', 'childfirst', 'kiddiWinkie', 'little footprints', 'brighton montessori', 'posso', 'little skool-house',
# 'little village', 'mulberry', 'learning vision', 'Star Learners', 'global eduhub', 'sunflower childcare', 'frobel']
preschool_keys = ['E-BRIDGE', 'ETONHOUSE PRESCHOOL', 'MINDCHAMPS', 'LECLARE', "Pat's Schoolhouse",
'SKOOL4KIDZ', 'VIVA KIDS', 'JUST KIDS LEARNING PLACE', 'BIBINOGS KIDS ACADEMY',
'CREATIVE LAND CHILDCARE', 'Lorna Whiston',
'Carpe Diem', 'Crestar', 'nurture edu', 'safari house']
preschool_keys = map(str.upper, preschool_keys)
preschools = [s for s in df.employer_name if found(preschool_keys, s)]
preschool_posts = df[df.employer_name.isin(preschools)]
print('# posts from preschool employers: %d, distributed as follows:' %preschool_posts.shape[0])
sortDown(preschool_posts.groupby('employer_name').size())
In [ ]:
titles = set(preschool_posts['title'])
titles
Among the titles, we can only find 2 titles for pre-school teacher [Child Care Teacher, Pre-Primary Education Teacher]. The reason is because the set of preschools are not exhaustive. How to fix this?
Another way to search for titles of PST is to look at the job titles for Teacher and manually narrow down to Pre-school Teacher as follow.
In [ ]:
idx = [i for i,s in enumerate(df.title) if ('Teacher' in s)]
teacher_df = df.iloc[idx]
In [ ]:
print('# posts of titles containing kw Teacher: %d' %teacher_df.shape[0])
In [ ]:
teacher_stat = getTitleStats(teacher_df)
teacher_stat.to_csv(SKILL_RES + 'pst/teachers.csv', index=False)
Based on this, we guessed the following titles for PST.
In [ ]:
cc_teachers = ['Pre-School Teacher', 'Kindergarten Teacher', 'Child Care Teacher', 'Pre-Primary Education Teacher',
'Teacher, Kindergarten', 'Teacher, Child Care', 'Day Care Teacher']
In [ ]:
teacher_stat.describe().round(1)
In [ ]:
teacher_stat[teacher_stat.title.isin(cc_teachers)]
In [ ]:
pst_posts = df[df.title == 'Pre-School Teacher']
pst_posts.to_csv(SKILL_RES + 'pst/posts.csv', index=False)
In [ ]:
pst_sims = pairwiseSim(pst_posts, doc_topic_distr)
In [ ]:
print pst_sims.topic_sim.describe().round(3)
pst_sims.skill_sim.describe().round(2)
In [ ]:
fig = plotSimDists(pst_sims, sci_fmt=False)
# fig.suptitle('Pre-School Teacher (13 posts)', fontsize=20)
fig.savefig(SKILL_RES + 'fig/pst_sims.pdf')
plt.show(); plt.close()
In [ ]:
pst_sims.query('skill_sim >= 0.6')
In [ ]:
pst_sims.sort_values('topic_sim', ascending=False, inplace=True)
pst_sims.to_csv(SKILL_RES + 'pst/pst_sims.csv', index=False)
In [ ]:
pst_sims = pst_sims.query('skill_sim < 0.6')
vizPair(0, pst_sims, labels, abbv_title='pst')
In [ ]:
last = pst_sims.shape[0] - 1
vizPair(last, pst_sims, labels, abbv_title='pst')
In [ ]:
# employers having PST positions
pst_employers = np.unique(pst_posts.employer_name)
print('# PST employers: %d' %len(pst_employers))
# pst_employers
In [ ]:
posts_of_pst_employers = df[df.employer_name.isin(pst_employers)]
print('# posts of PST employers: {}'.format(posts_of_pst_employers.shape[0]))
In [ ]:
other_titles_df = getTitleStats(posts_of_pst_employers).query('title != "Pre-School Teacher"') # n_post > 1
other_titles = other_titles_df['title']
In [ ]:
teachers = teacher_stat.title
Titles from PST employers will not include all Teacher titles.
In [ ]:
# set(teachers).difference(other_titles)
In [394]:
rel_titles = set(teachers.append(other_titles))
rel_posts = df[df.title.isin(rel_titles)]
In [396]:
# merge diff versions of some titles
rel_posts = changeTitle(rel_posts, 'Teacher, Kindergarten', 'Kindergarten Teacher')
rel_posts = changeTitle(rel_posts, 'Teacher, Child Care', 'Child Care Teacher')
In [397]:
rel_titles = set(rel_posts.title)
print('# relevant titles: %d' %len(rel_titles))
print('# titles retrieved by kw teacher: {}'.format(len(teachers)))
print('# titles retrieved by PST employers: {}'.format(len(other_titles)))
In [398]:
res = getTitleStats(rel_posts)
In [399]:
res['topic_sim_with_pst'] = res['title'].apply(simScore, t2='Pre-School Teacher')
In [400]:
res.sort_values('topic_sim_with_pst', ascending=False, inplace=True)
res.avg_n_skill = res.avg_n_skill.round(1)
In [ ]:
res.to_csv(SKILL_RES + 'pst/sims.csv', index=False)
In [408]:
res.describe().round(3)
Out[408]:
The description shows us the following:
In [ ]:
res.query('n_post >= 6').describe().round(3)
In [407]:
res.query('n_post >= 6').head(10)
Out[407]:
In [405]:
df[df.title == 'Speech and Drama Teacher'].iloc[0].doc
Out[405]:
In [ ]:
res.tail()
In [ ]:
tmp = teacher_df.query('title == "Student Teacher"')
print tmp.iloc[0]['employer_name']
print tmp.iloc[0]['doc']
In [ ]:
cc_sims = pairwiseSim(cc_df, doc_topic_distr)
In [ ]:
print('# pairs: %d' %cc_sims.shape[0])
In [ ]:
# del cc_sims['index']
cc_sims.topic_sim.describe().round(3)
In [ ]:
fig = plotSimDists(cc_sims, 'Pre-School Teacher')
fig.savefig(SKILL_RES + 'fig/pst_sims.pdf')
plt.show(); plt.close()
In [ ]:
cc_sims = cc_sims.sort_values('skill_sim', ascending=False)
In [ ]:
cc_sims.query('skill_sim >= 0.8').to_csv(SKILL_RES + 'job_prof/pst_variants.csv', index=False)
In [ ]:
cc_sims.query('(employer1 != employer2) and (topic_sim >= 0.9) and (skill_sim < 0.8)')
In [ ]:
niwa_df = cc_sims.query('employer1 == "NIWA SCHOOLHOUSE"')
print(niwa_df.shape[0])
In [ ]:
niwa_df.head()
In [ ]:
tmp = niwa_df.head()
plt.close('all')
In [ ]:
vizPair(0, niwa_df)
In [ ]:
vizPair(1, niwa_df)
Result: The topic similarity scores are very high with a mean value of 0.99!
The SF framework for PST has 10 categories of skills (listed below). When we manually labeled the top-100 popular skills using the categories, we detected another group of skills, namely language skills e.g. chinese, mandarin. We labeled this group as Language Skill. We also found several skills which cannot go into any categories, we thus create Others group for these skills. In short, we have 12 following categories of skills.
In [ ]:
categs = {'c-dev': 'Child Development', 'ped': 'Curriculum & Pedagogy',
'env': 'Learning environment', 'rel' : 'Interaction & Relationships',
'safety' : 'Health, Safety & Nutrition', 'family & com' : 'Family & Community partnerships',
'prof dev' : 'Professional Mastery', 'prof val' : 'Professional Values & Ethics',
'teamwork' : 'Teamwork & Collaboration', 'plan' : 'Visioning & Planning',
'lang': 'Language Skill', 'others' : 'Others'}
In [ ]:
skill_sets = map(lambda s: set(s.split(',')), cc_df.occur_skills)
cc_skills = unionAll(skill_sets)
print('# skills in child care posts: %d' %len(cc_skills))
freqs = [freq(sk, skill_sets) for sk in cc_skills]
cc_skill_df = pd.DataFrame({'skill': list(cc_skills), 'freq': freqs})
# cc_skill_df.sort_values('freq', ascending=False).to_csv(SKILL_RES + 'cc_skills.csv', index=False)
The final result is follows.
In [ ]:
cc_skill_df = pd.read_csv(SKILL_RES + 'cc_skills.csv')
In [ ]:
# top 100 skills
top_100 = pd.DataFrame(cc_skill_df.head(100))
top_100['Category'] = top_100.label.apply(lambda x: categs[x])
print('Distribution of categories among top-100 skills')
sortDown(top_100.groupby('Category').size())
In [ ]:
top_100['skill_freq'] = top_100.skill + ' (' + map(str, top_100.freq) + ')'
top_100.head(1)
In [ ]:
tmp = top_100.groupby('Category').agg({'skill_freq': joinEntries, 'skill': len})
tmp = tmp.reset_index().rename(columns = {'skill_freq': 'skills (freq)', 'skill': 'n_skill'})
tmp
In [ ]:
tmp.to_csv(JOB_PROF + 'cc_skill_categ.csv', index=False)
In [ ]:
print [sk for sk in cc_skills if ('child' in sk)]
print [sk for sk in cc_skills if ('curriculum' in sk)]
In [ ]:
# query employers in HAS in ds
hotel_kws = map(str.upper, ['hotel', 'hostel', 'motel', 'lodging', 'resort'])
names = [s for s in df.employer_name if found(hotel_kws, s)]
hotels = pd.DataFrame({'name': names}).drop_duplicates()
print('# employers in HAS: %d' %hotels.shape[0])
In [ ]:
hotels.to_csv(SKILL_RES + 'hotels.csv', index=False)
In [ ]:
# query all posts of the employers
has_posts = df[df.employer_name.isin(hotels)]
print('# posts in HAS: %d, distributed as follows:' %has_posts.shape[0])
# sortDown(has_posts.groupby('employer_name').size())
In [ ]:
# query possible job titles for the employers in HAS
has_title = set(has_posts.title)
print('# titles in the HAS employers: %d' %len(has_title))
In [ ]:
has_title_stats = stats[stats.title.isin(has_title)]
has_title_stats.to_csv(SKILL_RES + 'has_title_stats.csv', index=False)
In [ ]:
has_title_stats.describe().round(1)
In [ ]:
HK_DIR = SKILL_RES + 'job_prof/hk/'
In [ ]:
# based on __keywords__ in titles suggested by SF
hk_kw = ['Housekeeper', 'Housekeeping Attendant', 'Room']
hk_titles = set([t for t in has_posts.title if found(hk_kw, t)])
hk_titles
Only 'Housekeeper (Hotels and Other Establishments)' and 'Housekeeping Attendant' are exact match for HK track. So we only keep the two titles for HK. We then change the former to the latter s.t. we have a consistent title for posts of HK.
In [ ]:
df = changeTitle(df, 'Housekeeper (Hotels and Other Establishments)', 'Housekeeping Attendant')
In [380]:
hk_kw = ['Housekeep']
hk_titles = set([t for t in df.title if found(hk_kw, t)])
hk_titles
Out[380]:
As HK Manager and HK Supervisor are similar, we merge them together.
In [381]:
df = changeTitle(df, 'Housekeeping Manager', 'Housekeeping Supervisor')
After the merging, we need to re-comp the statistics for job titles as the stats change.
In [382]:
stats = getTitleStats(df)
stats.to_csv(SKILL_RES + 'stats.csv', index=False)
In [383]:
hk_titles = set([t for t in df.title if found(hk_kw, t)])
hk_titles
Out[383]:
In [ ]:
_ = df[df.title.isin(hk_titles)]
print('# posts: %d' %_.shape[0])
In [ ]:
hk_posts = df[df.title == 'Housekeeping Attendant']
print('# HK posts: %d' %hk_posts.shape[0])
In [ ]:
hk_sims = pairwiseSim(hk_posts, doc_topic_distr)
hk_sims.topic_sim.describe().round(2)
Again, the topic similarity is high with a mean value of $0.89$.
In [ ]:
fig = plotSimDists(hk_sims, sci_fmt=False)
fig.set_tight_layout(True)
fig.savefig(SKILL_RES + 'fig/HK/hk_sims.pdf')
plt.show(); plt.close()
In [ ]:
hk_sims = hk_sims.sort_values('topic_sim', ascending=False)
In [ ]:
vizPair(0, hk_sims, labels, abbv_title='hk')
The two posts above are from hotels of the same group: Park Hotel Group.
In [ ]:
vizPair(1, hk_sims, labels, abbv_title='hk')
This case a job agency reposted the job for the employer.
In [ ]:
vizPair(2, hk_sims, labels, abbv_title='hk')
In [ ]:
last = hk_sims.shape[0]-1
vizPair(last, hk_sims, labels, abbv_title='hk')
In [ ]:
hk_employers = hk_posts.employer_name.drop_duplicates()
hk_employers.to_csv(HK_DIR + 'employers.csv', index=False)
In [ ]:
print('# employers having HK positions: %d' %len(hk_employers))
In [ ]:
# posts from HK employers
posts_of_hk_employers = df[df.employer_name.isin(hk_employers)]
print('# posts from HK employers: %d' %posts_of_hk_employers.shape[0])
In [ ]:
titles_from_hk_employers = set(posts_of_hk_employers['title'])
print('# titles from HK employers: %d' %len(titles_from_hk_employers))
In [ ]:
rel_titles = titles_from_hk_employers.union(hk_titles)
print('# relevant titles: %d' %len(rel_titles))
In [ ]:
hk_titles.difference(titles_from_hk_employers)
As we already cover a large number of relevant titles. We may not need to retrieve more titles based on key words.
In [ ]:
# + Contains keywords relevant to HK:
But first let us look at basic stats of the titles.
In [385]:
rel_title_stats = stats[stats.title.isin(rel_titles)]
rel_title_stats.describe().round(1)
Out[385]:
In [ ]:
rel_title_stats.to_csv(HK_DIR + 'rel_title_stats.csv', index=False)
From the summary, we see that:
In [384]:
hk_title_stats = stats[stats.title.isin(hk_titles)]
hk_title_stats
Out[384]:
As 512 titles is too many, we need to narrow down. We want to keep all the titles with kw 'Housekeep', except the last one. Combining the stats of HK titles with that of relevant titles, we have 2 options for the filtering threshold $\theta$:
In [386]:
thetas = [22, 38]
for t in thetas:
_ = rel_title_stats.query('n_post >= {}'.format(t))
print('# titles to be compared if threshold is {}: {}'.format(t, _.shape[0]))
In [387]:
def sims2RelTitles(min_post):
print('Picked titles with at least {} posts'.format(min_post))
res = rel_title_stats.query('n_post >= {}'.format(min_post))
res['topic_sim_with_hk'] = res['title'].apply(simScore, t2='Housekeeping Attendant')
res = res.round(2)
res = res.sort_values('topic_sim_with_hk', ascending=False)
res.reset_index(inplace=True); del res['index']
return res
In [388]:
res_38 = sims2RelTitles(min_post=38)
In [392]:
res_38.head(10).to_csv(HK_DIR + 'top_sim_titles.csv', index=False)
In [393]:
res_38.head(10)
Out[393]:
In [391]:
res_38.describe().round(2)
Out[391]:
In [ ]:
In [ ]:
# limit to titles with > 22 posts.
_ = rel_title_stats.query('n_post > 22')
print('# titles with > 22 posts: %d' %_.shape[0])
res = rel_title_stats.query('n_post > 22')
res['topic_sim_with_hk'] = res['title'].apply(simScore, t2='Housekeeping Attendant')
In [ ]:
res = res.round(2)
res = res.sort_values('topic_sim_with_hk', ascending=False)
res.head(10)
In [ ]:
res.to_csv(HK_DIR + 'sims_to_hk.csv', index=False)
In [ ]:
skill_df = pd.DataFrame({'skill': c.keys(), 'freq': c.values()})
skill_df = skill_df.sort_values('freq', ascending=False)
In [ ]:
hk_skills = skillFreq(hk_posts)
print('# skills in HK posts: %d' %hk_skills.shape[0])
hk_skills.head()
In [ ]:
hk_skills.to_csv(SKILL_RES + 'job_prof/hk_skills.csv', index=False)
In [ ]:
# Machinist/Technician are suggested by SF
tech_kw = ['Machinist', 'Technician']
tech_titles = [t for t in df.title if found(tech_kw, t)]
c = Counter(tech_titles)
In [ ]:
tech_titles = pd.DataFrame({'title': c.keys(), 'n_post': c.values()}).sort_values('n_post', ascending=False)
In [ ]:
tech_titles.n_post.describe().round(1)
In [ ]:
tech_titles = tech_titles.query('n_post > 10')
print('# titles: %d' %tech_titles.shape[0])
In [ ]:
tech_titles.tail()
In [ ]:
tech_posts = df[df.title.isin(tech_titles.title)]
print('# posts for Technician: %d' %tech_posts.shape[0])
In [ ]:
getTitleStats(tech_posts).to_csv(SKILL_RES + 'job_prof/tech_titles.csv', index=False)
In [ ]:
sortDown(tech_posts.groupby('title').size())
In [ ]:
tech_sims = pairwiseSim(tech_posts, doc_topic_distr)
In [ ]:
# plot dists of the sims
fig = plotSimDists(tech_sims, 'Technician jobs')
fig.savefig(SKILL_RES + 'fig/tech_jobs_sim.pdf')
plt.show(); plt.close()
In [ ]:
tech_sims.skill_sim.describe().round(2)
In [ ]:
tech_sims.query('skill_sim == 1')
In [ ]:
tech_sims.topic_sim.describe().round(2)
In [ ]:
tech_sims = tech_sims.sort_values('topic_sim', ascending=False)
In [ ]:
n_pair = tech_sims.shape[0]; last = n_pair - 1
vizPair(last, tech_sims)
In [ ]:
fm_posts = df[df.title == 'Fashion Merchandiser']
fm_sims = pairwiseSim(fm_posts, doc_topic_distr)
In [ ]:
fm = fm_sims.sort_values('skill_sim', ascending=False)
fm.head().to_csv(SKILL_RES + 'job_prof/fm_variants.csv', index=False)