Clustering Job Posts

Given a job title (in an industry), we would like to group its job posts into clusters by their similarity in topic distributions. The clusters are speculated to reveal us some interesting insights (e.g. trends) on the job title.

In [1]:
import my_util as my_util
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *

import random as rd

In [2]:
global doc_topic_distr

In [3]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
SKILL_DAT = HOME_DIR + 'data/clean/skill_cluster/' 
SKILL_RES = HOME_DIR + 'results/' + 'skill_cluster/new/'
JOB_PROF = SKILL_RES + 'job_prof/'

In [4]:
df = pd.read_csv(SKILL_DAT + 'filter_doc_index.csv')
print df.shape
df.set_index('index', inplace=True)

(71338, 9)

In [5]:
with(open(SKILL_RES + 'doc_20topic_distr.mtx', 'r')) as f:
    doc_topic_distr = mmread(f)

In [6]:
# reload(my_util)
# from my_util import *
from cluster_skill_helpers import *

In [19]:
def changeTitle(df, t1='Teacher, Kindergarten', t2='Kindergarten Teacher'):
    n_post = df.shape[0]
    idx = [i for i in range(n_post) if df.iloc[i].title == t1]
    out_df = df.copy()
    out_df.title.iloc[idx] = t2
    return out_df

def vizPair(i, sim_df, labels, abbv_title=''):
    fig = vizDists4Pair(sim_df.iloc[i], df, doc_topic_distr, labels)
    fig.savefig(SKILL_RES + 'fig/{}_p{}.pdf'.format(abbv_title, i+1)); plt.close()
def calSimScores(job_title='Research Fellow', industry=None, df=df, out_fmt='data_frame'): # 'Education'
    posts = df[(df['title'] == job_title) & (df['industry'] == industry)] if industry is not None else df[df['title'] == job_title]
#     posts = rmBadPosts(posts, job_title)
    n_post = posts.shape[0]
    if n_post > 100: posts = posts.sample(100)
    n_post = posts.shape[0]
#     print('{} in {}: {} posts'.format(job_title, industry, n_post))
    return pairwiseSim(posts, doc_topic_distr, out_fmt, verbose=False)

def consistency(job_title, industry, save_sim=False, abbv_job='', abbv_industry=''):
    @brief: calculate consistency score of given job title in given industry as avg of job post sims
    @param: save_sim=True if want to save the sims
    sims = calSimScores(job_title, industry)
    if save_sim:
        fname = JOB_PROF + 'consistency/{}_{}_sims.csv'.format(abbv_industry, abbv_job)
        sims.to_csv(fname, index=False)
    cscore = round(sims['topic_sim'].mean(), 3)
    return cscore

def cScoreAtRow(row):
    @return: consistency score of pair (job_title, industry) in the given row
    count =
    if (count % 100 == 0): print('{} pairs and counting...'.format(count))
    job_title, industry = row['title'], row['industry']
    sims = calSimScores(job_title, industry)
    cscore = round(sims['topic_sim'].mean(), 3)
    return cscore

def simScore(t1, t2):
    print('{} vs. {}'.format(t1, t2))
    posts1 = df[df.title == t1]; posts2 = df[df.title == t2]
##     Rm lousy posts with too few skills from both sets
#     posts1 = rmBadPosts(posts1, t1)
#     posts2 = rmBadPosts(posts2, t2)
##     Sample for efficiency (if too many posts)
    n1, n2 = posts1.shape[0], posts2.shape[0]
    if n1 > 100: posts1 = posts1.sample(100)
    if n2 > 100: posts2 = posts2.sample(100)
    if (n1 > 0) and (n2 > 0): # needed to avoid empty sets if bad posts are removed
        res = crossSimScores(posts1, posts2, doc_topic_distr, verbose=False)
        topic_sim = round(res['topic_sim'].mean(), 3)
        return topic_sim  # return res
        # print('Topic similarity score bw {} and {}: {}'.format(t1, t2, topic_sim))
    return np.nan

def AF_clustering(posts, job_title, sim_mat):
    af = cluster.AffinityPropagation(affinity='precomputed').fit(sim_mat) # preference=-50,
    cluster_centers_indices = af.cluster_centers_indices_
    n_clusters_ = len(cluster_centers_indices)
    n_post = posts.shape[0]
    print('# posts to be clustered by Affinity Propagation model: {}'.format(n_post))
    print('Estimated number of clusters: %d' %n_clusters_)
    # Representatives (cluster centers)
    reps = posts.iloc[cluster_centers_indices]
    reps.to_csv(JOB_PROF + 'clusters/{}_reps.csv'.format(job_title), index=False)
    # Retrieve labels of posts
    res = posts.copy()
    res['af_label'] = af.labels_
    return res.sort_values('af_label')

def plotCluster(c, job_title, cluster_res): # figsize=(12,6)
    posts = cluster_res.query('af_label == {}'.format(c))
    n_post = posts.shape[0]
    if (n_post % 2 == 1):
        print('n_post={} is odd number, drop 1 post'.format(n_post)); n_post -= 1
        posts = posts.iloc[1:]

    w = 12; h = 3*n_post/4 if n_post >= 8 else 6
    fig = vizTopicDists(posts, doc_topic_distr, figsize=(w, h))
    i=c+1; title = 'Topic distribution of {} posts in {}th cluster'.format(job_title, i)
    fig.suptitle(title, fontsize=20)
    fig.savefig(SKILL_RES + 'fig/c{}_{}.pdf'.format(i, job_title))
    return fig

In [ ]:
good_df = df.query('n_skill >= 10')

We need to get basic stats of job titles to understand more about them. Given a title, we need to know:

  • How many posts for the title in whole ds
  • avg n_skill in the posts
  • n_employer having the title

In [ ]:
stats = getTitleStats(df)
stats.to_csv(SKILL_RES + 'stats.csv', index=False)

In [ ]:
k = 20;  fname = SKILL_RES + 'lda/{}_topics.csv'.format(k)
doc_topic_distr = topic_distr[k]
topic_df = pd.read_csv(fname) 
labels = map(str.upper, topic_df['label'])

Range of Job Post Similarity Scores

We will look at distribution of topic similarity scores of certain samples in data. We investigate how the distribution varies with n_topic and sample size.

In [ ]:
with(open(SKILL_RES + 'doc_topic_distr.mtx', 'r')) as f:
    doc_15topic_distr = mmread(f)

with(open(SKILL_RES + 'doc_20topic_distr.mtx', 'r')) as f:
    doc_20topic_distr = mmread(f)

with(open(SKILL_RES + 'doc_30topic_distr.mtx', 'r')) as f:
    doc_30topic_distr = mmread(f)

In [ ]:

In [ ]:
# Allow us to loop over doc-topic dists wrt diff no. of topics
topic_distr = {15: doc_15topic_distr, 20: doc_20topic_distr, 30: doc_30topic_distr}

In [ ]:
rd.seed(1234567) # time()
size = 500; posts = good_df.sample(size)

In [ ]:
sims_15 = pairwiseSim(posts, doc_15topic_distr)
sims_20 = pairwiseSim(posts, doc_20topic_distr)
sims_30 = pairwiseSim(posts, doc_30topic_distr)

In [ ]:

In [ ]:
sims_15.sort_values('topic_sim', inplace=True)
vizPair(0, sims_15)
vizPair(1, sims_15)

In [ ]:
medium = sims_15.query('0.7 < topic_sim and topic_sim <= 0.8')
medium.sort_values('topic_sim', inplace=True)

vizPair(0, medium, abbv_title='medium')

last = medium.shape[0]-1
vizPair(last, medium, abbv_title='medium')

large = sims_15.query('topic_sim > 0.8')
vizPair(0, large, abbv_title='large')

n_pair = sims_15.shape[0]; last = n_pair - 1
vizPair(last, sims_15)

In [ ]:
sims = {15: sims_15, 20: sims_20, 30: sims_30}
ks = [15, 20, 30]

In [413]:
for k in ks:
    fig = plotSimDists(sims[k])
    fig.suptitle('Sample size: {} posts'.format(size), fontsize=20)
    fname = SKILL_RES + 'fig/sim_dists_{}topics.pdf'.format(k)
#     fig.set_tight_layout(True)
    fig.savefig(fname); plt.close()

The plots show that topic similarity distribution $sim_{topic}$ changes gradually with the number of topics $k$ and with sample size. For a sample of size 300, we have

  • $k=15$: $sim_{topic}$ is distributed in the range $ [0.63, 0.93] $
  • $k=20$: $sim_{topic}$ is distributed in the range $ [0.62, 0.92] $
  • $k=30$: $sim_{topic}$ is distributed in the range $ [0.6, 0.9] $

Similarity of Job Posts and Consistency Score

We will look at consistency score of a given job title in a given industry. First, let's see how many pairs (job title, industry) we have and get stats for the pairs.

In [6]:

doc job_id title n_skill occur_skills employer_id employer_name industry
102501 oversee site operations for ground improvement... JOB-2015-0145129 Site Engineer 5 consultant,installation,ground improvement,lan... 199800294Z KIARATEX EXPORTS PTE. LTD. Wholesale and Retail Trade

In [8]:
by_job_and_industry = df.groupby(['title', 'industry'])

agg_df = by_job_and_industry.agg({'job_id': len, 'employer_id': 'nunique', 'n_skill': 'mean'})

agg_df = agg_df.rename(columns={'job_id': 'n_post', 'employer_id': 'n_employer', 
                               'n_skill': 'avg_n_skill'})
agg_df = agg_df.reset_index()

n_employer avg_n_skill n_post
count 15471.0 15471.0 15471.0
mean 3.3 14.9 4.6
std 6.2 8.5 13.6
min 1.0 2.0 1.0
25% 1.0 9.0 1.0
50% 1.0 13.0 2.0
75% 3.0 18.5 4.0
max 170.0 87.0 525.0

In [12]:
agg_df.sort_values('n_post', inplace=True)

In [61]:

title industry n_employer avg_n_skill n_post
0 Analyst Financial and Insurance Activities 78 19.613333 525
1 Research Fellow Education 4 11.977876 452
2 Associate Financial and Insurance Activities 53 26.796569 408
3 Administrative Assistant Administrative and Support Service Activities 124 9.773810 336
4 Research Assistant Education 7 11.843511 262
5 Manager Financial and Insurance Activities 48 21.367089 237
6 Vice President Financial and Insurance Activities 30 26.600858 233
7 Research Associate Education 5 13.545045 222
8 Application Developer Information and Communications 100 14.820276 217
9 Recruitment Consultant Administrative and Support Service Activities 114 13.995305 213
  • First, analyze tuples (job title, industry) with $\ge 100 $ posts.

In [10]:
res = agg_df.query('n_post >= 100').copy()
print('# pairs to analyze: %d' %res.shape[0])

# pairs to analyze: 55

In [11]:

Accountant in Administrative and Support Service Activities: 100 posts

In [96]:
res['cscore'] = res.apply(cScoreAtRow, axis=1)

Analyst in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Research Fellow in Education
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Associate in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Administrative Assistant in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Research Assistant in Education
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Manager in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Vice President in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Research Associate in Education
Computing pairwise similarity scores among 100 job posts,
Done after 7.8s
Application Developer in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Recruitment Consultant in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Researcher in Education
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Information Technology Specialist in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Software Developer in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 8.0s
Quantity Surveyor in Construction
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Software Engineer in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 8.1s
Business Analyst in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Assistant Vice President in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.2s
Accounts Assistant in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.0s
Business Analyst in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Business Analyst in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Accounts Executive in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.1s
Sales Executive in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.1s
Accountant in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.0s
Application Developer in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Senior Software Engineer in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 8.0s
Engineer, Software in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.7s
Consultant in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 7.8s
Analyst in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
SAP Consultant in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Assistant Manager in Education
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Human Resource Executive in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Sales Engineer in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.8s
Application Consultant in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 8.0s
Information Technology Project Manager in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 8.0s
Assistant Manager in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 7.9s
Engineer in Manufacturing
Computing pairwise similarity scores among 100 job posts,
Done after 8.1s
Administrative Assistant in Professional, Scientific and Technical Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.2s
Information Technology Consultant in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 8.1s
Sales Coordinator in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.2s
Manager in Professional, Scientific and Technical Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.4s
Project Manager in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 8.8s
SAP Consultant in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 8.5s
Customer Service Executive in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 9.0s
Information Technology Engineer in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.9s
Marketing Executive in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 8.8s
Management Assistant Officer in Education
Computing pairwise similarity scores among 100 job posts,
Done after 8.9s
Relationship Manager in Financial and Insurance Activities
Computing pairwise similarity scores among 100 job posts,
Done after 9.0s
Senior Software Engineer in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 9.8s
Project Manager in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 9.3s
Information System Engineer in Administrative and Support Service Activities
Computing pairwise similarity scores among 100 job posts,
Done after 9.5s
Executive in Education
Computing pairwise similarity scores among 100 job posts,
Done after 9.2s
Project Engineer in Construction
Computing pairwise similarity scores among 100 job posts,
Done after 9.4s
Manager in Education
Computing pairwise similarity scores among 100 job posts,
Done after 9.4s
Sales Executive in Wholesale and Retail Trade
Computing pairwise similarity scores among 100 job posts,
Done after 9.5s
Software Consultant in Information and Communications
Computing pairwise similarity scores among 100 job posts,
Done after 10.6s

In [102]:

count    55.000
mean      0.851
std       0.031
min       0.772
25%       0.836
50%       0.851
75%       0.874
max       0.906
Name: cscore, dtype: float64

In [99]:
res = res.sort_values('cscore', ascending=False)

In [100]:

title industry n_employer avg_n_skill n_post cscore
51 Project Engineer Construction 96 8.500000 106 0.906
38 Sales Coordinator Administrative and Support Service Activities 46 9.145299 117 0.902
20 Accounts Executive Administrative and Support Service Activities 79 10.514620 171 0.901
13 Quantity Surveyor Construction 170 10.680412 194 0.899
17 Accounts Assistant Administrative and Support Service Activities 88 9.951351 185 0.889

In [101]:

title industry n_employer avg_n_skill n_post cscore
39 Manager Professional, Scientific and Technical Activities 47 16.730435 115 0.796
16 Assistant Vice President Financial and Insurance Activities 29 21.259459 185 0.789
5 Manager Financial and Insurance Activities 48 21.367089 237 0.788
6 Vice President Financial and Insurance Activities 30 26.600858 233 0.787
2 Associate Financial and Insurance Activities 53 26.796569 408 0.772

In [103]:
res.to_csv(JOB_PROF + 'cscore_jobs_100posts.csv', index=False)

Distribution of c-scores

In [60]:
def vizCScores(res):
    fig = plt.figure(figsize=(6,5))
    avg, std = round(res.cscore.mean(), 3), round(res.cscore.std(), 3)
    xl = 'Consistency score' + r'$(\mu = {}, \sigma = {})$'.format(avg, std)
    plt.xlabel(xl, fontsize=16); 
    plt.ylabel('Count', fontsize=16)
    return fig

In [ ]:
res = agg_df.query('2 <= n_post < 100')

In [59]:
_ = agg_df.query('2 <= n_post')
print('# pairs with at least 2 posts: %d' % _.shape[0])

# pairs with at least 2 posts: 8037

In [29]:
del res['index']

title industry n_employer avg_n_skill n_post
0 Business Executive Professional, Scientific and Technical Activities 2 23.0 2
1 Communications Executive Administrative and Support Service Activities 2 13.5 2
2 Other Finance Dealers and Brokers Financial and Insurance Activities 2 8.5 2
3 Operations Assistant Wholesale and Retail Trade 2 9.5 2
4 Business Applications Manager Administrative and Support Service Activities 2 33.5 2

In [30]:
res['cscore'] = res.apply(cScoreAtRow, axis=1)

0 pairs and counting...
100 pairs and counting...
200 pairs and counting...
300 pairs and counting...
400 pairs and counting...
500 pairs and counting...
600 pairs and counting...
700 pairs and counting...
800 pairs and counting...
900 pairs and counting...
1000 pairs and counting...
1100 pairs and counting...
1200 pairs and counting...
1300 pairs and counting...
1400 pairs and counting...
1500 pairs and counting...
1600 pairs and counting...
1700 pairs and counting...
1800 pairs and counting...
1900 pairs and counting...
2000 pairs and counting...
2100 pairs and counting...
2200 pairs and counting...
2300 pairs and counting...
2400 pairs and counting...
2500 pairs and counting...
2600 pairs and counting...
2700 pairs and counting...
2800 pairs and counting...
2900 pairs and counting...
3000 pairs and counting...
3100 pairs and counting...
3200 pairs and counting...
3300 pairs and counting...
3400 pairs and counting...
3500 pairs and counting...
3600 pairs and counting...
3700 pairs and counting...
3800 pairs and counting...
3900 pairs and counting...
4000 pairs and counting...
4100 pairs and counting...
4200 pairs and counting...
4300 pairs and counting...
4400 pairs and counting...
4500 pairs and counting...
4600 pairs and counting...
4700 pairs and counting...
4800 pairs and counting...
4900 pairs and counting...
5000 pairs and counting...
5100 pairs and counting...
5200 pairs and counting...
5300 pairs and counting...
5400 pairs and counting...
5500 pairs and counting...
5600 pairs and counting...
5700 pairs and counting...
5800 pairs and counting...
5900 pairs and counting...
6000 pairs and counting...
6100 pairs and counting...
6200 pairs and counting...
6300 pairs and counting...
6400 pairs and counting...
6500 pairs and counting...
6600 pairs and counting...
6700 pairs and counting...
6800 pairs and counting...
6900 pairs and counting...
7000 pairs and counting...
7100 pairs and counting...
7200 pairs and counting...
7300 pairs and counting...
7400 pairs and counting...
7500 pairs and counting...
7600 pairs and counting...
7700 pairs and counting...
7800 pairs and counting...
7900 pairs and counting...
C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\ SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:
  if __name__ == '__main__':

In [33]:
res_55 = pd.read_csv(JOB_PROF + 'cscore_jobs_100posts.csv')
res = pd.concat([res, res_55])
res.to_csv(JOB_PROF + 'cscore_all.csv', index=False)

In [61]:
fig = vizCScores(res)
fig.savefig(JOB_PROF + 'cscore_dist.pdf'); plt.close()

In [36]:
res = res.sort_values('cscore', ascending=False)
  • Analyze cases where cscore is 1 (posts are 100% consistent):

In [49]:
res.query('cscore == 1')

title industry n_employer avg_n_skill n_post cscore
2709 Application Architect Transportation and Storage 2 18.0 2 1.0
2619 Junior Copywriter Professional, Scientific and Technical Activities 2 15.0 2 1.0
334 Operations Administrator Professional, Scientific and Technical Activities 2 19.0 2 1.0
2028 Corporate Secretarial Assistant Wholesale and Retail Trade 2 8.0 2 1.0
1658 Web Developer Health and Social Services 2 11.0 2 1.0
1359 Data Center Operator Wholesale and Retail Trade 1 14.0 2 1.0
407 Book Editor Information and Communications 2 5.0 2 1.0
2141 Application Technician Administrative and Support Service Activities 2 4.0 2 1.0
656 Senior Client Relations Officer Financial and Insurance Activities 2 23.0 2 1.0
1232 Graphic Web Designer Other Service Activities 2 15.0 2 1.0
2620 Customer Service Consultant Other Service Activities 2 6.0 2 1.0
868 Sales Support Specialist Professional, Scientific and Technical Activities 2 9.0 2 1.0
153 Communication Specialist Professional, Scientific and Technical Activities 2 8.0 2 1.0
851 Parent Liaison Manager Health and Social Services 2 5.0 2 1.0
2772 Accountant, Company Construction 3 22.0 3 1.0
2452 Senior Materials Engineer Professional, Scientific and Technical Activities 2 21.0 2 1.0
259 Business Operations Manager Accommodation and Food Service Activities 2 6.0 2 1.0
1083 Storeman Manufacturing 2 6.0 2 1.0
2190 Architectural Designer Manufacturing 2 10.0 2 1.0
1128 Sourcing Manager Professional, Scientific and Technical Activities 2 24.0 2 1.0
1710 Warehouse Manager Construction 2 7.0 2 1.0

As these cases have only 2, 3 posts, they are very likely to be re-posts. Let's see:

In [52]:
def checkRepost(in_df):
    for i in range(in_df.shape[0]):
        row = in_df.iloc[i]
        title = row['title']; industry = row['industry']
        docs = set(df[(df.title == title) & (df.industry == industry)]['doc'])
        if (len(docs) == 1):
            print (i, True)  
        else: print(docs)

In [53]:
tmp = res.query('cscore == 1')

(0, True)
(1, True)
(2, True)
(3, True)
(4, True)
set(['perform day to day centre computer operations batch processing printing reports systems backup tape management server reboots facilties and infrastructure checks in the data centre handle escalations of server network issues and provide troubleshooting support pro actively monitor the data centre systems uptime and connectivity to ensure system availability to prevent any down time and coordinate problem resolution with vendor or second level support groups ensure physical security procedures are followed strictly provide server troubleshooting support', 'ability to perform day to day data center computer operations batch processing printing reports systems backup tape management server reboots facilities and infrastructure checks in data center demonstrate initative to pro actively monitor the data center systems up time and connectivity to ensure system availability to prevent any down time coordinate problem resolution with vendor or second level support groups to log and submit problem management record assigned to appropriate party to manage physical security procedures strictly'])
(6, True)
set(['our mnc client is seeking committed talent to join them as responsibilities provide technical and application support to the performance coatings industry in the australasia region assist in supporting new products development and conduct applications testing in the laboratory support day to day lab testing operations and maintain laboratory equipment assist the application manager in daily duties', 'provide technical and application support to the performance coatings industry in the australasia region assist in supporting new products development and conduct applications testing in the laboratory support day to day lab testing operations and maintain laboratory equipment assist the application manager in daily duties'])
(8, True)
(9, True)
(10, True)
(11, True)
(12, True)
(13, True)
(14, True)
(15, True)
(16, True)
set(['organizing maintaining store basic store keeping duties picking packing stock taking and inventory management basic data entry', 'organizing and maintaining store basic store keeping duties picking packing stock taking and inventory management basic data entry'])
(18, True)
(19, True)
(20, True)

Gotcha: All of them are re-posts, some with a bit editing (cases 5, 7 and 17).

In [48]:
res.query('cscore < 1').head()

title industry n_employer avg_n_skill n_post cscore
2557 Junior Sous Chef Professional, Scientific and Technical Activities 1 14.0 2 0.996
1392 Advisory Software Engineer Professional, Scientific and Technical Activities 1 7.5 2 0.994
2237 Mechatronics Technician Professional, Scientific and Technical Activities 1 8.5 2 0.994
2155 Logistics Clerk Administrative and Support Service Activities 1 6.5 2 0.994
2317 Motor Vehicle Cleaner / Polisher Other Service Activities 1 8.5 2 0.994

In [54]:
checkRepost(res.query('cscore < 1').head())

set(['reporting to the executive chef designates the incumbent shall be responsible to handle kitchen operations in the kitchen assist the pastry chef in overseeing the preparation of both pastry and western cuisine assist the head chef in the planning and development of menus and recipes supervise train and develop staff ensuring consistency in work performance ensure the quality control and presentation of all food items ensure proper handling and storage of all food items in accordance with hotel standards and sanitation health regulations assist the head chef in maintaining food costs and labor costs in the outlet', 'reporting to the executive chef designates the incumbent shall be responsible to handle kitchen operations in the cafe kitchen assist the sous chef in overseeing the preparation of both local and western cuisine assist the sous chef in the planning and development of menus and recipes supervise train and develop staff ensuring consistency in work performance ensure the quality control and presentation of all food items ensure proper handling and storage of all food items in accordance with hotel standards and sanitation health regulations assist the sous chef in maintaining food costs and labor costs in the outlet'])
set(['degree or diploma in computer engineering or electrical electronics engineering strong software engineering skills 4 years in jquery standalone applications web services and database optimize application for maximum speed and scalability assure that all user input is validated before submitting to back end', 'degree or diploma in computer engineering or electrical electronics engineering strong software engineering skills 4 years in cg perl standalone applications web services and database'])
set(['this is in collaboration with ite where you will study 1 2 days and work 3 4 days school fees are paid for 1 job will include building and assembly of special vehicles and equipment 2 the trainee will be taught welding general fabrication and composites lamination 3 the trainee will be taught to integrate electronics and electrical systems to vehicle 4 the trainee will be taught how to assemble wiring harnesses', '1 job will include building and assembly of special vehicles and equipment 2 the trainee will be taught welding general fabrication and composites lamination 3 the trainee will be taught to integrate electronics and electrical systems to vehicle 4 the trainee will be taught how to assemble wiring harnesses'])
set(['coordinating of logistic shipment related activities other administrative support as and when assigned such as data entry filling and documenting you will work with a supervisor and training will be provided minimum n o levels or equivalent some simple logistic shipment working experience proficient in ms offices able to commence immediately preferred company information a engineering company located near to commonwealth mrt working hour mon thurs 8 30am 6pm friday 8 30am 5 45pm 2 months work 1 saturday sat 8 30am 12 30pm to apply interested and suitable candidates please email your cv in ms word by click apply job please include the following information in your cv 1 current salary 2 expected salary 3 reasons for leaving 4 availability', 'data entry assist in logistic duties willing to learn training will be provided 1 2 years of working experience able to commence immediately preferred responsible and good working altutide company information industry engineering working location a few minutes walk from commonwealth mrt working hour mon thurs 8 30am 6pm friday 8 30am 5 45pm 2 months work 1 saturday sat 8 30am 12 30pm salary range about 1600 1800 negotiable depends on working experience to apply interested and suitable candidates please email your cv in ms word by click apply job please include the following information in your cv 1 current salary 2 expected salary 3 reasons for leaving 4 availability'])
set(['we are a reputable car washing company dealing with government agencies our company have been awarded numerous government contracts and we are looking to expand you will be responsible for the maintenence and cleanliness of our clieant vehicles job include washing of vehicles and reporting to the transport officer supervising a team of car washers transporting the team members to different washing locations ensure monthly washing target are met developing and conducting work safety and company training programe report directly to manager we are looking for dedicated and hardworking individuals to join our dynamic team of mobile car washers the positions are only open to singaporean and pr our salary package include 13 month target bonus annual leave benefits and transport benefits', 'we are a reputable car washing company dealing with government agencies our company have been awarded numerous government contracts and we are looking to expand you will be responsible for the maintenence and cleanliness of our clieant vehicles job include washing of vehicles and reporting to the transport officer we are looking for dedicated and hardworking individuals to join our dynamic team of mobile car washers the positions are only open to singaporean and pr our salary package include 13 month target bonus annual leave benefits and transport benefits'])

Niche vs. General Job Titles

InfoCom industry

In [70]:
info_df = agg_df.query('industry == "Information and Communications"')

info_top50 = info_df.iloc[range(50)]
info_top50['cscore'] = info_top50['title'].apply(consistency, industry='Information and Communications')
info_top50.to_csv(JOB_PROF + 'consistency/infocom_cscore.csv', index=False)


count    50.00000
mean      0.84618
std       0.02295
min       0.77500
25%       0.83800
50%       0.85000
75%       0.85875
max       0.88000
Name: cscore, dtype: float64

In [79]:
info_top50.sort_values('cscore', ascending=False, inplace=True)

In [80]:

title industry n_employer avg_n_skill n_post cscore
257 Analyst Programmer Information and Communications 22 9.555556 36 0.880
160 Marketing Executive Information and Communications 42 14.720000 50 0.878
41 SAP Consultant Information and Communications 33 11.149123 114 0.878
173 Administrative Assistant Information and Communications 44 12.333333 48 0.876
178 Sales Executive Information and Communications 37 11.608696 46 0.875

In [81]:

title industry n_employer avg_n_skill n_post cscore
26 Consultant Information and Communications 24 16.964539 141 0.807
282 Architect Information and Communications 19 22.606061 33 0.807
74 Analyst Information and Communications 39 18.768293 82 0.805
95 Engineer Information and Communications 21 25.819444 72 0.781
81 Manager Information and Communications 22 20.155844 77 0.775

Financial and Insurance Activities

In [75]:
fin_df = agg_df.query('industry == "Financial and Insurance Activities"')

fin_top50 = fin_df.iloc[range(50)]
fin_top50['cscore'] = fin_top50['title'].apply(consistency, industry='Financial and Insurance Activities')
fin_top50.to_csv(JOB_PROF + 'consistency/fin50_cscore.csv', index=False)

count    66.000000
mean      0.933258
std       0.031988
min       0.828000
25%       0.907500
50%       0.938000
75%       0.954000
max       0.986000
Name: cscore, dtype: float64

Senior Software Engineer vs. Marketing Executive (Wholesale and Retail Trade)

Denote Senior Software Engineer as SSE, Administrative Assistant as AA, and Marketing Executive as ME.

In [58]:
sse_cscore = consistency('Senior Software Engineer', 'Wholesale and Retail Trade', abbv_job='sse', abbv_industry='wholesale')
aa_cscore = consistency('Administrative Assistant', 'Wholesale and Retail Trade', abbv_job='aa', abbv_industry='wholesale')
me_sims = consistency('Marketing Executive', 'Wholesale and Retail Trade', abbv_job='me', abbv_industry='wholesale')

Job title Senior Software Engineer:
Computing pairwise similarity scores among 70 job posts,
each post is compared with subseq posts...
Done after 3.8s
Job title Administrative Assistant:
Computing pairwise similarity scores among 98 job posts,
each post is compared with subseq posts...
Done after 7.6s
Job title Marketing Executive:
Computing pairwise similarity scores among 66 job posts,
each post is compared with subseq posts...
Done after 3.5s

Vice President vs. Information Technology Specialist in Financial and Insurance Activities

In [22]:
agg_df.query('industry == "Financial and Insurance Activities"').head()

title industry n_employer avg_n_skill n_post
0 Analyst Financial and Insurance Activities 78 19.613333 525
2 Associate Financial and Insurance Activities 53 26.796569 408
5 Manager Financial and Insurance Activities 48 21.367089 237
6 Vice President Financial and Insurance Activities 30 26.600858 233
11 Information Technology Specialist Financial and Insurance Activities 8 12.866029 209

Software Engineer in Finance

In [416]:
fin_se = calSimScores('Software Engineer', 'Financial and Insurance Activities', df)

Computing pairwise similarity scores among 18 job posts,
each post is compared with subseq posts...
	 0 posts and counting...
Done after 0.3s

In [417]:
print fin_se.topic_sim.describe().round(2)
print fin_se.skill_sim.describe().round(2)

count    170.00
mean       0.86
std        0.06
min        0.76
25%        0.83
50%        0.85
75%        0.88
max        1.00
Name: topic_sim, dtype: float64
count    170.00
mean       0.13
std        0.30
min        0.00
25%        0.00
50%        0.03
75%        0.05
max        1.00
Name: skill_sim, dtype: float64

In [ ]:
fin_se = fin_se.sort_values('topic_sim', ascending=False) 
# del fin_se['index']

In [ ]:

In [ ]:
fin_se.head().to_csv(SKILL_RES + 'fin_se_top5.csv', index=False)

In [ ]:
fin_se = fin_se.sort_values('skill_sim', ascending=False)

In [ ]:
np.corrcoef(fin_se.skill_sim, fin_se.topic_sim)[0,1]
Plot cluster dists of the first 5 pairs:

In [ ]:
posts = getPostsInPairs(fin_se.head())
fig = vizTopicDists(posts, doc_topic_distr, figsize=(12, 6))
plt.savefig(SKILL_RES + 'fig/fin_se_top5.pdf'); plt.close()

In [ ]:
fin_se.tail().to_csv(SKILL_RES + 'fin_se_bottom5.csv', index=False)

Manager in Finance

In [418]:
fin_man = calSimScores('Manager', 'Financial and Insurance Activities', df)

print fin_man.topic_sim.describe().round(2)
print fin_man.skill_sim.describe().round(2)

Computing pairwise similarity scores among 176 job posts,
each post is compared with subseq posts...
	 0 posts and counting...
	 50 posts and counting...
	 100 posts and counting...
	 150 posts and counting...
Done after 29.3s
count    15575.00
mean         0.78
std          0.07
min          0.52
25%          0.73
50%          0.78
75%          0.83
max          1.00
Name: topic_sim, dtype: float64
count    15575.00
mean         0.04
std          0.11
min          0.00
25%          0.00
50%          0.03
75%          0.05
max          1.00
Name: skill_sim, dtype: float64

In [ ]:
fin_man = fin_man.sort_values('topic_sim', ascending=False); del fin_man['index']

In [ ]:
fin_man.head().to_csv(SKILL_RES + 'fin_man_top5.csv', index=False)
fin_man.tail().to_csv(SKILL_RES + 'fin_man_bottom5.csv', index=False)

In [ ]:
posts = getPostsInPairs(fin_man.tail(), df)

In [ ]:
top5 = fin_man.query('employer1 != employer2 and skill_sim <= 0.8').head()

In [ ]:
fig = vizTopicDists(posts, doc_topic_distr, figsize=(12, 6))
plt.savefig(SKILL_RES + 'fig/fin_man_bottom5.pdf'); plt.close()

Research Fellow

In [ ]:
rf_sims = calSimScores(job_title='Research Fellow', industry='Education', df=df)

In [ ]:
print rf_sims.topic_sim.describe().round(3)
print rf_sims.skill_sim.describe().round(2)

In [ ]:
rf_sims = rf_sims.sort_values('topic_sim', ascending=False)

Clustering Job Posts

In this section, we will try the following clustering models which can work directly on precomputed similarity matrix.

  • Affinity Propagation (AP): can learn number of clusters from data
  • Spectral Clustering (SC): need to preset number of clusters, which can be guessed using AP result.

After training each model, we will analyze the returned clusters by:

  • metrics (from sklearn.metrics) such as homogeneity, completeness or mutual information
  • manually looking at representative posts in each cluster. The representative posts are provided by AP as cluster centers.

In [ ]:
import sklearn.cluster as cluster

In [ ]:
# dir to store results
JOB_PROF = SKILL_RES + 'job_prof/'

Affinity Propagation

  • Software Engineer:

In [ ]:
se_sims = calSimScores('Software Engineer', df=df, out_fmt='matrix_topic_sim') # 'Financial and Insurance Activities',

In [ ]:
se_posts = df[df.title == 'Software Engineer']
se_cluster = AF_clustering(se_posts, job_title='se', sim_mat=se_sims)

In [ ]:

In [ ]:

In [ ]:
fig = plotCluster(0, job_title='SE',cluster_res=se_cluster)
#; plt.close()

In [ ]:
for i in range(9):
    fig = plotCluster(i, job_title='SE',cluster_res=se_cluster)

In [ ]:
fig = plotCluster(10, job_title='SE',cluster_res=se_cluster); plt.close()

In [ ]:
fig = plotCluster(22, job_title='SE', cluster_res=se_cluster, figsize=(12, 6)); plt.close()

Comparing Different Job Titles

We compare job titles to see if their job posts are consistent (thru the topic sim distribution among the job posts).

In [ ]:
se_sims_df = calSimScores('Software Engineer')

In [ ]:
se_sims_df.sort_values('topic_sim', ascending=False, inplace=True)

In [ ]:
se_sims_df.query('skill_sim < 1 and employer1 != employer2').head()
  • Software Developer:

In [ ]:
dev_titles = set([s for s in df.title if ('Developer' in s)])

In [421]:
dev_posts = df[(df.title == 'Software Developer') & (df.industry == 'Financial and Insurance Activities')]
print('# posts of Software Developer in Finance: %d' %dev_posts.shape[0])

# posts of Software Developer in Finance: 6

In [ ]:
sd_sims_df = calSimScores('Software Developer')

In [ ]:

Result: The similarity of posts in Software Developer are also high with a mean of 0.83 .

  • Manager:

In [ ]:
man_posts = df[(df.title == 'Manager')] # (df.industry == 'Financial and Insurance Activities')
man_sims = calSimScores('Manager', industry=None, df=df, out_fmt='matrix_topic_sim')

In [ ]:
man_cluster = AF_clustering(man_posts, job_title='Manager', sim_mat=man_sims)

In [ ]:

In [ ]:
for c in [7, 9, 11, 14, 22]:
    plotCluster(c, job_title='fin_man', cluster_res=man_cluster)


In [ ]:
man_sim_df = calSimScores('Manager')
  • Associate:

In [ ]:
assoc_sim_df = calSimScores('Associate')

In [ ]:

In [ ]:
fig = plotSimDists(assoc_sim_df, job_title='Associate')
fig.savefig(SKILL_RES + 'fig/assoc_sim_dists.pdf'); plt.close()

In [ ]:

In [ ]:

In [ ]:

In [ ]:
fig = plotSimDists(sim_df=se_sims_df, job_title='Software Engineer')
plt.savefig(SKILL_RES + 'fig/se_sim_dists.pdf'); plt.close()

In [ ]:
fig = plotSimDists(sim_df=man_sim_df, job_title='Manager')
fig.savefig(SKILL_RES + 'fig/man_sim_dists.pdf'); plt.close()
  • Research Fellow

In [ ]:
rf_sim_mat = calSimScores(job_title='Research Fellow', industry='Education', df=df, out_fmt='matrix_topic_sim')

In [ ]:
rf_posts = df[(df.title=='Research Fellow') & (df.industry == 'Education')]
print('# posts of Research Fellow: %d' %rf_posts.shape[0])

In [ ]:
rf_cluster = AF_clustering(rf_posts, job_title='rf', sim_mat=rf_sim_mat)

In [ ]:

In [ ]:
rf_c0 = plotCluster(0, job_title='RF', cluster_res=rf_cluster)
rf_c1 = plotCluster(1, job_title='RF', cluster_res=rf_cluster)

Find "Synonym" Job Titles

  • Software Engineer vs. Software Developer

In [ ]:
se_and_sd = simScore('Software Engineer', 'Software Developer')

In [ ]:
se_and_sd = se_and_sd.sort_values('topic_sim', ascending=False)
se_and_sd.reset_index(inplace=True); del se_and_sd['index']

In [ ]:
fig = plotSimDists(se_and_sd, 'SE_and_SD')
fig.savefig(SKILL_RES + 'fig/se_and_sd_sims.pdf'); plt.close()

In [ ]:

In [ ]:
vizPair(0, se_and_sd, abbv_title='se_vs_sd')

In [ ]:
last = se_and_sd.shape[0] - 1
vizPair(last, se_and_sd, abbv_title='se_vs_sd')
  • Software Engineer vs. Manager

In [ ]:
se_and_man = simScore('Software Engineer', 'Manager')

In [ ]:

In [ ]:
fig = plotSimDists(se_and_man)
fig.savefig(SKILL_RES + 'fig/se_and_man_sims.pdf'); plt.close()

Spectral Clustering

In [ ]:
spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="precomputed")
  • SE in Finance:

In [ ]:

In [ ]:
fin_se_posts = df[(df.title == 'Software Engineer') & (df.industry == 'Financial and Insurance Activities')]
fin_se_posts['cluster'] = spectral.labels_
fin_se_posts = fin_se_posts.sort_values('cluster')
  • Manager in Finance:

In [ ]:

In [ ]:
fin_man_posts = df[(df.title == 'Manager') & (df.industry == 'Financial and Insurance Activities')]

In [ ]:
fin_man_posts['cluster'] = spectral.labels_
fin_man_posts = fin_man_posts.sort_values('cluster')

Evaluation using the Frameworks from SkillsFuture

In this section, we use the skill frameworks from SkillsFuture (SF) as a source to evaluate our topic model as well as our proposed consistency score. Currently available frameworks are for 3 sectors: (i) Hotel and Accomodation services, (ii) Early Childcare and Education, and (iii) Precision Engineering. Given a job title t in one of the three sectors, we proceed by the following steps.

  • Obtain an exhaustive list of synonym titles for t
  • Using the titles to retrieve posts for t
  • Measure the similarity among the retrieved posts
  • Categorize the skills in these posts using the SF framework for t

In [ ]:
df = df[-df.title.isnull()]
# standardize employer_name
df.employer_name = df.employer_name.apply(lambda s: s.replace('PTE LTD', 'PTE. LTD.').replace('PTE. LIMITED', 'PTE. LTD.')
                                          .replace('PRE-SCHOOL', 'PRESCHOOL') )

df.to_csv(SKILL_DAT + 'filter_doc_index.csv')

Early Childhood Care and Education (ECCE)

Pre-School Teachers (PST)

The list of titles for PST may be formed by looking at the titles from certain pre-schools in SG. We tried with the top pre-schools obtained from First try return empty results! Checking with employer name in data shows that we need to append 'PTE. LTD.' to school names. We then added schools found in data.

In [ ]:
keys = map(str.upper, ['PreSchool', 'Skool', 'CHILDCARE', 'Kid', 'toddler', 'infant'])

guessed_preschools = set([s for s in df.employer_name if found(keys, s)])
print('# guessed preschools: %d' %len(guessed_preschools))


In [ ]:
# ['Shaws CDLC', 'childfirst',  'kiddiWinkie', 'little footprints', 'brighton montessori', 'posso', 'little skool-house', 
# 'little village', 'mulberry', 'learning vision', 'Star Learners', 'global eduhub', 'sunflower childcare', 'frobel']
preschool_keys = ['E-BRIDGE', 'ETONHOUSE PRESCHOOL', 'MINDCHAMPS', 'LECLARE', "Pat's Schoolhouse", 
                  'CREATIVE LAND CHILDCARE', 'Lorna Whiston', 
                  'Carpe Diem', 'Crestar', 'nurture edu', 'safari house']
preschool_keys = map(str.upper, preschool_keys)

preschools = [s for s in df.employer_name if found(preschool_keys, s)]
preschool_posts = df[df.employer_name.isin(preschools)]
print('# posts from preschool employers: %d, distributed as follows:' %preschool_posts.shape[0])

In [ ]:
titles = set(preschool_posts['title'])

Among the titles, we can only find 2 titles for pre-school teacher [Child Care Teacher, Pre-Primary Education Teacher]. The reason is because the set of preschools are not exhaustive. How to fix this?

Another way to search for titles of PST is to look at the job titles for Teacher and manually narrow down to Pre-school Teacher as follow.

In [ ]:
idx = [i for i,s in enumerate(df.title) if ('Teacher' in s)]
teacher_df = df.iloc[idx]

In [ ]:
print('# posts of titles containing kw Teacher: %d' %teacher_df.shape[0])

In [ ]:
teacher_stat = getTitleStats(teacher_df)
teacher_stat.to_csv(SKILL_RES + 'pst/teachers.csv', index=False)

Based on this, we guessed the following titles for PST.

In [ ]:
cc_teachers = ['Pre-School Teacher', 'Kindergarten Teacher', 'Child Care Teacher', 'Pre-Primary Education Teacher',
            'Teacher, Kindergarten', 'Teacher, Child Care', 'Day Care Teacher']

In [ ]:

In [ ]:
Seed set of PST

In [ ]:
pst_posts = df[df.title == 'Pre-School Teacher']
pst_posts.to_csv(SKILL_RES + 'pst/posts.csv', index=False)

In [ ]:
pst_sims = pairwiseSim(pst_posts, doc_topic_distr)

In [ ]:
print pst_sims.topic_sim.describe().round(3)

In [ ]:
fig = plotSimDists(pst_sims, sci_fmt=False)
# fig.suptitle('Pre-School Teacher (13 posts)', fontsize=20)
fig.savefig(SKILL_RES + 'fig/pst_sims.pdf'); plt.close()

In [ ]:
pst_sims.query('skill_sim >= 0.6')

In [ ]:
pst_sims.sort_values('topic_sim', ascending=False, inplace=True)
pst_sims.to_csv(SKILL_RES + 'pst/pst_sims.csv', index=False)

In [ ]:
pst_sims = pst_sims.query('skill_sim < 0.6')
vizPair(0, pst_sims, labels, abbv_title='pst')

In [ ]:
last = pst_sims.shape[0] - 1
vizPair(last, pst_sims, labels, abbv_title='pst')
Relevant titles for PST

In [ ]:
# employers having PST positions
pst_employers = np.unique(pst_posts.employer_name)
print('# PST employers: %d' %len(pst_employers))
# pst_employers
  • Get other titles from the PST employers:

In [ ]:
posts_of_pst_employers = df[df.employer_name.isin(pst_employers)]
print('# posts of PST employers: {}'.format(posts_of_pst_employers.shape[0]))

In [ ]:
other_titles_df = getTitleStats(posts_of_pst_employers).query('title != "Pre-School Teacher"') # n_post > 1
other_titles = other_titles_df['title']

In [ ]:
teachers = teacher_stat.title

Titles from PST employers will not include all Teacher titles.

In [ ]:
# set(teachers).difference(other_titles)
Similarity scores of relevant titles with PST

In [394]:
rel_titles = set(teachers.append(other_titles))
rel_posts = df[df.title.isin(rel_titles)]

In [396]:
# merge diff versions of some titles
rel_posts = changeTitle(rel_posts, 'Teacher, Kindergarten', 'Kindergarten Teacher')
rel_posts = changeTitle(rel_posts, 'Teacher, Child Care', 'Child Care Teacher')

In [397]:
rel_titles = set(rel_posts.title)
print('# relevant titles: %d' %len(rel_titles))
print('# titles retrieved by kw teacher: {}'.format(len(teachers)))
print('# titles retrieved by PST employers: {}'.format(len(other_titles)))

# relevant titles: 128
# titles retrieved by kw teacher: 44
# titles retrieved by PST employers: 99

In [398]:
res = getTitleStats(rel_posts)

In [399]:
res['topic_sim_with_pst'] = res['title'].apply(simScore, t2='Pre-School Teacher')

Production Planner vs. Housekeeping Attendant
1st title: 38 posts, 2nd title: 41 posts
10 posts and counting...
20 posts and counting...
30 posts and counting...
Done after 2.0s
C:\Users\mdluu.2011\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\ SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:

In [392]:
res_38.head(10).to_csv(HK_DIR + 'top_sim_titles.csv', index=False)

In [393]:

title n_employer avg_n_skill n_post topic_sim_with_hk
0 Assistant Restaurant Manager 49 10.8 52 0.89
1 Housekeeping Supervisor 27 10.8 38 0.89
2 Housekeeping Attendant 31 11.0 41 0.89
3 Chef 74 12.3 87 0.88
4 Chef de Partie (Restaurant) 61 11.4 65 0.88
5 Restaurant Manager 97 14.8 110 0.88
6 Waiter 47 10.3 66 0.88
7 Cook 92 10.0 103 0.88
8 Sous Chef 61 14.0 65 0.87
9 Interior Designer 55 10.1 66 0.86

In [391]:

n_employer avg_n_skill n_post topic_sim_with_hk
count 191.00 191.00 191.00 191.00
mean 93.84 13.85 138.95 0.81
std 82.45 3.27 143.29 0.03
min 25.00 7.90 38.00 0.75
25% 44.50 11.30 54.00 0.79
50% 62.00 13.30 78.00 0.81
75% 111.00 16.20 162.50 0.83
max 615.00 23.00 993.00 0.89

In [ ]:

In [ ]:
# limit to titles with > 22 posts.
_ = rel_title_stats.query('n_post > 22')
print('# titles with > 22 posts: %d' %_.shape[0])

res = rel_title_stats.query('n_post > 22')
res['topic_sim_with_hk'] = res['title'].apply(simScore, t2='Housekeeping Attendant')

In [ ]:
res = res.round(2)
res = res.sort_values('topic_sim_with_hk', ascending=False)

In [ ]:
res.to_csv(HK_DIR + 'sims_to_hk.csv', index=False)
  • Categorize the skills in these posts using the SF framework for HK track:

In [ ]:
skill_df = pd.DataFrame({'skill': c.keys(), 'freq': c.values()})
skill_df = skill_df.sort_values('freq', ascending=False)

In [ ]:
hk_skills = skillFreq(hk_posts)
print('# skills in HK posts: %d' %hk_skills.shape[0])

In [ ]:
hk_skills.to_csv(SKILL_RES + 'job_prof/hk_skills.csv', index=False)

Precision Engineering - Technician Track

  • Find an exhaustive list of job titles for Technician:

In [ ]:
# Machinist/Technician are suggested by SF
tech_kw = ['Machinist', 'Technician']
tech_titles = [t for t in df.title if found(tech_kw, t)]
c = Counter(tech_titles)

In [ ]:
tech_titles = pd.DataFrame({'title': c.keys(), 'n_post': c.values()}).sort_values('n_post', ascending=False)

In [ ]:

In [ ]:
tech_titles = tech_titles.query('n_post > 10')
print('# titles: %d' %tech_titles.shape[0])

In [ ]:
  • Using the titles to retrieve posts for Technician:

In [ ]:
tech_posts = df[df.title.isin(tech_titles.title)]
print('# posts for Technician: %d' %tech_posts.shape[0])

In [ ]:
getTitleStats(tech_posts).to_csv(SKILL_RES + 'job_prof/tech_titles.csv', index=False)

In [ ]:
  • Measure the similarity among the retrieved posts:

In [ ]:
tech_sims = pairwiseSim(tech_posts, doc_topic_distr)

In [ ]:
# plot dists of the sims
fig = plotSimDists(tech_sims, 'Technician jobs')
fig.savefig(SKILL_RES + 'fig/tech_jobs_sim.pdf'); plt.close()

In [ ]:

In [ ]:
tech_sims.query('skill_sim == 1')

In [ ]:

In [ ]:
tech_sims = tech_sims.sort_values('topic_sim', ascending=False)

In [ ]:
n_pair = tech_sims.shape[0]; last = n_pair - 1
vizPair(last, tech_sims)
  • Categorize the skills in these posts using the SF framework for Technician:

Flagging Variants/Branches of an Employer

In [ ]:
fm_posts = df[df.title == 'Fashion Merchandiser']
fm_sims = pairwiseSim(fm_posts, doc_topic_distr)

In [ ]:
fm = fm_sims.sort_values('skill_sim', ascending=False)
fm.head().to_csv(SKILL_RES + 'job_prof/fm_variants.csv', index=False)