In [1]:
import my_util as my_util
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *

In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'; DATA_DIR = HOME_DIR + 'data/clean/'
RES_DIR = HOME_DIR + 'results/'; 
SKILL_DAT = DATA_DIR + 'skill_cluster/'; SKILL_RES = RES_DIR + 'skill_cluster/new/'

In [ ]:
df = pd.read_csv(SKILL_DAT + 'doc_index.csv')

Removing reposts


In [ ]:
df = df.sort_values(['employer_name', 'doc'])
print('# posts bf filtering dups: %d' %df.shape[0])

In [ ]:
df.head(10)

In [ ]:
df = df.drop_duplicates(['employer_name', 'doc'])
print('# posts after filtering dups: %d' %df.shape[0])

In [ ]:
df.head(10)

In [ ]:
df = df.reset_index()
df.head()

In [ ]:
df.to_csv(SKILL_DAT + 'uniq_doc_index.csv', index=False)

A careful check also reveals a renaming of job title (from Marine Superintendent to Fleet Manager, employer: "K" LINE LOGISTICS SINGAPORE), though the posts are the same. This is also interesting and may need further analysis later. However, we first need to re-plot cluster distributions again to see if we really get rid of dups.


In [3]:
df = pd.read_csv(SKILL_DAT + 'uniq_doc_index.csv')
df = df.set_index('index')

In [8]:
with(open(LDA_DIR + 'doc_topic_distr.mtx', 'r')) as f:
    doc_topic_distr = mmread(f)

In [4]:
# Global settings for all cluster plots
abbv_clusters = ['FIN', 'HWR', 'SCI', 'PROD', 'CON', 'LEG', 'CUS', 'LOG', 'MKT', 'DEV', 'MAN', 'HOS', 'AUD', 'COM', 'HR']
x = range(1, 16); labels = abbv_clusters

In [5]:
def getPosts2Check(job_title, industry, n_pair=10):
    sub_df = df[(df['title'] == job_title) & (df['industry'] == industry)]
    # Drop duplicated posts due to reposting
    
    scores = calScore(sub_df, doc_topic_distr)
    scores = scores.query('sim_score < 1').sort_values('sim_score', ascending=False)

    k_pairs = scores.head(n_pair)
    
    ids2check = np.unique(list(k_pairs['job_id1']) + list(k_pairs['job_id2']))
    print('# posts to check: %d' %len(ids2check))

    posts2check = sub_df[sub_df['job_id'].isin(ids2check)]
    return posts2check.sort_values('employer_name')

#     return k_pairs

In [6]:
def plotClusterDists(posts, figsize):
    n_post = posts.shape[0]
    fig, axarr = plt.subplots(n_post, sharex=True, figsize=figsize)

    for r in range(n_post):
        plt.subplot(n_post, 1, r+1)
        plotClusterDistAtRow(r, posts, doc_topic_distr)
        # Show ylabel at the middle
        if r==(n_post/2 - 1): 
            plt.ylabel('Probability', fontsize=24)

    ## Fine tune the fig    
    fig.subplots_adjust(hspace=.5)

    # Hide xticks on all subplots except the last one
    hide_xticks(fig)

    # Provide xtick labels only for the last subplot
    plt.xticks(x, labels, rotation=45)
    # Show xlabel only at the last subplot    
    plt.xlabel('Skill Clusters', fontsize=20)
    return fig

In [10]:
posts2check = getPosts2Check(job_title="Software Engineer", industry="Financial and Insurance Activities", n_pair=5)
fig = plotClusterDists(posts2check, figsize=(6, 10))
# plt.savefig(LDA_DIR + 'fig/se_in_fin2.pdf')
plt.show(); plt.close()


Computing pair similarity scores among 31 job posts,
each post is compared with subseq posts...
	 0 posts and counting...
	 10 posts and counting...
	 20 posts and counting...
Done after 0.6s
# posts to check: 8

Duplications seem still exist! Let us examine further.

  • Sample posts for SE:

In [ ]:
sample_employers = ["Capital Match Holdings Pte. Ltd.", "Comfortdelgro Corporation Limited", "Fujitsu Asia Pte Ltd", 
             "Millennium Capital Management (Singapore) Pte. Ltd."]

sample_employers = map(str.upper, sample_employers)
sample_se_posts = df[(df.employer_name.isin(sample_employers)) & (df.title == "Software Engineer")]
se_fig = plotClusterDists(sample_se_posts)

plt.show(); plt.close()

In [ ]:
dups = df[(df.employer_name.isin(sample_employers)) & (df.title == "Software Engineer")]
dups = dups.sort_values('employer_name')
  • Save duplicated and unique versions to files for further analysis:

In [ ]:
dups.to_csv(RES_DIR + 'tmp/se_dups.csv', index=False)
dups.drop_duplicates(['doc']).to_csv(RES_DIR + 'tmp/se_no_dups2.csv', index=False)

By reading the job posts of Capital Match Holdings, we found that they are actually different. However, the differences in their contents are subtle. Cluster distribution of the last post is a bit different from the first two because the last one has no salary detail while the first two provide salary and equity. A closer look at the skill sets in these posts reveals:

  • skill set of the first 2 posts = skill set of the last post + {equity}
  • salary in 1st post is more than salary in 2nd post (4K vs. 3K)
  • Other examples of suspicious dups:

In [ ]:
se_in_infocom = getPosts2Check(job_title='Software Engineer', industry='Information and Communications')

In [ ]:
fig = plotClusterDists(se_in_infocom)
plt.savefig(LDA_DIR + 'fig/se_in_infocom2.pdf')
plt.show(); plt.close()
  • Sample posts for Administrative Assistant:

In [ ]:
posts = getPosts2Check(job_title="Administrative Assistant", industry="Financial and Insurance Activities", n_pair=5)

In [ ]:
fig = plotClusterDists(posts, figsize=(6, 12))
# plt.savefig(LDA_DIR + 'fig/admin_in_fin.jpg')
plt.show(); plt.close()

In [ ]:
# Manager
posts = getPosts2Check(job_title="Manager", industry="Financial and Insurance Activities", n_pair=5)

In [ ]:
fig = plotClusterDists(posts, figsize=(6, 12))
plt.savefig(LDA_DIR + 'fig/man_in_fin.jpg')
plt.savefig(LDA_DIR + 'fig/man_in_fin.pdf')
plt.show(); plt.close()

These example tell us that we should also filter out the posts with are almost identical. Such posts can be detected based on the set of common skills (Jaccard similarity) as in the next section.

Detecting Almost-Identical Posts by Common Skills

In this section we use Jaccard similarity to detect posts that are almost-identical. We expect that such posts usually come from the same company. Thus, we first analyze the posts under the same company.


In [ ]:
cmh = df[df.employer_name == str.upper("Capital Match Holdings Pte. Ltd.")]
pairwiseJacSim(df=cmh)

This example confirms that posts under the same company can be very similar, the percentage of overlapping skills can be more than 90%. We want to remove such pairs of posts as they can be the reason consistency score are over-estimated.

The filtering process for a given DF of posts is follows.

  • Init an empty list keep
  • Compute Jaccard similarities of the first post with the remaining posts
  • Add the 1st post to keep and remove it from remain as well as any post which is too similar with the 1st post
  • Repeat the process until no more post left in remain.

In [38]:
def filterSimPosts(i, employer='Capital Match Holdings Pte. Ltd.', sim_thres=.9):
    
    def doFilter(df):
        keep, remain = [], df
        t0 = time()
        # While there are at least 2 posts then we need to check
        while (remain.shape[0] >= 2):
            res = jacSim2Others(0, remain)
            
            job_id = remain.iloc[0]['job_id']
            keep.append(job_id)
            # In the remain, keep only posts that are sig. diff from post[0]
            sig_diff_posts = list(res.query('jacSim < {}'.format(sim_thres))['job_id2'])
            remain = remain[remain['job_id'].isin(sig_diff_posts)]
        
        # If only 1 post remains then we can just keep it as there are no more posts too similar to it
        if (remain.shape[0] == 1):
            keep.append(remain.iloc[0]['job_id'])
        
        print('\t done after {}s'.format(round(time()-t0, 1)))
        return df[df['job_id'].isin(keep)]
    
    
    sub_df = df[df.employer_name == employer.upper()]; n_post = sub_df.shape[0]
    #         if (i % 100 == 0):
    print('\t{}, {}, {} posts'.format(i, employer, n_post))
    return doFilter(sub_df)

In [12]:
tmp = filterSimPosts(0, employer='Capital Match Holdings Pte. Ltd.', sim_thres=.9)
pairwiseJacSim(tmp)


	0, Capital Match Holdings Pte. Ltd., 5 posts, 0.0s elapsed
Computing pairwise Jaccard similarities among 3 job posts,
each post is compared against its subseq posts...
Done after 0.0s
Out[12]:
employer intersection_size jacSim job_id1 job_id2 skills1 skills2 union_size
0 CAPITAL MATCH HOLDINGS PTE. LTD. 7 0.44 JOB-2015-0198405 JOB-2015-0198229 {referrals, investors, credit, institutional i... {web, investors, institutional investors, capi... 16
1 CAPITAL MATCH HOLDINGS PTE. LTD. 5 0.29 JOB-2015-0198405 JOB-2015-0256490 {referrals, investors, credit, institutional i... {investors, processing, competitive, banking, ... 17
2 CAPITAL MATCH HOLDINGS PTE. LTD. 5 0.28 JOB-2015-0198229 JOB-2015-0256490 {web, investors, institutional investors, capi... {investors, processing, competitive, banking, ... 18

In [ ]:
tmp = filterSimPosts(1, employer='Apar Technologies Pte. Ltd.', sim_thres=.9)
pairwiseJacSim(tmp)

In [13]:
by_employer = df.groupby('employer_name').agg({'job_id': 'nunique'})
by_employer = by_employer.add_prefix('n_').reset_index()

# by_employer.n_job_id.describe().round(1)

by_employer = by_employer.sort_values('n_job_id')
employers = list(by_employer.query('n_job_id >= 2')['employer_name'])
len(employers)


Out[13]:
6838

In [14]:
print('Filtering too similar posts in {} companies...'.format(len(employers)))


Filtering too similar posts in 6838 companies...

In [15]:
def filterPosts4Employers(start=0, end=100):
    t0 = time()
    frames = [filterSimPosts(i+start, emp, sim_thres=.9) for i, emp in enumerate(employers[start:end])]
    filtered_df = pd.concat(frames)
    print('Done after %.1fs' %(time() - t0))
    return filtered_df

In [16]:
filtered_df_1 = filterPosts4Employers(start=0, end=3000)


	0, KIARATEX EXPORTS PTE LTD, 2 posts, 0.0s elapsed
	100, KPS CONSTRUCTION & ENGINEERING PTE. LTD., 2 posts, 0.0s elapsed
	200, TANESEI TRADING PTE. LTD., 2 posts, 0.0s elapsed
	300, IT-GRID TECHNOLOGIES PTE. LTD., 2 posts, 0.0s elapsed
	400, KALTIMEX ENERGY (SINGAPORE) PTE LTD, 2 posts, 0.0s elapsed
	500, PRESTIGE BIOSYSTEMS PTE LTD, 2 posts, 0.0s elapsed
	600, ST. ANDREW'S DENTAL SURGEONS GROUP PTE. LTD., 2 posts, 0.0s elapsed
	700, MEGAWAY ENGINEERING & TRADING PTE LTD, 2 posts, 0.0s elapsed
	800, NELCO PRODUCTS PTE. LTD., 2 posts, 0.0s elapsed
	900, LYS ENERGY SOLUTIONS PTE. LTD., 2 posts, 0.0s elapsed
	1000, SINOHYDRO-SEMBCORP JOINT VENTURE, 2 posts, 0.0s elapsed
	1100, ELTEK POWER PTE. LTD., 2 posts, 0.0s elapsed
	1200, HTE (S) PTE. LTD., 2 posts, 0.0s elapsed
	1300, BANFF CYBER TECHNOLOGIES PTE. LTD., 2 posts, 0.0s elapsed
	1400, AYA FUJI INTERNATIONAL (M & E) PTE LTD, 2 posts, 0.0s elapsed
	1500, ESPZEN PTE. LTD., 2 posts, 0.0s elapsed
	1600, BRADY ENGINEERING PTE. LTD., 2 posts, 0.0s elapsed
	1700, ALLIANCE HEALTHCARE GROUP PTE LTD, 2 posts, 0.0s elapsed
	1800, C P WORLD PTE. LTD., 2 posts, 0.0s elapsed
	1900, SINGAPORE INSTITUTE OF MANAGEMENT, 3 posts, 0.0s elapsed
	2000, SOUTHERN GLOBE CORPORATION PTE. LTD., 3 posts, 0.0s elapsed
	2100, NET SOFTWARE TECHNOLOGY PTE. LTD., 3 posts, 0.0s elapsed
	2200, DELUGE FIRE PROTECTION (S.E.A.) PTE LTD, 3 posts, 0.0s elapsed
	2300, DAVID WOOD & ASSOCIATES PTE. LTD., 3 posts, 0.0s elapsed
	2400, SHEVS IFT CONSULTANTS PTE. LTD., 3 posts, 0.0s elapsed
	2500, COLUMBIA SHIPMANAGEMENT (SINGAPORE) PTE LTD, 3 posts, 0.0s elapsed
	2600, HRNET ONE PTE LTD, 3 posts, 0.0s elapsed
	2700, AVA INSURANCE AGENCY PTE. LTD., 3 posts, 0.0s elapsed
	2800, THE BANK OF NOVA SCOTIA, 3 posts, 0.0s elapsed
	2900, J. F. HILLEBRAND SINGAPORE PTE. LTD., 3 posts, 0.0s elapsed
Done after 27.0s

In [17]:
filtered_df_2 = filterPosts4Employers(start=3000, end=5000)


	3000, LIAN SIONG ALUMINIUM & TRADING PTE. LTD., 4 posts, 0.0s elapsed
	3100, ANIMAL RECOVERY VETERINARY CENTRE PTE. LTD., 4 posts, 0.0s elapsed
	3200, UHP TECHNOLOGY PTE. LTD., 4 posts, 0.0s elapsed
	3300, SWEE HUP ENGINEERING RESOURCES PTE LTD, 4 posts, 0.0s elapsed
	3400, SOCIETY FOR WINGS, 4 posts, 0.0s elapsed
	3500, FAGERDALA SINGAPORE PTE LTD, 4 posts, 0.0s elapsed
	3600, SINGAPORE SAIZERIYA PTE. LTD., 4 posts, 0.0s elapsed
	3700, NATIONAL HEALTHCARE GROUP PHARMACY, 4 posts, 0.0s elapsed
	3800, D.PBU PTE. LTD., 5 posts, 0.0s elapsed
	3900, GENESYS TELECOMMUNICATIONS LABORATORIES ASIA PTE LTD, 5 posts, 0.0s elapsed
	4000, PULSESYNC PTE. LTD., 5 posts, 0.0s elapsed
	4100, LOTUS ASIA TOURS PTE. LTD., 5 posts, 0.0s elapsed
	4200, SHOPPY PTE. LTD., 5 posts, 0.0s elapsed
	4300, SHIN KHAI CONSTRUCTION PTE. LTD., 6 posts, 0.0s elapsed
	4400, DNATA SINGAPORE PTE. LTD., 6 posts, 0.0s elapsed
	4500, APP SYSTEMS SERVICES PTE. LTD., 6 posts, 0.0s elapsed
	4600, BLUM & CO PTE LTD, 6 posts, 0.0s elapsed
	4700, GATEWAY TECHNOLOGIES SERVICES PTE LTD, 7 posts, 0.0s elapsed
	4800, SINGAPORE ASSOCIATION OF THE VISUALLY HANDICAPPED, 7 posts, 0.0s elapsed
	4900, GRAPHENE SERVICES PTE. LTD., 7 posts, 0.0s elapsed
Done after 57.3s

In [18]:
filtered_df_3 = filterPosts4Employers(start=5000, end=6000)


	5000, NOOR HOLDINGS PTE. LTD., 8 posts, 0.1s elapsed
	5100, YOUNG WOMEN'S CHRISTIAN ASSOCIATION OF SINGAPORE, 8 posts, 0.1s elapsed
	5200, TAYLOR & FRANCIS ASIA PACIFIC, 9 posts, 0.1s elapsed
	5300, VINTEDGE PTE LTD, 9 posts, 0.1s elapsed
	5400, KGI FRASER SECURITIES PTE. LTD., 10 posts, 0.1s elapsed
	5500, PROCTER & GAMBLE EUROPE SA SINGAPORE BRANCH, 10 posts, 0.1s elapsed
	5600, SPARKY ANIMATION PTE. LTD., 11 posts, 0.1s elapsed
	5700, SULZER CHEMTECH PTE. LTD., 12 posts, 0.1s elapsed
	5800, KEPPEL SEA SCAN PTE LTD, 13 posts, 0.2s elapsed
	5900, FIRMENICH ASIA PTE LTD, 15 posts, 0.2s elapsed
Done after 99.0s

In [23]:
# Defensive save
filtered_df.to_csv(SKILL_DAT + 'filter_doc_index.csv', index=False)

In [27]:
filtered_df_4 = filterPosts4Employers(start=6000, end=len(employers)-5)


	6000, SWISS REINSURANCE COMPANY LTD, 16 posts, 0.2s elapsed
	6100, LEAP NETWORKS PTE. LTD., 18 posts, 0.2s elapsed
	6200, JOHNSON CONTROLS HOLDING (S) PTE. LTD., 20 posts, 0.3s elapsed
	6300, SG REHAB PTE LTD, 24 posts, 0.2s elapsed
	6400, ALEXANDER MANN BPO SOLUTIONS (SINGAPORE) PTE. LTD., 29 posts, 0.6s elapsed
	6500, LUCASFILM ANIMATION COMPANY SINGAPORE B.V., 37 posts, 0.9s elapsed
	6600, PARKROYAL KITCHENER HOTEL PTE. LTD., 48 posts, 1.3s elapsed
	6700, ON DEMAND AGILITY SOLUTIONS PTE. LTD., 72 posts, 3.2s elapsed
	6800, ADECCO PERSONNEL PTE LTD, 210 posts, 27.2s elapsed
Done after 3510.4s

In [63]:
filtered_df_6 = filterPosts4Employers(start=len(employers)-5, end=len(employers)-4)


	6833, DBS BANK LTD., 644 posts
	 done after 255.1s
Done after 255.1s

In [39]:
filtered_df_5 = filterPosts4Employers(start=len(employers)-4, end=len(employers))


	6834, MACHSPEED HUMAN RESOURCES PTE. LTD., 664 posts
	 done after 171.1s
	6835, STANDARD CHARTERED BANK, 668 posts
	 done after 263.7s
	6836, NATIONAL UNIVERSITY OF SINGAPORE, 1255 posts
	 done after 946.2s
	6837, NANYANG TECHNOLOGICAL UNIVERSITY, 1292 posts
	 done after 928.0s
Done after 2309.1s

In [64]:
filtered_df = pd.concat([filtered_df_1, filtered_df_2, filtered_df_3, filtered_df_4, filtered_df_5, filtered_df_6])
print filtered_df.shape

filtered_df = filtered_df.reset_index()
filtered_df.to_csv(SKILL_DAT + 'filter_doc_index.csv', index=False)


(71339, 8)

In [58]:
df = df.dropna()
df.to_csv(SKILL_DAT + 'uniq_doc_index.csv', index=False)

In [61]:
dbs = df[df.employer_name == 'DBS BANK LTD.']

In [65]:
# print out to find the row with problem
# for r, skill_str in enumerate(dbs.occur_skills):
#     print((r, skill_str.split()))

Prolem in row 541, it has invalid value!


In [55]:
dbs.iloc[541]
# dbs = dbs.drop(dbs.index[541])


Out[55]:
doc              support the roll out and monitoring of best pr...
job_id                                            JOB-2015-0147783
title                            Information Technology Specialist
n_skill                                                          0
occur_skills                                                   NaN
employer_id                                             196800306E
employer_name                                        DBS BANK LTD.
industry                        Financial and Insurance Activities
Name: 99655, dtype: object

In [24]:
sample_employers = ["Capital Match Holdings Pte. Ltd.", "Comfortdelgro Corporation Limited", "Fujitsu Asia Pte Ltd", 
                     "Millennium Capital Management (Singapore) Pte. Ltd."]

In [25]:
def plotSamplePosts(employers, df, figsize):

    sample_employers = map(str.upper, employers)
    sample_posts = df[(df.employer_name.isin(sample_employers)) & (df.title == "Software Engineer")]
    return plotClusterDists(sample_posts, figsize)

In [26]:
fig = plotSamplePosts(sample_employers, df=filtered_df, figsize=(6, 8))
plt.savefig(LDA_DIR + 'fig/se_in_fin3.jpg')
plt.savefig(LDA_DIR + 'fig/se_in_fin3.pdf')
plt.show(); plt.close()



In [ ]:
filtered_df.shape[0]

In [ ]:
# Statistics check

In [ ]:
posts = pd.read_csv(DATA_DIR + 'full_job_posts.csv')
print posts.shape
posts.head()

In [ ]:
jd_df = pd.read_csv('d:/larc_projects/job_analytics/data/raw/jd.csv')
print jd_df.shape
jd_df.head()

In [ ]:
tmp = pd.merge(posts, jd_df)
tmp = tmp.drop_duplicates()
tmp.shape

In [ ]:
tmp = tmp.drop_duplicates(['employer_name', 'job_description_clob'])
tmp.shape