In [1]:
import my_util as my_util
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *
In [2]:
HOME_DIR = 'd:/larc_projects/job_analytics/'; DATA_DIR = HOME_DIR + 'data/clean/'
RES_DIR = HOME_DIR + 'results/';
SKILL_DAT = DATA_DIR + 'skill_cluster/'; SKILL_RES = RES_DIR + 'skill_cluster/new/'
In [ ]:
df = pd.read_csv(SKILL_DAT + 'doc_index.csv')
In [ ]:
df = df.sort_values(['employer_name', 'doc'])
print('# posts bf filtering dups: %d' %df.shape[0])
In [ ]:
df.head(10)
In [ ]:
df = df.drop_duplicates(['employer_name', 'doc'])
print('# posts after filtering dups: %d' %df.shape[0])
In [ ]:
df.head(10)
In [ ]:
df = df.reset_index()
df.head()
In [ ]:
df.to_csv(SKILL_DAT + 'uniq_doc_index.csv', index=False)
A careful check also reveals a renaming of job title (from Marine Superintendent to Fleet Manager, employer: "K" LINE LOGISTICS SINGAPORE), though the posts are the same. This is also interesting and may need further analysis later. However, we first need to re-plot cluster distributions again to see if we really get rid of dups.
In [3]:
df = pd.read_csv(SKILL_DAT + 'uniq_doc_index.csv')
df = df.set_index('index')
In [8]:
with(open(LDA_DIR + 'doc_topic_distr.mtx', 'r')) as f:
doc_topic_distr = mmread(f)
In [4]:
# Global settings for all cluster plots
abbv_clusters = ['FIN', 'HWR', 'SCI', 'PROD', 'CON', 'LEG', 'CUS', 'LOG', 'MKT', 'DEV', 'MAN', 'HOS', 'AUD', 'COM', 'HR']
x = range(1, 16); labels = abbv_clusters
In [5]:
def getPosts2Check(job_title, industry, n_pair=10):
sub_df = df[(df['title'] == job_title) & (df['industry'] == industry)]
# Drop duplicated posts due to reposting
scores = calScore(sub_df, doc_topic_distr)
scores = scores.query('sim_score < 1').sort_values('sim_score', ascending=False)
k_pairs = scores.head(n_pair)
ids2check = np.unique(list(k_pairs['job_id1']) + list(k_pairs['job_id2']))
print('# posts to check: %d' %len(ids2check))
posts2check = sub_df[sub_df['job_id'].isin(ids2check)]
return posts2check.sort_values('employer_name')
# return k_pairs
In [6]:
def plotClusterDists(posts, figsize):
n_post = posts.shape[0]
fig, axarr = plt.subplots(n_post, sharex=True, figsize=figsize)
for r in range(n_post):
plt.subplot(n_post, 1, r+1)
plotClusterDistAtRow(r, posts, doc_topic_distr)
# Show ylabel at the middle
if r==(n_post/2 - 1):
plt.ylabel('Probability', fontsize=24)
## Fine tune the fig
fig.subplots_adjust(hspace=.5)
# Hide xticks on all subplots except the last one
hide_xticks(fig)
# Provide xtick labels only for the last subplot
plt.xticks(x, labels, rotation=45)
# Show xlabel only at the last subplot
plt.xlabel('Skill Clusters', fontsize=20)
return fig
In [10]:
posts2check = getPosts2Check(job_title="Software Engineer", industry="Financial and Insurance Activities", n_pair=5)
fig = plotClusterDists(posts2check, figsize=(6, 10))
# plt.savefig(LDA_DIR + 'fig/se_in_fin2.pdf')
plt.show(); plt.close()
Duplications seem still exist! Let us examine further.
In [ ]:
sample_employers = ["Capital Match Holdings Pte. Ltd.", "Comfortdelgro Corporation Limited", "Fujitsu Asia Pte Ltd",
"Millennium Capital Management (Singapore) Pte. Ltd."]
sample_employers = map(str.upper, sample_employers)
sample_se_posts = df[(df.employer_name.isin(sample_employers)) & (df.title == "Software Engineer")]
se_fig = plotClusterDists(sample_se_posts)
plt.show(); plt.close()
In [ ]:
dups = df[(df.employer_name.isin(sample_employers)) & (df.title == "Software Engineer")]
dups = dups.sort_values('employer_name')
In [ ]:
dups.to_csv(RES_DIR + 'tmp/se_dups.csv', index=False)
dups.drop_duplicates(['doc']).to_csv(RES_DIR + 'tmp/se_no_dups2.csv', index=False)
By reading the job posts of Capital Match Holdings, we found that they are actually different. However, the differences in their contents are subtle. Cluster distribution of the last post is a bit different from the first two because the last one has no salary detail while the first two provide salary and equity. A closer look at the skill sets in these posts reveals:
In [ ]:
se_in_infocom = getPosts2Check(job_title='Software Engineer', industry='Information and Communications')
In [ ]:
fig = plotClusterDists(se_in_infocom)
plt.savefig(LDA_DIR + 'fig/se_in_infocom2.pdf')
plt.show(); plt.close()
In [ ]:
posts = getPosts2Check(job_title="Administrative Assistant", industry="Financial and Insurance Activities", n_pair=5)
In [ ]:
fig = plotClusterDists(posts, figsize=(6, 12))
# plt.savefig(LDA_DIR + 'fig/admin_in_fin.jpg')
plt.show(); plt.close()
In [ ]:
# Manager
posts = getPosts2Check(job_title="Manager", industry="Financial and Insurance Activities", n_pair=5)
In [ ]:
fig = plotClusterDists(posts, figsize=(6, 12))
plt.savefig(LDA_DIR + 'fig/man_in_fin.jpg')
plt.savefig(LDA_DIR + 'fig/man_in_fin.pdf')
plt.show(); plt.close()
These example tell us that we should also filter out the posts with are almost identical. Such posts can be detected based on the set of common skills (Jaccard similarity) as in the next section.
In [ ]:
cmh = df[df.employer_name == str.upper("Capital Match Holdings Pte. Ltd.")]
pairwiseJacSim(df=cmh)
This example confirms that posts under the same company can be very similar, the percentage of overlapping skills can be more than 90%. We want to remove such pairs of posts as they can be the reason consistency score are over-estimated.
The filtering process for a given DF of posts is follows.
In [38]:
def filterSimPosts(i, employer='Capital Match Holdings Pte. Ltd.', sim_thres=.9):
def doFilter(df):
keep, remain = [], df
t0 = time()
# While there are at least 2 posts then we need to check
while (remain.shape[0] >= 2):
res = jacSim2Others(0, remain)
job_id = remain.iloc[0]['job_id']
keep.append(job_id)
# In the remain, keep only posts that are sig. diff from post[0]
sig_diff_posts = list(res.query('jacSim < {}'.format(sim_thres))['job_id2'])
remain = remain[remain['job_id'].isin(sig_diff_posts)]
# If only 1 post remains then we can just keep it as there are no more posts too similar to it
if (remain.shape[0] == 1):
keep.append(remain.iloc[0]['job_id'])
print('\t done after {}s'.format(round(time()-t0, 1)))
return df[df['job_id'].isin(keep)]
sub_df = df[df.employer_name == employer.upper()]; n_post = sub_df.shape[0]
# if (i % 100 == 0):
print('\t{}, {}, {} posts'.format(i, employer, n_post))
return doFilter(sub_df)
In [12]:
tmp = filterSimPosts(0, employer='Capital Match Holdings Pte. Ltd.', sim_thres=.9)
pairwiseJacSim(tmp)
Out[12]:
In [ ]:
tmp = filterSimPosts(1, employer='Apar Technologies Pte. Ltd.', sim_thres=.9)
pairwiseJacSim(tmp)
In [13]:
by_employer = df.groupby('employer_name').agg({'job_id': 'nunique'})
by_employer = by_employer.add_prefix('n_').reset_index()
# by_employer.n_job_id.describe().round(1)
by_employer = by_employer.sort_values('n_job_id')
employers = list(by_employer.query('n_job_id >= 2')['employer_name'])
len(employers)
Out[13]:
In [14]:
print('Filtering too similar posts in {} companies...'.format(len(employers)))
In [15]:
def filterPosts4Employers(start=0, end=100):
t0 = time()
frames = [filterSimPosts(i+start, emp, sim_thres=.9) for i, emp in enumerate(employers[start:end])]
filtered_df = pd.concat(frames)
print('Done after %.1fs' %(time() - t0))
return filtered_df
In [16]:
filtered_df_1 = filterPosts4Employers(start=0, end=3000)
In [17]:
filtered_df_2 = filterPosts4Employers(start=3000, end=5000)
In [18]:
filtered_df_3 = filterPosts4Employers(start=5000, end=6000)
In [23]:
# Defensive save
filtered_df.to_csv(SKILL_DAT + 'filter_doc_index.csv', index=False)
In [27]:
filtered_df_4 = filterPosts4Employers(start=6000, end=len(employers)-5)
In [63]:
filtered_df_6 = filterPosts4Employers(start=len(employers)-5, end=len(employers)-4)
In [39]:
filtered_df_5 = filterPosts4Employers(start=len(employers)-4, end=len(employers))
In [64]:
filtered_df = pd.concat([filtered_df_1, filtered_df_2, filtered_df_3, filtered_df_4, filtered_df_5, filtered_df_6])
print filtered_df.shape
filtered_df = filtered_df.reset_index()
filtered_df.to_csv(SKILL_DAT + 'filter_doc_index.csv', index=False)
In [58]:
df = df.dropna()
df.to_csv(SKILL_DAT + 'uniq_doc_index.csv', index=False)
In [61]:
dbs = df[df.employer_name == 'DBS BANK LTD.']
In [65]:
# print out to find the row with problem
# for r, skill_str in enumerate(dbs.occur_skills):
# print((r, skill_str.split()))
Prolem in row 541, it has invalid value!
In [55]:
dbs.iloc[541]
# dbs = dbs.drop(dbs.index[541])
Out[55]:
In [24]:
sample_employers = ["Capital Match Holdings Pte. Ltd.", "Comfortdelgro Corporation Limited", "Fujitsu Asia Pte Ltd",
"Millennium Capital Management (Singapore) Pte. Ltd."]
In [25]:
def plotSamplePosts(employers, df, figsize):
sample_employers = map(str.upper, employers)
sample_posts = df[(df.employer_name.isin(sample_employers)) & (df.title == "Software Engineer")]
return plotClusterDists(sample_posts, figsize)
In [26]:
fig = plotSamplePosts(sample_employers, df=filtered_df, figsize=(6, 8))
plt.savefig(LDA_DIR + 'fig/se_in_fin3.jpg')
plt.savefig(LDA_DIR + 'fig/se_in_fin3.pdf')
plt.show(); plt.close()
In [ ]:
filtered_df.shape[0]
In [ ]:
# Statistics check
In [ ]:
posts = pd.read_csv(DATA_DIR + 'full_job_posts.csv')
print posts.shape
posts.head()
In [ ]:
jd_df = pd.read_csv('d:/larc_projects/job_analytics/data/raw/jd.csv')
print jd_df.shape
jd_df.head()
In [ ]:
tmp = pd.merge(posts, jd_df)
tmp = tmp.drop_duplicates()
tmp.shape
In [ ]:
tmp = tmp.drop_duplicates(['employer_name', 'job_description_clob'])
tmp.shape