In [ ]:
import my_util as my_util; from my_util import *
In [ ]:
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
# job descriptions (JDs)
init_posts = pd.read_csv(DATA_DIR + 'jd_df.csv')
In [ ]:
skill_df = pd.read_csv(DATA_DIR + 'skill_index.csv')
init_skills = skill_df['skill']
jd_docs = list(init_posts['clean_text'].apply(str.lower))
n_skill, n_jd = len(init_skills) , init_posts.shape[0]
print('Initial no. of skills: %d' %n_skill)
print('Initial no. of JDs: %d' %n_jd) # some garbage JDs with no text already removed
skill_df.head(3)
There are two goals: i) to remove JDs with too few skills, and ii) to remove skills occurring in too few JDs. Thus, we repeat the following process until the two goals are satisfied.
In [ ]:
n_iter, posts = 0, init_posts
n_post = posts.shape[0]
stop_cond, thres = False, .98
while not stop_cond:
n_iter = n_iter + 1
print('Iteration %d' %n_iter)
new_posts = extractJDs(posts, skills, min_n_skill=2)
n_new_post = new_posts.shape[0]
print('No. of posts after filtering: %d' %n_new_post)
skill_df = extractSkills(skills, new_posts, min_n_jd=2)
new_skills = skill_df['skill']
print('No. of skills after filtering: %d' %len(new_skills) )
stop_cond = (n_new_post >= thres*n_post) and (len(new_skills) >= thres*len(skills))
posts = new_posts
n_post = posts.shape[0]
skills = new_skills
# end
In [ ]:
# print min(posts['n_uniq_skill'])
# print min(skill_df['n_jd_with_skill'])
posts.to_csv(DATA_DIR + 'filtered/posts.csv', index=False)
skill_df.to_csv(DATA_DIR + 'filtered/skills.csv', index=False)
In [ ]:
posts = posts.sort_values(by='n_uniq_skill', ascending=False)
posts.head()
In [ ]:
# Sanity check by pull up skills occuring in the JD with most skills
# post_with_most_skill = init_posts.query('job_id == {}'.format('JOB-2015-0196805') )
In [ ]:
train_idx, test_idx = mkPartition(n_instance, p=80)
X_train, X_test = doc_skill_tfidf[train_idx, :], doc_skill_tfidf[test_idx, :]
n_train, n_test = X_train.shape[0], X_test.shape[0]
print('Train set has %d JDs and test set has %d JDs' %(n_train, n_test))
In [ ]:
stats = pd.DataFrame({'n_train': n_train, 'n_test': n_test, 'n_jd (train & test)': n_post, 'n_skill': len(skills)}, index=[0])
stats.to_csv(RES_DIR + 'stats.csv', index=False)
In [ ]:
from ja_helpers import toIDF
In [ ]:
idf = toIDF(terms=skills, doc_term_mat=doc_skill)
idf.sort_values('idf_log10', inplace=True)
idf.to_csv(SKILL_DIR + 'skill_idf.csv', index=False)
In [ ]:
idf['idf_log10'] = idf['idf'] * np.log10(np.e)
quantile(idf['idf_log10'])
In [ ]:
idf_log10 = idf['idf_log10']
n, bins, patches = plt.hist(idf_log10, bins=np.unique(idf_log10))
plt.xlabel('IDF of term (log-10 scale)')
plt.ylabel('# terms')
plt.grid(True)
plt.savefig(SKILL_DIR + 'idf_hist.pdf')
plt.show()
plt.close()
In [ ]:
# terms which occur in at least 10% of docs
idf.query('idf_log10 <= 1')
Setting idf threshold as 1 did not catch stop words like com, can, so I increase the threshold of idf.
In [ ]:
idf.query('idf_log10 <= 1.35').to_csv(SKILL_DIR + 'stop_words.csv', index=False)
In [ ]:
df = pd.read_csv(SKILL_DIR + 'stop_words.csv')
stop_words = df['term']
In [ ]:
skill_df = skill_df[- skill_df['skill'].isin(stop_words)]
print(skill_df.shape)
skill_df.to_csv(SKILL_DIR + 'skill_index.csv', index=False)
In [ ]:
job_posts = pd.read_csv(DATA_DIR + 'full_job_posts.csv')
job_posts.head(5)
In [ ]:
by_job_id = job_posts[['job_id', 'job_posting_date_history']].groupby('job_id')
res = by_job_id.agg({'job_posting_date_history': lambda x:len(np.unique(x))})
res = res.rename(columns={'job_posting_date_history': 'n_post_date'}).reset_index()
res.sort_values('n_post_date', ascending=False, inplace=True)
res.head()
In [ ]:
quantile(res['n_post_date'])
In [ ]:
repost_jobs = res.query('n_post_date > 1')
print('# jobs reposted: %d' %repost_jobs.shape[0])
In [ ]:
jobs = job_posts[['job_id', 'title', 'employer_name']].drop_duplicates()
print('# records in jobs bf merging: %d' %jobs.shape[0])
jobs = pd.merge(jobs, job_desc)
print('# records in jobs after merging: %d' %jobs.shape[0])
jobs_wo_title = job_posts[job_posts['title'].isnull()]
n_job_wo_title = jobs_wo_title.shape[0]
print('# job posts in WDA without title: %d' %n_job_wo_title)
jobs_wo_title
In [ ]:
jobs.to_csv(DATA_DIR + 'jobs.csv', index=False)
jobs.head()
In [ ]:
employers = pd.read_csv(DATA_DIR + 'employers.csv')
print employers.shape
In [ ]:
employers.rename(columns={'company_registration_number_uen_ep': 'employer_id', 'organisation_name_ep': 'employer_name',
'ssic_group_ep': 'industry'}, inplace=True)
# Standardize employer names by uppercase (problem detected below)
employers['employer_name'] = map(str.upper, employers['employer_name'])
employers = employers.drop_duplicates()
employers.shape
In [ ]:
# Handle the problem with PRIORITY CONSULTANTS (detected below)
employers.query('employer_name == "PRIORITY CONSULTANTS"')
In [ ]:
employers = employers.drop(10278)
employers.query('employer_name == "PRIORITY CONSULTANTS"')
In [ ]:
employers.to_csv(DATA_DIR + 'employers.csv', index=False)
In [ ]:
posts = pd.read_csv(DATA_DIR + 'full_job_posts.csv')
posts.head()
In [ ]:
df = mergeKeepLeftIndex(doc_index, posts[['job_id', 'employer_id']])
df = df.drop_duplicates()
df.shape
In [ ]:
df = mergeKeepLeftIndex(df, employers[['employer_id', 'employer_name', 'industry']])
df = df.drop_duplicates()
df.shape[0]
In [ ]:
df.to_csv(SKILL_DIR + 'doc_index.csv', index=False)
In [ ]:
# First, verify duplication exists
print len(df.index)
print len(df.index.unique())
# Then detect them
import collections
print [(item, count) for item, count in collections.Counter(df.index).items() if count > 1]
df.iloc[25569:25571, :]
The problem is due to upper vs. lower case in employer names! That's why we need to standardize them.
Lesson learnt: Watch out for case-sensitive problem in data.
After handling this, we repeat the above process and check for duplications again.
In [ ]:
print [(item, count) for item, count in collections.Counter(tmp.index).items() if count > 1]
In [ ]:
tmp.iloc[29403:29405, :]
This time it is because the company PRIORITY CONSULTANTS registered itself with 2 different industries: Administrative Service and Scientific Activities.