We first focus on building the document-skill matrix where each entry $f(d,s)$ is the frequency skill $s$ occurs in document $d$.
Challenge: naive counting can cause over-estimating the frequency. This is due to overlapping in skills. By overlapping we mean that a unigram skill can be a part of a bigram/trigram skill. For example: 'analytics' is a skill itself but it also occurs in 'data analytics', thus a document with skill 'data analytics' occuring 10 times is also considered as containing skill 'analytics' 10 times.
Solution: To overcome this, we propose counting with removal as follows. We count trigram skills first, then remove them from docs, count bigram skills, remove them from docs and finally count unigram skills. This way we can avoid overlapping.
In [ ]:
import cluster_skill_helpers as cluster_skill_helpers
from cluster_skill_helpers import *
In [ ]:
HOME_DIR = 'd:/larc_projects/job_analytics/'; DATA_DIR = HOME_DIR + 'data/clean/'
SKILL_DIR = DATA_DIR + 'skill_cluster/'; RES_DIR = HOME_DIR + 'results/reports/skill_cluster/'
In [ ]:
jd_df = pd.read_csv(DATA_DIR + 'jd_df.csv')
jd_df.sort_values(by='n_uniq_skill', inplace=True, ascending=False)
In [22]:
jd_docs = jd_df['clean_text'].apply(str.lower)
In [ ]:
doc_index = pd.DataFrame({'job_id': jd_df['job_id'], 'doc': jd_docs})
doc_index.head()
In [ ]:
doc_index.to_csv(SKILL_DIR + 'doc_index.csv', index=False)
In [ ]:
skill_df = pd.read_csv(DATA_DIR + 'skill_cluster/skill_df.csv')
skills = skill_df['skill']
In [ ]:
trigram_skills = np.unique(skill_df.query('n_word == 3')['skill'])
bigram_skills = np.unique(skill_df.query('n_word == 2')['skill'])
unigram_skills = np.unique(skill_df.query('n_word == 1')['skill'])
# pd.DataFrame({'trigram': trigram_skills}).to_csv(SKILL_DIR + 'trigrams.csv', index=False)
# pd.DataFrame({'bigram': bigram_skills}).to_csv(SKILL_DIR + 'bigrams.csv', index=False)
# pd.DataFrame({'unigram': unigram_skills}).to_csv(SKILL_DIR + 'unigrams.csv', index=False)
In [23]:
reload(cluster_skill_helpers)
from cluster_skill_helpers import *
In [ ]:
doc_trigram = buildDocSkillMat(n=3, jd_docs=jd_docs, skills=trigram_skills)
In [24]:
print('Removing tri-grams from JDs to avoid duplications...')
jd_docs = jd_docs.apply(rmSkills, skills = trigram_skills)
In [25]:
reload(cluster_skill_helpers)
from cluster_skill_helpers import *
In [26]:
doc_bigram = buildDocSkillMat(n=2, jd_docs=jd_docs, skills=bigram_skills)
print('Removing bi-grams from JDs...')
jd_docs = jd_docs.apply(rmSkills, skills = bigram_skills)
In [27]:
doc_unigram = buildDocSkillMat(n=1, jd_docs=jd_docs, skills=unigram_skills)
In [28]:
with(open(SKILL_DIR + 'doc_trigram.mtx', 'w')) as f:
mmwrite(f, doc_trigram)
with(open(SKILL_DIR + 'doc_bigram.mtx', 'w')) as f:
mmwrite(f, doc_bigram)
with(open(SKILL_DIR + 'doc_unigram.mtx', 'w')) as f:
mmwrite(f, doc_unigram)
In [29]:
from scipy.sparse import hstack
doc_skill = hstack([doc_unigram, doc_bigram, doc_trigram])
assert doc_skill.shape[0] == doc_unigram.shape[0]
assert doc_skill.shape[1] == doc_unigram.shape[1] + doc_bigram.shape[1] + doc_trigram.shape[1]
In [30]:
with(open(SKILL_DIR + 'doc_skill.mtx', 'w')) as f:
mmwrite(f, doc_skill)
In [ ]:
skills = np.concatenate((unigram_skills, bigram_skills, trigram_skills))
pd.DataFrame({'skill': skills}).to_csv(SKILL_DIR + 'skill_index.csv', index=False)
In [ ]:
vectorizer = text_manip.CountVectorizer(vocabulary=skills, ngram_range=(1,3))
t0 = time()
print('Naive counting...')
naive_doc_skill = vectorizer.fit_transform(jd_docs)
print('Done after %f.1s' %(time() - t0))
s_freq = naive_doc_skill.sum(axis=0).A1
naive_skill_df = pd.DataFrame({'skill': skills, 'freq': s_freq})
naive_skill_df = pd.merge(naive_skill_df, skill_df)
In [ ]:
naive_skill_df = naive_skill_df[['skill', 'n_word', 'freq', 'n_jd_with_skill']]
naive_skill_df.head()
In [ ]:
res = vectorizer.inverse_transform(naive_doc_skill)
# res[:10]
In [ ]:
s_freq = doc_skill.sum(axis=0).A1
new_skill_df = pd.DataFrame({'skill': skills, 'new_freq': s_freq})
In [ ]:
skill_df = pd.merge(naive_skill_df, new_skill_df)
skill_df = skill_df[['skill', 'n_word', 'freq', 'new_freq', 'n_jd_with_skill']]
In [ ]:
unigram_df = skill_df.query('n_word == 1').sort_values(by='new_freq', ascending=False)
bigram_df = skill_df.query('n_word == 2').sort_values(by='new_freq', ascending=False)
trigram_df = skill_df.query('n_word == 3').sort_values(by='new_freq', ascending=False)
print('# unigram skills in JDs: {}'.format(unigram_df.shape[0]))
print('# bigram skills in JDs: {}'.format(bigram_df.shape[0]))
print('# trigram skills in JDs: {}'.format(trigram_df.shape[0]))
In [ ]:
trigram_df.head(10)
In [ ]:
bigram_df.head(20)
In [ ]:
unigram_df.head(20)
In [ ]:
trigram_df.to_csv(SKILL_DIR + 'trigram.csv', index=False)
bigram_df.to_csv(SKILL_DIR + 'bigram.csv', index=False)
unigram_df.to_csv(SKILL_DIR + 'unigram.csv', index=False)
# top100_skills = skill_df.head(100)
# top100_skills.to_csv(RES_DIR + 'top100_skills.csv', index=False)
In [ ]:
ks = range(5, 25, 5)
n_top_words = 10
In [ ]:
n_doc = doc_skill.shape[0]; n_feat = doc_skill.shape[1]
train_idx, test_idx = mkPartition(n_doc, p=80)
In [ ]:
lda_X_train, lda_X_test = doc_skill[train_idx, :], doc_skill[test_idx, :]
In [ ]:
# # check correctness of rmSkills()
# non_zeros = find(doc_trigram)
# idx_docs_with_trigram = non_zeros[0]
# trigram_counts = non_zeros[2]
# many_trigrams = idx_docs_with_trigram[trigram_counts > 1]
# doc_with_trigram = jd_docs.iloc[many_trigrams[0]]
# print('Doc bf removing tri-gram skills:\n {}'.format(doc_with_trigram))
# res = rmSkills(trigram_skills, doc_with_trigram)
# two_spaces = [m.start() for m in re.finditer(' ', res)]
# print res[two_spaces[1]:]
# print res[two_spaces[0]:450]