1. Preprocesesing



In [7]:

    
df_train = pd.read_excel('./data/job_train.xlsx')



In [8]:

    
# X : job description's 'responsibilities' and 'requirements' on LinkedIn
# y : {0 : 'Data Science', 1 : 'Digital Marketer', 2 : 'UX/UI Designer'}
df_train.tail(3)









    Out[8]:






  
    
      
      X
      y
      y_name
    
  
  
    
      298
      98.. HTML/ CSS/ Javascipt Angular.js Photoshop...
      2
      UX/UI Designer
    
    
      299
      99.. 5-8 years of Software/Application Enginee...
      2
      UX/UI Designer
    
    
      300
      100.. Develop high level and detailed mockups,...
      2
      UX/UI Designer



In [9]:

    
X_train = df_train['X'].values
y_train = df_train['y'].values



In [10]:

    
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer(analyzer='word', stop_words='english', min_df=50)
X_train_count_vect = count_vect.fit_transform(X_train)

tfidf_transform = TfidfTransformer()
X_train_tfidf_transform = tfidf_transform.fit_transform(X_train_count_vect)



In [11]:

    
# make columns with vocabularies (bag of words) appeared in samples
column = [count_vect.vocabulary_.keys()]
X_train_count = count_vect.transform(X_train).toarray()
X_train_tfidf = tfidf_transform.fit_transform(X_train_count_vect).toarray()



In [15]:

    
# frequency count
pd.DataFrame(data=X_train_count, columns=column).head(3)









    Out[15]:






  
    
      
      skills
      new
      machine
      content
      project
      best
      modeling
      practices
      creative
      support
      ...
      software
      including
      related
      management
      data
      quantitative
      able
      based
      written
      ability
    
  
  
    
      0
      2
      0
      0
      0
      1
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      1
      0
      2
      0
      2
      1
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2
      2
      1
      2
      3
      2
      2
      0
      1
      1
      9
      ...
      1
      0
      1
      0
      0
      0
      3
      0
      1
      0
    
  

3 rows × 98 columns



In [17]:

    
# tfidf count
pd.DataFrame(data=X_train_tfidf, columns=column).head(3)









    Out[17]:






  
    
      
      skills
      new
      machine
      content
      project
      best
      modeling
      practices
      creative
      support
      ...
      software
      including
      related
      management
      data
      quantitative
      able
      based
      written
      ability
    
  
  
    
      0
      0.138983
      0.000000
      0.000000
      0.000000
      0.095460
      0.000000
      0
      0.000000
      0.000000
      0.089198
      ...
      0.000000
      0
      0.000000
      0
      0
      0
      0.000000
      0
      0.000000
      0.000000
    
    
      1
      0.078083
      0.000000
      0.219677
      0.000000
      0.214526
      0.142639
      0
      0.000000
      0.000000
      0.100226
      ...
      0.000000
      0
      0.000000
      0
      0
      0
      0.000000
      0
      0.000000
      0.069415
    
    
      2
      0.068019
      0.056752
      0.095681
      0.161612
      0.093438
      0.124254
      0
      0.062565
      0.061276
      0.392885
      ...
      0.044039
      0
      0.049511
      0
      0
      0
      0.109529
      0
      0.050267
      0.000000
    
  

3 rows × 98 columns

2. Model, Feature Selection



In [8]:

    
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from matplotlib.pyplot import axvline, axhline
from sklearn.metrics import precision_score, accuracy_score, recall_score

parameter tuning



In [9]:

    
# go through cross validation 
# search optimal parameter that maximizes classification performance

cv = StratifiedKFold(y_train, n_folds=10, random_state=0)
score_range = []
parameter_range = []

for parameter in np.arange(-1e-4, 1e-4, 1e-5):
    clf_MNB = MultinomialNB(alpha=parameter)
    scores = np.zeros(10)

    for i, (train_idx, validation_idx) in enumerate(cv):
        X_train_ = X_train_tfidf[train_idx]
        y_train_ = y_train[train_idx]
        X_validation = X_train_tfidf[validation_idx]
        y_validation = y_train[validation_idx]
        
        clf_MNB.fit(X_train_, y_train_)
        y_predicted = clf_MNB.predict(X_validation)
        scores = round(accuracy_score(y_predicted, y_validation), 2)
        
    score_range.append(scores)
    parameter_range.append(parameter)
    dictionary = dict(zip(parameter_range, score_range))


plt.figure(figsize = (10, 3))
plt.plot(parameter_range, score_range, color='#E85285', label='accuracy: %(n)0.2f \nalpha: %(v)0.5f' % 
         {'n': max(dictionary.values()), 
          'v': max(dictionary.items(), key=lambda x: x[1])[0] })
plt.xlabel('alpha')
plt.ylabel('accuracy')
plt.legend(loc='lower right', prop={'size':12})
plt.scatter([max(dictionary.items(), key=lambda x: x[1])[0], ], [max(dictionary.values()), ], 80, color='#E85285')
axhline(max(dictionary.values()), color='#E85285', linewidth=1, linestyle='--')
axvline(max(dictionary.items(), key=lambda x: x[1])[0], color='#E85285', linewidth=1, linestyle='--')
plt.xlim(min(parameter_range), max(parameter_range))
plt.ylim(min(score_range)*0.8, max(score_range)*1.2);

classification performance evaluation



In [10]:

    
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier 
# baseline confirmation, implying that model has to perform at least as good as it
# since all 3 classes respectively have 100 samples, baseline performance is 0.33
clf_dummy = DummyClassifier(strategy='most_frequent')
clf_dummy = clf_dummy.fit(X_train_, y_train_)
print('baseline score =>', round(clf_dummy.score(X_validation, y_validation), 2))









    



baseline score => 0.33



In [11]:

    
# fit train data to the classifier with alpha being 0.00004
clf_MNB = MultinomialNB(alpha=0.00004)
clf_MNB.fit(X_train_, y_train_)
y_predicted = clf_MNB.predict(X_validation)



In [12]:

    
# print classification report to see the performancce of classifier
# we can see that the model has well classified class 0, data scientist.
# precision(0.91), recall(0.90), f1-score(0.90), accuracy(0.9) all far outperform the baseline (0.33).
# Though four types of scores are very similar to each other, I will focus on 'accuracy' on this project
from sklearn.metrics import classification_report, confusion_matrix
target_names = ['(0) Data Scientist', '(1) Digital Marketer', '(2) UX/UI Designer']
print(classification_report(y_validation, y_predicted, target_names=target_names))









    



                      precision    recall  f1-score   support

  (0) Data Scientist       0.91      1.00      0.95        10
(1) Digital Marketer       0.82      0.90      0.86        10
  (2) UX/UI Designer       1.00      0.80      0.89        10

         avg / total       0.91      0.90      0.90        30



In [13]:

    
# this is pretty much the same as above
cm = confusion_matrix(y_validation, y_predicted)
print(cm)
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')









    



[[10  0  0]
 [ 1  9  0]
 [ 0  2  8]]






    Out[13]:





<matplotlib.text.Text at 0x1f4a1128>

building a model with pipeline, gridsearch



In [14]:

    
# use pipeline to tune parameters of countvectorizer, tfidftransformer, and multinomial Naive Bayes simultaneously
# parameter tuning itself can be done by aid of GridSearhCV below
from sklearn.pipeline import Pipeline
pipeline_clf_train = Pipeline(
    steps=[
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf_MNB', MultinomialNB()),
    ]
);



In [15]:

    
# the intention of including bigram is to extract noun phrasse, if necessary
# tfidf__use_idf: False was quite suprising.

from sklearn.grid_search import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf' : [True, False],
    'clf_MNB__alpha' : np.arange(0, 0.00005, 0.000005),
}
gs_clf = GridSearchCV(pipeline_clf_train, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)



In [16]:

    
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
print('------------------------------')
print('score :', score.round(2))









    



clf_MNB__alpha: 1.5000000000000002e-05
tfidf__use_idf: False
vect__ngram_range: (1, 1)
------------------------------
score : 0.97

3. Test



In [17]:

    
# test data to test model's performance
df_test = pd.read_excel('./data/job_test.xlsx')
X_test = df_test['X'].values
y_test = df_test['y'].values
df_test.tail(3)









    Out[17]:






  
    
      
      X
      y
      y_name
    
  
  
    
      148
      Benefits package including medical, dental, vi...
      2
      UX/UI Designer
    
    
      149
      Extensive experience designing beautiful, enga...
      2
      UX/UI Designer
    
    
      150
      Tackle complex problems and provide elegant, c...
      2
      UX/UI Designer



In [18]:

    
pipeline_clf_test = Pipeline(
    steps=[
        ('vect', CountVectorizer(ngram_range=(1, 1), stop_words='english', min_df=25)),
        ('clf', MultinomialNB(alpha=1.5000000000000002e-05))
    ]
)
pipeline_clf_test = pipeline_clf_test.fit(X_train, y_train)
y_predicted = pipeline_clf_test.predict(X_test)

print("Prediction Accuracy =>" , np.mean(y_test == y_predicted).round(2))









    



Prediction Accuracy => 0.99

4. Prediction (raw input)



In [23]:

    
docs = [raw_input()]
categories = ['Data Scientist', 'Digital Marketer', 'UX/UI Designer']

predicted = pipeline_clf_test.predict(docs)[0]
prob = pipeline_clf_test.predict_proba(list(docs))[0]
prob_gap = np.max(prob) - np.median(prob)

if prob_gap > 0.4:
    print("\n==== Your job ====")
    print(categories[predicted])
    print("\n=== Probability ===")
    print(prob[predicted].round(2))
    
else:
    print("+++More detailed words please+++")









    



analytic logical critical thinking passion prudent communication not very creative problem solving positive want-to-be tech savvy data-driven result-oriented speedy 

==== Your job ====
Data Scientist

=== Probability ===
0.86

5. Visualization



In [20]:

    
# data scientist
from wordcloud import WordCloud
word_ds = " ".join(df_train[df_train['y'] == 0]['X'])
wordcloud_ds = WordCloud(background_color='white', width=800, height=400).generate(word_ds)
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud_ds.recolor(random_state=4))
plt.xticks([])
plt.yticks([])
plt.grid(False);









    



C:\Anaconda2\lib\site-packages\PIL\ImageDraw.py:104: UserWarning: setfont() is deprecated. Please set the attribute directly instead.
  "Please set the attribute directly instead.")



In [21]:

    
# digital marketer
word_dm = " ".join(df_train[df_train['y'] == 1]['X'])
wordcloud_dm = WordCloud(background_color='white', width=800, height=400).generate(word_dm)
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud_dm.recolor(random_state=4))
plt.xticks([])
plt.yticks([])
plt.grid(False);



In [22]:

    
# UX/UI designer
word_design = " ".join(df_train[df_train['y'] == 2]['X'])
wordcloud_design = WordCloud(background_color='white', width=800, height=400).generate(word_design)
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud_design.recolor(random_state=4))
plt.xticks([])
plt.yticks([])
plt.grid(False);

	X	y	y_name
298	98.. HTML/ CSS/ Javascipt Angular.js Photoshop...	2	UX/UI Designer
299	99.. 5-8 years of Software/Application Enginee...	2	UX/UI Designer
300	100.. Develop high level and detailed mockups,...	2	UX/UI Designer

	skills	new	machine	content	project	best	practices	creative	support	...	software	related	able	written	ability
0	2	0	0	0	1	0	0	0	1	...	0	0	0	0	0
1	1	0	2	0	2	1	0	0	1	...	0	0	0	0	1
2	2	1	2	3	2	2	1	1	9	...	1	1	3	1	0

	skills	new	machine	content	project	best	practices	creative	support	...	software	related	able	written	ability
0	0.138983	0.000000	0.000000	0.000000	0.095460	0.000000	0.000000	0.000000	0.089198	...	0.000000	0.000000	0.000000	0.000000	0.000000
1	0.078083	0.000000	0.219677	0.000000	0.214526	0.142639	0.000000	0.000000	0.100226	...	0.000000	0.000000	0.000000	0.000000	0.069415
2	0.068019	0.056752	0.095681	0.161612	0.093438	0.124254	0.062565	0.061276	0.392885	...	0.044039	0.049511	0.109529	0.050267	0.000000

	X	y	y_name
148	Benefits package including medical, dental, vi...	2	UX/UI Designer
149	Extensive experience designing beautiful, enga...	2	UX/UI Designer
150	Tackle complex problems and provide elegant, c...	2	UX/UI Designer

	skills	new	machine	content	project	best	practices	creative	support	...	software	related	able	written	ability
0	2	0	0	0	1	0	0	0	1	...	0	0	0	0	0
1	1	0	2	0	2	1	0	0	1	...	0	0	0	0	1
2	2	1	2	3	2	2	1	1	9	...	1	1	3	1	0

	skills	new	machine	content	project	best	practices	creative	support	...	software	related	able	written	ability
0	2	0	0	0	1	0	0	0	1	...	0	0	0	0	0
1	1	0	2	0	2	1	0	0	1	...	0	0	0	0	1
2	2	1	2	3	2	2	1	1	9	...	1	1	3	1	0