In [1]:
## Import libraries for these exercises
%matplotlib inline
from sklearn.feature_extraction import stop_words
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, confusion_matrix
import pandas as pd
from tokenize_resumes import *
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
'''More TFIDF:  term-frequency times inverse document frequency'''


Out[1]:
'More TFIDF:  term-frequency times inverse document frequency'

In [2]:
## Define a method to compare two resumes (for utility purposes)

def compare_resumes(res1, res2):
    resvec1 = vectorizer.transform([res1])
    resvec2 = vectorizer.transform([res2])
    return cosine_similarity(resvec1, resvec2)

In [3]:
resume_dataset = get_resume_dataset()
train, test = train_test_split(resume_dataset)
print("train and test sizes: %d & %d" % (len(train), len(test)))


train and test sizes: 750 & 250

In [4]:
## Initialize sklearn vectorizer object
vectorizer = TfidfVectorizer(use_idf=True)

## Fit the vectorizer and get the training vectors
train_vectors = vectorizer.fit_transform(train.resumes)


## Get the test set vectors (No fit!)
test_vectors = vectorizer.transform(test.resumes)

In [5]:
## Initialize the logistic regression object
logit = LogisticRegression()


## Train the logistic regression model!

logit.fit(train_vectors, train.position)
'''Think of recall as search effectivness -- you do not want to miss true positives.'''


Out[5]:
'Think of recall as search effectivness -- you do not want to miss true positives.'

In [6]:
print(classification_report(test.position, logit.predict(test_vectors)))


                          precision    recall  f1-score   support

business development rep       0.70      0.70      0.70        60
        inside sales rep       0.72      0.76      0.74        51
     marketing associate       0.97      0.86      0.91        72
       software engineer       0.92      0.99      0.95        67

             avg / total       0.84      0.84      0.84       250


In [7]:
confusion_matrix(test.position, logit.predict(test_vectors))


Out[7]:
array([[42, 14,  1,  3],
       [11, 39,  1,  0],
       [ 6,  1, 62,  3],
       [ 1,  0,  0, 66]])

In [8]:
confusion_matrix(test.position, logit.predict(test_vectors))/confusion_matrix(test.position, logit.predict(test_vectors)).sum(axis=1)


Out[8]:
array([[ 0.7       ,  0.2745098 ,  0.01388889,  0.04477612],
       [ 0.18333333,  0.76470588,  0.01388889,  0.        ],
       [ 0.1       ,  0.01960784,  0.86111111,  0.04477612],
       [ 0.01666667,  0.        ,  0.        ,  0.98507463]])

In [9]:
accuracy_score(test.position, logit.predict(test_vectors))


Out[9]:
0.85199999999999998

In [10]:
recall_score(test.position, logit.predict(test_vectors), average='micro')


Out[10]:
0.85199999999999998

In [11]:
?confusion_matrix

In [ ]:


In [ ]: