In [1]:
## Import libraries for these exercises
%matplotlib inline
from sklearn.feature_extraction import stop_words
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, confusion_matrix
import pandas as pd
from tokenize_resumes import *
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
'''More TFIDF: term-frequency times inverse document frequency'''
Out[1]:
In [2]:
## Define a method to compare two resumes (for utility purposes)
def compare_resumes(res1, res2):
resvec1 = vectorizer.transform([res1])
resvec2 = vectorizer.transform([res2])
return cosine_similarity(resvec1, resvec2)
In [3]:
resume_dataset = get_resume_dataset()
train, test = train_test_split(resume_dataset)
print("train and test sizes: %d & %d" % (len(train), len(test)))
In [4]:
## Initialize sklearn vectorizer object
vectorizer = TfidfVectorizer(use_idf=True)
## Fit the vectorizer and get the training vectors
train_vectors = vectorizer.fit_transform(train.resumes)
## Get the test set vectors (No fit!)
test_vectors = vectorizer.transform(test.resumes)
In [5]:
## Initialize the logistic regression object
logit = LogisticRegression()
## Train the logistic regression model!
logit.fit(train_vectors, train.position)
'''Think of recall as search effectivness -- you do not want to miss true positives.'''
Out[5]:
In [6]:
print(classification_report(test.position, logit.predict(test_vectors)))
In [7]:
confusion_matrix(test.position, logit.predict(test_vectors))
Out[7]:
In [8]:
confusion_matrix(test.position, logit.predict(test_vectors))/confusion_matrix(test.position, logit.predict(test_vectors)).sum(axis=1)
Out[8]:
In [9]:
accuracy_score(test.position, logit.predict(test_vectors))
Out[9]:
In [10]:
recall_score(test.position, logit.predict(test_vectors), average='micro')
Out[10]:
In [11]:
?confusion_matrix
In [ ]:
In [ ]: