In [ ]:
%matplotlib inline

Logistic Regression 3-class Classifier

Show below is a logistic-regression classifiers decision boundaries on the iris <https://en.wikipedia.org/wiki/Iris_flower_data_set>_ dataset. The datapoints are colored according to their labels.


In [ ]:
print(__doc__)


# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target

h = .02  # step size in the mesh

logreg = linear_model.LogisticRegression()

# we create an instance of Neighbours Classifier and fit the data.
logreg.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

Para ver as predições do nosso classificador podemos usar a função predict


In [ ]:
logreg.predict(X)

In [ ]:
logreg.score(X,Y)
  1. Utilize o classificador logistico na dataset haberman.txt (o mesmo utilizado na aula do knn) usando todos os atributos. Não esqueça de separar os dados em treino e teste

  2. Implemente as funções precision e recall para avaliar os resultados


In [ ]:
import numpy as np
from sklearn import linear_model

data = np.loadtxt('haberman.data',delimiter=',')
#vamos embaralhar os dados antes de dividir treino e teste
rdata = np.random.permutation(data)
l,c = rdata.shape
X = rdata[:,:c-1]
y = rdata[:,c-1]

nt = int(l * 0.8)
X_train = X[:nt,:]
X_test = X[nt:,:]
y_train = y[:nt]
y_test = y[nt:]

logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train,y_train)
logreg.score(X_test,y_test)

In [ ]:
#Na aula do dia 05-10-16 implementamos uma função, vamos alterá-la para
#podermos calcular os valores por classe
def confusionMatrix(predicted, actual, class_value):
    if len(predicted) != len(actual): return -1
    tp = 0.0
    fp = 0.0
    tn = 0.0
    fn = 0.0
    for i in range(len(actual)):
        if actual[i] == class_value: #labels that are equal to the actual class  (positive examples)
            if predicted[i] == class_value:
                tp += 1.0 #correctly predicted positive
            else:
                fn += 1.0 #incorrectly predicted negative
        else:              #labels that are different than the actual class (negative examples)
            if predicted[i] != class_value:
                tn += 1.0 #correctly predicted negative
            else:
                fp += 1.0 #incorrectly predicted positive
    rtn = [tp, fn, fp, tn]
    return rtn

def precision(tp,fp):
    return tp/(tp+fp)

def recall (tp,fn):
    return tp/(tp+fn)

In [ ]:
yb = logreg.predict(X_test)
#vamos verificar os valores para as 2 classes
tp, fn, fp, tn = confusionMatrix(yb,y_test,1.0) #Classe 1
print tp, fn, fp, tn
print "Precision: " + str(precision(tp,fp))
print "Recall: " + str(recall(tp,fn))

In [ ]:
tp, fn, fp, tn = confusionMatrix(yb,y_test,2.0) #Classe 2
print tp, fn, fp, tn
print "Precision: " + str(precision(tp,fp))
print "Recall: " + str(recall(tp,fn))