Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no UCI. Este dataset de carros possui as seguintes features e classe:
Attributos
Classes
Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes (veja aqui)
Analise a acurácia dos dois algoritmos e discuta a sua solução.
In [83]:
import csv
import random
import math
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
def separateByClass(dataset):
separated = {}
for _,row in dataset.iterrows():
vector = row
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
if stdev == 0.0:
stdev = 10000.0
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for _,row in testSet.iterrows():
result = predict(summaries, row)
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet.iloc[i] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0
In [84]:
filename = 'carData.csv'
dataset = pd.read_csv(filename)
for i in range(0, dataset.shape[1]):
dataset.iloc[:,i] = LabelEncoder().fit_transform(dataset.iloc[:,i])
Y = dataset.unacc.copy()
del dataset['unacc']
X = dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3)
X_train_sk = X_train.copy()
idx = X_train.shape[1]
new_col = Y_train
X_train.insert(loc=idx, column='Class', value=new_col)
X_train.head()
Out[84]:
In [85]:
summaries = summarizeByClass(X_train)
Y_test.head()
Out[85]:
In [86]:
y_pred = getPredictions(summaries, X_test)
In [87]:
acc = getAccuracy(Y_test, y_pred)
print('Hand-made Gaussian NB\nAccuracy = ',acc)
print("\nClassification Report:")
print(classification_report(y_true=Y_test, y_pred=y_pred, target_names=["unacc", "acc", "good", "vgood"]))
In [88]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(X_train_sk.values, Y_train.values)
y_pred_sk = naive.predict(X_test.values)
accSK = getAccuracy(Y_test, y_pred_sk)
print('SciKitLearn Gaussian NB\nAccuracy = ',accSK)
print("\nClassification Report:")
print(classification_report(y_true=Y_test, y_pred=y_pred_sk, target_names=["unacc", "acc", "good", "vgood"]))
In [ ]: