Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no UCI. Este dataset de carros possui as seguintes features e classe:
Attributos
Classes
Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes (veja aqui)
Analise a acurácia dos dois algoritmos e discuta a sua solução.
In [1]:
import pandas as pd
import sklearn as skt
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
In [2]:
columns = ['buying','maint','doors','persons','lug_boot','safety','class']
dataset = pd.read_csv('carData.csv' , header = None, names = columns, index_col=False)
print(dataset.head())
In [3]:
import random
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
In [4]:
import math
import csv
def loadCsv(filename):
lines = csv.reader(open(filename, "r"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0
In [5]:
for i in range(0, dataset.shape[1]):
dataset.iloc[:,i] = LabelEncoder().fit_transform(dataset.iloc[:,i])
In [6]:
dataset.head
Out[6]:
In [7]:
X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:,:-1], dataset.iloc[:,-1], test_size=0.2)
In [8]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
#Implementação do sklearn
mult = MultinomialNB()
mult.fit(X_train.values,y_train.values)
Out[8]:
In [9]:
pred = mult.predict(X_test.values)
In [10]:
print(pred)
In [11]:
from sklearn.metrics import accuracy_score, classification_report
In [12]:
print(accuracy_score(y_true = y_test,y_pred = pred))
In [14]:
#Implementação baseada no tutorial feito em sala de aula
sumarizado = summarize(dataset)
In [ ]: