In [2]:
from collections import defaultdict
from functools import reduce
import math
class NaiveBayes:
def __init__(self):
#freqFeature[feature] = frequencia
self.freqFeature = defaultdict(int)
#freqLabel[classe] = frequencia
#label = classe
self.freqLabel = defaultdict(int)
#Estrutura -> condFreqFeature[label][feature] = frequencia
self.condFreqFeature = defaultdict(lambda: defaultdict(int))
def countFrequencies(self):
allFeatures = reduce(lambda x, y: x+y, self.dataSet_x)
for f in allFeatures:
self.freqFeature[f] += 1
for l in self.dataSet_y:
self.freqLabel[l] += 1
def countCondFrequencies(self):
dataSet = list(zip(self.dataSet_x, self.dataSet_y)) #A partir de python 3, zip retorna object. Entao mudo para list
for t in dataSet:
for f in t[0]:
# condFreqFeature[label][feature]
self.condFreqFeature[t[1]][f] += 1
def train(self, dataSet_x, dataSet_y):
self.dataSet_x = dataSet_x
self.dataSet_y = dataSet_y
self.countFrequencies()
self.countCondFrequencies()
def probLikelihood(self, f, l, vocabulary):
laplace = 1
condFreq = self.condFreqFeature[l][f]
prob = (float)(condFreq + laplace) / (self.freqLabel[l] + vocabulary)
return prob
def predict(self, dataSet_x):
# Correcao de Laplace
# P( f | l) = (freq( f | l ) + laplace*) / ( freq(l)** + qnt(distinct(f))*** )
#
# * -> laplace smoothing: add 1
# ** -> Frequencia com que o valor de label aparece
# *** -> Quantidade de features distintas
#
# Devido a possibilidade de underflow de pontos flutuantes, eh interessante fazer
# P(x1|l)*P(x2|l) ... -> exp(Log(P(x1|l)) + Log(P(x2|l))) ...
probs = []
totalTuples = len(self.dataSet_y)
vocabulary = len(self.freqFeature)
#Cada tupla
for index, t in enumerate(dataSet_x):
probs.append(defaultdict(float))
#Cada label
for l in self.dataSet_y:
prob = 0.0
#Cada feature
for f in t:
prob += math.log(self.probLikelihood(f, l, vocabulary))
prob += (self.freqLabel[l] / totalTuples)
probs[index][l] = math.exp(prob)
return probs
In [4]:
import random
import math
# Car dataset
# Attribute Information:
#
# Class Values:
#
# unacc, acc, good, vgood
#
# Attributes:
#
# buying: vhigh, high, med, low.
# maint: vhigh, high, med, low.
# doors: 2, 3, 4, 5more.
# persons: 2, 4, more.
# lug_boot: small, med, big.
# safety: low, med, high.
#Retur dataset
def readFile(path):
rawDataset = open(path, 'r')
#Adicionamos os sufixos para cada valor de feature relativo a sua coluna,
#para que a contagem de frequencias nao conflite com valores semelhantes em diferentes features
suffix = ['_buy', '_maint', '_doors', '_pers', '_lug', '_safety', '_class']
dataset = []
rawDataset.seek(0)
for line in rawDataset:
l = line.split(',')
#Elimina o caractere de breakline do texto
l[-1] = l[-1].replace("\n", "")
newTuple = map(lambda (x,y): x+y, zip( l , suffix))
dataset.append( newTuple )
return dataset
def main():
preparedDataset = readFile('carData')
#Para toda execucao da main, randomiza os dados
random.shuffle(preparedDataset)
dataset = []
#Features
dataset.append([])
#Label
dataset.append([])
#Separa para dataset[0] como um vetor de vetores, onde cada elemento eh uma linha de features
#Para dataset[1] eh o vetor com as labels (classes)
for t in preparedDataset:
dataset[0].append(t[:-1])
dataset[1].append(t[-1])
#Conjunto de features
dataSet_x = dataset[0]
#Conjunto de classes
dataSet_y = dataset[1]
#Repare acima, dataSet_x[0] representa as features da linha 1 do conjunto, bem como dataSet_y[0] eh a classe da linha 1
nTuples = len(dataSet_x)
nToTrain = int(math.floor(nTuples * 0.7))
dataSet_x_train = dataSet_x[:nToTrain]
dataSet_y_train = dataSet_y[:nToTrain]
dataSet_x_test = dataSet_x[nToTrain:]
dataSet_y_test = dataSet_y[nToTrain:]
#Instancia o NaiveBayes
naive = NaiveBayes()
#Passa os dados para treino
#naive.train(features, class)
naive.train(dataSet_x_train, dataSet_y_train)
accuracy = 0.0
#Faz a predicao
#naive.predict(dados_para_classificar -> apenas features)
results = naive.predict(dataSet_x_test)
#Faz apenas o "score" do modelos, calculando quantos foram preditos corretamente
for index, r in enumerate(results):
yPredicted = max(r, key=r.get)
y = dataSet_y_test[index]
if(y == yPredicted):
accuracy += 1.0
#Exibe a acuracia do algoritmo
print accuracy / len(dataSet_y_test)
main()
In [ ]: