Naive Bayes - Trabalho

Questão 1

Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no UCI. Este dataset de carros possui as seguintes features e classe:

Attributos

  1. buying: vhigh, high, med, low
  2. maint: vhigh, high, med, low
  3. doors: 2, 3, 4, 5, more
  4. persons: 2, 4, more
  5. lug_boot: small, med, big
  6. safety: low, med, high

Classes

  1. unacc, acc, good, vgood

Questão 2

Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes (veja aqui)

Questão 3

Analise a acurácia dos dois algoritmos e discuta a sua solução.


In [1]:
import pandas as pd
import sklearn as skt
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split


/home/bruno/miniconda3/envs/data-science/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
columns = ['buying','maint','doors','persons','lug_boot','safety','class']
dataset = pd.read_csv('carData.csv' , header = None, names = columns, index_col=False)
print(dataset.head())


  buying  maint doors persons lug_boot safety  class
0  vhigh  vhigh     2       2    small    low  unacc
1  vhigh  vhigh     2       2    small    med  unacc
2  vhigh  vhigh     2       2    small   high  unacc
3  vhigh  vhigh     2       2      med    low  unacc
4  vhigh  vhigh     2       2      med    med  unacc

In [3]:
import random
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [4]:
import math
import csv
 
def loadCsv(filename):
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [5]:
for i in range(0, dataset.shape[1]):
    dataset.iloc[:,i] = LabelEncoder().fit_transform(dataset.iloc[:,i])

In [6]:
dataset.head


Out[6]:
<bound method NDFrame.head of       buying  maint  doors  persons  lug_boot  safety  class
0          3      3      0        0         2       1      2
1          3      3      0        0         2       2      2
2          3      3      0        0         2       0      2
3          3      3      0        0         1       1      2
4          3      3      0        0         1       2      2
5          3      3      0        0         1       0      2
6          3      3      0        0         0       1      2
7          3      3      0        0         0       2      2
8          3      3      0        0         0       0      2
9          3      3      0        1         2       1      2
10         3      3      0        1         2       2      2
11         3      3      0        1         2       0      2
12         3      3      0        1         1       1      2
13         3      3      0        1         1       2      2
14         3      3      0        1         1       0      2
15         3      3      0        1         0       1      2
16         3      3      0        1         0       2      2
17         3      3      0        1         0       0      2
18         3      3      0        2         2       1      2
19         3      3      0        2         2       2      2
20         3      3      0        2         2       0      2
21         3      3      0        2         1       1      2
22         3      3      0        2         1       2      2
23         3      3      0        2         1       0      2
24         3      3      0        2         0       1      2
25         3      3      0        2         0       2      2
26         3      3      0        2         0       0      2
27         3      3      1        0         2       1      2
28         3      3      1        0         2       2      2
29         3      3      1        0         2       0      2
...      ...    ...    ...      ...       ...     ...    ...
1698       1      1      2        2         0       1      2
1699       1      1      2        2         0       2      1
1700       1      1      2        2         0       0      3
1701       1      1      3        0         2       1      2
1702       1      1      3        0         2       2      2
1703       1      1      3        0         2       0      2
1704       1      1      3        0         1       1      2
1705       1      1      3        0         1       2      2
1706       1      1      3        0         1       0      2
1707       1      1      3        0         0       1      2
1708       1      1      3        0         0       2      2
1709       1      1      3        0         0       0      2
1710       1      1      3        1         2       1      2
1711       1      1      3        1         2       2      0
1712       1      1      3        1         2       0      1
1713       1      1      3        1         1       1      2
1714       1      1      3        1         1       2      1
1715       1      1      3        1         1       0      3
1716       1      1      3        1         0       1      2
1717       1      1      3        1         0       2      1
1718       1      1      3        1         0       0      3
1719       1      1      3        2         2       1      2
1720       1      1      3        2         2       2      0
1721       1      1      3        2         2       0      1
1722       1      1      3        2         1       1      2
1723       1      1      3        2         1       2      1
1724       1      1      3        2         1       0      3
1725       1      1      3        2         0       1      2
1726       1      1      3        2         0       2      1
1727       1      1      3        2         0       0      3

[1728 rows x 7 columns]>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:,:-1], dataset.iloc[:,-1], test_size=0.2)

In [8]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
#Implementação do sklearn
mult = MultinomialNB()
mult.fit(X_train.values,y_train.values)


Out[8]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
pred = mult.predict(X_test.values)

In [10]:
print(pred)


[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2]

In [11]:
from sklearn.metrics import accuracy_score, classification_report

In [12]:
print(accuracy_score(y_true = y_test,y_pred = pred))


0.679190751445

In [14]:
#Implementação baseada no tutorial feito em sala de aula
sumarizado = summarize(dataset)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-14-0b59a7fcbec8> in <module>()
      1 #Implementação baseada no tutorial feito em sala de aula
----> 2 sumarizado = summarize(dataset)

<ipython-input-4-6f93b1d8380f> in summarize(dataset)
     27 
     28 def summarize(dataset):
---> 29     summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
     30     del summaries[-1]
     31     return summaries

<ipython-input-4-6f93b1d8380f> in <listcomp>(.0)
     27 
     28 def summarize(dataset):
---> 29     summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
     30     del summaries[-1]
     31     return summaries

<ipython-input-4-6f93b1d8380f> in mean(numbers)
     19 
     20 def mean(numbers):
---> 21     return sum(numbers)/float(len(numbers))
     22 
     23 def stdev(numbers):

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [ ]: