Aula 3 Video 1 - Entendendo os dados e o problema


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

Download Dataset: avaliacoes_usuario.csv


In [2]:
movies = pd.read_csv('datasets/avaliacoes_usuario.csv')

In [3]:
movies.head(5)


Out[3]:
Titulo Documentary Sci-Fi Mystery Horror Romance Thriller Crime Fantasy Comedy Animation Children Drama Adventure Duracao Investimento Gostou
0 Nixon (1995) 0 0 0 0 0 0 0 0 0 0 0 1 0 114.496547 7.930748 1
1 Leaving Las Vegas (1995) 0 0 0 0 1 0 0 0 0 0 0 1 0 110.140191 18.276555 1
2 Persuasion (1995) 0 0 0 0 1 0 0 0 0 0 0 1 0 105.747597 16.582232 1
3 Babe (1995) 0 0 0 0 0 0 0 0 0 0 1 1 0 126.131978 13.004553 1
4 Carrington (1995) 0 0 0 0 1 0 0 0 0 0 0 1 0 85.025469 14.418120 1

In [4]:
Counter(movies['Gostou'])


Out[4]:
Counter({0: 135, 1: 543})

In [5]:
print('Gostou {}'.format((543/(135+543))))


Gostou 0.8008849557522124

In [6]:
print('Não gostou {}'.format((135/(135+543))))


Não gostou 0.19911504424778761

In [7]:
caract = movies[movies.columns[1:16]]
gostos = movies[movies.columns[16:]]

In [8]:
#treino, teste, treino_labels, teste_labels = train_test_split(caract, gostos, test_size=0.1)
treino, teste, treino_labels, teste_labels = train_test_split(caract, gostos)

In [9]:
type(treino)


Out[9]:
pandas.core.frame.DataFrame

In [10]:
Counter(treino_labels['Gostou'])


Out[10]:
Counter({0: 100, 1: 408})

In [11]:
Counter(teste_labels['Gostou'])


Out[11]:
Counter({0: 35, 1: 135})

In [12]:
print('Gostou % {} do treino_label (deve manter a proporção +- 80%)'.format(404/(104+404)))


Gostou % 0.7952755905511811 do treino_label (deve manter a proporção +- 80%)

In [13]:
print('Gostou % {} do teste_label (deve manter a proporção +- 80%)'.format(139/(31+139)))


Gostou % 0.8176470588235294 do teste_label (deve manter a proporção +- 80%)

Aula 3 Video 2 - A regressão logística


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.naive_bayes import MultinomialNB,GaussianNB

In [15]:
type(treino)


Out[15]:
pandas.core.frame.DataFrame

In [16]:
treino[0:1]


Out[16]:
Documentary Sci-Fi Mystery Horror Romance Thriller Crime Fantasy Comedy Animation Children Drama Adventure Duracao Investimento
344 0 0 0 1 0 0 0 0 0 0 0 0 0 131.165837 39.479492

In [17]:
treino = np.array(treino).reshape(len(treino), 15)
teste = np.array(teste).reshape(len(teste), 15)

In [18]:
type(treino)


Out[18]:
numpy.ndarray

In [19]:
treino[0]


Out[19]:
array([   0.        ,    0.        ,    0.        ,    1.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,  131.1658368 ,   39.47949166])

In [20]:
type(treino_labels)


Out[20]:
pandas.core.frame.DataFrame

In [21]:
treino_labels.head(5)


Out[21]:
Gostou
344 1
290 1
491 1
253 1
511 1

In [22]:
treino_labels = treino_labels.values.ravel()
teste_labels = teste_labels.values.ravel()

In [23]:
type(treino_labels)


Out[23]:
numpy.ndarray

In [24]:
treino_labels[0:5]


Out[24]:
array([1, 1, 1, 1, 1])

In [25]:
treino_labels.shape


Out[25]:
(508,)

Usando regressão logistica


In [26]:
modelo = LogisticRegression()
modelo.fit(treino, treino_labels)


Out[26]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
previsoes = modelo.predict(teste)
previsoes


Out[27]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1])

In [28]:
teste_labels.shape


Out[28]:
(170,)

In [29]:
previsoes.shape


Out[29]:
(170,)

In [30]:
acuracia = accuracy_score(teste_labels, previsoes)
acuracia


Out[30]:
0.79411764705882348

In [31]:
zootopia = [0,0,0,0,0,0,0,1,1,1,1,0,1,110,27.74456356]
modelo.predict([zootopia])


Out[31]:
array([1])

Usando Multinomial Naive Bayes


In [32]:
modelo_NB = MultinomialNB()

In [33]:
modelo_NB.fit(treino,treino_labels)


Out[33]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
previsoes_NB = modelo_NB.predict(teste)

In [35]:
acuracia = accuracy_score(teste_labels, previsoes_NB)
acuracia


Out[35]:
0.78235294117647058

In [36]:
zootopia = [0,0,0,0,0,0,0,1,1,1,1,0,1,110,27.74456356]
modelo_NB.predict([zootopia])


Out[36]:
array([1])

EXTRA: Usando Gaussian Naive Bayes


In [37]:
modelo_GNB = GaussianNB()

In [38]:
modelo_GNB.fit(treino,treino_labels)


Out[38]:
GaussianNB(priors=None)

In [39]:
previsoes_GNB = modelo_GNB.predict(teste)

In [40]:
acuracia = accuracy_score(teste_labels, previsoes_GNB)
acuracia


Out[40]:
0.69411764705882351

In [41]:
zootopia = [0,0,0,0,0,0,0,1,1,1,1,0,1,110,27.74456356]
modelo_GNB.predict([zootopia])


Out[41]:
array([1])