Aula 4 Video 1 - Conhecendo DecisionTreeRegressor


In [201]:
import pandas as pd
import numpy as np 
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Download Dataset: movies_multilinear_reg.csv


In [202]:
filmes = pd.read_csv('datasets/movies_multilinear_reg.csv')

In [203]:
filmes.head(5)


Out[203]:
movieId Titulo Documentary Sci-Fi Mystery Horror Romance Thriller Crime Fantasy Comedy Animation Children Drama Adventure Duracao Investimento Bilheteria
0 1 Toy Story (1995) 0 0 0 0 0 0 0 1 1 1 1 0 1 103.468310 11.048216 5623234.602
1 2 Jumanji (1995) 0 0 0 0 0 0 0 1 0 0 1 0 1 112.337916 14.927678 5714951.757
2 3 Grumpier Old Men (1995) 0 0 0 0 1 0 0 0 1 0 0 0 0 116.245732 27.114597 9524339.124
3 4 Waiting to Exhale (1995) 0 0 0 0 1 0 0 0 1 0 0 1 0 120.317732 4.994242 6331568.779
4 5 Father of the Bride Part II (1995) 0 0 0 0 0 0 0 0 1 0 0 0 0 110.023572 19.142246 6409617.277

In [204]:
filmes.shape


Out[204]:
(9125, 18)

In [205]:
filmes_caract = filmes[filmes.columns[2:17]]
filmes_labels = filmes[filmes.columns[17:]]

In [206]:
filmes_caract.head(5)


Out[206]:
Documentary Sci-Fi Mystery Horror Romance Thriller Crime Fantasy Comedy Animation Children Drama Adventure Duracao Investimento
0 0 0 0 0 0 0 0 1 1 1 1 0 1 103.468310 11.048216
1 0 0 0 0 0 0 0 1 0 0 1 0 1 112.337916 14.927678
2 0 0 0 0 1 0 0 0 1 0 0 0 0 116.245732 27.114597
3 0 0 0 0 1 0 0 0 1 0 0 1 0 120.317732 4.994242
4 0 0 0 0 0 0 0 0 1 0 0 0 0 110.023572 19.142246

In [207]:
filmes_labels.head(5)


Out[207]:
Bilheteria
0 5623234.602
1 5714951.757
2 9524339.124
3 6331568.779
4 6409617.277

In [208]:
treino, teste, treino_labels, teste_labels = train_test_split(filmes_caract, filmes_labels)

In [209]:
print('Shape do treino {}, Shape do teste {}'.format(treino.shape, teste.shape))


Shape do treino (6843, 15), Shape do teste (2282, 15)

In [210]:
porc_treino = len(treino)/len(filmes_caract)*100
porc_teste =  len(teste)/len(filmes_caract)*100
print('Proporção Treino / Teste: {0:.2f}% / {1:.2f}%'.format(porc_treino, porc_teste))


Proporção Treino / Teste: 74.99% / 25.01%

In [211]:
treino_arr = np.array(treino).reshape(len(treino),15)
teste_arr = np.array(teste).reshape(len(teste),15)

In [212]:
print('Tipo e shape do array treino {} - {}'.format(type(treino_arr), treino_arr.shape))


Tipo e shape do array treino <class 'numpy.ndarray'> - (6843, 15)

In [213]:
print('Tipo e shape do array teste {} - {}'.format(type(teste_arr), teste_arr.shape))


Tipo e shape do array teste <class 'numpy.ndarray'> - (2282, 15)

In [214]:
treino_labels_arr = np.array(treino_labels).reshape(len(treino_labels),1)
teste_labels_arr = np.array(teste_labels).reshape(len(teste_labels),1)

Usando modelo DecisionTreeRegressor


In [215]:
modelo = tree.DecisionTreeRegressor()
modelo.fit(treino_arr, treino_labels_arr)


Out[215]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [216]:
score_treino = modelo.score(treino_arr, treino_labels_arr)
score_teste = modelo.score(teste_arr, teste_labels_arr)
print('DecisionTreeRegressor: Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_treino * 100, score_teste * 100))


DecisionTreeRegressor: Acertamos 100.00% no treino, Acertamos 66.12% no teste

Usando modelo LinearRegression


In [217]:
modelo_reg = LinearRegression()
modelo_reg.fit(treino_arr, treino_labels_arr)


Out[217]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [218]:
score_treino = modelo_reg.score(treino_arr, treino_labels_arr)
score_teste = modelo_reg.score(teste_arr, teste_labels_arr)
print('LinearRegression: Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_treino * 100, score_teste * 100))


LinearRegression: Acertamos 82.61% no treino, Acertamos 83.25% no teste

Aula 4 Video 2 - Classificadores e métricas


In [219]:
modelo_depth5 = tree.DecisionTreeRegressor(max_depth=5)
modelo_depth5.fit(treino_arr, treino_labels_arr)


Out[219]:
DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [220]:
score_max5_treino = modelo_depth5.score(treino_arr, treino_labels_arr)
score_max5_teste = modelo_depth5.score(teste_arr, teste_labels_arr)
print('DecisionTreeRegressor(max_depth=5): Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_max5_treino * 100, score_max5_treino * 100))


DecisionTreeRegressor(max_depth=5): Acertamos 80.73% no treino, Acertamos 80.73% no teste

In [221]:
zootopia = [0,0,0,0,0,0,0,1,1,1,1,0,1,110,27.74456356]
modelo_depth5.predict([zootopia])


Out[221]:
array([ 7899402.77132227])

Gostos do usuário

Download Dataset: avaliacoes_usuario.csv


In [222]:
gostos = pd.read_csv('datasets/avaliacoes_usuario.csv')
gostos.head(5)


Out[222]:
Titulo Documentary Sci-Fi Mystery Horror Romance Thriller Crime Fantasy Comedy Animation Children Drama Adventure Duracao Investimento Gostou
0 Nixon (1995) 0 0 0 0 0 0 0 0 0 0 0 1 0 114.496547 7.930748 1
1 Leaving Las Vegas (1995) 0 0 0 0 1 0 0 0 0 0 0 1 0 110.140191 18.276555 1
2 Persuasion (1995) 0 0 0 0 1 0 0 0 0 0 0 1 0 105.747597 16.582232 1
3 Babe (1995) 0 0 0 0 0 0 0 0 0 0 1 1 0 126.131978 13.004553 1
4 Carrington (1995) 0 0 0 0 1 0 0 0 0 0 0 1 0 85.025469 14.418120 1

In [223]:
caract = gostos[gostos.columns[1:16]]
labels = gostos[gostos.columns[16:]]

In [224]:
treino, teste, treino_labels, teste_labels = train_test_split(caract, labels)

In [225]:
treino = np.array(treino).reshape(len(treino), 15)
teste = np.array(teste).reshape(len(teste), 15)
treino_labels = treino_labels.values.ravel()
teste_labels = teste_labels.values.ravel()

In [226]:
print('Tipos: treino {}, teste {}, treino_labels {}, teste_labels {}'.
      format(type(treino), type(teste), type(treino_labels), type(teste_labels)))


Tipos: treino <class 'numpy.ndarray'>, teste <class 'numpy.ndarray'>, treino_labels <class 'numpy.ndarray'>, teste_labels <class 'numpy.ndarray'>

Usando LogisticRegression


In [227]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [228]:
model = LogisticRegression()
model.fit(treino, treino_labels)


Out[228]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [229]:
previsoes = model.predict(teste)
acuracia = accuracy_score(teste_labels, previsoes)
print('Acuracia com LogisticRegression no teste: {0:.2f}%'.format(acuracia * 100))


Acuracia com LogisticRegression no teste: 80.00%

Usando DecissionTreeClassifier


In [230]:
modelo = tree.DecisionTreeClassifier()
modelo.fit(treino, treino_labels)


Out[230]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [231]:
previsoes = modelo.predict(teste)
acuracia = accuracy_score(teste_labels, previsoes)
print('Acuracia com DecisionTreeClassifier no teste: {0:.2f}%'.format(acuracia * 100))
previsoes = modelo.predict(treino)
acuracia = accuracy_score(treino_labels, previsoes)
print('Acuracia com DecisionTreeClassifier com treino: {0:.2f}%'.format(acuracia * 100))


Acuracia com DecisionTreeClassifier no teste: 70.00%
Acuracia com DecisionTreeClassifier com treino: 100.00%

Usando DecisionTreeClassifier(max_depth=5)


In [232]:
modelo_max5 = tree.DecisionTreeClassifier(max_depth=5)
modelo_max5.fit(treino, treino_labels)
previsoes = modelo_max5.predict(teste)
acuracia = accuracy_score(teste_labels, previsoes)
print('Acuracia com DecisionTreeClassifier(max_depth=5) no teste: {0:.2f}%'.format(acuracia * 100))
previsoes = modelo_max5.predict(treino)
acuracia = accuracy_score(treino_labels, previsoes)
print('Acuracia com DecisionTreeClassifier(max_depth=5) com treino: {0:.2f}%'.format(acuracia * 100))


Acuracia com DecisionTreeClassifier(max_depth=5) no teste: 75.29%
Acuracia com DecisionTreeClassifier(max_depth=5) com treino: 85.63%

In [ ]: