Aula 2 Video 1 - A regressão linear múltipla


In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

Download Dataset: movies_multilinear_reg.csv


In [81]:
movies = pd.read_csv('datasets/movies_multilinear_reg.csv')

In [82]:
type(movies)


Out[82]:
pandas.core.frame.DataFrame

In [83]:
movies.head(10)


Out[83]:
movieId Titulo Documentary Sci-Fi Mystery Horror Romance Thriller Crime Fantasy Comedy Animation Children Drama Adventure Duracao Investimento Bilheteria
0 1 Toy Story (1995) 0 0 0 0 0 0 0 1 1 1 1 0 1 103.468310 11.048216 5623234.602
1 2 Jumanji (1995) 0 0 0 0 0 0 0 1 0 0 1 0 1 112.337916 14.927678 5714951.757
2 3 Grumpier Old Men (1995) 0 0 0 0 1 0 0 0 1 0 0 0 0 116.245732 27.114597 9524339.124
3 4 Waiting to Exhale (1995) 0 0 0 0 1 0 0 0 1 0 0 1 0 120.317732 4.994242 6331568.779
4 5 Father of the Bride Part II (1995) 0 0 0 0 0 0 0 0 1 0 0 0 0 110.023572 19.142246 6409617.277
5 6 Heat (1995) 0 0 0 0 0 1 1 0 0 0 0 0 0 97.324845 9.977311 4956557.317
6 7 Sabrina (1995) 0 0 0 0 1 0 0 0 1 0 0 0 0 88.978893 14.257461 4654565.066
7 8 Tom and Huck (1995) 0 0 0 0 0 0 0 0 0 0 1 0 1 78.536011 8.871800 3950017.325
8 9 Sudden Death (1995) 0 0 0 0 0 0 0 0 0 0 0 0 0 104.934703 29.112800 6850971.551
9 10 GoldenEye (1995) 0 0 0 0 0 1 0 0 0 0 0 0 1 105.899475 3.695241 5157865.850

In [84]:
movies.shape


Out[84]:
(9125, 18)

In [85]:
filmes_independente = movies[movies.columns[2:17]]

In [86]:
type(filmes_independente)


Out[86]:
pandas.core.frame.DataFrame

In [87]:
filmes_dependente = movies[movies.columns[17:]]

In [88]:
type(filmes_dependente)


Out[88]:
pandas.core.frame.DataFrame

In [89]:
train, test, train_bilheteria, test_bilheteria = train_test_split(filmes_independente, filmes_dependente)

In [90]:
train.head()


Out[90]:
Documentary Sci-Fi Mystery Horror Romance Thriller Crime Fantasy Comedy Animation Children Drama Adventure Duracao Investimento
7505 0 0 0 0 0 0 1 0 1 0 0 0 0 111.247325 41.496397
909 0 0 0 0 0 0 0 0 0 0 0 1 0 129.757430 3.751061
8308 0 0 0 0 0 0 0 0 1 0 0 0 0 139.792106 32.107787
416 0 0 0 0 1 0 0 0 1 0 0 0 0 120.752073 6.473384
4728 0 0 0 0 0 0 0 0 0 0 0 0 0 96.975762 4.128282

In [91]:
train.shape[0]


Out[91]:
6843

In [92]:
test.shape[0]


Out[92]:
2282

In [93]:
modelo = LinearRegression()
modelo.fit(train,train_bilheteria)


Out[93]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

1,Toy Story (1995),0,0,0,0,0,0,0,1,1,1,1,0,1,103.4683096,11.04821649,5623234.602


In [94]:
modelo.predict([[0,0,0,0,0,0,0,1,1,1,1,0,1,103.4683096,11.04821649]])


Out[94]:
array([[ 5828627.96965683]])

In [95]:
modelo.score(train, train_bilheteria)


Out[95]:
0.82728993267807627

In [96]:
modelo.coef_


Out[96]:
array([[ 42655.97503496,   6338.33598592, -13818.76605525, -23915.64582939,
         24775.47891531,  15437.82939179,  13901.13435994,  17001.12069112,
         -5758.97665963,  14581.77792254,  12879.50700613, -10960.33103084,
        -61189.27789748,  45674.3003098 ,  98955.21832726]])

In [97]:
modelo.intercept_


Out[97]:
array([ 31992.49848156])

In [98]:
modelo.score(test, test_bilheteria)


Out[98]:
0.82868767262936649

Zootopia

movieId,Titulo,Documentary,Sci-Fi,Mystery,Horror,Romance,Thriller,Crime,Fantasy,Comedy,Animation,Children,Drama,Adventure,Duracao,Investimento,Bilheteria
9999999,Zootopia,0,0,0,0,0,0,0,1,1,1,1,0,1,110,27.74456356,?????

In [99]:
zootopia = [0,0,0,0,0,0,0,1,1,1,1,0,1,110,27.74456356]
modelo.predict([zootopia])


Out[99]:
array([[ 7779149.0280967]])

Split dos dados com test_size=0.3


In [100]:
train, test, train_bilheteria, test_bilheteria = train_test_split(filmes_independente, filmes_dependente, test_size=0.3)

In [101]:
modelo_30 = LinearRegression()
modelo_30.fit(train,train_bilheteria)


Out[101]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [102]:
modelo_30.score(test, test_bilheteria)


Out[102]:
0.8324125338489522

In [103]:
zootopia = [0,0,0,0,0,0,0,0,1,1,1,0,1,145.5170642,3.451632127]
modelo.predict([zootopia])


Out[103]:
array([[ 6980452.63003705]])

In [104]:
planeta_macaco = [0,1,0,0,0,0,0,0,0,0,0,0,0,150,5]
modelo.predict([planeta_macaco])


Out[104]:
array([[ 7384251.97257379]])