Topic: Challenge Set 5
Subject: Linear Regression and Train/Test Split
Date: 02/07/2017
Name: Prashant Tatineni
In [97]:
import pandas as pd
import patsy
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
%matplotlib inline
In [3]:
df = pd.read_csv('2013_movies.csv')
In [4]:
df.head()
Out[4]:
In [5]:
y, X = patsy.dmatrices('DomesticTotalGross ~ Budget + Runtime', data=df, return_type="dataframe")
In [10]:
X.head()
Out[10]:
Challenge 1
In [13]:
model = sm.OLS(y, X['Intercept'])
fit = model.fit()
fit.summary()
Out[13]:
This model is representing the null hypothesis.
In [24]:
records = range(89)
In [32]:
plt.scatter(records, y, color='g')
plt.scatter(records, fit.predict(X['Intercept']))
Out[32]:
In [51]:
plt.hist((y['DomesticTotalGross'] - fit.predict(X['Intercept'])));
Challenge 2
In [58]:
model = sm.OLS(y, X[['Intercept','Budget']])
fit = model.fit()
fit.summary()
Out[58]:
In [59]:
plt.scatter(X['Budget'], y, color='g')
plt.scatter(X['Budget'], fit.predict(X[['Intercept','Budget']]))
Out[59]:
In [61]:
plt.scatter(X['Budget'], fit.predict(X[['Intercept','Budget']]) - y['DomesticTotalGross'])
Out[61]:
For higher budget, higher grossing movies there is some spread in the data and the model's residuals are higher
Challenge 3
In [66]:
y3, X3 = patsy.dmatrices('DomesticTotalGross ~ Rating', data=df, return_type="dataframe")
In [72]:
X3.head()
Out[72]:
In [83]:
model = sm.OLS(y3, X3)
fit = model.fit()
fit.summary()
Out[83]:
In [89]:
records3 = range(100)
In [90]:
plt.scatter(records3, y3, color='g')
plt.scatter(records3, fit.predict(X3))
Out[90]:
In [91]:
plt.hist((y3['DomesticTotalGross'] - fit.predict(X3)));
Here, the model is using the 'rating' to predict Domestic gross. Since there's 4 ratings, it's predicting one of 4 domestic gross values.
Challenge 4
In [92]:
y4, X4 = patsy.dmatrices('DomesticTotalGross ~ Budget + Runtime + Rating', data=df, return_type="dataframe")
In [93]:
X4.head()
Out[93]:
In [94]:
model = sm.OLS(y4, X4)
fit = model.fit()
fit.summary()
Out[94]:
In [96]:
plt.scatter(records, y4, color='g')
plt.scatter(records, fit.predict(X4))
Out[96]:
Challenge 5
In [98]:
X_train, X_test, y_train, y_test = train_test_split(X4, y4, test_size=0.25)
In [99]:
y_test.shape
Out[99]:
In [100]:
model = sm.OLS(y_train, X_train)
fit = model.fit()
fit.summary()
Out[100]:
In [101]:
records5 = range(23)
In [102]:
plt.scatter(records5, y_test, color='g')
plt.scatter(records5, fit.predict(X_test))
Out[102]: