In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
# this allows plots to appear directly in the notebook
%matplotlib inline
In [4]:
data = pd.read_csv('data/wine.csv')
data.head()
Out[4]:
In [5]:
# print the shape of the DataFrame
data.shape
Out[5]:
In [7]:
# visualize the relationship between the features and the response using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='Year', y='Price', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='WinterRain', y='Price', ax=axs[1])
data.plot(kind='scatter', x='AGST', y='Price', ax=axs[2])
Out[7]:
In [8]:
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='HarvestRain', y='Price', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='Age', y='Price', ax=axs[1])
data.plot(kind='scatter', x='FrancePop', y='Price', ax=axs[2])
Out[8]:
In [14]:
# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf
# create a fitted model in one line
lm = smf.ols(formula='Price ~ Year + WinterRain + AGST + HarvestRain + Age + FrancePop',
data=data).fit()
# print the coefficients
lm.params
Out[14]:
In [15]:
lm.rsquared
Out[15]:
In [16]:
lm.summary()
Out[16]:
In [19]:
# Create model using a smaller amount of significant variables
lm = smf.ols(formula='Price ~ AGST + HarvestRain + WinterRain + Age',
data=data).fit()
In [20]:
lm.rsquared
Out[20]:
In [28]:
ypred = lm.predict(data)
ypred
Out[28]:
In [24]:
wine_test = pd.read_csv('data/wine_test.csv')
In [27]:
lm.predict(wine_test)
Out[27]:
In [ ]:
In [ ]: