In [1]:
    
# imports
import pandas as pd
import matplotlib.pyplot as plt
# this allows plots to appear directly in the notebook
%matplotlib inline
    
In [4]:
    
data = pd.read_csv('data/wine.csv')
data.head()
    
    Out[4]:
In [5]:
    
# print the shape of the DataFrame
data.shape
    
    Out[5]:
In [7]:
    
# visualize the relationship between the features and the response using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='Year', y='Price', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='WinterRain', y='Price', ax=axs[1])
data.plot(kind='scatter', x='AGST', y='Price', ax=axs[2])
    
    Out[7]:
    
In [8]:
    
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='HarvestRain', y='Price', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='Age', y='Price', ax=axs[1])
data.plot(kind='scatter', x='FrancePop', y='Price', ax=axs[2])
    
    Out[8]:
    
In [14]:
    
# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf
# create a fitted model in one line
lm = smf.ols(formula='Price ~ Year + WinterRain + AGST + HarvestRain + Age + FrancePop', 
             data=data).fit()
# print the coefficients
lm.params
    
    Out[14]:
In [15]:
    
lm.rsquared
    
    Out[15]:
In [16]:
    
lm.summary()
    
    Out[16]:
In [19]:
    
# Create model using a smaller amount of significant variables
lm = smf.ols(formula='Price ~ AGST + HarvestRain + WinterRain + Age', 
             data=data).fit()
    
In [20]:
    
lm.rsquared
    
    Out[20]:
In [28]:
    
ypred = lm.predict(data)
ypred
    
    Out[28]:
In [24]:
    
wine_test = pd.read_csv('data/wine_test.csv')
    
In [27]:
    
lm.predict(wine_test)
    
    Out[27]:
In [ ]:
    
    
In [ ]: