In [1]:
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import norm, lognorm, bernoulli
from scipy.stats.mstats import mquantiles
%matplotlib inline
plt.style.use('ggplot')
In [36]:
# simulate some data - theoretical conditions, not real-world...
b0, b1 = 1, 5
x = np.linspace(1, 5, 1000)
eps = norm(loc=0, scale=1).rvs(1000)
y = b0 + b1 * x + eps
plt.scatter(x, y);
In [37]:
X = np.vstack([np.ones(1000), x]).T
In [38]:
# solve the problem
w = np.linalg.inv(X.T @ X) @ X.T @ y
print(w)
In [39]:
lm = sm.OLS(exog=X, endog=y).fit() # order of inputs reversed unless you explicitly name exog, endog
lm.summary()
Out[39]:
In [40]:
eps_ln = lognorm(s=1.3).rvs(1000)
sgn = bernoulli(p=.7).rvs(1000)
sgn[sgn == 0] = -1
y_ln = b0 + b1 * x + sgn * eps_ln
plt.scatter(x, y_ln);
In [41]:
print(np.linalg.inv(X.T @ X) @ X.T @ y_ln)
In [42]:
lm2 = sm.OLS(y_ln, X).fit()
lm2.summary()
Out[42]:
In [43]:
# residual plot - random?
plt.scatter(X[:,1], lm2.resid);
#plt.ylim(-25, 25);
In [44]:
# Q - Q plot
preds = lm2.predict(X)
qs = np.arange(0, 1, 0.01)
true_q = mquantiles(y_ln, prob=qs)
pred_q = mquantiles(preds, prob = qs)
plt.plot(true_q, true_q)
plt.scatter(true_q, pred_q);
In [45]:
# residuals vs fitted
plt.scatter(preds, lm2.resid)
plt.ylim(-40, 40);
In [46]:
# leverage plot
leverage = lm2.get_influence()
(c, p) = leverage.cooks_distance
plt.scatter(range(len(c)), c)
plt.title('Leverage Plot');
In [6]:
diamonds = sm.datasets.get_rdataset('diamonds', 'ggplot2')
In [7]:
print(diamonds.__doc__)
In [8]:
df = diamonds.data # extract the data - this returns a Pandas dataframe
In [9]:
df.describe(include='all')
Out[9]:
In [10]:
plt.scatter(df.carat, df.price, alpha = 0.3)
plt.xlim(0,6)
plt.ylim(0,20000);
In [11]:
# prepare the data for modeling
ones = np.ones(df.shape[0])
carats = np.array(df['carat'])
X = np.vstack([ones, carats]).T
y = np.array(df['price'])
In [12]:
lm = sm.OLS(y, X)
results = lm.fit()
In [13]:
results.summary()
Out[13]:
In [20]:
# alternative - use 'formulas' api
# only simple models
import statsmodels.formula.api as smf
lm2 = smf.ols('price~carat', data=df)
In [ ]:
lm2.fit().summary()
In [ ]:
predictions = results.predict() # with no argument, this defaults to make predictions on the already seen data
In [ ]:
plt.scatter(df.carat, df.price, alpha=0.3) # previous plot
plt.plot(df.carat, predictions, color='red', label='Predicted') # predicted values
plt.legend(loc='lower right')
plt.ylim(0, 20000) # included b/c some predicted values are far from the actual data, comment out to see
plt.xlim(0,6);
In [14]:
df_transformed = df
In [15]:
df_transformed['log_price'] = np.log(df_transformed.price)
In [16]:
plt.scatter(df_transformed.carat, df_transformed.log_price, alpha=0.3)
plt.xlim(0,6)
plt.ylim(5.5,10);
In [17]:
df_transformed['log_carat']=np.log(df_transformed.carat)
In [18]:
plt.scatter(df_transformed.log_carat, df_transformed.log_price, alpha=0.3)
plt.ylim(5.5, 10);
In [21]:
model_transformed = smf.ols('log_price~log_carat', data=df_transformed)
results_transformed = model_transformed.fit()
print(results_transformed.summary())
In [22]:
df_transformed.describe(include='all')
Out[22]:
In [23]:
df_transformed.corr()
Out[23]:
In [24]:
from sklearn.preprocessing import StandardScaler
In [25]:
# in multiple-variable setting, scaling may improve performance
# in single-variable setting, it doesn't make a difference
# code below only to demonstrate usage of StandardScaler - note the deprecation warning
X = np.array(df_transformed['log_carat'])
y = np.array(df_transformed['log_price'])
X_scaled = StandardScaler().fit_transform(X)
X_scaled = np.vstack([ones, X_scaled]).T
In [26]:
X_scaled[:5]
Out[26]:
In [27]:
lm3 = sm.OLS(y, X_scaled)
results = lm3.fit()
results.summary()
Out[27]:
In [28]:
from patsy import dmatrices
# note the order of y and X. Again annoyingly opposite of what you might expect.
y, X = dmatrices('log_price~log_carat+cut+clarity+color', data=df_transformed, return_type='matrix')
In [29]:
y_demo, X_demo = dmatrices('log_price~log_carat+cut+clarity+color', data=df_transformed, return_type='dataframe')
In [30]:
X_demo.head()
Out[30]:
In [31]:
X[:10,:10]
Out[31]:
In [32]:
multi_model = sm.OLS(y, X)
multi_results=multi_model.fit()
print(multi_results.summary())
In [ ]: