In [1]:
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import norm, lognorm, bernoulli
from scipy.stats.mstats import mquantiles

%matplotlib inline
plt.style.use('ggplot')

In [36]:
# simulate some data - theoretical conditions, not real-world...
b0, b1 = 1, 5

x = np.linspace(1, 5, 1000)
eps = norm(loc=0, scale=1).rvs(1000)
y = b0 + b1 * x + eps

plt.scatter(x, y);



In [37]:
X = np.vstack([np.ones(1000), x]).T

In [38]:
# solve the problem
w = np.linalg.inv(X.T @ X) @ X.T @ y

print(w)


[ 0.91185372  5.0345251 ]

In [39]:
lm = sm.OLS(exog=X, endog=y).fit()  # order of inputs reversed unless you explicitly name exog, endog
lm.summary()


Out[39]:
OLS Regression Results
Dep. Variable: y R-squared: 0.970
Model: OLS Adj. R-squared: 0.970
Method: Least Squares F-statistic: 3.242e+04
Date: Wed, 06 Jul 2016 Prob (F-statistic): 0.00
Time: 11:49:50 Log-Likelihood: -1439.7
No. Observations: 1000 AIC: 2883.
Df Residuals: 998 BIC: 2893.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.9119 0.090 10.144 0.000 0.735 1.088
x1 5.0345 0.028 180.057 0.000 4.980 5.089
Omnibus: 6.541 Durbin-Watson: 1.989
Prob(Omnibus): 0.038 Jarque-Bera (JB): 6.688
Skew: 0.155 Prob(JB): 0.0353
Kurtosis: 3.254 Cond. No. 9.70

In [40]:
eps_ln = lognorm(s=1.3).rvs(1000)
sgn = bernoulli(p=.7).rvs(1000)
sgn[sgn == 0] = -1
y_ln = b0 + b1 * x + sgn * eps_ln

plt.scatter(x, y_ln);



In [41]:
print(np.linalg.inv(X.T @ X) @ X.T @ y_ln)


[ 2.12158822  4.98243332]

In [42]:
lm2 = sm.OLS(y_ln, X).fit()
lm2.summary()


Out[42]:
OLS Regression Results
Dep. Variable: y R-squared: 0.553
Model: OLS Adj. R-squared: 0.553
Method: Least Squares F-statistic: 1236.
Date: Wed, 06 Jul 2016 Prob (F-statistic): 8.50e-177
Time: 11:49:54 Log-Likelihood: -3062.9
No. Observations: 1000 AIC: 6130.
Df Residuals: 998 BIC: 6140.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 2.1216 0.456 4.656 0.000 1.227 3.016
x1 4.9824 0.142 35.153 0.000 4.704 5.261
Omnibus: 780.573 Durbin-Watson: 2.061
Prob(Omnibus): 0.000 Jarque-Bera (JB): 239825.773
Skew: 2.506 Prob(JB): 0.00
Kurtosis: 78.701 Cond. No. 9.70

In [43]:
# residual plot - random?
plt.scatter(X[:,1], lm2.resid);
#plt.ylim(-25, 25);



In [44]:
# Q - Q plot
preds = lm2.predict(X)
qs = np.arange(0, 1, 0.01)
true_q = mquantiles(y_ln, prob=qs)
pred_q = mquantiles(preds, prob = qs)


plt.plot(true_q, true_q)
plt.scatter(true_q, pred_q);



In [45]:
# residuals vs fitted
plt.scatter(preds, lm2.resid)
plt.ylim(-40, 40);



In [46]:
# leverage plot
leverage = lm2.get_influence()
(c, p) = leverage.cooks_distance
plt.scatter(range(len(c)), c)
plt.title('Leverage Plot');



In [6]:
diamonds = sm.datasets.get_rdataset('diamonds', 'ggplot2')

In [7]:
print(diamonds.__doc__)


+------------+-------------------+
| diamonds   | R Documentation   |
+------------+-------------------+

Prices of 50,000 round cut diamonds
-----------------------------------

Description
~~~~~~~~~~~

A dataset containing the prices and other attributes of almost 54,000
diamonds. The variables are as follows:

Usage
~~~~~

::

    data(diamonds)

Format
~~~~~~

A data frame with 53940 rows and 10 variables

Details
~~~~~~~

-  price. price in US dollars (\\$326–\\$18,823)

-  carat. weight of the diamond (0.2–5.01)

-  cut. quality of the cut (Fair, Good, Very Good, Premium, Ideal)

-  colour. diamond colour, from J (worst) to D (best)

-  clarity. a measurement of how clear the diamond is (I1 (worst), SI1,
   SI2, VS1, VS2, VVS1, VVS2, IF (best))

-  x. length in mm (0–10.74)

-  y. width in mm (0–58.9)

-  z. depth in mm (0–31.8)

-  depth. total depth percentage = z / mean(x, y) = 2 \* z / (x + y)
   (43–79)

-  table. width of top of diamond relative to widest point (43–95)




In [8]:
df = diamonds.data  # extract the data - this returns a Pandas dataframe

In [9]:
df.describe(include='all')


Out[9]:
carat cut color clarity depth table price x y z
count 53940.000000 53940 53940 53940 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000
unique NaN 5 7 8 NaN NaN NaN NaN NaN NaN
top NaN Ideal G SI1 NaN NaN NaN NaN NaN NaN
freq NaN 21551 11292 13065 NaN NaN NaN NaN NaN NaN
mean 0.797940 NaN NaN NaN 61.749405 57.457184 3932.799722 5.731157 5.734526 3.538734
std 0.474011 NaN NaN NaN 1.432621 2.234491 3989.439738 1.121761 1.142135 0.705699
min 0.200000 NaN NaN NaN 43.000000 43.000000 326.000000 0.000000 0.000000 0.000000
25% 0.400000 NaN NaN NaN 61.000000 56.000000 950.000000 4.710000 4.720000 2.910000
50% 0.700000 NaN NaN NaN 61.800000 57.000000 2401.000000 5.700000 5.710000 3.530000
75% 1.040000 NaN NaN NaN 62.500000 59.000000 5324.250000 6.540000 6.540000 4.040000
max 5.010000 NaN NaN NaN 79.000000 95.000000 18823.000000 10.740000 58.900000 31.800000

In [10]:
plt.scatter(df.carat, df.price, alpha = 0.3)
plt.xlim(0,6)
plt.ylim(0,20000);



In [11]:
# prepare the data for modeling 
ones = np.ones(df.shape[0])
carats = np.array(df['carat'])

X = np.vstack([ones, carats]).T
y = np.array(df['price'])

In [12]:
lm = sm.OLS(y, X)
results = lm.fit()

In [13]:
results.summary()


Out[13]:
OLS Regression Results
Dep. Variable: y R-squared: 0.849
Model: OLS Adj. R-squared: 0.849
Method: Least Squares F-statistic: 3.041e+05
Date: Wed, 06 Jul 2016 Prob (F-statistic): 0.00
Time: 11:33:53 Log-Likelihood: -4.7273e+05
No. Observations: 53940 AIC: 9.455e+05
Df Residuals: 53938 BIC: 9.455e+05
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -2256.3606 13.055 -172.830 0.000 -2281.949 -2230.772
x1 7756.4256 14.067 551.408 0.000 7728.855 7783.996
Omnibus: 14025.341 Durbin-Watson: 0.986
Prob(Omnibus): 0.000 Jarque-Bera (JB): 153030.525
Skew: 0.939 Prob(JB): 0.00
Kurtosis: 11.035 Cond. No. 3.65

In [20]:
# alternative - use 'formulas' api
# only simple models
import statsmodels.formula.api as smf

lm2 = smf.ols('price~carat', data=df)

In [ ]:
lm2.fit().summary()

In [ ]:
predictions = results.predict()  # with no argument, this defaults to make predictions on the already seen data

In [ ]:
plt.scatter(df.carat, df.price, alpha=0.3)  # previous plot
plt.plot(df.carat, predictions, color='red', label='Predicted')  # predicted values
plt.legend(loc='lower right')
plt.ylim(0, 20000)  # included b/c some predicted values are far from the actual data, comment out to see 
plt.xlim(0,6);

In [14]:
df_transformed = df

In [15]:
df_transformed['log_price'] = np.log(df_transformed.price)

In [16]:
plt.scatter(df_transformed.carat, df_transformed.log_price, alpha=0.3)     
plt.xlim(0,6)
plt.ylim(5.5,10);



In [17]:
df_transformed['log_carat']=np.log(df_transformed.carat)

In [18]:
plt.scatter(df_transformed.log_carat, df_transformed.log_price, alpha=0.3)
plt.ylim(5.5, 10);



In [21]:
model_transformed = smf.ols('log_price~log_carat', data=df_transformed)
results_transformed = model_transformed.fit()
print(results_transformed.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:              log_price   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                 7.510e+05
Date:                Wed, 06 Jul 2016   Prob (F-statistic):               0.00
Time:                        11:38:05   Log-Likelihood:                -4424.2
No. Observations:               53940   AIC:                             8852.
Df Residuals:                   53938   BIC:                             8870.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      8.4487      0.001   6190.896      0.000         8.446     8.451
log_carat      1.6758      0.002    866.590      0.000         1.672     1.680
==============================================================================
Omnibus:                      877.676   Durbin-Watson:                   1.227
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1679.882
Skew:                           0.072   Prob(JB):                         0.00
Kurtosis:                       3.853   Cond. No.                         2.08
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [22]:
df_transformed.describe(include='all')


Out[22]:
carat cut color clarity depth table price x y z log_price log_carat
count 53940.000000 53940 53940 53940 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000
unique NaN 5 7 8 NaN NaN NaN NaN NaN NaN NaN NaN
top NaN Ideal G SI1 NaN NaN NaN NaN NaN NaN NaN NaN
freq NaN 21551 11292 13065 NaN NaN NaN NaN NaN NaN NaN NaN
mean 0.797940 NaN NaN NaN 61.749405 57.457184 3932.799722 5.731157 5.734526 3.538734 7.786768 -0.394967
std 0.474011 NaN NaN NaN 1.432621 2.234491 3989.439738 1.121761 1.142135 0.705699 1.014649 0.584828
min 0.200000 NaN NaN NaN 43.000000 43.000000 326.000000 0.000000 0.000000 0.000000 5.786897 -1.609438
25% 0.400000 NaN NaN NaN 61.000000 56.000000 950.000000 4.710000 4.720000 2.910000 6.856462 -0.916291
50% 0.700000 NaN NaN NaN 61.800000 57.000000 2401.000000 5.700000 5.710000 3.530000 7.783641 -0.356675
75% 1.040000 NaN NaN NaN 62.500000 59.000000 5324.250000 6.540000 6.540000 4.040000 8.580027 0.039221
max 5.010000 NaN NaN NaN 79.000000 95.000000 18823.000000 10.740000 58.900000 31.800000 9.842835 1.611436

In [23]:
df_transformed.corr()


Out[23]:
carat depth table price x y z log_price log_carat
carat 1.000000 0.028224 0.181618 0.921591 0.975094 0.951722 0.953387 0.920207 0.956400
depth 0.028224 1.000000 -0.295779 -0.010647 -0.025289 -0.029341 0.094924 0.000860 0.030434
table 0.181618 -0.295779 1.000000 0.127134 0.195344 0.183760 0.150929 0.158208 0.191741
price 0.921591 -0.010647 0.127134 1.000000 0.884435 0.865421 0.861249 0.895771 0.855526
x 0.975094 -0.025289 0.195344 0.884435 1.000000 0.974701 0.970772 0.958010 0.990166
y 0.951722 -0.029341 0.183760 0.865421 0.974701 1.000000 0.952006 0.936173 0.966495
z 0.953387 0.094924 0.150929 0.861249 0.970772 0.952006 1.000000 0.935218 0.969060
log_price 0.920207 0.000860 0.158208 0.895771 0.958010 0.936173 0.935218 1.000000 0.965914
log_carat 0.956400 0.030434 0.191741 0.855526 0.990166 0.966495 0.969060 0.965914 1.000000

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
# in multiple-variable setting, scaling may improve performance
# in single-variable setting, it doesn't make a difference
# code below only to demonstrate usage of StandardScaler - note the deprecation warning
X = np.array(df_transformed['log_carat'])
y = np.array(df_transformed['log_price'])

X_scaled = StandardScaler().fit_transform(X)
X_scaled = np.vstack([ones, X_scaled]).T


/Users/jwj2/anaconda/envs/snowflakes/lib/python3.5/site-packages/sklearn/preprocessing/data.py:583: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/jwj2/anaconda/envs/snowflakes/lib/python3.5/site-packages/sklearn/preprocessing/data.py:646: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

In [26]:
X_scaled[:5]


Out[26]:
array([[ 1.        , -1.8376683 ],
       [ 1.        , -1.99322292],
       [ 1.        , -1.8376683 ],
       [ 1.        , -1.44130568],
       [ 1.        , -1.32726865]])

In [27]:
lm3 = sm.OLS(y, X_scaled)
results = lm3.fit()
results.summary()


Out[27]:
OLS Regression Results
Dep. Variable: y R-squared: 0.933
Model: OLS Adj. R-squared: 0.933
Method: Least Squares F-statistic: 7.510e+05
Date: Wed, 06 Jul 2016 Prob (F-statistic): 0.00
Time: 11:40:02 Log-Likelihood: -4424.2
No. Observations: 53940 AIC: 8852.
Df Residuals: 53938 BIC: 8870.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 7.7868 0.001 6885.265 0.000 7.785 7.789
x1 0.9801 0.001 866.590 0.000 0.978 0.982
Omnibus: 877.676 Durbin-Watson: 1.227
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1679.882
Skew: 0.072 Prob(JB): 0.00
Kurtosis: 3.853 Cond. No. 1.00

In [28]:
from patsy import dmatrices

# note the order of y and X. Again annoyingly opposite of what you might expect.
y, X = dmatrices('log_price~log_carat+cut+clarity+color', data=df_transformed, return_type='matrix')

In [29]:
y_demo, X_demo = dmatrices('log_price~log_carat+cut+clarity+color', data=df_transformed, return_type='dataframe')

In [30]:
X_demo.head()


Out[30]:
Intercept cut[T.Good] cut[T.Ideal] cut[T.Premium] cut[T.Very Good] clarity[T.IF] clarity[T.SI1] clarity[T.SI2] clarity[T.VS1] clarity[T.VS2] clarity[T.VVS1] clarity[T.VVS2] color[T.E] color[T.F] color[T.G] color[T.H] color[T.I] color[T.J] log_carat
0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 -1.469676
1 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 -1.560648
2 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 -1.469676
3 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.237874
4 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 -1.171183

In [31]:
X[:10,:10]


Out[31]:
array([[ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.]])

In [32]:
multi_model = sm.OLS(y, X)
multi_results=multi_model.fit()
print(multi_results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:              log_price   R-squared:                       0.983
Model:                            OLS   Adj. R-squared:                  0.983
Method:                 Least Squares   F-statistic:                 1.693e+05
Date:                Wed, 06 Jul 2016   Prob (F-statistic):               0.00
Time:                        11:47:40   Log-Likelihood:                 31967.
No. Observations:               53940   AIC:                        -6.390e+04
Df Residuals:                   53921   BIC:                        -6.373e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------
Intercept            7.8569      0.006   1364.426      0.000         7.846     7.868
cut[T.Good]          0.0800      0.004     20.575      0.000         0.072     0.088
cut[T.Ideal]         0.1612      0.004     45.439      0.000         0.154     0.168
cut[T.Premium]       0.1393      0.004     38.937      0.000         0.132     0.146
cut[T.Very Good]     0.1172      0.004     32.392      0.000         0.110     0.124
clarity[T.IF]        1.1137      0.006    184.695      0.000         1.102     1.126
clarity[T.SI1]       0.5930      0.005    115.165      0.000         0.583     0.603
clarity[T.SI2]       0.4279      0.005     82.637      0.000         0.418     0.438
clarity[T.VS1]       0.8123      0.005    154.523      0.000         0.802     0.823
clarity[T.VS2]       0.7422      0.005    143.342      0.000         0.732     0.752
clarity[T.VVS1]      1.0187      0.006    182.731      0.000         1.008     1.030
clarity[T.VVS2]      0.9473      0.005    174.828      0.000         0.937     0.958
color[T.E]          -0.0543      0.002    -25.622      0.000        -0.058    -0.050
color[T.F]          -0.0946      0.002    -44.160      0.000        -0.099    -0.090
color[T.G]          -0.1604      0.002    -76.492      0.000        -0.164    -0.156
color[T.H]          -0.2511      0.002   -112.850      0.000        -0.255    -0.247
color[T.I]          -0.3726      0.002   -149.504      0.000        -0.377    -0.368
color[T.J]          -0.5110      0.003   -166.236      0.000        -0.517    -0.505
log_carat            1.8837      0.001   1668.750      0.000         1.882     1.886
==============================================================================
Omnibus:                     3721.869   Durbin-Watson:                   1.245
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            15890.209
Skew:                           0.210   Prob(JB):                         0.00
Kurtosis:                       5.626   Cond. No.                         33.3
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [ ]: