notebook.community

Edit and run



In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import norm, lognorm, bernoulli
from scipy.stats.mstats import mquantiles

%matplotlib inline
plt.style.use('ggplot')



In [36]:

    
# simulate some data - theoretical conditions, not real-world...
b0, b1 = 1, 5

x = np.linspace(1, 5, 1000)
eps = norm(loc=0, scale=1).rvs(1000)
y = b0 + b1 * x + eps

plt.scatter(x, y);



In [37]:

    
X = np.vstack([np.ones(1000), x]).T



In [38]:

    
# solve the problem
w = np.linalg.inv(X.T @ X) @ X.T @ y

print(w)









    



[ 0.91185372  5.0345251 ]



In [39]:

    
lm = sm.OLS(exog=X, endog=y).fit()  # order of inputs reversed unless you explicitly name exog, endog
lm.summary()









    Out[39]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.970 


  Model:                    OLS          Adj. R-squared:        0.970 


  Method:              Least Squares     F-statistic:        3.242e+04


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):     0.00  


  Time:                  11:49:50        Log-Likelihood:      -1439.7 


  No. Observations:         1000         AIC:                   2883. 


  Df Residuals:              998         BIC:                   2893. 


  Df Model:                    1                                      


  Covariance Type:       nonrobust                                    




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const      0.9119      0.090     10.144   0.000      0.735     1.088


  x1         5.0345      0.028    180.057   0.000      4.980     5.089




  Omnibus:         6.541    Durbin-Watson:         1.989


  Prob(Omnibus):   0.038    Jarque-Bera (JB):      6.688


  Skew:            0.155    Prob(JB):             0.0353


  Kurtosis:        3.254    Cond. No.               9.70



In [40]:

    
eps_ln = lognorm(s=1.3).rvs(1000)
sgn = bernoulli(p=.7).rvs(1000)
sgn[sgn == 0] = -1
y_ln = b0 + b1 * x + sgn * eps_ln

plt.scatter(x, y_ln);



In [41]:

    
print(np.linalg.inv(X.T @ X) @ X.T @ y_ln)









    



[ 2.12158822  4.98243332]



In [42]:

    
lm2 = sm.OLS(y_ln, X).fit()
lm2.summary()









    Out[42]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.553 


  Model:                    OLS          Adj. R-squared:        0.553 


  Method:              Least Squares     F-statistic:           1236. 


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):  8.50e-177


  Time:                  11:49:54        Log-Likelihood:      -3062.9 


  No. Observations:         1000         AIC:                   6130. 


  Df Residuals:              998         BIC:                   6140. 


  Df Model:                    1                                      


  Covariance Type:       nonrobust                                    




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const      2.1216      0.456      4.656   0.000      1.227     3.016


  x1         4.9824      0.142     35.153   0.000      4.704     5.261




  Omnibus:        780.573    Durbin-Watson:          2.061 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   239825.773


  Skew:            2.506     Prob(JB):                0.00 


  Kurtosis:       78.701     Cond. No.                9.70



In [43]:

    
# residual plot - random?
plt.scatter(X[:,1], lm2.resid);
#plt.ylim(-25, 25);



In [44]:

    
# Q - Q plot
preds = lm2.predict(X)
qs = np.arange(0, 1, 0.01)
true_q = mquantiles(y_ln, prob=qs)
pred_q = mquantiles(preds, prob = qs)


plt.plot(true_q, true_q)
plt.scatter(true_q, pred_q);



In [45]:

    
# residuals vs fitted
plt.scatter(preds, lm2.resid)
plt.ylim(-40, 40);



In [46]:

    
# leverage plot
leverage = lm2.get_influence()
(c, p) = leverage.cooks_distance
plt.scatter(range(len(c)), c)
plt.title('Leverage Plot');



In [6]:

    
diamonds = sm.datasets.get_rdataset('diamonds', 'ggplot2')



In [7]:

    
print(diamonds.__doc__)









    



+------------+-------------------+
| diamonds   | R Documentation   |
+------------+-------------------+

Prices of 50,000 round cut diamonds
-----------------------------------

Description
~~~~~~~~~~~

A dataset containing the prices and other attributes of almost 54,000
diamonds. The variables are as follows:

Usage
~~~~~

::

    data(diamonds)

Format
~~~~~~

A data frame with 53940 rows and 10 variables

Details
~~~~~~~

-  price. price in US dollars (\\$326–\\$18,823)

-  carat. weight of the diamond (0.2–5.01)

-  cut. quality of the cut (Fair, Good, Very Good, Premium, Ideal)

-  colour. diamond colour, from J (worst) to D (best)

-  clarity. a measurement of how clear the diamond is (I1 (worst), SI1,
   SI2, VS1, VS2, VVS1, VVS2, IF (best))

-  x. length in mm (0–10.74)

-  y. width in mm (0–58.9)

-  z. depth in mm (0–31.8)

-  depth. total depth percentage = z / mean(x, y) = 2 \* z / (x + y)
   (43–79)

-  table. width of top of diamond relative to widest point (43–95)



In [8]:

    
df = diamonds.data  # extract the data - this returns a Pandas dataframe



In [9]:

    
df.describe(include='all')









    Out[9]:






  
    
      
      carat
      cut
      color
      clarity
      depth
      table
      price
      x
      y
      z
    
  
  
    
      count
      53940.000000
      53940
      53940
      53940
      53940.000000
      53940.000000
      53940.000000
      53940.000000
      53940.000000
      53940.000000
    
    
      unique
      NaN
      5
      7
      8
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      top
      NaN
      Ideal
      G
      SI1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      freq
      NaN
      21551
      11292
      13065
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      mean
      0.797940
      NaN
      NaN
      NaN
      61.749405
      57.457184
      3932.799722
      5.731157
      5.734526
      3.538734
    
    
      std
      0.474011
      NaN
      NaN
      NaN
      1.432621
      2.234491
      3989.439738
      1.121761
      1.142135
      0.705699
    
    
      min
      0.200000
      NaN
      NaN
      NaN
      43.000000
      43.000000
      326.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.400000
      NaN
      NaN
      NaN
      61.000000
      56.000000
      950.000000
      4.710000
      4.720000
      2.910000
    
    
      50%
      0.700000
      NaN
      NaN
      NaN
      61.800000
      57.000000
      2401.000000
      5.700000
      5.710000
      3.530000
    
    
      75%
      1.040000
      NaN
      NaN
      NaN
      62.500000
      59.000000
      5324.250000
      6.540000
      6.540000
      4.040000
    
    
      max
      5.010000
      NaN
      NaN
      NaN
      79.000000
      95.000000
      18823.000000
      10.740000
      58.900000
      31.800000



In [10]:

    
plt.scatter(df.carat, df.price, alpha = 0.3)
plt.xlim(0,6)
plt.ylim(0,20000);



In [11]:

    
# prepare the data for modeling 
ones = np.ones(df.shape[0])
carats = np.array(df['carat'])

X = np.vstack([ones, carats]).T
y = np.array(df['price'])



In [12]:

    
lm = sm.OLS(y, X)
results = lm.fit()



In [13]:

    
results.summary()









    Out[13]:





OLS Regression Results

  Dep. Variable:             y           R-squared:              0.849  


  Model:                    OLS          Adj. R-squared:         0.849  


  Method:              Least Squares     F-statistic:         3.041e+05 


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):      0.00   


  Time:                  11:33:53        Log-Likelihood:     -4.7273e+05


  No. Observations:        53940         AIC:                 9.455e+05 


  Df Residuals:            53938         BIC:                 9.455e+05 


  Df Model:                    1                                        


  Covariance Type:       nonrobust                                      




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const  -2256.3606     13.055   -172.830   0.000  -2281.949 -2230.772


  x1      7756.4256     14.067    551.408   0.000   7728.855  7783.996




  Omnibus:        14025.341    Durbin-Watson:          0.986 


  Prob(Omnibus):    0.000      Jarque-Bera (JB):   153030.525


  Skew:             0.939      Prob(JB):                0.00 


  Kurtosis:        11.035      Cond. No.                3.65



In [20]:

    
# alternative - use 'formulas' api
# only simple models
import statsmodels.formula.api as smf

lm2 = smf.ols('price~carat', data=df)



In [ ]:

    
lm2.fit().summary()



In [ ]:

    
predictions = results.predict()  # with no argument, this defaults to make predictions on the already seen data



In [ ]:

    
plt.scatter(df.carat, df.price, alpha=0.3)  # previous plot
plt.plot(df.carat, predictions, color='red', label='Predicted')  # predicted values
plt.legend(loc='lower right')
plt.ylim(0, 20000)  # included b/c some predicted values are far from the actual data, comment out to see 
plt.xlim(0,6);



In [14]:

    
df_transformed = df



In [15]:

    
df_transformed['log_price'] = np.log(df_transformed.price)



In [16]:

    
plt.scatter(df_transformed.carat, df_transformed.log_price, alpha=0.3)     
plt.xlim(0,6)
plt.ylim(5.5,10);



In [17]:

    
df_transformed['log_carat']=np.log(df_transformed.carat)



In [18]:

    
plt.scatter(df_transformed.log_carat, df_transformed.log_price, alpha=0.3)
plt.ylim(5.5, 10);



In [21]:

    
model_transformed = smf.ols('log_price~log_carat', data=df_transformed)
results_transformed = model_transformed.fit()
print(results_transformed.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:              log_price   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                 7.510e+05
Date:                Wed, 06 Jul 2016   Prob (F-statistic):               0.00
Time:                        11:38:05   Log-Likelihood:                -4424.2
No. Observations:               53940   AIC:                             8852.
Df Residuals:                   53938   BIC:                             8870.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      8.4487      0.001   6190.896      0.000         8.446     8.451
log_carat      1.6758      0.002    866.590      0.000         1.672     1.680
==============================================================================
Omnibus:                      877.676   Durbin-Watson:                   1.227
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1679.882
Skew:                           0.072   Prob(JB):                         0.00
Kurtosis:                       3.853   Cond. No.                         2.08
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [22]:

    
df_transformed.describe(include='all')









    Out[22]:






  
    
      
      carat
      cut
      color
      clarity
      depth
      table
      price
      x
      y
      z
      log_price
      log_carat
    
  
  
    
      count
      53940.000000
      53940
      53940
      53940
      53940.000000
      53940.000000
      53940.000000
      53940.000000
      53940.000000
      53940.000000
      53940.000000
      53940.000000
    
    
      unique
      NaN
      5
      7
      8
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      top
      NaN
      Ideal
      G
      SI1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      freq
      NaN
      21551
      11292
      13065
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      mean
      0.797940
      NaN
      NaN
      NaN
      61.749405
      57.457184
      3932.799722
      5.731157
      5.734526
      3.538734
      7.786768
      -0.394967
    
    
      std
      0.474011
      NaN
      NaN
      NaN
      1.432621
      2.234491
      3989.439738
      1.121761
      1.142135
      0.705699
      1.014649
      0.584828
    
    
      min
      0.200000
      NaN
      NaN
      NaN
      43.000000
      43.000000
      326.000000
      0.000000
      0.000000
      0.000000
      5.786897
      -1.609438
    
    
      25%
      0.400000
      NaN
      NaN
      NaN
      61.000000
      56.000000
      950.000000
      4.710000
      4.720000
      2.910000
      6.856462
      -0.916291
    
    
      50%
      0.700000
      NaN
      NaN
      NaN
      61.800000
      57.000000
      2401.000000
      5.700000
      5.710000
      3.530000
      7.783641
      -0.356675
    
    
      75%
      1.040000
      NaN
      NaN
      NaN
      62.500000
      59.000000
      5324.250000
      6.540000
      6.540000
      4.040000
      8.580027
      0.039221
    
    
      max
      5.010000
      NaN
      NaN
      NaN
      79.000000
      95.000000
      18823.000000
      10.740000
      58.900000
      31.800000
      9.842835
      1.611436



In [23]:

    
df_transformed.corr()



In [24]:

    
from sklearn.preprocessing import StandardScaler



In [25]:

    
# in multiple-variable setting, scaling may improve performance
# in single-variable setting, it doesn't make a difference
# code below only to demonstrate usage of StandardScaler - note the deprecation warning
X = np.array(df_transformed['log_carat'])
y = np.array(df_transformed['log_price'])

X_scaled = StandardScaler().fit_transform(X)
X_scaled = np.vstack([ones, X_scaled]).T









    



/Users/jwj2/anaconda/envs/snowflakes/lib/python3.5/site-packages/sklearn/preprocessing/data.py:583: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/jwj2/anaconda/envs/snowflakes/lib/python3.5/site-packages/sklearn/preprocessing/data.py:646: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)



In [26]:

    
X_scaled[:5]









    Out[26]:





array([[ 1.        , -1.8376683 ],
       [ 1.        , -1.99322292],
       [ 1.        , -1.8376683 ],
       [ 1.        , -1.44130568],
       [ 1.        , -1.32726865]])



In [27]:

    
lm3 = sm.OLS(y, X_scaled)
results = lm3.fit()
results.summary()









    Out[27]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.933 


  Model:                    OLS          Adj. R-squared:        0.933 


  Method:              Least Squares     F-statistic:        7.510e+05


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):     0.00  


  Time:                  11:40:02        Log-Likelihood:      -4424.2 


  No. Observations:        53940         AIC:                   8852. 


  Df Residuals:            53938         BIC:                   8870. 


  Df Model:                    1                                      


  Covariance Type:       nonrobust                                    




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const      7.7868      0.001   6885.265   0.000      7.785     7.789


  x1         0.9801      0.001    866.590   0.000      0.978     0.982




  Omnibus:        877.676    Durbin-Watson:         1.227


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   1679.882


  Skew:            0.072     Prob(JB):               0.00


  Kurtosis:        3.853     Cond. No.               1.00



In [28]:

    
from patsy import dmatrices

# note the order of y and X. Again annoyingly opposite of what you might expect.
y, X = dmatrices('log_price~log_carat+cut+clarity+color', data=df_transformed, return_type='matrix')



In [29]:

    
y_demo, X_demo = dmatrices('log_price~log_carat+cut+clarity+color', data=df_transformed, return_type='dataframe')



In [30]:

    
X_demo.head()









    Out[30]:






  
    
      
      Intercept
      cut[T.Good]
      cut[T.Ideal]
      cut[T.Premium]
      cut[T.Very Good]
      clarity[T.IF]
      clarity[T.SI1]
      clarity[T.SI2]
      clarity[T.VS1]
      clarity[T.VS2]
      clarity[T.VVS1]
      clarity[T.VVS2]
      color[T.E]
      color[T.F]
      color[T.G]
      color[T.H]
      color[T.I]
      color[T.J]
      log_carat
    
  
  
    
      0
      1.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -1.469676
    
    
      1
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -1.560648
    
    
      2
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      -1.469676
    
    
      3
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      -1.237874
    
    
      4
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      -1.171183



In [31]:

    
X[:10,:10]









    Out[31]:





array([[ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.]])



In [32]:

    
multi_model = sm.OLS(y, X)
multi_results=multi_model.fit()
print(multi_results.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:              log_price   R-squared:                       0.983
Model:                            OLS   Adj. R-squared:                  0.983
Method:                 Least Squares   F-statistic:                 1.693e+05
Date:                Wed, 06 Jul 2016   Prob (F-statistic):               0.00
Time:                        11:47:40   Log-Likelihood:                 31967.
No. Observations:               53940   AIC:                        -6.390e+04
Df Residuals:                   53921   BIC:                        -6.373e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------
Intercept            7.8569      0.006   1364.426      0.000         7.846     7.868
cut[T.Good]          0.0800      0.004     20.575      0.000         0.072     0.088
cut[T.Ideal]         0.1612      0.004     45.439      0.000         0.154     0.168
cut[T.Premium]       0.1393      0.004     38.937      0.000         0.132     0.146
cut[T.Very Good]     0.1172      0.004     32.392      0.000         0.110     0.124
clarity[T.IF]        1.1137      0.006    184.695      0.000         1.102     1.126
clarity[T.SI1]       0.5930      0.005    115.165      0.000         0.583     0.603
clarity[T.SI2]       0.4279      0.005     82.637      0.000         0.418     0.438
clarity[T.VS1]       0.8123      0.005    154.523      0.000         0.802     0.823
clarity[T.VS2]       0.7422      0.005    143.342      0.000         0.732     0.752
clarity[T.VVS1]      1.0187      0.006    182.731      0.000         1.008     1.030
clarity[T.VVS2]      0.9473      0.005    174.828      0.000         0.937     0.958
color[T.E]          -0.0543      0.002    -25.622      0.000        -0.058    -0.050
color[T.F]          -0.0946      0.002    -44.160      0.000        -0.099    -0.090
color[T.G]          -0.1604      0.002    -76.492      0.000        -0.164    -0.156
color[T.H]          -0.2511      0.002   -112.850      0.000        -0.255    -0.247
color[T.I]          -0.3726      0.002   -149.504      0.000        -0.377    -0.368
color[T.J]          -0.5110      0.003   -166.236      0.000        -0.517    -0.505
log_carat            1.8837      0.001   1668.750      0.000         1.882     1.886
==============================================================================
Omnibus:                     3721.869   Durbin-Watson:                   1.245
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            15890.209
Skew:                           0.210   Prob(JB):                         0.00
Kurtosis:                       5.626   Cond. No.                         33.3
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [ ]:

	carat	depth	table	price	x	y	z	log_price	log_carat
carat	1.000000	0.028224	0.181618	0.921591	0.975094	0.951722	0.953387	0.920207	0.956400
depth	0.028224	1.000000	-0.295779	-0.010647	-0.025289	-0.029341	0.094924	0.000860	0.030434
table	0.181618	-0.295779	1.000000	0.127134	0.195344	0.183760	0.150929	0.158208	0.191741
price	0.921591	-0.010647	0.127134	1.000000	0.884435	0.865421	0.861249	0.895771	0.855526
x	0.975094	-0.025289	0.195344	0.884435	1.000000	0.974701	0.970772	0.958010	0.990166
y	0.951722	-0.029341	0.183760	0.865421	0.974701	1.000000	0.952006	0.936173	0.966495
z	0.953387	0.094924	0.150929	0.861249	0.970772	0.952006	1.000000	0.935218	0.969060
log_price	0.920207	0.000860	0.158208	0.895771	0.958010	0.936173	0.935218	1.000000	0.965914
log_carat	0.956400	0.030434	0.191741	0.855526	0.990166	0.966495	0.969060	0.965914	1.000000

Dep. Variable:	y	R-squared:	0.970
Model:	OLS	Adj. R-squared:	0.970
Method:	Least Squares	F-statistic:	3.242e+04
Date:	Wed, 06 Jul 2016	Prob (F-statistic):	0.00
Time:	11:49:50	Log-Likelihood:	-1439.7
No. Observations:	1000	AIC:	2883.
Df Residuals:	998	BIC:	2893.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	0.9119	0.090	10.144	0.000	0.735 1.088
x1	5.0345	0.028	180.057	0.000	4.980 5.089

Omnibus:	6.541	Durbin-Watson:	1.989
Prob(Omnibus):	0.038	Jarque-Bera (JB):	6.688
Skew:	0.155	Prob(JB):	0.0353
Kurtosis:	3.254	Cond. No.	9.70

Dep. Variable:	y	R-squared:	0.553
Model:	OLS	Adj. R-squared:	0.553
Method:	Least Squares	F-statistic:	1236.
Date:	Wed, 06 Jul 2016	Prob (F-statistic):	8.50e-177
Time:	11:49:54	Log-Likelihood:	-3062.9
No. Observations:	1000	AIC:	6130.
Df Residuals:	998	BIC:	6140.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	780.573	Durbin-Watson:	2.061
Prob(Omnibus):	0.000	Jarque-Bera (JB):	239825.773
Skew:	2.506	Prob(JB):	0.00
Kurtosis:	78.701	Cond. No.	9.70

	carat	cut	color	clarity	depth	table	price	x	y	z
count	53940.000000	53940	53940	53940	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000
unique	NaN	5	7	8	NaN	NaN	NaN	NaN	NaN	NaN
top	NaN	Ideal	G	SI1	NaN	NaN	NaN	NaN	NaN	NaN
freq	NaN	21551	11292	13065	NaN	NaN	NaN	NaN	NaN	NaN
mean	0.797940	NaN	NaN	NaN	61.749405	57.457184	3932.799722	5.731157	5.734526	3.538734
std	0.474011	NaN	NaN	NaN	1.432621	2.234491	3989.439738	1.121761	1.142135	0.705699
min	0.200000	NaN	NaN	NaN	43.000000	43.000000	326.000000	0.000000	0.000000	0.000000
25%	0.400000	NaN	NaN	NaN	61.000000	56.000000	950.000000	4.710000	4.720000	2.910000
50%	0.700000	NaN	NaN	NaN	61.800000	57.000000	2401.000000	5.700000	5.710000	3.530000
75%	1.040000	NaN	NaN	NaN	62.500000	59.000000	5324.250000	6.540000	6.540000	4.040000
max	5.010000	NaN	NaN	NaN	79.000000	95.000000	18823.000000	10.740000	58.900000	31.800000

Dep. Variable:	y	R-squared:	0.849
Model:	OLS	Adj. R-squared:	0.849
Method:	Least Squares	F-statistic:	3.041e+05
Date:	Wed, 06 Jul 2016	Prob (F-statistic):	0.00
Time:	11:33:53	Log-Likelihood:	-4.7273e+05
No. Observations:	53940	AIC:	9.455e+05
Df Residuals:	53938	BIC:	9.455e+05
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	14025.341	Durbin-Watson:	0.986
Prob(Omnibus):	0.000	Jarque-Bera (JB):	153030.525
Skew:	0.939	Prob(JB):	0.00
Kurtosis:	11.035	Cond. No.	3.65

Dep. Variable:	y	R-squared:	0.933
Model:	OLS	Adj. R-squared:	0.933
Method:	Least Squares	F-statistic:	7.510e+05
Date:	Wed, 06 Jul 2016	Prob (F-statistic):	0.00
Time:	11:40:02	Log-Likelihood:	-4424.2
No. Observations:	53940	AIC:	8852.
Df Residuals:	53938	BIC:	8870.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	877.676	Durbin-Watson:	1.227
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1679.882
Skew:	0.072	Prob(JB):	0.00
Kurtosis:	3.853	Cond. No.	1.00

	Intercept	cut[T.Good]	cut[T.Ideal]	cut[T.Premium]	clarity[T.SI1]	clarity[T.SI2]	clarity[T.VS1]	clarity[T.VS2]	color[T.E]	color[T.I]	color[T.J]	log_carat
0	1.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	-1.469676
1	1.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	-1.560648
2	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	-1.469676
3	1.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	-1.237874
4	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	-1.171183