Regression and Prediction



In [1]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

Load up some data



In [2]:

    
tips = pd.read_csv('input/tips.csv')
tips['tip_percent'] = (tips['tip'] / tips['total_bill'] * 100)
tips['tip_above_avg'] = np.where(tips['tip_percent'] >= tips['tip_percent'].mean(), 1, 0)
tips.replace({'Yes': 1, 'No': 0}, inplace=True)

WTH Does that stuff mean?

http://www.dummies.com/education/math/statistics/how-to-interpret-a-correlation-coefficient-r/



In [3]:

    
# Describe what we want using R-style formulas
formula = 'tip_percent ~ total_bill + party_size + C(ordered_alc_bev) + C(gender) + C(day) + C(time)'
model = sm.ols(formula, data=tips)    # Describe model
results = model.fit()                 # Fit model       
results.summary()                     # Summarize model









    Out[3]:





OLS Regression Results

  Dep. Variable:        tip_percent      R-squared:             0.334


  Model:                    OLS          Adj. R-squared:        0.312


  Method:              Least Squares     F-statistic:           14.76


  Date:              Mon, 16 Apr 2018    Prob (F-statistic):  1.89e-17


  Time:                  23:47:54        Log-Likelihood:      -916.52


  No. Observations:          244         AIC:                   1851.


  Df Residuals:              235         BIC:                   1883.


  Df Model:                    8                                     


  Covariance Type:       nonrobust                                   




                             coef      std err       t       P>|t|   [0.025     0.975]  


  Intercept                   42.5140      3.720     11.429   0.000     35.186     49.842


  C(ordered_alc_bev)[T.1]      1.7912      1.473      1.216   0.225     -1.110      4.692


  C(gender)[T.Male]            0.3495      1.461      0.239   0.811     -2.528      3.227


  C(day)[T.Sat]               -8.5745      3.188     -2.689   0.008    -14.856     -2.293


  C(day)[T.Sun]               -7.9049      3.286     -2.406   0.017    -14.378     -1.432


  C(day)[T.Thur]              -4.7105      4.048     -1.164   0.246    -12.686      3.265


  C(time)[T.Lunch]            -2.1070      4.569     -0.461   0.645    -11.108      6.895


  total_bill                  -0.6502      0.097     -6.674   0.000     -0.842     -0.458


  party_size                  -1.5466      0.916     -1.689   0.093     -3.351      0.257




  Omnibus:        172.635    Durbin-Watson:         2.119


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   2382.625


  Skew:            2.613     Prob(JB):               0.00


  Kurtosis:       17.389     Cond. No.               206.



In [4]:

    
columns = ['total_bill', 'gender', 'ordered_alc_bev', 'day', 'time', 'party_size']
data = [[15.52, 'Female', 0, 'Sun', 'Lunch', 1]]

df = pd.DataFrame(data, columns=columns)

predictions = results.predict(df).tolist()  # Convert to simple list
predictions









    Out[4]:





[20.8648081851993]

http://blog.yhat.com/posts/logistic-regression-python-rodeo.html



In [5]:

    
columns = ['total_bill', 'party_size', 'ordered_alc_bev']
training_columns = tips[columns].columns
logit = sm.Logit(tips['tip_above_avg'], tips[training_columns])  # Describe model
results = logit.fit()                                            # Fit model  
results.summary()                                                # Summarize model









    



Optimization terminated successfully.
         Current function value: 0.576896
         Iterations 6






    Out[5]:





Logit Regression Results

  Dep. Variable:    tip_above_avg     No. Observations:       244  


  Model:                Logit         Df Residuals:           241  


  Method:                MLE          Df Model:                 2  


  Date:           Mon, 16 Apr 2018    Pseudo R-squ.:       0.1457  


  Time:               23:47:54        Log-Likelihood:      -140.76 


  converged:            True          LL-Null:             -164.77 


                                    LLR p-value:        3.763e-11




                     coef      std err       z       P>|z|   [0.025     0.975]  


  total_bill          -0.1123      0.025     -4.524   0.000     -0.161     -0.064


  party_size           0.3599      0.172      2.095   0.036      0.023      0.697


  ordered_alc_bev      1.1323      0.280      4.039   0.000      0.583      1.682



In [6]:

    
data = [[10.52, 4, 0]]
df = pd.DataFrame(data, columns=columns)
predictions = results.predict(df).tolist()  # Convert to simple list

predictions









    Out[6]:





[0.5640802818887375]



In [7]:

    
from requests import get, post
request_data = [
	{"total_bill": 22.50, "party_size": 3, "ordered_alc_bev": 1, "gender": "Female", "day": "Sat", "time": "Dinner"},
	{"total_bill": 18.62, "party_size": 3, "ordered_alc_bev": 1, "gender": "Female", "day": "Sat", "time": "Dinner"},
	{"total_bill": 17.14, "party_size": 3, "ordered_alc_bev": 0, "gender": "Female", "day": "Sat", "time": "Dinner"},
]
post('http://localhost:5000/', json=request_data).json()









    Out[7]:





{'total_bill': 58.26,
 'predicted_tip': 8.08,
 'tip_percentage': 0.14,
 'tip_above_average': 'No'}



In [ ]:

Dep. Variable:	tip_percent	R-squared:	0.334
Model:	OLS	Adj. R-squared:	0.312
Method:	Least Squares	F-statistic:	14.76
Date:	Mon, 16 Apr 2018	Prob (F-statistic):	1.89e-17
Time:	23:47:54	Log-Likelihood:	-916.52
No. Observations:	244	AIC:	1851.
Df Residuals:	235	BIC:	1883.
Df Model:	8
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	42.5140	3.720	11.429	0.000	35.186	49.842
C(ordered_alc_bev)[T.1]	1.7912	1.473	1.216	0.225	-1.110	4.692
C(gender)[T.Male]	0.3495	1.461	0.239	0.811	-2.528	3.227
C(day)[T.Sat]	-8.5745	3.188	-2.689	0.008	-14.856	-2.293
C(day)[T.Sun]	-7.9049	3.286	-2.406	0.017	-14.378	-1.432
C(day)[T.Thur]	-4.7105	4.048	-1.164	0.246	-12.686	3.265
C(time)[T.Lunch]	-2.1070	4.569	-0.461	0.645	-11.108	6.895
total_bill	-0.6502	0.097	-6.674	0.000	-0.842	-0.458
party_size	-1.5466	0.916	-1.689	0.093	-3.351	0.257

Omnibus:	172.635	Durbin-Watson:	2.119
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2382.625
Skew:	2.613	Prob(JB):	0.00
Kurtosis:	17.389	Cond. No.	206.

Dep. Variable:	tip_above_avg	No. Observations:	244
Model:	Logit	Df Residuals:	241
Method:	MLE	Df Model:	2
Date:	Mon, 16 Apr 2018	Pseudo R-squ.:	0.1457
Time:	23:47:54	Log-Likelihood:	-140.76
converged:	True	LL-Null:	-164.77
		LLR p-value:	3.763e-11

	coef	std err	z	P>\|z\|	[0.025	0.975]
total_bill	-0.1123	0.025	-4.524	0.000	-0.161	-0.064
party_size	0.3599	0.172	2.095	0.036	0.023	0.697
ordered_alc_bev	1.1323	0.280	4.039	0.000	0.583	1.682