Regression and Prediction


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

Load up some data


In [2]:
tips = pd.read_csv('input/tips.csv')
tips['tip_percent'] = (tips['tip'] / tips['total_bill'] * 100)
tips['tip_above_avg'] = np.where(tips['tip_percent'] >= tips['tip_percent'].mean(), 1, 0)
tips.replace({'Yes': 1, 'No': 0}, inplace=True)

In [3]:
# Describe what we want using R-style formulas
formula = 'tip_percent ~ total_bill + party_size + C(ordered_alc_bev) + C(gender) + C(day) + C(time)'
model = sm.ols(formula, data=tips)    # Describe model
results = model.fit()                 # Fit model       
results.summary()                     # Summarize model


Out[3]:
OLS Regression Results
Dep. Variable: tip_percent R-squared: 0.334
Model: OLS Adj. R-squared: 0.312
Method: Least Squares F-statistic: 14.76
Date: Mon, 16 Apr 2018 Prob (F-statistic): 1.89e-17
Time: 23:47:54 Log-Likelihood: -916.52
No. Observations: 244 AIC: 1851.
Df Residuals: 235 BIC: 1883.
Df Model: 8
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 42.5140 3.720 11.429 0.000 35.186 49.842
C(ordered_alc_bev)[T.1] 1.7912 1.473 1.216 0.225 -1.110 4.692
C(gender)[T.Male] 0.3495 1.461 0.239 0.811 -2.528 3.227
C(day)[T.Sat] -8.5745 3.188 -2.689 0.008 -14.856 -2.293
C(day)[T.Sun] -7.9049 3.286 -2.406 0.017 -14.378 -1.432
C(day)[T.Thur] -4.7105 4.048 -1.164 0.246 -12.686 3.265
C(time)[T.Lunch] -2.1070 4.569 -0.461 0.645 -11.108 6.895
total_bill -0.6502 0.097 -6.674 0.000 -0.842 -0.458
party_size -1.5466 0.916 -1.689 0.093 -3.351 0.257
Omnibus: 172.635 Durbin-Watson: 2.119
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2382.625
Skew: 2.613 Prob(JB): 0.00
Kurtosis: 17.389 Cond. No. 206.

In [4]:
columns = ['total_bill', 'gender', 'ordered_alc_bev', 'day', 'time', 'party_size']
data = [[15.52, 'Female', 0, 'Sun', 'Lunch', 1]]

df = pd.DataFrame(data, columns=columns)

predictions = results.predict(df).tolist()  # Convert to simple list
predictions


Out[4]:
[20.8648081851993]

In [5]:
columns = ['total_bill', 'party_size', 'ordered_alc_bev']
training_columns = tips[columns].columns
logit = sm.Logit(tips['tip_above_avg'], tips[training_columns])  # Describe model
results = logit.fit()                                            # Fit model  
results.summary()                                                # Summarize model


Optimization terminated successfully.
         Current function value: 0.576896
         Iterations 6
Out[5]:
Logit Regression Results
Dep. Variable: tip_above_avg No. Observations: 244
Model: Logit Df Residuals: 241
Method: MLE Df Model: 2
Date: Mon, 16 Apr 2018 Pseudo R-squ.: 0.1457
Time: 23:47:54 Log-Likelihood: -140.76
converged: True LL-Null: -164.77
LLR p-value: 3.763e-11
coef std err z P>|z| [0.025 0.975]
total_bill -0.1123 0.025 -4.524 0.000 -0.161 -0.064
party_size 0.3599 0.172 2.095 0.036 0.023 0.697
ordered_alc_bev 1.1323 0.280 4.039 0.000 0.583 1.682

In [6]:
data = [[10.52, 4, 0]]
df = pd.DataFrame(data, columns=columns)
predictions = results.predict(df).tolist()  # Convert to simple list

predictions


Out[6]:
[0.5640802818887375]

In [7]:
from requests import get, post
request_data = [
	{"total_bill": 22.50, "party_size": 3, "ordered_alc_bev": 1, "gender": "Female", "day": "Sat", "time": "Dinner"},
	{"total_bill": 18.62, "party_size": 3, "ordered_alc_bev": 1, "gender": "Female", "day": "Sat", "time": "Dinner"},
	{"total_bill": 17.14, "party_size": 3, "ordered_alc_bev": 0, "gender": "Female", "day": "Sat", "time": "Dinner"},
]
post('http://localhost:5000/', json=request_data).json()


Out[7]:
{'total_bill': 58.26,
 'predicted_tip': 8.08,
 'tip_percentage': 0.14,
 'tip_above_average': 'No'}

In [ ]: