In [1]:
# code written in py_3.0

import pandas as pd
import numpy as np

Load customer account data - i.e., past product sales data


In [2]:
# find path to your RetailMart.xlsx
df_accounts = pd.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch06/RetailMart.xlsx','rb'), sheetname=0)
df_accounts = df_accounts.drop('Unnamed: 17', 1) # drop empty col
df_accounts.rename(columns={'PREGNANT':'Pregnant'}, inplace=True)
df_accounts.rename(columns={'Home/Apt/ PO Box':'Residency'}, inplace=True) # add simpler col name
df_accounts.columns = [x.strip().replace(' ', '_') for x in df_accounts.columns] # python does not like spaces in var names
df_accounts.head()


Out[2]:
Implied_Gender Residency Pregnancy_Test Birth_Control Feminine_Hygiene Folic_Acid Prenatal_Vitamins Prenatal_Yoga Body_Pillow Ginger_Ale Sea_Bands Stopped_buying_ciggies Cigarettes Smoking_Cessation Stopped_buying_wine Wine Maternity_Clothes Pregnant
0 M A 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
1 M H 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
2 M H 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
3 U H 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
4 F A 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1

We need to categorise the 'Pregnant' column so that it can only take on one of two (in this case) possabilities. Here 1 = pregnant, and 0 = not pregnant


In [3]:
df_accounts['Pregnant'] = df_accounts['Pregnant'].astype('category') # set col type

Following Greg Lamp over at the Yhat Blog (see here), we need to 'dummify' (i.e., separate out) the catagorical variables: gender and residency


In [4]:
# dummify gender var
dummy_gender = pd.get_dummies(df_accounts['Implied_Gender'], prefix='Gender')
print(dummy_gender.head())


   Gender_F  Gender_M  Gender_U
0         0         1         0
1         0         1         0
2         0         1         0
3         0         0         1
4         1         0         0

In [5]:
# dummify residency var
dummy_resident = pd.get_dummies(df_accounts['Residency'], prefix='Resident')
print(dummy_resident.head())


   Resident_A  Resident_H  Resident_P
0           1           0           0
1           0           1           0
2           0           1           0
3           0           1           0
4           1           0           0

In [6]:
# make clean dataframe for regression model
cols_to_keep = df_accounts.columns[2:len(df_accounts.columns)-1] # keep all but 'Pregnant' var
# add dummy vars back in
data = pd.concat([dummy_gender.ix[:, 'Gender_M':],dummy_resident.ix[:, 'Resident_H':],df_accounts[cols_to_keep]], axis=1)
data.insert(0, 'Intercept', 1.0) # manually add the intercept
data.head()


Out[6]:
Intercept Gender_M Gender_U Resident_H Resident_P Pregnancy_Test Birth_Control Feminine_Hygiene Folic_Acid Prenatal_Vitamins Prenatal_Yoga Body_Pillow Ginger_Ale Sea_Bands Stopped_buying_ciggies Cigarettes Smoking_Cessation Stopped_buying_wine Wine Maternity_Clothes
0 1.0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
1 1.0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
2 1.0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0
3 1.0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
4 1.0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0

In [7]:
from patsy import dmatrices
import statsmodels.api as sm

train_cols = data.columns[1:]
logit = sm.Logit(df_accounts['Pregnant'], data[train_cols])

# fit the model
result = logit.fit()


Optimization terminated successfully.
         Current function value: 0.373878
         Iterations 8

In [8]:
print('Parameters:')
print(result.params)
print(result.summary())


Parameters:
Gender_M                 -0.616638
Gender_U                 -0.021343
Resident_H               -0.367855
Resident_P               -0.209655
Pregnancy_Test            2.319395
Birth_Control            -2.400243
Feminine_Hygiene         -2.084057
Folic_Acid                4.048098
Prenatal_Vitamins         2.402392
Prenatal_Yoga             2.969468
Body_Pillow               1.256653
Ginger_Ale                1.884790
Sea_Bands                 0.940477
Stopped_buying_ciggies    1.218470
Cigarettes               -1.528444
Smoking_Cessation         1.750957
Stopped_buying_wine       1.276292
Wine                     -1.676815
Maternity_Clothes         2.003167
dtype: float64
                           Logit Regression Results                           
==============================================================================
Dep. Variable:               Pregnant   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      981
Method:                           MLE   Df Model:                           18
Date:                Mon, 05 Dec 2016   Pseudo R-squ.:                  0.4606
Time:                        00:46:36   Log-Likelihood:                -373.88
converged:                       True   LL-Null:                       -693.15
                                        LLR p-value:                6.050e-124
==========================================================================================
                             coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------
Gender_M                  -0.6166      0.177     -3.481      0.000        -0.964    -0.269
Gender_U                  -0.0213      0.295     -0.072      0.942        -0.599     0.556
Resident_H                -0.3679      0.165     -2.228      0.026        -0.691    -0.044
Resident_P                -0.2097      0.317     -0.662      0.508        -0.830     0.411
Pregnancy_Test             2.3194      0.525      4.418      0.000         1.290     3.348
Birth_Control             -2.4002      0.360     -6.670      0.000        -3.106    -1.695
Feminine_Hygiene          -2.0841      0.337     -6.182      0.000        -2.745    -1.423
Folic_Acid                 4.0481      0.764      5.300      0.000         2.551     5.545
Prenatal_Vitamins          2.4024      0.372      6.463      0.000         1.674     3.131
Prenatal_Yoga              2.9695      1.158      2.565      0.010         0.701     5.238
Body_Pillow                1.2567      0.860      1.462      0.144        -0.429     2.942
Ginger_Ale                 1.8848      0.428      4.401      0.000         1.045     2.724
Sea_Bands                  0.9405      0.671      1.401      0.161        -0.376     2.256
Stopped_buying_ciggies     1.2185      0.340      3.586      0.000         0.552     1.885
Cigarettes                -1.5284      0.365     -4.185      0.000        -2.244    -0.813
Smoking_Cessation          1.7510      0.514      3.405      0.001         0.743     2.759
Stopped_buying_wine        1.2763      0.302      4.232      0.000         0.685     1.867
Wine                      -1.6768      0.340     -4.927      0.000        -2.344    -1.010
Maternity_Clothes          2.0032      0.330      6.074      0.000         1.357     2.650
==========================================================================================

logistic reg revisited with sklearn


In [9]:
# define X and y
X = data[train_cols]
y = df_accounts['Pregnant']

# train/test split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# train a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, y_train)

# make predictions for testing set
y_pred_class = logreg.predict(X_test)

# calculate testing accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))


0.88
D:\Anaconda\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [21]:
# predict probability of survival
y_pred_prob = logreg.predict_proba(X_test)[:, 1]

import matplotlib.pyplot as plt

# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.gca().set_aspect('equal', adjustable='box')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()



In [11]:
# calculate AUC
print(metrics.roc_auc_score(y_test, y_pred_prob))


0.94394259722

In [12]:
# histogram of predicted probabilities grouped by actual response value
df = pd.DataFrame({'probability':y_pred_prob, 'actual':y_test})
df.hist(column='probability', by='actual', sharex=True, sharey=True)
plt.show()



In [13]:
# calculate cross-validated AUC
from sklearn.cross_validation import cross_val_score
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()


Out[13]:
0.89871999999999996

Random forest feature selection


In [211]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
clf.fit(data[train_cols], df_accounts['Pregnant'])


Out[211]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [212]:
# sort the features by importance
sorted_idx = clf.feature_importances_
df_features = pd.DataFrame({"Feature": train_cols})
df_features['Importance'] = sorted_idx

df_features = df_features.sort_values(by=['Importance'], ascending=[True]) # sort my most important feature
ax = df_features.plot(kind='barh', title ="Classification Feature Importance", figsize=(15, 10), legend=False, fontsize=12)
ax.set_xlabel("Importance", fontsize=12)
ax.set_yticklabels(df_features['Feature'])
plt.show()


We can see that the purchase of Folic Acid is a much better predictor of a customer pregnancy, surprisingly more so than an intrest in Prenatal Yoga (presumably more expectant mother use folic acid than take up yoga)---this information could be used to accurately target the advertisment of baby products


In [ ]: