Load data from http://media.wiley.com/product_ancillary/6X/11186614/DOWNLOAD/ch06.zip, RetailMart.xlsx
In [1]:
# code written in py_3.0
import pandas as pd
import numpy as np
Load customer account data - i.e., past product sales data
In [2]:
# find path to your RetailMart.xlsx
df_accounts = pd.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch06/RetailMart.xlsx','rb'), sheetname=0)
df_accounts = df_accounts.drop('Unnamed: 17', 1) # drop empty col
df_accounts.rename(columns={'PREGNANT':'Pregnant'}, inplace=True)
df_accounts.rename(columns={'Home/Apt/ PO Box':'Residency'}, inplace=True) # add simpler col name
df_accounts.columns = [x.strip().replace(' ', '_') for x in df_accounts.columns] # python does not like spaces in var names
df_accounts.head()
Out[2]:
We need to categorise the 'Pregnant' column so that it can only take on one of two (in this case) possabilities. Here 1 = pregnant, and 0 = not pregnant
In [3]:
df_accounts['Pregnant'] = df_accounts['Pregnant'].astype('category') # set col type
Following Greg Lamp over at the Yhat Blog (see here), we need to 'dummify' (i.e., separate out) the catagorical variables: gender and residency
In [4]:
# dummify gender var
dummy_gender = pd.get_dummies(df_accounts['Implied_Gender'], prefix='Gender')
print(dummy_gender.head())
In [5]:
# dummify residency var
dummy_resident = pd.get_dummies(df_accounts['Residency'], prefix='Resident')
print(dummy_resident.head())
In [6]:
# make clean dataframe for regression model
cols_to_keep = df_accounts.columns[2:len(df_accounts.columns)-1] # keep all but 'Pregnant' var
# add dummy vars back in
data = pd.concat([dummy_gender.ix[:, 'Gender_M':],dummy_resident.ix[:, 'Resident_H':],df_accounts[cols_to_keep]], axis=1)
data.insert(0, 'Intercept', 1.0) # manually add the intercept
data.head()
Out[6]:
In [7]:
from patsy import dmatrices
import statsmodels.api as sm
train_cols = data.columns[1:]
logit = sm.Logit(df_accounts['Pregnant'], data[train_cols])
# fit the model
result = logit.fit()
In [8]:
print('Parameters:')
print(result.params)
print(result.summary())
logistic reg revisited with sklearn
In [9]:
# define X and y
X = data[train_cols]
y = df_accounts['Pregnant']
# train/test split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# train a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, y_train)
# make predictions for testing set
y_pred_class = logreg.predict(X_test)
# calculate testing accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))
In [21]:
# predict probability of survival
y_pred_prob = logreg.predict_proba(X_test)[:, 1]
import matplotlib.pyplot as plt
# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.gca().set_aspect('equal', adjustable='box')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()
In [11]:
# calculate AUC
print(metrics.roc_auc_score(y_test, y_pred_prob))
In [12]:
# histogram of predicted probabilities grouped by actual response value
df = pd.DataFrame({'probability':y_pred_prob, 'actual':y_test})
df.hist(column='probability', by='actual', sharex=True, sharey=True)
plt.show()
In [13]:
# calculate cross-validated AUC
from sklearn.cross_validation import cross_val_score
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()
Out[13]:
Random forest feature selection
In [211]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
clf.fit(data[train_cols], df_accounts['Pregnant'])
Out[211]:
In [212]:
# sort the features by importance
sorted_idx = clf.feature_importances_
df_features = pd.DataFrame({"Feature": train_cols})
df_features['Importance'] = sorted_idx
df_features = df_features.sort_values(by=['Importance'], ascending=[True]) # sort my most important feature
ax = df_features.plot(kind='barh', title ="Classification Feature Importance", figsize=(15, 10), legend=False, fontsize=12)
ax.set_xlabel("Importance", fontsize=12)
ax.set_yticklabels(df_features['Feature'])
plt.show()
We can see that the purchase of Folic Acid is a much better predictor of a customer pregnancy, surprisingly more so than an intrest in Prenatal Yoga (presumably more expectant mother use folic acid than take up yoga)---this information could be used to accurately target the advertisment of baby products
In [ ]: