In [1]:
from datetime import datetime as dt
import math
from string import Template
from IPython.display import display, YouTubeVideo, HTML, Image, Video
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
from pandas.tseries.offsets import *
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn as skl
from sklearn import datasets
# supporting code in python modules
import expStd
import baseVizBld
%matplotlib inline
sns.set(style="darkgrid")
plt.style.use('ggplot')
In [2]:
#%lsmagic
InteractiveShell.ast_node_interactivity = "all"
In [1]:
from traitlets.config.manager import BaseJSONConfigManager
path = "/Anaconda3/etc/jupyter/nbconfig"
cm = BaseJSONConfigManager(config_dir=path)
cm.update("livereveal", {
"theme": "simple",
"start_slideshow_at": "selected",
"scroll": True,
"slideNumber": True,
"transition": "slide",
"progress": True,
"viewDistance": 5,
})
Out[1]:
All data presented here is synthetic and has been generated using:
All the code required to perform the examples presented here as well as the presentation itself can be found my a repo on github. The link is availible in the presentation documents shared
Throughout the presentation I will include "toolbox" references highlighting the particular packages I'm using to do something.
All of the examples today are written using python 3 I am presenting using a jupyter notebook and the RISE Reveal.js extension
Since i made up the data we can safely assume that it passes validation
In [3]:
HTML( "<h1>That data science venn diagram ...</h1>")
Image("https://static1.squarespace.com/static/5150aec6e4b0e340ec52710a/t/51525c33e4b0b3e0d10f77ab/1364352052403/Data_Science_VD.png")
Out[3]:
Out[3]:
In [4]:
HTML("<h1>I like this one better</h1>")
Image("https://3.bp.blogspot.com/-bvQxcwfqATQ/V-E_uTBc4VI/AAAAAAAAMGQ/Qa1Ntef-rs0E-mWx5pkVu-CPlREdvD0TwCLcB/s1600/VennDiagram2.png")
Out[4]:
Out[4]:
In [5]:
HTML("<h1> ... so why are we here today?</h1><br>")
Out[5]:
In [6]:
HTML("<h1>One more chart you've seen before</h1><br>")
Image("https://media.licdn.com/mpr/mpr/shrinknp_800_800/AAEAAQAAAAAAAAjIAAAAJGI0NzY3MGM0LTIyMTEtNDYwYy04OWQ2LTgyYmZiNDgzNTlhNw.png")
Out[6]:
Out[6]:
in this talk i'll be going through an example of each of these "types" of analytics and showing you how working as a team enables you to climb these stairs
In [3]:
Image("https://media.licdn.com/mpr/mpr/shrinknp_800_800/AAEAAQAAAAAAAAjIAAAAJGI0NzY3MGM0LTIyMTEtNDYwYy04OWQ2LTgyYmZiNDgzNTlhNw.png")
Out[3]:
Make a dataset that emulates lapse experience for a savings product with cash value and surrender charges
Randomly specify:
Let
In [7]:
def chargePlt(df, chrgYrs):
ttl = Template('$n year charge')
fig, axes = plt.subplots(ncols=2, nrows=2, figsize = (16,10))
fig.suptitle('Actual versus Expected Lapse Rates', fontsize = 16)
ax1, ax2, ax3, ax4 = axes.ravel()
ax1.set_title(ttl.substitute(n=chrgYrs[0]))
line1, = ax1.plot(df.xs(chrgYrs[0],level='chrYrs').expected_lapseRate, ls='-', label = 'expected')
line2, = ax1.plot(df.xs(chrgYrs[0],level='chrYrs').actual_lapseRate, ls='-', label = 'actual')
ax1.legend(loc='upper right')
ax2.set_title(ttl.substitute(n=chrgYrs[1]))
line1, = ax2.plot(df.xs(chrgYrs[1],level='chrYrs').expected_lapseRate, ls='-', label = 'expected')
line2, = ax2.plot(df.xs(chrgYrs[1],level='chrYrs').actual_lapseRate, ls='-', label = 'actual')
ax2.legend(loc='upper right')
ax3.set_title(ttl.substitute(n=chrgYrs[2]))
line1, = ax3.plot(df.xs(chrgYrs[2],level='chrYrs').expected_lapseRate, ls='-', label = 'expected')
line2, = ax3.plot(df.xs(chrgYrs[2],level='chrYrs').actual_lapseRate, ls='-', label = 'actual')
ax3.legend(loc='upper right')
ax4.set_title(ttl.substitute(n=chrgYrs[3]))
line1, = ax4.plot(df.xs(chrgYrs[3],level='chrYrs').expected_lapseRate, ls='-', label = 'expected')
line2, = ax4.plot(df.xs(chrgYrs[3],level='chrYrs').actual_lapseRate, ls='-', label = 'actual')
ax4.legend(loc='upper right')
ax1.set_xlabel('duration')
ax2.set_xlabel('duration')
ax3.set_xlabel('duration')
ax4.set_xlabel('duration')
ax1.set_ylabel('rate')
ax2.set_ylabel('rate')
ax3.set_ylabel('rate')
ax4.set_ylabel('rate')
plt.show()
In [8]:
# good fit - simulate directly against expected
synthExp_gf = expStd.process_synthExp(expStd.buildSynthPop(10000, 5000, 1000000, 1, 20,0))
# not good fit - add distortion to simulation
synthExp_ngf = expStd.process_synthExp(expStd.buildSynthPop(10000, 5000, 1000000, 1, 20,0.1))
In [9]:
chargePlt(synthExp_gf, ['2','4','6','8'])
In [10]:
chargePlt(synthExp_ngf, ['2','4','6','8'])
In [11]:
# freeing some space
synthExp_gf = 0
synthExp_ngf = 0
In [8]:
vid = YouTubeVideo("93lrosBEW-Q", start=27, end =40, width = 1067, height = 600, autoplay=0)
display(vid)
In [10]:
HTML("<h2>Remember our trusty old venn diagram</h2><br>")
Image("https://static1.squarespace.com/static/5150aec6e4b0e340ec52710a/t/51525c33e4b0b3e0d10f77ab/1364352052403/Data_Science_VD.png")
Out[10]:
Out[10]:
In [4]:
Image("https://media.licdn.com/mpr/mpr/shrinknp_800_800/AAEAAQAAAAAAAAjIAAAAJGI0NzY3MGM0LTIyMTEtNDYwYy04OWQ2LTgyYmZiNDgzNTlhNw.png")
Out[4]:
In [14]:
HTML("<h3>Actuaries understand product features and the drivers of risk and value</h3> <br>")
Image("images/julia_icon.png")
Out[14]:
Out[14]:
In [15]:
HTML("<h3> \"Hackers\" (we'll call them developers from now on) bring a new perspective on how to process and represent data </h3><br>")
Image("images/eric_icon.png")
Out[15]:
Out[15]:
In [15]:
import matplotlib.patches as mpatches
def buildFullPremPlt(df):
fig, axes = plt.subplots(ncols=1, nrows=1, figsize = (16,10))
axes.set_title("Transactions")
axes.set_xlabel('date')
axes.set_ylabel('policy')
return plt.plot_date(x = df.trxDt, y = df.polNo, marker = ',', color = 'green')
def buildIndPremPlt(df1, df2):
fig, axes = plt.subplots(ncols=1, nrows=2, figsize = (16,10), sharex=True)
fig.suptitle('Transactions Split by Industry', fontsize = 16)
ax1, ax2 = axes.ravel()
ax1.set_title("Education")
ax2.set_title("Other")
ax1.set_xlabel('date')
ax2.set_xlabel('date')
ax1.set_ylabel('policy')
ax2.set_ylabel('policy')
ax1.plot_date(x = df1.trxDt, y = df1.polNo, marker = ',', color = 'green')
ax2.plot_date(x = df2.trxDt, y = df2.polNo, marker = ',', color = 'green')
return fig, axes
def bldTypePlt(df, title, ax):
ax.set_title(title)
ax.set_xlabel('date')
ax.set_ylabel('policy')
ax.plot_date(x = df[df['premType']=='auto']['trxDt'],
y = df[df['premType']=='auto']['polNo'],
marker = ',', color = 'yellow', label = 'auto')
yellow_patch = mpatches.Patch(color='yellow', label='Automatic')
ax.plot_date(x = df[df['premType']=='adhoc']['trxDt'],
y = df[df['premType']=='adhoc']['polNo'],
marker = ',', color = 'c', label = 'adhoc')
green_patch = mpatches.Patch(color='c', label='Ad Hoc')
ax.plot_date(x = df[df['premType']=='rollover']['trxDt'],
y = df[df['premType']=='rollover']['polNo'],
marker = ',', color = 'm', label = 'rollover')
blue_patch = mpatches.Patch(color='m', label='Rollover')
ax.plot_date(x = df[df['premType']=='termination']['trxDt'],
y = df[df['premType']=='termination']['polNo'],
marker = ',', color = 'k', label = 'termination')
red_patch = mpatches.Patch(color='k', label='Termination')
ax.legend(handles = [yellow_patch, green_patch, blue_patch, red_patch], loc='upper left')
return ax
def bldIndTypePlt(df1, df2):
fig, axes = plt.subplots(ncols=1, nrows=2, figsize = (16,10), sharex=True)
fig.suptitle('Transactions Split by Industry and Type', fontsize = 16)
ax1, ax2 = axes.ravel()
ax1 = bldTypePlt(df1, "Education", ax1)
ax2 = bldTypePlt(df2, "Other", ax2)
return fig, axes
In [4]:
if(False):
est = baseVizBld.buildRetPop(1500, pd.Timestamp('2002-01-01'),pd.Timestamp('2003-07-01'))
trxes = baseVizBld.buildTrx(est)
est.to_csv('data/premEx_pop.csv')
trxes.to_csv('data/premEx_trx.csv')
else:
est = pd.read_csv('data/premEx_pop.csv')
trxes = pd.read_csv('data/premEx_trx.csv')
In [5]:
if(False):
eduTrx = trxes.drop(trxes[trxes['educInd']==False].index, inplace = False)
otherTrx = trxes.drop(trxes[trxes['educInd']==True].index, inplace = False)
else:
eduTrx = trxes[trxes['educInd']==True]
otherTrx = trxes[trxes['educInd']==False]
So we are in diagnostic land - why is such and such happening. We havent been able to answer this question looking at the data in our traditional analyses so lets work with a hacker to dig more deeply into the data - in fact we aren't going to do any aggregation at all!
In [8]:
buildFullPremPlt(trxes)
Out[8]:
We are building a pixel plot
What are we seeing
In [16]:
buildIndPremPlt(eduTrx, otherTrx)
Out[16]:
The contribution period of the "Other" group seems shorter, so there is an overlap in the contribution end dates for older education market and younger "Other"
In [17]:
bldIndTypePlt(eduTrx,otherTrx)
Out[17]:
Same plot but
So by working with a developer we were able to view the data at a new level of detail which enabled us to discover that people retire and wait a while before moving their money
In [23]:
trxes = 0
est = 0
eduTrx = 0
otherTrx = 0
In [5]:
Image("https://media.licdn.com/mpr/mpr/shrinknp_800_800/AAEAAQAAAAAAAAjIAAAAJGI0NzY3MGM0LTIyMTEtNDYwYy04OWQ2LTgyYmZiNDgzNTlhNw.png")
Out[5]:
Predictive analytics certainly requires a level of competency in writing code and engineering and an understanding of the underlying probability, statistics, and credibility aspects to fitting and using a model. But those arent actually the hard parts - you can get there those are skills you can be taught easily. Whats hard about doing good predictive is figuring out what is worth predicting and how the prediction would fit into anything you do operationally - that is why you need a business expert
In [24]:
HTML("<h3>Adding a substantive expert in the business to your analytics team increases the team's ability to attack relevant business problems and provide on the ground actionable recomendations to end business users</h3>")
Image("images/rob_icon.png")
Out[24]:
Out[24]:
Imagine you work for a life insurer who sells a little bit in the diabetes market, but they'd like to know if we can expand our business.
Want to be smart and capture people who are ok and have their disease under control - that is people who's diabetes progression is fairly flat year over year
We have a data set on potential clients with diabetes and so our task is to try and build a model that would let us predict which potential clients will keep their diabetes under control and to whom we would sell
In [18]:
from sklearn import datasets
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
In [19]:
diabetes = datasets.load_diabetes()
cols = ['age','sex','bmi','avg_bp','s1','s2','s3','s4','s5','s6']
diabetes_df = pd.DataFrame(diabetes.data, columns = cols)
diabetes_df['disease_status_t1'] = diabetes.target
Data Validation and Feature Engineering
In [20]:
def cat_Status(scr):
if(scr < 151):
return 'A'
elif(scr < 251):
return 'B'
else:
return 'C'
In [21]:
diabetes_df['age_sex_cross'] = diabetes_df.age * diabetes_df.sex
diabetes_df['diseaseStatus_cat'] = diabetes_df.apply(lambda x: cat_Status(x.disease_status_t1), axis = 1)
In [22]:
xdata = diabetes_df.drop(['disease_status_t1', 'diseaseStatus_cat'], axis =1)
ydata = diabetes_df[['disease_status_t1', 'diseaseStatus_cat']]
In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(xdata, ydata, test_size = 0.3, random_state = 0)
Lets limit our discussion to supervised models here
Describe what you are trying to predict
What are you going to do with your predictions?
In [24]:
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
clf = neighbors.KNeighborsClassifier(15, weights='uniform')
In [25]:
HTML("<h4>Correlation matrix</h4>")
diabetes_df.corr().style
Out[25]:
Out[25]:
Teach the model how to predict what we want it to predict
In [26]:
clf.fit(X_train[['s5','bmi','s4']], y_train['diseaseStatus_cat'])
Out[26]:
Split the dataset and pick a metric
Actuaries are often really against not using all the data. I remember early on building a model to predict clients who were likely to make a subsequent contribution and i used a hold out of 20% and all i got from my actuarial collegues was "but youre going to retrain it with all the data before you deploy right?" I think this comes from a good place where we want to be as precise as possible and we feel like more is better but flying blind is bad!
Model validation is splitting your dataset so that you dont use the same observations that you used to teach the model to test how well it performs. Imagine if the practice problems for exams were actually on the next exam!
Train-test vs train-test-final
So you can see how well your model did
In [27]:
import itertools
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
def buildConfusionMatrix(cm, classes, ax,
normalize = False,
title = 'Confusion Matrix',
cmap = plt.cm.Blues
):
ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.set_title(title)
tick_marks = np.arange(len(classes))
ax.set_xticks(tick_marks)
ax.set_xticklabels(classes)
ax.set_yticks(tick_marks)
ax.set_yticklabels(classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
ax.set_ylabel('True label')
ax.set_xlabel('Predicted label')
In [28]:
class_names = ['A','B','C']
cm_train = confusion_matrix(y_train['diseaseStatus_cat'],clf.predict(X_train[['s5','bmi','s4']]))
cm_test = confusion_matrix(y_test['diseaseStatus_cat'],clf.predict(X_test[['s5','bmi','s4']]))
np.set_printoptions(precision=2)
In [29]:
fig, axes = plt.subplots(ncols=2, nrows=1, figsize = (20,16))
ax1, ax2 = axes.ravel()
buildConfusionMatrix(cm_train, classes = class_names, ax = ax1, title = 'Confusion Matrix: Training Data')
buildConfusionMatrix(cm_test, classes = class_names, ax = ax2, title = 'Confusion Matrix: Testing Data')
Confusion matrix and precision and recall
Confusion matrix - shows you what your model predicted versus what the true result was (i'm going to show it with a heat map so darker = more observations), your aiming to see the darkest running in a line from the upper left to the lower right
Precision and recall
In [30]:
Image("images/precision_recall.png")
HTML("By Walber - Own work, CC BY-SA 4.0, https://commons.wikimedia.org/w/index.php?curid=36926283")
Out[30]:
Out[30]:
In [31]:
from sklearn.metrics import precision_score, recall_score
'Precision: {0:%} Recall: {1:%}'.format(
precision_score(y_test['diseaseStatus_cat'],clf.predict(X_test[['s5','bmi','s4']]), average = 'weighted'),
recall_score(y_test['diseaseStatus_cat'],clf.predict(X_test[['s5','bmi','s4']]), average = 'weighted'))
Out[31]:
In [32]:
HTML("<h2>Add BP to the predictors!</h2>")
clf2 = neighbors.KNeighborsClassifier(15, weights='uniform')
clf2.fit(X_train[['s5','bmi','s4','avg_bp']], y_train['diseaseStatus_cat'])
cm_train = confusion_matrix(y_train['diseaseStatus_cat'],clf2.predict(X_train[['s5','bmi','s4','avg_bp']]))
cm_test = confusion_matrix(y_test['diseaseStatus_cat'],clf2.predict(X_test[['s5','bmi','s4','avg_bp']]))
Out[32]:
Out[32]:
In [33]:
fig, axes = plt.subplots(ncols=2, nrows=1, figsize = (20,16))
ax1, ax2 = axes.ravel()
buildConfusionMatrix(cm_train, classes = class_names, ax = ax1, title = 'Confusion Matrix: Training Data')
buildConfusionMatrix(cm_test, classes = class_names, ax = ax2, title = 'Confusion Matrix: Testing Data')
In [34]:
from sklearn.metrics import precision_score, recall_score
'Precision: {0:%} Recall: {1:%}'.format(
precision_score(y_test['diseaseStatus_cat'],clf2.predict(X_test[['s5','bmi','s4','avg_bp']]), average = 'weighted'),
recall_score(y_test['diseaseStatus_cat'],clf2.predict(X_test[['s5','bmi','s4','avg_bp']]), average = 'weighted'))
Out[34]:
In [6]:
Image("https://media.licdn.com/mpr/mpr/shrinknp_800_800/AAEAAQAAAAAAAAjIAAAAJGI0NzY3MGM0LTIyMTEtNDYwYy04OWQ2LTgyYmZiNDgzNTlhNw.png")
Out[6]:
In [4]:
HTML("<h3>Good prescriptive analytics requires a skilled and effective analytics team </h3>")
Image("images/analyticsteam.png")
Out[4]:
Out[4]:
In [35]:
HTML("<h3>... it also requires allies</h3>")
Image("images/allies.png")
Out[35]:
Out[35]:
In [7]:
Image("images/ABMs.png")
Out[7]:
Digging into these "why" questions requires a deep expertise around the business and behavior where the actuary and the business person need to work together and even draw on external resources to develop the mental model
... and then you have to take your mental model and put it into code, and brother you better believe that these things take chops and you have to get in and engineer and problem solve together
In [8]:
Image("images/usingABMs.png")
Out[8]:
Again
In [9]:
Image("images/whenABM.png")
Out[9]:
In [36]:
Image("https://media.licdn.com/mpr/mpr/shrinknp_800_800/AAEAAQAAAAAAAAjIAAAAJGI0NzY3MGM0LTIyMTEtNDYwYy04OWQ2LTgyYmZiNDgzNTlhNw.png")
Out[36]:
In [10]:
Image("https://static1.squarespace.com/static/5150aec6e4b0e340ec52710a/t/51525c33e4b0b3e0d10f77ab/1364352052403/Data_Science_VD.png")
Out[10]:
In [11]:
Image("images/analyticsteam.png")
Out[11]: