In [70]:
%load_ext autoreload
%autoreload 2

from munging import imputation
from munging import inspection
from munging import utility
from munging import transformation
from munging import feature
from munging import performance


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm

%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [115]:
custdata = pd.read_table("data/custdata.tsv")

In [116]:
custdata.head()


Out[116]:
custid sex is.employed income marital.stat health.ins housing.type recent.move num.vehicles age state.of.res
0 2068 F NaN 11300 Married True Homeowner free and clear False 2 49 Michigan
1 2073 F NaN 0 Married True Rented True 3 40 Florida
2 2848 M True 4500 Never Married False Rented True 3 22 Georgia
3 5641 M True 20000 Never Married False Occupied with no rent False 0 22 New Mexico
4 6369 F True 12000 Never Married True Rented True 1 31 Florida

Missing Values


In [106]:
inspection.na_pattern(custdata)


Out[106]:
custid sex is.employed income marital.stat health.ins housing.type recent.move num.vehicles age state.of.res occurrence
0 - - missing - - - - - - - - 280
1 - - - - - - - - - - - 664
2 - - - - - - missing missing missing - - 8
3 - - missing - - - missing missing missing - - 48
4 0 0 328 0 0 0 56 56 56 0 0 1000

In [107]:
categorical_features = inspection.find_categorical_features(custdata)
print categorical_features


['sex' 'is.employed' 'marital.stat' 'health.ins' 'housing.type'
 'recent.move' 'state.of.res']

In [108]:
df = imputation.imput_categorical_features(custdata, categorical_features)

In [109]:
inspection.na_pattern(df)


Out[109]:
custid sex is.employed income marital.stat health.ins housing.type recent.move num.vehicles age state.of.res occurrence
0 - - - - - - - - - - - 944
1 - - - - - - - - missing - - 56
2 0 0 0 0 0 0 0 0 56 0 0 1000

In [110]:
for f in categorical_features:
    print pd.unique(df[f])


['F' 'M']
['missing' True False]
['Married' 'Never Married' 'Divorced/Separated' 'Widowed']
[True False]
['Homeowner free and clear' 'Rented' 'Occupied with no rent'
 'Homeowner with mortgage/loan' 'missing']
[False True 'missing']
['Michigan' 'Florida' 'Georgia' 'New Mexico' 'New York' 'Idaho' 'Illinois'
 'North Carolina' 'Indiana' 'New Hampshire' 'South Carolina' 'Pennsylvania'
 'Virginia' 'New Jersey' 'Ohio' 'Missouri' 'California' 'Wisconsin'
 'South Dakota' 'Colorado' 'Oklahoma' 'Kentucky' 'Massachusetts' 'Texas'
 'Maryland' 'Connecticut' 'Mississippi' 'Arkansas' 'Washington'
 'Rhode Island' 'Minnesota' 'Alaska' 'Tennessee' 'Iowa' 'Kansas' 'Oregon'
 'Alabama' 'Arizona' 'West Virginia' 'Louisiana' 'Vermont' 'Maine' 'Hawaii'
 'Utah' 'Wyoming' 'Nebraska' 'Nevada' 'Delaware' 'Montana' 'North Dakota']

In [111]:
numerical_features = inspection.find_numerical_features(custdata)
print numerical_features
df = imputation.imput_numerical_features(custdata, numerical_features, )


['custid' 'income' 'num.vehicles' 'age']

In [112]:
inspection.na_pattern(df)


Out[112]:
custid sex is.employed income marital.stat health.ins housing.type recent.move num.vehicles age state.of.res num.vehicles_isna occurrence
0 - - missing - - - - - - - - - 280
1 - - - - - - - - - - - - 664
2 - - - - - - missing missing - - - - 8
3 - - missing - - - missing missing - - - - 48
4 0 0 328 0 0 0 56 56 0 0 0 0 1000

In [112]:


In [113]:
df = imputation.imput(custdata)
inspection.na_pattern(df)


Out[113]:
custid sex is.employed income marital.stat health.ins housing.type recent.move num.vehicles age state.of.res num.vehicles_isna occurrence
0 - - - - - - - - - - - - 1000
1 0 0 0 0 0 0 0 0 0 0 0 0 1000

In [114]:
df.head()


Out[114]:
custid sex is.employed income marital.stat health.ins housing.type recent.move num.vehicles age state.of.res num.vehicles_isna
0 2068 F missing 11300 Married True Homeowner free and clear False 2 49 Michigan False
1 2073 F missing 0 Married True Rented True 3 40 Florida False
2 2848 M True 4500 Never Married False Rented True 3 22 Georgia False
3 5641 M True 20000 Never Married False Occupied with no rent False 0 22 New Mexico False
4 6369 F True 12000 Never Married True Rented True 1 31 Florida False

In [13]:
df["num.vehicles_isna"].sum()


Out[13]:
56

Transformations


In [14]:
df = imputation.imput(custdata)

In [15]:
inspection.plot_features_density(df)



In [16]:
inspection.plot_features_density(df, plot_type="hist", )



In [17]:
skewed_feats = inspection.find_features_skewed(df, 20)
skewed_feats


Out[17]:
array(['income'], 
      dtype='|S6')

In [18]:
log_df = transformation.log_transform(df[df.income > 0], skewed_feats)
inspection.plot_features_density(log_df)



In [19]:
## arcsinh - assume negative income is valid (e.g., debt)
arc_df = transformation.arcsinh_transform(df, skewed_feats)
inspection.plot_features_density(arc_df)



In [20]:
## signed_log - assume negative income is valid (e.g., debt)
slog_df = transformation.signed_log_transform(df, skewed_feats)
inspection.plot_features_density(slog_df)



In [21]:
inspection.plot_features_density(slog_df, plot_type="hist")


Pairwise plotting to find usefual single variable and correlation of variables


In [22]:
df = imputation.imput(custdata)
df = df[df.income > 0]
df = transformation.log_transform(df, ["income"])

In [23]:
print inspection.find_numerical_features(df)
print inspection.find_categorical_features(df)


['custid' 'income' 'num.vehicles' 'age' 'log_income']
['sex' 'is.employed' 'marital.stat' 'health.ins' 'housing.type'
 'recent.move' 'state.of.res' 'num.vehicles_isna']

In [24]:
inspection.plot_feature_pair(df, "age", "log_income")



In [25]:
inspection.plot_feature_pair(df, "marital.stat", "income")



In [26]:
inspection.plot_feature_pair(df, "age", "marital.stat")



In [27]:
inspection.plot_feature_pair(df, "marital.stat", "sex")



In [28]:
inspection.plot_feature_pair(df, "sex", "marital.stat")



In [29]:
inspection.plot_feature_pair(df, "num.vehicles", "health.ins", legend = True)


discretization of continous variables


In [30]:
print utility.is_discrete(df, "num.vehicles")
print utility.is_discrete(df, "custid")


False
False

In [31]:
## default qcut
r = transformation.discretize_numerical(df, ["num.vehicles"], max_qcut = 100)
print np.unique(r["discrete_num.vehicles"])
r.loc[:, ["num.vehicles", "discrete_num.vehicles"]].head()


['(1.717, 2]' '(2, 3]' '(3.364, 4]' '(4, 5]' '(5, 6]' '[0, 1]']
Out[31]:
num.vehicles discrete_num.vehicles
0 2 (1.717, 2]
2 3 (2, 3]
3 0 [0, 1]
4 1 [0, 1]
5 1 [0, 1]

In [32]:
## equal bin size cut
r = transformation.discretize_numerical(df, ["income"], feat_bins = {"income": 10})
print np.unique(r["discrete_income"])
r.loc[:, ["income", "discrete_income"]].head()


['(123024, 184521]' '(184521, 246018]' '(246018, 307515]'
 '(307515, 369012]' '(369012, 430509]' '(430509, 492006]'
 '(553503, 615000]' '(61527, 123024]' '[-584.97, 61527]']
Out[32]:
income discrete_income
0 11300 [-584.97, 61527]
2 4500 [-584.97, 61527]
3 20000 [-584.97, 61527]
4 12000 [-584.97, 61527]
5 180000 (123024, 184521]

In [33]:
## customized bins cut
r = transformation.discretize_numerical(df, ["age"], feat_bins = {"age": [-100, 0, 25, 30, 50, 60, 100, 150]})
print np.unique(r["discrete_age"])
r.loc[:, ["age", "discrete_age"]].head()


['(0, 25]' '(100, 150]' '(25, 30]' '(30, 50]' '(50, 60]' '(60, 100]'
 '[-100, 0]']
Out[33]:
age discrete_age
0 49 (30, 50]
2 22 (0, 25]
3 22 (0, 25]
4 31 (30, 50]
5 40 (30, 50]

In [34]:
inspection.plot_feature_pair(r, "discrete_age", "health.ins")


Feature Engineering


In [97]:
categorical_feats = [f for f in inspection.find_categorical_features(r) if f != "health.ins"]
rr = feature.BiClassProbabilityFeatureExtractor().fit_transform(r, categorical_feats, "health.ins",)
rr.head(n = 2).T


Out[97]:
0 2
custid 2068 2848
sex F M
is.employed missing True
income 11300 4500
marital.stat Married Never Married
health.ins True False
housing.type Homeowner free and clear Rented
recent.move False True
num.vehicles 2 3
age 49 22
state.of.res Michigan Georgia
num.vehicles_isna False False
log_income 9.332558 8.411833
discrete_age (30, 50] (0, 25]
health.insIsTrue_on_sex 0.8931298 0.8541667
health.insIsTrue_on_is.employed 0.9496124 0.8597663
health.insIsTrue_on_marital.stat 0.9094737 0.7365854
health.insIsTrue_on_housing.type 0.9411765 0.7854985
health.insIsTrue_on_recent.move 0.8791774 0.8214286
health.insIsTrue_on_state.of.res 0.875 0.8571429
health.insIsTrue_on_num.vehicles_isna 0.8719101 0.8719101
health.insIsTrue_on_discrete_age 0.8380682 0.7111111

Performance Calibration


In [98]:
fig, axes = plt.subplots(nrows = 4, ncols = 2, figsize = (2 * 4, 4 * 4))
axes = axes.ravel()
iax = 0
for f in rr.columns:
    if '_on_' in f:
        performance.biclassification_density_plot(rr['health.ins'], rr[f], y_name="health.ins", yhat_name=f, ax = axes[iax])
        iax += 1



In [99]:
calibration.biclassification_density_plot(rr['health.ins'], rr["health.ins"]+np.random.randn(rr.shape[0])*0.01, 
                                          y_name="health.ins", yhat_name="health.ins")



In [100]:
y = rr["health.ins"]
yhat = rr["health.insIsTrue_on_discrete_age"]
prior_p = np.mean(y == True)
base_score = np.sum(np.log(np.where(y==True, prior_p, 1-prior_p)))
yhat_score = np.sum(np.log(np.where(y==True, yhat, 1-yhat)))

In [101]:
base_score, yhat_score


Out[101]:
(-354.47189420192092, -324.3203395014998)

In [102]:
for f in rr.columns:
    if '_on_' in f:
        print f, performance.biclassification_likelihood_score(rr["health.ins"], 
                                                            rr[f], 
                                                            y_positive = True)


health.insIsTrue_on_sex 1.54475494326
health.insIsTrue_on_is.employed 18.9503328029
health.insIsTrue_on_marital.stat 23.7160288585
health.insIsTrue_on_housing.type 19.3856930812
health.insIsTrue_on_recent.move 1.48300076457
health.insIsTrue_on_state.of.res 20.7962012371
health.insIsTrue_on_num.vehicles_isna 0.137867366456
health.insIsTrue_on_discrete_age 30.1515547004

In [ ]: