notebook.community



In [70]:

    
%load_ext autoreload
%autoreload 2

from munging import imputation
from munging import inspection
from munging import utility
from munging import transformation
from munging import feature
from munging import performance


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm

%matplotlib inline









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [115]:

    
custdata = pd.read_table("data/custdata.tsv")



In [116]:

    
custdata.head()









    Out[116]:






  
    
      
      custid
      sex
      is.employed
      income
      marital.stat
      health.ins
      housing.type
      recent.move
      num.vehicles
      age
      state.of.res
    
  
  
    
      0
       2068
       F
        NaN
       11300
             Married
        True
       Homeowner free and clear
       False
       2
       49
         Michigan
    
    
      1
       2073
       F
        NaN
           0
             Married
        True
                         Rented
        True
       3
       40
          Florida
    
    
      2
       2848
       M
       True
        4500
       Never Married
       False
                         Rented
        True
       3
       22
          Georgia
    
    
      3
       5641
       M
       True
       20000
       Never Married
       False
          Occupied with no rent
       False
       0
       22
       New Mexico
    
    
      4
       6369
       F
       True
       12000
       Never Married
        True
                         Rented
        True
       1
       31
          Florida

Missing Values



In [106]:

    
inspection.na_pattern(custdata)









    Out[106]:






  
    
      
      custid
      sex
      is.employed
      income
      marital.stat
      health.ins
      housing.type
      recent.move
      num.vehicles
      age
      state.of.res
      occurrence
    
  
  
    
      0
       -
       -
       missing
       -
       -
       -
             -
             -
             -
       -
       -
        280
    
    
      1
       -
       -
             -
       -
       -
       -
             -
             -
             -
       -
       -
        664
    
    
      2
       -
       -
             -
       -
       -
       -
       missing
       missing
       missing
       -
       -
          8
    
    
      3
       -
       -
       missing
       -
       -
       -
       missing
       missing
       missing
       -
       -
         48
    
    
      4
       0
       0
           328
       0
       0
       0
            56
            56
            56
       0
       0
       1000



In [107]:

    
categorical_features = inspection.find_categorical_features(custdata)
print categorical_features









    



['sex' 'is.employed' 'marital.stat' 'health.ins' 'housing.type'
 'recent.move' 'state.of.res']



In [108]:

    
df = imputation.imput_categorical_features(custdata, categorical_features)



In [109]:

    
inspection.na_pattern(df)









    Out[109]:






  
    
      
      custid
      sex
      is.employed
      income
      marital.stat
      health.ins
      housing.type
      recent.move
      num.vehicles
      age
      state.of.res
      occurrence
    
  
  
    
      0
       -
       -
       -
       -
       -
       -
       -
       -
             -
       -
       -
        944
    
    
      1
       -
       -
       -
       -
       -
       -
       -
       -
       missing
       -
       -
         56
    
    
      2
       0
       0
       0
       0
       0
       0
       0
       0
            56
       0
       0
       1000



In [110]:

    
for f in categorical_features:
    print pd.unique(df[f])









    



['F' 'M']
['missing' True False]
['Married' 'Never Married' 'Divorced/Separated' 'Widowed']
[True False]
['Homeowner free and clear' 'Rented' 'Occupied with no rent'
 'Homeowner with mortgage/loan' 'missing']
[False True 'missing']
['Michigan' 'Florida' 'Georgia' 'New Mexico' 'New York' 'Idaho' 'Illinois'
 'North Carolina' 'Indiana' 'New Hampshire' 'South Carolina' 'Pennsylvania'
 'Virginia' 'New Jersey' 'Ohio' 'Missouri' 'California' 'Wisconsin'
 'South Dakota' 'Colorado' 'Oklahoma' 'Kentucky' 'Massachusetts' 'Texas'
 'Maryland' 'Connecticut' 'Mississippi' 'Arkansas' 'Washington'
 'Rhode Island' 'Minnesota' 'Alaska' 'Tennessee' 'Iowa' 'Kansas' 'Oregon'
 'Alabama' 'Arizona' 'West Virginia' 'Louisiana' 'Vermont' 'Maine' 'Hawaii'
 'Utah' 'Wyoming' 'Nebraska' 'Nevada' 'Delaware' 'Montana' 'North Dakota']



In [111]:

    
numerical_features = inspection.find_numerical_features(custdata)
print numerical_features
df = imputation.imput_numerical_features(custdata, numerical_features, )









    



['custid' 'income' 'num.vehicles' 'age']



In [112]:

    
inspection.na_pattern(df)









    Out[112]:






  
    
      
      custid
      sex
      is.employed
      income
      marital.stat
      health.ins
      housing.type
      recent.move
      num.vehicles
      age
      state.of.res
      num.vehicles_isna
      occurrence
    
  
  
    
      0
       -
       -
       missing
       -
       -
       -
             -
             -
       -
       -
       -
       -
        280
    
    
      1
       -
       -
             -
       -
       -
       -
             -
             -
       -
       -
       -
       -
        664
    
    
      2
       -
       -
             -
       -
       -
       -
       missing
       missing
       -
       -
       -
       -
          8
    
    
      3
       -
       -
       missing
       -
       -
       -
       missing
       missing
       -
       -
       -
       -
         48
    
    
      4
       0
       0
           328
       0
       0
       0
            56
            56
       0
       0
       0
       0
       1000



In [112]:



In [113]:

    
df = imputation.imput(custdata)
inspection.na_pattern(df)









    Out[113]:






  
    
      
      custid
      sex
      is.employed
      income
      marital.stat
      health.ins
      housing.type
      recent.move
      num.vehicles
      age
      state.of.res
      num.vehicles_isna
      occurrence
    
  
  
    
      0
       -
       -
       -
       -
       -
       -
       -
       -
       -
       -
       -
       -
       1000
    
    
      1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       1000



In [114]:

    
df.head()









    Out[114]:






  
    
      
      custid
      sex
      is.employed
      income
      marital.stat
      health.ins
      housing.type
      recent.move
      num.vehicles
      age
      state.of.res
      num.vehicles_isna
    
  
  
    
      0
       2068
       F
       missing
       11300
             Married
        True
       Homeowner free and clear
       False
       2
       49
         Michigan
       False
    
    
      1
       2073
       F
       missing
           0
             Married
        True
                         Rented
        True
       3
       40
          Florida
       False
    
    
      2
       2848
       M
          True
        4500
       Never Married
       False
                         Rented
        True
       3
       22
          Georgia
       False
    
    
      3
       5641
       M
          True
       20000
       Never Married
       False
          Occupied with no rent
       False
       0
       22
       New Mexico
       False
    
    
      4
       6369
       F
          True
       12000
       Never Married
        True
                         Rented
        True
       1
       31
          Florida
       False



In [13]:

    
df["num.vehicles_isna"].sum()









    Out[13]:





56

Transformations



In [14]:

    
df = imputation.imput(custdata)



In [15]:

    
inspection.plot_features_density(df)



In [16]:

    
inspection.plot_features_density(df, plot_type="hist", )



In [17]:

    
skewed_feats = inspection.find_features_skewed(df, 20)
skewed_feats









    Out[17]:





array(['income'], 
      dtype='|S6')



In [18]:

    
log_df = transformation.log_transform(df[df.income > 0], skewed_feats)
inspection.plot_features_density(log_df)



In [19]:

    
## arcsinh - assume negative income is valid (e.g., debt)
arc_df = transformation.arcsinh_transform(df, skewed_feats)
inspection.plot_features_density(arc_df)



In [20]:

    
## signed_log - assume negative income is valid (e.g., debt)
slog_df = transformation.signed_log_transform(df, skewed_feats)
inspection.plot_features_density(slog_df)



In [21]:

    
inspection.plot_features_density(slog_df, plot_type="hist")

Pairwise plotting to find usefual single variable and correlation of variables



In [22]:

    
df = imputation.imput(custdata)
df = df[df.income > 0]
df = transformation.log_transform(df, ["income"])



In [23]:

    
print inspection.find_numerical_features(df)
print inspection.find_categorical_features(df)









    



['custid' 'income' 'num.vehicles' 'age' 'log_income']
['sex' 'is.employed' 'marital.stat' 'health.ins' 'housing.type'
 'recent.move' 'state.of.res' 'num.vehicles_isna']



In [24]:

    
inspection.plot_feature_pair(df, "age", "log_income")



In [25]:

    
inspection.plot_feature_pair(df, "marital.stat", "income")



In [26]:

    
inspection.plot_feature_pair(df, "age", "marital.stat")



In [27]:

    
inspection.plot_feature_pair(df, "marital.stat", "sex")



In [28]:

    
inspection.plot_feature_pair(df, "sex", "marital.stat")



In [29]:

    
inspection.plot_feature_pair(df, "num.vehicles", "health.ins", legend = True)

discretization of continous variables



In [30]:

    
print utility.is_discrete(df, "num.vehicles")
print utility.is_discrete(df, "custid")









    



False
False



In [31]:

    
## default qcut
r = transformation.discretize_numerical(df, ["num.vehicles"], max_qcut = 100)
print np.unique(r["discrete_num.vehicles"])
r.loc[:, ["num.vehicles", "discrete_num.vehicles"]].head()









    



['(1.717, 2]' '(2, 3]' '(3.364, 4]' '(4, 5]' '(5, 6]' '[0, 1]']






    Out[31]:






  
    
      
      num.vehicles
      discrete_num.vehicles
    
  
  
    
      0
       2
       (1.717, 2]
    
    
      2
       3
           (2, 3]
    
    
      3
       0
           [0, 1]
    
    
      4
       1
           [0, 1]
    
    
      5
       1
           [0, 1]



In [32]:

    
## equal bin size cut
r = transformation.discretize_numerical(df, ["income"], feat_bins = {"income": 10})
print np.unique(r["discrete_income"])
r.loc[:, ["income", "discrete_income"]].head()









    



['(123024, 184521]' '(184521, 246018]' '(246018, 307515]'
 '(307515, 369012]' '(369012, 430509]' '(430509, 492006]'
 '(553503, 615000]' '(61527, 123024]' '[-584.97, 61527]']






    Out[32]:






  
    
      
      income
      discrete_income
    
  
  
    
      0
        11300
       [-584.97, 61527]
    
    
      2
         4500
       [-584.97, 61527]
    
    
      3
        20000
       [-584.97, 61527]
    
    
      4
        12000
       [-584.97, 61527]
    
    
      5
       180000
       (123024, 184521]



In [33]:

    
## customized bins cut
r = transformation.discretize_numerical(df, ["age"], feat_bins = {"age": [-100, 0, 25, 30, 50, 60, 100, 150]})
print np.unique(r["discrete_age"])
r.loc[:, ["age", "discrete_age"]].head()









    



['(0, 25]' '(100, 150]' '(25, 30]' '(30, 50]' '(50, 60]' '(60, 100]'
 '[-100, 0]']






    Out[33]:






  
    
      
      age
      discrete_age
    
  
  
    
      0
       49
       (30, 50]
    
    
      2
       22
        (0, 25]
    
    
      3
       22
        (0, 25]
    
    
      4
       31
       (30, 50]
    
    
      5
       40
       (30, 50]



In [34]:

    
inspection.plot_feature_pair(r, "discrete_age", "health.ins")

Feature Engineering



In [97]:

    
categorical_feats = [f for f in inspection.find_categorical_features(r) if f != "health.ins"]
rr = feature.BiClassProbabilityFeatureExtractor().fit_transform(r, categorical_feats, "health.ins",)
rr.head(n = 2).T









    Out[97]:






  
    
      
      0
      2
    
  
  
    
      custid
                           2068
                2848
    
    
      sex
                              F
                   M
    
    
      is.employed
                        missing
                True
    
    
      income
                          11300
                4500
    
    
      marital.stat
                        Married
       Never Married
    
    
      health.ins
                           True
               False
    
    
      housing.type
       Homeowner free and clear
              Rented
    
    
      recent.move
                          False
                True
    
    
      num.vehicles
                              2
                   3
    
    
      age
                             49
                  22
    
    
      state.of.res
                       Michigan
             Georgia
    
    
      num.vehicles_isna
                          False
               False
    
    
      log_income
                       9.332558
            8.411833
    
    
      discrete_age
                       (30, 50]
             (0, 25]
    
    
      health.insIsTrue_on_sex
                      0.8931298
           0.8541667
    
    
      health.insIsTrue_on_is.employed
                      0.9496124
           0.8597663
    
    
      health.insIsTrue_on_marital.stat
                      0.9094737
           0.7365854
    
    
      health.insIsTrue_on_housing.type
                      0.9411765
           0.7854985
    
    
      health.insIsTrue_on_recent.move
                      0.8791774
           0.8214286
    
    
      health.insIsTrue_on_state.of.res
                          0.875
           0.8571429
    
    
      health.insIsTrue_on_num.vehicles_isna
                      0.8719101
           0.8719101
    
    
      health.insIsTrue_on_discrete_age
                      0.8380682
           0.7111111

Performance Calibration



In [98]:

    
fig, axes = plt.subplots(nrows = 4, ncols = 2, figsize = (2 * 4, 4 * 4))
axes = axes.ravel()
iax = 0
for f in rr.columns:
    if '_on_' in f:
        performance.biclassification_density_plot(rr['health.ins'], rr[f], y_name="health.ins", yhat_name=f, ax = axes[iax])
        iax += 1



In [99]:

    
calibration.biclassification_density_plot(rr['health.ins'], rr["health.ins"]+np.random.randn(rr.shape[0])*0.01, 
                                          y_name="health.ins", yhat_name="health.ins")



In [100]:

    
y = rr["health.ins"]
yhat = rr["health.insIsTrue_on_discrete_age"]
prior_p = np.mean(y == True)
base_score = np.sum(np.log(np.where(y==True, prior_p, 1-prior_p)))
yhat_score = np.sum(np.log(np.where(y==True, yhat, 1-yhat)))



In [101]:

    
base_score, yhat_score









    Out[101]:





(-354.47189420192092, -324.3203395014998)



In [102]:

    
for f in rr.columns:
    if '_on_' in f:
        print f, performance.biclassification_likelihood_score(rr["health.ins"], 
                                                            rr[f], 
                                                            y_positive = True)









    



health.insIsTrue_on_sex 1.54475494326
health.insIsTrue_on_is.employed 18.9503328029
health.insIsTrue_on_marital.stat 23.7160288585
health.insIsTrue_on_housing.type 19.3856930812
health.insIsTrue_on_recent.move 1.48300076457
health.insIsTrue_on_state.of.res 20.7962012371
health.insIsTrue_on_num.vehicles_isna 0.137867366456
health.insIsTrue_on_discrete_age 30.1515547004



In [ ]:

	custid	sex	is.employed	income	marital.stat	health.ins	housing.type	recent.move	num.vehicles	age	state.of.res
0	2068	F	NaN	11300	Married	True	Homeowner free and clear	False	2	49	Michigan
1	2073	F	NaN	0	Married	True	Rented	True	3	40	Florida
2	2848	M	True	4500	Never Married	False	Rented	True	3	22	Georgia
3	5641	M	True	20000	Never Married	False	Occupied with no rent	False	0	22	New Mexico
4	6369	F	True	12000	Never Married	True	Rented	True	1	31	Florida

	custid	sex	is.employed	income	marital.stat	health.ins	housing.type	recent.move	num.vehicles	age	state.of.res	occurrence
0	-	-	missing	-	-	-	-	-	-	-	-	280
1	-	-	-	-	-	-	-	-	-	-	-	664
2	-	-	-	-	-	-	missing	missing	missing	-	-	8
3	-	-	missing	-	-	-	missing	missing	missing	-	-	48
4	0	0	328	0	0	0	56	56	56	0	0	1000

	income	discrete_income
0	11300	[-584.97, 61527]
2	4500	[-584.97, 61527]
3	20000	[-584.97, 61527]
4	12000	[-584.97, 61527]
5	180000	(123024, 184521]

	0	2
custid	2068	2848
sex	F	M
is.employed	missing	True
income	11300	4500
marital.stat	Married	Never Married
health.ins	True	False
housing.type	Homeowner free and clear	Rented
recent.move	False	True
num.vehicles	2	3
age	49	22
state.of.res	Michigan	Georgia
num.vehicles_isna	False	False
log_income	9.332558	8.411833
discrete_age	(30, 50]	(0, 25]
health.insIsTrue_on_sex	0.8931298	0.8541667
health.insIsTrue_on_is.employed	0.9496124	0.8597663
health.insIsTrue_on_marital.stat	0.9094737	0.7365854
health.insIsTrue_on_housing.type	0.9411765	0.7854985
health.insIsTrue_on_recent.move	0.8791774	0.8214286
health.insIsTrue_on_state.of.res	0.875	0.8571429
health.insIsTrue_on_num.vehicles_isna	0.8719101	0.8719101
health.insIsTrue_on_discrete_age	0.8380682	0.7111111