In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from munging import imputation
from munging import inspection
from munging import utility
from munging import transformation
from munging import feature
from munging import performance


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm

%matplotlib inline

Load Data


In [12]:
## load data
custdata = pd.read_table("data/orange_small_train.data.gz", 
                         compression="gzip", na_values=["NA", ""], 
                         delimiter = "\t", header = 0)
churn = np.loadtxt("data/orange_small_train_churn.labels.txt")
custdata["Churn"] = np.where(churn==1, "churn", "nochurn")
custdata.head(n = 3)


Out[12]:
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 ... Var222 Var223 Var224 Var225 Var226 Var227 Var228 Var229 Var230 Churn
0 NaN NaN NaN NaN NaN 1526 7 NaN NaN NaN ... fXVEsaq jySVZNlOJy NaN NaN xb3V RAYp F2FyR07IdsN7I NaN NaN nochurn
1 NaN NaN NaN NaN NaN 525 0 NaN NaN NaN ... 2Kb5FSF LM8l689qOp NaN NaN fKCe RAYp F2FyR07IdsN7I NaN NaN churn
2 NaN NaN NaN NaN NaN 5236 7 NaN NaN NaN ... NKv4yOc jySVZNlOJy NaN kG3k Qu4f 02N6s8f ib5G6X1eUxUn6 am7c NaN nochurn

3 rows × 231 columns

Exploration Data Analysis

1. feature types


In [13]:
## feature types
categorical_feats = inspection.find_categorical_features(custdata)
numerical_feats = inspection.find_numerical_features(custdata)
print len(categorical_feats), np.unique(custdata.dtypes[categorical_feats])
print len(numerical_feats), np.unique(custdata.dtypes[numerical_feats])

## better check the int64 type features, to make sure they are numerical instead of categorical
for f in numerical_feats:
    if custdata[f].dtype == np.int64:
        print f, custdata[f].unique()


39 [dtype('O')]
192 [dtype('int64') dtype('float64')]
Var73 [ 36  26 130  12  82 126 114  18  72 128  76 148  48  34 132 158 140  66
  38  16  60  96   8  98 144  20  88 168  50  94  28  14  40  68  22 118
  24  64  42  32 180 120  30  44  56 146 156 186  84  54   4 100 194 108
 104  10 102 116  58  78 106  52   6  86 174 152 226 124  90 136 170  80
  62 184 110  46 242 112  74 202 198 154 178 138 172 122 134 218 208 206
 142 204 196 176 160  70  92 200 190 182 214 234 162 164 150 192 222 188
 216 166 220 260 262 224 250 210 240 238 248 230 212 258 264 228 232 256
 236 244 246 252 254]

The feature types look correct

2. na imputation


In [4]:
## missing value pattern and imputation
%time inspection.na_pattern(custdata)


CPU times: user 4min 37s, sys: 7.26 s, total: 4min 44s
Wall time: 4min 44s
Out[4]:
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 ... Var223 Var224 Var225 Var226 Var227 Var228 Var229 Var230 Churn occurrence
0 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 992
1 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 29
2 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 959
3 missing missing missing missing missing missing - missing missing missing ... - missing missing - - - missing missing - 164
4 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 1683
5 missing missing missing missing missing - - missing missing missing ... - missing missing - - - - missing - 103
6 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 873
7 missing missing missing missing missing - - missing missing missing ... missing missing missing - - - missing missing - 967
8 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 26
9 missing missing missing missing missing - - missing missing missing ... - missing - - - - missing missing - 253
10 missing missing missing missing missing - - missing missing missing ... - missing - - - - missing missing - 324
11 missing missing missing missing missing - - missing missing missing ... - missing missing - - - - missing - 1
12 missing missing missing missing missing - - missing missing missing ... - missing - - - - missing missing - 157
13 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 3305
14 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 423
15 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 673
16 missing missing missing missing missing - - missing missing missing ... - missing - - - - missing missing - 268
17 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 636
18 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 722
19 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 1467
20 missing missing missing missing missing - - missing missing missing ... - missing - - - - missing missing - 202
21 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 823
22 missing missing missing - missing missing missing missing missing missing ... missing missing missing - - - missing missing - 78
23 missing missing missing missing missing - - missing missing missing ... missing missing missing - - - - missing - 1
24 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 4710
25 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 968
26 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 1003
27 missing missing missing missing missing - - missing missing missing ... - missing - - - - missing missing - 375
28 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 1437
29 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 96
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1271 missing missing missing missing missing missing - missing missing missing ... missing missing missing - - - missing missing - 1
1272 missing missing missing missing missing - - missing missing missing ... missing missing - - - - missing missing - 1
1273 missing missing missing missing missing - missing missing missing missing ... - missing missing - - - missing missing - 1
1274 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 1
1275 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 1
1276 missing missing missing missing missing - missing missing missing missing ... - missing missing - - - missing missing - 1
1277 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 1
1278 missing - - missing missing missing missing missing missing missing ... missing missing missing - - - missing missing - 1
1279 - missing missing missing missing missing missing missing - missing ... missing missing missing - - - missing missing - 1
1280 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 1
1281 missing missing missing missing missing - - missing missing missing ... - missing - - - - missing missing - 1
1282 missing missing missing missing missing - - missing missing missing ... missing missing - - - - - missing - 1
1283 missing missing missing missing missing - - missing missing missing ... - missing - - - - missing missing - 1
1284 missing missing missing missing missing - - missing missing missing ... missing missing missing - - - missing missing - 2
1285 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 1
1286 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 1
1287 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 1
1288 missing missing missing missing missing - - missing missing missing ... missing missing - - - - missing missing - 1
1289 missing missing missing missing missing - - missing missing missing ... missing missing - - - - missing missing - 1
1290 missing missing missing - missing missing missing missing missing missing ... missing missing missing - - - missing missing - 1
1291 missing missing missing missing missing - - missing missing missing ... - missing missing - - - - missing - 1
1292 missing - - missing missing missing missing missing missing missing ... - - missing - - - missing missing - 1
1293 missing missing missing missing missing - - missing missing missing ... - missing - - - - - missing - 1
1294 missing missing missing missing - missing missing missing missing - ... - missing missing - - - missing missing - 1
1295 missing missing missing missing missing missing - missing missing missing ... - missing - - - - - missing - 1
1296 missing missing missing missing missing missing - missing missing missing ... - missing - - - - missing missing - 1
1297 missing missing missing missing missing - - missing missing missing ... - missing missing - - - missing missing - 1
1298 missing missing missing - missing missing missing missing missing missing ... - missing missing - - - missing missing - 1
1299 missing missing missing missing missing - missing missing missing missing ... - missing - - - - - missing - 1
1300 49298 48759 48760 48421 48513 5529 5539 50000 49298 48513 ... 5211 49180 26144 0 0 0 28432 50000 0 50000

1301 rows × 232 columns

It consists of more than 1000 na patterns, which is almost possible to find anything interesting by just looking at the table, and we are more comfortable to assume that there is no OBVIOUS pattern of na occurenance. So lets just impute the values


In [14]:
## impute the missing values
feat_means = {f: custdata[f].mean() for f in numerical_feats}
feat_means = {f: v if not np.isnan(v) else 0 for f, v in feat_means.items()}
%time custdata = imputation.imput(custdata, copy=False, na_numerical=feat_means)
custdata.head(n = 3)


CPU times: user 480 ms, sys: 11.5 ms, total: 491 ms
Wall time: 491 ms
Out[14]:
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 ... Var90_isna Var91_isna Var92_isna Var93_isna Var94_isna Var95_isna Var96_isna Var97_isna Var98_isna Var99_isna
0 11.487179 0.004029 425.298387 0.125396 238793.32885 1526 7 0 48.145299 392605.656355 ... True True True True True True True True True True
1 11.487179 0.004029 425.298387 0.125396 238793.32885 525 0 0 48.145299 392605.656355 ... True True True True False True True True True True
2 11.487179 0.004029 425.298387 0.125396 238793.32885 5236 7 0 48.145299 392605.656355 ... True True True True True True True True True True

3 rows × 420 columns


In [15]:
%time inspection.na_pattern(custdata)


CPU times: user 2.81 s, sys: 139 ms, total: 2.95 s
Wall time: 2.96 s
Out[15]:
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 ... Var91_isna Var92_isna Var93_isna Var94_isna Var95_isna Var96_isna Var97_isna Var98_isna Var99_isna occurrence
0 - - - - - - - - - - ... - - - - - - - - - 50000
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 50000

2 rows × 421 columns

3. transformation of skewed value


In [56]:
%time inspection.plot_features_density(custdata, feat_names=numerical_feats, plot_type = "hist")


CPU times: user 11.3 s, sys: 127 ms, total: 11.4 s
Wall time: 11.4 s

There are zero-variance variables, and highly skewed variables (withou outliers). But we leave them for now since we are going to use the condtional probablity as new extracted features

variable discretization and extraction


In [8]:
%time df = transformation.discretize_numerical(custdata, numerical_feats, )


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-8-5b42cc91e63a> in <module>()
----> 1 get_ipython().magic(u'time df = transformation.discretize_numerical(custdata, numerical_feats, )')

/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2203         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2204         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2205         return self.run_line_magic(magic_name, magic_arg_s)
   2206 
   2207     #-------------------------------------------------------------------------

/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2124                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2125             with self.builtin_trap:
-> 2126                 result = fn(*args,**kwargs)
   2127             return result
   2128 

/usr/local/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/usr/local/lib/python2.7/dist-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/usr/local/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1127         else:
   1128             st = clock2()
-> 1129             exec(code, glob, local_ns)
   1130             end = clock2()
   1131             out = None

<timed exec> in <module>()

/home/dola/workspace/dola/practical_munging_tools/munging/transformation.pyc in discretize_numerical(df, feat_names, copy, max_qcut, feat_bins, prefix)
     79                         ## there is a bug in current numpy percentile - use pandas quantile instead
     80                         bins = np.unique(df[f].quantile(np.linspace(0., 1., max_qcut)))
---> 81                 result[prefix+f] = np.asarray(pd.cut(result[f], bins, include_lowest=True))
     82         return result

/usr/local/lib/python2.7/dist-packages/pandas/tools/tile.pyc in cut(x, bins, right, labels, retbins, precision, include_lowest)
    105     return _bins_to_cuts(x, bins, right=right, labels=labels,
    106                          retbins=retbins, precision=precision,
--> 107                          include_lowest=include_lowest)
    108 
    109 

/usr/local/lib/python2.7/dist-packages/pandas/tools/tile.pyc in _bins_to_cuts(x, bins, right, labels, retbins, precision, name, include_lowest)
    173                 try:
    174                     levels = _format_levels(bins, precision, right=right,
--> 175                                             include_lowest=include_lowest)
    176                 except ValueError:
    177                     increases += 1

/usr/local/lib/python2.7/dist-packages/pandas/tools/tile.pyc in _format_levels(bins, prec, right, include_lowest)
    219 
    220         if include_lowest:
--> 221             levels[0] = '[' + levels[0][1:]
    222     else:
    223         levels = ['[%s, %s)' % (fmt(a), fmt(b))

IndexError: list index out of range

In [16]:
cprob_feater = feature.BiClassProbabilityFeatureExtractor()
%time df = cprob_feater.fit_transform(custdata, categorical_feats[:-1], by = "Churn")


CPU times: user 2.09 s, sys: 419 ms, total: 2.51 s
Wall time: 2.51 s

In [21]:
for f in categorical_feats[:-1]:
    print f, performance.biclassification_likelihood_score(y=df.Churn, yhat=1-df["ChurnIsnochurn_on_"+f], 
                                                           y_positive ="churn", y_name="Churn", 
                                                           yhat_name=f, )


Var191 5.47552387758
Var192 397.968395057
Var193 138.053366017
Var194 10.2790058536
Var195 28.0608245591
Var196 2.43242586534
Var197 149.84187659
Var198 1578.90991846
Var199 2361.45139427
Var200 4109.96606037
Var201 10.4493228817
Var202 2616.82448231
Var203 5.40224448081
Var204 112.501306627
Var205 110.456443547
Var206 177.122347418
Var207 86.0231581
Var208 2.6955776509
Var210 54.6287477251
Var211 24.6601831132
Var212 196.381236914
Var213 7.76366404489
Var214 4109.96606037
Var215 0.396735228082
Var216 988.972226567
Var217 4552.91478616
Var218 172.797282905
Var219 18.1932619992
Var220 1578.90991846
Var221 59.7289561804
Var222 1578.90991846
Var223 2.43780559721
Var224 1.99192653855
Var225 79.9578628758
Var226 57.5026926211
Var227 77.083495083
Var228 150.525268966
Var229 93.3105502656

In [ ]: