In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
from munging import imputation
from munging import inspection
from munging import utility
from munging import transformation
from munging import feature
from munging import performance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
%matplotlib inline
In [12]:
## load data
custdata = pd.read_table("data/orange_small_train.data.gz",
compression="gzip", na_values=["NA", ""],
delimiter = "\t", header = 0)
churn = np.loadtxt("data/orange_small_train_churn.labels.txt")
custdata["Churn"] = np.where(churn==1, "churn", "nochurn")
custdata.head(n = 3)
Out[12]:
Var1
Var2
Var3
Var4
Var5
Var6
Var7
Var8
Var9
Var10
...
Var222
Var223
Var224
Var225
Var226
Var227
Var228
Var229
Var230
Churn
0
NaN
NaN
NaN
NaN
NaN
1526
7
NaN
NaN
NaN
...
fXVEsaq
jySVZNlOJy
NaN
NaN
xb3V
RAYp
F2FyR07IdsN7I
NaN
NaN
nochurn
1
NaN
NaN
NaN
NaN
NaN
525
0
NaN
NaN
NaN
...
2Kb5FSF
LM8l689qOp
NaN
NaN
fKCe
RAYp
F2FyR07IdsN7I
NaN
NaN
churn
2
NaN
NaN
NaN
NaN
NaN
5236
7
NaN
NaN
NaN
...
NKv4yOc
jySVZNlOJy
NaN
kG3k
Qu4f
02N6s8f
ib5G6X1eUxUn6
am7c
NaN
nochurn
3 rows × 231 columns
In [13]:
## feature types
categorical_feats = inspection.find_categorical_features(custdata)
numerical_feats = inspection.find_numerical_features(custdata)
print len(categorical_feats), np.unique(custdata.dtypes[categorical_feats])
print len(numerical_feats), np.unique(custdata.dtypes[numerical_feats])
## better check the int64 type features, to make sure they are numerical instead of categorical
for f in numerical_feats:
if custdata[f].dtype == np.int64:
print f, custdata[f].unique()
39 [dtype('O')]
192 [dtype('int64') dtype('float64')]
Var73 [ 36 26 130 12 82 126 114 18 72 128 76 148 48 34 132 158 140 66
38 16 60 96 8 98 144 20 88 168 50 94 28 14 40 68 22 118
24 64 42 32 180 120 30 44 56 146 156 186 84 54 4 100 194 108
104 10 102 116 58 78 106 52 6 86 174 152 226 124 90 136 170 80
62 184 110 46 242 112 74 202 198 154 178 138 172 122 134 218 208 206
142 204 196 176 160 70 92 200 190 182 214 234 162 164 150 192 222 188
216 166 220 260 262 224 250 210 240 238 248 230 212 258 264 228 232 256
236 244 246 252 254]
The feature types look correct
In [4]:
## missing value pattern and imputation
%time inspection.na_pattern(custdata)
CPU times: user 4min 37s, sys: 7.26 s, total: 4min 44s
Wall time: 4min 44s
Out[4]:
Var1
Var2
Var3
Var4
Var5
Var6
Var7
Var8
Var9
Var10
...
Var223
Var224
Var225
Var226
Var227
Var228
Var229
Var230
Churn
occurrence
0
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
992
1
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
29
2
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
959
3
missing
missing
missing
missing
missing
missing
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
164
4
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
1683
5
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
-
missing
-
103
6
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
873
7
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
missing
missing
missing
-
-
-
missing
missing
-
967
8
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
26
9
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
253
10
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
324
11
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
-
missing
-
1
12
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
157
13
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
3305
14
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
423
15
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
673
16
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
268
17
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
636
18
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
722
19
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1467
20
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
202
21
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
823
22
missing
missing
missing
-
missing
missing
missing
missing
missing
missing
...
missing
missing
missing
-
-
-
missing
missing
-
78
23
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
missing
missing
missing
-
-
-
-
missing
-
1
24
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
4710
25
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
968
26
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
1003
27
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
375
28
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
1437
29
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
96
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1271
missing
missing
missing
missing
missing
missing
-
missing
missing
missing
...
missing
missing
missing
-
-
-
missing
missing
-
1
1272
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
missing
missing
-
-
-
-
missing
missing
-
1
1273
missing
missing
missing
missing
missing
-
missing
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1274
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
1
1275
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1276
missing
missing
missing
missing
missing
-
missing
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1277
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1278
missing
-
-
missing
missing
missing
missing
missing
missing
missing
...
missing
missing
missing
-
-
-
missing
missing
-
1
1279
-
missing
missing
missing
missing
missing
missing
missing
-
missing
...
missing
missing
missing
-
-
-
missing
missing
-
1
1280
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
1
1281
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
1
1282
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
missing
missing
-
-
-
-
-
missing
-
1
1283
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
1
1284
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
missing
missing
missing
-
-
-
missing
missing
-
2
1285
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1286
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1287
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1288
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
missing
missing
-
-
-
-
missing
missing
-
1
1289
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
missing
missing
-
-
-
-
missing
missing
-
1
1290
missing
missing
missing
-
missing
missing
missing
missing
missing
missing
...
missing
missing
missing
-
-
-
missing
missing
-
1
1291
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
-
missing
-
1
1292
missing
-
-
missing
missing
missing
missing
missing
missing
missing
...
-
-
missing
-
-
-
missing
missing
-
1
1293
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
1
1294
missing
missing
missing
missing
-
missing
missing
missing
missing
-
...
-
missing
missing
-
-
-
missing
missing
-
1
1295
missing
missing
missing
missing
missing
missing
-
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
1
1296
missing
missing
missing
missing
missing
missing
-
missing
missing
missing
...
-
missing
-
-
-
-
missing
missing
-
1
1297
missing
missing
missing
missing
missing
-
-
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1298
missing
missing
missing
-
missing
missing
missing
missing
missing
missing
...
-
missing
missing
-
-
-
missing
missing
-
1
1299
missing
missing
missing
missing
missing
-
missing
missing
missing
missing
...
-
missing
-
-
-
-
-
missing
-
1
1300
49298
48759
48760
48421
48513
5529
5539
50000
49298
48513
...
5211
49180
26144
0
0
0
28432
50000
0
50000
1301 rows × 232 columns
It consists of more than 1000 na patterns, which is almost possible to find anything interesting by just looking at the table, and we are more comfortable to assume that there is no OBVIOUS pattern of na occurenance. So lets just impute the values
In [14]:
## impute the missing values
feat_means = {f: custdata[f].mean() for f in numerical_feats}
feat_means = {f: v if not np.isnan(v) else 0 for f, v in feat_means.items()}
%time custdata = imputation.imput(custdata, copy=False, na_numerical=feat_means)
custdata.head(n = 3)
CPU times: user 480 ms, sys: 11.5 ms, total: 491 ms
Wall time: 491 ms
Out[14]:
Var1
Var2
Var3
Var4
Var5
Var6
Var7
Var8
Var9
Var10
...
Var90_isna
Var91_isna
Var92_isna
Var93_isna
Var94_isna
Var95_isna
Var96_isna
Var97_isna
Var98_isna
Var99_isna
0
11.487179
0.004029
425.298387
0.125396
238793.32885
1526
7
0
48.145299
392605.656355
...
True
True
True
True
True
True
True
True
True
True
1
11.487179
0.004029
425.298387
0.125396
238793.32885
525
0
0
48.145299
392605.656355
...
True
True
True
True
False
True
True
True
True
True
2
11.487179
0.004029
425.298387
0.125396
238793.32885
5236
7
0
48.145299
392605.656355
...
True
True
True
True
True
True
True
True
True
True
3 rows × 420 columns
In [15]:
%time inspection.na_pattern(custdata)
CPU times: user 2.81 s, sys: 139 ms, total: 2.95 s
Wall time: 2.96 s
Out[15]:
Var1
Var2
Var3
Var4
Var5
Var6
Var7
Var8
Var9
Var10
...
Var91_isna
Var92_isna
Var93_isna
Var94_isna
Var95_isna
Var96_isna
Var97_isna
Var98_isna
Var99_isna
occurrence
0
-
-
-
-
-
-
-
-
-
-
...
-
-
-
-
-
-
-
-
-
50000
1
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
50000
2 rows × 421 columns
In [56]:
%time inspection.plot_features_density(custdata, feat_names=numerical_feats, plot_type = "hist")
CPU times: user 11.3 s, sys: 127 ms, total: 11.4 s
Wall time: 11.4 s
There are zero-variance variables, and highly skewed variables (withou outliers). But we leave them for now since we are going to use the condtional probablity as new extracted features
In [8]:
%time df = transformation.discretize_numerical(custdata, numerical_feats, )
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-8-5b42cc91e63a> in <module>()
----> 1 get_ipython().magic(u'time df = transformation.discretize_numerical(custdata, numerical_feats, )')
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2203 magic_name, _, magic_arg_s = arg_s.partition(' ')
2204 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2205 return self.run_line_magic(magic_name, magic_arg_s)
2206
2207 #-------------------------------------------------------------------------
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2124 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2125 with self.builtin_trap:
-> 2126 result = fn(*args,**kwargs)
2127 return result
2128
/usr/local/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
/usr/local/lib/python2.7/dist-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
/usr/local/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
1127 else:
1128 st = clock2()
-> 1129 exec(code, glob, local_ns)
1130 end = clock2()
1131 out = None
<timed exec> in <module>()
/home/dola/workspace/dola/practical_munging_tools/munging/transformation.pyc in discretize_numerical(df, feat_names, copy, max_qcut, feat_bins, prefix)
79 ## there is a bug in current numpy percentile - use pandas quantile instead
80 bins = np.unique(df[f].quantile(np.linspace(0., 1., max_qcut)))
---> 81 result[prefix+f] = np.asarray(pd.cut(result[f], bins, include_lowest=True))
82 return result
/usr/local/lib/python2.7/dist-packages/pandas/tools/tile.pyc in cut(x, bins, right, labels, retbins, precision, include_lowest)
105 return _bins_to_cuts(x, bins, right=right, labels=labels,
106 retbins=retbins, precision=precision,
--> 107 include_lowest=include_lowest)
108
109
/usr/local/lib/python2.7/dist-packages/pandas/tools/tile.pyc in _bins_to_cuts(x, bins, right, labels, retbins, precision, name, include_lowest)
173 try:
174 levels = _format_levels(bins, precision, right=right,
--> 175 include_lowest=include_lowest)
176 except ValueError:
177 increases += 1
/usr/local/lib/python2.7/dist-packages/pandas/tools/tile.pyc in _format_levels(bins, prec, right, include_lowest)
219
220 if include_lowest:
--> 221 levels[0] = '[' + levels[0][1:]
222 else:
223 levels = ['[%s, %s)' % (fmt(a), fmt(b))
IndexError: list index out of range
In [16]:
cprob_feater = feature.BiClassProbabilityFeatureExtractor()
%time df = cprob_feater.fit_transform(custdata, categorical_feats[:-1], by = "Churn")
CPU times: user 2.09 s, sys: 419 ms, total: 2.51 s
Wall time: 2.51 s
In [21]:
for f in categorical_feats[:-1]:
print f, performance.biclassification_likelihood_score(y=df.Churn, yhat=1-df["ChurnIsnochurn_on_"+f],
y_positive ="churn", y_name="Churn",
yhat_name=f, )
Var191 5.47552387758
Var192 397.968395057
Var193 138.053366017
Var194 10.2790058536
Var195 28.0608245591
Var196 2.43242586534
Var197 149.84187659
Var198 1578.90991846
Var199 2361.45139427
Var200 4109.96606037
Var201 10.4493228817
Var202 2616.82448231
Var203 5.40224448081
Var204 112.501306627
Var205 110.456443547
Var206 177.122347418
Var207 86.0231581
Var208 2.6955776509
Var210 54.6287477251
Var211 24.6601831132
Var212 196.381236914
Var213 7.76366404489
Var214 4109.96606037
Var215 0.396735228082
Var216 988.972226567
Var217 4552.91478616
Var218 172.797282905
Var219 18.1932619992
Var220 1578.90991846
Var221 59.7289561804
Var222 1578.90991846
Var223 2.43780559721
Var224 1.99192653855
Var225 79.9578628758
Var226 57.5026926211
Var227 77.083495083
Var228 150.525268966
Var229 93.3105502656
In [ ]:
Content source: dolaameng/practical_munging_tools
Similar notebooks: