In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from munging import session


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats

In [3]:
## load data
custdata = pd.read_table("data/orange_small_train.data.gz", 
                         compression="gzip", na_values=["NA", ""], 
                         delimiter = "\t", header = 0)
churn = np.loadtxt("data/orange_small_train_churn.labels.txt")
custdata["Churn"] = np.where(churn==1, "churn", "nochurn")
custdata.head(n = 3)


Out[3]:
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 ... Var222 Var223 Var224 Var225 Var226 Var227 Var228 Var229 Var230 Churn
0 NaN NaN NaN NaN NaN 1526 7 NaN NaN NaN ... fXVEsaq jySVZNlOJy NaN NaN xb3V RAYp F2FyR07IdsN7I NaN NaN nochurn
1 NaN NaN NaN NaN NaN 525 0 NaN NaN NaN ... 2Kb5FSF LM8l689qOp NaN NaN fKCe RAYp F2FyR07IdsN7I NaN NaN churn
2 NaN NaN NaN NaN NaN 5236 7 NaN NaN NaN ... NKv4yOc jySVZNlOJy NaN kG3k Qu4f 02N6s8f ib5G6X1eUxUn6 am7c NaN nochurn

3 rows × 231 columns


In [4]:
custdata.columns


Out[4]:
Index([u'Var1', u'Var2', u'Var3', u'Var4', u'Var5', u'Var6', u'Var7', u'Var8', u'Var9', u'Var10', u'Var11', u'Var12', u'Var13', u'Var14', u'Var15', u'Var16', u'Var17', u'Var18', u'Var19', u'Var20', u'Var21', u'Var22', u'Var23', u'Var24', u'Var25', u'Var26', u'Var27', u'Var28', u'Var29', u'Var30', u'Var31', u'Var32', u'Var33', u'Var34', u'Var35', u'Var36', u'Var37', u'Var38', u'Var39', u'Var40', u'Var41', u'Var42', u'Var43', u'Var44', u'Var45', u'Var46', u'Var47', u'Var48', u'Var49', u'Var50', u'Var51', u'Var52', u'Var53', u'Var54', u'Var55', u'Var56', u'Var57', u'Var58', u'Var59', u'Var60', u'Var61', u'Var62', u'Var63', u'Var64', u'Var65', u'Var66', u'Var67', u'Var68', u'Var69', u'Var70', u'Var71', u'Var72', u'Var73', u'Var74', u'Var75', u'Var76', u'Var77', u'Var78', u'Var79', u'Var80', u'Var81', u'Var82', u'Var83', u'Var84', u'Var85', u'Var86', u'Var87', u'Var88', u'Var89', u'Var90', u'Var91', u'Var92', u'Var93', u'Var94', u'Var95', u'Var96', u'Var97', u'Var98', u'Var99', u'Var100', ...], dtype='object')

In [58]:
custsession = session.Session(custdata, 'Churn')

Data Session Exploration & Inspection


In [6]:
numerical_feats = custsession.find_numerical_features()
categorical_feats = custsession.find_categorical_features()
na_feats = custsession.find_na_features()
print 'numerical:', numerical_feats
print 'categorical:', categorical_feats
print 'with_missing_values:', na_feats


numerical: ['Var1' 'Var2' 'Var3' 'Var4' 'Var5' 'Var6' 'Var7' 'Var8' 'Var9' 'Var10'
 'Var11' 'Var12' 'Var13' 'Var14' 'Var15' 'Var16' 'Var17' 'Var18' 'Var19'
 'Var20' 'Var21' 'Var22' 'Var23' 'Var24' 'Var25' 'Var26' 'Var27' 'Var28'
 'Var29' 'Var30' 'Var31' 'Var32' 'Var33' 'Var34' 'Var35' 'Var36' 'Var37'
 'Var38' 'Var39' 'Var40' 'Var41' 'Var42' 'Var43' 'Var44' 'Var45' 'Var46'
 'Var47' 'Var48' 'Var49' 'Var50' 'Var51' 'Var52' 'Var53' 'Var54' 'Var55'
 'Var56' 'Var57' 'Var58' 'Var59' 'Var60' 'Var61' 'Var62' 'Var63' 'Var64'
 'Var65' 'Var66' 'Var67' 'Var68' 'Var69' 'Var70' 'Var71' 'Var72' 'Var73'
 'Var74' 'Var75' 'Var76' 'Var77' 'Var78' 'Var79' 'Var80' 'Var81' 'Var82'
 'Var83' 'Var84' 'Var85' 'Var86' 'Var87' 'Var88' 'Var89' 'Var90' 'Var91'
 'Var92' 'Var93' 'Var94' 'Var95' 'Var96' 'Var97' 'Var98' 'Var99' 'Var100'
 'Var101' 'Var102' 'Var103' 'Var104' 'Var105' 'Var106' 'Var107' 'Var108'
 'Var109' 'Var110' 'Var111' 'Var112' 'Var113' 'Var114' 'Var115' 'Var116'
 'Var117' 'Var118' 'Var119' 'Var120' 'Var121' 'Var122' 'Var123' 'Var124'
 'Var125' 'Var126' 'Var127' 'Var128' 'Var129' 'Var130' 'Var131' 'Var132'
 'Var133' 'Var134' 'Var135' 'Var136' 'Var137' 'Var138' 'Var139' 'Var140'
 'Var141' 'Var142' 'Var143' 'Var144' 'Var145' 'Var146' 'Var147' 'Var148'
 'Var149' 'Var150' 'Var151' 'Var152' 'Var153' 'Var154' 'Var155' 'Var156'
 'Var157' 'Var158' 'Var159' 'Var160' 'Var161' 'Var162' 'Var163' 'Var164'
 'Var165' 'Var166' 'Var167' 'Var168' 'Var169' 'Var170' 'Var171' 'Var172'
 'Var173' 'Var174' 'Var175' 'Var176' 'Var177' 'Var178' 'Var179' 'Var180'
 'Var181' 'Var182' 'Var183' 'Var184' 'Var185' 'Var186' 'Var187' 'Var188'
 'Var189' 'Var190' 'Var209' 'Var230']
categorical: ['Var191' 'Var192' 'Var193' 'Var194' 'Var195' 'Var196' 'Var197' 'Var198'
 'Var199' 'Var200' 'Var201' 'Var202' 'Var203' 'Var204' 'Var205' 'Var206'
 'Var207' 'Var208' 'Var210' 'Var211' 'Var212' 'Var213' 'Var214' 'Var215'
 'Var216' 'Var217' 'Var218' 'Var219' 'Var220' 'Var221' 'Var222' 'Var223'
 'Var224' 'Var225' 'Var226' 'Var227' 'Var228' 'Var229']
with_missing_values: ['Var1' 'Var2' 'Var3' 'Var4' 'Var5' 'Var6' 'Var7' 'Var8' 'Var9' 'Var10'
 'Var11' 'Var12' 'Var13' 'Var14' 'Var15' 'Var16' 'Var17' 'Var18' 'Var19'
 'Var20' 'Var21' 'Var22' 'Var23' 'Var24' 'Var25' 'Var26' 'Var27' 'Var28'
 'Var29' 'Var30' 'Var31' 'Var32' 'Var33' 'Var34' 'Var35' 'Var36' 'Var37'
 'Var38' 'Var39' 'Var40' 'Var41' 'Var42' 'Var43' 'Var44' 'Var45' 'Var46'
 'Var47' 'Var48' 'Var49' 'Var50' 'Var51' 'Var52' 'Var53' 'Var54' 'Var55'
 'Var56' 'Var58' 'Var59' 'Var60' 'Var61' 'Var62' 'Var63' 'Var64' 'Var65'
 'Var66' 'Var67' 'Var68' 'Var69' 'Var70' 'Var71' 'Var72' 'Var74' 'Var75'
 'Var76' 'Var77' 'Var78' 'Var79' 'Var80' 'Var81' 'Var82' 'Var83' 'Var84'
 'Var85' 'Var86' 'Var87' 'Var88' 'Var89' 'Var90' 'Var91' 'Var92' 'Var93'
 'Var94' 'Var95' 'Var96' 'Var97' 'Var98' 'Var99' 'Var100' 'Var101' 'Var102'
 'Var103' 'Var104' 'Var105' 'Var106' 'Var107' 'Var108' 'Var109' 'Var110'
 'Var111' 'Var112' 'Var114' 'Var115' 'Var116' 'Var117' 'Var118' 'Var119'
 'Var120' 'Var121' 'Var122' 'Var123' 'Var124' 'Var125' 'Var126' 'Var127'
 'Var128' 'Var129' 'Var130' 'Var131' 'Var132' 'Var133' 'Var134' 'Var135'
 'Var136' 'Var137' 'Var138' 'Var139' 'Var140' 'Var141' 'Var142' 'Var143'
 'Var144' 'Var145' 'Var146' 'Var147' 'Var148' 'Var149' 'Var150' 'Var151'
 'Var152' 'Var153' 'Var154' 'Var155' 'Var156' 'Var157' 'Var158' 'Var159'
 'Var160' 'Var161' 'Var162' 'Var163' 'Var164' 'Var165' 'Var166' 'Var167'
 'Var168' 'Var169' 'Var170' 'Var171' 'Var172' 'Var173' 'Var174' 'Var175'
 'Var176' 'Var177' 'Var178' 'Var179' 'Var180' 'Var181' 'Var182' 'Var183'
 'Var184' 'Var185' 'Var186' 'Var187' 'Var188' 'Var189' 'Var190' 'Var191'
 'Var192' 'Var194' 'Var197' 'Var199' 'Var200' 'Var201' 'Var202' 'Var203'
 'Var205' 'Var206' 'Var208' 'Var209' 'Var213' 'Var214' 'Var215' 'Var217'
 'Var218' 'Var219' 'Var223' 'Var224' 'Var225' 'Var229' 'Var230']

In [7]:
custdata.columns - np.union1d(numerical_feats, categorical_feats)


Out[7]:
Index([u'Churn'], dtype='object')

In [9]:
skewed_feats = custsession.find_skewed_features()
noninformative_feats = custsession.find_noninformative_features()
print 'skewed numerical feats:', skewed_feats
print 'noninformative_feats:', noninformative_feats


skewed numerical feats: ['Var1' 'Var2' 'Var3' 'Var4' 'Var5' 'Var6' 'Var7' 'Var9' 'Var10' 'Var11'
 'Var12' 'Var13' 'Var14' 'Var17' 'Var18' 'Var19' 'Var21' 'Var22' 'Var23'
 'Var24' 'Var25' 'Var26' 'Var27' 'Var28' 'Var29' 'Var33' 'Var34' 'Var35'
 'Var36' 'Var37' 'Var38' 'Var40' 'Var41' 'Var43' 'Var44' 'Var46' 'Var47'
 'Var49' 'Var50' 'Var51' 'Var53' 'Var56' 'Var58' 'Var59' 'Var60' 'Var61'
 'Var62' 'Var63' 'Var65' 'Var66' 'Var67' 'Var68' 'Var70' 'Var71' 'Var72'
 'Var73' 'Var74' 'Var75' 'Var76' 'Var77' 'Var78' 'Var80' 'Var81' 'Var83'
 'Var84' 'Var85' 'Var88' 'Var89' 'Var90' 'Var91' 'Var93' 'Var94' 'Var95'
 'Var96' 'Var97' 'Var98' 'Var99' 'Var100' 'Var101' 'Var103' 'Var104'
 'Var105' 'Var106' 'Var107' 'Var108' 'Var109' 'Var110' 'Var111' 'Var112'
 'Var115' 'Var116' 'Var117' 'Var119' 'Var120' 'Var121' 'Var122' 'Var123'
 'Var124' 'Var125' 'Var126' 'Var127' 'Var128' 'Var129' 'Var131' 'Var132'
 'Var133' 'Var134' 'Var137' 'Var138' 'Var139' 'Var140' 'Var143' 'Var144'
 'Var145' 'Var146' 'Var148' 'Var149' 'Var150' 'Var151' 'Var152' 'Var155'
 'Var156' 'Var157' 'Var158' 'Var159' 'Var160' 'Var161' 'Var162' 'Var163'
 'Var164' 'Var165' 'Var166' 'Var170' 'Var171' 'Var172' 'Var173' 'Var174'
 'Var176' 'Var177' 'Var178' 'Var179' 'Var181' 'Var182' 'Var183' 'Var184'
 'Var186' 'Var187' 'Var189']
noninformative_feats: ['Var1' 'Var2' 'Var3' 'Var4' 'Var5' 'Var8' 'Var9' 'Var10' 'Var11' 'Var12'
 'Var14' 'Var15' 'Var16' 'Var17' 'Var18' 'Var19' 'Var20' 'Var23' 'Var26'
 'Var27' 'Var29' 'Var30' 'Var31' 'Var32' 'Var33' 'Var34' 'Var36' 'Var37'
 'Var39' 'Var40' 'Var41' 'Var42' 'Var43' 'Var45' 'Var46' 'Var47' 'Var48'
 'Var49' 'Var50' 'Var51' 'Var52' 'Var53' 'Var54' 'Var55' 'Var56' 'Var58'
 'Var59' 'Var60' 'Var61' 'Var62' 'Var63' 'Var64' 'Var66' 'Var67' 'Var68'
 'Var69' 'Var70' 'Var71' 'Var75' 'Var77' 'Var79' 'Var80' 'Var82' 'Var84'
 'Var86' 'Var87' 'Var88' 'Var89' 'Var90' 'Var91' 'Var92' 'Var93' 'Var95'
 'Var96' 'Var97' 'Var98' 'Var99' 'Var100' 'Var101' 'Var102' 'Var103'
 'Var104' 'Var105' 'Var106' 'Var107' 'Var108' 'Var110' 'Var111' 'Var114'
 'Var115' 'Var116' 'Var117' 'Var118' 'Var120' 'Var121' 'Var122' 'Var124'
 'Var127' 'Var128' 'Var129' 'Var130' 'Var131' 'Var135' 'Var136' 'Var137'
 'Var138' 'Var139' 'Var141' 'Var142' 'Var145' 'Var146' 'Var147' 'Var148'
 'Var150' 'Var151' 'Var152' 'Var154' 'Var155' 'Var156' 'Var157' 'Var158'
 'Var159' 'Var161' 'Var162' 'Var164' 'Var165' 'Var166' 'Var167' 'Var168'
 'Var169' 'Var170' 'Var171' 'Var172' 'Var174' 'Var175' 'Var176' 'Var177'
 'Var178' 'Var179' 'Var180' 'Var182' 'Var183' 'Var184' 'Var185' 'Var186'
 'Var187' 'Var188' 'Var189' 'Var190' 'Var191' 'Var195' 'Var196' 'Var209'
 'Var210' 'Var213' 'Var215' 'Var224' 'Var230']

In [10]:
pd.value_counts(custdata.Var196)


Out[10]:
1K8T    49550
z3mO      432
JA1C       17
mKeq        1
dtype: int64

In [11]:
custsession.plot_feature_density([f for f in skewed_feats if f not in noninformative_feats], 
                                 kind="density")



In [12]:
custsession.plot_feature_pair("Churn", "Var153")



In [13]:
custsession.plot_feature_pair( "Var197", "Churn", figsize=(6, 6))



In [63]:
remover = custsession.remove_features(noninformative_feats)
print custsession.find_noninformative_features()
print custsession.find_skewed_features()


[]
['Var6' 'Var7' 'Var13' 'Var21' 'Var22' 'Var24' 'Var25' 'Var28' 'Var35'
 'Var38' 'Var44' 'Var65' 'Var72' 'Var73' 'Var74' 'Var76' 'Var78' 'Var81'
 'Var83' 'Var85' 'Var94' 'Var109' 'Var112' 'Var119' 'Var123' 'Var125'
 'Var126' 'Var132' 'Var133' 'Var134' 'Var140' 'Var143' 'Var144' 'Var149'
 'Var160' 'Var163' 'Var173' 'Var181']

In [15]:
print custsession.get_features()
print custsession.get_train_data().shape
print custsession.get_validation_data().shape


Index([u'Var6', u'Var7', u'Var13', u'Var21', u'Var22', u'Var24', u'Var25', u'Var28', u'Var35', u'Var38', u'Var44', u'Var57', u'Var65', u'Var72', u'Var73', u'Var74', u'Var76', u'Var78', u'Var81', u'Var83', u'Var85', u'Var94', u'Var109', u'Var112', u'Var113', u'Var119', u'Var123', u'Var125', u'Var126', u'Var132', u'Var133', u'Var134', u'Var140', u'Var143', u'Var144', u'Var149', u'Var153', u'Var160', u'Var163', u'Var173', u'Var181', u'Var192', u'Var193', u'Var194', u'Var197', u'Var198', u'Var199', u'Var200', u'Var201', u'Var202', u'Var203', u'Var204', u'Var205', u'Var206', u'Var207', u'Var208', u'Var211', u'Var212', u'Var214', u'Var216', u'Var217', u'Var218', u'Var219', u'Var220', u'Var221', u'Var222', u'Var223', u'Var225', u'Var226', u'Var227', u'Var228', u'Var229', u'Churn'], dtype='object')
(35000, 73)
(15000, 73)

In [16]:
custsession.get_crossvalue_table(["Var229"], ["Churn"])


Out[16]:
train_churn validation_churn overall_churn
Var229
oJmt 0.076923 0.090909 0.081081
All 0.073229 0.073933 0.073440
am7c 0.055202 0.057403 0.055864
mj86 0.052739 0.059480 0.054774
sk2h 0.035714 0.100000 0.052632

In [17]:
custsession.get_crossvalue_table(["Var229", "Var197"], ["Churn"])


Out[17]:
train_churn validation_churn overall_churn
oJmt hM6W 1.000000 NaN 1.000000
am7c hSk9 1.000000 0.000000 0.500000
sk2h 0Xwj 0.500000 1.000000 0.666667
mj86 nCqp 0.500000 0.000000 0.333333
ZF5Q 0.400000 0.000000 0.333333
5IqI 0.333333 NaN 0.333333
oJmt 0Xwj 0.333333 0.000000 0.200000
mj86 sM2H 0.333333 0.000000 0.250000
Ulfj 0.333333 0.000000 0.250000
tixA 0.250000 NaN 0.250000
am7c RsSg 0.250000 0.333333 0.285714
mj86 0aHy 0.250000 0.000000 0.125000
am7c MeKL 0.250000 NaN 0.250000
mj86 PShj 0.200000 0.000000 0.166667
R5iW 0.200000 NaN 0.200000
am7c _vzJ 0.200000 0.000000 0.111111
mj86 3PdN 0.200000 0.000000 0.125000
IGdn 0.200000 0.142857 0.181818
am7c 0kpG 0.200000 0.000000 0.150000
hAOr 0.181818 0.000000 0.125000
mj86 KXDt 0.166667 NaN 0.166667
8qEx 0.160000 0.214286 0.179487
80HR 0.160000 0.000000 0.121212
am7c G6s_ 0.157895 0.076923 0.125000
Cnh8 0.142857 0.166667 0.150000
USOt 0.142857 0.055556 0.116667
IIni 0.142857 0.000000 0.125000
iJ4u 0.142857 0.000000 0.100000
EzKK 0.136364 0.333333 0.193548
mj86 uNkU 0.136364 0.111111 0.129032
... ... ... ... ...
am7c 12fX NaN 0.500000 0.500000
2D6V NaN 0.333333 0.333333
5tHj NaN 0.000000 0.000000
CNAE NaN 0.000000 0.000000
aekr NaN 0.000000 0.000000
pdeM NaN 0.000000 0.000000
tN0D NaN 0.000000 0.000000
mj86 0zDT NaN 0.000000 0.000000
19FS NaN 0.000000 0.000000
D_RP NaN 0.000000 0.000000
MrXh NaN 0.000000 0.000000
ONrL NaN 0.000000 0.000000
Tw4q NaN 0.000000 0.000000
YRnd NaN 0.000000 0.000000
ZEGa NaN 0.000000 0.000000
d6Mq NaN 0.000000 0.000000
kOrz NaN 0.000000 0.000000
tN0D NaN 0.000000 0.000000
y3E2 NaN 0.000000 0.000000
oJmt 0LaQ NaN 0.000000 0.000000
1TqK NaN 0.000000 0.000000
487l NaN 0.000000 0.000000
LXO4 NaN 0.000000 0.000000
pGTb NaN 0.000000 0.000000
xgSF NaN 1.000000 1.000000
sk2h 7ALb NaN 0.000000 0.000000
AnrR NaN 0.000000 0.000000
FgS1 NaN 0.000000 0.000000
PGNs NaN 0.000000 0.000000
pE9z NaN 0.000000 0.000000

437 rows × 3 columns


In [18]:
custsession.get_crossvalue_table(["Var219"], ["Churn"])


Out[18]:
train_churn validation_churn overall_churn
FqMWi1g 0.111111 0.000000 0.060606
FzaX 0.074215 0.075773 0.074682
All 0.073229 0.073933 0.073440
qxDb 0.071151 0.069841 0.070772
AU8_WTd 0.061069 0.037736 0.054348
AU8pNoi 0.056430 0.054755 0.055906
OFWH 0.054348 0.070922 0.059952
Lmli 0.047120 0.013699 0.037879
wwPEXoilkr 0.041935 0.043478 0.042411
tdJW_Pm 0.016949 0.105263 0.038462
JdAM 0.000000 0.000000 0.000000
lkwAXjv 0.000000 0.000000 0.000000
kgEg 0.000000 NaN 0.000000
49W0rUY 0.000000 0.000000 0.000000
49W9HeL 0.000000 NaN 0.000000
FQHxeR8 0.000000 0.000000 0.000000
AU8ltHK 0.000000 0.250000 0.100000
AU8OvAe 0.000000 0.000000 0.000000
AU8KzzF 0.000000 0.000000 0.000000
AT1N 0.000000 0.000000 0.000000
6krWwfF 0.000000 NaN 0.000000
ylgWTXl 0.000000 0.142857 0.047619
HEoH NaN 0.000000 0.000000

In [19]:
custsession.plot_feature_pair("Var21", "Var22")



In [56]:
custsession.plot_feature_pair("Var21", "Var160")



In [65]:
custsession.find_redundant_features()


1.0 Index([u'Var21', u'Var22'], dtype='object')
Out[65]:
['Var22']

In [54]:
cmatrix = custsession.data.corr().abs()
for i in xrange(cmatrix.shape[0]):
    cmatrix.iloc[i,i] = 0

In [55]:
cmatrix.loc[["Var21", "Var22", "Var160"], ["Var21", "Var22", "Var160"]]


Out[55]:
Var21 Var22 Var160
Var21 0.000000 1.000000 0.926964
Var22 1.000000 0.000000 0.927281
Var160 0.926964 0.927281 0.000000

In [52]:
mean_corr = cmatrix.mean(axis = 0)

In [53]:
removed_feats = []
while True:
    max_corr = np.asarray(cmatrix).max()
    if max_corr <= 0.90: 
        break
    f1, f2 = cmatrix.columns[np.where(cmatrix == max_corr)[0]]
    print f1, f2
    feat_to_remove = f1 if mean_corr[f1] > mean_corr[f2] else f2
    removed_feats.append(feat_to_remove)
    cmatrix.loc[:, feat_to_remove] = 0
    cmatrix.loc[feat_to_remove, :] = 0


Var21 Var22
Var21 Var160

In [50]:
removed_feats


Out[50]:
['Var22']

In [29]:
np.where(cmatrix == np.asarray(cmatrix).max())


Out[29]:
(array([3, 4]), array([4, 3]))

In [32]:
cmatrix.iloc[:, [3, 4]].mean()


Out[32]:
Var21    0.224932
Var22    0.226035
dtype: float64

In [39]:
cmatrix.loc["Var6", "Var7"]


Out[39]:
0.14989044989623401

In [ ]: