In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

In [3]:
## load data
custdata = pd.read_table("data/orange_small_train.data.gz", 
                         compression="gzip", na_values=["NA", ""], 
                         delimiter = "\t", header = 0)
churn = np.loadtxt("data/orange_small_train_churn.labels.txt")
custdata["Churn"] = np.where(churn==1, "churn", "nochurn")
custdata.head(n = 3)


Out[3]:
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 ... Var222 Var223 Var224 Var225 Var226 Var227 Var228 Var229 Var230 Churn
0 NaN NaN NaN NaN NaN 1526 7 NaN NaN NaN ... fXVEsaq jySVZNlOJy NaN NaN xb3V RAYp F2FyR07IdsN7I NaN NaN nochurn
1 NaN NaN NaN NaN NaN 525 0 NaN NaN NaN ... 2Kb5FSF LM8l689qOp NaN NaN fKCe RAYp F2FyR07IdsN7I NaN NaN churn
2 NaN NaN NaN NaN NaN 5236 7 NaN NaN NaN ... NKv4yOc jySVZNlOJy NaN kG3k Qu4f 02N6s8f ib5G6X1eUxUn6 am7c NaN nochurn

3 rows × 231 columns


In [4]:
train, test = train_test_split(np.arange(custdata.shape[0]), test_size = 0.15)
test_data = custdata.iloc[test, :]
custdata = custdata.iloc[train, :]
print custdata.shape, test_data.shape


(42500, 231) (7500, 231)

In [5]:
custsession = session.Session(custdata, 'Churn')
transformers = []
print custsession.get_parameters()


{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.95, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}

In [6]:
numerical_features = custsession.get_features_of(criterion=custsession.is_numerical_feature)
categorical_features = custsession.get_features_of(criterion=custsession.is_categorical_feature)
all_features = set(custdata.columns)
all_features.difference(np.union1d(numerical_features, categorical_features))


Out[6]:
{'Churn'}

In [7]:
## remove heavy missing value features
na_features = custsession.get_features_of(criterion=custsession.is_na_feature)
features_to_ignore = custsession.get_features_of(criterion=custsession.is_na_heavy)
remover = custsession.remove_features(features_to_ignore)
transformers.append(remover)
print len(custsession.get_all_input_features())


77

In [8]:
custsession.get_features_of(custsession.is_na_feature)


Out[8]:
array(['Var6', 'Var7', 'Var13', 'Var21', 'Var22', 'Var24', 'Var25',
       'Var28', 'Var35', 'Var38', 'Var44', 'Var51', 'Var65', 'Var72',
       'Var74', 'Var76', 'Var78', 'Var81', 'Var83', 'Var85', 'Var94',
       'Var109', 'Var112', 'Var119', 'Var123', 'Var125', 'Var126',
       'Var132', 'Var133', 'Var134', 'Var140', 'Var143', 'Var144',
       'Var149', 'Var153', 'Var160', 'Var163', 'Var173', 'Var181',
       'Var189', 'Var192', 'Var194', 'Var197', 'Var199', 'Var200',
       'Var201', 'Var202', 'Var203', 'Var205', 'Var206', 'Var208',
       'Var214', 'Var217', 'Var218', 'Var219', 'Var223', 'Var225', 'Var229'], 
      dtype='|S6')

In [9]:
## impute missing value features 
feature_imputer = custsession.impute_features(auto_remove=True)
transformers.append(feature_imputer)
print custsession.get_features_of(custsession.is_na_feature)


[]

In [10]:
## remove non-informative (near-zero variance) feature
noninformative_feats = custsession.get_features_of(custsession.is_noninformative_feature)
remover = custsession.remove_features(noninformative_feats)
transformers.append(remover)
print custsession.get_features_of(custsession.is_noninformative_feature)


[]

In [11]:
## evenize skewed features
custsession.get_features_of(custsession.is_skewed_numerical_feature)
feature_evenizer = custsession.evenize_skew_features(auto_remove=True)
transformers.append(feature_evenizer)
custsession.get_features_of(custsession.is_skewed_numerical_feature)


Out[11]:
array(['Var65_IMPUTED_LOG', 'Var78_IMPUTED_LOG1', 'Var181_IMPUTED_LOG1',
       'Var173_IMPUTED_LOG1', 'Var143_IMPUTED_LOG1', 'Var72_IMPUTED_LOG',
       'Var35_IMPUTED_LOG1', 'Var132_IMPUTED_LOG1', 'Var44_IMPUTED_LOG1'], 
      dtype='|S19')

In [12]:
## whiten features
feature_whitener = custsession.whiten_features(auto_remove=True)
transformers.append(feature_whitener)
for f in custsession.get_features_of(custsession.is_numerical_feature):
    assert np.isclose(custsession.data[f].mean(), 0, rtol=1e-1, atol = 1e-1) 
    assert np.isclose(custsession.data[f].std(), 1, rtol=1e-1, atol = 1e-1)

In [13]:
## find redundant features 
custsession.find_redundant_features()


Out[13]:
[]

In [14]:
## numerize categorical features
categorical_features = custsession.get_features_of(custsession.is_categorical_feature)
numerizer = custsession.numerize_categorical_features(auto_remove=False)
transformers.append(numerizer)

In [16]:
## explore categorical data
print custsession.print_categorial_crosstable(feature_names=['Var211',  ])
custsession.plot_feature_pair("Var211", "Churn")


        train_churn  test_churn  overall_churn
Var211                                        
L84s       0.078597    0.072998       0.076914
All        0.074756    0.069569       0.073200
Mtgm       0.058915    0.055240       0.057820

In [17]:
## explore numerical data
custsession.plot_feature_pair("Var73_LOG_WHITE", "Churn")
custsession.plot_feature_pair("Var211", "Var73_LOG_WHITE")
custsession.plot_feature_pair('Var57_WHITE', 'Var113_WHITE')



In [15]:
redudant_features = custsession.find_redundant_features()
remover = custsession.remove_features(redudant_features)
transformers.append(remover)
custsession.find_redundant_features()


Out[15]:
[]

In [16]:
## rank numerized discrete features
numerized_features = [f for f in custsession.get_all_input_features() if "NUMERIZED" in f]
numerized_features_rank = custsession.rank_features(feature_names = numerized_features, 
                          by = custsession.numerized_feature_auc_metric, target_value = "churn")

In [44]:
## find useful numerical features
numerical_features = custsession.get_features_of(custsession.is_numerical_feature)
numerical_features = [f for f in numerical_features if f not in numerized_features]
print len(numerical_features)
fig, axes = plt.subplots(nrows = len(numerical_features)/4+1, ncols=4, figsize = (4 * 4, len(numerical_features)+4))
axes = axes.ravel()
for f, ax in zip(numerical_features, axes):
    custsession.plot_feature_pair(xname = f, yname = "Churn", ax = ax, legend=False)


43

In [17]:
selected_features = np.asarray([f for f,s in numerized_features_rank[:10]]+ 
                               ["Var73_LOG_WHITE", "Var126_IMPUTED_SIGNEDLOG_WHITE", "Var144_IMPUTED_LOG1_WHITE"])
train_matrix, test_matrix = custsession.get_data(selected_features=selected_features)
print train_matrix.shape, test_matrix.shape


(29750, 14) (12750, 14)

In [30]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(train_matrix.iloc[:, :-1], train_matrix.iloc[:, -1])
print roc_auc_score(train_matrix.iloc[:, -1]=="churn", tree.predict_proba(train_matrix.iloc[:, :-1])[:, 0])
print roc_auc_score(test_matrix.iloc[:, -1]=="churn", tree.predict_proba(test_matrix.iloc[:, :-1])[:, 0])


0.836862751462
0.824788630969

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [21]:


In [19]:
## test all transformers on the new data 
combiners = transform.TransformPipeline(transformers)
transformed_test = combiners.transform(test_data)
print transformed_test.shape
print len(custsession.get_all_input_features())


(7500, 150)
149

In [20]:
transformed_test.head()


Out[20]:
Var193 Var198 Var204 Var207 Var211 Var212 Var216 Var220 Var221 Var222 ... Var217_IMPUTED_churn_NUMERIZED Var51_IS_IMPUTED_churn_NUMERIZED Var202_IMPUTED_churn_NUMERIZED Var199_IMPUTED_churn_NUMERIZED Var218_IMPUTED_churn_NUMERIZED Var219_IMPUTED_churn_NUMERIZED Var194_IMPUTED_churn_NUMERIZED Var197_IMPUTED_churn_NUMERIZED Var189_IS_IMPUTED_churn_NUMERIZED Var192_IMPUTED_churn_NUMERIZED
0 2Knk1KF 4M5uMXJ 5euo me75fM6ugJ Mtgm XfqtO3UdzaXh_ XTbPUYD IvCOZ4E oslk FjXRDbC ... 0.000000 0.075935 0.1 0.000000 0.058839 0.075574 0.077601 0.062016 0.073036 0.044248
1 RO12 4M5uMXJ 5euo me75fM6ugJ L84s NhsEn4L mAjbk_S IvCOZ4E oslk FjXRDbC ... 0.333333 0.075935 0.1 0.063158 0.058839 0.075574 0.077601 0.062016 0.073036 0.044248
2 NRCqczK fhk21Ss 5euo 7M47J5GA0pTYIFxg5uy L84s WsRVNrF85oPU_ Ekn9R0M 4UxGlow zCkv catzS2D ... 0.000000 0.075935 0.0 0.033860 0.058839 0.075574 0.077601 0.062016 0.073036 0.044248
3 2Knk1KF fhk21Ss jakt 7M47J5GA0pTYIFxg5uy L84s E1aAZ0x7vd beKMbmK 4UxGlow QKW8DRm catzS2D ... 0.000000 0.075935 0.0 0.052326 0.058839 0.075574 0.077601 0.062016 0.073036 0.044248
4 RO12 yB8VVdO TFSW me75fM6ugJ L84s NhsEn4L TDcECyH fmK1UQz oslk LsdaF5M ... 0.000000 0.075935 0.0 0.189189 0.058839 0.079091 0.077601 0.084736 0.073036 0.044248

5 rows × 150 columns


In [ ]: