In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
from munging import session
from munging import transform
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
In [3]:
## load data
custdata = pd.read_table("data/orange_small_train.data.gz",
compression="gzip", na_values=["NA", ""],
delimiter = "\t", header = 0)
churn = np.loadtxt("data/orange_small_train_churn.labels.txt")
custdata["Churn"] = np.where(churn==1, "churn", "nochurn")
custdata.head(n = 3)
Out[3]:
In [4]:
train, test = train_test_split(np.arange(custdata.shape[0]), test_size = 0.15)
test_data = custdata.iloc[test, :]
custdata = custdata.iloc[train, :]
print custdata.shape, test_data.shape
In [5]:
custsession = session.Session(custdata, 'Churn')
transformers = []
print custsession.get_parameters()
In [6]:
numerical_features = custsession.get_features_of(criterion=custsession.is_numerical_feature)
categorical_features = custsession.get_features_of(criterion=custsession.is_categorical_feature)
all_features = set(custdata.columns)
all_features.difference(np.union1d(numerical_features, categorical_features))
Out[6]:
In [7]:
## remove heavy missing value features
na_features = custsession.get_features_of(criterion=custsession.is_na_feature)
features_to_ignore = custsession.get_features_of(criterion=custsession.is_na_heavy)
remover = custsession.remove_features(features_to_ignore)
transformers.append(remover)
print len(custsession.get_all_input_features())
In [8]:
custsession.get_features_of(custsession.is_na_feature)
Out[8]:
In [9]:
## impute missing value features
feature_imputer = custsession.impute_features(auto_remove=True)
transformers.append(feature_imputer)
print custsession.get_features_of(custsession.is_na_feature)
In [10]:
## remove non-informative (near-zero variance) feature
noninformative_feats = custsession.get_features_of(custsession.is_noninformative_feature)
remover = custsession.remove_features(noninformative_feats)
transformers.append(remover)
print custsession.get_features_of(custsession.is_noninformative_feature)
In [11]:
## evenize skewed features
custsession.get_features_of(custsession.is_skewed_numerical_feature)
feature_evenizer = custsession.evenize_skew_features(auto_remove=True)
transformers.append(feature_evenizer)
custsession.get_features_of(custsession.is_skewed_numerical_feature)
Out[11]:
In [12]:
## whiten features
feature_whitener = custsession.whiten_features(auto_remove=True)
transformers.append(feature_whitener)
for f in custsession.get_features_of(custsession.is_numerical_feature):
assert np.isclose(custsession.data[f].mean(), 0, rtol=1e-1, atol = 1e-1)
assert np.isclose(custsession.data[f].std(), 1, rtol=1e-1, atol = 1e-1)
In [13]:
## find redundant features
custsession.find_redundant_features()
Out[13]:
In [14]:
## numerize categorical features
categorical_features = custsession.get_features_of(custsession.is_categorical_feature)
numerizer = custsession.numerize_categorical_features(auto_remove=False)
transformers.append(numerizer)
In [16]:
## explore categorical data
print custsession.print_categorial_crosstable(feature_names=['Var211', ])
custsession.plot_feature_pair("Var211", "Churn")
In [17]:
## explore numerical data
custsession.plot_feature_pair("Var73_LOG_WHITE", "Churn")
custsession.plot_feature_pair("Var211", "Var73_LOG_WHITE")
custsession.plot_feature_pair('Var57_WHITE', 'Var113_WHITE')
In [15]:
redudant_features = custsession.find_redundant_features()
remover = custsession.remove_features(redudant_features)
transformers.append(remover)
custsession.find_redundant_features()
Out[15]:
In [16]:
## rank numerized discrete features
numerized_features = [f for f in custsession.get_all_input_features() if "NUMERIZED" in f]
numerized_features_rank = custsession.rank_features(feature_names = numerized_features,
by = custsession.numerized_feature_auc_metric, target_value = "churn")
In [44]:
## find useful numerical features
numerical_features = custsession.get_features_of(custsession.is_numerical_feature)
numerical_features = [f for f in numerical_features if f not in numerized_features]
print len(numerical_features)
fig, axes = plt.subplots(nrows = len(numerical_features)/4+1, ncols=4, figsize = (4 * 4, len(numerical_features)+4))
axes = axes.ravel()
for f, ax in zip(numerical_features, axes):
custsession.plot_feature_pair(xname = f, yname = "Churn", ax = ax, legend=False)
In [17]:
selected_features = np.asarray([f for f,s in numerized_features_rank[:10]]+
["Var73_LOG_WHITE", "Var126_IMPUTED_SIGNEDLOG_WHITE", "Var144_IMPUTED_LOG1_WHITE"])
train_matrix, test_matrix = custsession.get_data(selected_features=selected_features)
print train_matrix.shape, test_matrix.shape
In [30]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(train_matrix.iloc[:, :-1], train_matrix.iloc[:, -1])
print roc_auc_score(train_matrix.iloc[:, -1]=="churn", tree.predict_proba(train_matrix.iloc[:, :-1])[:, 0])
print roc_auc_score(test_matrix.iloc[:, -1]=="churn", tree.predict_proba(test_matrix.iloc[:, :-1])[:, 0])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [21]:
In [19]:
## test all transformers on the new data
combiners = transform.TransformPipeline(transformers)
transformed_test = combiners.transform(test_data)
print transformed_test.shape
print len(custsession.get_all_input_features())
In [20]:
transformed_test.head()
Out[20]:
In [ ]: