In [70]:
%load_ext autoreload
%autoreload 2
from munging import imputation
from munging import inspection
from munging import utility
from munging import transformation
from munging import feature
from munging import performance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
%matplotlib inline
In [115]:
custdata = pd.read_table("data/custdata.tsv")
In [116]:
custdata.head()
Out[116]:
In [106]:
inspection.na_pattern(custdata)
Out[106]:
In [107]:
categorical_features = inspection.find_categorical_features(custdata)
print categorical_features
In [108]:
df = imputation.imput_categorical_features(custdata, categorical_features)
In [109]:
inspection.na_pattern(df)
Out[109]:
In [110]:
for f in categorical_features:
print pd.unique(df[f])
In [111]:
numerical_features = inspection.find_numerical_features(custdata)
print numerical_features
df = imputation.imput_numerical_features(custdata, numerical_features, )
In [112]:
inspection.na_pattern(df)
Out[112]:
In [112]:
In [113]:
df = imputation.imput(custdata)
inspection.na_pattern(df)
Out[113]:
In [114]:
df.head()
Out[114]:
In [13]:
df["num.vehicles_isna"].sum()
Out[13]:
In [14]:
df = imputation.imput(custdata)
In [15]:
inspection.plot_features_density(df)
In [16]:
inspection.plot_features_density(df, plot_type="hist", )
In [17]:
skewed_feats = inspection.find_features_skewed(df, 20)
skewed_feats
Out[17]:
In [18]:
log_df = transformation.log_transform(df[df.income > 0], skewed_feats)
inspection.plot_features_density(log_df)
In [19]:
## arcsinh - assume negative income is valid (e.g., debt)
arc_df = transformation.arcsinh_transform(df, skewed_feats)
inspection.plot_features_density(arc_df)
In [20]:
## signed_log - assume negative income is valid (e.g., debt)
slog_df = transformation.signed_log_transform(df, skewed_feats)
inspection.plot_features_density(slog_df)
In [21]:
inspection.plot_features_density(slog_df, plot_type="hist")
In [22]:
df = imputation.imput(custdata)
df = df[df.income > 0]
df = transformation.log_transform(df, ["income"])
In [23]:
print inspection.find_numerical_features(df)
print inspection.find_categorical_features(df)
In [24]:
inspection.plot_feature_pair(df, "age", "log_income")
In [25]:
inspection.plot_feature_pair(df, "marital.stat", "income")
In [26]:
inspection.plot_feature_pair(df, "age", "marital.stat")
In [27]:
inspection.plot_feature_pair(df, "marital.stat", "sex")
In [28]:
inspection.plot_feature_pair(df, "sex", "marital.stat")
In [29]:
inspection.plot_feature_pair(df, "num.vehicles", "health.ins", legend = True)
In [30]:
print utility.is_discrete(df, "num.vehicles")
print utility.is_discrete(df, "custid")
In [31]:
## default qcut
r = transformation.discretize_numerical(df, ["num.vehicles"], max_qcut = 100)
print np.unique(r["discrete_num.vehicles"])
r.loc[:, ["num.vehicles", "discrete_num.vehicles"]].head()
Out[31]:
In [32]:
## equal bin size cut
r = transformation.discretize_numerical(df, ["income"], feat_bins = {"income": 10})
print np.unique(r["discrete_income"])
r.loc[:, ["income", "discrete_income"]].head()
Out[32]:
In [33]:
## customized bins cut
r = transformation.discretize_numerical(df, ["age"], feat_bins = {"age": [-100, 0, 25, 30, 50, 60, 100, 150]})
print np.unique(r["discrete_age"])
r.loc[:, ["age", "discrete_age"]].head()
Out[33]:
In [34]:
inspection.plot_feature_pair(r, "discrete_age", "health.ins")
In [97]:
categorical_feats = [f for f in inspection.find_categorical_features(r) if f != "health.ins"]
rr = feature.BiClassProbabilityFeatureExtractor().fit_transform(r, categorical_feats, "health.ins",)
rr.head(n = 2).T
Out[97]:
In [98]:
fig, axes = plt.subplots(nrows = 4, ncols = 2, figsize = (2 * 4, 4 * 4))
axes = axes.ravel()
iax = 0
for f in rr.columns:
if '_on_' in f:
performance.biclassification_density_plot(rr['health.ins'], rr[f], y_name="health.ins", yhat_name=f, ax = axes[iax])
iax += 1
In [99]:
calibration.biclassification_density_plot(rr['health.ins'], rr["health.ins"]+np.random.randn(rr.shape[0])*0.01,
y_name="health.ins", yhat_name="health.ins")
In [100]:
y = rr["health.ins"]
yhat = rr["health.insIsTrue_on_discrete_age"]
prior_p = np.mean(y == True)
base_score = np.sum(np.log(np.where(y==True, prior_p, 1-prior_p)))
yhat_score = np.sum(np.log(np.where(y==True, yhat, 1-yhat)))
In [101]:
base_score, yhat_score
Out[101]:
In [102]:
for f in rr.columns:
if '_on_' in f:
print f, performance.biclassification_likelihood_score(rr["health.ins"],
rr[f],
y_positive = True)
In [ ]: