In [ ]:
import pandas as pd
import numpy as np
In [ ]:
import statsmodels
from statsmodels.imputation import mice
In [ ]:
import random
In [ ]:
random.seed(10)
In [ ]:
df = pd.read_csv("http://goo.gl/19NKXV")
In [ ]:
df.head()
In [ ]:
original = df.copy()
In [ ]:
original.describe().loc['count',:]
Add some missing values
In [ ]:
def add_nulls(df, n):
new = df.copy()
new.iloc[random.sample(range(new.shape[0]), n), :] = np.nan
return new
In [ ]:
df.Cholesterol = add_nulls(df[['Cholesterol']], 20)
df.Smoking = add_nulls(df[['Smoking']], 20)
df.Education = add_nulls(df[['Education']], 20)
df.Age = add_nulls(df[['Age']], 5)
df.BMI = add_nulls(df[['BMI']], 5)
Confirm the presence of null values
In [ ]:
df.describe()
Create categorical variables
In [ ]:
for col in ['Gender', 'Smoking', 'Education']:
df[col] = df[col].astype('category')
In [ ]:
df.dtypes
Create dummy variables
In [ ]:
df = pd.get_dummies(df);
MICEData class
In [ ]:
imp = mice.MICEData(df)
Imputation for one feature
The conditional_formula
attribute is a dictionary containing the models that will be used to impute the data for each column. This can be updated to change the imputation model.
In [ ]:
imp.conditional_formula['BMI']
In [ ]:
before = imp.data.BMI.copy()
The perturb_params
method must be called before running the impute
method, that runs the imputation. It updates the specified column in the data
attribute.
In [ ]:
imp.perturb_params('BMI')
In [ ]:
imp.impute('BMI')
In [ ]:
after = imp.data.BMI
In [ ]:
import matplotlib.pyplot as plt
In [ ]:
plt.clf()
fig, ax = plt.subplots(1, 1)
ax.plot(before, 'or', label='before', alpha=1, ms=8)
ax.plot(after, 'ok', label='after', alpha=0.8, mfc='w', ms=8)
plt.legend();
In [ ]:
pd.DataFrame(dict(before=before.describe(), after=after.describe()))
In [ ]:
before[before != after]
In [ ]:
after[before != after]
In [ ]:
imp.update_all(2)
In [ ]:
imp.plot_fit_obs('BMI');
In [ ]:
imp.plot_fit_obs('Age');
In [ ]:
original.mean()
In [ ]:
for col in original.mean().index:
x = original.mean()[col]
y = imp.data[col].mean()
e = abs(x - y) / x
print("{:<12} mean={:>8.2f}, exact={:>8.2f}, error={:>5.2g}%".format(col, x, y, e * 100))
In [ ]: