In [ ]:
    
import pandas as pd
import numpy as np
    
In [ ]:
    
import statsmodels
from statsmodels.imputation import mice
    
In [ ]:
    
import random
    
In [ ]:
    
random.seed(10)
    
In [ ]:
    
df = pd.read_csv("http://goo.gl/19NKXV")
    
In [ ]:
    
df.head()
    
In [ ]:
    
original = df.copy()
    
In [ ]:
    
original.describe().loc['count',:]
    
Add some missing values
In [ ]:
    
def add_nulls(df, n):
    new = df.copy()
    new.iloc[random.sample(range(new.shape[0]), n), :] = np.nan
    return new
    
In [ ]:
    
df.Cholesterol = add_nulls(df[['Cholesterol']], 20)
df.Smoking = add_nulls(df[['Smoking']], 20)
df.Education = add_nulls(df[['Education']], 20)
df.Age = add_nulls(df[['Age']], 5)
df.BMI = add_nulls(df[['BMI']], 5)
    
Confirm the presence of null values
In [ ]:
    
df.describe()
    
Create categorical variables
In [ ]:
    
for col in ['Gender', 'Smoking', 'Education']:
    df[col] = df[col].astype('category')
    
In [ ]:
    
df.dtypes
    
Create dummy variables
In [ ]:
    
df = pd.get_dummies(df);
    
MICEData class
In [ ]:
    
imp = mice.MICEData(df)
    
 Imputation for one feature 
The conditional_formula attribute is a dictionary containing the models that will be used to impute the data for each column. This can be updated to change the imputation model.
In [ ]:
    
imp.conditional_formula['BMI']
    
In [ ]:
    
before = imp.data.BMI.copy()
    
The perturb_params method must be called before running the impute method, that runs the imputation. It updates the specified column in the data attribute.
In [ ]:
    
imp.perturb_params('BMI')
    
In [ ]:
    
imp.impute('BMI')
    
In [ ]:
    
after = imp.data.BMI
    
In [ ]:
    
import matplotlib.pyplot as plt
    
In [ ]:
    
plt.clf()
fig, ax = plt.subplots(1, 1)
ax.plot(before, 'or', label='before', alpha=1, ms=8)
ax.plot(after, 'ok', label='after', alpha=0.8, mfc='w', ms=8)
plt.legend();
    
In [ ]:
    
pd.DataFrame(dict(before=before.describe(), after=after.describe()))
    
In [ ]:
    
before[before != after]
    
In [ ]:
    
after[before != after]
    
In [ ]:
    
imp.update_all(2)
    
In [ ]:
    
imp.plot_fit_obs('BMI');
    
In [ ]:
    
imp.plot_fit_obs('Age');
    
In [ ]:
    
original.mean()
    
In [ ]:
    
for col in original.mean().index:
    x = original.mean()[col]
    y = imp.data[col].mean()
    e = abs(x - y) / x
    print("{:<12}  mean={:>8.2f}, exact={:>8.2f}, error={:>5.2g}%".format(col, x, y, e * 100))
    
In [ ]: