Title: Missing Data In Pandas Dataframes
Slug: pandas_missing_data
Summary: Missing Data In Pandas Dataframes
Date: 2016-05-01 12:00
Category: Python
Tags: Data Wrangling
Authors: Chris Albon
In [6]:
import pandas as pd
import numpy as np
In [7]:
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
'age': [42, np.nan, 36, 24, 73],
'sex': ['m', np.nan, 'f', 'm', 'f'],
'preTestScore': [4, np.nan, np.nan, 2, 3],
'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df
Out[7]:
In [8]:
df_no_missing = df.dropna()
df_no_missing
Out[8]:
In [9]:
df_cleaned = df.dropna(how='all')
df_cleaned
Out[9]:
In [10]:
df['location'] = np.nan
df
Out[10]:
In [11]:
df.dropna(axis=1, how='all')
Out[11]:
In [12]:
df.dropna(thresh=5)
Out[12]:
In [13]:
df.fillna(0)
Out[13]:
In [14]:
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
df
Out[14]:
In [15]:
df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)
df
Out[15]:
In [16]:
# Select the rows of df where age is not NaN and sex is not NaN
df[df['age'].notnull() & df['sex'].notnull()]
Out[16]: