In [ ]:
import pandas as pd
import numpy as np
% matplotlib inline
from matplotlib import pyplot as plt
from codefiles.class2 import get_data, print_analysis, plot_standard_deviations
In [ ]:
data = get_data()
heights = data['height']
ages = data['age']
gender = data['gender']
In [ ]:
heights.hist(bins=40, figsize=(16,4))
plt.xlabel('Height')
plt.ylabel('Count')
Theory of the normal distribution here
In [ ]:
plot_standard_deviations(heights, 'Heights')
In [ ]:
print_analysis(heights)
Under:
In [ ]:
heights[heights < heights.mean() - 2*heights.std()]
Over:
In [ ]:
heights[heights > heights.mean() + 2*heights.std()]
Under:
In [ ]:
heights[heights < heights.mean() - 3*heights.std()]
Over:
In [ ]:
heights[heights > heights.mean() + 3*heights.std()]
In [ ]:
ages.hist(bins=40, figsize=(16,4))
plt.xlabel('Age')
plt.ylabel('Count')
In [ ]:
plot_standard_deviations(ages, 'ages')
In [ ]:
print_analysis(ages)
What's the biggest outlier we have?
In [ ]:
ages.max()
What if we used percentiles?
In [ ]:
extreme_value = .999
In [ ]:
ages.quantile(extreme_value)
In [ ]:
under_extreme_value = ages < ages.quantile(extreme_value)
Well this looks a lot more usable:
In [ ]:
ages[under_extreme_value].hist(bins=40, figsize=(16,4))
plt.xlabel('Age')
plt.ylabel('Count')
In [ ]:
plot_standard_deviations(ages[under_extreme_value], 'ages')
In [ ]:
print_analysis(ages[under_extreme_value])