In [0]:
# Dataset from here
# https://archive.ics.uci.edu/ml/datasets/Adult
In [0]:
import great_expectations as ge
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
In [0]:
"""
age: continuous.
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
fnlwgt: continuous.
education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
education-num: continuous.
marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
sex: Female, Male.
capital-gain: continuous.
capital-loss: continuous.
hours-per-week: continuous.
native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
"""
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
continuous_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
In [0]:
df = ge.read_csv('../data/adult.data.b_2_train.csv')
df_test = ge.read_csv('../data/adult.data.b_2_test.csv')
df.head()
In [0]:
df.shape
In [0]:
df.expect_column_values_to_be_in_set('sex', ['Female', 'Male'])
In [0]:
def strip_spaces(df):
for column in df.columns:
if isinstance(df[column][0], str):
df[column] = df[column].apply(str.strip)
strip_spaces(df)
strip_spaces(df_test)
In [0]:
df.expect_column_values_to_be_in_set('sex', ['Female', 'Male'])
In [0]:
df['y'] = df['<=50k'].apply(lambda x: 0 if (x == '<=50K') else 1)
df_test['y'] = df_test['<=50k'].apply(lambda x: 0 if (x == '<=50K') else 1)
In [0]:
df['sex'].value_counts().plot(kind='bar')
In [0]:
sex_partition = ge.dataset.util.categorical_partition_data(df['sex'])
df.expect_column_chisquare_test_p_value_to_be_greater_than('sex', sex_partition)
In [0]:
df_test.expect_column_chisquare_test_p_value_to_be_greater_than('sex', sex_partition, output_format='SUMMARY')
In [0]:
plt.hist(df['age'])
In [0]:
age_partition = ge.dataset.util.continuous_partition_data(df['age'])
df.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than('age', age_partition)
In [0]:
out = df_test.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than('age', age_partition, output_format='SUMMARY')
print(out)
In [0]:
plt.plot(out['summary_obj']['expected_cdf']['x'], out['summary_obj']['expected_cdf']['cdf_values'])
plt.plot(out['summary_obj']['observed_cdf']['x'], out['summary_obj']['observed_cdf']['cdf_values'])
In [0]:
plt.plot(out['summary_obj']['expected_partition']['bins'][1:], out['summary_obj']['expected_partition']['weights'])
plt.plot(out['summary_obj']['observed_partition']['bins'][1:], out['summary_obj']['observed_partition']['weights'])
In [0]:
df['<=50k'].value_counts().plot(kind='bar')
In [0]:
df['education'].value_counts().plot(kind='bar')
In [0]:
education_partition = ge.dataset.util.categorical_partition_data(df['education'])
df.expect_column_chisquare_test_p_value_to_be_greater_than('education', education_partition)
In [0]:
df_test['education'].value_counts().plot(kind='bar')
df_test.expect_column_chisquare_test_p_value_to_be_greater_than('education', education_partition)
In [0]:
df_test.expect_column_kl_divergence_to_be_less_than('education', education_partition, threshold=0.1)
In [0]:
plt.hist(df['education-num'])
In [0]:
education_num_partition_auto = ge.dataset.util.continuous_partition_data(df['education-num'])
df.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than('education-num', education_num_partition_auto)
In [0]:
education_num_partition_auto
In [0]:
education_num_partition_cat = ge.dataset.util.categorical_partition_data(df['education-num'])
df.expect_column_chisquare_test_p_value_to_be_greater_than('education-num', education_num_partition_cat)
In [0]:
df_test.expect_column_chisquare_test_p_value_to_be_greater_than('education-num', education_num_partition_cat)
In [0]:
education_num_partition = ge.dataset.util.continuous_partition_data(df['education-num'], bins='uniform', n_bins=10)
df.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than('education-num', education_num_partition)
In [0]:
s1 = df['education'][df['y'] == 1].value_counts()
s1.name = 'education_y_1'
s2 = df['education'][df['y'] == 0].value_counts()
s2.name = 'education_y_0'
plotter = pd.concat([s1, s2], axis=1)
In [0]:
p1 = plt.bar(range(len(plotter)), plotter['education_y_0'])
p2 = plt.bar(range(len(plotter)), plotter['education_y_1'], bottom=plotter['education_y_0'])
plt.xticks(range(len(plotter)), plotter.index, rotation='vertical')
plt.show()
In [0]:
df.get_expectation_suite()
In [0]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
In [0]:
def build_transformer(df_train):
le = {}
ohe = OneHotEncoder()
X_cat = pd.DataFrame()
for cat_column in categorical_columns:
le[cat_column] = LabelEncoder()
X_cat[cat_column + '_le'] = le[cat_column].fit_transform(df_train[cat_column])
X_cat = ohe.fit_transform(X_cat)
X_train = np.append(X_cat.toarray(), df_train[continuous_columns], axis=1)
return le, ohe, X_train
def apply_transformer(le, ohe, df_test):
X_cat = pd.DataFrame()
for cat_column in categorical_columns:
X_cat[cat_column + '_le'] = le[cat_column].transform(df_test[cat_column])
X_cat = ohe.transform(X_cat)
X_test = np.append(X_cat.toarray(), df_test[continuous_columns], axis=1)
return X_test
In [0]:
clf = RandomForestClassifier()
In [0]:
le, ohe, X_train = build_transformer(df)
In [0]:
clf.fit(X_train, df['y'])
In [0]:
clf.score(X_train, df['y'])
In [0]:
my_expectations = df.get_expectation_suite()
In [0]:
my_expectations
In [0]:
results = df_test.validate(expectation_suite=my_expectations)
results
In [0]:
failures = df_test.validate(expectation_suite=my_expectations, only_return_failures=True)
failures
In [0]:
X_test = apply_transformer(le, ohe, df_test)
In [0]:
clf.score(X_test, df_test['y'])
In [0]:
df_test_2 = ge.read_csv('../data/adult.data.b_1_train.csv')
strip_spaces(df_test_2)
#df_test_2 = df_test_2[df_test_2['native-country'] != 'Holand-Netherlands']
df_test_2['y'] = df_test_2['<=50k'].apply(lambda x: 0 if (x == '<=50K') else 1)
X_test_2 = apply_transformer(le, ohe, df_test_2)
In [0]:
clf.score(X_test_2, df_test_2['y'])
In [0]:
# Health Screening: Preventative Checkup!
In [0]:
failures = df_test_2.validate(my_expectations, only_return_failures=True, output_format='SUMMARY')
failures
In [0]:
df_test_2['sex'].value_counts().plot(kind='bar')
In [0]:
df_test_3 = ge.read_csv('../data/adult.data.b_1_test.csv')
strip_spaces(df_test_3)
#df_test_3 = df_test_3[df_test_3['native-country'] != 'Holand-Netherlands']
df_test_3['y'] = df_test_3['<=50k'].apply(lambda x: 0 if (x == '<=50K') else 1)
X_test_3 = apply_transformer(le, ohe, df_test_3)
In [0]:
clf.score(X_test_3, df_test_3['y'])
In [0]:
#What could have gone wrong?
#
# a. The world changed.
# b. New sensor means different data.
# c. Bueller? Bueller?
# d. Biased sample of the data
In [0]:
result = df_test_2.validate(my_expectations, only_return_failures=True, output_format='SUMMARY')
failures
In [0]: