In [0]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
在本练习中,您将使用成年人普查收入数据集,这是机器学习文献中常用的数据集。该数据由 Ronny Kohavi 和 Barry Becker 从 1994 年人口普查局数据库中提取而来。
数据集中的每个样本都包含参与 1994 年人口普查的一组个体的下列人口统计数据:
age
:个体的年龄。fnlwgt
:人口普查机构认为这组观察数据代表的个体人数。education_num
:教育类别表示法的列举。数值越大,个体的教育水平就越高。例如,education_num
为 11
表示 Assoc_voc
(职业学校的副学士学位)、education_num
为 13
表示 Bachelors
(学士学位)、education_num
为 9
表示 HS-grad
(高中毕业)。capital_gain
:个体的资本收益,以美元表示。capital_loss
:个体的资本损失,以美元表示。hours_per_week
:每周工作时数。workclass
:个体的雇主类型。例如:Private
、Self-emp-not-inc
、Self-emp-inc
、Federal-gov
、Local-gov
、State-gov
、Without-pay
和 Never-worked
。education
:个体取得的最高教育水平。marital_status
:个体的婚姻状况。例如:Married-civ-spouse、
Divorced、
Never-married、
Separated、
Widowed、
Married-spouse-absent和
Married-AF-spouse`。occupation
:个体的职业。例如:tech-support
、Craft-repair
、Other-service
、Sales
、Exec-managerial
等等。relationship
:每个人在家庭关系中的角色。例如:Wife
、Own-child
、Husband
、Not-in-family
、Other-relative
和 Unmarried
。gender
:个体的性别,只能有两种选择:Female
或 Male
。race
:White
、Asian-Pac-Islander
、Amer-Indian-Eskimo
、Black
和 Other
。native_country
:个体的原籍国。例如:United-States
、Cambodia
、England
、Puerto-Rico
、Canada
、Germany
、Outlying-US(Guam-USVI-etc)
、India
、Japan
、United-States
、Cambodia
、England
、Puerto-Rico
、Canada
、Germany
、Outlying-US(Guam-USVI-etc)
、India
、Japan
等等。预测任务是确定一个人的年收入是否超过 5 万美元。**
income_bracket
:此人的年收入是否超过 5 万美元。为此数据集提取的所有样本都满足以下条件:
age
为 16 岁或以上。income_bracket
)每年超过 100 美元。fnlwgt
大于 0。hours_per_week
大于 0。
In [0]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import tempfile
!pip install seaborn==0.8.1
import seaborn as sns
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from google.colab import widgets
# For facets
from IPython.core.display import display, HTML
import base64
!pip install facets-overview==1.0.0
from facets_overview.feature_statistics_generator import FeatureStatisticsGenerator
print('Modules are imported.')
In [0]:
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"]
train_df = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
names=COLUMNS,
sep=r'\s*,\s*',
engine='python',
na_values="?")
test_df = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
names=COLUMNS,
sep=r'\s*,\s*',
skiprows=[0],
engine='python',
na_values="?")
# Drop rows with missing values
train_df = train_df.dropna(how="any", axis=0)
test_df = test_df.dropna(how="any", axis=0)
print('UCI Adult Census Income dataset loaded.')
首先,我们可以使用 Facets Overview,这是一个交互式可视化工具,可以帮助我们探索数据集。通过 Facets Overview,我们可以快速分析成年人数据集中各个值的分布情况。
In [0]:
#@title Visualize the Data in Facets
fsg = FeatureStatisticsGenerator()
dataframes = [
{'table': train_df, 'name': 'trainData'}]
censusProto = fsg.ProtoFromDataFrames(dataframes)
protostr = base64.b64encode(censusProto.SerializeToString()).decode("utf-8")
HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
<link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
<facets-overview id="elem"></facets-overview>
<script>
document.querySelector("#elem").protoInput = "{protostr}";
</script>"""
html = HTML_TEMPLATE.format(protostr=protostr)
display(HTML(html))
查看数值特征和类别特征的缺失列后,可以看到没有缺失的特征值,所以这不是一个问题。
通过观察每个数值特征的最小/最大值和直方图,我们可以确定数据集中的任何极端离群值。对于 hours_per_week
,我们可以看到最小值是 1,这可能有点奇怪,因为大部分工作都要求每周工作多个小时。对于 capital_gain
和 capital_loss
,我们可以看到超过 90% 的值是 0。考虑到只有进行投资的个体才会登记资本收益/损失,那么,该特征有不足 10% 的样本具有非零值当然是合理的,但是,我们可能需要更仔细地验证这些特征值是否有效。
通过观察性别直方图,我们发现超过三分之二(约 67%)的样本代表男性。这有力地表明了数据偏斜现象,因为我们期望两个性别之间的划分接近 50/50。
为了进一步探索数据集,我们可以使用 Facets Dive,这个工具提供了一个交互界面,在此界面中,可视化图表中的每个项目都代表一个数据点。但是要使用 Facets Dive,我们需要将数据转换为 JSON 数组。
幸好,DataFrame 方法 to_json()
为我们解决了这个问题。
请运行下面的单元格,以将数据转换为 JSON 数组并加载 Facets Dive。
In [0]:
#@title Set the Number of Data Points to Visualize in Facets Dive
SAMPLE_SIZE = 2500 #@param
train_dive = train_df.sample(SAMPLE_SIZE).to_json(orient='records')
HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
<link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
<facets-dive id="elem" height="600"></facets-dive>
<script>
var data = {jsonstr};
document.querySelector("#elem").data = data;
</script>"""
html = HTML_TEMPLATE.format(jsonstr=train_dive)
display(HTML(html))
使用可视化图表左侧面板上的菜单更改数据的组织方式:
在 Faceting | X-Axis 菜单中,选择 education,在 Display | Color 和 Display | Type 菜单中,选择 income_bracket。您会如何描述教育水平和收入档次之间的关系?
接下来,在 Faceting | X-Axis 菜单中,选择 marital_status,在 Display | Color 和 Display | Type 菜单中,选择 gender。对于每个 marital-status 类别的性别分布,您能观察到哪些值得注意的现象?
在执行上述任务时,请记住以下与公平性有关的问题:
在我们的数据集中,较高的教育水平通常与较高的收入档次相关联。在教育水平为本科或以上的样本中,收入水平超过 5 万美元的人所占比例更大。
在大多数 marital-status 类别中,男性和女性值的分布接近 1:1。唯一值得注意的例外情况是“married-civ-spouse”,这时候男性超出了女性,男女比例超过 5:1。考虑到我们在任务 1 中已经发现在数据集中男性的比例特别高,那么我们现在可以推断已婚女性在数据中所占比例特别低。
In [0]:
feature = 'capital_gain / capital_loss' #@param ["", "hours_per_week", "fnlwgt", "gender", "capital_gain / capital_loss", "age"] {allow-input: false}
if feature == "hours_per_week":
print(
'''It does seem a little strange to see 'hours_per_week' max out at 99 hours,
which could lead to data misrepresentation. One way to address this is by
representing 'hours_per_week' as a binary "working 40 hours/not working 40
hours" feature. Also keep in mind that data was extracted based on work hours
being greater than 0. In other words, this feature representation exclude a
subpopulation of the US that is not working. This could skew the outcomes of the
model.''')
if feature == "fnlwgt":
print(
"""'fnlwgt' represents the weight of the observations. After fitting the model
to this data set, if certain group of individuals end up performing poorly
compared to other groups, then we could explore ways of reweighting each data
point using this feature.""")
if feature == "gender":
print(
"""Looking at the ratio between men and women shows how disproportionate the data
is compared to the real world where the ratio (at least in the US) is closer to
1:1. This could pose a huge probem in performance across gender. Considerable
measures may need to be taken to upsample the underrepresented group (in this
case, women).""")
if feature == "capital_gain / capital_loss":
print(
"""Both 'capital_gain' and 'capital_loss' have very low variance, which might
suggest they don't contribute a whole lot of information for predicting income. It
may be okay to omit these features rather than giving the model more noise.""")
if feature == "age":
print(
'''"age" has a lot of variance, so it might benefit from bucketing to learn
fine-grained correlations between income and age, as well as to prevent
overfitting.''')
In [0]:
def csv_to_pandas_input_fn(data, batch_size=100, num_epochs=1, shuffle=False):
return tf.estimator.inputs.pandas_input_fn(
x=data.drop('income_bracket', axis=1),
y=data['income_bracket'].apply(lambda x: ">50K" in x).astype(int),
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=1)
print('csv_to_pandas_input_fn() defined.')
In [0]:
#@title Categorical Feature Columns
# Since we don't know the full range of possible values with occupation and
# native_country, we'll use categorical_column_with_hash_bucket() to help map
# each feature string into an integer ID.
occupation = tf.feature_column.categorical_column_with_hash_bucket(
"occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
"native_country", hash_bucket_size=1000)
# For the remaining categorical features, since we know what the possible values
# are, we can be more explicit and use categorical_column_with_vocabulary_list()
gender = tf.feature_column.categorical_column_with_vocabulary_list(
"gender", ["Female", "Male"])
race = tf.feature_column.categorical_column_with_vocabulary_list(
"race", [
"White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"
])
education = tf.feature_column.categorical_column_with_vocabulary_list(
"education", [
"Bachelors", "HS-grad", "11th", "Masters", "9th",
"Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
"Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
"Preschool", "12th"
])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
"marital_status", [
"Married-civ-spouse", "Divorced", "Married-spouse-absent",
"Never-married", "Separated", "Married-AF-spouse", "Widowed"
])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
"relationship", [
"Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
"Other-relative"
])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
"workclass", [
"Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
"Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
])
print('Categorical feature columns defined.')
In [0]:
#@title Numeric Feature Columns
# For Numeric features, we can just call on feature_column.numeric_column()
# to use its raw value instead of having to create a map between value and ID.
age = tf.feature_column.numeric_column("age")
fnlwgt = tf.feature_column.numeric_column("fnlwgt")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")
print('Numeric feature columns defined.')
如果您在完成公平意识任务 3 时选择了 age
,则会注意到我们建议对 age
进行分桶(也称为分箱)可能效果更好,即将年龄相近的人员分成一组。这可能有助于模型在各年龄段中更好地泛化。因此,我们将 age
从数值特征(从技术上讲是排序特征)转换为类别特征。
In [0]:
age_buckets = tf.feature_column.bucketized_column(
age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
In [0]:
# List of variables, with special handling for gender subgroup.
variables = [native_country, education, occupation, workclass,
relationship, age_buckets]
subgroup_variables = [gender]
feature_columns = variables + subgroup_variables
准备好特征后,我们就可以尝试使用深度学习预测收入了。
为了简单起见,我们会使神经网络架构很精简,直接定义一个包含两个隐藏层的前馈神经网络。
但首先,我们必须将高维度类别特征转换为低维度的密集实值向量(称之为嵌入向量)。幸好,indicator_column
(将其视为独热编码)和 embedding_column
(可将稀疏特征转换为密集特征)可帮助我们简化此流程。
以下单元格创建了进一步定义模型所需的深度列。
In [0]:
deep_columns = [
tf.feature_column.indicator_column(workclass),
tf.feature_column.indicator_column(education),
tf.feature_column.indicator_column(age_buckets),
tf.feature_column.indicator_column(gender),
tf.feature_column.indicator_column(relationship),
tf.feature_column.embedding_column(native_country, dimension=8),
tf.feature_column.embedding_column(occupation, dimension=8),
]
print(deep_columns)
print('Deep columns created.')
完成所有数据预处理工作后,我们现在可以定义深度神经网络模型了。首先使用下面定义的参数。(稍后,在您定义评估指标并评估模型后,您可以返回并调整这些参数以比较结果。)
In [0]:
#@title Define Deep Neural Net Model
HIDDEN_UNITS = [1024, 512] #@param
LEARNING_RATE = 0.1 #@param
L1_REGULARIZATION_STRENGTH = 0.0001 #@param
L2_REGULARIZATION_STRENGTH = 0.0001 #@param
model_dir = tempfile.mkdtemp()
single_task_deep_model = tf.estimator.DNNClassifier(
feature_columns=deep_columns,
hidden_units=HIDDEN_UNITS,
optimizer=tf.train.ProximalAdagradOptimizer(
learning_rate=LEARNING_RATE,
l1_regularization_strength=L1_REGULARIZATION_STRENGTH,
l2_regularization_strength=L2_REGULARIZATION_STRENGTH),
model_dir=model_dir)
print('Deep neural net model defined.')
为了简单起见,我们将训练 1000 步,但您可以随意尝试调整此参数。
In [0]:
#@title Fit Deep Neural Net Model to the Adult Training Dataset
STEPS = 1000 #@param
single_task_deep_model.train(
input_fn=csv_to_pandas_input_fn(train_df, num_epochs=None, shuffle=True),
steps=STEPS);
print('Deep neural net model is done fitting.')
我们现在可以使用预留测试集评估模型的整体表现。
In [0]:
#@title Evaluate Deep Neural Net Performance
results = single_task_deep_model.evaluate(
input_fn=csv_to_pandas_input_fn(test_df, num_epochs=1, shuffle=False),
steps=None)
print("model directory = %s" % model_dir)
print("---- Results ----")
for key in sorted(results):
print("%s: %s" % (key, results[key]))
您可以尝试使用不同的参数重新训练模型。最后,您会发现深度神经网络在预测收入方面表现不错。
但是缺少与子群组相关的评估指标。我们将在下一部分介绍您可以在子群组级别进行评估的一些方法。
虽然评估模型的总体效果可让我们了解该模型的质量,但无法让我们充分了解模型针对不同子群组的表现如何。
在评估模型的公平性时,请务必确定预测错误在所有子群组中是否保持统一,或者某些子群组是否比其他子群组更容易出现特定预测错误。
混淆矩阵是一种用于比较不同种类模型错误的发生率的重要工具。记得在机器学习速成课程的分类单元中,我们提到:混淆矩阵是一个网格,它可以绘制模型的预测值和真实值,并将统计信息制成表格,总结了模型做出正确预测和错误预测的频率。
我们首先为收入预测模型创建二元混淆矩阵,之所以为二元,是因为我们的标签 (income_bracket
) 只包含两个可能的值(<50K
或 >50K
)。我们将收入 >50K
定义为正标签,并将收入 <50k
定义为负标签。
注意:在这种情况下,正和负不应被解释为价值评判(我们并非认为年收入高于 5 万的人比年收入低于 5 万的人更优秀)。它们只是用于区分模型可做出的两种可能预测的标准术语。
模型做出正确预测(预测结果与真实值相符)的情况被归类为 true,模型做出错误预测的情况被归类为 false。
因此,我们的混淆矩阵会列出四个可能的状态:
>50K
,并且此结果是真实值。<50K
,并且此结果是真实值。>50K
,并且此结果与真实情况冲突。<50K
,并且此结果与真实情况冲突。注意:如果需要,我们可以使用每个状态的结果计算辅助评估指标,例如精确率和召回率。
In [0]:
#@test {"output": "ignore"}
#@title Define Function to Compute Binary Confusion Matrix Evaluation Metrics
def compute_eval_metrics(references, predictions):
tn, fp, fn, tp = confusion_matrix(references, predictions).ravel()
precision = tp / float(tp + fp)
recall = tp / float(tp + fn)
false_positive_rate = fp / float(fp + tn)
false_omission_rate = fn / float(tn + fn)
return precision, recall, false_positive_rate, false_omission_rate
print('Binary confusion matrix and evaluation metrics defined.')
我们还需要绘制二元混淆矩阵方面的帮助。以下函数结合使用各种第三方模块(Pandas DataFrame、Matplotlib、Seaborn)来绘制混淆矩阵。
In [0]:
#@title Define Function to Visualize Binary Confusion Matrix
def plot_confusion_matrix(confusion_matrix, class_names, figsize = (8,6)):
# We're taking our calculated binary confusion matrix that's already in form
# of an array and turning it into a Pandas DataFrame because it's a lot
# easier to work with when visualizing a heat map in Seaborn.
df_cm = pd.DataFrame(
confusion_matrix, index=class_names, columns=class_names,
)
fig = plt.figure(figsize=figsize)
# Combine the instance (numercial value) with its description
strings = np.asarray([['True Positives', 'False Negatives'],
['False Positives', 'True Negatives']])
labels = (np.asarray(
["{0:d}\n{1}".format(value, string) for string, value in zip(
strings.flatten(), confusion_matrix.flatten())])).reshape(2, 2)
heatmap = sns.heatmap(df_cm, annot=labels, fmt="");
heatmap.yaxis.set_ticklabels(
heatmap.yaxis.get_ticklabels(), rotation=0, ha='right')
heatmap.xaxis.set_ticklabels(
heatmap.xaxis.get_ticklabels(), rotation=45, ha='right')
plt.ylabel('References')
plt.xlabel('Predictions')
return fig
print('Binary confusion matrix visualization defined.')
现在我们已经定义了所有必要的函数,接下来可以使用我们的深度神经网络模型的结果计算二元混淆矩阵和评估指标。此单元格的输出是一个带标签视图,我们可以通过这一视图在混淆矩阵和评估指标表格之间切换。
In [0]:
#@title Visualize Binary Confusion Matrix and Compute Evaluation Metrics Per Subgroup
CATEGORY = "gender" #@param {type:"string"}
SUBGROUP = "Male" #@param {type:"string"}
# Given define subgroup, generate predictions and obtain its corresponding
# ground truth.
predictions_dict = single_task_deep_model.predict(input_fn=csv_to_pandas_input_fn(
test_df.loc[test_df[CATEGORY] == SUBGROUP], num_epochs=1, shuffle=False))
predictions = []
for prediction_item, in zip(predictions_dict):
predictions.append(prediction_item['class_ids'][0])
actuals = list(
test_df.loc[test_df[CATEGORY] == SUBGROUP]['income_bracket'].apply(
lambda x: '>50K' in x).astype(int))
classes = ['Over $50K', 'Less than $50K']
# To stay consistent, we have to flip the confusion
# matrix around on both axes because sklearn's confusion matrix module by
# default is rotated.
rotated_confusion_matrix = np.fliplr(confusion_matrix(actuals, predictions))
rotated_confusion_matrix = np.flipud(rotated_confusion_matrix)
tb = widgets.TabBar(['Confusion Matrix', 'Evaluation Metrics'], location='top')
with tb.output_to('Confusion Matrix'):
plot_confusion_matrix(rotated_confusion_matrix, classes);
with tb.output_to('Evaluation Metrics'):
grid = widgets.Grid(2,4)
p, r, fpr, fomr = compute_eval_metrics(actuals, predictions)
with grid.output_to(0, 0):
print(' Precision ')
with grid.output_to(1, 0):
print(' %.4f ' % p)
with grid.output_to(0, 1):
print(' Recall ')
with grid.output_to(1, 1):
print(' %.4f ' % r)
with grid.output_to(0, 2):
print(' False Positive Rate ')
with grid.output_to(1, 2):
print(' %.4f ' % fpr)
with grid.output_to(0, 3):
print(' False Omission Rate ')
with grid.output_to(1, 3):
print(' %.4f ' % fomr)
使用默认的模型参数,您可能会发现模型对于男性的表现要比女性好。具体而言,在运行模型后,我们发现男性的精确率和召回率(分别为 0.7490 和 0.4795)都比女性(分别为 0.6787 和 0.3716)高。
希望通过此混淆矩阵演示,您可以发现结果与整体效果指标略有不同,并强调了评估模型在各子群组中的表现(而不是总体表现)的重要性。
在您的工作中,确保您能在权衡假正例、假负例、真正例和真负例方面做出明智的决策。例如,您可能需要实现非常低的假正例率和较高的真正例率,或者可能需要实现较高的精确率,而较低的召回率能接受。
请根据这些权衡需求选择评估指标。