In [0]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
학습 목표:
텐서플로우 첫걸음 실습에서 사용한 모델을 다시 살펴보겠습니다.
우선 캘리포니아 주택 데이터를 pandas DataFrame
으로 가져옵니다.
In [0]:
from __future__ import print_function
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
california_housing_dataframe = california_housing_dataframe.reindex(
np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe
다음으로, 입력 함수를 설정하고 모델 학습용 함수를 정의합니다.
In [0]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
"""Trains a linear regression model of one feature.
Args:
features: pandas DataFrame of features
targets: pandas DataFrame of targets
batch_size: Size of batches to be passed to the model
shuffle: True or False. Whether to shuffle the data.
num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
Returns:
Tuple of (features, labels) for next data batch
"""
# Convert pandas data into a dict of np arrays.
features = {key:np.array(value) for key,value in dict(features).items()}
# Construct a dataset, and configure batching/repeating.
ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
# Shuffle the data, if specified.
if shuffle:
ds = ds.shuffle(buffer_size=10000)
# Return the next batch of data.
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
In [0]:
def train_model(learning_rate, steps, batch_size, input_feature):
"""Trains a linear regression model.
Args:
learning_rate: A `float`, the learning rate.
steps: A non-zero `int`, the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero `int`, the batch size.
input_feature: A `string` specifying a column from `california_housing_dataframe`
to use as input feature.
Returns:
A Pandas `DataFrame` containing targets and the corresponding predictions done
after training the model.
"""
periods = 10
steps_per_period = steps / periods
my_feature = input_feature
my_feature_data = california_housing_dataframe[[my_feature]].astype('float32')
my_label = "median_house_value"
targets = california_housing_dataframe[my_label].astype('float32')
# Create input functions.
training_input_fn = lambda: my_input_fn(my_feature_data, targets, batch_size=batch_size)
predict_training_input_fn = lambda: my_input_fn(my_feature_data, targets, num_epochs=1, shuffle=False)
# Create feature columns.
feature_columns = [tf.feature_column.numeric_column(my_feature)]
# Create a linear regressor object.
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
linear_regressor = tf.estimator.LinearRegressor(
feature_columns=feature_columns,
optimizer=my_optimizer
)
# Set up to plot the state of our model's line each period.
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.title("Learned Line by Period")
plt.ylabel(my_label)
plt.xlabel(my_feature)
sample = california_housing_dataframe.sample(n=300)
plt.scatter(sample[my_feature], sample[my_label])
colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)]
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
print("Training model...")
print("RMSE (on training data):")
root_mean_squared_errors = []
for period in range (0, periods):
# Train the model, starting from the prior state.
linear_regressor.train(
input_fn=training_input_fn,
steps=steps_per_period,
)
# Take a break and compute predictions.
predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
predictions = np.array([item['predictions'][0] for item in predictions])
# Compute loss.
root_mean_squared_error = math.sqrt(
metrics.mean_squared_error(predictions, targets))
# Occasionally print the current loss.
print(" period %02d : %0.2f" % (period, root_mean_squared_error))
# Add the loss metrics from this period to our list.
root_mean_squared_errors.append(root_mean_squared_error)
# Finally, track the weights and biases over time.
# Apply some math to ensure that the data and line are plotted neatly.
y_extents = np.array([0, sample[my_label].max()])
weight = linear_regressor.get_variable_value('linear/linear_model/%s/weights' % input_feature)[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
x_extents = (y_extents - bias) / weight
x_extents = np.maximum(np.minimum(x_extents,
sample[my_feature].max()),
sample[my_feature].min())
y_extents = weight * x_extents + bias
plt.plot(x_extents, y_extents, color=colors[period])
print("Model training finished.")
# Output a graph of loss metrics over periods.
plt.subplot(1, 2, 2)
plt.ylabel('RMSE')
plt.xlabel('Periods')
plt.title("Root Mean Squared Error vs. Periods")
plt.tight_layout()
plt.plot(root_mean_squared_errors)
# Create a table with calibration data.
calibration_data = pd.DataFrame()
calibration_data["predictions"] = pd.Series(predictions)
calibration_data["targets"] = pd.Series(targets)
display.display(calibration_data.describe())
print("Final RMSE (on training data): %0.2f" % root_mean_squared_error)
return calibration_data
total_rooms
특성과 population
특성은 모두 특정 지역의 합계를 계수합니다.
그런데 지역마다 인구밀도가 다르다면 어떻게 될까요? total_rooms
와 population
의 비율로 합성 특성을 만들면 지역의 인구밀도와 주택 가격 중앙값의 관계를 살펴볼 수 있습니다.
아래 셀에서 rooms_per_person
이라는 특성을 만들고 train_model()
의 input_feature
로 사용합니다.
학습률을 조정하여 이 단일 특성으로 성능을 어디까지 올릴 수 있을까요? 성능이 높다는 것은 회귀선이 데이터에 잘 부합하고 최종 RMSE가 낮다는 의미입니다.
참고: 아래에 코드 셀을 몇 개 추가하여 다양한 학습률을 실험하면서 결과를 비교해 보면 도움이 됩니다. 새 코드 셀을 추가하려면 이 셀 가운데 바로 아래에 마우스를 가져가고 CODE를 클릭합니다.
In [0]:
#
# YOUR CODE HERE
#
california_housing_dataframe["rooms_per_person"] =
calibration_data = train_model(
learning_rate=0.00005,
steps=500,
batch_size=5,
input_feature="rooms_per_person"
)
In [0]:
california_housing_dataframe["rooms_per_person"] = (
california_housing_dataframe["total_rooms"] / california_housing_dataframe["population"])
calibration_data = train_model(
learning_rate=0.05,
steps=500,
batch_size=5,
input_feature="rooms_per_person")
In [0]:
# YOUR CODE HERE
In [0]:
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(calibration_data["predictions"], calibration_data["targets"])
보정 데이터를 보면 대부분의 산포점이 직선을 이룹니다. 이 선은 수직에 가까운데, 여기에 대해서는 나중에 설명합니다. 지금은 선에서 벗어난 점에 대해 집중할 때입니다. 이러한 점은 비교적 적은 편입니다.
rooms_per_person
의 히스토그램을 그려보면 입력 데이터에서 몇 개의 이상점을 발견할 수 있습니다.
In [0]:
plt.subplot(1, 2, 2)
_ = california_housing_dataframe["rooms_per_person"].hist()
In [0]:
# YOUR CODE HERE
작업 2에서 작성한 히스토그램을 보면 대부분의 값이 5
미만입니다. rooms_per_person
을 5에서 잘라내고 히스토그램을 작성하여 결과를 다시 확인해 보세요.
In [0]:
california_housing_dataframe["rooms_per_person"] = (
california_housing_dataframe["rooms_per_person"]).apply(lambda x: min(x, 5))
_ = california_housing_dataframe["rooms_per_person"].hist()
삭제가 효과가 있었는지 확인하기 위해 학습을 다시 실행하고 보정 데이터를 한 번 더 출력해 보겠습니다.
In [0]:
calibration_data = train_model(
learning_rate=0.05,
steps=500,
batch_size=5,
input_feature="rooms_per_person")
In [0]:
_ = plt.scatter(calibration_data["predictions"], calibration_data["targets"])