In [ ]:
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
This notebook uses the Census Income Data Set to demonstrate how to train a model and generate local predictions.
The Census Income Data Set that this sample
uses for training is provided by the UC Irvine Machine Learning
Repository. Google has hosted the data on a public GCS bucket gs://cloud-samples-data/ml-engine/sklearn/census_data/
and also hosted in the UC Irvine dataset repository.
adult.data
adult.test
Note: Your typical development process with your own data would require you to upload your data to GCS so that you can access that data from inside your notebook. However, in this case, Google has put the data on GCS to avoid the steps of having you download the data from UC Irvine and then upload the data to GCS.
This dataset is provided by a third party. Google provides no representation, warranty, or other guarantees about the validity or any other aspects of this dataset.
First, you'll create the model (provided below). This is similar to your normal process for creating a scikit-learn model. However, there is one key difference:
The code in this file loads the data into a pandas DataFrame that can be used by scikit-learn. Then the model is fit against the training data. Lastly, sklearn's built in version of joblib is used to save the model to a file that can be uploaded to AI Platform's prediction service.
In [ ]:
import pandas as pd
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
Add code to download the data (in this case, using the publicly hosted data). you will then be able to use the data when training your model.
In [ ]:
# Download the data
! curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data --output adult.data
! curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test --output adult.test
Read in the data
In [ ]:
# Define the format of your input data including unused columns (These are the columns from the census data files)
COLUMNS = (
'age',
'workclass',
'fnlwgt',
'education',
'education-num',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
'income-level'
)
# Categorical columns are columns that need to be turned into a numerical value to be used by scikit-learn
CATEGORICAL_COLUMNS = (
'workclass',
'education',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'native-country'
)
Load the training census dataset
In [ ]:
with open('./adult.data', 'r') as train_data:
raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)
# Remove the column you are trying to predict ('income-level') from our features list
# Convert the Dataframe to a lists of lists
train_features = raw_training_data.drop('income-level', axis=1).values.tolist()
# Create our training labels list, convert the Dataframe to a lists of lists
train_labels = (raw_training_data['income-level'] == ' >50K').values.tolist()
Load the test census dataset
In [ ]:
with open('./adult.test', 'r') as test_data:
raw_testing_data = pd.read_csv(test_data, names=COLUMNS, skiprows=1)
# Remove the column we are trying to predict ('income-level') from our features list
# Convert the Dataframe to a lists of lists
test_features = raw_testing_data.drop('income-level', axis=1).values.tolist()
# Create our training labels list, convert the Dataframe to a lists of lists
test_labels = (raw_testing_data['income-level'] == ' >50K.').values.tolist()
This is where your model code would go. Below is an example model using the census dataset. Since the census data set has categorical features, you need to convert them to numerical values. You'll use a list of pipelines to convert each categorical column and then use FeatureUnion to combine them before calling the RandomForestClassifier.
Each categorical column needs to be extracted individually and converted to a numerical value.
To do this, each categorical column will use a pipeline that extracts one feature column via
SelectKBest(k=1) and a LabelBinarizer()
to convert the categorical value to a numerical one.
A scores array (created below) will select and extract the feature column. The scores array is
created by iterating over the COLUMNS and checking if it is a CATEGORICAL_COLUMN.
In [ ]:
categorical_pipelines = []
for i, col in enumerate(COLUMNS[:-1]):
if col in CATEGORICAL_COLUMNS:
# Create a scores array to get the individual categorical column.
# Example:
# data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical',
# 'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']
# scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
#
# Returns: [['State-gov']]
# Build the scores array
scores = [0] * len(COLUMNS[:-1])
# This column is the categorical column you want to extract.
scores[i] = 1
skb = SelectKBest(k=1)
skb.scores_ = scores
# Convert the categorical column to a numerical value
lbn = LabelBinarizer()
r = skb.transform(train_features)
lbn.fit(r)
# Create the pipeline to extract the categorical feature
categorical_pipelines.append(
('categorical-{}'.format(i), Pipeline([
('SKB-{}'.format(i), skb),
('LBN-{}'.format(i), lbn)])))
# Create pipeline to extract the numerical features
skb = SelectKBest(k=6)
# From COLUMNS use the features that are numerical
skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
categorical_pipelines.append(('numerical', skb))
# Combine all the features using FeatureUnion
preprocess = FeatureUnion(categorical_pipelines)
# Create the classifier
classifier = RandomForestClassifier()
# Transform the features and fit them to the classifier
classifier.fit(preprocess.transform(train_features), train_labels)
# Create the overall model as a single pipeline
pipeline = Pipeline([
('union', preprocess),
('classifier', classifier)
])
Export the model to a file
In [ ]:
model = 'model.joblib'
joblib.dump(pipeline, model)
In [ ]:
!ls -al model.joblib
In [ ]:
print('Show a person that makes <=50K:')
print('\tFeatures: {0} --> Label: {1}\n'.format(test_features[0], test_labels[0]))
with open('less_than_50K.json', 'w') as outfile:
json.dump(test_features[0], outfile)
print('Show a person that makes >50K:')
print('\tFeatures: {0} --> Label: {1}'.format(test_features[3], test_labels[3]))
with open('more_than_50K.json', 'w') as outfile:
json.dump(test_features[3], outfile)
In [ ]:
local_results = pipeline.predict(test_features)
local = pd.Series(local_results, name='local')
In [ ]:
local[:10]
In [ ]:
# Print the first 10 responses
for i, response in enumerate(local[:10]):
print('Prediction: {}\tLabel: {}'.format(response, test_labels[i]))
In [ ]:
actual = pd.Series(test_labels, name='actual')
local_predictions = pd.Series(local_results, name='local')
pd.crosstab(actual, local_predictions)