Train a model using AutoML

  • This notebook uses AutoML to train a model

Environment setup


In [1]:
import logging
import os
from pathlib import Path
from importlib import reload
import sys
import notebook_setup

notebook_setup.setup()


Adding /home/jovyan/git_kubeflow-code-intelligence/py to python path

In [2]:
import subprocess 
# TODO(jlewi): Get the project using fairing?
# PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()
PROJECT = "issue-label-bot-dev"

In [3]:
!pip install --user google-cloud-automl


Requirement already satisfied: google-cloud-automl in /home/jovyan/.local/lib/python3.6/site-packages (0.10.0)
Requirement already satisfied: google-api-core[grpc]<2.0.0dev,>=1.14.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-cloud-automl) (1.16.0)
Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.51.0)
Requirement already satisfied: six>=1.10.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.14.0)
Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (2.22.0)
Requirement already satisfied: protobuf>=3.4.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (3.11.3)
Requirement already satisfied: setuptools>=34.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (46.1.3)
Requirement already satisfied: pytz in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (2019.1)
Requirement already satisfied: google-auth<2.0dev,>=0.4.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.13.1)
Requirement already satisfied: grpcio<2.0dev,>=1.8.2; extra == "grpc" in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.26.0)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (2019.3.9)
Requirement already satisfied: idna<2.9,>=2.5 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (2.8)
Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (3.1.1)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (0.2.5)
Requirement already satisfied: rsa<4.1,>=3.1.4 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (4.0)
Requirement already satisfied: pyasn1<0.5.0,>=0.4.1 in /home/jovyan/.local/lib/python3.6/site-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (0.4.5)
WARNING: You are using pip version 19.3.1; however, version 20.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

Create the AutoML dataset


In [4]:
# TODO(jlewi): How do we check if the dataset already exists and whether it already has data
from google.cloud import automl
import logging

display_name = "kubeflow_issues_with_repo"

client = automl.AutoMlClient()

# A resource that represents Google Cloud Platform location.
project_location = client.location_path(PROJECT, "us-central1")
# Specify the classification type
# Types:
# MultiLabel: Multiple labels are allowed for one example.
# MultiClass: At most one label is allowed per example.
metadata = automl.types.TextClassificationDatasetMetadata(
    classification_type=automl.enums.ClassificationType.MULTILABEL
)
dataset = automl.types.Dataset(
    display_name=display_name,
    text_classification_dataset_metadata=metadata,
)

# Create a dataset with the dataset metadata in the region.
response = client.create_dataset(project_location, dataset)

created_dataset = response.result()

# Display the dataset information
logging.info("Dataset name: {}".format(created_dataset.name))
dataset_id = created_dataset.name.split("/")[-1]
logging.info(f"Dataset id: {dataset_id}")


Dataset name: projects/976279526634/locations/us-central1/datasets/TCN4282013949513170944
Dataset id: TCN4282013949513170944

Prepare the dataset

  • Docs for preparing the dataset
  • We need to create a CSV file that lists all the data files
  • We need to upload each document as a text file to GCS

In [5]:
from code_intelligence import github_bigquery
recent_issues = github_bigquery.get_issues("kubeflow", PROJECT, max_age_days=60)


  Elapsed 7.0 s. Waiting...
  Elapsed 8.09 s. Waiting...
  Elapsed 9.19 s. Waiting...
  Elapsed 10.27 s. Waiting...
  Elapsed 11.28 s. Waiting...
  Elapsed 12.35 s. Waiting...
  Elapsed 13.45 s. Waiting...
  Elapsed 14.54 s. Waiting...
Downloading: 100%|██████████| 11259/11259 [00:03<00:00, 3600.93rows/s]
Total time taken 18.49 s.
Finished at 2020-06-28 17:15:40.

Write the files to GCS


In [6]:
# Need to use a bucket in the same region and type as automl
data_dir = f"gs://issue-label-bot-dev_automl/automl_{dataset_id}"
issues_dir = os.path.join(data_dir, "issues")

In [10]:
from code_intelligence import gcs_util
from code_intelligence import github_util
from code_intelligence import util
from google.cloud import storage

In [8]:
import pandas as pd
info = pd.DataFrame(columns=["url", "set", "labels"], index=range(recent_issues.shape[0]))

# Make the set an empty string because we will let AutoML assign points to the train, eval and test sets
info["set"] = ""

In [ ]:
storage_client = storage.Client()

bucket_name, _ = gcs_util.split_gcs_uri(data_dir)
bucket = storage_client.get_bucket(bucket_name)

for i in range(recent_issues.shape[0]):
    owner, repo, number = util.parse_issue_url(recent_issues.iloc[i]["html_url"])
    owner_repo = f"{owner}_{repo}"
    name = f"{owner}_{repo}_{number}.txt"
    target = os.path.join(issues_dir, name)

    issue = recent_issues.iloc[i]
    
    if gcs_util.check_gcs_object(target, storage_client=storage_client):
        logging.info(f"{target} already exists")
        
    else:
        _, obj_path = gcs_util.split_gcs_uri(target)
        blob = bucket.blob(obj_path)
        
        # Include the owner and repo in the text body because it is predictive
        doc = github_util.build_issue_doc(owner, repo, issue["title"], [issue["body"]])
        blob.upload_from_string(doc)
        logging.info(f"Created {target}")

    info.iloc[i]["url"] = target


Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_arena_316.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_arena_317.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_131.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_132.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_133.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_135.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_136.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_137.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_139.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_140.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_141.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_142.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_86.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_89.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_92.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_94.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_96.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_97.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_98.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_99.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community-infra_2.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community-infra_4.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community-infra_6.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community_336.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community_338.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community_340.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community_341.txt
  • Create the CSV file with the data
  • We don't use pandas to_csv because this ends up putting quoting the string containing the labels e.g

    ,gs://issue-label-bot-dev/automl_2020_0429/issues/kubeflow_website_997.txt,"area/docs, kind/feature, lifecycle/stale, priority/p2"
  • But that isn't the format AutoML expects

Compute Target Labels

Compute a historgram of label frequency

  • AutoML requires labels have a minimum count of each label (8 for training, 1 for validation, 1 for test) so filter out labels that don't appear very often

In [ ]:
from collections import Counter
label_counts = Counter()

for r in range(recent_issues.shape[0]):
    label_counts.update(recent_issues.iloc[r]["parsed_labels"])

In [ ]:
#label_counts_df = pd.DataFrame({"label": label_counts.keys(), "count": label_counts.values()})
label_counts_df = pd.DataFrame(label_counts.items(), columns=["label", "count"])

In [ ]:
label_counts_df.sort_values("count", ascending=False, inplace=True)

In [ ]:
cutoff = 50
target_labels = label_counts_df.loc[label_counts_df["count"] > cutoff]

Distinguish unlabeled vs. negative examples

  • We need to distinguish between unlabeled examples and negative examples
  • For example, if an issue doesn't have label "platform/gcp" that could be for one of two reasons

    1. The issue was never labeled
    2. The label platform/gcp doesn't apply
  • A quick hack to distinguish the two is to only include area and platform labels

    • For now at least if one of these labels exists on an issue it was probably applied by a human
    • This is in contrast to kind labels which could be applied by the bot or by a GitHub issue template
  • Longer term we could look at GitHub events to infer whether data was labeled by a human


In [ ]:
target_labels = target_labels[target_labels["label"].apply(lambda x: x.startswith("area") or x.startswith("platform"))]
  • Filter labels to target labels

In [ ]:
def label_filter(labels):
    filtered = []
    for l in labels:
        if l in target_labels.values:
            filtered.append(l)
    return filtered

info["labels"] = recent_issues["parsed_labels"].apply(label_filter)

In [ ]:
# Compute string for automl

# AutoML doesn't allow "/" only letters, dashes, underscores are allowed in labels
# We need a comma separated string and we need to replace "/" with "-"
info["automl_labels"] = info["labels"].apply(lambda l: ", ".join(l).replace("/", "-"))

In [ ]:
import datetime
import io
import csv
buffer = io.StringIO()

# AutoML seems to require at least 1 label for every issue
#labeled_rows = info.loc[info["labels"] != ""]
#labeled_rows = info.loc[info["labels"] != ""]
#labeled_rows.to_csv(buffer, columns=["set", "url", "labels"], header=False, index=False)

info.to_csv(buffer, columns=["set", "url", "automl_labels"], header=False, index=False, doublequote=False)

# for i in range(labeled_rows.shape[0]):
#     row = labeled_rows.iloc[i]    
#     buffer.write(f"{row['set']}, {row['url']}, {row['labels']}\n")
    
now = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
dataset_path = os.path.join(data_dir, f"dataset_{now}.csv")
_, obj_path = gcs_util.split_gcs_uri(dataset_path)
blob = bucket.blob(obj_path)

blob.upload_from_string(buffer.getvalue())

logging.info(f"Created {dataset_path}")
  • Import the data to AutoML

In [ ]:
from google.cloud import automl

dataset_full_id = client.dataset_path(
    PROJECT, "us-central1", dataset_id
)

# Get the multiple Google Cloud Storage URIs
input_uris = [dataset_path]
gcs_source = automl.types.GcsSource(input_uris=input_uris)
input_config = automl.types.InputConfig(gcs_source=gcs_source)
# Import data from the input URI
response = client.import_data(dataset_full_id, input_config)

logging.info(f"Processing import: operation: {response.operation.name}")

# This appears to be a blocking call
logging.info("Data imported. {}".format(response.result()))

Train a model


In [ ]:
# A resource that represents Google Cloud Platform location.
project_location = client.location_path(PROJECT, "us-central1")
# Leave model unset to use the default base model provided by Google
metadata = automl.types.TextClassificationModelMetadata()
model = automl.types.Model(
    display_name=display_name,
    dataset_id=dataset_id,
    text_classification_model_metadata=metadata,
)

# Create a model with the model metadata in the region.
response = client.create_model(project_location, model)

print(u"Training operation name: {}".format(response.operation.name))
print("Training started...")

In [ ]:
# This is blocking
result = response.result()

In [ ]:
result.name

Deploy a model

  • We need to deploy the model before we can send predictions.

In [ ]:
# r=client.list_models(project_location)

# for i in r:
#     logging.info({})

In [ ]:
# Should be a value like "projects/976279526634/locations/us-central1/models/TCN654213816573231104'"
model_name = result.name

In [ ]:
model_name

In [ ]:
deploy_response = client.deploy_model(model_name)

In [ ]:
final_response = deploy_response.result()

Send some predictions


In [ ]:
prediction_client = automl.PredictionServiceClient()

In [ ]:
text_snippet = automl.types.TextSnippet(
    content="tfjob isn't working. I can't run my training jobs", mime_type="text/plain"
)
payload = automl.types.ExamplePayload(text_snippet=text_snippet)

response = prediction_client.predict(model_name, payload)

for annotation_payload in response.payload:
    print(
        u"Predicted class name: {}".format(annotation_payload.display_name)
    )
    print(
        u"Predicted class score: {}".format(
            annotation_payload.classification.score
        )
    )

In [ ]:
response.payload.__class__

In [ ]:
automl.types

In [ ]:
from google.cloud.automl import types as automl_types

In [ ]:
predict_response = automl_types.PredictResponse()

In [ ]:
predict_response.payload.append(annotation)

In [ ]:
predict_response.payload

In [ ]:
annotation_payload.__class__

In [ ]:
annotation = automl_types.AnnotationPayload()
annotation.display_name = "area-jupyter"
annotation.classification.score = .9