Train a model using AutoML

This notebook uses AutoML to train a model

Environment setup



In [1]:

    
import logging
import os
from pathlib import Path
from importlib import reload
import sys
import notebook_setup

notebook_setup.setup()









    



Adding /home/jovyan/git_kubeflow-code-intelligence/py to python path



In [2]:

    
import subprocess 
# TODO(jlewi): Get the project using fairing?
# PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()
PROJECT = "issue-label-bot-dev"



In [3]:

    
!pip install --user google-cloud-automl









    



Requirement already satisfied: google-cloud-automl in /home/jovyan/.local/lib/python3.6/site-packages (0.10.0)
Requirement already satisfied: google-api-core[grpc]<2.0.0dev,>=1.14.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-cloud-automl) (1.16.0)
Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.51.0)
Requirement already satisfied: six>=1.10.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.14.0)
Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (2.22.0)
Requirement already satisfied: protobuf>=3.4.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (3.11.3)
Requirement already satisfied: setuptools>=34.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (46.1.3)
Requirement already satisfied: pytz in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (2019.1)
Requirement already satisfied: google-auth<2.0dev,>=0.4.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.13.1)
Requirement already satisfied: grpcio<2.0dev,>=1.8.2; extra == "grpc" in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.26.0)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (2019.3.9)
Requirement already satisfied: idna<2.9,>=2.5 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (2.8)
Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (3.1.1)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (0.2.5)
Requirement already satisfied: rsa<4.1,>=3.1.4 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (4.0)
Requirement already satisfied: pyasn1<0.5.0,>=0.4.1 in /home/jovyan/.local/lib/python3.6/site-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-automl) (0.4.5)
WARNING: You are using pip version 19.3.1; however, version 20.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

Create the AutoML dataset



In [4]:

    
# TODO(jlewi): How do we check if the dataset already exists and whether it already has data
from google.cloud import automl
import logging

display_name = "kubeflow_issues_with_repo"

client = automl.AutoMlClient()

# A resource that represents Google Cloud Platform location.
project_location = client.location_path(PROJECT, "us-central1")
# Specify the classification type
# Types:
# MultiLabel: Multiple labels are allowed for one example.
# MultiClass: At most one label is allowed per example.
metadata = automl.types.TextClassificationDatasetMetadata(
    classification_type=automl.enums.ClassificationType.MULTILABEL
)
dataset = automl.types.Dataset(
    display_name=display_name,
    text_classification_dataset_metadata=metadata,
)

# Create a dataset with the dataset metadata in the region.
response = client.create_dataset(project_location, dataset)

created_dataset = response.result()

# Display the dataset information
logging.info("Dataset name: {}".format(created_dataset.name))
dataset_id = created_dataset.name.split("/")[-1]
logging.info(f"Dataset id: {dataset_id}")









    



Dataset name: projects/976279526634/locations/us-central1/datasets/TCN4282013949513170944
Dataset id: TCN4282013949513170944

Prepare the dataset

Docs for preparing the dataset
We need to create a CSV file that lists all the data files
We need to upload each document as a text file to GCS



In [5]:

    
from code_intelligence import github_bigquery
recent_issues = github_bigquery.get_issues("kubeflow", PROJECT, max_age_days=60)









    



  Elapsed 7.0 s. Waiting...
  Elapsed 8.09 s. Waiting...
  Elapsed 9.19 s. Waiting...
  Elapsed 10.27 s. Waiting...
  Elapsed 11.28 s. Waiting...
  Elapsed 12.35 s. Waiting...
  Elapsed 13.45 s. Waiting...
  Elapsed 14.54 s. Waiting...
Downloading: 100%|██████████| 11259/11259 [00:03<00:00, 3600.93rows/s]
Total time taken 18.49 s.
Finished at 2020-06-28 17:15:40.

Write the files to GCS



In [6]:

    
# Need to use a bucket in the same region and type as automl
data_dir = f"gs://issue-label-bot-dev_automl/automl_{dataset_id}"
issues_dir = os.path.join(data_dir, "issues")



In [10]:

    
from code_intelligence import gcs_util
from code_intelligence import github_util
from code_intelligence import util
from google.cloud import storage



In [8]:

    
import pandas as pd
info = pd.DataFrame(columns=["url", "set", "labels"], index=range(recent_issues.shape[0]))

# Make the set an empty string because we will let AutoML assign points to the train, eval and test sets
info["set"] = ""



In [ ]:

    
storage_client = storage.Client()

bucket_name, _ = gcs_util.split_gcs_uri(data_dir)
bucket = storage_client.get_bucket(bucket_name)

for i in range(recent_issues.shape[0]):
    owner, repo, number = util.parse_issue_url(recent_issues.iloc[i]["html_url"])
    owner_repo = f"{owner}_{repo}"
    name = f"{owner}_{repo}_{number}.txt"
    target = os.path.join(issues_dir, name)

    issue = recent_issues.iloc[i]
    
    if gcs_util.check_gcs_object(target, storage_client=storage_client):
        logging.info(f"{target} already exists")
        
    else:
        _, obj_path = gcs_util.split_gcs_uri(target)
        blob = bucket.blob(obj_path)
        
        # Include the owner and repo in the text body because it is predictive
        doc = github_util.build_issue_doc(owner, repo, issue["title"], [issue["body"]])
        blob.upload_from_string(doc)
        logging.info(f"Created {target}")

    info.iloc[i]["url"] = target









    



Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_arena_316.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_arena_317.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_131.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_132.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_133.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_135.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_136.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_137.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_139.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_140.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_141.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_code-intelligence_142.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_86.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_89.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_92.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_94.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_96.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_97.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_98.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_common_99.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community-infra_2.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community-infra_4.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community-infra_6.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community_336.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community_338.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community_340.txt
Created gs://issue-label-bot-dev_automl/automl_TCN4282013949513170944/issues/kubeflow_community_341.txt

Create the CSV file with the data

We don't use pandas to_csv because this ends up putting quoting the string containing the labels e.g

,gs://issue-label-bot-dev/automl_2020_0429/issues/kubeflow_website_997.txt,"area/docs, kind/feature, lifecycle/stale, priority/p2"

But that isn't the format AutoML expects

Compute Target Labels

Compute a historgram of label frequency

AutoML requires labels have a minimum count of each label (8 for training, 1 for validation, 1 for test) so filter out labels that don't appear very often



In [ ]:

    
from collections import Counter
label_counts = Counter()

for r in range(recent_issues.shape[0]):
    label_counts.update(recent_issues.iloc[r]["parsed_labels"])



In [ ]:

    
#label_counts_df = pd.DataFrame({"label": label_counts.keys(), "count": label_counts.values()})
label_counts_df = pd.DataFrame(label_counts.items(), columns=["label", "count"])



In [ ]:

    
label_counts_df.sort_values("count", ascending=False, inplace=True)



In [ ]:

    
cutoff = 50
target_labels = label_counts_df.loc[label_counts_df["count"] > cutoff]

Distinguish unlabeled vs. negative examples

We need to distinguish between unlabeled examples and negative examples
For example, if an issue doesn't have label "platform/gcp" that could be for one of two reasons
1. The issue was never labeled
2. The label platform/gcp doesn't apply
A quick hack to distinguish the two is to only include area and platform labels
- For now at least if one of these labels exists on an issue it was probably applied by a human
- This is in contrast to kind labels which could be applied by the bot or by a GitHub issue template
Longer term we could look at GitHub events to infer whether data was labeled by a human



In [ ]:

    
target_labels = target_labels[target_labels["label"].apply(lambda x: x.startswith("area") or x.startswith("platform"))]

Filter labels to target labels



In [ ]:

    
def label_filter(labels):
    filtered = []
    for l in labels:
        if l in target_labels.values:
            filtered.append(l)
    return filtered

info["labels"] = recent_issues["parsed_labels"].apply(label_filter)



In [ ]:

    
# Compute string for automl

# AutoML doesn't allow "/" only letters, dashes, underscores are allowed in labels
# We need a comma separated string and we need to replace "/" with "-"
info["automl_labels"] = info["labels"].apply(lambda l: ", ".join(l).replace("/", "-"))



In [ ]:

    
import datetime
import io
import csv
buffer = io.StringIO()

# AutoML seems to require at least 1 label for every issue
#labeled_rows = info.loc[info["labels"] != ""]
#labeled_rows = info.loc[info["labels"] != ""]
#labeled_rows.to_csv(buffer, columns=["set", "url", "labels"], header=False, index=False)

info.to_csv(buffer, columns=["set", "url", "automl_labels"], header=False, index=False, doublequote=False)

# for i in range(labeled_rows.shape[0]):
#     row = labeled_rows.iloc[i]    
#     buffer.write(f"{row['set']}, {row['url']}, {row['labels']}\n")
    
now = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
dataset_path = os.path.join(data_dir, f"dataset_{now}.csv")
_, obj_path = gcs_util.split_gcs_uri(dataset_path)
blob = bucket.blob(obj_path)

blob.upload_from_string(buffer.getvalue())

logging.info(f"Created {dataset_path}")

Import the data to AutoML



In [ ]:

    
from google.cloud import automl

dataset_full_id = client.dataset_path(
    PROJECT, "us-central1", dataset_id
)

# Get the multiple Google Cloud Storage URIs
input_uris = [dataset_path]
gcs_source = automl.types.GcsSource(input_uris=input_uris)
input_config = automl.types.InputConfig(gcs_source=gcs_source)
# Import data from the input URI
response = client.import_data(dataset_full_id, input_config)

logging.info(f"Processing import: operation: {response.operation.name}")

# This appears to be a blocking call
logging.info("Data imported. {}".format(response.result()))

Train a model



In [ ]:

    
# A resource that represents Google Cloud Platform location.
project_location = client.location_path(PROJECT, "us-central1")
# Leave model unset to use the default base model provided by Google
metadata = automl.types.TextClassificationModelMetadata()
model = automl.types.Model(
    display_name=display_name,
    dataset_id=dataset_id,
    text_classification_model_metadata=metadata,
)

# Create a model with the model metadata in the region.
response = client.create_model(project_location, model)

print(u"Training operation name: {}".format(response.operation.name))
print("Training started...")



In [ ]:

    
# This is blocking
result = response.result()



In [ ]:

    
result.name

Deploy a model

We need to deploy the model before we can send predictions.



In [ ]:

    
# r=client.list_models(project_location)

# for i in r:
#     logging.info({})



In [ ]:

    
# Should be a value like "projects/976279526634/locations/us-central1/models/TCN654213816573231104'"
model_name = result.name



In [ ]:

    
model_name



In [ ]:

    
deploy_response = client.deploy_model(model_name)



In [ ]:

    
final_response = deploy_response.result()

Send some predictions



In [ ]:

    
prediction_client = automl.PredictionServiceClient()



In [ ]:

    
text_snippet = automl.types.TextSnippet(
    content="tfjob isn't working. I can't run my training jobs", mime_type="text/plain"
)
payload = automl.types.ExamplePayload(text_snippet=text_snippet)

response = prediction_client.predict(model_name, payload)

for annotation_payload in response.payload:
    print(
        u"Predicted class name: {}".format(annotation_payload.display_name)
    )
    print(
        u"Predicted class score: {}".format(
            annotation_payload.classification.score
        )
    )



In [ ]:

    
response.payload.__class__



In [ ]:

    
automl.types



In [ ]:

    
from google.cloud.automl import types as automl_types



In [ ]:

    
predict_response = automl_types.PredictResponse()



In [ ]:

    
predict_response.payload.append(annotation)



In [ ]:

    
predict_response.payload



In [ ]:

    
annotation_payload.__class__



In [ ]:

    
annotation = automl_types.AnnotationPayload()
annotation.display_name = "area-jupyter"
annotation.classification.score = .9