In [1]:
import logging
import os
from pathlib import Path
from importlib import reload
import sys
import notebook_setup
notebook_setup.setup()
In [2]:
import subprocess
# TODO(jlewi): Get the project using fairing?
# PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()
PROJECT = "issue-label-bot-dev"
In [3]:
!pip install --user google-cloud-automl
In [4]:
# TODO(jlewi): How do we check if the dataset already exists and whether it already has data
from google.cloud import automl
import logging
display_name = "kubeflow_issues_with_repo"
client = automl.AutoMlClient()
# A resource that represents Google Cloud Platform location.
project_location = client.location_path(PROJECT, "us-central1")
# Specify the classification type
# Types:
# MultiLabel: Multiple labels are allowed for one example.
# MultiClass: At most one label is allowed per example.
metadata = automl.types.TextClassificationDatasetMetadata(
classification_type=automl.enums.ClassificationType.MULTILABEL
)
dataset = automl.types.Dataset(
display_name=display_name,
text_classification_dataset_metadata=metadata,
)
# Create a dataset with the dataset metadata in the region.
response = client.create_dataset(project_location, dataset)
created_dataset = response.result()
# Display the dataset information
logging.info("Dataset name: {}".format(created_dataset.name))
dataset_id = created_dataset.name.split("/")[-1]
logging.info(f"Dataset id: {dataset_id}")
In [5]:
from code_intelligence import github_bigquery
recent_issues = github_bigquery.get_issues("kubeflow", PROJECT, max_age_days=60)
In [6]:
# Need to use a bucket in the same region and type as automl
data_dir = f"gs://issue-label-bot-dev_automl/automl_{dataset_id}"
issues_dir = os.path.join(data_dir, "issues")
In [10]:
from code_intelligence import gcs_util
from code_intelligence import github_util
from code_intelligence import util
from google.cloud import storage
In [8]:
import pandas as pd
info = pd.DataFrame(columns=["url", "set", "labels"], index=range(recent_issues.shape[0]))
# Make the set an empty string because we will let AutoML assign points to the train, eval and test sets
info["set"] = ""
In [ ]:
storage_client = storage.Client()
bucket_name, _ = gcs_util.split_gcs_uri(data_dir)
bucket = storage_client.get_bucket(bucket_name)
for i in range(recent_issues.shape[0]):
owner, repo, number = util.parse_issue_url(recent_issues.iloc[i]["html_url"])
owner_repo = f"{owner}_{repo}"
name = f"{owner}_{repo}_{number}.txt"
target = os.path.join(issues_dir, name)
issue = recent_issues.iloc[i]
if gcs_util.check_gcs_object(target, storage_client=storage_client):
logging.info(f"{target} already exists")
else:
_, obj_path = gcs_util.split_gcs_uri(target)
blob = bucket.blob(obj_path)
# Include the owner and repo in the text body because it is predictive
doc = github_util.build_issue_doc(owner, repo, issue["title"], [issue["body"]])
blob.upload_from_string(doc)
logging.info(f"Created {target}")
info.iloc[i]["url"] = target
We don't use pandas to_csv because this ends up putting quoting the string containing the labels e.g
,gs://issue-label-bot-dev/automl_2020_0429/issues/kubeflow_website_997.txt,"area/docs, kind/feature, lifecycle/stale, priority/p2"
In [ ]:
from collections import Counter
label_counts = Counter()
for r in range(recent_issues.shape[0]):
label_counts.update(recent_issues.iloc[r]["parsed_labels"])
In [ ]:
#label_counts_df = pd.DataFrame({"label": label_counts.keys(), "count": label_counts.values()})
label_counts_df = pd.DataFrame(label_counts.items(), columns=["label", "count"])
In [ ]:
label_counts_df.sort_values("count", ascending=False, inplace=True)
In [ ]:
cutoff = 50
target_labels = label_counts_df.loc[label_counts_df["count"] > cutoff]
For example, if an issue doesn't have label "platform/gcp" that could be for one of two reasons
A quick hack to distinguish the two is to only include area and platform labels
Longer term we could look at GitHub events to infer whether data was labeled by a human
In [ ]:
target_labels = target_labels[target_labels["label"].apply(lambda x: x.startswith("area") or x.startswith("platform"))]
In [ ]:
def label_filter(labels):
filtered = []
for l in labels:
if l in target_labels.values:
filtered.append(l)
return filtered
info["labels"] = recent_issues["parsed_labels"].apply(label_filter)
In [ ]:
# Compute string for automl
# AutoML doesn't allow "/" only letters, dashes, underscores are allowed in labels
# We need a comma separated string and we need to replace "/" with "-"
info["automl_labels"] = info["labels"].apply(lambda l: ", ".join(l).replace("/", "-"))
In [ ]:
import datetime
import io
import csv
buffer = io.StringIO()
# AutoML seems to require at least 1 label for every issue
#labeled_rows = info.loc[info["labels"] != ""]
#labeled_rows = info.loc[info["labels"] != ""]
#labeled_rows.to_csv(buffer, columns=["set", "url", "labels"], header=False, index=False)
info.to_csv(buffer, columns=["set", "url", "automl_labels"], header=False, index=False, doublequote=False)
# for i in range(labeled_rows.shape[0]):
# row = labeled_rows.iloc[i]
# buffer.write(f"{row['set']}, {row['url']}, {row['labels']}\n")
now = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
dataset_path = os.path.join(data_dir, f"dataset_{now}.csv")
_, obj_path = gcs_util.split_gcs_uri(dataset_path)
blob = bucket.blob(obj_path)
blob.upload_from_string(buffer.getvalue())
logging.info(f"Created {dataset_path}")
In [ ]:
from google.cloud import automl
dataset_full_id = client.dataset_path(
PROJECT, "us-central1", dataset_id
)
# Get the multiple Google Cloud Storage URIs
input_uris = [dataset_path]
gcs_source = automl.types.GcsSource(input_uris=input_uris)
input_config = automl.types.InputConfig(gcs_source=gcs_source)
# Import data from the input URI
response = client.import_data(dataset_full_id, input_config)
logging.info(f"Processing import: operation: {response.operation.name}")
# This appears to be a blocking call
logging.info("Data imported. {}".format(response.result()))
In [ ]:
# A resource that represents Google Cloud Platform location.
project_location = client.location_path(PROJECT, "us-central1")
# Leave model unset to use the default base model provided by Google
metadata = automl.types.TextClassificationModelMetadata()
model = automl.types.Model(
display_name=display_name,
dataset_id=dataset_id,
text_classification_model_metadata=metadata,
)
# Create a model with the model metadata in the region.
response = client.create_model(project_location, model)
print(u"Training operation name: {}".format(response.operation.name))
print("Training started...")
In [ ]:
# This is blocking
result = response.result()
In [ ]:
result.name
In [ ]:
# r=client.list_models(project_location)
# for i in r:
# logging.info({})
In [ ]:
# Should be a value like "projects/976279526634/locations/us-central1/models/TCN654213816573231104'"
model_name = result.name
In [ ]:
model_name
In [ ]:
deploy_response = client.deploy_model(model_name)
In [ ]:
final_response = deploy_response.result()
In [ ]:
prediction_client = automl.PredictionServiceClient()
In [ ]:
text_snippet = automl.types.TextSnippet(
content="tfjob isn't working. I can't run my training jobs", mime_type="text/plain"
)
payload = automl.types.ExamplePayload(text_snippet=text_snippet)
response = prediction_client.predict(model_name, payload)
for annotation_payload in response.payload:
print(
u"Predicted class name: {}".format(annotation_payload.display_name)
)
print(
u"Predicted class score: {}".format(
annotation_payload.classification.score
)
)
In [ ]:
response.payload.__class__
In [ ]:
automl.types
In [ ]:
from google.cloud.automl import types as automl_types
In [ ]:
predict_response = automl_types.PredictResponse()
In [ ]:
predict_response.payload.append(annotation)
In [ ]:
predict_response.payload
In [ ]:
annotation_payload.__class__
In [ ]:
annotation = automl_types.AnnotationPayload()
annotation.display_name = "area-jupyter"
annotation.classification.score = .9