The notebook assumes you have already computted the embeddings and stored them on GCS
TODO(jlewi): I last ran this notebook in gcr.io/kubeflow-images-public/tensorflow-1.15.2-notebook-gpu:1.0.0
In [32]:
# URL of the trained language model
LANGUAGE_MODEL_URL = 'gs://issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
In [1]:
import logging
import os
from pathlib import Path
from importlib import reload
import sys
import notebook_setup
notebook_setup.setup()
In [2]:
# Install mlmd sdk
!pip install --user "git+git://github.com/kubeflow/metadata.git#egg=kfmd&subdirectory=sdk/python"
In [65]:
# fairing:include-cell
import sys
from label_microservice.repo_config import RepoConfig
from label_microservice.mlp import MLPWrapper
from sklearn.neural_network import MLPClassifier
import dill as dpickle
import os
import yaml
from google.cloud import storage
import requests
import json
import numpy as np
from passlib.apps import custom_app_context as pwd_context
from collections import Counter
from kubeflow import metadata
import datetime
import logging
import pandas as pd
In [6]:
from code_intelligence import gcs_util
embeddings_file = "gs://repo-embeddings/kubeflow/2020_0428/kubeflow_issue_embeddings_2020-04-11T17:15:10.000876-07:00.hdf5"
name = os.path.basename(embeddings_file)
data_dir = os.path.join(home, "data")
if not os.path.exists(data_dir):
os.makedirs(data_dir)
local_file = os.path.join(data_dir, name)
if not os.path.exists(local_file):
gcs_util.copy_file(embeddings_file, local_file)
else:
logging.info(f"File {local_file} already exists")
In [7]:
import h5py
h5_file = h5py.File(local_file, mode="r")
In [8]:
issue_embeddings = h5_file["issue_embeddings"].value
issues = pd.read_hdf(local_file, "issues")
In [9]:
label_counts = Counter()
for r in range(issues.shape[0]):
label_counts.update(issues.iloc[r]["parsed_labels"])
In [10]:
#label_counts_df = pd.DataFrame({"label": label_counts.keys(), "count": label_counts.values()})
label_counts_df = pd.DataFrame(label_counts.items(), columns=["label", "count"])
In [11]:
label_counts_df.sort_values("count", ascending=False, inplace=True)
In [12]:
label_counts_df["index"] = range(label_counts_df.shape[0])
In [13]:
# Create a bar chart with the x-axis sorted by the values
import altair as alt
bars = alt.Chart(label_counts_df).mark_bar().encode(x=alt.X("label", sort="-y"), y=alt.Y('count'))
bars.interactive()
Out[13]:
In [ ]:
issues["parsed_labels"]
In [14]:
count_cutoff = 30
target_labels = label_counts_df[label_counts_df["count"] >= count_cutoff]["label"]
exclude_prefixes = ["lifecycle", "status"]
def keep_label(l):
for p in exclude_prefixes:
if l.startswith(p):
return False
return True
target_labels = target_labels[target_labels.apply(keep_label)]
target_labels.values.sort()
In [15]:
label_to_index = dict(zip(target_labels.values, range(target_labels.shape[0])))
In [16]:
import numpy as np
num_labels = target_labels.shape[0]
def hot_encoded(x):
d = np.zeros([1, num_labels])
for l in x:
if not l in label_to_index:
continue
d[0, label_to_index[l]] = 1
return d
In [17]:
issue_hot_encoded = issues["parsed_labels"].apply(hot_encoded)
issue_hot_encoded = np.concatenate(issue_hot_encoded)
In [20]:
from label_microservice import mlp
In [21]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(issue_embeddings, issue_hot_encoded, test_size=test_size, random_state=1234)
In [117]:
precision_threshold=0.7,
recall_threshold=0.5
workspace_name='train'
min_freq=25
activation='relu'
alpha=0.0001
early_stopping=True
epsilon=1e-08
hidden_layer_sizes=(600,600)
learning_rate='adaptive'
learning_rate_init=0.001
max_iter=3000
momentum=0.9
n_iter_no_change=5
random_state=1234
solver='adam'
validation_fraction=0.1
clf = MLPClassifier(activation=activation,
alpha=alpha,
early_stopping=early_stopping,
epsilon=epsilon,
hidden_layer_sizes=hidden_layer_sizes,
learning_rate=learning_rate,
learning_rate_init=learning_rate_init,
max_iter=max_iter,
momentum=momentum,
n_iter_no_change=n_iter_no_change,
random_state=random_state,
solver=solver,
validation_fraction=validation_fraction)
In [121]:
# Set class labels
clf.classes_ = target_labels
In [23]:
clf.fit(X_train, y_train)
Out[23]:
In [24]:
mlp_predictions = clf.predict_proba(X_train)
mlp_df, mlp_auc = mlp.calculate_auc(mlp_predictions, y_train, target_labels)
In [25]:
mlp_predictions = clf.predict_proba(X_test)
mlp_df, mlp_auc = mlp.calculate_auc(mlp_predictions, y_test, target_labels)
In [49]:
from code_intelligence import util as code_intelligence_util
from code_intelligence import embeddings
from code_intelligence import graphql
from code_intelligence import inference
In [310]:
reload(embeddings)
Out[310]:
In [29]:
data_dir
Out[29]:
In [31]:
LANGUAGE_MODEL_URL
Out[31]:
In [36]:
def pass_through(x):
return x
# TODO(jlewi): We should download the file if the local file doesn't exist
local_model_file = os.path.basename(LANGUAGE_MODEL_URL)
# model_url = 'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
local_model_dir = os.path.join(data_dir, "language_model")
if not os.path.exists(os.path.join(local_model_dir, local_model_file)):
if not os.path.exists(local_model_dir):
os.makedirs(local_model_dir)
gcs_util.copy_from_gcs(LANGUAGE_MODEL_URL, os.path.join(local_model_dir, local_model_file))
inference_wrapper = inference.InferenceWrapper(model_path=local_model_dir, model_file_name=local_model_file)
In [37]:
if not os.getenv("GITHUB_TOKEN"):
raise ValueError(f"No GitHub token specified")
else:
gh_client = graphql.GraphQLClient()
In [53]:
issue_url = "https://github.com/kubeflow/kubeflow/issues/4972"
issue_dict= embeddings.get_issue(issue_url, gh_client)
dict_for_embeddings = inference_wrapper.process_dict(issue_dict)
embedding_data = inference_wrapper.get_pooled_features(dict_for_embeddings['text']).detach().cpu().numpy()
predictions = clf.predict_proba(embedding_data)
p = pd.DataFrame({"probabilities": predictions[0, :], "labels": target_labels})
p.sort_values("probabilities", ascending=False)
Out[53]:
In [124]:
from sklearn.metrics import precision_recall_curve
In [350]:
# Lets select those points in the test set that have one of the labels of interest
label_indexes = []
for i in range(target_labels.size):
name = target_labels.iloc[i]
keep_label = False
for p in ["area", "platform"]:
if name.startswith(p):
keep_label = True
if keep_label:
label_indexes.append(i)
In [351]:
has_label_of_interest = np.sum(y_test[:, label_indexes], axis=1) > 0
X_test_of_interest = X_test[has_label_of_interest, :]
y_test_of_interest = y_test[has_label_of_interest, :]
In [356]:
#y_pred = clf.predict_proba(X_test)
y_pred = clf.predict_proba(X_test_of_interest)
# Choose a ridiculously low precision_threshold otherwise most labels will end up not having a threshold and we will never generate predictions for that
# label. Arguably if we erroneously start applying labels to issues; those will hopefully get corrected by humans and we can eventually use that
# to improve the model.
precision_threshold = .3
recall_threshold = .25
probability_thresholds = {}
precisions = {}
recalls = {}
results = pd.DataFrame({"label": target_labels, "precision":[None] * target_labels.size,
"recall":[None] * target_labels.size})
# Default to a threshold of 1 so that the label will never be applied
label_thresholds = np.ones(target_labels.size)
for label in range(target_labels.size):
# find the probability for each label
best_precision, best_recall, best_threshold = 0.0, 0.0, 1
#precision, recall, threshold = precision_recall_curve(y_test[:, label], y_pred[:, label])
precision, recall, threshold = precision_recall_curve(y_test_of_interest[:, label], y_pred[:, label])
results["precision"].iloc[label] = precision
results["recall"].iloc[label] = recall
for prec, reca, thre in zip(precision[:-1], recall[:-1], threshold):
# precision, recall must meet two thresholds respecitively
if prec >= precision_threshold and reca >= recall_threshold:
# choose the threshold with the higher precision
if prec > best_precision:
best_precision = prec
best_recall = reca
best_threshold = thre
# self.probability_thresholds is a dict {label_index: probability_threshold}
# If probability_thresholds[label] is None, do not predict this label always, which
# means this label is in the excluded list because it does not satisfy
# both of the precision and recall thresholds
label_thresholds[label]=best_threshold
probability_thresholds[label] = best_threshold
precisions[label] = best_precision
recalls[label] = best_recall
In [ ]:
## Plot Precision and recall for various labels
In [353]:
# TODO(jlewi): How to do multiple labels on the same graph
# Susample otherwise we get too many points to plot
subsample = 10
rows = []
labels_of_interest = ["area/jupyter", "area/kfctl", "area/engprod", "area/docs", "area/kustomize", "platform/gcp", "platform/aws"]
for l in ["area/jupyter", "area/kfctl", "area/engprod", "area/docs", "area/kustomize", "platform/gcp", "platform/aws"]:
#l = "area/jupyter"
selector = results["label"]== l
index = np.where(selector.values)[0][0]
row = pd.DataFrame({"precision": results.iloc[index]["precision"][::subsample], "recall": results.iloc[index]["recall"][::subsample], "label": l})
rows.append(row)
#row = pd.DataFrame({"precision": results.iloc[index].precision, "recall": results.iloc[index].recall})
#row = results.loc[results["label"] == "area/jupyter"]
roc_points = pd.concat(rows)
alt.Chart(roc_points).mark_line().encode(x="recall", y="precision", color="label").interactive()
Out[353]:
In [354]:
probability_thresholds
Out[354]:
In [300]:
# Comment how many examples we have in the test set for the above labels
y_label_counts = pd.Series(y_test.sum(axis=0), index=target_labels)
y_label_counts.loc[labels_of_interest]
Out[300]:
In [308]:
# Compute fraction of issues with the labels
y_label_counts.loc[labels_of_interest] / y_test.shape[0]
Out[308]:
In [54]:
from code_intelligence import github_bigquery
import subprocess
# TODO(jlewi): Get the project using fairing?
PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()
In [103]:
reload(github_bigquery)
# Fetch recent issues
recent_issues = github_bigquery.get_issues("kubeflow", PROJECT, max_age_days=14)
In [106]:
input_data = recent_issues[["title", "body"]]
recent_embeddings = inference_wrapper.df_to_embedding(input_data)
In [114]:
recent_predictions = clf.predict_proba(recent_embeddings)
#p = pd.DataFrame({"probabilities": predictions[0, :], "labels": target_labels})
#p.sort_values("probabilities", ascending=False)
In [365]:
Out[365]:
In [370]:
np.sign([2, 0, -1])
Out[370]:
In [378]:
recent_labels = (np.sign(recent_predictions - np.tile(label_thresholds, [recent_predictions.shape[0], 1]))+1) / 2
In [397]:
recent_labels_df=pd.DataFrame(recent_labels, columns=target_labels)
predicted_label_counts = recent_labels_df.sum(axis=0)
In [411]:
predicted_label_counts.sort_values(ascending=False)
Out[411]:
In [414]:
recent_labels_df.shape
Out[414]:
In [ ]:
* Print out issue titles and labels
* This is for qualitative analysis
In [408]:
from IPython.core.display import display, HTML
for i in range(recent_issues.shape[0]):
title = recent_issues.iloc[i]["title"]
url = recent_issues.iloc[i]["html_url"]
predicted_labels = target_labels[recent_labels[i,:]>0]
names = ", ".join(predicted_labels)
display(HTML(f"Issue: <a href='{url}'>{title}</a> {names}"))
In [5]:
# fairing:include-cell
class RepoMLP(object):
"""RepoMLP is a helper class to work with scklearn multi-layer perceptron.
The RepoMLP provides some wrapper code to help train the sklearn multi-layer perceptron in this case.
TODO(jlewi): This is a wrapper around MLPWrapper which is a wrapper around
"""
def __init__(self,
owner=None,
repo=None,
precision_threshold=0.7,
recall_threshold=0.5,
workspace_name='train',
min_freq=25,
activation='relu',
alpha=0.0001,
early_stopping=True,
epsilon=1e-08,
hidden_layer_sizes=(600,600),
learning_rate='adaptive',
learning_rate_init=0.001,
max_iter=3000,
momentum=0.9,
n_iter_no_change=5,
random_state=1234,
solver='adam',
validation_fraction=0.1):
self.precision_threshold = precision_threshold
self.recall_threshold = recall_threshold
self.min_freq = min_freq # for filtering labels
self.mlp_wrapper = None
self.clf = MLPClassifier(activation=activation,
alpha=alpha,
early_stopping=early_stopping,
epsilon=epsilon,
hidden_layer_sizes=hidden_layer_sizes,
learning_rate=learning_rate,
learning_rate_init=learning_rate_init,
max_iter=max_iter,
momentum=momentum,
n_iter_no_change=n_iter_no_change,
random_state=random_state,
solver=solver,
validation_fraction=validation_fraction)
self.all_labels = None
self.probability_thresholds = None
self.load_yaml(owner, repo)
self.exec = self.create_execution(workspace_name=workspace_name)
def load_yaml(self, owner, repo):
config = RepoConfig(owner, repo)
self.repo_owner = config.repo_owner
self.repo_name = config.repo_name
self.model_bucket_name = config.model_bucket_name
self.model_file = config.model_local_path
self.model_dest = config.model_gcs_path
self.labels_file = config.labels_local_path
self.labels_dest = config.labels_gcs_path
self.embeddings_bucket_name = config.embeddings_bucket_name
self.embeddings_file = config.embeddings_local_path
self.embeddings_dest = config.embeddings_gcs_path
# TODO(chunhsiang): need to be able to train on multiple repos which
# should be defined in the yaml config
# for now, only train model on the repo installed
self.trained_repos = [f'{self.repo_owner}/{self.repo_name}']
# TODO(jlewi): Delete this code?
def download_embeddings_from_gcs(self):
storage_client = storage.Client()
bucket = storage_client.get_bucket(self.embeddings_bucket_name)
blob = bucket.get_blob(self.embeddings_dest)
with open(self.embeddings_file, 'wb') as f:
blob.download_to_file(f)
def load_training_data(self):
self.download_embeddings_from_gcs()
with open(self.embeddings_file, 'rb') as f:
data = dpickle.load(f)
# filter labels
c = Counter()
for lbls in data['labels']:
c.update(lbls)
self.all_labels = [x for x in c if c[x] >= self.min_freq]
X = []
y = []
for emb, lbls in zip(data['features'], data['labels']):
mask = [self.all_labels.index(x) for x in lbls if c[x] >= self.min_freq]
if mask == []:
continue
zer = np.zeros(len(self.all_labels))
zer[mask] = 1
y.append(zer)
X.append(emb)
return X, y
def train(self):
X, y = self.load_training_data()
self.mlp_wrapper = MLPWrapper(clf=self.clf,
precision_threshold=self.precision_threshold,
recall_threshold=self.recall_threshold)
# TODO(jlewi): find_probability_thresholds; splits the data into test and
# training sets and then calls fit. Why are we then calling fit again?
# Is this just because its using the whole dataset?
# get probability thresholds before `fit` because it overwrites classifier
self.mlp_wrapper.find_probability_thresholds(X, y)
self.probability_thresholds = self.mlp_wrapper.probability_thresholds
# train model using the whole data
self.mlp_wrapper.fit(X, y)
self.save_model()
# store model artifacts using kubeflow metadata
model_name = ','.join(sorted(self.trained_repos))
model_uri = f'gs://{self.model_bucket_name}/{self.model_dest}'
# put all the repo names as the label keys
model_labels = {r:'' for r in self.trained_repos}
self.exec.log_output(metadata.Model(
name=model_name,
uri=model_uri,
labels=model_labels))
def save_model(self):
self.mlp_wrapper.save_model(model_file=self.model_file)
# dump label columns for prediction
thresholds = {}
for i in self.probability_thresholds:
if self.probability_thresholds[i]:
thresholds[i] = float(self.probability_thresholds[i])
else:
thresholds[i] = None
label_dict = {
'labels': self.all_labels,
'probability_thresholds': thresholds
}
with open(self.labels_file, 'w') as f:
yaml.dump(label_dict, f)
self.upload_model_to_gcs()
def upload_model_to_gcs(self):
# upload model
storage_client = storage.Client()
bucket = storage_client.get_bucket(self.model_bucket_name)
blob = bucket.blob(self.model_dest)
blob.upload_from_filename(self.model_file)
# upload label columns
storage_client = storage.Client()
bucket = storage_client.get_bucket(self.model_bucket_name)
blob = bucket.blob(self.labels_dest)
blob.upload_from_filename(self.labels_file)
def create_execution(self, workspace_name):
"""
Return a metatdata execution object in a workspace and
a run for logging.
Args:
workspace_name: workspace name, str
"""
workspace = metadata.Workspace(
# connect to metadata-service in namesapce kubeflow in k8s cluster.
backend_url_prefix='metadata-service.kubeflow:8080',
name=workspace_name,
description='workspace for model training artifacts and executions')
run = metadata.Run(
workspace=workspace,
name='run-' + datetime.utcnow().isoformat('T'))
return metadata.Execution(
name = 'execution-' + datetime.utcnow().isoformat('T'),
workspace=workspace,
run=run)
In [3]:
r = RepoMLP(workspace_name='ws1', owner='kubeflow', repo='examples')
In [4]:
r.train()
Kubeflow Fairing is a Python package that makes training and deploying machine learning models on Kubeflow easier.
Here, we use the preprocessor in Kubeflow Fairing to convert a notebook to be a Python script and create an entry point for that script. After preprocessing the notebook, we can call the command in the command line like the following to run
$ python repo_mlp.py train
In [5]:
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire
In [6]:
preprocessor = ConvertNotebookPreprocessorWithFire('RepoMLP')
if not preprocessor.input_files:
preprocessor.input_files = set()
input_files = ['mlp.py', 'repo_config.py']
preprocessor.input_files = set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()
Out[6]:
In [ ]: