The notebook assumes you have already computted the embeddings and stored them on GCS
TODO(jlewi): I last ran this notebook in gcr.io/kubeflow-images-public/tensorflow-1.15.2-notebook-gpu:1.0.0
In [32]:
    
# URL of the trained language model
LANGUAGE_MODEL_URL = 'gs://issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
    
In [1]:
    
import logging
import os
from pathlib import Path
from importlib import reload
import sys
import notebook_setup
notebook_setup.setup()
    
    
In [2]:
    
# Install mlmd sdk
!pip install --user "git+git://github.com/kubeflow/metadata.git#egg=kfmd&subdirectory=sdk/python"
    
    
In [65]:
    
# fairing:include-cell
import sys
from label_microservice.repo_config import RepoConfig
from label_microservice.mlp import MLPWrapper
from sklearn.neural_network import MLPClassifier
import dill as dpickle
import os
import yaml
from google.cloud import storage
import requests
import json
import numpy as np
from passlib.apps import custom_app_context as pwd_context
from collections import Counter
from kubeflow import metadata
import  datetime
import logging
import pandas as pd
    
In [6]:
    
from code_intelligence import gcs_util
embeddings_file = "gs://repo-embeddings/kubeflow/2020_0428/kubeflow_issue_embeddings_2020-04-11T17:15:10.000876-07:00.hdf5"
name = os.path.basename(embeddings_file)
data_dir = os.path.join(home, "data")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
local_file = os.path.join(data_dir, name)
if not os.path.exists(local_file):
    gcs_util.copy_file(embeddings_file, local_file)
else:
    logging.info(f"File {local_file} already exists")
    
    
In [7]:
    
import h5py
h5_file = h5py.File(local_file, mode="r")
    
In [8]:
    
issue_embeddings = h5_file["issue_embeddings"].value
issues = pd.read_hdf(local_file, "issues")
    
    
In [9]:
    
label_counts = Counter()
for r in range(issues.shape[0]):
    label_counts.update(issues.iloc[r]["parsed_labels"])
    
In [10]:
    
#label_counts_df = pd.DataFrame({"label": label_counts.keys(), "count": label_counts.values()})
label_counts_df = pd.DataFrame(label_counts.items(), columns=["label", "count"])
    
In [11]:
    
label_counts_df.sort_values("count", ascending=False, inplace=True)
    
In [12]:
    
label_counts_df["index"] = range(label_counts_df.shape[0])
    
In [13]:
    
# Create a bar chart with the x-axis sorted by the values
import altair as alt
bars = alt.Chart(label_counts_df).mark_bar().encode(x=alt.X("label", sort="-y"), y=alt.Y('count'))
bars.interactive()
    
    Out[13]:
In [ ]:
    
issues["parsed_labels"]
    
In [14]:
    
count_cutoff = 30
target_labels = label_counts_df[label_counts_df["count"] >= count_cutoff]["label"]
exclude_prefixes = ["lifecycle", "status"]
def keep_label(l):
    for p in exclude_prefixes:
        if l.startswith(p):
            return False
        
    return True
            
target_labels = target_labels[target_labels.apply(keep_label)]
target_labels.values.sort()
    
In [15]:
    
label_to_index = dict(zip(target_labels.values, range(target_labels.shape[0])))
    
In [16]:
    
import numpy as np 
num_labels = target_labels.shape[0]
def hot_encoded(x):
    d = np.zeros([1, num_labels])
    
    for l in x:
        if not l in label_to_index:
            continue
        d[0, label_to_index[l]] = 1
        
    return d
    
In [17]:
    
issue_hot_encoded = issues["parsed_labels"].apply(hot_encoded)
issue_hot_encoded = np.concatenate(issue_hot_encoded)
    
In [20]:
    
from label_microservice import mlp
    
In [21]:
    
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(issue_embeddings, issue_hot_encoded, test_size=test_size, random_state=1234)
    
In [117]:
    
precision_threshold=0.7,
recall_threshold=0.5
workspace_name='train'
min_freq=25
activation='relu'
alpha=0.0001
early_stopping=True
epsilon=1e-08
hidden_layer_sizes=(600,600)
learning_rate='adaptive'
learning_rate_init=0.001
max_iter=3000
momentum=0.9
n_iter_no_change=5
random_state=1234
solver='adam'
validation_fraction=0.1
                
clf = MLPClassifier(activation=activation,
                     alpha=alpha,
                     early_stopping=early_stopping,
                     epsilon=epsilon,
                     hidden_layer_sizes=hidden_layer_sizes,
                     learning_rate=learning_rate,
                     learning_rate_init=learning_rate_init,
                     max_iter=max_iter,
                     momentum=momentum,
                     n_iter_no_change=n_iter_no_change,
                     random_state=random_state,
                     solver=solver,
                     validation_fraction=validation_fraction)
    
    
In [121]:
    
# Set class labels
clf.classes_ = target_labels
    
In [23]:
    
clf.fit(X_train, y_train)
    
    Out[23]:
In [24]:
    
mlp_predictions = clf.predict_proba(X_train)
mlp_df, mlp_auc = mlp.calculate_auc(mlp_predictions, y_train, target_labels)
    
    
    
In [25]:
    
mlp_predictions = clf.predict_proba(X_test)
mlp_df, mlp_auc = mlp.calculate_auc(mlp_predictions, y_test, target_labels)
    
    
    
In [49]:
    
from code_intelligence import util as code_intelligence_util
from code_intelligence import embeddings
from code_intelligence import graphql
from code_intelligence import inference
    
In [310]:
    
reload(embeddings)
    
    Out[310]:
In [29]:
    
data_dir
    
    Out[29]:
In [31]:
    
LANGUAGE_MODEL_URL
    
    Out[31]:
In [36]:
    
def pass_through(x):
    return x
# TODO(jlewi): We should download the file if the local file doesn't exist
local_model_file = os.path.basename(LANGUAGE_MODEL_URL)
# model_url = 'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
local_model_dir = os.path.join(data_dir, "language_model")
if not os.path.exists(os.path.join(local_model_dir, local_model_file)):
    if not os.path.exists(local_model_dir):
        os.makedirs(local_model_dir)
    gcs_util.copy_from_gcs(LANGUAGE_MODEL_URL, os.path.join(local_model_dir, local_model_file))
inference_wrapper = inference.InferenceWrapper(model_path=local_model_dir, model_file_name=local_model_file)
    
In [37]:
    
if not os.getenv("GITHUB_TOKEN"):
    raise ValueError(f"No GitHub token specified")   
else:
    gh_client = graphql.GraphQLClient()
    
    
In [53]:
    
issue_url = "https://github.com/kubeflow/kubeflow/issues/4972"
issue_dict= embeddings.get_issue(issue_url, gh_client)
dict_for_embeddings = inference_wrapper.process_dict(issue_dict)
embedding_data = inference_wrapper.get_pooled_features(dict_for_embeddings['text']).detach().cpu().numpy()
predictions = clf.predict_proba(embedding_data)
p = pd.DataFrame({"probabilities": predictions[0, :], "labels": target_labels})
p.sort_values("probabilities", ascending=False)
    
    Out[53]:
In [124]:
    
from sklearn.metrics import precision_recall_curve
    
In [350]:
    
# Lets select those points in the test set that have one of the labels of interest
label_indexes = []
for i in range(target_labels.size):
    name = target_labels.iloc[i]
    
    keep_label = False
    for p in ["area", "platform"]:                         
        if name.startswith(p):
            keep_label = True
    
    if keep_label:
        label_indexes.append(i)
    
In [351]:
    
has_label_of_interest = np.sum(y_test[:, label_indexes], axis=1) > 0
X_test_of_interest = X_test[has_label_of_interest, :]
y_test_of_interest = y_test[has_label_of_interest, :]
    
In [356]:
    
#y_pred = clf.predict_proba(X_test)
y_pred = clf.predict_proba(X_test_of_interest)
# Choose a ridiculously low precision_threshold otherwise most labels will end up not having a threshold and we will never generate predictions for that 
# label. Arguably if we erroneously start applying labels to issues; those will hopefully get corrected by humans and we can eventually use that
# to improve the model.
precision_threshold = .3
recall_threshold = .25
probability_thresholds = {}
precisions = {}
recalls = {}
results = pd.DataFrame({"label":  target_labels, "precision":[None] * target_labels.size,
                        "recall":[None] * target_labels.size})
# Default to a threshold of 1 so that the label will never be applied
label_thresholds = np.ones(target_labels.size)
for label in range(target_labels.size):
    # find the probability for each label
    best_precision, best_recall, best_threshold = 0.0, 0.0, 1
    #precision, recall, threshold = precision_recall_curve(y_test[:, label], y_pred[:, label])
    precision, recall, threshold = precision_recall_curve(y_test_of_interest[:, label], y_pred[:, label])
    
    results["precision"].iloc[label] = precision
    results["recall"].iloc[label] = recall
    
    for prec, reca, thre in zip(precision[:-1], recall[:-1], threshold):
        # precision, recall must meet two thresholds respecitively
        if prec >= precision_threshold and reca >= recall_threshold:
            # choose the threshold with the higher precision
            if prec > best_precision:
                best_precision = prec
                best_recall = reca
                best_threshold = thre
    # self.probability_thresholds is a dict {label_index: probability_threshold}
    # If probability_thresholds[label] is None, do not predict this label always, which
    # means this label is in the excluded list because it does not satisfy
    # both of the precision and recall thresholds
    label_thresholds[label]=best_threshold
    probability_thresholds[label] = best_threshold
    precisions[label] = best_precision
    recalls[label] = best_recall
    
In [ ]:
    
## Plot Precision and recall for various labels
    
In [353]:
    
# TODO(jlewi): How to do multiple labels on the same graph
# Susample otherwise we get too many points to plot
subsample = 10
rows = []
labels_of_interest = ["area/jupyter", "area/kfctl", "area/engprod", "area/docs", "area/kustomize", "platform/gcp", "platform/aws"]
for l in  ["area/jupyter", "area/kfctl", "area/engprod", "area/docs", "area/kustomize", "platform/gcp", "platform/aws"]:        
#l = "area/jupyter"
    selector = results["label"]== l
    index = np.where(selector.values)[0][0]
    row = pd.DataFrame({"precision": results.iloc[index]["precision"][::subsample], "recall":  results.iloc[index]["recall"][::subsample], "label": l})
    rows.append(row)
    #row = pd.DataFrame({"precision": results.iloc[index].precision, "recall":  results.iloc[index].recall})
    #row = results.loc[results["label"] == "area/jupyter"]
roc_points = pd.concat(rows)    
alt.Chart(roc_points).mark_line().encode(x="recall", y="precision", color="label").interactive()
    
    Out[353]:
In [354]:
    
probability_thresholds
    
    Out[354]:
In [300]:
    
# Comment how many examples we have in the test set for the above labels
y_label_counts = pd.Series(y_test.sum(axis=0), index=target_labels)
y_label_counts.loc[labels_of_interest]
    
    Out[300]:
In [308]:
    
# Compute fraction of issues with the labels
y_label_counts.loc[labels_of_interest] / y_test.shape[0]
    
    Out[308]:
In [54]:
    
from code_intelligence import github_bigquery
import subprocess 
# TODO(jlewi): Get the project using fairing?
PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()
    
In [103]:
    
reload(github_bigquery)
# Fetch recent issues
recent_issues = github_bigquery.get_issues("kubeflow", PROJECT, max_age_days=14)
    
    
In [106]:
    
input_data = recent_issues[["title", "body"]]
recent_embeddings = inference_wrapper.df_to_embedding(input_data)
    
    
In [114]:
    
recent_predictions = clf.predict_proba(recent_embeddings)
#p = pd.DataFrame({"probabilities": predictions[0, :], "labels": target_labels})
#p.sort_values("probabilities", ascending=False)
    
In [365]:
    
    
    Out[365]:
In [370]:
    
np.sign([2, 0, -1])
    
    Out[370]:
In [378]:
    
recent_labels = (np.sign(recent_predictions - np.tile(label_thresholds, [recent_predictions.shape[0], 1]))+1) / 2
    
In [397]:
    
recent_labels_df=pd.DataFrame(recent_labels, columns=target_labels)
predicted_label_counts = recent_labels_df.sum(axis=0)
    
In [411]:
    
predicted_label_counts.sort_values(ascending=False)
    
    Out[411]:
In [414]:
    
recent_labels_df.shape
    
    Out[414]:
In [ ]:
    
* Print out issue titles and labels
  * This is for qualitative analysis
    
In [408]:
    
from IPython.core.display import display, HTML
for i in range(recent_issues.shape[0]):
    title = recent_issues.iloc[i]["title"]
    url = recent_issues.iloc[i]["html_url"]
    predicted_labels = target_labels[recent_labels[i,:]>0]
    names = ", ".join(predicted_labels)
    display(HTML(f"Issue: <a href='{url}'>{title}</a> {names}"))
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [5]:
    
# fairing:include-cell
class RepoMLP(object):
    """RepoMLP is a helper class to work with scklearn multi-layer perceptron. 
    
    The RepoMLP provides some wrapper code to help train the sklearn multi-layer perceptron in this case.
    
    TODO(jlewi): This is a wrapper around MLPWrapper which is a wrapper around 
    """
    def __init__(self,
                 owner=None,
                 repo=None,
                 precision_threshold=0.7,
                 recall_threshold=0.5,
                 workspace_name='train',
                 min_freq=25,
                 activation='relu',
                 alpha=0.0001,
                 early_stopping=True,
                 epsilon=1e-08,
                 hidden_layer_sizes=(600,600),
                 learning_rate='adaptive',
                 learning_rate_init=0.001,
                 max_iter=3000,
                 momentum=0.9,
                 n_iter_no_change=5,
                 random_state=1234,
                 solver='adam',
                 validation_fraction=0.1):
        self.precision_threshold = precision_threshold
        self.recall_threshold = recall_threshold
        self.min_freq = min_freq # for filtering labels
        self.mlp_wrapper = None
        self.clf = MLPClassifier(activation=activation,
                                 alpha=alpha,
                                 early_stopping=early_stopping,
                                 epsilon=epsilon,
                                 hidden_layer_sizes=hidden_layer_sizes,
                                 learning_rate=learning_rate,
                                 learning_rate_init=learning_rate_init,
                                 max_iter=max_iter,
                                 momentum=momentum,
                                 n_iter_no_change=n_iter_no_change,
                                 random_state=random_state,
                                 solver=solver,
                                 validation_fraction=validation_fraction)
        self.all_labels = None
        self.probability_thresholds = None
        self.load_yaml(owner, repo)
        self.exec = self.create_execution(workspace_name=workspace_name)
    def load_yaml(self, owner, repo):
        config = RepoConfig(owner, repo)
        self.repo_owner = config.repo_owner
        self.repo_name = config.repo_name
        self.model_bucket_name = config.model_bucket_name
        self.model_file = config.model_local_path
        self.model_dest = config.model_gcs_path
        self.labels_file = config.labels_local_path
        self.labels_dest = config.labels_gcs_path
        self.embeddings_bucket_name = config.embeddings_bucket_name
        self.embeddings_file = config.embeddings_local_path
        self.embeddings_dest = config.embeddings_gcs_path
        
        # TODO(chunhsiang): need to be able to train on multiple repos which
        # should be defined in the yaml config
        # for now, only train model on the repo installed
        self.trained_repos = [f'{self.repo_owner}/{self.repo_name}']
    # TODO(jlewi): Delete this code?
    def download_embeddings_from_gcs(self):
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.embeddings_bucket_name)
        blob = bucket.get_blob(self.embeddings_dest)
        with open(self.embeddings_file, 'wb') as f:
            blob.download_to_file(f)
    def load_training_data(self):
        self.download_embeddings_from_gcs()
        with open(self.embeddings_file, 'rb') as f:
            data = dpickle.load(f)
        # filter labels
        c = Counter()
        for lbls in data['labels']:
            c.update(lbls)
        self.all_labels = [x for x in c if c[x] >= self.min_freq]
        X = []
        y = []
        for emb, lbls in zip(data['features'], data['labels']):
            mask = [self.all_labels.index(x) for x in lbls if c[x] >= self.min_freq]
            if mask == []:
                continue
            zer = np.zeros(len(self.all_labels))
            zer[mask] = 1
            y.append(zer)
            X.append(emb)
        return X, y
    def train(self):
        X, y = self.load_training_data()
        self.mlp_wrapper = MLPWrapper(clf=self.clf,
                                      precision_threshold=self.precision_threshold,
                                      recall_threshold=self.recall_threshold)
        # TODO(jlewi): find_probability_thresholds; splits the data into test and
        # training sets and then calls fit. Why are we then calling fit again?
        # Is this just because its using the whole dataset?
        # get probability thresholds before `fit` because it overwrites classifier
        self.mlp_wrapper.find_probability_thresholds(X, y)
        self.probability_thresholds = self.mlp_wrapper.probability_thresholds
        # train model using the whole data
        self.mlp_wrapper.fit(X, y)
        self.save_model()
        # store model artifacts using kubeflow metadata
        model_name = ','.join(sorted(self.trained_repos))
        model_uri = f'gs://{self.model_bucket_name}/{self.model_dest}'
        # put all the repo names as the label keys
        model_labels = {r:'' for r in self.trained_repos}
        self.exec.log_output(metadata.Model(
            name=model_name,
            uri=model_uri,
            labels=model_labels))
    def save_model(self):
        self.mlp_wrapper.save_model(model_file=self.model_file)
        # dump label columns for prediction
        thresholds = {}
        for i in self.probability_thresholds:
            if self.probability_thresholds[i]:
                thresholds[i] = float(self.probability_thresholds[i])
            else:
                thresholds[i] = None
        label_dict = {
            'labels': self.all_labels,
            'probability_thresholds': thresholds
        }
        with open(self.labels_file, 'w') as f:
            yaml.dump(label_dict, f)
        self.upload_model_to_gcs()
    def upload_model_to_gcs(self):
        # upload model
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.model_bucket_name)
        blob = bucket.blob(self.model_dest)
        blob.upload_from_filename(self.model_file)
        # upload label columns
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.model_bucket_name)
        blob = bucket.blob(self.labels_dest)
        blob.upload_from_filename(self.labels_file)
    def create_execution(self, workspace_name):
        """
        Return a metatdata execution object in a workspace and
        a run for logging.
        Args:
          workspace_name: workspace name, str
        """
        workspace = metadata.Workspace(
            # connect to metadata-service in namesapce kubeflow in k8s cluster.
            backend_url_prefix='metadata-service.kubeflow:8080',
            name=workspace_name,
            description='workspace for model training artifacts and executions')
        
        run = metadata.Run(
            workspace=workspace,
            name='run-' + datetime.utcnow().isoformat('T'))
        return metadata.Execution(
            name = 'execution-' + datetime.utcnow().isoformat('T'),
            workspace=workspace,
            run=run)
    
In [3]:
    
r = RepoMLP(workspace_name='ws1', owner='kubeflow', repo='examples')
    
In [4]:
    
r.train()
    
Kubeflow Fairing is a Python package that makes training and deploying machine learning models on Kubeflow easier.
Here, we use the preprocessor in Kubeflow Fairing to convert a notebook to be a Python script and create an entry point for that script. After preprocessing the notebook, we can call the command in the command line like the following to run
$ python repo_mlp.py train
In [5]:
    
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire
    
In [6]:
    
preprocessor = ConvertNotebookPreprocessorWithFire('RepoMLP')
if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['mlp.py', 'repo_config.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()
    
    Out[6]:
In [ ]: