Train a model

This notebook can be used to train a model
The notebook assumes you have already computted the embeddings and stored them on GCS
- You can use the notebook Issue_Embbeddings/notebooks/Get-GitHub-Issues.ipynb to compute the embeddings
TODO(jlewi): I last ran this notebook in gcr.io/kubeflow-images-public/tensorflow-1.15.2-notebook-gpu:1.0.0
- I ran Get-GitHUb-Issues.ipynb first and thus installed all the dependencies that that notebook installs in my image

Parameters



In [32]:

    
# URL of the trained language model
LANGUAGE_MODEL_URL = 'gs://issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'

Environment setup



In [1]:

    
import logging
import os
from pathlib import Path
from importlib import reload
import sys
import notebook_setup

notebook_setup.setup()









    



Adding /home/jovyan/git_kubeflow-code-intelligence/py to python path



In [2]:

    
# Install mlmd sdk
!pip install --user "git+git://github.com/kubeflow/metadata.git#egg=kfmd&subdirectory=sdk/python"









    



Collecting kfmd
  Cloning git://github.com/kubeflow/metadata.git to /tmp/pip-install-tk7n969n/kfmd
  Running command git clone -q git://github.com/kubeflow/metadata.git /tmp/pip-install-tk7n969n/kfmd
  WARNING: Generating metadata for package kfmd produced metadata for project name kubeflow-metadata. Fix your #egg=kfmd fragments.
Requirement already satisfied (use --upgrade to upgrade): kubeflow-metadata from git+git://github.com/kubeflow/metadata.git#egg=kfmd&subdirectory=sdk/python in /home/jovyan/.local/lib/python3.6/site-packages
Requirement already satisfied: ml-metadata==0.21.1 in /home/jovyan/.local/lib/python3.6/site-packages (from kubeflow-metadata) (0.21.1)
Requirement already satisfied: retrying in /home/jovyan/.local/lib/python3.6/site-packages (from kubeflow-metadata) (1.3.3)
Requirement already satisfied: absl-py<1,>=0.7 in /usr/local/lib/python3.6/dist-packages (from ml-metadata==0.21.1->kubeflow-metadata) (0.9.0)
Requirement already satisfied: six<2,>=1.10 in /home/jovyan/.local/lib/python3.6/site-packages (from ml-metadata==0.21.1->kubeflow-metadata) (1.14.0)
Requirement already satisfied: protobuf<4,>=3.7 in /home/jovyan/.local/lib/python3.6/site-packages (from ml-metadata==0.21.1->kubeflow-metadata) (3.11.3)
Requirement already satisfied: tensorflow!=2.0.*,<3,>=1.15 in /home/jovyan/.local/lib/python3.6/site-packages (from ml-metadata==0.21.1->kubeflow-metadata) (2.1.0)
Requirement already satisfied: setuptools in /home/jovyan/.local/lib/python3.6/site-packages (from protobuf<4,>=3.7->ml-metadata==0.21.1->kubeflow-metadata) (46.1.3)
Requirement already satisfied: keras-preprocessing>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.1.0)
Requirement already satisfied: gast==0.2.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (0.2.2)
Requirement already satisfied: numpy<2.0,>=1.16.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.16.4)
Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (3.1.0)
Requirement already satisfied: keras-applications>=1.0.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.0.8)
Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.11.2)
Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.1.0)
Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (0.8.1)
Requirement already satisfied: tensorflow-estimator<2.2.0,>=2.1.0rc0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (2.1.0)
Requirement already satisfied: wheel>=0.26; python_version >= "3" in /usr/lib/python3/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (0.30.0)
Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.26.0)
Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (0.1.8)
Requirement already satisfied: scipy==1.4.1; python_version >= "3" in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.4.1)
Requirement already satisfied: tensorboard<2.2.0,>=2.1.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (2.1.1)
Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.8->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (2.10.0)
Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (3.1.1)
Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (0.4.1)
Requirement already satisfied: werkzeug>=0.11.15 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (0.15.2)
Requirement already satisfied: google-auth<2,>=1.6.3 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.13.1)
Requirement already satisfied: requests<3,>=2.21.0 in /home/jovyan/.local/lib/python3.6/site-packages (from tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (2.22.0)
Requirement already satisfied: requests-oauthlib>=0.7.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.2.0)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2,>=1.6.3->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (0.2.5)
Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2,>=1.6.3->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (3.1.1)
Requirement already satisfied: rsa<4.1,>=3.1.4 in /home/jovyan/.local/lib/python3.6/site-packages (from google-auth<2,>=1.6.3->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (4.0)
Requirement already satisfied: idna<2.9,>=2.5 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3,>=2.21.0->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (2.8)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3,>=2.21.0->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3,>=2.21.0->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /home/jovyan/.local/lib/python3.6/site-packages (from requests<3,>=2.21.0->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (2019.3.9)
Requirement already satisfied: oauthlib>=3.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (3.0.1)
Requirement already satisfied: pyasn1<0.5.0,>=0.4.1 in /home/jovyan/.local/lib/python3.6/site-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard<2.2.0,>=2.1.0->tensorflow!=2.0.*,<3,>=1.15->ml-metadata==0.21.1->kubeflow-metadata) (0.4.5)
Building wheels for collected packages: kubeflow-metadata, kubeflow-metadata
  Building wheel for kubeflow-metadata (setup.py) ... done
  Created wheel for kubeflow-metadata: filename=kubeflow_metadata-0.3.1-cp36-none-any.whl size=12520 sha256=e5c79dff8ab0b2c441c23e263792617e78447a785b9929e2aa08e9bd2f50ec30
  Stored in directory: /tmp/pip-ephem-wheel-cache-v9xc7mlu/wheels/b0/95/87/cee3c10b4603782af6a567f97cd87528873f5df9418223abba
  Building wheel for kubeflow-metadata (setup.py) ... error
  ERROR: Command errored out with exit status 1:
   command: /usr/bin/python3 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-tk7n969n/kubeflow-metadata/setup.py'"'"'; __file__='"'"'/tmp/pip-install-tk7n969n/kubeflow-metadata/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /tmp/pip-wheel-qkxfzyvz --python-tag cp36
       cwd: /tmp/pip-install-tk7n969n/kubeflow-metadata/
  Complete output (5 lines):
  Traceback (most recent call last):
    File "<string>", line 1, in <module>
    File "/usr/lib/python3.6/tokenize.py", line 452, in open
      buffer = _builtin_open(filename, 'rb')
  FileNotFoundError: [Errno 2] No such file or directory: '/tmp/pip-install-tk7n969n/kubeflow-metadata/setup.py'
  ----------------------------------------
  ERROR: Failed building wheel for kubeflow-metadata
  Running setup.py clean for kubeflow-metadata
  ERROR: Command errored out with exit status 1:
   command: /usr/bin/python3 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-tk7n969n/kubeflow-metadata/setup.py'"'"'; __file__='"'"'/tmp/pip-install-tk7n969n/kubeflow-metadata/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' clean --all
       cwd: /tmp/pip-install-tk7n969n/kubeflow-metadata
  Complete output (5 lines):
  Traceback (most recent call last):
    File "<string>", line 1, in <module>
    File "/usr/lib/python3.6/tokenize.py", line 452, in open
      buffer = _builtin_open(filename, 'rb')
  FileNotFoundError: [Errno 2] No such file or directory: '/tmp/pip-install-tk7n969n/kubeflow-metadata/setup.py'
  ----------------------------------------
  ERROR: Failed cleaning build dir for kubeflow-metadata
Successfully built kubeflow-metadata
Failed to build kubeflow-metadata
WARNING: You are using pip version 19.3.1; however, version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

Train a model



In [65]:

    
# fairing:include-cell
import sys
from label_microservice.repo_config import RepoConfig
from label_microservice.mlp import MLPWrapper
from sklearn.neural_network import MLPClassifier
import dill as dpickle
import os
import yaml
from google.cloud import storage
import requests
import json
import numpy as np
from passlib.apps import custom_app_context as pwd_context
from collections import Counter
from kubeflow import metadata
import  datetime
import logging
import pandas as pd

Load the training data

The data should be stored in GCS in an HDF5 file



In [6]:

    
from code_intelligence import gcs_util

embeddings_file = "gs://repo-embeddings/kubeflow/2020_0428/kubeflow_issue_embeddings_2020-04-11T17:15:10.000876-07:00.hdf5"
name = os.path.basename(embeddings_file)

data_dir = os.path.join(home, "data")

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
local_file = os.path.join(data_dir, name)

if not os.path.exists(local_file):
    gcs_util.copy_file(embeddings_file, local_file)
else:
    logging.info(f"File {local_file} already exists")









    



File /home/jovyan/data/kubeflow_issue_embeddings_2020-04-11T17:15:10.000876-07:00.hdf5 already exists



In [7]:

    
import h5py
h5_file = h5py.File(local_file, mode="r")



In [8]:

    
issue_embeddings = h5_file["issue_embeddings"].value
issues = pd.read_hdf(local_file, "issues")









    



/home/jovyan/.local/lib/python3.6/site-packages/ipykernel_launcher.py:1: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
  """Entry point for launching an IPython kernel.

Compute a historgram of label frequency



In [9]:

    
label_counts = Counter()

for r in range(issues.shape[0]):
    label_counts.update(issues.iloc[r]["parsed_labels"])



In [10]:

    
#label_counts_df = pd.DataFrame({"label": label_counts.keys(), "count": label_counts.values()})
label_counts_df = pd.DataFrame(label_counts.items(), columns=["label", "count"])



In [11]:

    
label_counts_df.sort_values("count", ascending=False, inplace=True)



In [12]:

    
label_counts_df["index"] = range(label_counts_df.shape[0])



In [13]:

    
# Create a bar chart with the x-axis sorted by the values
import altair as alt
bars = alt.Chart(label_counts_df).mark_bar().encode(x=alt.X("label", sort="-y"), y=alt.Y('count'))
bars.interactive()









    Out[13]:






Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor

Compute a matrix of training labels



In [ ]:

    
issues["parsed_labels"]

Select the labels with a minimum number of examples to train on

Also filter out certain label prefixes we don't want to predict
We also need to compute the "hot" encoded matrix used for prediction
- This will be numsamples x numlabels matrix
- element (i,j) is 0 or 1 depending on whether that element has that label



In [14]:

    
count_cutoff = 30
target_labels = label_counts_df[label_counts_df["count"] >= count_cutoff]["label"]

exclude_prefixes = ["lifecycle", "status"]
def keep_label(l):
    for p in exclude_prefixes:
        if l.startswith(p):
            return False
        
    return True
            
target_labels = target_labels[target_labels.apply(keep_label)]
target_labels.values.sort()



In [15]:

    
label_to_index = dict(zip(target_labels.values, range(target_labels.shape[0])))



In [16]:

    
import numpy as np 
num_labels = target_labels.shape[0]
def hot_encoded(x):
    d = np.zeros([1, num_labels])
    
    for l in x:
        if not l in label_to_index:
            continue
        d[0, label_to_index[l]] = 1
        
    return d



In [17]:

    
issue_hot_encoded = issues["parsed_labels"].apply(hot_encoded)
issue_hot_encoded = np.concatenate(issue_hot_encoded)

Train the model

We need to split the data into test and training sets



In [20]:

    
from label_microservice import mlp



In [21]:

    
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(issue_embeddings, issue_hot_encoded, test_size=test_size, random_state=1234)



In [117]:

    
precision_threshold=0.7,
recall_threshold=0.5
workspace_name='train'
min_freq=25
activation='relu'
alpha=0.0001
early_stopping=True
epsilon=1e-08
hidden_layer_sizes=(600,600)
learning_rate='adaptive'
learning_rate_init=0.001
max_iter=3000
momentum=0.9
n_iter_no_change=5
random_state=1234
solver='adam'
validation_fraction=0.1
                
clf = MLPClassifier(activation=activation,
                     alpha=alpha,
                     early_stopping=early_stopping,
                     epsilon=epsilon,
                     hidden_layer_sizes=hidden_layer_sizes,
                     learning_rate=learning_rate,
                     learning_rate_init=learning_rate_init,
                     max_iter=max_iter,
                     momentum=momentum,
                     n_iter_no_change=n_iter_no_change,
                     random_state=random_state,
                     solver=solver,
                     validation_fraction=validation_fraction)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-117-312e50616e20> in <module>
     30                      solver=solver,
     31                      validation_fraction=validation_fraction,
---> 32                      classes=target_labels)        

TypeError: __init__() got an unexpected keyword argument 'classes'



In [121]:

    
# Set class labels
clf.classes_ = target_labels



In [23]:

    
clf.fit(X_train, y_train)









    Out[23]:





MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(600, 600), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=3000, momentum=0.9,
       n_iter_no_change=5, nesterovs_momentum=True, power_t=0.5,
       random_state=1234, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

Compute AUC on training and test sets
If AUC on training >> AUC on test then we are overfitting



In [24]:

    
mlp_predictions = clf.predict_proba(X_train)
mlp_df, mlp_auc = mlp.calculate_auc(mlp_predictions, y_train, target_labels)









    







  
    
      
      label
      auc
      count
    
  
  
    
      1
      addition/feature
      0.767130
      34.0
    
    
      6
      api/v1alpha2
      0.873236
      57.0
    
    
      13
      area/0.3.0
      0.660164
      17.0
    
    
      2
      area/0.4.0
      0.762683
      116.0
    
    
      20
      area/0.5.0
      0.749123
      45.0
    
    
      36
      area/1.0.0
      0.725669
      40.0
    
    
      19
      area/api
      0.722186
      30.0
    
    
      31
      area/back-end
      0.652767
      35.0
    
    
      0
      area/backend
      0.632352
      52.0
    
    
      51
      area/bootstrap
      0.815521
      61.0
    
    
      34
      area/build-release
      0.793404
      50.0
    
    
      59
      area/centraldashboard
      0.884100
      49.0
    
    
      24
      area/components
      0.571700
      34.0
    
    
      8
      area/docs
      0.881929
      218.0
    
    
      3
      area/engprod
      0.724587
      79.0
    
    
      9
      area/enterprise_readiness
      0.680578
      27.0
    
    
      12
      area/example/code_search
      0.748904
      52.0
    
    
      5
      area/example/issue_summarization
      0.648000
      21.0
    
    
      33
      area/front-end
      0.903475
      199.0
    
    
      186
      area/frontend
      0.863241
      80.0
    
    
      23
      area/inference
      0.748020
      72.0
    
    
      10
      area/istio
      0.788577
      21.0
    
    
      66
      area/jupyter
      0.949484
      218.0
    
    
      109
      area/katib
      0.544324
      38.0
    
    
      243
      area/kfctl
      0.835448
      305.0
    
    
      156
      area/ksonnet
      0.705119
      23.0
    
    
      53
      area/kustomize
      0.671678
      39.0
    
    
      194
      area/metadata
      0.861941
      26.0
    
    
      65
      area/operator
      0.826509
      47.0
    
    
      177
      area/pipelines
      0.673647
      74.0
    
    
      ...
      ...
      ...
      ...
    
    
      40
      area/sdk/dsl
      0.848027
      51.0
    
    
      29
      area/sdk/dsl/compiler
      0.870994
      41.0
    
    
      183
      area/testing
      0.932066
      195.0
    
    
      192
      area/tfjob
      0.646274
      58.0
    
    
      182
      community/discussion
      0.768986
      29.0
    
    
      30
      community/question
      0.866500
      171.0
    
    
      146
      cuj/build-train-deploy
      0.862712
      21.0
    
    
      88
      cuj/multi-user
      0.801133
      20.0
    
    
      11
      doc-sprint
      0.881246
      95.0
    
    
      81
      feature
      0.695470
      48.0
    
    
      57
      feature_request
      0.713382
      32.0
    
    
      43
      good first issue
      0.658021
      90.0
    
    
      7
      help wanted
      0.680207
      195.0
    
    
      22
      improvement/enhancement
      0.743681
      166.0
    
    
      181
      kind/bug
      0.904546
      1118.0
    
    
      26
      kind/discussion
      0.740549
      25.0
    
    
      38
      kind/enhancement
      0.858680
      30.0
    
    
      108
      kind/feature
      0.828165
      907.0
    
    
      234
      kind/process
      0.815845
      53.0
    
    
      27
      kind/question
      0.882781
      171.0
    
    
      96
      platform/aws
      0.581743
      34.0
    
    
      72
      platform/gcp
      0.899582
      191.0
    
    
      84
      platform/minikube
      0.664970
      21.0
    
    
      73
      priority/p0
      0.733091
      442.0
    
    
      85
      priority/p1
      0.724888
      1139.0
    
    
      179
      priority/p2
      0.664800
      746.0
    
    
      148
      priority/p3
      0.731410
      51.0
    
    
      114
      release/0.2.0
      0.739689
      40.0
    
    
      45
      release/0.3.0
      0.729746
      99.0
    
    
      180
      testing
      0.952151
      34.0
    
  

64 rows × 3 columns







    



Weighted Average AUC: 0.7931557887254959



In [25]:

    
mlp_predictions = clf.predict_proba(X_test)
mlp_df, mlp_auc = mlp.calculate_auc(mlp_predictions, y_test, target_labels)









    







  
    
      
      label
      auc
      count
    
  
  
    
      1
      addition/feature
      0.721142
      16.0
    
    
      6
      api/v1alpha2
      0.746764
      25.0
    
    
      13
      area/0.3.0
      0.518209
      18.0
    
    
      2
      area/0.4.0
      0.687899
      58.0
    
    
      20
      area/0.5.0
      0.682154
      9.0
    
    
      36
      area/1.0.0
      0.658661
      15.0
    
    
      19
      area/api
      0.735178
      18.0
    
    
      31
      area/back-end
      0.525537
      12.0
    
    
      0
      area/backend
      0.520193
      20.0
    
    
      51
      area/bootstrap
      0.746872
      24.0
    
    
      34
      area/build-release
      0.809440
      17.0
    
    
      59
      area/centraldashboard
      0.664688
      19.0
    
    
      24
      area/components
      0.645988
      22.0
    
    
      8
      area/docs
      0.836248
      95.0
    
    
      3
      area/engprod
      0.586436
      34.0
    
    
      9
      area/enterprise_readiness
      0.708824
      14.0
    
    
      12
      area/example/code_search
      0.711029
      13.0
    
    
      5
      area/example/issue_summarization
      0.508188
      10.0
    
    
      33
      area/front-end
      0.873642
      78.0
    
    
      186
      area/frontend
      0.843582
      41.0
    
    
      23
      area/inference
      0.812361
      29.0
    
    
      10
      area/istio
      0.678056
      11.0
    
    
      66
      area/jupyter
      0.942235
      110.0
    
    
      109
      area/katib
      0.506560
      14.0
    
    
      243
      area/kfctl
      0.829178
      124.0
    
    
      156
      area/ksonnet
      0.750000
      8.0
    
    
      53
      area/kustomize
      0.664620
      17.0
    
    
      194
      area/metadata
      0.906886
      9.0
    
    
      65
      area/operator
      0.713259
      12.0
    
    
      177
      area/pipelines
      0.524112
      25.0
    
    
      ...
      ...
      ...
      ...
    
    
      40
      area/sdk/dsl
      0.734925
      20.0
    
    
      29
      area/sdk/dsl/compiler
      0.857899
      16.0
    
    
      183
      area/testing
      0.923907
      91.0
    
    
      192
      area/tfjob
      0.608080
      18.0
    
    
      182
      community/discussion
      0.797142
      11.0
    
    
      30
      community/question
      0.880685
      69.0
    
    
      146
      cuj/build-train-deploy
      0.753476
      15.0
    
    
      88
      cuj/multi-user
      0.600059
      19.0
    
    
      11
      doc-sprint
      0.875688
      41.0
    
    
      81
      feature
      0.678555
      19.0
    
    
      57
      feature_request
      0.702508
      14.0
    
    
      43
      good first issue
      0.657903
      48.0
    
    
      7
      help wanted
      0.713199
      102.0
    
    
      22
      improvement/enhancement
      0.670356
      81.0
    
    
      181
      kind/bug
      0.875297
      480.0
    
    
      26
      kind/discussion
      0.545338
      17.0
    
    
      38
      kind/enhancement
      0.724563
      10.0
    
    
      108
      kind/feature
      0.803830
      363.0
    
    
      234
      kind/process
      0.767716
      16.0
    
    
      27
      kind/question
      0.830156
      87.0
    
    
      96
      platform/aws
      0.576086
      19.0
    
    
      72
      platform/gcp
      0.873200
      85.0
    
    
      84
      platform/minikube
      0.747617
      12.0
    
    
      73
      priority/p0
      0.726644
      180.0
    
    
      85
      priority/p1
      0.686845
      500.0
    
    
      179
      priority/p2
      0.631954
      307.0
    
    
      148
      priority/p3
      0.634069
      20.0
    
    
      114
      release/0.2.0
      0.574587
      15.0
    
    
      45
      release/0.3.0
      0.694674
      54.0
    
    
      180
      testing
      0.937556
      22.0
    
  

64 rows × 3 columns







    



Weighted Average AUC: 0.7596734653267538

Qualitative Analysis of the Model

To evaluate the model we can fetch some issues and generate predictions for them



In [49]:

    
from code_intelligence import util as code_intelligence_util
from code_intelligence import embeddings
from code_intelligence import graphql
from code_intelligence import inference



In [310]:

    
reload(embeddings)









    Out[310]:





<module 'code_intelligence.embeddings' from '/home/jovyan/git_kubeflow-code-intelligence/py/code_intelligence/embeddings.py'>



In [29]:

    
data_dir









    Out[29]:





'/home/jovyan/data'



In [31]:

    
LANGUAGE_MODEL_URL









    Out[31]:





'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'



In [36]:

    
def pass_through(x):
    return x

# TODO(jlewi): We should download the file if the local file doesn't exist
local_model_file = os.path.basename(LANGUAGE_MODEL_URL)

# model_url = 'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
local_model_dir = os.path.join(data_dir, "language_model")

if not os.path.exists(os.path.join(local_model_dir, local_model_file)):
    if not os.path.exists(local_model_dir):
        os.makedirs(local_model_dir)
    gcs_util.copy_from_gcs(LANGUAGE_MODEL_URL, os.path.join(local_model_dir, local_model_file))

inference_wrapper = inference.InferenceWrapper(model_path=local_model_dir, model_file_name=local_model_file)



In [37]:

    
if not os.getenv("GITHUB_TOKEN"):
    raise ValueError(f"No GitHub token specified")   
else:
    gh_client = graphql.GraphQLClient()









    



GraphQLClient is defaulting to FixedAccessTokenGenerator based on environment variables. This is deprecated. Caller should explicitly pass in a instance via header_generator. Traceback:
<function extract_stack at 0x7f18e59c76a8>



In [53]:

    
issue_url = "https://github.com/kubeflow/kubeflow/issues/4972"
issue_dict= embeddings.get_issue(issue_url, gh_client)
dict_for_embeddings = inference_wrapper.process_dict(issue_dict)
embedding_data = inference_wrapper.get_pooled_features(dict_for_embeddings['text']).detach().cpu().numpy()
predictions = clf.predict_proba(embedding_data)
p = pd.DataFrame({"probabilities": predictions[0, :], "labels": target_labels})
p.sort_values("probabilities", ascending=False)









    Out[53]:







  
    
      
      probabilities
      labels
    
  
  
    
      181
      0.958053
      kind/bug
    
    
      243
      0.154085
      area/kfctl
    
    
      85
      0.142676
      priority/p1
    
    
      72
      0.114866
      platform/gcp
    
    
      179
      0.113417
      priority/p2
    
    
      73
      0.087836
      priority/p0
    
    
      30
      0.056801
      community/question
    
    
      27
      0.037530
      kind/question
    
    
      51
      0.015242
      area/bootstrap
    
    
      108
      0.013784
      kind/feature
    
    
      43
      0.013541
      good first issue
    
    
      8
      0.013455
      area/docs
    
    
      109
      0.012552
      area/katib
    
    
      177
      0.011179
      area/pipelines
    
    
      53
      0.009116
      area/kustomize
    
    
      66
      0.007146
      area/jupyter
    
    
      22
      0.006907
      improvement/enhancement
    
    
      0
      0.006834
      area/backend
    
    
      10
      0.006017
      area/istio
    
    
      186
      0.005850
      area/frontend
    
    
      11
      0.005842
      doc-sprint
    
    
      137
      0.005688
      area/sdk
    
    
      192
      0.004366
      area/tfjob
    
    
      7
      0.004256
      help wanted
    
    
      180
      0.003859
      testing
    
    
      23
      0.003487
      area/inference
    
    
      45
      0.003437
      release/0.3.0
    
    
      84
      0.003216
      platform/minikube
    
    
      96
      0.003182
      platform/aws
    
    
      3
      0.003166
      area/engprod
    
    
      ...
      ...
      ...
    
    
      2
      0.002620
      area/0.4.0
    
    
      183
      0.002547
      area/testing
    
    
      234
      0.002250
      kind/process
    
    
      114
      0.002228
      release/0.2.0
    
    
      40
      0.002095
      area/sdk/dsl
    
    
      194
      0.002001
      area/metadata
    
    
      57
      0.001837
      feature_request
    
    
      20
      0.001555
      area/0.5.0
    
    
      59
      0.001475
      area/centraldashboard
    
    
      9
      0.001471
      area/enterprise_readiness
    
    
      26
      0.001466
      kind/discussion
    
    
      156
      0.001275
      area/ksonnet
    
    
      12
      0.001224
      area/example/code_search
    
    
      65
      0.001171
      area/operator
    
    
      148
      0.001167
      priority/p3
    
    
      36
      0.001126
      area/1.0.0
    
    
      28
      0.001069
      area/sdk/components
    
    
      38
      0.001063
      kind/enhancement
    
    
      24
      0.000859
      area/components
    
    
      146
      0.000858
      cuj/build-train-deploy
    
    
      34
      0.000857
      area/build-release
    
    
      81
      0.000812
      feature
    
    
      13
      0.000799
      area/0.3.0
    
    
      19
      0.000752
      area/api
    
    
      16
      0.000713
      area/samples
    
    
      70
      0.000630
      area/sdk/client
    
    
      182
      0.000528
      community/discussion
    
    
      5
      0.000408
      area/example/issue_summarization
    
    
      1
      0.000259
      addition/feature
    
    
      88
      0.000199
      cuj/multi-user
    
  

64 rows × 2 columns

Compute Best Thresholds

TODO(jlewi): This code is based on copied/duplicated from label_microservice.mlp



In [124]:

    
from sklearn.metrics import precision_recall_curve



In [350]:

    
# Lets select those points in the test set that have one of the labels of interest
label_indexes = []

for i in range(target_labels.size):
    name = target_labels.iloc[i]
    
    keep_label = False
    for p in ["area", "platform"]:                         
        if name.startswith(p):
            keep_label = True
    
    if keep_label:
        label_indexes.append(i)

Select which points to use to evaluate the ROC
- One problem is that our test/train set is probably underreporting true positives;
- i.e. if an issue has a label applied; that is a high degree of confidence that the label is correct (as it was probably added by a human)
- The converse is not true; if an issue lacks labels that is not a strong signal the label doesn't apply because not all issues are properly labeled; e.g. if an issue is closed before being triaged it may not have had labels applied.
To address this we only look at an issue with at least one area, kind or platform label



In [351]:

    
has_label_of_interest = np.sum(y_test[:, label_indexes], axis=1) > 0

X_test_of_interest = X_test[has_label_of_interest, :]
y_test_of_interest = y_test[has_label_of_interest, :]



In [356]:

    
#y_pred = clf.predict_proba(X_test)
y_pred = clf.predict_proba(X_test_of_interest)

# Choose a ridiculously low precision_threshold otherwise most labels will end up not having a threshold and we will never generate predictions for that 
# label. Arguably if we erroneously start applying labels to issues; those will hopefully get corrected by humans and we can eventually use that
# to improve the model.
precision_threshold = .3
recall_threshold = .25
probability_thresholds = {}
precisions = {}
recalls = {}

results = pd.DataFrame({"label":  target_labels, "precision":[None] * target_labels.size,
                        "recall":[None] * target_labels.size})

# Default to a threshold of 1 so that the label will never be applied
label_thresholds = np.ones(target_labels.size)

for label in range(target_labels.size):
    # find the probability for each label
    best_precision, best_recall, best_threshold = 0.0, 0.0, 1
    #precision, recall, threshold = precision_recall_curve(y_test[:, label], y_pred[:, label])
    precision, recall, threshold = precision_recall_curve(y_test_of_interest[:, label], y_pred[:, label])
    
    results["precision"].iloc[label] = precision
    results["recall"].iloc[label] = recall
    
    for prec, reca, thre in zip(precision[:-1], recall[:-1], threshold):
        # precision, recall must meet two thresholds respecitively
        if prec >= precision_threshold and reca >= recall_threshold:
            # choose the threshold with the higher precision
            if prec > best_precision:
                best_precision = prec
                best_recall = reca
                best_threshold = thre
    # self.probability_thresholds is a dict {label_index: probability_threshold}
    # If probability_thresholds[label] is None, do not predict this label always, which
    # means this label is in the excluded list because it does not satisfy
    # both of the precision and recall thresholds
    label_thresholds[label]=best_threshold
    probability_thresholds[label] = best_threshold
    precisions[label] = best_precision
    recalls[label] = best_recall



In [ ]:

    
## Plot Precision and recall for various labels



In [353]:

    
# TODO(jlewi): How to do multiple labels on the same graph

# Susample otherwise we get too many points to plot
subsample = 10

rows = []
labels_of_interest = ["area/jupyter", "area/kfctl", "area/engprod", "area/docs", "area/kustomize", "platform/gcp", "platform/aws"]
for l in  ["area/jupyter", "area/kfctl", "area/engprod", "area/docs", "area/kustomize", "platform/gcp", "platform/aws"]:        
#l = "area/jupyter"
    selector = results["label"]== l
    index = np.where(selector.values)[0][0]
    row = pd.DataFrame({"precision": results.iloc[index]["precision"][::subsample], "recall":  results.iloc[index]["recall"][::subsample], "label": l})
    rows.append(row)
    #row = pd.DataFrame({"precision": results.iloc[index].precision, "recall":  results.iloc[index].recall})
    #row = results.loc[results["label"] == "area/jupyter"]


roc_points = pd.concat(rows)    


alt.Chart(roc_points).mark_line().encode(x="recall", y="precision", color="label").interactive()









    Out[353]:






Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor



In [354]:

    
probability_thresholds









    Out[354]:





{0: None,
 1: None,
 2: None,
 3: None,
 4: None,
 5: None,
 6: None,
 7: None,
 8: None,
 9: None,
 10: None,
 11: None,
 12: None,
 13: 0.14807147197201903,
 14: None,
 15: None,
 16: None,
 17: None,
 18: 0.19385336438401296,
 19: None,
 20: None,
 21: None,
 22: 0.25581560403170517,
 23: None,
 24: 0.13234397075965143,
 25: None,
 26: None,
 27: None,
 28: None,
 29: None,
 30: None,
 31: None,
 32: None,
 33: None,
 34: None,
 35: None,
 36: 0.12184644305077198,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 43: None,
 44: None,
 45: None,
 46: None,
 47: None,
 48: 0.6735544101390423,
 49: None,
 50: None,
 51: 0.28143440559124105,
 52: None,
 53: 0.09544697370402178,
 54: None,
 55: 0.15795212963724892,
 56: None,
 57: 0.16912464568117452,
 58: 0.3507302772068746,
 59: 0.1790282622758851,
 60: None,
 61: None,
 62: None,
 63: 0.06242248102002665}



In [300]:

    
# Comment how many examples we have in the test set for the above labels
y_label_counts = pd.Series(y_test.sum(axis=0), index=target_labels)

y_label_counts.loc[labels_of_interest]









    Out[300]:





label
area/jupyter      110.0
area/kfctl        124.0
area/engprod       34.0
area/docs          95.0
area/kustomize     17.0
platform/gcp       85.0
platform/aws       19.0
dtype: float64

As shown below by computing the fraction of issues in the test set with a given label the fractions are very low
One problem is that our test/train set is probably underreporting true positives;
- i.e. if an issue has a label applied; that is a high degree of confidence that the label is correct (as it was probably added by a human)
- The converse is not true; if an issue lacks labels that is not a strong signal the label doesn't apply because not all issues are properly labeled; e.g. if an issue is closed before being triaged it may not have had labels applied.



In [308]:

    
# Compute fraction of issues with the labels
y_label_counts.loc[labels_of_interest] / y_test.shape[0]









    Out[308]:





label
area/jupyter      0.046709
area/kfctl        0.052654
area/engprod      0.014437
area/docs         0.040340
area/kustomize    0.007219
platform/gcp      0.036093
platform/aws      0.008068
dtype: float64

Generate Predictions for Recently Created issues

Use BigQuery to bulk fetch recent issues and compute predictions for them



In [54]:

    
from code_intelligence import github_bigquery

import subprocess 
# TODO(jlewi): Get the project using fairing?
PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()



In [103]:

    
reload(github_bigquery)

# Fetch recent issues
recent_issues = github_bigquery.get_issues("kubeflow", PROJECT, max_age_days=14)









    



  Elapsed 6.23 s. Waiting...
  Elapsed 7.35 s. Waiting...
Downloading: 100%|██████████| 2593/2593 [00:00<00:00, 3016.77rows/s]
Total time taken 9.17 s.
Finished at 2020-04-27 15:41:02.



In [106]:

    
input_data = recent_issues[["title", "body"]]
recent_embeddings = inference_wrapper.df_to_embedding(input_data)









    



Model inference: 0 / 179
Model inference: 9 / 179
Model inference: 18 / 179
Model inference: 27 / 179
Model inference: 36 / 179
Model inference: 45 / 179
Model inference: 54 / 179
Model inference: 63 / 179
Model inference: 72 / 179
Model inference: 81 / 179
Model inference: 90 / 179
Model inference: 99 / 179
Model inference: 108 / 179
Model inference: 117 / 179
Model inference: 126 / 179
Model inference: 135 / 179
Model inference: 144 / 179
Model inference: 153 / 179
Model inference: 162 / 179
Model inference: 171 / 179



In [114]:

    
recent_predictions = clf.predict_proba(recent_embeddings)
#p = pd.DataFrame({"probabilities": predictions[0, :], "labels": target_labels})
#p.sort_values("probabilities", ascending=False)



In [365]:









    Out[365]:





array([[1.      , 1.      , 1.      , 1.      , ..., 1.      , 1.      , 1.      , 0.062422],
       [1.      , 1.      , 1.      , 1.      , ..., 1.      , 1.      , 1.      , 0.062422],
       [1.      , 1.      , 1.      , 1.      , ..., 1.      , 1.      , 1.      , 0.062422],
       [1.      , 1.      , 1.      , 1.      , ..., 1.      , 1.      , 1.      , 0.062422],
       ...,
       [1.      , 1.      , 1.      , 1.      , ..., 1.      , 1.      , 1.      , 0.062422],
       [1.      , 1.      , 1.      , 1.      , ..., 1.      , 1.      , 1.      , 0.062422],
       [1.      , 1.      , 1.      , 1.      , ..., 1.      , 1.      , 1.      , 0.062422],
       [1.      , 1.      , 1.      , 1.      , ..., 1.      , 1.      , 1.      , 0.062422]])



In [370]:

    
np.sign([2, 0, -1])









    Out[370]:





array([ 1,  0, -1])

To compute recent labels we subtract the threshold and then look at the sign of the result



In [378]:

    
recent_labels = (np.sign(recent_predictions - np.tile(label_thresholds, [recent_predictions.shape[0], 1]))+1) / 2

Compute how many issues are labeled with each label



In [397]:

    
recent_labels_df=pd.DataFrame(recent_labels, columns=target_labels)
predicted_label_counts = recent_labels_df.sum(axis=0)



In [411]:

    
predicted_label_counts.sort_values(ascending=False)









    Out[411]:





label
kind/bug                            43.0
kind/feature                        22.0
priority/p2                         15.0
priority/p1                         14.0
area/testing                         8.0
area/kfctl                           7.0
priority/p0                          7.0
kind/question                        4.0
area/jupyter                         3.0
platform/gcp                         2.0
area/frontend                        0.0
area/example/code_search             0.0
area/example/issue_summarization     0.0
area/front-end                       0.0
testing                              0.0
area/inference                       0.0
area/istio                           0.0
area/engprod                         0.0
area/katib                           0.0
area/ksonnet                         0.0
area/enterprise_readiness            0.0
area/build-release                   0.0
area/docs                            0.0
area/components                      0.0
area/centraldashboard                0.0
area/metadata                        0.0
area/bootstrap                       0.0
area/backend                         0.0
area/back-end                        0.0
area/api                             0.0
                                    ... 
api/v1alpha2                         0.0
area/kustomize                       0.0
area/sdk                             0.0
area/operator                        0.0
feature                              0.0
release/0.2.0                        0.0
priority/p3                          0.0
platform/minikube                    0.0
platform/aws                         0.0
kind/process                         0.0
kind/enhancement                     0.0
kind/discussion                      0.0
improvement/enhancement              0.0
help wanted                          0.0
good first issue                     0.0
feature_request                      0.0
doc-sprint                           0.0
area/pipelines                       0.0
cuj/multi-user                       0.0
cuj/build-train-deploy               0.0
community/question                   0.0
community/discussion                 0.0
area/tfjob                           0.0
area/sdk/dsl/compiler                0.0
area/sdk/dsl                         0.0
area/sdk/components                  0.0
area/sdk/client                      0.0
release/0.3.0                        0.0
area/samples                         0.0
addition/feature                     0.0
Length: 64, dtype: float64

Compute fraction of issues with an area or platform label



In [414]:

    
recent_labels_df.shape









    Out[414]:





(179, 64)



In [ ]:

    
* Print out issue titles and labels
  * This is for qualitative analysis



In [408]:

    
from IPython.core.display import display, HTML

for i in range(recent_issues.shape[0]):
    title = recent_issues.iloc[i]["title"]
    url = recent_issues.iloc[i]["html_url"]
    predicted_labels = target_labels[recent_labels[i,:]>0]
    names = ", ".join(predicted_labels)
    display(HTML(f"Issue: <a href='{url}'>{title}</a> {names}"))









    




Issue:  https://embeddings.gh-issue-labeler.com is down. 






    




Issue: Failed to fetch common module 






    




Issue: Difference between kubeflow/common and kubeflow/tf-operator/pkg/common 






    




Issue: [Slack] Process for channel creation 






    




Issue: [Guidelines] Add guideline on Authorization kind/feature






    




Issue: CentralDashboard: When should a namespace be shown in the selector? priority/p1






    




Issue: Create a repo for Kubeflow GCP Blueprints priority/p1, priority/p2






    




Issue: No working example for kfctl_istio_dex (no GKE) deployments priority/p2






    




Issue: mnist_vanilla_k8s.ipynb on-prem air-gapped  






    




Issue: Periodic example tests are failing kind/bug, priority/p0, priority/p1






    




Issue: MNIST demo produces No such file or directory: '/var/run/secrets/kubernetes.io/serviceaccount/namespace' 






    




Issue: Kubeflow fairing on-prem air-gapped with private registry 






    




Issue: Support specifying GPU resource kind/feature






    




Issue: Cannot run MNIST example because packages cannot be installed 






    




Issue: Suggestion services folder structure 






    




Issue: Update docs to tell which suggestion service supports for which algorithm. kind/feature






    




Issue: [UI Feature] Show Suggestion information in Experiment page kind/feature






    




Issue: Add max running time as an optional stop criteria kind/feature






    




Issue: Goptuna Suggestion Katib integration 






    




Issue: katib-mysql does not work kind/bug






    




Issue: [Installation does not working]deploy.sh is failed  kind/bug






    




Issue: Failed calling webhook when running random example experiment kind/bug






    




Issue: Rename Chocolate algorithm names 






    




Issue: Add new permission to Katib UI RBAC to view suggestions kind/feature






    




Issue: [Feature] Support DARTS in Katib 






    




Issue: Suggestion stuck for HyperBand when `len(response.ParameterAssignments) < requestNum` kind/bug






    




Issue: Experiments are created but are not resulting in deployments kind/bug






    




Issue: OpenAPI validation for Katib CRs - prevent invalid specs priority/p1






    




Issue: kfctl not building breaking unittests - error fetching modules area/testing, priority/p0






    




Issue: creating secrets from kfDef during installation 






    




Issue: zsh completions can't be sourced 






    




Issue: gcp apply could not update deployment manager - service account does not exist area/kfctl, platform/gcp






    




Issue: Make the CI tests for repo  kind/feature






    




Issue: Publish kfp-tekton SDK in Pypi  kind/feature






    




Issue: Found more official repo for "kubectl-wrapper" 






    




Issue: Add unit tests for 'basic_no_decorator.py' and 'compose.py' area/testing






    




Issue: Clarify building from Tekton master in sdk/README 






    




Issue: Docs for deploying tekton pipeline on kfp user namespaces. kind/feature






    




Issue: Add list of sample kfp pipelines for Tekton kind/feature






    




Issue: Extend resourceOp warpper image to have multiple outputs kind/feature






    




Issue: Refactor compiler and op to template function. kind/feature






    




Issue: Create makefile for sdk/python kind/feature






    




Issue: SDK promote method miss annotations 






    




Issue: Error Response Logging  kind/feature






    




Issue: Development build missing kustomize patch area/kfctl, kind/bug






    




Issue: TFServing supports for general collections of metrics. kind/feature






    




Issue: InferenceService with Kafka Event Source  kind/bug






    




Issue: the state of my kservice is "IngressNotConfigured" kind/bug






    




Issue: Transformer request to Predictor should be Non Blocking 






    




Issue: Investigate integration of TorchServe in KFServing 






    




Issue: Update Alibi Explainer Library to 0.4.0 






    




Issue: Unable to connect to port : Operation timed out kind/bug






    




Issue: Serving custom model over GPU node. 






    




Issue: GKE Deploy fails in Australia as no NVIDIA GPUs available unless we modify cluster.jinja kind/bug






    




Issue: Issue with installing 1.0.1 on on-premiss kind/bug






    




Issue: PVC discovery kind/feature, priority/p2






    




Issue: Stop/Pause Notebook Servers area/jupyter, kind/feature, priority/p2






    




Issue: Allow creation of profiles within dashboard kind/feature, priority/p1, priority/p2






    




Issue: Enforce Profile/Namespace naming in registration flow kind/feature






    




Issue: Kubernetes GPU pods /Kubeflow kind/question, priority/p2






    




Issue: Generic experiment, run for training job, notebook run kind/feature






    




Issue: Errors on reconfiguration of Jupyter Notebook Web App on existing Kubernetes cluster using Kustomization kind/bug






    




Issue: katib-mysql Table 'mysql.plugin' doesn't exist kind/bug






    




Issue: Seldon-Core Quickstart is not working kind/bug






    




Issue: Seldon Serving with Cognito kind/bug






    




Issue: Design Doc: Evaluate and Improve our Multi-User model kind/feature






    




Issue: Missing Deployments in 1.0.1 using kfctl_istio_dex_v1.0.1 






    




Issue: Notebook with EBS volume - creation failure kind/bug






    




Issue: MiniKF dashboard external IP address not connecting kind/question






    




Issue: Profile creation through central dashboard vs kubectl YAML does not have the same behavior kind/bug






    




Issue: exposing host docker in jupyterlab custom container area/jupyter, kind/question






    




Issue: Pipelines sample error - Uniform Bucket Level Access required 






    




Issue: GRPC Auth problem with GCP IAP  kind/bug, platform/gcp






    




Issue: Manage contributor add failure - rolebinding unique name issue kind/bug






    




Issue: katib-mysql startup brittle 






    




Issue: Kubeflow Deployment on AWS EKS fails due to "webhook.cert-manager.io" area/kfctl, kind/bug






    




Issue: Cannot access kubeflow dashboard as admin kind/bug






    




Issue: Issues with executor image and executor runtime - Kubeflow 0.5 on OCP 3.11 






    




Issue: Dashboard deployed on eks does not show manage contributors in navigation kind/bug






    




Issue: Cannot access notebook servers: Sorry, /jupyter/ is not a valid page kind/bug






    




Issue: kfserving_sdk_sample.ipynb triggers error 






    




Issue: KFServing SDK sample notebook `kfserving_sdk_sample.ipynb` triggers error kind/bug






    




Issue: kubeflow manifests presubmits are broken due to go modules problems area/kfctl, area/testing, priority/p1






    




Issue: Add aggregated rule for MPI-Operator and MXNet-operator 






    




Issue: Add manifests for XGBoostJob 






    




Issue: OIDC EnvoyFilter not working with Istio 1.3.1 kind/bug






    




Issue: Update SparkApplication CRD area/kfctl






    




Issue: In-compatibility with Kustomize Go API version area/kfctl






    




Issue: certmanager install has race condition - try to create KF certmanager resources before cert manager is available kind/bug






    




Issue: Add AWS specific Container Image into jupyter-web-app configMap priority/p1






    




Issue: Deploying manifest 1.0.2 on top of 1.0.1 creates a lot of errors with patch 






    




Issue: commonLabels need to be immutable for upgrades - remove version from commonLabels priority/p1






    




Issue: kustomize package to setup a namespace for using Kubeflow (alternative to profile controller) kind/feature, priority/p1, priority/p2






    




Issue: Graduate MPI Operator to v1 






    




Issue: mpijob using hostnetwork error 






    




Issue: Add support for MPICH  






    




Issue: When worker's spec add mounting  a pvc, the satefuleset will be build without mountPath  






    




Issue: Launcher should not collect all the workers' logs  






    




Issue: Upgrade MPI Operator manifests to v1 in kubeflow/manifests 






    




Issue: Perform scalability and load testing for v1 controller 






    




Issue: Add API reference docs to the website 






    




Issue: Add CRD schema validation 






    




Issue: Vulnerability scanning for docker images 






    




Issue: Provide list of licenses used the dependencies 






    




Issue: Add E2E tests area/testing, priority/p1






    




Issue: Multi version support 






    




Issue: Add JSON logging 






    




Issue: Continuous updating of Kustomize manifests priority/p1






    




Issue: Change to follow kubeflow/common convention and reuse implementation 






    




Issue: [Deployment] Separate cluster scoped and namespace scoped resources priority/p1, priority/p2






    




Issue: [Front end] [FR] Better error message for custom metric viz. 






    




Issue: [UX, MLMD] - Artifacts - grpc-message: no healthy upstream on Windows kind/bug






    




Issue: Better local dev/test experience priority/p2






    




Issue: TestInfra - Use KFP to test KFP area/testing, priority/p2






    




Issue: Execution Cache does not cache executions that have no outputs 






    




Issue: Visualizations as components 






    




Issue: Multi-User Authorization: Add support for K8s RBAC via SubjectAccessReview priority/p0, priority/p1






    




Issue: Tensorboard not working with k8s secrets kind/bug






    




Issue: Support notification after a pipeline run done 






    




Issue: YAML serialization and deserialization seems to be _not completely awesome_ 






    




Issue: client.schedule_pipeline not working with cron kind/bug






    




Issue: Add `ExperimentUUID` to `jobs` table 






    




Issue: UX+Backend - Error messages might be pretty long and repetitive 






    




Issue: Backend - Reject invalid pipelines 






    




Issue: Presubmit e2e test moves forward even when api-server image fails to build area/testing, priority/p0






    




Issue: Failing api-integration-tests area/testing






    




Issue: AWS SageMaker component improvement 






    




Issue: ml-pipeline pod crashes when run pipeline called with empty parameters list and has_default_bucket = true kind/bug






    




Issue: RBAC error for ml-pipeline-scheduledworkflow kind/bug






    




Issue: GCS Path as input artifact kind/bug






    




Issue: ml-pipeline memory leak and no HPA support 






    




Issue: Always show GettingStart page for Standalone 






    




Issue: [Metadata Writer] TypeError: create_new_execution_in_existing_context() got an unexpected keyword argument 'custom_properties' kind/bug, priority/p0






    




Issue: [Multi-User] Deploy UI artifact service for each namespace 






    




Issue: While building the Frontend Dockerfile, it fails with compilation error 






    




Issue: Flaky integration test - StorageState mismatch area/testing, priority/p0, priority/p1






    




Issue: pods running in kubeflow namespace kind/bug






    




Issue: Example hello_world.py keeps pending. kind/bug






    




Issue: pipeline visualization image build process is not moving further when try  "RUN /usr/local/gcloud/google-cloud-sdk/install.sh" instruction in docker file 






    




Issue: Support launching TensorBoard before Trainer done 






    




Issue: Client.wait_for_run_completion and "Terminate" in the UI kind/bug, priority/p0






    




Issue: run-frontend-integration-tests failure kind/bug






    




Issue: Request Jupyter Server Start/Stop area/jupyter, priority/p2






    




Issue: MetadataWriter should stop recording non-KFP Argo pods 






    




Issue: Error when using KFP with argo workflow 






    




Issue: Generate release notes failed 






    




Issue: [FR] Exponential backoff support  






    




Issue: Feature Request : Allow runs to be filtered on run attributes like metrics, time, duration etc[UI] 






    




Issue: [Question] can we set priorityClassName or priority to pods in kubeflow pipeline? 






    




Issue: Metadata Writer should log ComponentSpec as execution custom property 






    




Issue: SDK: outputs appear to be getting truncated, at least when used in conditional tests kind/bug






    




Issue: with standalone KFP 0.4 install, artifacts and executions not showing in UI 






    




Issue: [FR] Expose Argo retryStrategy priority/p2






    




Issue: [Frontend pH] document how to run frontend-integration tests locally 






    




Issue: [Frontend pH] Inconsistency between building UI server locally and in docker 






    




Issue: TFX-KFP SDK: Rename KubeflowDagRunner as KubeflowDagCompiler 






    




Issue: TFX-KFP: created experiement which can't be deleted and stop upload another one kind/bug






    




Issue: Getting pipeline run metrics from SDK 






    




Issue: FR: record time consuming data for profiling priority/p2






    




Issue: cache-server in KFP 0.4: gets 'TLS handshake error', does not work properly kind/bug






    




Issue: Connect to minikf via sdk kind/bug






    




Issue: Multiple run termination option 






    




Issue: Support execution id placeholder in component.yaml 






    




Issue: Alibaba Cloud OSS storage support to save artifacts 






    




Issue: cache-deployer-deployment crashes due to missing role permissions kind/bug






    




Issue: PyTorch Operator recognizes kubernetes cluster like single machine? kind/question






    




Issue: [examples/smoke_dist] pytorch_job_sendrecv.yaml does not exist in the directory 






    




Issue: [examples/mnist]README.md instruction should be modify 






    




Issue: Rethink distributed Pytorch backoff retry priority/p2






    




Issue: Component Specification: How to produce output by value with a reusable component? 






    




Issue: Istio in Kubeflow diagram is not accurate 






    




Issue: Update docs to use v1.0.2 






    




Issue: [PyTorchJob] Example document should be modified 






    




Issue: [KFP] kubeflow stand alone deployment with vpc  priority/p2






    




Issue: Step in Installing kustomize throws an error area/kfctl






    




Issue: Facing Issue while running mnist_gcp.ipynb 






    




Issue: deploy.kubeflow.cloud not working 






    




Issue: The current tool for API docs generation is unusable 






    




Issue: Graduate XGBoost Operator to v1

Old Code

TODO(jlewi): Below is some of the original code for training with a pipeline and using fairing. I'm not sure whether that is worth keeping at this point
- It might be easier to just iterate on and train the model directly using a notebook.
- Model quality still doesn't seem good enough that it seems worth it to try to periodically retarin the model.
- Need to figure out to do with the class RepoMLP; I think the only code in RepoMLP not in the cells above is
  1. Code to save the model
  2. Code to log to metadata server



In [5]:

    
# fairing:include-cell
class RepoMLP(object):
    """RepoMLP is a helper class to work with scklearn multi-layer perceptron. 
    
    The RepoMLP provides some wrapper code to help train the sklearn multi-layer perceptron in this case.
    
    TODO(jlewi): This is a wrapper around MLPWrapper which is a wrapper around 
    """
    def __init__(self,
                 owner=None,
                 repo=None,
                 precision_threshold=0.7,
                 recall_threshold=0.5,
                 workspace_name='train',
                 min_freq=25,
                 activation='relu',
                 alpha=0.0001,
                 early_stopping=True,
                 epsilon=1e-08,
                 hidden_layer_sizes=(600,600),
                 learning_rate='adaptive',
                 learning_rate_init=0.001,
                 max_iter=3000,
                 momentum=0.9,
                 n_iter_no_change=5,
                 random_state=1234,
                 solver='adam',
                 validation_fraction=0.1):
        self.precision_threshold = precision_threshold
        self.recall_threshold = recall_threshold
        self.min_freq = min_freq # for filtering labels
        self.mlp_wrapper = None
        self.clf = MLPClassifier(activation=activation,
                                 alpha=alpha,
                                 early_stopping=early_stopping,
                                 epsilon=epsilon,
                                 hidden_layer_sizes=hidden_layer_sizes,
                                 learning_rate=learning_rate,
                                 learning_rate_init=learning_rate_init,
                                 max_iter=max_iter,
                                 momentum=momentum,
                                 n_iter_no_change=n_iter_no_change,
                                 random_state=random_state,
                                 solver=solver,
                                 validation_fraction=validation_fraction)
        self.all_labels = None
        self.probability_thresholds = None
        self.load_yaml(owner, repo)
        self.exec = self.create_execution(workspace_name=workspace_name)

    def load_yaml(self, owner, repo):
        config = RepoConfig(owner, repo)
        self.repo_owner = config.repo_owner
        self.repo_name = config.repo_name

        self.model_bucket_name = config.model_bucket_name
        self.model_file = config.model_local_path
        self.model_dest = config.model_gcs_path

        self.labels_file = config.labels_local_path
        self.labels_dest = config.labels_gcs_path

        self.embeddings_bucket_name = config.embeddings_bucket_name
        self.embeddings_file = config.embeddings_local_path
        self.embeddings_dest = config.embeddings_gcs_path
        
        # TODO(chunhsiang): need to be able to train on multiple repos which
        # should be defined in the yaml config
        # for now, only train model on the repo installed
        self.trained_repos = [f'{self.repo_owner}/{self.repo_name}']

    # TODO(jlewi): Delete this code?
    def download_embeddings_from_gcs(self):
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.embeddings_bucket_name)
        blob = bucket.get_blob(self.embeddings_dest)
        with open(self.embeddings_file, 'wb') as f:
            blob.download_to_file(f)

    def load_training_data(self):
        self.download_embeddings_from_gcs()
        with open(self.embeddings_file, 'rb') as f:
            data = dpickle.load(f)

        # filter labels
        c = Counter()
        for lbls in data['labels']:
            c.update(lbls)
        self.all_labels = [x for x in c if c[x] >= self.min_freq]

        X = []
        y = []
        for emb, lbls in zip(data['features'], data['labels']):
            mask = [self.all_labels.index(x) for x in lbls if c[x] >= self.min_freq]
            if mask == []:
                continue
            zer = np.zeros(len(self.all_labels))
            zer[mask] = 1
            y.append(zer)
            X.append(emb)
        return X, y

    def train(self):
        X, y = self.load_training_data()
        self.mlp_wrapper = MLPWrapper(clf=self.clf,
                                      precision_threshold=self.precision_threshold,
                                      recall_threshold=self.recall_threshold)
        # TODO(jlewi): find_probability_thresholds; splits the data into test and
        # training sets and then calls fit. Why are we then calling fit again?
        # Is this just because its using the whole dataset?
        # get probability thresholds before `fit` because it overwrites classifier
        self.mlp_wrapper.find_probability_thresholds(X, y)
        self.probability_thresholds = self.mlp_wrapper.probability_thresholds
        # train model using the whole data
        self.mlp_wrapper.fit(X, y)
        self.save_model()

        # store model artifacts using kubeflow metadata
        model_name = ','.join(sorted(self.trained_repos))
        model_uri = f'gs://{self.model_bucket_name}/{self.model_dest}'
        # put all the repo names as the label keys
        model_labels = {r:'' for r in self.trained_repos}
        self.exec.log_output(metadata.Model(
            name=model_name,
            uri=model_uri,
            labels=model_labels))

    def save_model(self):
        self.mlp_wrapper.save_model(model_file=self.model_file)
        # dump label columns for prediction
        thresholds = {}
        for i in self.probability_thresholds:
            if self.probability_thresholds[i]:
                thresholds[i] = float(self.probability_thresholds[i])
            else:
                thresholds[i] = None
        label_dict = {
            'labels': self.all_labels,
            'probability_thresholds': thresholds
        }
        with open(self.labels_file, 'w') as f:
            yaml.dump(label_dict, f)

        self.upload_model_to_gcs()

    def upload_model_to_gcs(self):
        # upload model
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.model_bucket_name)
        blob = bucket.blob(self.model_dest)
        blob.upload_from_filename(self.model_file)

        # upload label columns
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.model_bucket_name)
        blob = bucket.blob(self.labels_dest)
        blob.upload_from_filename(self.labels_file)

    def create_execution(self, workspace_name):
        """
        Return a metatdata execution object in a workspace and
        a run for logging.
        Args:
          workspace_name: workspace name, str
        """
        workspace = metadata.Workspace(
            # connect to metadata-service in namesapce kubeflow in k8s cluster.
            backend_url_prefix='metadata-service.kubeflow:8080',
            name=workspace_name,
            description='workspace for model training artifacts and executions')
        
        run = metadata.Run(
            workspace=workspace,
            name='run-' + datetime.utcnow().isoformat('T'))

        return metadata.Execution(
            name = 'execution-' + datetime.utcnow().isoformat('T'),
            workspace=workspace,
            run=run)

Run locally to test the code



In [3]:

    
r = RepoMLP(workspace_name='ws1', owner='kubeflow', repo='examples')



In [4]:

    
r.train()

Create entry point using fairing

Kubeflow Fairing is a Python package that makes training and deploying machine learning models on Kubeflow easier.

Here, we use the preprocessor in Kubeflow Fairing to convert a notebook to be a Python script and create an entry point for that script. After preprocessing the notebook, we can call the command in the command line like the following to run

$ python repo_mlp.py train



In [5]:

    
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire



In [6]:

    
preprocessor = ConvertNotebookPreprocessorWithFire('RepoMLP')

if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['mlp.py', 'repo_config.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()









    Out[6]:





[PosixPath('repo_mlp.py'), 'repo_config.py', 'mlp.py']



In [ ]:

	label	auc	count
1	addition/feature	0.767130	34.0
6	api/v1alpha2	0.873236	57.0
13	area/0.3.0	0.660164	17.0
2	area/0.4.0	0.762683	116.0
20	area/0.5.0	0.749123	45.0
36	area/1.0.0	0.725669	40.0
19	area/api	0.722186	30.0
31	area/back-end	0.652767	35.0
0	area/backend	0.632352	52.0
51	area/bootstrap	0.815521	61.0
34	area/build-release	0.793404	50.0
59	area/centraldashboard	0.884100	49.0
24	area/components	0.571700	34.0
8	area/docs	0.881929	218.0
3	area/engprod	0.724587	79.0
9	area/enterprise_readiness	0.680578	27.0
12	area/example/code_search	0.748904	52.0
5	area/example/issue_summarization	0.648000	21.0
33	area/front-end	0.903475	199.0
186	area/frontend	0.863241	80.0
23	area/inference	0.748020	72.0
10	area/istio	0.788577	21.0
66	area/jupyter	0.949484	218.0
109	area/katib	0.544324	38.0
243	area/kfctl	0.835448	305.0
156	area/ksonnet	0.705119	23.0
53	area/kustomize	0.671678	39.0
194	area/metadata	0.861941	26.0
65	area/operator	0.826509	47.0
177	area/pipelines	0.673647	74.0
...	...	...	...
40	area/sdk/dsl	0.848027	51.0
29	area/sdk/dsl/compiler	0.870994	41.0
183	area/testing	0.932066	195.0
192	area/tfjob	0.646274	58.0
182	community/discussion	0.768986	29.0
30	community/question	0.866500	171.0
146	cuj/build-train-deploy	0.862712	21.0
88	cuj/multi-user	0.801133	20.0
11	doc-sprint	0.881246	95.0
81	feature	0.695470	48.0
57	feature_request	0.713382	32.0
43	good first issue	0.658021	90.0
7	help wanted	0.680207	195.0
22	improvement/enhancement	0.743681	166.0
181	kind/bug	0.904546	1118.0
26	kind/discussion	0.740549	25.0
38	kind/enhancement	0.858680	30.0
108	kind/feature	0.828165	907.0
234	kind/process	0.815845	53.0
27	kind/question	0.882781	171.0
96	platform/aws	0.581743	34.0
72	platform/gcp	0.899582	191.0
84	platform/minikube	0.664970	21.0
73	priority/p0	0.733091	442.0
85	priority/p1	0.724888	1139.0
179	priority/p2	0.664800	746.0
148	priority/p3	0.731410	51.0
114	release/0.2.0	0.739689	40.0
45	release/0.3.0	0.729746	99.0
180	testing	0.952151	34.0

	label	auc	count
1	addition/feature	0.721142	16.0
6	api/v1alpha2	0.746764	25.0
13	area/0.3.0	0.518209	18.0
2	area/0.4.0	0.687899	58.0
20	area/0.5.0	0.682154	9.0
36	area/1.0.0	0.658661	15.0
19	area/api	0.735178	18.0
31	area/back-end	0.525537	12.0
0	area/backend	0.520193	20.0
51	area/bootstrap	0.746872	24.0
34	area/build-release	0.809440	17.0
59	area/centraldashboard	0.664688	19.0
24	area/components	0.645988	22.0
8	area/docs	0.836248	95.0
3	area/engprod	0.586436	34.0
9	area/enterprise_readiness	0.708824	14.0
12	area/example/code_search	0.711029	13.0
5	area/example/issue_summarization	0.508188	10.0
33	area/front-end	0.873642	78.0
186	area/frontend	0.843582	41.0
23	area/inference	0.812361	29.0
10	area/istio	0.678056	11.0
66	area/jupyter	0.942235	110.0
109	area/katib	0.506560	14.0
243	area/kfctl	0.829178	124.0
156	area/ksonnet	0.750000	8.0
53	area/kustomize	0.664620	17.0
194	area/metadata	0.906886	9.0
65	area/operator	0.713259	12.0
177	area/pipelines	0.524112	25.0
...	...	...	...
40	area/sdk/dsl	0.734925	20.0
29	area/sdk/dsl/compiler	0.857899	16.0
183	area/testing	0.923907	91.0
192	area/tfjob	0.608080	18.0
182	community/discussion	0.797142	11.0
30	community/question	0.880685	69.0
146	cuj/build-train-deploy	0.753476	15.0
88	cuj/multi-user	0.600059	19.0
11	doc-sprint	0.875688	41.0
81	feature	0.678555	19.0
57	feature_request	0.702508	14.0
43	good first issue	0.657903	48.0
7	help wanted	0.713199	102.0
22	improvement/enhancement	0.670356	81.0
181	kind/bug	0.875297	480.0
26	kind/discussion	0.545338	17.0
38	kind/enhancement	0.724563	10.0
108	kind/feature	0.803830	363.0
234	kind/process	0.767716	16.0
27	kind/question	0.830156	87.0
96	platform/aws	0.576086	19.0
72	platform/gcp	0.873200	85.0
84	platform/minikube	0.747617	12.0
73	priority/p0	0.726644	180.0
85	priority/p1	0.686845	500.0
179	priority/p2	0.631954	307.0
148	priority/p3	0.634069	20.0
114	release/0.2.0	0.574587	15.0
45	release/0.3.0	0.694674	54.0
180	testing	0.937556	22.0

	probabilities	labels
181	0.958053	kind/bug
243	0.154085	area/kfctl
85	0.142676	priority/p1
72	0.114866	platform/gcp
179	0.113417	priority/p2
73	0.087836	priority/p0
30	0.056801	community/question
27	0.037530	kind/question
51	0.015242	area/bootstrap
108	0.013784	kind/feature
43	0.013541	good first issue
8	0.013455	area/docs
109	0.012552	area/katib
177	0.011179	area/pipelines
53	0.009116	area/kustomize
66	0.007146	area/jupyter
22	0.006907	improvement/enhancement
0	0.006834	area/backend
10	0.006017	area/istio
186	0.005850	area/frontend
11	0.005842	doc-sprint
137	0.005688	area/sdk
192	0.004366	area/tfjob
7	0.004256	help wanted
180	0.003859	testing
23	0.003487	area/inference
45	0.003437	release/0.3.0
84	0.003216	platform/minikube
96	0.003182	platform/aws
3	0.003166	area/engprod
...	...	...
2	0.002620	area/0.4.0
183	0.002547	area/testing
234	0.002250	kind/process
114	0.002228	release/0.2.0
40	0.002095	area/sdk/dsl
194	0.002001	area/metadata
57	0.001837	feature_request
20	0.001555	area/0.5.0
59	0.001475	area/centraldashboard
9	0.001471	area/enterprise_readiness
26	0.001466	kind/discussion
156	0.001275	area/ksonnet
12	0.001224	area/example/code_search
65	0.001171	area/operator
148	0.001167	priority/p3
36	0.001126	area/1.0.0
28	0.001069	area/sdk/components
38	0.001063	kind/enhancement
24	0.000859	area/components
146	0.000858	cuj/build-train-deploy
34	0.000857	area/build-release
81	0.000812	feature
13	0.000799	area/0.3.0
19	0.000752	area/api
16	0.000713	area/samples
70	0.000630	area/sdk/client
182	0.000528	community/discussion
5	0.000408	area/example/issue_summarization
1	0.000259	addition/feature
88	0.000199	cuj/multi-user