Analyze Issue Label Bot

  • This notebook is used to compute metrics to evaluate performance of the issue label bot.

In [1]:
import altair as alt
import collections
import importlib
import logging
import sys
import os
import datetime
from dateutil import parser as dateutil_parser
import glob
import json
import numpy as np
import pandas as pd
from pandas.io import gbq

# A bit of a hack to set the path correctly
sys.path = [os.path.abspath(os.path.join(os.getcwd(), "..", "..", "py"))] + sys.path

logging.basicConfig(level=logging.INFO,
                  format=('%(levelname)s|%(asctime)s'
                        '|%(message)s|%(pathname)s|%(lineno)d|'),
                datefmt='%Y-%m-%dT%H:%M:%S',
                )
logging.getLogger().setLevel(logging.INFO)
alt.renderers.enable('html')


Out[1]:
RendererRegistry.enable('html')

In [2]:
import getpass
import subprocess
# Configuration Variables. Modify as desired.

PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()

Setup Authorization

If you are using a service account run %%bash

Activate Service Account provided by Kubeflow.

gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

If you are running using user credentials

gcloud auth application-default login

Query Bigquery

  • We need to query bigquery to get the issues were we added predictions

In [3]:
query = """
SELECT
    timestamp,
    jsonPayload.repo_owner, 
    jsonPayload.repo_name,
    cast(jsonPayload.issue_num as numeric) as issue_num,
    jsonPayload.predictions
  FROM `issue-label-bot-dev.issue_label_bot_logs_dev.stderr_*`
  where jsonPayload.message = "Add labels to issue."
        and timestamp_diff(current_timestamp(), timestamp, day) <=28
"""

labeled=gbq.read_gbq(str(query), dialect='standard', project_id=PROJECT)


Downloading: 100%|██████████| 3791/3791 [00:01<00:00, 2651.61rows/s]
INFO|2020-06-28T01:42:14|Total time taken 7.9 s.
Finished at 2020-06-28 01:42:14.|/home/jovyan/.local/lib/python3.6/site-packages/pandas_gbq/gbq.py|378|

In [4]:
# Count how many times each label was added
label_counts = collections.defaultdict(lambda: 0)

In [5]:
# We need to compute the number of issues that got labeled with an area or kind label
results = pd.DataFrame(index=range(labeled.shape[0]), columns=["area", "kind"])
results = results.fillna(0)

for i in range(labeled.shape[0]):    
    predictions = labeled["predictions"][i]
    
    if not predictions:
        continue
        
    # Loop over the predictions to see if one of them includes an area or kind label
    for l, p in predictions.items():
        label_counts[l] = label_counts[l] + 1

In [6]:
# Now for each issue count whether a particular label is added
issue_labels = pd.DataFrame(index=range(labeled.shape[0]), columns=label_counts.keys())
issue_labels = issue_labels.fillna(0)

for c in ["repo_owner", "repo_name", "issue_num"]:
    issue_labels[c] = labeled[c]

for i in range(labeled.shape[0]):
    predictions = labeled["predictions"][i]
    
    if not predictions:
        continue
    
    for l, p in predictions.items():
        if not p:
            continue
            
        issue_labels.at[i, l] = 1
        
# Deduplicate the rows
# We need to group by (repo_owner, repo_name, issue_num); we should take the max of each column
# as a way of dealing with duplicates
issue_labels = issue_labels.groupby(["repo_owner", "repo_name", "issue_num"], as_index=False).max()

In [7]:
# Create a mapping from label prefixes to all all the labels with that prefix
# e.g. area -> ["area_jupyter", "area_kfctl", ...]

label_prefixes = collections.defaultdict(lambda: [])

for l in label_counts.keys():
    pieces = l.split("_")
    if len(pieces) <= 1:
        continue
        
    label_prefixes[pieces[0]] = label_prefixes[pieces[0]] + [l]
    
# Add remappings.
# The log entries associated with "Add labels to issue." log the model predictions before label remapping
# is applied; i.e. before feature is remapped to kind/feature.
# So we want to apply those mappings here before computing the stats.
#
# TODO(https://github.com/kubeflow/code-intelligence/issues/109): We should arguably load these from
# the YAML files configuring label bot.
for l in ["bug", "feature", "feature_request", "question"]:
    if l not in label_counts.keys():
        continue
    label_prefixes["kind"] = label_prefixes["kind"] + [l]

In [8]:
# Now for each issue aggregate across all labels with a given prefix to see if the issue has at least one
# of the given prefix labels
issue_group_labels =  pd.DataFrame(index=range(issue_labels.shape[0]), columns=label_prefixes.keys())
issue_group_labels = issue_group_labels.fillna(0)

for c in ["repo_owner", "repo_name", "issue_num"]:
    issue_group_labels[c] = issue_labels[c]

for prefix, labels in label_prefixes.items():
    issue_group_labels[prefix] = issue_labels[labels].max(axis=1)

In [9]:
# Compute the number of issues with at least one of the specified prefixes
rows = ["area", "platform", "kind"]
num_issues = issue_group_labels.shape[0]
counts = issue_group_labels[rows].sum(axis=0)
stats = pd.DataFrame(index=range(len(rows)), columns = ["label", "count", "percentage"])
stats["label"] = counts.index
stats["count"] = counts.values
stats["percentage"] = stats["count"]/float(num_issues) *100

In [10]:
print(f"Total # of issues {num_issues}")
print("Number and precentage of issues with labels with various prefixes")
stats


Total # of issues 569
Number and precentage of issues with labels with various prefixes
Out[10]:
label count percentage
0 area 323 56.766257
1 platform 61 10.720562
2 kind 532 93.497364

In [11]:
chart = alt.Chart(stats)
chart.mark_point().encode(
  x='label',
  y='count',
).interactive()


Out[11]:

Number of Issues Labeled Per Day

  • Make a plot of the number of issues labeled each day

In [12]:
import numpy as np
issues_per_day = labeled[["timestamp","repo_owner", "repo_name", "issue_num"]]
# Deduplicate the issues by taking the first entry
issues_per_day =  issues_per_day.groupby(["repo_owner", "repo_name", "issue_num"], as_index=False).min()
# Compute the day 
issues_per_day["day"] = issues_per_day["timestamp"].apply(lambda x: datetime.datetime(x.year, x.month, x.day))
issue_counts = issues_per_day[["day"]]
issue_counts["num_issues"] = 1
issue_counts = issue_counts.groupby(["day"], as_index=False).sum()


/home/jovyan/.local/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  

In [13]:
chart = alt.Chart(issue_counts)
line = chart.mark_line().encode(
  x=alt.X('day'),
  y=alt.Y('num_issues'),
)

point = line + line.mark_point()

point.interactive()


Out[13]: