In [1]:
import altair as alt
import collections
import importlib
import logging
import sys
import os
import datetime
from dateutil import parser as dateutil_parser
import glob
import json
import numpy as np
import pandas as pd
from pandas.io import gbq
# A bit of a hack to set the path correctly
sys.path = [os.path.abspath(os.path.join(os.getcwd(), "..", "..", "py"))] + sys.path
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(message)s|%(pathname)s|%(lineno)d|'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)
alt.renderers.enable('html')
Out[1]:
In [2]:
import getpass
import subprocess
# Configuration Variables. Modify as desired.
PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()
In [3]:
query = """
SELECT
timestamp,
jsonPayload.repo_owner,
jsonPayload.repo_name,
cast(jsonPayload.issue_num as numeric) as issue_num,
jsonPayload.predictions
FROM `issue-label-bot-dev.issue_label_bot_logs_dev.stderr_*`
where jsonPayload.message = "Add labels to issue."
and timestamp_diff(current_timestamp(), timestamp, day) <=28
"""
labeled=gbq.read_gbq(str(query), dialect='standard', project_id=PROJECT)
In [4]:
# Count how many times each label was added
label_counts = collections.defaultdict(lambda: 0)
In [5]:
# We need to compute the number of issues that got labeled with an area or kind label
results = pd.DataFrame(index=range(labeled.shape[0]), columns=["area", "kind"])
results = results.fillna(0)
for i in range(labeled.shape[0]):
predictions = labeled["predictions"][i]
if not predictions:
continue
# Loop over the predictions to see if one of them includes an area or kind label
for l, p in predictions.items():
label_counts[l] = label_counts[l] + 1
In [6]:
# Now for each issue count whether a particular label is added
issue_labels = pd.DataFrame(index=range(labeled.shape[0]), columns=label_counts.keys())
issue_labels = issue_labels.fillna(0)
for c in ["repo_owner", "repo_name", "issue_num"]:
issue_labels[c] = labeled[c]
for i in range(labeled.shape[0]):
predictions = labeled["predictions"][i]
if not predictions:
continue
for l, p in predictions.items():
if not p:
continue
issue_labels.at[i, l] = 1
# Deduplicate the rows
# We need to group by (repo_owner, repo_name, issue_num); we should take the max of each column
# as a way of dealing with duplicates
issue_labels = issue_labels.groupby(["repo_owner", "repo_name", "issue_num"], as_index=False).max()
In [7]:
# Create a mapping from label prefixes to all all the labels with that prefix
# e.g. area -> ["area_jupyter", "area_kfctl", ...]
label_prefixes = collections.defaultdict(lambda: [])
for l in label_counts.keys():
pieces = l.split("_")
if len(pieces) <= 1:
continue
label_prefixes[pieces[0]] = label_prefixes[pieces[0]] + [l]
# Add remappings.
# The log entries associated with "Add labels to issue." log the model predictions before label remapping
# is applied; i.e. before feature is remapped to kind/feature.
# So we want to apply those mappings here before computing the stats.
#
# TODO(https://github.com/kubeflow/code-intelligence/issues/109): We should arguably load these from
# the YAML files configuring label bot.
for l in ["bug", "feature", "feature_request", "question"]:
if l not in label_counts.keys():
continue
label_prefixes["kind"] = label_prefixes["kind"] + [l]
In [8]:
# Now for each issue aggregate across all labels with a given prefix to see if the issue has at least one
# of the given prefix labels
issue_group_labels = pd.DataFrame(index=range(issue_labels.shape[0]), columns=label_prefixes.keys())
issue_group_labels = issue_group_labels.fillna(0)
for c in ["repo_owner", "repo_name", "issue_num"]:
issue_group_labels[c] = issue_labels[c]
for prefix, labels in label_prefixes.items():
issue_group_labels[prefix] = issue_labels[labels].max(axis=1)
In [9]:
# Compute the number of issues with at least one of the specified prefixes
rows = ["area", "platform", "kind"]
num_issues = issue_group_labels.shape[0]
counts = issue_group_labels[rows].sum(axis=0)
stats = pd.DataFrame(index=range(len(rows)), columns = ["label", "count", "percentage"])
stats["label"] = counts.index
stats["count"] = counts.values
stats["percentage"] = stats["count"]/float(num_issues) *100
In [10]:
print(f"Total # of issues {num_issues}")
print("Number and precentage of issues with labels with various prefixes")
stats
Out[10]:
In [11]:
chart = alt.Chart(stats)
chart.mark_point().encode(
x='label',
y='count',
).interactive()
Out[11]:
In [12]:
import numpy as np
issues_per_day = labeled[["timestamp","repo_owner", "repo_name", "issue_num"]]
# Deduplicate the issues by taking the first entry
issues_per_day = issues_per_day.groupby(["repo_owner", "repo_name", "issue_num"], as_index=False).min()
# Compute the day
issues_per_day["day"] = issues_per_day["timestamp"].apply(lambda x: datetime.datetime(x.year, x.month, x.day))
issue_counts = issues_per_day[["day"]]
issue_counts["num_issues"] = 1
issue_counts = issue_counts.groupby(["day"], as_index=False).sum()
In [13]:
chart = alt.Chart(issue_counts)
line = chart.mark_line().encode(
x=alt.X('day'),
y=alt.Y('num_issues'),
)
point = line + line.mark_point()
point.interactive()
Out[13]: