In [13]:
import pandas as pd
import re
import base64
import requests
import yaml
from collections import Counter

This notebook illustrates how to obtain the training data for a specific repo (issues with labels), in this case kubernetes/kubernetes

Note there are several ways to get the training data: (1) BigQuery and (2) Scraping or the API. This notebook shows how to obtain training data from BigQuery. In production scenarios we will likely use the API or scraping as not all data is contained in BigQuery.

Part 1 Acquire Data

Using BigQuery

#standardSQL

SELECT *
FROM (
  SELECT
    updated_at
    , MAX(updated_at) OVER (PARTITION BY url) as last_time
    , FORMAT("%T", ARRAY_CONCAT_AGG(labels)) as labels
    , repo, url, title, body, len_labels
  FROM(
      SELECT
          TIMESTAMP(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.updated_at'), "\"", "")) as updated_at
        , REGEXP_EXTRACT(JSON_EXTRACT(payload, '$.issue.url'), r'https://api.github.com/repos/(.*)/issues') as repo
        , JSON_EXTRACT(payload, '$.issue.url') as url
          -- extract the title and body removing parentheses, brackets, and quotes
        , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title
        , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body
        , REGEXP_EXTRACT_ALL(JSON_EXTRACT(payload, "$.issue.labels"), ',"name\":"(.+?)","color') as labels
        , ARRAY_LENGTH(REGEXP_EXTRACT_ALL(JSON_EXTRACT(payload, "$.issue.labels"), ',"name\":"(.+?)","color')) as len_labels
      FROM `githubarchive.month.20*`
      WHERE 
          _TABLE_SUFFIX BETWEEN '1601' and '1912'
          and type="IssuesEvent"
  )
  WHERE 
    repo = 'kubernetes/kubernetes'
  GROUP BY updated_at, repo, url, title, body, len_labels
)
WHERE last_time = updated_at and len_labels >= 1

Data from the above query is available on GCS: https://storage.googleapis.com/issue_label_bot/k8s_issues/000000000000.csv


In [9]:
df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/k8s_issues/000000000000.csv')

# convert from string to list
df.labels = df.labels.apply(lambda x: eval(x)) 


# change data type of last_time to date
df['last_time'] = pd.to_datetime(df.last_time)

df.head()


Out[9]:
updated_at last_time labels repo url title body len_labels
0 2018-02-24 15:09:51 UTC 2018-02-24 15:09:51+00:00 [lifecycle/rotten, priority/backlog, sig/clust... kubernetes/kubernetes "https://api.github.com/repos/kubernetes/kuber... minions ip does not follow --hostname-override... according to 9267, the kubelet should registe... 4
1 2016-03-01 00:30:05 UTC 2016-03-01 00:30:05+00:00 [kind/flake, priority/P1, team/control-plane, ... kubernetes/kubernetes "https://api.github.com/repos/kubernetes/kuber... e2e flake: daemon set should run and stop comp... this has been flaking pretty often lately - in... 3
2 2016-02-19 18:55:16 UTC 2016-02-19 18:55:16+00:00 [area/usability, component/kubectl, kind/enhan... kubernetes/kubernetes "https://api.github.com/repos/kubernetes/kuber... can't abort kubectl edit i am running kubectl edit against an object.... 5
3 2016-02-25 00:27:07 UTC 2016-02-25 00:27:07+00:00 [area/test, kind/flake, priority/P1, team/cont... kubernetes/kubernetes "https://api.github.com/repos/kubernetes/kuber... e2e flake: limitrange it should create a lim... limitrange e2e flake:\r \r \r failure 7.61... 4
4 2016-06-14 04:14:18 UTC 2016-06-14 04:14:18+00:00 [] kubernetes/kubernetes "https://api.github.com/repos/kubernetes/kuber... unable to deploy kubernetes on local ubuntu ma... below is my error trace. i am not sure if the ... 0

Clean Data

  • Remove labels that are deprecated and no longer specified here.
  • Remove bot commands that contain the label name (otherwhise this will be data leakage).

In [11]:
def get_current_labels(url="https://raw.githubusercontent.com/kubernetes/test-infra/master/label_sync/labels.yaml"):
    """
    Get list of valid issue labels (b/c labels get deprecated over time).
    See: https://kubernetes.slack.com/archives/C1TU9EB9S/p1561570627363100
    """
    req = requests.get(url)
    yml = yaml.safe_load(req.content)
    return [x['name'] for x in yml['default']['labels']]


# remove deprecated labels
current_labels = get_current_labels()
df.labels = df.labels.apply(lambda x: [l for l in x if l in current_labels])


# get rid of bot commands
df['body'] = df.body.apply(lambda x: re.sub('(/sig|/kind|/status/triage/|priority) \S+', '', str(x)))

Filter Issues

Remove Issues that:

  • Do not have any labels
  • Do not contain at least one label that has a minimum frequency of 25

In [29]:
# remove labels that do not occur at least 25 times
c = Counter()

for row in df.labels:
    c.update(row)
    
min_threshold = 25
min_threshold_labels = [k for k in c if c[k] >= min_threshold]

df['labels'] = df.labels.apply(lambda x: [l for l in x if l in min_threshold_labels])

# filter out issues without any labels
df = df[df.labels.apply(lambda x: x != [])]

In [30]:
# remove extreanous columns
df = df[['last_time', 'repo', 'title', 'body', 'labels']]

In [31]:
print(f'Number of labeled issues after filtering and cleaning: {df.shape[0]:,}')


Number of labeled issues after filtering and cleaning: 19,549

In [32]:
df.head()


Out[32]:
last_time repo title body labels
0 2018-02-24 15:09:51+00:00 kubernetes/kubernetes minions ip does not follow --hostname-override... according to 9267, the kubelet should registe... [lifecycle/rotten, priority/backlog, sig/clust...
1 2016-03-01 00:30:05+00:00 kubernetes/kubernetes e2e flake: daemon set should run and stop comp... this has been flaking pretty often lately - in... [kind/flake, kind/flake]
3 2016-02-25 00:27:07+00:00 kubernetes/kubernetes e2e flake: limitrange it should create a lim... limitrange e2e flake:\r \r \r failure 7.61... [kind/flake]
7 2016-09-30 20:58:49+00:00 kubernetes/kubernetes k8s.io kubectl client k8s.io guestbook appl... https://k8s-gubernator.appspot.com/build/kuber... [kind/flake]
8 2016-11-22 16:40:48+00:00 kubernetes/kubernetes ci-kubernetes-e2e-gci-gke-staging: broken test... https://k8s-gubernator.appspot.com/build/kuber... [kind/flake]