In [13]:
import pandas as pd
import re
import base64
import requests
import yaml
from collections import Counter
Note there are several ways to get the training data: (1) BigQuery and (2) Scraping or the API. This notebook shows how to obtain training data from BigQuery. In production scenarios we will likely use the API or scraping as not all data is contained in BigQuery.
#standardSQL
SELECT *
FROM (
SELECT
updated_at
, MAX(updated_at) OVER (PARTITION BY url) as last_time
, FORMAT("%T", ARRAY_CONCAT_AGG(labels)) as labels
, repo, url, title, body, len_labels
FROM(
SELECT
TIMESTAMP(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.updated_at'), "\"", "")) as updated_at
, REGEXP_EXTRACT(JSON_EXTRACT(payload, '$.issue.url'), r'https://api.github.com/repos/(.*)/issues') as repo
, JSON_EXTRACT(payload, '$.issue.url') as url
-- extract the title and body removing parentheses, brackets, and quotes
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body
, REGEXP_EXTRACT_ALL(JSON_EXTRACT(payload, "$.issue.labels"), ',"name\":"(.+?)","color') as labels
, ARRAY_LENGTH(REGEXP_EXTRACT_ALL(JSON_EXTRACT(payload, "$.issue.labels"), ',"name\":"(.+?)","color')) as len_labels
FROM `githubarchive.month.20*`
WHERE
_TABLE_SUFFIX BETWEEN '1601' and '1912'
and type="IssuesEvent"
)
WHERE
repo = 'kubernetes/kubernetes'
GROUP BY updated_at, repo, url, title, body, len_labels
)
WHERE last_time = updated_at and len_labels >= 1
In [9]:
df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/k8s_issues/000000000000.csv')
# convert from string to list
df.labels = df.labels.apply(lambda x: eval(x))
# change data type of last_time to date
df['last_time'] = pd.to_datetime(df.last_time)
df.head()
Out[9]:
In [11]:
def get_current_labels(url="https://raw.githubusercontent.com/kubernetes/test-infra/master/label_sync/labels.yaml"):
"""
Get list of valid issue labels (b/c labels get deprecated over time).
See: https://kubernetes.slack.com/archives/C1TU9EB9S/p1561570627363100
"""
req = requests.get(url)
yml = yaml.safe_load(req.content)
return [x['name'] for x in yml['default']['labels']]
# remove deprecated labels
current_labels = get_current_labels()
df.labels = df.labels.apply(lambda x: [l for l in x if l in current_labels])
# get rid of bot commands
df['body'] = df.body.apply(lambda x: re.sub('(/sig|/kind|/status/triage/|priority) \S+', '', str(x)))
In [29]:
# remove labels that do not occur at least 25 times
c = Counter()
for row in df.labels:
c.update(row)
min_threshold = 25
min_threshold_labels = [k for k in c if c[k] >= min_threshold]
df['labels'] = df.labels.apply(lambda x: [l for l in x if l in min_threshold_labels])
# filter out issues without any labels
df = df[df.labels.apply(lambda x: x != [])]
In [30]:
# remove extreanous columns
df = df[['last_time', 'repo', 'title', 'body', 'labels']]
In [31]:
print(f'Number of labeled issues after filtering and cleaning: {df.shape[0]:,}')
In [32]:
df.head()
Out[32]: