Unlike Issue-Label Bot which predicts generic bug, feature-request and question labels, we are attempting to build the capability to predict repo-specific labels. One of the primary challenges of doing this is a dearth of labeled examples for a particular repo. Therefore, we attempt to generate features via transfer learning from a language model trained over a large corpus of GitHub issues. These features are then fed downstream to a classifier with the goal of enabling the classifier to predict personalized issue labels based upon existing hand-labeled issues present in a repository.
As an initial test, we will evaluate the ability to predict sig/ labels on the Kubernetes/Kubernetes repo.
In order to measure the efficacy of these embeddings, we will use DataRobot as a benchmark to see if adding embeddings from transfer learning improves model performance relative to TFIDF n-gram techniques featurization of text.
#standardSQL
SELECT *
FROM (
SELECT
updated_at
, MAX(updated_at) OVER (PARTITION BY url) as last_time
, FORMAT("%T", ARRAY_CONCAT_AGG(labels)) as labels
, repo, url, title, body, len_labels
FROM(
SELECT
TIMESTAMP(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.updated_at'), "\"", "")) as updated_at
, REGEXP_EXTRACT(JSON_EXTRACT(payload, '$.issue.url'), r'https://api.github.com/repos/(.*)/issues') as repo
, JSON_EXTRACT(payload, '$.issue.url') as url
-- extract the title and body removing parentheses, brackets, and quotes
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body
, REGEXP_EXTRACT_ALL(JSON_EXTRACT(payload, "$.issue.labels"), ',"name\":"(.+?)","color') as labels
, ARRAY_LENGTH(REGEXP_EXTRACT_ALL(JSON_EXTRACT(payload, "$.issue.labels"), ',"name\":"(.+?)","color')) as len_labels
FROM `githubarchive.month.20*`
WHERE
_TABLE_SUFFIX BETWEEN '1601' and '1912'
and type="IssuesEvent"
)
WHERE
repo = 'kubernetes/kubernetes'
GROUP BY updated_at, repo, url, title, body, len_labels
)
WHERE last_time = updated_at and len_labels >= 1
https://storage.googleapis.com/issue_label_bot/k8s_issues/000000000000.csv
In [35]:
import pandas as pd
import numpy as np
from random import randint
from matplotlib import pyplot as plt
import re
pd.set_option('max_colwidth', 1000)
In [36]:
df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/k8s_issues/000000000000.csv')
df.labels = df.labels.apply(lambda x: eval(x))
df.head()
Out[36]:
In [37]:
#remove target leakage from kubernetes which are the bot commands
df['body'] = df.body.apply(lambda x: re.sub('(/sig|/kind|/status/triage/|priority) \S+', '', str(x)))
In [38]:
def count_sig(l):
return(sum(['sig/' in x for x in l]))
In [39]:
from matplotlib.ticker import PercentFormatter
sig_counts = df.labels.apply(lambda x: count_sig(x))
plt.hist(sig_counts, weights=np.ones(len(sig_counts)) / len(sig_counts))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title(f'Distribution of # of sig/ labels for kubernetes/kubernetes\n {len(sig_counts):,} issues pulled from GHArchive.')
plt.show()
In [40]:
from collections import Counter
c = Counter()
for row in df.labels:
c.update(row)
In [41]:
print(f'There are {len(c.keys())} unique labels in kubernetes/kubernetes')
nsig = sum(['sig/' in x for x in list(c.keys())])
print(f"number of sig labels: {nsig}")
In [74]:
c.most_common(50)
Out[74]:
In [79]:
len([(k, c[k]) for k in c if c[k] >= 100])
Out[79]:
In [43]:
sig_labels = [x for x in list(c.keys()) if 'sig/' in x]
for l in sig_labels:
print(f'{l}: {c[l]}')
In [10]:
min_freq = 30
def contains_sig(l):
if not l:
return False
else:
# make sure there are at least 10 issues labeled with that value
return max(['sig/' in x and c[x] >=min_freq for x in l])
In [11]:
sig_df = df[df.labels.apply(lambda x: contains_sig(x))]
print(f'{sig_df.shape[0]:,} issues have sig/ labels')
In [12]:
sig_labels = [k for k in c.keys() if c[k] >= min_freq and 'sig/' in k]
print(f'{len(sig_labels)} sig labels that have at least {min_freq} issues')
In [13]:
# build an indicator matrix
indicator = []
for l in sig_df.labels.values:
zer = np.zeros(len(sig_labels))
mask = [sig_labels.index(x) for x in l if x in sig_labels]
zer[mask] = 1
indicator.append(zer[None, :])
indicator_matrix = pd.DataFrame(np.concatenate(indicator, axis=0), columns=sig_labels).astype(int)
corr_grid = indicator_matrix.T.dot(indicator_matrix)
In [14]:
for i, x in enumerate(corr_grid):
corr_grid.iloc[i][i:] = 0
In [15]:
import seaborn as sns
import matplotlib.pyplot as plt
#cmap = sns.diverging_palette(220, 10, as_cmap=True)
In [16]:
#normalize correlation grid
for label in corr_grid:
corr_grid.loc[label] = corr_grid.loc[label] / c[label]
In [17]:
plt.figure(figsize=(16, 14))
plt.title('Co-Occurence Matrix')
sns.heatmap(corr_grid, square=True, vmin=0, vmax=.4, mask=corr_grid<=0.05)
Out[17]:
In [18]:
def part_assign():
i = randint(1, 10)
if i <=5:
return i
else:
return 6
combined_sig_df = pd.concat([sig_df.reset_index(), indicator_matrix.reset_index()], axis=1)
combined_sig_df['part'] = combined_sig_df.repo.apply(lambda x: part_assign())
combined_sig_df.to_pickle('combined_sig_df.pkl')
In [19]:
combined_sig_df = pd.read_pickle('combined_sig_df.pkl')
In [20]:
#! pip install datarobot
In [21]:
import datarobot as dr
from datarobot import UserCV
from fastai.core import parallel
from datarobot import Blueprint
ucv = UserCV(user_partition_col='part', cv_holdout_level=6, seed=123)
dr.Client(token='something-something', endpoint='https://app.datarobot.com/api/v2')
Out[21]:
In [22]:
def create_dr_proj(label):
temp_df = combined_sig_df[['title', 'body', 'part', label]]
proj = dr.Project.create(sourcedata=temp_df,
project_name=label,
)
proj.set_target(label,
positive_class=1,
partitioning_method=ucv,
target_type='Binary',
mode=dr.AUTOPILOT_MODE.MANUAL,
worker_count=9,
max_wait=600000)
bps = proj.get_blueprints()
bp = [b for b in bps if 'Nystroem' in str(b)][0]
proj.train(bp, sample_pct=49.8)
proj.unlock_holdout()
return proj
In [23]:
proj_list = []
for i, label in enumerate(sig_labels):
try:
print(f'creating project {i}: {label}')
proj = create_dr_proj(label)
proj_list.append(proj)
except:
pass
In [264]:
predictions = []
for proj in proj_list:
print(f'getting predictions for holdout set for {str(proj)}')
label = proj.target.replace('_', '-')
temp_df = combined_sig_df[['title', 'body', 'part', label]]
temp_df = temp_df[temp_df.part == 6]
ds = proj.upload_dataset(temp_df)
m = proj.get_models()[0]
predict_job = m.request_predictions(ds.id)
yhat = predict_job.get_result_when_complete()
predictions.append({label: yhat['positive_probability']})
In [287]:
result = {}
for d in predictions:
result.update(d)
baseline_holdout_predictions_df = pd.DataFrame(result)
baseline_holdout_predictions_df.columns = ['p_'+x for x in baseline_holdout_predictions_df.columns]
In [282]:
assert len(baseline_holdout_predictions_df) == len(combined_sig_df[combined_sig_df.part == 6])
In [298]:
predictions_df = pd.concat([combined_sig_df[combined_sig_df.part == 6].reset_index(drop=True),
baseline_holdout_predictions_df.reset_index(drop=True)], axis=1)
predictions_df['version'] = 'baseline'
In [299]:
predictions_df.to_pickle('prediction_baseline_df.pkl')
In [81]:
import pandas as pd
from inference import InferenceWrapper, pass_through
import os
import torch
from torch.cuda import empty_cache
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
wrapper = InferenceWrapper(model_path='/ds/lang_model/models_uxgcl1e1/',
model_file_name='trained_model_uxgcl1e1.pkl')
empty_cache()
In [2]:
combined_sig_df = pd.read_pickle('combined_sig_df.pkl')
# text = wrapper.process_df(combined_sig_df)
# text.to_pickle('textlm_df.pkl')
In [4]:
text = pd.read_pickle('textlm_df.pkl')
assert text['text'].isna().sum() == 0
In [5]:
features = []
In [6]:
from tqdm.auto import tqdm
with torch.no_grad():
for t in tqdm(text['text'].values):
feat = wrapper.get_pooled_features(t).cpu()
features.append(feat)
empty_cache()
In [7]:
feat_matrix = torch.cat(features, dim=0).numpy()
In [26]:
feat_matrix = feat_matrix[:, :1600]
In [27]:
feat_df = pd.DataFrame(feat_matrix)
feat_df.columns = ['f_' + str(x) for x in feat_df.columns]
feat_df.to_csv('feat_df.csv', index=False)
In [28]:
feat_df = pd.read_csv('feat_df.csv')
In [29]:
lm_combined_df = pd.concat([combined_sig_df.reset_index(drop=True),
feat_df.reset_index(drop=True)], axis=1)
In [30]:
import datarobot as dr
from datarobot import UserCV
ucv = UserCV(user_partition_col='part', cv_holdout_level=6, seed=123)
dr.Client(token='something', endpoint='https://app.datarobot.com/api/v2')
def create_dr_proj(label):
temp_df = lm_combined_df[['title', 'body', 'part', label] + list(feat_df.columns)]
proj = dr.Project.create(sourcedata=temp_df,
project_name='lm_'+label,
)
proj.set_target(label,
positive_class=1,
partitioning_method=ucv,
target_type='Binary',
mode=dr.AUTOPILOT_MODE.QUICK,
worker_count=9,
max_wait=600000)
proj.unlock_holdout()
return proj
In [31]:
proj_list_lm = []
for i, label in enumerate(sig_labels):
try:
print(f'creating project {i}: lm_{label}')
proj = create_dr_proj(label)
proj_list_lm.append(proj)
except:
pass
In [1]:
import datarobot as dr
from datarobot import UserCV
dr.Client(token='something-something', endpoint='https://app.datarobot.com/api/v2')
def get_metrics(modelobj):
return modelobj.metrics['AUC']['holdout']
In [2]:
projects = [p for p in dr.Project.list() if p.project_name.startswith('lm_')]
In [44]:
'hamel'.replace('am', 'gg')
Out[44]:
In [45]:
label = []
category = []
auc = []
for proj in projects:
print(f'getting metrics for {proj.project_name}')
models = [m for m in proj.get_models() if m.sample_pct > 45]
baseline_model = sorted([m for m in models if m.featurelist_name == 'text only'], key=get_metrics, reverse=True)[0]
deep_model = sorted([m for m in models if m.featurelist_name != 'text only'], key=get_metrics, reverse=True)[0]
baseline_auc = get_metrics(baseline_model)
deep_auc = get_metrics(deep_model)
label.extend([proj.project_name.replace('lm_', '')] * 2)
category.extend(['baseline', 'deep'])
auc.extend([baseline_auc, deep_auc])
In [46]:
import pandas as pd
compare_df = pd.DataFrame({'label': label,
'category': category,
'auc': auc})
In [73]:
pivot = compare_df.pivot(index='label', columns='category', values='auc')
pivot['winner'] = pivot.apply(lambda x: 'deep' if x.deep > x.baseline else 'baseline', axis=1)
pivot['abs diff'] = pivot.apply(lambda x: abs(x.deep - x.baseline), axis=1)
pivot['label count'] = [c[x] for x in pivot.index.values]
pivot.sort_values(by=['label count'], ascending=False)
Out[73]:
In [80]:
wrapper
In [99]:
len(wrapper.learn.data.vocab.itos)
Out[99]:
In [100]:
pivot.to_picklei('pivot_df.pkl')
Out[100]:
In [103]:
import pandas as pd
In [116]:
score_df = pd.read_pickle('score_df.pkl')
In [117]:
score_df.set_index('label', inplace=True)
In [118]:
score_df.columns = ['deep2']
In [119]:
new_pivot = pivot.join(score_df, how='left')[['baseline', 'deep', 'deep2', 'label count']]
In [121]:
def winner(x):
if x.baseline > x.deep and x.baseline > x.deep2:
return 'baseline'
elif x.deep > x.deep2:
return 'deep'
elif x.deep2 > x.deep:
return 'deep2'
In [124]:
new_pivot.dropna(inplace=True)
In [125]:
new_pivot['winner'] = new_pivot.apply(lambda x: winner(x), axis=1)
In [130]:
new_pivot['baseline minus best deep'] = new_pivot.apply(lambda x: x.baseline - max(x.deep, x.deep2), axis=1)
In [135]:
new_pivot['abs diff'] = new_pivot.apply(lambda x: abs(x['baseline minus best deep']), axis=1)
In [136]:
new_pivot.sort_values('label count', ascending=False)
Out[136]:
In [132]:
new_pivot.mean()
Out[132]:
In [ ]: