https://storage.googleapis.com/issue_label_bot/k8s_issues/000000000000.csv
In [1]:
import os
import torch
from torch.cuda import empty_cache
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"
import pandas as pd
import numpy as np
import re
pd.set_option('max_colwidth', 1000)
In [2]:
df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/k8s_issues/000000000000.csv')
df.labels = df.labels.apply(lambda x: eval(x))
#remove target leakage from kubernetes which are the bot commands
df['body'] = df.body.apply(lambda x: re.sub('(/sig|/kind|/status/triage/|priority) \S+', '', str(x)))
df['last_time'] = pd.to_datetime(df.last_time)
In [3]:
import base64
import requests
import yaml
def get_current_labels(url="https://raw.githubusercontent.com/kubernetes/test-infra/master/label_sync/labels.yaml"):
"""
Get list of valid issue labels (b/c labels get deprecated over time).
See: https://kubernetes.slack.com/archives/C1TU9EB9S/p1561570627363100
"""
req = requests.get(url)
yml = yaml.safe_load(req.content)
return [x['name'] for x in yml['default']['labels']]
In [4]:
current_labels = get_current_labels()
# remove deprecated labels
df.labels = df.labels.apply(lambda x: [l for l in x if l in current_labels])
# filter out issues without any labels
df = df[df.labels.apply(lambda x: x != [])]
print(f'Number of labeled issues after filtering: {df.shape[0]:,}')
In [5]:
df.head(1)
Out[5]:
In [6]:
from collections import Counter
c = Counter()
for row in df.labels:
c.update(row)
In [7]:
min_threshold = 50
min_threshold_labels = [k for k in c if c[k] >= min_threshold]
print(f'{len(min_threshold_labels)} labels that occur at least {min_threshold} times.')
In [8]:
df['labels'] = df.labels.apply(lambda x: [l for l in x if l in min_threshold_labels])
df = df[df.labels.apply(lambda x: x != [])]
print(f'Number of labeled issues after filtering again: {df.shape[0]:,}')
Remaining Issues
In [9]:
for l in min_threshold_labels:
print(f'{l}: {c[l]}')
In [10]:
df['year'] = df.last_time.apply(lambda x: x.year)
df.groupby('year')['body'].count()
Out[10]:
In [11]:
from inference import InferenceWrapper, pass_through
from sklearn.model_selection import train_test_split
In [12]:
parsed_df = InferenceWrapper.process_df(df)
Join the labels back onto the parsed text
In [13]:
assert parsed_df.shape[0] == df.shape[0]
ml_df = pd.concat([df.reset_index(drop=True), parsed_df], axis=1)[['text', 'labels']]
# must delimit the labels by something (here using space) for fastai
ml_df['labels'] = ml_df.labels.apply(lambda x: ' '.join(x))
assert len(ml_df) == len(parsed_df) == len(df)
In [14]:
ml_df.head(2)
Out[14]:
In [30]:
ml_df.to_pickle('ml_df.pkl')
The pretrained Encoder comes from the language model
In [2]:
from fastai.text.data import TextClasDataBunch
from inference import InferenceWrapper, pass_through
from fastai.text import text_classifier_learner
from sklearn.model_selection import train_test_split
ml_df = pd.read_pickle('ml_df.pkl')
In [3]:
train_df, val_df = train_test_split(ml_df, train_size=.8, random_state=1234)
In [4]:
print(f' # of training rows: {len(train_df):,}\n # of validation rows: {len(val_df):,}')
In [5]:
train_label_set = set()
for labels in train_df.labels:
train_label_set.update(labels)
val_label_set = set()
for labels in val_df.labels:
val_label_set.update(labels)
In [6]:
# make sure labels are not unique in either the val or the training set
diff_set = train_label_set ^ val_label_set
assert not diff_set
In [7]:
from fastai.text.transform import Tokenizer
tokenizer = Tokenizer(pre_rules=[pass_through], n_cpus=31)
In [8]:
from fastai.basic_train import load_learner
model_path='/ds/lang_model/models_22zkdqlr/'
model_file_name='trained_model_22zkdqlr.pkl'
learn = load_learner(path=model_path, file=model_file_name)
In [9]:
data_multi_label = TextClasDataBunch.from_df(path='/ds/multi_class_model/',
train_df=train_df,
valid_df=val_df,
tokenizer=tokenizer,
text_cols='text',
label_cols='labels',
label_delim=' ',
vocab=learn.data.vocab,
bs=32)
In [24]:
data_multi_label.save()
In [10]:
from fastai.text.models import AWD_LSTM, awd_lstm_lm_config
emb_sz=800
qrnn=False
bidir=False
n_layers=4
n_hid=2400
awd_lstm_lm_config.update(dict(emb_sz=emb_sz, qrnn=qrnn, bidir=bidir, n_layers=n_layers, n_hid=n_hid))
awd_lstm_lm_config.pop('tie_weights', None)
awd_lstm_lm_config.pop('out_bias', None)
tcl = text_classifier_learner(data=data_multi_label,
pretrained=False,
arch=AWD_LSTM,
config=awd_lstm_lm_config)
In [11]:
tcl.load_encoder('trained_model_encoder_22zkdqlr')
In [12]:
tcl.freeze()
In [17]:
tcl.lr_find()
In [18]:
tcl.recorder.plot()
In [13]:
from torch.cuda import empty_cache
empty_cache()
In [13]:
tcl.fit_one_cycle(3, max_lr=.1)
Manual Learning Rate Annealing
In [16]:
tcl.fit(epochs=1, lr=slice(.0004))
Unfreeze and keep training
In [17]:
tcl.freeze_to(-2)
In [20]:
tcl.fit(epochs=1, lr=slice(.0001))
In [21]:
tcl.fit(epochs=1, lr=slice(.0004))
In [25]:
classifier_model_path = tcl.save(file='classifier_best_model',
return_path=True)
In [26]:
classifier_model_path
Out[26]:
In [55]:
val_preds = tcl.get_preds()
In [61]:
val_preds2 = val_df.text.apply(lambda x: tcl.predict(x)[2].cpu().numpy())
In [73]:
val_preds2_matrix = np.stack(val_preds2.values)
In [32]:
val_proba = val_preds[0].cpu().numpy()
val_proba.shape
Out[32]:
In [54]:
val_df.head()
Out[54]:
In [86]:
i = 10
idx = np.argmax(val_preds2_matrix[i, :])
print(f'{class_list[i]}')
print(f'ground truth: {val_df.iloc[idx]}')
In [87]:
class_list = tcl.data.classes
assert len(class_list) == val_proba.shape[1]
In [88]:
val_scores = {}
for i, lbl in enumerate(class_list):
ground_truth = val_df.labels.apply(lambda x: lbl in x).values
predicted_probs = val_preds2_matrix[:, i]
val_scores[lbl] = {'yhat': predicted_probs, 'y': ground_truth}
In [89]:
from sklearn.metrics import roc_auc_score as auc
In [90]:
auc_scores = []
labels = []
for lbl in val_scores:
auc_scores.append(auc(val_scores[lbl]['y'], val_scores[lbl]['yhat']))
labels.append(lbl)
In [92]:
assert len(auc_scores) == len(labels)
In [93]:
score_df = pd.DataFrame({'label':labels, 'auc': auc_scores})
score_df
Out[93]:
In [94]:
score_df.to_pickle('score_df.pkl')
In [ ]:
score_df = pd.DataFrame({'label'f'] = pivot.apply(lambda x: abs(x.deep - x.baseline), axis=1)
pivot['label c: labels,
'auc': auc_scores})
In [73]:
pivot = compare_df.pivot(index='label', columns='category', values='auc')
pivot['winner'] = pivot.apply(lambda x: 'deep' if x.deep > x.baseline else 'baseline', axis=1)
pivot['abs diff'] = pivot.apply(lambda x: abs(x.deep - x.baseline), axis=1)
pivot['label count'] = [c[x] for x in pivot.index.values]
pivot.sort_values(by=['label count'], ascending=False)
Out[73]:
In [96]:
pred = tcl.predict(val_df.text.iloc[1])
pred
Out[96]:
In [97]:
val_df.labels.iloc[1]
Out[97]:
In [98]:
tcl.data.classes[torch.argmax(pred[1]).item()]
Out[98]:
In [ ]:
In [100]:
pred_proba = [(v,k) for k, v in zip(tcl.data.classes, pred[2].data.tolist())]
pred_proba.sort(reverse=True)
pred_proba
Out[100]:
In [ ]: