In this notebook, we show how to feed the embeddings from the language model into the MLP classifier. Then, we take the github repo, kubernetes/kubernetes, as an example. We do transfer learning and show the results.
combined_sig_df.pkl https://storage.googleapis.com/issue_label_bot/notebook_files/combined_sig_df.pkl This file includes the github issue contents including titles, bodies, and labels.
feat_df.csv https://storage.googleapis.com/issue_label_bot/notebook_files/feat_df.csv
This file includes 1600-dimentional embeddings of 14390 issues from kubernetes/kubernetes.
In [1]:
import pandas as pd
combined_sig_df = pd.read_pickle('combined_sig_df.pkl')
feat_df = pd.read_csv('feat_df.csv')
In [2]:
# github issue contents
combined_sig_df.head(3)
Out[2]:
In [3]:
# embeddings of github issues [mean, max]
feat_df.head(3)
Out[3]:
In [4]:
# count the labels in the holdout set
from collections import Counter
c = Counter()
for row in combined_sig_df[combined_sig_df.part == 6].labels:
c.update(row)
Split the data into two sets according to the column, part.
There are 28 labels in total because 28 sig labels have at least 30 issues, which are preprocessed in the notebook, EvaluateEmbeddings.
In [5]:
train_mask = combined_sig_df.part != 6
holdout_mask = ~train_mask
In [6]:
X = feat_df[train_mask].values
label_columns = [x for x in combined_sig_df.columns if 'sig/' in x]
y = combined_sig_df[label_columns][train_mask].values
print(X.shape)
print(y.shape)
In [7]:
X_holdout = feat_df[holdout_mask].values
y_holdout = combined_sig_df[label_columns][holdout_mask].values
print(X_holdout.shape)
print(y_holdout.shape)
In [8]:
from sklearn.metrics import roc_auc_score
def calculate_auc(predictions):
auc_scores = []
counts = []
for i, l in enumerate(label_columns):
y_hat = predictions[:, i]
y = y_holdout[:, i]
auc = roc_auc_score(y_true=y, y_score=y_hat)
auc_scores.append(auc)
counts.append(c[l])
df = pd.DataFrame({'label': label_columns, 'auc': auc_scores, 'count': counts})
display(df)
weightedavg_auc = df.apply(lambda x: x.auc * x['count'], axis=1).sum() / df['count'].sum()
print(f'Weighted Average AUC: {weightedavg_auc}')
return df, weightedavg_auc
In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
In [10]:
mlp = MLPClassifier(early_stopping=True, n_iter_no_change=5, max_iter=500, solver='adam',
random_state=1234)
In [11]:
mlp.fit(X, y)
Out[11]:
In [12]:
mlp_predictions = mlp.predict_proba(X_holdout)
In [13]:
mlp_df, mlp_auc = calculate_auc(mlp_predictions)
In [14]:
import numpy as np
In [15]:
def calculate_max_range_count(x):
max_range_count = [0] * 11 # [0,0.1), [0.1,0.2), ... , [0.9,1), [1,1]
for i in x:
max_range_count[int(max(i) // 0.1)] += 1
thresholds_lower = [0.1 * i for i in range(11)]
thresholds_upper = [0.1 * (i+1) for i in range(10)] + [1]
df = pd.DataFrame({'l': thresholds_lower, 'u': thresholds_upper, 'count': max_range_count})
display(df)
return df, max_range_count
In [16]:
_, _ = calculate_max_range_count(mlp_predictions)
In [17]:
def calculate_result(y_true, y_pred, threshold=0.0):
total_true = np.array([0] * len(y_pred[0]))
total_pred_true = np.array([0] * len(y_pred[0]))
pred_correct = np.array([0] * len(y_pred[0]))
for i in range(len(y_pred)):
y_true_label = np.where(y_true[i] == 1)[0]
total_true[y_true_label] += 1
y_pred_true = np.where(y_pred[i] >= threshold)[0]
total_pred_true[y_pred_true] += 1
for j in y_true_label:
if j in y_pred_true:
pred_correct[j] += 1
df = pd.DataFrame({'label': label_columns, 'precision': (pred_correct / total_pred_true), 'recall': (pred_correct / total_true)})
print(f'Threshold: {threshold}')
display(df)
return df, (pred_correct / total_pred_true), (pred_correct / total_true)
In [18]:
_, _, _ = calculate_result(y_holdout, mlp_predictions, threshold=0.0)
In [19]:
_, _, _ = calculate_result(y_holdout, mlp_predictions, threshold=0.3)
In [20]:
_, _, _ = calculate_result(y_holdout, mlp_predictions, threshold=0.5)
In [21]:
_, _, _ = calculate_result(y_holdout, mlp_predictions, threshold=0.7)
In [22]:
# params = {'hidden_layer_sizes': [(100,), (200,), (400, ), (50, 50), (100, 100), (200, 200)],
# 'alpha': [.001, .01, .1, 1, 10],
# 'learning_rate': ['constant', 'adaptive'],
# 'learning_rate_init': [.001, .01, .1]}
params = {'hidden_layer_sizes': [(100,), (200,), (400, ), (50, 50), (100, 100), (200, 200)],
'alpha': [.001],
'learning_rate': ['adaptive'],
'learning_rate_init': [.001]}
mlp_clf = MLPClassifier(early_stopping=True, validation_fraction=.2, n_iter_no_change=4, max_iter=500)
gscvmlp = GridSearchCV(mlp_clf, params, cv=5, n_jobs=-1)
gscvmlp.fit(X, y)
Out[22]:
In [23]:
print(f'The best model from grid search is:\n=====================================\n{gscvmlp.best_estimator_}')
In [24]:
mlp_tuned_predictions = gscvmlp.predict_proba(X_holdout)
In [25]:
mlp_tuned_df, mlp_tuned_auc = calculate_auc(mlp_tuned_predictions)
In [26]:
_, _ = calculate_max_range_count(mlp_tuned_predictions)
In [27]:
_, _, _ = calculate_result(y_holdout, mlp_tuned_predictions, threshold=0.0)
In [28]:
_, _, _ = calculate_result(y_holdout, mlp_tuned_predictions, threshold=0.7)
In [29]:
import dill as dpickle
In [30]:
with open('mlp_k8s.dpkl', 'wb') as f:
dpickle.dump(gscvmlp, f)
In [31]:
import dill as dpickle
In [32]:
with open('mlp_k8s.dpkl', 'rb') as f:
gscvmlp = dpickle.load(f)
In [33]:
mlp_tuned_predictions = gscvmlp.predict_proba(X_holdout)
In [34]:
mlp_tuned_df, mlp_tuned_auc = calculate_auc(mlp_tuned_predictions)
In [35]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from collections import Counter
import dill as dpickle
import numpy as np
import pandas as pd
In [41]:
class MLP:
def __init__(self,
counter, # for calculate auc
label_columns,
activation='relu',
alpha=0.0001,
early_stopping=True,
epsilon=1e-08,
hidden_layer_sizes=(100,),
learning_rate='constant',
learning_rate_init=0.001,
max_iter=500,
model_file="model.dpkl",
momentum=0.9,
n_iter_no_change=5,
precision_thre=0.7,
prob_thre=0.0,
random_state=1234,
recall_thre=0.5,
solver='adam',
validation_fraction=0.1):
self.clf = MLPClassifier(activation=activation,
alpha=alpha,
early_stopping=early_stopping,
epsilon=epsilon,
hidden_layer_sizes=hidden_layer_sizes,
learning_rate=learning_rate,
learning_rate_init=learning_rate_init,
max_iter=max_iter,
momentum=momentum,
n_iter_no_change=n_iter_no_change,
random_state=random_state,
solver=solver,
validation_fraction=validation_fraction)
self.model_file = model_file
self.precision_thre = precision_thre
self.prob_thre = prob_thre
self.recall_thre = recall_thre
self.counter = counter
self.label_columns = label_columns
self.precision = None
self.recall = None
self.exclusion_list = None
def fit(self, X, y):
self.clf.fit(X, y)
def predict_proba(self, X):
return self.clf.predict_proba(X)
def calculate_auc(self, y_holdout, predictions):
auc_scores = []
counts = []
for i, l in enumerate(self.label_columns):
y_hat = predictions[:, i]
y = y_holdout[:, i]
auc = roc_auc_score(y_true=y, y_score=y_hat)
auc_scores.append(auc)
counts.append(self.counter[l])
df = pd.DataFrame({'label': self.label_columns, 'auc': auc_scores, 'count': counts})
display(df)
weightedavg_auc = df.apply(lambda x: x.auc * x['count'], axis=1).sum() / df['count'].sum()
print(f'Weighted Average AUC: {weightedavg_auc}')
return df, weightedavg_auc
def calculate_max_range_count(self, prob):
thresholds_lower = [0.1 * i for i in range(11)]
thresholds_upper = [0.1 * (i+1) for i in range(10)] + [1]
max_range_count = [0] * 11 # [0,0.1), [0.1,0.2), ... , [0.9,1), [1,1]
for i in prob:
max_range_count[int(max(i) // 0.1)] += 1
df = pd.DataFrame({'l': thresholds_lower, 'u': thresholds_upper, 'count': max_range_count})
display(df)
return df, max_range_count
def calculate_result(self, y_true, y_pred, display_table=True, prob_thre=0.0):
if prob_thre:
self.prob_thre = prob_thre
total_true = np.array([0] * len(y_pred[0]))
total_pred_true = np.array([0] * len(y_pred[0]))
pred_correct = np.array([0] * len(y_pred[0]))
for i in range(len(y_pred)):
y_true_label = np.where(y_true[i] == 1)[0]
total_true[y_true_label] += 1
y_pred_true = np.where(y_pred[i] >= prob_thre)[0]
total_pred_true[y_pred_true] += 1
for j in y_true_label:
if j in y_pred_true:
pred_correct[j] += 1
self.precision = pred_correct / total_pred_true
self.recall = pred_correct / total_true
df = pd.DataFrame({'label': self.label_columns, 'precision': self.precision, 'recall': self.recall})
if display_table:
print(f'Threshold: {self.prob_thre}')
display(df)
return df, self.precision, self.recall
def find_best_prob_thre(self, y_true, y_pred):
best_prob_thre = 0
prec_count = 0
reca_count = 0
print (f'Precision threshold: {self.precision_thre}\nRecall threshold:{self.recall_thre}')
thre = 0.0
while thre < 1:
_, prec, reca = self.calculate_result(y_true, y_pred, display_table=False, prob_thre=thre)
pc = 0
for p in prec:
if p >= self.precision_thre:
pc += 1
rc = 0
for r in reca:
if r >= self.recall_thre:
rc += 1
if pc > prec_count or pc == prec_count and rc >= reca_count:
best_prob_thre = thre
prec_count = pc
reca_count = rc
thre += 0.1
self.best_prob_thre = best_prob_thre
print(f'Best probability threshold: {best_prob_thre},\n{min(prec_count, reca_count)} labels meet both of the precision threshold and the recall threshold')
def get_exclusion_list(self):
assert len(self.precision) == len(self.recall)
self.exclusion_list = []
for p, r, label in zip(self.precision, self.recall, self.label_columns):
if p < self.precision_thre or r < self.recall_thre:
self.exclusion_list.append(label)
return self.exclusion_list
def grid_search(self, params, cv=5, n_jobs=-1):
self.clf = GridSearchCV(self.clf, params, cv=cv, n_jobs=n_jobs)
def save_model(self):
with open(self.model_file, 'wb') as f:
dpickle.dump(self.clf, f)
def load_model(self):
with open(self.model_file, 'rb') as f:
self.clf = dpickle.load(f)
In [42]:
c = Counter()
for row in combined_sig_df[combined_sig_df.part == 6].labels:
c.update(row)
In [43]:
clf = MLP(c, label_columns, early_stopping=True, n_iter_no_change=5, max_iter=500,
solver='adam', random_state=1234, precision_thre=0.7, recall_thre=0.3)
clf.fit(X, y)
mlp_predictions = clf.predict_proba(X_holdout)
mlp_df, mlp_auc = clf.calculate_auc(y_holdout, mlp_predictions)
In [44]:
_, _ = clf.calculate_max_range_count(mlp_predictions)
In [45]:
_, _, _ = clf.calculate_result(y_holdout, mlp_predictions)
In [46]:
clf.find_best_prob_thre(y_holdout, mlp_predictions)
In [47]:
_, _, _ = clf.calculate_result(y_holdout, mlp_predictions, prob_thre=0.7)
In [48]:
clf.get_exclusion_list()
Out[48]:
In [42]:
params = {'hidden_layer_sizes': [(100,), (200,), (400, ), (50, 50), (100, 100), (200, 200)],
'alpha': [.001],
'learning_rate': ['adaptive'],
'learning_rate_init': [.001]}
clf.grid_search(params, cv=5, n_jobs=-1)
clf.fit(X, y)
mlp_predictions = clf.predict_proba(X_holdout)
mlp_df, mlp_auc = clf.calculate_auc(y_holdout, mlp_predictions)
In [43]:
clf.save_model()
clf.load_model()
mlp_predictions = clf.predict_proba(X_holdout)
mlp_df, mlp_auc = clf.calculate_auc(y_holdout, mlp_predictions)
In [51]:
new_clf = MLP(c, label_columns)
new_clf.load_model()
mlp_predictions = new_clf.predict_proba(X_holdout)
mlp_df, mlp_auc = new_clf.calculate_auc(y_holdout, mlp_predictions)
In [52]:
_, _ = new_clf.calculate_max_range_count(mlp_predictions)
In [53]:
_, _, _ = new_clf.calculate_result(y_holdout, mlp_predictions)
In [ ]: