In [14]:
import os
# so I don't
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="-1" #prevent access to GPU for inference
import pandas as pd
pd.options.display.max_rows = 150
pd.options.display.max_colwidth = 500
import json
from mdparse import transform_pre_rules, compose
from pathlib import Path
from tqdm import tqdm_notebook
In [2]:
import torch
assert not torch.cuda.is_available()
See the query in GCP BigQuery Console
In [2]:
df = pd.read_csv(f'https://storage.googleapis.com/issue_label_bot/kubeflow_issues/000000000000.csv')
# filter for kubeflow/kubeflow
kfdf = df[df.repo.apply(lambda x: x.split('/')[1] =='kubeflow')]
Flatten list of labels
In [298]:
# unpack the lists of labels and flatten
def unpack_list(x):
"convert list as string into list."
if x == '':
return 'no_labels'
else:
return json.loads(x)
#flatten lists
labels = []
label_series = kfdf.labels.apply(lambda x: unpack_list(x))
for x in label_series:
labels.extend(x)
Top 10 / Bottom 10 labels
In [299]:
label_counts = pd.DataFrame({'labels': labels}).labels.value_counts()
display(label_counts.head(10))
display(label_counts.tail(10))
In [402]:
#Borrowed this from nb 2
def process_dict(dfdict, _):
"""process the data, but allow failure."""
t = compose(transform_pre_rules)
title = dfdict['title']
body = dfdict['body']
try:
text = 'xxxfldtitle '+ t(title) + ' xxxfldbody ' + t(body)
except:
return None
return {'url': dfdict['url'], 'text':text}
In [403]:
processed_issue_texts = [process_dict(x, 0) for x in kfdf.to_dict(orient='rows')]
processed_issue_texts[:2]
Out[403]:
Notes: you can export a lightweight learner for inference per https://docs.fast.ai/tutorial.inference.html
In [15]:
pd.read_csv('lang_model_onecycle_resume/history.csv')
Out[15]:
In [16]:
pd.read_csv('lang_model_onecycle/history.csv')
Out[16]:
In [10]:
from fastai.text.models import AWD_LSTM
from fastai.text import TextLMDataBunch as lmdb, load_data
from fastai.text.learner import language_model_learner
from fastai.basic_train import load_learner
path = Path('lang_model_onecycle/')
In [11]:
def pass_through(x):
return x
You don't have to execute the below cell anymore
In [5]:
data_lm = load_data(path, bs=128)
learn = language_model_learner(data=data_lm,
arch=AWD_LSTM,
pretrained=False)
learn.load('bestmodel')
learn.export()
Out[5]:
In [1]:
# previous Loss: [3.390915, tensor(0.3917)] for langmodel_onecycle_resume
In [6]:
learn.validate()
Out[6]:
In [13]:
learn.model
Out[13]:
In [12]:
learn = load_learner(path)
In [213]:
learn.model.reset() # so the hidden states reset between predictions
_ = learn.model.eval() # turn off dropout, etc. only need to do this after loading model.
Fastai encoder produces a tuple of two lists raw_output and output. see this reference raw_output are the hidden states emitted for each element of the sequence without dropout. Because you are turning off dropout during inference with .eval(), it really doesn't matter which one you get as they will both be the same (if they are not, this is a bug).
In [14]:
ex = processed_issue_texts[0]['text']
print(ex)
In [15]:
ex_numericalized_x, ex_numericalized_y = learn.data.one_item(ex)
ex_numericalized_x
Out[15]:
The next two output tensors should be the same, this is testing that the model state is being reset correctly between predictions
In [16]:
encoder = learn.model[0]
rep = encoder.forward(ex_numericalized_x)[-1][-1]
print(rep)
print(rep.shape)
In [17]:
learn.model.reset()
rep = encoder.forward(ex_numericalized_x)[-1][-1]
print(rep)
print(rep.shape)
Numericalized data
In [404]:
from tqdm import tqdm_notebook
In [405]:
# index into [0] b/c we don't care about the y value.
num_x = []
for x in tqdm_notebook(processed_issue_texts, total=len(processed_issue_texts)):
num_x.extend(learn.data.one_item(x)[0])
In [406]:
reps=[]
for x in tqdm_notebook(num_x, total=len(num_x)):
encoder.reset()
reps.extend(encoder.forward(x[None, :])[-1][-1])
In [407]:
from typing import List
class IssueRepresentation:
def __init__(self, tensor:torch.tensor) -> torch.tensor:
self.tensor=tensor
@property
def mean(self):
return torch.mean(self.tensor, 0)
@property
def max(self):
return torch.max(self.tensor, 0)[0]
@property
def last(self):
return self.tensor[-1,:]
@property
def concat(self):
return torch.cat([self.mean, self.max, self.last])
class IssueRepresentation_List:
def __init__(self, irl=List[torch.tensor]):
self.irl = [IssueRepresentation(x) for x in irl]
@property
def mean(self):
return torch.stack([x.mean for x in self.irl])
@property
def max(self):
return torch.stack([x.max for x in self.irl])
@property
def last(self):
return torch.stack([x.last for x in self.irl])
@property
def concat(self):
return torch.stack([x.concat for x in self.irl])
In [408]:
irl = IssueRepresentation_List(reps)
In [409]:
import pickle as pkl
with open('irl.pkl', 'wb') as f:
pkl.dump(irl, f)
In [410]:
from IPython.display import display, Markdown, HTML
In [411]:
import pickle as pkl
with open('irl.pkl', 'rb') as f:
irl = pkl.load(f)
In [412]:
## == True converts it into a 0/1 indices array
candidates_to_label = torch.tensor((kfdf.labels == '[]').values) == True
print(f'{candidates_to_label.sum()} issues w/o labels out of {len(kfdf)} total issues.')
In [413]:
no_label_reps = irl.concat[candidates_to_label]
label_reps = irl.concat[~candidates_to_label]
assert (no_label_reps.shape[0] + label_reps.shape[0]) == len(kfdf)
In [414]:
label_mask = kfdf.labels != '[]'
labeled_df = kfdf[label_mask].reset_index(drop=True)
no_label_df = kfdf[~label_mask].reset_index(drop=True)
assert len(labeled_df) + len(no_label_df) == len(kfdf)
In [415]:
class oneshotlabeler:
def __init__(self, vecs, refdf):
assert vecs.shape[0] == len(refdf)
self.vecs = vecs
self.refdf = refdf.reset_index(drop=True)
self.cs = CosineSimilarity()
def query(self, vec):
assert vec.ndim == 1
sims = cs.forward(vec.unsqueeze(0), self.vecs)
idxs = sims.argsort(descending=True)
ranked_sims = sims[idxs]
closest_idx = idxs[0].item()
ref_issue = self.refdf.iloc[closest_idx]
msg = []
msg.append(f'\n## Prediction:\n')
msg.append(f'**Predicted Labels**: {json.loads(ref_issue.labels)}\n')
msg.append(f'**Cosine similarity (0-1)**: {ranked_sims[0]:.2f}\n')
msg.append(f'**Closest Issue URL**: {json.loads(ref_issue.url)}\n')
msg.append(f'**Closest Issue Title**: {ref_issue.title}\n')
msg.append(f'**Closest Issue Body**:\n {ref_issue.body[:600]}')
display(Markdown('\n'.join(msg)))
def random_prediction(self, no_label_df, no_label_vec):
assert len(no_label_df) == no_label_vec.shape[0]
sample = no_label_df.sample(1)
idx = sample.index.values[0]
msg = []
msg.append(f'\n## Un-Labeled Target Issue To Predict:\n')
msg.append(f'**Title:** {sample.title.values[0]}\n')
msg.append(f'**Body:**\n {sample.body.values[0][:600]}\n')
msg.append(f'**URL:** {sample.url.values[0]}')
display(Markdown('\n'.join(msg)))
self.query(no_label_vec[idx, :])
In [416]:
assert len(no_label_df) == no_label_reps.shape[0]
In [417]:
ol = oneshotlabeler(vecs=label_reps,
refdf = labeled_df)
In [449]:
ol.random_prediction(no_label_df=no_label_df,
no_label_vec=no_label_reps)
In [ ]:
In [ ]:
In [ ]:
In [419]:
from collections import Counter, Set
label_counter = Counter()
df['labels_unpacked'] = df.labels.apply(lambda x: unpack_list(x))
for labels in df.labels_unpacked:
label_counter.update(labels)
labels_to_keep = {x:label_counter[x] for x in label_counter if label_counter[x] >= 20}
See the labels that occur > 20 times.
In [420]:
display(labels_to_keep)
print(f' Number of labels: {len(labels_to_keep)}')
label_set = set(labels_to_keep.keys())
In [456]:
h_labeled_df = df[df.labels_unpacked.apply(lambda x: len(set(x).intersection(label_set)) > 0)]
h_labeled_df.shape
Out[456]:
Only retain labels that occur at least 20 times
In [477]:
h_labeled_df['final_labels'] = h_labeled_df.labels_unpacked.apply(lambda x: set(x).intersection(label_set))
In [774]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
In [775]:
ohe_labels = mlb.fit_transform(h_labeled_df.final_labels.values.tolist())
display(ohe_labels)
print(ohe_labels.shape)
In [776]:
mlb.classes_
Out[776]:
In [401]:
cleaned_text = [process_dict(x, 0) for x in labeled_df.to_dict(orient='rows')]
vecs = []
encoder = learn.model[0]
encoder.eval()
for x in tqdm_notebook(cleaned_text, total=len(cleaned_text)):
# transform the data to integers
x = learn.data.one_item(x)[0]
# forward to pass through model
encoder.reset()
vecs.extend(encoder.forward(x)[-1][-1])
In [451]:
vec_list = IssueRepresentation_List(vecs)
In [459]:
latent_features = vec_list.concat
display(latent_features)
print(latent_features.shape)
In [509]:
print(f'There are {h_labeled_df.repo.nunique()} repos in the dataset with labels')
In [773]:
mlb_repos = MultiLabelBinarizer()
repo_indicators = mlb_repos.fit_transform([[x] for x in h_labeled_df.repo.values.tolist()])
display(repo_indicators)
repo_indicators.shape
Out[773]:
In [504]:
import numpy as np
feature_arr = np.concatenate([repo_indicators, latent_features], axis = 1)
feature_arr.shape
Out[504]:
In [738]:
! pip install -U scikit-learn
In [744]:
import sklearn.neural_network.MLPClassifier
sklearn.__version__
Out[744]:
In [746]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neural_network import MLPClassifier
etc = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=2, bootstrap=False, n_jobs=-1)
knn = KNeighborsClassifier(n_neighbors=2, weights='distance', metric='cosine', n_jobs=-1)
rc = RidgeClassifierCV(alphas=[.1, .5, 5, 10, 50, 100], normalize=True, store_cv_values=True)
In [845]:
mlp = MLPClassifier(alpha=.01,
hidden_layer_sizes=(500,),
learning_rate='adaptive',
learning_rate_init=.1,
early_stopping=True,
validation_fraction=.25)
In [846]:
clf = mlp
In [847]:
pred_index = 1
In [848]:
clf.fit(X=np.delete(feature_arr, pred_index, axis=0),
y=np.delete(ohe_labels, pred_index, axis=0))
Out[848]:
In [849]:
preds = clf.predict_proba(feature_arr[None, pred_index])
ground_truth = ohe_labels[pred_index, :] == 1
for g,p,c in zip(ground_truth.tolist(), preds[0, :].tolist(), mlb.classes_.tolist()):
if g:
print('***', p,' ', c)
else:
print(p,' ', c)
In [850]:
clf.predict_proba(feature_arr[None, pred_index]).shape
Out[850]:
In [872]:
drdf = pd.DataFrame(feature_arr)
drdf['target'] = ohe_labels[:, 0]
drdf.columns = ['f_'+ str(x) for x in drdf.columns.tolist()]
In [873]:
drdf.to_csv('drdf_test.csv')
In [865]:
feature_arr.shape
Out[865]:
In [875]:
ohe_labels[:, 0].sum()
Out[875]:
In [884]:
raw_textdrdf = pd.DataFrame({'text': [x['text'] for x in cleaned_text], 'target': ohe_labels[:, 0]})
In [885]:
raw_textdrdf.head()
Out[885]:
In [886]:
raw_textdrdf.to_csv('raw_textdrdf.csv', index=False)
In [887]:
tempdf = pd.DataFrame(feature_arr)
tempdf.columns = ['f_'+ str(x) for x in tempdf.columns.tolist()]
In [892]:
drdf_concat = pd.concat([raw_textdf, tempdf, pd.DataFrame({'target': ohe_labels[:, 0]})], axis=1)
In [893]:
drdf_concat.to_csv('drdf_concat.csv')
In [895]:
drdf_concat.columns
Out[895]:
In [17]:
mlb.classes_
In [ ]: