In [14]:
import os

# so I don't 
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="-1"  #prevent access to GPU for inference

import pandas as pd
pd.options.display.max_rows = 150
pd.options.display.max_colwidth = 500
import json
from mdparse import transform_pre_rules, compose
from pathlib import Path
from tqdm import tqdm_notebook

In [2]:
import torch
assert not torch.cuda.is_available()


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-2-83338dcfe66a> in <module>
      1 import torch
----> 2 assert not torch.cuda.is_available()

AssertionError: 

Look at Data For Kubeflow/Kubeflow

Get Kubeflow Data

See the query in GCP BigQuery Console


In [2]:
df = pd.read_csv(f'https://storage.googleapis.com/issue_label_bot/kubeflow_issues/000000000000.csv')
# filter for kubeflow/kubeflow
kfdf = df[df.repo.apply(lambda x: x.split('/')[1] =='kubeflow')]

Flatten list of labels


In [298]:
# unpack the lists of labels and flatten
def unpack_list(x):
    "convert list as string into list."
    if x == '':
        return 'no_labels'
    else:
        return json.loads(x)

#flatten lists
labels = []
label_series = kfdf.labels.apply(lambda x: unpack_list(x))
for x in label_series:
    labels.extend(x)

Top 10 / Bottom 10 labels


In [299]:
label_counts = pd.DataFrame({'labels': labels}).labels.value_counts()
display(label_counts.head(10))
display(label_counts.tail(10))


priority/p1           534
priority/p2           148
area/jupyter          142
platform/gcp          128
area/kfctl            114
release/0.3.0          98
community/question     96
area/0.4.0             90
area/bootstrap         83
priority/p0            62
Name: labels, dtype: int64
platform/minikf           1
p1-important              1
platform/aws              1
area/openvino             1
area/centraldashbosard    1
area/design               1
cloud/azure               1
approved                  1
area/horovod              1
area/chainer              1
Name: labels, dtype: int64

In [402]:
#Borrowed this from nb 2
def process_dict(dfdict, _):
    """process the data, but allow failure."""
    t = compose(transform_pre_rules)
    title = dfdict['title']
    body = dfdict['body']
    try:
        text = 'xxxfldtitle '+ t(title) + ' xxxfldbody ' + t(body)
    except:
        return None
    return {'url': dfdict['url'], 'text':text}

In [403]:
processed_issue_texts = [process_dict(x, 0) for x in kfdf.to_dict(orient='rows')]
processed_issue_texts[:2]


Out[403]:
[{'url': '"https://github.com/kubeflow/kubeflow/issues/574"',
  'text': "xxxfldtitle tfjobs ui doesn't work behind iap; react app needs support iap? xxxfldbody tfjobs ui is deployed on dev.kubeflow.org. \\ r \\ r the ui shows up behind iap but its doesn't work \\ r - no tfjobs are listed \\ r - creating a job via the ui doesn't work. \\ r \\ r looking at the developer console we see requests to \\ r \\ r \\ r *URL* xxxlnkhb accounts.google.com xxxlnkhe \\ r \\ r which suggests to me the request is hitting the loadbalancer and being directed to do auth verification to sign in and its getting rejected. \\ r \\ r so i think one of two things is happening \\ r \\ r 1. the request is coming from the server running in k8s and incorrectly being redirected to the external loadbalncer and thus hitting iap when it shouldn't be \\ r 1. the request is coming from the client and the client needs to be updated to support iap. \\ r \\ r xxxatmention do you know where the request is coming from? \\ r \\ r you should be able to access it at \\ r *URL* xxxlnkhb dev.kubeflow.org xxxlnkhe"},
 {'url': '"https://github.com/kubeflow/kubeflow/issues/950"',
  'text': "xxxfldtitle gcp cluster-kubeflow.yaml isn't tested xxxfldbody this is the recommended dm and bootstrapper config for gke deployments. \\ r *URL* xxxlnkhb github.com xxxlnkhe \\ r it doesn't like that yaml file is used by our e2e tests because it wasn't updated to specify the registry when that change was made to \\ r *URL* xxxlnkhb github.com xxxlnkhe \\ r there is also another gcp bootstrapper config in that directory \\ r \\ r our e2e tests are using this dm config \\ r *URL* xxxlnkhb github.com xxxlnkhe \\ r we do need to make changes to the dm config in order to pull the registry at head. \\ r \\ r we have two options \\ r \\ r 1. we could create a separate test for *URL* xxxlnkhb github.com xxxlnkhe 1. we could make the necessary modifications to *URL* xxxlnkhb github.com xxxlnkhe \\ r we could use a simple shell script that runs jq to make the necessary field changes \\ r \\ r i prefer 2. if we use jq then we could update the instructions *URL* xxxlnkhb github.com xxxlnkhe to use the same jq commands. \\ r \\ r xxxatmention thoughts?"}]

Read in Model

Notes: you can export a lightweight learner for inference per https://docs.fast.ai/tutorial.inference.html


In [15]:
pd.read_csv('lang_model_onecycle_resume/history.csv')


Out[15]:
epoch train_loss valid_loss accuracy time
0 0 3.701002 3.515886 0.376756 15:39:27
1 1 3.691734 3.512343 0.377202 33:23:22
2 2 3.699855 3.509445 0.377600 15:40:10

In [16]:
pd.read_csv('lang_model_onecycle/history.csv')


Out[16]:
epoch train_loss valid_loss accuracy time
0 0 3.726073 3.523209 0.375969 15:40:15

In [10]:
from fastai.text.models import AWD_LSTM
from fastai.text import TextLMDataBunch as lmdb, load_data
from fastai.text.learner import language_model_learner
from fastai.basic_train import load_learner
path = Path('lang_model_onecycle/')

In [11]:
def pass_through(x):
    return x

You don't have to execute the below cell anymore


In [5]:
data_lm = load_data(path, bs=128)

learn = language_model_learner(data=data_lm,
                               arch=AWD_LSTM,
                               pretrained=False)

learn.load('bestmodel')

learn.export()


Out[5]:
LanguageLearner(data=TextLMDataBunch;

Train: LabelList (16385650 items)
x: LMTextList
xxbos xxxfldtitle xxunk throws exception when adding product via xxup api xxxfldbody xxmaj if xxmaj google xxmaj contents xxmaj experiments is enabled , adding a product using the xxxcdb xxup v1 / products / xxxcde endpoint causes xxunk xxxfilepath line 117 to throw xxxcdb invalidargumentexception xxxcde . xxmaj with contents experiments disabled the product add completes successfully . xxmaj example valid request json : 
  xxxcdb " product " : xxxjson 
  xxxcde,xxbos xxxfldtitle xxmaj grafana xxmaj kairosdb xxmaj top n rows xxxfldbody xxmaj when xxmaj grafana pulls the data it shows all the rows returned by the query , so can we please have a query option with something like xxunk ? xxmaj so we can only show limited number of rows on xxmaj grafana unlike now it just gets and shows everything . i have also requested a xxup ui option with xxmaj grafana guys where we can choose the number of rows we want to see under the graph , cheers,xxbos xxxfldtitle xxmaj graph request : mouseover highlight curve . xxxfldbody xxmaj it would be very nice , is mouseover of an item in graph legend ( to the left ) , or mouseover a graphed value ( with checkbox like xxup xxunk ) - would plot that value with double thickness . xxmaj that would make it easy to identify line for each value .,xxbos xxxfldtitle xxmaj graphite template to handle ip - address and hostname xxxfldbody xxmaj are there any plans to improve the graphite template logic ? xxmaj we are sending graphite data that contains xxup ip address and hostname ( xxunk , xxunk , etc ) , since those values have dot 's in them , it makes it difficult to identify tags for those graphite values . xxmaj one other issue we are having is identifying multiple tags in the middle of the graphite string to create a measurement .,xxbos xxxfldtitle xxmaj graphs : xxunk xxxfldbody xxmaj project : xxmaj graphs 
  xxmaj job : xxup uat 
  xxmaj env : xxup uat 
  xxmaj region : fxlabs / xxup us_west_1 
  xxmaj result : fail 
  xxmaj status xxmaj code : 400 
  xxmaj headers : xxunk x - xxup xss - xxmaj protection=[1 ; mode = block ] , xxmaj cache - xxmaj control=[no - cache , no - store , max - age=0 , must - revalidate ] , xxmaj pragma=[no - cache ] , xxmaj expires=[0 ] , x - xxmaj frame - options=[deny ] , xxunk xxmaj transfer - xxmaj encoding=[chunked ] , date=[tue , 02 xxmaj oct 2018 xxunk xxup gmt ] } 
  xxmaj endpoint : * xxup url * xxxlnkhb xxunk xxxlnkhe 
  xxmaj request : 
  xxmaj response : 
  xxxjson 
  xxmaj logs : 
  xxmaj assertion xxxatmention = = 403 ] resolved - to [ 400 = = 403 ] result [ xxmaj failed ] --- xxup fx xxmaj bot ---
y: LMLabelList
,,,,
Path: lang_model;

Valid: LabelList (1862119 items)
x: LMTextList
xxbos xxxfldtitle xxmaj gopi xxmaj episode 98 xxxfldbody xxmaj gopi xxmaj episode 98 xxxhtml * xxup url * xxxlnkhb ift.tt xxxlnkhe xxxhtml via xxmaj juragan xxmaj sinopsis * xxup url * xxxlnkhb ift.tt xxxlnkhe xxxhtml xxmaj december 17 , 2016 at xxup xxunk,xxbos xxxfldtitle xxmaj grabber with default xxunk model xxxfldbody xxxhm xxmaj overview 
  i want to use the default xxunk model ( loaded via dll ) to work with the grabber script . xxmaj the xxunk - demo uses xxunk controller block - shaped prefabs to add the grabber script , but i would like to actually use the default models . xxmaj how would i go about doing so ? 
  xxxhm xxmaj unity xxmaj editor xxmaj version 
  xxunk 
  xxxhm xxmaj mixed xxmaj reality xxmaj toolkit xxmaj release xxmaj version 
  xxunk xxmaj hot xxmaj fix,xxbos xxxfldtitle graphql integration xxxfldbody xxmaj hi ! xxmaj this project looks really promising to build a modern webapp 😊 i 've been hoping for something like this for a long time ! xxmaj however my first criterion is integrating with a graphql backend , which i even plan to be a hosted backend ( xxmaj graphcool xxxlnkhb * xxup url * xxxlnkhe ) . i suppose it 's not possible right now to use xxmaj xxunk in conjunction with xxmaj graphcool , but could it be considered in the design ? xxmaj the best would be to come up with a standard which can be plugged into any graphql backend ( hosted or own ) . xxmaj thanks so much and all the best,xxbos xxxfldtitle xxmaj grass toolbox filter input is n't labelled xxxfldbody xxxhr xxmaj author xxmaj name : xxunk - ( xxunk - ) xxmaj original xxmaj redmine xxmaj issue : xxunk , * xxup url * xxxlnkhb issues.qgis.org xxxlnkhe 
  xxxhr xxmaj if you open the xxup grass tools and choose the module list tab , there 's a text input field below the list which filters the list of modules . xxmaj however there 's no clue that it 's a filter box ! xxmaj we had some students confused by this on a course recently . 
  xxmaj needs a xxmaj filter : label on that line . xxmaj relevant xxup ui file is xxunk i guess .,xxbos xxxfldtitle xxmaj great program -- problems with kotlin xxxfldbody xxmaj quickly trying out your program - awesome . a few quick problems : decompiling a directory of kotlin ... a crash and some mis - ordered xxunk . crash : 
  xxunk xxxlnkhb github.com xxxlnkhe xxmaj xxunk : 
  xxup xxunk xxxlnkhb github.com xxxlnkhe xxunk xxxlnkhb github.com xxxlnkhe
y: LMLabelList
,,,,
Path: lang_model;

Test: None, model=SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(60003, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60003, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60003, bias=True)
    (output_dp): RNNDropout()
  )
), opt_func=functools.partial(<class 'torch.optim.adam.Adam'>, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[<function accuracy at 0x7fe85dc5fae8>], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('lang_model_onecycle_resume'), model_dir='models', callback_fns=[functools.partial(<class 'fastai.basic_train.Recorder'>, add_time=True, silent=False)], callbacks=[RNNTrainer
learn: LanguageLearner(data=TextLMDataBunch;

Train: LabelList (16385650 items)
x: LMTextList
xxbos xxxfldtitle xxunk throws exception when adding product via xxup api xxxfldbody xxmaj if xxmaj google xxmaj contents xxmaj experiments is enabled , adding a product using the xxxcdb xxup v1 / products / xxxcde endpoint causes xxunk xxxfilepath line 117 to throw xxxcdb invalidargumentexception xxxcde . xxmaj with contents experiments disabled the product add completes successfully . xxmaj example valid request json : 
  xxxcdb " product " : xxxjson 
  xxxcde,xxbos xxxfldtitle xxmaj grafana xxmaj kairosdb xxmaj top n rows xxxfldbody xxmaj when xxmaj grafana pulls the data it shows all the rows returned by the query , so can we please have a query option with something like xxunk ? xxmaj so we can only show limited number of rows on xxmaj grafana unlike now it just gets and shows everything . i have also requested a xxup ui option with xxmaj grafana guys where we can choose the number of rows we want to see under the graph , cheers,xxbos xxxfldtitle xxmaj graph request : mouseover highlight curve . xxxfldbody xxmaj it would be very nice , is mouseover of an item in graph legend ( to the left ) , or mouseover a graphed value ( with checkbox like xxup xxunk ) - would plot that value with double thickness . xxmaj that would make it easy to identify line for each value .,xxbos xxxfldtitle xxmaj graphite template to handle ip - address and hostname xxxfldbody xxmaj are there any plans to improve the graphite template logic ? xxmaj we are sending graphite data that contains xxup ip address and hostname ( xxunk , xxunk , etc ) , since those values have dot 's in them , it makes it difficult to identify tags for those graphite values . xxmaj one other issue we are having is identifying multiple tags in the middle of the graphite string to create a measurement .,xxbos xxxfldtitle xxmaj graphs : xxunk xxxfldbody xxmaj project : xxmaj graphs 
  xxmaj job : xxup uat 
  xxmaj env : xxup uat 
  xxmaj region : fxlabs / xxup us_west_1 
  xxmaj result : fail 
  xxmaj status xxmaj code : 400 
  xxmaj headers : xxunk x - xxup xss - xxmaj protection=[1 ; mode = block ] , xxmaj cache - xxmaj control=[no - cache , no - store , max - age=0 , must - revalidate ] , xxmaj pragma=[no - cache ] , xxmaj expires=[0 ] , x - xxmaj frame - options=[deny ] , xxunk xxmaj transfer - xxmaj encoding=[chunked ] , date=[tue , 02 xxmaj oct 2018 xxunk xxup gmt ] } 
  xxmaj endpoint : * xxup url * xxxlnkhb xxunk xxxlnkhe 
  xxmaj request : 
  xxmaj response : 
  xxxjson 
  xxmaj logs : 
  xxmaj assertion xxxatmention = = 403 ] resolved - to [ 400 = = 403 ] result [ xxmaj failed ] --- xxup fx xxmaj bot ---
y: LMLabelList
,,,,
Path: lang_model;

Valid: LabelList (1862119 items)
x: LMTextList
xxbos xxxfldtitle xxmaj gopi xxmaj episode 98 xxxfldbody xxmaj gopi xxmaj episode 98 xxxhtml * xxup url * xxxlnkhb ift.tt xxxlnkhe xxxhtml via xxmaj juragan xxmaj sinopsis * xxup url * xxxlnkhb ift.tt xxxlnkhe xxxhtml xxmaj december 17 , 2016 at xxup xxunk,xxbos xxxfldtitle xxmaj grabber with default xxunk model xxxfldbody xxxhm xxmaj overview 
  i want to use the default xxunk model ( loaded via dll ) to work with the grabber script . xxmaj the xxunk - demo uses xxunk controller block - shaped prefabs to add the grabber script , but i would like to actually use the default models . xxmaj how would i go about doing so ? 
  xxxhm xxmaj unity xxmaj editor xxmaj version 
  xxunk 
  xxxhm xxmaj mixed xxmaj reality xxmaj toolkit xxmaj release xxmaj version 
  xxunk xxmaj hot xxmaj fix,xxbos xxxfldtitle graphql integration xxxfldbody xxmaj hi ! xxmaj this project looks really promising to build a modern webapp 😊 i 've been hoping for something like this for a long time ! xxmaj however my first criterion is integrating with a graphql backend , which i even plan to be a hosted backend ( xxmaj graphcool xxxlnkhb * xxup url * xxxlnkhe ) . i suppose it 's not possible right now to use xxmaj xxunk in conjunction with xxmaj graphcool , but could it be considered in the design ? xxmaj the best would be to come up with a standard which can be plugged into any graphql backend ( hosted or own ) . xxmaj thanks so much and all the best,xxbos xxxfldtitle xxmaj grass toolbox filter input is n't labelled xxxfldbody xxxhr xxmaj author xxmaj name : xxunk - ( xxunk - ) xxmaj original xxmaj redmine xxmaj issue : xxunk , * xxup url * xxxlnkhb issues.qgis.org xxxlnkhe 
  xxxhr xxmaj if you open the xxup grass tools and choose the module list tab , there 's a text input field below the list which filters the list of modules . xxmaj however there 's no clue that it 's a filter box ! xxmaj we had some students confused by this on a course recently . 
  xxmaj needs a xxmaj filter : label on that line . xxmaj relevant xxup ui file is xxunk i guess .,xxbos xxxfldtitle xxmaj great program -- problems with kotlin xxxfldbody xxmaj quickly trying out your program - awesome . a few quick problems : decompiling a directory of kotlin ... a crash and some mis - ordered xxunk . crash : 
  xxunk xxxlnkhb github.com xxxlnkhe xxmaj xxunk : 
  xxup xxunk xxxlnkhb github.com xxxlnkhe xxunk xxxlnkhb github.com xxxlnkhe
y: LMLabelList
,,,,
Path: lang_model;

Test: None, model=SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(60003, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60003, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60003, bias=True)
    (output_dp): RNNDropout()
  )
), opt_func=functools.partial(<class 'torch.optim.adam.Adam'>, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[<function accuracy at 0x7fe85dc5fae8>], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('lang_model_onecycle_resume'), model_dir='models', callback_fns=[functools.partial(<class 'fastai.basic_train.Recorder'>, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(
  (0): WeightDropout(
    (module): LSTM(400, 1150, batch_first=True)
  )
  (1): RNNDropout()
), Sequential(
  (0): WeightDropout(
    (module): LSTM(1150, 1150, batch_first=True)
  )
  (1): RNNDropout()
), Sequential(
  (0): WeightDropout(
    (module): LSTM(1150, 400, batch_first=True)
  )
  (1): RNNDropout()
), Sequential(
  (0): Embedding(60003, 400, padding_idx=1)
  (1): EmbeddingDropout(
    (emb): Embedding(60003, 400, padding_idx=1)
  )
  (2): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60003, bias=True)
    (output_dp): RNNDropout()
  )
)], add_time=True, silent=None)
alpha: 2.0
beta: 1.0], layer_groups=[Sequential(
  (0): WeightDropout(
    (module): LSTM(400, 1150, batch_first=True)
  )
  (1): RNNDropout()
), Sequential(
  (0): WeightDropout(
    (module): LSTM(1150, 1150, batch_first=True)
  )
  (1): RNNDropout()
), Sequential(
  (0): WeightDropout(
    (module): LSTM(1150, 400, batch_first=True)
  )
  (1): RNNDropout()
), Sequential(
  (0): Embedding(60003, 400, padding_idx=1)
  (1): EmbeddingDropout(
    (emb): Embedding(60003, 400, padding_idx=1)
  )
  (2): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60003, bias=True)
    (output_dp): RNNDropout()
  )
)], add_time=True, silent=None)

In [1]:
# previous Loss: [3.390915, tensor(0.3917)] for langmodel_onecycle_resume

In [6]:
learn.validate()


Out[6]:
[3.390915, tensor(0.3917)]

In [13]:
learn.model


Out[13]:
SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(60003, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60003, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60003, bias=True)
    (output_dp): RNNDropout()
  )
)

In [12]:
learn = load_learner(path)

Inference


In [213]:
learn.model.reset() # so the hidden states reset between predictions
_ = learn.model.eval() # turn off dropout, etc. only need to do this after loading model.

Notes

Fastai encoder produces a tuple of two lists raw_output and output. see this reference raw_output are the hidden states emitted for each element of the sequence without dropout. Because you are turning off dropout during inference with .eval(), it really doesn't matter which one you get as they will both be the same (if they are not, this is a bug).


In [14]:
ex = processed_issue_texts[0]['text']
print(ex)


xxxfldtitle v1alpha2 implement condition update xxxfldbody we should update the conditions according to the status. \ r \ r / cc xxxatmention 

In [15]:
ex_numericalized_x,  ex_numericalized_y = learn.data.one_item(ex)
ex_numericalized_x


Out[15]:
tensor([[    2,    22, 35652,   454,  1619,   173,    23,    64,    66,   173,
             9,  2127,  1099,    13,     9,   357,    10,    50,   696,    50,
           696,    37,  1075,   118]])

The next two output tensors should be the same, this is testing that the model state is being reset correctly between predictions


In [16]:
encoder = learn.model[0]
rep = encoder.forward(ex_numericalized_x)[-1][-1]
print(rep)
print(rep.shape)


tensor([[[-0.0129,  0.0362,  0.0007,  ..., -0.0754, -0.0074,  0.0045],
         [-0.0251,  0.0263,  0.0664,  ..., -0.0272,  0.0092,  0.0330],
         [ 0.0580,  0.0300,  0.0196,  ..., -0.0416,  0.0290,  0.0129],
         ...,
         [-0.0111,  0.0130,  0.0432,  ..., -0.0640,  0.1140,  0.0357],
         [-0.0105, -0.0146,  0.0293,  ..., -0.1969,  0.2049,  0.0006],
         [-0.0057,  0.0225,  0.0220,  ..., -0.1356, -0.0231, -0.0011]]],
       grad_fn=<TransposeBackward0>)
torch.Size([1, 24, 400])

In [17]:
learn.model.reset()
rep = encoder.forward(ex_numericalized_x)[-1][-1]
print(rep)
print(rep.shape)


tensor([[[-0.0129,  0.0362,  0.0007,  ..., -0.0754, -0.0074,  0.0045],
         [-0.0251,  0.0263,  0.0664,  ..., -0.0272,  0.0092,  0.0330],
         [ 0.0580,  0.0300,  0.0196,  ..., -0.0416,  0.0290,  0.0129],
         ...,
         [-0.0111,  0.0130,  0.0432,  ..., -0.0640,  0.1140,  0.0357],
         [-0.0105, -0.0146,  0.0293,  ..., -0.1969,  0.2049,  0.0006],
         [-0.0057,  0.0225,  0.0220,  ..., -0.1356, -0.0231, -0.0011]]],
       grad_fn=<TransposeBackward0>)
torch.Size([1, 24, 400])

Get Representations

Numericalized data


In [404]:
from tqdm import tqdm_notebook

In [405]:
# index into [0] b/c we don't care about the y value.
num_x = []

for x in tqdm_notebook(processed_issue_texts, total=len(processed_issue_texts)):
    num_x.extend(learn.data.one_item(x)[0])




In [406]:
reps=[]
for x in tqdm_notebook(num_x, total=len(num_x)):
    encoder.reset()
    reps.extend(encoder.forward(x[None, :])[-1][-1])




In [407]:
from typing import List
class IssueRepresentation:
    
    def __init__(self, tensor:torch.tensor) -> torch.tensor:
        self.tensor=tensor
    
    @property
    def mean(self):
        return torch.mean(self.tensor, 0)
    
    @property
    def max(self):
        return torch.max(self.tensor, 0)[0]
    
    @property
    def last(self):
        return self.tensor[-1,:]
    
    @property
    def concat(self):
        return torch.cat([self.mean, self.max, self.last])

class IssueRepresentation_List:
    def __init__(self, irl=List[torch.tensor]):
        self.irl = [IssueRepresentation(x) for x in irl]
    
    @property
    def mean(self):
        return torch.stack([x.mean for x in self.irl])
    
    @property
    def max(self):
        return torch.stack([x.max for x in self.irl])
    
    @property
    def last(self):
        return torch.stack([x.last for x in self.irl])
    
    @property
    def concat(self):
        return torch.stack([x.concat for x in self.irl])

In [408]:
irl = IssueRepresentation_List(reps)

In [409]:
import pickle as pkl

with open('irl.pkl', 'wb') as f:
    pkl.dump(irl, f)

See if Naive One Shot Learning Works


In [410]:
from IPython.display import display, Markdown, HTML

In [411]:
import pickle as pkl

with open('irl.pkl', 'rb') as f:
    irl = pkl.load(f)

In [412]:
## == True converts it into a 0/1 indices array
candidates_to_label = torch.tensor((kfdf.labels == '[]').values) == True

print(f'{candidates_to_label.sum()} issues w/o labels out of {len(kfdf)} total issues.')


542 issues w/o labels out of 1384 total issues.

In [413]:
no_label_reps = irl.concat[candidates_to_label]
label_reps = irl.concat[~candidates_to_label]

assert (no_label_reps.shape[0] + label_reps.shape[0]) == len(kfdf)

In [414]:
label_mask = kfdf.labels != '[]'

labeled_df = kfdf[label_mask].reset_index(drop=True)
no_label_df = kfdf[~label_mask].reset_index(drop=True)

assert len(labeled_df) + len(no_label_df) == len(kfdf)

In [415]:
class oneshotlabeler:
    def __init__(self, vecs, refdf):
        assert vecs.shape[0] == len(refdf)
        self.vecs = vecs
        self.refdf = refdf.reset_index(drop=True)
        self.cs = CosineSimilarity()
    
    def query(self, vec):
        assert vec.ndim == 1
        sims = cs.forward(vec.unsqueeze(0), self.vecs)
        idxs = sims.argsort(descending=True)
        ranked_sims = sims[idxs]
        
        closest_idx = idxs[0].item()
        ref_issue = self.refdf.iloc[closest_idx]
        
        msg = []
        msg.append(f'\n## Prediction:\n')
        msg.append(f'**Predicted Labels**: {json.loads(ref_issue.labels)}\n')
        msg.append(f'**Cosine similarity (0-1)**: {ranked_sims[0]:.2f}\n')
        msg.append(f'**Closest Issue URL**: {json.loads(ref_issue.url)}\n')
        msg.append(f'**Closest Issue Title**: {ref_issue.title}\n')
        msg.append(f'**Closest Issue Body**:\n {ref_issue.body[:600]}')
        display(Markdown('\n'.join(msg)))
        
    def random_prediction(self, no_label_df, no_label_vec):
        assert len(no_label_df) == no_label_vec.shape[0]
        sample = no_label_df.sample(1)
        idx = sample.index.values[0]
        
        msg = []
        msg.append(f'\n## Un-Labeled Target Issue To Predict:\n')
        msg.append(f'**Title:** {sample.title.values[0]}\n')
        msg.append(f'**Body:**\n {sample.body.values[0][:600]}\n')
        msg.append(f'**URL:** {sample.url.values[0]}')
        display(Markdown('\n'.join(msg)))
        
        self.query(no_label_vec[idx, :])

In [416]:
assert len(no_label_df) == no_label_reps.shape[0]

In [417]:
ol = oneshotlabeler(vecs=label_reps, 
                    refdf = labeled_df)

In [449]:
ol.random_prediction(no_label_df=no_label_df,
                     no_label_vec=no_label_reps)


Un-Labeled Target Issue To Predict:

Title: \ kfctl apply k8s\ fails to deploy scheduledworkflows on mac

Body: hi, i'm trying to install kubeflow on \ docker for mac with k8s\ with these instructions from the documentation https://www.kubeflow.org/docs/started/getting-started-k8s/ .\r \r environment\r \r kubectl version\r client version: version.info{major:\ 1\ , minor:\ 10\ , gitversion:\ v1.10.11\ , gitcommit:\ 637c7e288581ee40ab4ca210618a89a555b6e7e9\ , gittreestate:\ clean\ , builddate:\ 2018-11-26t14:38:32z\ , goversion:\ go1.9.3\ , compiler:\ gc\ , platform:\ darwin/amd64\ }\r server version: version.info{major:\ 1\ , minor:\ 10\ , gitversion:\ v1.10.11\ , gitcommit:\ 637c7e288581ee40ab

URL: "https://github.com/kubeflow/kubeflow/issues/3130"

Prediction:

Predicted Labels: ['area/tfjob', 'kind/bug', 'priority/p1']

Cosine similarity (0-1): 0.95

Closest Issue URL: https://github.com/kubeflow/kubeflow/issues/2634

Closest Issue Title: kfctl apply failed for invalid spec.version when installing crd tfjobs.kubeflow.org

Closest Issue Body: i'm initializing a kubeflow testing environment following offical getting started guide https://www.kubeflow.org/docs/started/getting-started/. here are the commands just copy them from the webpage :\r \r \r export kubeflow_src=$ pwd /kubeflow\r \r mkdir ${kubeflow_src}\r cd ${kubeflow_src}\r export kubeflow_tag=v0.4.1\r \r curl https://raw.githubusercontent.com/kubeflow/kubeflow/${kubeflow_tag}/scripts/download.sh | bash\r \r export kfapp=kfapp\r ${kubeflow_src}/scripts/kfctl.sh init ${kfapp} --platform none\r cd ${kfapp}\r ${kubeflow_src}/scripts/kfctl.sh generate k8s\r ${kubeflow_src}


In [ ]:


In [ ]:


In [ ]:

Notes

  • Some labels have a fairly high N. Do we really need few shot for these?
  • Do you really want to maintain local models for each repo? If you do should be a seperate service with API endpoint to keep dependencies clean.
  • First lets see if few shot can even work?
  • Looks like we might be able to get pretty far on keyword matching and BPE

Supervised Learning On Kubeflow/* (the whole org)

1. Get Data That has at least one label that occurs > 20 times.


In [419]:
from collections import Counter, Set
label_counter = Counter()

df['labels_unpacked'] = df.labels.apply(lambda x: unpack_list(x))

for labels in df.labels_unpacked:
    label_counter.update(labels)
    
labels_to_keep = {x:label_counter[x] for x in label_counter if label_counter[x] >= 20}

See the labels that occur > 20 times.


In [420]:
display(labels_to_keep)
print(f' Number of labels: {len(labels_to_keep)}')

label_set = set(labels_to_keep.keys())


{'api/v1alpha2': 78,
 'area/operator': 51,
 'kind/enhancement': 34,
 'priority/p0': 167,
 'area/front-end': 136,
 'priority/p1': 902,
 'release/0.2.0': 67,
 'release/0.3.0': 195,
 'sprint/2018-06-11-to-06-22': 28,
 'area/0.4.0': 130,
 'area/docs': 117,
 'area/testing': 138,
 'help wanted': 122,
 'addition/feature': 42,
 'kind/bug': 130,
 'problems/bug': 28,
 'area/bootstrap': 83,
 'platform/gcp': 138,
 'testing': 90,
 'area/0.3.0': 28,
 'good first issue': 71,
 'improvement/enhancement': 40,
 'community/discussion': 30,
 'priority/p3': 40,
 'area/api': 28,
 'area/jupyter': 145,
 'sprint/2018-07-09-to-07-20': 20,
 'area/back-end': 27,
 'area/tfjob': 47,
 'priority/p2': 284,
 'area/example/code_search': 45,
 'kind/feature': 44,
 'area/0.5.0': 40,
 'community/question': 150,
 'inference': 27,
 'area/kfctl': 116,
 'area/build-release': 61,
 'area/inference': 60,
 'area/ksonnet': 24,
 'cuj/multi-user': 30,
 'cuj/build-train-deploy': 31}
 Number of labels: 41

In [456]:
h_labeled_df = df[df.labels_unpacked.apply(lambda x: len(set(x).intersection(label_set)) > 0)]
h_labeled_df.shape


Out[456]:
(1687, 7)

Only retain labels that occur at least 20 times


In [477]:
h_labeled_df['final_labels'] = h_labeled_df.labels_unpacked.apply(lambda x: set(x).intersection(label_set))


/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

2. Extract One Hot Encoded Labels


In [774]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [775]:
ohe_labels = mlb.fit_transform(h_labeled_df.final_labels.values.tolist())
display(ohe_labels)
print(ohe_labels.shape)


array([[0, 1, 0, 0, ..., 0, 0, 0, 0],
       [0, 0, 0, 0, ..., 1, 1, 0, 0],
       [0, 1, 0, 1, ..., 0, 0, 0, 0],
       [1, 1, 0, 0, ..., 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, ..., 0, 0, 0, 0],
       [0, 0, 0, 0, ..., 0, 0, 0, 0],
       [1, 1, 0, 1, ..., 0, 0, 0, 0],
       [0, 1, 0, 0, ..., 0, 0, 0, 0]])
(1687, 41)

In [776]:
mlb.classes_


Out[776]:
array(['addition/feature', 'api/v1alpha2', 'area/0.3.0', 'area/0.4.0', 'area/0.5.0', 'area/api', 'area/back-end',
       'area/bootstrap', 'area/build-release', 'area/docs', 'area/example/code_search', 'area/front-end',
       'area/inference', 'area/jupyter', 'area/kfctl', 'area/ksonnet', 'area/operator', 'area/testing', 'area/tfjob',
       'community/discussion', 'community/question', 'cuj/build-train-deploy', 'cuj/multi-user', 'good first issue',
       'help wanted', 'improvement/enhancement', 'inference', 'kind/bug', 'kind/enhancement', 'kind/feature',
       'platform/gcp', 'priority/p0', 'priority/p1', 'priority/p2', 'priority/p3', 'problems/bug', 'release/0.2.0',
       'release/0.3.0', 'sprint/2018-06-11-to-06-22', 'sprint/2018-07-09-to-07-20', 'testing'], dtype=object)

3. Extract Latent Features For Each Issue


In [401]:
cleaned_text = [process_dict(x, 0) for x in labeled_df.to_dict(orient='rows')]
vecs = []
encoder = learn.model[0]
encoder.eval()

for x in tqdm_notebook(cleaned_text, total=len(cleaned_text)):
    # transform the data to integers
    x = learn.data.one_item(x)[0]
    # forward to pass through model
    encoder.reset()
    vecs.extend(encoder.forward(x)[-1][-1])




In [451]:
vec_list = IssueRepresentation_List(vecs)

In [459]:
latent_features = vec_list.concat
display(latent_features)
print(latent_features.shape)


tensor([[ 0.0467,  0.0119,  0.0663,  ..., -0.0782, -0.0304, -0.0003],
        [ 0.0970,  0.0110, -0.0078,  ..., -0.0838, -0.0189,  0.0131],
        [ 0.0591,  0.0070,  0.0389,  ..., -0.0966, -0.0313,  0.0013],
        ...,
        [ 0.0703,  0.0162, -0.0188,  ..., -0.0941, -0.0271, -0.0014],
        [ 0.0736, -0.0151,  0.0031,  ..., -0.0619, -0.0281, -0.0016],
        [ 0.0621, -0.0010,  0.0237,  ..., -0.1155, -0.0385, -0.0002]],
       grad_fn=<StackBackward>)
torch.Size([1687, 1200])

4. Extract Repo Indicators (Additional Features)


In [509]:
print(f'There are {h_labeled_df.repo.nunique()} repos in the dataset with labels')


There are 19 repos in the dataset with labels

In [773]:
mlb_repos = MultiLabelBinarizer()

repo_indicators = mlb_repos.fit_transform([[x] for x in h_labeled_df.repo.values.tolist()])
display(repo_indicators)
repo_indicators.shape


array([[0, 0, 0, 0, ..., 0, 0, 1, 0],
       [0, 0, 0, 0, ..., 0, 0, 0, 0],
       [0, 0, 0, 0, ..., 0, 0, 1, 0],
       [0, 0, 0, 0, ..., 0, 0, 1, 0],
       ...,
       [0, 0, 0, 0, ..., 0, 0, 0, 0],
       [0, 0, 0, 0, ..., 0, 0, 1, 0],
       [0, 0, 0, 0, ..., 0, 0, 1, 0],
       [0, 0, 0, 0, ..., 0, 0, 1, 0]])
Out[773]:
(1687, 19)

5. Combine Feautre Vectors & Train Model 5-Fold CV, Keep Out-of-Fold preds

Concat Repo Indicators and Latent Features


In [504]:
import numpy as np

feature_arr = np.concatenate([repo_indicators, latent_features], axis = 1)
feature_arr.shape


Out[504]:
(1687, 1219)

In [738]:
! pip install -U scikit-learn


Collecting scikit-learn
  Downloading https://files.pythonhosted.org/packages/90/c7/401c231c445fb6fad135e92197da9c3e77983de169ff1887cc18af94498d/scikit_learn-0.21.1-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)
     |████████████████████████████████| 6.7MB 19.5MB/s eta 0:00:01
Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.2.1)
Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.16.3)
Requirement already satisfied, skipping upgrade: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (0.13.2)
Installing collected packages: scikit-learn
  Found existing installation: scikit-learn 0.21.0
    Uninstalling scikit-learn-0.21.0:
      Successfully uninstalled scikit-learn-0.21.0
Successfully installed scikit-learn-0.21.1

In [744]:
import sklearn.neural_network.MLPClassifier
sklearn.__version__


Out[744]:
'0.21.0'

In [746]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neural_network import MLPClassifier

etc = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=2, bootstrap=False, n_jobs=-1)
knn = KNeighborsClassifier(n_neighbors=2, weights='distance', metric='cosine', n_jobs=-1)
rc = RidgeClassifierCV(alphas=[.1, .5, 5, 10, 50, 100], normalize=True, store_cv_values=True)

In [845]:
mlp = MLPClassifier(alpha=.01, 
                    hidden_layer_sizes=(500,),
                    learning_rate='adaptive', 
                    learning_rate_init=.1, 
                    early_stopping=True, 
                    validation_fraction=.25)

In [846]:
clf = mlp

In [847]:
pred_index = 1

In [848]:
clf.fit(X=np.delete(feature_arr, pred_index, axis=0),
        y=np.delete(ohe_labels, pred_index, axis=0))


Out[848]:
MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(500,), learning_rate='adaptive',
              learning_rate_init=0.1, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.25, verbose=False, warm_start=False)

In [849]:
preds = clf.predict_proba(feature_arr[None, pred_index])
ground_truth = ohe_labels[pred_index, :] == 1

for g,p,c in zip(ground_truth.tolist(), preds[0, :].tolist(), mlb.classes_.tolist()):
    if g:
        print('***', p,' ', c)
    else:
        print(p,' ', c)


8.800422804439018e-05   addition/feature
0.004316295039701864   api/v1alpha2
5.649512418352858e-05   area/0.3.0
0.08840352694439761   area/0.4.0
9.026766555145957e-05   area/0.5.0
2.8259444307727876e-05   area/api
0.00434346014321032   area/back-end
0.024164074913004662   area/bootstrap
0.06470222263188   area/build-release
0.0937456735331016   area/docs
0.002476910030106788   area/example/code_search
*** 0.07005119531342831   area/front-end
0.0400206659764553   area/inference
0.08157587854703642   area/jupyter
0.0609930105802697   area/kfctl
0.015836785437453937   area/ksonnet
0.02912750816976115   area/operator
0.10465119971930756   area/testing
0.002670841714019011   area/tfjob
0.006456681244973385   community/discussion
0.007324003218792524   community/question
0.022002275579410747   cuj/build-train-deploy
0.01269626922120256   cuj/multi-user
0.05254902484630349   good first issue
0.0695335347299035   help wanted
0.00016611675694153008   improvement/enhancement
0.0017730967490715611   inference
0.0776168961705274   kind/bug
0.0069148434294968265   kind/enhancement
0.0019111186845026853   kind/feature
0.0006896978848769677   platform/gcp
0.10813692048246774   priority/p0
*** 0.6955454644727967   priority/p1
0.15374017261981285   priority/p2
0.02934454331439272   priority/p3
0.0012741408243957598   problems/bug
*** 0.005526174342339448   release/0.2.0
*** 0.04561568608796274   release/0.3.0
*** 0.005336582094342152   sprint/2018-06-11-to-06-22
0.0009323752051200633   sprint/2018-07-09-to-07-20
0.09014346870401345   testing

In [850]:
clf.predict_proba(feature_arr[None, pred_index]).shape


Out[850]:
(1, 41)

In [872]:
drdf = pd.DataFrame(feature_arr)
drdf['target'] = ohe_labels[:, 0]
drdf.columns = ['f_'+ str(x) for x in drdf.columns.tolist()]

In [873]:
drdf.to_csv('drdf_test.csv')

In [865]:
feature_arr.shape


Out[865]:
(1687, 1219)

In [875]:
ohe_labels[:, 0].sum()


Out[875]:
38

In [884]:
raw_textdrdf = pd.DataFrame({'text': [x['text'] for x in cleaned_text], 'target': ohe_labels[:, 0]})

In [885]:
raw_textdrdf.head()


Out[885]:
text target
0 xxxfldtitle v1alpha2 implement condition update xxxfldbody we should update the conditions according to the status. \ r \ r / cc xxxatmention 0
1 xxxfldtitle tfjobs ui doesn't work behind iap; react app needs support iap? xxxfldbody tfjobs ui is deployed on dev.kubeflow.org. \ r \ r the ui shows up behind iap but its doesn't work \ r - no tfjobs are listed \ r - creating a job via the ui doesn't work. \ r \ r looking at the developer console we see requests to \ r \ r \ r *URL* xxxlnkhb accounts.google.com xxxlnkhe \ r \ r which suggests to me the request is hitting the loadbalancer and being directed to do auth verification to sign i... 0
2 xxxfldtitle docs add instructions about how to contribute e2e test cases xxxfldbody ref *URL* xxxlnkhb github.com xxxlnkhe \ r i think we need to have a doc about how to write e2e test cases for operators, which will lower the barriers of participation. in the best case, the doc could be also helpful for pytorch and mxnet operators. \ r \ r / cc xxxatmention 0
3 xxxfldtitle v1alpha2 error when host name is not svc.cluster.local xxxfldbody there are some k8s clusters which have their own domains, they may not use svc.cluster.local. then the service is configured to it, thus it won't work. 1
4 xxxfldtitle gcp cluster-kubeflow.yaml isn't tested xxxfldbody this is the recommended dm and bootstrapper config for gke deployments. \ r *URL* xxxlnkhb github.com xxxlnkhe \ r it doesn't like that yaml file is used by our e2e tests because it wasn't updated to specify the registry when that change was made to \ r *URL* xxxlnkhb github.com xxxlnkhe \ r there is also another gcp bootstrapper config in that directory \ r \ r our e2e tests are using this dm config \ r *URL* xxxlnkhb github.com ... 0

In [886]:
raw_textdrdf.to_csv('raw_textdrdf.csv', index=False)

In [887]:
tempdf = pd.DataFrame(feature_arr)
tempdf.columns = ['f_'+ str(x) for x in tempdf.columns.tolist()]

In [892]:
drdf_concat  = pd.concat([raw_textdf, tempdf, pd.DataFrame({'target': ohe_labels[:, 0]})], axis=1)

In [893]:
drdf_concat.to_csv('drdf_concat.csv')

In [895]:
drdf_concat.columns


Out[895]:
Index(['text', 'f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_8',
       ...
       'f_1210', 'f_1211', 'f_1212', 'f_1213', 'f_1214', 'f_1215', 'f_1216', 'f_1217', 'f_1218', 'target'], dtype='object', length=1221)

In [17]:
mlb.classes_


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-c3abe1761bb7> in <module>
----> 1 mlb.classes_

NameError: name 'mlb' is not defined

In [ ]: