Define pipeline to put embeddings to GCS


In [1]:
# fairing:include-cell
import sys
sys.path.append("/py")
from label_microservice.repo_config import RepoConfig
from code_intelligence.embeddings import pass_through
from code_intelligence.embeddings import load_model_artifact
from code_intelligence.embeddings import get_all_issue_text
import dill as dpickle
import os
import yaml
import logging
from google.cloud import storage

In [2]:
# fairing:include-cell
class IssuesLoader(object):

    def __init__(self, owner=None, repo=None):
        self.load_yaml(owner, repo)

    def load_yaml(self, owner, repo):
        config = RepoConfig(owner, repo)
        self.repo_owner = config.repo_owner
        self.repo_name = config.repo_name

        self.bucket_name = config.embeddings_bucket_name
        self.embeddings_file = config.embeddings_local_path
        self.embeddings_dest = config.embeddings_gcs_path

    def load_lang_model(self):
        model_url = 'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
        return load_model_artifact(model_url)

    def save_issue_embeddings(self):
        # check whether embeddings exist in gcs
        if self.check_embeddings_in_gcs():
            return

        inference_wrapper = self.load_lang_model()
        data = get_all_issue_text(owner=self.repo_owner, repo=self.repo_name,
                                  inf_wrapper=inference_wrapper)
        with open(self.embeddings_file, 'wb') as f:
            dpickle.dump(data, f)

        self.upload_embeddings_to_gcs()

    def check_embeddings_in_gcs(self):
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.bucket_name)
        return storage.Blob(bucket=bucket, name=self.embeddings_dest).exists(storage_client)

    def upload_embeddings_to_gcs(self):
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.bucket_name)
        blob = bucket.blob(self.embeddings_dest)
        blob.upload_from_filename(self.embeddings_file)

Run locally to test the code


In [3]:
ldr = IssuesLoader()

In [4]:
ldr.save_issue_embeddings()


100%|██████████| 209/209 [09:43<00:00,  2.33s/it]

Create entry point using fairing


In [5]:
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire

In [6]:
preprocessor = ConvertNotebookPreprocessorWithFire('IssuesLoader')

if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['embeddings.py', 'inference.py', 'repo_config.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()


Out[6]:
[PosixPath('issues_loader.py'),
 'inference.py',
 'repo_config.py',
 'embeddings.py']

In [ ]: