In [1]:
# fairing:include-cell
import sys
sys.path.append("/py")
from label_microservice.repo_config import RepoConfig
from code_intelligence.embeddings import pass_through
from code_intelligence.embeddings import load_model_artifact
from code_intelligence.embeddings import get_all_issue_text
import dill as dpickle
import os
import yaml
import logging
from google.cloud import storage
In [2]:
# fairing:include-cell
class IssuesLoader(object):
def __init__(self, owner=None, repo=None):
self.load_yaml(owner, repo)
def load_yaml(self, owner, repo):
config = RepoConfig(owner, repo)
self.repo_owner = config.repo_owner
self.repo_name = config.repo_name
self.bucket_name = config.embeddings_bucket_name
self.embeddings_file = config.embeddings_local_path
self.embeddings_dest = config.embeddings_gcs_path
def load_lang_model(self):
model_url = 'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
return load_model_artifact(model_url)
def save_issue_embeddings(self):
# check whether embeddings exist in gcs
if self.check_embeddings_in_gcs():
return
inference_wrapper = self.load_lang_model()
data = get_all_issue_text(owner=self.repo_owner, repo=self.repo_name,
inf_wrapper=inference_wrapper)
with open(self.embeddings_file, 'wb') as f:
dpickle.dump(data, f)
self.upload_embeddings_to_gcs()
def check_embeddings_in_gcs(self):
storage_client = storage.Client()
bucket = storage_client.get_bucket(self.bucket_name)
return storage.Blob(bucket=bucket, name=self.embeddings_dest).exists(storage_client)
def upload_embeddings_to_gcs(self):
storage_client = storage.Client()
bucket = storage_client.get_bucket(self.bucket_name)
blob = bucket.blob(self.embeddings_dest)
blob.upload_from_filename(self.embeddings_file)
In [3]:
ldr = IssuesLoader()
In [4]:
ldr.save_issue_embeddings()
In [5]:
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire
In [6]:
preprocessor = ConvertNotebookPreprocessorWithFire('IssuesLoader')
if not preprocessor.input_files:
preprocessor.input_files = set()
input_files = ['embeddings.py', 'inference.py', 'repo_config.py']
preprocessor.input_files = set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()
Out[6]:
In [ ]: