issues_loader.ipynb is a very similar notebook
Resource specs
If kernel dies while computing embeddings it could be because you run out of memory
Tesla V100 GPU, 32 vCPUs 244GB of Memory
In [1]:
import logging
import os
from pathlib import Path
import sys
logging.basicConfig(format='%(message)s')
logging.getLogger().setLevel(logging.INFO)
home = str(Path.home())
# Installing the python packages locally doesn't appear to have them automatically
# added the path so we need to manually add the directory
local_py_path = os.path.join(home, ".local/lib/python3.6/site-packages")
for p in [local_py_path, os.path.abspath("../../py")]:
if p not in sys.path:
logging.info("Adding %s to python path", p)
# Insert at front because we want to override any installed packages
sys.path.insert(0, p)
In [2]:
!pip3 install --user --upgrade -r ../requirements.txt
In [3]:
from bs4 import BeautifulSoup
import requests
from fastai.core import parallel, partial
from collections import Counter
from tqdm import tqdm_notebook
import torch
from code_intelligence import embeddings
from code_intelligence import graphql
from code_intelligence import gcs_util
from google.cloud import storage
In [194]:
if not os.getenv("GITHUB_TOKEN"):
logging.warning(f"No GitHub token set defaulting to hardcode list of Kubeflow repositories")
# The list of repos can be updated using the else block
repo_names = ['arena', 'batch-predict', 'caffe2-operator', 'chainer-operator', 'code-intelligence', 'common', 'community', 'crd-validation', 'example-seldon', 'examples', 'fairing', 'features', 'frontend', 'homebrew-cask', 'homebrew-core', 'internal-acls', 'katib', 'kfctl', 'kfp-tekton', 'kfserving', 'kubebench', 'kubeflow', 'manifests', 'marketing-materials', 'metadata', 'mpi-operator', 'mxnet-operator', 'pipelines', 'pytorch-operator', 'reporting', 'testing', 'tf-operator', 'triage-issues', 'website', 'xgboost-operator']
else:
gh_client = graphql.GraphQLClient()
repo_query="""query repoQuery($org: String!) {
organization(login: $org) {
repositories(first:100) {
totalCount
edges {
node {
name
}
}
}
}
}
"""
variables = {
"org": "kubeflow",
}
results = gh_client.run_query(repo_query, variables)
repo_nodes = graphql.unpack_and_split_nodes(results, ["data", "organization", "repositories", "edges"])
repo_names = [n["name"] for n in repo_nodes]
",".join([f"'{n}'" for n in sorted(repo_names)])
names_str = ", ".join([f"'{n}'" for n in sorted(repo_names)])
print(f"[{names_str}]")
In [5]:
import pandas as pd
from inference import InferenceWrapper
In [6]:
from pathlib import Path
from urllib import request as request_url
def pass_through(x):
return x
model_url = 'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
inference_wrapper = embeddings.load_model_artifact(model_url)
In [7]:
from pandas.io import gbq
import subprocess
# TODO(jlewi): Get the project using fairing?
PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()
In [166]:
# TODO(jlewi): This code should now be a function in embeddings/github_bigquery.py
query = """SELECT
JSON_EXTRACT(payload, '$.issue.html_url') as html_url,
JSON_EXTRACT(payload, '$.issue.title') as title,
JSON_EXTRACT(payload, '$.issue.body') as body,
JSON_EXTRACT(payload, "$.issue.labels") as labels,
JSON_EXTRACT(payload, "$.issue.updated_at") as updated_at,
org.login,
type,
FROM `githubarchive.month.20*`
WHERE (type="IssuesEvent" or type="IssueCommentEvent") and org.login = 'kubeflow'"""
issues_and_pulls=gbq.read_gbq(query, dialect='standard', project_id=PROJECT)
In [167]:
import re
pattern = re.compile(".*issues/[\d]+")
issues_index = issues_and_pulls["html_url"].apply(lambda x: pattern.match(x) is not None)
issues=issues_and_pulls[issues_index]
In [168]:
latest_issues = issues.groupby("html_url", as_index=False).apply(lambda x: x.sort_values(["updated_at"]).iloc[-1])
In [169]:
# Example of fetching a specific issue
# This allows easy spot checking of the data
some_issue = "https://github.com/kubeflow/kubeflow/issues/4916"
test_issue = latest_issues.loc[latest_issues["html_url"]==f'"{some_issue}"']
test_issue
Out[169]:
In [170]:
import json
def get_labels(x):
d = json.loads(x)
return [i["name"] for i in d]
latest_issues["parsed_labels"] = latest_issues["labels"].apply(get_labels)
In [171]:
for f in ["html_url", "title", "body"]:
latest_issues[f] = latest_issues[f].apply(lambda x : json.loads(x))
In [230]:
input_data = latest_issues[["title", "body"]]
In [231]:
issue_embeddings = inference_wrapper.df_to_embedding(input_data)
In [232]:
issue_embeddings.shape
Out[232]:
This calls the /text on the embeddings microservice
TODO(https://github.com/kubeflow/code-intelligence/issues/126) The label bot microservice needs to be updated to actually use the GraphQL API to match this code. Hopefully, in the interim the model is robust to slight deviations caused by the differences in whitespace
In [233]:
from code_intelligence import util as code_intelligence_util
In [234]:
issue_index = 1020
logging.info(f"Fetching issue {latest_issues.iloc[issue_index]['html_url']}")
issue_owner, issue_repo, issue_num = code_intelligence_util.parse_issue_url(latest_issues.iloc[issue_index]["html_url"].strip("\""))
In [235]:
some_issue_data = embeddings.get_issue(latest_issues.iloc[issue_index]["html_url"], gh_client)
In [224]:
some_issue_data
Out[224]:
In [236]:
print(latest_issues.iloc[issue_index]["title"])
print(some_issue_data["title"])
print(latest_issues.iloc[issue_index]["body"])
print(some_issue_data["body"])
some_issue_data["title"] == latest_issues.iloc[issue_index]["title"]
some_issue_data["body"] == latest_issues.iloc[issue_index]["body"]
Out[236]:
In [237]:
dict_for_embeddings = inference_wrapper.process_dict(some_issue_data)
In [238]:
inference_wrapper.get_pooled_features(dict_for_embeddings['text']).detach().cpu().numpy()
Out[238]:
In [239]:
issue_embeddings[issue_index,:]
Out[239]:
In [263]:
import h5py
import datetime
now = code_intelligence_util.now().isoformat()
In [268]:
git_tag = subprocess.check_output(["git", "describe", "--tags", "--always", "--dirty"]).decode().strip()
file_name = f"kubeflow_issue_embeddings_{now}.hdf5"
local_file = os.path.join(home, file_name)
In [269]:
latest_issues.to_hdf(local_file, "issues", mode="a")
In [270]:
h5_file = h5py.File(local_file, mode="a")
In [271]:
h5_file.create_dataset("issue_embeddings", data=issue_embeddings)
Out[271]:
In [272]:
# store some metadata
h5_file.attrs["file"] = "Get-GitHub-Issues.ipynb"
h5_file.attrs["git-tag"] = git_tag
In [273]:
h5_file.close()
In [274]:
embeddings_file = os.path.join(embeddings_dir, file_name)
if gcs_util.check_gcs_object(embeddings_file):
logging.info(f"File {embeddings_file} exists")
else:
logging.info(f"Copying {local_file} to {embeddings_file}")
gcs_util.copy_to_gcs(local_file, embeddings_file)
In [67]:
embeddings_file
Out[67]: