# Using Plotly v4.1.1 doesn't require account creation. Works offline.
import plotly.graph_objs as go
import itertools
import math
import pandas
from IPython.display import display, HTML
import project_stats
c = project_stats.ProjectStats(project="KF1.0")
Make plots showing different groups of labels
The following section adds up the expected engineering effort required for the project. Here is the approach:
The distribution is computed for open issues as well as issues in any state to compare total expected effort vs. remaining effort required.
# Costs of effort labels in workdays per eng
effort_labels_costs = {
"effort/1-day" : 1,
"effort/1-days" : 1,
"effort/3-days" : 3,
"effort/5-days" : 5,
"effort/2-weeks" : 10,
# removed temporarily because no issues in KF1.0 refer to this label yet
# "effort/2-weeks+" : 20
effort_labels = list(effort_labels_costs.keys())
def current_effort_distribution(stats_df, open_or_total='open'):
return stats_df[open_or_total][effort_labels].tail(1)
def open_effort_distribution(stats_df):
return current_effort_distribution(stats_df, 'open')
def total_effort_distribution(stats_df):
return current_effort_distribution(stats_df, 'total')
def current_effort_weeks(stats_df, open_or_total='open'):
e = current_effort_distribution(stats_df, open_or_total)
days = sum([e.iloc[0, e.columns.get_loc(l)] * effort_labels_costs[l] for l in effort_labels])
# consider 5 workdays in weeks
return math.ceil(days / 5)
def open_effort_weeks(stats_df):
return current_effort_weeks(stats_df, 'open')
def total_effort_weeks(stats_df):
return current_effort_weeks(stats_df, 'total')
print("Open effort needed (in weeks):")
print("Total expected effort for the project (in weeks):")
df = pandas.DataFrame({}, index = effort_labels)
df['open'] = open_effort_distribution(c.stats).values[0]
df['total'] = total_effort_distribution(c.stats).values[0]
data = [
) for col in df.columns
The following section computes the progress/completion tracking stats for the components of Kubeflow targeted for 1.0. For all those components, we categorize the issues under the application requirements set by
For each Application & Category combination the following code computes the total issues filed vs. those that are closed. This information is used to produce a completion heat map across the applications and categories.
The information is extracted as following:
- The tracking issues for all components are listed below.
- They follow a well defined template for listing requirement categories and associated Github issues.
- The parser downloads the issue body and extract sections for each of those categories.
- It then extracts the referenced github issues from those sections and computes the stats on total referenced issues vs. closed issues.
import os
import re
from github import Github
# Tracking issues for KF 1.0
kf1_tracking_issues = {
'Kfctl' : '',
'Central Dashboard' : '',
'Notebooks Manager UI' : '',
'Notebooks Controller' : '',
'Profiles Controller' : '',
'KFServing Deployments' : '',
kf1_tracking_cats = [
'Configuration and deployment',
'Logging and monitoring',
In [13]:
# NOTE: Setup GITHUB_TOKEN environment variable to your Github access token
gh = Github(os.environ['GITHUB_TOKEN'])
kf_org, = [o for o in gh.get_user().get_orgs() if == 'Kubeflow']
def parse_issue_url(url_str):
""" Parse a github issue url to extrac the repo name and the issue id.
url_str: url for the issue.
return (repo_name, issue_id)
url_parts = url_str.split('/')
issue_id = url_parts[-1]
repo_name = url_parts[-3]
return (repo_name, int(issue_id))
def get_issue(gh_org, repo_name, issue_id):
""" Fetch the specified Github Issue.
gh_org: Org The Github org that owns the repo.
repo_name: string Name of the repo
issue_id: int
return gh_org.get_repo(repo_name).get_issue(issue_id)
def extract_referenced_issues(content):
"""Collect a list of github issues referenced in the content.
content: string markdown representing the body of the tracking issue.
ref_issues = re.findall(r'[a-z/]*#\d+', content)
return ref_issues
def get_issue_status(fq_issue_id, cur_repo):
"""Identify if the github issue is open or close.
fq_issue_id: string representing github format for the issue id.
cur_repo: string reponame of the current repository. All short form issues
are assumed to belong to this repository.
returns 'open' or 'closed'.
parts = fq_issue_id.split('#')
issue_id = int(parts[-1])
repo_name = cur_repo
if len(parts[0]) > 0:
repo_name = parts[0].split('/')[-1]
iss = get_issue(kf_org, repo_name, issue_id)
return iss.state
def get_ref_issues_stats(repo_name, content):
"""Compute state of issues referenced in the content.
ref_issues = extract_referenced_issues(content)
total_issues = len(ref_issues)
ref_issue_states = [get_issue_status(r, repo_name) for r in ref_issues]
closed = ref_issue_states.count('closed')
return(closed, total_issues)
def tracking_issue_state(issue_url):
""" Compute the tracking issue state given the tracking issue url
repo_name, tr_issue_id = parse_issue_url(issue_url)
tr_issue = get_issue(kf_org, repo_name, tr_issue_id)
return get_ref_issues_stats(repo_name, tr_issue.body)
def get_sections(content):
""" Split trakcing issue content into separate sections for each set of 1.0 requirements.
headers = re.findall(r'### [A-Za-z/ ]+', content)
sec_content = re.split(r'### [A-Za-z/ ]+', content)[1:] # Skip content previous to first header
lower_kf1_tracking_cats = [cat.lower() for cat in kf1_tracking_cats]
sections = {}
for h,s in zip(headers, sec_content):
hh = h[4:]
if hh.lower() in lower_kf1_tracking_cats:
sections[hh] = s
return sections
def plot_tracking_stats():
tracking_df = pandas.DataFrame({}, columns=['closed', 'total', 'percent'])
print('Fetching stats on...')
for k,v in kf1_tracking_issues.items():
closed, total = tracking_issue_state(v)
tracking_df.loc[k] = [closed, total, closed * 100.0 / total]
bar_texts = [str(tracking_df.loc[i, 'closed']) + '/' + str(tracking_df.loc[i, 'total']) for i in tracking_df.index.values]
data = [
orientation = 'h',
text = bar_texts,
return go.Figure(data)
def tracking_heatmap():
"""Compute and display tracking heatmap across all components and categories.
closed_df = pandas.DataFrame({}, columns=kf1_tracking_cats, index=kf1_tracking_issues.keys())
total_df = pandas.DataFrame({}, columns=kf1_tracking_cats, index=kf1_tracking_issues.keys())
percent_df = pandas.DataFrame({}, columns=kf1_tracking_cats, index=kf1_tracking_issues.keys())
print('Fetching stats on...')
for k,v in kf1_tracking_issues.items():
repo_name, tr_issue_id = parse_issue_url(v)
tr_issue = get_issue(kf_org, repo_name, tr_issue_id)
sections = get_sections(tr_issue.body)
for st, sc in sections.items():
print('>>>' + st)
closed, total = get_ref_issues_stats(repo_name, sc)
closed_df.loc[k, st] = closed
total_df.loc[k, st] = total
if total != 0:
percent_df.loc[k, st] = closed * 100.0 / total
percent_df.loc[k, st] = 0.0
fig = go.Figure(data=go.Heatmap(
z= percent_df,
x= percent_df.columns.values,
y= percent_df.index.values))
def issues_missing_effort_labels(stats_df):
e = total_effort_distribution(stats_df)
num_issues_with_effort = sum([e.iloc[0, e.columns.get_loc(l)] for l in effort_labels])
projects = kf_org.get_projects()
kf1_proj = [p for p in projects if == 'KF1.0'][0]
kf1_cols = kf1_proj.get_columns()
all_issues = 0
for col in kf1_cols:
l = col.get_cards()
for i in l:
all_issues = all_issues + 1
return all_issues - num_issues_with_effort
print(issues_missing_effort_labels(c.stats), "issues are missing effort labels")