Ack: Derived from project_stats.ipynb and project_stats.py
GITHUB_TOKEN
to pass your token to the code
In [1]:
# Using Plotly v4.1.1 doesn't require account creation. Works offline.
# https://plot.ly/python/getting-started/
import plotly.graph_objs as go
import itertools
import math
import pandas
from IPython.display import display, HTML
In [2]:
import project_stats
In [3]:
c = project_stats.ProjectStats(project="KF1.0")
c.main()
Make plots showing different groups of labels
The following section adds up the expected engineering effort required for the project. Here is the approach:
The distribution is computed for open issues as well as issues in any state to compare total expected effort vs. remaining effort required.
In [4]:
# Costs of effort labels in workdays per eng
effort_labels_costs = {
"effort/1-day" : 1,
"effort/1-days" : 1,
"effort/3-days" : 3,
"effort/5-days" : 5,
"effort/2-weeks" : 10,
# removed temporarily because no issues in KF1.0 refer to this label yet
# "effort/2-weeks+" : 20
}
effort_labels = list(effort_labels_costs.keys())
In [5]:
def current_effort_distribution(stats_df, open_or_total='open'):
return stats_df[open_or_total][effort_labels].tail(1)
def open_effort_distribution(stats_df):
return current_effort_distribution(stats_df, 'open')
def total_effort_distribution(stats_df):
return current_effort_distribution(stats_df, 'total')
def current_effort_weeks(stats_df, open_or_total='open'):
e = current_effort_distribution(stats_df, open_or_total)
days = sum([e.iloc[0, e.columns.get_loc(l)] * effort_labels_costs[l] for l in effort_labels])
# consider 5 workdays in weeks
return math.ceil(days / 5)
def open_effort_weeks(stats_df):
return current_effort_weeks(stats_df, 'open')
def total_effort_weeks(stats_df):
return current_effort_weeks(stats_df, 'total')
In [6]:
open_effort_distribution(c.stats)
Out[6]:
In [7]:
total_effort_distribution(c.stats)
Out[7]:
In [29]:
print("Open effort needed (in weeks):")
print(open_effort_weeks(c.stats))
In [28]:
print("Total expected effort for the project (in weeks):")
print(total_effort_weeks(c.stats))
In [10]:
df = pandas.DataFrame({}, index = effort_labels)
df['open'] = open_effort_distribution(c.stats).values[0]
df['total'] = total_effort_distribution(c.stats).values[0]
data = [
go.Bar(
x=effort_labels,
y=df[col].values,
name=col
) for col in df.columns
]
go.Figure(data)
The following section computes the progress/completion tracking stats for the components of Kubeflow targeted for 1.0. For all those components, we categorize the issues under the application requirements set by https://github.com/kubeflow/community/blob/master/guidelines/application_requirements.md
For each Application & Category combination the following code computes the total issues filed vs. those that are closed. This information is used to produce a completion heat map across the applications and categories.
The information is extracted as following:
- The tracking issues for all components are listed below.
- They follow a well defined template for listing requirement categories and associated Github issues.
- The parser downloads the issue body and extract sections for each of those categories.
- It then extracts the referenced github issues from those sections and computes the stats on total referenced issues vs. closed issues.
In [11]:
import os
import re
from github import Github
In [12]:
# Tracking issues for KF 1.0
kf1_tracking_issues = {
'Kfctl' : 'https://github.com/kubeflow/kfctl/issues/18',
'Central Dashboard' : 'https://github.com/kubeflow/kubeflow/issues/4026',
'Notebooks Manager UI' : 'https://github.com/kubeflow/kubeflow/issues/4062',
'Notebooks Controller' : 'https://github.com/kubeflow/kubeflow/issues/3656',
'Profiles Controller' : 'https://github.com/kubeflow/kubeflow/issues/4058',
'KFServing Deployments' : 'https://github.com/kubeflow/kubeflow/issues/4061',
}
kf1_tracking_cats = [
'Configuration and deployment',
'Logging and monitoring',
'CI/CD',
'Docs'
]
In [13]:
# NOTE: Setup GITHUB_TOKEN environment variable to your Github access token
gh = Github(os.environ['GITHUB_TOKEN'])
kf_org, = [o for o in gh.get_user().get_orgs() if o.name == 'Kubeflow']
kf_org.name
Out[13]:
In [14]:
def parse_issue_url(url_str):
""" Parse a github issue url to extrac the repo name and the issue id.
url_str: url for the issue.
return (repo_name, issue_id)
"""
url_parts = url_str.split('/')
issue_id = url_parts[-1]
repo_name = url_parts[-3]
return (repo_name, int(issue_id))
In [15]:
def get_issue(gh_org, repo_name, issue_id):
""" Fetch the specified Github Issue.
gh_org: Org The Github org that owns the repo.
repo_name: string Name of the repo
issue_id: int
"""
return gh_org.get_repo(repo_name).get_issue(issue_id)
In [16]:
def extract_referenced_issues(content):
"""Collect a list of github issues referenced in the content.
content: string markdown representing the body of the tracking issue.
"""
ref_issues = re.findall(r'[a-z/]*#\d+', content)
return ref_issues
In [17]:
def get_issue_status(fq_issue_id, cur_repo):
"""Identify if the github issue is open or close.
fq_issue_id: string representing github format for the issue id.
cur_repo: string reponame of the current repository. All short form issues
are assumed to belong to this repository.
returns 'open' or 'closed'.
"""
parts = fq_issue_id.split('#')
issue_id = int(parts[-1])
repo_name = cur_repo
if len(parts[0]) > 0:
repo_name = parts[0].split('/')[-1]
iss = get_issue(kf_org, repo_name, issue_id)
return iss.state
In [18]:
def get_ref_issues_stats(repo_name, content):
"""Compute state of issues referenced in the content.
"""
ref_issues = extract_referenced_issues(content)
total_issues = len(ref_issues)
ref_issue_states = [get_issue_status(r, repo_name) for r in ref_issues]
closed = ref_issue_states.count('closed')
return(closed, total_issues)
In [19]:
def tracking_issue_state(issue_url):
""" Compute the tracking issue state given the tracking issue url
"""
repo_name, tr_issue_id = parse_issue_url(issue_url)
tr_issue = get_issue(kf_org, repo_name, tr_issue_id)
return get_ref_issues_stats(repo_name, tr_issue.body)
In [20]:
def get_sections(content):
""" Split trakcing issue content into separate sections for each set of 1.0 requirements.
"""
headers = re.findall(r'### [A-Za-z/ ]+', content)
sec_content = re.split(r'### [A-Za-z/ ]+', content)[1:] # Skip content previous to first header
lower_kf1_tracking_cats = [cat.lower() for cat in kf1_tracking_cats]
sections = {}
for h,s in zip(headers, sec_content):
hh = h[4:]
if hh.lower() in lower_kf1_tracking_cats:
sections[hh] = s
return sections
In [21]:
def plot_tracking_stats():
tracking_df = pandas.DataFrame({}, columns=['closed', 'total', 'percent'])
print('Fetching stats on...')
for k,v in kf1_tracking_issues.items():
print(k)
closed, total = tracking_issue_state(v)
tracking_df.loc[k] = [closed, total, closed * 100.0 / total]
print("Done")
bar_texts = [str(tracking_df.loc[i, 'closed']) + '/' + str(tracking_df.loc[i, 'total']) for i in tracking_df.index.values]
data = [
go.Bar(
y=tracking_df.index.values,
x=tracking_df['percent'],
orientation = 'h',
text = bar_texts,
)
]
return go.Figure(data)
In [22]:
plot_tracking_stats()
In [23]:
def tracking_heatmap():
"""Compute and display tracking heatmap across all components and categories.
"""
closed_df = pandas.DataFrame({}, columns=kf1_tracking_cats, index=kf1_tracking_issues.keys())
total_df = pandas.DataFrame({}, columns=kf1_tracking_cats, index=kf1_tracking_issues.keys())
percent_df = pandas.DataFrame({}, columns=kf1_tracking_cats, index=kf1_tracking_issues.keys())
print('Fetching stats on...')
for k,v in kf1_tracking_issues.items():
print(k)
print(v)
repo_name, tr_issue_id = parse_issue_url(v)
tr_issue = get_issue(kf_org, repo_name, tr_issue_id)
sections = get_sections(tr_issue.body)
for st, sc in sections.items():
print('>>>' + st)
closed, total = get_ref_issues_stats(repo_name, sc)
closed_df.loc[k, st] = closed
total_df.loc[k, st] = total
if total != 0:
percent_df.loc[k, st] = closed * 100.0 / total
else:
percent_df.loc[k, st] = 0.0
print("Done")
display(HTML(closed_df.to_html()))
display(HTML(total_df.to_html()))
display(HTML(percent_df.to_html()))
fig = go.Figure(data=go.Heatmap(
z= percent_df,
x= percent_df.columns.values,
y= percent_df.index.values))
fig.show()
In [24]:
tracking_heatmap()
In [25]:
def issues_missing_effort_labels(stats_df):
e = total_effort_distribution(stats_df)
num_issues_with_effort = sum([e.iloc[0, e.columns.get_loc(l)] for l in effort_labels])
projects = kf_org.get_projects()
kf1_proj = [p for p in projects if p.name == 'KF1.0'][0]
kf1_cols = kf1_proj.get_columns()
all_issues = 0
for col in kf1_cols:
l = col.get_cards()
for i in l:
all_issues = all_issues + 1
return all_issues - num_issues_with_effort
print(issues_missing_effort_labels(c.stats), "issues are missing effort labels")