This notebook reports kubeflow test flakiness. We leverqe kettle which uploads test metadata into bigquery upon publishing in GCS buckets via Prow. We also customize flakiness queries used in k8s infra-test metrics to compute daily flakiness metrics for kubeflow test jobs, junit tests and workflow resr.
We aim to use this report to evaluate kubeflow flakiness and as clues to debug the flakiness in kubeflow test pipeline. At this stage, we conclude that kubeflow test metadata are not sufficient to provide clues for debugging flakiness. Also, we currently generate the rport for the kubeflow prs which trigger presubmit tests. Later we will expand it for all types of kubeflow tests.
Caveat: flakiness metrics are correct for commits for which a conclusion is made i.e., either it is passed or failed and is reflected in the logs. If a commit failed and future runs reveal that the failure was a flake, it won't be reflected in the current flakiness metrics. Therefore, we should look at flakiness report for a relatively long time (e.g., a week).
In [74]:
%load_ext google.cloud.bigquery
In [76]:
%%bigquery daily_flakiness
select
job,
start_date,
round(sum(if(flaked=1,passed,runs))/sum(runs),3) build_consistency,
round(1-sum(flaked)/count(distinct commit),3) commit_consistency,
round (sum(flaked)/count(distinct commit),3) flake_rate,
sum(flaked) flakes,
sum(runs) runs,
sum(passed) passed,
sum(flaky_runs) flaky_runs,
sum(failed) failed
from ( /* Determine whether a (job, pr-num, commit) flaked */
select
job,
start_date,
num,
commit,
if(passed = runs or passed = 0, 0, 1) flaked,
if(passed = runs or passed = 0, 0, runs-passed) flaky_runs,
if(passed = 0, runs, 0) failed,
passed,
CAST(runs as INT64) runs
from (
select /* Count the runs and passes for each (job, pr-num, commit) */
max(start_date) start_date,
num,
commit,
sum(if(result='SUCCESS',1,0)) passed,
count(result) runs,
job
from (
SELECT /* all runs of any job for the past week, noting the commit and whether it passed */
job,
regexp_extract(path, r'/(\d+)\/') as num, /* pr number */
regexp_extract(m.value, r'[^,]+,\d+:([a-f0-9]+)"') commit, /* extract the first commit id from the repo flag */
EXTRACT(DATE FROM started) start_date,
result
FROM `k8s-gubernator.build.all` , UNNEST(metadata) as m
where
started > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 672 HOUR)
and (m.key = "repos") and STRPOS(job,'kubeflow') > 0 and STRPOS(job,'pr:') > 0
)
group by job, num, commit
)
)
group by job, start_date
order by
flakes desc,
build_consistency,
commit_consistency,
job
Out[76]:
In [78]:
import pandas as pd
overal_flakes = pd.DataFrame(daily_flakiness).groupby("start_date",as_index=False).agg(
{ 'flake_rate':'mean',
'flakes' :'sum',
'runs' : 'sum'
})
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime
matplotlib.rc('font', size=16)
ax=overal_flakes.flake_rate.plot(xticks=overal_flakes.index,figsize=(14,8), rot=45)
plt.title('Daily flake rate')
plt.xlabel('time')
plt.ylabel('flake_rate')
ax.set_xticklabels(overal_flakes['start_date'])
plt.ylim([0,1])
plt.show()
As illustreated by the plot, build_consistency and commit_consistency are expectedly corrolated. Note that build_consistency is computed with respect to the total number of runs whereas commit_consistency is computed with respect to distinct number of commits. This means that if number of flaky runs increases (i.e., runing retest) then build_consistency becomes lower than commit_consistency.
In [79]:
import pandas as pd
overal_consistency = pd.DataFrame(daily_flakiness).groupby("start_date",as_index=False).agg(
{ 'build_consistency':'mean',
'commit_consistency' :'mean'
})
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime
matplotlib.rc('font', size=16)
ax1=overal_consistency.build_consistency.plot(xticks=overal_consistency.index,figsize=(14,8), rot=45)
ax2=overal_consistency.commit_consistency.plot(xticks=overal_consistency.index,figsize=(14,8), rot=45)
plt.title('Daily build and commit consistency')
plt.xlabel('time')
plt.ylabel('percentage of consistency')
ax1.set_xticklabels(overal_consistency['start_date'])
plt.legend()
plt.ylim([0,1])
plt.show()
In [80]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
job_flakiness=pd.DataFrame(daily_flakiness).groupby("job",as_index=False).agg(
{ 'passed':'sum',
'failed' :'sum',
'flaky_runs' : 'sum'
})
matplotlib.rc('font', size=16)
ax=job_flakiness[['passed','failed','flaky_runs']].plot(kind='bar', stacked=True, xticks=job_flakiness.index, figsize=(14,8), rot=90)
ax.set_xticklabels(job_flakiness['job'])
plt.title('job flakines')
plt.xlabel('job')
plt.ylabel('number of runs')
plt.show()
Below we computer failure rate per junit tests as well as workflow tests. Kettle stores Junit mtadata in the column "test". Workflow tests results are stored in the column "metadata" whcih is a map. if key ends with "-phase", the key contains workflow test name and the value indicates whether it is succeeded or failed.
In [200]:
%%bigquery teststats
CREATE TEMP FUNCTION removeFirstChar(x STRING)
RETURNS STRING
LANGUAGE js AS """
if (x.charAt(0) == '-') {
x=x.substr(1);
}
return x;
""";
SELECT testname,runs,failures,filurepercentage
FROM(
SELECT
removeFirstChar(t.name) testname,
SUM(CASE WHEN t.failed=TRUE THEN 1 ELSE 0 END) failures,
COUNT(*) runs,
ROUND(SUM(CASE WHEN t.failed=TRUE THEN 1 ELSE 0 END)/COUNT(*)*100, 2) filurepercentage
FROM
`k8s-gubernator.build.all`, UNNEST(test) as t
WHERE
job LIKE '%kubeflow-presubmit%'
GROUP BY
testname
) WHERE testname not LIKE '%kubeflow-presubmit%'
order by filurepercentage DESC
Out[200]:
In [201]:
%%bigquery teststats
CREATE TEMP FUNCTION getWorkflowTestName(x STRING)
RETURNS STRING
LANGUAGE js AS """
var r=/\\d/;
var y=x.replace("-e2e","-endtoend");
var fd=r.exec(y);
y=y.substring(0, y.indexOf(fd) - 1);
y=y.replace("-endtoend","-e2e");
return y;
""";
Select
testname,
SUM(CASE WHEN mvalue ="Succeeded" THEN 0 ELSE 1 END) failures,
COUNT(*) runs,
ROUND(SUM(CASE WHEN mvalue ="Succeeded" THEN 0 ELSE 1 END)/COUNT(*)*100, 2) filurepercentage
From(
Select getWorkflowTestName(mkey) testname,mvalue
FROM(
SELECT m.key mkey, m.value mvalue
FROM
`k8s-gubernator.build.all`,UNNEST(metadata) as m
WHERE
job LIKE '%kubeflow-presubmit%' and ENDS_WITH(m.key, "-phase")
)
)
GROUP BY testname
ORDER BY filurepercentage DESC
Out[201]:
following are list of daily flakiness metrics used for jubit tests
In [ ]:
Here we first compute junit results along with kubeflow test job result per commit and filter out commits whcih has flakes. A commit is said to flake if some of runs pass and some fail. Here are the columns:
In [91]:
%%bigquery testkfisready
CREATE TEMP FUNCTION removeFirstChar(x STRING)
RETURNS STRING
LANGUAGE js AS """
if (x.charAt(0) == '-') {
x=x.substr(1);
}
return x;
""";
SELECT
testname,
job,
runs,
test_failed,
test_passed,
job_passed,
job_failed,
num,
path,
commit,
test_elapsed_time
FROM(
SELECT
testname,
count(*) runs,
sum(job_passed) job_passed,
count(*) - sum(job_passed) job_failed,
max(start_date) start_date,
sum(test_failed) test_failed,
sum(test_passed) test_passed,
num,
array_agg(path) path,
commit,
job,
array_agg(test_elapsed_time) test_elapsed_time
FROM(
SELECT /* collect stats per (commit, testname)*/
t.name testname,
CASE WHEN t.failed=TRUE THEN 1 ELSE 0 END test_failed,
CASE WHEN t.failed=TRUE THEN 0 ELSE 1 END test_passed,
job_passed,
t.time test_elapsed_time,
num,
path path,
job,
start_date start_date,
regexp_extract(commit, r'[^,]+,\d+:([a-f0-9]+)"') commit /* extract the first commit id from the repo flag */
FROM( /* collect kubeflow commit rows */
SELECT
path,
m.value commit,
test,
job,
EXTRACT(DATE FROM started) start_date,
regexp_extract(path, r'/(\d+)\/') as num, /* pr number */
CASE WHEN result='SUCCESS' THEN 1 ELSE 0 END job_passed
FROM
`k8s-gubernator.build.all`, UNNEST(metadata) as m
WHERE
job LIKE '%kubeflow-presubmit%'
and started > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 700 HOUR)
and (m.key = "repos") and STRPOS(job,'kubeflow') > 0 and STRPOS(job,'pr:') > 0
), UNNEST(test) as t
where t.name not LIKE '%kubeflow-presubmit%'
)
GROUP BY testname,commit,num,job
) where job_passed>0 and job_failed>0 and test_failed>0
Out[91]:
In [40]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(testkfisready)
Out[40]:
In [82]:
%%bigquery testkfisready
CREATE TEMP FUNCTION getWorkflowTestName(x STRING)
RETURNS STRING
LANGUAGE js AS """
var r=/\\d/;
var y=x.replace("-e2e","-endtoend");
var fd=r.exec(y);
y=y.substring(0, y.indexOf(fd) - 1);
y=y.replace("-endtoend","-e2e");
return y;
""";
SELECT
testname,
runs,
wf_passed,
wf_failed,
passed,
failed,
start_date,
num,
job,
commit,
path,
IF(failed>0 and passed>0 and wf_failed>0, 1,0) wf_flake,
if(failed>0 and passed> 0, 1,0) flake
FROM(
SELECT
testname,
count(*) runs,
sum(wf_passed) wf_passed,
sum(wf_failed) wf_failed,
sum(passed) passed,
sum(failed) failed,
max(start_date) start_date,
num,
array_agg(path) path,
job,
commit
FROM(
SELECT
getWorkflowTestName(m.key) testname,
IF(m.value="Succeeded", 1, 0) wf_passed,
IF(m.value="Succeeded", 0, 1) wf_failed,
regexp_extract(commit, r'[^,]+,\d+:([a-f0-9]+)"') commit,
job,
path,
start_date,
num,
passed,
failed
FROM(
SELECT
path,
test,
job,
metadata,
EXTRACT(DATE FROM started) start_date,
regexp_extract(path, r'/(\d+)\/') as num, /* pr number */
CASE WHEN result='SUCCESS' THEN 1 ELSE 0 END passed,
CASE WHEN result='SUCCESS' THEN 0 ELSE 1 END failed,
(SELECT m.value From UNNEST(metadata) as m where m.key = "repos") as commit
FROM
`k8s-gubernator.build.all`
WHERE
job LIKE '%kubeflow-presubmit%'
and started > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 168 HOUR)
and STRPOS(job,'kubeflow') > 0 and STRPOS(job,'pr:') > 0
), UNNEST(metadata) as m
where
job LIKE '%kubeflow-presubmit%' and ENDS_WITH(m.key, "-phase")
)
group by testname,job,commit,num
)WHERE failed>0 and passed>0 and wf_failed>0
Out[82]:
In [71]:
%%bigquery testkfisready
CREATE TEMP FUNCTION getWorkflowTestName(x STRING)
RETURNS STRING
LANGUAGE js AS """
var r=/\\d/;
var y=x.replace("-e2e","-endtoend");
var fd=r.exec(y);
y=y.substring(0, y.indexOf(fd) - 1);
y=y.replace("-endtoend","-e2e");
return y;
""";
SELECT
testname,
sum(runs) runs,
sum(passed) passed,
sum(failed) failed,
sum(wf_passed) wf_passed,
sum(wf_failed) wf_failed,
sum(if(wf_failed>0 and flake>0,1,0)) wf_flakes,
sum(flake) flakes
FROM(
SELECT
testname,
runs,
wf_passed,
wf_failed,
passed,
failed,
start_date,
num,
job,
commit,
path,
if(failed>0 and passed> 0, 1,0) flake
FROM(
SELECT
testname,
count(*) runs,
sum(wf_passed) wf_passed,
sum(wf_failed) wf_failed,
sum(passed) passed,
sum(failed) failed,
max(start_date) start_date,
num,
array_agg(path) path,
job,
commit
FROM(
SELECT
getWorkflowTestName(m.key) testname,
IF(m.value="Succeeded", 1, 0) wf_passed,
IF(m.value="Succeeded", 0, 1) wf_failed,
regexp_extract(commit, r'[^,]+,\d+:([a-f0-9]+)"') commit,
job,
path,
start_date,
num,
passed,
failed
FROM(
SELECT
path,
test,
job,
metadata,
EXTRACT(DATE FROM started) start_date,
regexp_extract(path, r'/(\d+)\/') as num, /* pr number */
CASE WHEN result='SUCCESS' THEN 1 ELSE 0 END passed,
CASE WHEN result='SUCCESS' THEN 0 ELSE 1 END failed,
(SELECT m.value From UNNEST(metadata) as m where m.key = "repos") as commit
FROM
`k8s-gubernator.build.all`
WHERE
job LIKE '%kubeflow-presubmit%'
and started > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 600 HOUR)
and STRPOS(job,'kubeflow') > 0 and STRPOS(job,'pr:') > 0
and regexp_extract(path, r'/(\d+)\/') is not null
), UNNEST(metadata) as m
where
job LIKE '%kubeflow-presubmit%' and ENDS_WITH(m.key, "-phase")
)
group by testname,job,commit,num
)WHERE failed>0 and passed>0 and wf_failed>0
) group by testname
order by wf_flakes DESC
Out[71]:
In [ ]:
%%bigquery testkfisready
CREATE TEMP FUNCTION getWorkflowTestName(x STRING)
RETURNS STRING
LANGUAGE js AS """
var r=/\\d/;
var y=x.replace("-e2e","-endtoend");
var fd=r.exec(y);
y=y.substring(0, y.indexOf(fd) - 1);
y=y.replace("-endtoend","-e2e");
return y;
""";
SELECT
testname,
sum(runs) runs,
sum(passed) passed,
sum(failed) failed,
sum(wf_passed) wf_passed,
sum(wf_failed) wf_failed,
sum(if(wf_failed>0 and flake>0,1,0)) wf_flakes,
sum(flake) flakes,
array_agg(num) num,
array_agg(commit) co
FROM(
SELECT
testname,
runs,
wf_passed,
wf_failed,
passed,
failed,
start_date,
num,
job,
commit,
path,
if(failed>0 and passed> 0, 1,0) flake
FROM(
SELECT
testname,
count(*) runs,
sum(wf_passed) wf_passed,
sum(wf_failed) wf_failed,
sum(passed) passed,
sum(failed) failed,
max(start_date) start_date,
num,
array_agg(path) path,
job,
commit
FROM(
SELECT
getWorkflowTestName(m.key) testname,
IF(m.value="Succeeded", 1, 0) wf_passed,
IF(m.value="Succeeded", 0, 1) wf_failed,
regexp_extract(commit, r'[^,]+,\d+:([a-f0-9]+)"') commit,
job,
path,
start_date,
num,
passed,
failed
FROM(
SELECT
path,
test,
job,
metadata,
EXTRACT(DATE FROM started) start_date,
regexp_extract(path, r'/(\d+)\/') as num, /* pr number */
CASE WHEN result='SUCCESS' THEN 1 ELSE 0 END passed,
CASE WHEN result='SUCCESS' THEN 0 ELSE 1 END failed,
(SELECT m.value From UNNEST(metadata) as m where m.key = "repos") as commit
FROM
`k8s-gubernator.build.all`
WHERE
job LIKE '%kubeflow-presubmit%'
and started > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 600 HOUR)
and STRPOS(job,'kubeflow') > 0 and STRPOS(job,'pr:') > 0
and regexp_extract(path, r'/(\d+)\/') is not null
), UNNEST(metadata) as m
where
job LIKE '%kubeflow-presubmit%' and ENDS_WITH(m.key, "-phase")
)
group by testname,job,commit,num
)WHERE failed>0 and passed>0 and wf_failed>0 and commit is not null and num is not null
) group by testname
order by wf_flakes DESC