In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

In [0]:
!pip install scikit_posthocs orange3
!git clone https://github.com/google/fuzzbench.git

import sys
sys.path.append("fuzzbench")
from analysis import data_utils

In [0]:
#@title Report data source
report_directory = "2020-05-11"  #@param ["2020-05-11", "2020-04-21", "202-04-14", "2020-05-20-aflplusplus-2"] {allow-input: true}
data_url = f"https://www.fuzzbench.com/reports/{report_directory}/data.csv.gz"
df = pd.read_csv(data_url)

In [0]:
exp_snapshot_df = data_utils.get_experiment_snapshots(df)
exp_pivot_df = data_utils.experiment_pivot_table(exp_snapshot_df, data_utils.benchmark_rank_by_median)

Median Edge Coverage


In [16]:
exp_snapshot_df.pivot_table(index='benchmark', columns='fuzzer', values='edges_covered', aggfunc='median')


Out[16]:
fuzzer afl aflfast aflplusplus aflsmart eclipser entropic fairfuzz fastcgs_lm honggfuzz lafintel libfuzzer mopt
benchmark
bloaty_fuzz_target 5486 5177 5205 5426 4220 4807 5023 5667 5511 4971 4466 5648
curl_curl_fuzzer_http 5421 5320 5370 5430 4424 5001 4846 5405 5409 5320 4777 5386
freetype2-2017 5377 5225 5148 5303 4510 5591 5348 5335 7167 5138 4395 5372
harfbuzz-1.3.2 4282 4182 4111 4262 3383 4235 3621 4268 4368 4099 4107 4254
jsoncpp_jsoncpp_fuzzer 634 634 632 634 588 635 634 634 635 630 635 634
lcms-2017-03-21 1109 910 1131 1167 492 1309 1159 851 1008 1119 1211 896
libjpeg-turbo-07-2017 1444 1437 1436 1442 1048 1453 1109 1438 1434 1419 1365 1439
libpcap_fuzz_both 21 21 21 18 880 1736 21 19 1953 1721 1592 21
libpng-1.2.56 631 629 630 674 512 647 633 525 677 644 630 525
libxml2-v2.9.2 4665 4381 4169 4673 1680 4522 3445 3839 4564 4288 4287 3602
mbedtls_fuzz_dtlsclient 1674 1601 1657 1689 1413 1632 1663 1687 1680 1593 1605 1673
openssl_x509 4076 4073 4077 4076 4056 4077 4054 4077 4070 4070 4069 4075
openthread-2019-12-23 1727 1690 1710 1734 1672 1539 1139 1715 1729 1524 1536 1713
php_php-fuzz-parser 11209 11118 11143 11229 9888 11069 10961 11332 11495 10979 10231 11207
proj4-2017-08-14 2153 2036 2149 2149 177 2355 2048 1964 3279 2102 2255 1997
re2-2014-12-09 2272 2269 2262 2272 1956 2296 2268 2249 2296 2257 2303 2252
sqlite3_ossfuzz 17246 16656 16652 17297 5620 11199 8805 17714 12617 11898 8629 17248
systemd_fuzz-link-parser 990 986 982 989 936 982 904 989 1002 981 968 989
vorbis-2017-12-11 1010 999 988 1014 892 1007 1001 1014 998 979 790 1014
woff2-2016-05-06 1046 968 1046 1044 835 1046 950 1034 1118 1022 1007 1077
zlib_zlib_uncompress_fuzzer 330 326 329 330 315 338 331 329 334 329 335 329

Current Fuzzbench default report ranking

  • use the mean edges covered per benchmark
  • rank each fuzzer by their mean ranking for all benchmarks

In [5]:
default_report_rank = data_utils.experiment_level_ranking(
    exp_snapshot_df, 
    data_utils.benchmark_rank_by_mean, 
    data_utils.experiment_rank_by_average_rank)
default_report_rank


Out[5]:
fuzzer
honggfuzz       3.309524
afl             3.952381
aflsmart        4.119048
entropic        4.761905
fastcgs_lm      6.023810
mopt            6.047619
aflplusplus     6.833333
aflfast         7.452381
lafintel        7.904762
libfuzzer       7.928571
fairfuzz        8.476190
eclipser       11.190476
Name: average rank, dtype: float64

Other ranking measures

exp_pivot_df is the result of using the median coverage as a benchmark ranking algorithm

  • Number of firsts (best median coverage for a benchmark)
  • Percent coverage (median coverage / max coverage per benchmark)
  • Average rank (simple mean of benchmark ranking)
  • Statistical tests wins (only count cases where the coverage improvement was statistically significant (p_value < 0.05))

In [6]:
firsts_ranked = data_utils.experiment_rank_by_num_firsts(exp_pivot_df)
firsts_ranked


Out[6]:
fuzzer
honggfuzz      8.0
aflsmart       4.0
entropic       3.0
fastcgs_lm     2.0
libfuzzer      1.0
mopt           0.0
lafintel       0.0
fairfuzz       0.0
eclipser       0.0
aflplusplus    0.0
aflfast        0.0
afl            0.0
Name: number of wins, dtype: float64

In [7]:
percent_coverage = data_utils.experiment_rank_by_average_normalized_score(exp_pivot_df)
percent_coverage


Out[7]:
fuzzer
honggfuzz      97.018742
entropic       92.400031
aflsmart       90.725944
lafintel       90.410083
afl            90.248494
aflplusplus    88.693044
fastcgs_lm     87.601417
mopt           87.521381
aflfast        87.497333
libfuzzer      87.071347
fairfuzz       81.420302
eclipser       71.055928
Name: average normalized score, dtype: float64

In [8]:
average_rank = data_utils.experiment_rank_by_average_rank(exp_pivot_df)
average_rank


Out[8]:
fuzzer
honggfuzz       3.285714
aflsmart        3.976190
afl             4.095238
entropic        4.666667
fastcgs_lm      5.785714
mopt            5.976190
aflplusplus     6.857143
aflfast         7.523810
libfuzzer       7.785714
lafintel        8.309524
fairfuzz        8.452381
eclipser       11.285714
Name: average rank, dtype: float64

In [9]:
stats_wins = data_utils.experiment_level_ranking(
    exp_snapshot_df,
    data_utils.benchmark_rank_by_stat_test_wins,
    data_utils.experiment_rank_by_average_rank
)
stats_wins


Out[9]:
fuzzer
honggfuzz       3.166667
afl             3.904762
aflsmart        4.357143
entropic        4.761905
fastcgs_lm      5.738095
mopt            5.904762
aflplusplus     6.904762
aflfast         7.642857
libfuzzer       7.738095
lafintel        8.166667
fairfuzz        8.452381
eclipser       11.261905
Name: average rank, dtype: float64

Ranking comparison chart


In [17]:
rankings = {
    "Default Ranking (mean coverage)": default_report_rank,
    "Stats Wins (p_value wins)": stats_wins,
    "% Coverage Ranking": percent_coverage,
    "Average Rank (median coverage)": average_rank
}

fig, axes = plt.subplots(1,len(rankings), figsize=(30,7))
for i, (title, ranking_series) in enumerate(rankings.items()):
  ax = sns.barplot(x=ranking_series.values, y=ranking_series.index, ax=axes[i])
  ax.set_title(title)
  ax.set_ylabel("")
fig.suptitle("Comparison of Ranking Methods")
fig.show()



In [0]: