1 # Copyright 2016 the V8 project authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 # Do statistical tests on benchmark results 6 # This script requires the libraries rjson, R.utils, ggplot2 and data.table 7 # Install them prior to running 8 9 # To use the script, first get some benchmark results, for example via 10 # tools/run_perf.py ../v8-perf/benchmarks/Octane2.1/Octane2.1-TF.json 11 # --outdir=out/x64.release-on --outdir-no-patch=out/x64.release-off 12 # --json-test-results=results-on.json 13 # --json-test-results-no-patch=results-off.json 14 # then run this script 15 # Rscript statistics-for-json.R results-on.json results-off.json ~/SVG 16 # to produce graphs (and get stdio output of statistical tests). 17 18 19 suppressMessages(library("rjson")) # for fromJson 20 suppressMessages(library("R.utils")) # for printf 21 suppressMessages(library("ggplot2")) # for plotting 22 suppressMessages(library("data.table")) # less broken than data.frame 23 24 # Clear all variables from environment 25 rm(list=ls()) 26 27 args <- commandArgs(TRUE) 28 if (length(args) != 3) { 29 printf(paste("usage: Rscript %%this_script patched-results.json", 30 "unpatched-results.json\n")) 31 } else { 32 patch <- fromJSON(file=args[1]) 33 nopatch <- fromJSON(file=args[2]) 34 outputPath <- args[3] 35 df <- data.table(L = numeric(), R = numeric(), E = numeric(), 36 p.value = numeric(), yL = character(), 37 p.value.sig = logical()) 38 39 for (i in seq(1, length(patch$traces))) { 40 testName <- patch$traces[[i]]$graphs[[2]] 41 printf("%s\n", testName) 42 43 nopatch_res <- as.integer(nopatch$traces[[i]]$results) 44 patch_res <- as.integer(patch$traces[[i]]$results) 45 if (length(nopatch_res) > 0) { 46 patch_norm <- shapiro.test(patch_res); 47 nopatch_norm <- shapiro.test(nopatch_res); 48 49 # Shaprio-Wilk test indicates whether data is not likely to 50 # come from a normal distribution. The p-value is the probability 51 # to obtain the sample from a normal distribution. This means, the 52 # smaller p, the more likely the sample was not drawn from a normal 53 # distribution. See [wikipedia:Shapiro-Wilk-Test]. 54 printf(" Patched scores look %s distributed (W=%.4f, p=%.4f)\n", 55 ifelse(patch_norm$p.value < 0.05, "not normally", "normally"), 56 patch_norm$statistic, patch_norm$p.value); 57 printf(" Unpatched scores look %s distributed (W=%.4f, p=%.4f)\n", 58 ifelse(nopatch_norm$p.value < 0.05, "not normally", "normally"), 59 nopatch_norm$statistic, nopatch_norm$p.value); 60 61 hist <- ggplot(data=data.frame(x=as.integer(patch_res)), aes(x)) + 62 theme_bw() + 63 geom_histogram(bins=50) + 64 ylab("Points") + 65 xlab(patch$traces[[i]]$graphs[[2]]) 66 ggsave(filename=sprintf("%s/%s.svg", outputPath, testName), 67 plot=hist, width=7, height=7) 68 69 hist <- ggplot(data=data.frame(x=as.integer(nopatch_res)), aes(x)) + 70 theme_bw() + 71 geom_histogram(bins=50) + 72 ylab("Points") + 73 xlab(patch$traces[[i]]$graphs[[2]]) 74 ggsave(filename=sprintf("%s/%s-before.svg", outputPath, testName), 75 plot=hist, width=7, height=7) 76 77 # The Wilcoxon rank-sum test 78 mww <- wilcox.test(patch_res, nopatch_res, conf.int = TRUE, exact=TRUE) 79 printf(paste(" Wilcoxon U-test W=%.4f, p=%.4f,", 80 "confidence interval [%.1f, %.1f],", 81 "est. effect size %.1f \n"), 82 mww$statistic, mww$p.value, 83 mww$conf.int[1], mww$conf.int[2], mww$estimate); 84 df <-rbind(df, list(mww$conf.int[1], mww$conf.int[2], 85 unname(mww$estimate), unname(mww$p.value), 86 testName, ifelse(mww$p.value < 0.05, TRUE, FALSE))) 87 # t-test 88 t <- t.test(patch_res, nopatch_res, paired=FALSE) 89 printf(paste(" Welch t-test t=%.4f, df = %.2f, p=%.4f,", 90 "confidence interval [%.1f, %.1f], mean diff %.1f \n"), 91 t$statistic, t$parameter, t$p.value, 92 t$conf.int[1], t$conf.int[2], t$estimate[1]-t$estimate[2]); 93 } 94 } 95 df2 <- cbind(x=1:nrow(df), df[order(E),]) 96 speedup <- ggplot(df2, aes(x = x, y = E, colour=p.value.sig)) + 97 geom_errorbar(aes(ymax = L, ymin = R), colour="black") + 98 geom_point(size = 4) + 99 scale_x_discrete(limits=df2$yL, 100 name=paste("Benchmark, n=", length(patch_res))) + 101 theme_bw() + 102 geom_hline(yintercept = 0) + 103 ylab("Est. Effect Size in Points") + 104 theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) + 105 theme(legend.position = "bottom") + 106 scale_colour_manual(name="Statistical Significance (MWW, p < 0.05)", 107 values=c("red", "green"), 108 labels=c("not significant", "significant")) + 109 theme(legend.justification=c(0,1), legend.position=c(0,1)) 110 print(speedup) 111 ggsave(filename=sprintf("%s/speedup-estimates.svg", outputPath), 112 plot=speedup, width=7, height=7) 113 } 114