Home | History | Annotate | Download | only in perf
      1 # Copyright 2016 the V8 project authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 # Do statistical tests on benchmark results
      6 # This script requires the libraries rjson, R.utils, ggplot2 and data.table
      7 # Install them prior to running
      8 
      9 # To use the script, first get some benchmark results, for example via
     10 # tools/run_perf.py ../v8-perf/benchmarks/Octane2.1/Octane2.1-TF.json
     11 #  --outdir=out/x64.release-on --outdir-no-patch=out/x64.release-off
     12 # --json-test-results=results-on.json
     13 # --json-test-results-no-patch=results-off.json
     14 # then run this script
     15 # Rscript statistics-for-json.R results-on.json results-off.json ~/SVG
     16 # to produce graphs (and get stdio output of statistical tests).
     17 
     18 
     19 suppressMessages(library("rjson"))       # for fromJson
     20 suppressMessages(library("R.utils"))     # for printf
     21 suppressMessages(library("ggplot2"))     # for plotting
     22 suppressMessages(library("data.table"))  # less broken than data.frame
     23 
     24 # Clear all variables from environment
     25 rm(list=ls())
     26 
     27 args <- commandArgs(TRUE)
     28 if (length(args) != 3) {
     29   printf(paste("usage: Rscript %%this_script patched-results.json",
     30                "unpatched-results.json\n"))
     31 } else {
     32   patch <- fromJSON(file=args[1])
     33   nopatch <- fromJSON(file=args[2])
     34   outputPath <- args[3]
     35   df <- data.table(L = numeric(), R = numeric(), E = numeric(), 
     36                    p.value = numeric(), yL = character(), 
     37                    p.value.sig = logical())
     38   
     39   for (i in seq(1, length(patch$traces))) {
     40     testName <- patch$traces[[i]]$graphs[[2]]
     41     printf("%s\n", testName)
     42     
     43     nopatch_res <- as.integer(nopatch$traces[[i]]$results)
     44     patch_res <- as.integer(patch$traces[[i]]$results)
     45     if (length(nopatch_res) > 0) {
     46       patch_norm <- shapiro.test(patch_res);
     47       nopatch_norm <- shapiro.test(nopatch_res);
     48 
     49       # Shaprio-Wilk test indicates whether data is not likely to 
     50       # come from a normal distribution. The p-value is the probability
     51       # to obtain the sample from a normal distribution. This means, the
     52       # smaller p, the more likely the sample was not drawn from a normal
     53       # distribution. See [wikipedia:Shapiro-Wilk-Test].
     54       printf("  Patched scores look %s distributed (W=%.4f, p=%.4f)\n", 
     55              ifelse(patch_norm$p.value < 0.05, "not normally", "normally"), 
     56              patch_norm$statistic, patch_norm$p.value);
     57       printf("  Unpatched scores look %s distributed (W=%.4f, p=%.4f)\n", 
     58              ifelse(nopatch_norm$p.value < 0.05, "not normally", "normally"), 
     59              nopatch_norm$statistic, nopatch_norm$p.value);
     60       
     61       hist <- ggplot(data=data.frame(x=as.integer(patch_res)), aes(x)) +
     62         theme_bw() + 
     63         geom_histogram(bins=50) +
     64         ylab("Points") +
     65         xlab(patch$traces[[i]]$graphs[[2]])
     66       ggsave(filename=sprintf("%s/%s.svg", outputPath, testName), 
     67              plot=hist, width=7, height=7)
     68       
     69       hist <- ggplot(data=data.frame(x=as.integer(nopatch_res)), aes(x)) +
     70         theme_bw() + 
     71         geom_histogram(bins=50) +
     72         ylab("Points") +
     73         xlab(patch$traces[[i]]$graphs[[2]])
     74       ggsave(filename=sprintf("%s/%s-before.svg", outputPath, testName), 
     75              plot=hist, width=7, height=7)
     76       
     77       # The Wilcoxon rank-sum test 
     78       mww <- wilcox.test(patch_res, nopatch_res, conf.int = TRUE, exact=TRUE)
     79       printf(paste("  Wilcoxon U-test W=%.4f, p=%.4f,",
     80                    "confidence interval [%.1f, %.1f],",
     81                    "est. effect size %.1f \n"),
     82                    mww$statistic, mww$p.value,
     83                    mww$conf.int[1], mww$conf.int[2], mww$estimate);
     84       df <-rbind(df, list(mww$conf.int[1], mww$conf.int[2], 
     85                           unname(mww$estimate), unname(mww$p.value),
     86                           testName, ifelse(mww$p.value < 0.05, TRUE, FALSE)))
     87       # t-test
     88       t <- t.test(patch_res, nopatch_res, paired=FALSE)
     89       printf(paste("  Welch t-test t=%.4f, df = %.2f, p=%.4f,",
     90                    "confidence interval [%.1f, %.1f], mean diff %.1f \n"),
     91              t$statistic, t$parameter, t$p.value, 
     92              t$conf.int[1], t$conf.int[2], t$estimate[1]-t$estimate[2]);
     93     }
     94   }
     95   df2 <- cbind(x=1:nrow(df), df[order(E),])
     96   speedup <- ggplot(df2, aes(x = x, y = E, colour=p.value.sig)) +
     97     geom_errorbar(aes(ymax = L, ymin = R), colour="black") +
     98     geom_point(size = 4) +
     99     scale_x_discrete(limits=df2$yL,
    100                        name=paste("Benchmark, n=", length(patch_res))) +
    101     theme_bw() +
    102     geom_hline(yintercept = 0) +
    103     ylab("Est. Effect Size in Points") +
    104     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) +
    105     theme(legend.position = "bottom") +
    106     scale_colour_manual(name="Statistical Significance (MWW, p < 0.05)",
    107                           values=c("red", "green"),
    108                           labels=c("not significant", "significant")) +
    109     theme(legend.justification=c(0,1), legend.position=c(0,1))
    110   print(speedup)
    111   ggsave(filename=sprintf("%s/speedup-estimates.svg", outputPath), 
    112          plot=speedup, width=7, height=7)
    113 }
    114