From 2e1e2a153a2d193b1de1d3fe23f5f821fe8c56a1 Mon Sep 17 00:00:00 2001
From: Samuel Fadel <samuelfadel@gmail.com>
Date: Mon, 22 Aug 2016 12:06:48 -0300
Subject: plot.R: added confidence interval plots.

---
 plot.R | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 util.R |  9 +++++++
 2 files changed, 86 insertions(+), 9 deletions(-)

diff --git a/plot.R b/plot.R
index 06f3edd..3004a27 100644
--- a/plot.R
+++ b/plot.R
@@ -4,6 +4,8 @@
 require(cowplot)
 require(gridExtra)
 require(logging)
+require(reshape2)
+require(scales)
 
 source("util.R")
 
@@ -167,19 +169,29 @@ plot.scatter.measure <- function(measure, datasets, techniques, output.dir, n.it
 
   min.max <- min(max(measure.df$x), max(measure.df$y))
   p <- ggplot(measure.df) +
-          background_grid(major="xy", minor="none") +
-          theme(legend.position="right") +
-          labs(x=paste(measure$name.pretty, "(before)", sep=" "),
-               y=paste(measure$name.pretty, "(after)", sep=" ")) +
-          geom_point(aes(x=x, y=y, color=tech, shape=dataset), alpha=0.8, size=3) +
-          scale_color_brewer(palette="Set1", guide=guide_legend(title="Technique")) +
-          scale_shape(guide=guide_legend(title="Dataset")) +
-          geom_abline(intercept=0, slope=1)
+         background_grid(major="xy", minor="none") +
+         theme(legend.position="right") +
+         labs(x=paste(measure$name.pretty, "(before)", sep=" "),
+              y=paste(measure$name.pretty, "(after)", sep=" ")) +
+         geom_point(aes(x=x, y=y, color=tech, shape=dataset), alpha=0.8, size=3) +
+         scale_color_brewer(palette="Set1", guide=guide_legend(title="Technique")) +
+         scale_shape(guide=guide_legend(title="Dataset")) +
+         geom_abline(intercept=0, slope=1)
 
   fname <- file.path(output.dir, "plots", paste(measure$name, "-scatter", ".pdf", sep=""))
   loginfo("Saving plot: %s", fname)
   save_plot(fname, p, base_aspect_ratio=1.5)
 
+  p <- p +
+    scale_x_log10(breaks=trans_breaks("log10", function(x) 10^x),
+                  labels=trans_format("log10", math_format(10^ .x))) +
+    scale_y_log10(breaks=trans_breaks("log10", function(x) 10^x),
+                  labels=trans_format("log10", math_format(10^ .x))) +
+    annotation_logticks()
+  fname <- file.path(output.dir, "plots", paste(measure$name, "-scatter-log", ".pdf", sep=""))
+  loginfo("Saving plot: %s", fname)
+  save_plot(fname, p, base_aspect_ratio=1.5)
+
   p
 }
 
@@ -192,13 +204,68 @@ plot.scatter <- function(datasets, techniques, measures, output.dir, n.iter=30)
   }
 }
 
+# Plot a single barplot of techniques and datasets, where the y axis shows the
+# difference Ym-Y, with confidence intervals.
+plot.ci.measure <- function(measure, datasets, techniques, output.dir, n.iter=30) {
+  measure.df <- data.frame()
+  for (tech in techniques) {
+    for (ds in datasets) {
+      if (is.null(ds$labels.file) && measure$name == "silhouette") {
+        next
+      }
+
+      base.path <- file.path(output.dir, ds$name, tech$name)
+      fname <- file.path(base.path, paste(measure$name, "Y.tbl", sep="-"))
+      Y.measure  <- read.table(fname)$V1
+      fname <- file.path(base.path, paste(measure$name, "Ym.tbl", sep="-"))
+      Ym.measure <- read.table(fname)$V1
+      measure.df <- rbind(measure.df, data.frame(tech=tech$name.pretty,
+                                                 dataset=ds$name.pretty,
+                                                 y=Ym.measure - Y.measure))
+    }
+  }
+
+  p <- ggplot(measure.df) +
+         background_grid(major="xy", minor="none") +
+         theme(legend.position="right") +
+         labs(x="", y=measure$name.pretty) +
+         stat_summary(aes(x=tech, y=y, color=tech, shape=dataset), fun.data=ci.fun, position=position_dodge(width=0.75)) +
+         scale_color_brewer(palette="Set1", guide=guide_legend(title="Technique")) +
+         scale_shape(guide=guide_legend(title="Dataset")) +
+         scale_x_discrete(expand=c(0, 0.01))
+
+  fname <- file.path(output.dir, "plots", paste(measure$name, "-ci", ".pdf", sep=""))
+  loginfo("Saving plot: %s", fname)
+  save_plot(fname, p, base_aspect_ratio=1.65)
+
+  p <- p + scale_y_log10(breaks=trans_breaks("log10", function(x) 10^x),
+                         labels=trans_format("log10", math_format(10^ .x))) +
+    annotation_logticks(sides="l")
+
+  fname <- file.path(output.dir, "plots", paste(measure$name, "-ci-log", ".pdf", sep=""))
+  loginfo("Saving plot: %s", fname)
+  save_plot(fname, p, base_aspect_ratio=1.65)
+
+
+  p
+}
+
+# This function runs the function above for all measures
+plot.ci <- function(datasets, techniques, measures, output.dir, n.iter=30) {
+  dir.create.safe(file.path(output.dir, "plots"))
+
+  for (measure in measures) {
+    p <- plot.ci.measure(measure, datasets, techniques, output.dir, n.iter)
+  }
+}
+
 # Experiment configuration
 # Defines: datasets, techniques, measures, output.dir
 source("config.R")
 
 args <- commandArgs(T)
 
-# Logging setup
+# logging setup
 basicConfig()
 addHandler(writeToFile,
            file=args[1],
@@ -208,3 +275,4 @@ plot.measures(datasets, techniques, measures, output.dir)
 plot.averages(datasets, techniques, measures, output.dir)
 plot.scatter(datasets, techniques, measures, output.dir)
 plot.ri(datasets, techniques, measures, output.dir)
+plot.ci(datasets, techniques, measures, output.dir)
diff --git a/util.R b/util.R
index 0a87079..3f1a9b8 100644
--- a/util.R
+++ b/util.R
@@ -13,3 +13,12 @@ dir.create.safe <- function(path, log=T) {
     dir.create(path)
   }
 }
+
+# Confidence interval stat summary
+ci.fun <- function(d) {
+  test <- t.test(d)
+  ci <- test$conf.int
+  m <- as.double(test$estimate)
+
+  data.frame(ymin=ci[1], ymax=ci[2], y=m)
+}
-- 
cgit v1.2.3