From fbf8d82cdd3720c4bbf2a94035b6779e56d73448 Mon Sep 17 00:00:00 2001
From: Samuel Fadel <samuelfadel@gmail.com>
Date: Thu, 18 Aug 2016 23:15:39 -0300
Subject: Updated with all the code used to run experiments.

---
 measures.R | 224 +++++++++++++-------------
 plot.R     | 160 +++++++++++++++++++
 proj.R     | 103 ++++++++++++
 run.R      | 518 +++++++++++++++++++++++++++++++++++--------------------------
 4 files changed, 668 insertions(+), 337 deletions(-)
 create mode 100644 plot.R
 create mode 100644 proj.R

diff --git a/measures.R b/measures.R
index 99aa4df..057a114 100644
--- a/measures.R
+++ b/measures.R
@@ -1,30 +1,5 @@
-stress <- function(Dx, Dy) {
-  if (any(Dx != t(Dx)) || any(Dy != t(Dy))) {
-    stop("Dx and Dy must be symmetric")
-  }
-
-  Dx <- as.matrix(Dx)
-  Dy <- as.matrix(Dy)
-  if (nrow(Dx) != nrow(Dy)) {
-    stop("Dx and Dy must have the same number of elements")
-  }
-
-  n <- nrow(Dx)
-  s <- vector("numeric", n)
-  for (i in 1:n) {
-    s[i] <- 0
-    for (j in 1:n) {
-      if (i == j) {
-        next
-      }
-
-      s[i] = s[i] + (Dx[i, j] - Dy[i, j])^2 / Dx[i, j]
-    }
-    s[i] = s[i] / sum(D[i, ])
-  }
-
-  s
-}
+# Measures used as manipulation targets.
+# NOTE: This file is only a library, see run.R for more details.
 
 NP <- function(Dx, Dy, k = 9) {
   if (any(Dx != t(Dx)) || any(Dy != t(Dy))) {
@@ -53,112 +28,129 @@ NP <- function(Dx, Dy, k = 9) {
   preservation
 }
 
+#silhouette <- function(Dy, labels) {
+#  if (any(t(Dy) != Dy)) {
+#    stop("Dy must be symmetric")
+#  }
+#
+#  Dy <- as.matrix(Dy)
+#  n <- nrow(Dy)
+#  cohesion <- vector("numeric", n)
+#  separation <- vector("numeric", n)
+#
+#  for (i in 1:n) {
+#    label <- labels[i]
+#    separation[i] <- min(Dy[i, labels != label])
+#    cohesion[i] <- mean(Dy[i, labels[-i] == label])
+#  }
+#
+#  (separation - cohesion) / max(separation, cohesion)
+#}
 silhouette <- function(Dy, labels) {
-  if (any(t(Dy) != Dy)) {
-    stop("Dy must be symmetric")
-  }
-
-  Dy <- as.matrix(Dy)
   n <- nrow(Dy)
-  cohesion <- vector("numeric", n)
-  separation <- vector("numeric", n)
-
-  for (i in 1:n) {
-    label <- labels[i]
-    separation[i] <- min(Dy[i, labels != label])
-    cohesion[i] <- mean(Dy[i, labels[-i] == label])
+  if (n != length(labels)) {
+    stop("Number of labels doesn't match number of points")
   }
 
-  silh <- (separation - cohesion) / max(separation, cohesion)
-}
-
-d2p <- function(D, perplexity = 9, tol = 1e-5, max.tries = 50) {
-  if (any(D != t(D))) {
-    stop("D must be symmetric")
+  A_labels <- list()
+  B_labels <- list()
+  unique.labels <- unique(labels)
+  for (l in unique.labels) {
+    A_labels[[l]] <- labels == l
+    B_labels[[l]] <- labels != l
   }
 
-  D <- as.matrix(D)
-  P <- matrix(data=0, nrow=nrow(D), ncol=ncol(D))
-  n <- nrow(D)
-  beta <- rep(1, n)
-  logU <- log(perplexity)
-
+  # This factor excludes self comparisons when computing cohesion
+  m.factor <- n / (n-1)
+  s <- vector("numeric", n)
   for (i in 1:n) {
-    #denom <- sum(exp(-D[i, ] / sigmas))
-    #P[i, ] <- exp(-D[i, ] / sigmas) / denom
-
-    betaMin <- -Inf
-    betaMax <-  Inf
-    Di <- D[i, -i]
-
-    tries <- 0
-    repeat {
-      Pi <- exp(-Di * beta[i])
-      sumPi <- sum(Pi)
-      H <- log(sumPi) + beta[i] * sum(Di * Pi) / sumPi
-      Pi <- Pi / sumPi
-      Hdiff <- H - logU
-
-      if (abs(Hdiff) < tol || tries > max.tries) {
-        break
-      }
-
-      if (Hdiff > 0) {
-        betaMin <- beta[i]
-        beta[i] <- if (is.finite(betaMax)) {
-                     (beta[i] + betaMax) / 2
-                   } else {
-                     beta[i] * 2
-                   }
-      } else {
-        betaMax <- beta[i]
-        beta[i] <- if (is.finite(betaMin)) {
-                     (beta[i] + betaMin) / 2
-                   } else {
-                     beta[i] / 2
-                   }
-      }
+    label_i <- labels[i]
 
-      tries <- tries + 1
+    a <- m.factor * mean(Dy[i, A_labels[[label_i]]]) # cohesion
+    b <- Inf
+    for (l in unique(labels)) {
+      b <- min(mean(Dy[i, B_labels[[l]]]), b) # separation
     }
 
-    P[i, -i] <- Pi
+    s[i] <- (b - a) / max(b, a)
   }
 
-  list(P = P, beta = beta) # sigmas = sqrt(1 / beta)
+  s
 }
 
-d2p.beta <- function(D, beta) {
-  if (any(D != t(D))) {
-    stop("D must be symmetric")
-  }
+#stress <- function(Dx, Dy) {
+#  if (any(Dx != t(Dx)) || any(Dy != t(Dy))) {
+#    stop("Dx and Dy must be symmetric")
+#  }
+#
+#  Dx <- as.matrix(Dx)
+#  Dy <- as.matrix(Dy)
+#  if (nrow(Dx) != nrow(Dy)) {
+#    stop("Dx and Dy must have the same number of elements")
+#  }
+#
+#  n <- nrow(Dx)
+#  s <- vector("numeric", n)
+#  for (i in 1:n) {
+#    s[i] <- 0
+#    for (j in 1:n) {
+#      if (i == j) {
+#        next
+#      }
+#
+#      s[i] = s[i] + (Dx[i, j] - Dy[i, j])^2 / Dx[i, j]
+#    }
+#    s[i] = s[i] / sum(D[i, ])
+#  }
+#
+#  s
+#}
+stress <- function(Dx, Dy) {
+  n <- nrow(Dx)
 
-  D <- as.matrix(D)
-  n <- nrow(D)
-  P <- matrix(data=0, nrow=nrow(D), ncol=ncol(D))
-  for (i in 1:n) {
-    P[i, -i] <- exp(-D[i, -i] * beta[i])
-    P[i, -i] <- P[i, -i] / sum(P[i, -i])
+  C <- 0
+  D <- 0
+  for (i in 1:(n-1)) {
+    for (j in (i + 1):n) {
+      C <- C + Dx[i, j]
+      D <- D + (Dx[i, j] - Dy[i, j])^2 / Dx[i, j]
+      if (is.nan(D)) {
+        loginfo("%d, %d", i, j)
+        loginfo("%f", Dx[i, j])
+        stop("NaN")
+      }
+    }
   }
 
-  P
+  D / C
 }
 
-klDivergence <- function(P, Q, eps = 1e-12) {
-  if (nrow(P) != ncol(P) || nrow(Q) != ncol(Q)) {
-    stop("P and Q must be square")
-  }
-  if (nrow(P) != nrow(Q)) {
-    stop("P and Q must have the same number of elements")
-  }
-
-  P[P < eps] <- eps
-  Q[Q < eps] <- eps
-  n <- nrow(P)
-  d <- vector("numeric", n)
-  for (i in 1:n) {
-    d[i] <- sum(P[i, -i] * log(P[i, -i] / Q[i, -i]))
-  }
-
-  d
+# NOTE: This function requires the 'klmeasure' binary from:
+# http://research.cs.aalto.fi/pml/software/dredviz/
+smoothed.pr <- function(Dx, Dy, k) {
+  # Create SOM_PAK file for Dx
+  Dx.fname <- tempfile()
+  Dx.f <- file(Dx.fname, "w")
+  cat(sprintf("%d\n", ncol(Dx)), file=Dx.f)
+  write.table(Dx, Dx.f, col.names=F, row.names=F)
+  close(Dx.f)
+
+  # Create SOM_PAK file for Dy
+  Dy.fname <- tempfile()
+  Dy.f <- file(Dy.fname, "w")
+  cat(sprintf("%d\n", ncol(Dy)), file=Dy.f)
+  write.table(Dy, Dy.f, col.names=F, row.names=F)
+  close(Dy.f)
+
+  output <- system2("./klmeasure",
+                    stdout=T,
+                    args=c("--datadist", Dx.fname,
+                           "--projdist", Dy.fname,
+                           "--neighbors", sprintf("%d", k)))
+  output <- strsplit(output[2], " | ", fixed=T)
+  output <- unlist(output)
+
+  file.remove(Dx.fname, Dy.fname)
+
+  list(s.precision=as.double(output[1]), s.recall=as.double(output[2]))
 }
diff --git a/plot.R b/plot.R
new file mode 100644
index 0000000..e98f2e2
--- /dev/null
+++ b/plot.R
@@ -0,0 +1,160 @@
+# Functions for plotting results from manipulation experiments.
+# NOTE: This script should only be used after results are generated with run.R
+
+require(cowplot)
+require(gridExtra)
+require(logging)
+
+# Plots results for experiments of all techniques for a given measure and
+# datasset.
+plot.measure <- function(ds, techniques, output.dir, measure, n.iter=30) {
+  measure.df <- data.frame()
+  scale.labels <- c()
+
+  for (tech in techniques) {
+    Y.measure  <- read.table(file.path(output.dir, ds$name, tech$name, paste(measure$name, "Y.tbl", sep="-")))
+    Ym.measure <- read.table(file.path(output.dir, ds$name, tech$name, paste(measure$name, "Ym.tbl", sep="-")))
+
+    measure.df <- rbind(measure.df,
+                        data.frame(name=tech$name.pretty,
+                                   type=paste("Y",  tech$name, sep="."),
+                                   V1=Ym.measure - Y.measure)
+                       )
+
+    scale.labels <- c(scale.labels, tech$name.pretty)
+  }
+
+  p <- ggplot(measure.df) +
+         background_grid(major="xy", minor="none") +
+         theme(legend.position="none") +
+         labs(x="", y=measure$name.pretty) +
+         geom_boxplot(aes(type, V1, fill=name)) +
+         scale_fill_brewer(palette="Set1", guide=guide_legend(title="")) +
+         scale_y_continuous(limits=c(min(0, min(measure.df$V1)), max(0, max(measure.df$V1)))) +
+         scale_x_discrete(labels=scale.labels)
+
+  fname <- file.path(output.dir, "plots", ds$name, paste(measure$name, "pdf", sep="."))
+  loginfo("Saving plot: %s", fname)
+  save_plot(fname, p, base_aspect_ratio=1.3)
+}
+
+# Plot boxplots of techniques, one per measure diff per dataset.
+plot.measures <- function(datasets, techniques, measures, output.dir, n.iter=30) {
+  dir.create.safe(file.path(output.dir, "plots"))
+
+  for (ds in datasets) {
+    dir.create.safe(file.path(output.dir, "plots", ds$name))
+    for (measure in measures) {
+      if (is.null(ds$labels.file) && measure$name == "silhouette") {
+        next
+      }
+
+      plot.measure(ds, techniques, output.dir, measure, n.iter)
+    }
+  }
+}
+
+# Same as above, but averages over all datasets.
+plot.averages <- function(datasets, techniques, measures, output.dir, n.iter=30) {
+  dir.create.safe(file.path(output.dir, "plots"))
+
+  for (measure in measures) {
+    measure.df <- data.frame()
+    scale.labels <- c()
+    for (tech in techniques) {
+      measure.avg <- rep(0, n.iter)
+      scale.labels <- c(scale.labels, tech$name.pretty)
+
+      for (ds in datasets) {
+        if (is.null(ds$labels.file) && measure$name == "silhouette") {
+          next
+        }
+
+        Y.measure  <- read.table(file.path(output.dir, ds$name, tech$name, paste(measure$name, "Y.tbl", sep="-")))$V1
+        Ym.measure <- read.table(file.path(output.dir, ds$name, tech$name, paste(measure$name, "Ym.tbl", sep="-")))$V1
+        measure.avg <- measure.avg + (Ym.measure - Y.measure)
+      }
+      measure.avg <- measure.avg / length(datasets)
+      measure.df <- rbind(measure.df, data.frame(tech=tech$name, V1=measure.avg))
+    }
+
+    p <- ggplot(measure.df) +
+           background_grid(major="xy", minor="none") +
+           theme(legend.position="none") +
+           labs(x="", y=measure$name.pretty) +
+           geom_boxplot(aes(tech, V1, fill=tech)) +
+           scale_fill_brewer(palette="Set1", guide=guide_legend(title="")) +
+           scale_y_continuous(limits=c(min(0, min(measure.df$V1)), max(0, max(measure.df$V1)))) +
+           scale_x_discrete(labels=scale.labels)
+
+    fname <- file.path(output.dir, "plots", paste(measure$name, "pdf", sep="."))
+    loginfo("Saving plot: %s", fname)
+    save_plot(fname, p, base_aspect_ratio=1.3)
+  }
+}
+
+# Plot a single scatterplot of techniques and datasets, where x axis is the
+# measure before manipulation and y axis is the measure after manipulation.
+# Also adds a y=x line so that visual inspection is easier.
+plot.scatter.measure <- function(measure, datasets, techniques, output.dir, n.iter=30) {
+  measure.df <- data.frame()
+  for (tech in techniques) {
+    for (ds in datasets) {
+      if (is.null(ds$labels.file) && measure$name == "silhouette") {
+        next
+      }
+
+      base.path <- file.path(output.dir, ds$name, tech$name)
+      fname <- file.path(base.path, paste(measure$name, "Y.tbl", sep="-"))
+      Y.measure  <- read.table(fname)$V1
+      fname <- file.path(base.path, paste(measure$name, "Ym.tbl", sep="-"))
+      Ym.measure <- read.table(fname)$V1
+      measure.df <- rbind(measure.df, data.frame(tech=tech$name.pretty,
+                                                 dataset=ds$name.pretty,
+                                                 x=mean(Y.measure),
+                                                 y=mean(Ym.measure)))
+    }
+  }
+
+  min.max <- min(max(measure.df$x), max(measure.df$y))
+  p <- ggplot(measure.df) +
+          background_grid(major="xy", minor="none") +
+          theme(legend.position="right") +
+          labs(x=paste(measure$name.pretty, "(before)", sep=" "),
+               y=paste(measure$name.pretty, "(after)", sep=" ")) +
+          geom_point(aes(x=x, y=y, color=tech, shape=dataset), alpha=0.8, size=3) +
+          scale_color_brewer(palette="Set1", guide=guide_legend(title="Technique")) +
+          scale_shape(guide=guide_legend(title="Dataset")) +
+          geom_abline(intercept=0, slope=1)
+
+  fname <- file.path(output.dir, "plots", paste(measure$name, "-scatter", ".pdf", sep=""))
+  loginfo("Saving plot: %s", fname)
+  save_plot(fname, p, base_aspect_ratio=1.5)
+
+  p
+}
+
+# This function runs the scatterplot function above for all measures
+plot.scatter <- function(datasets, techniques, measures, output.dir, n.iter=30) {
+  dir.create.safe(file.path(output.dir, "plots"))
+
+  for (measure in measures) {
+    p <- plot.scatter.measure(measure, datasets, techniques, output.dir, n.iter)
+  }
+}
+
+# Experiment configuration
+# Defines: datasets, techniques, measures, output.dir
+source("config.R")
+
+args <- commandArgs(T)
+
+# Logging setup
+basicConfig()
+addHandler(writeToFile,
+           file=args[1],
+           level="FINEST")
+
+plot.measures(datasets, techniques, measures, output.dir)
+plot.averages(datasets, techniques, measures, output.dir)
+plot.scatter(datasets, techniques, measures, output.dir)
diff --git a/proj.R b/proj.R
new file mode 100644
index 0000000..4ef8114
--- /dev/null
+++ b/proj.R
@@ -0,0 +1,103 @@
+# NOTE: This script should only be used after results are generated with run.R
+
+library(grid)
+library(gridExtra)
+library(ggplot2)
+library(mp)
+
+get_legend <- function(p) {
+  tmp <- ggplot_gtable(ggplot_build(p))
+  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
+  legend <- tmp$grobs[[leg]]
+  legend
+}
+
+args <- commandArgs(T)
+
+dataset <- args[1]
+tech    <- args[2]
+iter    <- args[3]
+measure <- args[4]
+out.file <- args[5]
+
+X.file <- file.path("datasets", paste(dataset, "tbl", sep="."))
+l.file <- file.path("datasets", paste(dataset, "labels", sep="."))
+Y.file <- file.path("results", dataset, tech, paste("Y-", iter, ".tbl", sep=""))
+sample.indices.file <- file.path("results", dataset, paste("sample-indices-", iter, ".tbl", sep=""))
+Ys.file <- file.path("results", dataset, paste("Ys-", iter, ".tbl", sep=""))
+Ysm.file <- file.path("results", dataset, paste("Ysm-", measure, "-", iter, ".tbl", sep=""))
+
+X <- read.table(X.file)
+Y <- read.table(Y.file)
+l <- as.factor(read.table(l.file)$V1)
+l <- l[!duplicated(X)]
+X <- scale(unique(X))
+
+sample.indices <- read.table(sample.indices.file)$V1
+ls <- l[sample.indices]
+Ys <- read.table(Ys.file)
+Ysm <- read.table(Ysm.file)
+Ym <- pekalska(dist(X), sample.indices, Ysm)
+
+dfYs <- data.frame(x=Ys[,1], y=Ys[,2], Labels=ls)
+pYs <- ggplot(dfYs) +
+         theme_minimal() +
+         theme(legend.position="none",
+               plot.background=element_rect(fill="#ffffff", color="#000000"),
+               axis.text=element_blank(),
+               axis.title=element_blank(),
+               panel.grid=element_blank()) +
+         geom_point(aes(x=x, y=y, color=Labels), size=0.5, alpha=0.8) +
+         scale_color_brewer(palette="Set1")
+pYs <- ggplotGrob(pYs)
+
+dfYsm <- data.frame(x=Ysm[,1], y=Ysm[,2], Labels=ls)
+pYsm <- ggplot(dfYsm) +
+          theme_minimal() +
+          theme(legend.position="none",
+                plot.background=element_rect(fill="#ffffff", color="#000000"),
+                axis.text=element_blank(),
+                axis.title=element_blank(),
+                panel.grid=element_blank()) +
+          geom_point(aes(x=x, y=y, color=Labels), size=0.5, alpha=0.8) +
+          scale_color_brewer(palette="Set1")
+pYsm <- ggplotGrob(pYsm)
+
+dfY <- data.frame(x=Y[,1], y=Y[,2], Labels=l)
+pY <- ggplot(dfY) +
+        theme_minimal() +
+        theme(legend.position="bottom",
+              legend.title=element_blank(),
+              legend.text=element_text(size=14),
+              legend.background=element_rect(fill="#ffffff", color="#000000"),
+              axis.text=element_blank(),
+              axis.title=element_blank(),
+              panel.grid=element_blank()) +
+        geom_point(aes(x=x, y=y, color=Labels), alpha=0.8) +
+        scale_color_brewer(palette="Set1") +
+        annotation_custom(grob = pYs,
+                          xmin=min(dfY$x), xmax=min(dfY$x) + (max(dfY$x) - min(dfY$x)) / 3,
+                          ymin=max(dfY$y) - (max(dfY$y) - min(dfY$y)) / 3, ymax=max(dfY$y))
+
+legend <- get_legend(pY)
+pY <- pY + theme(legend.position="none")
+
+dfYm <- data.frame(x=Ym[,1], y=Ym[,2], Labels=l)
+pYm <- ggplot(dfYm) +
+         theme_minimal() +
+         theme(legend.position="none",
+               axis.text=element_blank(),
+               axis.title=element_blank(),
+               panel.grid=element_blank()) +
+         geom_point(aes(x=x, y=y, color=Labels), alpha=0.8) +
+         scale_color_brewer(palette="Set1") +
+         annotation_custom(grob = pYsm,
+                           xmin=min(dfYm$x), xmax=min(dfYm$x) + (max(dfYm$x) - min(dfYm$x)) / 3,
+                           ymin=min(dfYm$y), ymax=min(dfYm$y) + (max(dfYm$y) - min(dfYm$y)) / 3)
+
+p <- grid.arrange(pY, pYm, legend,
+                  ncol=2, nrow=2,
+                  layout_matrix=rbind(c(3, 3), c(1, 2)),
+                  widths=c(5, 5), heights=c(0.5, 4.5))
+
+ggsave(out.file, p, width=10, height=5)
diff --git a/run.R b/run.R
index 4c073a0..cda6830 100644
--- a/run.R
+++ b/run.R
@@ -1,19 +1,16 @@
-require(ggplot2)
-require(gridExtra)
+# Main experiments script: performs sampling, manipulation and projections for
+# all techniques, datasets and measures.
+
+require(logging)
+require(MASS)
 require(mp)
+require(Rtsne)
 
 source("measures.R")
 
-#automated.m <- function(D, labels) {
-#  D.m <- D
-#  for (label in unique(labels)) {
-#    same.label <- labels == label
-#    D.m[same.label, same.label] <- D[same.label, same.label] * 0.1
-#  }
-#
-#  D.m
-#}
-automated.m <- function(Xs, labels) {
+# Performs automated silhouette improvement manipulation, using a method
+# inspired by Schaefer et al. (2013).
+automated.silh <- function(Xs, labels) {
   n <- nrow(Xs)
   p <- ncol(Xs)
   Xs <- cbind(Xs, matrix(data=0, nrow=n, ncol=p))
@@ -23,228 +20,307 @@ automated.m <- function(Xs, labels) {
     }
   }
 
-  dist(Xs)
+  Dx <- dist(Xs)
+  # Dx <- Dx / mean(Dx)
+  as.matrix(Dx)
 }
 
-color_scale.blue_orange <- function(name) {
-  scale_colour_gradient(name = name, high = "#376092", low = "#e46c0a", space = "Lab")
+# NOTE: This function requires the 'klmeasure' binary from:
+# http://research.cs.aalto.fi/pml/software/dredviz/
+nerv <- function(Dx, Y, lambda=0.1) {
+  # Create SOM_PAK file for Dx
+  Dx.fname <- tempfile()
+  Dx.f <- file(Dx.fname, "w")
+  cat(sprintf("%d\n", ncol(Dx)), file=Dx.f)
+  write.table(Dx, Dx.f, col.names=F, row.names=F)
+  close(Dx.f)
+
+  # Create SOM_PAK file for Y
+  Y.fname <- tempfile()
+  Y.f <- file(Y.fname, "w")
+  cat(sprintf("%d\n", ncol(Y)), file=Y.f)
+  write.table(Y, Y.f, col.names=F, row.names=F)
+  close(Y.f)
+
+  # Run NeRV
+  Ym.fname <- tempfile()
+  system2("./nerv",
+          stdout=F,
+          stderr=F,
+          args=c("--inputdist", Dx.fname,
+                 "--outputfile", Ym.fname,
+                 "--init", Y.fname,
+                 "--lambda", sprintf("%.2f", lambda)))
+
+  # Read results from generated file; remove file afterwards
+  Ym <- read.table(Ym.fname, skip=1)
+  file.remove(Dx.fname, Y.fname, Ym.fname)
+
+  Ym
 }
 
-color_scale.gradient2 <- function(name) {
-  scale_colour_gradient2(name = name, mid = "#dddddd", space = "Lab")
+# Wrapper so that we can 'do.call' pekalska as we do with other techniques
+pekalska.wrapper <- function(X, sample.indices, Ys) {
+  pekalska(dist(X), sample.indices, Ys)
 }
 
-test <- function(file, suffix, output.dir) {
-  message("Testing dataset: ", file)
-  dataset <- read.table(file)
+# Scales columns of projections so that all values are in [0, 1]
+scale.Ys <- function(Ys) {
+  for (j in 1:ncol(Ys)) {
+    min.j <- min(Ys[, j])
+    max.j <- max(Ys[, j])
+    Ys[, j] <- (Ys[, j] - min.j) / (max.j - min.j)
+  }
 
-  # Extract labels
-  labels <- dataset[, ncol(dataset)]
-  classes <- as.factor(labels)
-  X <- dataset[, -ncol(dataset)]
+  Ys
+}
 
+dir.create.safe <- function(path, log=T) {
+  if (!dir.exists(path)) {
+    if (log) {
+      loginfo("Creating directory: %s", path)
+    }
+
+    dir.create(path)
+  }
+}
+
+run.manipulation <- function(X, Dx, labels, k, ds, n.iter, output.dir) {
   n <- nrow(X)
 
-  # Calculate distances (X) and normalize
-  message("\tCalculating dist(X)")
-  Dx <- dist(X)
-  Dx <- Dx / mean(Dx)
-  Dx <- as.matrix(Dx)
-
-  # Sample dataset
-  sample.indices <- sample(n, 3*sqrt(n))
-  classes.s <- as.factor(labels[sample.indices])
-
-  # Automatic sample positioning
-  message("\tCalculating Ys")
-  Dx.s <- Dx[sample.indices, sample.indices]
-  Ys   <- forceScheme(Dx.s)
-
-  # LAMP
-  message("\tCalculating Y")
-  Y <- lamp(X, sample.indices, Ys)
-
-  # Calculate distances (Y) and normalize
-  message("\tCalculating dist(Y)")
-  Dy <- dist(Y)
-  Dy <- Dy / mean(Dy)
-  Dy <- as.matrix(Dy)
-
-  message("\tCalculating P and Q")
-  prob <- d2p(Dx^2)
-  P <- prob$P
-  Q <- d2p.beta(Dy^2, prob$beta)
-
-  # Calculate measures
-  message("\tCalculating measures")
-  np <- NP(Dx, Dy)
-  silh <- silhouette(Dy, classes)
-  precision <- klDivergence(Q, P)
-  recall <- klDivergence(P, Q)
-
-  measures <- rbind(data.frame(mean=mean(np), median=median(np), sd=sd(np)),
-                    data.frame(mean=mean(silh), median=median(silh), sd=sd(silh)),
-                    data.frame(mean=mean(precision), median=median(precision), sd=sd(precision)),
-                    data.frame(mean=mean(recall), median=median(recall), sd=sd(recall)))
-  write.table(measures, paste(output.dir, suffix, "-measures.csv", sep=""), row.names=F)
-
-  if (!(all(is.finite(np)) &&
-      all(is.finite(silh)) &&
-      all(is.finite(precision)) &&
-      all(is.finite(recall)))) {
-    stop("Non-finite measures found")
+  loginfo("Calculating all sample.indices and Ys")
+  for (iter in 1:n.iter) {
+    loginfo("Iteration: %02d", iter)
+    # Sample dataset
+    sample.indices <- sample(n, max(ncol(X), sqrt(n)*3))
+    fname <- paste("sample-indices-", iter, ".tbl", sep="")
+    write.table(sample.indices, file.path(output.dir, ds$name, fname), row.names=F, col.names=F)
+
+    # Initial sample positioning
+    loginfo("Calculating Ys")
+    Dx.s  <- Dx[sample.indices, sample.indices]
+    Ys    <- scale.Ys(cmdscale(Dx.s))
+    fname <- paste("Ys-", iter, ".tbl", sep="")
+    write.table(Ys, file.path(output.dir, ds$name, fname), row.names=F, col.names=F)
+
+    # Perform manipulation
+    loginfo("Running manipulation procedures")
+
+    loginfo("Ys.m: Silhouette")
+    Dx.m <- automated.silh(X[sample.indices, ], labels[sample.indices])
+    Ys.silhouette <- scale.Ys(cmdscale(Dx.m))
+    Ys.m <- Ys.silhouette
+    fname <- paste("Ysm-silhouette-", iter, ".tbl", sep="")
+    write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F)
+
+    loginfo("Ys.m: NP")
+    Ys.np <- scale.Ys(Rtsne(X[sample.indices, ], perplexity=k)$Y)
+    Ys.m <- Ys.np
+    fname <- paste("Ysm-np-", iter, ".tbl", sep="")
+    write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F)
+
+    loginfo("Ys.m: Stress")
+    Ys.stress <- scale.Ys(sammon(Dx.s, Ys, tol=1e-20)$points)
+    Ys.m <- Ys.stress
+    fname <- paste("Ysm-stress-", iter, ".tbl", sep="")
+    write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F)
+
+    loginfo("Ys.m: Precision")
+    Ys.precision <- scale.Ys(nerv(Dx.s, Ys, 0.01))
+    Ys.m <- Ys.precision
+    fname <- paste("Ysm-precision-", iter, ".tbl", sep="")
+    write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F)
+
+    loginfo("Ys.m: Recall")
+    Ys.recall <- scale.Ys(nerv(Dx.s, Ys, 0.99))
+    Ys.m <- Ys.recall
+    fname <- paste("Ysm-recall-", iter, ".tbl", sep="")
+    write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F)
   }
+}
 
-  # Plot results
-  message("\tPlotting results")
-  shape_scale <- scale_shape_manual(name = "Classe", values = 1:nlevels(classes))
-  Ys <- cbind(as.data.frame(Ys), classes.s)
-  Y  <- cbind(as.data.frame(Y), classes, np, silh, precision, recall)
-  p.s <- ggplot(Ys) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes.s, colour = classes.s)) +
-      shape_scale + scale_color_manual(name = "Classe", values = 1:nlevels(classes))
-  ggsave(paste(output.dir, "subsample-", suffix, ".pdf", sep=""), p.s, width=5, height=5)
-
-  p <- ggplot(Y) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = classes)) +
-      shape_scale + scale_color_manual(name = "Classe", values = 1:nlevels(classes))
-  ggsave(paste(output.dir, suffix, ".pdf", sep=""), p, width=5, height=5)
-
-  p.np <- ggplot(Y) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = np)) +
-      shape_scale + color_scale.blue_orange("NP")
-  ggsave(paste(output.dir, "np-", suffix, ".pdf", sep=""), p.np, width=5, height=5)
-
-  p.silh <- ggplot(Y) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = silh)) +
-      shape_scale + color_scale.blue_orange("Silhueta")
-  ggsave(paste(output.dir, "silh-", suffix, ".pdf", sep=""), p.silh, width=5, height=5)
-
-  p.precision <- ggplot(Y) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = precision)) +
-      shape_scale + color_scale.blue_orange("Precisão")
-  ggsave(paste(output.dir, "precision-", suffix, ".pdf", sep=""), p.precision, width=5, height=5)
-
-  p.recall <- ggplot(Y) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = recall)) +
-      shape_scale + color_scale.blue_orange("Revocação")
-  ggsave(paste(output.dir, "recall-", suffix, ".pdf", sep=""), p.recall, width=5, height=5)
-
-  
-  pdf(paste(output.dir, "all-", suffix, ".pdf", sep=""), width = 12, height = 16)
-  grid.arrange(p.s, p, p.np, p.silh, p.precision, p.recall, ncol = 2)
-  dev.off()
-
-  # Perform manipulation
-  message("\tCalculating Ys.m")
-  Dx.m <- automated.m(X[sample.indices, ], labels[sample.indices])
-  Ys.m <- forceScheme(Dx.m, Ys[, 1:2])
-
-  # LAMP
-  message("\tCalculating Y.m")
-  Y.m <- lamp(X, sample.indices, Ys.m)
-
-  # Calculate distances (Y.m) and normalize
-  message("\tCalculating dist(Y.m)")
-  Dy <- dist(Y.m)
-  Dy <- Dy / mean(Dy)
-  Dy <- as.matrix(Dy)
-
-  message("\tCalculating Q")
-  Q <- d2p.beta(Dy^2, prob$beta)
-
-  # Calculate measures
-  message("\tCalculating measures")
-  np <- NP(Dx, Dy) - np
-  silh <- silhouette(Dy, classes) - silh
-  precision <- klDivergence(Q, P) - precision
-  recall <- klDivergence(P, Q) - recall
-
-  measures <- rbind(data.frame(mean=mean(np), median=median(np), sd=sd(np)),
-                    data.frame(mean=mean(silh), median=median(silh), sd=sd(silh)),
-                    data.frame(mean=mean(precision), median=median(precision), sd=sd(precision)),
-                    data.frame(mean=mean(recall), median=median(recall), sd=sd(recall)))
-  write.table(measures, paste(output.dir, suffix, "-manip-measures.csv", sep=""), row.names=F)
-
-  if (!(all(is.finite(np)) &&
-      all(is.finite(silh)) &&
-      all(is.finite(precision)) &&
-      all(is.finite(recall)))) {
-    stop("Non-finite measures found")
+run.technique <- function(X, Dx, labels, k, ds, n.iter, output.dir) {
+  loginfo("Technique: %s", tech$name)
+  dir.create.safe(file.path(output.dir, ds$name, tech$name))
+
+  silhouette.Y <- c()
+  np.Y         <- c()
+  stress.Y     <- c()
+  precision.Y  <- c()
+  recall.Y     <- c()
+
+  silhouette.Ym <- c()
+  np.Ym         <- c()
+  stress.Ym     <- c()
+  precision.Ym  <- c()
+  recall.Ym     <- c()
+
+  for (iter in 1:n.iter) {
+    loginfo("Iteration: %02d", iter)
+
+    # Load sample indices...
+    fname <- paste("sample-indices-", iter, ".tbl", sep="")
+    sample.indices <- read.table(file.path(output.dir, ds$name, fname))$V1
+    # ... and initial projection
+    fname <- paste("Ys-", iter, ".tbl", sep="")
+    Ys <- read.table(file.path(output.dir, ds$name, fname))
+
+    loginfo("Calculating Y")
+    Y     <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys), tech$args))
+    fname <- paste("Y-", iter, ".tbl", sep="")
+    write.table(Y, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F)
+
+    # Calculate distances (Y) and normalize
+    loginfo("Calculating distances")
+    Dy <- dist(Y)
+    Dy <- Dy / mean(Dy)
+    Dy <- as.matrix(Dy)
+
+    # Calculate measures
+    loginfo("Calculating measures")
+    np.Y         <- c(np.Y,         mean(NP(Dx, Dy, k)))
+    silhouette.Y <- c(silhouette.Y, mean(silhouette(Dy, classes)))
+    stress.Y     <- c(stress.Y,     stress(Dx, Dy))
+    precision.Y  <- c(precision.Y,  smoothed.pr(Dx, Dy, k)$s.precision)
+    recall.Y     <- c(recall.Y,     smoothed.pr(Dx, Dy, k)$s.recall)
+
+    # Testing manipulations
+    loginfo("Projection using Ysm.silhouette")
+    fname <- paste("Ysm-silhouette-", iter, ".tbl", sep="")
+    Ys.m <- read.table(file.path(output.dir, ds$name, fname))
+    Y.m  <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args))
+    fname <- paste("Ym-silhouette-", iter, ".tbl", sep="")
+    write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F)
+
+    Dy <- dist(Y.m)
+    Dy <- Dy / mean(Dy)
+    Dy <- as.matrix(Dy)
+    silhouette.Ym <- c(silhouette.Ym, mean(silhouette(Dy, classes)))
+
+
+    loginfo("Projection using Ysm.np")
+    fname <- paste("Ysm-np-", iter, ".tbl", sep="")
+    Ys.m <- read.table(file.path(output.dir, ds$name, fname))
+    Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args))
+    fname <- paste("Ym-np-", iter, ".tbl", sep="")
+    write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F)
+
+    Dy <- dist(Y.m)
+    Dy <- Dy / mean(Dy)
+    Dy <- as.matrix(Dy)
+    np.Ym <- c(np.Ym, mean(NP(Dx, Dy, k)))
+
+
+    loginfo("Projection using Ysm.stress")
+    fname <- paste("Ysm-stress-", iter, ".tbl", sep="")
+    Ys.m <- read.table(file.path(output.dir, ds$name, fname))
+    Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args))
+    fname <- paste("Ym-stress-", iter, ".tbl", sep="")
+    write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F)
+
+    Dy <- dist(Y.m)
+    Dy <- Dy / mean(Dy)
+    Dy <- as.matrix(Dy)
+    stress.Ym <- c(stress.Ym, stress(Dx, Dy))
+
+
+    loginfo("Projection using Ysm.precision")
+    fname <- paste("Ysm-precision-", iter, ".tbl", sep="")
+    Ys.m <- read.table(file.path(output.dir, ds$name, fname))
+    Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args))
+    fname <- paste("Ym-precision-", iter, ".tbl", sep="")
+    write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F)
+
+    Dy <- dist(Y.m)
+    Dy <- Dy / mean(Dy)
+    Dy <- as.matrix(Dy)
+    precision.Ym <- c(precision.Ym, smoothed.pr(Dx, Dy, k)$s.precision)
+
+
+    loginfo("Projection using Ysm.recall")
+    fname <- paste("Ysm-recall-", iter, ".tbl", sep="")
+    Ys.m <- read.table(file.path(output.dir, ds$name, fname))
+    Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args))
+    fname <- paste("Ym-recall-", iter, ".tbl", sep="")
+    write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F)
+
+    Dy <- dist(Y.m)
+    Dy <- Dy / mean(Dy)
+    Dy <- as.matrix(Dy)
+    recall.Ym <- c(recall.Ym, smoothed.pr(Dx, Dy, k)$s.recall)
   }
 
-  # Plot results
-  message("\tPlotting results")
-  Ys.m <- cbind(as.data.frame(Ys.m), classes.s)
-  Y.m  <- cbind(as.data.frame(Y.m), classes, np, silh, precision, recall)
-  pm.s <- ggplot(Ys.m) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes.s, colour = classes.s)) +
-      shape_scale + scale_color_manual(name = "Classe", values = 1:nlevels(classes))
-  ggsave(paste(output.dir, "manip-subsample-", suffix, ".pdf", sep=""), pm.s, width=5, height=5)
-
-  pm <- ggplot(Y.m) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = classes)) +
-      shape_scale + scale_color_manual(name = "Classe", values = 1:nlevels(classes))
-  ggsave(paste(output.dir, "manip-", suffix, ".pdf", sep=""), pm, width=5, height=5)
-
-  p.np <- ggplot(Y.m) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = np)) +
-      shape_scale + color_scale.gradient2("NP")
-  ggsave(paste(output.dir, "manip-np-", suffix, ".pdf", sep=""), p.np, width=5, height=5)
-
-  p.silh <- ggplot(Y.m) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = silh)) +
-      shape_scale + color_scale.gradient2("Silhueta")
-  ggsave(paste(output.dir, "manip-silh-", suffix, ".pdf", sep=""), p.silh, width=5, height=5)
-
-  p.precision <- ggplot(Y.m) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = precision)) +
-      shape_scale + color_scale.gradient2("Precisão")
-  ggsave(paste(output.dir, "manip-precision-", suffix, ".pdf", sep=""), p.precision, width=5, height=5)
-
-  p.recall <- ggplot(Y.m) +
-      theme_bw() +
-      labs(x = "", y = "") +
-      geom_point(aes(x = V1, y = V2, shape = classes, colour = recall)) +
-      shape_scale + color_scale.gradient2("Revocação")
-  ggsave(paste(output.dir, "manip-recall-", suffix, ".pdf", sep=""), p.recall, width=5, height=5)
-
-  pdf(paste(output.dir, "original-manip-", suffix, ".pdf", sep=""), width = 10, height = 8)
-  grid.arrange(p.s, p, pm.s, pm, ncol = 2)
-  dev.off()
-
-  pdf(paste(output.dir, "manip-measures-", suffix, ".pdf", sep=""), width = 10, height = 8)
-  grid.arrange(p.np, p.silh, p.precision, p.recall, ncol = 2)
-  dev.off()
-  
-  pdf(paste(output.dir, "manip-all-", suffix, ".pdf", sep=""), width = 12, height = 16)
-  grid.arrange(pm.s, pm, p.np, p.silh, p.precision, p.recall, ncol = 2)
-  dev.off()
+  write.table(silhouette.Y, file.path(output.dir, ds$name, tech$name, "silhouette-Y.tbl"), col.names=F, row.names=F)
+  write.table(np.Y,         file.path(output.dir, ds$name, tech$name, "np-Y.tbl"),         col.names=F, row.names=F)
+  write.table(stress.Y,     file.path(output.dir, ds$name, tech$name, "stress-Y.tbl"),     col.names=F, row.names=F)
+  write.table(precision.Y,  file.path(output.dir, ds$name, tech$name, "precision-Y.tbl"),  col.names=F, row.names=F)
+  write.table(recall.Y,     file.path(output.dir, ds$name, tech$name, "recall-Y.tbl"),     col.names=F, row.names=F)
+
+  write.table(silhouette.Ym, file.path(output.dir, ds$name, tech$name, "silhouette-Ym.tbl"), col.names=F, row.names=F)
+  write.table(np.Ym,         file.path(output.dir, ds$name, tech$name, "np-Ym.tbl"),         col.names=F, row.names=F)
+  write.table(stress.Ym,     file.path(output.dir, ds$name, tech$name, "stress-Ym.tbl"),     col.names=F, row.names=F)
+  write.table(precision.Ym,  file.path(output.dir, ds$name, tech$name, "precision-Ym.tbl"),  col.names=F, row.names=F)
+  write.table(recall.Ym,     file.path(output.dir, ds$name, tech$name, "recall-Ym.tbl"),     col.names=F, row.names=F)
 }
 
-#test(file = "datasets/iris.tbl", suffix = "iris", "tests/")
-test(file = "datasets/wdbc.tbl", suffix = "wdbc", "tests/")
-test(file = "datasets/segmentation.tbl", suffix = "segmentation", "tests/")
-test(file = "datasets/glass.tbl", suffix = "glass", "tests/")
+run <- function(datasets,
+                techniques,
+                output.dir,
+                kf=function(n) as.integer(min(sqrt(n), 0.05*n)),
+                n.iter=30,
+                intial.manipulation=T) {
+  dir.create.safe(output.dir)
+
+  for (ds in datasets) {
+    loginfo("Testing dataset: %s", ds$name)
+    dir.create.safe(file.path(output.dir, ds$name))
+
+    # Load and clean data by removing duplicates, center and scale
+    X <- read.table(ds$data.file)
+    if (!is.null(ds$labels.file)) {
+      labels <- read.table(ds$labels.file)$V1
+      labels <- labels[!duplicated(X)]
+      classes <- as.factor(labels)
+    }
+
+    X <- unique(X)
+    if (ds$scale) {
+      X <- scale(X)
+    }
+
+    n <- nrow(X)
+    k <- kf(n)
+
+    # Calculate distances (X) and normalize
+    loginfo("Calculating dist(X)")
+    Dx <- dist(X)
+    Dx <- Dx / mean(Dx)
+    Dx <- as.matrix(Dx)
+
+    # Generate samples, initial projections and all manipulations
+    if (intial.manipulation) {
+      run.manipulation(X, Dx, labels, k, ds, n.iter, output.dir)
+    }
+
+    # Test techniques
+    for (tech in techniques) {
+      run.technique(X, Dx, labels, k, ds, tech, n.iter, output.dir)
+    }
+  }
+}
+
+
+# Experiment configuration
+# Defines: datasets, techniques, output.dir
+source("config.R")
+
+args <- commandArgs(T)
+
+# Logging setup
+basicConfig()
+addHandler(writeToFile,
+           file=args[1],
+           level="FINEST")
+
+# The alpha and omega
+run(datasets, techniques, output.dir=output.dir, initial.manipulation=F)
-- 
cgit v1.2.3