Added silhouette, fixed bugs and organized code.

author: Samuel Fadel <samuelfadel@gmail.com> 2015-06-07 01:14:34 -0300
committer: Samuel Fadel <samuelfadel@gmail.com> 2015-06-07 01:14:34 -0300
commit: 945d4517d9a86c889ae5f887dd20d1ebedd862f7 (patch)
tree: 8ba01206d45ee5a9e00beb9910219e20befdc2b7
parent: ea663237d22ea58a7a016b65a6e7b92457fdf812 (diff)
2 files changed, 133 insertions, 31 deletions
diff --git a/measures.R b/measures.R
index c3f5021..1b9128f 100644
--- a/measures.R
+++ b/measures.R
@@ -53,24 +53,98 @@ NP <- function(Dx, Dy, k = 9) {
   preservation
 }
 
-d2p <- function(D, sigmas) {
+silhouette <- function(Dy, labels) {
+  if (any(t(Dy) != Dy)) {
+    stop("Dy must be symmetric")
+  }
+
+  Dy <- as.matrix(Dy)
+  n <- nrow(Dy)
+  cohesion <- vector("numeric", n)
+  separation <- vector("numeric", n)
+
+  for (i in 1:n) {
+    label <- labels[i]
+    cohesion[i] <- mean(Dy[i, labels[-i] == label])
+    separation[i] <- min(Dy[i, labels != label])
+  }
+
+  silh <- (cohesion - separation) / max(cohesion, separation)
+}
+
+d2p <- function(D, perplexity = 9, tol = 1e-5, max.tries = 50) {
   if (any(D != t(D))) {
     stop("D must be symmetric")
   }
 
   D <- as.matrix(D)
+  P <- matrix(data=0, nrow=nrow(D), ncol=ncol(D))
   n <- nrow(D)
-  P <- matrix(data=NA, nrow=nrow(D), ncol=ncol(D))
+  beta <- rep(1, n)
+  logU <- log(perplexity)
+
+  for (i in 1:n) {
+    #denom <- sum(exp(-D[i, ] / sigmas))
+    #P[i, ] <- exp(-D[i, ] / sigmas) / denom
+
+    betaMin <- -Inf
+    betaMax <-  Inf
+    Di <- D[i, -i]
+
+    tries <- 0
+    repeat {
+      Pi <- exp(-Di * beta[i])
+      sumPi <- sum(Pi)
+      H <- log(sumPi) + beta[i] * sum(Di * Pi) / sumPi
+      Pi <- Pi / sumPi
+      Hdiff <- H - logU
+
+      if (abs(Hdiff) < tol || tries > max.tries) {
+        break
+      }
+
+      if (Hdiff > 0) {
+        betaMin <- beta[i]
+        beta[i] <- if (is.finite(betaMax)) {
+                     (beta[i] + betaMax) / 2
+                   } else {
+                     beta[i] * 2
+                   }
+      } else {
+        betaMax <- beta[i]
+        beta[i] <- if (is.finite(betaMin)) {
+                     (beta[i] + betaMin) / 2
+                   } else {
+                     beta[i] / 2
+                   }
+      }
+
+      tries <- tries + 1
+    }
+
+    P[i, -i] <- Pi
+  }
+
+  list(P = P, beta = beta) # sigmas = sqrt(1 / beta)
+}
 
+d2p.beta <- function(D, beta) {
+  if (any(D != t(D))) {
+    stop("D must be symmetric")
+  }
+
+  D <- as.matrix(D)
+  n <- nrow(D)
+  P <- matrix(data=0, nrow=nrow(D), ncol=ncol(D))
   for (i in 1:n) {
-    denom <- sum(exp(-D[i, ] / sigmas))
-    P[i, ] <- exp(-D[i, ] / sigmas) / denom
+    P[i, -i] <- exp(-D[i, -i] * beta[i])
+    P[i, -i] <- P[i, -i] / sum(P[i, -i])
   }
 
   P
 }
 
-klDivergence <- function(P, Q) {
+klDivergence <- function(P, Q, eps = 1e-12) {
   if (nrow(P) != ncol(P) || nrow(Q) != ncol(Q)) {
     stop("P and Q must be square")
   }
@@ -78,10 +152,12 @@ klDivergence <- function(P, Q) {
     stop("P and Q must have the same number of elements")
   }
 
+  P[P < eps] <- eps
+  Q[Q < eps] <- eps
   n <- nrow(P)
   d <- vector("numeric", n)
   for (i in 1:n) {
-    d[i] <- sum(P[i, ] * log(P[i, ] / Q[i, ]))
+    d[i] <- sum(P[i, -i] * log(P[i, -i] / Q[i, -i]))
   }
 
   d
diff --git a/tests.R b/tests.R
index 65ed46d..f313b7b 100644
--- a/tests.R
+++ b/tests.R
@@ -15,7 +15,7 @@ automated.m <- function(D, labels) {
 }
 
 test <- function(file, suffix, output.dir) {
-  message("Testing dataset ", file)
+  message("Testing dataset: ", file)
   dataset <- read.table(file)
 
   # Extract labels
@@ -51,23 +51,30 @@ test <- function(file, suffix, output.dir) {
   Dy <- as.matrix(Dy)
 
   message("\tCalculating P and Q")
-  sigmas <- vector("numeric", n)
-  sigmas[] <- 1
-  P <- d2p(Dx, sigmas)
-  Q <- d2p(Dy, sigmas)
+  prob <- d2p(Dx^2)
+  P <- prob$P
+  Q <- d2p.beta(Dy^2, prob$beta)
 
   # Calculate measures
   message("\tCalculating measures")
   np <- NP(Dx, Dy)
+  silh <- silhouette(Dy, classes)
   precision <- klDivergence(Q, P)
   recall <- klDivergence(P, Q)
 
+  if (!(all(is.finite(np)) &&
+      all(is.finite(silh)) &&
+      all(is.finite(precision)) &&
+      all(is.finite(recall)))) {
+    stop("Non-finite measures found")
+  }
+
   # Plot results
-  #color_scale <- scale_colour_gradientn(colours = c("#e46c0a", "#dddddd", "#376092"))
-  color_scale <- scale_colour_gradient2(mid = "#dddddd", space = "Lab")
-  shape_scale <- scale_shape(solid = FALSE)
+  message("\tPlotting results")
+  color_scale <- scale_colour_gradient(high = "#376092", low = "#e46c0a", space = "Lab")
+  shape_scale <- scale_shape_manual(values = 1:nlevels(classes))
   Ys <- cbind(as.data.frame(Ys), classes.s)
-  Y  <- cbind(as.data.frame(Y), classes, np, precision, recall)
+  Y  <- cbind(as.data.frame(Y), classes, np, silh, precision, recall)
   p.s <- ggplot(Ys) +
       theme_bw() +
       labs(x = "", y = "") +
@@ -83,6 +90,11 @@ test <- function(file, suffix, output.dir) {
       labs(x = "", y = "", title = "NP (9)") +
       geom_point(aes(x = V1, y = V2, shape = classes, colour = np)) +
       shape_scale + color_scale
+  p.silh <- ggplot(Y) +
+      theme_bw() +
+      labs(x = "", y = "", title = "Silhouette") +
+      geom_point(aes(x = V1, y = V2, shape = classes, colour = silh)) +
+      shape_scale + color_scale
   p.precision <- ggplot(Y) +
       theme_bw() +
       labs(x = "", y = "", title = "Precision") +
@@ -94,14 +106,14 @@ test <- function(file, suffix, output.dir) {
       geom_point(aes(x = V1, y = V2, shape = classes, colour = recall)) +
       shape_scale + color_scale
   
-  pdf(paste(output.dir, "original-", suffix, ".pdf", sep=""), width = 16, height = 8)
-  grid.arrange(p.s, p, p.np, p.precision, p.recall, ncol=3)
+  pdf(paste(output.dir, "original-", suffix, ".pdf", sep=""), width = 12, height = 16)
+  grid.arrange(p.s, p, p.np, p.silh, p.precision, p.recall, ncol = 2)
   dev.off()
 
   # Perform manipulation
   message("\tCalculating Ys.m")
   Dx.m <- automated.m(Dx.s, labels[sample.indices])
-  Ys.m <- forceScheme(Dx.m)
+  Ys.m <- forceScheme(Dx.m, Ys[, 1:2])
 
   # LAMP
   message("\tCalculating Y.m")
@@ -114,46 +126,60 @@ test <- function(file, suffix, output.dir) {
   Dy <- as.matrix(Dy)
 
   message("\tCalculating Q")
-  Q  <- d2p(Dy, sigmas)
+  Q <- d2p.beta(Dy^2, prob$beta)
 
   # Calculate measures
   message("\tCalculating measures")
-  np <- np - NP(Dx, Dy)
-  precision <- precision - klDivergence(Q, P)
-  recall <- recall - klDivergence(P, Q)
+  np <- NP(Dx, Dy) - np
+  silh <- silhouette(Dy, classes) - silh
+  precision <- klDivergence(Q, P) - precision
+  recall <- klDivergence(P, Q) - recall
+
+  if (!(all(is.finite(np)) &&
+      all(is.finite(silh)) &&
+      all(is.finite(precision)) &&
+      all(is.finite(recall)))) {
+    stop("Non-finite measures found")
+  }
 
   # Plot results
+  message("\tPlotting results")
+  color_scale <- scale_colour_gradient2(mid = "#dddddd", space = "Lab")
   Ys.m <- cbind(as.data.frame(Ys.m), classes.s)
-  Y.m  <- cbind(as.data.frame(Y.m), classes, np, precision, recall)
-  p.s <- ggplot(cbind(Ys.m, classes.s)) +
+  Y.m  <- cbind(as.data.frame(Y.m), classes, np, silh, precision, recall)
+  p.s <- ggplot(Ys.m) +
       theme_bw() +
       labs(x = "", y = "") +
       geom_point(aes(x = V1, y = V2, shape = classes.s, colour = classes.s)) +
-      scale_shape_identity() +
       shape_scale
-  p <- ggplot(cbind(Y.m, classes)) +
+  p <- ggplot(Y.m) +
       theme_bw() +
       labs(x = "", y = "") +
       geom_point(aes(x = V1, y = V2, shape = classes, colour = classes)) +
       shape_scale
-  p.np <- ggplot(cbind(Y.m, np)) +
+  p.np <- ggplot(Y.m) +
       theme_bw() +
       labs(x = "", y = "", title = "NP (9)") +
       geom_point(aes(x = V1, y = V2, shape = classes, colour = np)) +
       shape_scale + color_scale
-  p.precision <- ggplot(cbind(Y.m, precision)) +
+  p.silh <- ggplot(Y.m) +
+      theme_bw() +
+      labs(x = "", y = "", title = "Silhouette") +
+      geom_point(aes(x = V1, y = V2, shape = classes, colour = silh)) +
+      shape_scale + color_scale
+  p.precision <- ggplot(Y.m) +
       theme_bw() +
       labs(x = "", y = "", title = "Precision") +
       geom_point(aes(x = V1, y = V2, shape = classes, colour = precision)) +
       shape_scale + color_scale
-  p.recall <- ggplot(cbind(Y.m, recall)) +
+  p.recall <- ggplot(Y.m) +
       theme_bw() +
       labs(x = "", y = "", title = "Recall") +
       geom_point(aes(x = V1, y = V2, shape = classes, colour = recall)) +
       shape_scale + color_scale
   
-  pdf(paste(output.dir, "manip-", suffix, ".pdf", sep=""), width = 16, height = 8)
-  grid.arrange(p.s, p, p.np, p.precision, p.recall, ncol=3)
+  pdf(paste(output.dir, "manip-", suffix, ".pdf", sep=""), width = 12, height = 16)
+  grid.arrange(p.s, p, p.np, p.silh, p.precision, p.recall, ncol = 2)
   dev.off()
 }
author	Samuel Fadel <samuelfadel@gmail.com>	2015-06-07 01:14:34 -0300
committer	Samuel Fadel <samuelfadel@gmail.com>	2015-06-07 01:14:34 -0300
commit	945d4517d9a86c889ae5f887dd20d1ebedd862f7 (patch)
tree	8ba01206d45ee5a9e00beb9910219e20befdc2b7
parent	ea663237d22ea58a7a016b65a6e7b92457fdf812 (diff)