aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Fadel <samuelfadel@gmail.com>2015-06-03 12:33:37 -0300
committerSamuel Fadel <samuelfadel@gmail.com>2015-06-03 12:33:37 -0300
commit4580cffb564aa3018d4dbcfe57b365deedfac62b (patch)
treed9b5214bdd7754cf319e099560cf37710b4b75ed
Initial commit.
Initial testing suite along with some measures implemented.
-rw-r--r--measures.R88
-rw-r--r--tests.R152
2 files changed, 240 insertions, 0 deletions
diff --git a/measures.R b/measures.R
new file mode 100644
index 0000000..c3f5021
--- /dev/null
+++ b/measures.R
@@ -0,0 +1,88 @@
+stress <- function(Dx, Dy) {
+ if (any(Dx != t(Dx)) || any(Dy != t(Dy))) {
+ stop("Dx and Dy must be symmetric")
+ }
+
+ Dx <- as.matrix(Dx)
+ Dy <- as.matrix(Dy)
+ if (nrow(Dx) != nrow(Dy)) {
+ stop("Dx and Dy must have the same number of elements")
+ }
+
+ n <- nrow(Dx)
+ s <- vector("numeric", n)
+ for (i in 1:n) {
+ s[i] <- 0
+ for (j in 1:n) {
+ if (i == j) {
+ next
+ }
+
+ s[i] = s[i] + (Dx[i, j] - Dy[i, j])^2 / Dx[i, j]
+ }
+ s[i] = s[i] / sum(D[i, ])
+ }
+
+ s
+}
+
+NP <- function(Dx, Dy, k = 9) {
+ if (any(Dx != t(Dx)) || any(Dy != t(Dy))) {
+ stop("Dx and Dy must be symmetric")
+ }
+
+ Dx <- as.matrix(Dx)
+ Dy <- as.matrix(Dy)
+ if (nrow(Dx) != nrow(Dy)) {
+ stop("Dx and Dy must have the same number of elements")
+ }
+
+ n <- nrow(Dx)
+ if (k >= n) {
+ stop("k must be smaller than the number of elements")
+ }
+
+ preservation <- vector("numeric", n)
+ for (i in 1:n) {
+ nx <- order(Dx[i, ])[1 + 1:k]
+ ny <- order(Dy[i, ])[1 + 1:k]
+ diff <- setdiff(nx, ny)
+ preservation[i] <- (k - length(diff)) / k
+ }
+
+ preservation
+}
+
+d2p <- function(D, sigmas) {
+ if (any(D != t(D))) {
+ stop("D must be symmetric")
+ }
+
+ D <- as.matrix(D)
+ n <- nrow(D)
+ P <- matrix(data=NA, nrow=nrow(D), ncol=ncol(D))
+
+ for (i in 1:n) {
+ denom <- sum(exp(-D[i, ] / sigmas))
+ P[i, ] <- exp(-D[i, ] / sigmas) / denom
+ }
+
+ P
+}
+
+klDivergence <- function(P, Q) {
+ if (nrow(P) != ncol(P) || nrow(Q) != ncol(Q)) {
+ stop("P and Q must be square")
+ }
+ if (nrow(P) != nrow(Q)) {
+ stop("P and Q must have the same number of elements")
+ }
+
+ n <- nrow(P)
+ d <- vector("numeric", n)
+ for (i in 1:n) {
+ d[i] <- sum(P[i, ] * log(P[i, ] / Q[i, ]))
+ }
+
+ d
+}
diff --git a/tests.R b/tests.R
new file mode 100644
index 0000000..d56114c
--- /dev/null
+++ b/tests.R
@@ -0,0 +1,152 @@
+require(ggplot2)
+require(gridExtra)
+require(mp)
+
+source("measures.R")
+
+automated.m <- function(D, labels) {
+ D.m <- D
+ for (label in unique(labels)) {
+ same.label <- labels == label
+ D.m[same.label, same.label] <- D[same.label, same.label] * 0.1
+ #D.m[same.label, diff.label] <- D[same.label, diff.label] * 10
+ #D.m[diff.label, same.label] <- D.m[same.label, diff.label]
+ }
+
+ D.m
+}
+
+xy.df <- function(M) {
+ M <- as.data.frame(M)
+ names(M) <- c("x", "y")
+
+ M
+}
+
+test <- function(file, suffix, output.dir) {
+ cat("Testing dataset ", file, "...\n")
+ dataset <- read.table(file)
+
+ # Extract labels
+ labels <- dataset[, ncol(dataset)]
+
+ # Remove labels from dataset
+ X <- dataset[, -ncol(dataset)]
+
+ n <- nrow(X)
+
+ # Calculate distances (X) and normalize
+ Dx <- dist(X)
+ Dx <- Dx / mean(Dx)
+ Dx <- as.matrix(Dx)
+
+ sample.indices <- sample(n, 3*sqrt(n))
+ Dx.s <- Dx[sample.indices, sample.indices]
+ Ys <- forceScheme(Dx.s)
+ Ys <- xy.df(Ys)
+ Y <- lamp(X, sample.indices, Ys)
+ Y <- xy.df(Y)
+
+ # Plot mapping
+ classes <- as.factor(labels)
+ classes.s <- as.factor(labels[sample.indices])
+ p.s <- ggplot(cbind(Ys, classes.s), aes(x = x, y = y, colour = classes.s)) + geom_point()
+ p <- ggplot(cbind(Y, classes), aes(x = x, y = y, colour = classes)) + geom_point()
+ pdf(paste(output.dir, "original-", suffix, ".pdf", sep=""), width = 10, height = 5)
+ grid.arrange(p.s, p,
+ widths = unit(rep_len(3, 2), "null"),
+ heights = unit(rep_len(1, 2), "null"),
+ ncol=2)
+ dev.off()
+ png(paste(output.dir, "original-", suffix, ".png", sep=""), width = 1200, height = 600)
+ grid.arrange(p.s, p,
+ widths = unit(rep_len(3, 2), "null"),
+ heights = unit(rep_len(1, 2), "null"),
+ ncol=2)
+ dev.off()
+
+ # Calculate distances (Y) and normalize
+ Dy <- dist(Y)
+ Dy <- Dy / mean(Dy)
+ Dy <- as.matrix(Dy)
+
+ # Calculate measures and plot
+ sigmas <- vector("numeric", n)
+ sigmas[] <- 1
+ P <- d2p(Dx, sigmas)
+ Q <- d2p(Dy, sigmas)
+ np = NP(Dx, Dy)
+ #stress = stress(Dx, Dy),
+ precision <- klDivergence(Q, P)
+ recall <- klDivergence(P, Q)
+ p.np <- ggplot(cbind(Y, np), aes(x = x, y = y, colour = np)) + geom_point() + labs(title = "NP (9)")
+ p.precision <- ggplot(cbind(Y, precision), aes(x = x, y = y, colour = precision)) + geom_point() + labs(title = "Precision")
+ p.recall <- ggplot(cbind(Y, recall), aes(x = x, y = y, colour = recall)) + geom_point() + labs(title = "Recall")
+ pdf(paste(output.dir, "measures-original-", suffix, ".pdf", sep=""), width = 15, height = 5)
+ grid.arrange(p.np, p.precision, p.recall,
+ widths = unit(rep_len(3, 3), "null"),
+ heights = unit(rep_len(1, 3), "null"),
+ ncol=3)
+ dev.off()
+ png(paste(output.dir, "measures-original-", suffix, ".png", sep=""), width = 1800, height = 600)
+ grid.arrange(p.np, p.precision, p.recall,
+ widths = unit(rep_len(3, 3), "null"),
+ heights = unit(rep_len(1, 3), "null"),
+ ncol=3)
+ dev.off()
+
+ # Perform manipulation
+ Dx.m <- automated.m(Dx.s, labels[sample.indices])
+ Ys.m <- forceScheme(Dx.m)
+ Ys.m <- xy.df(Ys.m)
+ Y.m <- lamp(X, sample.indices, Ys.m)
+ Y.m <- xy.df(Y.m)
+
+ # Plot mapping
+ p.s <- ggplot(cbind(Ys.m, classes.s), aes(x = x, y = y, colour = classes.s)) + geom_point()
+ p <- ggplot(cbind(Y.m, classes), aes(x = x, y = y, colour = classes)) + geom_point()
+ pdf(paste(output.dir, "manip-", suffix, ".pdf", sep=""), width = 10, height = 5)
+ grid.arrange(p.s, p,
+ widths = unit(rep_len(3, 2), "null"),
+ heights = unit(rep_len(1, 2), "null"),
+ ncol=2)
+ dev.off()
+ png(paste(output.dir, "manip-", suffix, ".png", sep=""), width = 1200, height = 600)
+ grid.arrange(p.s, p,
+ widths = unit(rep_len(3, 2), "null"),
+ heights = unit(rep_len(1, 2), "null"),
+ ncol=2)
+ dev.off()
+
+ # Calculate distances (Y.m) and normalize
+ Dy <- dist(Y.m)
+ Dy <- Dy / mean(Dy)
+ Dy <- as.matrix(Dy)
+ Q <- d2p(Dy, sigmas)
+
+ # Calculate measures and plot
+ np = np - NP(Dx, Dy)
+ #stress = stress(Dx, Dy),
+ precision <- precision - klDivergence(Q, P)
+ recall <- recall - klDivergence(P, Q)
+ p.np <- ggplot(cbind(Y.m, np), aes(x = x, y = y, colour = np)) + geom_point() + labs(title = "NP (9)")
+ p.precision <- ggplot(cbind(Y.m, precision), aes(x = x, y = y, colour = precision)) + geom_point() + labs(title = "Precision")
+ p.recall <- ggplot(cbind(Y.m, recall), aes(x = x, y = y, colour = recall)) + geom_point() + labs(title = "Recall")
+ pdf(paste(output.dir, "measures-manip-", suffix, ".pdf", sep=""), width = 15, height = 5)
+ grid.arrange(p.np, p.precision, p.recall,
+ widths = unit(rep_len(3, 3), "null"),
+ heights = unit(rep_len(1, 3), "null"),
+ ncol=3)
+ dev.off()
+ png(paste(output.dir, "measures-manip-", suffix, ".png", sep=""), width = 1800, height = 600)
+ grid.arrange(p.np, p.precision, p.recall,
+ widths = unit(rep_len(3, 3), "null"),
+ heights = unit(rep_len(1, 3), "null"),
+ ncol=3)
+ dev.off()
+}
+
+test(file = "datasets/iris-std.tbl", suffix = "iris", "plots/")
+test(file = "datasets/wdbc.tbl", suffix = "wdbc", "plots/")
+test(file = "datasets/segmentation.tbl", suffix = "segmentation", "plots/")
+test(file = "datasets/images.tbl", suffix = "images", "plots/")