From 4580cffb564aa3018d4dbcfe57b365deedfac62b Mon Sep 17 00:00:00 2001 From: Samuel Fadel Date: Wed, 3 Jun 2015 12:33:37 -0300 Subject: Initial commit. Initial testing suite along with some measures implemented. --- measures.R | 88 +++++++++++++++++++++++++++++++++++ tests.R | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 measures.R create mode 100644 tests.R diff --git a/measures.R b/measures.R new file mode 100644 index 0000000..c3f5021 --- /dev/null +++ b/measures.R @@ -0,0 +1,88 @@ +stress <- function(Dx, Dy) { + if (any(Dx != t(Dx)) || any(Dy != t(Dy))) { + stop("Dx and Dy must be symmetric") + } + + Dx <- as.matrix(Dx) + Dy <- as.matrix(Dy) + if (nrow(Dx) != nrow(Dy)) { + stop("Dx and Dy must have the same number of elements") + } + + n <- nrow(Dx) + s <- vector("numeric", n) + for (i in 1:n) { + s[i] <- 0 + for (j in 1:n) { + if (i == j) { + next + } + + s[i] = s[i] + (Dx[i, j] - Dy[i, j])^2 / Dx[i, j] + } + s[i] = s[i] / sum(D[i, ]) + } + + s +} + +NP <- function(Dx, Dy, k = 9) { + if (any(Dx != t(Dx)) || any(Dy != t(Dy))) { + stop("Dx and Dy must be symmetric") + } + + Dx <- as.matrix(Dx) + Dy <- as.matrix(Dy) + if (nrow(Dx) != nrow(Dy)) { + stop("Dx and Dy must have the same number of elements") + } + + n <- nrow(Dx) + if (k >= n) { + stop("k must be smaller than the number of elements") + } + + preservation <- vector("numeric", n) + for (i in 1:n) { + nx <- order(Dx[i, ])[1 + 1:k] + ny <- order(Dy[i, ])[1 + 1:k] + diff <- setdiff(nx, ny) + preservation[i] <- (k - length(diff)) / k + } + + preservation +} + +d2p <- function(D, sigmas) { + if (any(D != t(D))) { + stop("D must be symmetric") + } + + D <- as.matrix(D) + n <- nrow(D) + P <- matrix(data=NA, nrow=nrow(D), ncol=ncol(D)) + + for (i in 1:n) { + denom <- sum(exp(-D[i, ] / sigmas)) + P[i, ] <- exp(-D[i, ] / sigmas) / denom + } + + P +} + +klDivergence <- function(P, Q) { + if (nrow(P) != ncol(P) || nrow(Q) != ncol(Q)) { + stop("P and Q must be square") + } + if (nrow(P) != nrow(Q)) { + stop("P and Q must have the same number of elements") + } + + n <- nrow(P) + d <- vector("numeric", n) + for (i in 1:n) { + d[i] <- sum(P[i, ] * log(P[i, ] / Q[i, ])) + } + + d +} diff --git a/tests.R b/tests.R new file mode 100644 index 0000000..d56114c --- /dev/null +++ b/tests.R @@ -0,0 +1,152 @@ +require(ggplot2) +require(gridExtra) +require(mp) + +source("measures.R") + +automated.m <- function(D, labels) { + D.m <- D + for (label in unique(labels)) { + same.label <- labels == label + D.m[same.label, same.label] <- D[same.label, same.label] * 0.1 + #D.m[same.label, diff.label] <- D[same.label, diff.label] * 10 + #D.m[diff.label, same.label] <- D.m[same.label, diff.label] + } + + D.m +} + +xy.df <- function(M) { + M <- as.data.frame(M) + names(M) <- c("x", "y") + + M +} + +test <- function(file, suffix, output.dir) { + cat("Testing dataset ", file, "...\n") + dataset <- read.table(file) + + # Extract labels + labels <- dataset[, ncol(dataset)] + + # Remove labels from dataset + X <- dataset[, -ncol(dataset)] + + n <- nrow(X) + + # Calculate distances (X) and normalize + Dx <- dist(X) + Dx <- Dx / mean(Dx) + Dx <- as.matrix(Dx) + + sample.indices <- sample(n, 3*sqrt(n)) + Dx.s <- Dx[sample.indices, sample.indices] + Ys <- forceScheme(Dx.s) + Ys <- xy.df(Ys) + Y <- lamp(X, sample.indices, Ys) + Y <- xy.df(Y) + + # Plot mapping + classes <- as.factor(labels) + classes.s <- as.factor(labels[sample.indices]) + p.s <- ggplot(cbind(Ys, classes.s), aes(x = x, y = y, colour = classes.s)) + geom_point() + p <- ggplot(cbind(Y, classes), aes(x = x, y = y, colour = classes)) + geom_point() + pdf(paste(output.dir, "original-", suffix, ".pdf", sep=""), width = 10, height = 5) + grid.arrange(p.s, p, + widths = unit(rep_len(3, 2), "null"), + heights = unit(rep_len(1, 2), "null"), + ncol=2) + dev.off() + png(paste(output.dir, "original-", suffix, ".png", sep=""), width = 1200, height = 600) + grid.arrange(p.s, p, + widths = unit(rep_len(3, 2), "null"), + heights = unit(rep_len(1, 2), "null"), + ncol=2) + dev.off() + + # Calculate distances (Y) and normalize + Dy <- dist(Y) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + + # Calculate measures and plot + sigmas <- vector("numeric", n) + sigmas[] <- 1 + P <- d2p(Dx, sigmas) + Q <- d2p(Dy, sigmas) + np = NP(Dx, Dy) + #stress = stress(Dx, Dy), + precision <- klDivergence(Q, P) + recall <- klDivergence(P, Q) + p.np <- ggplot(cbind(Y, np), aes(x = x, y = y, colour = np)) + geom_point() + labs(title = "NP (9)") + p.precision <- ggplot(cbind(Y, precision), aes(x = x, y = y, colour = precision)) + geom_point() + labs(title = "Precision") + p.recall <- ggplot(cbind(Y, recall), aes(x = x, y = y, colour = recall)) + geom_point() + labs(title = "Recall") + pdf(paste(output.dir, "measures-original-", suffix, ".pdf", sep=""), width = 15, height = 5) + grid.arrange(p.np, p.precision, p.recall, + widths = unit(rep_len(3, 3), "null"), + heights = unit(rep_len(1, 3), "null"), + ncol=3) + dev.off() + png(paste(output.dir, "measures-original-", suffix, ".png", sep=""), width = 1800, height = 600) + grid.arrange(p.np, p.precision, p.recall, + widths = unit(rep_len(3, 3), "null"), + heights = unit(rep_len(1, 3), "null"), + ncol=3) + dev.off() + + # Perform manipulation + Dx.m <- automated.m(Dx.s, labels[sample.indices]) + Ys.m <- forceScheme(Dx.m) + Ys.m <- xy.df(Ys.m) + Y.m <- lamp(X, sample.indices, Ys.m) + Y.m <- xy.df(Y.m) + + # Plot mapping + p.s <- ggplot(cbind(Ys.m, classes.s), aes(x = x, y = y, colour = classes.s)) + geom_point() + p <- ggplot(cbind(Y.m, classes), aes(x = x, y = y, colour = classes)) + geom_point() + pdf(paste(output.dir, "manip-", suffix, ".pdf", sep=""), width = 10, height = 5) + grid.arrange(p.s, p, + widths = unit(rep_len(3, 2), "null"), + heights = unit(rep_len(1, 2), "null"), + ncol=2) + dev.off() + png(paste(output.dir, "manip-", suffix, ".png", sep=""), width = 1200, height = 600) + grid.arrange(p.s, p, + widths = unit(rep_len(3, 2), "null"), + heights = unit(rep_len(1, 2), "null"), + ncol=2) + dev.off() + + # Calculate distances (Y.m) and normalize + Dy <- dist(Y.m) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + Q <- d2p(Dy, sigmas) + + # Calculate measures and plot + np = np - NP(Dx, Dy) + #stress = stress(Dx, Dy), + precision <- precision - klDivergence(Q, P) + recall <- recall - klDivergence(P, Q) + p.np <- ggplot(cbind(Y.m, np), aes(x = x, y = y, colour = np)) + geom_point() + labs(title = "NP (9)") + p.precision <- ggplot(cbind(Y.m, precision), aes(x = x, y = y, colour = precision)) + geom_point() + labs(title = "Precision") + p.recall <- ggplot(cbind(Y.m, recall), aes(x = x, y = y, colour = recall)) + geom_point() + labs(title = "Recall") + pdf(paste(output.dir, "measures-manip-", suffix, ".pdf", sep=""), width = 15, height = 5) + grid.arrange(p.np, p.precision, p.recall, + widths = unit(rep_len(3, 3), "null"), + heights = unit(rep_len(1, 3), "null"), + ncol=3) + dev.off() + png(paste(output.dir, "measures-manip-", suffix, ".png", sep=""), width = 1800, height = 600) + grid.arrange(p.np, p.precision, p.recall, + widths = unit(rep_len(3, 3), "null"), + heights = unit(rep_len(1, 3), "null"), + ncol=3) + dev.off() +} + +test(file = "datasets/iris-std.tbl", suffix = "iris", "plots/") +test(file = "datasets/wdbc.tbl", suffix = "wdbc", "plots/") +test(file = "datasets/segmentation.tbl", suffix = "segmentation", "plots/") +test(file = "datasets/images.tbl", suffix = "images", "plots/") -- cgit v1.2.3