diff options
author | Samuel Fadel <samuelfadel@gmail.com> | 2016-08-18 23:15:39 -0300 |
---|---|---|
committer | Samuel Fadel <samuelfadel@gmail.com> | 2016-08-18 23:15:39 -0300 |
commit | fbf8d82cdd3720c4bbf2a94035b6779e56d73448 (patch) | |
tree | a4eaf4a9728d5e66ed062dbce3fe731366ee73e8 | |
parent | 9b51886d702b22018db08ba8a30ab47d36f64aac (diff) |
Updated with all the code used to run experiments.
-rw-r--r-- | measures.R | 224 | ||||
-rw-r--r-- | plot.R | 160 | ||||
-rw-r--r-- | proj.R | 103 | ||||
-rw-r--r-- | run.R | 518 |
4 files changed, 668 insertions, 337 deletions
@@ -1,30 +1,5 @@ -stress <- function(Dx, Dy) { - if (any(Dx != t(Dx)) || any(Dy != t(Dy))) { - stop("Dx and Dy must be symmetric") - } - - Dx <- as.matrix(Dx) - Dy <- as.matrix(Dy) - if (nrow(Dx) != nrow(Dy)) { - stop("Dx and Dy must have the same number of elements") - } - - n <- nrow(Dx) - s <- vector("numeric", n) - for (i in 1:n) { - s[i] <- 0 - for (j in 1:n) { - if (i == j) { - next - } - - s[i] = s[i] + (Dx[i, j] - Dy[i, j])^2 / Dx[i, j] - } - s[i] = s[i] / sum(D[i, ]) - } - - s -} +# Measures used as manipulation targets. +# NOTE: This file is only a library, see run.R for more details. NP <- function(Dx, Dy, k = 9) { if (any(Dx != t(Dx)) || any(Dy != t(Dy))) { @@ -53,112 +28,129 @@ NP <- function(Dx, Dy, k = 9) { preservation } +#silhouette <- function(Dy, labels) { +# if (any(t(Dy) != Dy)) { +# stop("Dy must be symmetric") +# } +# +# Dy <- as.matrix(Dy) +# n <- nrow(Dy) +# cohesion <- vector("numeric", n) +# separation <- vector("numeric", n) +# +# for (i in 1:n) { +# label <- labels[i] +# separation[i] <- min(Dy[i, labels != label]) +# cohesion[i] <- mean(Dy[i, labels[-i] == label]) +# } +# +# (separation - cohesion) / max(separation, cohesion) +#} silhouette <- function(Dy, labels) { - if (any(t(Dy) != Dy)) { - stop("Dy must be symmetric") - } - - Dy <- as.matrix(Dy) n <- nrow(Dy) - cohesion <- vector("numeric", n) - separation <- vector("numeric", n) - - for (i in 1:n) { - label <- labels[i] - separation[i] <- min(Dy[i, labels != label]) - cohesion[i] <- mean(Dy[i, labels[-i] == label]) + if (n != length(labels)) { + stop("Number of labels doesn't match number of points") } - silh <- (separation - cohesion) / max(separation, cohesion) -} - -d2p <- function(D, perplexity = 9, tol = 1e-5, max.tries = 50) { - if (any(D != t(D))) { - stop("D must be symmetric") + A_labels <- list() + B_labels <- list() + unique.labels <- unique(labels) + for (l in unique.labels) { + A_labels[[l]] <- labels == l + B_labels[[l]] <- labels != l } - D <- as.matrix(D) - P <- matrix(data=0, nrow=nrow(D), ncol=ncol(D)) - n <- nrow(D) - beta <- rep(1, n) - logU <- log(perplexity) - + # This factor excludes self comparisons when computing cohesion + m.factor <- n / (n-1) + s <- vector("numeric", n) for (i in 1:n) { - #denom <- sum(exp(-D[i, ] / sigmas)) - #P[i, ] <- exp(-D[i, ] / sigmas) / denom - - betaMin <- -Inf - betaMax <- Inf - Di <- D[i, -i] - - tries <- 0 - repeat { - Pi <- exp(-Di * beta[i]) - sumPi <- sum(Pi) - H <- log(sumPi) + beta[i] * sum(Di * Pi) / sumPi - Pi <- Pi / sumPi - Hdiff <- H - logU - - if (abs(Hdiff) < tol || tries > max.tries) { - break - } - - if (Hdiff > 0) { - betaMin <- beta[i] - beta[i] <- if (is.finite(betaMax)) { - (beta[i] + betaMax) / 2 - } else { - beta[i] * 2 - } - } else { - betaMax <- beta[i] - beta[i] <- if (is.finite(betaMin)) { - (beta[i] + betaMin) / 2 - } else { - beta[i] / 2 - } - } + label_i <- labels[i] - tries <- tries + 1 + a <- m.factor * mean(Dy[i, A_labels[[label_i]]]) # cohesion + b <- Inf + for (l in unique(labels)) { + b <- min(mean(Dy[i, B_labels[[l]]]), b) # separation } - P[i, -i] <- Pi + s[i] <- (b - a) / max(b, a) } - list(P = P, beta = beta) # sigmas = sqrt(1 / beta) + s } -d2p.beta <- function(D, beta) { - if (any(D != t(D))) { - stop("D must be symmetric") - } +#stress <- function(Dx, Dy) { +# if (any(Dx != t(Dx)) || any(Dy != t(Dy))) { +# stop("Dx and Dy must be symmetric") +# } +# +# Dx <- as.matrix(Dx) +# Dy <- as.matrix(Dy) +# if (nrow(Dx) != nrow(Dy)) { +# stop("Dx and Dy must have the same number of elements") +# } +# +# n <- nrow(Dx) +# s <- vector("numeric", n) +# for (i in 1:n) { +# s[i] <- 0 +# for (j in 1:n) { +# if (i == j) { +# next +# } +# +# s[i] = s[i] + (Dx[i, j] - Dy[i, j])^2 / Dx[i, j] +# } +# s[i] = s[i] / sum(D[i, ]) +# } +# +# s +#} +stress <- function(Dx, Dy) { + n <- nrow(Dx) - D <- as.matrix(D) - n <- nrow(D) - P <- matrix(data=0, nrow=nrow(D), ncol=ncol(D)) - for (i in 1:n) { - P[i, -i] <- exp(-D[i, -i] * beta[i]) - P[i, -i] <- P[i, -i] / sum(P[i, -i]) + C <- 0 + D <- 0 + for (i in 1:(n-1)) { + for (j in (i + 1):n) { + C <- C + Dx[i, j] + D <- D + (Dx[i, j] - Dy[i, j])^2 / Dx[i, j] + if (is.nan(D)) { + loginfo("%d, %d", i, j) + loginfo("%f", Dx[i, j]) + stop("NaN") + } + } } - P + D / C } -klDivergence <- function(P, Q, eps = 1e-12) { - if (nrow(P) != ncol(P) || nrow(Q) != ncol(Q)) { - stop("P and Q must be square") - } - if (nrow(P) != nrow(Q)) { - stop("P and Q must have the same number of elements") - } - - P[P < eps] <- eps - Q[Q < eps] <- eps - n <- nrow(P) - d <- vector("numeric", n) - for (i in 1:n) { - d[i] <- sum(P[i, -i] * log(P[i, -i] / Q[i, -i])) - } - - d +# NOTE: This function requires the 'klmeasure' binary from: +# http://research.cs.aalto.fi/pml/software/dredviz/ +smoothed.pr <- function(Dx, Dy, k) { + # Create SOM_PAK file for Dx + Dx.fname <- tempfile() + Dx.f <- file(Dx.fname, "w") + cat(sprintf("%d\n", ncol(Dx)), file=Dx.f) + write.table(Dx, Dx.f, col.names=F, row.names=F) + close(Dx.f) + + # Create SOM_PAK file for Dy + Dy.fname <- tempfile() + Dy.f <- file(Dy.fname, "w") + cat(sprintf("%d\n", ncol(Dy)), file=Dy.f) + write.table(Dy, Dy.f, col.names=F, row.names=F) + close(Dy.f) + + output <- system2("./klmeasure", + stdout=T, + args=c("--datadist", Dx.fname, + "--projdist", Dy.fname, + "--neighbors", sprintf("%d", k))) + output <- strsplit(output[2], " | ", fixed=T) + output <- unlist(output) + + file.remove(Dx.fname, Dy.fname) + + list(s.precision=as.double(output[1]), s.recall=as.double(output[2])) } @@ -0,0 +1,160 @@ +# Functions for plotting results from manipulation experiments. +# NOTE: This script should only be used after results are generated with run.R + +require(cowplot) +require(gridExtra) +require(logging) + +# Plots results for experiments of all techniques for a given measure and +# datasset. +plot.measure <- function(ds, techniques, output.dir, measure, n.iter=30) { + measure.df <- data.frame() + scale.labels <- c() + + for (tech in techniques) { + Y.measure <- read.table(file.path(output.dir, ds$name, tech$name, paste(measure$name, "Y.tbl", sep="-"))) + Ym.measure <- read.table(file.path(output.dir, ds$name, tech$name, paste(measure$name, "Ym.tbl", sep="-"))) + + measure.df <- rbind(measure.df, + data.frame(name=tech$name.pretty, + type=paste("Y", tech$name, sep="."), + V1=Ym.measure - Y.measure) + ) + + scale.labels <- c(scale.labels, tech$name.pretty) + } + + p <- ggplot(measure.df) + + background_grid(major="xy", minor="none") + + theme(legend.position="none") + + labs(x="", y=measure$name.pretty) + + geom_boxplot(aes(type, V1, fill=name)) + + scale_fill_brewer(palette="Set1", guide=guide_legend(title="")) + + scale_y_continuous(limits=c(min(0, min(measure.df$V1)), max(0, max(measure.df$V1)))) + + scale_x_discrete(labels=scale.labels) + + fname <- file.path(output.dir, "plots", ds$name, paste(measure$name, "pdf", sep=".")) + loginfo("Saving plot: %s", fname) + save_plot(fname, p, base_aspect_ratio=1.3) +} + +# Plot boxplots of techniques, one per measure diff per dataset. +plot.measures <- function(datasets, techniques, measures, output.dir, n.iter=30) { + dir.create.safe(file.path(output.dir, "plots")) + + for (ds in datasets) { + dir.create.safe(file.path(output.dir, "plots", ds$name)) + for (measure in measures) { + if (is.null(ds$labels.file) && measure$name == "silhouette") { + next + } + + plot.measure(ds, techniques, output.dir, measure, n.iter) + } + } +} + +# Same as above, but averages over all datasets. +plot.averages <- function(datasets, techniques, measures, output.dir, n.iter=30) { + dir.create.safe(file.path(output.dir, "plots")) + + for (measure in measures) { + measure.df <- data.frame() + scale.labels <- c() + for (tech in techniques) { + measure.avg <- rep(0, n.iter) + scale.labels <- c(scale.labels, tech$name.pretty) + + for (ds in datasets) { + if (is.null(ds$labels.file) && measure$name == "silhouette") { + next + } + + Y.measure <- read.table(file.path(output.dir, ds$name, tech$name, paste(measure$name, "Y.tbl", sep="-")))$V1 + Ym.measure <- read.table(file.path(output.dir, ds$name, tech$name, paste(measure$name, "Ym.tbl", sep="-")))$V1 + measure.avg <- measure.avg + (Ym.measure - Y.measure) + } + measure.avg <- measure.avg / length(datasets) + measure.df <- rbind(measure.df, data.frame(tech=tech$name, V1=measure.avg)) + } + + p <- ggplot(measure.df) + + background_grid(major="xy", minor="none") + + theme(legend.position="none") + + labs(x="", y=measure$name.pretty) + + geom_boxplot(aes(tech, V1, fill=tech)) + + scale_fill_brewer(palette="Set1", guide=guide_legend(title="")) + + scale_y_continuous(limits=c(min(0, min(measure.df$V1)), max(0, max(measure.df$V1)))) + + scale_x_discrete(labels=scale.labels) + + fname <- file.path(output.dir, "plots", paste(measure$name, "pdf", sep=".")) + loginfo("Saving plot: %s", fname) + save_plot(fname, p, base_aspect_ratio=1.3) + } +} + +# Plot a single scatterplot of techniques and datasets, where x axis is the +# measure before manipulation and y axis is the measure after manipulation. +# Also adds a y=x line so that visual inspection is easier. +plot.scatter.measure <- function(measure, datasets, techniques, output.dir, n.iter=30) { + measure.df <- data.frame() + for (tech in techniques) { + for (ds in datasets) { + if (is.null(ds$labels.file) && measure$name == "silhouette") { + next + } + + base.path <- file.path(output.dir, ds$name, tech$name) + fname <- file.path(base.path, paste(measure$name, "Y.tbl", sep="-")) + Y.measure <- read.table(fname)$V1 + fname <- file.path(base.path, paste(measure$name, "Ym.tbl", sep="-")) + Ym.measure <- read.table(fname)$V1 + measure.df <- rbind(measure.df, data.frame(tech=tech$name.pretty, + dataset=ds$name.pretty, + x=mean(Y.measure), + y=mean(Ym.measure))) + } + } + + min.max <- min(max(measure.df$x), max(measure.df$y)) + p <- ggplot(measure.df) + + background_grid(major="xy", minor="none") + + theme(legend.position="right") + + labs(x=paste(measure$name.pretty, "(before)", sep=" "), + y=paste(measure$name.pretty, "(after)", sep=" ")) + + geom_point(aes(x=x, y=y, color=tech, shape=dataset), alpha=0.8, size=3) + + scale_color_brewer(palette="Set1", guide=guide_legend(title="Technique")) + + scale_shape(guide=guide_legend(title="Dataset")) + + geom_abline(intercept=0, slope=1) + + fname <- file.path(output.dir, "plots", paste(measure$name, "-scatter", ".pdf", sep="")) + loginfo("Saving plot: %s", fname) + save_plot(fname, p, base_aspect_ratio=1.5) + + p +} + +# This function runs the scatterplot function above for all measures +plot.scatter <- function(datasets, techniques, measures, output.dir, n.iter=30) { + dir.create.safe(file.path(output.dir, "plots")) + + for (measure in measures) { + p <- plot.scatter.measure(measure, datasets, techniques, output.dir, n.iter) + } +} + +# Experiment configuration +# Defines: datasets, techniques, measures, output.dir +source("config.R") + +args <- commandArgs(T) + +# Logging setup +basicConfig() +addHandler(writeToFile, + file=args[1], + level="FINEST") + +plot.measures(datasets, techniques, measures, output.dir) +plot.averages(datasets, techniques, measures, output.dir) +plot.scatter(datasets, techniques, measures, output.dir) @@ -0,0 +1,103 @@ +# NOTE: This script should only be used after results are generated with run.R + +library(grid) +library(gridExtra) +library(ggplot2) +library(mp) + +get_legend <- function(p) { + tmp <- ggplot_gtable(ggplot_build(p)) + leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") + legend <- tmp$grobs[[leg]] + legend +} + +args <- commandArgs(T) + +dataset <- args[1] +tech <- args[2] +iter <- args[3] +measure <- args[4] +out.file <- args[5] + +X.file <- file.path("datasets", paste(dataset, "tbl", sep=".")) +l.file <- file.path("datasets", paste(dataset, "labels", sep=".")) +Y.file <- file.path("results", dataset, tech, paste("Y-", iter, ".tbl", sep="")) +sample.indices.file <- file.path("results", dataset, paste("sample-indices-", iter, ".tbl", sep="")) +Ys.file <- file.path("results", dataset, paste("Ys-", iter, ".tbl", sep="")) +Ysm.file <- file.path("results", dataset, paste("Ysm-", measure, "-", iter, ".tbl", sep="")) + +X <- read.table(X.file) +Y <- read.table(Y.file) +l <- as.factor(read.table(l.file)$V1) +l <- l[!duplicated(X)] +X <- scale(unique(X)) + +sample.indices <- read.table(sample.indices.file)$V1 +ls <- l[sample.indices] +Ys <- read.table(Ys.file) +Ysm <- read.table(Ysm.file) +Ym <- pekalska(dist(X), sample.indices, Ysm) + +dfYs <- data.frame(x=Ys[,1], y=Ys[,2], Labels=ls) +pYs <- ggplot(dfYs) + + theme_minimal() + + theme(legend.position="none", + plot.background=element_rect(fill="#ffffff", color="#000000"), + axis.text=element_blank(), + axis.title=element_blank(), + panel.grid=element_blank()) + + geom_point(aes(x=x, y=y, color=Labels), size=0.5, alpha=0.8) + + scale_color_brewer(palette="Set1") +pYs <- ggplotGrob(pYs) + +dfYsm <- data.frame(x=Ysm[,1], y=Ysm[,2], Labels=ls) +pYsm <- ggplot(dfYsm) + + theme_minimal() + + theme(legend.position="none", + plot.background=element_rect(fill="#ffffff", color="#000000"), + axis.text=element_blank(), + axis.title=element_blank(), + panel.grid=element_blank()) + + geom_point(aes(x=x, y=y, color=Labels), size=0.5, alpha=0.8) + + scale_color_brewer(palette="Set1") +pYsm <- ggplotGrob(pYsm) + +dfY <- data.frame(x=Y[,1], y=Y[,2], Labels=l) +pY <- ggplot(dfY) + + theme_minimal() + + theme(legend.position="bottom", + legend.title=element_blank(), + legend.text=element_text(size=14), + legend.background=element_rect(fill="#ffffff", color="#000000"), + axis.text=element_blank(), + axis.title=element_blank(), + panel.grid=element_blank()) + + geom_point(aes(x=x, y=y, color=Labels), alpha=0.8) + + scale_color_brewer(palette="Set1") + + annotation_custom(grob = pYs, + xmin=min(dfY$x), xmax=min(dfY$x) + (max(dfY$x) - min(dfY$x)) / 3, + ymin=max(dfY$y) - (max(dfY$y) - min(dfY$y)) / 3, ymax=max(dfY$y)) + +legend <- get_legend(pY) +pY <- pY + theme(legend.position="none") + +dfYm <- data.frame(x=Ym[,1], y=Ym[,2], Labels=l) +pYm <- ggplot(dfYm) + + theme_minimal() + + theme(legend.position="none", + axis.text=element_blank(), + axis.title=element_blank(), + panel.grid=element_blank()) + + geom_point(aes(x=x, y=y, color=Labels), alpha=0.8) + + scale_color_brewer(palette="Set1") + + annotation_custom(grob = pYsm, + xmin=min(dfYm$x), xmax=min(dfYm$x) + (max(dfYm$x) - min(dfYm$x)) / 3, + ymin=min(dfYm$y), ymax=min(dfYm$y) + (max(dfYm$y) - min(dfYm$y)) / 3) + +p <- grid.arrange(pY, pYm, legend, + ncol=2, nrow=2, + layout_matrix=rbind(c(3, 3), c(1, 2)), + widths=c(5, 5), heights=c(0.5, 4.5)) + +ggsave(out.file, p, width=10, height=5) @@ -1,19 +1,16 @@ -require(ggplot2) -require(gridExtra) +# Main experiments script: performs sampling, manipulation and projections for +# all techniques, datasets and measures. + +require(logging) +require(MASS) require(mp) +require(Rtsne) source("measures.R") -#automated.m <- function(D, labels) { -# D.m <- D -# for (label in unique(labels)) { -# same.label <- labels == label -# D.m[same.label, same.label] <- D[same.label, same.label] * 0.1 -# } -# -# D.m -#} -automated.m <- function(Xs, labels) { +# Performs automated silhouette improvement manipulation, using a method +# inspired by Schaefer et al. (2013). +automated.silh <- function(Xs, labels) { n <- nrow(Xs) p <- ncol(Xs) Xs <- cbind(Xs, matrix(data=0, nrow=n, ncol=p)) @@ -23,228 +20,307 @@ automated.m <- function(Xs, labels) { } } - dist(Xs) + Dx <- dist(Xs) + # Dx <- Dx / mean(Dx) + as.matrix(Dx) } -color_scale.blue_orange <- function(name) { - scale_colour_gradient(name = name, high = "#376092", low = "#e46c0a", space = "Lab") +# NOTE: This function requires the 'klmeasure' binary from: +# http://research.cs.aalto.fi/pml/software/dredviz/ +nerv <- function(Dx, Y, lambda=0.1) { + # Create SOM_PAK file for Dx + Dx.fname <- tempfile() + Dx.f <- file(Dx.fname, "w") + cat(sprintf("%d\n", ncol(Dx)), file=Dx.f) + write.table(Dx, Dx.f, col.names=F, row.names=F) + close(Dx.f) + + # Create SOM_PAK file for Y + Y.fname <- tempfile() + Y.f <- file(Y.fname, "w") + cat(sprintf("%d\n", ncol(Y)), file=Y.f) + write.table(Y, Y.f, col.names=F, row.names=F) + close(Y.f) + + # Run NeRV + Ym.fname <- tempfile() + system2("./nerv", + stdout=F, + stderr=F, + args=c("--inputdist", Dx.fname, + "--outputfile", Ym.fname, + "--init", Y.fname, + "--lambda", sprintf("%.2f", lambda))) + + # Read results from generated file; remove file afterwards + Ym <- read.table(Ym.fname, skip=1) + file.remove(Dx.fname, Y.fname, Ym.fname) + + Ym } -color_scale.gradient2 <- function(name) { - scale_colour_gradient2(name = name, mid = "#dddddd", space = "Lab") +# Wrapper so that we can 'do.call' pekalska as we do with other techniques +pekalska.wrapper <- function(X, sample.indices, Ys) { + pekalska(dist(X), sample.indices, Ys) } -test <- function(file, suffix, output.dir) { - message("Testing dataset: ", file) - dataset <- read.table(file) +# Scales columns of projections so that all values are in [0, 1] +scale.Ys <- function(Ys) { + for (j in 1:ncol(Ys)) { + min.j <- min(Ys[, j]) + max.j <- max(Ys[, j]) + Ys[, j] <- (Ys[, j] - min.j) / (max.j - min.j) + } - # Extract labels - labels <- dataset[, ncol(dataset)] - classes <- as.factor(labels) - X <- dataset[, -ncol(dataset)] + Ys +} +dir.create.safe <- function(path, log=T) { + if (!dir.exists(path)) { + if (log) { + loginfo("Creating directory: %s", path) + } + + dir.create(path) + } +} + +run.manipulation <- function(X, Dx, labels, k, ds, n.iter, output.dir) { n <- nrow(X) - # Calculate distances (X) and normalize - message("\tCalculating dist(X)") - Dx <- dist(X) - Dx <- Dx / mean(Dx) - Dx <- as.matrix(Dx) - - # Sample dataset - sample.indices <- sample(n, 3*sqrt(n)) - classes.s <- as.factor(labels[sample.indices]) - - # Automatic sample positioning - message("\tCalculating Ys") - Dx.s <- Dx[sample.indices, sample.indices] - Ys <- forceScheme(Dx.s) - - # LAMP - message("\tCalculating Y") - Y <- lamp(X, sample.indices, Ys) - - # Calculate distances (Y) and normalize - message("\tCalculating dist(Y)") - Dy <- dist(Y) - Dy <- Dy / mean(Dy) - Dy <- as.matrix(Dy) - - message("\tCalculating P and Q") - prob <- d2p(Dx^2) - P <- prob$P - Q <- d2p.beta(Dy^2, prob$beta) - - # Calculate measures - message("\tCalculating measures") - np <- NP(Dx, Dy) - silh <- silhouette(Dy, classes) - precision <- klDivergence(Q, P) - recall <- klDivergence(P, Q) - - measures <- rbind(data.frame(mean=mean(np), median=median(np), sd=sd(np)), - data.frame(mean=mean(silh), median=median(silh), sd=sd(silh)), - data.frame(mean=mean(precision), median=median(precision), sd=sd(precision)), - data.frame(mean=mean(recall), median=median(recall), sd=sd(recall))) - write.table(measures, paste(output.dir, suffix, "-measures.csv", sep=""), row.names=F) - - if (!(all(is.finite(np)) && - all(is.finite(silh)) && - all(is.finite(precision)) && - all(is.finite(recall)))) { - stop("Non-finite measures found") + loginfo("Calculating all sample.indices and Ys") + for (iter in 1:n.iter) { + loginfo("Iteration: %02d", iter) + # Sample dataset + sample.indices <- sample(n, max(ncol(X), sqrt(n)*3)) + fname <- paste("sample-indices-", iter, ".tbl", sep="") + write.table(sample.indices, file.path(output.dir, ds$name, fname), row.names=F, col.names=F) + + # Initial sample positioning + loginfo("Calculating Ys") + Dx.s <- Dx[sample.indices, sample.indices] + Ys <- scale.Ys(cmdscale(Dx.s)) + fname <- paste("Ys-", iter, ".tbl", sep="") + write.table(Ys, file.path(output.dir, ds$name, fname), row.names=F, col.names=F) + + # Perform manipulation + loginfo("Running manipulation procedures") + + loginfo("Ys.m: Silhouette") + Dx.m <- automated.silh(X[sample.indices, ], labels[sample.indices]) + Ys.silhouette <- scale.Ys(cmdscale(Dx.m)) + Ys.m <- Ys.silhouette + fname <- paste("Ysm-silhouette-", iter, ".tbl", sep="") + write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F) + + loginfo("Ys.m: NP") + Ys.np <- scale.Ys(Rtsne(X[sample.indices, ], perplexity=k)$Y) + Ys.m <- Ys.np + fname <- paste("Ysm-np-", iter, ".tbl", sep="") + write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F) + + loginfo("Ys.m: Stress") + Ys.stress <- scale.Ys(sammon(Dx.s, Ys, tol=1e-20)$points) + Ys.m <- Ys.stress + fname <- paste("Ysm-stress-", iter, ".tbl", sep="") + write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F) + + loginfo("Ys.m: Precision") + Ys.precision <- scale.Ys(nerv(Dx.s, Ys, 0.01)) + Ys.m <- Ys.precision + fname <- paste("Ysm-precision-", iter, ".tbl", sep="") + write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F) + + loginfo("Ys.m: Recall") + Ys.recall <- scale.Ys(nerv(Dx.s, Ys, 0.99)) + Ys.m <- Ys.recall + fname <- paste("Ysm-recall-", iter, ".tbl", sep="") + write.table(Ys.m, file.path(output.dir, ds$name, fname), row.names=F, col.names=F) } +} - # Plot results - message("\tPlotting results") - shape_scale <- scale_shape_manual(name = "Classe", values = 1:nlevels(classes)) - Ys <- cbind(as.data.frame(Ys), classes.s) - Y <- cbind(as.data.frame(Y), classes, np, silh, precision, recall) - p.s <- ggplot(Ys) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes.s, colour = classes.s)) + - shape_scale + scale_color_manual(name = "Classe", values = 1:nlevels(classes)) - ggsave(paste(output.dir, "subsample-", suffix, ".pdf", sep=""), p.s, width=5, height=5) - - p <- ggplot(Y) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = classes)) + - shape_scale + scale_color_manual(name = "Classe", values = 1:nlevels(classes)) - ggsave(paste(output.dir, suffix, ".pdf", sep=""), p, width=5, height=5) - - p.np <- ggplot(Y) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = np)) + - shape_scale + color_scale.blue_orange("NP") - ggsave(paste(output.dir, "np-", suffix, ".pdf", sep=""), p.np, width=5, height=5) - - p.silh <- ggplot(Y) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = silh)) + - shape_scale + color_scale.blue_orange("Silhueta") - ggsave(paste(output.dir, "silh-", suffix, ".pdf", sep=""), p.silh, width=5, height=5) - - p.precision <- ggplot(Y) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = precision)) + - shape_scale + color_scale.blue_orange("Precisão") - ggsave(paste(output.dir, "precision-", suffix, ".pdf", sep=""), p.precision, width=5, height=5) - - p.recall <- ggplot(Y) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = recall)) + - shape_scale + color_scale.blue_orange("Revocação") - ggsave(paste(output.dir, "recall-", suffix, ".pdf", sep=""), p.recall, width=5, height=5) - - - pdf(paste(output.dir, "all-", suffix, ".pdf", sep=""), width = 12, height = 16) - grid.arrange(p.s, p, p.np, p.silh, p.precision, p.recall, ncol = 2) - dev.off() - - # Perform manipulation - message("\tCalculating Ys.m") - Dx.m <- automated.m(X[sample.indices, ], labels[sample.indices]) - Ys.m <- forceScheme(Dx.m, Ys[, 1:2]) - - # LAMP - message("\tCalculating Y.m") - Y.m <- lamp(X, sample.indices, Ys.m) - - # Calculate distances (Y.m) and normalize - message("\tCalculating dist(Y.m)") - Dy <- dist(Y.m) - Dy <- Dy / mean(Dy) - Dy <- as.matrix(Dy) - - message("\tCalculating Q") - Q <- d2p.beta(Dy^2, prob$beta) - - # Calculate measures - message("\tCalculating measures") - np <- NP(Dx, Dy) - np - silh <- silhouette(Dy, classes) - silh - precision <- klDivergence(Q, P) - precision - recall <- klDivergence(P, Q) - recall - - measures <- rbind(data.frame(mean=mean(np), median=median(np), sd=sd(np)), - data.frame(mean=mean(silh), median=median(silh), sd=sd(silh)), - data.frame(mean=mean(precision), median=median(precision), sd=sd(precision)), - data.frame(mean=mean(recall), median=median(recall), sd=sd(recall))) - write.table(measures, paste(output.dir, suffix, "-manip-measures.csv", sep=""), row.names=F) - - if (!(all(is.finite(np)) && - all(is.finite(silh)) && - all(is.finite(precision)) && - all(is.finite(recall)))) { - stop("Non-finite measures found") +run.technique <- function(X, Dx, labels, k, ds, n.iter, output.dir) { + loginfo("Technique: %s", tech$name) + dir.create.safe(file.path(output.dir, ds$name, tech$name)) + + silhouette.Y <- c() + np.Y <- c() + stress.Y <- c() + precision.Y <- c() + recall.Y <- c() + + silhouette.Ym <- c() + np.Ym <- c() + stress.Ym <- c() + precision.Ym <- c() + recall.Ym <- c() + + for (iter in 1:n.iter) { + loginfo("Iteration: %02d", iter) + + # Load sample indices... + fname <- paste("sample-indices-", iter, ".tbl", sep="") + sample.indices <- read.table(file.path(output.dir, ds$name, fname))$V1 + # ... and initial projection + fname <- paste("Ys-", iter, ".tbl", sep="") + Ys <- read.table(file.path(output.dir, ds$name, fname)) + + loginfo("Calculating Y") + Y <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys), tech$args)) + fname <- paste("Y-", iter, ".tbl", sep="") + write.table(Y, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F) + + # Calculate distances (Y) and normalize + loginfo("Calculating distances") + Dy <- dist(Y) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + + # Calculate measures + loginfo("Calculating measures") + np.Y <- c(np.Y, mean(NP(Dx, Dy, k))) + silhouette.Y <- c(silhouette.Y, mean(silhouette(Dy, classes))) + stress.Y <- c(stress.Y, stress(Dx, Dy)) + precision.Y <- c(precision.Y, smoothed.pr(Dx, Dy, k)$s.precision) + recall.Y <- c(recall.Y, smoothed.pr(Dx, Dy, k)$s.recall) + + # Testing manipulations + loginfo("Projection using Ysm.silhouette") + fname <- paste("Ysm-silhouette-", iter, ".tbl", sep="") + Ys.m <- read.table(file.path(output.dir, ds$name, fname)) + Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args)) + fname <- paste("Ym-silhouette-", iter, ".tbl", sep="") + write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F) + + Dy <- dist(Y.m) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + silhouette.Ym <- c(silhouette.Ym, mean(silhouette(Dy, classes))) + + + loginfo("Projection using Ysm.np") + fname <- paste("Ysm-np-", iter, ".tbl", sep="") + Ys.m <- read.table(file.path(output.dir, ds$name, fname)) + Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args)) + fname <- paste("Ym-np-", iter, ".tbl", sep="") + write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F) + + Dy <- dist(Y.m) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + np.Ym <- c(np.Ym, mean(NP(Dx, Dy, k))) + + + loginfo("Projection using Ysm.stress") + fname <- paste("Ysm-stress-", iter, ".tbl", sep="") + Ys.m <- read.table(file.path(output.dir, ds$name, fname)) + Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args)) + fname <- paste("Ym-stress-", iter, ".tbl", sep="") + write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F) + + Dy <- dist(Y.m) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + stress.Ym <- c(stress.Ym, stress(Dx, Dy)) + + + loginfo("Projection using Ysm.precision") + fname <- paste("Ysm-precision-", iter, ".tbl", sep="") + Ys.m <- read.table(file.path(output.dir, ds$name, fname)) + Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args)) + fname <- paste("Ym-precision-", iter, ".tbl", sep="") + write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F) + + Dy <- dist(Y.m) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + precision.Ym <- c(precision.Ym, smoothed.pr(Dx, Dy, k)$s.precision) + + + loginfo("Projection using Ysm.recall") + fname <- paste("Ysm-recall-", iter, ".tbl", sep="") + Ys.m <- read.table(file.path(output.dir, ds$name, fname)) + Y.m <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys.m), tech$args)) + fname <- paste("Ym-recall-", iter, ".tbl", sep="") + write.table(Y.m, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F) + + Dy <- dist(Y.m) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + recall.Ym <- c(recall.Ym, smoothed.pr(Dx, Dy, k)$s.recall) } - # Plot results - message("\tPlotting results") - Ys.m <- cbind(as.data.frame(Ys.m), classes.s) - Y.m <- cbind(as.data.frame(Y.m), classes, np, silh, precision, recall) - pm.s <- ggplot(Ys.m) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes.s, colour = classes.s)) + - shape_scale + scale_color_manual(name = "Classe", values = 1:nlevels(classes)) - ggsave(paste(output.dir, "manip-subsample-", suffix, ".pdf", sep=""), pm.s, width=5, height=5) - - pm <- ggplot(Y.m) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = classes)) + - shape_scale + scale_color_manual(name = "Classe", values = 1:nlevels(classes)) - ggsave(paste(output.dir, "manip-", suffix, ".pdf", sep=""), pm, width=5, height=5) - - p.np <- ggplot(Y.m) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = np)) + - shape_scale + color_scale.gradient2("NP") - ggsave(paste(output.dir, "manip-np-", suffix, ".pdf", sep=""), p.np, width=5, height=5) - - p.silh <- ggplot(Y.m) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = silh)) + - shape_scale + color_scale.gradient2("Silhueta") - ggsave(paste(output.dir, "manip-silh-", suffix, ".pdf", sep=""), p.silh, width=5, height=5) - - p.precision <- ggplot(Y.m) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = precision)) + - shape_scale + color_scale.gradient2("Precisão") - ggsave(paste(output.dir, "manip-precision-", suffix, ".pdf", sep=""), p.precision, width=5, height=5) - - p.recall <- ggplot(Y.m) + - theme_bw() + - labs(x = "", y = "") + - geom_point(aes(x = V1, y = V2, shape = classes, colour = recall)) + - shape_scale + color_scale.gradient2("Revocação") - ggsave(paste(output.dir, "manip-recall-", suffix, ".pdf", sep=""), p.recall, width=5, height=5) - - pdf(paste(output.dir, "original-manip-", suffix, ".pdf", sep=""), width = 10, height = 8) - grid.arrange(p.s, p, pm.s, pm, ncol = 2) - dev.off() - - pdf(paste(output.dir, "manip-measures-", suffix, ".pdf", sep=""), width = 10, height = 8) - grid.arrange(p.np, p.silh, p.precision, p.recall, ncol = 2) - dev.off() - - pdf(paste(output.dir, "manip-all-", suffix, ".pdf", sep=""), width = 12, height = 16) - grid.arrange(pm.s, pm, p.np, p.silh, p.precision, p.recall, ncol = 2) - dev.off() + write.table(silhouette.Y, file.path(output.dir, ds$name, tech$name, "silhouette-Y.tbl"), col.names=F, row.names=F) + write.table(np.Y, file.path(output.dir, ds$name, tech$name, "np-Y.tbl"), col.names=F, row.names=F) + write.table(stress.Y, file.path(output.dir, ds$name, tech$name, "stress-Y.tbl"), col.names=F, row.names=F) + write.table(precision.Y, file.path(output.dir, ds$name, tech$name, "precision-Y.tbl"), col.names=F, row.names=F) + write.table(recall.Y, file.path(output.dir, ds$name, tech$name, "recall-Y.tbl"), col.names=F, row.names=F) + + write.table(silhouette.Ym, file.path(output.dir, ds$name, tech$name, "silhouette-Ym.tbl"), col.names=F, row.names=F) + write.table(np.Ym, file.path(output.dir, ds$name, tech$name, "np-Ym.tbl"), col.names=F, row.names=F) + write.table(stress.Ym, file.path(output.dir, ds$name, tech$name, "stress-Ym.tbl"), col.names=F, row.names=F) + write.table(precision.Ym, file.path(output.dir, ds$name, tech$name, "precision-Ym.tbl"), col.names=F, row.names=F) + write.table(recall.Ym, file.path(output.dir, ds$name, tech$name, "recall-Ym.tbl"), col.names=F, row.names=F) } -#test(file = "datasets/iris.tbl", suffix = "iris", "tests/") -test(file = "datasets/wdbc.tbl", suffix = "wdbc", "tests/") -test(file = "datasets/segmentation.tbl", suffix = "segmentation", "tests/") -test(file = "datasets/glass.tbl", suffix = "glass", "tests/") +run <- function(datasets, + techniques, + output.dir, + kf=function(n) as.integer(min(sqrt(n), 0.05*n)), + n.iter=30, + intial.manipulation=T) { + dir.create.safe(output.dir) + + for (ds in datasets) { + loginfo("Testing dataset: %s", ds$name) + dir.create.safe(file.path(output.dir, ds$name)) + + # Load and clean data by removing duplicates, center and scale + X <- read.table(ds$data.file) + if (!is.null(ds$labels.file)) { + labels <- read.table(ds$labels.file)$V1 + labels <- labels[!duplicated(X)] + classes <- as.factor(labels) + } + + X <- unique(X) + if (ds$scale) { + X <- scale(X) + } + + n <- nrow(X) + k <- kf(n) + + # Calculate distances (X) and normalize + loginfo("Calculating dist(X)") + Dx <- dist(X) + Dx <- Dx / mean(Dx) + Dx <- as.matrix(Dx) + + # Generate samples, initial projections and all manipulations + if (intial.manipulation) { + run.manipulation(X, Dx, labels, k, ds, n.iter, output.dir) + } + + # Test techniques + for (tech in techniques) { + run.technique(X, Dx, labels, k, ds, tech, n.iter, output.dir) + } + } +} + + +# Experiment configuration +# Defines: datasets, techniques, output.dir +source("config.R") + +args <- commandArgs(T) + +# Logging setup +basicConfig() +addHandler(writeToFile, + file=args[1], + level="FINEST") + +# The alpha and omega +run(datasets, techniques, output.dir=output.dir, initial.manipulation=F) |