From 4b99b752fec3f67667a17a066ae3c1a1abaea181 Mon Sep 17 00:00:00 2001 From: Samuel Fadel Date: Thu, 25 Aug 2016 18:10:59 -0300 Subject: Added measure evolution and removed relative improvement experiments. --- run.R | 482 ++++++++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 321 insertions(+), 161 deletions(-) diff --git a/run.R b/run.R index bc44d63..80234e1 100644 --- a/run.R +++ b/run.R @@ -2,6 +2,7 @@ # # Main experiments script. +library(cluster) library(logging) library(MASS) library(mp) @@ -66,6 +67,27 @@ pekalska.wrapper <- function(X, sample.indices, Ys) { pekalska(dist(X), sample.indices, Ys) } +# Computes a random projection of a data matrix +random.projection <- function(X, k=2, fixed.seed=T) { + if (fixed.seed) { + set.seed(12345) + } + + X <- as.matrix(X) + + # Not sure if factor is right for k > 2, but we use only k=2 for now + factor <- sqrt(3) / sqrt(2) + P <- matrix(sample(0:5, ncol(X)*k, replace=T), ncol=k) + i.zeros <- P == 0 + i.ones <- P == 1 + i.other <- P > 1 + P[i.zeros] <- factor + P[i.ones] <- -factor + P[i.other] <- 0 + + X %*% P +} + # Scales columns of projections so that all values are in [0, 1] scale.Ys <- function(Ys) { for (j in 1:ncol(Ys)) { @@ -77,6 +99,16 @@ scale.Ys <- function(Ys) { Ys } +# Extracts a "good" CP selection +extract.CPs <- function(Dx, k=-1) { + if (k <= 0) { + n <- nrow(Dx) + k <- as.integer(sqrt(n)*3) + } + + pam(Dx, k)$id.med +} + # Generates samples (one sample per iteration) and performs automated # manipulation for all measures on each sample. run.manipulation <- function(X, Dx, labels, k, ds, n.iter, output.dir) { @@ -137,6 +169,8 @@ run.technique <- function(X, Dx, labels, k, ds, n.iter, output.dir) { loginfo("Technique: %s", tech$name) dir.create.safe(file.path(output.dir, ds$name, tech$name)) + classes <- as.factor(labels) + silhouette.Y <- c() np.Y <- c() stress.Y <- c() @@ -257,6 +291,8 @@ run.technique <- function(X, Dx, labels, k, ds, n.iter, output.dir) { write.table(recall.Ym, file.path(output.dir, ds$name, tech$name, "recall-Ym.tbl"), col.names=F, row.names=F) } +# The control points improvement experiment; n.iter sets of control points per +# dataset. run <- function(datasets, techniques, output.dir, @@ -274,7 +310,6 @@ run <- function(datasets, if (!is.null(ds$labels.file)) { labels <- read.table(ds$labels.file)$V1 labels <- labels[!duplicated(X)] - classes <- as.factor(labels) } X <- unique(X) @@ -303,6 +338,290 @@ run <- function(datasets, } } +# Generates the base random CP projection and target manipulated projections for +# each measure. +run.manipulation.evo <- function(X, Dx, labels, sample.indices, k, ds, output.dir) { + Dx.s <- Dx[sample.indices, sample.indices] + + # Initial sample positioning + loginfo("Calculating Ys.i") + Ys.i <- random.projection(X[sample.indices,], fixed.seed=T) + Ys.i <- scale.Ys(Ys.i) + write.table(Ys.i, file.path(output.dir, ds$name, "Ysi.tbl"), row.names=F, col.names=F) + + # Perform manipulation + loginfo("Running manipulation procedures") + + loginfo("Ys.f: Silhouette") + Dx.m <- automated.silh(X[sample.indices, ], labels[sample.indices]) + Ys.silhouette <- scale.Ys(cmdscale(Dx.m)) + Ys.m <- Ys.silhouette + write.table(Ys.m, file.path(output.dir, ds$name, "Ysf-silhouette.tbl"), row.names=F, col.names=F) + + loginfo("Ys.f: NP") + Ys.np <- scale.Ys(Rtsne(X[sample.indices, ], perplexity=k)$Y) + Ys.m <- Ys.np + write.table(Ys.m, file.path(output.dir, ds$name, "Ysf-np.tbl"), row.names=F, col.names=F) + + loginfo("Ys.f: Stress") + Ys.stress <- scale.Ys(sammon(Dx.s, Ys.i, tol=1e-20)$points) + Ys.m <- Ys.stress + write.table(Ys.m, file.path(output.dir, ds$name, "Ysf-stress.tbl"), row.names=F, col.names=F) + + loginfo("Ys.f: Precision") + Ys.precision <- scale.Ys(nerv(Dx.s, Ys.i, 0.01)) + Ys.m <- Ys.precision + write.table(Ys.m, file.path(output.dir, ds$name, "Ysf-precision.tbl"), row.names=F, col.names=F) + + loginfo("Ys.f: Recall") + Ys.recall <- scale.Ys(nerv(Dx.s, Ys.i, 0.99)) + Ys.m <- Ys.recall + write.table(Ys.m, file.path(output.dir, ds$name, "Ysf-recall.tbl"), row.names=F, col.names=F) +} + +# Produces Y for each interpolation step using the given technique and dataset. +run.technique.evo <- function(X, Dx, labels, k, ds, tech, n.samples, output.dir) { + loginfo("Technique: %s", tech$name) + dir.create.safe(file.path(output.dir, ds$name, tech$name)) + + classes <- as.factor(labels) + + # Load sample indices... + sample.indices <- read.table(file.path(output.dir, ds$name, "sample-indices.tbl"))$V1 + Dx.s <- Dx[sample.indices, sample.indices] + # ... and initial projection + Ys.i <- read.table(file.path(output.dir, ds$name, "Ysi.tbl")) + + alphas <- 0:(n.samples - 1)/(n.samples - 1) + + loginfo("Computing targets for measures") + for (measure in measures) { + if (is.null(ds$labels.file) && measure$name == "silhouette") { + next + } + + loginfo("Measure: %s", measure$name.pretty) + fname <- paste("Ysf-", measure$name, ".tbl", sep="") + Ys.f <- read.table(file.path(output.dir, ds$name, fname)) + + for (iter in 1:n.samples) { + loginfo("Calculating Y (%02d of %02d)", iter, n.samples) + + alpha <- alphas[iter] + Ys <- alpha * Ys.f + (1 - alpha) * Ys.i + Y <- do.call(tech$fn, append(list(X=X, sample.indices=sample.indices, Ys=Ys), tech$args)) + fname <- paste("Y-evo-", measure$name, "-", iter, ".tbl", sep="") + write.table(Y, file.path(output.dir, ds$name, tech$name, fname), row.names=F, col.names=F) + } + } + + silhouette.Ys <- c() + np.Ys <- c() + stress.Ys <- c() + precision.Ys <- c() + recall.Ys <- c() + + silhouette.Y <- c() + np.Y <- c() + stress.Y <- c() + precision.Y <- c() + recall.Y <- c() + + loginfo("Computing measures") + for (iter in 1:n.samples) { + loginfo("Iteration: %02d", iter) + alpha <- alphas[iter] + + if (!is.null(ds$labels.file)) { + # Silhouette -------------------------------------------------------------- + Ys.f <- read.table(file.path(output.dir, ds$name, "Ysf-silhouette.tbl")) + Ys <- alpha * Ys.f + (1 - alpha) * Ys.i + + # Calculate distances (Ys) and normalize + loginfo("Calculating distances (Ys)") + Dy.s <- dist(Ys) + Dy.s <- Dy.s / mean(Dy.s) + Dy.s <- as.matrix(Dy.s) + + fname <- paste("Y-evo-silhouette-", iter, ".tbl", sep="") + Y <- read.table(file.path(output.dir, ds$name, tech$name, fname)) + + # Calculate distances (Y) and normalize + loginfo("Calculating distances (Y)") + Dy <- dist(Y) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + + # Calculate measure + loginfo("Calculating silhouette for Ys and Y") + silhouette.Ys <- c(silhouette.Ys, mean(silhouette(Dy.s, classes[sample.indices]))) + silhouette.Y <- c(silhouette.Y, mean(silhouette(Dy, classes))) + } + + # NP ---------------------------------------------------------------------- + Ys.f <- read.table(file.path(output.dir, ds$name, "Ysf-np.tbl")) + Ys <- alpha * Ys.f + (1 - alpha) * Ys.i + + # Calculate distances (Ys) and normalize + loginfo("Calculating distances (Ys)") + Dy.s <- dist(Ys) + Dy.s <- Dy.s / mean(Dy.s) + Dy.s <- as.matrix(Dy.s) + + fname <- paste("Y-evo-np-", iter, ".tbl", sep="") + Y <- read.table(file.path(output.dir, ds$name, tech$name, fname)) + + # Calculate distances (Y) and normalize + loginfo("Calculating distances (Y)") + Dy <- dist(Y) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + + # Calculate measure + loginfo("Calculating NP for Ys and Y") + np.Ys <- c(np.Ys, mean(NP(Dx.s, Dy.s, k))) + np.Y <- c(np.Y, mean(NP(Dx, Dy, k))) + + # Stress ------------------------------------------------------------------ + Ys.f <- read.table(file.path(output.dir, ds$name, "Ysf-stress.tbl")) + Ys <- alpha * Ys.f + (1 - alpha) * Ys.i + + # Calculate distances (Ys) and normalize + loginfo("Calculating distances (Ys)") + Dy.s <- dist(Ys) + Dy.s <- Dy.s / mean(Dy.s) + Dy.s <- as.matrix(Dy.s) + + fname <- paste("Y-evo-stress-", iter, ".tbl", sep="") + Y <- read.table(file.path(output.dir, ds$name, tech$name, fname)) + + # Calculate distances (Y) and normalize + loginfo("Calculating distances (Y)") + Dy <- dist(Y) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + + # Calculate measure + loginfo("Calculating stress for Ys and Y") + stress.Ys <- c(stress.Ys, stress(Dx.s, Dy.s)) + stress.Y <- c(stress.Y, stress(Dx, Dy)) + + # Precision --------------------------------------------------------------- + Ys.f <- read.table(file.path(output.dir, ds$name, "Ysf-precision.tbl")) + Ys <- alpha * Ys.f + (1 - alpha) * Ys.i + + # Calculate distances (Ys) and normalize + loginfo("Calculating distances (Ys)") + Dy.s <- dist(Ys) + Dy.s <- Dy.s / mean(Dy.s) + Dy.s <- as.matrix(Dy.s) + + fname <- paste("Y-evo-precision-", iter, ".tbl", sep="") + Y <- read.table(file.path(output.dir, ds$name, tech$name, fname)) + + # Calculate distances (Y) and normalize + loginfo("Calculating distances (Y)") + Dy <- dist(Y) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + + # Calculate measure + loginfo("Calculating smoothed precision for Ys and Y") + precision.Ys <- c(precision.Ys, smoothed.pr(Dx.s, Dy.s, k)$s.precision) + precision.Y <- c(precision.Y, smoothed.pr(Dx, Dy, k)$s.precision) + + # Recall ------------------------------------------------------------------ + Ys.f <- read.table(file.path(output.dir, ds$name, "Ysf-recall.tbl")) + Ys <- alpha * Ys.f + (1 - alpha) * Ys.i + + # Calculate distances (Ys) and normalize + loginfo("Calculating distances (Ys)") + Dy.s <- dist(Ys) + Dy.s <- Dy.s / mean(Dy.s) + Dy.s <- as.matrix(Dy.s) + + fname <- paste("Y-evo-recall-", iter, ".tbl", sep="") + Y <- read.table(file.path(output.dir, ds$name, tech$name, fname)) + + # Calculate distances (Y) and normalize + loginfo("Calculating distances (Y)") + Dy <- dist(Y) + Dy <- Dy / mean(Dy) + Dy <- as.matrix(Dy) + + # Calculate measure + loginfo("Calculating smoothed recall for Ys and Y") + recall.Ys <- c(recall.Ys, smoothed.pr(Dx.s, Dy.s, k)$s.recall) + recall.Y <- c(recall.Y, smoothed.pr(Dx, Dy, k)$s.recall) + } + + if (!is.null(ds$labels.file)) { + write.table(silhouette.Ys, file.path(output.dir, ds$name, tech$name, "silhouette-Ys-evo.tbl"), col.names=F, row.names=F) + } + write.table(np.Ys, file.path(output.dir, ds$name, tech$name, "np-Ys-evo.tbl"), col.names=F, row.names=F) + write.table(stress.Ys, file.path(output.dir, ds$name, tech$name, "stress-Ys-evo.tbl"), col.names=F, row.names=F) + write.table(precision.Ys, file.path(output.dir, ds$name, tech$name, "precision-Ys-evo.tbl"), col.names=F, row.names=F) + write.table(recall.Ys, file.path(output.dir, ds$name, tech$name, "recall-Ys-evo.tbl"), col.names=F, row.names=F) + + if (!is.null(ds$labels.file)) { + write.table(silhouette.Y, file.path(output.dir, ds$name, tech$name, "silhouette-Y-evo.tbl"), col.names=F, row.names=F) + } + write.table(np.Y, file.path(output.dir, ds$name, tech$name, "np-Y-evo.tbl"), col.names=F, row.names=F) + write.table(stress.Y, file.path(output.dir, ds$name, tech$name, "stress-Y-evo.tbl"), col.names=F, row.names=F) + write.table(precision.Y, file.path(output.dir, ds$name, tech$name, "precision-Y-evo.tbl"), col.names=F, row.names=F) + write.table(recall.Y, file.path(output.dir, ds$name, tech$name, "recall-Y-evo.tbl"), col.names=F, row.names=F) +} + +# The control points improvement evolution experiment. +run.evo <- function(datasets, + techniques, + output.dir, + n.samples=30, + intial.manipulation=T, + kf=function(n) as.integer(min(sqrt(n), 0.05*n))) { + dir.create.safe(output.dir) + + for (ds in datasets) { + loginfo("Testing dataset: %s", ds$name) + dir.create.safe(file.path(output.dir, ds$name)) + + # Load and clean data by removing duplicates, center and scale + X <- read.table(ds$data.file) + if (!is.null(ds$labels.file)) { + labels <- read.table(ds$labels.file)$V1 + labels <- labels[!duplicated(X)] + classes <- as.factor(labels) + } + + X <- unique(X) + if (ds$scale) { + X <- scale(X) + } + + n <- nrow(X) + k <- kf(n) + + # Calculate distances (X) and normalize + loginfo("Calculating dist(X)") + Dx <- dist(X) + Dx <- Dx / mean(Dx) + Dx <- as.matrix(Dx) + + loginfo("Extracting control points") + sample.indices <- extract.CPs(Dx) + write.table(sample.indices, file.path(output.dir, ds$name, "sample-indices.tbl"), row.names=F, col.names=F) + + # Computes each manipulation target + run.manipulation.evo(X, Dx, labels, sample.indices, k, ds, output.dir) + + # Test techniques + for (tech in techniques) { + run.technique.evo(X, Dx, labels, k, ds, tech, n.samples, output.dir) + } + } +} + + # Runs all techniques (and only the techniques) to generate all mappings from # the original and manipulated samples. run.Y <- function(datasets, @@ -379,163 +698,6 @@ run.Y <- function(datasets, } } -relative.improvements <- function(datasets, - techniques, - output.dir, - n.iter=30, - kf=function(n) as.integer(min(sqrt(n), 0.05*n))) { - for (ds in datasets) { - loginfo("Dataset: %s", ds$name) - - X <- read.table(ds$data.file) - if (!is.null(ds$labels.file)) { - labels <- read.table(ds$labels.file)$V1 - labels <- labels[!duplicated(X)] - classes <- as.factor(labels) - } - - X <- unique(X) - if (ds$scale) { - X <- scale(X) - } - - n <- nrow(X) - k <- kf(n) - - loginfo("Calculating dist(X)") - Dx <- dist(X) - Dx <- Dx / mean(Dx) - Dx <- as.matrix(Dx) - - # Relative improvements per measure (for all iterations) - rcp.silhouette <- c() - rcp.np <- c() - rcp.stress <- c() - rcp.precision <- c() - rcp.recall <- c() - - for (iter in 1:n.iter) { - loginfo("Iteration: %d", iter) - - fname <- paste("sample-indices-", iter, ".tbl", sep="") - sample.indices <- read.table(file.path(output.dir, ds$name, fname))$V1 - Dx.s <- Dx[sample.indices, sample.indices] - fname <- paste("Ys-", iter, ".tbl", sep="") - Ys <- read.table(file.path(output.dir, ds$name, fname)) - - loginfo("Calculating dist(Ys)") - Dy <- dist(Ys) - Dy <- Dy / mean(Dy) - Dy <- as.matrix(Dy) - - loginfo("Calculating measures for Ys") - if (!is.null(ds$labels.file)) { - silhouette.Ys <- mean(silhouette(Dy, classes[sample.indices])) - } - np.Ys <- mean(NP(Dx.s, Dy, k)) - stress.Ys <- stress(Dx.s, Dy) - s.pr <- smoothed.pr(Dx.s, Dy, k) - precision.Ys <- s.pr$s.precision - recall.Ys <- s.pr$s.recall - - if (!is.null(ds$labels.file)) { - fname <- paste("Ysm-silhouette-", iter, ".tbl", sep="") - Ys.m <- read.table(file.path(output.dir, ds$name, fname)) - Dy <- dist(Ys.m) - Dy <- Dy / mean(Dy) - Dy <- as.matrix(Dy) - silhouette.Ysm <- mean(silhouette(Dy, classes[sample.indices])) - rcp.silhouette <- c(rcp.silhouette, silhouette.Ysm / silhouette.Ys) - } - - fname <- paste("Ysm-np-", iter, ".tbl", sep="") - Ys.m <- read.table(file.path(output.dir, ds$name, fname)) - Dy <- dist(Ys.m) - Dy <- Dy / mean(Dy) - Dy <- as.matrix(Dy) - np.Ysm <- mean(NP(Dx.s, Dy, k)) - rcp.np <- c(rcp.np, np.Ysm / np.Ys) - - fname <- paste("Ysm-stress-", iter, ".tbl", sep="") - Ys.m <- read.table(file.path(output.dir, ds$name, fname)) - Dy <- dist(Ys.m) - Dy <- Dy / mean(Dy) - Dy <- as.matrix(Dy) - stress.Ysm <- stress(Dx.s, Dy) - rcp.stress <- c(rcp.stress, stress.Ysm / stress.Ys) - - fname <- paste("Ysm-precision-", iter, ".tbl", sep="") - Ys.m <- read.table(file.path(output.dir, ds$name, fname)) - Dy <- dist(Ys.m) - Dy <- Dy / mean(Dy) - Dy <- as.matrix(Dy) - precision.Ysm <- smoothed.pr(Dx.s, Dy, k)$s.precision - rcp.precision <- c(rcp.precision, precision.Ysm / precision.Ys) - - fname <- paste("Ysm-recall-", iter, ".tbl", sep="") - Ys.m <- read.table(file.path(output.dir, ds$name, fname)) - Dy <- dist(Ys.m) - Dy <- Dy / mean(Dy) - Dy <- as.matrix(Dy) - recall.Ysm <- smoothed.pr(Dx.s, Dy, k)$s.recall - rcp.recall <- c(rcp.recall, precision.Ysm / precision.Ys) - } - - write.table(rcp.silhouette, file.path(output.dir, ds$name, "r-cp-silhouette.tbl"), col.names=F, row.names=F) - write.table(rcp.np, file.path(output.dir, ds$name, "r-cp-np.tbl"), col.names=F, row.names=F) - write.table(rcp.stress, file.path(output.dir, ds$name, "r-cp-stress.tbl"), col.names=F, row.names=F) - write.table(rcp.precision, file.path(output.dir, ds$name, "r-cp-precision.tbl"), col.names=F, row.names=F) - write.table(rcp.recall, file.path(output.dir, ds$name, "r-cp-recall.tbl"), col.names=F, row.names=F) - - - for (tech in techniques) { - loginfo("Technique: %s", tech$name) - - r.silhouette <- c() - r.np <- c() - r.stress <- c() - r.precision <- c() - r.recall <- c() - - if (!is.null(ds$labels.file)) { - silhouette.Y <- read.table(file.path(output.dir, ds$name, tech$name, "silhouette-Y.tbl"))$V1 - } - np.Y <- read.table(file.path(output.dir, ds$name, tech$name, "np-Y.tbl"))$V1 - stress.Y <- read.table(file.path(output.dir, ds$name, tech$name, "stress-Y.tbl"))$V1 - precision.Y <- read.table(file.path(output.dir, ds$name, tech$name, "precision-Y.tbl"))$V1 - recall.Y <- read.table(file.path(output.dir, ds$name, tech$name, "recall-Y.tbl"))$V1 - - if (!is.null(ds$labels.file)) { - silhouette.Ym <- read.table(file.path(output.dir, ds$name, tech$name, "silhouette-Ym.tbl"))$V1 - } - np.Ym <- read.table(file.path(output.dir, ds$name, tech$name, "np-Ym.tbl"))$V1 - stress.Ym <- read.table(file.path(output.dir, ds$name, tech$name, "stress-Ym.tbl"))$V1 - precision.Ym <- read.table(file.path(output.dir, ds$name, tech$name, "precision-Ym.tbl"))$V1 - recall.Ym <- read.table(file.path(output.dir, ds$name, tech$name, "recall-Ym.tbl"))$V1 - - for (iter in 1:n.iter) { - loginfo("Iteration: %d", iter) - - if (!is.null(ds$labels.file)) { - r.silhouette <- c(r.silhouette, silhouette.Ym[iter] / silhouette.Y[iter]) - } - r.np <- c(r.np, np.Ym[iter] / np.Y[iter]) - r.stress <- c(r.stress, stress.Ym[iter] / stress.Y[iter]) - r.precision <- c(r.precision, precision.Ym[iter] / precision.Y[iter]) - r.recall <- c(r.recall, recall.Ym[iter] / recall.Y[iter]) - } - - if (!is.null(ds$labels.file)) { - write.table(r.silhouette, file.path(output.dir, ds$name, tech$name, "r-silhouette.tbl"), col.names=F, row.names=F) - } - write.table(r.np, file.path(output.dir, ds$name, tech$name, "r-np.tbl"), col.names=F, row.names=F) - write.table(r.stress, file.path(output.dir, ds$name, tech$name, "r-stress.tbl"), col.names=F, row.names=F) - write.table(r.precision, file.path(output.dir, ds$name, tech$name, "r-precision.tbl"), col.names=F, row.names=F) - write.table(r.recall, file.path(output.dir, ds$name, tech$name, "r-recall.tbl"), col.names=F, row.names=F) - } - } -} - # Computes confidence intervals for the difference in measures between # manipulated and original samples. confidence.intervals <- function(datasets, techniques, measures, output.dir, n.iter=30) { @@ -578,9 +740,7 @@ addHandler(writeToFile, # The alpha and omega run(datasets, techniques, output.dir=output.dir, initial.manipulation=F) - -# Compute relative improvements for all datasets and techniques (and samples) -relative.improvements(datasets, techniques, output.dir) +run.evo(datasets, techniques, output.dir=output.dir) # Compute all confidence intervals confidence.intervals(datasets, techniques, measures, output.dir) -- cgit v1.2.3