aboutsummaryrefslogtreecommitdiff
path: root/tests.R
blob: d56114c2da9793e816bcb48409f062d127a665e9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
require(ggplot2)
require(gridExtra)
require(mp)

source("measures.R")

automated.m <- function(D, labels) {
  D.m <- D
  for (label in unique(labels)) {
    same.label <- labels == label
    D.m[same.label, same.label] <- D[same.label, same.label] * 0.1
    #D.m[same.label, diff.label] <- D[same.label, diff.label] * 10
    #D.m[diff.label, same.label] <- D.m[same.label, diff.label]
  }

  D.m
}

xy.df <- function(M) {
  M <- as.data.frame(M)
  names(M) <- c("x", "y")

  M
}

test <- function(file, suffix, output.dir) {
  cat("Testing dataset ", file, "...\n")
  dataset <- read.table(file)

  # Extract labels
  labels  <- dataset[, ncol(dataset)]

  # Remove labels from dataset
  X <- dataset[, -ncol(dataset)]
  
  n <- nrow(X)

  # Calculate distances (X) and normalize
  Dx <- dist(X)
  Dx <- Dx / mean(Dx)
  Dx <- as.matrix(Dx)

  sample.indices <- sample(n, 3*sqrt(n))
  Dx.s <- Dx[sample.indices, sample.indices]
  Ys   <- forceScheme(Dx.s)
  Ys   <- xy.df(Ys)
  Y    <- lamp(X, sample.indices, Ys)
  Y    <- xy.df(Y)

  # Plot mapping
  classes <- as.factor(labels)
  classes.s <- as.factor(labels[sample.indices])
  p.s <- ggplot(cbind(Ys, classes.s), aes(x = x, y = y, colour = classes.s)) + geom_point()
  p   <- ggplot(cbind(Y, classes), aes(x = x, y = y, colour = classes)) + geom_point()
  pdf(paste(output.dir, "original-", suffix, ".pdf", sep=""), width = 10, height = 5)
  grid.arrange(p.s, p,
               widths = unit(rep_len(3, 2), "null"),
               heights = unit(rep_len(1, 2), "null"),
               ncol=2)
  dev.off()
  png(paste(output.dir, "original-", suffix, ".png", sep=""), width = 1200, height = 600)
  grid.arrange(p.s, p,
               widths = unit(rep_len(3, 2), "null"),
               heights = unit(rep_len(1, 2), "null"),
               ncol=2)
  dev.off()

  # Calculate distances (Y) and normalize
  Dy <- dist(Y)
  Dy <- Dy / mean(Dy)
  Dy <- as.matrix(Dy)

  # Calculate measures and plot
  sigmas <- vector("numeric", n)
  sigmas[] <- 1
  P <- d2p(Dx, sigmas)
  Q <- d2p(Dy, sigmas)
  np = NP(Dx, Dy)
  #stress = stress(Dx, Dy),
  precision <- klDivergence(Q, P)
  recall <- klDivergence(P, Q)
  p.np        <- ggplot(cbind(Y, np), aes(x = x, y = y, colour = np)) + geom_point() + labs(title = "NP (9)")
  p.precision <- ggplot(cbind(Y, precision), aes(x = x, y = y, colour = precision)) + geom_point() + labs(title = "Precision")
  p.recall    <- ggplot(cbind(Y, recall), aes(x = x, y = y, colour = recall)) + geom_point()    + labs(title = "Recall")
  pdf(paste(output.dir, "measures-original-", suffix, ".pdf", sep=""), width = 15, height = 5)
  grid.arrange(p.np, p.precision, p.recall,
               widths = unit(rep_len(3, 3), "null"),
               heights = unit(rep_len(1, 3), "null"),
               ncol=3)
  dev.off()
  png(paste(output.dir, "measures-original-", suffix, ".png", sep=""), width = 1800, height = 600)
  grid.arrange(p.np, p.precision, p.recall,
               widths = unit(rep_len(3, 3), "null"),
               heights = unit(rep_len(1, 3), "null"),
               ncol=3)
  dev.off()

  # Perform manipulation
  Dx.m <- automated.m(Dx.s, labels[sample.indices])
  Ys.m <- forceScheme(Dx.m)
  Ys.m <- xy.df(Ys.m)
  Y.m  <- lamp(X, sample.indices, Ys.m)
  Y.m  <- xy.df(Y.m)

  # Plot mapping
  p.s <- ggplot(cbind(Ys.m, classes.s), aes(x = x, y = y, colour = classes.s)) + geom_point()
  p   <- ggplot(cbind(Y.m, classes), aes(x = x, y = y, colour = classes)) + geom_point()
  pdf(paste(output.dir, "manip-", suffix, ".pdf", sep=""), width = 10, height = 5)
  grid.arrange(p.s, p,
               widths = unit(rep_len(3, 2), "null"),
               heights = unit(rep_len(1, 2), "null"),
               ncol=2)
  dev.off()
  png(paste(output.dir, "manip-", suffix, ".png", sep=""), width = 1200, height = 600)
  grid.arrange(p.s, p,
               widths = unit(rep_len(3, 2), "null"),
               heights = unit(rep_len(1, 2), "null"),
               ncol=2)
  dev.off()

  # Calculate distances (Y.m) and normalize
  Dy <- dist(Y.m)
  Dy <- Dy / mean(Dy)
  Dy <- as.matrix(Dy)
  Q <- d2p(Dy, sigmas)

  # Calculate measures and plot
  np = np - NP(Dx, Dy)
  #stress = stress(Dx, Dy),
  precision <- precision - klDivergence(Q, P)
  recall <- recall - klDivergence(P, Q)
  p.np        <- ggplot(cbind(Y.m, np), aes(x = x, y = y, colour = np)) + geom_point() + labs(title = "NP (9)")
  p.precision <- ggplot(cbind(Y.m, precision), aes(x = x, y = y, colour = precision)) + geom_point() + labs(title = "Precision")
  p.recall    <- ggplot(cbind(Y.m, recall), aes(x = x, y = y, colour = recall)) + geom_point()    + labs(title = "Recall")
  pdf(paste(output.dir, "measures-manip-", suffix, ".pdf", sep=""), width = 15, height = 5)
  grid.arrange(p.np, p.precision, p.recall,
               widths = unit(rep_len(3, 3), "null"),
               heights = unit(rep_len(1, 3), "null"),
               ncol=3)
  dev.off()
  png(paste(output.dir, "measures-manip-", suffix, ".png", sep=""), width = 1800, height = 600)
  grid.arrange(p.np, p.precision, p.recall,
               widths = unit(rep_len(3, 3), "null"),
               heights = unit(rep_len(1, 3), "null"),
               ncol=3)
  dev.off()
}

test(file = "datasets/iris-std.tbl", suffix = "iris", "plots/")
test(file = "datasets/wdbc.tbl", suffix = "wdbc", "plots/")
test(file = "datasets/segmentation.tbl", suffix = "segmentation", "plots/")
test(file = "datasets/images.tbl", suffix = "images", "plots/")