1 files changed, 468 insertions, 0 deletions
diff --git a/skelftkernel.h b/skelftkernel.h
new file mode 100644
index 0000000..1182181
--- /dev/null
+++ b/skelftkernel.h
@@ -0,0 +1,468 @@
+// CUDA-based Skeletonization, Distance Transforms, Feature Transforms, Erosion
+// and Dilation, and Flood Filling Toolkit
+//
+//(c) Alexandru Telea, Univ. of Groningen, 2011
+//
+// This code is adapted from "Parallel Banding Algorithm to compute exact
+// distance transforms with the GPU",
+// T. Cao, K. Tang, A. Mohamed, T. Tan, Proc. ACM Symo. on Interactive 3D
+// Graphics and Games (I3D), 2010, 83-90
+//====================================================================================================================
+
+// MARKER is used to mark blank pixels in the texture. Any uncolored pixels will
+// have x = MARKER.
+// Input texture should have x = MARKER for all pixels other than sites
+#define MARKER -32768
+
+#define TOID(x, y, size) (__mul24((y), (size)) + (x))
+
+// Transpose a square matrix
+__global__ void kernelTranspose(short2 *data, int size) {
+    __shared__ short2 block1[TILE_DIM][TILE_DIM + 1];
+    __shared__ short2 block2[TILE_DIM][TILE_DIM + 1];
+
+    int blockIdx_y = blockIdx.x;
+    int blockIdx_x = blockIdx.x + blockIdx.y;
+
+    if (blockIdx_x >= gridDim.x)
+        return;
+
+    int blkX, blkY, x, y, id1, id2;
+    short2 pixel;
+
+    blkX = __mul24(blockIdx_x, TILE_DIM);
+    blkY = __mul24(blockIdx_y, TILE_DIM);
+
+    x = blkX + threadIdx.x;
+    y = blkY + threadIdx.y;
+    id1 = __mul24(y, size) + x;
+
+    x = blkY + threadIdx.x;
+    y = blkX + threadIdx.y;
+    id2 = __mul24(y, size) + x;
+
+    // read the matrix tile into shared memory
+    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+        int idx = __mul24(i, size);
+        block1[threadIdx.y + i][threadIdx.x] =
+            tex1Dfetch(pbaTexColor, id1 + idx);
+        block2[threadIdx.y + i][threadIdx.x] =
+            tex1Dfetch(pbaTexColor, id2 + idx);
+    }
+
+    __syncthreads();
+
+    // write the transposed matrix tile to global memory
+    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+        int idx = __mul24(i, size);
+        pixel = block1[threadIdx.x][threadIdx.y + i];
+        data[id2 + idx] = make_short2(pixel.y, pixel.x);
+        pixel = block2[threadIdx.x][threadIdx.y + i];
+        data[id1 + idx] = make_short2(pixel.y, pixel.x);
+    }
+}
+
+__global__ void kernelFloodDown(short2 *output, int size, int bandSize) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+    int id = TOID(tx, ty, size);
+
+    short2 pixel1, pixel2;
+
+    pixel1 = make_short2(MARKER, MARKER);
+
+    for (int i = 0; i < bandSize; ++i, id += size) {
+        pixel2 = tex1Dfetch(pbaTexColor, id);
+
+        if (pixel2.x != MARKER)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+__global__ void kernelFloodUp(short2 *output, int size, int bandSize) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = (blockIdx.y + 1) * bandSize - 1;
+    int id = TOID(tx, ty, size);
+
+    short2 pixel1, pixel2;
+    int dist1, dist2;
+
+    pixel1 = make_short2(MARKER, MARKER);
+
+    for (int i = 0; i < bandSize; i++, id -= size) {
+        dist1 = abs(pixel1.y - ty + i);
+
+        pixel2 = tex1Dfetch(pbaTexColor, id);
+        dist2 = abs(pixel2.y - ty + i);
+
+        if (dist2 < dist1)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+__global__ void kernelPropagateInterband(short2 *output, int size,
+                                         int bandSize) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int inc = __mul24(bandSize, size);
+    int ny, nid, nDist;
+    short2 pixel;
+
+    // Top row, look backward
+    int ty = __mul24(blockIdx.y, bandSize);
+    int topId = TOID(tx, ty, size);
+    int bottomId = TOID(tx, ty + bandSize - 1, size);
+
+    pixel = tex1Dfetch(pbaTexColor, topId);
+    int myDist = abs(pixel.y - ty);
+
+    for (nid = bottomId - inc; nid >= 0; nid -= inc) {
+        pixel = tex1Dfetch(pbaTexColor, nid);
+        if (pixel.x != MARKER) {
+            nDist = abs(pixel.y - ty);
+            if (nDist < myDist)
+                output[topId] = pixel;
+            break;
+        }
+    }
+
+    // Last row, look downward
+    ty = ty + bandSize - 1;
+    pixel = tex1Dfetch(pbaTexColor, bottomId);
+    myDist = abs(pixel.y - ty);
+
+    for (ny = ty + 1, nid = topId + inc; ny < size;
+         ny += bandSize, nid += inc) {
+        pixel = tex1Dfetch(pbaTexColor, nid);
+
+        if (pixel.x != MARKER) {
+            nDist = abs(pixel.y - ty);
+            if (nDist < myDist)
+                output[bottomId] = pixel;
+            break;
+        }
+    }
+}
+
+__global__ void kernelUpdateVertical(short2 *output, int size, int band,
+                                     int bandSize) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+
+    short2 top = tex1Dfetch(pbaTexLinks, TOID(tx, ty, size));
+    short2 bottom = tex1Dfetch(pbaTexLinks, TOID(tx, ty + bandSize - 1, size));
+    short2 pixel;
+
+    int dist, myDist;
+
+    int id = TOID(tx, ty, size);
+
+    for (int i = 0; i < bandSize; i++, id += size) {
+        pixel = tex1Dfetch(pbaTexColor, id);
+        myDist = abs(pixel.y - (ty + i));
+
+        dist = abs(top.y - (ty + i));
+        if (dist < myDist) {
+            myDist = dist;
+            pixel = top;
+        }
+
+        dist = abs(bottom.y - (ty + i));
+        if (dist < myDist)
+            pixel = bottom;
+
+        output[id] = pixel;
+    }
+}
+
+// Input: y1 < y2
+__device__ float interpoint(int x1, int y1, int x2, int y2, int x0) {
+    float xM = float(x1 + x2) / 2.0f;
+    float yM = float(y1 + y2) / 2.0f;
+    float nx = x2 - x1;
+    float ny = y2 - y1;
+
+    return yM + nx * (xM - x0) / ny;
+}
+
+__global__ void kernelProximatePoints(short2 *stack, int size, int bandSize) {
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = __mul24(blockIdx.y, bandSize);
+    int id = TOID(tx, ty, size);
+    int lasty = -1;
+    short2 last1, last2, current;
+    float i1, i2;
+
+    last1.y = -1;
+    last2.y = -1;
+
+    for (int i = 0; i < bandSize; i++, id += size) {
+        current = tex1Dfetch(pbaTexColor, id);
+
+        if (current.x != MARKER) {
+            while (last2.y >= 0) {
+                i1 = interpoint(last1.x, last2.y, last2.x, lasty, tx);
+                i2 = interpoint(last2.x, lasty, current.x, current.y, tx);
+
+                if (i1 < i2)
+                    break;
+
+                lasty = last2.y;
+                last2 = last1;
+
+                if (last1.y >= 0)
+                    last1 = stack[TOID(tx, last1.y, size)];
+            }
+
+            last1 = last2;
+            last2 = make_short2(current.x, lasty);
+            lasty = current.y;
+
+            stack[id] = last2;
+        }
+    }
+
+    // Store the pointer to the tail at the last pixel of this band
+    if (lasty != ty + bandSize - 1)
+        stack[TOID(tx, ty + bandSize - 1, size)] = make_short2(MARKER, lasty);
+}
+
+__global__ void kernelCreateForwardPointers(short2 *output, int size,
+                                            int bandSize) {
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = __mul24(blockIdx.y + 1, bandSize) - 1;
+    int id = TOID(tx, ty, size);
+    int lasty = -1, nexty;
+    short2 current;
+
+    // Get the tail pointer
+    current = tex1Dfetch(pbaTexLinks, id);
+
+    if (current.x == MARKER)
+        nexty = current.y;
+    else
+        nexty = ty;
+
+    for (int i = 0; i < bandSize; i++, id -= size)
+        if (ty - i == nexty) {
+            current = make_short2(lasty, tex1Dfetch(pbaTexLinks, id).y);
+            output[id] = current;
+
+            lasty = nexty;
+            nexty = current.y;
+        }
+
+    // Store the pointer to the head at the first pixel of this band
+    if (lasty != ty - bandSize + 1)
+        output[id + size] = make_short2(lasty, MARKER);
+}
+
+__global__ void kernelMergeBands(short2 *output, int size, int bandSize) {
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int band1 = blockIdx.y * 2;
+    int band2 = band1 + 1;
+    int firsty, lasty;
+    short2 last1, last2, current;
+    // last1 and last2: x component store the x coordinate of the site,
+    // y component store the backward pointer
+    // current: y component store the x coordinate of the site,
+    // x component store the forward pointer
+
+    // Get the two last items of the first list
+    lasty = __mul24(band2, bandSize) - 1;
+    last2 = make_short2(tex1Dfetch(pbaTexColor, TOID(tx, lasty, size)).x,
+                        tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)).y);
+
+    if (last2.x == MARKER) {
+        lasty = last2.y;
+
+        if (lasty >= 0)
+            last2 =
+                make_short2(tex1Dfetch(pbaTexColor, TOID(tx, lasty, size)).x,
+                            tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)).y);
+        else
+            last2 = make_short2(MARKER, MARKER);
+    }
+
+    if (last2.y >= 0) {
+        // Second item at the top of the stack
+        last1 = make_short2(tex1Dfetch(pbaTexColor, TOID(tx, last2.y, size)).x,
+                            tex1Dfetch(pbaTexLinks, TOID(tx, last2.y, size)).y);
+    }
+
+    // Get the first item of the second band
+    firsty = __mul24(band2, bandSize);
+    current = make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
+                          tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);
+
+    if (current.y == MARKER) {
+        firsty = current.x;
+
+        if (firsty >= 0)
+            current =
+                make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
+                            tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);
+        else
+            current = make_short2(MARKER, MARKER);
+    }
+
+    float i1, i2;
+
+    // Count the number of item in the second band that survive so far.
+    // Once it reaches 2, we can stop.
+    int top = 0;
+
+    while (top < 2 && current.y >= 0) {
+        // While there's still something on the left
+        while (last2.y >= 0) {
+            i1 = interpoint(last1.x, last2.y, last2.x, lasty, tx);
+            i2 = interpoint(last2.x, lasty, current.y, firsty, tx);
+
+            if (i1 < i2)
+                break;
+
+            lasty = last2.y;
+            last2 = last1;
+            --top;
+
+            if (last1.y >= 0)
+                last1 = make_short2(
+                    tex1Dfetch(pbaTexColor, TOID(tx, last1.y, size)).x,
+                    output[TOID(tx, last1.y, size)].y);
+        }
+
+        // Update the current pointer
+        output[TOID(tx, firsty, size)] = make_short2(current.x, lasty);
+
+        if (lasty >= 0)
+            output[TOID(tx, lasty, size)] = make_short2(firsty, last2.y);
+
+        last1 = last2;
+        last2 = make_short2(current.y, lasty);
+        lasty = firsty;
+        firsty = current.x;
+
+        top = max(1, top + 1);
+
+        // Advance the current pointer to the next one
+        if (firsty >= 0)
+            current =
+                make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
+                            tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);
+        else
+            current = make_short2(MARKER, MARKER);
+    }
+
+    // Update the head and tail pointer.
+    firsty = __mul24(band1, bandSize);
+    lasty = __mul24(band2, bandSize);
+    current = tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size));
+
+    if (current.y == MARKER && current.x < 0) { // No head?
+        last1 = tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size));
+
+        if (last1.y == MARKER)
+            current.x = last1.x;
+        else
+            current.x = lasty;
+
+        output[TOID(tx, firsty, size)] = current;
+    }
+
+    firsty = __mul24(band1, bandSize) + bandSize - 1;
+    lasty = __mul24(band2, bandSize) + bandSize - 1;
+    current = tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size));
+
+    if (current.x == MARKER && current.y < 0) { // No tail?
+        last1 = tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size));
+
+        if (last1.x == MARKER)
+            current.y = last1.y;
+        else
+            current.y = firsty;
+
+        output[TOID(tx, lasty, size)] = current;
+    }
+}
+
+__global__ void kernelDoubleToSingleList(short2 *output, int size) {
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = blockIdx.y;
+    int id = TOID(tx, ty, size);
+
+    output[id] = make_short2(tex1Dfetch(pbaTexColor, id).x,
+                             tex1Dfetch(pbaTexLinks, id).y);
+}
+
+__global__ void kernelColor(short2 *output, int size) {
+    __shared__ short2 s_last1[BLOCKSIZE], s_last2[BLOCKSIZE];
+    __shared__ int s_lasty[BLOCKSIZE];
+
+    int col = threadIdx.x;
+    int tid = threadIdx.y;
+    int tx = __mul24(blockIdx.x, blockDim.x) + col;
+    int dx, dy, lasty;
+    unsigned int best, dist;
+    short2 last1, last2;
+
+    if (tid == blockDim.y - 1) {
+        lasty = size - 1;
+
+        last2 = tex1Dfetch(pbaTexColor, __mul24(lasty, size) + tx);
+
+        if (last2.x == MARKER) {
+            lasty = last2.y;
+            last2 = tex1Dfetch(pbaTexColor, __mul24(lasty, size) + tx);
+        }
+
+        if (last2.y >= 0)
+            last1 = tex1Dfetch(pbaTexColor, __mul24(last2.y, size) + tx);
+
+        s_last1[col] = last1;
+        s_last2[col] = last2;
+        s_lasty[col] = lasty;
+    }
+
+    __syncthreads();
+
+    for (int ty = size - 1 - tid; ty >= 0; ty -= blockDim.y) {
+        last1 = s_last1[col];
+        last2 = s_last2[col];
+        lasty = s_lasty[col];
+
+        dx = last2.x - tx;
+        dy = lasty - ty;
+        best = dist = __mul24(dx, dx) + __mul24(dy, dy);
+
+        while (last2.y >= 0) {
+            dx = last1.x - tx;
+            dy = last2.y - ty;
+            dist = __mul24(dx, dx) + __mul24(dy, dy);
+
+            if (dist > best)
+                break;
+
+            best = dist;
+            lasty = last2.y;
+            last2 = last1;
+
+            if (last2.y >= 0)
+                last1 = tex1Dfetch(pbaTexColor, __mul24(last2.y, size) + tx);
+        }
+
+        __syncthreads();
+
+        output[TOID(tx, ty, size)] = make_short2(last2.x, lasty);
+
+        if (tid == blockDim.y - 1) {
+            s_last1[col] = last1;
+            s_last2[col] = last2;
+            s_lasty[col] = lasty;
+        }
+
+        __syncthreads();
+    }
+}
+\ No newline at end of file