// CUDA-based Skeletonization, Distance Transforms, Feature Transforms, Erosion
// and Dilation, and Flood Filling Toolkit
//
//(c) Alexandru Telea, Univ. of Groningen, 2011
//
// This code is adapted from "Parallel Banding Algorithm to compute exact
// distance transforms with the GPU",
// T. Cao, K. Tang, A. Mohamed, T. Tan, Proc. ACM Symo. on Interactive 3D
// Graphics and Games (I3D), 2010, 83-90
//====================================================================================================================

// MARKER is used to mark blank pixels in the texture. Any uncolored pixels will
// have x = MARKER.
// Input texture should have x = MARKER for all pixels other than sites
#define MARKER -32768

#define TOID(x, y, size) (__mul24((y), (size)) + (x))

// Transpose a square matrix
__global__ void kernelTranspose(short2 *data, int size) {
    __shared__ short2 block1[TILE_DIM][TILE_DIM + 1];
    __shared__ short2 block2[TILE_DIM][TILE_DIM + 1];

    int blockIdx_y = blockIdx.x;
    int blockIdx_x = blockIdx.x + blockIdx.y;

    if (blockIdx_x >= gridDim.x)
        return;

    int blkX, blkY, x, y, id1, id2;
    short2 pixel;

    blkX = __mul24(blockIdx_x, TILE_DIM);
    blkY = __mul24(blockIdx_y, TILE_DIM);

    x = blkX + threadIdx.x;
    y = blkY + threadIdx.y;
    id1 = __mul24(y, size) + x;

    x = blkY + threadIdx.x;
    y = blkX + threadIdx.y;
    id2 = __mul24(y, size) + x;

    // read the matrix tile into shared memory
    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
        int idx = __mul24(i, size);
        block1[threadIdx.y + i][threadIdx.x] =
            tex1Dfetch(pbaTexColor, id1 + idx);
        block2[threadIdx.y + i][threadIdx.x] =
            tex1Dfetch(pbaTexColor, id2 + idx);
    }

    __syncthreads();

    // write the transposed matrix tile to global memory
    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
        int idx = __mul24(i, size);
        pixel = block1[threadIdx.x][threadIdx.y + i];
        data[id2 + idx] = make_short2(pixel.y, pixel.x);
        pixel = block2[threadIdx.x][threadIdx.y + i];
        data[id1 + idx] = make_short2(pixel.y, pixel.x);
    }
}

__global__ void kernelFloodDown(short2 *output, int size, int bandSize) {
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * bandSize;
    int id = TOID(tx, ty, size);

    short2 pixel1, pixel2;

    pixel1 = make_short2(MARKER, MARKER);

    for (int i = 0; i < bandSize; ++i, id += size) {
        pixel2 = tex1Dfetch(pbaTexColor, id);

        if (pixel2.x != MARKER)
            pixel1 = pixel2;

        output[id] = pixel1;
    }
}

__global__ void kernelFloodUp(short2 *output, int size, int bandSize) {
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = (blockIdx.y + 1) * bandSize - 1;
    int id = TOID(tx, ty, size);

    short2 pixel1, pixel2;
    int dist1, dist2;

    pixel1 = make_short2(MARKER, MARKER);

    for (int i = 0; i < bandSize; i++, id -= size) {
        dist1 = abs(pixel1.y - ty + i);

        pixel2 = tex1Dfetch(pbaTexColor, id);
        dist2 = abs(pixel2.y - ty + i);

        if (dist2 < dist1)
            pixel1 = pixel2;

        output[id] = pixel1;
    }
}

__global__ void kernelPropagateInterband(short2 *output, int size,
                                         int bandSize) {
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int inc = __mul24(bandSize, size);
    int ny, nid, nDist;
    short2 pixel;

    // Top row, look backward
    int ty = __mul24(blockIdx.y, bandSize);
    int topId = TOID(tx, ty, size);
    int bottomId = TOID(tx, ty + bandSize - 1, size);

    pixel = tex1Dfetch(pbaTexColor, topId);
    int myDist = abs(pixel.y - ty);

    for (nid = bottomId - inc; nid >= 0; nid -= inc) {
        pixel = tex1Dfetch(pbaTexColor, nid);
        if (pixel.x != MARKER) {
            nDist = abs(pixel.y - ty);
            if (nDist < myDist)
                output[topId] = pixel;
            break;
        }
    }

    // Last row, look downward
    ty = ty + bandSize - 1;
    pixel = tex1Dfetch(pbaTexColor, bottomId);
    myDist = abs(pixel.y - ty);

    for (ny = ty + 1, nid = topId + inc; ny < size;
         ny += bandSize, nid += inc) {
        pixel = tex1Dfetch(pbaTexColor, nid);

        if (pixel.x != MARKER) {
            nDist = abs(pixel.y - ty);
            if (nDist < myDist)
                output[bottomId] = pixel;
            break;
        }
    }
}

__global__ void kernelUpdateVertical(short2 *output, int size, int band,
                                     int bandSize) {
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * bandSize;

    short2 top = tex1Dfetch(pbaTexLinks, TOID(tx, ty, size));
    short2 bottom = tex1Dfetch(pbaTexLinks, TOID(tx, ty + bandSize - 1, size));
    short2 pixel;

    int dist, myDist;

    int id = TOID(tx, ty, size);

    for (int i = 0; i < bandSize; i++, id += size) {
        pixel = tex1Dfetch(pbaTexColor, id);
        myDist = abs(pixel.y - (ty + i));

        dist = abs(top.y - (ty + i));
        if (dist < myDist) {
            myDist = dist;
            pixel = top;
        }

        dist = abs(bottom.y - (ty + i));
        if (dist < myDist)
            pixel = bottom;

        output[id] = pixel;
    }
}

// Input: y1 < y2
__device__ float interpoint(int x1, int y1, int x2, int y2, int x0) {
    float xM = float(x1 + x2) / 2.0f;
    float yM = float(y1 + y2) / 2.0f;
    float nx = x2 - x1;
    float ny = y2 - y1;

    return yM + nx * (xM - x0) / ny;
}

__global__ void kernelProximatePoints(short2 *stack, int size, int bandSize) {
    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
    int ty = __mul24(blockIdx.y, bandSize);
    int id = TOID(tx, ty, size);
    int lasty = -1;
    short2 last1, last2, current;
    float i1, i2;

    last1.y = -1;
    last2.y = -1;

    for (int i = 0; i < bandSize; i++, id += size) {
        current = tex1Dfetch(pbaTexColor, id);

        if (current.x != MARKER) {
            while (last2.y >= 0) {
                i1 = interpoint(last1.x, last2.y, last2.x, lasty, tx);
                i2 = interpoint(last2.x, lasty, current.x, current.y, tx);

                if (i1 < i2)
                    break;

                lasty = last2.y;
                last2 = last1;

                if (last1.y >= 0)
                    last1 = stack[TOID(tx, last1.y, size)];
            }

            last1 = last2;
            last2 = make_short2(current.x, lasty);
            lasty = current.y;

            stack[id] = last2;
        }
    }

    // Store the pointer to the tail at the last pixel of this band
    if (lasty != ty + bandSize - 1)
        stack[TOID(tx, ty + bandSize - 1, size)] = make_short2(MARKER, lasty);
}

__global__ void kernelCreateForwardPointers(short2 *output, int size,
                                            int bandSize) {
    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
    int ty = __mul24(blockIdx.y + 1, bandSize) - 1;
    int id = TOID(tx, ty, size);
    int lasty = -1, nexty;
    short2 current;

    // Get the tail pointer
    current = tex1Dfetch(pbaTexLinks, id);

    if (current.x == MARKER)
        nexty = current.y;
    else
        nexty = ty;

    for (int i = 0; i < bandSize; i++, id -= size)
        if (ty - i == nexty) {
            current = make_short2(lasty, tex1Dfetch(pbaTexLinks, id).y);
            output[id] = current;

            lasty = nexty;
            nexty = current.y;
        }

    // Store the pointer to the head at the first pixel of this band
    if (lasty != ty - bandSize + 1)
        output[id + size] = make_short2(lasty, MARKER);
}

__global__ void kernelMergeBands(short2 *output, int size, int bandSize) {
    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
    int band1 = blockIdx.y * 2;
    int band2 = band1 + 1;
    int firsty, lasty;
    short2 last1, last2, current;
    // last1 and last2: x component store the x coordinate of the site,
    // y component store the backward pointer
    // current: y component store the x coordinate of the site,
    // x component store the forward pointer

    // Get the two last items of the first list
    lasty = __mul24(band2, bandSize) - 1;
    last2 = make_short2(tex1Dfetch(pbaTexColor, TOID(tx, lasty, size)).x,
                        tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)).y);

    if (last2.x == MARKER) {
        lasty = last2.y;

        if (lasty >= 0)
            last2 =
                make_short2(tex1Dfetch(pbaTexColor, TOID(tx, lasty, size)).x,
                            tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)).y);
        else
            last2 = make_short2(MARKER, MARKER);
    }

    if (last2.y >= 0) {
        // Second item at the top of the stack
        last1 = make_short2(tex1Dfetch(pbaTexColor, TOID(tx, last2.y, size)).x,
                            tex1Dfetch(pbaTexLinks, TOID(tx, last2.y, size)).y);
    }

    // Get the first item of the second band
    firsty = __mul24(band2, bandSize);
    current = make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
                          tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);

    if (current.y == MARKER) {
        firsty = current.x;

        if (firsty >= 0)
            current =
                make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
                            tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);
        else
            current = make_short2(MARKER, MARKER);
    }

    float i1, i2;

    // Count the number of item in the second band that survive so far.
    // Once it reaches 2, we can stop.
    int top = 0;

    while (top < 2 && current.y >= 0) {
        // While there's still something on the left
        while (last2.y >= 0) {
            i1 = interpoint(last1.x, last2.y, last2.x, lasty, tx);
            i2 = interpoint(last2.x, lasty, current.y, firsty, tx);

            if (i1 < i2)
                break;

            lasty = last2.y;
            last2 = last1;
            --top;

            if (last1.y >= 0)
                last1 = make_short2(
                    tex1Dfetch(pbaTexColor, TOID(tx, last1.y, size)).x,
                    output[TOID(tx, last1.y, size)].y);
        }

        // Update the current pointer
        output[TOID(tx, firsty, size)] = make_short2(current.x, lasty);

        if (lasty >= 0)
            output[TOID(tx, lasty, size)] = make_short2(firsty, last2.y);

        last1 = last2;
        last2 = make_short2(current.y, lasty);
        lasty = firsty;
        firsty = current.x;

        top = max(1, top + 1);

        // Advance the current pointer to the next one
        if (firsty >= 0)
            current =
                make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
                            tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);
        else
            current = make_short2(MARKER, MARKER);
    }

    // Update the head and tail pointer.
    firsty = __mul24(band1, bandSize);
    lasty = __mul24(band2, bandSize);
    current = tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size));

    if (current.y == MARKER && current.x < 0) { // No head?
        last1 = tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size));

        if (last1.y == MARKER)
            current.x = last1.x;
        else
            current.x = lasty;

        output[TOID(tx, firsty, size)] = current;
    }

    firsty = __mul24(band1, bandSize) + bandSize - 1;
    lasty = __mul24(band2, bandSize) + bandSize - 1;
    current = tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size));

    if (current.x == MARKER && current.y < 0) { // No tail?
        last1 = tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size));

        if (last1.x == MARKER)
            current.y = last1.y;
        else
            current.y = firsty;

        output[TOID(tx, lasty, size)] = current;
    }
}

__global__ void kernelDoubleToSingleList(short2 *output, int size) {
    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
    int ty = blockIdx.y;
    int id = TOID(tx, ty, size);

    output[id] = make_short2(tex1Dfetch(pbaTexColor, id).x,
                             tex1Dfetch(pbaTexLinks, id).y);
}

__global__ void kernelColor(short2 *output, int size) {
    __shared__ short2 s_last1[BLOCKSIZE], s_last2[BLOCKSIZE];
    __shared__ int s_lasty[BLOCKSIZE];

    int col = threadIdx.x;
    int tid = threadIdx.y;
    int tx = __mul24(blockIdx.x, blockDim.x) + col;
    int dx, dy, lasty;
    unsigned int best, dist;
    short2 last1, last2;

    if (tid == blockDim.y - 1) {
        lasty = size - 1;

        last2 = tex1Dfetch(pbaTexColor, __mul24(lasty, size) + tx);

        if (last2.x == MARKER) {
            lasty = last2.y;
            last2 = tex1Dfetch(pbaTexColor, __mul24(lasty, size) + tx);
        }

        if (last2.y >= 0)
            last1 = tex1Dfetch(pbaTexColor, __mul24(last2.y, size) + tx);

        s_last1[col] = last1;
        s_last2[col] = last2;
        s_lasty[col] = lasty;
    }

    __syncthreads();

    for (int ty = size - 1 - tid; ty >= 0; ty -= blockDim.y) {
        last1 = s_last1[col];
        last2 = s_last2[col];
        lasty = s_lasty[col];

        dx = last2.x - tx;
        dy = lasty - ty;
        best = dist = __mul24(dx, dx) + __mul24(dy, dy);

        while (last2.y >= 0) {
            dx = last1.x - tx;
            dy = last2.y - ty;
            dist = __mul24(dx, dx) + __mul24(dy, dy);

            if (dist > best)
                break;

            best = dist;
            lasty = last2.y;
            last2 = last1;

            if (last2.y >= 0)
                last1 = tex1Dfetch(pbaTexColor, __mul24(last2.y, size) + tx);
        }

        __syncthreads();

        output[TOID(tx, ty, size)] = make_short2(last2.x, lasty);

        if (tid == blockDim.y - 1) {
            s_last1[col] = last1;
            s_last2[col] = last2;
            s_lasty[col] = lasty;
        }

        __syncthreads();
    }
}