From d7dd95cd9eaa223ff8b86e84e6b1b488ff79bcd5 Mon Sep 17 00:00:00 2001
From: Samuel Fadel <samuelfadel@gmail.com>
Date: Sat, 19 Dec 2015 12:29:33 +0100
Subject: New rendering (VoronoiSplat) -- incomplete.

* Added voronoi-like splatting to points: the same technique from
  Messias et al., (2014)
* It is now possible to change the projection technique during
  runtime (possible, but still requires some work)
---
 main.cpp         |  25 +-
 main.h           |  18 +-
 main_view.qml    |  88 +++++-
 pm.pro           |  34 +++
 skelft.cu        | 869 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 skelft.h         | 106 +++++++
 skelft_core.cpp  | 199 +++++++++++++
 skelftkernel.h   | 468 ++++++++++++++++++++++++++++++
 voronoisplat.cpp | 407 ++++++++++++++++++++++++++
 voronoisplat.h   | 103 +++++++
 10 files changed, 2307 insertions(+), 10 deletions(-)
 create mode 100644 skelft.cu
 create mode 100644 skelft.h
 create mode 100644 skelft_core.cpp
 create mode 100644 skelftkernel.h
 create mode 100644 voronoisplat.cpp
 create mode 100644 voronoisplat.h
diff --git a/main.cpp b/main.cpp
index 81498a7..64560d3 100644
--- a/main.cpp
+++ b/main.cpp
@@ -12,12 +12,14 @@
 #include "mp.h"
 #include "continuouscolorscale.h"
 #include "scatterplot.h"
+#include "voronoisplat.h"
 #include "historygraph.h"
 #include "interactionhandler.h"
 #include "selectionhandler.h"
 #include "effectivenessobserver.h"
 #include "distortionobserver.h"
 #include "npdistortion.h"
+#include "skelft.h"
 
 static QObject *mainProvider(QQmlEngine *engine, QJSEngine *scriptEngine)
 {
@@ -155,12 +157,16 @@ int main(int argc, char **argv)
     m->setSubsample(Ys);
 
     qmlRegisterType<Scatterplot>("PM", 1, 0, "Scatterplot");
+    qmlRegisterType<VoronoiSplat>("PM", 1, 0, "VoronoiSplat");
     qmlRegisterType<HistoryGraph>("PM", 1, 0, "HistoryGraph");
+    qmlRegisterType<InteractionHandler>("PM", 1, 0, "InteractionHandler");
     qmlRegisterSingletonType<Main>("PM", 1, 0, "Main", mainProvider);
 
     // Set up multisampling
     QSurfaceFormat fmt;
     fmt.setSamples(16);
+    fmt.setRenderableType(QSurfaceFormat::OpenGL);
+    fmt.setVersion(4, 5);
     QSurfaceFormat::setDefaultFormat(fmt);
     QQmlApplicationEngine engine(QUrl("qrc:///main_view.qml"));
 
@@ -184,6 +190,8 @@ int main(int argc, char **argv)
     // subsamplePlot->setColorData(arma::zeros<arma::vec>(subsampleSize));
     subsamplePlot->setColorScale(&colorScale);
     Scatterplot *plot = engine.rootObjects()[0]->findChild<Scatterplot *>("plot");
+    VoronoiSplat *splat = engine.rootObjects()[0]->findChild<VoronoiSplat *>("splat");
+    skelft2DInitialization(splat->width());
 
     // Keep track of the current subsample (in order to save them later, if requested)
     QObject::connect(subsamplePlot, SIGNAL(xyChanged(const arma::mat &)),
@@ -191,15 +199,22 @@ int main(int argc, char **argv)
     QObject::connect(subsamplePlot, SIGNAL(xyInteractivelyChanged(const arma::mat &)),
             m, SLOT(setSubsample(const arma::mat &)));
 
-    // Update LAMP projection as the subsample is modified
+    // Update projection as the subsample is modified
     InteractionHandler interactionHandler(X, sampleIndices);
-    interactionHandler.setTechnique(InteractionHandler::TECHNIQUE_LAMP);
+    m->setInteractionHandler(&interactionHandler);
     QObject::connect(subsamplePlot, SIGNAL(xyChanged(const arma::mat &)),
             &interactionHandler, SLOT(setSubsample(const arma::mat &)));
     QObject::connect(subsamplePlot, SIGNAL(xyInteractivelyChanged(const arma::mat &)),
             &interactionHandler, SLOT(setSubsample(const arma::mat &)));
     QObject::connect(&interactionHandler, SIGNAL(subsampleChanged(const arma::mat &)),
             plot, SLOT(setXY(const arma::mat &)));
+    m->setTechnique(InteractionHandler::TECHNIQUE_LAMP);
+
+    // Update splat whenever the main plot is also updated
+    QObject::connect(plot, SIGNAL(xyChanged(const arma::mat &)),
+            splat, SLOT(setPoints(const arma::mat &)));
+    QObject::connect(plot, SIGNAL(colorDataChanged(const arma::vec &)),
+            splat, SLOT(setValues(const arma::vec &)));
 
     // Linking between selections in subsample plot and full dataset plot
     SelectionHandler selectionHandler(sampleIndices);
@@ -237,7 +252,11 @@ int main(int argc, char **argv)
     subsamplePlot->setXY(Ys);
     subsamplePlot->setColorData(labels(sampleIndices));
     plot->setColorScale(&colorScale);
+    splat->setColorScale(&colorScale);
     plot->setColorData(labels);
 
-    return app.exec();
+    auto ret = app.exec();
+
+    skelft2DDeinitialization();
+    return ret;
 }
diff --git a/main.h b/main.h
index 01d62f7..474c7c2 100644
--- a/main.h
+++ b/main.h
@@ -4,6 +4,8 @@
 #include <QObject>
 #include <armadillo>
 
+#include "interactionhandler.h"
+
 class Main : public QObject
 {
     Q_OBJECT
@@ -38,6 +40,16 @@ public:
     Q_INVOKABLE void setSubsampleSavePath(const std::string &path) { m_subsampleSavePath = path; }
     Q_INVOKABLE void setSubsampleSavePath(const QString &path)     { setSubsampleSavePath(path.toStdString()); }
 
+    void setInteractionHandler(InteractionHandler *interactionHandler) {
+        m_interactionHandler = interactionHandler;
+    }
+
+    Q_INVOKABLE void setTechnique(int technique) {
+        if (m_interactionHandler) {
+            m_interactionHandler->setTechnique((InteractionHandler::Technique) technique);
+        }
+    }
+
     arma::mat X() const { return m_dataset.cols(0, m_dataset.n_cols - 2); }
     arma::vec labels() const { return m_dataset.col(m_dataset.n_cols - 1); }
 
@@ -53,13 +65,17 @@ public slots:
     }
 
 private:
-    Main(QObject *parent = 0) : QObject(parent) {}
+    Main(QObject *parent = 0)
+        : QObject(parent)
+        , m_interactionHandler(0)
+    {}
     ~Main() {}
     Q_DISABLE_COPY(Main)
 
     arma::mat m_dataset, m_subsample;
     arma::uvec m_subsampleIndices;
     std::string m_indicesSavePath, m_subsampleSavePath;
+    InteractionHandler *m_interactionHandler;
 };
 
 #endif // MAIN_H
diff --git a/main_view.qml b/main_view.qml
index 0cd29b5..62c3a67 100644
--- a/main_view.qml
+++ b/main_view.qml
@@ -6,10 +6,13 @@ import QtQuick.Layouts 1.2
 import PM 1.0
 
 ApplicationWindow {
+    id: mainWindow
     title: "Projection Manipulation"
     visible: true
-    width: 900
-    height: 600
+    minimumWidth: 900
+    minimumHeight: 600
+    maximumWidth: minimumWidth
+    maximumHeight: minimumHeight
 
     menuBar: MenuBar {
         Menu {
@@ -18,6 +21,26 @@ ApplicationWindow {
             MenuItem { action: quitAction }
         }
 
+        Menu {
+            title: "Technique"
+            MenuItem {
+                action: lampTechniqueAction
+                exclusiveGroup: techniqueGroup
+            }
+            MenuItem {
+                action: lspTechniqueAction
+                exclusiveGroup: techniqueGroup
+            }
+            MenuItem {
+                action: plmpTechniqueAction
+                exclusiveGroup: techniqueGroup
+            }
+            MenuItem {
+                action: pekalskaTechniqueAction
+                exclusiveGroup: techniqueGroup
+            }
+        }
+
         Menu {
             title: "View"
             MenuItem {
@@ -68,10 +91,18 @@ ApplicationWindow {
                 border.width: 1
                 border.color: "#cccccc"
 
+                VoronoiSplat {
+                    id: splat
+                    objectName: "splat"
+                    width: 256
+                    height: 256
+                }
+
                 Scatterplot {
                     id: plot
                     objectName: "plot"
                     anchors.fill: parent
+                    visible: false
                 }
             }
         }
@@ -107,6 +138,51 @@ ApplicationWindow {
         }
     }
 
+    ExclusiveGroup {
+        id: techniqueGroup
+
+        Action {
+            id: lampTechniqueAction
+            text: "LAMP"
+            shortcut: "Ctrl+1"
+            checked: true
+            checkable: true
+            onTriggered: {
+                Main.setTechnique(InteractionHandler.TECHNIQUE_LAMP)
+            }
+        }
+
+        Action {
+            id: lspTechniqueAction
+            text: "LSP"
+            shortcut: "Ctrl+2"
+            checkable: true
+            onTriggered: {
+                Main.setTechnique(InteractionHandler.TECHNIQUE_LSP)
+            }
+        }
+
+        Action {
+            id: plmpTechniqueAction
+            text: "PLMP"
+            shortcut: "Ctrl+3"
+            checkable: true
+            onTriggered: {
+                Main.setTechnique(InteractionHandler.TECHNIQUE_PLMP)
+            }
+        }
+
+        Action {
+            id: pekalskaTechniqueAction
+            text: "Pekalska"
+            shortcut: "Ctrl+4"
+            checkable: true
+            onTriggered: {
+                Main.setTechnique(InteractionHandler.TECHNIQUE_PEKALSKA)
+            }
+        }
+    }
+
     ExclusiveGroup {
         id: coloringGroup
 
@@ -116,7 +192,7 @@ ApplicationWindow {
             shortcut: "Shift+L"
             checked: true
             checkable: true
-            onTriggered: console.log("Labels")
+            onTriggered: console.log("stub: Labels")
         }
 
         Action {
@@ -124,7 +200,7 @@ ApplicationWindow {
             text: "Neighborhood Preservation"
             shortcut: "Shift+N"
             checkable: true
-            onTriggered: console.log("NP")
+            onTriggered: console.log("stub: NP")
         }
 
         Action {
@@ -132,7 +208,7 @@ ApplicationWindow {
             text: "Stress"
             shortcut: "Shift+S"
             checkable: true
-            onTriggered: console.log("Stress")
+            onTriggered: console.log("stub: Stress")
         }
 
         Action {
@@ -140,7 +216,7 @@ ApplicationWindow {
             text: "Silhouette"
             shortcut: "Shift+T"
             checkable: true
-            onTriggered: console.log("Silhouette")
+            onTriggered: console.log("stub: Silhouette")
         }
     }
 }
diff --git a/pm.pro b/pm.pro
index 88a4b61..7cc6bdc 100644
--- a/pm.pro
+++ b/pm.pro
@@ -1,5 +1,7 @@
 QT += qml quick widgets
 
+CONFIG += qt debug
+
 QMAKE_CXXFLAGS += -std=c++11 -fopenmp
 QMAKE_LIBS += -larmadillo -fopenmp
 HEADERS += main.h \
@@ -8,6 +10,7 @@ HEADERS += main.h \
     geometry.h \
     scale.h \
     scatterplot.h \
+    voronoisplat.h \
     historygraph.h \
     interactionhandler.h \
     selectionhandler.h \
@@ -15,18 +18,22 @@ HEADERS += main.h \
     distortionobserver.h \
     distortionmeasure.h \
     npdistortion.h \
+    skelft.h \
+    skelftkernel.h
     mp.h
 SOURCES += main.cpp \
     colorscale.cpp \
     continuouscolorscale.cpp \
     geometry.cpp \
     scatterplot.cpp \
+    voronoisplat.cpp \
     historygraph.cpp \
     interactionhandler.cpp \
     selectionhandler.cpp \
     effectivenessobserver.cpp \
     distortionobserver.cpp \
     npdistortion.cpp \
+    skelft_core.cpp \
     lamp.cpp \
     plmp.cpp \
     knn.cpp \
@@ -34,6 +41,33 @@ SOURCES += main.cpp \
     tsne.cpp \
     measures.cpp \
     dist.cpp
+
+OTHER_FILES += skelft.cu
+
+# Cuda settings
+CUDA_SOURCES += skelft.cu
+CUDA_DIR = "/opt/cuda"
+CUDA_ARCH = sm_30
+NVCC_OPTIONS += --use_fast_math
+SYSTEM_TYPE = 64  # Either 64 or empty
+INCLUDEPATH += $$CUDA_DIR/include
+QMAKE_LIBDIR += $$CUDA_DIR/lib$$SYSTEM_TYPE
+LIBS += -lcuda -lcudart
+
+CONFIG(debug, debug|release) {
+    cuda_dbg.input = CUDA_SOURCES
+    cuda_dbg.output = ${QMAKE_FILE_BASE}_cuda.o
+    cuda_dbg.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG -g $$NVCC_OPTIONS -I$$INCLUDEPATH $$LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+    cuda_dbg.dependency_type = TYPE_C
+    QMAKE_EXTRA_COMPILERS += cuda_dbg
+} else {
+    cuda.input = CUDA_SOURCES
+    cuda.output = ${QMAKE_FILE_BASE}_cuda.o
+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS -I$$INCLUDEPATH $$LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+    cuda.dependency_type = TYPE_C
+    QMAKE_EXTRA_COMPILERS += cuda
+}
+
 RESOURCES += pm.qrc
 
 target.path = .
diff --git a/skelft.cu b/skelft.cu
new file mode 100644
index 0000000..be7662c
--- /dev/null
+++ b/skelft.cu
@@ -0,0 +1,869 @@
+#include <device_functions.h>
+#include "skelft.h"
+#include <cstdio>
+#include <iostream>
+
+
+#include <thrust/host_vector.h> 
+#include <thrust/device_vector.h> 
+#include <thrust/sort.h>
+
+
+
+using namespace std;
+
+
+// Parameters for CUDA kernel executions; more or less optimized for a 1024x1024 image.
+#define BLOCKX     16
+#define BLOCKY     16
+#define BLOCKSIZE  64
+#define TILE_DIM   32
+#define BLOCK_ROWS 16
+
+
+
+/****** Global Variables *******/
+const int NB = 7;                        // Nr buffers we use and store in the entire framework
+short2 **pbaTextures;                    // Work buffers used to compute and store resident images
+// 0: work buffer
+// 1: FT
+// 2: thresholded DT or exact floating-point DT
+// 3: thresholded skeleton
+// 4: topology analysis
+// 5: work buffer for topology
+// 6: skeleton FT
+//
+
+float*  pbaTexSiteParam;    // Stores boundary parameterization
+float2* pbaTexAdvectSites;
+float*  pbaTexDensityData;  // The density map of the sites
+float2* pbaTexGradientData; // The gradient of the density map
+
+int     pbaTexSize;     // Texture size (squared) actually used in all computations
+int     floodBand  = 4, // Various FT computation parameters; defaults are good for an 1024x1024 image.    
+        maurerBand = 4,
+        colorBand  = 4;        
+
+texture<short2> pbaTexColor;  // 2D textures (bound to various buffers defined above as needed)
+texture<short2> pbaTexColor2;
+texture<short2> pbaTexLinks;
+texture<float>  pbaTexParam;  // 1D site parameterization texture (bound to pbaTexSiteParam)
+texture<unsigned char>
+                pbaTexGray;   // 2D texture of unsigned char values, e.g. the binary skeleton
+texture<float2> pbaTexAdvect;
+texture<float,cudaTextureType2D,cudaReadModeElementType>  pbaTexDensity;
+texture<float2,cudaTextureType2D,cudaReadModeElementType> pbaTexGradient;
+texture<float,cudaTextureType2D,cudaReadModeElementType>  pbaTexDT;
+
+
+__device__  bool fill_gc; //Indicates if a fill-sweep did fill anything or not
+
+
+#if __CUDA_ARCH__ < 110   // We cannot use atomic intrinsics on SM10 or below. Thus, we define these as nop.
+#define atomicInc(a,b) 0  // The default will be that some code e.g. endpoint detection will thus not do anything.
+#endif
+
+
+
+/********* Kernels ********/
+#include "skelftkernel.h"
+
+
+
+// Initialize necessary memory (CPU/GPU sides)
+// - textureSize: The max size of any image we will process until re-initialization
+void skelft2DInitialization(int maxTexSize)
+{
+    int pbaMemSize = maxTexSize * maxTexSize * sizeof(short2); // A buffer has 2 shorts / pixel
+    pbaTextures  = (short2**) malloc(NB * sizeof(short2*));    // We will use NB buffers
+
+    for(int i=0;i<NB;++i)
+       cudaMalloc((void **) &pbaTextures[i], pbaMemSize); // Allocate work buffer 'i' (on GPU)
+
+    cudaMalloc((void**) &pbaTexSiteParam, maxTexSize * maxTexSize * sizeof(float));    // Sites texture (for FT)
+    cudaMalloc((void**) &pbaTexAdvectSites, maxTexSize * maxTexSize * sizeof(float2)); // Sites 2D-coords
+    cudaMalloc((void**) &pbaTexDensityData, maxTexSize * maxTexSize * sizeof(float));
+    cudaMalloc((void**) &pbaTexGradientData, maxTexSize * maxTexSize * sizeof(float2));
+}
+
+
+
+
+// Deallocate all allocated memory
+void skelft2DDeinitialization()
+{
+    for(int i=0;i<NB;++i) cudaFree(pbaTextures[i]); 
+    cudaFree(pbaTexSiteParam);
+    cudaFree(pbaTexAdvectSites);
+    cudaFree(pbaTexDensityData);
+    cudaFree(pbaTexGradientData);
+    free(pbaTextures);
+}
+
+
+// Initialize the Voronoi textures from the sites' encoding texture (parameterization)
+// REMARK: we interpret 'inputVoro' as a 2D texture, as it's much easier/faster like this
+__global__ void kernelSiteParamInit(short2* inputVoro, int size)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (tx<size && ty<size) // Careful not to go outside the image..
+    {
+        int i = TOID(tx,ty,size);
+        // The sites-param has non-zero (parameter) values precisely on boundary points
+        float param = tex1Dfetch(pbaTexParam,i); 
+
+        short2& v = inputVoro[i];
+        // Non-boundary points are marked as 0 in the parameterization. Here we will compute the FT.
+        v.x = v.y = MARKER;
+        // These are points which define the 'sites' to compute the FT/skeleton (thus, have FT==identity)
+        // We could use an if-then-else here, but it's faster with an if-then
+        if (param>0) 
+        {            
+            v.x = tx;
+            v.y = ty;
+        }
+    }
+}
+
+
+
+void skelft2DInitializeInput(float* sites, int size)                                        // Copy input sites from CPU to GPU; Also set up site param initialization in pbaTextures[0]
+{
+    pbaTexSize = size;                                                                        // Size of the actual texture being used in this run; can be smaller than the max-tex-size
+                                                                                            // which was used in skelft2DInitialization()
+
+    cudaMemcpy(pbaTexSiteParam, sites, pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyHostToDevice);
+                                                                                            // Pass sites parameterization to CUDA.  Must be done before calling the initialization
+                                                                                            // kernel, since we use the sites-param as a texture in that kernel
+    cudaBindTexture(0, pbaTexParam, pbaTexSiteParam);                                        // Bind the sites-param as a 1D texture so we can quickly index it next
+    dim3 block = dim3(BLOCKX,BLOCKY);
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+    
+    kernelSiteParamInit<<<grid,block>>>(pbaTextures[0],pbaTexSize);                            // Do the site param initialization. This sets up pbaTextures[0]
+    cudaUnbindTexture(pbaTexParam);
+}
+
+
+
+
+
+// In-place transpose a squared texture. 
+// Block orders are modified to optimize memory access. 
+// Point coordinates are also swapped. 
+void pba2DTranspose(short2 *texture)
+{
+    dim3 block(TILE_DIM, BLOCK_ROWS); 
+    dim3 grid(pbaTexSize / TILE_DIM, pbaTexSize / TILE_DIM); 
+
+    cudaBindTexture(0, pbaTexColor, texture); 
+    kernelTranspose<<< grid, block >>>(texture, pbaTexSize); 
+    cudaUnbindTexture(pbaTexColor); 
+}
+
+// Phase 1 of PBA. m1 must divides texture size
+void pba2DPhase1(int m1, short xm, short ym, short xM, short yM) 
+{
+    dim3 block = dim3(BLOCKSIZE);   
+    dim3 grid = dim3(pbaTexSize / block.x, m1); 
+
+    // Flood vertically in their own bands
+    cudaBindTexture(0, pbaTexColor, pbaTextures[0]); 
+    kernelFloodDown<<< grid, block >>>(pbaTextures[1], pbaTexSize, pbaTexSize / m1); 
+    cudaUnbindTexture(pbaTexColor); 
+
+    cudaBindTexture(0, pbaTexColor, pbaTextures[1]); 
+    kernelFloodUp<<< grid, block >>>(pbaTextures[1], pbaTexSize, pbaTexSize / m1); 
+
+    // Passing information between bands
+    grid = dim3(pbaTexSize / block.x, m1); 
+    kernelPropagateInterband<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m1); 
+
+    cudaBindTexture(0, pbaTexLinks, pbaTextures[0]); 
+    kernelUpdateVertical<<< grid, block >>>(pbaTextures[1], pbaTexSize, m1, pbaTexSize / m1); 
+    cudaUnbindTexture(pbaTexLinks); 
+    cudaUnbindTexture(pbaTexColor); 
+}
+
+// Phase 2 of PBA. m2 must divides texture size
+void pba2DPhase2(int m2) 
+{
+    // Compute proximate points locally in each band
+    dim3 block = dim3(BLOCKSIZE);   
+    dim3 grid = dim3(pbaTexSize / block.x, m2); 
+    cudaBindTexture(0, pbaTexColor, pbaTextures[1]); 
+    kernelProximatePoints<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m2); 
+
+    cudaBindTexture(0, pbaTexLinks, pbaTextures[0]); 
+    kernelCreateForwardPointers<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m2); 
+
+    // Repeatly merging two bands into one
+    for (int noBand = m2; noBand > 1; noBand /= 2) {
+        grid = dim3(pbaTexSize / block.x, noBand / 2); 
+        kernelMergeBands<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / noBand); 
+    }
+
+    // Replace the forward link with the X coordinate of the seed to remove
+    // the need of looking at the other texture. We need it for coloring.
+    grid = dim3(pbaTexSize / block.x, pbaTexSize); 
+    kernelDoubleToSingleList<<< grid, block >>>(pbaTextures[0], pbaTexSize); 
+    cudaUnbindTexture(pbaTexLinks); 
+    cudaUnbindTexture(pbaTexColor); 
+}
+
+// Phase 3 of PBA. m3 must divides texture size
+void pba2DPhase3(int m3) 
+{
+    dim3 block = dim3(BLOCKSIZE / m3, m3); 
+    dim3 grid = dim3(pbaTexSize / block.x); 
+    cudaBindTexture(0, pbaTexColor, pbaTextures[0]); 
+    kernelColor<<< grid, block >>>(pbaTextures[1], pbaTexSize); 
+    cudaUnbindTexture(pbaTexColor); 
+}
+
+
+
+void skel2DFTCompute(short xm, short ym, short xM, short yM, int floodBand, int maurerBand, int colorBand)
+{
+    pba2DPhase1(floodBand,xm,ym,xM,yM);                                        //Vertical sweep
+
+    pba2DTranspose(pbaTextures[1]);                                            //
+
+    pba2DPhase2(maurerBand);                                                //Horizontal coloring
+
+    pba2DPhase3(colorBand);                                                    //Row coloring
+
+    pba2DTranspose(pbaTextures[1]); 
+}
+
+
+
+
+
+__global__ void kernelThresholdDT(unsigned char* output, int size, float threshold2, short xm, short ym, short xM, short yM)
+//Input:    pbaTexColor: closest-site-ids per pixel, i.e. FT
+//Output:   output: thresholded DT
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (tx>xm && ty>ym && tx<xM && ty<yM)                                    //careful not to index outside the image..
+    {    
+        int    id     = TOID(tx, ty, size);
+      short2 voroid = tex1Dfetch(pbaTexColor,id);                            //get the closest-site to tx,ty into voroid.x,.y
+      float  d2     = (tx-voroid.x)*(tx-voroid.x)+(ty-voroid.y)*(ty-voroid.y);
+      output[id]    = (d2<=threshold2);                                        //threshold DT into binary image
+    }
+}
+
+
+
+__global__ void kernelDT(float* output, int size, short xm, short ym, short xM, short yM)
+//Input:    pbaTexColor: closest-site-ids per pixel, i.e. FT
+//Output:   output: DT
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+        
+    if (tx>xm && ty>ym && tx<xM && ty<yM)                                    //careful not to index outside the image..
+    {    
+        int    id     = TOID(tx, ty, size);
+      short2 voroid = tex1Dfetch(pbaTexColor,id);                            //get the closest-site to tx,ty into voroid.x,.y
+      float  d2     = (tx-voroid.x)*(tx-voroid.x)+(ty-voroid.y)*(ty-voroid.y);
+      output[id]    = sqrtf(d2);                                            //save the Euclidean DT
+    }
+}
+
+
+__global__ void kernelSkel(unsigned char* output, short xm, short ym, 
+                           short xM, short yM, short size, float threshold, float length)    
+                                                                            //Input:    pbaTexColor: closest-site-ids per pixel
+                                                                            //            pbaTexParam: labels for sites (only valid at site locations)
+{                                                                            //Output:    output: binary thresholded skeleton
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (tx>xm && ty>ym && tx<xM && ty<yM) 
+    {
+        int    id     = TOID(tx, ty, size);
+      int    Id     = id;
+      short2 voroid = tex1Dfetch(pbaTexColor,id);                            //get the closest-site to tx,ty into voroid.x,.y
+      int    id2    = TOID(voroid.x,voroid.y,size);                            //convert the site's coord to an index into pbaTexParam[], the site-label-texture
+      float  imp    = tex1Dfetch(pbaTexParam,id2);                            //get the site's label
+
+             ++id;                                                            //TOID(tx+1,ty,size)
+             voroid = tex1Dfetch(pbaTexColor,id);                            //
+             id2    = TOID(voroid.x,voroid.y,size);                            //
+      float  imp_r  = tex1Dfetch(pbaTexParam,id2);                            //
+
+             id     += size-1;                                                //TOID(tx,ty+1,size)
+             voroid = tex1Dfetch(pbaTexColor,id);                            //
+             id2    = TOID(voroid.x,voroid.y,size);                            //
+      float  imp_u  = tex1Dfetch(pbaTexParam,id2);                            //
+
+      float imp_dx  = fabsf(imp_r-imp);
+      float imp_dy  = fabsf(imp_u-imp);
+      float Imp     = max(imp_dx,imp_dy);
+      Imp = min(Imp,fabsf(length-Imp));
+      if (Imp>=threshold) output[Id] = 1;                                    //By filling only 1-values, we reduce memory access somehow (writing to output[] is expensive)
+    } 
+    
+    //WARNING: this kernel may sometimes creates 2-pixel-thick branches.. Study the AFMM original code to see if this is correct.
+}
+
+
+
+#define X 1
+
+__constant__ const                                                            //REMARK: put following constants (for kernelTopology) in CUDA constant-memory, as this gives a huge speed difference
+unsigned char topo_patterns[][9] =        { {0,0,0,                            //These are the 3x3 templates that we use to detect skeleton endpoints
+                                           0,X,0,                            //(with four 90-degree rotations for each)
+                                           0,X,0},
+                                          {0,0,0,
+                                           0,X,0,
+                                           0,0,X},
+                                          {0,0,0,
+                                           0,X,0,
+                                           0,X,X},
+                                          {0,0,0,
+                                           0,X,0,
+                                           X,X,0} 
+                                        };
+                                        
+#define topo_NPATTERNS  4                                                        //Number of patterns we try to match (for kernelTopology)
+                                                                                //REMARK: #define faster than __constant__
+
+__constant__ const unsigned char topo_rot[][9] = { {0,1,2,3,4,5,6,7,8}, {2,5,8,1,4,7,0,3,6}, {8,7,6,5,4,3,2,1,0}, {6,3,0,7,4,1,8,5,2} };
+                                                                                //These encode the four 90-degree rotations of the patterns (for kernelTopology);
+
+__device__ unsigned int topo_gc            = 0;
+__device__ unsigned int topo_gc_last    = 0;
+
+
+__global__ void kernelTopology(unsigned char* output, short2* output_set, short xm, short ym, short xM, short yM, short size, int maxpts)    
+{
+    const int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ty = blockIdx.y * blockDim.y + threadIdx.y;
+        
+    unsigned char t[9];
+    
+    if (tx>xm && ty>ym && tx<xM-1 && ty<yM-1)                                    //careful not to index outside the image; take into account the template size too
+    {    
+       int    id = TOID(tx, ty, size);     
+       unsigned char  p  = tex1Dfetch(pbaTexGray,id);                            //get the skeleton pixel at tx,ty
+       if (p)                                                                    //if the pixel isn't skeleton, nothing to do
+       {
+         unsigned char idx=0;
+         for(int j=ty-1;j<=ty+1;++j)                                            //read the template into t[] for easier use
+         {
+           int id = TOID(tx-1, j, size);
+           for(int i=0;i<=2;++i,++id,++idx)
+              t[idx] = tex1Dfetch(pbaTexGray,id);                                //get the 3x3 template centered at the skel point tx,ty
+         }
+          
+         for(unsigned char r=0;r<4;++r)                                            //try to match all rotations of a pattern:
+         {
+           const unsigned char* rr = topo_rot[r];
+           for(unsigned char p=0;p<topo_NPATTERNS;++p)                            //try to match all patterns:
+           {
+             const unsigned char* pat = topo_patterns[p];
+             unsigned char j = (p==0)? 0 : 7;                                    //Speedup: for all patterns except 1st, check only last 3 entries, the first 6 are identical for all patterns
+             for(;j<9;++j)                                                        //try to match rotated pattern vs actual pattern
+               if (pat[j]!=t[rr[j]]) break;                                        //this rotation failed
+             if (j<6) break;                                                    //Speedup: if we have a mismatch on the 1st 6 pattern entries, then none of the patterns can match
+                                                                                //           since all templates have the same first 6 entries.
+
+             if (j==9)                                                            //this rotation succeeded: mark the pixel as a topology event and we're done
+             {    
+                int crt_gc = atomicInc(&topo_gc,maxpts);                        //REMARK: this serializes (compacts) all detected endpoints in one array.            
+                output_set[crt_gc] = make_short2(tx,ty);                        //To do this, we use an atomic read-increment-return on a global counter, 
+                                                                                //which is guaranteed to give all threads unique consecutive indexes in the array.
+                output[id] = 1;                                                    //Also create the topology image
+                return;
+             }
+           }
+         }
+       }
+    }
+    else                                                                        //Last thread: add zero-marker to the output point-set, so the reader knows how many points are really in there
+    if (tx==xM-1 && ty==yM-1)                                                    //Also reset the global vector counter topo_gc, for the next parallel-run of this function
+    { topo_gc_last = topo_gc; topo_gc = 0; }                                    //We do this in the last thread so that no one modifies topo_gc from now on. 
+                                                                                //REMARK: this seems to be the only way I can read a __device__ variable back to the CPU
+}
+
+
+
+
+void skelft2DParams(int floodBand_, int maurerBand_, int colorBand_)        //Set up some params of the FT algorithm    
+{
+  floodBand   = floodBand_;
+  maurerBand  = maurerBand_;
+  colorBand   = colorBand_;
+}
+
+
+
+
+
+// Compute 2D FT / Voronoi diagram of a set of sites
+// siteParam:   Site parameterization. 0 = non-site points; >0 = site parameter value.
+// output:        FT. The (x,y) at (i,j) are the coords of the closest site to (i,j)
+// size:        Texture size (pow 2)
+void skelft2DFT(short* output, float* siteParam, short xm, short ym, short xM, short yM, int size) 
+{
+    skelft2DInitializeInput(siteParam,size);                                    // Initialization of already-allocated data structures
+
+    skel2DFTCompute(xm, ym, xM, yM, floodBand, maurerBand, colorBand);            // Compute FT
+    
+    // Copy FT to CPU, if required
+    if (output) cudaMemcpy(output, pbaTextures[1], size*size*sizeof(short2), cudaMemcpyDeviceToHost);
+}
+
+
+
+
+
+
+void skelft2DDT(float* outputDT, short xm, short ym, short xM, short yM)        //Compute exact DT from resident FT (in pbaTextures[1])
+{
+    dim3 block = dim3(BLOCKX,BLOCKY);
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+    
+
+    cudaBindTexture(0, pbaTexColor, pbaTextures[1]);                            //Used to read the FT from
+
+    xm = ym = 0; xM = yM = pbaTexSize-1;
+    kernelDT <<< grid, block >>>((float*)pbaTextures[2], pbaTexSize, xm-1, ym-1, xM+1, yM+1);
+    cudaUnbindTexture(pbaTexColor);
+      
+    //Copy DT to CPU
+    if (outputDT) cudaMemcpy(outputDT, pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost);
+}
+
+
+
+void skelft2DDT(short* outputDT, float threshold,                                //Compute (thresholded) DT (into pbaTextures[2]) from resident FT (in pbaTextures[1])    
+                      short xm, short ym, short xM, short yM)
+{
+    dim3 block = dim3(BLOCKX,BLOCKY);
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+
+    cudaBindTexture(0, pbaTexColor, pbaTextures[1]);                            //Used to read the FT from
+
+    if (threshold>=0)
+    {
+      xm -= (short)threshold; if (xm<0) xm=0;
+      ym -= (short)threshold; if (ym<0) ym=0;
+      xM += (short)threshold; if (xM>pbaTexSize-1) xM=pbaTexSize-1;
+      yM += (short)threshold; if (yM>pbaTexSize-1) yM=pbaTexSize-1;
+    
+      kernelThresholdDT<<< grid, block >>>((unsigned char*)pbaTextures[2], pbaTexSize, threshold*threshold, xm-1, ym-1, xM+1, yM+1);    
+      cudaUnbindTexture(pbaTexColor);
+    
+      //Copy thresholded image to CPU
+      if (outputDT) cudaMemcpy(outputDT, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    }
+}
+
+
+
+
+void skelft2DSkeleton(unsigned char* outputSkel, float length, float threshold,    //Compute thresholded skeleton (into pbaTextures[3]) from resident FT (in pbaTextures[1])
+                      short xm,short ym,short xM,short yM)                        
+{                                                                                //length:     boundary length
+    dim3 block = dim3(BLOCKX,BLOCKY);                                            //threshold:  skeleton importance min-value (below this, we ignore branches)
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+    
+    cudaBindTexture(0, pbaTexColor, pbaTextures[1]);                            //Used to read the resident FT
+    cudaBindTexture(0, pbaTexParam, pbaTexSiteParam);                            //Used to read the resident boundary parameterization    
+    cudaMemset(pbaTextures[3],0,sizeof(unsigned char)*pbaTexSize*pbaTexSize);    //Faster to zero result and then fill only 1-values (see kernel)
+    
+    kernelSkel<<< grid, block >>>((unsigned char*)pbaTextures[3], xm, ym, xM-1, yM-1, pbaTexSize, threshold, length);
+    
+    cudaUnbindTexture(pbaTexColor);
+    cudaUnbindTexture(pbaTexParam);
+    
+    //Copy skeleton to CPU
+    if (outputSkel) cudaMemcpy(outputSkel, pbaTextures[3], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
+}
+
+
+
+
+void skelft2DTopology(unsigned char* outputTopo, int* npts, short* outputPoints, //Compute topology-points of the resident skeleton (in pbaTextures[3])
+                      short xm,short ym,short xM,short yM)                    
+{
+    int maxpts = (npts)? *npts : pbaTexSize*pbaTexSize;                            //This is the max # topo-points we are going to return in outputPoints[]
+
+    dim3 block = dim3(BLOCKX,BLOCKY);
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+    
+    cudaBindTexture(0, pbaTexGray, pbaTextures[3]);                                //Used to read the resident skeleton
+    cudaMemset(pbaTextures[4],0,sizeof(unsigned char)*pbaTexSize*pbaTexSize);    //Faster to zero result and then fill only 1-values (see kernel)
+
+    unsigned int zero = 0;
+    cudaMemcpyToSymbol(topo_gc,&zero,sizeof(unsigned int),0,cudaMemcpyHostToDevice);        //Set topo_gc to 0
+
+    kernelTopology<<< grid, block >>>((unsigned char*)pbaTextures[4], pbaTextures[5], xm, ym, xM, yM, pbaTexSize, maxpts+1);
+    
+    cudaUnbindTexture(pbaTexGray);
+
+    if (outputPoints && maxpts)                                                    //If output-point vector desired, copy the end-points, put in pbaTexture[5] as a vector of short2's, 
+    {                                                                            //into caller space. We copy only 'maxpts' elements, as the user instructed us.
+        unsigned int num_pts;
+        cudaMemcpyFromSymbol(&num_pts,topo_gc_last,sizeof(unsigned int),0,cudaMemcpyDeviceToHost);        //Get #topo-points we have detected from the device-var from CUDA
+        if (npts && num_pts)                                                                            //Copy the topo-points to caller        
+           cudaMemcpy(outputPoints,pbaTextures[5],num_pts*sizeof(short2),cudaMemcpyDeviceToHost);
+        if (npts) *npts = num_pts;                                                //Return #detected topo-points to caller                                
+    }
+        
+    if (outputTopo)                                                                //If topology image desired, copy it into user space
+        cudaMemcpy(outputTopo,pbaTextures[4],pbaTexSize*pbaTexSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+}
+
+
+
+
+__global__ void kernelSiteFromSkeleton(short2* outputSites, int size)                        //Initialize the Voronoi textures from the sites' encoding texture (parameterization)
+{                                                                                            //REMARK: we interpret 'inputVoro' as a 2D texture, as it's much easier/faster like this
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (tx<size && ty<size)                                                                    //Careful not to go outside the image..
+    {
+      int i = TOID(tx,ty,size);
+      unsigned char param = tex1Dfetch(pbaTexGray,i);                                        //The sites-param has non-zero (parameter) values precisely on non-boundary points
+
+      short2& v = outputSites[i];
+      v.x = v.y = MARKER;                                                                    //Non-boundary points are marked as 0 in the parameterization. Here we will compute the FT.
+      if (param)                                                                            //These are points which define the 'sites' to compute the FT/skeleton (thus, have FT==identity)
+      {                                                                                        //We could use an if-then-else here, but it's faster with an if-then
+         v.x = tx; v.y = ty;
+      }
+    }
+}
+
+
+
+
+__global__ void kernelSkelInterpolate(float* output, int size)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (tx<size && ty<size)                                                                    //Careful not to go outside the image..
+    {
+        int    id     = TOID(tx, ty, size);
+      short2 vid    = tex1Dfetch(pbaTexColor,id);                            
+      float  T      = sqrtf((tx-vid.x)*(tx-vid.x)+(ty-vid.y)*(ty-vid.y));
+      short2 vid2   = tex1Dfetch(pbaTexColor2,id);                            
+      float  D      = sqrtf((tx-vid2.x)*(tx-vid2.x)+(ty-vid2.y)*(ty-vid2.y));
+      float  B      = ((D)? min(T/2/D,0.5f):0.5) + 0.5*((T)? max(1-D/T,0.0f):0);
+      output[id]    = pow(B,0.2f);
+    }
+}
+
+
+
+
+void skel2DSkeletonDT(float* outputSkelDT,short xm,short ym,short xM,short yM)
+{
+    dim3 block = dim3(BLOCKX,BLOCKY);
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+
+    cudaBindTexture(0,pbaTexGray,pbaTextures[3]);                            //Used to read the resident binary skeleton
+    kernelSiteFromSkeleton<<<grid,block>>>(pbaTextures[0],pbaTexSize);        //1. Init pbaTextures[0] with sites on skeleton i.e. from pbaTexGray
+    cudaUnbindTexture(pbaTexGray);
+        
+    //Must first save pbaTextures[1] since we may need it later..
+    cudaMemcpy(pbaTextures[5],pbaTextures[1],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice);
+    skel2DFTCompute(xm, ym, xM, yM, floodBand, maurerBand, colorBand);        //2. Compute FT of the skeleton into pbaTextures[6]
+    cudaMemcpy(pbaTextures[6],pbaTextures[1],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice);
+    cudaMemcpy(pbaTextures[1],pbaTextures[5],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice);
+    
+    //Compute interpolation        
+    cudaBindTexture(0,pbaTexColor,pbaTextures[1]);                            // FT of boundary
+    cudaBindTexture(0,pbaTexColor2,pbaTextures[6]);                            // FT of skeleton
+    kernelSkelInterpolate<<<grid,block>>>((float*)pbaTextures[0],pbaTexSize);
+    cudaUnbindTexture(pbaTexColor);
+    cudaUnbindTexture(pbaTexColor2);
+    if (outputSkelDT) cudaMemcpy(outputSkelDT, pbaTextures[0], pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost);
+}
+
+
+
+
+
+__global__ void kernelFill(unsigned char* output, int size, unsigned char bg, unsigned char fg, short xm, short ym, short xM, short yM, bool ne)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+        
+    if (tx>xm && ty>ym && tx<xM && ty<yM)                                        //careful not to index outside the image..
+    {    
+      int    id0 = TOID(tx, ty, size);
+      unsigned char val = tex1Dfetch(pbaTexGray,id0);                            //
+      if (val==fg)                                                                //do we have a filled pixel? Then fill all to left/top/up/bottom of it which is background
+      {
+        bool fill = false;
+        int id = id0;
+        if (ne)                                                                    //fill in north-east direction:
+        {
+            for(short x=tx+1;x<xM;++x)                                            //REMARK: here and below, the interesting thing is that it's faster, by about 10-15%, to fill a whole
+            {                                                                    //        scanline rather than oly until the current block's borders (+1). The reason is that filling a whole
+                                                                                //          scanline decreases the total #sweeps, which seems to be the limiting speed factor
+              if (tex1Dfetch(pbaTexGray,++id)!=bg) break;
+              output[id] = fg; fill = true;
+            }
+
+            id = id0;
+            for(short y=ty-1;y>ym;--y)
+            {
+              if (tex1Dfetch(pbaTexGray,id-=size)!=bg) break;
+              output[id] = fg; fill = true;
+            }
+        }
+        else                                                                    //fill in south-west direction:
+        {
+            for(short x=tx-1;x>xm;--x)
+            {
+              if (tex1Dfetch(pbaTexGray,--id)!=bg) break;
+              output[id] = fg; fill = true;
+            }
+
+            id = id0;
+            for(short y=ty+1;y<yM;++y)
+            {
+              if (tex1Dfetch(pbaTexGray,id+=size)!=bg) break;
+              output[id] = fg; fill = true;
+            }
+        }
+        
+        if (fill) fill_gc = true;                                                //if we filled anything, inform caller; we 'gather' this info from a local var into the
+                                                                                //global var here, since it's faster than writing the global var in the for loops
+      }      
+    }
+}
+
+
+
+
+__global__ void kernelFillHoles(unsigned char* output, int size, unsigned char bg, unsigned char fg, unsigned char fill_fg)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (tx>=0 && ty>=0 && tx<size && ty<size)                                    //careful not to index outside the image..
+    {    
+        int            id = TOID(tx, ty, size);
+      unsigned char val = tex1Dfetch(pbaTexGray,id);                            //
+      if (val==fill_fg)
+         output[id] = bg;
+      else if (val==bg)
+         output[id] = fg;     
+    }
+}
+
+
+int skelft2DFill(unsigned char* outputFill, short sx, short sy, short xm, short ym, short xM, short yM, unsigned char fill_value)
+{
+    dim3 block = dim3(BLOCKX,BLOCKY);
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+
+    unsigned char background;
+    int id = sy * pbaTexSize + sx;
+    cudaMemcpy(&background,(unsigned char*)pbaTextures[2]+id,sizeof(unsigned char),cudaMemcpyDeviceToHost); //See which is the value we have to fill from (sx,sy)
+    
+    cudaMemset(((unsigned char*)pbaTextures[2])+id,fill_value,sizeof(unsigned char));                    //Fill the seed (x,y) on the GPU    
+
+    cudaBindTexture(0, pbaTexGray, pbaTextures[2]);                                                        //Used to read the thresholded DT
+
+    int iter=0;
+    bool xy = true;                                                                                        //Direction of filling for current sweep: either north-east or south-west
+                                                                                                        //This kind of balances the memory-accesses nicely over kernel calls
+    for(;;++iter,xy=!xy)                                                                                //Keep filling a sweep at a time until we have no background pixels anymore
+    {    
+       bool filled = false;                                                                                //Initialize flag: we didn't fill anything in this sweep
+       cudaMemcpyToSymbol(fill_gc,&filled,sizeof(bool),0,cudaMemcpyHostToDevice);                        //Pass flag to CUDA
+       kernelFill<<<grid, block>>>((unsigned char*)pbaTextures[2],pbaTexSize,background,fill_value,xm,ym,xM,yM,xy);    
+                                                                                                        //One fill sweep       
+       cudaMemcpyFromSymbol(&filled,fill_gc,sizeof(bool),0,cudaMemcpyDeviceToHost);                        //See if we filled anything in this sweep
+       if (!filled) break;                                                                                //Nothing filled? Then we're done, the image didn't change
+    }
+    cudaUnbindTexture(pbaTexGray);
+        
+    if (outputFill) cudaMemcpy(outputFill, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    
+    return iter;                                                                                        //Return #iterations done for the fill - useful as a performance measure for caller
+}
+
+
+
+int skelft2DFillHoles(unsigned char* outputFill, short sx, short sy, unsigned char foreground)
+{
+    unsigned char background;
+    unsigned char fill_value = 128;
+    int id = sy * pbaTexSize + sx;
+    cudaMemcpy(&background,(unsigned char*)pbaTextures[2]+id,sizeof(unsigned char),cudaMemcpyDeviceToHost); //See which is the value at (sx,sy)
+
+    int iter = skelft2DFill(0,sx,sy,0,0,pbaTexSize,pbaTexSize,fill_value);                                //First, fill the background surrounding the image with some special value
+        
+    dim3 block = dim3(BLOCKX,BLOCKY);
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+    
+
+    cudaBindTexture(0, pbaTexGray, pbaTextures[2]);                                                        //Used to read the thresholded DT
+
+    kernelFillHoles<<<grid, block>>>((unsigned char*)pbaTextures[2],pbaTexSize,background,foreground,fill_value);
+
+    cudaUnbindTexture(pbaTexGray);
+    
+    if (outputFill) cudaMemcpy(outputFill, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    
+    return iter;
+}
+
+
+
+//--------  Advection-related code  ----------------------------------------------------------------------
+
+
+void advect2DSetSites(float* sites, int nsites)
+{
+    cudaMemcpy(pbaTexAdvectSites, sites, nsites * sizeof(float2), cudaMemcpyHostToDevice);                //Copy side-coords from CPU->GPU
+}
+
+//#define KERNEL(radius,dist) expf(-10*dist/radius)
+#define KERNEL(radius,dist) ((radius-dist)/radius)
+
+
+
+__global__ void kernelSplat(float* output, int size, int numpts, float d)
+//REMARK: This could be done tens of times faster using OpenGL (draw point-sprites with a 2D distance-kernel texture)
+{
+    int offs = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (offs < numpts)                                                        //careful not to index outside the site-vector..
+    {    
+      float2        p  = tex1Dfetch(pbaTexAdvect,offs);                        //splat the site whose coords are at location offs in pbaTexAdvect[]
+
+      float d2   = d*d;
+      short tx   = p.x, ty = p.y;                                            //REMARK: if I make these float, then something gets ccrewed up - why?
+      short jmin = ty-d;
+      short jmax = ty+d;
+
+      for(short j=jmin;j<=jmax;++j)                                            //bounding-box of the splat at 'skel'
+        {
+           float dy = (j-ty)*(j-ty);                                        //precalc this for speed
+           short imin = tx-d;
+           short imax = tx+d;
+           
+           int offs = __mul24(int(j),size);
+           
+           for(short i=imin;i<=imax;++i)                                    //REMARK: this could be made ~1.5x faster by computing the x-scanline intersection (min,max) with the d2-circle..
+             {
+                float r2 = dy+(i-tx)*(i-tx);
+                if (r2<=d2)                                                    //check we're really inside the splat
+                {
+                  int id = int(i)+offs; 
+                  float val = KERNEL(d2,r2);
+                  output[id] += val;                                        //Accumulate density (WARNING: hope this is thread-safe!!)
+                                                                            //If not, I need atomicAdd() on floats which requires CUDA 2.0    
+                }
+             }
+        }       
+    }
+}
+
+
+inline __device__ float2 computeGradient(const float2& p)                    //Compute -gradient of density at p
+{
+    float v_d = tex2D(pbaTexDensity,p.x,p.y-1);
+    float v_l = tex2D(pbaTexDensity,p.x-1,p.y);
+    float v_r = tex2D(pbaTexDensity,p.x+1,p.y);
+    float v_t = tex2D(pbaTexDensity,p.x,p.y+1);
+    
+    return make_float2((v_l-v_r)/2,(v_d-v_t)/2);
+}
+
+
+inline __device__ float2 computeGradientExplicit(float* density, const float2& p, int size)        //Compute gradient of density at p
+{
+    int x = p.x, y = p.y;
+    float v_d = density[TOID(x,y-1,size)];
+    float v_l = density[TOID(x-1,y,size)];
+    float v_r = density[TOID(x+1,y,size)];
+    float v_t = density[TOID(x,y+1,size)];
+    
+    return make_float2((v_r-v_l)/2,(v_t-v_d)/2);
+}
+
+
+__global__ void kernelGradient(float2* grad, int size)                            
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const float eps = 0.0001;
+
+    if (tx<size && ty<size)                                                    //Careful not to go outside the image..
+    {
+      float2 p = make_float2(tx,ty);
+      float2 g = computeGradient(p);
+      
+      float gn = sqrtf(g.x*g.x+g.y*g.y);                                    //robustly normalize the gradient
+      g.x /= gn+eps; g.y /= gn+eps;
+            
+      grad[TOID(tx,ty,size)] = g;    
+    }
+}
+
+
+void advect2DSplat(float* output, int num_pts, float radius)
+{
+    dim3 block = dim3(BLOCKX*BLOCKY);                                            //Prepare the splatting kernel: this operates on a vector of 2D sites
+    int numpts_b = (num_pts/block.x+1)*block.x;                                    //Find higher multiple of blocksize than # sites
+    dim3 grid  = dim3(numpts_b/block.x);
+    
+    cudaMemset(pbaTexDensityData,0,sizeof(float)*pbaTexSize*pbaTexSize);        //Zero the density texture
+    cudaBindTexture(0,pbaTexAdvect,pbaTexAdvectSites);                            //Bind sites to a texture
+
+    kernelSplat<<< grid, block >>>(pbaTexDensityData, pbaTexSize, num_pts, radius);
+                                                                                //Splat kernel    
+    cudaThreadSynchronize();    
+                                                                                
+    cudaUnbindTexture(pbaTexAdvect);                                            //Done with the sites
+                                                                                //Copy splat-map to CPU (if desired)
+    if (output) cudaMemcpy(output, pbaTexDensityData, pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost);
+}
+
+
+
+void advect2DGetSites(float* output, int num_pts)                                //Get sites CUDA->CPU
+{
+    cudaMemcpy(output,pbaTexAdvectSites,num_pts*sizeof(float2),cudaMemcpyDeviceToHost);
+}
+
+
+void advect2DGradient()                                                            //Compute and save gradient of density map
+{
+    dim3 block = dim3(BLOCKX,BLOCKY);
+    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
+    
+    cudaMemset(pbaTexGradientData,0,sizeof(float2)*pbaTexSize*pbaTexSize);        //Zero the gradient texture
+
+    pbaTexDensity.filterMode = cudaFilterModeLinear;                            //The floating-point density map
+    pbaTexDensity.normalized = false;
+    cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
+    cudaBindTexture2D(0,pbaTexDensity,pbaTexDensityData,channelDesc1,pbaTexSize,pbaTexSize,4*pbaTexSize);                            
+        
+    kernelGradient<<< grid, block >>>(pbaTexGradientData, pbaTexSize);            //Gradient kernel    
+                                                                                
+    cudaUnbindTexture(pbaTexDensity);
+}
+
+
+
+
+
diff --git a/skelft.h b/skelft.h
new file mode 100644
index 0000000..e426053
--- /dev/null
+++ b/skelft.h
@@ -0,0 +1,106 @@
+// CUDA-based Skeletonization, Distance Transforms, Feature Transforms, Erosion
+// and Dilation, and Flood Filling Toolkit
+//
+//(c) Alexandru Telea, Univ. of Groningen, 2011
+//====================================================================================================================
+#pragma once
+
+// Given an image of size nx x ny that we want to process, CUDA may need to use
+// a larger image (e.g. pow 2) internally
+// to handle our image. This returns the size of this larger image. Since we get
+// back data from CUDA at this size,
+// we need to know about the size to allocate all our app-side buffers.
+int skelft2DSize(int nx, int ny);
+
+// Initialize CUDA and allocate memory
+// textureSize is 2^k with k >= 6
+void skelft2DInitialization(int textureSize);
+
+// Deallocate all memory on GPU
+void skelft2DDeinitialization();
+
+// Set various FT computation params. Can be called before each FT computation
+void skelft2DParams(int phase1Band, int phase2Band, int phase3Band);
+
+// Compute 2D feature transform (or Voronoi diagram)
+// siteParam: 2D texture site parameterization. 0 = non-site pixel; >0 = site
+// parameter at current pixel.
+// output:    2D texture FT. Gives coords (i,j) of closest site to each pixel.
+//            If output==0, the FT is still computed and stored on CUDA, but not
+//            passed back to CPU.
+// size:
+void skelft2DFT(short *output, float *siteParam, short xm, short ym, short xM,
+                short yM, int size);
+
+// Compute thresholded skeleton of in-CUDA-memory FT.
+// length:    max value of site parameter (needed for normalization)
+// threshold: threshold for the skeleton importance, like in the AFMM algorithm
+// output:    binary thresholded skeleton (0=background,1=skeleton)
+void skelft2DSkeleton(unsigned char *output, float length, float threshold,
+                      short xm, short ym, short xM, short yM);
+
+// Compute thresholded DT of in-CUDA-memory FT.
+// threshold: upper value for DT
+// output:    binary thresholded DT (1=larger than threshold,0=otherwise)
+void skelft2DDT(short *output, float threshold, short xm, short ym, short xM,
+                short yM);
+
+// Compute exact DT of in-CUDA-memory FT.
+// output:    exact floating-point DT
+void skelft2DDT(float *outputDT, short xm, short ym, short xM, short yM);
+
+// Make an arc-length parameterization of a binary shape, used for
+// skeletonization input
+// input:	  binary image whose boundary we parameterize
+// dx,dy:
+// param:     arc-length parameterization image
+// size:
+// return:	  the boundary length
+float skelft2DMakeBoundary(const unsigned char *input, int xm, int ym, int xM,
+                           int yM, float *param, int size, short iso = 1,
+                           bool thr_upper = true);
+float skelft2DMakeBoundary(const float *input, int xm, int ym, int xM, int yM,
+                           float *param, int size, float iso = 1,
+                           bool thr_upper = true);
+
+// Compute topology events (skeleton endpoints) for an in-CUDA-memory
+// thresholded skeleton
+// topo:	  binary image (1=skel endpoints,0=otherwise), optional. If not
+// supplied, not returned
+// npts:	  on input, this gives the max #points we will return; on output,
+// set to the #points detected and returned
+// points:    array of (x,y) pairs of the detected points
+extern "C" void skelft2DTopology(unsigned char *topo, int *npts, short *points,
+                                 short xm, short ym, short xM, short yM);
+
+// Utility: save given image to pgm file
+void skelft2DSave(short *outputFT, int dx, int dy, const char *f);
+
+// Compute DT of in-CUDA-memory skeleton
+void skel2DSkeletonDT(float *outputSkelDT, short xm, short ym, short xM,
+                      short yM);
+
+// Fills all same-value 4-connected pixels from (seedx,seedy) with value
+// 'fill_val' in the in-CUDA-memory thresholded DT
+// outputFill: filled image
+// <return>:   #iterations done for the fill, useful as a performance measure
+// (lower=better)
+int skelft2DFill(unsigned char *outputFill, short seedx, short seedy, short xm,
+                 short ym, short xM, short yM, unsigned char foreground);
+
+int skelft2DFillHoles(unsigned char *outputFill, short x, short y,
+                      unsigned char fill_val);
+
+//-----------------------
+
+void advect2DSetSites(float *sites, int nsites, int inside_sites);
+
+void advectSetImage(int *image, int nx, int ny);
+
+void advect2DGetSites(float *sites, int nsites);
+
+void advect2DSplat(float *density, int num_pts, float radius);
+
+void advectAttract(int num_pts, float step);
+
+void advectRelax(int num_pts, float map_avg, float radius);
diff --git a/skelft_core.cpp b/skelft_core.cpp
new file mode 100644
index 0000000..e4cec20
--- /dev/null
+++ b/skelft_core.cpp
@@ -0,0 +1,199 @@
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <stack>
+#include <vector>
+#include <iostream>
+
+const float MYINFINITY = 1.0e7f;
+
+inline float SQR(float x) { return x * x; }
+inline int SQR(int x) { return x * x; }
+inline float INTERP(float a, float b, float t) { return a * (1 - t) + b * t; }
+
+struct Coord {
+    int i, j;
+    Coord(int i_, int j_)
+        : i(i_)
+        , j(j_){};
+    Coord() {}
+    int operator==(const Coord &c) const { return i == c.i && j == c.j; }
+};
+
+using namespace std;
+
+const float ALIVE = -10, NARROW_BAND = ALIVE + 1,
+            FAR_AWAY = ALIVE + 2; // Classification values for pixels in an
+                                  // image. The values are arbitrary.
+
+int skelft2DSize(int nx, int ny) {
+    int dx = (int) floor(
+        pow(2.0f, int(log(float(nx)) / log(2.0f) +
+                      1))); // Find minimal pow of 2 which fits the input image
+    int dy = (int) floor(pow(2.0f, int(log(float(ny)) / log(2.0f) + 1)));
+    int fboSize = max(dx, dy); // DT/FT/skeleton image size (pow of 2, should be
+                               // able to contain the input image)
+
+    return fboSize;
+}
+
+// Classify 'f' into inside, outside, and boundary pixels according to isovalue
+// 'iso'
+// Inside pixels are those with value > iso (if thr_upper==true), else those
+// with value < iso.
+template <class T>
+void classify(vector<Coord> &s, const T *f, int xm, int ym, int xM, int yM,
+              int size, T iso, bool thr_upper, float *siteParam,
+              bool scan_bot_to_top) {
+    for (int i = 0; i < size * size; ++i)
+        siteParam[i] = ALIVE; // First: all points are ALIVE, i.e., sites
+
+    for (int j = ym; j < yM; ++j)
+        for (int i = xm; i < xM; ++i) {
+            T fptr = f[i + j * size]; // Mark subset of points which are
+                                      // FAR_AWAY, i.e. NOT sites
+            if (((thr_upper && fptr >= iso) || (!thr_upper && fptr <= iso)))
+                siteParam[i + size * j] = FAR_AWAY;
+        }
+
+    int js, je, jd;
+    if (scan_bot_to_top) {
+        js = ym;
+        je = yM;
+        jd = 1;
+    } else {
+        js = yM - 1;
+        je = ym - 1;
+        jd = -1;
+    }
+
+    for (int j = js; j != je; j += jd)
+        for (int i = xm; i < xM; ++i) {
+            float &cv = siteParam[i + size * j];
+            if (cv == FAR_AWAY)
+                if (((i - 1 + size * j) > 0 &&
+                     siteParam[i - 1 + size * j] == ALIVE) ||
+                    siteParam[i + 1 + size * j] == ALIVE ||
+                    ((i + size * (j - 1)) > 0 &&
+                     siteParam[i + size * (j - 1)] == ALIVE) ||
+                    siteParam[i + size * (j + 1)] == ALIVE) {
+                    cv = NARROW_BAND;
+                    s.push_back(Coord(i, j));
+                }
+        }
+}
+
+// Encode input image (classified as inside/outside/boundary into a
+// boundary-length parameterization image, needed for skeleton computations.
+//
+float encodeBoundary(vector<Coord> &s, int xm, int ym, int xM, int yM,
+                     float *siteParam, int size) {
+    const float LARGE_BOUNDARY_VAL =
+        1000; // Value that the boundary-param will jump with between disjoint,
+              // self-connected, boundary pixel chains
+
+    float length = -LARGE_BOUNDARY_VAL + 1; // Compute the boundary length
+
+    while (s.size()) // Process all narrowband points, add their
+                     // arc-length-parameterization in siteParam
+    {
+        Coord pt = s[s.size() - 1];
+        s.pop_back(); // Pick narrowband-point, test if already numbered
+        if (siteParam[pt.i + size * pt.j] > 0)
+            continue;
+
+        int ci, cj;
+        float cc = 0; // Point pt not numbered
+        for (int dj = -1; dj < 2; ++dj) // Find min-numbered-neighbour of pt
+            for (int di = -1; di < 2; ++di) {
+                int ii = pt.i + di, jj = pt.j + dj;
+                if (ii < xm || ii >= xM || jj < ym || jj >= yM)
+                    continue;
+                float val = siteParam[ii + size * jj];
+                if (val < 0)
+                    continue;
+                if (cc && val > cc)
+                    continue;
+                ci = ii;
+                cj = jj;
+                cc = val;
+            }
+
+        float c = (!cc) ? length + LARGE_BOUNDARY_VAL
+                        : cc + sqrt(float((pt.i - ci) * (pt.i - ci) +
+                                          (pt.j - cj) * (pt.j - cj)));
+
+        siteParam[pt.i + size * pt.j] =
+            c; // Also store the sites' parameterization in siteParam[] for CUDA
+
+        if (c > length)
+            length = c; // Determine length of boundary, used for
+                        // wraparound-distance-computations
+
+        for (int dj = -1; dj < 2; ++dj)
+            for (int di = -1; di < 2; ++di) {
+                if (di == 0 && dj == 0)
+                    continue;
+                int ii = pt.i + di, jj = pt.j + dj;
+                if (ii >= xm && ii < xM && jj >= ym && jj < yM &&
+                    siteParam[ii + size * jj] == NARROW_BAND)
+                    s.push_back(Coord(ii, jj));
+            }
+    }
+
+    for (int i = 0, iend = size * size; i < iend;
+         ++i) // All ALIVE points are marked now as non-sites:
+        if (siteParam[i] < 0)
+            siteParam[i] = 0;
+
+    return length;
+}
+
+float skelft2DMakeBoundary(const float *f, int xm, int ym, int xM, int yM,
+                           float *siteParam, int size, float iso,
+                           bool thr_upper) {
+    bool scan_bot_to_top = false;
+
+    vector<Coord> s;
+    s.reserve((xM - xm) * (yM - ym));
+
+    classify(s, f, xm, ym, xM, yM, size, iso, thr_upper, siteParam,
+             scan_bot_to_top);
+    return encodeBoundary(s, xm, ym, xM, yM, siteParam, size);
+}
+
+float skelft2DMakeBoundary(const unsigned char *f, int xm, int ym, int xM,
+                           int yM, float *siteParam, int size, short iso,
+                           bool thr_upper) {
+    bool scan_bot_to_top = false;
+
+    vector<Coord> s;
+    s.reserve((xM - xm) * (yM - ym));
+
+    classify(s, f, xm, ym, xM, yM, size, (unsigned char) iso, thr_upper,
+             siteParam, scan_bot_to_top);
+    return encodeBoundary(s, xm, ym, xM, yM, siteParam, size);
+}
+
+void skelft2DSave(short *outputFT, int dx, int dy, const char *f) {
+    FILE *fp = fopen(f, "wb");
+    fprintf(fp, "P5 %d %d 255\n", dx, dy);
+
+    const int SIZE = 3000;
+    unsigned char buf[SIZE];
+
+    int size = dx * dy;
+    int bb = 0;
+    float range = max(dx, dy) / 255.0f;
+    for (short *v = outputFT, *vend = outputFT + size; v < vend; ++v) {
+        short val = *v;
+        buf[bb++] = (unsigned char) (val / range);
+        if (bb == SIZE) {
+            fwrite(buf, sizeof(unsigned char), SIZE, fp);
+            bb = 0;
+        }
+    }
+    if (bb)
+        fwrite(buf, sizeof(unsigned char), bb, fp);
+    fclose(fp);
+}
diff --git a/skelftkernel.h b/skelftkernel.h
new file mode 100644
index 0000000..1182181
--- /dev/null
+++ b/skelftkernel.h
@@ -0,0 +1,468 @@
+// CUDA-based Skeletonization, Distance Transforms, Feature Transforms, Erosion
+// and Dilation, and Flood Filling Toolkit
+//
+//(c) Alexandru Telea, Univ. of Groningen, 2011
+//
+// This code is adapted from "Parallel Banding Algorithm to compute exact
+// distance transforms with the GPU",
+// T. Cao, K. Tang, A. Mohamed, T. Tan, Proc. ACM Symo. on Interactive 3D
+// Graphics and Games (I3D), 2010, 83-90
+//====================================================================================================================
+
+// MARKER is used to mark blank pixels in the texture. Any uncolored pixels will
+// have x = MARKER.
+// Input texture should have x = MARKER for all pixels other than sites
+#define MARKER -32768
+
+#define TOID(x, y, size) (__mul24((y), (size)) + (x))
+
+// Transpose a square matrix
+__global__ void kernelTranspose(short2 *data, int size) {
+    __shared__ short2 block1[TILE_DIM][TILE_DIM + 1];
+    __shared__ short2 block2[TILE_DIM][TILE_DIM + 1];
+
+    int blockIdx_y = blockIdx.x;
+    int blockIdx_x = blockIdx.x + blockIdx.y;
+
+    if (blockIdx_x >= gridDim.x)
+        return;
+
+    int blkX, blkY, x, y, id1, id2;
+    short2 pixel;
+
+    blkX = __mul24(blockIdx_x, TILE_DIM);
+    blkY = __mul24(blockIdx_y, TILE_DIM);
+
+    x = blkX + threadIdx.x;
+    y = blkY + threadIdx.y;
+    id1 = __mul24(y, size) + x;
+
+    x = blkY + threadIdx.x;
+    y = blkX + threadIdx.y;
+    id2 = __mul24(y, size) + x;
+
+    // read the matrix tile into shared memory
+    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+        int idx = __mul24(i, size);
+        block1[threadIdx.y + i][threadIdx.x] =
+            tex1Dfetch(pbaTexColor, id1 + idx);
+        block2[threadIdx.y + i][threadIdx.x] =
+            tex1Dfetch(pbaTexColor, id2 + idx);
+    }
+
+    __syncthreads();
+
+    // write the transposed matrix tile to global memory
+    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+        int idx = __mul24(i, size);
+        pixel = block1[threadIdx.x][threadIdx.y + i];
+        data[id2 + idx] = make_short2(pixel.y, pixel.x);
+        pixel = block2[threadIdx.x][threadIdx.y + i];
+        data[id1 + idx] = make_short2(pixel.y, pixel.x);
+    }
+}
+
+__global__ void kernelFloodDown(short2 *output, int size, int bandSize) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+    int id = TOID(tx, ty, size);
+
+    short2 pixel1, pixel2;
+
+    pixel1 = make_short2(MARKER, MARKER);
+
+    for (int i = 0; i < bandSize; ++i, id += size) {
+        pixel2 = tex1Dfetch(pbaTexColor, id);
+
+        if (pixel2.x != MARKER)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+__global__ void kernelFloodUp(short2 *output, int size, int bandSize) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = (blockIdx.y + 1) * bandSize - 1;
+    int id = TOID(tx, ty, size);
+
+    short2 pixel1, pixel2;
+    int dist1, dist2;
+
+    pixel1 = make_short2(MARKER, MARKER);
+
+    for (int i = 0; i < bandSize; i++, id -= size) {
+        dist1 = abs(pixel1.y - ty + i);
+
+        pixel2 = tex1Dfetch(pbaTexColor, id);
+        dist2 = abs(pixel2.y - ty + i);
+
+        if (dist2 < dist1)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+__global__ void kernelPropagateInterband(short2 *output, int size,
+                                         int bandSize) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int inc = __mul24(bandSize, size);
+    int ny, nid, nDist;
+    short2 pixel;
+
+    // Top row, look backward
+    int ty = __mul24(blockIdx.y, bandSize);
+    int topId = TOID(tx, ty, size);
+    int bottomId = TOID(tx, ty + bandSize - 1, size);
+
+    pixel = tex1Dfetch(pbaTexColor, topId);
+    int myDist = abs(pixel.y - ty);
+
+    for (nid = bottomId - inc; nid >= 0; nid -= inc) {
+        pixel = tex1Dfetch(pbaTexColor, nid);
+        if (pixel.x != MARKER) {
+            nDist = abs(pixel.y - ty);
+            if (nDist < myDist)
+                output[topId] = pixel;
+            break;
+        }
+    }
+
+    // Last row, look downward
+    ty = ty + bandSize - 1;
+    pixel = tex1Dfetch(pbaTexColor, bottomId);
+    myDist = abs(pixel.y - ty);
+
+    for (ny = ty + 1, nid = topId + inc; ny < size;
+         ny += bandSize, nid += inc) {
+        pixel = tex1Dfetch(pbaTexColor, nid);
+
+        if (pixel.x != MARKER) {
+            nDist = abs(pixel.y - ty);
+            if (nDist < myDist)
+                output[bottomId] = pixel;
+            break;
+        }
+    }
+}
+
+__global__ void kernelUpdateVertical(short2 *output, int size, int band,
+                                     int bandSize) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+
+    short2 top = tex1Dfetch(pbaTexLinks, TOID(tx, ty, size));
+    short2 bottom = tex1Dfetch(pbaTexLinks, TOID(tx, ty + bandSize - 1, size));
+    short2 pixel;
+
+    int dist, myDist;
+
+    int id = TOID(tx, ty, size);
+
+    for (int i = 0; i < bandSize; i++, id += size) {
+        pixel = tex1Dfetch(pbaTexColor, id);
+        myDist = abs(pixel.y - (ty + i));
+
+        dist = abs(top.y - (ty + i));
+        if (dist < myDist) {
+            myDist = dist;
+            pixel = top;
+        }
+
+        dist = abs(bottom.y - (ty + i));
+        if (dist < myDist)
+            pixel = bottom;
+
+        output[id] = pixel;
+    }
+}
+
+// Input: y1 < y2
+__device__ float interpoint(int x1, int y1, int x2, int y2, int x0) {
+    float xM = float(x1 + x2) / 2.0f;
+    float yM = float(y1 + y2) / 2.0f;
+    float nx = x2 - x1;
+    float ny = y2 - y1;
+
+    return yM + nx * (xM - x0) / ny;
+}
+
+__global__ void kernelProximatePoints(short2 *stack, int size, int bandSize) {
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = __mul24(blockIdx.y, bandSize);
+    int id = TOID(tx, ty, size);
+    int lasty = -1;
+    short2 last1, last2, current;
+    float i1, i2;
+
+    last1.y = -1;
+    last2.y = -1;
+
+    for (int i = 0; i < bandSize; i++, id += size) {
+        current = tex1Dfetch(pbaTexColor, id);
+
+        if (current.x != MARKER) {
+            while (last2.y >= 0) {
+                i1 = interpoint(last1.x, last2.y, last2.x, lasty, tx);
+                i2 = interpoint(last2.x, lasty, current.x, current.y, tx);
+
+                if (i1 < i2)
+                    break;
+
+                lasty = last2.y;
+                last2 = last1;
+
+                if (last1.y >= 0)
+                    last1 = stack[TOID(tx, last1.y, size)];
+            }
+
+            last1 = last2;
+            last2 = make_short2(current.x, lasty);
+            lasty = current.y;
+
+            stack[id] = last2;
+        }
+    }
+
+    // Store the pointer to the tail at the last pixel of this band
+    if (lasty != ty + bandSize - 1)
+        stack[TOID(tx, ty + bandSize - 1, size)] = make_short2(MARKER, lasty);
+}
+
+__global__ void kernelCreateForwardPointers(short2 *output, int size,
+                                            int bandSize) {
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = __mul24(blockIdx.y + 1, bandSize) - 1;
+    int id = TOID(tx, ty, size);
+    int lasty = -1, nexty;
+    short2 current;
+
+    // Get the tail pointer
+    current = tex1Dfetch(pbaTexLinks, id);
+
+    if (current.x == MARKER)
+        nexty = current.y;
+    else
+        nexty = ty;
+
+    for (int i = 0; i < bandSize; i++, id -= size)
+        if (ty - i == nexty) {
+            current = make_short2(lasty, tex1Dfetch(pbaTexLinks, id).y);
+            output[id] = current;
+
+            lasty = nexty;
+            nexty = current.y;
+        }
+
+    // Store the pointer to the head at the first pixel of this band
+    if (lasty != ty - bandSize + 1)
+        output[id + size] = make_short2(lasty, MARKER);
+}
+
+__global__ void kernelMergeBands(short2 *output, int size, int bandSize) {
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int band1 = blockIdx.y * 2;
+    int band2 = band1 + 1;
+    int firsty, lasty;
+    short2 last1, last2, current;
+    // last1 and last2: x component store the x coordinate of the site,
+    // y component store the backward pointer
+    // current: y component store the x coordinate of the site,
+    // x component store the forward pointer
+
+    // Get the two last items of the first list
+    lasty = __mul24(band2, bandSize) - 1;
+    last2 = make_short2(tex1Dfetch(pbaTexColor, TOID(tx, lasty, size)).x,
+                        tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)).y);
+
+    if (last2.x == MARKER) {
+        lasty = last2.y;
+
+        if (lasty >= 0)
+            last2 =
+                make_short2(tex1Dfetch(pbaTexColor, TOID(tx, lasty, size)).x,
+                            tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)).y);
+        else
+            last2 = make_short2(MARKER, MARKER);
+    }
+
+    if (last2.y >= 0) {
+        // Second item at the top of the stack
+        last1 = make_short2(tex1Dfetch(pbaTexColor, TOID(tx, last2.y, size)).x,
+                            tex1Dfetch(pbaTexLinks, TOID(tx, last2.y, size)).y);
+    }
+
+    // Get the first item of the second band
+    firsty = __mul24(band2, bandSize);
+    current = make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
+                          tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);
+
+    if (current.y == MARKER) {
+        firsty = current.x;
+
+        if (firsty >= 0)
+            current =
+                make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
+                            tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);
+        else
+            current = make_short2(MARKER, MARKER);
+    }
+
+    float i1, i2;
+
+    // Count the number of item in the second band that survive so far.
+    // Once it reaches 2, we can stop.
+    int top = 0;
+
+    while (top < 2 && current.y >= 0) {
+        // While there's still something on the left
+        while (last2.y >= 0) {
+            i1 = interpoint(last1.x, last2.y, last2.x, lasty, tx);
+            i2 = interpoint(last2.x, lasty, current.y, firsty, tx);
+
+            if (i1 < i2)
+                break;
+
+            lasty = last2.y;
+            last2 = last1;
+            --top;
+
+            if (last1.y >= 0)
+                last1 = make_short2(
+                    tex1Dfetch(pbaTexColor, TOID(tx, last1.y, size)).x,
+                    output[TOID(tx, last1.y, size)].y);
+        }
+
+        // Update the current pointer
+        output[TOID(tx, firsty, size)] = make_short2(current.x, lasty);
+
+        if (lasty >= 0)
+            output[TOID(tx, lasty, size)] = make_short2(firsty, last2.y);
+
+        last1 = last2;
+        last2 = make_short2(current.y, lasty);
+        lasty = firsty;
+        firsty = current.x;
+
+        top = max(1, top + 1);
+
+        // Advance the current pointer to the next one
+        if (firsty >= 0)
+            current =
+                make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x,
+                            tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x);
+        else
+            current = make_short2(MARKER, MARKER);
+    }
+
+    // Update the head and tail pointer.
+    firsty = __mul24(band1, bandSize);
+    lasty = __mul24(band2, bandSize);
+    current = tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size));
+
+    if (current.y == MARKER && current.x < 0) { // No head?
+        last1 = tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size));
+
+        if (last1.y == MARKER)
+            current.x = last1.x;
+        else
+            current.x = lasty;
+
+        output[TOID(tx, firsty, size)] = current;
+    }
+
+    firsty = __mul24(band1, bandSize) + bandSize - 1;
+    lasty = __mul24(band2, bandSize) + bandSize - 1;
+    current = tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size));
+
+    if (current.x == MARKER && current.y < 0) { // No tail?
+        last1 = tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size));
+
+        if (last1.x == MARKER)
+            current.y = last1.y;
+        else
+            current.y = firsty;
+
+        output[TOID(tx, lasty, size)] = current;
+    }
+}
+
+__global__ void kernelDoubleToSingleList(short2 *output, int size) {
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = blockIdx.y;
+    int id = TOID(tx, ty, size);
+
+    output[id] = make_short2(tex1Dfetch(pbaTexColor, id).x,
+                             tex1Dfetch(pbaTexLinks, id).y);
+}
+
+__global__ void kernelColor(short2 *output, int size) {
+    __shared__ short2 s_last1[BLOCKSIZE], s_last2[BLOCKSIZE];
+    __shared__ int s_lasty[BLOCKSIZE];
+
+    int col = threadIdx.x;
+    int tid = threadIdx.y;
+    int tx = __mul24(blockIdx.x, blockDim.x) + col;
+    int dx, dy, lasty;
+    unsigned int best, dist;
+    short2 last1, last2;
+
+    if (tid == blockDim.y - 1) {
+        lasty = size - 1;
+
+        last2 = tex1Dfetch(pbaTexColor, __mul24(lasty, size) + tx);
+
+        if (last2.x == MARKER) {
+            lasty = last2.y;
+            last2 = tex1Dfetch(pbaTexColor, __mul24(lasty, size) + tx);
+        }
+
+        if (last2.y >= 0)
+            last1 = tex1Dfetch(pbaTexColor, __mul24(last2.y, size) + tx);
+
+        s_last1[col] = last1;
+        s_last2[col] = last2;
+        s_lasty[col] = lasty;
+    }
+
+    __syncthreads();
+
+    for (int ty = size - 1 - tid; ty >= 0; ty -= blockDim.y) {
+        last1 = s_last1[col];
+        last2 = s_last2[col];
+        lasty = s_lasty[col];
+
+        dx = last2.x - tx;
+        dy = lasty - ty;
+        best = dist = __mul24(dx, dx) + __mul24(dy, dy);
+
+        while (last2.y >= 0) {
+            dx = last1.x - tx;
+            dy = last2.y - ty;
+            dist = __mul24(dx, dx) + __mul24(dy, dy);
+
+            if (dist > best)
+                break;
+
+            best = dist;
+            lasty = last2.y;
+            last2 = last1;
+
+            if (last2.y >= 0)
+                last1 = tex1Dfetch(pbaTexColor, __mul24(last2.y, size) + tx);
+        }
+
+        __syncthreads();
+
+        output[TOID(tx, ty, size)] = make_short2(last2.x, lasty);
+
+        if (tid == blockDim.y - 1) {
+            s_last1[col] = last1;
+            s_last2[col] = last2;
+            s_lasty[col] = lasty;
+        }
+
+        __syncthreads();
+    }
+}
\ No newline at end of file
diff --git a/voronoisplat.cpp b/voronoisplat.cpp
new file mode 100644
index 0000000..b8ab426
--- /dev/null
+++ b/voronoisplat.cpp
@@ -0,0 +1,407 @@
+#include "voronoisplat.h"
+
+#include <algorithm>
+
+#include <QQuickWindow>
+#include <QOpenGLFramebufferObject>
+
+#include "scale.h"
+#include "skelft.h"
+
+static const float RAD_BLUR = 5.0f;
+static const int COLORMAP_SAMPLES = 128;
+
+VoronoiSplatRenderer::VoronoiSplatRenderer(const QSize &size)
+    : m_item(0)
+    , gl(QOpenGLContext::currentContext())
+    , m_size(size)
+{
+    setupShaders();
+
+    // sitesVAO: VBOs 0 & 1 are for sites & their values (init'd later)
+    m_sitesVAO.create();
+    gl.glGenBuffers(3, m_VBOs);
+
+    // 2ndPassVAO: VBO 2 is a quad mapping the final texture to the framebuffer
+    m_2ndPassVAO.create();
+    m_2ndPassVAO.bind();
+    GLfloat verts[] = {-1.0f, -1.0f, -1.0f, 1.0f,
+                        1.0f, -1.0f,  1.0f, 1.0f};
+    gl.glBindBuffer(GL_ARRAY_BUFFER, m_VBOs[2]);
+    gl.glBufferData(GL_ARRAY_BUFFER, sizeof(verts), verts, GL_STATIC_DRAW);
+
+    int vertAttrib = m_program2->attributeLocation("vert");
+    gl.glVertexAttribPointer(vertAttrib, 2, GL_FLOAT, GL_FALSE, 0, 0);
+    gl.glEnableVertexAttribArray(vertAttrib);
+    m_2ndPassVAO.release();
+
+    setupTextures();
+}
+
+void VoronoiSplatRenderer::setupShaders()
+{
+    m_program1 = new QOpenGLShaderProgram;
+    m_program1->addShaderFromSourceCode(QOpenGLShader::Vertex,
+R"EOF(#version 440
+
+uniform float rad_blur;
+uniform float rad_max;
+uniform float fbSize;
+
+in vec2 vert;
+in float scalar;
+
+out float value;
+
+void main() {
+  gl_PointSize = (rad_max + rad_blur) * 2.0;
+  gl_Position = vec4(2.0 * (vert / fbSize) - 1.0, 0, 1);
+  value = scalar;
+}
+)EOF");
+    m_program1->addShaderFromSourceCode(QOpenGLShader::Fragment,
+R"EOF(#version 440
+
+uniform sampler2D siteDT;
+uniform float rad_blur;
+uniform float rad_max;
+
+in float value;
+
+layout (location = 0) out vec4 fragColor;
+
+void main() {
+  float dt = texelFetch(siteDT, ivec2(gl_FragCoord.xy), 0).r;
+  if (dt > rad_max)
+    discard;
+  else {
+    float r = distance(gl_PointCoord, vec2(0.5, 0.5)) * (rad_max + rad_blur) * 2.0;
+    float r2 = r * r;
+    float rad = dt + rad_blur;
+    float rad2 = rad * rad;
+    if (r2 > rad2)
+      discard;
+    else {
+      float w = exp(-5.0 * r2 / rad2);
+      fragColor = vec4(w * value, w, 0.0, 0.0);
+    }
+  }
+}
+)EOF");
+    m_program1->link();
+
+    m_program2 = new QOpenGLShaderProgram;
+    m_program2->addShaderFromSourceCode(QOpenGLShader::Vertex,
+R"EOF(#version 440
+
+in vec2 vert;
+
+void main() {
+  gl_Position = vec4(vert, 0, 1);
+}
+)EOF");
+
+    m_program2->addShaderFromSourceCode(QOpenGLShader::Fragment,
+R"EOF(
+#version 440
+
+layout (location = 0) out vec4 fragColor;
+
+uniform sampler2D siteDT;
+uniform sampler2D accumTex;
+uniform sampler2D colormap;
+uniform float rad_max;
+
+vec3 getRGB(float value) {
+  return texture2D(colormap, vec2(0, 1)).rgb;
+}
+
+void main() {
+  float dt = texelFetch(siteDT, ivec2(gl_FragCoord.xy), 0).r;
+  if (dt > rad_max)
+    discard;
+  else {
+    vec4 accum = texelFetch(accumTex, ivec2(gl_FragCoord.xy), 0);
+    // 1.0 is extra-accumulated because of white background
+    float value = 0.0f;
+    if (accum.g > 1.0)
+      value = (accum.r - 1.0) / (accum.g - 1.0);
+    else
+      value = accum.r / accum.g;
+    fragColor.rgb = getRGB(value);
+    fragColor.a = 1.0 - dt / rad_max;
+  }
+}
+)EOF");
+    m_program2->link();
+}
+
+void VoronoiSplatRenderer::setupTextures()
+{
+    gl.glGenTextures(2, m_textures);
+
+    // textures[0] stores the DT values for each pixel
+    gl.glBindTexture(GL_TEXTURE_2D, m_textures[0]);
+    gl.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, m_size.width(),
+            m_size.height(), 0, GL_RG, GL_FLOAT, 0);
+    gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+    // textures[1] is the result of the first pass
+    gl.glBindTexture(GL_TEXTURE_2D, m_textures[1]);
+    gl.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, m_size.width(),
+            m_size.height(), 0, GL_RGBA, GL_FLOAT, 0);
+    gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+    // Used for colormap lookup in the frag shader
+    gl.glGenTextures(1, &m_colorMapTex);
+    gl.glBindTexture(GL_TEXTURE_2D, m_colorMapTex);
+    gl.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, COLORMAP_SAMPLES, 1, 0, GL_RGB,
+            GL_FLOAT, 0);
+    gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+}
+
+VoronoiSplatRenderer::~VoronoiSplatRenderer()
+{
+    gl.glDeleteBuffers(2, m_VBOs);
+    delete m_program1;
+}
+
+void VoronoiSplatRenderer::setSites(const arma::mat &points)
+{
+    if (points.n_rows < 1 || points.n_cols != 2
+        || (m_values.size() != 0 && points.n_rows != m_values.size())) {
+        return;
+    }
+
+    // Copy 'points' to internal data structure(s)
+    copyPoints(points);
+
+    // Update VBO with the new data
+    m_sitesVAO.bind();
+    gl.glBindBuffer(GL_ARRAY_BUFFER, m_VBOs[0]);
+    gl.glBufferData(GL_ARRAY_BUFFER, m_sites.size() * sizeof(float),
+            m_sites.data(), GL_STATIC_DRAW);
+
+    int vertAttrib = m_program1->attributeLocation("vert");
+    gl.glVertexAttribPointer(vertAttrib, 2, GL_FLOAT, GL_FALSE, 0, 0);
+    gl.glEnableVertexAttribArray(vertAttrib);
+    m_sitesVAO.release();
+
+    // Compute DT values for the new positions
+    computeDT();
+}
+
+void VoronoiSplatRenderer::copyPoints(const arma::mat &points)
+{
+    double minX = points.col(0).min();
+    double maxX = points.col(0).max();
+    double minY = points.col(1).min();
+    double maxY = points.col(1).max();
+
+    // Coords are tighly packed into 'm_sites' as [ (x1, y1), (x2, y2), ... ]
+    m_sites.resize(points.n_rows * 2);
+    LinearScale<float> sx(minX, maxX, 1, m_size.width() - 1);
+    const double *col = points.colptr(0);
+    for (unsigned i = 0; i < points.n_rows; i++) {
+        m_sites[2*i] = sx(col[i]);
+    }
+
+    col = points.colptr(1);
+    LinearScale<float> sy(minY, maxY, m_size.height() - 1, 1);
+    for (unsigned i = 0; i < points.n_rows; i++) {
+        m_sites[2*i + 1] = sy(col[i]);
+    }
+}
+
+void VoronoiSplatRenderer::computeDT()
+{
+    int w = m_size.width(), h = m_size.height();
+
+    std::vector<float> buf(w * h, 0.0f);
+    for (unsigned i = 0; i < m_sites.size(); i += 2) {
+        buf[int(m_sites[i + 1]) * h + int(m_sites[i])] = (float) i / 2 + 1;
+    }
+
+    // Compute FT of the sites
+    skelft2DFT(0, buf.data(), 0, 0, w, h, w);
+    // Compute DT of the sites (from the resident FT)
+    skelft2DDT(buf.data(), 0, 0, w, h);
+
+    std::vector<GLfloat> dtTexData(w * h * 2, 0.0f);
+    for (unsigned i = 0; i < buf.size(); i++) {
+        dtTexData[2*i] = buf[i];
+    }
+
+    // Upload result to lookup texture
+    gl.glBindTexture(GL_TEXTURE_2D, m_textures[0]);
+    gl.glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_size.width(), m_size.height(),
+            GL_RG, GL_FLOAT, dtTexData.data());
+}
+
+void VoronoiSplatRenderer::setValues(const arma::vec &values)
+{
+    if (values.n_elem == 0
+        || (m_sites.size() != 0 && values.n_elem != m_sites.size() / 2)) {
+        return;
+    }
+
+    m_values.resize(values.n_elem);
+    std::copy(values.begin(), values.end(), m_values.begin());
+
+    // Update VBO with the new data
+    m_sitesVAO.bind();
+    gl.glBindBuffer(GL_ARRAY_BUFFER, m_VBOs[1]);
+    gl.glBufferData(GL_ARRAY_BUFFER, m_values.size() * sizeof(float),
+            m_values.data(), GL_DYNAMIC_DRAW);
+
+    int valueAttrib = m_program1->attributeLocation("scalar");
+    gl.glVertexAttribPointer(valueAttrib, 1, GL_FLOAT, GL_FALSE, 0, 0);
+    gl.glEnableVertexAttribArray(valueAttrib);
+    m_sitesVAO.release();
+}
+
+void VoronoiSplatRenderer::setColorMap(const ColorScale *scale)
+{
+    if (!scale) {
+        return;
+    }
+
+    float t = scale->min();
+    float step = (scale->max() - scale->min()) / COLORMAP_SAMPLES;
+    qreal r, g, b;
+    std::vector<float> cmap(COLORMAP_SAMPLES * 3); // R,G,B
+    for (int i = 0; i < COLORMAP_SAMPLES * 3; i += 3) {
+        scale->color(t).getRgbF(&r, &g, &b);
+        cmap[i + 0] = b;
+        cmap[i + 1] = g;
+        cmap[i + 2] = r;
+
+        t += step;
+    }
+
+    // Update texture data
+    gl.glBindTexture(GL_TEXTURE_2D, m_colorMapTex);
+    gl.glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, COLORMAP_SAMPLES, 1, GL_RGB,
+            GL_FLOAT, cmap.data());
+}
+
+QOpenGLFramebufferObject *VoronoiSplatRenderer::createFramebufferObject(const QSize &size)
+{
+    return new QOpenGLFramebufferObject(size);
+}
+
+void VoronoiSplatRenderer::synchronize(QQuickFramebufferObject *item)
+{
+    m_item = static_cast<VoronoiSplat *>(item);
+
+    if (m_item->colorScaleUpdated()) {
+        setColorMap(m_item->colorScale());
+    }
+    if (m_item->pointsUpdated()) {
+        setSites(m_item->points());
+    }
+    if (m_item->valuesUpdated()) {
+        setValues(m_item->values());
+    }
+
+    m_item->setUpdated(false);
+}
+
+void VoronoiSplatRenderer::render()
+{
+    // Store the final texture to render to, as we use a two-pass rendering
+    // which first renders to an intermediate texture
+    GLint targetTexture;
+    gl.glGetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER,
+            GL_COLOR_ATTACHMENT0, GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME,
+            &targetTexture);
+
+    // The first pass
+    m_program1->bind();
+    m_program1->setUniformValue("rad_max", 20.0f);
+    m_program1->setUniformValue("rad_blur", RAD_BLUR);
+    m_program1->setUniformValue("fbSize", float(m_size.width()));
+
+    gl.glActiveTexture(GL_TEXTURE0);
+    gl.glBindTexture(GL_TEXTURE_2D, m_textures[0]);
+    m_program1->setUniformValue("siteDT", 0);
+
+    gl.glEnable(GL_POINT_SPRITE);
+    gl.glEnable(GL_PROGRAM_POINT_SIZE);
+    gl.glEnable(GL_BLEND);
+    gl.glBlendFunc(GL_ONE, GL_ONE);
+
+    // In the first pass we draw to an intermediate texture, which is used as
+    // input for the next pass
+    gl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+            GL_TEXTURE_2D, m_textures[1], 0);
+
+    gl.glClearColor(1, 1, 1, 1);
+    gl.glClear(GL_COLOR_BUFFER_BIT);
+
+    m_sitesVAO.bind();
+    gl.glDrawArrays(GL_POINTS, 0, m_values.size());
+    m_sitesVAO.release();
+
+    m_program1->release();
+
+    // Second pass
+    m_program2->bind();
+    m_program2->setUniformValue("rad_max", 20.0f);
+
+    gl.glActiveTexture(GL_TEXTURE0);
+    gl.glBindTexture(GL_TEXTURE_2D, m_textures[0]);
+    m_program2->setUniformValue("siteDT", 0);
+    gl.glActiveTexture(GL_TEXTURE1);
+    gl.glBindTexture(GL_TEXTURE_2D, m_textures[1]);
+    m_program2->setUniformValue("accumTex", 1);
+    gl.glActiveTexture(GL_TEXTURE2);
+    gl.glBindTexture(GL_TEXTURE_2D, m_colorMapTex);
+    m_program2->setUniformValue("colormap", 2);
+
+    gl.glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+
+    // Restore the original framebuffer texture
+    gl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+            GL_TEXTURE_2D, targetTexture, 0);
+
+    glClearColor(1, 1, 1, 1);
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    m_2ndPassVAO.bind();
+    gl.glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+    m_2ndPassVAO.release();
+
+    m_program2->release();
+
+    m_item->window()->resetOpenGLState();
+}
+
+static int nextPow2(int n)
+{
+    // TODO: check for overflows
+    n--;
+    for (int shift = 1; ((n + 1) & n); shift <<= 1) {
+        n |= n >> shift;
+    }
+    return n + 1;
+}
+
+VoronoiSplat::VoronoiSplat(QQuickItem *parent)
+    : QQuickFramebufferObject(parent)
+    , m_colorScaleUpdated(false)
+    , m_pointsUpdated(false)
+    , m_valuesUpdated(false)
+    , m_colorScale(0)
+{
+    setTextureFollowsItemSize(false);
+}
+
+QQuickFramebufferObject::Renderer *VoronoiSplat::createRenderer() const
+{
+    int baseSize = nextPow2(std::min(width(), height()));
+    return new VoronoiSplatRenderer(QSize(baseSize, baseSize));
+}
diff --git a/voronoisplat.h b/voronoisplat.h
new file mode 100644
index 0000000..ce84e74
--- /dev/null
+++ b/voronoisplat.h
@@ -0,0 +1,103 @@
+#ifndef VORONOISPLAT_H
+#define VORONOISPLAT_H
+
+#include <QQuickFramebufferObject>
+#include <QOpenGLFunctions>
+#include <QOpenGLBuffer>
+#include <QOpenGLShaderProgram>
+#include <QOpenGLVertexArrayObject>
+#include <armadillo>
+
+#include "colorscale.h"
+
+class VoronoiSplat; // defined after this class
+
+class VoronoiSplatRenderer
+    : public QQuickFramebufferObject::Renderer
+{
+public:
+    // 'size' must be square (and power of 2); item is the QQuickFBO that
+    // creates this
+    VoronoiSplatRenderer(const QSize &size);
+    ~VoronoiSplatRenderer();
+
+    void synchronize(QQuickFramebufferObject *item);
+    // 'points' should be a 2D points matrix (each point in a row)
+    void setSites(const arma::mat &points);
+    void setValues(const arma::vec &values);
+    // Set colormap data based on the given color scale;
+    void setColorMap(const ColorScale *scale);
+
+    QOpenGLFramebufferObject *createFramebufferObject(const QSize & size);    
+    void render();
+
+private:
+    void setupShaders();
+    void setupTextures();
+    void copyPoints(const arma::mat &points);
+    void computeDT();
+
+    VoronoiSplat *m_item;
+    QOpenGLFunctions gl;
+    QOpenGLShaderProgram *m_program1, *m_program2;
+    GLuint m_VBOs[3];
+    GLuint m_textures[2], m_colorMapTex;
+    QOpenGLVertexArrayObject m_sitesVAO, m_2ndPassVAO;
+
+    std::vector<float> m_sites, m_values;
+    QSize m_size;
+};
+
+class VoronoiSplat
+    : public QQuickFramebufferObject
+{
+    Q_OBJECT
+public:
+    VoronoiSplat(QQuickItem *parent = 0);
+    Renderer *createRenderer() const;
+
+    const arma::mat &points() const { return m_points; }
+    const arma::vec &values() const { return m_values; }
+    const ColorScale *colorScale() const { return m_colorScale; }
+
+    bool colorScaleUpdated() const { return m_colorScaleUpdated; }
+    bool pointsUpdated() const { return m_pointsUpdated; }
+    bool valuesUpdated() const { return m_valuesUpdated; }
+
+public slots:
+    void setColorScale(const ColorScale *scale)
+    {
+        m_colorScale = scale;
+        m_colorScaleUpdated = true;
+        update();
+    }
+
+    void setPoints(const arma::mat &points)
+    {
+        m_points = points;
+        m_pointsUpdated = true;
+        update();
+    }
+
+    void setValues(const arma::vec &values)
+    {
+        m_values = values;
+        m_valuesUpdated = true;
+        update();
+    }
+
+    void setUpdated(bool updated)
+    {
+        m_colorScaleUpdated = updated;
+        m_pointsUpdated     = updated;
+        m_valuesUpdated     = updated;
+    }
+
+private:
+    bool m_colorScaleUpdated, m_pointsUpdated, m_valuesUpdated;
+    const ColorScale *m_colorScale;
+    arma::mat m_points;
+    arma::vec m_values;
+};
+
+#endif // VORONOISPLAT_H
-- 
cgit v1.2.3