diff options
-rw-r--r-- | main.cpp | 25 | ||||
-rw-r--r-- | main.h | 18 | ||||
-rw-r--r-- | main_view.qml | 88 | ||||
-rw-r--r-- | pm.pro | 34 | ||||
-rw-r--r-- | skelft.cu | 869 | ||||
-rw-r--r-- | skelft.h | 106 | ||||
-rw-r--r-- | skelft_core.cpp | 199 | ||||
-rw-r--r-- | skelftkernel.h | 468 | ||||
-rw-r--r-- | voronoisplat.cpp | 407 | ||||
-rw-r--r-- | voronoisplat.h | 103 |
10 files changed, 2307 insertions, 10 deletions
@@ -12,12 +12,14 @@ #include "mp.h" #include "continuouscolorscale.h" #include "scatterplot.h" +#include "voronoisplat.h" #include "historygraph.h" #include "interactionhandler.h" #include "selectionhandler.h" #include "effectivenessobserver.h" #include "distortionobserver.h" #include "npdistortion.h" +#include "skelft.h" static QObject *mainProvider(QQmlEngine *engine, QJSEngine *scriptEngine) { @@ -155,12 +157,16 @@ int main(int argc, char **argv) m->setSubsample(Ys); qmlRegisterType<Scatterplot>("PM", 1, 0, "Scatterplot"); + qmlRegisterType<VoronoiSplat>("PM", 1, 0, "VoronoiSplat"); qmlRegisterType<HistoryGraph>("PM", 1, 0, "HistoryGraph"); + qmlRegisterType<InteractionHandler>("PM", 1, 0, "InteractionHandler"); qmlRegisterSingletonType<Main>("PM", 1, 0, "Main", mainProvider); // Set up multisampling QSurfaceFormat fmt; fmt.setSamples(16); + fmt.setRenderableType(QSurfaceFormat::OpenGL); + fmt.setVersion(4, 5); QSurfaceFormat::setDefaultFormat(fmt); QQmlApplicationEngine engine(QUrl("qrc:///main_view.qml")); @@ -184,6 +190,8 @@ int main(int argc, char **argv) // subsamplePlot->setColorData(arma::zeros<arma::vec>(subsampleSize)); subsamplePlot->setColorScale(&colorScale); Scatterplot *plot = engine.rootObjects()[0]->findChild<Scatterplot *>("plot"); + VoronoiSplat *splat = engine.rootObjects()[0]->findChild<VoronoiSplat *>("splat"); + skelft2DInitialization(splat->width()); // Keep track of the current subsample (in order to save them later, if requested) QObject::connect(subsamplePlot, SIGNAL(xyChanged(const arma::mat &)), @@ -191,15 +199,22 @@ int main(int argc, char **argv) QObject::connect(subsamplePlot, SIGNAL(xyInteractivelyChanged(const arma::mat &)), m, SLOT(setSubsample(const arma::mat &))); - // Update LAMP projection as the subsample is modified + // Update projection as the subsample is modified InteractionHandler interactionHandler(X, sampleIndices); - interactionHandler.setTechnique(InteractionHandler::TECHNIQUE_LAMP); + m->setInteractionHandler(&interactionHandler); QObject::connect(subsamplePlot, SIGNAL(xyChanged(const arma::mat &)), &interactionHandler, SLOT(setSubsample(const arma::mat &))); QObject::connect(subsamplePlot, SIGNAL(xyInteractivelyChanged(const arma::mat &)), &interactionHandler, SLOT(setSubsample(const arma::mat &))); QObject::connect(&interactionHandler, SIGNAL(subsampleChanged(const arma::mat &)), plot, SLOT(setXY(const arma::mat &))); + m->setTechnique(InteractionHandler::TECHNIQUE_LAMP); + + // Update splat whenever the main plot is also updated + QObject::connect(plot, SIGNAL(xyChanged(const arma::mat &)), + splat, SLOT(setPoints(const arma::mat &))); + QObject::connect(plot, SIGNAL(colorDataChanged(const arma::vec &)), + splat, SLOT(setValues(const arma::vec &))); // Linking between selections in subsample plot and full dataset plot SelectionHandler selectionHandler(sampleIndices); @@ -237,7 +252,11 @@ int main(int argc, char **argv) subsamplePlot->setXY(Ys); subsamplePlot->setColorData(labels(sampleIndices)); plot->setColorScale(&colorScale); + splat->setColorScale(&colorScale); plot->setColorData(labels); - return app.exec(); + auto ret = app.exec(); + + skelft2DDeinitialization(); + return ret; } @@ -4,6 +4,8 @@ #include <QObject> #include <armadillo> +#include "interactionhandler.h" + class Main : public QObject { Q_OBJECT @@ -38,6 +40,16 @@ public: Q_INVOKABLE void setSubsampleSavePath(const std::string &path) { m_subsampleSavePath = path; } Q_INVOKABLE void setSubsampleSavePath(const QString &path) { setSubsampleSavePath(path.toStdString()); } + void setInteractionHandler(InteractionHandler *interactionHandler) { + m_interactionHandler = interactionHandler; + } + + Q_INVOKABLE void setTechnique(int technique) { + if (m_interactionHandler) { + m_interactionHandler->setTechnique((InteractionHandler::Technique) technique); + } + } + arma::mat X() const { return m_dataset.cols(0, m_dataset.n_cols - 2); } arma::vec labels() const { return m_dataset.col(m_dataset.n_cols - 1); } @@ -53,13 +65,17 @@ public slots: } private: - Main(QObject *parent = 0) : QObject(parent) {} + Main(QObject *parent = 0) + : QObject(parent) + , m_interactionHandler(0) + {} ~Main() {} Q_DISABLE_COPY(Main) arma::mat m_dataset, m_subsample; arma::uvec m_subsampleIndices; std::string m_indicesSavePath, m_subsampleSavePath; + InteractionHandler *m_interactionHandler; }; #endif // MAIN_H diff --git a/main_view.qml b/main_view.qml index 0cd29b5..62c3a67 100644 --- a/main_view.qml +++ b/main_view.qml @@ -6,10 +6,13 @@ import QtQuick.Layouts 1.2 import PM 1.0 ApplicationWindow { + id: mainWindow title: "Projection Manipulation" visible: true - width: 900 - height: 600 + minimumWidth: 900 + minimumHeight: 600 + maximumWidth: minimumWidth + maximumHeight: minimumHeight menuBar: MenuBar { Menu { @@ -19,6 +22,26 @@ ApplicationWindow { } Menu { + title: "Technique" + MenuItem { + action: lampTechniqueAction + exclusiveGroup: techniqueGroup + } + MenuItem { + action: lspTechniqueAction + exclusiveGroup: techniqueGroup + } + MenuItem { + action: plmpTechniqueAction + exclusiveGroup: techniqueGroup + } + MenuItem { + action: pekalskaTechniqueAction + exclusiveGroup: techniqueGroup + } + } + + Menu { title: "View" MenuItem { action: labelColorAction @@ -68,10 +91,18 @@ ApplicationWindow { border.width: 1 border.color: "#cccccc" + VoronoiSplat { + id: splat + objectName: "splat" + width: 256 + height: 256 + } + Scatterplot { id: plot objectName: "plot" anchors.fill: parent + visible: false } } } @@ -108,6 +139,51 @@ ApplicationWindow { } ExclusiveGroup { + id: techniqueGroup + + Action { + id: lampTechniqueAction + text: "LAMP" + shortcut: "Ctrl+1" + checked: true + checkable: true + onTriggered: { + Main.setTechnique(InteractionHandler.TECHNIQUE_LAMP) + } + } + + Action { + id: lspTechniqueAction + text: "LSP" + shortcut: "Ctrl+2" + checkable: true + onTriggered: { + Main.setTechnique(InteractionHandler.TECHNIQUE_LSP) + } + } + + Action { + id: plmpTechniqueAction + text: "PLMP" + shortcut: "Ctrl+3" + checkable: true + onTriggered: { + Main.setTechnique(InteractionHandler.TECHNIQUE_PLMP) + } + } + + Action { + id: pekalskaTechniqueAction + text: "Pekalska" + shortcut: "Ctrl+4" + checkable: true + onTriggered: { + Main.setTechnique(InteractionHandler.TECHNIQUE_PEKALSKA) + } + } + } + + ExclusiveGroup { id: coloringGroup Action { @@ -116,7 +192,7 @@ ApplicationWindow { shortcut: "Shift+L" checked: true checkable: true - onTriggered: console.log("Labels") + onTriggered: console.log("stub: Labels") } Action { @@ -124,7 +200,7 @@ ApplicationWindow { text: "Neighborhood Preservation" shortcut: "Shift+N" checkable: true - onTriggered: console.log("NP") + onTriggered: console.log("stub: NP") } Action { @@ -132,7 +208,7 @@ ApplicationWindow { text: "Stress" shortcut: "Shift+S" checkable: true - onTriggered: console.log("Stress") + onTriggered: console.log("stub: Stress") } Action { @@ -140,7 +216,7 @@ ApplicationWindow { text: "Silhouette" shortcut: "Shift+T" checkable: true - onTriggered: console.log("Silhouette") + onTriggered: console.log("stub: Silhouette") } } } @@ -1,5 +1,7 @@ QT += qml quick widgets +CONFIG += qt debug + QMAKE_CXXFLAGS += -std=c++11 -fopenmp QMAKE_LIBS += -larmadillo -fopenmp HEADERS += main.h \ @@ -8,6 +10,7 @@ HEADERS += main.h \ geometry.h \ scale.h \ scatterplot.h \ + voronoisplat.h \ historygraph.h \ interactionhandler.h \ selectionhandler.h \ @@ -15,18 +18,22 @@ HEADERS += main.h \ distortionobserver.h \ distortionmeasure.h \ npdistortion.h \ + skelft.h \ + skelftkernel.h mp.h SOURCES += main.cpp \ colorscale.cpp \ continuouscolorscale.cpp \ geometry.cpp \ scatterplot.cpp \ + voronoisplat.cpp \ historygraph.cpp \ interactionhandler.cpp \ selectionhandler.cpp \ effectivenessobserver.cpp \ distortionobserver.cpp \ npdistortion.cpp \ + skelft_core.cpp \ lamp.cpp \ plmp.cpp \ knn.cpp \ @@ -34,6 +41,33 @@ SOURCES += main.cpp \ tsne.cpp \ measures.cpp \ dist.cpp + +OTHER_FILES += skelft.cu + +# Cuda settings +CUDA_SOURCES += skelft.cu +CUDA_DIR = "/opt/cuda" +CUDA_ARCH = sm_30 +NVCC_OPTIONS += --use_fast_math +SYSTEM_TYPE = 64 # Either 64 or empty +INCLUDEPATH += $$CUDA_DIR/include +QMAKE_LIBDIR += $$CUDA_DIR/lib$$SYSTEM_TYPE +LIBS += -lcuda -lcudart + +CONFIG(debug, debug|release) { + cuda_dbg.input = CUDA_SOURCES + cuda_dbg.output = ${QMAKE_FILE_BASE}_cuda.o + cuda_dbg.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG -g $$NVCC_OPTIONS -I$$INCLUDEPATH $$LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} + cuda_dbg.dependency_type = TYPE_C + QMAKE_EXTRA_COMPILERS += cuda_dbg +} else { + cuda.input = CUDA_SOURCES + cuda.output = ${QMAKE_FILE_BASE}_cuda.o + cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS -I$$INCLUDEPATH $$LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} + cuda.dependency_type = TYPE_C + QMAKE_EXTRA_COMPILERS += cuda +} + RESOURCES += pm.qrc target.path = . diff --git a/skelft.cu b/skelft.cu new file mode 100644 index 0000000..be7662c --- /dev/null +++ b/skelft.cu @@ -0,0 +1,869 @@ +#include <device_functions.h> +#include "skelft.h" +#include <cstdio> +#include <iostream> + + +#include <thrust/host_vector.h> +#include <thrust/device_vector.h> +#include <thrust/sort.h> + + + +using namespace std; + + +// Parameters for CUDA kernel executions; more or less optimized for a 1024x1024 image. +#define BLOCKX 16 +#define BLOCKY 16 +#define BLOCKSIZE 64 +#define TILE_DIM 32 +#define BLOCK_ROWS 16 + + + +/****** Global Variables *******/ +const int NB = 7; // Nr buffers we use and store in the entire framework +short2 **pbaTextures; // Work buffers used to compute and store resident images +// 0: work buffer +// 1: FT +// 2: thresholded DT or exact floating-point DT +// 3: thresholded skeleton +// 4: topology analysis +// 5: work buffer for topology +// 6: skeleton FT +// + +float* pbaTexSiteParam; // Stores boundary parameterization +float2* pbaTexAdvectSites; +float* pbaTexDensityData; // The density map of the sites +float2* pbaTexGradientData; // The gradient of the density map + +int pbaTexSize; // Texture size (squared) actually used in all computations +int floodBand = 4, // Various FT computation parameters; defaults are good for an 1024x1024 image. + maurerBand = 4, + colorBand = 4; + +texture<short2> pbaTexColor; // 2D textures (bound to various buffers defined above as needed) +texture<short2> pbaTexColor2; +texture<short2> pbaTexLinks; +texture<float> pbaTexParam; // 1D site parameterization texture (bound to pbaTexSiteParam) +texture<unsigned char> + pbaTexGray; // 2D texture of unsigned char values, e.g. the binary skeleton +texture<float2> pbaTexAdvect; +texture<float,cudaTextureType2D,cudaReadModeElementType> pbaTexDensity; +texture<float2,cudaTextureType2D,cudaReadModeElementType> pbaTexGradient; +texture<float,cudaTextureType2D,cudaReadModeElementType> pbaTexDT; + + +__device__ bool fill_gc; //Indicates if a fill-sweep did fill anything or not + + +#if __CUDA_ARCH__ < 110 // We cannot use atomic intrinsics on SM10 or below. Thus, we define these as nop. +#define atomicInc(a,b) 0 // The default will be that some code e.g. endpoint detection will thus not do anything. +#endif + + + +/********* Kernels ********/ +#include "skelftkernel.h" + + + +// Initialize necessary memory (CPU/GPU sides) +// - textureSize: The max size of any image we will process until re-initialization +void skelft2DInitialization(int maxTexSize) +{ + int pbaMemSize = maxTexSize * maxTexSize * sizeof(short2); // A buffer has 2 shorts / pixel + pbaTextures = (short2**) malloc(NB * sizeof(short2*)); // We will use NB buffers + + for(int i=0;i<NB;++i) + cudaMalloc((void **) &pbaTextures[i], pbaMemSize); // Allocate work buffer 'i' (on GPU) + + cudaMalloc((void**) &pbaTexSiteParam, maxTexSize * maxTexSize * sizeof(float)); // Sites texture (for FT) + cudaMalloc((void**) &pbaTexAdvectSites, maxTexSize * maxTexSize * sizeof(float2)); // Sites 2D-coords + cudaMalloc((void**) &pbaTexDensityData, maxTexSize * maxTexSize * sizeof(float)); + cudaMalloc((void**) &pbaTexGradientData, maxTexSize * maxTexSize * sizeof(float2)); +} + + + + +// Deallocate all allocated memory +void skelft2DDeinitialization() +{ + for(int i=0;i<NB;++i) cudaFree(pbaTextures[i]); + cudaFree(pbaTexSiteParam); + cudaFree(pbaTexAdvectSites); + cudaFree(pbaTexDensityData); + cudaFree(pbaTexGradientData); + free(pbaTextures); +} + + +// Initialize the Voronoi textures from the sites' encoding texture (parameterization) +// REMARK: we interpret 'inputVoro' as a 2D texture, as it's much easier/faster like this +__global__ void kernelSiteParamInit(short2* inputVoro, int size) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + if (tx<size && ty<size) // Careful not to go outside the image.. + { + int i = TOID(tx,ty,size); + // The sites-param has non-zero (parameter) values precisely on boundary points + float param = tex1Dfetch(pbaTexParam,i); + + short2& v = inputVoro[i]; + // Non-boundary points are marked as 0 in the parameterization. Here we will compute the FT. + v.x = v.y = MARKER; + // These are points which define the 'sites' to compute the FT/skeleton (thus, have FT==identity) + // We could use an if-then-else here, but it's faster with an if-then + if (param>0) + { + v.x = tx; + v.y = ty; + } + } +} + + + +void skelft2DInitializeInput(float* sites, int size) // Copy input sites from CPU to GPU; Also set up site param initialization in pbaTextures[0] +{ + pbaTexSize = size; // Size of the actual texture being used in this run; can be smaller than the max-tex-size + // which was used in skelft2DInitialization() + + cudaMemcpy(pbaTexSiteParam, sites, pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyHostToDevice); + // Pass sites parameterization to CUDA. Must be done before calling the initialization + // kernel, since we use the sites-param as a texture in that kernel + cudaBindTexture(0, pbaTexParam, pbaTexSiteParam); // Bind the sites-param as a 1D texture so we can quickly index it next + dim3 block = dim3(BLOCKX,BLOCKY); + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + kernelSiteParamInit<<<grid,block>>>(pbaTextures[0],pbaTexSize); // Do the site param initialization. This sets up pbaTextures[0] + cudaUnbindTexture(pbaTexParam); +} + + + + + +// In-place transpose a squared texture. +// Block orders are modified to optimize memory access. +// Point coordinates are also swapped. +void pba2DTranspose(short2 *texture) +{ + dim3 block(TILE_DIM, BLOCK_ROWS); + dim3 grid(pbaTexSize / TILE_DIM, pbaTexSize / TILE_DIM); + + cudaBindTexture(0, pbaTexColor, texture); + kernelTranspose<<< grid, block >>>(texture, pbaTexSize); + cudaUnbindTexture(pbaTexColor); +} + +// Phase 1 of PBA. m1 must divides texture size +void pba2DPhase1(int m1, short xm, short ym, short xM, short yM) +{ + dim3 block = dim3(BLOCKSIZE); + dim3 grid = dim3(pbaTexSize / block.x, m1); + + // Flood vertically in their own bands + cudaBindTexture(0, pbaTexColor, pbaTextures[0]); + kernelFloodDown<<< grid, block >>>(pbaTextures[1], pbaTexSize, pbaTexSize / m1); + cudaUnbindTexture(pbaTexColor); + + cudaBindTexture(0, pbaTexColor, pbaTextures[1]); + kernelFloodUp<<< grid, block >>>(pbaTextures[1], pbaTexSize, pbaTexSize / m1); + + // Passing information between bands + grid = dim3(pbaTexSize / block.x, m1); + kernelPropagateInterband<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m1); + + cudaBindTexture(0, pbaTexLinks, pbaTextures[0]); + kernelUpdateVertical<<< grid, block >>>(pbaTextures[1], pbaTexSize, m1, pbaTexSize / m1); + cudaUnbindTexture(pbaTexLinks); + cudaUnbindTexture(pbaTexColor); +} + +// Phase 2 of PBA. m2 must divides texture size +void pba2DPhase2(int m2) +{ + // Compute proximate points locally in each band + dim3 block = dim3(BLOCKSIZE); + dim3 grid = dim3(pbaTexSize / block.x, m2); + cudaBindTexture(0, pbaTexColor, pbaTextures[1]); + kernelProximatePoints<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m2); + + cudaBindTexture(0, pbaTexLinks, pbaTextures[0]); + kernelCreateForwardPointers<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m2); + + // Repeatly merging two bands into one + for (int noBand = m2; noBand > 1; noBand /= 2) { + grid = dim3(pbaTexSize / block.x, noBand / 2); + kernelMergeBands<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / noBand); + } + + // Replace the forward link with the X coordinate of the seed to remove + // the need of looking at the other texture. We need it for coloring. + grid = dim3(pbaTexSize / block.x, pbaTexSize); + kernelDoubleToSingleList<<< grid, block >>>(pbaTextures[0], pbaTexSize); + cudaUnbindTexture(pbaTexLinks); + cudaUnbindTexture(pbaTexColor); +} + +// Phase 3 of PBA. m3 must divides texture size +void pba2DPhase3(int m3) +{ + dim3 block = dim3(BLOCKSIZE / m3, m3); + dim3 grid = dim3(pbaTexSize / block.x); + cudaBindTexture(0, pbaTexColor, pbaTextures[0]); + kernelColor<<< grid, block >>>(pbaTextures[1], pbaTexSize); + cudaUnbindTexture(pbaTexColor); +} + + + +void skel2DFTCompute(short xm, short ym, short xM, short yM, int floodBand, int maurerBand, int colorBand) +{ + pba2DPhase1(floodBand,xm,ym,xM,yM); //Vertical sweep + + pba2DTranspose(pbaTextures[1]); // + + pba2DPhase2(maurerBand); //Horizontal coloring + + pba2DPhase3(colorBand); //Row coloring + + pba2DTranspose(pbaTextures[1]); +} + + + + + +__global__ void kernelThresholdDT(unsigned char* output, int size, float threshold2, short xm, short ym, short xM, short yM) +//Input: pbaTexColor: closest-site-ids per pixel, i.e. FT +//Output: output: thresholded DT +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + if (tx>xm && ty>ym && tx<xM && ty<yM) //careful not to index outside the image.. + { + int id = TOID(tx, ty, size); + short2 voroid = tex1Dfetch(pbaTexColor,id); //get the closest-site to tx,ty into voroid.x,.y + float d2 = (tx-voroid.x)*(tx-voroid.x)+(ty-voroid.y)*(ty-voroid.y); + output[id] = (d2<=threshold2); //threshold DT into binary image + } +} + + + +__global__ void kernelDT(float* output, int size, short xm, short ym, short xM, short yM) +//Input: pbaTexColor: closest-site-ids per pixel, i.e. FT +//Output: output: DT +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + if (tx>xm && ty>ym && tx<xM && ty<yM) //careful not to index outside the image.. + { + int id = TOID(tx, ty, size); + short2 voroid = tex1Dfetch(pbaTexColor,id); //get the closest-site to tx,ty into voroid.x,.y + float d2 = (tx-voroid.x)*(tx-voroid.x)+(ty-voroid.y)*(ty-voroid.y); + output[id] = sqrtf(d2); //save the Euclidean DT + } +} + + +__global__ void kernelSkel(unsigned char* output, short xm, short ym, + short xM, short yM, short size, float threshold, float length) + //Input: pbaTexColor: closest-site-ids per pixel + // pbaTexParam: labels for sites (only valid at site locations) +{ //Output: output: binary thresholded skeleton + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + if (tx>xm && ty>ym && tx<xM && ty<yM) + { + int id = TOID(tx, ty, size); + int Id = id; + short2 voroid = tex1Dfetch(pbaTexColor,id); //get the closest-site to tx,ty into voroid.x,.y + int id2 = TOID(voroid.x,voroid.y,size); //convert the site's coord to an index into pbaTexParam[], the site-label-texture + float imp = tex1Dfetch(pbaTexParam,id2); //get the site's label + + ++id; //TOID(tx+1,ty,size) + voroid = tex1Dfetch(pbaTexColor,id); // + id2 = TOID(voroid.x,voroid.y,size); // + float imp_r = tex1Dfetch(pbaTexParam,id2); // + + id += size-1; //TOID(tx,ty+1,size) + voroid = tex1Dfetch(pbaTexColor,id); // + id2 = TOID(voroid.x,voroid.y,size); // + float imp_u = tex1Dfetch(pbaTexParam,id2); // + + float imp_dx = fabsf(imp_r-imp); + float imp_dy = fabsf(imp_u-imp); + float Imp = max(imp_dx,imp_dy); + Imp = min(Imp,fabsf(length-Imp)); + if (Imp>=threshold) output[Id] = 1; //By filling only 1-values, we reduce memory access somehow (writing to output[] is expensive) + } + + //WARNING: this kernel may sometimes creates 2-pixel-thick branches.. Study the AFMM original code to see if this is correct. +} + + + +#define X 1 + +__constant__ const //REMARK: put following constants (for kernelTopology) in CUDA constant-memory, as this gives a huge speed difference +unsigned char topo_patterns[][9] = { {0,0,0, //These are the 3x3 templates that we use to detect skeleton endpoints + 0,X,0, //(with four 90-degree rotations for each) + 0,X,0}, + {0,0,0, + 0,X,0, + 0,0,X}, + {0,0,0, + 0,X,0, + 0,X,X}, + {0,0,0, + 0,X,0, + X,X,0} + }; + +#define topo_NPATTERNS 4 //Number of patterns we try to match (for kernelTopology) + //REMARK: #define faster than __constant__ + +__constant__ const unsigned char topo_rot[][9] = { {0,1,2,3,4,5,6,7,8}, {2,5,8,1,4,7,0,3,6}, {8,7,6,5,4,3,2,1,0}, {6,3,0,7,4,1,8,5,2} }; + //These encode the four 90-degree rotations of the patterns (for kernelTopology); + +__device__ unsigned int topo_gc = 0; +__device__ unsigned int topo_gc_last = 0; + + +__global__ void kernelTopology(unsigned char* output, short2* output_set, short xm, short ym, short xM, short yM, short size, int maxpts) +{ + const int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int ty = blockIdx.y * blockDim.y + threadIdx.y; + + unsigned char t[9]; + + if (tx>xm && ty>ym && tx<xM-1 && ty<yM-1) //careful not to index outside the image; take into account the template size too + { + int id = TOID(tx, ty, size); + unsigned char p = tex1Dfetch(pbaTexGray,id); //get the skeleton pixel at tx,ty + if (p) //if the pixel isn't skeleton, nothing to do + { + unsigned char idx=0; + for(int j=ty-1;j<=ty+1;++j) //read the template into t[] for easier use + { + int id = TOID(tx-1, j, size); + for(int i=0;i<=2;++i,++id,++idx) + t[idx] = tex1Dfetch(pbaTexGray,id); //get the 3x3 template centered at the skel point tx,ty + } + + for(unsigned char r=0;r<4;++r) //try to match all rotations of a pattern: + { + const unsigned char* rr = topo_rot[r]; + for(unsigned char p=0;p<topo_NPATTERNS;++p) //try to match all patterns: + { + const unsigned char* pat = topo_patterns[p]; + unsigned char j = (p==0)? 0 : 7; //Speedup: for all patterns except 1st, check only last 3 entries, the first 6 are identical for all patterns + for(;j<9;++j) //try to match rotated pattern vs actual pattern + if (pat[j]!=t[rr[j]]) break; //this rotation failed + if (j<6) break; //Speedup: if we have a mismatch on the 1st 6 pattern entries, then none of the patterns can match + // since all templates have the same first 6 entries. + + if (j==9) //this rotation succeeded: mark the pixel as a topology event and we're done + { + int crt_gc = atomicInc(&topo_gc,maxpts); //REMARK: this serializes (compacts) all detected endpoints in one array. + output_set[crt_gc] = make_short2(tx,ty); //To do this, we use an atomic read-increment-return on a global counter, + //which is guaranteed to give all threads unique consecutive indexes in the array. + output[id] = 1; //Also create the topology image + return; + } + } + } + } + } + else //Last thread: add zero-marker to the output point-set, so the reader knows how many points are really in there + if (tx==xM-1 && ty==yM-1) //Also reset the global vector counter topo_gc, for the next parallel-run of this function + { topo_gc_last = topo_gc; topo_gc = 0; } //We do this in the last thread so that no one modifies topo_gc from now on. + //REMARK: this seems to be the only way I can read a __device__ variable back to the CPU +} + + + + +void skelft2DParams(int floodBand_, int maurerBand_, int colorBand_) //Set up some params of the FT algorithm +{ + floodBand = floodBand_; + maurerBand = maurerBand_; + colorBand = colorBand_; +} + + + + + +// Compute 2D FT / Voronoi diagram of a set of sites +// siteParam: Site parameterization. 0 = non-site points; >0 = site parameter value. +// output: FT. The (x,y) at (i,j) are the coords of the closest site to (i,j) +// size: Texture size (pow 2) +void skelft2DFT(short* output, float* siteParam, short xm, short ym, short xM, short yM, int size) +{ + skelft2DInitializeInput(siteParam,size); // Initialization of already-allocated data structures + + skel2DFTCompute(xm, ym, xM, yM, floodBand, maurerBand, colorBand); // Compute FT + + // Copy FT to CPU, if required + if (output) cudaMemcpy(output, pbaTextures[1], size*size*sizeof(short2), cudaMemcpyDeviceToHost); +} + + + + + + +void skelft2DDT(float* outputDT, short xm, short ym, short xM, short yM) //Compute exact DT from resident FT (in pbaTextures[1]) +{ + dim3 block = dim3(BLOCKX,BLOCKY); + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + + cudaBindTexture(0, pbaTexColor, pbaTextures[1]); //Used to read the FT from + + xm = ym = 0; xM = yM = pbaTexSize-1; + kernelDT <<< grid, block >>>((float*)pbaTextures[2], pbaTexSize, xm-1, ym-1, xM+1, yM+1); + cudaUnbindTexture(pbaTexColor); + + //Copy DT to CPU + if (outputDT) cudaMemcpy(outputDT, pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost); +} + + + +void skelft2DDT(short* outputDT, float threshold, //Compute (thresholded) DT (into pbaTextures[2]) from resident FT (in pbaTextures[1]) + short xm, short ym, short xM, short yM) +{ + dim3 block = dim3(BLOCKX,BLOCKY); + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + cudaBindTexture(0, pbaTexColor, pbaTextures[1]); //Used to read the FT from + + if (threshold>=0) + { + xm -= (short)threshold; if (xm<0) xm=0; + ym -= (short)threshold; if (ym<0) ym=0; + xM += (short)threshold; if (xM>pbaTexSize-1) xM=pbaTexSize-1; + yM += (short)threshold; if (yM>pbaTexSize-1) yM=pbaTexSize-1; + + kernelThresholdDT<<< grid, block >>>((unsigned char*)pbaTextures[2], pbaTexSize, threshold*threshold, xm-1, ym-1, xM+1, yM+1); + cudaUnbindTexture(pbaTexColor); + + //Copy thresholded image to CPU + if (outputDT) cudaMemcpy(outputDT, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost); + } +} + + + + +void skelft2DSkeleton(unsigned char* outputSkel, float length, float threshold, //Compute thresholded skeleton (into pbaTextures[3]) from resident FT (in pbaTextures[1]) + short xm,short ym,short xM,short yM) +{ //length: boundary length + dim3 block = dim3(BLOCKX,BLOCKY); //threshold: skeleton importance min-value (below this, we ignore branches) + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + cudaBindTexture(0, pbaTexColor, pbaTextures[1]); //Used to read the resident FT + cudaBindTexture(0, pbaTexParam, pbaTexSiteParam); //Used to read the resident boundary parameterization + cudaMemset(pbaTextures[3],0,sizeof(unsigned char)*pbaTexSize*pbaTexSize); //Faster to zero result and then fill only 1-values (see kernel) + + kernelSkel<<< grid, block >>>((unsigned char*)pbaTextures[3], xm, ym, xM-1, yM-1, pbaTexSize, threshold, length); + + cudaUnbindTexture(pbaTexColor); + cudaUnbindTexture(pbaTexParam); + + //Copy skeleton to CPU + if (outputSkel) cudaMemcpy(outputSkel, pbaTextures[3], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost); +} + + + + +void skelft2DTopology(unsigned char* outputTopo, int* npts, short* outputPoints, //Compute topology-points of the resident skeleton (in pbaTextures[3]) + short xm,short ym,short xM,short yM) +{ + int maxpts = (npts)? *npts : pbaTexSize*pbaTexSize; //This is the max # topo-points we are going to return in outputPoints[] + + dim3 block = dim3(BLOCKX,BLOCKY); + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + cudaBindTexture(0, pbaTexGray, pbaTextures[3]); //Used to read the resident skeleton + cudaMemset(pbaTextures[4],0,sizeof(unsigned char)*pbaTexSize*pbaTexSize); //Faster to zero result and then fill only 1-values (see kernel) + + unsigned int zero = 0; + cudaMemcpyToSymbol(topo_gc,&zero,sizeof(unsigned int),0,cudaMemcpyHostToDevice); //Set topo_gc to 0 + + kernelTopology<<< grid, block >>>((unsigned char*)pbaTextures[4], pbaTextures[5], xm, ym, xM, yM, pbaTexSize, maxpts+1); + + cudaUnbindTexture(pbaTexGray); + + if (outputPoints && maxpts) //If output-point vector desired, copy the end-points, put in pbaTexture[5] as a vector of short2's, + { //into caller space. We copy only 'maxpts' elements, as the user instructed us. + unsigned int num_pts; + cudaMemcpyFromSymbol(&num_pts,topo_gc_last,sizeof(unsigned int),0,cudaMemcpyDeviceToHost); //Get #topo-points we have detected from the device-var from CUDA + if (npts && num_pts) //Copy the topo-points to caller + cudaMemcpy(outputPoints,pbaTextures[5],num_pts*sizeof(short2),cudaMemcpyDeviceToHost); + if (npts) *npts = num_pts; //Return #detected topo-points to caller + } + + if (outputTopo) //If topology image desired, copy it into user space + cudaMemcpy(outputTopo,pbaTextures[4],pbaTexSize*pbaTexSize*sizeof(unsigned char), cudaMemcpyDeviceToHost); +} + + + + +__global__ void kernelSiteFromSkeleton(short2* outputSites, int size) //Initialize the Voronoi textures from the sites' encoding texture (parameterization) +{ //REMARK: we interpret 'inputVoro' as a 2D texture, as it's much easier/faster like this + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + if (tx<size && ty<size) //Careful not to go outside the image.. + { + int i = TOID(tx,ty,size); + unsigned char param = tex1Dfetch(pbaTexGray,i); //The sites-param has non-zero (parameter) values precisely on non-boundary points + + short2& v = outputSites[i]; + v.x = v.y = MARKER; //Non-boundary points are marked as 0 in the parameterization. Here we will compute the FT. + if (param) //These are points which define the 'sites' to compute the FT/skeleton (thus, have FT==identity) + { //We could use an if-then-else here, but it's faster with an if-then + v.x = tx; v.y = ty; + } + } +} + + + + +__global__ void kernelSkelInterpolate(float* output, int size) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + if (tx<size && ty<size) //Careful not to go outside the image.. + { + int id = TOID(tx, ty, size); + short2 vid = tex1Dfetch(pbaTexColor,id); + float T = sqrtf((tx-vid.x)*(tx-vid.x)+(ty-vid.y)*(ty-vid.y)); + short2 vid2 = tex1Dfetch(pbaTexColor2,id); + float D = sqrtf((tx-vid2.x)*(tx-vid2.x)+(ty-vid2.y)*(ty-vid2.y)); + float B = ((D)? min(T/2/D,0.5f):0.5) + 0.5*((T)? max(1-D/T,0.0f):0); + output[id] = pow(B,0.2f); + } +} + + + + +void skel2DSkeletonDT(float* outputSkelDT,short xm,short ym,short xM,short yM) +{ + dim3 block = dim3(BLOCKX,BLOCKY); + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + cudaBindTexture(0,pbaTexGray,pbaTextures[3]); //Used to read the resident binary skeleton + kernelSiteFromSkeleton<<<grid,block>>>(pbaTextures[0],pbaTexSize); //1. Init pbaTextures[0] with sites on skeleton i.e. from pbaTexGray + cudaUnbindTexture(pbaTexGray); + + //Must first save pbaTextures[1] since we may need it later.. + cudaMemcpy(pbaTextures[5],pbaTextures[1],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice); + skel2DFTCompute(xm, ym, xM, yM, floodBand, maurerBand, colorBand); //2. Compute FT of the skeleton into pbaTextures[6] + cudaMemcpy(pbaTextures[6],pbaTextures[1],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice); + cudaMemcpy(pbaTextures[1],pbaTextures[5],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice); + + //Compute interpolation + cudaBindTexture(0,pbaTexColor,pbaTextures[1]); // FT of boundary + cudaBindTexture(0,pbaTexColor2,pbaTextures[6]); // FT of skeleton + kernelSkelInterpolate<<<grid,block>>>((float*)pbaTextures[0],pbaTexSize); + cudaUnbindTexture(pbaTexColor); + cudaUnbindTexture(pbaTexColor2); + if (outputSkelDT) cudaMemcpy(outputSkelDT, pbaTextures[0], pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost); +} + + + + + +__global__ void kernelFill(unsigned char* output, int size, unsigned char bg, unsigned char fg, short xm, short ym, short xM, short yM, bool ne) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + if (tx>xm && ty>ym && tx<xM && ty<yM) //careful not to index outside the image.. + { + int id0 = TOID(tx, ty, size); + unsigned char val = tex1Dfetch(pbaTexGray,id0); // + if (val==fg) //do we have a filled pixel? Then fill all to left/top/up/bottom of it which is background + { + bool fill = false; + int id = id0; + if (ne) //fill in north-east direction: + { + for(short x=tx+1;x<xM;++x) //REMARK: here and below, the interesting thing is that it's faster, by about 10-15%, to fill a whole + { // scanline rather than oly until the current block's borders (+1). The reason is that filling a whole + // scanline decreases the total #sweeps, which seems to be the limiting speed factor + if (tex1Dfetch(pbaTexGray,++id)!=bg) break; + output[id] = fg; fill = true; + } + + id = id0; + for(short y=ty-1;y>ym;--y) + { + if (tex1Dfetch(pbaTexGray,id-=size)!=bg) break; + output[id] = fg; fill = true; + } + } + else //fill in south-west direction: + { + for(short x=tx-1;x>xm;--x) + { + if (tex1Dfetch(pbaTexGray,--id)!=bg) break; + output[id] = fg; fill = true; + } + + id = id0; + for(short y=ty+1;y<yM;++y) + { + if (tex1Dfetch(pbaTexGray,id+=size)!=bg) break; + output[id] = fg; fill = true; + } + } + + if (fill) fill_gc = true; //if we filled anything, inform caller; we 'gather' this info from a local var into the + //global var here, since it's faster than writing the global var in the for loops + } + } +} + + + + +__global__ void kernelFillHoles(unsigned char* output, int size, unsigned char bg, unsigned char fg, unsigned char fill_fg) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + if (tx>=0 && ty>=0 && tx<size && ty<size) //careful not to index outside the image.. + { + int id = TOID(tx, ty, size); + unsigned char val = tex1Dfetch(pbaTexGray,id); // + if (val==fill_fg) + output[id] = bg; + else if (val==bg) + output[id] = fg; + } +} + + +int skelft2DFill(unsigned char* outputFill, short sx, short sy, short xm, short ym, short xM, short yM, unsigned char fill_value) +{ + dim3 block = dim3(BLOCKX,BLOCKY); + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + unsigned char background; + int id = sy * pbaTexSize + sx; + cudaMemcpy(&background,(unsigned char*)pbaTextures[2]+id,sizeof(unsigned char),cudaMemcpyDeviceToHost); //See which is the value we have to fill from (sx,sy) + + cudaMemset(((unsigned char*)pbaTextures[2])+id,fill_value,sizeof(unsigned char)); //Fill the seed (x,y) on the GPU + + cudaBindTexture(0, pbaTexGray, pbaTextures[2]); //Used to read the thresholded DT + + int iter=0; + bool xy = true; //Direction of filling for current sweep: either north-east or south-west + //This kind of balances the memory-accesses nicely over kernel calls + for(;;++iter,xy=!xy) //Keep filling a sweep at a time until we have no background pixels anymore + { + bool filled = false; //Initialize flag: we didn't fill anything in this sweep + cudaMemcpyToSymbol(fill_gc,&filled,sizeof(bool),0,cudaMemcpyHostToDevice); //Pass flag to CUDA + kernelFill<<<grid, block>>>((unsigned char*)pbaTextures[2],pbaTexSize,background,fill_value,xm,ym,xM,yM,xy); + //One fill sweep + cudaMemcpyFromSymbol(&filled,fill_gc,sizeof(bool),0,cudaMemcpyDeviceToHost); //See if we filled anything in this sweep + if (!filled) break; //Nothing filled? Then we're done, the image didn't change + } + cudaUnbindTexture(pbaTexGray); + + if (outputFill) cudaMemcpy(outputFill, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost); + + return iter; //Return #iterations done for the fill - useful as a performance measure for caller +} + + + +int skelft2DFillHoles(unsigned char* outputFill, short sx, short sy, unsigned char foreground) +{ + unsigned char background; + unsigned char fill_value = 128; + int id = sy * pbaTexSize + sx; + cudaMemcpy(&background,(unsigned char*)pbaTextures[2]+id,sizeof(unsigned char),cudaMemcpyDeviceToHost); //See which is the value at (sx,sy) + + int iter = skelft2DFill(0,sx,sy,0,0,pbaTexSize,pbaTexSize,fill_value); //First, fill the background surrounding the image with some special value + + dim3 block = dim3(BLOCKX,BLOCKY); + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + + cudaBindTexture(0, pbaTexGray, pbaTextures[2]); //Used to read the thresholded DT + + kernelFillHoles<<<grid, block>>>((unsigned char*)pbaTextures[2],pbaTexSize,background,foreground,fill_value); + + cudaUnbindTexture(pbaTexGray); + + if (outputFill) cudaMemcpy(outputFill, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost); + + return iter; +} + + + +//-------- Advection-related code ---------------------------------------------------------------------- + + +void advect2DSetSites(float* sites, int nsites) +{ + cudaMemcpy(pbaTexAdvectSites, sites, nsites * sizeof(float2), cudaMemcpyHostToDevice); //Copy side-coords from CPU->GPU +} + +//#define KERNEL(radius,dist) expf(-10*dist/radius) +#define KERNEL(radius,dist) ((radius-dist)/radius) + + + +__global__ void kernelSplat(float* output, int size, int numpts, float d) +//REMARK: This could be done tens of times faster using OpenGL (draw point-sprites with a 2D distance-kernel texture) +{ + int offs = blockIdx.x * blockDim.x + threadIdx.x; + + if (offs < numpts) //careful not to index outside the site-vector.. + { + float2 p = tex1Dfetch(pbaTexAdvect,offs); //splat the site whose coords are at location offs in pbaTexAdvect[] + + float d2 = d*d; + short tx = p.x, ty = p.y; //REMARK: if I make these float, then something gets ccrewed up - why? + short jmin = ty-d; + short jmax = ty+d; + + for(short j=jmin;j<=jmax;++j) //bounding-box of the splat at 'skel' + { + float dy = (j-ty)*(j-ty); //precalc this for speed + short imin = tx-d; + short imax = tx+d; + + int offs = __mul24(int(j),size); + + for(short i=imin;i<=imax;++i) //REMARK: this could be made ~1.5x faster by computing the x-scanline intersection (min,max) with the d2-circle.. + { + float r2 = dy+(i-tx)*(i-tx); + if (r2<=d2) //check we're really inside the splat + { + int id = int(i)+offs; + float val = KERNEL(d2,r2); + output[id] += val; //Accumulate density (WARNING: hope this is thread-safe!!) + //If not, I need atomicAdd() on floats which requires CUDA 2.0 + } + } + } + } +} + + +inline __device__ float2 computeGradient(const float2& p) //Compute -gradient of density at p +{ + float v_d = tex2D(pbaTexDensity,p.x,p.y-1); + float v_l = tex2D(pbaTexDensity,p.x-1,p.y); + float v_r = tex2D(pbaTexDensity,p.x+1,p.y); + float v_t = tex2D(pbaTexDensity,p.x,p.y+1); + + return make_float2((v_l-v_r)/2,(v_d-v_t)/2); +} + + +inline __device__ float2 computeGradientExplicit(float* density, const float2& p, int size) //Compute gradient of density at p +{ + int x = p.x, y = p.y; + float v_d = density[TOID(x,y-1,size)]; + float v_l = density[TOID(x-1,y,size)]; + float v_r = density[TOID(x+1,y,size)]; + float v_t = density[TOID(x,y+1,size)]; + + return make_float2((v_r-v_l)/2,(v_t-v_d)/2); +} + + +__global__ void kernelGradient(float2* grad, int size) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + + const float eps = 0.0001; + + if (tx<size && ty<size) //Careful not to go outside the image.. + { + float2 p = make_float2(tx,ty); + float2 g = computeGradient(p); + + float gn = sqrtf(g.x*g.x+g.y*g.y); //robustly normalize the gradient + g.x /= gn+eps; g.y /= gn+eps; + + grad[TOID(tx,ty,size)] = g; + } +} + + +void advect2DSplat(float* output, int num_pts, float radius) +{ + dim3 block = dim3(BLOCKX*BLOCKY); //Prepare the splatting kernel: this operates on a vector of 2D sites + int numpts_b = (num_pts/block.x+1)*block.x; //Find higher multiple of blocksize than # sites + dim3 grid = dim3(numpts_b/block.x); + + cudaMemset(pbaTexDensityData,0,sizeof(float)*pbaTexSize*pbaTexSize); //Zero the density texture + cudaBindTexture(0,pbaTexAdvect,pbaTexAdvectSites); //Bind sites to a texture + + kernelSplat<<< grid, block >>>(pbaTexDensityData, pbaTexSize, num_pts, radius); + //Splat kernel + cudaThreadSynchronize(); + + cudaUnbindTexture(pbaTexAdvect); //Done with the sites + //Copy splat-map to CPU (if desired) + if (output) cudaMemcpy(output, pbaTexDensityData, pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost); +} + + + +void advect2DGetSites(float* output, int num_pts) //Get sites CUDA->CPU +{ + cudaMemcpy(output,pbaTexAdvectSites,num_pts*sizeof(float2),cudaMemcpyDeviceToHost); +} + + +void advect2DGradient() //Compute and save gradient of density map +{ + dim3 block = dim3(BLOCKX,BLOCKY); + dim3 grid = dim3(pbaTexSize/block.x,pbaTexSize/block.y); + + cudaMemset(pbaTexGradientData,0,sizeof(float2)*pbaTexSize*pbaTexSize); //Zero the gradient texture + + pbaTexDensity.filterMode = cudaFilterModeLinear; //The floating-point density map + pbaTexDensity.normalized = false; + cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat); + cudaBindTexture2D(0,pbaTexDensity,pbaTexDensityData,channelDesc1,pbaTexSize,pbaTexSize,4*pbaTexSize); + + kernelGradient<<< grid, block >>>(pbaTexGradientData, pbaTexSize); //Gradient kernel + + cudaUnbindTexture(pbaTexDensity); +} + + + + + diff --git a/skelft.h b/skelft.h new file mode 100644 index 0000000..e426053 --- /dev/null +++ b/skelft.h @@ -0,0 +1,106 @@ +// CUDA-based Skeletonization, Distance Transforms, Feature Transforms, Erosion +// and Dilation, and Flood Filling Toolkit +// +//(c) Alexandru Telea, Univ. of Groningen, 2011 +//==================================================================================================================== +#pragma once + +// Given an image of size nx x ny that we want to process, CUDA may need to use +// a larger image (e.g. pow 2) internally +// to handle our image. This returns the size of this larger image. Since we get +// back data from CUDA at this size, +// we need to know about the size to allocate all our app-side buffers. +int skelft2DSize(int nx, int ny); + +// Initialize CUDA and allocate memory +// textureSize is 2^k with k >= 6 +void skelft2DInitialization(int textureSize); + +// Deallocate all memory on GPU +void skelft2DDeinitialization(); + +// Set various FT computation params. Can be called before each FT computation +void skelft2DParams(int phase1Band, int phase2Band, int phase3Band); + +// Compute 2D feature transform (or Voronoi diagram) +// siteParam: 2D texture site parameterization. 0 = non-site pixel; >0 = site +// parameter at current pixel. +// output: 2D texture FT. Gives coords (i,j) of closest site to each pixel. +// If output==0, the FT is still computed and stored on CUDA, but not +// passed back to CPU. +// size: +void skelft2DFT(short *output, float *siteParam, short xm, short ym, short xM, + short yM, int size); + +// Compute thresholded skeleton of in-CUDA-memory FT. +// length: max value of site parameter (needed for normalization) +// threshold: threshold for the skeleton importance, like in the AFMM algorithm +// output: binary thresholded skeleton (0=background,1=skeleton) +void skelft2DSkeleton(unsigned char *output, float length, float threshold, + short xm, short ym, short xM, short yM); + +// Compute thresholded DT of in-CUDA-memory FT. +// threshold: upper value for DT +// output: binary thresholded DT (1=larger than threshold,0=otherwise) +void skelft2DDT(short *output, float threshold, short xm, short ym, short xM, + short yM); + +// Compute exact DT of in-CUDA-memory FT. +// output: exact floating-point DT +void skelft2DDT(float *outputDT, short xm, short ym, short xM, short yM); + +// Make an arc-length parameterization of a binary shape, used for +// skeletonization input +// input: binary image whose boundary we parameterize +// dx,dy: +// param: arc-length parameterization image +// size: +// return: the boundary length +float skelft2DMakeBoundary(const unsigned char *input, int xm, int ym, int xM, + int yM, float *param, int size, short iso = 1, + bool thr_upper = true); +float skelft2DMakeBoundary(const float *input, int xm, int ym, int xM, int yM, + float *param, int size, float iso = 1, + bool thr_upper = true); + +// Compute topology events (skeleton endpoints) for an in-CUDA-memory +// thresholded skeleton +// topo: binary image (1=skel endpoints,0=otherwise), optional. If not +// supplied, not returned +// npts: on input, this gives the max #points we will return; on output, +// set to the #points detected and returned +// points: array of (x,y) pairs of the detected points +extern "C" void skelft2DTopology(unsigned char *topo, int *npts, short *points, + short xm, short ym, short xM, short yM); + +// Utility: save given image to pgm file +void skelft2DSave(short *outputFT, int dx, int dy, const char *f); + +// Compute DT of in-CUDA-memory skeleton +void skel2DSkeletonDT(float *outputSkelDT, short xm, short ym, short xM, + short yM); + +// Fills all same-value 4-connected pixels from (seedx,seedy) with value +// 'fill_val' in the in-CUDA-memory thresholded DT +// outputFill: filled image +// <return>: #iterations done for the fill, useful as a performance measure +// (lower=better) +int skelft2DFill(unsigned char *outputFill, short seedx, short seedy, short xm, + short ym, short xM, short yM, unsigned char foreground); + +int skelft2DFillHoles(unsigned char *outputFill, short x, short y, + unsigned char fill_val); + +//----------------------- + +void advect2DSetSites(float *sites, int nsites, int inside_sites); + +void advectSetImage(int *image, int nx, int ny); + +void advect2DGetSites(float *sites, int nsites); + +void advect2DSplat(float *density, int num_pts, float radius); + +void advectAttract(int num_pts, float step); + +void advectRelax(int num_pts, float map_avg, float radius); diff --git a/skelft_core.cpp b/skelft_core.cpp new file mode 100644 index 0000000..e4cec20 --- /dev/null +++ b/skelft_core.cpp @@ -0,0 +1,199 @@ +#include <cstdlib> +#include <cmath> +#include <cstring> +#include <stack> +#include <vector> +#include <iostream> + +const float MYINFINITY = 1.0e7f; + +inline float SQR(float x) { return x * x; } +inline int SQR(int x) { return x * x; } +inline float INTERP(float a, float b, float t) { return a * (1 - t) + b * t; } + +struct Coord { + int i, j; + Coord(int i_, int j_) + : i(i_) + , j(j_){}; + Coord() {} + int operator==(const Coord &c) const { return i == c.i && j == c.j; } +}; + +using namespace std; + +const float ALIVE = -10, NARROW_BAND = ALIVE + 1, + FAR_AWAY = ALIVE + 2; // Classification values for pixels in an + // image. The values are arbitrary. + +int skelft2DSize(int nx, int ny) { + int dx = (int) floor( + pow(2.0f, int(log(float(nx)) / log(2.0f) + + 1))); // Find minimal pow of 2 which fits the input image + int dy = (int) floor(pow(2.0f, int(log(float(ny)) / log(2.0f) + 1))); + int fboSize = max(dx, dy); // DT/FT/skeleton image size (pow of 2, should be + // able to contain the input image) + + return fboSize; +} + +// Classify 'f' into inside, outside, and boundary pixels according to isovalue +// 'iso' +// Inside pixels are those with value > iso (if thr_upper==true), else those +// with value < iso. +template <class T> +void classify(vector<Coord> &s, const T *f, int xm, int ym, int xM, int yM, + int size, T iso, bool thr_upper, float *siteParam, + bool scan_bot_to_top) { + for (int i = 0; i < size * size; ++i) + siteParam[i] = ALIVE; // First: all points are ALIVE, i.e., sites + + for (int j = ym; j < yM; ++j) + for (int i = xm; i < xM; ++i) { + T fptr = f[i + j * size]; // Mark subset of points which are + // FAR_AWAY, i.e. NOT sites + if (((thr_upper && fptr >= iso) || (!thr_upper && fptr <= iso))) + siteParam[i + size * j] = FAR_AWAY; + } + + int js, je, jd; + if (scan_bot_to_top) { + js = ym; + je = yM; + jd = 1; + } else { + js = yM - 1; + je = ym - 1; + jd = -1; + } + + for (int j = js; j != je; j += jd) + for (int i = xm; i < xM; ++i) { + float &cv = siteParam[i + size * j]; + if (cv == FAR_AWAY) + if (((i - 1 + size * j) > 0 && + siteParam[i - 1 + size * j] == ALIVE) || + siteParam[i + 1 + size * j] == ALIVE || + ((i + size * (j - 1)) > 0 && + siteParam[i + size * (j - 1)] == ALIVE) || + siteParam[i + size * (j + 1)] == ALIVE) { + cv = NARROW_BAND; + s.push_back(Coord(i, j)); + } + } +} + +// Encode input image (classified as inside/outside/boundary into a +// boundary-length parameterization image, needed for skeleton computations. +// +float encodeBoundary(vector<Coord> &s, int xm, int ym, int xM, int yM, + float *siteParam, int size) { + const float LARGE_BOUNDARY_VAL = + 1000; // Value that the boundary-param will jump with between disjoint, + // self-connected, boundary pixel chains + + float length = -LARGE_BOUNDARY_VAL + 1; // Compute the boundary length + + while (s.size()) // Process all narrowband points, add their + // arc-length-parameterization in siteParam + { + Coord pt = s[s.size() - 1]; + s.pop_back(); // Pick narrowband-point, test if already numbered + if (siteParam[pt.i + size * pt.j] > 0) + continue; + + int ci, cj; + float cc = 0; // Point pt not numbered + for (int dj = -1; dj < 2; ++dj) // Find min-numbered-neighbour of pt + for (int di = -1; di < 2; ++di) { + int ii = pt.i + di, jj = pt.j + dj; + if (ii < xm || ii >= xM || jj < ym || jj >= yM) + continue; + float val = siteParam[ii + size * jj]; + if (val < 0) + continue; + if (cc && val > cc) + continue; + ci = ii; + cj = jj; + cc = val; + } + + float c = (!cc) ? length + LARGE_BOUNDARY_VAL + : cc + sqrt(float((pt.i - ci) * (pt.i - ci) + + (pt.j - cj) * (pt.j - cj))); + + siteParam[pt.i + size * pt.j] = + c; // Also store the sites' parameterization in siteParam[] for CUDA + + if (c > length) + length = c; // Determine length of boundary, used for + // wraparound-distance-computations + + for (int dj = -1; dj < 2; ++dj) + for (int di = -1; di < 2; ++di) { + if (di == 0 && dj == 0) + continue; + int ii = pt.i + di, jj = pt.j + dj; + if (ii >= xm && ii < xM && jj >= ym && jj < yM && + siteParam[ii + size * jj] == NARROW_BAND) + s.push_back(Coord(ii, jj)); + } + } + + for (int i = 0, iend = size * size; i < iend; + ++i) // All ALIVE points are marked now as non-sites: + if (siteParam[i] < 0) + siteParam[i] = 0; + + return length; +} + +float skelft2DMakeBoundary(const float *f, int xm, int ym, int xM, int yM, + float *siteParam, int size, float iso, + bool thr_upper) { + bool scan_bot_to_top = false; + + vector<Coord> s; + s.reserve((xM - xm) * (yM - ym)); + + classify(s, f, xm, ym, xM, yM, size, iso, thr_upper, siteParam, + scan_bot_to_top); + return encodeBoundary(s, xm, ym, xM, yM, siteParam, size); +} + +float skelft2DMakeBoundary(const unsigned char *f, int xm, int ym, int xM, + int yM, float *siteParam, int size, short iso, + bool thr_upper) { + bool scan_bot_to_top = false; + + vector<Coord> s; + s.reserve((xM - xm) * (yM - ym)); + + classify(s, f, xm, ym, xM, yM, size, (unsigned char) iso, thr_upper, + siteParam, scan_bot_to_top); + return encodeBoundary(s, xm, ym, xM, yM, siteParam, size); +} + +void skelft2DSave(short *outputFT, int dx, int dy, const char *f) { + FILE *fp = fopen(f, "wb"); + fprintf(fp, "P5 %d %d 255\n", dx, dy); + + const int SIZE = 3000; + unsigned char buf[SIZE]; + + int size = dx * dy; + int bb = 0; + float range = max(dx, dy) / 255.0f; + for (short *v = outputFT, *vend = outputFT + size; v < vend; ++v) { + short val = *v; + buf[bb++] = (unsigned char) (val / range); + if (bb == SIZE) { + fwrite(buf, sizeof(unsigned char), SIZE, fp); + bb = 0; + } + } + if (bb) + fwrite(buf, sizeof(unsigned char), bb, fp); + fclose(fp); +} diff --git a/skelftkernel.h b/skelftkernel.h new file mode 100644 index 0000000..1182181 --- /dev/null +++ b/skelftkernel.h @@ -0,0 +1,468 @@ +// CUDA-based Skeletonization, Distance Transforms, Feature Transforms, Erosion +// and Dilation, and Flood Filling Toolkit +// +//(c) Alexandru Telea, Univ. of Groningen, 2011 +// +// This code is adapted from "Parallel Banding Algorithm to compute exact +// distance transforms with the GPU", +// T. Cao, K. Tang, A. Mohamed, T. Tan, Proc. ACM Symo. on Interactive 3D +// Graphics and Games (I3D), 2010, 83-90 +//==================================================================================================================== + +// MARKER is used to mark blank pixels in the texture. Any uncolored pixels will +// have x = MARKER. +// Input texture should have x = MARKER for all pixels other than sites +#define MARKER -32768 + +#define TOID(x, y, size) (__mul24((y), (size)) + (x)) + +// Transpose a square matrix +__global__ void kernelTranspose(short2 *data, int size) { + __shared__ short2 block1[TILE_DIM][TILE_DIM + 1]; + __shared__ short2 block2[TILE_DIM][TILE_DIM + 1]; + + int blockIdx_y = blockIdx.x; + int blockIdx_x = blockIdx.x + blockIdx.y; + + if (blockIdx_x >= gridDim.x) + return; + + int blkX, blkY, x, y, id1, id2; + short2 pixel; + + blkX = __mul24(blockIdx_x, TILE_DIM); + blkY = __mul24(blockIdx_y, TILE_DIM); + + x = blkX + threadIdx.x; + y = blkY + threadIdx.y; + id1 = __mul24(y, size) + x; + + x = blkY + threadIdx.x; + y = blkX + threadIdx.y; + id2 = __mul24(y, size) + x; + + // read the matrix tile into shared memory + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + int idx = __mul24(i, size); + block1[threadIdx.y + i][threadIdx.x] = + tex1Dfetch(pbaTexColor, id1 + idx); + block2[threadIdx.y + i][threadIdx.x] = + tex1Dfetch(pbaTexColor, id2 + idx); + } + + __syncthreads(); + + // write the transposed matrix tile to global memory + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + int idx = __mul24(i, size); + pixel = block1[threadIdx.x][threadIdx.y + i]; + data[id2 + idx] = make_short2(pixel.y, pixel.x); + pixel = block2[threadIdx.x][threadIdx.y + i]; + data[id1 + idx] = make_short2(pixel.y, pixel.x); + } +} + +__global__ void kernelFloodDown(short2 *output, int size, int bandSize) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * bandSize; + int id = TOID(tx, ty, size); + + short2 pixel1, pixel2; + + pixel1 = make_short2(MARKER, MARKER); + + for (int i = 0; i < bandSize; ++i, id += size) { + pixel2 = tex1Dfetch(pbaTexColor, id); + + if (pixel2.x != MARKER) + pixel1 = pixel2; + + output[id] = pixel1; + } +} + +__global__ void kernelFloodUp(short2 *output, int size, int bandSize) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = (blockIdx.y + 1) * bandSize - 1; + int id = TOID(tx, ty, size); + + short2 pixel1, pixel2; + int dist1, dist2; + + pixel1 = make_short2(MARKER, MARKER); + + for (int i = 0; i < bandSize; i++, id -= size) { + dist1 = abs(pixel1.y - ty + i); + + pixel2 = tex1Dfetch(pbaTexColor, id); + dist2 = abs(pixel2.y - ty + i); + + if (dist2 < dist1) + pixel1 = pixel2; + + output[id] = pixel1; + } +} + +__global__ void kernelPropagateInterband(short2 *output, int size, + int bandSize) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int inc = __mul24(bandSize, size); + int ny, nid, nDist; + short2 pixel; + + // Top row, look backward + int ty = __mul24(blockIdx.y, bandSize); + int topId = TOID(tx, ty, size); + int bottomId = TOID(tx, ty + bandSize - 1, size); + + pixel = tex1Dfetch(pbaTexColor, topId); + int myDist = abs(pixel.y - ty); + + for (nid = bottomId - inc; nid >= 0; nid -= inc) { + pixel = tex1Dfetch(pbaTexColor, nid); + if (pixel.x != MARKER) { + nDist = abs(pixel.y - ty); + if (nDist < myDist) + output[topId] = pixel; + break; + } + } + + // Last row, look downward + ty = ty + bandSize - 1; + pixel = tex1Dfetch(pbaTexColor, bottomId); + myDist = abs(pixel.y - ty); + + for (ny = ty + 1, nid = topId + inc; ny < size; + ny += bandSize, nid += inc) { + pixel = tex1Dfetch(pbaTexColor, nid); + + if (pixel.x != MARKER) { + nDist = abs(pixel.y - ty); + if (nDist < myDist) + output[bottomId] = pixel; + break; + } + } +} + +__global__ void kernelUpdateVertical(short2 *output, int size, int band, + int bandSize) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * bandSize; + + short2 top = tex1Dfetch(pbaTexLinks, TOID(tx, ty, size)); + short2 bottom = tex1Dfetch(pbaTexLinks, TOID(tx, ty + bandSize - 1, size)); + short2 pixel; + + int dist, myDist; + + int id = TOID(tx, ty, size); + + for (int i = 0; i < bandSize; i++, id += size) { + pixel = tex1Dfetch(pbaTexColor, id); + myDist = abs(pixel.y - (ty + i)); + + dist = abs(top.y - (ty + i)); + if (dist < myDist) { + myDist = dist; + pixel = top; + } + + dist = abs(bottom.y - (ty + i)); + if (dist < myDist) + pixel = bottom; + + output[id] = pixel; + } +} + +// Input: y1 < y2 +__device__ float interpoint(int x1, int y1, int x2, int y2, int x0) { + float xM = float(x1 + x2) / 2.0f; + float yM = float(y1 + y2) / 2.0f; + float nx = x2 - x1; + float ny = y2 - y1; + + return yM + nx * (xM - x0) / ny; +} + +__global__ void kernelProximatePoints(short2 *stack, int size, int bandSize) { + int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int ty = __mul24(blockIdx.y, bandSize); + int id = TOID(tx, ty, size); + int lasty = -1; + short2 last1, last2, current; + float i1, i2; + + last1.y = -1; + last2.y = -1; + + for (int i = 0; i < bandSize; i++, id += size) { + current = tex1Dfetch(pbaTexColor, id); + + if (current.x != MARKER) { + while (last2.y >= 0) { + i1 = interpoint(last1.x, last2.y, last2.x, lasty, tx); + i2 = interpoint(last2.x, lasty, current.x, current.y, tx); + + if (i1 < i2) + break; + + lasty = last2.y; + last2 = last1; + + if (last1.y >= 0) + last1 = stack[TOID(tx, last1.y, size)]; + } + + last1 = last2; + last2 = make_short2(current.x, lasty); + lasty = current.y; + + stack[id] = last2; + } + } + + // Store the pointer to the tail at the last pixel of this band + if (lasty != ty + bandSize - 1) + stack[TOID(tx, ty + bandSize - 1, size)] = make_short2(MARKER, lasty); +} + +__global__ void kernelCreateForwardPointers(short2 *output, int size, + int bandSize) { + int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int ty = __mul24(blockIdx.y + 1, bandSize) - 1; + int id = TOID(tx, ty, size); + int lasty = -1, nexty; + short2 current; + + // Get the tail pointer + current = tex1Dfetch(pbaTexLinks, id); + + if (current.x == MARKER) + nexty = current.y; + else + nexty = ty; + + for (int i = 0; i < bandSize; i++, id -= size) + if (ty - i == nexty) { + current = make_short2(lasty, tex1Dfetch(pbaTexLinks, id).y); + output[id] = current; + + lasty = nexty; + nexty = current.y; + } + + // Store the pointer to the head at the first pixel of this band + if (lasty != ty - bandSize + 1) + output[id + size] = make_short2(lasty, MARKER); +} + +__global__ void kernelMergeBands(short2 *output, int size, int bandSize) { + int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int band1 = blockIdx.y * 2; + int band2 = band1 + 1; + int firsty, lasty; + short2 last1, last2, current; + // last1 and last2: x component store the x coordinate of the site, + // y component store the backward pointer + // current: y component store the x coordinate of the site, + // x component store the forward pointer + + // Get the two last items of the first list + lasty = __mul24(band2, bandSize) - 1; + last2 = make_short2(tex1Dfetch(pbaTexColor, TOID(tx, lasty, size)).x, + tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)).y); + + if (last2.x == MARKER) { + lasty = last2.y; + + if (lasty >= 0) + last2 = + make_short2(tex1Dfetch(pbaTexColor, TOID(tx, lasty, size)).x, + tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)).y); + else + last2 = make_short2(MARKER, MARKER); + } + + if (last2.y >= 0) { + // Second item at the top of the stack + last1 = make_short2(tex1Dfetch(pbaTexColor, TOID(tx, last2.y, size)).x, + tex1Dfetch(pbaTexLinks, TOID(tx, last2.y, size)).y); + } + + // Get the first item of the second band + firsty = __mul24(band2, bandSize); + current = make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x, + tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x); + + if (current.y == MARKER) { + firsty = current.x; + + if (firsty >= 0) + current = + make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x, + tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x); + else + current = make_short2(MARKER, MARKER); + } + + float i1, i2; + + // Count the number of item in the second band that survive so far. + // Once it reaches 2, we can stop. + int top = 0; + + while (top < 2 && current.y >= 0) { + // While there's still something on the left + while (last2.y >= 0) { + i1 = interpoint(last1.x, last2.y, last2.x, lasty, tx); + i2 = interpoint(last2.x, lasty, current.y, firsty, tx); + + if (i1 < i2) + break; + + lasty = last2.y; + last2 = last1; + --top; + + if (last1.y >= 0) + last1 = make_short2( + tex1Dfetch(pbaTexColor, TOID(tx, last1.y, size)).x, + output[TOID(tx, last1.y, size)].y); + } + + // Update the current pointer + output[TOID(tx, firsty, size)] = make_short2(current.x, lasty); + + if (lasty >= 0) + output[TOID(tx, lasty, size)] = make_short2(firsty, last2.y); + + last1 = last2; + last2 = make_short2(current.y, lasty); + lasty = firsty; + firsty = current.x; + + top = max(1, top + 1); + + // Advance the current pointer to the next one + if (firsty >= 0) + current = + make_short2(tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)).x, + tex1Dfetch(pbaTexColor, TOID(tx, firsty, size)).x); + else + current = make_short2(MARKER, MARKER); + } + + // Update the head and tail pointer. + firsty = __mul24(band1, bandSize); + lasty = __mul24(band2, bandSize); + current = tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)); + + if (current.y == MARKER && current.x < 0) { // No head? + last1 = tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)); + + if (last1.y == MARKER) + current.x = last1.x; + else + current.x = lasty; + + output[TOID(tx, firsty, size)] = current; + } + + firsty = __mul24(band1, bandSize) + bandSize - 1; + lasty = __mul24(band2, bandSize) + bandSize - 1; + current = tex1Dfetch(pbaTexLinks, TOID(tx, lasty, size)); + + if (current.x == MARKER && current.y < 0) { // No tail? + last1 = tex1Dfetch(pbaTexLinks, TOID(tx, firsty, size)); + + if (last1.x == MARKER) + current.y = last1.y; + else + current.y = firsty; + + output[TOID(tx, lasty, size)] = current; + } +} + +__global__ void kernelDoubleToSingleList(short2 *output, int size) { + int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int ty = blockIdx.y; + int id = TOID(tx, ty, size); + + output[id] = make_short2(tex1Dfetch(pbaTexColor, id).x, + tex1Dfetch(pbaTexLinks, id).y); +} + +__global__ void kernelColor(short2 *output, int size) { + __shared__ short2 s_last1[BLOCKSIZE], s_last2[BLOCKSIZE]; + __shared__ int s_lasty[BLOCKSIZE]; + + int col = threadIdx.x; + int tid = threadIdx.y; + int tx = __mul24(blockIdx.x, blockDim.x) + col; + int dx, dy, lasty; + unsigned int best, dist; + short2 last1, last2; + + if (tid == blockDim.y - 1) { + lasty = size - 1; + + last2 = tex1Dfetch(pbaTexColor, __mul24(lasty, size) + tx); + + if (last2.x == MARKER) { + lasty = last2.y; + last2 = tex1Dfetch(pbaTexColor, __mul24(lasty, size) + tx); + } + + if (last2.y >= 0) + last1 = tex1Dfetch(pbaTexColor, __mul24(last2.y, size) + tx); + + s_last1[col] = last1; + s_last2[col] = last2; + s_lasty[col] = lasty; + } + + __syncthreads(); + + for (int ty = size - 1 - tid; ty >= 0; ty -= blockDim.y) { + last1 = s_last1[col]; + last2 = s_last2[col]; + lasty = s_lasty[col]; + + dx = last2.x - tx; + dy = lasty - ty; + best = dist = __mul24(dx, dx) + __mul24(dy, dy); + + while (last2.y >= 0) { + dx = last1.x - tx; + dy = last2.y - ty; + dist = __mul24(dx, dx) + __mul24(dy, dy); + + if (dist > best) + break; + + best = dist; + lasty = last2.y; + last2 = last1; + + if (last2.y >= 0) + last1 = tex1Dfetch(pbaTexColor, __mul24(last2.y, size) + tx); + } + + __syncthreads(); + + output[TOID(tx, ty, size)] = make_short2(last2.x, lasty); + + if (tid == blockDim.y - 1) { + s_last1[col] = last1; + s_last2[col] = last2; + s_lasty[col] = lasty; + } + + __syncthreads(); + } +}
\ No newline at end of file diff --git a/voronoisplat.cpp b/voronoisplat.cpp new file mode 100644 index 0000000..b8ab426 --- /dev/null +++ b/voronoisplat.cpp @@ -0,0 +1,407 @@ +#include "voronoisplat.h" + +#include <algorithm> + +#include <QQuickWindow> +#include <QOpenGLFramebufferObject> + +#include "scale.h" +#include "skelft.h" + +static const float RAD_BLUR = 5.0f; +static const int COLORMAP_SAMPLES = 128; + +VoronoiSplatRenderer::VoronoiSplatRenderer(const QSize &size) + : m_item(0) + , gl(QOpenGLContext::currentContext()) + , m_size(size) +{ + setupShaders(); + + // sitesVAO: VBOs 0 & 1 are for sites & their values (init'd later) + m_sitesVAO.create(); + gl.glGenBuffers(3, m_VBOs); + + // 2ndPassVAO: VBO 2 is a quad mapping the final texture to the framebuffer + m_2ndPassVAO.create(); + m_2ndPassVAO.bind(); + GLfloat verts[] = {-1.0f, -1.0f, -1.0f, 1.0f, + 1.0f, -1.0f, 1.0f, 1.0f}; + gl.glBindBuffer(GL_ARRAY_BUFFER, m_VBOs[2]); + gl.glBufferData(GL_ARRAY_BUFFER, sizeof(verts), verts, GL_STATIC_DRAW); + + int vertAttrib = m_program2->attributeLocation("vert"); + gl.glVertexAttribPointer(vertAttrib, 2, GL_FLOAT, GL_FALSE, 0, 0); + gl.glEnableVertexAttribArray(vertAttrib); + m_2ndPassVAO.release(); + + setupTextures(); +} + +void VoronoiSplatRenderer::setupShaders() +{ + m_program1 = new QOpenGLShaderProgram; + m_program1->addShaderFromSourceCode(QOpenGLShader::Vertex, +R"EOF(#version 440 + +uniform float rad_blur; +uniform float rad_max; +uniform float fbSize; + +in vec2 vert; +in float scalar; + +out float value; + +void main() { + gl_PointSize = (rad_max + rad_blur) * 2.0; + gl_Position = vec4(2.0 * (vert / fbSize) - 1.0, 0, 1); + value = scalar; +} +)EOF"); + m_program1->addShaderFromSourceCode(QOpenGLShader::Fragment, +R"EOF(#version 440 + +uniform sampler2D siteDT; +uniform float rad_blur; +uniform float rad_max; + +in float value; + +layout (location = 0) out vec4 fragColor; + +void main() { + float dt = texelFetch(siteDT, ivec2(gl_FragCoord.xy), 0).r; + if (dt > rad_max) + discard; + else { + float r = distance(gl_PointCoord, vec2(0.5, 0.5)) * (rad_max + rad_blur) * 2.0; + float r2 = r * r; + float rad = dt + rad_blur; + float rad2 = rad * rad; + if (r2 > rad2) + discard; + else { + float w = exp(-5.0 * r2 / rad2); + fragColor = vec4(w * value, w, 0.0, 0.0); + } + } +} +)EOF"); + m_program1->link(); + + m_program2 = new QOpenGLShaderProgram; + m_program2->addShaderFromSourceCode(QOpenGLShader::Vertex, +R"EOF(#version 440 + +in vec2 vert; + +void main() { + gl_Position = vec4(vert, 0, 1); +} +)EOF"); + + m_program2->addShaderFromSourceCode(QOpenGLShader::Fragment, +R"EOF( +#version 440 + +layout (location = 0) out vec4 fragColor; + +uniform sampler2D siteDT; +uniform sampler2D accumTex; +uniform sampler2D colormap; +uniform float rad_max; + +vec3 getRGB(float value) { + return texture2D(colormap, vec2(0, 1)).rgb; +} + +void main() { + float dt = texelFetch(siteDT, ivec2(gl_FragCoord.xy), 0).r; + if (dt > rad_max) + discard; + else { + vec4 accum = texelFetch(accumTex, ivec2(gl_FragCoord.xy), 0); + // 1.0 is extra-accumulated because of white background + float value = 0.0f; + if (accum.g > 1.0) + value = (accum.r - 1.0) / (accum.g - 1.0); + else + value = accum.r / accum.g; + fragColor.rgb = getRGB(value); + fragColor.a = 1.0 - dt / rad_max; + } +} +)EOF"); + m_program2->link(); +} + +void VoronoiSplatRenderer::setupTextures() +{ + gl.glGenTextures(2, m_textures); + + // textures[0] stores the DT values for each pixel + gl.glBindTexture(GL_TEXTURE_2D, m_textures[0]); + gl.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, m_size.width(), + m_size.height(), 0, GL_RG, GL_FLOAT, 0); + gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + // textures[1] is the result of the first pass + gl.glBindTexture(GL_TEXTURE_2D, m_textures[1]); + gl.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, m_size.width(), + m_size.height(), 0, GL_RGBA, GL_FLOAT, 0); + gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + // Used for colormap lookup in the frag shader + gl.glGenTextures(1, &m_colorMapTex); + gl.glBindTexture(GL_TEXTURE_2D, m_colorMapTex); + gl.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, COLORMAP_SAMPLES, 1, 0, GL_RGB, + GL_FLOAT, 0); + gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + gl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); +} + +VoronoiSplatRenderer::~VoronoiSplatRenderer() +{ + gl.glDeleteBuffers(2, m_VBOs); + delete m_program1; +} + +void VoronoiSplatRenderer::setSites(const arma::mat &points) +{ + if (points.n_rows < 1 || points.n_cols != 2 + || (m_values.size() != 0 && points.n_rows != m_values.size())) { + return; + } + + // Copy 'points' to internal data structure(s) + copyPoints(points); + + // Update VBO with the new data + m_sitesVAO.bind(); + gl.glBindBuffer(GL_ARRAY_BUFFER, m_VBOs[0]); + gl.glBufferData(GL_ARRAY_BUFFER, m_sites.size() * sizeof(float), + m_sites.data(), GL_STATIC_DRAW); + + int vertAttrib = m_program1->attributeLocation("vert"); + gl.glVertexAttribPointer(vertAttrib, 2, GL_FLOAT, GL_FALSE, 0, 0); + gl.glEnableVertexAttribArray(vertAttrib); + m_sitesVAO.release(); + + // Compute DT values for the new positions + computeDT(); +} + +void VoronoiSplatRenderer::copyPoints(const arma::mat &points) +{ + double minX = points.col(0).min(); + double maxX = points.col(0).max(); + double minY = points.col(1).min(); + double maxY = points.col(1).max(); + + // Coords are tighly packed into 'm_sites' as [ (x1, y1), (x2, y2), ... ] + m_sites.resize(points.n_rows * 2); + LinearScale<float> sx(minX, maxX, 1, m_size.width() - 1); + const double *col = points.colptr(0); + for (unsigned i = 0; i < points.n_rows; i++) { + m_sites[2*i] = sx(col[i]); + } + + col = points.colptr(1); + LinearScale<float> sy(minY, maxY, m_size.height() - 1, 1); + for (unsigned i = 0; i < points.n_rows; i++) { + m_sites[2*i + 1] = sy(col[i]); + } +} + +void VoronoiSplatRenderer::computeDT() +{ + int w = m_size.width(), h = m_size.height(); + + std::vector<float> buf(w * h, 0.0f); + for (unsigned i = 0; i < m_sites.size(); i += 2) { + buf[int(m_sites[i + 1]) * h + int(m_sites[i])] = (float) i / 2 + 1; + } + + // Compute FT of the sites + skelft2DFT(0, buf.data(), 0, 0, w, h, w); + // Compute DT of the sites (from the resident FT) + skelft2DDT(buf.data(), 0, 0, w, h); + + std::vector<GLfloat> dtTexData(w * h * 2, 0.0f); + for (unsigned i = 0; i < buf.size(); i++) { + dtTexData[2*i] = buf[i]; + } + + // Upload result to lookup texture + gl.glBindTexture(GL_TEXTURE_2D, m_textures[0]); + gl.glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, m_size.width(), m_size.height(), + GL_RG, GL_FLOAT, dtTexData.data()); +} + +void VoronoiSplatRenderer::setValues(const arma::vec &values) +{ + if (values.n_elem == 0 + || (m_sites.size() != 0 && values.n_elem != m_sites.size() / 2)) { + return; + } + + m_values.resize(values.n_elem); + std::copy(values.begin(), values.end(), m_values.begin()); + + // Update VBO with the new data + m_sitesVAO.bind(); + gl.glBindBuffer(GL_ARRAY_BUFFER, m_VBOs[1]); + gl.glBufferData(GL_ARRAY_BUFFER, m_values.size() * sizeof(float), + m_values.data(), GL_DYNAMIC_DRAW); + + int valueAttrib = m_program1->attributeLocation("scalar"); + gl.glVertexAttribPointer(valueAttrib, 1, GL_FLOAT, GL_FALSE, 0, 0); + gl.glEnableVertexAttribArray(valueAttrib); + m_sitesVAO.release(); +} + +void VoronoiSplatRenderer::setColorMap(const ColorScale *scale) +{ + if (!scale) { + return; + } + + float t = scale->min(); + float step = (scale->max() - scale->min()) / COLORMAP_SAMPLES; + qreal r, g, b; + std::vector<float> cmap(COLORMAP_SAMPLES * 3); // R,G,B + for (int i = 0; i < COLORMAP_SAMPLES * 3; i += 3) { + scale->color(t).getRgbF(&r, &g, &b); + cmap[i + 0] = b; + cmap[i + 1] = g; + cmap[i + 2] = r; + + t += step; + } + + // Update texture data + gl.glBindTexture(GL_TEXTURE_2D, m_colorMapTex); + gl.glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, COLORMAP_SAMPLES, 1, GL_RGB, + GL_FLOAT, cmap.data()); +} + +QOpenGLFramebufferObject *VoronoiSplatRenderer::createFramebufferObject(const QSize &size) +{ + return new QOpenGLFramebufferObject(size); +} + +void VoronoiSplatRenderer::synchronize(QQuickFramebufferObject *item) +{ + m_item = static_cast<VoronoiSplat *>(item); + + if (m_item->colorScaleUpdated()) { + setColorMap(m_item->colorScale()); + } + if (m_item->pointsUpdated()) { + setSites(m_item->points()); + } + if (m_item->valuesUpdated()) { + setValues(m_item->values()); + } + + m_item->setUpdated(false); +} + +void VoronoiSplatRenderer::render() +{ + // Store the final texture to render to, as we use a two-pass rendering + // which first renders to an intermediate texture + GLint targetTexture; + gl.glGetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, + GL_COLOR_ATTACHMENT0, GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME, + &targetTexture); + + // The first pass + m_program1->bind(); + m_program1->setUniformValue("rad_max", 20.0f); + m_program1->setUniformValue("rad_blur", RAD_BLUR); + m_program1->setUniformValue("fbSize", float(m_size.width())); + + gl.glActiveTexture(GL_TEXTURE0); + gl.glBindTexture(GL_TEXTURE_2D, m_textures[0]); + m_program1->setUniformValue("siteDT", 0); + + gl.glEnable(GL_POINT_SPRITE); + gl.glEnable(GL_PROGRAM_POINT_SIZE); + gl.glEnable(GL_BLEND); + gl.glBlendFunc(GL_ONE, GL_ONE); + + // In the first pass we draw to an intermediate texture, which is used as + // input for the next pass + gl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, m_textures[1], 0); + + gl.glClearColor(1, 1, 1, 1); + gl.glClear(GL_COLOR_BUFFER_BIT); + + m_sitesVAO.bind(); + gl.glDrawArrays(GL_POINTS, 0, m_values.size()); + m_sitesVAO.release(); + + m_program1->release(); + + // Second pass + m_program2->bind(); + m_program2->setUniformValue("rad_max", 20.0f); + + gl.glActiveTexture(GL_TEXTURE0); + gl.glBindTexture(GL_TEXTURE_2D, m_textures[0]); + m_program2->setUniformValue("siteDT", 0); + gl.glActiveTexture(GL_TEXTURE1); + gl.glBindTexture(GL_TEXTURE_2D, m_textures[1]); + m_program2->setUniformValue("accumTex", 1); + gl.glActiveTexture(GL_TEXTURE2); + gl.glBindTexture(GL_TEXTURE_2D, m_colorMapTex); + m_program2->setUniformValue("colormap", 2); + + gl.glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); + + // Restore the original framebuffer texture + gl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, targetTexture, 0); + + glClearColor(1, 1, 1, 1); + glClear(GL_COLOR_BUFFER_BIT); + + m_2ndPassVAO.bind(); + gl.glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); + m_2ndPassVAO.release(); + + m_program2->release(); + + m_item->window()->resetOpenGLState(); +} + +static int nextPow2(int n) +{ + // TODO: check for overflows + n--; + for (int shift = 1; ((n + 1) & n); shift <<= 1) { + n |= n >> shift; + } + return n + 1; +} + +VoronoiSplat::VoronoiSplat(QQuickItem *parent) + : QQuickFramebufferObject(parent) + , m_colorScaleUpdated(false) + , m_pointsUpdated(false) + , m_valuesUpdated(false) + , m_colorScale(0) +{ + setTextureFollowsItemSize(false); +} + +QQuickFramebufferObject::Renderer *VoronoiSplat::createRenderer() const +{ + int baseSize = nextPow2(std::min(width(), height())); + return new VoronoiSplatRenderer(QSize(baseSize, baseSize)); +} diff --git a/voronoisplat.h b/voronoisplat.h new file mode 100644 index 0000000..ce84e74 --- /dev/null +++ b/voronoisplat.h @@ -0,0 +1,103 @@ +#ifndef VORONOISPLAT_H +#define VORONOISPLAT_H + +#include <QQuickFramebufferObject> +#include <QOpenGLFunctions> +#include <QOpenGLBuffer> +#include <QOpenGLShaderProgram> +#include <QOpenGLVertexArrayObject> +#include <armadillo> + +#include "colorscale.h" + +class VoronoiSplat; // defined after this class + +class VoronoiSplatRenderer + : public QQuickFramebufferObject::Renderer +{ +public: + // 'size' must be square (and power of 2); item is the QQuickFBO that + // creates this + VoronoiSplatRenderer(const QSize &size); + ~VoronoiSplatRenderer(); + + void synchronize(QQuickFramebufferObject *item); + // 'points' should be a 2D points matrix (each point in a row) + void setSites(const arma::mat &points); + void setValues(const arma::vec &values); + // Set colormap data based on the given color scale; + void setColorMap(const ColorScale *scale); + + QOpenGLFramebufferObject *createFramebufferObject(const QSize & size); + void render(); + +private: + void setupShaders(); + void setupTextures(); + void copyPoints(const arma::mat &points); + void computeDT(); + + VoronoiSplat *m_item; + QOpenGLFunctions gl; + QOpenGLShaderProgram *m_program1, *m_program2; + GLuint m_VBOs[3]; + GLuint m_textures[2], m_colorMapTex; + QOpenGLVertexArrayObject m_sitesVAO, m_2ndPassVAO; + + std::vector<float> m_sites, m_values; + QSize m_size; +}; + +class VoronoiSplat + : public QQuickFramebufferObject +{ + Q_OBJECT +public: + VoronoiSplat(QQuickItem *parent = 0); + Renderer *createRenderer() const; + + const arma::mat &points() const { return m_points; } + const arma::vec &values() const { return m_values; } + const ColorScale *colorScale() const { return m_colorScale; } + + bool colorScaleUpdated() const { return m_colorScaleUpdated; } + bool pointsUpdated() const { return m_pointsUpdated; } + bool valuesUpdated() const { return m_valuesUpdated; } + +public slots: + void setColorScale(const ColorScale *scale) + { + m_colorScale = scale; + m_colorScaleUpdated = true; + update(); + } + + void setPoints(const arma::mat &points) + { + m_points = points; + m_pointsUpdated = true; + update(); + } + + void setValues(const arma::vec &values) + { + m_values = values; + m_valuesUpdated = true; + update(); + } + + void setUpdated(bool updated) + { + m_colorScaleUpdated = updated; + m_pointsUpdated = updated; + m_valuesUpdated = updated; + } + +private: + bool m_colorScaleUpdated, m_pointsUpdated, m_valuesUpdated; + const ColorScale *m_colorScale; + arma::mat m_points; + arma::vec m_values; +}; + +#endif // VORONOISPLAT_H |