skelft.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869

#include <device_functions.h>
#include "skelft.h"
#include <cstdio>
#include <iostream>


#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/sort.h>


using namespace std;


// Parameters for CUDA kernel executions; more or less optimized for a 1024x1024 image.
#define BLOCKX     16
#define BLOCKY     16
#define BLOCKSIZE  64
#define TILE_DIM   32
#define BLOCK_ROWS 16


/****** Global Variables *******/
const int NB = 7;                        // Nr buffers we use and store in the entire framework
short2 **pbaTextures;                    // Work buffers used to compute and store resident images
// 0: work buffer
// 1: FT
// 2: thresholded DT or exact floating-point DT
// 3: thresholded skeleton
// 4: topology analysis
// 5: work buffer for topology
// 6: skeleton FT
//

float*  pbaTexSiteParam;    // Stores boundary parameterization
float2* pbaTexAdvectSites;
float*  pbaTexDensityData;  // The density map of the sites
float2* pbaTexGradientData; // The gradient of the density map

int     pbaTexSize;     // Texture size (squared) actually used in all computations
int     floodBand  = 4, // Various FT computation parameters; defaults are good for an 1024x1024 image.    
        maurerBand = 4,
        colorBand  = 4;        

texture<short2> pbaTexColor;  // 2D textures (bound to various buffers defined above as needed)
texture<short2> pbaTexColor2;
texture<short2> pbaTexLinks;
texture<float>  pbaTexParam;  // 1D site parameterization texture (bound to pbaTexSiteParam)
texture<unsigned char>
                pbaTexGray;   // 2D texture of unsigned char values, e.g. the binary skeleton
texture<float2> pbaTexAdvect;
texture<float,cudaTextureType2D,cudaReadModeElementType>  pbaTexDensity;
texture<float2,cudaTextureType2D,cudaReadModeElementType> pbaTexGradient;
texture<float,cudaTextureType2D,cudaReadModeElementType>  pbaTexDT;


__device__  bool fill_gc; //Indicates if a fill-sweep did fill anything or not


#if __CUDA_ARCH__ < 110   // We cannot use atomic intrinsics on SM10 or below. Thus, we define these as nop.
#define atomicInc(a,b) 0  // The default will be that some code e.g. endpoint detection will thus not do anything.
#endif


/********* Kernels ********/
#include "skelftkernel.h"


// Initialize necessary memory (CPU/GPU sides)
// - textureSize: The max size of any image we will process until re-initialization
void skelft2DInitialization(int maxTexSize)
{
    int pbaMemSize = maxTexSize * maxTexSize * sizeof(short2); // A buffer has 2 shorts / pixel
    pbaTextures  = (short2**) malloc(NB * sizeof(short2*));    // We will use NB buffers

    for(int i=0;i<NB;++i)
       cudaMalloc((void **) &pbaTextures[i], pbaMemSize); // Allocate work buffer 'i' (on GPU)

    cudaMalloc((void**) &pbaTexSiteParam, maxTexSize * maxTexSize * sizeof(float));    // Sites texture (for FT)
    cudaMalloc((void**) &pbaTexAdvectSites, maxTexSize * maxTexSize * sizeof(float2)); // Sites 2D-coords
    cudaMalloc((void**) &pbaTexDensityData, maxTexSize * maxTexSize * sizeof(float));
    cudaMalloc((void**) &pbaTexGradientData, maxTexSize * maxTexSize * sizeof(float2));
}


// Deallocate all allocated memory
void skelft2DDeinitialization()
{
    for(int i=0;i<NB;++i) cudaFree(pbaTextures[i]); 
    cudaFree(pbaTexSiteParam);
    cudaFree(pbaTexAdvectSites);
    cudaFree(pbaTexDensityData);
    cudaFree(pbaTexGradientData);
    free(pbaTextures);
}


// Initialize the Voronoi textures from the sites' encoding texture (parameterization)
// REMARK: we interpret 'inputVoro' as a 2D texture, as it's much easier/faster like this
__global__ void kernelSiteParamInit(short2* inputVoro, int size)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    if (tx<size && ty<size) // Careful not to go outside the image..
    {
        int i = TOID(tx,ty,size);
        // The sites-param has non-zero (parameter) values precisely on boundary points
        float param = tex1Dfetch(pbaTexParam,i); 

        short2& v = inputVoro[i];
        // Non-boundary points are marked as 0 in the parameterization. Here we will compute the FT.
        v.x = v.y = MARKER;
        // These are points which define the 'sites' to compute the FT/skeleton (thus, have FT==identity)
        // We could use an if-then-else here, but it's faster with an if-then
        if (param>0) 
        {            
            v.x = tx;
            v.y = ty;
        }
    }
}


void skelft2DInitializeInput(float* sites, int size)                                        // Copy input sites from CPU to GPU; Also set up site param initialization in pbaTextures[0]
{
    pbaTexSize = size;                                                                        // Size of the actual texture being used in this run; can be smaller than the max-tex-size
                                                                                            // which was used in skelft2DInitialization()

    cudaMemcpy(pbaTexSiteParam, sites, pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyHostToDevice);
                                                                                            // Pass sites parameterization to CUDA.  Must be done before calling the initialization
                                                                                            // kernel, since we use the sites-param as a texture in that kernel
    cudaBindTexture(0, pbaTexParam, pbaTexSiteParam);                                        // Bind the sites-param as a 1D texture so we can quickly index it next
    dim3 block = dim3(BLOCKX,BLOCKY);
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
    
    kernelSiteParamInit<<<grid,block>>>(pbaTextures[0],pbaTexSize);                            // Do the site param initialization. This sets up pbaTextures[0]
    cudaUnbindTexture(pbaTexParam);
}


// In-place transpose a squared texture. 
// Block orders are modified to optimize memory access. 
// Point coordinates are also swapped. 
void pba2DTranspose(short2 *texture)
{
    dim3 block(TILE_DIM, BLOCK_ROWS); 
    dim3 grid(pbaTexSize / TILE_DIM, pbaTexSize / TILE_DIM); 

    cudaBindTexture(0, pbaTexColor, texture); 
    kernelTranspose<<< grid, block >>>(texture, pbaTexSize); 
    cudaUnbindTexture(pbaTexColor); 
}

// Phase 1 of PBA. m1 must divides texture size
void pba2DPhase1(int m1, short xm, short ym, short xM, short yM) 
{
    dim3 block = dim3(BLOCKSIZE);   
    dim3 grid = dim3(pbaTexSize / block.x, m1); 

    // Flood vertically in their own bands
    cudaBindTexture(0, pbaTexColor, pbaTextures[0]); 
    kernelFloodDown<<< grid, block >>>(pbaTextures[1], pbaTexSize, pbaTexSize / m1); 
    cudaUnbindTexture(pbaTexColor); 

    cudaBindTexture(0, pbaTexColor, pbaTextures[1]); 
    kernelFloodUp<<< grid, block >>>(pbaTextures[1], pbaTexSize, pbaTexSize / m1); 

    // Passing information between bands
    grid = dim3(pbaTexSize / block.x, m1); 
    kernelPropagateInterband<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m1); 

    cudaBindTexture(0, pbaTexLinks, pbaTextures[0]); 
    kernelUpdateVertical<<< grid, block >>>(pbaTextures[1], pbaTexSize, m1, pbaTexSize / m1); 
    cudaUnbindTexture(pbaTexLinks); 
    cudaUnbindTexture(pbaTexColor); 
}

// Phase 2 of PBA. m2 must divides texture size
void pba2DPhase2(int m2) 
{
    // Compute proximate points locally in each band
    dim3 block = dim3(BLOCKSIZE);   
    dim3 grid = dim3(pbaTexSize / block.x, m2); 
    cudaBindTexture(0, pbaTexColor, pbaTextures[1]); 
    kernelProximatePoints<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m2); 

    cudaBindTexture(0, pbaTexLinks, pbaTextures[0]); 
    kernelCreateForwardPointers<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / m2); 

    // Repeatly merging two bands into one
    for (int noBand = m2; noBand > 1; noBand /= 2) {
        grid = dim3(pbaTexSize / block.x, noBand / 2); 
        kernelMergeBands<<< grid, block >>>(pbaTextures[0], pbaTexSize, pbaTexSize / noBand); 
    }

    // Replace the forward link with the X coordinate of the seed to remove
    // the need of looking at the other texture. We need it for coloring.
    grid = dim3(pbaTexSize / block.x, pbaTexSize); 
    kernelDoubleToSingleList<<< grid, block >>>(pbaTextures[0], pbaTexSize); 
    cudaUnbindTexture(pbaTexLinks); 
    cudaUnbindTexture(pbaTexColor); 
}

// Phase 3 of PBA. m3 must divides texture size
void pba2DPhase3(int m3) 
{
    dim3 block = dim3(BLOCKSIZE / m3, m3); 
    dim3 grid = dim3(pbaTexSize / block.x); 
    cudaBindTexture(0, pbaTexColor, pbaTextures[0]); 
    kernelColor<<< grid, block >>>(pbaTextures[1], pbaTexSize); 
    cudaUnbindTexture(pbaTexColor); 
}


void skel2DFTCompute(short xm, short ym, short xM, short yM, int floodBand, int maurerBand, int colorBand)
{
    pba2DPhase1(floodBand,xm,ym,xM,yM);                                        //Vertical sweep

    pba2DTranspose(pbaTextures[1]);                                            //

    pba2DPhase2(maurerBand);                                                //Horizontal coloring

    pba2DPhase3(colorBand);                                                    //Row coloring

    pba2DTranspose(pbaTextures[1]); 
}


__global__ void kernelThresholdDT(unsigned char* output, int size, float threshold2, short xm, short ym, short xM, short yM)
//Input:    pbaTexColor: closest-site-ids per pixel, i.e. FT
//Output:   output: thresholded DT
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (tx>xm && ty>ym && tx<xM && ty<yM)                                    //careful not to index outside the image..
    {    
        int    id     = TOID(tx, ty, size);
      short2 voroid = tex1Dfetch(pbaTexColor,id);                            //get the closest-site to tx,ty into voroid.x,.y
      float  d2     = (tx-voroid.x)*(tx-voroid.x)+(ty-voroid.y)*(ty-voroid.y);
      output[id]    = (d2<=threshold2);                                        //threshold DT into binary image
    }
}


__global__ void kernelDT(float* output, int size, short xm, short ym, short xM, short yM)
//Input:    pbaTexColor: closest-site-ids per pixel, i.e. FT
//Output:   output: DT
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
        
    if (tx>xm && ty>ym && tx<xM && ty<yM)                                    //careful not to index outside the image..
    {    
        int    id     = TOID(tx, ty, size);
      short2 voroid = tex1Dfetch(pbaTexColor,id);                            //get the closest-site to tx,ty into voroid.x,.y
      float  d2     = (tx-voroid.x)*(tx-voroid.x)+(ty-voroid.y)*(ty-voroid.y);
      output[id]    = sqrtf(d2);                                            //save the Euclidean DT
    }
}


__global__ void kernelSkel(unsigned char* output, short xm, short ym, 
                           short xM, short yM, short size, float threshold, float length)    
                                                                            //Input:    pbaTexColor: closest-site-ids per pixel
                                                                            //            pbaTexParam: labels for sites (only valid at site locations)
{                                                                            //Output:    output: binary thresholded skeleton
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (tx>xm && ty>ym && tx<xM && ty<yM) 
    {
        int    id     = TOID(tx, ty, size);
      int    Id     = id;
      short2 voroid = tex1Dfetch(pbaTexColor,id);                            //get the closest-site to tx,ty into voroid.x,.y
      int    id2    = TOID(voroid.x,voroid.y,size);                            //convert the site's coord to an index into pbaTexParam[], the site-label-texture
      float  imp    = tex1Dfetch(pbaTexParam,id2);                            //get the site's label

             ++id;                                                            //TOID(tx+1,ty,size)
             voroid = tex1Dfetch(pbaTexColor,id);                            //
             id2    = TOID(voroid.x,voroid.y,size);                            //
      float  imp_r  = tex1Dfetch(pbaTexParam,id2);                            //

             id     += size-1;                                                //TOID(tx,ty+1,size)
             voroid = tex1Dfetch(pbaTexColor,id);                            //
             id2    = TOID(voroid.x,voroid.y,size);                            //
      float  imp_u  = tex1Dfetch(pbaTexParam,id2);                            //

      float imp_dx  = fabsf(imp_r-imp);
      float imp_dy  = fabsf(imp_u-imp);
      float Imp     = max(imp_dx,imp_dy);
      Imp = min(Imp,fabsf(length-Imp));
      if (Imp>=threshold) output[Id] = 1;                                    //By filling only 1-values, we reduce memory access somehow (writing to output[] is expensive)
    } 
    
    //WARNING: this kernel may sometimes creates 2-pixel-thick branches.. Study the AFMM original code to see if this is correct.
}


#define X 1

__constant__ const                                                            //REMARK: put following constants (for kernelTopology) in CUDA constant-memory, as this gives a huge speed difference
unsigned char topo_patterns[][9] =        { {0,0,0,                            //These are the 3x3 templates that we use to detect skeleton endpoints
                                           0,X,0,                            //(with four 90-degree rotations for each)
                                           0,X,0},
                                          {0,0,0,
                                           0,X,0,
                                           0,0,X},
                                          {0,0,0,
                                           0,X,0,
                                           0,X,X},
                                          {0,0,0,
                                           0,X,0,
                                           X,X,0} 
                                        };
                                        
#define topo_NPATTERNS  4                                                        //Number of patterns we try to match (for kernelTopology)
                                                                                //REMARK: #define faster than __constant__

__constant__ const unsigned char topo_rot[][9] = { {0,1,2,3,4,5,6,7,8}, {2,5,8,1,4,7,0,3,6}, {8,7,6,5,4,3,2,1,0}, {6,3,0,7,4,1,8,5,2} };
                                                                                //These encode the four 90-degree rotations of the patterns (for kernelTopology);

__device__ unsigned int topo_gc            = 0;
__device__ unsigned int topo_gc_last    = 0;


__global__ void kernelTopology(unsigned char* output, short2* output_set, short xm, short ym, short xM, short yM, short size, int maxpts)    
{
    const int tx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ty = blockIdx.y * blockDim.y + threadIdx.y;
        
    unsigned char t[9];
    
    if (tx>xm && ty>ym && tx<xM-1 && ty<yM-1)                                    //careful not to index outside the image; take into account the template size too
    {    
       int    id = TOID(tx, ty, size);     
       unsigned char  p  = tex1Dfetch(pbaTexGray,id);                            //get the skeleton pixel at tx,ty
       if (p)                                                                    //if the pixel isn't skeleton, nothing to do
       {
         unsigned char idx=0;
         for(int j=ty-1;j<=ty+1;++j)                                            //read the template into t[] for easier use
         {
           int id = TOID(tx-1, j, size);
           for(int i=0;i<=2;++i,++id,++idx)
              t[idx] = tex1Dfetch(pbaTexGray,id);                                //get the 3x3 template centered at the skel point tx,ty
         }
          
         for(unsigned char r=0;r<4;++r)                                            //try to match all rotations of a pattern:
         {
           const unsigned char* rr = topo_rot[r];
           for(unsigned char p=0;p<topo_NPATTERNS;++p)                            //try to match all patterns:
           {
             const unsigned char* pat = topo_patterns[p];
             unsigned char j = (p==0)? 0 : 7;                                    //Speedup: for all patterns except 1st, check only last 3 entries, the first 6 are identical for all patterns
             for(;j<9;++j)                                                        //try to match rotated pattern vs actual pattern
               if (pat[j]!=t[rr[j]]) break;                                        //this rotation failed
             if (j<6) break;                                                    //Speedup: if we have a mismatch on the 1st 6 pattern entries, then none of the patterns can match
                                                                                //           since all templates have the same first 6 entries.

             if (j==9)                                                            //this rotation succeeded: mark the pixel as a topology event and we're done
             {    
                int crt_gc = atomicInc(&topo_gc,maxpts);                        //REMARK: this serializes (compacts) all detected endpoints in one array.            
                output_set[crt_gc] = make_short2(tx,ty);                        //To do this, we use an atomic read-increment-return on a global counter, 
                                                                                //which is guaranteed to give all threads unique consecutive indexes in the array.
                output[id] = 1;                                                    //Also create the topology image
                return;
             }
           }
         }
       }
    }
    else                                                                        //Last thread: add zero-marker to the output point-set, so the reader knows how many points are really in there
    if (tx==xM-1 && ty==yM-1)                                                    //Also reset the global vector counter topo_gc, for the next parallel-run of this function
    { topo_gc_last = topo_gc; topo_gc = 0; }                                    //We do this in the last thread so that no one modifies topo_gc from now on. 
                                                                                //REMARK: this seems to be the only way I can read a __device__ variable back to the CPU
}


void skelft2DParams(int floodBand_, int maurerBand_, int colorBand_)        //Set up some params of the FT algorithm    
{
  floodBand   = floodBand_;
  maurerBand  = maurerBand_;
  colorBand   = colorBand_;
}


// Compute 2D FT / Voronoi diagram of a set of sites
// siteParam:   Site parameterization. 0 = non-site points; >0 = site parameter value.
// output:        FT. The (x,y) at (i,j) are the coords of the closest site to (i,j)
// size:        Texture size (pow 2)
void skelft2DFT(short* output, float* siteParam, short xm, short ym, short xM, short yM, int size) 
{
    skelft2DInitializeInput(siteParam,size);                                    // Initialization of already-allocated data structures

    skel2DFTCompute(xm, ym, xM, yM, floodBand, maurerBand, colorBand);            // Compute FT
    
    // Copy FT to CPU, if required
    if (output) cudaMemcpy(output, pbaTextures[1], size*size*sizeof(short2), cudaMemcpyDeviceToHost);
}


void skelft2DDT(float* outputDT, short xm, short ym, short xM, short yM)        //Compute exact DT from resident FT (in pbaTextures[1])
{
    dim3 block = dim3(BLOCKX,BLOCKY);
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
    

    cudaBindTexture(0, pbaTexColor, pbaTextures[1]);                            //Used to read the FT from

    xm = ym = 0; xM = yM = pbaTexSize-1;
    kernelDT <<< grid, block >>>((float*)pbaTextures[2], pbaTexSize, xm-1, ym-1, xM+1, yM+1);
    cudaUnbindTexture(pbaTexColor);
      
    //Copy DT to CPU
    if (outputDT) cudaMemcpy(outputDT, pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost);
}


void skelft2DDT(short* outputDT, float threshold,                                //Compute (thresholded) DT (into pbaTextures[2]) from resident FT (in pbaTextures[1])    
                      short xm, short ym, short xM, short yM)
{
    dim3 block = dim3(BLOCKX,BLOCKY);
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);

    cudaBindTexture(0, pbaTexColor, pbaTextures[1]);                            //Used to read the FT from

    if (threshold>=0)
    {
      xm -= (short)threshold; if (xm<0) xm=0;
      ym -= (short)threshold; if (ym<0) ym=0;
      xM += (short)threshold; if (xM>pbaTexSize-1) xM=pbaTexSize-1;
      yM += (short)threshold; if (yM>pbaTexSize-1) yM=pbaTexSize-1;
    
      kernelThresholdDT<<< grid, block >>>((unsigned char*)pbaTextures[2], pbaTexSize, threshold*threshold, xm-1, ym-1, xM+1, yM+1);    
      cudaUnbindTexture(pbaTexColor);
    
      //Copy thresholded image to CPU
      if (outputDT) cudaMemcpy(outputDT, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    }
}


void skelft2DSkeleton(unsigned char* outputSkel, float length, float threshold,    //Compute thresholded skeleton (into pbaTextures[3]) from resident FT (in pbaTextures[1])
                      short xm,short ym,short xM,short yM)                        
{                                                                                //length:     boundary length
    dim3 block = dim3(BLOCKX,BLOCKY);                                            //threshold:  skeleton importance min-value (below this, we ignore branches)
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
    
    cudaBindTexture(0, pbaTexColor, pbaTextures[1]);                            //Used to read the resident FT
    cudaBindTexture(0, pbaTexParam, pbaTexSiteParam);                            //Used to read the resident boundary parameterization    
    cudaMemset(pbaTextures[3],0,sizeof(unsigned char)*pbaTexSize*pbaTexSize);    //Faster to zero result and then fill only 1-values (see kernel)
    
    kernelSkel<<< grid, block >>>((unsigned char*)pbaTextures[3], xm, ym, xM-1, yM-1, pbaTexSize, threshold, length);
    
    cudaUnbindTexture(pbaTexColor);
    cudaUnbindTexture(pbaTexParam);
    
    //Copy skeleton to CPU
    if (outputSkel) cudaMemcpy(outputSkel, pbaTextures[3], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
}


void skelft2DTopology(unsigned char* outputTopo, int* npts, short* outputPoints, //Compute topology-points of the resident skeleton (in pbaTextures[3])
                      short xm,short ym,short xM,short yM)                    
{
    int maxpts = (npts)? *npts : pbaTexSize*pbaTexSize;                            //This is the max # topo-points we are going to return in outputPoints[]

    dim3 block = dim3(BLOCKX,BLOCKY);
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
    
    cudaBindTexture(0, pbaTexGray, pbaTextures[3]);                                //Used to read the resident skeleton
    cudaMemset(pbaTextures[4],0,sizeof(unsigned char)*pbaTexSize*pbaTexSize);    //Faster to zero result and then fill only 1-values (see kernel)

    unsigned int zero = 0;
    cudaMemcpyToSymbol(topo_gc,&zero,sizeof(unsigned int),0,cudaMemcpyHostToDevice);        //Set topo_gc to 0

    kernelTopology<<< grid, block >>>((unsigned char*)pbaTextures[4], pbaTextures[5], xm, ym, xM, yM, pbaTexSize, maxpts+1);
    
    cudaUnbindTexture(pbaTexGray);

    if (outputPoints && maxpts)                                                    //If output-point vector desired, copy the end-points, put in pbaTexture[5] as a vector of short2's, 
    {                                                                            //into caller space. We copy only 'maxpts' elements, as the user instructed us.
        unsigned int num_pts;
        cudaMemcpyFromSymbol(&num_pts,topo_gc_last,sizeof(unsigned int),0,cudaMemcpyDeviceToHost);        //Get #topo-points we have detected from the device-var from CUDA
        if (npts && num_pts)                                                                            //Copy the topo-points to caller        
           cudaMemcpy(outputPoints,pbaTextures[5],num_pts*sizeof(short2),cudaMemcpyDeviceToHost);
        if (npts) *npts = num_pts;                                                //Return #detected topo-points to caller                                
    }
        
    if (outputTopo)                                                                //If topology image desired, copy it into user space
        cudaMemcpy(outputTopo,pbaTextures[4],pbaTexSize*pbaTexSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
}


__global__ void kernelSiteFromSkeleton(short2* outputSites, int size)                        //Initialize the Voronoi textures from the sites' encoding texture (parameterization)
{                                                                                            //REMARK: we interpret 'inputVoro' as a 2D texture, as it's much easier/faster like this
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    if (tx<size && ty<size)                                                                    //Careful not to go outside the image..
    {
      int i = TOID(tx,ty,size);
      unsigned char param = tex1Dfetch(pbaTexGray,i);                                        //The sites-param has non-zero (parameter) values precisely on non-boundary points

      short2& v = outputSites[i];
      v.x = v.y = MARKER;                                                                    //Non-boundary points are marked as 0 in the parameterization. Here we will compute the FT.
      if (param)                                                                            //These are points which define the 'sites' to compute the FT/skeleton (thus, have FT==identity)
      {                                                                                        //We could use an if-then-else here, but it's faster with an if-then
         v.x = tx; v.y = ty;
      }
    }
}


__global__ void kernelSkelInterpolate(float* output, int size)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    if (tx<size && ty<size)                                                                    //Careful not to go outside the image..
    {
        int    id     = TOID(tx, ty, size);
      short2 vid    = tex1Dfetch(pbaTexColor,id);                            
      float  T      = sqrtf((tx-vid.x)*(tx-vid.x)+(ty-vid.y)*(ty-vid.y));
      short2 vid2   = tex1Dfetch(pbaTexColor2,id);                            
      float  D      = sqrtf((tx-vid2.x)*(tx-vid2.x)+(ty-vid2.y)*(ty-vid2.y));
      float  B      = ((D)? min(T/2/D,0.5f):0.5) + 0.5*((T)? max(1-D/T,0.0f):0);
      output[id]    = pow(B,0.2f);
    }
}


void skel2DSkeletonDT(float* outputSkelDT,short xm,short ym,short xM,short yM)
{
    dim3 block = dim3(BLOCKX,BLOCKY);
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);

    cudaBindTexture(0,pbaTexGray,pbaTextures[3]);                            //Used to read the resident binary skeleton
    kernelSiteFromSkeleton<<<grid,block>>>(pbaTextures[0],pbaTexSize);        //1. Init pbaTextures[0] with sites on skeleton i.e. from pbaTexGray
    cudaUnbindTexture(pbaTexGray);
        
    //Must first save pbaTextures[1] since we may need it later..
    cudaMemcpy(pbaTextures[5],pbaTextures[1],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice);
    skel2DFTCompute(xm, ym, xM, yM, floodBand, maurerBand, colorBand);        //2. Compute FT of the skeleton into pbaTextures[6]
    cudaMemcpy(pbaTextures[6],pbaTextures[1],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice);
    cudaMemcpy(pbaTextures[1],pbaTextures[5],pbaTexSize*pbaTexSize*sizeof(short2),cudaMemcpyDeviceToDevice);
    
    //Compute interpolation        
    cudaBindTexture(0,pbaTexColor,pbaTextures[1]);                            // FT of boundary
    cudaBindTexture(0,pbaTexColor2,pbaTextures[6]);                            // FT of skeleton
    kernelSkelInterpolate<<<grid,block>>>((float*)pbaTextures[0],pbaTexSize);
    cudaUnbindTexture(pbaTexColor);
    cudaUnbindTexture(pbaTexColor2);
    if (outputSkelDT) cudaMemcpy(outputSkelDT, pbaTextures[0], pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost);
}


__global__ void kernelFill(unsigned char* output, int size, unsigned char bg, unsigned char fg, short xm, short ym, short xM, short yM, bool ne)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
        
    if (tx>xm && ty>ym && tx<xM && ty<yM)                                        //careful not to index outside the image..
    {    
      int    id0 = TOID(tx, ty, size);
      unsigned char val = tex1Dfetch(pbaTexGray,id0);                            //
      if (val==fg)                                                                //do we have a filled pixel? Then fill all to left/top/up/bottom of it which is background
      {
        bool fill = false;
        int id = id0;
        if (ne)                                                                    //fill in north-east direction:
        {
            for(short x=tx+1;x<xM;++x)                                            //REMARK: here and below, the interesting thing is that it's faster, by about 10-15%, to fill a whole
            {                                                                    //        scanline rather than oly until the current block's borders (+1). The reason is that filling a whole
                                                                                //          scanline decreases the total #sweeps, which seems to be the limiting speed factor
              if (tex1Dfetch(pbaTexGray,++id)!=bg) break;
              output[id] = fg; fill = true;
            }

            id = id0;
            for(short y=ty-1;y>ym;--y)
            {
              if (tex1Dfetch(pbaTexGray,id-=size)!=bg) break;
              output[id] = fg; fill = true;
            }
        }
        else                                                                    //fill in south-west direction:
        {
            for(short x=tx-1;x>xm;--x)
            {
              if (tex1Dfetch(pbaTexGray,--id)!=bg) break;
              output[id] = fg; fill = true;
            }

            id = id0;
            for(short y=ty+1;y<yM;++y)
            {
              if (tex1Dfetch(pbaTexGray,id+=size)!=bg) break;
              output[id] = fg; fill = true;
            }
        }
        
        if (fill) fill_gc = true;                                                //if we filled anything, inform caller; we 'gather' this info from a local var into the
                                                                                //global var here, since it's faster than writing the global var in the for loops
      }      
    }
}


__global__ void kernelFillHoles(unsigned char* output, int size, unsigned char bg, unsigned char fg, unsigned char fill_fg)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (tx>=0 && ty>=0 && tx<size && ty<size)                                    //careful not to index outside the image..
    {    
        int            id = TOID(tx, ty, size);
      unsigned char val = tex1Dfetch(pbaTexGray,id);                            //
      if (val==fill_fg)
         output[id] = bg;
      else if (val==bg)
         output[id] = fg;     
    }
}


int skelft2DFill(unsigned char* outputFill, short sx, short sy, short xm, short ym, short xM, short yM, unsigned char fill_value)
{
    dim3 block = dim3(BLOCKX,BLOCKY);
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);

    unsigned char background;
    int id = sy * pbaTexSize + sx;
    cudaMemcpy(&background,(unsigned char*)pbaTextures[2]+id,sizeof(unsigned char),cudaMemcpyDeviceToHost); //See which is the value we have to fill from (sx,sy)
    
    cudaMemset(((unsigned char*)pbaTextures[2])+id,fill_value,sizeof(unsigned char));                    //Fill the seed (x,y) on the GPU    

    cudaBindTexture(0, pbaTexGray, pbaTextures[2]);                                                        //Used to read the thresholded DT

    int iter=0;
    bool xy = true;                                                                                        //Direction of filling for current sweep: either north-east or south-west
                                                                                                        //This kind of balances the memory-accesses nicely over kernel calls
    for(;;++iter,xy=!xy)                                                                                //Keep filling a sweep at a time until we have no background pixels anymore
    {    
       bool filled = false;                                                                                //Initialize flag: we didn't fill anything in this sweep
       cudaMemcpyToSymbol(fill_gc,&filled,sizeof(bool),0,cudaMemcpyHostToDevice);                        //Pass flag to CUDA
       kernelFill<<<grid, block>>>((unsigned char*)pbaTextures[2],pbaTexSize,background,fill_value,xm,ym,xM,yM,xy);    
                                                                                                        //One fill sweep       
       cudaMemcpyFromSymbol(&filled,fill_gc,sizeof(bool),0,cudaMemcpyDeviceToHost);                        //See if we filled anything in this sweep
       if (!filled) break;                                                                                //Nothing filled? Then we're done, the image didn't change
    }
    cudaUnbindTexture(pbaTexGray);
        
    if (outputFill) cudaMemcpy(outputFill, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    
    return iter;                                                                                        //Return #iterations done for the fill - useful as a performance measure for caller
}


int skelft2DFillHoles(unsigned char* outputFill, short sx, short sy, unsigned char foreground)
{
    unsigned char background;
    unsigned char fill_value = 128;
    int id = sy * pbaTexSize + sx;
    cudaMemcpy(&background,(unsigned char*)pbaTextures[2]+id,sizeof(unsigned char),cudaMemcpyDeviceToHost); //See which is the value at (sx,sy)

    int iter = skelft2DFill(0,sx,sy,0,0,pbaTexSize,pbaTexSize,fill_value);                                //First, fill the background surrounding the image with some special value
        
    dim3 block = dim3(BLOCKX,BLOCKY);
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
    

    cudaBindTexture(0, pbaTexGray, pbaTextures[2]);                                                        //Used to read the thresholded DT

    kernelFillHoles<<<grid, block>>>((unsigned char*)pbaTextures[2],pbaTexSize,background,foreground,fill_value);

    cudaUnbindTexture(pbaTexGray);
    
    if (outputFill) cudaMemcpy(outputFill, (unsigned char*)pbaTextures[2], pbaTexSize * pbaTexSize * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    
    return iter;
}


//--------  Advection-related code  ----------------------------------------------------------------------


void advect2DSetSites(float* sites, int nsites)
{
    cudaMemcpy(pbaTexAdvectSites, sites, nsites * sizeof(float2), cudaMemcpyHostToDevice);                //Copy side-coords from CPU->GPU
}

//#define KERNEL(radius,dist) expf(-10*dist/radius)
#define KERNEL(radius,dist) ((radius-dist)/radius)


__global__ void kernelSplat(float* output, int size, int numpts, float d)
//REMARK: This could be done tens of times faster using OpenGL (draw point-sprites with a 2D distance-kernel texture)
{
    int offs = blockIdx.x * blockDim.x + threadIdx.x;

    if (offs < numpts)                                                        //careful not to index outside the site-vector..
    {    
      float2        p  = tex1Dfetch(pbaTexAdvect,offs);                        //splat the site whose coords are at location offs in pbaTexAdvect[]

      float d2   = d*d;
      short tx   = p.x, ty = p.y;                                            //REMARK: if I make these float, then something gets ccrewed up - why?
      short jmin = ty-d;
      short jmax = ty+d;

      for(short j=jmin;j<=jmax;++j)                                            //bounding-box of the splat at 'skel'
        {
           float dy = (j-ty)*(j-ty);                                        //precalc this for speed
           short imin = tx-d;
           short imax = tx+d;
           
           int offs = __mul24(int(j),size);
           
           for(short i=imin;i<=imax;++i)                                    //REMARK: this could be made ~1.5x faster by computing the x-scanline intersection (min,max) with the d2-circle..
             {
                float r2 = dy+(i-tx)*(i-tx);
                if (r2<=d2)                                                    //check we're really inside the splat
                {
                  int id = int(i)+offs; 
                  float val = KERNEL(d2,r2);
                  output[id] += val;                                        //Accumulate density (WARNING: hope this is thread-safe!!)
                                                                            //If not, I need atomicAdd() on floats which requires CUDA 2.0    
                }
             }
        }       
    }
}


inline __device__ float2 computeGradient(const float2& p)                    //Compute -gradient of density at p
{
    float v_d = tex2D(pbaTexDensity,p.x,p.y-1);
    float v_l = tex2D(pbaTexDensity,p.x-1,p.y);
    float v_r = tex2D(pbaTexDensity,p.x+1,p.y);
    float v_t = tex2D(pbaTexDensity,p.x,p.y+1);
    
    return make_float2((v_l-v_r)/2,(v_d-v_t)/2);
}


inline __device__ float2 computeGradientExplicit(float* density, const float2& p, int size)        //Compute gradient of density at p
{
    int x = p.x, y = p.y;
    float v_d = density[TOID(x,y-1,size)];
    float v_l = density[TOID(x-1,y,size)];
    float v_r = density[TOID(x+1,y,size)];
    float v_t = density[TOID(x,y+1,size)];
    
    return make_float2((v_r-v_l)/2,(v_t-v_d)/2);
}


__global__ void kernelGradient(float2* grad, int size)                            
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    const float eps = 0.0001;

    if (tx<size && ty<size)                                                    //Careful not to go outside the image..
    {
      float2 p = make_float2(tx,ty);
      float2 g = computeGradient(p);
      
      float gn = sqrtf(g.x*g.x+g.y*g.y);                                    //robustly normalize the gradient
      g.x /= gn+eps; g.y /= gn+eps;
            
      grad[TOID(tx,ty,size)] = g;    
    }
}


void advect2DSplat(float* output, int num_pts, float radius)
{
    dim3 block = dim3(BLOCKX*BLOCKY);                                            //Prepare the splatting kernel: this operates on a vector of 2D sites
    int numpts_b = (num_pts/block.x+1)*block.x;                                    //Find higher multiple of blocksize than # sites
    dim3 grid  = dim3(numpts_b/block.x);
    
    cudaMemset(pbaTexDensityData,0,sizeof(float)*pbaTexSize*pbaTexSize);        //Zero the density texture
    cudaBindTexture(0,pbaTexAdvect,pbaTexAdvectSites);                            //Bind sites to a texture

    kernelSplat<<< grid, block >>>(pbaTexDensityData, pbaTexSize, num_pts, radius);
                                                                                //Splat kernel    
    cudaThreadSynchronize();    
                                                                                
    cudaUnbindTexture(pbaTexAdvect);                                            //Done with the sites
                                                                                //Copy splat-map to CPU (if desired)
    if (output) cudaMemcpy(output, pbaTexDensityData, pbaTexSize * pbaTexSize * sizeof(float), cudaMemcpyDeviceToHost);
}


void advect2DGetSites(float* output, int num_pts)                                //Get sites CUDA->CPU
{
    cudaMemcpy(output,pbaTexAdvectSites,num_pts*sizeof(float2),cudaMemcpyDeviceToHost);
}


void advect2DGradient()                                                            //Compute and save gradient of density map
{
    dim3 block = dim3(BLOCKX,BLOCKY);
    dim3 grid  = dim3(pbaTexSize/block.x,pbaTexSize/block.y);
    
    cudaMemset(pbaTexGradientData,0,sizeof(float2)*pbaTexSize*pbaTexSize);        //Zero the gradient texture

    pbaTexDensity.filterMode = cudaFilterModeLinear;                            //The floating-point density map
    pbaTexDensity.normalized = false;
    cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
    cudaBindTexture2D(0,pbaTexDensity,pbaTexDensityData,channelDesc1,pbaTexSize,pbaTexSize,4*pbaTexSize);                            
        
    kernelGradient<<< grid, block >>>(pbaTexGradientData, pbaTexSize);            //Gradient kernel    
                                                                                
    cudaUnbindTexture(pbaTexDensity);
}