I am try to implement an algorithm for process image with more than 256 bins.
The main issue for process histogram in such case come from the impossibility to allocate more than 32 Kb as local tab in the GPU.
All the algorithm I found for 8 bits per pixel images use a fix size tab localy. The histogram is first process in that tab then a barrier is up and at last the bins are addition with the output memory.
I am working with IR image which have more than 32K bins of dynamic. So I cannot allocate a fix size tab inside the GPU.
My algorithm consist to use an atomic_add in order to create directly the output histogram.
I am interfacing with OpenCV so in order to manage the possible case of saturation my bins use floating points. Depending on the ability to the GPU to manage single or double precision. OpenCV doesn't manage unsigned int, long, and unsigned long data type as matrix type.
I have an error... I do think this error is a kind of segmentation fault. After sevral days I still have no idea what can be wrong.
Here is my code :
histogram.cl :
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
static void Atomic_Add_f64(__global double *val, double delta)
{
union {
double f;
ulong i;
} old;
union {
double f;
ulong i;
} new;
do {
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);
}
static void Atomic_Add_f32(__global float *val, double delta)
{
union
{
float f;
uint i;
} old;
union
{
float f;
uint i;
} new;
do
{
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);
}
__kernel void khist(
__global const uchar* _src,
const int src_steps,
const int src_offset,
const int rows,
const int cols,
__global uchar* _dst,
const int dst_steps,
const int dst_offset)
{
const int gid = get_global_id(0);
// printf("This message has been printed from the OpenCL kernel %d \n",gid);
if(gid < rows)
{
__global const _Sty* src = (__global const _Sty*)_src;
__global _Dty* dst = (__global _Dty*) _dst;
const int src_step1 = src_steps/sizeof(_Sty);
const int dst_step1 = dst_steps/sizeof(_Dty);
src += mad24(gid,src_step1,src_offset);
dst += mad24(gid,dst_step1,dst_offset);
_Dty one = (_Dty)1;
for(int c=0;c<cols;c++)
{
const _Rty idx = (_Rty)(*(src+c+src_offset));
ATOMIC_FUN(dst+idx+dst_offset,one);
}
}
}
The function Atomic_Add_f64 directly come from here and there
main.cpp
Aucun commentaire:
Enregistrer un commentaire