GPUDemon running puzzles.

Hi,

I am trying to use the GPUDemonRegistrationFilter to speed up the Registration. Here is how I implement it.
First, I used the following iteration observer.

class CommandIterationUpdate : public itk::Command
{
public:
using Self = CommandIterationUpdate;
using Superclass = itk::Command;
using Pointer = itk::SmartPointer;
itkNewMacro(Self);
protected:
CommandIterationUpdate() = default;
// define ITK short-hand types
using InternalPixelType = float;
using GPUImageType = itk::GPUImage<InternalPixelType, 3>;
using VectorPixelType = itk::Vector<float, 3>;
using GPUDisplacementFieldType = itk::GPUImage<VectorPixelType, 3>;
using GPURegistrationFilterType =
itk::GPUDemonsRegistrationFilter<GPUImageType,
GPUImageType,
GPUDisplacementFieldType>;
public:
void
Execute(const itk::Object*, const itk::EventObject&) override
{
std::cout << “Warning: The const Execute method shouldn’t be called”
<< std::endl;
}
void
Execute(itk::Object* caller, const itk::EventObject& event) override
{
auto* filter = static_cast<GPURegistrationFilterType*>(caller);
if (!(itk::IterationEvent().CheckEvent(&event)))
{
return;
}
if (filter)
{
filter->SetMaximumRMSError(MaxRmsE[RmsCounter]);
std::cout << filter->GetMetric()
<< " RMS Change: " << filter->GetRMSChange() << std::endl;
std::cout << "Level Tolerance= " << filter->GetMaximumRMSError()
<< std::endl;
}
}
};

Then I did the registration like a CPU demon by just replacing the image and filter to the GPU version.

auto fixedImageCaster = ImageCasterType::New();
fixedImageCaster->SetInput(targetReader->GetOutput());
auto movingImageCaster = ImageCasterType::New();
movingImageCaster->SetInput(matcher->GetOutput());
GPUImageType::Pointer GPUFixedImage = fixedImageCaster->GetOutput();
GPUImageType::Pointer GPUMovingImage = movingImageCaster->GetOutput();

The matcher is a HistogramMatchingImageFilter that I did with CPU images. And then the GPU filter.

using VectorPixelType = itk::Vector<float, Dimension>;
using DisplacementFieldType = itk::Image<VectorPixelType, Dimension>;
using GPUDisplacementFieldType = itk::GPUImage<VectorPixelType, Dimension>;
using GPURegistrationFilterType =
    itk::GPUDemonsRegistrationFilter<GPUImageType,
    GPUImageType,
    GPUDisplacementFieldType>;
auto GPUfilter = GPURegistrationFilterType::New();
GPUfilter->SetStandardDeviations(1.0);
GPUfilter->SetNumberOfIterations(40);
GPUfilter->SetFixedImage(GPUFixedImage);
GPUfilter->SetMovingImage(GPUMovingImage);
 auto observer = CommandIterationUpdate::New();
 GPUfilter->AddObserver(itk::IterationEvent(), observer);
try
{
    GPUfilter->Update();
}
catch (const itk::ExceptionObject& excp)
{
    std::cerr << excp << std::endl;
    return EXIT_FAILURE;
}

The compiling was correct and after I execute it with an example, I got someting as follows.

Platform : NVIDIA CUDA
NVIDIA GeForce GTX 1060 6GB
Maximum Work Item Sizes : { 1024, 1024, 64 }
Maximum Work Group Size : 1024
Alignment in bits of the base address : 4096
Smallest alignment in bytes for any data type : 128
cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing cl_nv_copy_opts cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_device_uuid cl_khr_pci_bus_info cl_khr_external_semaphore cl_khr_external_memory cl_khr_external_semaphore_win32 cl_khr_external_memory_win32
5334.71 RMS Change: 0.267018
Level Tolerance= 0.8

It seemed that CUDA worked. However, I set the iteration to 40 while the observer only gave me 1 print. There should be 40 prints. Dose anyone know what is going on?
Thanks a lot!