OpenCL kernels

This is the source code for llg3d/solvers/opencl/kernels.cl ↗.

4 main kernels are programmed:

Kernel Name

Description

predict

Predictor step of the Heun method.

correct_and_normalize

Corrector step of the Heun method and normalization of the magnetization.

sum_m1_weighted

Compute the partial weighted average of the magnetization.

reduce_partial_sums

Reduce the partial sums computed by sum_m1_weighted into the final average.

2 additional kernels are provided for debugging and testing purposes:

  • check_finite is a utility kernel to check for NaN or Inf values in the magnetization array, which can be useful for debugging and ensuring numerical stability.

  • test_laplacian is used in the test suite to verify the correctness of the laplacian computation on the OpenCL solver.

predict

__kernel void predict(__global const real *m_n,
                      __global real *m_np1,
                      __global const real *R_random,
                      __global real *s_pre,
                      const real inv_dx2,
                      const real coeff_1,
                      const real coeff_2,
                      const real coeff_3,
                      const real coeff_4,
                      const real lambda_G,
                      const real dt) {
  int gid = get_global_id(0);
  if (gid >= NTOT) return;

  int id1 = gid;
  int id2 = id1 + NTOT;
  int id3 = id2 + NTOT;

  // Compute slope using helper
  real s_1, s_2, s_3;
  real laplacian_1, laplacian_2, laplacian_3;
  real H_aniso_1, H_aniso_2, H_aniso_3;
  real R_1, R_2, R_3;

  compute_slope(m_n, R_random, inv_dx2, coeff_1, coeff_2, coeff_3,
                coeff_4, lambda_G, &s_1, &s_2, &s_3,
                &laplacian_1, &laplacian_2, &laplacian_3,
                &H_aniso_1, &H_aniso_2, &H_aniso_3,
                &R_1, &R_2, &R_3);

  // Store the slopes for correction phase
  s_pre[id1] = s_1;
  s_pre[id2] = s_2;
  s_pre[id3] = s_3;

  // Apply Euler update
  m_np1[id1] = m_n[id1] + dt * s_1;
  m_np1[id2] = m_n[id2] + dt * s_2;
  m_np1[id3] = m_n[id3] + dt * s_3;
}

correct_and_normalize

__kernel void correct_and_normalize(__global const real *m_n,
                                    __global real *m_np1,
                                    __global const real *R_random,
                                    __global const real *s_pre,
                                    const real inv_dx2,
                                    const real coeff_1, const real coeff_2,
                                    const real coeff_3, const real coeff_4,
                                    const real lambda_G, const real dt,
                                    __global int *error_codes) {
  int gid = get_global_id(0);
  if (gid >= NTOT) return;

  int id1 = gid;
  int id2 = id1 + NTOT;
  int id3 = id2 + NTOT;

  // Compute slope using helper (from m_np1 which contains m^{n+1})
  real s_1, s_2, s_3;
  real laplacian_1, laplacian_2, laplacian_3;
  real H_aniso_1, H_aniso_2, H_aniso_3;
  real R_1, R_2, R_3;

  compute_slope(m_np1, R_random, inv_dx2, coeff_1, coeff_2, coeff_3,
                coeff_4, lambda_G, &s_1, &s_2, &s_3,
                &laplacian_1, &laplacian_2, &laplacian_3,
                &H_aniso_1, &H_aniso_2, &H_aniso_3,
                &R_1, &R_2, &R_3);

  // Midpoint update (Heun's method correction step)
  real m_1 = m_n[id1] + dt * 0.5 * (s_pre[id1] + s_1);
  real m_2 = m_n[id2] + dt * 0.5 * (s_pre[id2] + s_2);
  real m_3 = m_n[id3] + dt * 0.5 * (s_pre[id3] + s_3);

  // Normalize
  real norm = sqrt(m_1 * m_1 + m_2 * m_2 + m_3 * m_3);
  m_np1[id1] = m_1 / norm;
  m_np1[id2] = m_2 / norm;
  m_np1[id3] = m_3 / norm;
  
#ifdef ENABLE_ERROR_CHECK
  // Check for non-finite values - each work-item writes to its own slot (thread-safe)
  int is_non_finite = (!isfinite(m_np1[id1]) || !isfinite(m_np1[id2]) || !isfinite(m_np1[id3]));
  error_codes[gid] = is_non_finite ? 1 : 0;
#endif
}

sum_m1_weighted

__kernel void sum_m1_weighted(__global const real *m_1, 
                              __global real *partial_sums,
                              __local real *local_sum) {
  int gid = get_global_id(0);
  int lid = get_local_id(0);
  int group_id = get_group_id(0);
  int lsize = get_local_size(0);
  
  // Each work item processes one element
  real my_sum = 0.0;
  if (gid < NTOT) {
    // Decompose linear ID into i, j, k
    int k = gid % NZ;
    int j = (gid / NZ) % NY;
    int i = gid / (NY * NZ);
    
    // Get m1 value
    real value = m_1[gid];
    
    // Apply midpoint method weights
    real weight = 1.0;
    if (i == 0 || i == NX-1) weight *= 0.5;
    if (j == 0 || j == NY-1) weight *= 0.5;
    if (k == 0 || k == NZ-1) weight *= 0.5;
    
    my_sum = weight * value;
  } else {
    // Out-of-range threads contribute zero
    my_sum = 0.0;
  }
  
  local_sum[lid] = my_sum;
  barrier(CLK_LOCAL_MEM_FENCE);
  
  // Work-group level reduction
  for (int offset = lsize / 2; offset > 0; offset /= 2) {
    if (lid < offset) {
      local_sum[lid] += local_sum[lid + offset];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  
  // Write partial sum for this work-group
  if (lid == 0) {
    partial_sums[group_id] = local_sum[0];
  }
}

reduce_partial_sums

__kernel void reduce_partial_sums(__global const real *partial_sums,
                                  __global real *final_sum,
                                  __local real *local_sum,
                                  const int num_groups) {
  int lid = get_local_id(0);
  int lsize = get_local_size(0);
  
  // Accumulate partial sums from global memory
  real my_sum = 0.0;
  for (int i = lid; i < num_groups; i += lsize) {
    my_sum += partial_sums[i];
  }
  
  local_sum[lid] = my_sum;
  barrier(CLK_LOCAL_MEM_FENCE);
  
  // Work-group level reduction
  for (int offset = lsize / 2; offset > 0; offset /= 2) {
    if (lid < offset) {
      local_sum[lid] += local_sum[lid + offset];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  
  // Write final sum (only one work-group needed)
  if (lid == 0) {
    final_sum[0] = local_sum[0];
  }
}

Full kernel code

#ifdef USE_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
typedef double real;
#else
typedef float real;
#endif

#ifndef NX
#define NX 11
#endif

#ifndef NY
#define NY 11
#endif

#ifndef NZ
#define NZ 11
#endif

#define NTOT (NX * NY * NZ)

// Helper function to compute the Laplacian with Neumann boundary conditions
// IMPORTANT: Assumes uniform grid (inv_dx2 = inv_dy2 = inv_dz2)
// For uniform grids, accumulate differences before multiplying by inv_dx2
// to reduce truncation error in single precision
inline void compute_laplacian(const real __global *m_i,
                              const real inv_dx2,
                              real *laplacian) {
  
  int gid = get_global_id(0);
  // Decompose linear ID into i, j, k
  int k = gid % NZ;
  int j = (gid / NZ) % NY;
  int i = gid / (NY * NZ);
  int id_left = (i - 1) * NY * NZ + j * NZ + k;
  int id_right = (i + 1) * NY * NZ + j * NZ + k;
  int id_bottom = i * NY * NZ + (j - 1) * NZ + k;
  int id_top = i * NY * NZ + (j + 1) * NZ + k;
  int id_back = i * NY * NZ + j * NZ + (k - 1);
  int id_front = i * NY * NZ + j * NZ + (k + 1);

  // Neumann boundary conditions
  if (i == 0)
    id_left = id_right;
  if (i == NX - 1)
    id_right = id_left;
  if (j == 0)
    id_bottom = id_top;
  if (j == NY - 1)
    id_top = id_bottom;
  if (k == 0)
    id_back = id_front;
  if (k == NZ - 1)
    id_front = id_back;

  real center = m_i[gid];
  
  // Uniform grid case: accumulate all differences before multiplying by inv_dx2
  // This reduces truncation error for uniform grids
  *laplacian = (m_i[id_left] + m_i[id_right] +
                m_i[id_top] + m_i[id_bottom] +
                m_i[id_front] + m_i[id_back] - 6.0 * center) * inv_dx2;
}

// Helper function to compute the slope (ds/dt = cross(m, R))
inline void compute_slope(__global const real *m,
                          __global const real *R_random,
                          const real inv_dx2,
                          const real coeff_1,
                          const real coeff_2,
                          const real coeff_3,
                          const real coeff_4,
                          const real lambda_G,
                          real *s_1, real *s_2, real *s_3,
                          real *laplacian_1_out, real *laplacian_2_out, real *laplacian_3_out,
                          real *H_aniso_1_out, real *H_aniso_2_out, real *H_aniso_3_out,
                          real *R_1_out, real *R_2_out, real *R_3_out) {

  int gid = get_global_id(0);
  int id1 = gid;
  int id2 = id1 + NTOT;
  int id3 = id2 + NTOT;

  real laplacian_1;
  compute_laplacian(m, inv_dx2, &laplacian_1);
  real laplacian_2;
  compute_laplacian(m + NTOT, inv_dx2, &laplacian_2);
  real laplacian_3;
  compute_laplacian(m + 2 * NTOT, inv_dx2, &laplacian_3);

  *laplacian_1_out = laplacian_1;
  *laplacian_2_out = laplacian_2;
  *laplacian_3_out = laplacian_3;

  real m_1 = m[id1];
  real m_2 = m[id2];
  real m_3 = m[id3];

  real m1m1 = m_1 * m_1;
  real m2m2 = m_2 * m_2;
  real m3m3 = m_3 * m_3;

  // Branch based on anisotropy type
  real H_aniso_1, H_aniso_2, H_aniso_3;

#ifdef ANISOTROPY_UNIAXIAL
  // Uniaxial anisotropy
  H_aniso_1 = coeff_2 * m_1;
  H_aniso_2 = 0.f;
  H_aniso_3 = 0.f;
#else  // Default: ANISOTROPY_CUBIC
  // Cubic anisotropy
  H_aniso_1 = -(1 - m1m1 + m2m2 * m3m3) * m_1;
  H_aniso_2 = -(1 - m2m2 + m1m1 * m3m3) * m_2;
  H_aniso_3 = -(1 - m3m3 + m1m1 * m2m2) * m_3;
#endif

  *H_aniso_1_out = H_aniso_1;
  *H_aniso_2_out = H_aniso_2;
  *H_aniso_3_out = H_aniso_3;

  real R_1 = coeff_1 * laplacian_1 + coeff_4 * R_random[id1] + H_aniso_1 + coeff_3;
  real R_2 = coeff_1 * laplacian_2 + coeff_4 * R_random[id2] + H_aniso_2;
  real R_3 = coeff_1 * laplacian_3 + coeff_4 * R_random[id3] + H_aniso_3;

  *R_1_out = R_1;
  *R_2_out = R_2;
  *R_3_out = R_3;

  real G_m1m2 = lambda_G * m_1 * m_2;
  real G_m1m3 = lambda_G * m_1 * m_3;
  real G_m2m3 = lambda_G * m_2 * m_3;


  *s_1 = (-m_2 - G_m1m3) * R_3 + (m_3 - G_m1m2) * R_2 +
           lambda_G * (m2m2 + m3m3) * R_1;
  *s_2 = (-m_3 - G_m1m2) * R_1 + (m_1 - G_m2m3) * R_3 +
           lambda_G * (m1m1 + m3m3) * R_2;
  *s_3 = (-m_1 - G_m2m3) * R_2 + (m_2 - G_m1m3) * R_1 +
           lambda_G * (m1m1 + m2m2) * R_3;
}

// Prediction step with slope storage: compute slope, store it, and apply first Euler update
__kernel void predict(__global const real *m_n,
                      __global real *m_np1,
                      __global const real *R_random,
                      __global real *s_pre,
                      const real inv_dx2,
                      const real coeff_1,
                      const real coeff_2,
                      const real coeff_3,
                      const real coeff_4,
                      const real lambda_G,
                      const real dt) {
  int gid = get_global_id(0);
  if (gid >= NTOT) return;

  int id1 = gid;
  int id2 = id1 + NTOT;
  int id3 = id2 + NTOT;

  // Compute slope using helper
  real s_1, s_2, s_3;
  real laplacian_1, laplacian_2, laplacian_3;
  real H_aniso_1, H_aniso_2, H_aniso_3;
  real R_1, R_2, R_3;

  compute_slope(m_n, R_random, inv_dx2, coeff_1, coeff_2, coeff_3,
                coeff_4, lambda_G, &s_1, &s_2, &s_3,
                &laplacian_1, &laplacian_2, &laplacian_3,
                &H_aniso_1, &H_aniso_2, &H_aniso_3,
                &R_1, &R_2, &R_3);

  // Store the slopes for correction phase
  s_pre[id1] = s_1;
  s_pre[id2] = s_2;
  s_pre[id3] = s_3;

  // Apply Euler update
  m_np1[id1] = m_n[id1] + dt * s_1;
  m_np1[id2] = m_n[id2] + dt * s_2;
  m_np1[id3] = m_n[id3] + dt * s_3;
}

// Correction step: compute slope, apply midpoint update, and normalize
__kernel void correct_and_normalize(__global const real *m_n,
                                    __global real *m_np1,
                                    __global const real *R_random,
                                    __global const real *s_pre,
                                    const real inv_dx2,
                                    const real coeff_1, const real coeff_2,
                                    const real coeff_3, const real coeff_4,
                                    const real lambda_G, const real dt,
                                    __global int *error_codes) {
  int gid = get_global_id(0);
  if (gid >= NTOT) return;

  int id1 = gid;
  int id2 = id1 + NTOT;
  int id3 = id2 + NTOT;

  // Compute slope using helper (from m_np1 which contains m^{n+1})
  real s_1, s_2, s_3;
  real laplacian_1, laplacian_2, laplacian_3;
  real H_aniso_1, H_aniso_2, H_aniso_3;
  real R_1, R_2, R_3;

  compute_slope(m_np1, R_random, inv_dx2, coeff_1, coeff_2, coeff_3,
                coeff_4, lambda_G, &s_1, &s_2, &s_3,
                &laplacian_1, &laplacian_2, &laplacian_3,
                &H_aniso_1, &H_aniso_2, &H_aniso_3,
                &R_1, &R_2, &R_3);

  // Midpoint update (Heun's method correction step)
  real m_1 = m_n[id1] + dt * 0.5 * (s_pre[id1] + s_1);
  real m_2 = m_n[id2] + dt * 0.5 * (s_pre[id2] + s_2);
  real m_3 = m_n[id3] + dt * 0.5 * (s_pre[id3] + s_3);

  // Normalize
  real norm = sqrt(m_1 * m_1 + m_2 * m_2 + m_3 * m_3);
  m_np1[id1] = m_1 / norm;
  m_np1[id2] = m_2 / norm;
  m_np1[id3] = m_3 / norm;
  
#ifdef ENABLE_ERROR_CHECK
  // Check for non-finite values - each work-item writes to its own slot (thread-safe)
  int is_non_finite = (!isfinite(m_np1[id1]) || !isfinite(m_np1[id2]) || !isfinite(m_np1[id3]));
  error_codes[gid] = is_non_finite ? 1 : 0;
#endif
}

// Kernel to detect non-finite values in m (returns 1 if any NaN/Inf, else 0)
__kernel void check_finite(__global const real *m, __global int *flag) {
  int gid = get_global_id(0);
  int lid = get_local_id(0);

  // Use local reduction to minimize atomics
  __local int lflag;
  if (lid == 0) lflag = 0;
  barrier(CLK_LOCAL_MEM_FENCE);

  if (gid < 3 * NTOT) {
    real v = m[gid];
    if (!isfinite(v)) {
      lflag = 1;
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  if (lid == 0 && lflag) {
    flag[0] = 1;
  }
}

// A kernel to test the laplacian function, used only in tests
__kernel void test_laplacian(__global const real *m_i,
                             __global real *lap_m_i,
                             const real inv_dx2) {

  int gid = get_global_id(0);
  if (gid >= NTOT) return;

  // Decompose linear ID into i, j, k
  int k = gid % NZ;
  int j = (gid / NZ) % NY;
  int i = gid / (NY * NZ);

  real laplacian;
  compute_laplacian(m_i, inv_dx2, &laplacian);
  lap_m_i[gid] = laplacian;
}

// Reduction kernel: sum m1 with midpoint weights in a single pass
// Uses local memory for work-group reduction
__kernel void sum_m1_weighted(__global const real *m_1, 
                              __global real *partial_sums,
                              __local real *local_sum) {
  int gid = get_global_id(0);
  int lid = get_local_id(0);
  int group_id = get_group_id(0);
  int lsize = get_local_size(0);
  
  // Each work item processes one element
  real my_sum = 0.0;
  if (gid < NTOT) {
    // Decompose linear ID into i, j, k
    int k = gid % NZ;
    int j = (gid / NZ) % NY;
    int i = gid / (NY * NZ);
    
    // Get m1 value
    real value = m_1[gid];
    
    // Apply midpoint method weights
    real weight = 1.0;
    if (i == 0 || i == NX-1) weight *= 0.5;
    if (j == 0 || j == NY-1) weight *= 0.5;
    if (k == 0 || k == NZ-1) weight *= 0.5;
    
    my_sum = weight * value;
  } else {
    // Out-of-range threads contribute zero
    my_sum = 0.0;
  }
  
  local_sum[lid] = my_sum;
  barrier(CLK_LOCAL_MEM_FENCE);
  
  // Work-group level reduction
  for (int offset = lsize / 2; offset > 0; offset /= 2) {
    if (lid < offset) {
      local_sum[lid] += local_sum[lid + offset];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  
  // Write partial sum for this work-group
  if (lid == 0) {
    partial_sums[group_id] = local_sum[0];
  }
}

// Second-level reduction: reduce partial sums to a single value
// Handles cases where num_groups > local_size by looping
__kernel void reduce_partial_sums(__global const real *partial_sums,
                                  __global real *final_sum,
                                  __local real *local_sum,
                                  const int num_groups) {
  int lid = get_local_id(0);
  int lsize = get_local_size(0);
  
  // Accumulate partial sums from global memory
  real my_sum = 0.0;
  for (int i = lid; i < num_groups; i += lsize) {
    my_sum += partial_sums[i];
  }
  
  local_sum[lid] = my_sum;
  barrier(CLK_LOCAL_MEM_FENCE);
  
  // Work-group level reduction
  for (int offset = lsize / 2; offset > 0; offset /= 2) {
    if (lid < offset) {
      local_sum[lid] += local_sum[lid + offset];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  
  // Write final sum (only one work-group needed)
  if (lid == 0) {
    final_sum[0] = local_sum[0];
  }
}