OpenCL kernels¶
This is the source code for llg3d/solvers/opencl/kernels.cl ↗.
4 main kernels are programmed:
Kernel Name |
Description |
|---|---|
Predictor step of the Heun method. |
|
Corrector step of the Heun method and normalization of the magnetization. |
|
Compute the partial weighted average of the magnetization. |
|
Reduce the partial sums computed by |
2 additional kernels are provided for debugging and testing purposes:
check_finiteis a utility kernel to check for NaN or Inf values in the magnetization array, which can be useful for debugging and ensuring numerical stability.test_laplacianis used in the test suite to verify the correctness of the laplacian computation on the OpenCL solver.
predict¶
__kernel void predict(__global const real *m_n,
__global real *m_np1,
__global const real *R_random,
__global real *s_pre,
const real inv_dx2,
const real coeff_1,
const real coeff_2,
const real coeff_3,
const real coeff_4,
const real lambda_G,
const real dt) {
int gid = get_global_id(0);
if (gid >= NTOT) return;
int id1 = gid;
int id2 = id1 + NTOT;
int id3 = id2 + NTOT;
// Compute slope using helper
real s_1, s_2, s_3;
real laplacian_1, laplacian_2, laplacian_3;
real H_aniso_1, H_aniso_2, H_aniso_3;
real R_1, R_2, R_3;
compute_slope(m_n, R_random, inv_dx2, coeff_1, coeff_2, coeff_3,
coeff_4, lambda_G, &s_1, &s_2, &s_3,
&laplacian_1, &laplacian_2, &laplacian_3,
&H_aniso_1, &H_aniso_2, &H_aniso_3,
&R_1, &R_2, &R_3);
// Store the slopes for correction phase
s_pre[id1] = s_1;
s_pre[id2] = s_2;
s_pre[id3] = s_3;
// Apply Euler update
m_np1[id1] = m_n[id1] + dt * s_1;
m_np1[id2] = m_n[id2] + dt * s_2;
m_np1[id3] = m_n[id3] + dt * s_3;
}
correct_and_normalize¶
__kernel void correct_and_normalize(__global const real *m_n,
__global real *m_np1,
__global const real *R_random,
__global const real *s_pre,
const real inv_dx2,
const real coeff_1, const real coeff_2,
const real coeff_3, const real coeff_4,
const real lambda_G, const real dt,
__global int *error_codes) {
int gid = get_global_id(0);
if (gid >= NTOT) return;
int id1 = gid;
int id2 = id1 + NTOT;
int id3 = id2 + NTOT;
// Compute slope using helper (from m_np1 which contains m^{n+1})
real s_1, s_2, s_3;
real laplacian_1, laplacian_2, laplacian_3;
real H_aniso_1, H_aniso_2, H_aniso_3;
real R_1, R_2, R_3;
compute_slope(m_np1, R_random, inv_dx2, coeff_1, coeff_2, coeff_3,
coeff_4, lambda_G, &s_1, &s_2, &s_3,
&laplacian_1, &laplacian_2, &laplacian_3,
&H_aniso_1, &H_aniso_2, &H_aniso_3,
&R_1, &R_2, &R_3);
// Midpoint update (Heun's method correction step)
real m_1 = m_n[id1] + dt * 0.5 * (s_pre[id1] + s_1);
real m_2 = m_n[id2] + dt * 0.5 * (s_pre[id2] + s_2);
real m_3 = m_n[id3] + dt * 0.5 * (s_pre[id3] + s_3);
// Normalize
real norm = sqrt(m_1 * m_1 + m_2 * m_2 + m_3 * m_3);
m_np1[id1] = m_1 / norm;
m_np1[id2] = m_2 / norm;
m_np1[id3] = m_3 / norm;
#ifdef ENABLE_ERROR_CHECK
// Check for non-finite values - each work-item writes to its own slot (thread-safe)
int is_non_finite = (!isfinite(m_np1[id1]) || !isfinite(m_np1[id2]) || !isfinite(m_np1[id3]));
error_codes[gid] = is_non_finite ? 1 : 0;
#endif
}
sum_m1_weighted¶
__kernel void sum_m1_weighted(__global const real *m_1,
__global real *partial_sums,
__local real *local_sum) {
int gid = get_global_id(0);
int lid = get_local_id(0);
int group_id = get_group_id(0);
int lsize = get_local_size(0);
// Each work item processes one element
real my_sum = 0.0;
if (gid < NTOT) {
// Decompose linear ID into i, j, k
int k = gid % NZ;
int j = (gid / NZ) % NY;
int i = gid / (NY * NZ);
// Get m1 value
real value = m_1[gid];
// Apply midpoint method weights
real weight = 1.0;
if (i == 0 || i == NX-1) weight *= 0.5;
if (j == 0 || j == NY-1) weight *= 0.5;
if (k == 0 || k == NZ-1) weight *= 0.5;
my_sum = weight * value;
} else {
// Out-of-range threads contribute zero
my_sum = 0.0;
}
local_sum[lid] = my_sum;
barrier(CLK_LOCAL_MEM_FENCE);
// Work-group level reduction
for (int offset = lsize / 2; offset > 0; offset /= 2) {
if (lid < offset) {
local_sum[lid] += local_sum[lid + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write partial sum for this work-group
if (lid == 0) {
partial_sums[group_id] = local_sum[0];
}
}
reduce_partial_sums¶
__kernel void reduce_partial_sums(__global const real *partial_sums,
__global real *final_sum,
__local real *local_sum,
const int num_groups) {
int lid = get_local_id(0);
int lsize = get_local_size(0);
// Accumulate partial sums from global memory
real my_sum = 0.0;
for (int i = lid; i < num_groups; i += lsize) {
my_sum += partial_sums[i];
}
local_sum[lid] = my_sum;
barrier(CLK_LOCAL_MEM_FENCE);
// Work-group level reduction
for (int offset = lsize / 2; offset > 0; offset /= 2) {
if (lid < offset) {
local_sum[lid] += local_sum[lid + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write final sum (only one work-group needed)
if (lid == 0) {
final_sum[0] = local_sum[0];
}
}
Full kernel code¶
#ifdef USE_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
typedef double real;
#else
typedef float real;
#endif
#ifndef NX
#define NX 11
#endif
#ifndef NY
#define NY 11
#endif
#ifndef NZ
#define NZ 11
#endif
#define NTOT (NX * NY * NZ)
// Helper function to compute the Laplacian with Neumann boundary conditions
// IMPORTANT: Assumes uniform grid (inv_dx2 = inv_dy2 = inv_dz2)
// For uniform grids, accumulate differences before multiplying by inv_dx2
// to reduce truncation error in single precision
inline void compute_laplacian(const real __global *m_i,
const real inv_dx2,
real *laplacian) {
int gid = get_global_id(0);
// Decompose linear ID into i, j, k
int k = gid % NZ;
int j = (gid / NZ) % NY;
int i = gid / (NY * NZ);
int id_left = (i - 1) * NY * NZ + j * NZ + k;
int id_right = (i + 1) * NY * NZ + j * NZ + k;
int id_bottom = i * NY * NZ + (j - 1) * NZ + k;
int id_top = i * NY * NZ + (j + 1) * NZ + k;
int id_back = i * NY * NZ + j * NZ + (k - 1);
int id_front = i * NY * NZ + j * NZ + (k + 1);
// Neumann boundary conditions
if (i == 0)
id_left = id_right;
if (i == NX - 1)
id_right = id_left;
if (j == 0)
id_bottom = id_top;
if (j == NY - 1)
id_top = id_bottom;
if (k == 0)
id_back = id_front;
if (k == NZ - 1)
id_front = id_back;
real center = m_i[gid];
// Uniform grid case: accumulate all differences before multiplying by inv_dx2
// This reduces truncation error for uniform grids
*laplacian = (m_i[id_left] + m_i[id_right] +
m_i[id_top] + m_i[id_bottom] +
m_i[id_front] + m_i[id_back] - 6.0 * center) * inv_dx2;
}
// Helper function to compute the slope (ds/dt = cross(m, R))
inline void compute_slope(__global const real *m,
__global const real *R_random,
const real inv_dx2,
const real coeff_1,
const real coeff_2,
const real coeff_3,
const real coeff_4,
const real lambda_G,
real *s_1, real *s_2, real *s_3,
real *laplacian_1_out, real *laplacian_2_out, real *laplacian_3_out,
real *H_aniso_1_out, real *H_aniso_2_out, real *H_aniso_3_out,
real *R_1_out, real *R_2_out, real *R_3_out) {
int gid = get_global_id(0);
int id1 = gid;
int id2 = id1 + NTOT;
int id3 = id2 + NTOT;
real laplacian_1;
compute_laplacian(m, inv_dx2, &laplacian_1);
real laplacian_2;
compute_laplacian(m + NTOT, inv_dx2, &laplacian_2);
real laplacian_3;
compute_laplacian(m + 2 * NTOT, inv_dx2, &laplacian_3);
*laplacian_1_out = laplacian_1;
*laplacian_2_out = laplacian_2;
*laplacian_3_out = laplacian_3;
real m_1 = m[id1];
real m_2 = m[id2];
real m_3 = m[id3];
real m1m1 = m_1 * m_1;
real m2m2 = m_2 * m_2;
real m3m3 = m_3 * m_3;
// Branch based on anisotropy type
real H_aniso_1, H_aniso_2, H_aniso_3;
#ifdef ANISOTROPY_UNIAXIAL
// Uniaxial anisotropy
H_aniso_1 = coeff_2 * m_1;
H_aniso_2 = 0.f;
H_aniso_3 = 0.f;
#else // Default: ANISOTROPY_CUBIC
// Cubic anisotropy
H_aniso_1 = -(1 - m1m1 + m2m2 * m3m3) * m_1;
H_aniso_2 = -(1 - m2m2 + m1m1 * m3m3) * m_2;
H_aniso_3 = -(1 - m3m3 + m1m1 * m2m2) * m_3;
#endif
*H_aniso_1_out = H_aniso_1;
*H_aniso_2_out = H_aniso_2;
*H_aniso_3_out = H_aniso_3;
real R_1 = coeff_1 * laplacian_1 + coeff_4 * R_random[id1] + H_aniso_1 + coeff_3;
real R_2 = coeff_1 * laplacian_2 + coeff_4 * R_random[id2] + H_aniso_2;
real R_3 = coeff_1 * laplacian_3 + coeff_4 * R_random[id3] + H_aniso_3;
*R_1_out = R_1;
*R_2_out = R_2;
*R_3_out = R_3;
real G_m1m2 = lambda_G * m_1 * m_2;
real G_m1m3 = lambda_G * m_1 * m_3;
real G_m2m3 = lambda_G * m_2 * m_3;
*s_1 = (-m_2 - G_m1m3) * R_3 + (m_3 - G_m1m2) * R_2 +
lambda_G * (m2m2 + m3m3) * R_1;
*s_2 = (-m_3 - G_m1m2) * R_1 + (m_1 - G_m2m3) * R_3 +
lambda_G * (m1m1 + m3m3) * R_2;
*s_3 = (-m_1 - G_m2m3) * R_2 + (m_2 - G_m1m3) * R_1 +
lambda_G * (m1m1 + m2m2) * R_3;
}
// Prediction step with slope storage: compute slope, store it, and apply first Euler update
__kernel void predict(__global const real *m_n,
__global real *m_np1,
__global const real *R_random,
__global real *s_pre,
const real inv_dx2,
const real coeff_1,
const real coeff_2,
const real coeff_3,
const real coeff_4,
const real lambda_G,
const real dt) {
int gid = get_global_id(0);
if (gid >= NTOT) return;
int id1 = gid;
int id2 = id1 + NTOT;
int id3 = id2 + NTOT;
// Compute slope using helper
real s_1, s_2, s_3;
real laplacian_1, laplacian_2, laplacian_3;
real H_aniso_1, H_aniso_2, H_aniso_3;
real R_1, R_2, R_3;
compute_slope(m_n, R_random, inv_dx2, coeff_1, coeff_2, coeff_3,
coeff_4, lambda_G, &s_1, &s_2, &s_3,
&laplacian_1, &laplacian_2, &laplacian_3,
&H_aniso_1, &H_aniso_2, &H_aniso_3,
&R_1, &R_2, &R_3);
// Store the slopes for correction phase
s_pre[id1] = s_1;
s_pre[id2] = s_2;
s_pre[id3] = s_3;
// Apply Euler update
m_np1[id1] = m_n[id1] + dt * s_1;
m_np1[id2] = m_n[id2] + dt * s_2;
m_np1[id3] = m_n[id3] + dt * s_3;
}
// Correction step: compute slope, apply midpoint update, and normalize
__kernel void correct_and_normalize(__global const real *m_n,
__global real *m_np1,
__global const real *R_random,
__global const real *s_pre,
const real inv_dx2,
const real coeff_1, const real coeff_2,
const real coeff_3, const real coeff_4,
const real lambda_G, const real dt,
__global int *error_codes) {
int gid = get_global_id(0);
if (gid >= NTOT) return;
int id1 = gid;
int id2 = id1 + NTOT;
int id3 = id2 + NTOT;
// Compute slope using helper (from m_np1 which contains m^{n+1})
real s_1, s_2, s_3;
real laplacian_1, laplacian_2, laplacian_3;
real H_aniso_1, H_aniso_2, H_aniso_3;
real R_1, R_2, R_3;
compute_slope(m_np1, R_random, inv_dx2, coeff_1, coeff_2, coeff_3,
coeff_4, lambda_G, &s_1, &s_2, &s_3,
&laplacian_1, &laplacian_2, &laplacian_3,
&H_aniso_1, &H_aniso_2, &H_aniso_3,
&R_1, &R_2, &R_3);
// Midpoint update (Heun's method correction step)
real m_1 = m_n[id1] + dt * 0.5 * (s_pre[id1] + s_1);
real m_2 = m_n[id2] + dt * 0.5 * (s_pre[id2] + s_2);
real m_3 = m_n[id3] + dt * 0.5 * (s_pre[id3] + s_3);
// Normalize
real norm = sqrt(m_1 * m_1 + m_2 * m_2 + m_3 * m_3);
m_np1[id1] = m_1 / norm;
m_np1[id2] = m_2 / norm;
m_np1[id3] = m_3 / norm;
#ifdef ENABLE_ERROR_CHECK
// Check for non-finite values - each work-item writes to its own slot (thread-safe)
int is_non_finite = (!isfinite(m_np1[id1]) || !isfinite(m_np1[id2]) || !isfinite(m_np1[id3]));
error_codes[gid] = is_non_finite ? 1 : 0;
#endif
}
// Kernel to detect non-finite values in m (returns 1 if any NaN/Inf, else 0)
__kernel void check_finite(__global const real *m, __global int *flag) {
int gid = get_global_id(0);
int lid = get_local_id(0);
// Use local reduction to minimize atomics
__local int lflag;
if (lid == 0) lflag = 0;
barrier(CLK_LOCAL_MEM_FENCE);
if (gid < 3 * NTOT) {
real v = m[gid];
if (!isfinite(v)) {
lflag = 1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid == 0 && lflag) {
flag[0] = 1;
}
}
// A kernel to test the laplacian function, used only in tests
__kernel void test_laplacian(__global const real *m_i,
__global real *lap_m_i,
const real inv_dx2) {
int gid = get_global_id(0);
if (gid >= NTOT) return;
// Decompose linear ID into i, j, k
int k = gid % NZ;
int j = (gid / NZ) % NY;
int i = gid / (NY * NZ);
real laplacian;
compute_laplacian(m_i, inv_dx2, &laplacian);
lap_m_i[gid] = laplacian;
}
// Reduction kernel: sum m1 with midpoint weights in a single pass
// Uses local memory for work-group reduction
__kernel void sum_m1_weighted(__global const real *m_1,
__global real *partial_sums,
__local real *local_sum) {
int gid = get_global_id(0);
int lid = get_local_id(0);
int group_id = get_group_id(0);
int lsize = get_local_size(0);
// Each work item processes one element
real my_sum = 0.0;
if (gid < NTOT) {
// Decompose linear ID into i, j, k
int k = gid % NZ;
int j = (gid / NZ) % NY;
int i = gid / (NY * NZ);
// Get m1 value
real value = m_1[gid];
// Apply midpoint method weights
real weight = 1.0;
if (i == 0 || i == NX-1) weight *= 0.5;
if (j == 0 || j == NY-1) weight *= 0.5;
if (k == 0 || k == NZ-1) weight *= 0.5;
my_sum = weight * value;
} else {
// Out-of-range threads contribute zero
my_sum = 0.0;
}
local_sum[lid] = my_sum;
barrier(CLK_LOCAL_MEM_FENCE);
// Work-group level reduction
for (int offset = lsize / 2; offset > 0; offset /= 2) {
if (lid < offset) {
local_sum[lid] += local_sum[lid + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write partial sum for this work-group
if (lid == 0) {
partial_sums[group_id] = local_sum[0];
}
}
// Second-level reduction: reduce partial sums to a single value
// Handles cases where num_groups > local_size by looping
__kernel void reduce_partial_sums(__global const real *partial_sums,
__global real *final_sum,
__local real *local_sum,
const int num_groups) {
int lid = get_local_id(0);
int lsize = get_local_size(0);
// Accumulate partial sums from global memory
real my_sum = 0.0;
for (int i = lid; i < num_groups; i += lsize) {
my_sum += partial_sums[i];
}
local_sum[lid] = my_sum;
barrier(CLK_LOCAL_MEM_FENCE);
// Work-group level reduction
for (int offset = lsize / 2; offset > 0; offset /= 2) {
if (lid < offset) {
local_sum[lid] += local_sum[lid + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write final sum (only one work-group needed)
if (lid == 0) {
final_sum[0] = local_sum[0];
}
}