Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active October 28, 2025 21:55
Show Gist options
  • Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
#include <immintrin.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#include <float.h>
#define MAX_MOTION_RADIUS 1000000.0f
enum {
MAX_CHNS = 128,
PTC_PER_CHN = 16,
SPH_PER_CHN = 8,
MAX_PTC = (MAX_CHNS * PTC_PER_CHN),
MAX_SPH = (MAX_CHNS * SPH_PER_CHN),
CON_PER_CHN = PTC_PER_CHN,
MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN),
MAX_ITR = 16,
};
struct chn_cfg {
float damping; // [0, 1]
float stiffness; // [0, 1]
float friction; // [0, 1]
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force
float bend_stiffness; // [0, 1], bending resistance
};
struct chn_sim {
unsigned char free_idx_cnt;
unsigned char free_idx[MAX_CHNS];
struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32)));
// chain global forces (world space gravity and wind)
float chn_grav_x[MAX_CHNS] __attribute__((aligned(32)));
float chn_grav_y[MAX_CHNS] __attribute__((aligned(32)));
float chn_grav_z[MAX_CHNS] __attribute__((aligned(32)));
float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32)));
float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32)));
float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32)));
// chain transformations (world space)
float chn_pos_x[MAX_CHNS] __attribute__((aligned(32)));
float chn_pos_y[MAX_CHNS] __attribute__((aligned(32)));
float chn_pos_z[MAX_CHNS] __attribute__((aligned(32)));
float chn_quat_x[MAX_CHNS] __attribute__((aligned(32)));
float chn_quat_y[MAX_CHNS] __attribute__((aligned(32)));
float chn_quat_z[MAX_CHNS] __attribute__((aligned(32)));
float chn_quat_w[MAX_CHNS] __attribute__((aligned(32)));
float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32)));
float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32)));
float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32)));
float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32)));
float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32)));
float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32)));
// particle positions (model space, transposed: 0-7 even indices, 8-15 odd indices)
float ptc_pos_x[MAX_PTC] __attribute__((aligned(32)));
float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32)));
float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); // Transposed: even cons 0-7, odd 8-15 (last dummy)
// XPBD lambdas for distance constraints (transposed similarly)
float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32)));
float ptc_bend_rest[MAX_REST_LEN] __attribute__((aligned(32)));
float ptc_bend_lambda[MAX_REST_LEN] __attribute__((aligned(32)));
// sphere positions (model space)
float sph_x[MAX_SPH] __attribute__((aligned(32)));
float sph_y[MAX_SPH] __attribute__((aligned(32)));
float sph_z[MAX_SPH] __attribute__((aligned(32)));
float sph_r[MAX_SPH] __attribute__((aligned(32)));
// rest positions (model space, transposed)
float rest_pos_x[MAX_PTC] __attribute__((aligned(32)));
float rest_pos_y[MAX_PTC] __attribute__((aligned(32)));
float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
float motion_radius[MAX_PTC] __attribute__((aligned(32)));
};
static int
chn_sim__val(struct chn_sim *cns) {
assert(cns);
assert(cns->free_idx_cnt <= MAX_CHNS);
for (int i = 0; i < cns->free_idx_cnt; ++i) {
assert(i < MAX_CHNS);
assert(cns->free_idx[i] < MAX_CHNS);
}
for (int i = 0; i < MAX_CHNS; ++i) {
const struct chn_cfg *cfg = &cns->chn_cfg[i];
assert(cfg->damping >= 0.0f);
assert(cfg->damping <= 1.0f);
assert(cfg->stiffness >= 0.0f);
assert(cfg->stiffness <= 1.0f);
assert(cfg->friction >= 0.0f);
assert(cfg->friction <= 1.0f);
assert(cfg->linear_inertia >= 0.0f);
assert(cfg->linear_inertia <= 1.0f);
assert(cfg->angular_inertia >= 0.0f);
assert(cfg->angular_inertia <= 1.0f);
assert(cfg->centrifugal_inertia >= 0.0f);
assert(cfg->centrifugal_inertia <= 1.0f);
assert(cfg->bend_stiffness >= 0.0f);
assert(cfg->bend_stiffness <= 1.0f);
assert(isfinite(cns->chn_grav_x[i]));
assert(isfinite(cns->chn_grav_y[i]));
assert(isfinite(cns->chn_grav_z[i]));
assert(isfinite(cns->chn_wnd_x[i]));
assert(isfinite(cns->chn_wnd_y[i]));
assert(isfinite(cns->chn_wnd_z[i]));
assert(isfinite(cns->chn_pos_x[i]));
assert(isfinite(cns->chn_pos_y[i]));
assert(isfinite(cns->chn_pos_z[i]));
assert(isfinite(cns->chn_quat_x[i]));
assert(isfinite(cns->chn_quat_y[i]));
assert(isfinite(cns->chn_quat_z[i]));
assert(isfinite(cns->chn_quat_w[i]));
// Quaternion normalization invariant
float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] +
cns->chn_quat_y[i]*cns->chn_quat_y[i] +
cns->chn_quat_z[i]*cns->chn_quat_z[i] +
cns->chn_quat_w[i]*cns->chn_quat_w[i];
assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
assert(isfinite(cns->chn_prev_pos_x[i]));
assert(isfinite(cns->chn_prev_pos_y[i]));
assert(isfinite(cns->chn_prev_pos_z[i]));
assert(isfinite(cns->chn_prev_quat_x[i]));
assert(isfinite(cns->chn_prev_quat_y[i]));
assert(isfinite(cns->chn_prev_quat_z[i]));
assert(isfinite(cns->chn_prev_quat_w[i]));
// Prev quat norm
qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] +
cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] +
cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] +
cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i];
assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
}
for (int i = 0; i < MAX_PTC; ++i) {
assert(i < MAX_PTC);
assert(cns->ptc_rest_len[i] >= 0.0f);
assert(cns->motion_radius[i] >= 0.0f);
assert(cns->motion_radius[i] <= MAX_MOTION_RADIUS);
assert(isfinite(cns->ptc_pos_x[i]));
assert(isfinite(cns->ptc_pos_y[i]));
assert(isfinite(cns->ptc_pos_z[i]));
assert(isfinite(cns->ptc_inv_mass[i]));
assert(cns->ptc_inv_mass[i] >= 0.0f);
assert(isfinite(cns->ptc_prev_pos_x[i]));
assert(isfinite(cns->ptc_prev_pos_y[i]));
assert(isfinite(cns->ptc_prev_pos_z[i]));
assert(isfinite(cns->rest_pos_x[i]));
assert(isfinite(cns->rest_pos_y[i]));
assert(isfinite(cns->rest_pos_z[i]));
assert(isfinite(cns->motion_radius[i]));
// Spot-check motion constraint post-any update (sample every 8th for perf)
float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i];
float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i];
float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i];
float dist = sqrtf(dx*dx + dy*dy + dz*dz);
assert(dist <= cns->motion_radius[i] + 1e-6f);
}
for (int i = 0; i < MAX_SPH; ++i) {
assert(i < MAX_SPH);
assert(isfinite(cns->sph_x[i]));
assert(isfinite(cns->sph_y[i]));
assert(isfinite(cns->sph_z[i]));
assert(isfinite(cns->sph_r[i]));
assert(cns->sph_r[i] >= 0.0f);
}
for (int i = 0; i < MAX_REST_LEN; ++i) {
assert(i < MAX_REST_LEN);
assert(cns->ptc_rest_len[i] >= 0.0f);
assert(isfinite(cns->ptc_rest_len[i]));
assert(isfinite(cns->ptc_lambda[i]));
assert(cns->ptc_lambda[i] >= -1e6f);
assert(cns->ptc_lambda[i] <= 1e6f);
assert(cns->ptc_bend_rest[i] >= 0.0f);
assert(isfinite(cns->ptc_bend_rest[i]));
assert(isfinite(cns->ptc_bend_lambda[i]));
assert(cns->ptc_bend_lambda[i] >= -1e6f);
assert(cns->ptc_bend_lambda[i] <= 1e6f);
}
return 1;
}
extern void
chn_sim_init(struct chn_sim *cns) {
assert(cns);
memset(cns, 0, sizeof(*cns));
cns->free_idx_cnt = MAX_CHNS;
for (int i = 0; i < MAX_CHNS; ++i) {
cns->free_idx[i] = MAX_CHNS - i - 1;
}
for (int i = 0; i < MAX_CHNS; i += 8) {
_mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f));
_mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));
}
for (int i = 0; i < MAX_PTC; i += 8) {
_mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS));
}
assert(chn_sim__val(cns));
}
extern int
chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
assert(cns);
assert(cfg);
assert(cns->free_idx_cnt > 0);
assert(cnt <= PTC_PER_CHN);
assert(cnt > 1);
assert(chn_sim__val(cns));
assert(cfg->damping >= 0.0f);
assert(cfg->damping <= 1.0f);
assert(cfg->stiffness >= 0.0f);
assert(cfg->stiffness <= 1.0f);
assert(cfg->friction >= 0.0f);
assert(cfg->friction <= 1.0f);
assert(cfg->linear_inertia >= 0.0f);
assert(cfg->linear_inertia <= 1.0f);
assert(cfg->angular_inertia >= 0.0f);
assert(cfg->angular_inertia <= 1.0f);
assert(cfg->centrifugal_inertia >= 0.0f);
assert(cfg->centrifugal_inertia <= 1.0f);
assert(cfg->bend_stiffness >= 0.0f);
assert(cfg->bend_stiffness <= 1.0f);
int chn = cns->free_idx[--cns->free_idx_cnt];
cns->chn_cfg[chn] = *cfg;
int p_base = (chn * PTC_PER_CHN);
assert(p_base + 15 < MAX_PTC);
int r_base = (chn * CON_PER_CHN);
assert(r_base + 15 < MAX_REST_LEN);
int even_base = p_base + 0; // Even indices 0-7
int odd_base = p_base + 8; // Odd indices 8-15
// Transpose particles: even indices to 0-7, odd to 8-15
int even_cnt = (cnt + 1) / 2; // Number of even particles
int odd_cnt = cnt / 2; // Number of odd particles
assert(even_cnt + odd_cnt == cnt);
for (int i = 0; i < even_cnt; ++i) {
int orig_i = 2 * i;
assert(orig_i * 4 + 3 < cnt * 4);
cns->ptc_pos_x[even_base + i] = rest_pos[orig_i*4+0];
cns->ptc_pos_y[even_base + i] = rest_pos[orig_i*4+1];
cns->ptc_pos_z[even_base + i] = rest_pos[orig_i*4+2];
cns->ptc_inv_mass[even_base + i] = rest_pos[orig_i*4+3];
cns->rest_pos_x[even_base + i] = rest_pos[orig_i*4+0];
cns->rest_pos_y[even_base + i] = rest_pos[orig_i*4+1];
cns->rest_pos_z[even_base + i] = rest_pos[orig_i*4+2];
// Set previous positions to match initial positions for zero initial velocity
cns->ptc_prev_pos_x[even_base + i] = cns->ptc_pos_x[even_base + i];
cns->ptc_prev_pos_y[even_base + i] = cns->ptc_pos_y[even_base + i];
cns->ptc_prev_pos_z[even_base + i] = cns->ptc_pos_z[even_base + i];
}
for (int i = even_cnt; i < 8; ++i) {
cns->ptc_pos_x[even_base + i] = 0.0f;
cns->ptc_pos_y[even_base + i] = 0.0f;
cns->ptc_pos_z[even_base + i] = 0.0f;
cns->ptc_inv_mass[even_base + i] = 0.0f;
cns->rest_pos_x[even_base + i] = 0.0f;
cns->rest_pos_y[even_base + i] = 0.0f;
cns->rest_pos_z[even_base + i] = 0.0f;
// Previous positions for pads remain 0.0f
cns->ptc_prev_pos_x[even_base + i] = 0.0f;
cns->ptc_prev_pos_y[even_base + i] = 0.0f;
cns->ptc_prev_pos_z[even_base + i] = 0.0f;
}
for (int i = 0; i < odd_cnt; ++i) {
int orig_i = 2 * i + 1;
assert(orig_i * 4 + 3 < cnt * 4);
cns->ptc_pos_x[odd_base + i] = rest_pos[orig_i*4+0];
cns->ptc_pos_y[odd_base + i] = rest_pos[orig_i*4+1];
cns->ptc_pos_z[odd_base + i] = rest_pos[orig_i*4+2];
cns->ptc_inv_mass[odd_base + i] = rest_pos[orig_i*4+3];
cns->rest_pos_x[odd_base + i] = rest_pos[orig_i*4+0];
cns->rest_pos_y[odd_base + i] = rest_pos[orig_i*4+1];
cns->rest_pos_z[odd_base + i] = rest_pos[orig_i*4+2];
// Set previous positions to match initial positions for zero initial velocity
cns->ptc_prev_pos_x[odd_base + i] = cns->ptc_pos_x[odd_base + i];
cns->ptc_prev_pos_y[odd_base + i] = cns->ptc_pos_y[odd_base + i];
cns->ptc_prev_pos_z[odd_base + i] = cns->ptc_pos_z[odd_base + i];
}
for (int i = odd_cnt; i < 8; ++i) {
cns->ptc_pos_x[odd_base + i] = 0.0f;
cns->ptc_pos_y[odd_base + i] = 0.0f;
cns->ptc_pos_z[odd_base + i] = 0.0f;
cns->ptc_inv_mass[odd_base + i] = 0.0f;
cns->rest_pos_x[odd_base + i] = 0.0f;
cns->rest_pos_y[odd_base + i] = 0.0f;
cns->rest_pos_z[odd_base + i] = 0.0f;
// Previous positions for pads remain 0.0f
cns->ptc_prev_pos_x[odd_base + i] = 0.0f;
cns->ptc_prev_pos_y[odd_base + i] = 0.0f;
cns->ptc_prev_pos_z[odd_base + i] = 0.0f;
}
// Set motion radius to FLT_MAX by default (no constraint)
for (int i = 0; i < PTC_PER_CHN; i += 8) {
_mm256_store_ps(&cns->motion_radius[p_base + i], _mm256_set1_ps(MAX_MOTION_RADIUS));
}
// Transpose constraints: even cons (0,2,4,...) to r_base+0-7, odd (1,3,5,...) to r_base+8-15
for (int i = 1; i < cnt; ++i) {
float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
__m256 dist_sq_vec = _mm256_set1_ps(dx*dx + dy*dy + dz*dz);
__m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);
float len = _mm256_cvtss_f32(sqrt_vec);
assert(isfinite(len));
len = fmaxf(len, 1e-6f);
int con_idx = i - 1;
if ((con_idx & 0x01) == 0) { // Even constraint
cns->ptc_rest_len[r_base + (con_idx / 2)] = len;
cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f;
} else { // Odd constraint
cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = len;
cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
}
}
for (int i = cnt - 1; i < CON_PER_CHN - 1; ++i) { // Pad dummies
int con_idx = i;
if ((con_idx & 0x01) == 0) {
cns->ptc_rest_len[r_base + (con_idx / 2)] = 0.0f;
cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f;
} else {
cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
}
}
// Transpose bending constraints: even bends (0-2,2-4,...) to r_base+0-7, odd (1-3,3-5,...) to r_base+8-15 (new)
int even_bend_cnt = even_cnt - 1;
int odd_bend_cnt = odd_cnt - 1;
assert(even_bend_cnt == even_cnt - 1);
assert(odd_bend_cnt == odd_cnt - 1);
for (int i = 0; i < even_bend_cnt; ++i) {
float dx = cns->rest_pos_x[even_base + i] - cns->rest_pos_x[even_base + i + 1];
float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1];
float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1];
__m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz);
__m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);
float len = _mm256_cvtss_f32(sqrt_vec);
assert(isfinite(len));
len = fmaxf(len, 1e-6f);
cns->ptc_bend_rest[r_base + i] = len;
cns->ptc_bend_lambda[r_base + i] = 0.0f;
}
for (int i = even_bend_cnt; i < 8; ++i) {
cns->ptc_bend_rest[r_base + i] = 0.0f;
cns->ptc_bend_lambda[r_base + i] = 0.0f;
}
for (int i = 0; i < odd_bend_cnt; ++i) {
float dx = cns->rest_pos_x[odd_base + i] - cns->rest_pos_x[odd_base + i + 1];
float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1];
float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1];
__m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz);
__m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);
float len = _mm256_cvtss_f32(sqrt_vec);
assert(isfinite(len));
len = fmaxf(len, 1e-6f);
cns->ptc_bend_rest[r_base + 8 + i] = len;
cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
}
for (int i = odd_bend_cnt; i < 8; ++i) {
cns->ptc_bend_rest[r_base + 8 + i] = 0.0f;
cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
}
assert(chn_sim__val(cns));
return chn;
}
extern void
chn_sim_del(struct chn_sim *cns, int chn) {
assert(chn_sim__val(cns));
assert(chn >= 0);
assert(chn < MAX_CHNS);
int p_base = (chn * PTC_PER_CHN);
assert(p_base + 15 < MAX_PTC);
int s_base = (chn * SPH_PER_CHN);
assert(s_base + 7 < MAX_SPH);
int r_base = (chn * CON_PER_CHN);
assert(r_base + 15 < MAX_REST_LEN);
cns->free_idx[cns->free_idx_cnt++] = chn;
memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
cns->chn_grav_x[chn] = 0.0f;
cns->chn_grav_y[chn] = 0.0f;
cns->chn_grav_z[chn] = 0.0f;
cns->chn_wnd_x[chn] = 0.0f;
cns->chn_wnd_y[chn] = 0.0f;
cns->chn_wnd_z[chn] = 0.0f;
cns->chn_pos_x[chn] = 0.0f;
cns->chn_pos_y[chn] = 0.0f;
cns->chn_pos_z[chn] = 0.0f;
cns->chn_quat_x[chn] = 0.0f;
cns->chn_quat_y[chn] = 0.0f;
cns->chn_quat_z[chn] = 0.0f;
cns->chn_quat_w[chn] = 1.0f;
cns->chn_prev_pos_x[chn] = 0.0f;
cns->chn_prev_pos_y[chn] = 0.0f;
cns->chn_prev_pos_z[chn] = 0.0f;
cns->chn_prev_quat_x[chn] = 0.0f;
cns->chn_prev_quat_y[chn] = 0.0f;
cns->chn_prev_quat_z[chn] = 0.0f;
cns->chn_prev_quat_w[chn] = 1.0f;
_mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_inv_mass[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_inv_mass[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->rest_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->rest_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->rest_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->rest_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->rest_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->rest_pos_z[p_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(MAX_MOTION_RADIUS));
_mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(MAX_MOTION_RADIUS));
_mm256_store_ps(&cns->sph_x[s_base], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->sph_y[s_base], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->sph_z[s_base], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->sph_r[s_base], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_rest_len[r_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_bend_rest[r_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], _mm256_set1_ps(0.0f));
assert(chn_sim__val(cns));
}
extern void
chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3,
const float *restrict rot4) {
assert(chn_sim__val(cns));
assert(pos3);
assert(rot4);
assert(chn >= 0);
assert(chn < MAX_CHNS);
cns->chn_pos_x[chn] = pos3[0];
cns->chn_pos_y[chn] = pos3[1];
cns->chn_pos_z[chn] = pos3[2];
cns->chn_quat_x[chn] = rot4[0];
cns->chn_quat_y[chn] = rot4[1];
cns->chn_quat_z[chn] = rot4[2];
cns->chn_quat_w[chn] = rot4[3];
float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3];
assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
assert(isfinite(cns->chn_quat_x[chn]));
assert(isfinite(cns->chn_quat_y[chn]));
assert(isfinite(cns->chn_quat_z[chn]));
assert(isfinite(cns->chn_quat_w[chn]));
assert(chn_sim__val(cns));
}
extern void
chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3,
const float *restrict rot4) {
assert(chn_sim__val(cns));
assert(pos3);
assert(rot4);
assert(chn >= 0);
assert(chn < MAX_CHNS);
cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0];
cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1];
cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2];
cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0];
cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1];
cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3];
assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
assert(isfinite(cns->chn_quat_x[chn]));
assert(isfinite(cns->chn_quat_y[chn]));
assert(isfinite(cns->chn_quat_z[chn]));
assert(isfinite(cns->chn_quat_w[chn]));
assert(isfinite(cns->chn_prev_quat_x[chn]));
assert(isfinite(cns->chn_prev_quat_y[chn]));
assert(isfinite(cns->chn_prev_quat_z[chn]));
assert(isfinite(cns->chn_prev_quat_w[chn]));
assert(chn_sim__val(cns));
}
extern void
chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
assert(chn_sim__val(cns));
assert(grav3);
assert(chn >= 0);
assert(chn < MAX_CHNS);
cns->chn_grav_x[chn] = grav3[0];
cns->chn_grav_y[chn] = grav3[1];
cns->chn_grav_z[chn] = grav3[2];
assert(isfinite(cns->chn_grav_x[chn]));
assert(isfinite(cns->chn_grav_y[chn]));
assert(isfinite(cns->chn_grav_z[chn]));
assert(chn_sim__val(cns));
}
extern void
chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
assert(chn_sim__val(cns));
assert(wind3);
assert(chn >= 0);
assert(chn < MAX_CHNS);
cns->chn_wnd_x[chn] = wind3[0];
cns->chn_wnd_y[chn] = wind3[1];
cns->chn_wnd_z[chn] = wind3[2];
assert(isfinite(cns->chn_wnd_x[chn]));
assert(isfinite(cns->chn_wnd_y[chn]));
assert(isfinite(cns->chn_wnd_z[chn]));
assert(chn_sim__val(cns));
}
extern void
chn_sim_set_sph(struct chn_sim *cns, int chn, const float *restrict spheres, int cnt) {
assert(chn_sim__val(cns));
assert(spheres);
assert(cnt <= SPH_PER_CHN);
assert(cnt >= 0);
int s_base = (chn * SPH_PER_CHN);
assert(s_base + 7 < MAX_SPH);
for (int i = 0; i < cnt; i++) {
assert(i * 4 + 3 < cnt * 4);
cns->sph_x[s_base + i] = spheres[i*4+0];
cns->sph_y[s_base + i] = spheres[i*4+1];
cns->sph_z[s_base + i] = spheres[i*4+2];
cns->sph_r[s_base + i] = spheres[i*4+3];
assert(isfinite(cns->sph_x[s_base + i]));
assert(isfinite(cns->sph_y[s_base + i]));
assert(isfinite(cns->sph_z[s_base + i]));
assert(isfinite(cns->sph_r[s_base + i]));
assert(cns->sph_r[s_base + i] >= 0.0f);
}
for (int i = cnt; i < SPH_PER_CHN; ++i) {
cns->sph_x[s_base + i] = 0;
cns->sph_y[s_base + i] = 0;
cns->sph_z[s_base + i] = 0;
cns->sph_r[s_base + i] = 0;
}
assert(chn_sim__val(cns));
}
extern void
chn_sim_con_motion(struct chn_sim *cns, int chn, const float *restrict radius, int cnt) {
assert(chn_sim__val(cns));
assert(chn >= 0);
assert(chn < MAX_CHNS);
assert(cnt >= 0);
assert(cnt < PTC_PER_CHN);
assert(radius);
int p_base = (chn * PTC_PER_CHN);
assert(p_base + 15 < MAX_PTC);
int even_base = p_base + 0;
int odd_base = p_base + 8;
int even_cnt = (cnt + 1) / 2;
int odd_cnt = cnt / 2;
for (int i = 0; i < even_cnt; ++i) {
assert(2 * i < cnt);
cns->motion_radius[even_base + i] = radius[2 * i];
assert(cns->motion_radius[even_base + i] >= 0.0f);
assert(cns->motion_radius[even_base + i] <= MAX_MOTION_RADIUS);
}
for (int i = even_cnt; i < 8; ++i) {
cns->motion_radius[even_base + i] = MAX_MOTION_RADIUS;
}
for (int i = 0; i < odd_cnt; ++i) {
assert(2 * i + 1 < cnt);
cns->motion_radius[odd_base + i] = radius[2 * i + 1];
assert(cns->motion_radius[odd_base + i] >= 0.0f);
assert(cns->motion_radius[odd_base + i] <= MAX_MOTION_RADIUS);
}
for (int i = odd_cnt; i < 8; ++i) {
cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS;
}
assert(chn_sim__val(cns));
}
extern void
chn_sim_run(struct chn_sim *cns, float dt) {
assert(chn_sim__val(cns));
assert(dt > 0.0f);
assert(dt < 1.0f);
// SIMD constants
const __m256 dt_vec = _mm256_set1_ps(dt);
const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
const __m256 one_vec = _mm256_set1_ps(1.0f);
const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
const __m256 eps_vec = _mm256_set1_ps(1e-6f);
const __m256 lambda_min_vec = _mm256_set1_ps(-1e6f);
const __m256 lambda_max_vec = _mm256_set1_ps(1e6f);
const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f);
const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f);
const __m256 zero_vec = _mm256_setzero_ps();
const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
const __m256 two_vec = _mm256_set1_ps(2.0f);
const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
const __m256 neg_rest_vec = _mm256_set1_ps(-1.0f); // For FMAs
// Stack arrays for precomputed data
float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32)));
float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32)));
float vel_x[MAX_CHNS] __attribute__((aligned(32)));
float vel_y[MAX_CHNS] __attribute__((aligned(32)));
float vel_z[MAX_CHNS] __attribute__((aligned(32)));
float ang_vel_x[MAX_CHNS] __attribute__((aligned(32)));
float ang_vel_y[MAX_CHNS] __attribute__((aligned(32)));
float ang_vel_z[MAX_CHNS] __attribute__((aligned(32)));
// Step 0: Precomputation
for (int c = 0; c < MAX_CHNS; c += 8) {
__builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
__builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 0, 3);
__builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 0, 3);
// Load chain quaternions
__m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
__m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
__m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]);
__m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]);
// Create q_conjugate components (qw remains, qx, qy, qz are negated)
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
// Compute local-space wind
{
__m256 vx = _mm256_load_ps(&cns->chn_wnd_x[c]);
__m256 vy = _mm256_load_ps(&cns->chn_wnd_y[c]);
__m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]);
// Step 1: t = q_conj * v_world_as_quat
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// Step 2: res_vec = (t * q)_vec
__m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
__m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y));
__m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z));
_mm256_store_ps(&chn_wnd_lcl_x[c], res_x);
_mm256_store_ps(&chn_wnd_lcl_y[c], res_y);
_mm256_store_ps(&chn_wnd_lcl_z[c], res_z);
assert(isfinite(_mm256_cvtss_f32(res_x)));
assert(isfinite(_mm256_cvtss_f32(res_y)));
assert(isfinite(_mm256_cvtss_f32(res_z)));
}
// Compute local-space gravity (identical to wind)
{
__m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]);
__m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]);
__m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]);
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx));
tx = _mm256_fnmadd_ps(cqz, vy, tx);
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy));
ty = _mm256_fnmadd_ps(cqx, vz, ty);
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz));
tz = _mm256_fnmadd_ps(cqy, vx, tz);
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);
tw = _mm256_fnmadd_ps(cqy, vy, tw);
tw = _mm256_fnmadd_ps(cqz, vz, tw);
__m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
__m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y));
__m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z));
_mm256_store_ps(&chn_grav_lcl_x[c], res_x);
_mm256_store_ps(&chn_grav_lcl_y[c], res_y);
_mm256_store_ps(&chn_grav_lcl_z[c], res_z);
assert(isfinite(_mm256_cvtss_f32(res_x)));
assert(isfinite(_mm256_cvtss_f32(res_y)));
assert(isfinite(_mm256_cvtss_f32(res_z)));
}
// Compute linear velocity
{
__m256 curr_x = _mm256_load_ps(&cns->chn_pos_x[c]);
__m256 curr_y = _mm256_load_ps(&cns->chn_pos_y[c]);
__m256 curr_z = _mm256_load_ps(&cns->chn_pos_z[c]);
__m256 prev_x = _mm256_load_ps(&cns->chn_prev_pos_x[c]);
__m256 prev_y = _mm256_load_ps(&cns->chn_prev_pos_y[c]);
__m256 prev_z = _mm256_load_ps(&cns->chn_prev_pos_z[c]);
__m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
__m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
__m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec);
_mm256_store_ps(&vel_x[c], vel_x_vec);
_mm256_store_ps(&vel_y[c], vel_y_vec);
_mm256_store_ps(&vel_z[c], vel_z_vec);
assert(isfinite(_mm256_cvtss_f32(vel_x_vec)));
assert(isfinite(_mm256_cvtss_f32(vel_y_vec)));
assert(isfinite(_mm256_cvtss_f32(vel_z_vec)));
}
// Compute angular velocity
{
__m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
__m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
__m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]);
__m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]);
__m256 prev_qx = _mm256_load_ps(&cns->chn_prev_quat_x[c]);
__m256 prev_qy = _mm256_load_ps(&cns->chn_prev_quat_y[c]);
__m256 prev_qz = _mm256_load_ps(&cns->chn_prev_quat_z[c]);
__m256 prev_qw = _mm256_load_ps(&cns->chn_prev_quat_w[c]);
// Step 1: Compute delta quaternion (to - from)
__m256 dt_qx = _mm256_sub_ps(qx, prev_qx);
__m256 dt_qy = _mm256_sub_ps(qy, prev_qy);
__m256 dt_qz = _mm256_sub_ps(qz, prev_qz);
__m256 dt_qw = _mm256_sub_ps(qw, prev_qw);
// Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
__m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx);
__m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy);
__m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz);
__m256 conj_qw = prev_qw;
// Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
__m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw));
orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw);
orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw);
__m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(dt_qy, conj_qz), _mm256_mul_ps(dt_qz, conj_qy));
__m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(dt_qz, conj_qx), _mm256_mul_ps(dt_qx, conj_qz));
__m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(dt_qx, conj_qy), _mm256_mul_ps(dt_qy, conj_qx));
__m256 orient_qx = _mm256_fmadd_ps(dt_qw, conj_qx, _mm256_mul_ps(dt_qx, conj_qw));
orient_qx = _mm256_add_ps(orient_qx, cross_x);
__m256 orient_qy = _mm256_fmadd_ps(dt_qw, conj_qy, _mm256_mul_ps(dt_qy, conj_qw));
orient_qy = _mm256_add_ps(orient_qy, cross_y);
__m256 orient_qz = _mm256_fmadd_ps(dt_qw, conj_qz, _mm256_mul_ps(dt_qz, conj_qw));
orient_qz = _mm256_add_ps(orient_qz, cross_z);
// Step 4: Compute dot product (to, from) for shortest arc check
__m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw));
dot = _mm256_fmadd_ps(qy, prev_qy, dot);
dot = _mm256_fmadd_ps(qz, prev_qz, dot);
// Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
__m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
__m256 negate_sign = _mm256_blendv_ps(one_vec, neg_one_vec, negate_mask);
orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
orient_qz = _mm256_mul_ps(orient_qz, negate_sign);
// Norm check and scale if >1+eps
__m256 norm_sq = _mm256_fmadd_ps(orient_qz, orient_qz, _mm256_fmadd_ps(orient_qy, orient_qy, _mm256_fmadd_ps(orient_qx, orient_qx, _mm256_mul_ps(orient_qw, orient_qw))));
__m256 scale_mask = _mm256_cmp_ps(norm_sq, _mm256_set1_ps((1.0f + 1e-6f) * (1.0f + 1e-6f)), _CMP_GT_OQ);
__m256 inv_norm = _mm256_rsqrt_ps(norm_sq);
__m256 scale = _mm256_blendv_ps(one_vec, inv_norm, scale_mask);
orient_qx = _mm256_mul_ps(orient_qx, scale);
orient_qy = _mm256_mul_ps(orient_qy, scale);
orient_qz = _mm256_mul_ps(orient_qz, scale);
orient_qw = _mm256_mul_ps(orient_qw, scale);
// Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
__m256 scale_ang = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
__m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale_ang);
__m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale_ang);
__m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale_ang);
// Store results
_mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
_mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
_mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
assert(isfinite(_mm256_cvtss_f32(ang_vel_x_vec)));
assert(isfinite(_mm256_cvtss_f32(ang_vel_y_vec)));
assert(isfinite(_mm256_cvtss_f32(ang_vel_z_vec)));
}
}
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - Duplicated even/odd
for (int c = 0; c < MAX_CHNS; ++c) {
__builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 1, 3);
__builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 1, 3);
__builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 1, 3);
int p_base = c * PTC_PER_CHN;
assert(p_base + 15 < MAX_PTC);
struct chn_cfg *cfg = &cns->chn_cfg[c];
__m256 delta_x_vec = _mm256_set1_ps(vel_x[c] * dt);
__m256 delta_y_vec = _mm256_set1_ps(vel_y[c] * dt);
__m256 delta_z_vec = _mm256_set1_ps(vel_z[c] * dt);
__m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
__m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
__m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);
__m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
__m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
__m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);
linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
__m256 lin_lag_factor = _mm256_sub_ps(one_vec, linear_inertia_vec);
__m256 lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor);
__m256 lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor);
__m256 lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor);
// Centrifugal zero-check: ang_vel mag (shared)
__m256 ang_mag_sq = _mm256_fmadd_ps(ang_vel_z_vec, ang_vel_z_vec, _mm256_fmadd_ps(ang_vel_y_vec, ang_vel_y_vec, _mm256_mul_ps(ang_vel_x_vec, ang_vel_x_vec)));
__m256 ang_zero_mask = _mm256_cmp_ps(ang_mag_sq, eps_vec, _CMP_LT_OQ);
// Even block
{
__m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
__m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
__m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
__m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
__m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ);
__m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
__m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
__m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec);
ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec);
ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec);
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
__m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec);
__m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
__m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);
__m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec);
__m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec);
__m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec);
// Apply centrifugal zero-check
cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask);
cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask);
cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask);
// Blend masks for all dt
lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask);
lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask);
lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask);
ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask);
ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask);
ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask);
cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask);
cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask);
cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask);
__m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
__m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
__m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);
_mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_add_ps(px, dt_x));
_mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y));
_mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z));
}
// Odd block
{
__m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
__m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
__m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
__m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
__m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ);
__m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
__m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
__m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec);
ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec);
ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec);
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
__m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec);
__m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
__m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);
__m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec);
__m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec);
__m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec);
// Apply centrifugal zero-check
cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask);
cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask);
cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask);
lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor);
lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor);
lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor);
lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask);
lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask);
lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask);
ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask);
ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask);
ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask);
cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask);
cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask);
cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask);
__m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
__m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
__m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);
_mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_add_ps(px, dt_x));
_mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_add_ps(py, dt_y));
_mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_add_ps(pz, dt_z));
}
// Update previous transformation
cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
cns->chn_prev_pos_z[c] = cns->chn_pos_z[c];
cns->chn_prev_quat_x[c] = cns->chn_quat_x[c];
cns->chn_prev_quat_y[c] = cns->chn_quat_y[c];
cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
}
// Step 2: Verlet Integration - Duplicated for even/odd
for (int c = 0; c < MAX_CHNS; ++c) {
__builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
int p_base = c * PTC_PER_CHN;
assert(p_base + 15 < MAX_PTC);
struct chn_cfg *cfg = &cns->chn_cfg[c];
__m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]);
__m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]);
__m256 chn_grav_z_vec = _mm256_set1_ps(chn_grav_lcl_z[c]);
__m256 chn_wnd_x_vec = _mm256_set1_ps(chn_wnd_lcl_x[c]);
__m256 chn_wnd_y_vec = _mm256_set1_ps(chn_wnd_lcl_y[c]);
__m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]);
__m256 damping_vec = _mm256_set1_ps(cfg->damping);
__m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec);
__m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec);
__m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec);
// Even block
{
__m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
__m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
__m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
__m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
__m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
__m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
__m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
__m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass);
__m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass);
__m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass);
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
_mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
_mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
_mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
_mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], p_curr_x);
_mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y);
_mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z);
assert(isfinite(_mm256_cvtss_f32(new_p_x)));
assert(isfinite(_mm256_cvtss_f32(new_p_y)));
assert(isfinite(_mm256_cvtss_f32(new_p_z)));
}
// Odd block
{
__m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
__m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
__m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
__m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
__m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
__m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
__m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
__m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass);
__m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass);
__m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass);
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
_mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x);
_mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y);
_mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z);
_mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], p_curr_x);
_mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], p_curr_y);
_mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z);
assert(isfinite(_mm256_cvtss_f32(new_p_x)));
assert(isfinite(_mm256_cvtss_f32(new_p_y)));
assert(isfinite(_mm256_cvtss_f32(new_p_z)));
}
}
// Step 3: XPBD Constraints (Distance only) - Permutes intra-vector only, no cross-block mixing
for (int c = 0; c < MAX_CHNS; ++c) {
__builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
int p_base = c * PTC_PER_CHN;
assert(p_base + 15 < MAX_PTC);
int r_base = c * CON_PER_CHN;
assert(r_base + 15 < MAX_REST_LEN);struct chn_cfg *cfg = &cns->chn_cfg[c];
__m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
__m256 compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(stiff_vec, eps_vec)); // Fallback if 0
__m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness);
__m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(bend_stiff_vec, eps_vec));
// Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15)
__m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
__m256 even_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
__m256 even_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
__m256 even_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
__m256 odd_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
__m256 odd_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
__m256 odd_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
__m256 odd_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
// Direct load transposed rests/lambdas: even cons [0,2,...] in r_base+0, odd in r_base+8
__m256 rest_even = _mm256_load_ps(&cns->ptc_rest_len[r_base + 0]);
__m256 lambda_even = _mm256_load_ps(&cns->ptc_lambda[r_base + 0]);
__m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]);
__m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]);
__m256 bend_rest_even = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 0]);
__m256 bend_lambda_even = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 0]);
__m256 bend_rest_odd = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 8]);
__m256 bend_lambda_odd = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 8]);
_mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f));
_mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f));
for (int iter = 0; iter < MAX_ITR; ++iter) {
// Even distance constraints: p0 = even (A), p1 = odd (B)
{
__m256 dx = _mm256_sub_ps(even_x, odd_x); // A - B
__m256 dy = _mm256_sub_ps(even_y, odd_y);
__m256 dz = _mm256_sub_ps(even_z, odd_z);
__m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
__m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
assert(isfinite(_mm256_cvtss_f32(dist)));
__m256 w_sum = _mm256_add_ps(even_m, odd_m);
__m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
__m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ);
__m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_even));
__m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_even)); // C + compliance * lambda
__m256 numerator = _mm256_sub_ps(zero_vec, temp); // - (C + compliance * lambda)
__m256 denom = _mm256_add_ps(w_sum, compliance_factor); // w_sum + compliance
__m256 delta_lambda = _mm256_div_ps(numerator, denom);
__m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda);
new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));
__m256 unit_x = _mm256_mul_ps(dx, inv_dist); // grad_A = unit
__m256 unit_y = _mm256_mul_ps(dy, inv_dist);
__m256 unit_z = _mm256_mul_ps(dz, inv_dist);
__m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
__m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
__m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);
__m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
corr_x = _mm256_and_ps(corr_x, corr_mask);
corr_y = _mm256_and_ps(corr_y, corr_mask);
corr_z = _mm256_and_ps(corr_z, corr_mask);
// dpos_A = delta_lambda * unit * even_m
even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, even_m), one_vec, even_x);
even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, even_m), one_vec, even_y);
even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, even_m), one_vec, even_z);
// dpos_B = delta_lambda * (-unit) * odd_m = - corr * odd_m
odd_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x);
odd_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y);
odd_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z);
lambda_even = new_lambda;
}
// Odd distance constraints: p1 = odd (A), p2 = even shifted (B)
{
__m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
__m256 p2_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx));
__m256 p2_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx));
__m256 p2_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx));
__m256 p2_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx));
__m256 dx = _mm256_sub_ps(odd_x, p2_x);
__m256 dy = _mm256_sub_ps(odd_y, p2_y);
__m256 dz = _mm256_sub_ps(odd_z, p2_z);
__m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
__m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
assert(isfinite(_mm256_cvtss_f32(dist)));
__m256 w_sum = _mm256_add_ps(odd_m, p2_m);
__m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
__m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ);
__m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_odd));
__m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_odd));
__m256 numerator = _mm256_sub_ps(zero_vec, temp);
__m256 denom = _mm256_add_ps(w_sum, compliance_factor);
__m256 delta_lambda = _mm256_div_ps(numerator, denom);
__m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda);
new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));
__m256 unit_x = _mm256_mul_ps(dx, inv_dist);
__m256 unit_y = _mm256_mul_ps(dy, inv_dist);
__m256 unit_z = _mm256_mul_ps(dz, inv_dist);
__m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
__m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
__m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);
__m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
corr_x = _mm256_and_ps(corr_x, corr_mask);
corr_y = _mm256_and_ps(corr_y, corr_mask);
corr_z = _mm256_and_ps(corr_z, corr_mask);
// dpos_A = delta_lambda * unit * odd_m
odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x);
odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y);
odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z);
// dpos_B = delta_lambda * (-unit) * p2_m = - corr * p2_m (permuted add via shift)
__m256 delta_p2_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, p2_m), one_vec, zero_vec);
__m256 delta_even_x = _mm256_permutevar8x32_ps(delta_p2_x, _mm256_castsi256_ps(shift_idx));
even_x = _mm256_add_ps(even_x, delta_even_x);
__m256 delta_p2_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, p2_m), one_vec, zero_vec);
__m256 delta_even_y = _mm256_permutevar8x32_ps(delta_p2_y, _mm256_castsi256_ps(shift_idx));
even_y = _mm256_add_ps(even_y, delta_even_y);
__m256 delta_p2_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, p2_m), one_vec, zero_vec);
__m256 delta_even_z = _mm256_permutevar8x32_ps(delta_p2_z, _mm256_castsi256_ps(shift_idx));
even_z = _mm256_add_ps(even_z, delta_even_z);
lambda_odd = new_lambda;
}
// Even bending constraints: between even[k] (A) and even[k+1] (B)
{
__m256 a_x = even_x;
__m256 a_y = even_y;
__m256 a_z = even_z;
__m256 a_m = even_m;
__m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
__m256 b_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx));
__m256 b_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx));
__m256 b_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx));
__m256 b_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx));
__m256 dx = _mm256_sub_ps(a_x, b_x);
__m256 dy = _mm256_sub_ps(a_y, b_y);
__m256 dz = _mm256_sub_ps(a_z, b_z);
__m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
__m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
assert(isfinite(_mm256_cvtss_f32(dist)));
__m256 w_sum = _mm256_add_ps(a_m, b_m);
__m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
__m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ);
__m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_even));
__m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_even));
__m256 numerator = _mm256_sub_ps(zero_vec, temp);
__m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor);
__m256 delta_lambda = _mm256_div_ps(numerator, denom);
__m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda);
new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));
__m256 unit_x = _mm256_mul_ps(dx, inv_dist);
__m256 unit_y = _mm256_mul_ps(dy, inv_dist);
__m256 unit_z = _mm256_mul_ps(dz, inv_dist);
__m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
__m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
__m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);
__m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
corr_x = _mm256_and_ps(corr_x, corr_mask);
corr_y = _mm256_and_ps(corr_y, corr_mask);
corr_z = _mm256_and_ps(corr_z, corr_mask);
// dpos_A = delta_lambda * unit * a_m
even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, even_x);
even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, even_y);
even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, even_z);
// dpos_B = - corr * b_m (permuted add)
__m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec);
__m256 delta_even_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx));
even_x = _mm256_add_ps(even_x, delta_even_x);
__m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec);
__m256 delta_even_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx));
even_y = _mm256_add_ps(even_y, delta_even_y);
__m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec);
__m256 delta_even_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx));
even_z = _mm256_add_ps(even_z, delta_even_z);
bend_lambda_even = new_lambda;
}
// Odd bending constraints: between odd[k] (A) and odd[k+1] (B)
{
__m256 a_x = odd_x;
__m256 a_y = odd_y;
__m256 a_z = odd_z;
__m256 a_m = odd_m;
__m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
__m256 b_x = _mm256_permutevar8x32_ps(odd_x, _mm256_castsi256_ps(shift_idx));
__m256 b_y = _mm256_permutevar8x32_ps(odd_y, _mm256_castsi256_ps(shift_idx));
__m256 b_z = _mm256_permutevar8x32_ps(odd_z, _mm256_castsi256_ps(shift_idx));
__m256 b_m = _mm256_permutevar8x32_ps(odd_m, _mm256_castsi256_ps(shift_idx));
__m256 dx = _mm256_sub_ps(a_x, b_x);
__m256 dy = _mm256_sub_ps(a_y, b_y);
__m256 dz = _mm256_sub_ps(a_z, b_z);
__m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
__m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
assert(isfinite(_mm256_cvtss_f32(dist)));
__m256 w_sum = _mm256_add_ps(a_m, b_m);
__m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
__m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ);
__m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_odd));
__m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_odd));
__m256 numerator = _mm256_sub_ps(zero_vec, temp);
__m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor);
__m256 delta_lambda = _mm256_div_ps(numerator, denom);
__m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda);
new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));
__m256 unit_x = _mm256_mul_ps(dx, inv_dist);
__m256 unit_y = _mm256_mul_ps(dy, inv_dist);
__m256 unit_z = _mm256_mul_ps(dz, inv_dist);
__m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
__m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
__m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);
__m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
corr_x = _mm256_and_ps(corr_x, corr_mask);
corr_y = _mm256_and_ps(corr_y, corr_mask);
corr_z = _mm256_and_ps(corr_z, corr_mask);
// dpos_A = delta_lambda * unit * a_m
odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, odd_x);
odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, odd_y);
odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, odd_z);
// dpos_B = - corr * b_m (permuted add)
__m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec);
__m256 delta_odd_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx));
odd_x = _mm256_add_ps(odd_x, delta_odd_x);
__m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec);
__m256 delta_odd_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx));
odd_y = _mm256_add_ps(odd_y, delta_odd_y);
__m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec);
__m256 delta_odd_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx));
odd_z = _mm256_add_ps(odd_z, delta_odd_z);
bend_lambda_odd = new_lambda;
}
}
// Store back
_mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x);
_mm256_store_ps(&cns->ptc_pos_y[p_base + 0], even_y);
_mm256_store_ps(&cns->ptc_pos_z[p_base + 0], even_z);
_mm256_store_ps(&cns->ptc_pos_x[p_base + 8], odd_x);
_mm256_store_ps(&cns->ptc_pos_y[p_base + 8], odd_y);
_mm256_store_ps(&cns->ptc_pos_z[p_base + 8], odd_z);
_mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even);
_mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd);
_mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], bend_lambda_even);
_mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], bend_lambda_odd);
assert(isfinite(_mm256_cvtss_f32(even_x)));
assert(isfinite(_mm256_cvtss_f32(odd_x)));
}
// Step 4: Sphere Collisions with Friction - Duplicated for even/odd
for (int c = 0; c < MAX_CHNS; ++c) {
__builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
int p_base = c * PTC_PER_CHN;
assert(p_base + 15 < MAX_PTC);
int s_base = c * SPH_PER_CHN;
assert(s_base + 7 < MAX_SPH);
struct chn_cfg *cfg = &cns->chn_cfg[c];
__m256 fric_vec = _mm256_set1_ps(cfg->friction);
for (int s = 0; s < SPH_PER_CHN; ++s) {
__m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]);
__m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]);
__m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
__m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);
// Even block
{
__m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
__m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
__m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
__m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
__m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
__m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
__m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
__m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
__m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
__m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 min_inv_dist = _mm256_set1_ps(1e-6f);
inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
__m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
assert(isfinite(_mm256_cvtss_f32(dist)));
__m256 pen = _mm256_sub_ps(sph_r_vec, dist);
__m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
__m256 norm_x = _mm256_mul_ps(dx, inv_dist);
__m256 norm_y = _mm256_mul_ps(dy, inv_dist);
__m256 norm_z = _mm256_mul_ps(dz, inv_dist);
__m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
__m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
__m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
__m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
__m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
__m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
__m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
__m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec); // Clamp before rsqrt
__m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);
__m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
__m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
__m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
__m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
__m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
__m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
__m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
__m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
__m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
__m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
_mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
_mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
_mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
assert(isfinite(_mm256_cvtss_f32(new_p_x)));
assert(isfinite(_mm256_cvtss_f32(new_p_y)));
assert(isfinite(_mm256_cvtss_f32(new_p_z)));
}
// Odd block
{
__m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
__m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
__m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
__m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
__m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
__m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
__m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
__m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
__m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
__m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 min_inv_dist = _mm256_set1_ps(1e-6f);
inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
__m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
assert(isfinite(_mm256_cvtss_f32(dist)));
__m256 pen = _mm256_sub_ps(sph_r_vec, dist);
__m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
__m256 norm_x = _mm256_mul_ps(dx, inv_dist);
__m256 norm_y = _mm256_mul_ps(dy, inv_dist);
__m256 norm_z = _mm256_mul_ps(dz, inv_dist);
__m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
__m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
__m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
__m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
__m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
__m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
__m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
__m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec); // Clamp before rsqrt
__m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);
__m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
__m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
__m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
__m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
__m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
__m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
__m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
__m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
__m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
__m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
_mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
_mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
_mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
assert(isfinite(_mm256_cvtss_f32(new_p_x)));
assert(isfinite(_mm256_cvtss_f32(new_p_y)));
assert(isfinite(_mm256_cvtss_f32(new_p_z)));
}
}
}
// Step 5: Motion Constraints - Duplicated for even/odd
for (int c = 0; c < MAX_CHNS; ++c) {
__builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
int p_base = c * PTC_PER_CHN;
assert(p_base + 15 < MAX_PTC);
// Even block
{
__m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
__m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
__m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
__m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 0]);
__m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 0]);
__m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 0]);
__m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 0]);
__m256 dx = _mm256_sub_ps(px, rx);
__m256 dy = _mm256_sub_ps(py, ry);
__m256 dz = _mm256_sub_ps(pz, rz);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
assert(isfinite(_mm256_cvtss_f32(dist)));
__m256 pen = _mm256_sub_ps(dist, r_vec);
__m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
__m256 corr_factor = _mm256_mul_ps(pen, inv_dist);
corr_factor = _mm256_and_ps(corr_factor, mask);
__m256 dt_x = _mm256_mul_ps(dx, corr_factor);
__m256 dt_y = _mm256_mul_ps(dy, corr_factor);
__m256 dt_z = _mm256_mul_ps(dz, corr_factor);
dt_x = _mm256_mul_ps(dt_x, pm);
dt_y = _mm256_mul_ps(dt_y, pm);
dt_z = _mm256_mul_ps(dt_z, pm);
_mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_sub_ps(px, dt_x));
_mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y));
_mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z));
assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x))));
assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y))));
assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z))));
}
// Odd block
{
__m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
__m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
__m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
__m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
__m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 8]);
__m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 8]);
__m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 8]);
__m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 8]);
__m256 dx = _mm256_sub_ps(px, rx);
__m256 dy = _mm256_sub_ps(py, ry);
__m256 dz = _mm256_sub_ps(pz, rz);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
assert(isfinite(_mm256_cvtss_f32(dist)));
__m256 pen = _mm256_sub_ps(dist, r_vec);
__m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
__m256 corr_factor = _mm256_mul_ps(pen, inv_dist);
corr_factor = _mm256_and_ps(corr_factor, mask);
__m256 dt_x = _mm256_mul_ps(dx, corr_factor);
__m256 dt_y = _mm256_mul_ps(dy, corr_factor);
__m256 dt_z = _mm256_mul_ps(dz, corr_factor);
dt_x = _mm256_mul_ps(dt_x, pm);
dt_y = _mm256_mul_ps(dt_y, pm);
dt_z = _mm256_mul_ps(dt_z, pm);
_mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_sub_ps(px, dt_x));
_mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_sub_ps(py, dt_y));
_mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z));
assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x))));
assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y))));
assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z))));
}
}
assert(chn_sim__val(cns));
}
#include <SDL2/SDL.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "chain.c"
#define WINDOW_WIDTH 800
#define WINDOW_HEIGHT 600
#define CHAIN_LENGTH 10
#define PARTICLE_SPACING 10.0f
static void
project(float x, float y, float z, int *screen_x, int *screen_y) {
*screen_x = (int)(x + WINDOW_WIDTH / 2.0f);
*screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f);
}
// Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity)
static void quat_rotate_inv_translate(float wx, float wy, float wz, // world pos
float cx, float cy, float cz, // chain pos
float qx, float qy, float qz, float qw, // chain quat
float *mx, float *my, float *mz) { // out model pos
// Translate: world - chain
float tx = wx - cx, ty = wy - cy, tz = wz - cz;
// Quat conj: (qw, -qx, -qy, -qz)
float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz;
// t = conj * (0, tx,ty,tz)
float tw = cqx*tx + cqy*ty + cqz*tz;
float t_x = cqw*tx + cqy*tz - cqz*ty;
float t_y = cqw*ty + cqz*tx - cqx*tz;
float t_z = cqw*tz + cqx*ty - cqy*tx;
// model = t * quat (vec part)
*mx = qw*t_x + qy*t_z - qz*t_y + tw*qx;
*my = qw*t_y + qz*t_x - qx*t_z + tw*qy;
*mz = qw*t_z + qx*t_y - qy*t_x + tw*qz;
}
extern int
main(int argc, char *argv[]) {
if (SDL_Init(SDL_INIT_VIDEO) < 0) {
printf("SDL_Init failed: %s\n", SDL_GetError());
return 1;
}
SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0);
SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
if (!window || !renderer) {
printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError());
SDL_Quit();
return 1;
}
struct chn_sim sim;
chn_sim_init(&sim);
struct chn_cfg cfg = {
.damping = 0.5f,
.stiffness = 0.9f,
.friction = 0.5f,
.linear_inertia = 0.9f, // Tune 0.5f for more tangential lag
.angular_inertia = 0.9f,
.centrifugal_inertia = 0.9f, // Tune 0.5f for more outward bow
.bend_stiffness = 0.8f,
};
float rest_pos[CHAIN_LENGTH * 4];
for (int i = 0; i < CHAIN_LENGTH; i++) {
rest_pos[i*4 + 0] = 0.0f;
rest_pos[i*4 + 1] = -i * PARTICLE_SPACING;
rest_pos[i*4 + 2] = 0.0f;
rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f;
}
int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH);
chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f}); // Reduced for spin test
chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); // Start at circle edge
// Fixed world spheres (triangle outside circle)
float world_spheres[12] = { // 3 spheres * 4 floats (x,y,z,r)
60.0f, 0.0f, 0.0f, 10.0f, // Sphere 0: right
-30.0f, 52.0f, 0.0f, 10.0f, // Sphere 1: top-left
-30.0f,-52.0f, 0.0f, 10.0f // Sphere 2: bottom-left
};
// Circular motion params
float time = 0.0f;
float circle_radius = 50.0f;
float angular_speed = 2.0f * M_PI / 5.0f; // Full circle every 5s
float dt = 0.016f;
int running = 1;
SDL_Event event;
// Debug: Check gravity and masses
int p_base = chn * PTC_PER_CHN;
while (running) {
while (SDL_PollEvent(&event)) {
if (event.type == SDL_QUIT) {
running = 0;
}
}
time += dt;
// Circular anchor motion
float anchor_x = circle_radius * cosf(angular_speed * time);
float anchor_y = circle_radius * sinf(angular_speed * time);
chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});
// Transform fixed world spheres to current model space
float model_spheres[12];
for (int s = 0; s < 3; ++s) {
float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2];
float r = world_spheres[s*4 + 3];
float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn];
float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn];
float mx, my, mz;
quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz);
model_spheres[s*4 + 0] = mx;
model_spheres[s*4 + 1] = my;
model_spheres[s*4 + 2] = mz;
model_spheres[s*4 + 3] = r;
}
chn_sim_set_sph(&sim, chn, model_spheres, 3);
chn_sim_run(&sim, dt);
SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
SDL_RenderClear(renderer);
SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255); // Green for chain & spheres
float root_x = sim.chn_pos_x[chn]; // Current root world x/y
float root_y = sim.chn_pos_y[chn];
// Draw chain segments
for (int i = 0; i < CHAIN_LENGTH - 1; i++) {
int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2);
float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0];
float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1];
float world_x0 = model_x0 + root_x;
float world_y0 = model_y0 + root_y;
float world_x1 = model_x1 + root_x;
float world_y1 = model_y1 + root_y;
int sx0, sy0, sx1, sy1;
project(world_x0, world_y0, 0, &sx0, &sy0);
project(world_x1, world_y1, 0, &sx1, &sy1);
SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
}
// Draw spheres (approx circle outline with 32 points)
int p_base = chn * PTC_PER_CHN;
int s_base = chn * SPH_PER_CHN;
for (int s = 0; s < 3; ++s) { // Only draw active spheres
float sph_model_x = sim.sph_x[s_base + s];
float sph_model_y = sim.sph_y[s_base + s];
float sph_r = sim.sph_r[s_base + s];
if (sph_r <= 0.0f) continue;
float sph_world_x = sph_model_x + root_x;
float sph_world_y = sph_model_y + root_y;
int sph_sx, sph_sy;
project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy);
// Parametric circle: 32 points
const int num_points = 32;
for (int pt = 0; pt < num_points; ++pt) {
float angle0 = 2.0f * M_PI * pt / num_points;
float angle1 = 2.0f * M_PI * (pt + 1) / num_points;
float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0);
float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1);
int px0, py0, px1, py1;
project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0);
project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1);
SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
}
}
SDL_RenderPresent(renderer);
SDL_Delay(16);
}
SDL_DestroyRenderer(renderer);
SDL_DestroyWindow(window);
SDL_Quit();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment