vurtun · October 28, 2025 21:55
diff --git a/chain.c b/chain.c
 #include <immintrin.h>
 #include <math.h>
 #include <assert.h>
 #include <string.h>
 #include <float.h>

 #define MAX_MOTION_RADIUS 1000000.0f

 enum {
  MAX_CHNS      = 128,
  PTC_PER_CHN   = 16,
  SPH_PER_CHN   = 8,
  MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
  MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
  CON_PER_CHN   = PTC_PER_CHN,
  MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
  MAX_ITR       = 16,
 };
 struct chn_cfg {
  float damping;              // [0, 1]
  float stiffness;            // [0, 1]
  float friction;             // [0, 1]
  float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
  float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
  float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
  float bend_stiffness;       // [0, 1], bending resistance
 };
 struct chn_sim {
  unsigned char free_idx_cnt;
  unsigned char free_idx[MAX_CHNS];
  struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32)));

  // chain global forces (world space gravity and wind)
  float chn_grav_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_z[MAX_CHNS] __attribute__((aligned(32)));

  float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32)));

  // chain transformations (world space)
  float chn_pos_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_pos_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_pos_z[MAX_CHNS] __attribute__((aligned(32)));

  float chn_quat_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_z[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_w[MAX_CHNS] __attribute__((aligned(32)));

  float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32)));

  float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32)));

  // particle positions (model space, transposed: 0-7 even indices, 8-15 odd indices)
  float ptc_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32)));

  float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32)));  // Transposed: even cons 0-7, odd 8-15 (last dummy)

  // XPBD lambdas for distance constraints (transposed similarly)
  float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32)));
  float ptc_bend_rest[MAX_REST_LEN] __attribute__((aligned(32)));
  float ptc_bend_lambda[MAX_REST_LEN] __attribute__((aligned(32)));

  // sphere positions (model space)
  float sph_x[MAX_SPH] __attribute__((aligned(32)));
  float sph_y[MAX_SPH] __attribute__((aligned(32)));
  float sph_z[MAX_SPH] __attribute__((aligned(32)));
  float sph_r[MAX_SPH] __attribute__((aligned(32)));

  // rest positions (model space, transposed)
  float rest_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float rest_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float motion_radius[MAX_PTC] __attribute__((aligned(32)));
 };
 static int
 chn_sim__val(struct chn_sim *cns) {
  assert(cns);
  assert(cns->free_idx_cnt <= MAX_CHNS);
  for (int i = 0; i < cns->free_idx_cnt; ++i) {
    assert(i < MAX_CHNS);
    assert(cns->free_idx[i] < MAX_CHNS);
  }
  for (int i = 0; i < MAX_CHNS; ++i) {
    const struct chn_cfg *cfg = &cns->chn_cfg[i];
    assert(cfg->damping >= 0.0f);
    assert(cfg->damping <= 1.0f);

    assert(cfg->stiffness >= 0.0f);
    assert(cfg->stiffness <= 1.0f);

    assert(cfg->friction >= 0.0f);
    assert(cfg->friction <= 1.0f);

    assert(cfg->linear_inertia >= 0.0f);
    assert(cfg->linear_inertia <= 1.0f);

    assert(cfg->angular_inertia >= 0.0f);
    assert(cfg->angular_inertia <= 1.0f);

    assert(cfg->centrifugal_inertia >= 0.0f);
    assert(cfg->centrifugal_inertia <= 1.0f);

    assert(cfg->bend_stiffness >= 0.0f);
    assert(cfg->bend_stiffness <= 1.0f);

    assert(isfinite(cns->chn_grav_x[i]));
    assert(isfinite(cns->chn_grav_y[i]));
    assert(isfinite(cns->chn_grav_z[i]));

    assert(isfinite(cns->chn_wnd_x[i]));
    assert(isfinite(cns->chn_wnd_y[i]));
    assert(isfinite(cns->chn_wnd_z[i]));

    assert(isfinite(cns->chn_pos_x[i]));
    assert(isfinite(cns->chn_pos_y[i]));
    assert(isfinite(cns->chn_pos_z[i]));

    assert(isfinite(cns->chn_quat_x[i]));
    assert(isfinite(cns->chn_quat_y[i]));
    assert(isfinite(cns->chn_quat_z[i]));
    assert(isfinite(cns->chn_quat_w[i]));

    // Quaternion normalization invariant
    float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] +
      cns->chn_quat_y[i]*cns->chn_quat_y[i] +
      cns->chn_quat_z[i]*cns->chn_quat_z[i] +
      cns->chn_quat_w[i]*cns->chn_quat_w[i];
    assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);

    assert(isfinite(cns->chn_prev_pos_x[i]));
    assert(isfinite(cns->chn_prev_pos_y[i]));
    assert(isfinite(cns->chn_prev_pos_z[i]));

    assert(isfinite(cns->chn_prev_quat_x[i]));
    assert(isfinite(cns->chn_prev_quat_y[i]));
    assert(isfinite(cns->chn_prev_quat_z[i]));
    assert(isfinite(cns->chn_prev_quat_w[i]));

    // Prev quat norm
    qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] +
      cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] +
      cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] +
      cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i];
    assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
  }
  for (int i = 0; i < MAX_PTC; ++i) {
    assert(i < MAX_PTC);
    assert(cns->ptc_rest_len[i] >= 0.0f);
    assert(cns->motion_radius[i] >= 0.0f);
    assert(cns->motion_radius[i] <= MAX_MOTION_RADIUS);

    assert(isfinite(cns->ptc_pos_x[i]));
    assert(isfinite(cns->ptc_pos_y[i]));
    assert(isfinite(cns->ptc_pos_z[i]));
    assert(isfinite(cns->ptc_inv_mass[i]));
    assert(cns->ptc_inv_mass[i] >= 0.0f);

    assert(isfinite(cns->ptc_prev_pos_x[i]));
    assert(isfinite(cns->ptc_prev_pos_y[i]));
    assert(isfinite(cns->ptc_prev_pos_z[i]));

    assert(isfinite(cns->rest_pos_x[i]));
    assert(isfinite(cns->rest_pos_y[i]));
    assert(isfinite(cns->rest_pos_z[i]));
    assert(isfinite(cns->motion_radius[i]));

    // Spot-check motion constraint post-any update (sample every 8th for perf)
    float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i];
    float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i];
    float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i];
    float dist = sqrtf(dx*dx + dy*dy + dz*dz);
    assert(dist <= cns->motion_radius[i] + 1e-6f);
  }
  for (int i = 0; i < MAX_SPH; ++i) {
    assert(i < MAX_SPH);
    assert(isfinite(cns->sph_x[i]));
    assert(isfinite(cns->sph_y[i]));
    assert(isfinite(cns->sph_z[i]));
    assert(isfinite(cns->sph_r[i]));
    assert(cns->sph_r[i] >= 0.0f);
  }
  for (int i = 0; i < MAX_REST_LEN; ++i) {
    assert(i < MAX_REST_LEN);
    assert(cns->ptc_rest_len[i] >= 0.0f);
    assert(isfinite(cns->ptc_rest_len[i]));

    assert(isfinite(cns->ptc_lambda[i]));
    assert(cns->ptc_lambda[i] >= -1e6f);
    assert(cns->ptc_lambda[i] <= 1e6f);

    assert(cns->ptc_bend_rest[i] >= 0.0f);
    assert(isfinite(cns->ptc_bend_rest[i]));

    assert(isfinite(cns->ptc_bend_lambda[i]));
    assert(cns->ptc_bend_lambda[i] >= -1e6f);
    assert(cns->ptc_bend_lambda[i] <= 1e6f);
  }
  return 1;
 }
 extern void
 chn_sim_init(struct chn_sim *cns) {
  assert(cns);
  memset(cns, 0, sizeof(*cns));
  cns->free_idx_cnt = MAX_CHNS;
  for (int i = 0; i < MAX_CHNS; ++i) {
    cns->free_idx[i] = MAX_CHNS - i - 1;
  }
  for (int i = 0; i < MAX_CHNS; i += 8) {
    _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f));
    _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));
  }
  for (int i = 0; i < MAX_PTC; i += 8) {
    _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS));
  }
  assert(chn_sim__val(cns));
 }
 extern int
 chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
  assert(cns);
  assert(cfg);
  assert(cns->free_idx_cnt > 0);
  assert(cnt <= PTC_PER_CHN);
  assert(cnt > 1);
  assert(chn_sim__val(cns));

  assert(cfg->damping >= 0.0f);
  assert(cfg->damping <= 1.0f);
  assert(cfg->stiffness >= 0.0f);
  assert(cfg->stiffness <= 1.0f);
  assert(cfg->friction >= 0.0f);
  assert(cfg->friction <= 1.0f);
  assert(cfg->linear_inertia >= 0.0f);
  assert(cfg->linear_inertia <= 1.0f);
  assert(cfg->angular_inertia >= 0.0f);
  assert(cfg->angular_inertia <= 1.0f);
  assert(cfg->centrifugal_inertia >= 0.0f);
  assert(cfg->centrifugal_inertia <= 1.0f);
  assert(cfg->bend_stiffness >= 0.0f);
  assert(cfg->bend_stiffness <= 1.0f);

  int chn = cns->free_idx[--cns->free_idx_cnt];
  cns->chn_cfg[chn] = *cfg;

  int p_base = (chn * PTC_PER_CHN);
  assert(p_base + 15 < MAX_PTC);
  int r_base = (chn * CON_PER_CHN);
  assert(r_base + 15 < MAX_REST_LEN);
  int even_base = p_base + 0;  // Even indices 0-7
  int odd_base = p_base + 8;   // Odd indices 8-15

  // Transpose particles: even indices to 0-7, odd to 8-15
  int even_cnt = (cnt + 1) / 2;  // Number of even particles
  int odd_cnt = cnt / 2;         // Number of odd particles
  assert(even_cnt + odd_cnt == cnt);
  for (int i = 0; i < even_cnt; ++i) {
    int orig_i = 2 * i;
    assert(orig_i * 4 + 3 < cnt * 4);

    cns->ptc_pos_x[even_base + i] = rest_pos[orig_i*4+0];
    cns->ptc_pos_y[even_base + i] = rest_pos[orig_i*4+1];
    cns->ptc_pos_z[even_base + i] = rest_pos[orig_i*4+2];
    cns->ptc_inv_mass[even_base + i] = rest_pos[orig_i*4+3];

    cns->rest_pos_x[even_base + i] = rest_pos[orig_i*4+0];
    cns->rest_pos_y[even_base + i] = rest_pos[orig_i*4+1];
    cns->rest_pos_z[even_base + i] = rest_pos[orig_i*4+2];

    // Set previous positions to match initial positions for zero initial velocity
    cns->ptc_prev_pos_x[even_base + i] = cns->ptc_pos_x[even_base + i];
    cns->ptc_prev_pos_y[even_base + i] = cns->ptc_pos_y[even_base + i];
    cns->ptc_prev_pos_z[even_base + i] = cns->ptc_pos_z[even_base + i];
  }
  for (int i = even_cnt; i < 8; ++i) {
    cns->ptc_pos_x[even_base + i] = 0.0f;
    cns->ptc_pos_y[even_base + i] = 0.0f;
    cns->ptc_pos_z[even_base + i] = 0.0f;
    cns->ptc_inv_mass[even_base + i] = 0.0f;

    cns->rest_pos_x[even_base + i] = 0.0f;
    cns->rest_pos_y[even_base + i] = 0.0f;
    cns->rest_pos_z[even_base + i] = 0.0f;

    // Previous positions for pads remain 0.0f
    cns->ptc_prev_pos_x[even_base + i] = 0.0f;
    cns->ptc_prev_pos_y[even_base + i] = 0.0f;
    cns->ptc_prev_pos_z[even_base + i] = 0.0f;
  }
  for (int i = 0; i < odd_cnt; ++i) {
    int orig_i = 2 * i + 1;
    assert(orig_i * 4 + 3 < cnt * 4);
    cns->ptc_pos_x[odd_base + i] = rest_pos[orig_i*4+0];
    cns->ptc_pos_y[odd_base + i] = rest_pos[orig_i*4+1];
    cns->ptc_pos_z[odd_base + i] = rest_pos[orig_i*4+2];
    cns->ptc_inv_mass[odd_base + i] = rest_pos[orig_i*4+3];

    cns->rest_pos_x[odd_base + i] = rest_pos[orig_i*4+0];
    cns->rest_pos_y[odd_base + i] = rest_pos[orig_i*4+1];
    cns->rest_pos_z[odd_base + i] = rest_pos[orig_i*4+2];

    // Set previous positions to match initial positions for zero initial velocity
    cns->ptc_prev_pos_x[odd_base + i] = cns->ptc_pos_x[odd_base + i];
    cns->ptc_prev_pos_y[odd_base + i] = cns->ptc_pos_y[odd_base + i];
    cns->ptc_prev_pos_z[odd_base + i] = cns->ptc_pos_z[odd_base + i];
  }
  for (int i = odd_cnt; i < 8; ++i) {
    cns->ptc_pos_x[odd_base + i] = 0.0f;
    cns->ptc_pos_y[odd_base + i] = 0.0f;
    cns->ptc_pos_z[odd_base + i] = 0.0f;
    cns->ptc_inv_mass[odd_base + i] = 0.0f;

    cns->rest_pos_x[odd_base + i] = 0.0f;
    cns->rest_pos_y[odd_base + i] = 0.0f;
    cns->rest_pos_z[odd_base + i] = 0.0f;

    // Previous positions for pads remain 0.0f
    cns->ptc_prev_pos_x[odd_base + i] = 0.0f;
    cns->ptc_prev_pos_y[odd_base + i] = 0.0f;
    cns->ptc_prev_pos_z[odd_base + i] = 0.0f;
  }
  // Set motion radius to FLT_MAX by default (no constraint)
  for (int i = 0; i < PTC_PER_CHN; i += 8) {
    _mm256_store_ps(&cns->motion_radius[p_base + i], _mm256_set1_ps(MAX_MOTION_RADIUS));
  }
  // Transpose constraints: even cons (0,2,4,...) to r_base+0-7, odd (1,3,5,...) to r_base+8-15
  for (int i = 1; i < cnt; ++i) {
    float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
    float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
    float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];

    __m256 dist_sq_vec = _mm256_set1_ps(dx*dx + dy*dy + dz*dz);
    __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);

    float len = _mm256_cvtss_f32(sqrt_vec);
    assert(isfinite(len));
    len = fmaxf(len, 1e-6f);

    int con_idx = i - 1;
    if ((con_idx & 0x01) == 0) {  // Even constraint
      cns->ptc_rest_len[r_base + (con_idx / 2)] = len;
      cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f;
    } else {  // Odd constraint
      cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = len;
      cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
    }
  }
  for (int i = cnt - 1; i < CON_PER_CHN - 1; ++i) {  // Pad dummies
    int con_idx = i;
    if ((con_idx & 0x01) == 0) {
      cns->ptc_rest_len[r_base + (con_idx / 2)] = 0.0f;
      cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f;
    } else {
      cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
      cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
    }
  }
  // Transpose bending constraints: even bends (0-2,2-4,...) to r_base+0-7, odd (1-3,3-5,...) to r_base+8-15 (new)
  int even_bend_cnt = even_cnt - 1;
  int odd_bend_cnt = odd_cnt - 1;
  assert(even_bend_cnt == even_cnt - 1);
  assert(odd_bend_cnt == odd_cnt - 1);
  for (int i = 0; i < even_bend_cnt; ++i) {
    float dx = cns->rest_pos_x[even_base + i] - cns->rest_pos_x[even_base + i + 1];
    float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1];
    float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1];

    __m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz);
    __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);

    float len = _mm256_cvtss_f32(sqrt_vec);
    assert(isfinite(len));
    len = fmaxf(len, 1e-6f);

    cns->ptc_bend_rest[r_base + i] = len;
    cns->ptc_bend_lambda[r_base + i] = 0.0f;
  }
  for (int i = even_bend_cnt; i < 8; ++i) {
    cns->ptc_bend_rest[r_base + i] = 0.0f;
    cns->ptc_bend_lambda[r_base + i] = 0.0f;
  }
  for (int i = 0; i < odd_bend_cnt; ++i) {
    float dx = cns->rest_pos_x[odd_base + i] - cns->rest_pos_x[odd_base + i + 1];
    float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1];
    float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1];

    __m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz);
    __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);
    float len = _mm256_cvtss_f32(sqrt_vec);

    assert(isfinite(len));
    len = fmaxf(len, 1e-6f);

    cns->ptc_bend_rest[r_base + 8 + i] = len;
    cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
  }
  for (int i = odd_bend_cnt; i < 8; ++i) {
    cns->ptc_bend_rest[r_base + 8 + i] = 0.0f;
    cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
  }
  assert(chn_sim__val(cns));
  return chn;
 }
 extern void
 chn_sim_del(struct chn_sim *cns, int chn) {
  assert(chn_sim__val(cns));
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  int p_base = (chn * PTC_PER_CHN);
  assert(p_base + 15 < MAX_PTC);
  int s_base = (chn * SPH_PER_CHN);
  assert(s_base + 7 < MAX_SPH);
  int r_base = (chn * CON_PER_CHN);
  assert(r_base + 15 < MAX_REST_LEN);

  cns->free_idx[cns->free_idx_cnt++] = chn;
  memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));

  cns->chn_grav_x[chn] = 0.0f;
  cns->chn_grav_y[chn] = 0.0f;
  cns->chn_grav_z[chn] = 0.0f;

  cns->chn_wnd_x[chn] = 0.0f;
  cns->chn_wnd_y[chn] = 0.0f;
  cns->chn_wnd_z[chn] = 0.0f;

  cns->chn_pos_x[chn] = 0.0f;
  cns->chn_pos_y[chn] = 0.0f;
  cns->chn_pos_z[chn] = 0.0f;

  cns->chn_quat_x[chn] = 0.0f;
  cns->chn_quat_y[chn] = 0.0f;
  cns->chn_quat_z[chn] = 0.0f;
  cns->chn_quat_w[chn] = 1.0f;

  cns->chn_prev_pos_x[chn] = 0.0f;
  cns->chn_prev_pos_y[chn] = 0.0f;
  cns->chn_prev_pos_z[chn] = 0.0f;

  cns->chn_prev_quat_x[chn] = 0.0f;
  cns->chn_prev_quat_y[chn] = 0.0f;
  cns->chn_prev_quat_z[chn] = 0.0f;
  cns->chn_prev_quat_w[chn] = 1.0f;

  _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->ptc_inv_mass[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_inv_mass[p_base + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->rest_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_z[p_base + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(MAX_MOTION_RADIUS));
  _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(MAX_MOTION_RADIUS));

  _mm256_store_ps(&cns->sph_x[s_base], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->sph_y[s_base], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->sph_z[s_base], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->sph_r[s_base], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->ptc_rest_len[r_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_bend_rest[r_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], _mm256_set1_ps(0.0f));

  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3,
            const float *restrict rot4) {

  assert(chn_sim__val(cns));
  assert(pos3);
  assert(rot4);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_pos_x[chn] = pos3[0];
  cns->chn_pos_y[chn] = pos3[1];
  cns->chn_pos_z[chn] = pos3[2];

  cns->chn_quat_x[chn] = rot4[0];
  cns->chn_quat_y[chn] = rot4[1];
  cns->chn_quat_z[chn] = rot4[2];
  cns->chn_quat_w[chn] = rot4[3];

  float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3];
  assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);

  assert(isfinite(cns->chn_quat_x[chn]));
  assert(isfinite(cns->chn_quat_y[chn]));
  assert(isfinite(cns->chn_quat_z[chn]));
  assert(isfinite(cns->chn_quat_w[chn]));
  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3,
            const float *restrict rot4) {

  assert(chn_sim__val(cns));
  assert(pos3);
  assert(rot4);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0];
  cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1];
  cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2];

  cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0];
  cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1];
  cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
  cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];

  float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3];
  assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);

  assert(isfinite(cns->chn_quat_x[chn]));
  assert(isfinite(cns->chn_quat_y[chn]));
  assert(isfinite(cns->chn_quat_z[chn]));
  assert(isfinite(cns->chn_quat_w[chn]));

  assert(isfinite(cns->chn_prev_quat_x[chn]));
  assert(isfinite(cns->chn_prev_quat_y[chn]));
  assert(isfinite(cns->chn_prev_quat_z[chn]));
  assert(isfinite(cns->chn_prev_quat_w[chn]));
  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
  assert(chn_sim__val(cns));
  assert(grav3);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_grav_x[chn] = grav3[0];
  cns->chn_grav_y[chn] = grav3[1];
  cns->chn_grav_z[chn] = grav3[2];

  assert(isfinite(cns->chn_grav_x[chn]));
  assert(isfinite(cns->chn_grav_y[chn]));
  assert(isfinite(cns->chn_grav_z[chn]));
  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
  assert(chn_sim__val(cns));
  assert(wind3);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_wnd_x[chn] = wind3[0];
  cns->chn_wnd_y[chn] = wind3[1];
  cns->chn_wnd_z[chn] = wind3[2];

  assert(isfinite(cns->chn_wnd_x[chn]));
  assert(isfinite(cns->chn_wnd_y[chn]));
  assert(isfinite(cns->chn_wnd_z[chn]));
  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_set_sph(struct chn_sim *cns, int chn, const float *restrict spheres, int cnt) {
  assert(chn_sim__val(cns));
  assert(spheres);
  assert(cnt <= SPH_PER_CHN);
  assert(cnt >= 0);

  int s_base = (chn * SPH_PER_CHN);
  assert(s_base + 7 < MAX_SPH);
  for (int i = 0; i < cnt; i++) {
    assert(i * 4 + 3 < cnt * 4);
    cns->sph_x[s_base + i] = spheres[i*4+0];
    cns->sph_y[s_base + i] = spheres[i*4+1];
    cns->sph_z[s_base + i] = spheres[i*4+2];
    cns->sph_r[s_base + i] = spheres[i*4+3];

    assert(isfinite(cns->sph_x[s_base + i]));
    assert(isfinite(cns->sph_y[s_base + i]));
    assert(isfinite(cns->sph_z[s_base + i]));
    assert(isfinite(cns->sph_r[s_base + i]));
    assert(cns->sph_r[s_base + i] >= 0.0f);
  }
  for (int i = cnt; i < SPH_PER_CHN; ++i) {
    cns->sph_x[s_base + i] = 0;
    cns->sph_y[s_base + i] = 0;
    cns->sph_z[s_base + i] = 0;
    cns->sph_r[s_base + i] = 0;
  }
  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_con_motion(struct chn_sim *cns, int chn, const float *restrict radius, int cnt) {
  assert(chn_sim__val(cns));
  assert(chn >= 0);
  assert(chn < MAX_CHNS);
  assert(cnt >= 0);
  assert(cnt < PTC_PER_CHN);
  assert(radius);

  int p_base = (chn * PTC_PER_CHN);
  assert(p_base + 15 < MAX_PTC);
  int even_base = p_base + 0;
  int odd_base = p_base + 8;
  int even_cnt = (cnt + 1) / 2;
  int odd_cnt = cnt / 2;

  for (int i = 0; i < even_cnt; ++i) {
    assert(2 * i < cnt);
    cns->motion_radius[even_base + i] = radius[2 * i];
    assert(cns->motion_radius[even_base + i] >= 0.0f);
    assert(cns->motion_radius[even_base + i] <= MAX_MOTION_RADIUS);
  }
  for (int i = even_cnt; i < 8; ++i) {
    cns->motion_radius[even_base + i] = MAX_MOTION_RADIUS;
  }
  for (int i = 0; i < odd_cnt; ++i) {
    assert(2 * i + 1 < cnt);
    cns->motion_radius[odd_base + i] = radius[2 * i + 1];
    assert(cns->motion_radius[odd_base + i] >= 0.0f);
    assert(cns->motion_radius[odd_base + i] <= MAX_MOTION_RADIUS);
  }
  for (int i = odd_cnt; i < 8; ++i) {
    cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS;
  }
  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_run(struct chn_sim *cns, float dt) {
  assert(chn_sim__val(cns));
  assert(dt > 0.0f);
  assert(dt < 1.0f);

  // SIMD constants
  const __m256 dt_vec = _mm256_set1_ps(dt);
  const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
  const __m256 one_vec = _mm256_set1_ps(1.0f);
  const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
  const __m256 eps_vec = _mm256_set1_ps(1e-6f);
  const __m256 lambda_min_vec = _mm256_set1_ps(-1e6f);
  const __m256 lambda_max_vec = _mm256_set1_ps(1e6f);
  const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f);
  const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f);
  const __m256 zero_vec = _mm256_setzero_ps();
  const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
  const __m256 two_vec = _mm256_set1_ps(2.0f);
  const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
  const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
  const __m256 neg_rest_vec = _mm256_set1_ps(-1.0f);  // For FMAs

  // Stack arrays for precomputed data
  float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32)));

  float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32)));

  float vel_x[MAX_CHNS] __attribute__((aligned(32)));
  float vel_y[MAX_CHNS] __attribute__((aligned(32)));
  float vel_z[MAX_CHNS] __attribute__((aligned(32)));

  float ang_vel_x[MAX_CHNS] __attribute__((aligned(32)));
  float ang_vel_y[MAX_CHNS] __attribute__((aligned(32)));
  float ang_vel_z[MAX_CHNS] __attribute__((aligned(32)));

  // Step 0: Precomputation
  for (int c = 0; c < MAX_CHNS; c += 8) {
    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
    __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 0, 3);
    __builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 0, 3);

    // Load chain quaternions
    __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
    __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
    __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]);
    __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]);

    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
    __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
    __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
    __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz

    // Compute local-space wind
    {
      __m256 vx = _mm256_load_ps(&cns->chn_wnd_x[c]);
      __m256 vy = _mm256_load_ps(&cns->chn_wnd_y[c]);
      __m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]);

      // Step 1: t = q_conj * v_world_as_quat
      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy

      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz

      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz

      // Step 2: res_vec = (t * q)_vec
      __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
      res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
      __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
      res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y));
      __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
      res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z));

      _mm256_store_ps(&chn_wnd_lcl_x[c], res_x);
      _mm256_store_ps(&chn_wnd_lcl_y[c], res_y);
      _mm256_store_ps(&chn_wnd_lcl_z[c], res_z);

      assert(isfinite(_mm256_cvtss_f32(res_x)));
      assert(isfinite(_mm256_cvtss_f32(res_y)));
      assert(isfinite(_mm256_cvtss_f32(res_z)));
    }
    // Compute local-space gravity (identical to wind)
    {
      __m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]);
      __m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]);
      __m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]);

      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx));
      tx = _mm256_fnmadd_ps(cqz, vy, tx);

      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy));
      ty = _mm256_fnmadd_ps(cqx, vz, ty);

      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz));
      tz = _mm256_fnmadd_ps(cqy, vx, tz);

      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);
      tw = _mm256_fnmadd_ps(cqy, vy, tw);
      tw = _mm256_fnmadd_ps(cqz, vz, tw);

      __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
      res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
      __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
      res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y));
      __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
      res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z));

      _mm256_store_ps(&chn_grav_lcl_x[c], res_x);
      _mm256_store_ps(&chn_grav_lcl_y[c], res_y);
      _mm256_store_ps(&chn_grav_lcl_z[c], res_z);

      assert(isfinite(_mm256_cvtss_f32(res_x)));
      assert(isfinite(_mm256_cvtss_f32(res_y)));
      assert(isfinite(_mm256_cvtss_f32(res_z)));
    }
    // Compute linear velocity
    {
      __m256 curr_x = _mm256_load_ps(&cns->chn_pos_x[c]);
      __m256 curr_y = _mm256_load_ps(&cns->chn_pos_y[c]);
      __m256 curr_z = _mm256_load_ps(&cns->chn_pos_z[c]);

      __m256 prev_x = _mm256_load_ps(&cns->chn_prev_pos_x[c]);
      __m256 prev_y = _mm256_load_ps(&cns->chn_prev_pos_y[c]);
      __m256 prev_z = _mm256_load_ps(&cns->chn_prev_pos_z[c]);

      __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
      __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
      __m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec);

      _mm256_store_ps(&vel_x[c], vel_x_vec);
      _mm256_store_ps(&vel_y[c], vel_y_vec);
      _mm256_store_ps(&vel_z[c], vel_z_vec);

      assert(isfinite(_mm256_cvtss_f32(vel_x_vec)));
      assert(isfinite(_mm256_cvtss_f32(vel_y_vec)));
      assert(isfinite(_mm256_cvtss_f32(vel_z_vec)));
    }
    // Compute angular velocity
    {
      __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
      __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
      __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]);
      __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]);

      __m256 prev_qx = _mm256_load_ps(&cns->chn_prev_quat_x[c]);
      __m256 prev_qy = _mm256_load_ps(&cns->chn_prev_quat_y[c]);
      __m256 prev_qz = _mm256_load_ps(&cns->chn_prev_quat_z[c]);
      __m256 prev_qw = _mm256_load_ps(&cns->chn_prev_quat_w[c]);

      // Step 1: Compute delta quaternion (to - from)
      __m256 dt_qx = _mm256_sub_ps(qx, prev_qx);
      __m256 dt_qy = _mm256_sub_ps(qy, prev_qy);
      __m256 dt_qz = _mm256_sub_ps(qz, prev_qz);
      __m256 dt_qw = _mm256_sub_ps(qw, prev_qw);

      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
      __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx);
      __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy);
      __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz);
      __m256 conj_qw = prev_qw;

      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
      __m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw));
      orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw);
      orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw);

      __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(dt_qy, conj_qz), _mm256_mul_ps(dt_qz, conj_qy));
      __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(dt_qz, conj_qx), _mm256_mul_ps(dt_qx, conj_qz));
      __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(dt_qx, conj_qy), _mm256_mul_ps(dt_qy, conj_qx));

      __m256 orient_qx = _mm256_fmadd_ps(dt_qw, conj_qx, _mm256_mul_ps(dt_qx, conj_qw));
      orient_qx = _mm256_add_ps(orient_qx, cross_x);
      __m256 orient_qy = _mm256_fmadd_ps(dt_qw, conj_qy, _mm256_mul_ps(dt_qy, conj_qw));
      orient_qy = _mm256_add_ps(orient_qy, cross_y);
      __m256 orient_qz = _mm256_fmadd_ps(dt_qw, conj_qz, _mm256_mul_ps(dt_qz, conj_qw));
      orient_qz = _mm256_add_ps(orient_qz, cross_z);

      // Step 4: Compute dot product (to, from) for shortest arc check
      __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw));
      dot = _mm256_fmadd_ps(qy, prev_qy, dot);
      dot = _mm256_fmadd_ps(qz, prev_qz, dot);

      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
      __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
      __m256 negate_sign = _mm256_blendv_ps(one_vec, neg_one_vec, negate_mask);

      orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
      orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
      orient_qz = _mm256_mul_ps(orient_qz, negate_sign);

      // Norm check and scale if >1+eps
      __m256 norm_sq = _mm256_fmadd_ps(orient_qz, orient_qz, _mm256_fmadd_ps(orient_qy, orient_qy, _mm256_fmadd_ps(orient_qx, orient_qx, _mm256_mul_ps(orient_qw, orient_qw))));
      __m256 scale_mask = _mm256_cmp_ps(norm_sq, _mm256_set1_ps((1.0f + 1e-6f) * (1.0f + 1e-6f)), _CMP_GT_OQ);
      __m256 inv_norm = _mm256_rsqrt_ps(norm_sq);
      __m256 scale = _mm256_blendv_ps(one_vec, inv_norm, scale_mask);
      orient_qx = _mm256_mul_ps(orient_qx, scale);
      orient_qy = _mm256_mul_ps(orient_qy, scale);
      orient_qz = _mm256_mul_ps(orient_qz, scale);
      orient_qw = _mm256_mul_ps(orient_qw, scale);

      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
      __m256 scale_ang = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
      __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale_ang);
      __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale_ang);
      __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale_ang);

      // Store results
      _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
      _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
      _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);

      assert(isfinite(_mm256_cvtss_f32(ang_vel_x_vec)));
      assert(isfinite(_mm256_cvtss_f32(ang_vel_y_vec)));
      assert(isfinite(_mm256_cvtss_f32(ang_vel_z_vec)));
    }
  }
  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - Duplicated even/odd
  for (int c = 0; c < MAX_CHNS; ++c) {
    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 1, 3);
    __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 1, 3);
    __builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 1, 3);

    int p_base = c * PTC_PER_CHN;
    assert(p_base + 15 < MAX_PTC);
    struct chn_cfg *cfg = &cns->chn_cfg[c];

    __m256 delta_x_vec = _mm256_set1_ps(vel_x[c] * dt);
    __m256 delta_y_vec = _mm256_set1_ps(vel_y[c] * dt);
    __m256 delta_z_vec = _mm256_set1_ps(vel_z[c] * dt);

    __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
    __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
    __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);

    __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
    __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
    __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);

    linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
    angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
    centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);

    __m256 lin_lag_factor = _mm256_sub_ps(one_vec, linear_inertia_vec);
    __m256 lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor);
    __m256 lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor);
    __m256 lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor);

    // Centrifugal zero-check: ang_vel mag (shared)
    __m256 ang_mag_sq = _mm256_fmadd_ps(ang_vel_z_vec, ang_vel_z_vec, _mm256_fmadd_ps(ang_vel_y_vec, ang_vel_y_vec, _mm256_mul_ps(ang_vel_x_vec, ang_vel_x_vec)));
    __m256 ang_zero_mask = _mm256_cmp_ps(ang_mag_sq, eps_vec, _CMP_LT_OQ);

    // Even block
    {
      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
      __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ);

      __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
      __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
      __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));

      ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec);
      ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec);
      ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec);

      __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
      __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
      __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));

      __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
      __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
      __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));

      __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec);
      __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
      __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);

      __m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec);
      __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec);
      __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec);

      // Apply centrifugal zero-check
      cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask);
      cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask);
      cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask);

      // Blend masks for all dt
      lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask);
      lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask);
      lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask);

      ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask);
      ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask);
      ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask);

      cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask);
      cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask);
      cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask);

      __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
      __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
      __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);

      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_add_ps(px, dt_x));
      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y));
      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z));
    }
    // Odd block
    {
      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
      __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ);

      __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
      __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
      __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));

      ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec);
      ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec);
      ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec);

      __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
      __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
      __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));

      __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
      __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
      __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));

      __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec);
      __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
      __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);

      __m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec);
      __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec);
      __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec);

      // Apply centrifugal zero-check
      cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask);
      cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask);
      cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask);

      lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor);
      lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor);
      lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor);

      lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask);
      lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask);
      lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask);

      ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask);
      ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask);
      ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask);

      cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask);
      cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask);
      cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask);

      __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
      __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
      __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);

      _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_add_ps(px, dt_x));
      _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_add_ps(py, dt_y));
      _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_add_ps(pz, dt_z));
    }
    // Update previous transformation
    cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
    cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
    cns->chn_prev_pos_z[c] = cns->chn_pos_z[c];

    cns->chn_prev_quat_x[c] = cns->chn_quat_x[c];
    cns->chn_prev_quat_y[c] = cns->chn_quat_y[c];
    cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
    cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
  }
  // Step 2: Verlet Integration - Duplicated for even/odd
  for (int c = 0; c < MAX_CHNS; ++c) {
    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
    int p_base = c * PTC_PER_CHN;
    assert(p_base + 15 < MAX_PTC);
    struct chn_cfg *cfg = &cns->chn_cfg[c];

    __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]);
    __m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]);
    __m256 chn_grav_z_vec = _mm256_set1_ps(chn_grav_lcl_z[c]);

    __m256 chn_wnd_x_vec = _mm256_set1_ps(chn_wnd_lcl_x[c]);
    __m256 chn_wnd_y_vec = _mm256_set1_ps(chn_wnd_lcl_y[c]);
    __m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]);
    __m256 damping_vec = _mm256_set1_ps(cfg->damping);

    __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec);
    __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec);
    __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec);

    // Even block
    {
      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);

      __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass);
      __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass);
      __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass);

      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);

      __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
      __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
      __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);

      __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
      __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
      __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);

      __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
      __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
      __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));

      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);

      _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], p_curr_x);
      _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y);
      _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z);

      assert(isfinite(_mm256_cvtss_f32(new_p_x)));
      assert(isfinite(_mm256_cvtss_f32(new_p_y)));
      assert(isfinite(_mm256_cvtss_f32(new_p_z)));
    }
    // Odd block
    {
      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);

      __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass);
      __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass);
      __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass);

      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);

      __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
      __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
      __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);

      __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
      __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
      __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);

      __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
      __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
      __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));

      _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x);
      _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y);
      _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z);

      _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], p_curr_x);
      _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], p_curr_y);
      _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z);

      assert(isfinite(_mm256_cvtss_f32(new_p_x)));
      assert(isfinite(_mm256_cvtss_f32(new_p_y)));
      assert(isfinite(_mm256_cvtss_f32(new_p_z)));
    } 
  }
  // Step 3: XPBD Constraints (Distance only) - Permutes intra-vector only, no cross-block mixing
  for (int c = 0; c < MAX_CHNS; ++c) {
    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
    int p_base = c * PTC_PER_CHN;
    assert(p_base + 15 < MAX_PTC);
    int r_base = c * CON_PER_CHN;
    assert(r_base + 15 < MAX_REST_LEN);struct chn_cfg *cfg = &cns->chn_cfg[c];
    
    __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
    __m256 compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(stiff_vec, eps_vec));  // Fallback if 0
    __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness);
    __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(bend_stiff_vec, eps_vec));

    // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15)
    __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
    __m256 even_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
    __m256 even_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
    __m256 even_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

    __m256 odd_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
    __m256 odd_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
    __m256 odd_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
    __m256 odd_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

    // Direct load transposed rests/lambdas: even cons [0,2,...] in r_base+0, odd in r_base+8
    __m256 rest_even = _mm256_load_ps(&cns->ptc_rest_len[r_base + 0]);
    __m256 lambda_even = _mm256_load_ps(&cns->ptc_lambda[r_base + 0]);

    __m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]);
    __m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]);

    __m256 bend_rest_even = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 0]);
    __m256 bend_lambda_even = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 0]);
    __m256 bend_rest_odd = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 8]);
    __m256 bend_lambda_odd = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 8]);

    _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f));
    _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f));

    for (int iter = 0; iter < MAX_ITR; ++iter) {
      // Even distance constraints: p0 = even (A), p1 = odd (B)
      {
        __m256 dx = _mm256_sub_ps(even_x, odd_x);  // A - B
        __m256 dy = _mm256_sub_ps(even_y, odd_y);
        __m256 dz = _mm256_sub_ps(even_z, odd_z);

        __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
        __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
        assert(isfinite(_mm256_cvtss_f32(dist)));

        __m256 w_sum = _mm256_add_ps(even_m, odd_m);
        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
        __m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ);

        __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_even));
        __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_even));  // C + compliance * lambda
        __m256 numerator = _mm256_sub_ps(zero_vec, temp);  // - (C + compliance * lambda)
        __m256 denom = _mm256_add_ps(w_sum, compliance_factor);  // w_sum + compliance
        __m256 delta_lambda = _mm256_div_ps(numerator, denom);
        __m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda);
        new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));

        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);  // grad_A = unit
        __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
        __m256 unit_z = _mm256_mul_ps(dz, inv_dist);

        __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
        __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
        __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);

        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
        corr_x = _mm256_and_ps(corr_x, corr_mask);
        corr_y = _mm256_and_ps(corr_y, corr_mask);
        corr_z = _mm256_and_ps(corr_z, corr_mask);

        // dpos_A = delta_lambda * unit * even_m
        even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, even_m), one_vec, even_x);
        even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, even_m), one_vec, even_y);
        even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, even_m), one_vec, even_z);

        // dpos_B = delta_lambda * (-unit) * odd_m = - corr * odd_m
        odd_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x);
        odd_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y);
        odd_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z);

        lambda_even = new_lambda;
      }
      // Odd distance constraints: p1 = odd (A), p2 = even shifted (B)
      {
        __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
        __m256 p2_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx));
        __m256 p2_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx));
        __m256 p2_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx));
        __m256 p2_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx));

        __m256 dx = _mm256_sub_ps(odd_x, p2_x);
        __m256 dy = _mm256_sub_ps(odd_y, p2_y);
        __m256 dz = _mm256_sub_ps(odd_z, p2_z);

        __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
        __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
        assert(isfinite(_mm256_cvtss_f32(dist)));

        __m256 w_sum = _mm256_add_ps(odd_m, p2_m);
        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
        __m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ);

        __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_odd));
        __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_odd));
        __m256 numerator = _mm256_sub_ps(zero_vec, temp);
        __m256 denom = _mm256_add_ps(w_sum, compliance_factor);
        __m256 delta_lambda = _mm256_div_ps(numerator, denom);
        __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda);
        new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));

        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
        __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
        __m256 unit_z = _mm256_mul_ps(dz, inv_dist);

        __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
        __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
        __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);

        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
        corr_x = _mm256_and_ps(corr_x, corr_mask);
        corr_y = _mm256_and_ps(corr_y, corr_mask);
        corr_z = _mm256_and_ps(corr_z, corr_mask);

        // dpos_A = delta_lambda * unit * odd_m
        odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x);
        odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y);
        odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z);

        // dpos_B = delta_lambda * (-unit) * p2_m = - corr * p2_m (permuted add via shift)
        __m256 delta_p2_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, p2_m), one_vec, zero_vec);
        __m256 delta_even_x = _mm256_permutevar8x32_ps(delta_p2_x, _mm256_castsi256_ps(shift_idx));
        even_x = _mm256_add_ps(even_x, delta_even_x);

        __m256 delta_p2_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, p2_m), one_vec, zero_vec);
        __m256 delta_even_y = _mm256_permutevar8x32_ps(delta_p2_y, _mm256_castsi256_ps(shift_idx));
        even_y = _mm256_add_ps(even_y, delta_even_y);

        __m256 delta_p2_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, p2_m), one_vec, zero_vec);
        __m256 delta_even_z = _mm256_permutevar8x32_ps(delta_p2_z, _mm256_castsi256_ps(shift_idx));
        even_z = _mm256_add_ps(even_z, delta_even_z);

        lambda_odd = new_lambda;
      }
      // Even bending constraints: between even[k] (A) and even[k+1] (B)
      {
        __m256 a_x = even_x;
        __m256 a_y = even_y;
        __m256 a_z = even_z;
        __m256 a_m = even_m;

        __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
        __m256 b_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx));
        __m256 b_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx));
        __m256 b_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx));
        __m256 b_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx));

        __m256 dx = _mm256_sub_ps(a_x, b_x);
        __m256 dy = _mm256_sub_ps(a_y, b_y);
        __m256 dz = _mm256_sub_ps(a_z, b_z);

        __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
        __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
        assert(isfinite(_mm256_cvtss_f32(dist)));

        __m256 w_sum = _mm256_add_ps(a_m, b_m);
        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
        __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ);

        __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_even));
        __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_even));
        __m256 numerator = _mm256_sub_ps(zero_vec, temp);
        __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor);
        __m256 delta_lambda = _mm256_div_ps(numerator, denom);
        __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda);
        new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));

        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
        __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
        __m256 unit_z = _mm256_mul_ps(dz, inv_dist);

        __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
        __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
        __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);

        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
        corr_x = _mm256_and_ps(corr_x, corr_mask);
        corr_y = _mm256_and_ps(corr_y, corr_mask);
        corr_z = _mm256_and_ps(corr_z, corr_mask);

        // dpos_A = delta_lambda * unit * a_m
        even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, even_x);
        even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, even_y);
        even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, even_z);

        // dpos_B = - corr * b_m (permuted add)
        __m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec);
        __m256 delta_even_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx));
        even_x = _mm256_add_ps(even_x, delta_even_x);

        __m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec);
        __m256 delta_even_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx));
        even_y = _mm256_add_ps(even_y, delta_even_y);

        __m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec);
        __m256 delta_even_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx));
        even_z = _mm256_add_ps(even_z, delta_even_z);

        bend_lambda_even = new_lambda;
      }
      // Odd bending constraints: between odd[k] (A) and odd[k+1] (B)
      {
        __m256 a_x = odd_x;
        __m256 a_y = odd_y;
        __m256 a_z = odd_z;
        __m256 a_m = odd_m;

        __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
        __m256 b_x = _mm256_permutevar8x32_ps(odd_x, _mm256_castsi256_ps(shift_idx));
        __m256 b_y = _mm256_permutevar8x32_ps(odd_y, _mm256_castsi256_ps(shift_idx));
        __m256 b_z = _mm256_permutevar8x32_ps(odd_z, _mm256_castsi256_ps(shift_idx));
        __m256 b_m = _mm256_permutevar8x32_ps(odd_m, _mm256_castsi256_ps(shift_idx));

        __m256 dx = _mm256_sub_ps(a_x, b_x);
        __m256 dy = _mm256_sub_ps(a_y, b_y);
        __m256 dz = _mm256_sub_ps(a_z, b_z);

        __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
        __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
        assert(isfinite(_mm256_cvtss_f32(dist)));

        __m256 w_sum = _mm256_add_ps(a_m, b_m);
        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
        __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ);

        __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_odd));
        __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_odd));
        __m256 numerator = _mm256_sub_ps(zero_vec, temp);
        __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor);
        __m256 delta_lambda = _mm256_div_ps(numerator, denom);
        __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda);
        new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));

        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
        __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
        __m256 unit_z = _mm256_mul_ps(dz, inv_dist);

        __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
        __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
        __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);

        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
        corr_x = _mm256_and_ps(corr_x, corr_mask);
        corr_y = _mm256_and_ps(corr_y, corr_mask);
        corr_z = _mm256_and_ps(corr_z, corr_mask);

        // dpos_A = delta_lambda * unit * a_m
        odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, odd_x);
        odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, odd_y);
        odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, odd_z);

        // dpos_B = - corr * b_m (permuted add)
        __m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec);
        __m256 delta_odd_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx));
        odd_x = _mm256_add_ps(odd_x, delta_odd_x);

        __m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec);
        __m256 delta_odd_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx));
        odd_y = _mm256_add_ps(odd_y, delta_odd_y);

        __m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec);
        __m256 delta_odd_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx));
        odd_z = _mm256_add_ps(odd_z, delta_odd_z);

        bend_lambda_odd = new_lambda;
      }
    }
    // Store back
    _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x);
    _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], even_y);
    _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], even_z);

    _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], odd_x);
    _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], odd_y);
    _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], odd_z);

    _mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even);
    _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd);
    _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], bend_lambda_even);
    _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], bend_lambda_odd);

    assert(isfinite(_mm256_cvtss_f32(even_x)));
    assert(isfinite(_mm256_cvtss_f32(odd_x)));  
  }
  // Step 4: Sphere Collisions with Friction - Duplicated for even/odd
  for (int c = 0; c < MAX_CHNS; ++c) {
    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
    int p_base = c * PTC_PER_CHN;
    assert(p_base + 15 < MAX_PTC);
    int s_base = c * SPH_PER_CHN;
    assert(s_base + 7 < MAX_SPH);

    struct chn_cfg *cfg = &cns->chn_cfg[c];
    __m256 fric_vec = _mm256_set1_ps(cfg->friction);
    for (int s = 0; s < SPH_PER_CHN; ++s) {
      __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]);
      __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]);
      __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
      __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);

      // Even block
      {
        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);

        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);

        __m256 dist_sq = _mm256_mul_ps(dx, dx);
        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
        assert(isfinite(_mm256_cvtss_f32(dist)));

        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);

        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);

        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);

        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);

        __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));

        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));

        __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
        tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec);  // Clamp before rsqrt
        __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);

        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);

        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);

        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);

        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);

        __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
        __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
        __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);

        _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
        _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
        _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);

        assert(isfinite(_mm256_cvtss_f32(new_p_x)));
        assert(isfinite(_mm256_cvtss_f32(new_p_y)));
        assert(isfinite(_mm256_cvtss_f32(new_p_z)));
      }
      // Odd block
      {
        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);

        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);

        __m256 dist_sq = _mm256_mul_ps(dx, dx);
        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
        assert(isfinite(_mm256_cvtss_f32(dist)));

        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);

        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);

        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);

        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);

        __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));

        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));

        __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
        tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec);  // Clamp before rsqrt
        __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);

        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);

        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);

        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);

        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);

        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);

        __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
        __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
        __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);

        _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
        _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
        _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);

        assert(isfinite(_mm256_cvtss_f32(new_p_x)));
        assert(isfinite(_mm256_cvtss_f32(new_p_y)));
        assert(isfinite(_mm256_cvtss_f32(new_p_z)));
      }
    }
  }
  // Step 5: Motion Constraints - Duplicated for even/odd
  for (int c = 0; c < MAX_CHNS; ++c) {
    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
    int p_base = c * PTC_PER_CHN;
    assert(p_base + 15 < MAX_PTC);
    // Even block
    {
      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);  __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 0]);

      __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 0]);
      __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 0]);
      __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 0]);

      __m256 dx = _mm256_sub_ps(px, rx);
      __m256 dy = _mm256_sub_ps(py, ry);
      __m256 dz = _mm256_sub_ps(pz, rz);

      __m256 dist_sq = _mm256_mul_ps(dx, dx);
      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
      __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
      assert(isfinite(_mm256_cvtss_f32(dist)));

      __m256 pen = _mm256_sub_ps(dist, r_vec);
      __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);

      __m256 corr_factor = _mm256_mul_ps(pen, inv_dist);
      corr_factor = _mm256_and_ps(corr_factor, mask);

      __m256 dt_x = _mm256_mul_ps(dx, corr_factor);
      __m256 dt_y = _mm256_mul_ps(dy, corr_factor);
      __m256 dt_z = _mm256_mul_ps(dz, corr_factor);

      dt_x = _mm256_mul_ps(dt_x, pm);
      dt_y = _mm256_mul_ps(dt_y, pm);
      dt_z = _mm256_mul_ps(dt_z, pm);

      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_sub_ps(px, dt_x));
      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y));
      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z));

      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x))));
      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y))));
      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z))));
    }
    // Odd block
    {
      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 8]);
      __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 8]);
      __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 8]);
      __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 8]);

      __m256 dx = _mm256_sub_ps(px, rx);
      __m256 dy = _mm256_sub_ps(py, ry);
      __m256 dz = _mm256_sub_ps(pz, rz);

      __m256 dist_sq = _mm256_mul_ps(dx, dx);
      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
      __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
      assert(isfinite(_mm256_cvtss_f32(dist)));

      __m256 pen = _mm256_sub_ps(dist, r_vec);
      __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);

      __m256 corr_factor = _mm256_mul_ps(pen, inv_dist);
      corr_factor = _mm256_and_ps(corr_factor, mask);

      __m256 dt_x = _mm256_mul_ps(dx, corr_factor);
      __m256 dt_y = _mm256_mul_ps(dy, corr_factor);
      __m256 dt_z = _mm256_mul_ps(dz, corr_factor);

      dt_x = _mm256_mul_ps(dt_x, pm);
      dt_y = _mm256_mul_ps(dt_y, pm);
      dt_z = _mm256_mul_ps(dt_z, pm);

      _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_sub_ps(px, dt_x));
      _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_sub_ps(py, dt_y));
      _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z));

      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x))));
      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y))));
      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z))));
    }
  }
  assert(chn_sim__val(cns));
 }
diff --git a/main.c b/main.c
 #include <SDL2/SDL.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>

 #include "chain.c"

 #define WINDOW_WIDTH 800
 #define WINDOW_HEIGHT 600
 #define CHAIN_LENGTH 10
 #define PARTICLE_SPACING 10.0f

 static void
 project(float x, float y, float z, int *screen_x, int *screen_y) {
  *screen_x = (int)(x + WINDOW_WIDTH / 2.0f);
  *screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f);
 }
 // Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity)
 static void quat_rotate_inv_translate(float wx, float wy, float wz,  // world pos
                                      float cx, float cy, float cz,  // chain pos
                                      float qx, float qy, float qz, float qw,  // chain quat
                                      float *mx, float *my, float *mz) {  // out model pos
  // Translate: world - chain
  float tx = wx - cx, ty = wy - cy, tz = wz - cz;
  // Quat conj: (qw, -qx, -qy, -qz)
  float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz;
  // t = conj * (0, tx,ty,tz)
  float tw = cqx*tx + cqy*ty + cqz*tz;
  float t_x = cqw*tx + cqy*tz - cqz*ty;
  float t_y = cqw*ty + cqz*tx - cqx*tz;
  float t_z = cqw*tz + cqx*ty - cqy*tx;
  // model = t * quat (vec part)
  *mx = qw*t_x + qy*t_z - qz*t_y + tw*qx;
  *my = qw*t_y + qz*t_x - qx*t_z + tw*qy;
  *mz = qw*t_z + qx*t_y - qy*t_x + tw*qz;
 }
 extern int
 main(int argc, char *argv[]) {
  if (SDL_Init(SDL_INIT_VIDEO) < 0) {
    printf("SDL_Init failed: %s\n", SDL_GetError());
    return 1;
  }
  SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0);
  SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
  if (!window || !renderer) {
    printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError());
    SDL_Quit();
    return 1;
  }
  struct chn_sim sim;
  chn_sim_init(&sim);

  struct chn_cfg cfg = {
    .damping = 0.5f,
    .stiffness = 0.9f,
    .friction = 0.5f,
    .linear_inertia = 0.9f,       // Tune 0.5f for more tangential lag
    .angular_inertia = 0.9f,
    .centrifugal_inertia = 0.9f,   // Tune 0.5f for more outward bow
    .bend_stiffness = 0.8f,
  };
  float rest_pos[CHAIN_LENGTH * 4];
  for (int i = 0; i < CHAIN_LENGTH; i++) {
    rest_pos[i*4 + 0] = 0.0f;
    rest_pos[i*4 + 1] = -i * PARTICLE_SPACING;
    rest_pos[i*4 + 2] = 0.0f;
    rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f;
  }
  int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH);
  chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f});  // Reduced for spin test
  chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});  // Start at circle edge

  // Fixed world spheres (triangle outside circle)
  float world_spheres[12] = {  // 3 spheres * 4 floats (x,y,z,r)
    60.0f,  0.0f, 0.0f, 10.0f,   // Sphere 0: right
    -30.0f, 52.0f, 0.0f, 10.0f,  // Sphere 1: top-left
    -30.0f,-52.0f, 0.0f, 10.0f   // Sphere 2: bottom-left
  };

  // Circular motion params
  float time = 0.0f;
  float circle_radius = 50.0f;
  float angular_speed = 2.0f * M_PI / 5.0f;  // Full circle every 5s
  float dt = 0.016f;
  int running = 1;
  SDL_Event event;

  // Debug: Check gravity and masses
  int p_base = chn * PTC_PER_CHN;
  while (running) {
    while (SDL_PollEvent(&event)) {
      if (event.type == SDL_QUIT) {
        running = 0;
      }
    }
    time += dt;

    // Circular anchor motion
    float anchor_x = circle_radius * cosf(angular_speed * time);
    float anchor_y = circle_radius * sinf(angular_speed * time);
    chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});

    // Transform fixed world spheres to current model space
    float model_spheres[12];
    for (int s = 0; s < 3; ++s) {
      float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2];
      float r = world_spheres[s*4 + 3];
      float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn];
      float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn];
      float mx, my, mz;
      quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz);
      model_spheres[s*4 + 0] = mx;
      model_spheres[s*4 + 1] = my;
      model_spheres[s*4 + 2] = mz;
      model_spheres[s*4 + 3] = r;
    }
    chn_sim_set_sph(&sim, chn, model_spheres, 3);
    chn_sim_run(&sim, dt);

    SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
    SDL_RenderClear(renderer);
    SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255);  // Green for chain & spheres

    float root_x = sim.chn_pos_x[chn];  // Current root world x/y
    float root_y = sim.chn_pos_y[chn];

    // Draw chain segments
    for (int i = 0; i < CHAIN_LENGTH - 1; i++) {
      int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
      int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2);

      float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0];
      float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1];
      float world_x0 = model_x0 + root_x;
      float world_y0 = model_y0 + root_y;
      float world_x1 = model_x1 + root_x;
      float world_y1 = model_y1 + root_y;

      int sx0, sy0, sx1, sy1;
      project(world_x0, world_y0, 0, &sx0, &sy0);
      project(world_x1, world_y1, 0, &sx1, &sy1);
      SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
    }
    // Draw spheres (approx circle outline with 32 points)
    int p_base = chn * PTC_PER_CHN;
    int s_base = chn * SPH_PER_CHN;
    for (int s = 0; s < 3; ++s) {  // Only draw active spheres
      float sph_model_x = sim.sph_x[s_base + s];
      float sph_model_y = sim.sph_y[s_base + s];
      float sph_r = sim.sph_r[s_base + s];
      if (sph_r <= 0.0f) continue;

      float sph_world_x = sph_model_x + root_x;
      float sph_world_y = sph_model_y + root_y;
      int sph_sx, sph_sy;
      project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy);

      // Parametric circle: 32 points
      const int num_points = 32;
      for (int pt = 0; pt < num_points; ++pt) {
        float angle0 = 2.0f * M_PI * pt / num_points;
        float angle1 = 2.0f * M_PI * (pt + 1) / num_points;
        float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0);
        float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1);

        int px0, py0, px1, py1;
        project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0);
        project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1);
        SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
      }
    }
    SDL_RenderPresent(renderer);
    SDL_Delay(16);
  }
  SDL_DestroyRenderer(renderer);
  SDL_DestroyWindow(window);
  SDL_Quit();
  return 0;
 }
	#include <SDL2/SDL.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>

	#include "chain.c"

	#define WINDOW_WIDTH 800
	#define WINDOW_HEIGHT 600
	#define CHAIN_LENGTH 10
	#define PARTICLE_SPACING 10.0f

	static void
	project(float x, float y, float z, int screen_x, int screen_y) {
	*screen_x = (int)(x + WINDOW_WIDTH / 2.0f);
	*screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f);
	}
	// Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity)
	static void quat_rotate_inv_translate(float wx, float wy, float wz, // world pos
	float cx, float cy, float cz, // chain pos
	float qx, float qy, float qz, float qw, // chain quat
	float mx, float my, float *mz) { // out model pos
	// Translate: world - chain
	float tx = wx - cx, ty = wy - cy, tz = wz - cz;
	// Quat conj: (qw, -qx, -qy, -qz)
	float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz;
	// t = conj * (0, tx,ty,tz)
	float tw = cqxtx + cqyty + cqz*tz;
	float t_x = cqwtx + cqytz - cqz*ty;
	float t_y = cqwty + cqztx - cqx*tz;
	float t_z = cqwtz + cqxty - cqy*tx;
	// model = t * quat (vec part)
	mx = qwt_x + qyt_z - qzt_y + tw*qx;
	my = qwt_y + qzt_x - qxt_z + tw*qy;
	mz = qwt_z + qxt_y - qyt_x + tw*qz;
	}
	extern int
	main(int argc, char *argv[]) {
	if (SDL_Init(SDL_INIT_VIDEO) < 0) {
	printf("SDL_Init failed: %s\n", SDL_GetError());
	return 1;
	}
	SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0);
	SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
	if (!window \|\| !renderer) {
	printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError());
	SDL_Quit();
	return 1;
	}
	struct chn_sim sim;
	chn_sim_init(&sim);

	struct chn_cfg cfg = {
	.damping = 0.5f,
	.stiffness = 0.9f,
	.friction = 0.5f,
	.linear_inertia = 0.9f, // Tune 0.5f for more tangential lag
	.angular_inertia = 0.9f,
	.centrifugal_inertia = 0.9f, // Tune 0.5f for more outward bow
	.bend_stiffness = 0.8f,
	};
	float rest_pos[CHAIN_LENGTH * 4];
	for (int i = 0; i < CHAIN_LENGTH; i++) {
	rest_pos[i*4 + 0] = 0.0f;
	rest_pos[i4 + 1] = -i PARTICLE_SPACING;
	rest_pos[i*4 + 2] = 0.0f;
	rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f;
	}
	int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH);
	chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f}); // Reduced for spin test
	chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); // Start at circle edge

	// Fixed world spheres (triangle outside circle)
	float world_spheres[12] = { // 3 spheres * 4 floats (x,y,z,r)
	60.0f, 0.0f, 0.0f, 10.0f, // Sphere 0: right
	-30.0f, 52.0f, 0.0f, 10.0f, // Sphere 1: top-left
	-30.0f,-52.0f, 0.0f, 10.0f // Sphere 2: bottom-left
	};

	// Circular motion params
	float time = 0.0f;
	float circle_radius = 50.0f;
	float angular_speed = 2.0f * M_PI / 5.0f; // Full circle every 5s
	float dt = 0.016f;
	int running = 1;
	SDL_Event event;

	// Debug: Check gravity and masses
	int p_base = chn * PTC_PER_CHN;
	while (running) {
	while (SDL_PollEvent(&event)) {
	if (event.type == SDL_QUIT) {
	running = 0;
	}
	}
	time += dt;

	// Circular anchor motion
	float anchor_x = circle_radius * cosf(angular_speed * time);
	float anchor_y = circle_radius * sinf(angular_speed * time);
	chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});

	// Transform fixed world spheres to current model space
	float model_spheres[12];
	for (int s = 0; s < 3; ++s) {
	float wx = world_spheres[s4 + 0], wy = world_spheres[s4 + 1], wz = world_spheres[s*4 + 2];
	float r = world_spheres[s*4 + 3];
	float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn];
	float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn];
	float mx, my, mz;
	quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz);
	model_spheres[s*4 + 0] = mx;
	model_spheres[s*4 + 1] = my;
	model_spheres[s*4 + 2] = mz;
	model_spheres[s*4 + 3] = r;
	}
	chn_sim_set_sph(&sim, chn, model_spheres, 3);
	chn_sim_run(&sim, dt);

	SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
	SDL_RenderClear(renderer);
	SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255); // Green for chain & spheres

	float root_x = sim.chn_pos_x[chn]; // Current root world x/y
	float root_y = sim.chn_pos_y[chn];

	// Draw chain segments
	for (int i = 0; i < CHAIN_LENGTH - 1; i++) {
	int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
	int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2);

	float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0];
	float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1];
	float world_x0 = model_x0 + root_x;
	float world_y0 = model_y0 + root_y;
	float world_x1 = model_x1 + root_x;
	float world_y1 = model_y1 + root_y;

	int sx0, sy0, sx1, sy1;
	project(world_x0, world_y0, 0, &sx0, &sy0);
	project(world_x1, world_y1, 0, &sx1, &sy1);
	SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
	}
	// Draw spheres (approx circle outline with 32 points)
	int p_base = chn * PTC_PER_CHN;
	int s_base = chn * SPH_PER_CHN;
	for (int s = 0; s < 3; ++s) { // Only draw active spheres
	float sph_model_x = sim.sph_x[s_base + s];
	float sph_model_y = sim.sph_y[s_base + s];
	float sph_r = sim.sph_r[s_base + s];
	if (sph_r <= 0.0f) continue;

	float sph_world_x = sph_model_x + root_x;
	float sph_world_y = sph_model_y + root_y;
	int sph_sx, sph_sy;
	project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy);

	// Parametric circle: 32 points
	const int num_points = 32;
	for (int pt = 0; pt < num_points; ++pt) {
	float angle0 = 2.0f * M_PI * pt / num_points;
	float angle1 = 2.0f * M_PI * (pt + 1) / num_points;
	float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0);
	float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1);

	int px0, py0, px1, py1;
	project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0);
	project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1);
	SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
	}
	}
	SDL_RenderPresent(renderer);
	SDL_Delay(16);
	}
	SDL_DestroyRenderer(renderer);
	SDL_DestroyWindow(window);
	SDL_Quit();
	return 0;
	}