vurtun · October 28, 2025 21:55 · Oct 28, 2025 · Oct 25, 2025 · Oct 25, 2025 · Oct 23, 2025
diff --git a/chain.c b/chain.c
@@ -117,64 +117,44 @@ chn_sim__val(struct chn_sim *cns) {
     assert(cfg->bend_stiffness >= 0.0f);
     assert(cfg->bend_stiffness <= 1.0f);
 
-    assert(!isnan(cns->chn_grav_x[i]));
-    assert(!isnan(cns->chn_grav_y[i]));
-    assert(!isnan(cns->chn_grav_z[i]));
-
     assert(isfinite(cns->chn_grav_x[i]));
     assert(isfinite(cns->chn_grav_y[i]));
     assert(isfinite(cns->chn_grav_z[i]));
 
-    assert(!isnan(cns->chn_wnd_x[i]));
-    assert(!isnan(cns->chn_wnd_y[i]));
-    assert(!isnan(cns->chn_wnd_z[i]));
-
     assert(isfinite(cns->chn_wnd_x[i]));
     assert(isfinite(cns->chn_wnd_y[i]));
     assert(isfinite(cns->chn_wnd_z[i]));
 
-    assert(!isnan(cns->chn_pos_x[i]));
-    assert(!isnan(cns->chn_pos_y[i]));
-    assert(!isnan(cns->chn_pos_z[i]));
-
     assert(isfinite(cns->chn_pos_x[i]));
     assert(isfinite(cns->chn_pos_y[i]));
     assert(isfinite(cns->chn_pos_z[i]));
 
-    assert(!isnan(cns->chn_quat_x[i]));
-    assert(!isnan(cns->chn_quat_y[i]));
-    assert(!isnan(cns->chn_quat_z[i]));
-    assert(!isnan(cns->chn_quat_w[i]));
-
     assert(isfinite(cns->chn_quat_x[i]));
     assert(isfinite(cns->chn_quat_y[i]));
     assert(isfinite(cns->chn_quat_z[i]));
     assert(isfinite(cns->chn_quat_w[i]));
 
     // Quaternion normalization invariant
-    float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] + cns->chn_quat_y[i]*cns->chn_quat_y[i] + cns->chn_quat_z[i]*cns->chn_quat_z[i] + cns->chn_quat_w[i]*cns->chn_quat_w[i];
+    float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] +
+      cns->chn_quat_y[i]*cns->chn_quat_y[i] +
+      cns->chn_quat_z[i]*cns->chn_quat_z[i] +
+      cns->chn_quat_w[i]*cns->chn_quat_w[i];
     assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
 
-    assert(!isnan(cns->chn_prev_pos_x[i]));
-    assert(!isnan(cns->chn_prev_pos_y[i]));
-    assert(!isnan(cns->chn_prev_pos_z[i]));
-
     assert(isfinite(cns->chn_prev_pos_x[i]));
     assert(isfinite(cns->chn_prev_pos_y[i]));
     assert(isfinite(cns->chn_prev_pos_z[i]));
 
-    assert(!isnan(cns->chn_prev_quat_x[i]));
-    assert(!isnan(cns->chn_prev_quat_y[i]));
-    assert(!isnan(cns->chn_prev_quat_z[i]));
-    assert(!isnan(cns->chn_prev_quat_w[i]));
-
     assert(isfinite(cns->chn_prev_quat_x[i]));
     assert(isfinite(cns->chn_prev_quat_y[i]));
     assert(isfinite(cns->chn_prev_quat_z[i]));
     assert(isfinite(cns->chn_prev_quat_w[i]));
 
     // Prev quat norm
-    qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] + cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] + cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] + cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i];
+    qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] +
+      cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] +
+      cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] +
+      cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i];
     assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
   }
   for (int i = 0; i < MAX_PTC; ++i) {
@@ -183,74 +163,48 @@ chn_sim__val(struct chn_sim *cns) {
     assert(cns->motion_radius[i] >= 0.0f);
     assert(cns->motion_radius[i] <= MAX_MOTION_RADIUS);
 
-    assert(!isnan(cns->ptc_pos_x[i]));
-    assert(!isnan(cns->ptc_pos_y[i]));
-    assert(!isnan(cns->ptc_pos_z[i]));
-    assert(!isnan(cns->ptc_inv_mass[i]));
-
     assert(isfinite(cns->ptc_pos_x[i]));
     assert(isfinite(cns->ptc_pos_y[i]));
     assert(isfinite(cns->ptc_pos_z[i]));
     assert(isfinite(cns->ptc_inv_mass[i]));
     assert(cns->ptc_inv_mass[i] >= 0.0f);
 
-    assert(!isnan(cns->ptc_prev_pos_x[i]));
-    assert(!isnan(cns->ptc_prev_pos_y[i]));
-    assert(!isnan(cns->ptc_prev_pos_z[i]));
-
     assert(isfinite(cns->ptc_prev_pos_x[i]));
     assert(isfinite(cns->ptc_prev_pos_y[i]));
     assert(isfinite(cns->ptc_prev_pos_z[i]));
 
-    assert(!isnan(cns->rest_pos_x[i]));
-    assert(!isnan(cns->rest_pos_y[i]));
-    assert(!isnan(cns->rest_pos_z[i]));
-    assert(!isnan(cns->motion_radius[i]));
-
     assert(isfinite(cns->rest_pos_x[i]));
     assert(isfinite(cns->rest_pos_y[i]));
     assert(isfinite(cns->rest_pos_z[i]));
     assert(isfinite(cns->motion_radius[i]));
 
     // Spot-check motion constraint post-any update (sample every 8th for perf)
-    if (i % 8 == 0) {
-      float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i];
-      float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i];
-      float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i];
-      float dist = sqrtf(dx*dx + dy*dy + dz*dz);
-      assert(dist <= cns->motion_radius[i] + 1e-6f);
-    }
+    float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i];
+    float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i];
+    float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i];
+    float dist = sqrtf(dx*dx + dy*dy + dz*dz);
+    assert(dist <= cns->motion_radius[i] + 1e-6f);
   }
   for (int i = 0; i < MAX_SPH; ++i) {
     assert(i < MAX_SPH);
     assert(isfinite(cns->sph_x[i]));
     assert(isfinite(cns->sph_y[i]));
     assert(isfinite(cns->sph_z[i]));
     assert(isfinite(cns->sph_r[i]));
-
-    assert(!isnan(cns->sph_x[i]));
-    assert(!isnan(cns->sph_y[i]));
-    assert(!isnan(cns->sph_z[i]));
-    assert(!isnan(cns->sph_r[i]));
-
     assert(cns->sph_r[i] >= 0.0f);
   }
   for (int i = 0; i < MAX_REST_LEN; ++i) {
     assert(i < MAX_REST_LEN);
     assert(cns->ptc_rest_len[i] >= 0.0f);
-    assert(!isnan(cns->ptc_rest_len[i]));
     assert(isfinite(cns->ptc_rest_len[i]));
 
-    assert(!isnan(cns->ptc_lambda[i]));
     assert(isfinite(cns->ptc_lambda[i]));
     assert(cns->ptc_lambda[i] >= -1e6f);
     assert(cns->ptc_lambda[i] <= 1e6f);
 
     assert(cns->ptc_bend_rest[i] >= 0.0f);
-    assert(!isnan(cns->ptc_bend_rest[i]));
     assert(isfinite(cns->ptc_bend_rest[i]));
 
-    assert(!isnan(cns->ptc_bend_lambda[i]));
     assert(isfinite(cns->ptc_bend_lambda[i]));
     assert(cns->ptc_bend_lambda[i] >= -1e6f);
     assert(cns->ptc_bend_lambda[i] <= 1e6f);
@@ -392,7 +346,6 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
 
     float len = _mm256_cvtss_f32(sqrt_vec);
     assert(isfinite(len));
-    assert(!isnan(len));
     len = fmaxf(len, 1e-6f);
 
     int con_idx = i - 1;
@@ -429,7 +382,6 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
 
     float len = _mm256_cvtss_f32(sqrt_vec);
     assert(isfinite(len));
-    assert(!isnan(len));
     len = fmaxf(len, 1e-6f);
 
     cns->ptc_bend_rest[r_base + i] = len;
@@ -449,7 +401,6 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     float len = _mm256_cvtss_f32(sqrt_vec);
 
     assert(isfinite(len));
-    assert(!isnan(len));
     len = fmaxf(len, 1e-6f);
 
     cns->ptc_bend_rest[r_base + 8 + i] = len;
@@ -660,11 +611,6 @@ chn_sim_set_sph(struct chn_sim *cns, int chn, const float *restrict spheres, int
     assert(isfinite(cns->sph_y[s_base + i]));
     assert(isfinite(cns->sph_z[s_base + i]));
     assert(isfinite(cns->sph_r[s_base + i]));
-
-    assert(!isnan(cns->sph_x[s_base + i]));
-    assert(!isnan(cns->sph_y[s_base + i]));
-    assert(!isnan(cns->sph_z[s_base + i]));
-    assert(!isnan(cns->sph_r[s_base + i]));
     assert(cns->sph_r[s_base + i] >= 0.0f);
   }
   for (int i = cnt; i < SPH_PER_CHN; ++i) {
@@ -939,7 +885,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       assert(isfinite(_mm256_cvtss_f32(ang_vel_z_vec)));
     }
   }
-  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - Duplicated even/odd (reverted for correctness)
+  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - Duplicated even/odd
   for (int c = 0; c < MAX_CHNS; ++c) {
     __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 1, 3);
     __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 1, 3);

diff --git a/chain.c b/chain.c
@@ -152,10 +152,7 @@ chn_sim__val(struct chn_sim *cns) {
     assert(isfinite(cns->chn_quat_w[i]));
 
     // Quaternion normalization invariant
-    float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] +
-      cns->chn_quat_y[i]*cns->chn_quat_y[i] +
-      cns->chn_quat_z[i]*cns->chn_quat_z[i] +
-      cns->chn_quat_w[i]*cns->chn_quat_w[i];
+    float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] + cns->chn_quat_y[i]*cns->chn_quat_y[i] + cns->chn_quat_z[i]*cns->chn_quat_z[i] + cns->chn_quat_w[i]*cns->chn_quat_w[i];
     assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
 
     assert(!isnan(cns->chn_prev_pos_x[i]));
@@ -177,10 +174,7 @@ chn_sim__val(struct chn_sim *cns) {
     assert(isfinite(cns->chn_prev_quat_w[i]));
 
     // Prev quat norm
-    qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] + 
-      cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] +
-      cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] + 
-      cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i];
+    qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] + cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] + cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] + cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i];
     assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
   }
   for (int i = 0; i < MAX_PTC; ++i) {
@@ -218,11 +212,14 @@ chn_sim__val(struct chn_sim *cns) {
     assert(isfinite(cns->rest_pos_z[i]));
     assert(isfinite(cns->motion_radius[i]));
 
-    float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i];
-    float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i];
-    float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i];
-    float dist = dx*dx + dy*dy + dz*dz;
-    assert(dist <= (cns->motion_radius[i] + 1e-6f * cns->motion_radius[i] + 1e-6f));
+    // Spot-check motion constraint post-any update (sample every 8th for perf)
+    if (i % 8 == 0) {
+      float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i];
+      float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i];
+      float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i];
+      float dist = sqrtf(dx*dx + dy*dy + dz*dz);
+      assert(dist <= cns->motion_radius[i] + 1e-6f);
+    }
   }
   for (int i = 0; i < MAX_SPH; ++i) {
     assert(i < MAX_SPH);
@@ -235,6 +232,7 @@ chn_sim__val(struct chn_sim *cns) {
     assert(!isnan(cns->sph_y[i]));
     assert(!isnan(cns->sph_z[i]));
     assert(!isnan(cns->sph_r[i]));
+
     assert(cns->sph_r[i] >= 0.0f);
   }
   for (int i = 0; i < MAX_REST_LEN; ++i) {
@@ -1223,6 +1221,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     assert(p_base + 15 < MAX_PTC);
     int r_base = c * CON_PER_CHN;
     assert(r_base + 15 < MAX_REST_LEN);struct chn_cfg *cfg = &cns->chn_cfg[c];
+
     __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
     __m256 compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(stiff_vec, eps_vec));  // Fallback if 0
     __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness);
@@ -1530,195 +1529,175 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     assert(s_base + 7 < MAX_SPH);
 
     struct chn_cfg *cfg = &cns->chn_cfg[c];
-    __m256 fric_vec = _mm256_set1_ps(cfg->friction);for (int s = 0; s < SPH_PER_CHN; ++s) {
-    __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]);
-    __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]);
-    __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
-    __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);
-
-    // Even block
-    {
-      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
-      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
-      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
+    __m256 fric_vec = _mm256_set1_ps(cfg->friction);
+    for (int s = 0; s < SPH_PER_CHN; ++s) {
+      __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]);
+      __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]);
+      __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
+      __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);
+
+      // Even block
+      {
+        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
+        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
+        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
 
-      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
-      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
-      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
-      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
-      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
+        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
+        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
+        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
+        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-      __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
-      __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
-      __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
+        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
+        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
+        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
 
-      __m256 dist_sq = _mm256_mul_ps(dx, dx);
-      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
+        __m256 dist_sq = _mm256_mul_ps(dx, dx);
+        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
-      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-      __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
-      inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
-      __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
-      assert(isfinite(_mm256_cvtss_f32(dist)));
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+        assert(isfinite(_mm256_cvtss_f32(dist)));
 
-      __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
-      __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
+        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
+        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
 
-      __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
-      __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
-      __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
+        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
 
-      __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
-      __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
-      __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
+        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
+        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
+        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
 
-      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
-      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
-      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
+        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
+        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
+        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
 
-      __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
+        __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
 
-      __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
-      __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
-      __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
+        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
+        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
+        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
 
-      __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
-      tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec);  // Clamp before rsqrt
-      __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);
+        __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
+        tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec);  // Clamp before rsqrt
+        __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);
 
-      __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
-      __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
-      __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
-      __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
+        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
+        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
+        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
+        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
 
-      __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
-      __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
-      __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
+        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
+        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
+        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
 
-      total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
-      total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
-      total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
+        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
+        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
+        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
 
-      total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
-      total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
-      total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
+        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
+        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
+        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
 
-      __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
-      __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
-      __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
+        __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
+        __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
+        __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
 
-      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
-      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
-      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
+        _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
+        _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
+        _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
 
-      assert(isfinite(_mm256_cvtss_f32(new_p_x)));
-      assert(isfinite(_mm256_cvtss_f32(new_p_y)));
-      assert(isfinite(_mm256_cvtss_f32(new_p_z)));
-    }
-    // Odd block
-    {
-      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
-      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
-      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
+        assert(isfinite(_mm256_cvtss_f32(new_p_x)));
+        assert(isfinite(_mm256_cvtss_f32(new_p_y)));
+        assert(isfinite(_mm256_cvtss_f32(new_p_z)));
+      }
+      // Odd block
+      {
+        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
+        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
+        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
 
-      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
-      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
-      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
-      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
-      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
+        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
+        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
+        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
+        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-      __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
-      __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
-      __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
+        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
+        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
+        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
 
-      __m256 dist_sq = _mm256_mul_ps(dx, dx);
-      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
+        __m256 dist_sq = _mm256_mul_ps(dx, dx);
+        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
-      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-      __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
-      inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
-      __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
-      assert(isfinite(_mm256_cvtss_f32(dist)));
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+        assert(isfinite(_mm256_cvtss_f32(dist)));
 
-      __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
-      __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
+        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
+        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
 
-      __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
-      __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
-      __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
+        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
 
-      __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
-      __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
-      __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
+        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
+        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
+        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
 
-      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
-      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
-      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
+        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
+        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
+        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
 
-      __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
+        __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
 
-      __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
-      __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
-      __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
+        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
+        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
+        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
 
-      __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
-      tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec);  // Clamp before rsqrt
-      __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);
+        __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
+        tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec);  // Clamp before rsqrt
+        __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);
 
-      __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
-      __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
-      __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
-      __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
+        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
+        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
+        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
+        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
 
-      __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
+        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
 
-      __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
-      __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
+        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
+        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
 
-      total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
-      total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
-      total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
+        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
+        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
+        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
 
-      total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
-      total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
-      total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
+        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
+        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
+        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
 
-      __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
-      __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
-      __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
+        __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
+        __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
+        __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
 
-      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
-      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
-      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
+        _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
+        _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
+        _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
 
-      assert(isfinite(_mm256_cvtss_f32(new_p_x)));
-      assert(isfinite(_mm256_cvtss_f32(new_p_y)));
-      assert(isfinite(_mm256_cvtss_f32(new_p_z)));
+        assert(isfinite(_mm256_cvtss_f32(new_p_x)));
+        assert(isfinite(_mm256_cvtss_f32(new_p_y)));
+        assert(isfinite(_mm256_cvtss_f32(new_p_z)));
+      }
     }
-    _m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
-    __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
-
-    total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
-    total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
-    total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
-
-    total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
-    total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
-    total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
-
-    __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
-    __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
-    __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
-
-    _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x);
-    _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y);
-    _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z);
-
-    assert(isfinite(_mm256_cvtss_f32(new_p_x)));
-    assert(isfinite(_mm256_cvtss_f32(new_p_y)));
-    assert(isfinite(_mm256_cvtss_f32(new_p_z)));
   }
   // Step 5: Motion Constraints - Duplicated for even/odd
   for (int c = 0; c < MAX_CHNS; ++c) {

diff --git a/chain.c b/chain.c
@@ -151,6 +151,13 @@ chn_sim__val(struct chn_sim *cns) {
     assert(isfinite(cns->chn_quat_z[i]));
     assert(isfinite(cns->chn_quat_w[i]));
 
+    // Quaternion normalization invariant
+    float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] +
+      cns->chn_quat_y[i]*cns->chn_quat_y[i] +
+      cns->chn_quat_z[i]*cns->chn_quat_z[i] +
+      cns->chn_quat_w[i]*cns->chn_quat_w[i];
+    assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
+
     assert(!isnan(cns->chn_prev_pos_x[i]));
     assert(!isnan(cns->chn_prev_pos_y[i]));
     assert(!isnan(cns->chn_prev_pos_z[i]));
@@ -168,10 +175,19 @@ chn_sim__val(struct chn_sim *cns) {
     assert(isfinite(cns->chn_prev_quat_y[i]));
     assert(isfinite(cns->chn_prev_quat_z[i]));
     assert(isfinite(cns->chn_prev_quat_w[i]));
+
+    // Prev quat norm
+    qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] + 
+      cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] +
+      cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] + 
+      cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i];
+    assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
   }
   for (int i = 0; i < MAX_PTC; ++i) {
+    assert(i < MAX_PTC);
     assert(cns->ptc_rest_len[i] >= 0.0f);
     assert(cns->motion_radius[i] >= 0.0f);
+    assert(cns->motion_radius[i] <= MAX_MOTION_RADIUS);
 
     assert(!isnan(cns->ptc_pos_x[i]));
     assert(!isnan(cns->ptc_pos_y[i]));
@@ -182,6 +198,7 @@ chn_sim__val(struct chn_sim *cns) {
     assert(isfinite(cns->ptc_pos_y[i]));
     assert(isfinite(cns->ptc_pos_z[i]));
     assert(isfinite(cns->ptc_inv_mass[i]));
+    assert(cns->ptc_inv_mass[i] >= 0.0f);
 
     assert(!isnan(cns->ptc_prev_pos_x[i]));
     assert(!isnan(cns->ptc_prev_pos_y[i]));
@@ -200,18 +217,15 @@ chn_sim__val(struct chn_sim *cns) {
     assert(isfinite(cns->rest_pos_y[i]));
     assert(isfinite(cns->rest_pos_z[i]));
     assert(isfinite(cns->motion_radius[i]));
+
+    float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i];
+    float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i];
+    float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i];
+    float dist = dx*dx + dy*dy + dz*dz;
+    assert(dist <= (cns->motion_radius[i] + 1e-6f * cns->motion_radius[i] + 1e-6f));
   }
   for (int i = 0; i < MAX_SPH; ++i) {
-    assert(isfinite(cns->sph_x[i]));
-    assert(isfinite(cns->sph_y[i]));
-    assert(isfinite(cns->sph_z[i]));
-    assert(isfinite(cns->sph_r[i]));
-
-    assert(!isnan(cns->sph_x[i]));
-    assert(!isnan(cns->sph_y[i]));
-    assert(!isnan(cns->sph_z[i]));
-    assert(!isnan(cns->sph_r[i]));
-
+    assert(i < MAX_SPH);
     assert(isfinite(cns->sph_x[i]));
     assert(isfinite(cns->sph_y[i]));
     assert(isfinite(cns->sph_z[i]));
@@ -223,6 +237,26 @@ chn_sim__val(struct chn_sim *cns) {
     assert(!isnan(cns->sph_r[i]));
     assert(cns->sph_r[i] >= 0.0f);
   }
+  for (int i = 0; i < MAX_REST_LEN; ++i) {
+    assert(i < MAX_REST_LEN);
+    assert(cns->ptc_rest_len[i] >= 0.0f);
+    assert(!isnan(cns->ptc_rest_len[i]));
+    assert(isfinite(cns->ptc_rest_len[i]));
+
+    assert(!isnan(cns->ptc_lambda[i]));
+    assert(isfinite(cns->ptc_lambda[i]));
+    assert(cns->ptc_lambda[i] >= -1e6f);
+    assert(cns->ptc_lambda[i] <= 1e6f);
+
+    assert(cns->ptc_bend_rest[i] >= 0.0f);
+    assert(!isnan(cns->ptc_bend_rest[i]));
+    assert(isfinite(cns->ptc_bend_rest[i]));
+
+    assert(!isnan(cns->ptc_bend_lambda[i]));
+    assert(isfinite(cns->ptc_bend_lambda[i]));
+    assert(cns->ptc_bend_lambda[i] >= -1e6f);
+    assert(cns->ptc_bend_lambda[i] <= 1e6f);
+  }
   return 1;
 }
 extern void
@@ -240,6 +274,7 @@ chn_sim_init(struct chn_sim *cns) {
   for (int i = 0; i < MAX_PTC; i += 8) {
     _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS));
   }
+  assert(chn_sim__val(cns));
 }
 extern int
 chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
@@ -269,15 +304,20 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
   cns->chn_cfg[chn] = *cfg;
 
   int p_base = (chn * PTC_PER_CHN);
+  assert(p_base + 15 < MAX_PTC);
   int r_base = (chn * CON_PER_CHN);
+  assert(r_base + 15 < MAX_REST_LEN);
   int even_base = p_base + 0;  // Even indices 0-7
   int odd_base = p_base + 8;   // Odd indices 8-15
 
   // Transpose particles: even indices to 0-7, odd to 8-15
   int even_cnt = (cnt + 1) / 2;  // Number of even particles
   int odd_cnt = cnt / 2;         // Number of odd particles
+  assert(even_cnt + odd_cnt == cnt);
   for (int i = 0; i < even_cnt; ++i) {
     int orig_i = 2 * i;
+    assert(orig_i * 4 + 3 < cnt * 4);
+
     cns->ptc_pos_x[even_base + i] = rest_pos[orig_i*4+0];
     cns->ptc_pos_y[even_base + i] = rest_pos[orig_i*4+1];
     cns->ptc_pos_z[even_base + i] = rest_pos[orig_i*4+2];
@@ -309,6 +349,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
   }
   for (int i = 0; i < odd_cnt; ++i) {
     int orig_i = 2 * i + 1;
+    assert(orig_i * 4 + 3 < cnt * 4);
     cns->ptc_pos_x[odd_base + i] = rest_pos[orig_i*4+0];
     cns->ptc_pos_y[odd_base + i] = rest_pos[orig_i*4+1];
     cns->ptc_pos_z[odd_base + i] = rest_pos[orig_i*4+2];
@@ -347,7 +388,14 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
     float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
     float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
-    float len = sqrtf(dx*dx + dy*dy + dz*dz);
+
+    __m256 dist_sq_vec = _mm256_set1_ps(dx*dx + dy*dy + dz*dz);
+    __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);
+
+    float len = _mm256_cvtss_f32(sqrt_vec);
+    assert(isfinite(len));
+    assert(!isnan(len));
+    len = fmaxf(len, 1e-6f);
 
     int con_idx = i - 1;
     if ((con_idx & 0x01) == 0) {  // Even constraint
@@ -371,11 +419,20 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
   // Transpose bending constraints: even bends (0-2,2-4,...) to r_base+0-7, odd (1-3,3-5,...) to r_base+8-15 (new)
   int even_bend_cnt = even_cnt - 1;
   int odd_bend_cnt = odd_cnt - 1;
+  assert(even_bend_cnt == even_cnt - 1);
+  assert(odd_bend_cnt == odd_cnt - 1);
   for (int i = 0; i < even_bend_cnt; ++i) {
     float dx = cns->rest_pos_x[even_base + i] - cns->rest_pos_x[even_base + i + 1];
     float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1];
     float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1];
-    float len = sqrtf(dx * dx + dy * dy + dz * dz);
+
+    __m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz);
+    __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);
+
+    float len = _mm256_cvtss_f32(sqrt_vec);
+    assert(isfinite(len));
+    assert(!isnan(len));
+    len = fmaxf(len, 1e-6f);
 
     cns->ptc_bend_rest[r_base + i] = len;
     cns->ptc_bend_lambda[r_base + i] = 0.0f;
@@ -388,7 +445,14 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     float dx = cns->rest_pos_x[odd_base + i] - cns->rest_pos_x[odd_base + i + 1];
     float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1];
     float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1];
-    float len = sqrtf(dx * dx + dy * dy + dz * dz);
+
+    __m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz);
+    __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec);
+    float len = _mm256_cvtss_f32(sqrt_vec);
+
+    assert(isfinite(len));
+    assert(!isnan(len));
+    len = fmaxf(len, 1e-6f);
 
     cns->ptc_bend_rest[r_base + 8 + i] = len;
     cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
@@ -407,8 +471,11 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   assert(chn < MAX_CHNS);
 
   int p_base = (chn * PTC_PER_CHN);
+  assert(p_base + 15 < MAX_PTC);
   int s_base = (chn * SPH_PER_CHN);
+  assert(s_base + 7 < MAX_SPH);
   int r_base = (chn * CON_PER_CHN);
+  assert(r_base + 15 < MAX_REST_LEN);
 
   cns->free_idx[cns->free_idx_cnt++] = chn;
   memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
@@ -500,6 +567,14 @@ chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3,
   cns->chn_quat_y[chn] = rot4[1];
   cns->chn_quat_z[chn] = rot4[2];
   cns->chn_quat_w[chn] = rot4[3];
+
+  float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3];
+  assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
+
+  assert(isfinite(cns->chn_quat_x[chn]));
+  assert(isfinite(cns->chn_quat_y[chn]));
+  assert(isfinite(cns->chn_quat_z[chn]));
+  assert(isfinite(cns->chn_quat_w[chn]));
   assert(chn_sim__val(cns));
 }
 extern void
@@ -521,6 +596,18 @@ chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3,
   cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
   cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
 
+  float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3];
+  assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f);
+
+  assert(isfinite(cns->chn_quat_x[chn]));
+  assert(isfinite(cns->chn_quat_y[chn]));
+  assert(isfinite(cns->chn_quat_z[chn]));
+  assert(isfinite(cns->chn_quat_w[chn]));
+
+  assert(isfinite(cns->chn_prev_quat_x[chn]));
+  assert(isfinite(cns->chn_prev_quat_y[chn]));
+  assert(isfinite(cns->chn_prev_quat_z[chn]));
+  assert(isfinite(cns->chn_prev_quat_w[chn]));
   assert(chn_sim__val(cns));
 }
 extern void
@@ -534,6 +621,9 @@ chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
   cns->chn_grav_y[chn] = grav3[1];
   cns->chn_grav_z[chn] = grav3[2];
 
+  assert(isfinite(cns->chn_grav_x[chn]));
+  assert(isfinite(cns->chn_grav_y[chn]));
+  assert(isfinite(cns->chn_grav_z[chn]));
   assert(chn_sim__val(cns));
 }
 extern void
@@ -547,20 +637,37 @@ chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
   cns->chn_wnd_y[chn] = wind3[1];
   cns->chn_wnd_z[chn] = wind3[2];
 
+  assert(isfinite(cns->chn_wnd_x[chn]));
+  assert(isfinite(cns->chn_wnd_y[chn]));
+  assert(isfinite(cns->chn_wnd_z[chn]));
   assert(chn_sim__val(cns));
 }
 extern void
-chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
+chn_sim_set_sph(struct chn_sim *cns, int chn, const float *restrict spheres, int cnt) {
   assert(chn_sim__val(cns));
   assert(spheres);
   assert(cnt <= SPH_PER_CHN);
+  assert(cnt >= 0);
 
   int s_base = (chn * SPH_PER_CHN);
+  assert(s_base + 7 < MAX_SPH);
   for (int i = 0; i < cnt; i++) {
+    assert(i * 4 + 3 < cnt * 4);
     cns->sph_x[s_base + i] = spheres[i*4+0];
     cns->sph_y[s_base + i] = spheres[i*4+1];
     cns->sph_z[s_base + i] = spheres[i*4+2];
     cns->sph_r[s_base + i] = spheres[i*4+3];
+
+    assert(isfinite(cns->sph_x[s_base + i]));
+    assert(isfinite(cns->sph_y[s_base + i]));
+    assert(isfinite(cns->sph_z[s_base + i]));
+    assert(isfinite(cns->sph_r[s_base + i]));
+
+    assert(!isnan(cns->sph_x[s_base + i]));
+    assert(!isnan(cns->sph_y[s_base + i]));
+    assert(!isnan(cns->sph_z[s_base + i]));
+    assert(!isnan(cns->sph_r[s_base + i]));
+    assert(cns->sph_r[s_base + i] >= 0.0f);
   }
   for (int i = cnt; i < SPH_PER_CHN; ++i) {
     cns->sph_x[s_base + i] = 0;
@@ -571,27 +678,35 @@ chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
   assert(chn_sim__val(cns));
 }
 extern void
-chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
+chn_sim_con_motion(struct chn_sim *cns, int chn, const float *restrict radius, int cnt) {
   assert(chn_sim__val(cns));
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
   assert(cnt >= 0);
   assert(cnt < PTC_PER_CHN);
+  assert(radius);
 
   int p_base = (chn * PTC_PER_CHN);
+  assert(p_base + 15 < MAX_PTC);
   int even_base = p_base + 0;
   int odd_base = p_base + 8;
   int even_cnt = (cnt + 1) / 2;
   int odd_cnt = cnt / 2;
 
   for (int i = 0; i < even_cnt; ++i) {
+    assert(2 * i < cnt);
     cns->motion_radius[even_base + i] = radius[2 * i];
+    assert(cns->motion_radius[even_base + i] >= 0.0f);
+    assert(cns->motion_radius[even_base + i] <= MAX_MOTION_RADIUS);
   }
   for (int i = even_cnt; i < 8; ++i) {
     cns->motion_radius[even_base + i] = MAX_MOTION_RADIUS;
   }
   for (int i = 0; i < odd_cnt; ++i) {
+    assert(2 * i + 1 < cnt);
     cns->motion_radius[odd_base + i] = radius[2 * i + 1];
+    assert(cns->motion_radius[odd_base + i] >= 0.0f);
+    assert(cns->motion_radius[odd_base + i] <= MAX_MOTION_RADIUS);
   }
   for (int i = odd_cnt; i < 8; ++i) {
     cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS;
@@ -601,20 +716,25 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
 extern void
 chn_sim_run(struct chn_sim *cns, float dt) {
   assert(chn_sim__val(cns));
+  assert(dt > 0.0f);
+  assert(dt < 1.0f);
 
   // SIMD constants
   const __m256 dt_vec = _mm256_set1_ps(dt);
   const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
   const __m256 one_vec = _mm256_set1_ps(1.0f);
   const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
   const __m256 eps_vec = _mm256_set1_ps(1e-6f);
+  const __m256 lambda_min_vec = _mm256_set1_ps(-1e6f);
+  const __m256 lambda_max_vec = _mm256_set1_ps(1e6f);
   const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f);
   const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f);
   const __m256 zero_vec = _mm256_setzero_ps();
   const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
   const __m256 two_vec = _mm256_set1_ps(2.0f);
   const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
   const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
+  const __m256 neg_rest_vec = _mm256_set1_ps(-1.0f);  // For FMAs
 
   // Stack arrays for precomputed data
   float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
@@ -635,6 +755,10 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
   // Step 0: Precomputation
   for (int c = 0; c < MAX_CHNS; c += 8) {
+    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
+    __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 0, 3);
+    __builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 0, 3);
+
     // Load chain quaternions
     __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
     __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
@@ -677,6 +801,10 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&chn_wnd_lcl_x[c], res_x);
       _mm256_store_ps(&chn_wnd_lcl_y[c], res_y);
       _mm256_store_ps(&chn_wnd_lcl_z[c], res_z);
+
+      assert(isfinite(_mm256_cvtss_f32(res_x)));
+      assert(isfinite(_mm256_cvtss_f32(res_y)));
+      assert(isfinite(_mm256_cvtss_f32(res_z)));
     }
     // Compute local-space gravity (identical to wind)
     {
@@ -707,6 +835,10 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&chn_grav_lcl_x[c], res_x);
       _mm256_store_ps(&chn_grav_lcl_y[c], res_y);
       _mm256_store_ps(&chn_grav_lcl_z[c], res_z);
+
+      assert(isfinite(_mm256_cvtss_f32(res_x)));
+      assert(isfinite(_mm256_cvtss_f32(res_y)));
+      assert(isfinite(_mm256_cvtss_f32(res_z)));
     }
     // Compute linear velocity
     {
@@ -725,6 +857,10 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&vel_x[c], vel_x_vec);
       _mm256_store_ps(&vel_y[c], vel_y_vec);
       _mm256_store_ps(&vel_z[c], vel_z_vec);
+
+      assert(isfinite(_mm256_cvtss_f32(vel_x_vec)));
+      assert(isfinite(_mm256_cvtss_f32(vel_y_vec)));
+      assert(isfinite(_mm256_cvtss_f32(vel_z_vec)));
     }
     // Compute angular velocity
     {
@@ -773,28 +909,46 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
       // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
       __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
-      __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec),
-                                        _mm256_andnot_ps(negate_mask, one_vec));
+      __m256 negate_sign = _mm256_blendv_ps(one_vec, neg_one_vec, negate_mask);
 
       orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
       orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
       orient_qz = _mm256_mul_ps(orient_qz, negate_sign);
 
+      // Norm check and scale if >1+eps
+      __m256 norm_sq = _mm256_fmadd_ps(orient_qz, orient_qz, _mm256_fmadd_ps(orient_qy, orient_qy, _mm256_fmadd_ps(orient_qx, orient_qx, _mm256_mul_ps(orient_qw, orient_qw))));
+      __m256 scale_mask = _mm256_cmp_ps(norm_sq, _mm256_set1_ps((1.0f + 1e-6f) * (1.0f + 1e-6f)), _CMP_GT_OQ);
+      __m256 inv_norm = _mm256_rsqrt_ps(norm_sq);
+      __m256 scale = _mm256_blendv_ps(one_vec, inv_norm, scale_mask);
+      orient_qx = _mm256_mul_ps(orient_qx, scale);
+      orient_qy = _mm256_mul_ps(orient_qy, scale);
+      orient_qz = _mm256_mul_ps(orient_qz, scale);
+      orient_qw = _mm256_mul_ps(orient_qw, scale);
+
       // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
-      __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
-      __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale);
-      __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale);
-      __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale);
+      __m256 scale_ang = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
+      __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale_ang);
+      __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale_ang);
+      __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale_ang);
 
       // Store results
       _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
       _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
       _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
+
+      assert(isfinite(_mm256_cvtss_f32(ang_vel_x_vec)));
+      assert(isfinite(_mm256_cvtss_f32(ang_vel_y_vec)));
+      assert(isfinite(_mm256_cvtss_f32(ang_vel_z_vec)));
     }
   }
-  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
+  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - Duplicated even/odd (reverted for correctness)
   for (int c = 0; c < MAX_CHNS; ++c) {
+    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 1, 3);
+    __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 1, 3);
+    __builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 1, 3);
+
     int p_base = c * PTC_PER_CHN;
+    assert(p_base + 15 < MAX_PTC);
     struct chn_cfg *cfg = &cns->chn_cfg[c];
 
     __m256 delta_x_vec = _mm256_set1_ps(vel_x[c] * dt);
@@ -818,14 +972,18 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     __m256 lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor);
     __m256 lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor);
 
-    // Even block (0-7)
+    // Centrifugal zero-check: ang_vel mag (shared)
+    __m256 ang_mag_sq = _mm256_fmadd_ps(ang_vel_z_vec, ang_vel_z_vec, _mm256_fmadd_ps(ang_vel_y_vec, ang_vel_y_vec, _mm256_mul_ps(ang_vel_x_vec, ang_vel_x_vec)));
+    __m256 ang_zero_mask = _mm256_cmp_ps(ang_mag_sq, eps_vec, _CMP_LT_OQ);
+
+    // Even block
     {
-      // In Step 1, for both even and odd blocks, after loading ptc_inv_mass and clamping to pm
       __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
       __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
       __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
       __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
-      __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); // Mask lanes with inv_mass > 0
+      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+      __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ);
 
       __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
       __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
@@ -851,18 +1009,23 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec);
       __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec);
 
-      // Apply mask to all dt components before adding
-      lin_dt_x = _mm256_and_ps(lin_dt_x, valid_mask);
-      lin_dt_y = _mm256_and_ps(lin_dt_y, valid_mask);
-      lin_dt_z = _mm256_and_ps(lin_dt_z, valid_mask);
+      // Apply centrifugal zero-check
+      cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask);
+      cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask);
+      cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask);
+
+      // Blend masks for all dt
+      lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask);
+      lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask);
+      lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask);
 
-      ang_dt_x = _mm256_and_ps(ang_dt_x, valid_mask);
-      ang_dt_y = _mm256_and_ps(ang_dt_y, valid_mask);
-      ang_dt_z = _mm256_and_ps(ang_dt_z, valid_mask);
+      ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask);
+      ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask);
+      ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask);
 
-      cent_dt_x = _mm256_and_ps(cent_dt_x, valid_mask);
-      cent_dt_y = _mm256_and_ps(cent_dt_y, valid_mask);
-      cent_dt_z = _mm256_and_ps(cent_dt_z, valid_mask);
+      cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask);
+      cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask);
+      cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask);
 
       __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
       __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
@@ -872,13 +1035,14 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y));
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z));
     }
-    // Odd block (8-15)
+    // Odd block
     {
       __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
       __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
       __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
       __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
-      __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); // Mask lanes with inv_mass > 0
+      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+      __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ);
 
       __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
       __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
@@ -904,21 +1068,26 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec);
       __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec);
 
+      // Apply centrifugal zero-check
+      cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask);
+      cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask);
+      cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask);
+
       lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor);
       lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor);
       lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor);
 
-      lin_dt_x = _mm256_and_ps(lin_dt_x, valid_mask);
-      lin_dt_y = _mm256_and_ps(lin_dt_y, valid_mask);
-      lin_dt_z = _mm256_and_ps(lin_dt_z, valid_mask);
+      lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask);
+      lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask);
+      lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask);
 
-      ang_dt_x = _mm256_and_ps(ang_dt_x, valid_mask);
-      ang_dt_y = _mm256_and_ps(ang_dt_y, valid_mask);
-      ang_dt_z = _mm256_and_ps(ang_dt_z, valid_mask);
+      ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask);
+      ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask);
+      ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask);
 
-      cent_dt_x = _mm256_and_ps(cent_dt_x, valid_mask);
-      cent_dt_y = _mm256_and_ps(cent_dt_y, valid_mask);
-      cent_dt_z = _mm256_and_ps(cent_dt_z, valid_mask);
+      cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask);
+      cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask);
+      cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask);
 
       __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
       __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
@@ -938,9 +1107,11 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
     cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
   }
-  // Step 2: Verlet Integration - duplicated for even/odd blocks
+  // Step 2: Verlet Integration - Duplicated for even/odd
   for (int c = 0; c < MAX_CHNS; ++c) {
+    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
     int p_base = c * PTC_PER_CHN;
+    assert(p_base + 15 < MAX_PTC);
     struct chn_cfg *cfg = &cns->chn_cfg[c];
 
     __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]);
@@ -995,6 +1166,10 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], p_curr_x);
       _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y);
       _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z);
+
+      assert(isfinite(_mm256_cvtss_f32(new_p_x)));
+      assert(isfinite(_mm256_cvtss_f32(new_p_y)));
+      assert(isfinite(_mm256_cvtss_f32(new_p_z)));
     }
     // Odd block
     {
@@ -1035,19 +1210,23 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], p_curr_x);
       _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], p_curr_y);
       _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z);
-    }
+
+      assert(isfinite(_mm256_cvtss_f32(new_p_x)));
+      assert(isfinite(_mm256_cvtss_f32(new_p_y)));
+      assert(isfinite(_mm256_cvtss_f32(new_p_z)));
+    } 
   }
-  // Step 3: XPBD Constraints (Distance only)
+  // Step 3: XPBD Constraints (Distance only) - Permutes intra-vector only, no cross-block mixing
   for (int c = 0; c < MAX_CHNS; ++c) {
+    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
     int p_base = c * PTC_PER_CHN;
+    assert(p_base + 15 < MAX_PTC);
     int r_base = c * CON_PER_CHN;
-
-    struct chn_cfg *cfg = &cns->chn_cfg[c];
+    assert(r_base + 15 < MAX_REST_LEN);struct chn_cfg *cfg = &cns->chn_cfg[c];
     __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
-    __m256 compliance_factor = _mm256_div_ps(dt2_vec, stiff_vec);
-
+    __m256 compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(stiff_vec, eps_vec));  // Fallback if 0
     __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness);
-    __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, bend_stiff_vec);
+    __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(bend_stiff_vec, eps_vec));
 
     // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15)
     __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
@@ -1084,21 +1263,21 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
         __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
         __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
-        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
-        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
         __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+        assert(isfinite(_mm256_cvtss_f32(dist)));
 
         __m256 w_sum = _mm256_add_ps(even_m, odd_m);
         __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
         __m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ);
 
-        __m256 C = _mm256_sub_ps(dist, rest_even);
+        __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_even));
         __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_even));  // C + compliance * lambda
         __m256 numerator = _mm256_sub_ps(zero_vec, temp);  // - (C + compliance * lambda)
         __m256 denom = _mm256_add_ps(w_sum, compliance_factor);  // w_sum + compliance
         __m256 delta_lambda = _mm256_div_ps(numerator, denom);
         __m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda);
+        new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));
 
         __m256 unit_x = _mm256_mul_ps(dx, inv_dist);  // grad_A = unit
         __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
@@ -1114,14 +1293,14 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         corr_z = _mm256_and_ps(corr_z, corr_mask);
 
         // dpos_A = delta_lambda * unit * even_m
-        even_x = _mm256_add_ps(even_x, _mm256_mul_ps(corr_x, even_m));
-        even_y = _mm256_add_ps(even_y, _mm256_mul_ps(corr_y, even_m));
-        even_z = _mm256_add_ps(even_z, _mm256_mul_ps(corr_z, even_m));
+        even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, even_m), one_vec, even_x);
+        even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, even_m), one_vec, even_y);
+        even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, even_m), one_vec, even_z);
 
         // dpos_B = delta_lambda * (-unit) * odd_m = - corr * odd_m
-        odd_x = _mm256_sub_ps(odd_x, _mm256_mul_ps(corr_x, odd_m));
-        odd_y = _mm256_sub_ps(odd_y, _mm256_mul_ps(corr_y, odd_m));
-        odd_z = _mm256_sub_ps(odd_z, _mm256_mul_ps(corr_z, odd_m));
+        odd_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x);
+        odd_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y);
+        odd_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z);
 
         lambda_even = new_lambda;
       }
@@ -1139,21 +1318,21 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
         __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
         __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
-        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
-        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
         __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+        assert(isfinite(_mm256_cvtss_f32(dist)));
 
         __m256 w_sum = _mm256_add_ps(odd_m, p2_m);
         __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
         __m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ);
 
-        __m256 C = _mm256_sub_ps(dist, rest_odd);
+        __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_odd));
         __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_odd));
         __m256 numerator = _mm256_sub_ps(zero_vec, temp);
         __m256 denom = _mm256_add_ps(w_sum, compliance_factor);
         __m256 delta_lambda = _mm256_div_ps(numerator, denom);
         __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda);
+        new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));
 
         __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
         __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
@@ -1169,27 +1348,21 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         corr_z = _mm256_and_ps(corr_z, corr_mask);
 
         // dpos_A = delta_lambda * unit * odd_m
-        odd_x = _mm256_add_ps(odd_x, _mm256_mul_ps(corr_x, odd_m));
-        odd_y = _mm256_add_ps(odd_y, _mm256_mul_ps(corr_y, odd_m));
-        odd_z = _mm256_add_ps(odd_z, _mm256_mul_ps(corr_z, odd_m));
-
-        // dpos_B = delta_lambda * (-unit) * p2_m = - corr * p2_m
-        __m256 delta_p2_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, p2_m));
-        __m256i delta_ix = _mm256_castps_si256(delta_p2_x);
-        __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4);
-        __m256 delta_even_x = _mm256_castsi256_ps(left_shifted_ix);
+        odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x);
+        odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y);
+        odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z);
+
+        // dpos_B = delta_lambda * (-unit) * p2_m = - corr * p2_m (permuted add via shift)
+        __m256 delta_p2_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, p2_m), one_vec, zero_vec);
+        __m256 delta_even_x = _mm256_permutevar8x32_ps(delta_p2_x, _mm256_castsi256_ps(shift_idx));
         even_x = _mm256_add_ps(even_x, delta_even_x);
 
-        __m256 delta_p2_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, p2_m));
-        __m256i delta_iy = _mm256_castps_si256(delta_p2_y);
-        __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4);
-        __m256 delta_even_y = _mm256_castsi256_ps(left_shifted_iy);
+        __m256 delta_p2_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, p2_m), one_vec, zero_vec);
+        __m256 delta_even_y = _mm256_permutevar8x32_ps(delta_p2_y, _mm256_castsi256_ps(shift_idx));
         even_y = _mm256_add_ps(even_y, delta_even_y);
 
-        __m256 delta_p2_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, p2_m));
-        __m256i delta_iz = _mm256_castps_si256(delta_p2_z);
-        __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4);
-        __m256 delta_even_z = _mm256_castsi256_ps(left_shifted_iz);
+        __m256 delta_p2_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, p2_m), one_vec, zero_vec);
+        __m256 delta_even_z = _mm256_permutevar8x32_ps(delta_p2_z, _mm256_castsi256_ps(shift_idx));
         even_z = _mm256_add_ps(even_z, delta_even_z);
 
         lambda_odd = new_lambda;
@@ -1213,21 +1386,21 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
         __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
         __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
-        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
-        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
         __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+        assert(isfinite(_mm256_cvtss_f32(dist)));
 
         __m256 w_sum = _mm256_add_ps(a_m, b_m);
         __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
         __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ);
 
-        __m256 C = _mm256_sub_ps(dist, bend_rest_even);
+        __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_even));
         __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_even));
         __m256 numerator = _mm256_sub_ps(zero_vec, temp);
         __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor);
         __m256 delta_lambda = _mm256_div_ps(numerator, denom);
         __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda);
+        new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));
 
         __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
         __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
@@ -1243,27 +1416,21 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         corr_z = _mm256_and_ps(corr_z, corr_mask);
 
         // dpos_A = delta_lambda * unit * a_m
-        even_x = _mm256_add_ps(even_x, _mm256_mul_ps(corr_x, a_m));
-        even_y = _mm256_add_ps(even_y, _mm256_mul_ps(corr_y, a_m));
-        even_z = _mm256_add_ps(even_z, _mm256_mul_ps(corr_z, a_m));
-
-        // dpos_B = - corr * b_m (shifted add)
-        __m256 delta_b_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, b_m));
-        __m256i delta_ix = _mm256_castps_si256(delta_b_x);
-        __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4);
-        __m256 delta_even_x = _mm256_castsi256_ps(left_shifted_ix);
+        even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, even_x);
+        even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, even_y);
+        even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, even_z);
+
+        // dpos_B = - corr * b_m (permuted add)
+        __m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec);
+        __m256 delta_even_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx));
         even_x = _mm256_add_ps(even_x, delta_even_x);
 
-        __m256 delta_b_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, b_m));
-        __m256i delta_iy = _mm256_castps_si256(delta_b_y);
-        __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4);
-        __m256 delta_even_y = _mm256_castsi256_ps(left_shifted_iy);
+        __m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec);
+        __m256 delta_even_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx));
         even_y = _mm256_add_ps(even_y, delta_even_y);
 
-        __m256 delta_b_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, b_m));
-        __m256i delta_iz = _mm256_castps_si256(delta_b_z);
-        __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4);
-        __m256 delta_even_z = _mm256_castsi256_ps(left_shifted_iz);
+        __m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec);
+        __m256 delta_even_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx));
         even_z = _mm256_add_ps(even_z, delta_even_z);
 
         bend_lambda_even = new_lambda;
@@ -1287,21 +1454,21 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
         __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
         __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
-        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
-        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
         __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+        assert(isfinite(_mm256_cvtss_f32(dist)));
 
         __m256 w_sum = _mm256_add_ps(a_m, b_m);
         __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
         __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ);
 
-        __m256 C = _mm256_sub_ps(dist, bend_rest_odd);
+        __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_odd));
         __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_odd));
         __m256 numerator = _mm256_sub_ps(zero_vec, temp);
         __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor);
         __m256 delta_lambda = _mm256_div_ps(numerator, denom);
         __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda);
+        new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec));
 
         __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
         __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
@@ -1317,27 +1484,21 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         corr_z = _mm256_and_ps(corr_z, corr_mask);
 
         // dpos_A = delta_lambda * unit * a_m
-        odd_x = _mm256_add_ps(odd_x, _mm256_mul_ps(corr_x, a_m));
-        odd_y = _mm256_add_ps(odd_y, _mm256_mul_ps(corr_y, a_m));
-        odd_z = _mm256_add_ps(odd_z, _mm256_mul_ps(corr_z, a_m));
-
-        // dpos_B = - corr * b_m (shifted add)
-        __m256 delta_b_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, b_m));
-        __m256i delta_ix = _mm256_castps_si256(delta_b_x);
-        __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4);
-        __m256 delta_odd_x = _mm256_castsi256_ps(left_shifted_ix);
+        odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, odd_x);
+        odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, odd_y);
+        odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, odd_z);
+
+        // dpos_B = - corr * b_m (permuted add)
+        __m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec);
+        __m256 delta_odd_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx));
         odd_x = _mm256_add_ps(odd_x, delta_odd_x);
 
-        __m256 delta_b_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, b_m));
-        __m256i delta_iy = _mm256_castps_si256(delta_b_y);
-        __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4);
-        __m256 delta_odd_y = _mm256_castsi256_ps(left_shifted_iy);
+        __m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec);
+        __m256 delta_odd_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx));
         odd_y = _mm256_add_ps(odd_y, delta_odd_y);
 
-        __m256 delta_b_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, b_m));
-        __m256i delta_iz = _mm256_castps_si256(delta_b_z);
-        __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4);
-        __m256 delta_odd_z = _mm256_castsi256_ps(left_shifted_iz);
+        __m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec);
+        __m256 delta_odd_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx));
         odd_z = _mm256_add_ps(odd_z, delta_odd_z);
 
         bend_lambda_odd = new_lambda;
@@ -1356,182 +1517,222 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd);
     _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], bend_lambda_even);
     _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], bend_lambda_odd);
+
+    assert(isfinite(_mm256_cvtss_f32(even_x)));
+    assert(isfinite(_mm256_cvtss_f32(odd_x)));  
   }
-  // Step 4: Sphere Collisions with Friction - duplicated for even/odd blocks
+  // Step 4: Sphere Collisions with Friction - Duplicated for even/odd
   for (int c = 0; c < MAX_CHNS; ++c) {
+    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
     int p_base = c * PTC_PER_CHN;
+    assert(p_base + 15 < MAX_PTC);
     int s_base = c * SPH_PER_CHN;
+    assert(s_base + 7 < MAX_SPH);
+
     struct chn_cfg *cfg = &cns->chn_cfg[c];
-    __m256 fric_vec = _mm256_set1_ps(cfg->friction);
+    __m256 fric_vec = _mm256_set1_ps(cfg->friction);for (int s = 0; s < SPH_PER_CHN; ++s) {
+    __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]);
+    __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]);
+    __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
+    __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);
+
+    // Even block
+    {
+      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
+      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
+      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
 
-    for (int s = 0; s < SPH_PER_CHN; ++s) {
-      __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]);
-      __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]);
-      __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
-      __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);
+      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
+      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
+      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
+      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
+      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-      // Even block
-      {
-        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
-        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
-        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
+      __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
+      __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
+      __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
 
-        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
-        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
-        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
-        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
-        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+      __m256 dist_sq = _mm256_mul_ps(dx, dx);
+      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
-        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
-        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
-        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
+      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+      __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+      inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+      __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+      assert(isfinite(_mm256_cvtss_f32(dist)));
 
-        __m256 dist_sq = _mm256_mul_ps(dx, dx);
-        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
+      __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
+      __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
 
-        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
-        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
-        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+      __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
+      __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
+      __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
 
-        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
-        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
+      __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
+      __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
+      __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
 
-        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
-        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
-        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
+      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
+      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
+      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
 
-        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
-        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
-        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
+      __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
 
-        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
-        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
-        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
+      __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
+      __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
+      __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
 
-        __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
+      __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
+      tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec);  // Clamp before rsqrt
+      __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);
 
-        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
-        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
-        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
+      __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
+      __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
+      __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
+      __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
 
-        __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
-        __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec));
+      __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
+      __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
+      __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
 
-        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
-        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
-        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
-        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
+      total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
+      total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
+      total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
 
-        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
-        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
-        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
+      total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
+      total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
+      total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
 
-        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
-        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
-        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
+      __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
+      __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
+      __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
 
-        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
-        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
-        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
 
-        __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
-        __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
-        __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
+      assert(isfinite(_mm256_cvtss_f32(new_p_x)));
+      assert(isfinite(_mm256_cvtss_f32(new_p_y)));
+      assert(isfinite(_mm256_cvtss_f32(new_p_z)));
+    }
+    // Odd block
+    {
+      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
+      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
+      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
 
-        _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
-        _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
-        _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
-      }
-      // Odd block
-      {
-        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
-        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
-        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
+      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
+      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
+      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
+      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
+      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
-        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
-        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
-        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
-        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+      __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
+      __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
+      __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
 
-        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
-        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
-        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
+      __m256 dist_sq = _mm256_mul_ps(dx, dx);
+      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
-        __m256 dist_sq = _mm256_mul_ps(dx, dx);
-        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
+      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+      __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+      inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+      __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+      assert(isfinite(_mm256_cvtss_f32(dist)));
 
-        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
-        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
-        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+      __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
+      __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
 
-        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
-        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
+      __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
+      __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
+      __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
 
-        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
-        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
-        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
+      __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
+      __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
+      __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
+
+      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
+      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
+      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
 
-        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
-        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
-        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
+      __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
 
-        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
-        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
-        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
+      __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
+      __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
+      __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
 
-        __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
+      __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
+      tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec);  // Clamp before rsqrt
+      __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq);
 
-        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
-        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
-        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
+      __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
+      __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
+      __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
+      __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
 
-        __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
-        __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec));
+      __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
 
-        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
-        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
-        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
-        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
+      __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
+      __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
 
-        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
-        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
-        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
+      total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
+      total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
+      total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
 
-        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
-        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
-        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
+      total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
+      total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
+      total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
 
-        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
-        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
-        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
+      __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
+      __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
+      __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
 
-        __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
-        __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
-        __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
 
-        _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x);
-        _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y);
-        _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z);
-      }
+      assert(isfinite(_mm256_cvtss_f32(new_p_x)));
+      assert(isfinite(_mm256_cvtss_f32(new_p_y)));
+      assert(isfinite(_mm256_cvtss_f32(new_p_z)));
     }
+    _m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
+    __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
+
+    total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
+    total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
+    total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
+
+    total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
+    total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
+    total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
+
+    __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
+    __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
+    __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
+
+    _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x);
+    _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y);
+    _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z);
+
+    assert(isfinite(_mm256_cvtss_f32(new_p_x)));
+    assert(isfinite(_mm256_cvtss_f32(new_p_y)));
+    assert(isfinite(_mm256_cvtss_f32(new_p_z)));
   }
-  // Step 5: Motion Constraints - duplicated for even/odd blocks
+  // Step 5: Motion Constraints - Duplicated for even/odd
   for (int c = 0; c < MAX_CHNS; ++c) {
+    __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3);
     int p_base = c * PTC_PER_CHN;
+    assert(p_base + 15 < MAX_PTC);
+    // Even block
     {
-      // Even block
       __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
       __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
       __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
       __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
-      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);  __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 0]);
 
-      __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 0]);
       __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 0]);
       __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 0]);
       __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 0]);
@@ -1546,6 +1747,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
       __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
       __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+      assert(isfinite(_mm256_cvtss_f32(dist)));
 
       __m256 pen = _mm256_sub_ps(dist, r_vec);
       __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
@@ -1564,9 +1766,13 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_sub_ps(px, dt_x));
       _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y));
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z));
+
+      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x))));
+      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y))));
+      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z))));
     }
+    // Odd block
     {
-      // Odd block
       __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
       __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
       __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
@@ -1588,6 +1794,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
       __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
       __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+      assert(isfinite(_mm256_cvtss_f32(dist)));
 
       __m256 pen = _mm256_sub_ps(dist, r_vec);
       __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
@@ -1606,6 +1813,10 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_sub_ps(px, dt_x));
       _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_sub_ps(py, dt_y));
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z));
+
+      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x))));
+      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y))));
+      assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z))));
     }
   }
   assert(chn_sim__val(cns));

diff --git a/chain.c b/chain.c
@@ -4,6 +4,8 @@
 #include <string.h>
 #include <float.h>
 
+#define MAX_MOTION_RADIUS 1000000.0f
+
 enum {
   MAX_CHNS      = 128,
   PTC_PER_CHN   = 16,
@@ -14,8 +16,6 @@ enum {
   MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
   MAX_ITR       = 16,
 };
-#define MAX_MOTION_RADIUS 1000000.0f
-
 struct chn_cfg {
   float damping;              // [0, 1]
   float stiffness;            // [0, 1]
@@ -86,6 +86,145 @@ struct chn_sim {
   float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
   float motion_radius[MAX_PTC] __attribute__((aligned(32)));
 };
+static int
+chn_sim__val(struct chn_sim *cns) {
+  assert(cns);
+  assert(cns->free_idx_cnt <= MAX_CHNS);
+  for (int i = 0; i < cns->free_idx_cnt; ++i) {
+    assert(i < MAX_CHNS);
+    assert(cns->free_idx[i] < MAX_CHNS);
+  }
+  for (int i = 0; i < MAX_CHNS; ++i) {
+    const struct chn_cfg *cfg = &cns->chn_cfg[i];
+    assert(cfg->damping >= 0.0f);
+    assert(cfg->damping <= 1.0f);
+
+    assert(cfg->stiffness >= 0.0f);
+    assert(cfg->stiffness <= 1.0f);
+
+    assert(cfg->friction >= 0.0f);
+    assert(cfg->friction <= 1.0f);
+
+    assert(cfg->linear_inertia >= 0.0f);
+    assert(cfg->linear_inertia <= 1.0f);
+
+    assert(cfg->angular_inertia >= 0.0f);
+    assert(cfg->angular_inertia <= 1.0f);
+
+    assert(cfg->centrifugal_inertia >= 0.0f);
+    assert(cfg->centrifugal_inertia <= 1.0f);
+
+    assert(cfg->bend_stiffness >= 0.0f);
+    assert(cfg->bend_stiffness <= 1.0f);
+
+    assert(!isnan(cns->chn_grav_x[i]));
+    assert(!isnan(cns->chn_grav_y[i]));
+    assert(!isnan(cns->chn_grav_z[i]));
+
+    assert(isfinite(cns->chn_grav_x[i]));
+    assert(isfinite(cns->chn_grav_y[i]));
+    assert(isfinite(cns->chn_grav_z[i]));
+
+    assert(!isnan(cns->chn_wnd_x[i]));
+    assert(!isnan(cns->chn_wnd_y[i]));
+    assert(!isnan(cns->chn_wnd_z[i]));
+
+    assert(isfinite(cns->chn_wnd_x[i]));
+    assert(isfinite(cns->chn_wnd_y[i]));
+    assert(isfinite(cns->chn_wnd_z[i]));
+
+    assert(!isnan(cns->chn_pos_x[i]));
+    assert(!isnan(cns->chn_pos_y[i]));
+    assert(!isnan(cns->chn_pos_z[i]));
+
+    assert(isfinite(cns->chn_pos_x[i]));
+    assert(isfinite(cns->chn_pos_y[i]));
+    assert(isfinite(cns->chn_pos_z[i]));
+
+    assert(!isnan(cns->chn_quat_x[i]));
+    assert(!isnan(cns->chn_quat_y[i]));
+    assert(!isnan(cns->chn_quat_z[i]));
+    assert(!isnan(cns->chn_quat_w[i]));
+
+    assert(isfinite(cns->chn_quat_x[i]));
+    assert(isfinite(cns->chn_quat_y[i]));
+    assert(isfinite(cns->chn_quat_z[i]));
+    assert(isfinite(cns->chn_quat_w[i]));
+
+    assert(!isnan(cns->chn_prev_pos_x[i]));
+    assert(!isnan(cns->chn_prev_pos_y[i]));
+    assert(!isnan(cns->chn_prev_pos_z[i]));
+
+    assert(isfinite(cns->chn_prev_pos_x[i]));
+    assert(isfinite(cns->chn_prev_pos_y[i]));
+    assert(isfinite(cns->chn_prev_pos_z[i]));
+
+    assert(!isnan(cns->chn_prev_quat_x[i]));
+    assert(!isnan(cns->chn_prev_quat_y[i]));
+    assert(!isnan(cns->chn_prev_quat_z[i]));
+    assert(!isnan(cns->chn_prev_quat_w[i]));
+
+    assert(isfinite(cns->chn_prev_quat_x[i]));
+    assert(isfinite(cns->chn_prev_quat_y[i]));
+    assert(isfinite(cns->chn_prev_quat_z[i]));
+    assert(isfinite(cns->chn_prev_quat_w[i]));
+  }
+  for (int i = 0; i < MAX_PTC; ++i) {
+    assert(cns->ptc_rest_len[i] >= 0.0f);
+    assert(cns->motion_radius[i] >= 0.0f);
+
+    assert(!isnan(cns->ptc_pos_x[i]));
+    assert(!isnan(cns->ptc_pos_y[i]));
+    assert(!isnan(cns->ptc_pos_z[i]));
+    assert(!isnan(cns->ptc_inv_mass[i]));
+
+    assert(isfinite(cns->ptc_pos_x[i]));
+    assert(isfinite(cns->ptc_pos_y[i]));
+    assert(isfinite(cns->ptc_pos_z[i]));
+    assert(isfinite(cns->ptc_inv_mass[i]));
+
+    assert(!isnan(cns->ptc_prev_pos_x[i]));
+    assert(!isnan(cns->ptc_prev_pos_y[i]));
+    assert(!isnan(cns->ptc_prev_pos_z[i]));
+
+    assert(isfinite(cns->ptc_prev_pos_x[i]));
+    assert(isfinite(cns->ptc_prev_pos_y[i]));
+    assert(isfinite(cns->ptc_prev_pos_z[i]));
+
+    assert(!isnan(cns->rest_pos_x[i]));
+    assert(!isnan(cns->rest_pos_y[i]));
+    assert(!isnan(cns->rest_pos_z[i]));
+    assert(!isnan(cns->motion_radius[i]));
+
+    assert(isfinite(cns->rest_pos_x[i]));
+    assert(isfinite(cns->rest_pos_y[i]));
+    assert(isfinite(cns->rest_pos_z[i]));
+    assert(isfinite(cns->motion_radius[i]));
+  }
+  for (int i = 0; i < MAX_SPH; ++i) {
+    assert(isfinite(cns->sph_x[i]));
+    assert(isfinite(cns->sph_y[i]));
+    assert(isfinite(cns->sph_z[i]));
+    assert(isfinite(cns->sph_r[i]));
+
+    assert(!isnan(cns->sph_x[i]));
+    assert(!isnan(cns->sph_y[i]));
+    assert(!isnan(cns->sph_z[i]));
+    assert(!isnan(cns->sph_r[i]));
+
+    assert(isfinite(cns->sph_x[i]));
+    assert(isfinite(cns->sph_y[i]));
+    assert(isfinite(cns->sph_z[i]));
+    assert(isfinite(cns->sph_r[i]));
+
+    assert(!isnan(cns->sph_x[i]));
+    assert(!isnan(cns->sph_y[i]));
+    assert(!isnan(cns->sph_z[i]));
+    assert(!isnan(cns->sph_r[i]));
+    assert(cns->sph_r[i] >= 0.0f);
+  }
+  return 1;
+}
 extern void
 chn_sim_init(struct chn_sim *cns) {
   assert(cns);
@@ -98,7 +237,6 @@ chn_sim_init(struct chn_sim *cns) {
     _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f));
     _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));
   }
-  // Set motion_radius to MAX_MOTION_RADIUS
   for (int i = 0; i < MAX_PTC; i += 8) {
     _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS));
   }
@@ -110,13 +248,28 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
   assert(cns->free_idx_cnt > 0);
   assert(cnt <= PTC_PER_CHN);
   assert(cnt > 1);
+  assert(chn_sim__val(cns));
+
+  assert(cfg->damping >= 0.0f);
+  assert(cfg->damping <= 1.0f);
+  assert(cfg->stiffness >= 0.0f);
+  assert(cfg->stiffness <= 1.0f);
+  assert(cfg->friction >= 0.0f);
+  assert(cfg->friction <= 1.0f);
+  assert(cfg->linear_inertia >= 0.0f);
+  assert(cfg->linear_inertia <= 1.0f);
+  assert(cfg->angular_inertia >= 0.0f);
+  assert(cfg->angular_inertia <= 1.0f);
+  assert(cfg->centrifugal_inertia >= 0.0f);
+  assert(cfg->centrifugal_inertia <= 1.0f);
+  assert(cfg->bend_stiffness >= 0.0f);
+  assert(cfg->bend_stiffness <= 1.0f);
 
   int chn = cns->free_idx[--cns->free_idx_cnt];
   cns->chn_cfg[chn] = *cfg;
 
   int p_base = (chn * PTC_PER_CHN);
   int r_base = (chn * CON_PER_CHN);
-  int b_base = (chn * (CON_PER_CHN - 1));
   int even_base = p_base + 0;  // Even indices 0-7
   int odd_base = p_base + 8;   // Odd indices 8-15
 
@@ -144,6 +297,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     cns->ptc_pos_y[even_base + i] = 0.0f;
     cns->ptc_pos_z[even_base + i] = 0.0f;
     cns->ptc_inv_mass[even_base + i] = 0.0f;
+
     cns->rest_pos_x[even_base + i] = 0.0f;
     cns->rest_pos_y[even_base + i] = 0.0f;
     cns->rest_pos_z[even_base + i] = 0.0f;
@@ -174,6 +328,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     cns->ptc_pos_y[odd_base + i] = 0.0f;
     cns->ptc_pos_z[odd_base + i] = 0.0f;
     cns->ptc_inv_mass[odd_base + i] = 0.0f;
+
     cns->rest_pos_x[odd_base + i] = 0.0f;
     cns->rest_pos_y[odd_base + i] = 0.0f;
     cns->rest_pos_z[odd_base + i] = 0.0f;
@@ -193,6 +348,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
     float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
     float len = sqrtf(dx*dx + dy*dy + dz*dz);
+
     int con_idx = i - 1;
     if ((con_idx & 0x01) == 0) {  // Even constraint
       cns->ptc_rest_len[r_base + (con_idx / 2)] = len;
@@ -220,6 +376,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1];
     float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1];
     float len = sqrtf(dx * dx + dy * dy + dz * dz);
+
     cns->ptc_bend_rest[r_base + i] = len;
     cns->ptc_bend_lambda[r_base + i] = 0.0f;
   }
@@ -232,25 +389,26 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1];
     float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1];
     float len = sqrtf(dx * dx + dy * dy + dz * dz);
+
     cns->ptc_bend_rest[r_base + 8 + i] = len;
     cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
   }
   for (int i = odd_bend_cnt; i < 8; ++i) {
     cns->ptc_bend_rest[r_base + 8 + i] = 0.0f;
     cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
   }
+  assert(chn_sim__val(cns));
   return chn;
 }
 extern void
 chn_sim_del(struct chn_sim *cns, int chn) {
-  assert(cns);
+  assert(chn_sim__val(cns));
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
 
   int p_base = (chn * PTC_PER_CHN);
   int s_base = (chn * SPH_PER_CHN);
   int r_base = (chn * CON_PER_CHN);
-  int b_base = (chn * (CON_PER_CHN - 1));
 
   cns->free_idx[cns->free_idx_cnt++] = chn;
   memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
@@ -281,7 +439,6 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   cns->chn_prev_quat_z[chn] = 0.0f;
   cns->chn_prev_quat_w[chn] = 1.0f;
 
-  // Zero transposed particle blocks
   _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
@@ -314,7 +471,6 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   _mm256_store_ps(&cns->sph_z[s_base], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->sph_r[s_base], _mm256_set1_ps(0.0f));
 
-  // Zero transposed constraints
   _mm256_store_ps(&cns->ptc_rest_len[r_base + 0], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f));
@@ -323,10 +479,14 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], _mm256_set1_ps(0.0f));
+
+  assert(chn_sim__val(cns));
 }
 extern void
-chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
-  assert(cns);
+chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3,
+            const float *restrict rot4) {
+
+  assert(chn_sim__val(cns));
   assert(pos3);
   assert(rot4);
   assert(chn >= 0);
@@ -340,10 +500,13 @@ chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const floa
   cns->chn_quat_y[chn] = rot4[1];
   cns->chn_quat_z[chn] = rot4[2];
   cns->chn_quat_w[chn] = rot4[3];
+  assert(chn_sim__val(cns));
 }
 extern void
-chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
-  assert(cns);
+chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3,
+            const float *restrict rot4) {
+
+  assert(chn_sim__val(cns));
   assert(pos3);
   assert(rot4);
   assert(chn >= 0);
@@ -357,32 +520,38 @@ chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const floa
   cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1];
   cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
   cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
+
+  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
-  assert(cns);
+  assert(chn_sim__val(cns));
   assert(grav3);
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
 
   cns->chn_grav_x[chn] = grav3[0];
   cns->chn_grav_y[chn] = grav3[1];
   cns->chn_grav_z[chn] = grav3[2];
+
+  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
-  assert(cns);
+  assert(chn_sim__val(cns));
   assert(wind3);
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
 
   cns->chn_wnd_x[chn] = wind3[0];
   cns->chn_wnd_y[chn] = wind3[1];
   cns->chn_wnd_z[chn] = wind3[2];
+
+  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
-  assert(cns);
+  assert(chn_sim__val(cns));
   assert(spheres);
   assert(cnt <= SPH_PER_CHN);
 
@@ -399,10 +568,11 @@ chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
     cns->sph_z[s_base + i] = 0;
     cns->sph_r[s_base + i] = 0;
   }
+  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
-  assert(cns);
+  assert(chn_sim__val(cns));
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
   assert(cnt >= 0);
@@ -413,6 +583,7 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
   int odd_base = p_base + 8;
   int even_cnt = (cnt + 1) / 2;
   int odd_cnt = cnt / 2;
+
   for (int i = 0; i < even_cnt; ++i) {
     cns->motion_radius[even_base + i] = radius[2 * i];
   }
@@ -425,9 +596,12 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
   for (int i = odd_cnt; i < 8; ++i) {
     cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS;
   }
+  assert(chn_sim__val(cns));
 }
 extern void
 chn_sim_run(struct chn_sim *cns, float dt) {
+  assert(chn_sim__val(cns));
+
   // SIMD constants
   const __m256 dt_vec = _mm256_set1_ps(dt);
   const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
@@ -600,7 +774,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
       __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
       __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec),
-                                       _mm256_andnot_ps(negate_mask, one_vec));
+                                        _mm256_andnot_ps(negate_mask, one_vec));
 
       orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
       orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
@@ -981,7 +1155,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         __m256 delta_lambda = _mm256_div_ps(numerator, denom);
         __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda);
 
-        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);  // grad_A = unit
+        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
         __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
         __m256 unit_z = _mm256_mul_ps(dz, inv_dist);
 
@@ -1434,4 +1608,5 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z));
     }
   }
-}
+  assert(chn_sim__val(cns));
+}
diff --git a/chain.c b/chain.c
@@ -1434,6 +1434,4 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z));
     }
   }
-}
-
-
+}
diff --git a/main.c b/main.c
@@ -3,6 +3,8 @@
 #include <stdlib.h>
 #include <math.h>
 
+#include "chain.c"
+
 #define WINDOW_WIDTH 800
 #define WINDOW_HEIGHT 600
 #define CHAIN_LENGTH 10
@@ -84,13 +86,7 @@ main(int argc, char *argv[]) {
   SDL_Event event;
 
   // Debug: Check gravity and masses
-  printf("Gravity y: %.2f\n", sim.chn_grav_y[chn]);
   int p_base = chn * PTC_PER_CHN;
-  for (int i = 0; i < CHAIN_LENGTH; i++) {
-    int idx = p_base + (i % 2 == 0 ? i/2 : 8 + (i-1)/2);
-    printf("p%d inv_mass: %.2f\n", i, sim.ptc_inv_mass[idx]);
-  }
-
   while (running) {
     while (SDL_PollEvent(&event)) {
       if (event.type == SDL_QUIT) {
@@ -173,23 +169,11 @@ main(int argc, char *argv[]) {
         SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
       }
     }
-
-    // Debug printf (every 5 steps ~0.08s)
-    if ((int)(time / dt) % 5 == 0) {
-      printf("t=%.2f: ", time);
-      for (int i = 0; i < CHAIN_LENGTH; i++) {
-        int idx = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
-        float model_y = sim.ptc_pos_y[p_base + idx];
-        float world_y = model_y + root_y;
-        printf("p%d world_y=%.2f (model=%.2f) ", i, world_y, model_y);
-      }
-      printf("\n");
-    }
     SDL_RenderPresent(renderer);
     SDL_Delay(16);
   }
   SDL_DestroyRenderer(renderer);
   SDL_DestroyWindow(window);
   SDL_Quit();
   return 0;
-}
+}
diff --git a/chain.c b/chain.c
@@ -1436,198 +1436,4 @@ chn_sim_run(struct chn_sim *cns, float dt) {
   }
 }
 
-#include <SDL2/SDL.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-
-#define WINDOW_WIDTH 800
-#define WINDOW_HEIGHT 600
-#define CHAIN_LENGTH 10
-#define PARTICLE_SPACING 10.0f
 
-static void
-project(float x, float y, float z, int *screen_x, int *screen_y) {
-  *screen_x = (int)(x + WINDOW_WIDTH / 2.0f);
-  *screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f);
-}
-// Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity)
-static void quat_rotate_inv_translate(float wx, float wy, float wz,  // world pos
-                                      float cx, float cy, float cz,  // chain pos
-                                      float qx, float qy, float qz, float qw,  // chain quat
-                                      float *mx, float *my, float *mz) {  // out model pos
-  // Translate: world - chain
-  float tx = wx - cx, ty = wy - cy, tz = wz - cz;
-  // Quat conj: (qw, -qx, -qy, -qz)
-  float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz;
-  // t = conj * (0, tx,ty,tz)
-  float tw = cqx*tx + cqy*ty + cqz*tz;
-  float t_x = cqw*tx + cqy*tz - cqz*ty;
-  float t_y = cqw*ty + cqz*tx - cqx*tz;
-  float t_z = cqw*tz + cqx*ty - cqy*tx;
-  // model = t * quat (vec part)
-  *mx = qw*t_x + qy*t_z - qz*t_y + tw*qx;
-  *my = qw*t_y + qz*t_x - qx*t_z + tw*qy;
-  *mz = qw*t_z + qx*t_y - qy*t_x + tw*qz;
-}
-extern int
-main(int argc, char *argv[]) {
-  if (SDL_Init(SDL_INIT_VIDEO) < 0) {
-    printf("SDL_Init failed: %s\n", SDL_GetError());
-    return 1;
-  }
-  SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0);
-  SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
-  if (!window || !renderer) {
-    printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError());
-    SDL_Quit();
-    return 1;
-  }
-  struct chn_sim sim;
-  chn_sim_init(&sim);
-
-  struct chn_cfg cfg = {
-    .damping = 0.5f,
-    .stiffness = 0.9f,
-    .friction = 0.5f,
-    .linear_inertia = 0.9f,       // Tune 0.5f for more tangential lag
-    .angular_inertia = 0.9f,
-    .centrifugal_inertia = 0.9f,   // Tune 0.5f for more outward bow
-    .bend_stiffness = 0.8f,
-  };
-  float rest_pos[CHAIN_LENGTH * 4];
-  for (int i = 0; i < CHAIN_LENGTH; i++) {
-    rest_pos[i*4 + 0] = 0.0f;
-    rest_pos[i*4 + 1] = -i * PARTICLE_SPACING;
-    rest_pos[i*4 + 2] = 0.0f;
-    rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f;
-  }
-  int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH);
-  chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f});  // Reduced for spin test
-  chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});  // Start at circle edge
-
-  // Fixed world spheres (triangle outside circle)
-  float world_spheres[12] = {  // 3 spheres * 4 floats (x,y,z,r)
-    60.0f,  0.0f, 0.0f, 10.0f,   // Sphere 0: right
-    -30.0f, 52.0f, 0.0f, 10.0f,  // Sphere 1: top-left
-    -30.0f,-52.0f, 0.0f, 10.0f   // Sphere 2: bottom-left
-  };
-
-  // Circular motion params
-  float time = 0.0f;
-  float circle_radius = 50.0f;
-  float angular_speed = 2.0f * M_PI / 5.0f;  // Full circle every 5s
-  float dt = 0.016f;
-  int running = 1;
-  SDL_Event event;
-
-  // Debug: Check gravity and masses
-  printf("Gravity y: %.2f\n", sim.chn_grav_y[chn]);
-  int p_base = chn * PTC_PER_CHN;
-  for (int i = 0; i < CHAIN_LENGTH; i++) {
-    int idx = p_base + (i % 2 == 0 ? i/2 : 8 + (i-1)/2);
-    printf("p%d inv_mass: %.2f\n", i, sim.ptc_inv_mass[idx]);
-  }
-
-  while (running) {
-    while (SDL_PollEvent(&event)) {
-      if (event.type == SDL_QUIT) {
-        running = 0;
-      }
-    }
-    time += dt;
-
-    // Circular anchor motion
-    float anchor_x = circle_radius * cosf(angular_speed * time);
-    float anchor_y = circle_radius * sinf(angular_speed * time);
-    chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});
-
-    // Transform fixed world spheres to current model space
-    float model_spheres[12];
-    for (int s = 0; s < 3; ++s) {
-      float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2];
-      float r = world_spheres[s*4 + 3];
-      float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn];
-      float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn];
-      float mx, my, mz;
-      quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz);
-      model_spheres[s*4 + 0] = mx;
-      model_spheres[s*4 + 1] = my;
-      model_spheres[s*4 + 2] = mz;
-      model_spheres[s*4 + 3] = r;
-    }
-    chn_sim_set_sph(&sim, chn, model_spheres, 3);
-    chn_sim_run(&sim, dt);
-
-    SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
-    SDL_RenderClear(renderer);
-    SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255);  // Green for chain & spheres
-
-    float root_x = sim.chn_pos_x[chn];  // Current root world x/y
-    float root_y = sim.chn_pos_y[chn];
-
-    // Draw chain segments
-    for (int i = 0; i < CHAIN_LENGTH - 1; i++) {
-      int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
-      int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2);
-
-      float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0];
-      float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1];
-      float world_x0 = model_x0 + root_x;
-      float world_y0 = model_y0 + root_y;
-      float world_x1 = model_x1 + root_x;
-      float world_y1 = model_y1 + root_y;
-
-      int sx0, sy0, sx1, sy1;
-      project(world_x0, world_y0, 0, &sx0, &sy0);
-      project(world_x1, world_y1, 0, &sx1, &sy1);
-      SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
-    }
-    // Draw spheres (approx circle outline with 32 points)
-    int p_base = chn * PTC_PER_CHN;
-    int s_base = chn * SPH_PER_CHN;
-    for (int s = 0; s < 3; ++s) {  // Only draw active spheres
-      float sph_model_x = sim.sph_x[s_base + s];
-      float sph_model_y = sim.sph_y[s_base + s];
-      float sph_r = sim.sph_r[s_base + s];
-      if (sph_r <= 0.0f) continue;
-
-      float sph_world_x = sph_model_x + root_x;
-      float sph_world_y = sph_model_y + root_y;
-      int sph_sx, sph_sy;
-      project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy);
-
-      // Parametric circle: 32 points
-      const int num_points = 32;
-      for (int pt = 0; pt < num_points; ++pt) {
-        float angle0 = 2.0f * M_PI * pt / num_points;
-        float angle1 = 2.0f * M_PI * (pt + 1) / num_points;
-        float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0);
-        float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1);
-
-        int px0, py0, px1, py1;
-        project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0);
-        project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1);
-        SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
-      }
-    }
-
-    // Debug printf (every 5 steps ~0.08s)
-    if ((int)(time / dt) % 5 == 0) {
-      printf("t=%.2f: ", time);
-      for (int i = 0; i < CHAIN_LENGTH; i++) {
-        int idx = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
-        float model_y = sim.ptc_pos_y[p_base + idx];
-        float world_y = model_y + root_y;
-        printf("p%d world_y=%.2f (model=%.2f) ", i, world_y, model_y);
-      }
-      printf("\n");
-    }
-    SDL_RenderPresent(renderer);
-    SDL_Delay(16);
-  }
-  SDL_DestroyRenderer(renderer);
-  SDL_DestroyWindow(window);
-  SDL_Quit();
-  return 0;
-}
diff --git a/main.c b/main.c
@@ -0,0 +1,195 @@
+#include <SDL2/SDL.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#define WINDOW_WIDTH 800
+#define WINDOW_HEIGHT 600
+#define CHAIN_LENGTH 10
+#define PARTICLE_SPACING 10.0f
+
+static void
+project(float x, float y, float z, int *screen_x, int *screen_y) {
+  *screen_x = (int)(x + WINDOW_WIDTH / 2.0f);
+  *screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f);
+}
+// Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity)
+static void quat_rotate_inv_translate(float wx, float wy, float wz,  // world pos
+                                      float cx, float cy, float cz,  // chain pos
+                                      float qx, float qy, float qz, float qw,  // chain quat
+                                      float *mx, float *my, float *mz) {  // out model pos
+  // Translate: world - chain
+  float tx = wx - cx, ty = wy - cy, tz = wz - cz;
+  // Quat conj: (qw, -qx, -qy, -qz)
+  float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz;
+  // t = conj * (0, tx,ty,tz)
+  float tw = cqx*tx + cqy*ty + cqz*tz;
+  float t_x = cqw*tx + cqy*tz - cqz*ty;
+  float t_y = cqw*ty + cqz*tx - cqx*tz;
+  float t_z = cqw*tz + cqx*ty - cqy*tx;
+  // model = t * quat (vec part)
+  *mx = qw*t_x + qy*t_z - qz*t_y + tw*qx;
+  *my = qw*t_y + qz*t_x - qx*t_z + tw*qy;
+  *mz = qw*t_z + qx*t_y - qy*t_x + tw*qz;
+}
+extern int
+main(int argc, char *argv[]) {
+  if (SDL_Init(SDL_INIT_VIDEO) < 0) {
+    printf("SDL_Init failed: %s\n", SDL_GetError());
+    return 1;
+  }
+  SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0);
+  SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+  if (!window || !renderer) {
+    printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError());
+    SDL_Quit();
+    return 1;
+  }
+  struct chn_sim sim;
+  chn_sim_init(&sim);
+
+  struct chn_cfg cfg = {
+    .damping = 0.5f,
+    .stiffness = 0.9f,
+    .friction = 0.5f,
+    .linear_inertia = 0.9f,       // Tune 0.5f for more tangential lag
+    .angular_inertia = 0.9f,
+    .centrifugal_inertia = 0.9f,   // Tune 0.5f for more outward bow
+    .bend_stiffness = 0.8f,
+  };
+  float rest_pos[CHAIN_LENGTH * 4];
+  for (int i = 0; i < CHAIN_LENGTH; i++) {
+    rest_pos[i*4 + 0] = 0.0f;
+    rest_pos[i*4 + 1] = -i * PARTICLE_SPACING;
+    rest_pos[i*4 + 2] = 0.0f;
+    rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f;
+  }
+  int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH);
+  chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f});  // Reduced for spin test
+  chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});  // Start at circle edge
+
+  // Fixed world spheres (triangle outside circle)
+  float world_spheres[12] = {  // 3 spheres * 4 floats (x,y,z,r)
+    60.0f,  0.0f, 0.0f, 10.0f,   // Sphere 0: right
+    -30.0f, 52.0f, 0.0f, 10.0f,  // Sphere 1: top-left
+    -30.0f,-52.0f, 0.0f, 10.0f   // Sphere 2: bottom-left
+  };
+
+  // Circular motion params
+  float time = 0.0f;
+  float circle_radius = 50.0f;
+  float angular_speed = 2.0f * M_PI / 5.0f;  // Full circle every 5s
+  float dt = 0.016f;
+  int running = 1;
+  SDL_Event event;
+
+  // Debug: Check gravity and masses
+  printf("Gravity y: %.2f\n", sim.chn_grav_y[chn]);
+  int p_base = chn * PTC_PER_CHN;
+  for (int i = 0; i < CHAIN_LENGTH; i++) {
+    int idx = p_base + (i % 2 == 0 ? i/2 : 8 + (i-1)/2);
+    printf("p%d inv_mass: %.2f\n", i, sim.ptc_inv_mass[idx]);
+  }
+
+  while (running) {
+    while (SDL_PollEvent(&event)) {
+      if (event.type == SDL_QUIT) {
+        running = 0;
+      }
+    }
+    time += dt;
+
+    // Circular anchor motion
+    float anchor_x = circle_radius * cosf(angular_speed * time);
+    float anchor_y = circle_radius * sinf(angular_speed * time);
+    chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});
+
+    // Transform fixed world spheres to current model space
+    float model_spheres[12];
+    for (int s = 0; s < 3; ++s) {
+      float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2];
+      float r = world_spheres[s*4 + 3];
+      float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn];
+      float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn];
+      float mx, my, mz;
+      quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz);
+      model_spheres[s*4 + 0] = mx;
+      model_spheres[s*4 + 1] = my;
+      model_spheres[s*4 + 2] = mz;
+      model_spheres[s*4 + 3] = r;
+    }
+    chn_sim_set_sph(&sim, chn, model_spheres, 3);
+    chn_sim_run(&sim, dt);
+
+    SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
+    SDL_RenderClear(renderer);
+    SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255);  // Green for chain & spheres
+
+    float root_x = sim.chn_pos_x[chn];  // Current root world x/y
+    float root_y = sim.chn_pos_y[chn];
+
+    // Draw chain segments
+    for (int i = 0; i < CHAIN_LENGTH - 1; i++) {
+      int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
+      int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2);
+
+      float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0];
+      float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1];
+      float world_x0 = model_x0 + root_x;
+      float world_y0 = model_y0 + root_y;
+      float world_x1 = model_x1 + root_x;
+      float world_y1 = model_y1 + root_y;
+
+      int sx0, sy0, sx1, sy1;
+      project(world_x0, world_y0, 0, &sx0, &sy0);
+      project(world_x1, world_y1, 0, &sx1, &sy1);
+      SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
+    }
+    // Draw spheres (approx circle outline with 32 points)
+    int p_base = chn * PTC_PER_CHN;
+    int s_base = chn * SPH_PER_CHN;
+    for (int s = 0; s < 3; ++s) {  // Only draw active spheres
+      float sph_model_x = sim.sph_x[s_base + s];
+      float sph_model_y = sim.sph_y[s_base + s];
+      float sph_r = sim.sph_r[s_base + s];
+      if (sph_r <= 0.0f) continue;
+
+      float sph_world_x = sph_model_x + root_x;
+      float sph_world_y = sph_model_y + root_y;
+      int sph_sx, sph_sy;
+      project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy);
+
+      // Parametric circle: 32 points
+      const int num_points = 32;
+      for (int pt = 0; pt < num_points; ++pt) {
+        float angle0 = 2.0f * M_PI * pt / num_points;
+        float angle1 = 2.0f * M_PI * (pt + 1) / num_points;
+        float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0);
+        float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1);
+
+        int px0, py0, px1, py1;
+        project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0);
+        project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1);
+        SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
+      }
+    }
+
+    // Debug printf (every 5 steps ~0.08s)
+    if ((int)(time / dt) % 5 == 0) {
+      printf("t=%.2f: ", time);
+      for (int i = 0; i < CHAIN_LENGTH; i++) {
+        int idx = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
+        float model_y = sim.ptc_pos_y[p_base + idx];
+        float world_y = model_y + root_y;
+        printf("p%d world_y=%.2f (model=%.2f) ", i, world_y, model_y);
+      }
+      printf("\n");
+    }
+    SDL_RenderPresent(renderer);
+    SDL_Delay(16);
+  }
+  SDL_DestroyRenderer(renderer);
+  SDL_DestroyWindow(window);
+  SDL_Quit();
+  return 0;
+}
diff --git a/chain.c b/chain.c
@@ -23,6 +23,7 @@ struct chn_cfg {
   float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
   float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
   float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
+  float bend_stiffness;       // [0, 1], bending resistance
 };
 struct chn_sim {
   unsigned char free_idx_cnt;
@@ -70,6 +71,8 @@ struct chn_sim {
 
   // XPBD lambdas for distance constraints (transposed similarly)
   float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32)));
+  float ptc_bend_rest[MAX_REST_LEN] __attribute__((aligned(32)));
+  float ptc_bend_lambda[MAX_REST_LEN] __attribute__((aligned(32)));
 
   // sphere positions (model space)
   float sph_x[MAX_SPH] __attribute__((aligned(32)));
@@ -209,6 +212,33 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
       cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
     }
   }
+  // Transpose bending constraints: even bends (0-2,2-4,...) to r_base+0-7, odd (1-3,3-5,...) to r_base+8-15 (new)
+  int even_bend_cnt = even_cnt - 1;
+  int odd_bend_cnt = odd_cnt - 1;
+  for (int i = 0; i < even_bend_cnt; ++i) {
+    float dx = cns->rest_pos_x[even_base + i] - cns->rest_pos_x[even_base + i + 1];
+    float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1];
+    float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1];
+    float len = sqrtf(dx * dx + dy * dy + dz * dz);
+    cns->ptc_bend_rest[r_base + i] = len;
+    cns->ptc_bend_lambda[r_base + i] = 0.0f;
+  }
+  for (int i = even_bend_cnt; i < 8; ++i) {
+    cns->ptc_bend_rest[r_base + i] = 0.0f;
+    cns->ptc_bend_lambda[r_base + i] = 0.0f;
+  }
+  for (int i = 0; i < odd_bend_cnt; ++i) {
+    float dx = cns->rest_pos_x[odd_base + i] - cns->rest_pos_x[odd_base + i + 1];
+    float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1];
+    float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1];
+    float len = sqrtf(dx * dx + dy * dy + dz * dz);
+    cns->ptc_bend_rest[r_base + 8 + i] = len;
+    cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
+  }
+  for (int i = odd_bend_cnt; i < 8; ++i) {
+    cns->ptc_bend_rest[r_base + 8 + i] = 0.0f;
+    cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f;
+  }
   return chn;
 }
 extern void
@@ -289,6 +319,10 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_bend_rest[r_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], _mm256_set1_ps(0.0f));
 }
 extern void
 chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
@@ -838,6 +872,9 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
     __m256 compliance_factor = _mm256_div_ps(dt2_vec, stiff_vec);
 
+    __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness);
+    __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, bend_stiff_vec);
+
     // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15)
     __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
     __m256 even_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
@@ -856,6 +893,14 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     __m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]);
     __m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]);
 
+    __m256 bend_rest_even = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 0]);
+    __m256 bend_lambda_even = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 0]);
+    __m256 bend_rest_odd = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 8]);
+    __m256 bend_lambda_odd = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 8]);
+
+    _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f));
+    _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f));
+
     for (int iter = 0; iter < MAX_ITR; ++iter) {
       // Even distance constraints: p0 = even (A), p1 = odd (B)
       {
@@ -975,6 +1020,154 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
         lambda_odd = new_lambda;
       }
+      // Even bending constraints: between even[k] (A) and even[k+1] (B)
+      {
+        __m256 a_x = even_x;
+        __m256 a_y = even_y;
+        __m256 a_z = even_z;
+        __m256 a_m = even_m;
+
+        __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        __m256 b_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx));
+        __m256 b_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx));
+        __m256 b_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx));
+        __m256 b_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx));
+
+        __m256 dx = _mm256_sub_ps(a_x, b_x);
+        __m256 dy = _mm256_sub_ps(a_y, b_y);
+        __m256 dz = _mm256_sub_ps(a_z, b_z);
+
+        __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
+        __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
+        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+
+        __m256 w_sum = _mm256_add_ps(a_m, b_m);
+        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
+        __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ);
+
+        __m256 C = _mm256_sub_ps(dist, bend_rest_even);
+        __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_even));
+        __m256 numerator = _mm256_sub_ps(zero_vec, temp);
+        __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor);
+        __m256 delta_lambda = _mm256_div_ps(numerator, denom);
+        __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda);
+
+        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 unit_z = _mm256_mul_ps(dz, inv_dist);
+
+        __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
+        __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
+        __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);
+
+        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
+        corr_x = _mm256_and_ps(corr_x, corr_mask);
+        corr_y = _mm256_and_ps(corr_y, corr_mask);
+        corr_z = _mm256_and_ps(corr_z, corr_mask);
+
+        // dpos_A = delta_lambda * unit * a_m
+        even_x = _mm256_add_ps(even_x, _mm256_mul_ps(corr_x, a_m));
+        even_y = _mm256_add_ps(even_y, _mm256_mul_ps(corr_y, a_m));
+        even_z = _mm256_add_ps(even_z, _mm256_mul_ps(corr_z, a_m));
+
+        // dpos_B = - corr * b_m (shifted add)
+        __m256 delta_b_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, b_m));
+        __m256i delta_ix = _mm256_castps_si256(delta_b_x);
+        __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4);
+        __m256 delta_even_x = _mm256_castsi256_ps(left_shifted_ix);
+        even_x = _mm256_add_ps(even_x, delta_even_x);
+
+        __m256 delta_b_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, b_m));
+        __m256i delta_iy = _mm256_castps_si256(delta_b_y);
+        __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4);
+        __m256 delta_even_y = _mm256_castsi256_ps(left_shifted_iy);
+        even_y = _mm256_add_ps(even_y, delta_even_y);
+
+        __m256 delta_b_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, b_m));
+        __m256i delta_iz = _mm256_castps_si256(delta_b_z);
+        __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4);
+        __m256 delta_even_z = _mm256_castsi256_ps(left_shifted_iz);
+        even_z = _mm256_add_ps(even_z, delta_even_z);
+
+        bend_lambda_even = new_lambda;
+      }
+      // Odd bending constraints: between odd[k] (A) and odd[k+1] (B)
+      {
+        __m256 a_x = odd_x;
+        __m256 a_y = odd_y;
+        __m256 a_z = odd_z;
+        __m256 a_m = odd_m;
+
+        __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        __m256 b_x = _mm256_permutevar8x32_ps(odd_x, _mm256_castsi256_ps(shift_idx));
+        __m256 b_y = _mm256_permutevar8x32_ps(odd_y, _mm256_castsi256_ps(shift_idx));
+        __m256 b_z = _mm256_permutevar8x32_ps(odd_z, _mm256_castsi256_ps(shift_idx));
+        __m256 b_m = _mm256_permutevar8x32_ps(odd_m, _mm256_castsi256_ps(shift_idx));
+
+        __m256 dx = _mm256_sub_ps(a_x, b_x);
+        __m256 dy = _mm256_sub_ps(a_y, b_y);
+        __m256 dz = _mm256_sub_ps(a_z, b_z);
+
+        __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
+        __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
+        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
+        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+
+        __m256 w_sum = _mm256_add_ps(a_m, b_m);
+        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
+        __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ);
+
+        __m256 C = _mm256_sub_ps(dist, bend_rest_odd);
+        __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_odd));
+        __m256 numerator = _mm256_sub_ps(zero_vec, temp);
+        __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor);
+        __m256 delta_lambda = _mm256_div_ps(numerator, denom);
+        __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda);
+
+        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 unit_z = _mm256_mul_ps(dz, inv_dist);
+
+        __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
+        __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
+        __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);
+
+        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
+        corr_x = _mm256_and_ps(corr_x, corr_mask);
+        corr_y = _mm256_and_ps(corr_y, corr_mask);
+        corr_z = _mm256_and_ps(corr_z, corr_mask);
+
+        // dpos_A = delta_lambda * unit * a_m
+        odd_x = _mm256_add_ps(odd_x, _mm256_mul_ps(corr_x, a_m));
+        odd_y = _mm256_add_ps(odd_y, _mm256_mul_ps(corr_y, a_m));
+        odd_z = _mm256_add_ps(odd_z, _mm256_mul_ps(corr_z, a_m));
+
+        // dpos_B = - corr * b_m (shifted add)
+        __m256 delta_b_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, b_m));
+        __m256i delta_ix = _mm256_castps_si256(delta_b_x);
+        __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4);
+        __m256 delta_odd_x = _mm256_castsi256_ps(left_shifted_ix);
+        odd_x = _mm256_add_ps(odd_x, delta_odd_x);
+
+        __m256 delta_b_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, b_m));
+        __m256i delta_iy = _mm256_castps_si256(delta_b_y);
+        __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4);
+        __m256 delta_odd_y = _mm256_castsi256_ps(left_shifted_iy);
+        odd_y = _mm256_add_ps(odd_y, delta_odd_y);
+
+        __m256 delta_b_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, b_m));
+        __m256i delta_iz = _mm256_castps_si256(delta_b_z);
+        __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4);
+        __m256 delta_odd_z = _mm256_castsi256_ps(left_shifted_iz);
+        odd_z = _mm256_add_ps(odd_z, delta_odd_z);
+
+        bend_lambda_odd = new_lambda;
+      }
     }
     // Store back
     _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x);
@@ -987,6 +1180,8 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
     _mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even);
     _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd);
+    _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], bend_lambda_even);
+    _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], bend_lambda_odd);
   }
   // Step 4: Sphere Collisions with Friction - duplicated for even/odd blocks
   for (int c = 0; c < MAX_CHNS; ++c) {
@@ -1293,11 +1488,12 @@ main(int argc, char *argv[]) {
 
   struct chn_cfg cfg = {
     .damping = 0.5f,
-    .stiffness = 0.99f,
+    .stiffness = 0.9f,
     .friction = 0.5f,
     .linear_inertia = 0.9f,       // Tune 0.5f for more tangential lag
     .angular_inertia = 0.9f,
-    .centrifugal_inertia = 0.9f   // Tune 0.5f for more outward bow
+    .centrifugal_inertia = 0.9f,   // Tune 0.5f for more outward bow
+    .bend_stiffness = 0.8f,
   };
   float rest_pos[CHAIN_LENGTH * 4];
   for (int i = 0; i < CHAIN_LENGTH; i++) {
@@ -1361,34 +1557,8 @@ main(int argc, char *argv[]) {
       model_spheres[s*4 + 3] = r;
     }
     chn_sim_set_sph(&sim, chn, model_spheres, 3);
-
     chn_sim_run(&sim, dt);
 
-    // Collision depth test: Print max pen across particles/spheres
-    float max_pen = 0.0f;
-    int max_p = -1, max_s = -1;
-    int p_base = chn * PTC_PER_CHN;
-    int s_base = chn * SPH_PER_CHN;
-    for (int p = 1; p < CHAIN_LENGTH; ++p) {  // Skip fixed p0
-      int idx = (p % 2 == 0 ? p / 2 : 8 + (p - 1) / 2);
-      float px = sim.ptc_pos_x[p_base + idx], py = sim.ptc_pos_y[p_base + idx], pz = sim.ptc_pos_z[p_base + idx];
-      for (int s = 0; s < 3; ++s) {
-        float sx = sim.sph_x[s_base + s], sy = sim.sph_y[s_base + s], sz = sim.sph_z[s_base + s];
-        float sr = sim.sph_r[s_base + s];
-        float dx = px - sx, dy = py - sy, dz = pz - sz;
-        float dist_sq = dx*dx + dy*dy + dz*dz;
-        float dist = sqrtf(dist_sq);
-        float pen = sr - dist;
-        if (pen > max_pen) {
-          max_pen = pen;
-          max_p = p; max_s = s;
-        }
-      }
-    }
-    if (max_pen > 0.0f && ((int)(time / dt) % 5 == 0)) {
-      printf("Collision: p%d/s%d pen=%.2f\n", max_p, max_s, max_pen);
-    }
-
     SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
     SDL_RenderClear(renderer);
     SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255);  // Green for chain & spheres
@@ -1414,6 +1584,8 @@ main(int argc, char *argv[]) {
       SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
     }
     // Draw spheres (approx circle outline with 32 points)
+    int p_base = chn * PTC_PER_CHN;
+    int s_base = chn * SPH_PER_CHN;
     for (int s = 0; s < 3; ++s) {  // Only draw active spheres
       float sph_model_x = sim.sph_x[s_base + s];
       float sph_model_y = sim.sph_y[s_base + s];

diff --git a/chain.c b/chain.c
@@ -12,20 +12,17 @@ enum {
   MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
   CON_PER_CHN   = PTC_PER_CHN,
   MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
-  MAX_ITR       = 12,
+  MAX_ITR       = 16,
 };
-#define MAX_MOTION_RADIUS 10.0f
+#define MAX_MOTION_RADIUS 1000000.0f
 
 struct chn_cfg {
   float damping;              // [0, 1]
   float stiffness;            // [0, 1]
-  float stretch_factor;
-  float max_strain;
   float friction;             // [0, 1]
   float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
   float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
   float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
-  float bend_stiffness;       // [0, 1] for bending constraints
 };
 struct chn_sim {
   unsigned char free_idx_cnt;
@@ -45,16 +42,16 @@ struct chn_sim {
   float chn_pos_x[MAX_CHNS] __attribute__((aligned(32)));
   float chn_pos_y[MAX_CHNS] __attribute__((aligned(32)));
   float chn_pos_z[MAX_CHNS] __attribute__((aligned(32)));
-  
+
   float chn_quat_x[MAX_CHNS] __attribute__((aligned(32)));
   float chn_quat_y[MAX_CHNS] __attribute__((aligned(32)));
   float chn_quat_z[MAX_CHNS] __attribute__((aligned(32)));
   float chn_quat_w[MAX_CHNS] __attribute__((aligned(32)));
-  
+
   float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32)));
   float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32)));
   float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32)));
-  
+
   float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32)));
   float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32)));
   float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
@@ -65,7 +62,7 @@ struct chn_sim {
   float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
   float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
   float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32)));
-  
+
   float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
   float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
   float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
@@ -74,10 +71,6 @@ struct chn_sim {
   // XPBD lambdas for distance constraints (transposed similarly)
   float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32)));
 
-  // Bending rest angles and lambdas (one less than constraints, transposed even/odd bends)
-  float ptc_bend_rest_ang[MAX_REST_LEN - MAX_CHNS] __attribute__((aligned(32)));
-  float ptc_bend_lambda[MAX_REST_LEN - MAX_CHNS] __attribute__((aligned(32)));
-
   // sphere positions (model space)
   float sph_x[MAX_SPH] __attribute__((aligned(32)));
   float sph_y[MAX_SPH] __attribute__((aligned(32)));
@@ -93,34 +86,19 @@ struct chn_sim {
 extern void
 chn_sim_init(struct chn_sim *cns) {
   assert(cns);
+  memset(cns, 0, sizeof(*cns));
   cns->free_idx_cnt = MAX_CHNS;
   for (int i = 0; i < MAX_CHNS; ++i) {
     cns->free_idx[i] = MAX_CHNS - i - 1;
   }
   for (int i = 0; i < MAX_CHNS; i += 8) {
     _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f));
-    _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));  
+    _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));
   }
-  // Initialize lambdas to zero
-  memset(cns->ptc_lambda, 0, sizeof(cns->ptc_lambda));
-  memset(cns->ptc_bend_lambda, 0, sizeof(cns->ptc_bend_lambda));
-  // Zero all transposed particle arrays
-  memset(cns->ptc_pos_x, 0, sizeof(cns->ptc_pos_x));
-  memset(cns->ptc_pos_y, 0, sizeof(cns->ptc_pos_y));
-  memset(cns->ptc_pos_z, 0, sizeof(cns->ptc_pos_z));
-  memset(cns->ptc_inv_mass, 0, sizeof(cns->ptc_inv_mass));
-  memset(cns->ptc_prev_pos_x, 0, sizeof(cns->ptc_prev_pos_x));
-  memset(cns->ptc_prev_pos_y, 0, sizeof(cns->ptc_prev_pos_y));
-  memset(cns->ptc_prev_pos_z, 0, sizeof(cns->ptc_prev_pos_z));
-  memset(cns->rest_pos_x, 0, sizeof(cns->rest_pos_x));
-  memset(cns->rest_pos_y, 0, sizeof(cns->rest_pos_y));
-  memset(cns->rest_pos_z, 0, sizeof(cns->rest_pos_z));
-  // Set motion_radius to FLT_MAX
+  // Set motion_radius to MAX_MOTION_RADIUS
   for (int i = 0; i < MAX_PTC; i += 8) {
     _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS));
   }
-  memset(cns->ptc_rest_len, 0, sizeof(cns->ptc_rest_len));
-  memset(cns->ptc_bend_rest_ang, 0, sizeof(cns->ptc_bend_rest_ang));
 }
 extern int
 chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
@@ -152,6 +130,11 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     cns->rest_pos_x[even_base + i] = rest_pos[orig_i*4+0];
     cns->rest_pos_y[even_base + i] = rest_pos[orig_i*4+1];
     cns->rest_pos_z[even_base + i] = rest_pos[orig_i*4+2];
+
+    // Set previous positions to match initial positions for zero initial velocity
+    cns->ptc_prev_pos_x[even_base + i] = cns->ptc_pos_x[even_base + i];
+    cns->ptc_prev_pos_y[even_base + i] = cns->ptc_pos_y[even_base + i];
+    cns->ptc_prev_pos_z[even_base + i] = cns->ptc_pos_z[even_base + i];
   }
   for (int i = even_cnt; i < 8; ++i) {
     cns->ptc_pos_x[even_base + i] = 0.0f;
@@ -161,6 +144,11 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     cns->rest_pos_x[even_base + i] = 0.0f;
     cns->rest_pos_y[even_base + i] = 0.0f;
     cns->rest_pos_z[even_base + i] = 0.0f;
+
+    // Previous positions for pads remain 0.0f
+    cns->ptc_prev_pos_x[even_base + i] = 0.0f;
+    cns->ptc_prev_pos_y[even_base + i] = 0.0f;
+    cns->ptc_prev_pos_z[even_base + i] = 0.0f;
   }
   for (int i = 0; i < odd_cnt; ++i) {
     int orig_i = 2 * i + 1;
@@ -172,6 +160,11 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     cns->rest_pos_x[odd_base + i] = rest_pos[orig_i*4+0];
     cns->rest_pos_y[odd_base + i] = rest_pos[orig_i*4+1];
     cns->rest_pos_z[odd_base + i] = rest_pos[orig_i*4+2];
+
+    // Set previous positions to match initial positions for zero initial velocity
+    cns->ptc_prev_pos_x[odd_base + i] = cns->ptc_pos_x[odd_base + i];
+    cns->ptc_prev_pos_y[odd_base + i] = cns->ptc_pos_y[odd_base + i];
+    cns->ptc_prev_pos_z[odd_base + i] = cns->ptc_pos_z[odd_base + i];
   }
   for (int i = odd_cnt; i < 8; ++i) {
     cns->ptc_pos_x[odd_base + i] = 0.0f;
@@ -181,12 +174,16 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
     cns->rest_pos_x[odd_base + i] = 0.0f;
     cns->rest_pos_y[odd_base + i] = 0.0f;
     cns->rest_pos_z[odd_base + i] = 0.0f;
-  }
-
-  // Set motion radius to MAX_MOTION_RADIUS by default (no constraint)
-  _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(MAX_MOTION_RADIUS));
-  _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(MAX_MOTION_RADIUS));
 
+    // Previous positions for pads remain 0.0f
+    cns->ptc_prev_pos_x[odd_base + i] = 0.0f;
+    cns->ptc_prev_pos_y[odd_base + i] = 0.0f;
+    cns->ptc_prev_pos_z[odd_base + i] = 0.0f;
+  }
+  // Set motion radius to FLT_MAX by default (no constraint)
+  for (int i = 0; i < PTC_PER_CHN; i += 8) {
+    _mm256_store_ps(&cns->motion_radius[p_base + i], _mm256_set1_ps(MAX_MOTION_RADIUS));
+  }
   // Transpose constraints: even cons (0,2,4,...) to r_base+0-7, odd (1,3,5,...) to r_base+8-15
   for (int i = 1; i < cnt; ++i) {
     float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
@@ -212,47 +209,6 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
       cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
     }
   }
-
-  // Compute rest bending angles for triples, transposed (even bends to b_base+0-6, odd to b_base+8-13)
-  for (int i = 2; i < cnt; ++i) {
-    float dx0 = rest_pos[(i-1)*4+0] - rest_pos[(i-2)*4+0];
-    float dy0 = rest_pos[(i-1)*4+1] - rest_pos[(i-2)*4+1];
-    float dz0 = rest_pos[(i-1)*4+2] - rest_pos[(i-2)*4+2];
-    float len0 = sqrtf(dx0*dx0 + dy0*dy0 + dz0*dz0);
-    if (len0 > 1e-6f) {
-      dx0 /= len0; dy0 /= len0; dz0 /= len0;
-    }
-
-    float dx1 = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
-    float dy1 = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
-    float dz1 = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
-    float len1 = sqrtf(dx1*dx1 + dy1*dy1 + dz1*dz1);
-    if (len1 > 1e-6f) {
-      dx1 /= len1; dy1 /= len1; dz1 /= len1;
-    }
-
-    float dot = dx0*dx1 + dy0*dy1 + dz0*dz1;
-    dot = fminf(1.0f, fmaxf(-1.0f, dot));
-    int bend_idx = i - 2;
-    if ((bend_idx & 0x01) == 0) {  // Even bend
-      cns->ptc_bend_rest_ang[b_base + (bend_idx / 2)] = dot;
-      cns->ptc_bend_lambda[b_base + (bend_idx / 2)] = 0.0f;
-    } else {  // Odd bend
-      cns->ptc_bend_rest_ang[b_base + 8 + ((bend_idx - 1) / 2)] = dot;
-      cns->ptc_bend_lambda[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f;
-    }
-  }
-  // Pad dummies
-  for (int i = cnt - 2; i < CON_PER_CHN - 2; ++i) {
-    int bend_idx = i;
-    if ((bend_idx & 0x01) == 0) {
-      cns->ptc_bend_rest_ang[b_base + (bend_idx / 2)] = 0.0f;
-      cns->ptc_bend_lambda[b_base + (bend_idx / 2)] = 0.0f;
-    } else {
-      cns->ptc_bend_rest_ang[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f;
-      cns->ptc_bend_lambda[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f;
-    }
-  }
   return chn;
 }
 extern void
@@ -320,8 +276,8 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   _mm256_store_ps(&cns->rest_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->rest_pos_z[p_base + 8], _mm256_set1_ps(0.0f));
 
-  _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(FLT_MAX));
-  _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(FLT_MAX));
+  _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(MAX_MOTION_RADIUS));
+  _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(MAX_MOTION_RADIUS));
 
   _mm256_store_ps(&cns->sph_x[s_base], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->sph_y[s_base], _mm256_set1_ps(0.0f));
@@ -333,11 +289,6 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f));
   _mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cns->ptc_bend_rest_ang[b_base + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_bend_rest_ang[b_base + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 8], _mm256_set1_ps(0.0f));
 }
 extern void
 chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
@@ -432,13 +383,13 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
     cns->motion_radius[even_base + i] = radius[2 * i];
   }
   for (int i = even_cnt; i < 8; ++i) {
-    cns->motion_radius[even_base + i] = FLT_MAX;
+    cns->motion_radius[even_base + i] = MAX_MOTION_RADIUS;
   }
   for (int i = 0; i < odd_cnt; ++i) {
     cns->motion_radius[odd_base + i] = radius[2 * i + 1];
   }
   for (int i = odd_cnt; i < 8; ++i) {
-    cns->motion_radius[odd_base + i] = FLT_MAX;
+    cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS;
   }
 }
 extern void
@@ -474,7 +425,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
   float ang_vel_y[MAX_CHNS] __attribute__((aligned(32)));
   float ang_vel_z[MAX_CHNS] __attribute__((aligned(32)));
 
-  // Step 0: Precomputation (SIMD, 8 chains at once)
+  // Step 0: Precomputation
   for (int c = 0; c < MAX_CHNS; c += 8) {
     // Load chain quaternions
     __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
@@ -614,7 +565,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
 
       // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
       __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
-      __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), 
+      __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec),
                                        _mm256_andnot_ps(negate_mask, one_vec));
 
       orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
@@ -633,14 +584,14 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
     }
   }
-  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - duplicated for even/odd blocks
+  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
     struct chn_cfg *cfg = &cns->chn_cfg[c];
 
-    __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
-    __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
-    __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);
+    __m256 delta_x_vec = _mm256_set1_ps(vel_x[c] * dt);
+    __m256 delta_y_vec = _mm256_set1_ps(vel_y[c] * dt);
+    __m256 delta_z_vec = _mm256_set1_ps(vel_z[c] * dt);
 
     __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
     __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
@@ -654,15 +605,19 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
     centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
 
+    __m256 lin_lag_factor = _mm256_sub_ps(one_vec, linear_inertia_vec);
+    __m256 lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor);
+    __m256 lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor);
+    __m256 lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor);
+
     // Even block (0-7)
     {
+      // In Step 1, for both even and odd blocks, after loading ptc_inv_mass and clamping to pm
       __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
       __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
       __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
-
-      __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
-      __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
-      __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
+      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
+      __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); // Mask lanes with inv_mass > 0
 
       __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
       __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
@@ -684,9 +639,22 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
       __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);
 
-      __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec);
-      __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec);
-      __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec);
+      __m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec);
+      __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec);
+      __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec);
+
+      // Apply mask to all dt components before adding
+      lin_dt_x = _mm256_and_ps(lin_dt_x, valid_mask);
+      lin_dt_y = _mm256_and_ps(lin_dt_y, valid_mask);
+      lin_dt_z = _mm256_and_ps(lin_dt_z, valid_mask);
+
+      ang_dt_x = _mm256_and_ps(ang_dt_x, valid_mask);
+      ang_dt_y = _mm256_and_ps(ang_dt_y, valid_mask);
+      ang_dt_z = _mm256_and_ps(ang_dt_z, valid_mask);
+
+      cent_dt_x = _mm256_and_ps(cent_dt_x, valid_mask);
+      cent_dt_y = _mm256_and_ps(cent_dt_y, valid_mask);
+      cent_dt_z = _mm256_and_ps(cent_dt_z, valid_mask);
 
       __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
       __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
@@ -696,16 +664,13 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y));
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z));
     }
-
     // Odd block (8-15)
     {
       __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
       __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
       __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
-
-      __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
-      __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
-      __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
+      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
+      __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); // Mask lanes with inv_mass > 0
 
       __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
       __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
@@ -727,9 +692,25 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
       __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);
 
-      __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec);
-      __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec);
-      __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec);
+      __m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec);
+      __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec);
+      __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec);
+
+      lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor);
+      lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor);
+      lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor);
+
+      lin_dt_x = _mm256_and_ps(lin_dt_x, valid_mask);
+      lin_dt_y = _mm256_and_ps(lin_dt_y, valid_mask);
+      lin_dt_z = _mm256_and_ps(lin_dt_z, valid_mask);
+
+      ang_dt_x = _mm256_and_ps(ang_dt_x, valid_mask);
+      ang_dt_y = _mm256_and_ps(ang_dt_y, valid_mask);
+      ang_dt_z = _mm256_and_ps(ang_dt_z, valid_mask);
+
+      cent_dt_x = _mm256_and_ps(cent_dt_x, valid_mask);
+      cent_dt_y = _mm256_and_ps(cent_dt_y, valid_mask);
+      cent_dt_z = _mm256_and_ps(cent_dt_z, valid_mask);
 
       __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
       __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
@@ -739,7 +720,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_add_ps(py, dt_y));
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_add_ps(pz, dt_z));
     }
-
     // Update previous transformation
     cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
     cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
@@ -750,7 +730,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
     cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
   }
-
   // Step 2: Verlet Integration - duplicated for even/odd blocks
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
@@ -809,7 +788,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y);
       _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z);
     }
-
     // Odd block
     {
       __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
@@ -851,18 +829,14 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z);
     }
   }
-
-  // Step 3: XPBD Constraints (Distance + Bending) - Direct loads for transposed even/odd
+  // Step 3: XPBD Constraints (Distance only)
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
     int r_base = c * CON_PER_CHN;
-    int b_base = c * (CON_PER_CHN - 1);
 
     struct chn_cfg *cfg = &cns->chn_cfg[c];
     __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
-    __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness);
     __m256 compliance_factor = _mm256_div_ps(dt2_vec, stiff_vec);
-    __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, bend_stiff_vec);
 
     // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15)
     __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
@@ -882,370 +856,127 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     __m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]);
     __m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]);
 
-    // For bending even: bend_rest_even in b_base+0 (7 lanes + dummy), odd bends in b_base+8 if implemented
-    __m256 bend_rest_even = _mm256_load_ps(&cns->ptc_bend_rest_ang[b_base + 0]);
-    __m256 bend_lambda_even = _mm256_load_ps(&cns->ptc_bend_lambda[b_base + 0]);
-
-    __m256 bend_rest_odd = _mm256_load_ps(&cns->ptc_bend_rest_ang[b_base + 8]);
-    __m256 bend_lambda_odd = _mm256_load_ps(&cns->ptc_bend_lambda[b_base + 8]);
-
     for (int iter = 0; iter < MAX_ITR; ++iter) {
-      // Even distance constraints: p0 = even, p1 = odd
+      // Even distance constraints: p0 = even (A), p1 = odd (B)
       {
-        __m256 dx = _mm256_sub_ps(odd_x, even_x);
-        __m256 dy = _mm256_sub_ps(odd_y, even_y);
-        __m256 dz = _mm256_sub_ps(odd_z, even_z);
+        __m256 dx = _mm256_sub_ps(even_x, odd_x);  // A - B
+        __m256 dy = _mm256_sub_ps(even_y, odd_y);
+        __m256 dz = _mm256_sub_ps(even_z, odd_z);
 
         __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
         __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
         __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
+        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
         __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
 
         __m256 w_sum = _mm256_add_ps(even_m, odd_m);
         __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
         __m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ);
-        __m256 denom = _mm256_div_ps(one_vec, w_sum);
 
         __m256 C = _mm256_sub_ps(dist, rest_even);
-        __m256 delta_lambda = _mm256_mul_ps(compliance_factor, _mm256_mul_ps(C, denom));
+        __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_even));  // C + compliance * lambda
+        __m256 numerator = _mm256_sub_ps(zero_vec, temp);  // - (C + compliance * lambda)
+        __m256 denom = _mm256_add_ps(w_sum, compliance_factor);  // w_sum + compliance
+        __m256 delta_lambda = _mm256_div_ps(numerator, denom);
         __m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda);
-        __m256 dlambda = delta_lambda;
 
-        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);  // grad_A = unit
         __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
         __m256 unit_z = _mm256_mul_ps(dz, inv_dist);
 
-        __m256 corr_x = _mm256_mul_ps(unit_x, dlambda);
-        __m256 corr_y = _mm256_mul_ps(unit_y, dlambda);
-        __m256 corr_z = _mm256_mul_ps(unit_z, dlambda);
+        __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
+        __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
+        __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);
 
         __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
         corr_x = _mm256_and_ps(corr_x, corr_mask);
         corr_y = _mm256_and_ps(corr_y, corr_mask);
         corr_z = _mm256_and_ps(corr_z, corr_mask);
 
-        // Apply directly
+        // dpos_A = delta_lambda * unit * even_m
         even_x = _mm256_add_ps(even_x, _mm256_mul_ps(corr_x, even_m));
         even_y = _mm256_add_ps(even_y, _mm256_mul_ps(corr_y, even_m));
         even_z = _mm256_add_ps(even_z, _mm256_mul_ps(corr_z, even_m));
 
+        // dpos_B = delta_lambda * (-unit) * odd_m = - corr * odd_m
         odd_x = _mm256_sub_ps(odd_x, _mm256_mul_ps(corr_x, odd_m));
         odd_y = _mm256_sub_ps(odd_y, _mm256_mul_ps(corr_y, odd_m));
         odd_z = _mm256_sub_ps(odd_z, _mm256_mul_ps(corr_z, odd_m));
 
         lambda_even = new_lambda;
       }
-      // Odd distance constraints: p1 = odd, p2 = even shifted left (slli)
+      // Odd distance constraints: p1 = odd (A), p2 = even shifted (B)
       {
-        __m256i even_ix = _mm256_castps_si256(even_x);
-        __m256i shifted_ix = _mm256_srli_si256(even_ix, 4);
-        __m256 p2_x = _mm256_castsi256_ps(shifted_ix);
-
-        __m256i even_iy = _mm256_castps_si256(even_y);
-        __m256i shifted_iy = _mm256_srli_si256(even_iy, 4);
-        __m256 p2_y = _mm256_castsi256_ps(shifted_iy);
-
-        __m256i even_iz = _mm256_castps_si256(even_z);
-        __m256i shifted_iz = _mm256_srli_si256(even_iz, 4);
-        __m256 p2_z = _mm256_castsi256_ps(shifted_iz);
+        __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        __m256 p2_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx));
+        __m256 p2_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx));
+        __m256 p2_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx));
+        __m256 p2_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx));
 
-        __m256i even_im = _mm256_castps_si256(even_m);
-        __m256i shifted_im = _mm256_srli_si256(even_im, 4);
-        __m256 p2_m = _mm256_castsi256_ps(shifted_im);
-
-        __m256 dx = _mm256_sub_ps(p2_x, odd_x);
-        __m256 dy = _mm256_sub_ps(p2_y, odd_y);
-        __m256 dz = _mm256_sub_ps(p2_z, odd_z);
+        __m256 dx = _mm256_sub_ps(odd_x, p2_x);
+        __m256 dy = _mm256_sub_ps(odd_y, p2_y);
+        __m256 dz = _mm256_sub_ps(odd_z, p2_z);
 
         __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
         __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
         __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
+        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
         __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
 
         __m256 w_sum = _mm256_add_ps(odd_m, p2_m);
         __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
         __m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ);
-        __m256 denom = _mm256_div_ps(one_vec, w_sum);
 
         __m256 C = _mm256_sub_ps(dist, rest_odd);
-        __m256 delta_lambda = _mm256_mul_ps(compliance_factor, _mm256_mul_ps(C, denom));
+        __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_odd));
+        __m256 numerator = _mm256_sub_ps(zero_vec, temp);
+        __m256 denom = _mm256_add_ps(w_sum, compliance_factor);
+        __m256 delta_lambda = _mm256_div_ps(numerator, denom);
         __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda);
-        __m256 dlambda = delta_lambda;
 
-        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);  // grad_A = unit
         __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
         __m256 unit_z = _mm256_mul_ps(dz, inv_dist);
 
-        __m256 corr_x = _mm256_mul_ps(unit_x, dlambda);
-        __m256 corr_y = _mm256_mul_ps(unit_y, dlambda);
-        __m256 corr_z = _mm256_mul_ps(unit_z, dlambda);
+        __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda);
+        __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda);
+        __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda);
 
         __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
         corr_x = _mm256_and_ps(corr_x, corr_mask);
         corr_y = _mm256_and_ps(corr_y, corr_mask);
         corr_z = _mm256_and_ps(corr_z, corr_mask);
 
-        // Apply: p1=odd += corr * odd_m
+        // dpos_A = delta_lambda * unit * odd_m
         odd_x = _mm256_add_ps(odd_x, _mm256_mul_ps(corr_x, odd_m));
         odd_y = _mm256_add_ps(odd_y, _mm256_mul_ps(corr_y, odd_m));
         odd_z = _mm256_add_ps(odd_z, _mm256_mul_ps(corr_z, odd_m));
 
-        // Propagate p2 correction back to even: delta_p2 = -corr * p2_m, then right-shift (srli) to align with even lanes
-        __m256 corr_shifted_x = _mm256_mul_ps(corr_x, p2_m);
-        __m256 delta_p2_x = _mm256_sub_ps(zero_vec, corr_shifted_x);
+        // dpos_B = delta_lambda * (-unit) * p2_m = - corr * p2_m
+        __m256 delta_p2_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, p2_m));
         __m256i delta_ix = _mm256_castps_si256(delta_p2_x);
-        __m256i right_shifted_ix = _mm256_srli_si256(delta_ix, 4);  // Right shift, pad 0 on MSB
-        __m256 delta_even_x = _mm256_castsi256_ps(right_shifted_ix);
+        __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4);
+        __m256 delta_even_x = _mm256_castsi256_ps(left_shifted_ix);
         even_x = _mm256_add_ps(even_x, delta_even_x);
 
-        __m256 corr_shifted_y = _mm256_mul_ps(corr_y, p2_m);
-        __m256 delta_p2_y = _mm256_sub_ps(zero_vec, corr_shifted_y);
+        __m256 delta_p2_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, p2_m));
         __m256i delta_iy = _mm256_castps_si256(delta_p2_y);
-        __m256i right_shifted_iy = _mm256_srli_si256(delta_iy, 4);
-        __m256 delta_even_y = _mm256_castsi256_ps(right_shifted_iy);
+        __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4);
+        __m256 delta_even_y = _mm256_castsi256_ps(left_shifted_iy);
         even_y = _mm256_add_ps(even_y, delta_even_y);
 
-        __m256 corr_shifted_z = _mm256_mul_ps(corr_z, p2_m);
-        __m256 delta_p2_z = _mm256_sub_ps(zero_vec, corr_shifted_z);
+        __m256 delta_p2_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, p2_m));
         __m256i delta_iz = _mm256_castps_si256(delta_p2_z);
-        __m256i right_shifted_iz = _mm256_srli_si256(delta_iz, 4);
-        __m256 delta_even_z = _mm256_castsi256_ps(right_shifted_iz);
+        __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4);
+        __m256 delta_even_z = _mm256_castsi256_ps(left_shifted_iz);
         even_z = _mm256_add_ps(even_z, delta_even_z);
 
         lambda_odd = new_lambda;
       }
-      // Even bending constraints: p0-p1-p2
-      {
-        // p2 = even shifted left (slli)
-        __m256i even_ix = _mm256_castps_si256(even_x);
-        __m256i shifted_ix = _mm256_srli_si256(even_ix, 4);
-        __m256 p2_x = _mm256_castsi256_ps(shifted_ix);
-
-        __m256i even_iy = _mm256_castps_si256(even_y);
-        __m256i shifted_iy = _mm256_srli_si256(even_iy, 4);
-        __m256 p2_y = _mm256_castsi256_ps(shifted_iy);
-
-        __m256i even_iz = _mm256_castps_si256(even_z);
-        __m256i shifted_iz = _mm256_srli_si256(even_iz, 4);
-        __m256 p2_z = _mm256_castsi256_ps(shifted_iz);
-
-        __m256i even_im = _mm256_castps_si256(even_m);
-        __m256i shifted_im = _mm256_srli_si256(even_im, 4);
-        __m256 p2_m = _mm256_castsi256_ps(shifted_im);
-
-        // Vectors v0 = p1 - p0, v1 = p2 - p1
-        __m256 v0x = _mm256_sub_ps(odd_x, even_x);
-        __m256 v0y = _mm256_sub_ps(odd_y, even_y);
-        __m256 v0z = _mm256_sub_ps(odd_z, even_z);
-        __m256 len0_sq = _mm256_fmadd_ps(v0z, v0z, _mm256_fmadd_ps(v0y, v0y, _mm256_mul_ps(v0x, v0x)));
-        __m256 mask_len0 = _mm256_cmp_ps(len0_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_len0 = _mm256_rsqrt_ps(len0_sq);
-        __m256 n0x = _mm256_mul_ps(v0x, inv_len0);
-        __m256 n0y = _mm256_mul_ps(v0y, inv_len0);
-        __m256 n0z = _mm256_mul_ps(v0z, inv_len0);
-
-        __m256 v1x = _mm256_sub_ps(p2_x, odd_x);
-        __m256 v1y = _mm256_sub_ps(p2_y, odd_y);
-        __m256 v1z = _mm256_sub_ps(p2_z, odd_z);
-        __m256 len1_sq = _mm256_fmadd_ps(v1z, v1z, _mm256_fmadd_ps(v1y, v1y, _mm256_mul_ps(v1x, v1x)));
-        __m256 mask_len1 = _mm256_cmp_ps(len1_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_len1 = _mm256_rsqrt_ps(len1_sq);
-        __m256 n1x = _mm256_mul_ps(v1x, inv_len1);
-        __m256 n1y = _mm256_mul_ps(v1y, inv_len1);
-        __m256 n1z = _mm256_mul_ps(v1z, inv_len1);
-
-        __m256 cos_theta = _mm256_fmadd_ps(n0z, n1z, _mm256_fmadd_ps(n0y, n1y, _mm256_mul_ps(n0x, n1x)));
-        cos_theta = _mm256_min_ps(one_vec, _mm256_max_ps(neg_one_vec, cos_theta));
-        __m256 C = _mm256_sub_ps(bend_rest_even, cos_theta);
-
-        __m256 w_sum = _mm256_add_ps(_mm256_add_ps(even_m, odd_m), p2_m);
-        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
-        __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ);
-        __m256 denom = _mm256_div_ps(one_vec, w_sum);
-
-        __m256 delta_lambda = _mm256_mul_ps(bend_compliance_factor, _mm256_mul_ps(C, denom));
-        __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda);
-        __m256 dlambda = delta_lambda;
-
-        // Direction: cross = n0 × n1 normalized
-        __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(n0y, n1z), _mm256_mul_ps(n0z, n1y));
-        __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(n0z, n1x), _mm256_mul_ps(n0x, n1z));
-        __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(n0x, n1y), _mm256_mul_ps(n0y, n1x));
-        __m256 cross_sq = _mm256_fmadd_ps(cross_z, cross_z, _mm256_fmadd_ps(cross_y, cross_y, _mm256_mul_ps(cross_x, cross_x)));
-        __m256 mask_cross = _mm256_cmp_ps(cross_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_cross = _mm256_rsqrt_ps(cross_sq);
-        cross_x = _mm256_mul_ps(cross_x, inv_cross);
-        cross_y = _mm256_mul_ps(cross_y, inv_cross);
-        cross_z = _mm256_mul_ps(cross_z, inv_cross);
-
-        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask_len0, mask_len1), mask_w), mask_rest), mask_cross);
-        __m256 dt_x = _mm256_and_ps(_mm256_mul_ps(cross_x, dlambda), corr_mask);
-        __m256 dt_y = _mm256_and_ps(_mm256_mul_ps(cross_y, dlambda), corr_mask);
-        __m256 dt_z = _mm256_and_ps(_mm256_mul_ps(cross_z, dlambda), corr_mask);
-
-        // Apply: p0 -= dt * even_m
-        even_x = _mm256_sub_ps(even_x, _mm256_mul_ps(dt_x, even_m));
-        even_y = _mm256_sub_ps(even_y, _mm256_mul_ps(dt_y, even_m));
-        even_z = _mm256_sub_ps(even_z, _mm256_mul_ps(dt_z, even_m));
-
-        // Propagate p2 correction back to even: delta_p2 = +dt * p2_m, then right-shift (srli)
-        __m256 corr_p2_x = _mm256_mul_ps(dt_x, p2_m);
-        __m256 delta_p2_x = corr_p2_x;
-        __m256i delta_p2_ix = _mm256_castps_si256(delta_p2_x);
-        __m256i right_shifted_p2_ix = _mm256_srli_si256(delta_p2_ix, 4);
-        __m256 delta_even_from_p2_x = _mm256_castsi256_ps(right_shifted_p2_ix);
-        even_x = _mm256_add_ps(even_x, delta_even_from_p2_x);
-
-        __m256 corr_p2_y = _mm256_mul_ps(dt_y, p2_m);
-        __m256 delta_p2_y = corr_p2_y;
-        __m256i delta_p2_iy = _mm256_castps_si256(delta_p2_y);
-        __m256i right_shifted_p2_iy = _mm256_srli_si256(delta_p2_iy, 4);
-        __m256 delta_even_from_p2_y = _mm256_castsi256_ps(right_shifted_p2_iy);
-        even_y = _mm256_add_ps(even_y, delta_even_from_p2_y);
-
-        __m256 corr_p2_z = _mm256_mul_ps(dt_z, p2_m);
-        __m256 delta_p2_z = corr_p2_z;
-        __m256i delta_p2_iz = _mm256_castps_si256(delta_p2_z);
-        __m256i right_shifted_p2_iz = _mm256_srli_si256(delta_p2_iz, 4);
-        __m256 delta_even_from_p2_z = _mm256_castsi256_ps(right_shifted_p2_iz);
-        even_z = _mm256_add_ps(even_z, delta_even_from_p2_z);
-
-        bend_lambda_even = new_lambda;
-      }
-      // Odd bending constraints: p1=odd[0], p2=even[1], p3=odd[1]
-      {
-        // p2 = even shifted right (srli to align even[1] to lane 0)
-        __m256i even_ix = _mm256_castps_si256(even_x);
-        __m256i shifted_ix = _mm256_srli_si256(even_ix, 4);
-        __m256 p2_x = _mm256_castsi256_ps(shifted_ix);
-
-        __m256i even_iy = _mm256_castps_si256(even_y);
-        __m256i shifted_iy = _mm256_srli_si256(even_iy, 4);
-        __m256 p2_y = _mm256_castsi256_ps(shifted_iy);
-
-        __m256i even_iz = _mm256_castps_si256(even_z);
-        __m256i shifted_iz = _mm256_srli_si256(even_iz, 4);
-        __m256 p2_z = _mm256_castsi256_ps(shifted_iz);
-
-        __m256i even_im = _mm256_castps_si256(even_m);
-        __m256i shifted_im = _mm256_srli_si256(even_im, 4);
-        __m256 p2_m = _mm256_castsi256_ps(shifted_im);
-
-        // p3 = odd shifted right (srli to align odd[1] to lane 0)
-        __m256i odd_ix = _mm256_castps_si256(odd_x);
-        __m256i shifted_odd_ix = _mm256_srli_si256(odd_ix, 4);
-        __m256 p3_x = _mm256_castsi256_ps(shifted_odd_ix);
-
-        __m256i odd_iy = _mm256_castps_si256(odd_y);
-        __m256i shifted_odd_iy = _mm256_srli_si256(odd_iy, 4);
-        __m256 p3_y = _mm256_castsi256_ps(shifted_odd_iy);
-
-        __m256i odd_iz = _mm256_castps_si256(odd_z);
-        __m256i shifted_odd_iz = _mm256_srli_si256(odd_iz, 4);
-        __m256 p3_z = _mm256_castsi256_ps(shifted_odd_iz);
-
-        __m256i odd_im = _mm256_castps_si256(odd_m);
-        __m256i shifted_odd_im = _mm256_srli_si256(odd_im, 4);
-        __m256 p3_m = _mm256_castsi256_ps(shifted_odd_im);
-
-        // Vectors v0 = p2 - p1, v1 = p3 - p2
-        __m256 v0x = _mm256_sub_ps(p2_x, odd_x);
-        __m256 v0y = _mm256_sub_ps(p2_y, odd_y);
-        __m256 v0z = _mm256_sub_ps(p2_z, odd_z);
-        __m256 len0_sq = _mm256_fmadd_ps(v0z, v0z, _mm256_fmadd_ps(v0y, v0y, _mm256_mul_ps(v0x, v0x)));
-        __m256 mask_len0 = _mm256_cmp_ps(len0_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_len0 = _mm256_rsqrt_ps(len0_sq);
-        __m256 n0x = _mm256_mul_ps(v0x, inv_len0);
-        __m256 n0y = _mm256_mul_ps(v0y, inv_len0);
-        __m256 n0z = _mm256_mul_ps(v0z, inv_len0);
-
-        __m256 v1x = _mm256_sub_ps(p3_x, p2_x);
-        __m256 v1y = _mm256_sub_ps(p3_y, p2_y);
-        __m256 v1z = _mm256_sub_ps(p3_z, p2_z);
-        __m256 len1_sq = _mm256_fmadd_ps(v1z, v1z, _mm256_fmadd_ps(v1y, v1y, _mm256_mul_ps(v1x, v1x)));
-        __m256 mask_len1 = _mm256_cmp_ps(len1_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_len1 = _mm256_rsqrt_ps(len1_sq);
-        __m256 n1x = _mm256_mul_ps(v1x, inv_len1);
-        __m256 n1y = _mm256_mul_ps(v1y, inv_len1);
-        __m256 n1z = _mm256_mul_ps(v1z, inv_len1);
-
-        // Cosine of current angle
-        __m256 cos_theta = _mm256_fmadd_ps(n0z, n1z, _mm256_fmadd_ps(n0y, n1y, _mm256_mul_ps(n0x, n1x)));
-        cos_theta = _mm256_min_ps(one_vec, _mm256_max_ps(neg_one_vec, cos_theta));
-        __m256 C = _mm256_sub_ps(bend_rest_odd, cos_theta);
-
-        // Sum of inverse masses
-        __m256 w_sum = _mm256_add_ps(_mm256_add_ps(odd_m, p2_m), p3_m);
-        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
-        __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ);
-        __m256 denom = _mm256_div_ps(one_vec, w_sum);
-
-        // XPBD lambda update
-        __m256 delta_lambda = _mm256_mul_ps(bend_compliance_factor, _mm256_mul_ps(C, denom));
-        __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda);
-        __m256 dlambda = delta_lambda;
-
-        // Correction direction: cross = n0 × n1 normalized
-        __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(n0y, n1z), _mm256_mul_ps(n0z, n1y));
-        __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(n0z, n1x), _mm256_mul_ps(n0x, n1z));
-        __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(n0x, n1y), _mm256_mul_ps(n0y, n1x));
-        __m256 cross_sq = _mm256_fmadd_ps(cross_z, cross_z, _mm256_fmadd_ps(cross_y, cross_y, _mm256_mul_ps(cross_x, cross_x)));
-        __m256 mask_cross = _mm256_cmp_ps(cross_sq, eps_vec, _CMP_GT_OQ);
-        __m256 inv_cross = _mm256_rsqrt_ps(cross_sq);
-        cross_x = _mm256_mul_ps(cross_x, inv_cross);
-        cross_y = _mm256_mul_ps(cross_y, inv_cross);
-        cross_z = _mm256_mul_ps(cross_z, inv_cross);
-
-        // Combine masks
-        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask_len0, mask_len1), mask_w), mask_rest), mask_cross);
-        __m256 dt_x = _mm256_and_ps(_mm256_mul_ps(cross_x, dlambda), corr_mask);
-        __m256 dt_y = _mm256_and_ps(_mm256_mul_ps(cross_y, dlambda), corr_mask);
-        __m256 dt_z = _mm256_and_ps(_mm256_mul_ps(cross_z, dlambda), corr_mask);
-
-        // Apply: p1=odd[0] -= dt * odd_m
-        odd_x = _mm256_sub_ps(odd_x, _mm256_mul_ps(dt_x, odd_m));
-        odd_y = _mm256_sub_ps(odd_y, _mm256_mul_ps(dt_y, odd_m));
-        odd_z = _mm256_sub_ps(odd_z, _mm256_mul_ps(dt_z, odd_m));
-
-        // Apply: p3=odd[1] += dt * p3_m
-        __m256 corr_p3_x = _mm256_mul_ps(dt_x, p3_m);
-        __m256 corr_p3_y = _mm256_mul_ps(dt_y, p3_m);
-        __m256 corr_p3_z = _mm256_mul_ps(dt_z, p3_m);
-        __m256i corr_p3_ix = _mm256_castps_si256(corr_p3_x);
-        __m256i corr_p3_iy = _mm256_castps_si256(corr_p3_y);
-        __m256i corr_p3_iz = _mm256_castps_si256(corr_p3_z);
-        __m256i left_shifted_p3_ix = _mm256_slli_si256(corr_p3_ix, 4);
-        __m256i left_shifted_p3_iy = _mm256_slli_si256(corr_p3_iy, 4);
-        __m256i left_shifted_p3_iz = _mm256_slli_si256(corr_p3_iz, 4);
-        __m256 delta_odd_x = _mm256_castsi256_ps(left_shifted_p3_ix);
-        __m256 delta_odd_y = _mm256_castsi256_ps(left_shifted_p3_iy);
-        __m256 delta_odd_z = _mm256_castsi256_ps(left_shifted_p3_iz);
-        odd_x = _mm256_add_ps(odd_x, delta_odd_x);
-        odd_y = _mm256_add_ps(odd_y, delta_odd_y);
-        odd_z = _mm256_add_ps(odd_z, delta_odd_z);
-
-        // Propagate p2 correction to even: delta_p2 = +dt * p2_m, then left-shift (slli)
-        __m256 corr_p2_x = _mm256_mul_ps(dt_x, p2_m);
-        __m256 corr_p2_y = _mm256_mul_ps(dt_y, p2_m);
-        __m256 corr_p2_z = _mm256_mul_ps(dt_z, p2_m);
-        __m256i corr_p2_ix = _mm256_castps_si256(corr_p2_x);
-        __m256i corr_p2_iy = _mm256_castps_si256(corr_p2_y);
-        __m256i corr_p2_iz = _mm256_castps_si256(corr_p2_z);
-        __m256i left_shifted_p2_ix = _mm256_slli_si256(corr_p2_ix, 4);
-        __m256i left_shifted_p2_iy = _mm256_slli_si256(corr_p2_iy, 4);
-        __m256i left_shifted_p2_iz = _mm256_slli_si256(corr_p2_iz, 4);
-        __m256 delta_even_from_p2_x = _mm256_castsi256_ps(left_shifted_p2_ix);
-        __m256 delta_even_from_p2_y = _mm256_castsi256_ps(left_shifted_p2_iy);
-        __m256 delta_even_from_p2_z = _mm256_castsi256_ps(left_shifted_p2_iz);
-        even_x = _mm256_add_ps(even_x, delta_even_from_p2_x);
-        even_y = _mm256_add_ps(even_y, delta_even_from_p2_y);
-        even_z = _mm256_add_ps(even_z, delta_even_from_p2_z);
-
-        bend_lambda_odd = new_lambda;
-      }
     }
-
-    // Store back direct (no encode, since transposed)
+    // Store back
     _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x);
     _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], even_y);
     _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], even_z);
@@ -1254,13 +985,9 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], odd_y);
     _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], odd_z);
 
-    // Store lambdas direct (transposed)
     _mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even);
     _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd);
-    _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 0], bend_lambda_even);
-    _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 8], bend_lambda_odd);
   }
-
   // Step 4: Sphere Collisions with Friction - duplicated for even/odd blocks
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
@@ -1279,7 +1006,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
         __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
         __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
-        
+
         __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
         __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
         __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
@@ -1295,6 +1022,8 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
         __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
         __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
 
         __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
@@ -1346,13 +1075,12 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
         _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
       }
-
       // Odd block
       {
         __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
         __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
         __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
-        
+
         __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
         __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
         __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
@@ -1368,6 +1096,8 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
         __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+        __m256 min_inv_dist = _mm256_set1_ps(1e-6f);
+        inv_dist = _mm256_max_ps(inv_dist, min_inv_dist);
         __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
 
         __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
@@ -1421,13 +1151,11 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       }
     }
   }
-
   // Step 5: Motion Constraints - duplicated for even/odd blocks
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
-
-    // Even block
     {
+      // Even block
       __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
       __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
       __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
@@ -1468,9 +1196,8 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y));
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z));
     }
-
-    // Odd block
     {
+      // Odd block
       __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
       __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
       __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
@@ -1512,4 +1239,223 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z));
     }
   }
-}
+}
+
+#include <SDL2/SDL.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#define WINDOW_WIDTH 800
+#define WINDOW_HEIGHT 600
+#define CHAIN_LENGTH 10
+#define PARTICLE_SPACING 10.0f
+
+static void
+project(float x, float y, float z, int *screen_x, int *screen_y) {
+  *screen_x = (int)(x + WINDOW_WIDTH / 2.0f);
+  *screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f);
+}
+// Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity)
+static void quat_rotate_inv_translate(float wx, float wy, float wz,  // world pos
+                                      float cx, float cy, float cz,  // chain pos
+                                      float qx, float qy, float qz, float qw,  // chain quat
+                                      float *mx, float *my, float *mz) {  // out model pos
+  // Translate: world - chain
+  float tx = wx - cx, ty = wy - cy, tz = wz - cz;
+  // Quat conj: (qw, -qx, -qy, -qz)
+  float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz;
+  // t = conj * (0, tx,ty,tz)
+  float tw = cqx*tx + cqy*ty + cqz*tz;
+  float t_x = cqw*tx + cqy*tz - cqz*ty;
+  float t_y = cqw*ty + cqz*tx - cqx*tz;
+  float t_z = cqw*tz + cqx*ty - cqy*tx;
+  // model = t * quat (vec part)
+  *mx = qw*t_x + qy*t_z - qz*t_y + tw*qx;
+  *my = qw*t_y + qz*t_x - qx*t_z + tw*qy;
+  *mz = qw*t_z + qx*t_y - qy*t_x + tw*qz;
+}
+extern int
+main(int argc, char *argv[]) {
+  if (SDL_Init(SDL_INIT_VIDEO) < 0) {
+    printf("SDL_Init failed: %s\n", SDL_GetError());
+    return 1;
+  }
+  SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0);
+  SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+  if (!window || !renderer) {
+    printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError());
+    SDL_Quit();
+    return 1;
+  }
+  struct chn_sim sim;
+  chn_sim_init(&sim);
+
+  struct chn_cfg cfg = {
+    .damping = 0.5f,
+    .stiffness = 0.99f,
+    .friction = 0.5f,
+    .linear_inertia = 0.9f,       // Tune 0.5f for more tangential lag
+    .angular_inertia = 0.9f,
+    .centrifugal_inertia = 0.9f   // Tune 0.5f for more outward bow
+  };
+  float rest_pos[CHAIN_LENGTH * 4];
+  for (int i = 0; i < CHAIN_LENGTH; i++) {
+    rest_pos[i*4 + 0] = 0.0f;
+    rest_pos[i*4 + 1] = -i * PARTICLE_SPACING;
+    rest_pos[i*4 + 2] = 0.0f;
+    rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f;
+  }
+  int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH);
+  chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f});  // Reduced for spin test
+  chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});  // Start at circle edge
+
+  // Fixed world spheres (triangle outside circle)
+  float world_spheres[12] = {  // 3 spheres * 4 floats (x,y,z,r)
+    60.0f,  0.0f, 0.0f, 10.0f,   // Sphere 0: right
+    -30.0f, 52.0f, 0.0f, 10.0f,  // Sphere 1: top-left
+    -30.0f,-52.0f, 0.0f, 10.0f   // Sphere 2: bottom-left
+  };
+
+  // Circular motion params
+  float time = 0.0f;
+  float circle_radius = 50.0f;
+  float angular_speed = 2.0f * M_PI / 5.0f;  // Full circle every 5s
+  float dt = 0.016f;
+  int running = 1;
+  SDL_Event event;
+
+  // Debug: Check gravity and masses
+  printf("Gravity y: %.2f\n", sim.chn_grav_y[chn]);
+  int p_base = chn * PTC_PER_CHN;
+  for (int i = 0; i < CHAIN_LENGTH; i++) {
+    int idx = p_base + (i % 2 == 0 ? i/2 : 8 + (i-1)/2);
+    printf("p%d inv_mass: %.2f\n", i, sim.ptc_inv_mass[idx]);
+  }
+
+  while (running) {
+    while (SDL_PollEvent(&event)) {
+      if (event.type == SDL_QUIT) {
+        running = 0;
+      }
+    }
+    time += dt;
+
+    // Circular anchor motion
+    float anchor_x = circle_radius * cosf(angular_speed * time);
+    float anchor_y = circle_radius * sinf(angular_speed * time);
+    chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});
+
+    // Transform fixed world spheres to current model space
+    float model_spheres[12];
+    for (int s = 0; s < 3; ++s) {
+      float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2];
+      float r = world_spheres[s*4 + 3];
+      float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn];
+      float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn];
+      float mx, my, mz;
+      quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz);
+      model_spheres[s*4 + 0] = mx;
+      model_spheres[s*4 + 1] = my;
+      model_spheres[s*4 + 2] = mz;
+      model_spheres[s*4 + 3] = r;
+    }
+    chn_sim_set_sph(&sim, chn, model_spheres, 3);
+
+    chn_sim_run(&sim, dt);
+
+    // Collision depth test: Print max pen across particles/spheres
+    float max_pen = 0.0f;
+    int max_p = -1, max_s = -1;
+    int p_base = chn * PTC_PER_CHN;
+    int s_base = chn * SPH_PER_CHN;
+    for (int p = 1; p < CHAIN_LENGTH; ++p) {  // Skip fixed p0
+      int idx = (p % 2 == 0 ? p / 2 : 8 + (p - 1) / 2);
+      float px = sim.ptc_pos_x[p_base + idx], py = sim.ptc_pos_y[p_base + idx], pz = sim.ptc_pos_z[p_base + idx];
+      for (int s = 0; s < 3; ++s) {
+        float sx = sim.sph_x[s_base + s], sy = sim.sph_y[s_base + s], sz = sim.sph_z[s_base + s];
+        float sr = sim.sph_r[s_base + s];
+        float dx = px - sx, dy = py - sy, dz = pz - sz;
+        float dist_sq = dx*dx + dy*dy + dz*dz;
+        float dist = sqrtf(dist_sq);
+        float pen = sr - dist;
+        if (pen > max_pen) {
+          max_pen = pen;
+          max_p = p; max_s = s;
+        }
+      }
+    }
+    if (max_pen > 0.0f && ((int)(time / dt) % 5 == 0)) {
+      printf("Collision: p%d/s%d pen=%.2f\n", max_p, max_s, max_pen);
+    }
+
+    SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
+    SDL_RenderClear(renderer);
+    SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255);  // Green for chain & spheres
+
+    float root_x = sim.chn_pos_x[chn];  // Current root world x/y
+    float root_y = sim.chn_pos_y[chn];
+
+    // Draw chain segments
+    for (int i = 0; i < CHAIN_LENGTH - 1; i++) {
+      int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
+      int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2);
+
+      float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0];
+      float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1];
+      float world_x0 = model_x0 + root_x;
+      float world_y0 = model_y0 + root_y;
+      float world_x1 = model_x1 + root_x;
+      float world_y1 = model_y1 + root_y;
+
+      int sx0, sy0, sx1, sy1;
+      project(world_x0, world_y0, 0, &sx0, &sy0);
+      project(world_x1, world_y1, 0, &sx1, &sy1);
+      SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
+    }
+    // Draw spheres (approx circle outline with 32 points)
+    for (int s = 0; s < 3; ++s) {  // Only draw active spheres
+      float sph_model_x = sim.sph_x[s_base + s];
+      float sph_model_y = sim.sph_y[s_base + s];
+      float sph_r = sim.sph_r[s_base + s];
+      if (sph_r <= 0.0f) continue;
+
+      float sph_world_x = sph_model_x + root_x;
+      float sph_world_y = sph_model_y + root_y;
+      int sph_sx, sph_sy;
+      project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy);
+
+      // Parametric circle: 32 points
+      const int num_points = 32;
+      for (int pt = 0; pt < num_points; ++pt) {
+        float angle0 = 2.0f * M_PI * pt / num_points;
+        float angle1 = 2.0f * M_PI * (pt + 1) / num_points;
+        float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0);
+        float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1);
+
+        int px0, py0, px1, py1;
+        project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0);
+        project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1);
+        SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
+      }
+    }
+
+    // Debug printf (every 5 steps ~0.08s)
+    if ((int)(time / dt) % 5 == 0) {
+      printf("t=%.2f: ", time);
+      for (int i = 0; i < CHAIN_LENGTH; i++) {
+        int idx = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
+        float model_y = sim.ptc_pos_y[p_base + idx];
+        float world_y = model_y + root_y;
+        printf("p%d world_y=%.2f (model=%.2f) ", i, world_y, model_y);
+      }
+      printf("\n");
+    }
+    SDL_RenderPresent(renderer);
+    SDL_Delay(16);
+  }
+  SDL_DestroyRenderer(renderer);
+  SDL_DestroyWindow(window);
+  SDL_Quit();
+  return 0;
+}
diff --git a/chain.c b/chain.c
@@ -5,7 +5,7 @@
 #include <float.h>
 
 enum {
-  MAX_CHNS      = 64,
+  MAX_CHNS      = 128,
   PTC_PER_CHN   = 16,
   SPH_PER_CHN   = 8,
   MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
@@ -14,6 +14,8 @@ enum {
   MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
   MAX_ITR       = 12,
 };
+#define MAX_MOTION_RADIUS 10.0f
+
 struct chn_cfg {
   float damping;              // [0, 1]
   float stiffness;            // [0, 1]
@@ -23,6 +25,7 @@ struct chn_cfg {
   float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
   float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
   float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
+  float bend_stiffness;       // [0, 1] for bending constraints
 };
 struct chn_sim {
   unsigned char free_idx_cnt;
@@ -57,7 +60,7 @@ struct chn_sim {
   float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
   float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32)));
 
-  // particle positions (model space)
+  // particle positions (model space, transposed: 0-7 even indices, 8-15 odd indices)
   float ptc_pos_x[MAX_PTC] __attribute__((aligned(32)));
   float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
   float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
@@ -66,15 +69,22 @@ struct chn_sim {
   float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
   float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
   float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
-  float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32)));
+  float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32)));  // Transposed: even cons 0-7, odd 8-15 (last dummy)
+
+  // XPBD lambdas for distance constraints (transposed similarly)
+  float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32)));
+
+  // Bending rest angles and lambdas (one less than constraints, transposed even/odd bends)
+  float ptc_bend_rest_ang[MAX_REST_LEN - MAX_CHNS] __attribute__((aligned(32)));
+  float ptc_bend_lambda[MAX_REST_LEN - MAX_CHNS] __attribute__((aligned(32)));
 
   // sphere positions (model space)
   float sph_x[MAX_SPH] __attribute__((aligned(32)));
   float sph_y[MAX_SPH] __attribute__((aligned(32)));
   float sph_z[MAX_SPH] __attribute__((aligned(32)));
   float sph_r[MAX_SPH] __attribute__((aligned(32)));
 
-  // rest positions (model space)
+  // rest positions (model space, transposed)
   float rest_pos_x[MAX_PTC] __attribute__((aligned(32)));
   float rest_pos_y[MAX_PTC] __attribute__((aligned(32)));
   float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
@@ -91,6 +101,26 @@ chn_sim_init(struct chn_sim *cns) {
     _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f));
     _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));  
   }
+  // Initialize lambdas to zero
+  memset(cns->ptc_lambda, 0, sizeof(cns->ptc_lambda));
+  memset(cns->ptc_bend_lambda, 0, sizeof(cns->ptc_bend_lambda));
+  // Zero all transposed particle arrays
+  memset(cns->ptc_pos_x, 0, sizeof(cns->ptc_pos_x));
+  memset(cns->ptc_pos_y, 0, sizeof(cns->ptc_pos_y));
+  memset(cns->ptc_pos_z, 0, sizeof(cns->ptc_pos_z));
+  memset(cns->ptc_inv_mass, 0, sizeof(cns->ptc_inv_mass));
+  memset(cns->ptc_prev_pos_x, 0, sizeof(cns->ptc_prev_pos_x));
+  memset(cns->ptc_prev_pos_y, 0, sizeof(cns->ptc_prev_pos_y));
+  memset(cns->ptc_prev_pos_z, 0, sizeof(cns->ptc_prev_pos_z));
+  memset(cns->rest_pos_x, 0, sizeof(cns->rest_pos_x));
+  memset(cns->rest_pos_y, 0, sizeof(cns->rest_pos_y));
+  memset(cns->rest_pos_z, 0, sizeof(cns->rest_pos_z));
+  // Set motion_radius to FLT_MAX
+  for (int i = 0; i < MAX_PTC; i += 8) {
+    _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS));
+  }
+  memset(cns->ptc_rest_len, 0, sizeof(cns->ptc_rest_len));
+  memset(cns->ptc_bend_rest_ang, 0, sizeof(cns->ptc_bend_rest_ang));
 }
 extern int
 chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
@@ -103,33 +133,125 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric
   int chn = cns->free_idx[--cns->free_idx_cnt];
   cns->chn_cfg[chn] = *cfg;
 
-  int p_idx = (chn * PTC_PER_CHN);
-  int r_idx = (chn * CON_PER_CHN);
-  for (int i = 0; i < cnt; ++i) {
-    cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0];
-    cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1];
-    cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2];
-    cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3];
-
-    cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0];
-    cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1];
-    cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2];
+  int p_base = (chn * PTC_PER_CHN);
+  int r_base = (chn * CON_PER_CHN);
+  int b_base = (chn * (CON_PER_CHN - 1));
+  int even_base = p_base + 0;  // Even indices 0-7
+  int odd_base = p_base + 8;   // Odd indices 8-15
+
+  // Transpose particles: even indices to 0-7, odd to 8-15
+  int even_cnt = (cnt + 1) / 2;  // Number of even particles
+  int odd_cnt = cnt / 2;         // Number of odd particles
+  for (int i = 0; i < even_cnt; ++i) {
+    int orig_i = 2 * i;
+    cns->ptc_pos_x[even_base + i] = rest_pos[orig_i*4+0];
+    cns->ptc_pos_y[even_base + i] = rest_pos[orig_i*4+1];
+    cns->ptc_pos_z[even_base + i] = rest_pos[orig_i*4+2];
+    cns->ptc_inv_mass[even_base + i] = rest_pos[orig_i*4+3];
+
+    cns->rest_pos_x[even_base + i] = rest_pos[orig_i*4+0];
+    cns->rest_pos_y[even_base + i] = rest_pos[orig_i*4+1];
+    cns->rest_pos_z[even_base + i] = rest_pos[orig_i*4+2];
   }
-  for (int i = cnt; i < PTC_PER_CHN; ++i) {
-    cns->ptc_pos_x[p_idx + i] = 0.0f;
-    cns->ptc_pos_y[p_idx + i] = 0.0f;
-    cns->ptc_pos_z[p_idx + i] = 0.0f;
+  for (int i = even_cnt; i < 8; ++i) {
+    cns->ptc_pos_x[even_base + i] = 0.0f;
+    cns->ptc_pos_y[even_base + i] = 0.0f;
+    cns->ptc_pos_z[even_base + i] = 0.0f;
+    cns->ptc_inv_mass[even_base + i] = 0.0f;
+    cns->rest_pos_x[even_base + i] = 0.0f;
+    cns->rest_pos_y[even_base + i] = 0.0f;
+    cns->rest_pos_z[even_base + i] = 0.0f;
   }
+  for (int i = 0; i < odd_cnt; ++i) {
+    int orig_i = 2 * i + 1;
+    cns->ptc_pos_x[odd_base + i] = rest_pos[orig_i*4+0];
+    cns->ptc_pos_y[odd_base + i] = rest_pos[orig_i*4+1];
+    cns->ptc_pos_z[odd_base + i] = rest_pos[orig_i*4+2];
+    cns->ptc_inv_mass[odd_base + i] = rest_pos[orig_i*4+3];
+
+    cns->rest_pos_x[odd_base + i] = rest_pos[orig_i*4+0];
+    cns->rest_pos_y[odd_base + i] = rest_pos[orig_i*4+1];
+    cns->rest_pos_z[odd_base + i] = rest_pos[orig_i*4+2];
+  }
+  for (int i = odd_cnt; i < 8; ++i) {
+    cns->ptc_pos_x[odd_base + i] = 0.0f;
+    cns->ptc_pos_y[odd_base + i] = 0.0f;
+    cns->ptc_pos_z[odd_base + i] = 0.0f;
+    cns->ptc_inv_mass[odd_base + i] = 0.0f;
+    cns->rest_pos_x[odd_base + i] = 0.0f;
+    cns->rest_pos_y[odd_base + i] = 0.0f;
+    cns->rest_pos_z[odd_base + i] = 0.0f;
+  }
+
+  // Set motion radius to MAX_MOTION_RADIUS by default (no constraint)
+  _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(MAX_MOTION_RADIUS));
+  _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(MAX_MOTION_RADIUS));
+
+  // Transpose constraints: even cons (0,2,4,...) to r_base+0-7, odd (1,3,5,...) to r_base+8-15
   for (int i = 1; i < cnt; ++i) {
     float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
     float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
     float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
-    cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
+    float len = sqrtf(dx*dx + dy*dy + dz*dz);
+    int con_idx = i - 1;
+    if ((con_idx & 0x01) == 0) {  // Even constraint
+      cns->ptc_rest_len[r_base + (con_idx / 2)] = len;
+      cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f;
+    } else {  // Odd constraint
+      cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = len;
+      cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
+    }
+  }
+  for (int i = cnt - 1; i < CON_PER_CHN - 1; ++i) {  // Pad dummies
+    int con_idx = i;
+    if ((con_idx & 0x01) == 0) {
+      cns->ptc_rest_len[r_base + (con_idx / 2)] = 0.0f;
+      cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f;
+    } else {
+      cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
+      cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f;
+    }
+  }
+
+  // Compute rest bending angles for triples, transposed (even bends to b_base+0-6, odd to b_base+8-13)
+  for (int i = 2; i < cnt; ++i) {
+    float dx0 = rest_pos[(i-1)*4+0] - rest_pos[(i-2)*4+0];
+    float dy0 = rest_pos[(i-1)*4+1] - rest_pos[(i-2)*4+1];
+    float dz0 = rest_pos[(i-1)*4+2] - rest_pos[(i-2)*4+2];
+    float len0 = sqrtf(dx0*dx0 + dy0*dy0 + dz0*dz0);
+    if (len0 > 1e-6f) {
+      dx0 /= len0; dy0 /= len0; dz0 /= len0;
+    }
+
+    float dx1 = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
+    float dy1 = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
+    float dz1 = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
+    float len1 = sqrtf(dx1*dx1 + dy1*dy1 + dz1*dz1);
+    if (len1 > 1e-6f) {
+      dx1 /= len1; dy1 /= len1; dz1 /= len1;
+    }
+
+    float dot = dx0*dx1 + dy0*dy1 + dz0*dz1;
+    dot = fminf(1.0f, fmaxf(-1.0f, dot));
+    int bend_idx = i - 2;
+    if ((bend_idx & 0x01) == 0) {  // Even bend
+      cns->ptc_bend_rest_ang[b_base + (bend_idx / 2)] = dot;
+      cns->ptc_bend_lambda[b_base + (bend_idx / 2)] = 0.0f;
+    } else {  // Odd bend
+      cns->ptc_bend_rest_ang[b_base + 8 + ((bend_idx - 1) / 2)] = dot;
+      cns->ptc_bend_lambda[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f;
+    }
   }
-  for (int i = cnt; i < CON_PER_CHN; ++i) {
-    cns->ptc_rest_len[r_idx + i] = 0.0f;
-    cns->ptc_rest_len[r_idx + i] = 0.0f;
-    cns->ptc_rest_len[r_idx + i] = 0.0f;
+  // Pad dummies
+  for (int i = cnt - 2; i < CON_PER_CHN - 2; ++i) {
+    int bend_idx = i;
+    if ((bend_idx & 0x01) == 0) {
+      cns->ptc_bend_rest_ang[b_base + (bend_idx / 2)] = 0.0f;
+      cns->ptc_bend_lambda[b_base + (bend_idx / 2)] = 0.0f;
+    } else {
+      cns->ptc_bend_rest_ang[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f;
+      cns->ptc_bend_lambda[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f;
+    }
   }
   return chn;
 }
@@ -139,9 +261,10 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
 
-  int p_idx = (chn * PTC_PER_CHN);
-  int s_idx = (chn * SPH_PER_CHN);
-  int c_idx = (chn * CON_PER_CHN);
+  int p_base = (chn * PTC_PER_CHN);
+  int s_base = (chn * SPH_PER_CHN);
+  int r_base = (chn * CON_PER_CHN);
+  int b_base = (chn * (CON_PER_CHN - 1));
 
   cns->free_idx[cns->free_idx_cnt++] = chn;
   memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
@@ -172,40 +295,49 @@ chn_sim_del(struct chn_sim *cns, int chn) {
   cns->chn_prev_quat_z[chn] = 0.0f;
   cns->chn_prev_quat_w[chn] = 1.0f;
 
-  _mm256_store_ps(&cns->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cns->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cns->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cns->sph_x[s_idx], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->sph_y[s_idx], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->sph_z[s_idx], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->sph_r[s_idx], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cns->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cns->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f));
+  // Zero transposed particle blocks
+  _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->ptc_inv_mass[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_inv_mass[p_base + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->rest_pos_x[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_x[p_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_y[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_y[p_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_z[p_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_z[p_base + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(FLT_MAX));
+  _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(FLT_MAX));
+
+  _mm256_store_ps(&cns->sph_x[s_base], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->sph_y[s_base], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->sph_z[s_base], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->sph_r[s_base], _mm256_set1_ps(0.0f));
+
+  // Zero transposed constraints
+  _mm256_store_ps(&cns->ptc_rest_len[r_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->ptc_bend_rest_ang[b_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_bend_rest_ang[b_base + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 8], _mm256_set1_ps(0.0f));
 }
 extern void
 chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
@@ -267,20 +399,20 @@ extern void
 chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
   assert(cns);
   assert(spheres);
-  assert(cnt < MAX_SPH);
+  assert(cnt <= SPH_PER_CHN);
 
-  int s_idx = (chn * SPH_PER_CHN);
+  int s_base = (chn * SPH_PER_CHN);
   for (int i = 0; i < cnt; i++) {
-    cns->sph_x[s_idx + i] = spheres[i*4+0];
-    cns->sph_y[s_idx + i] = spheres[i*4+1];
-    cns->sph_z[s_idx + i] = spheres[i*4+2];
-    cns->sph_r[s_idx + i] = spheres[i*4+3];
+    cns->sph_x[s_base + i] = spheres[i*4+0];
+    cns->sph_y[s_base + i] = spheres[i*4+1];
+    cns->sph_z[s_base + i] = spheres[i*4+2];
+    cns->sph_r[s_base + i] = spheres[i*4+3];
   }
   for (int i = cnt; i < SPH_PER_CHN; ++i) {
-    cns->sph_x[s_idx + i] = 0;
-    cns->sph_y[s_idx + i] = 0;
-    cns->sph_z[s_idx + i] = 0;
-    cns->sph_r[s_idx + i] = 0;
+    cns->sph_x[s_base + i] = 0;
+    cns->sph_y[s_base + i] = 0;
+    cns->sph_z[s_base + i] = 0;
+    cns->sph_r[s_base + i] = 0;
   }
 }
 extern void
@@ -291,12 +423,22 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
   assert(cnt >= 0);
   assert(cnt < PTC_PER_CHN);
 
-  int p_idx = (chn * PTC_PER_CHN);
-  for (int i = 0; i < cnt; ++i) {
-    cns->motion_radius[p_idx + i] = radius[i];
+  int p_base = (chn * PTC_PER_CHN);
+  int even_base = p_base + 0;
+  int odd_base = p_base + 8;
+  int even_cnt = (cnt + 1) / 2;
+  int odd_cnt = cnt / 2;
+  for (int i = 0; i < even_cnt; ++i) {
+    cns->motion_radius[even_base + i] = radius[2 * i];
+  }
+  for (int i = even_cnt; i < 8; ++i) {
+    cns->motion_radius[even_base + i] = FLT_MAX;
+  }
+  for (int i = 0; i < odd_cnt; ++i) {
+    cns->motion_radius[odd_base + i] = radius[2 * i + 1];
   }
-  for (int i = cnt; i < PTC_PER_CHN; ++i) {
-    cns->motion_radius[p_idx + i] = FLT_MAX;
+  for (int i = odd_cnt; i < 8; ++i) {
+    cns->motion_radius[odd_base + i] = FLT_MAX;
   }
 }
 extern void
@@ -314,26 +456,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
   const __m256 two_vec = _mm256_set1_ps(2.0f);
   const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
   const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
-  const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
-  const __m256i idx_p0_even = _mm256_set_epi32(14, 12, 10,  8,  6,  4,  2,  0);
-  const __m256i idx_p1_even = _mm256_set_epi32(15, 13, 11,  9,  7,  5,  3,  1);
-  const __m256i idx_rest_even = _mm256_set_epi32(14, 12, 10,  8,  6,  4,  2,  0);
-  const __m256i idx_p0_odd = _mm256_set_epi32(13, 11,  9,  7,  5,  3,  1, -1);  // Use -1 or 0 for dummy lane, masked anyway
-  const __m256i idx_p1_odd = _mm256_set_epi32(14, 12, 10,  8,  6,  4,  2, -1);  // Adjusted for consistency (dummy last)
-  const __m256i idx_rest_odd = _mm256_set_epi32(13, 11,  9,  7,  5,  3,  1, -1);
-  const __m256 valid_mask = _mm256_castsi256_ps(_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000));  // Or load from array as before
-
-  // Local arrays for PBD solver
-  float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
-  float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
-  float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
-  float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
-
-  // Initialize dummy elements
-  px_lcl[PTC_PER_CHN] = 0.0f;
-  py_lcl[PTC_PER_CHN] = 0.0f;
-  pz_lcl[PTC_PER_CHN] = 0.0f;
-  pm_lcl[PTC_PER_CHN] = 0.0f;
 
   // Stack arrays for precomputed data
   float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
@@ -372,11 +494,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]);
 
       // Step 1: t = q_conj * v_world_as_quat
-      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
-      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
-      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
-      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
       __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
       tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
 
@@ -391,10 +508,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
 
       // Step 2: res_vec = (t * q)_vec
-      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
-      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
-      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
-      // res_z = tw*qz + ty*qw + tx*qy - ty*qx
       __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
       res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
       __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
@@ -406,37 +519,24 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&chn_wnd_lcl_y[c], res_y);
       _mm256_store_ps(&chn_wnd_lcl_z[c], res_z);
     }
-    // Compute local-space gravity
+    // Compute local-space gravity (identical to wind)
     {
       __m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]);
       __m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]);
       __m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]);
 
-      // Step 1: t = q_conj * v_world_as_quat
-      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
-      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
-      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
-      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
-
-      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
-      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-
-      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
-      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx));
+      tx = _mm256_fnmadd_ps(cqz, vy, tx);
 
-      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
-      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy));
+      ty = _mm256_fnmadd_ps(cqx, vz, ty);
 
-      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
-      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
-      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz));
+      tz = _mm256_fnmadd_ps(cqy, vx, tz);
 
-      // Step 2: res_vec = (t * q)_vec
-      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
-      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
-      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
-      // res_z = tw*qz + ty*qw + tx*qy - ty*qx
+      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);
+      tw = _mm256_fnmadd_ps(cqy, vy, tw);
+      tw = _mm256_fnmadd_ps(cqz, vz, tw);
 
       __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
       res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
@@ -492,8 +592,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 conj_qw = prev_qw;
 
       // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
-      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
-      // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
       __m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw));
       orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw);
       orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw);
@@ -535,12 +633,11 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
     }
   }
-  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
+  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - duplicated for even/odd blocks
   for (int c = 0; c < MAX_CHNS; ++c) {
-    int base_idx = c * PTC_PER_CHN;
+    int p_base = c * PTC_PER_CHN;
     struct chn_cfg *cfg = &cns->chn_cfg[c];
 
-    // Load precomputed velocities and inertia parameters
     __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
     __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
     __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);
@@ -553,24 +650,63 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
     __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);
 
-    // Clamp inertia parameters to [0, 1]
     linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
     angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
     centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
 
-    for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]);
-      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]);
-      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]);
-      __m256 p_ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]);
-      p_ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+    // Even block (0-7)
+    {
+      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
+      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
+      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
+
+      __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
+
+      __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
+      __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
+      __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
+
+      ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec);
+      ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec);
+      ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec);
+
+      __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
+      __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
+      __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
+
+      __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
+      __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
+      __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
+
+      __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec);
+      __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
+      __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);
+
+      __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec);
+      __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec);
+      __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec);
+
+      __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
+      __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
+      __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);
+
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_add_ps(px, dt_x));
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y));
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z));
+    }
+
+    // Odd block (8-15)
+    {
+      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
+      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
+      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
 
-      // Linear inertia: v * dt * linear_inertia
       __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
       __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
       __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
 
-      // Angular inertia: (ω × r) * dt * angular_inertia
       __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
       __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
       __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
@@ -579,7 +715,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec);
       ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec);
 
-      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
       __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
       __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
       __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
@@ -588,26 +723,23 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
       __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
 
-      // Calculate displacement: (ω × (ω × r)) * dt^2
       __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec);
       __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
       __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);
 
-      // Apply the centrifugal_inertia factor
       __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec);
       __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec);
       __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec);
 
-      // Total delta
       __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
       __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
       __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);
 
-      // Update positions
-      _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x));
-      _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y));
-      _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z));
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_add_ps(px, dt_x));
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_add_ps(py, dt_y));
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_add_ps(pz, dt_z));
     }
+
     // Update previous transformation
     cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
     cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
@@ -618,9 +750,10 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
     cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
   }
-  // Step 2: Verlet Integration
+
+  // Step 2: Verlet Integration - duplicated for even/odd blocks
   for (int c = 0; c < MAX_CHNS; ++c) {
-    int base_idx = c * PTC_PER_CHN;
+    int p_base = c * PTC_PER_CHN;
     struct chn_cfg *cfg = &cns->chn_cfg[c];
 
     __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]);
@@ -632,20 +765,62 @@ chn_sim_run(struct chn_sim *cns, float dt) {
     __m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]);
     __m256 damping_vec = _mm256_set1_ps(cfg->damping);
 
-    for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]);
-      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]);
-      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]);
-      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]);
+    __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec);
+    __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec);
+    __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec);
+
+    // Even block
+    {
+      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
+      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
+      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
+      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
       ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[base_idx + i]);
-      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[base_idx + i]);
-      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[base_idx + i]);
+      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
+      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
+      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
+
+      __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass);
+      __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass);
+      __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass);
+
+      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
+      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
+      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
+
+      __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
+      __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
+      __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);
+
+      __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
+      __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
+      __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);
+
+      __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
+      __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
+      __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
+
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
 
-      __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec);
-      __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec);
-      __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec);
+      _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], p_curr_x);
+      _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y);
+      _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z);
+    }
+
+    // Odd block
+    {
+      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
+      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
+      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
+      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
+      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+
+      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
+      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
+      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
 
       __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass);
       __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass);
@@ -667,220 +842,426 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
       __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
 
-      _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], new_p_x);
-      _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], new_p_y);
-      _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], new_p_z);
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x);
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y);
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z);
 
-      _mm256_store_ps(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x);
-      _mm256_store_ps(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y);
-      _mm256_store_ps(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z);
+      _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], p_curr_x);
+      _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], p_curr_y);
+      _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z);
     }
   }
-  // Step 3: Distance Constraints
+
+  // Step 3: XPBD Constraints (Distance + Bending) - Direct loads for transposed even/odd
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
     int r_base = c * CON_PER_CHN;
+    int b_base = c * (CON_PER_CHN - 1);
 
     struct chn_cfg *cfg = &cns->chn_cfg[c];
     __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
-    __m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor);
-    __m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain);
-    __m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec);
+    __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness);
+    __m256 compliance_factor = _mm256_div_ps(dt2_vec, stiff_vec);
+    __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, bend_stiff_vec);
 
-    for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
-      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
-      _mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cns->ptc_pos_x[p_base + i]));
-      _mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cns->ptc_pos_y[p_base + i]));
-      _mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cns->ptc_pos_z[p_base + i]));
-      _mm256_store_ps(&pm_lcl[i], pm);
-    }
-    for (int iter = 0; iter < MAX_ITR; ++iter) {
-      // Color 0: even constraints (0-1, 2-3, ..., 14-15)
-      {
-        __m256 p0x = _mm256_i32gather_ps(px_lcl, idx_p0_even, 4);
-        __m256 p0y = _mm256_i32gather_ps(py_lcl, idx_p0_even, 4);
-        __m256 p0z = _mm256_i32gather_ps(pz_lcl, idx_p0_even, 4);
-        __m256 p0m = _mm256_i32gather_ps(pm_lcl, idx_p0_even, 4);
+    // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15)
+    __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
+    __m256 even_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
+    __m256 even_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
+    __m256 even_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-        __m256 p1x = _mm256_i32gather_ps(px_lcl, idx_p1_even, 4);
-        __m256 p1y = _mm256_i32gather_ps(py_lcl, idx_p1_even, 4);
-        __m256 p1z = _mm256_i32gather_ps(pz_lcl, idx_p1_even, 4);
-        __m256 p1m = _mm256_i32gather_ps(pm_lcl, idx_p1_even, 4);
+    __m256 odd_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
+    __m256 odd_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
+    __m256 odd_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
+    __m256 odd_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-        __m256 rest_len_vec = _mm256_i32gather_ps(&cns->ptc_rest_len[r_base], idx_rest_even, 4);
+    // Direct load transposed rests/lambdas: even cons [0,2,...] in r_base+0, odd in r_base+8
+    __m256 rest_even = _mm256_load_ps(&cns->ptc_rest_len[r_base + 0]);
+    __m256 lambda_even = _mm256_load_ps(&cns->ptc_lambda[r_base + 0]);
 
-        __m256 dx = _mm256_sub_ps(p1x, p0x);
-        __m256 dy = _mm256_sub_ps(p1y, p0y);
-        __m256 dz = _mm256_sub_ps(p1z, p0z);
+    __m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]);
+    __m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]);
 
-        __m256 dist_sq = _mm256_mul_ps(dx, dx);
-        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
-        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+    // For bending even: bend_rest_even in b_base+0 (7 lanes + dummy), odd bends in b_base+8 if implemented
+    __m256 bend_rest_even = _mm256_load_ps(&cns->ptc_bend_rest_ang[b_base + 0]);
+    __m256 bend_lambda_even = _mm256_load_ps(&cns->ptc_bend_lambda[b_base + 0]);
 
-        __m256 w_sum = _mm256_add_ps(p0m, p1m);
-        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
-        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
-
-        __m256 dist = _mm256_rcp_ps(inv_dist);
-        __m256 diff = _mm256_sub_ps(dist, rest_len_vec);
-        __m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec);
-        __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
-        __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));
-
-        __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
-        __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom));
-
-        __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec);
-        corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum);
-        __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part);
-
-        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-
-        __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist);
-        __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist);
-        __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist);
-
-        __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude);
-        __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude);
-        __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude);
-
-        dt_x = _mm256_and_ps(dt_x, apply_corr_mask);
-        dt_y = _mm256_and_ps(dt_y, apply_corr_mask);
-        dt_z = _mm256_and_ps(dt_z, apply_corr_mask);
-
-        __m256 new_p0x = _mm256_fmadd_ps(dt_x, p0m, p0x);
-        __m256 new_p0y = _mm256_fmadd_ps(dt_y, p0m, p0y);
-        __m256 new_p0z = _mm256_fmadd_ps(dt_z, p0m, p0z);
-
-        __m256 new_p1x = _mm256_fnmadd_ps(dt_x, p1m, p1x);
-        __m256 new_p1y = _mm256_fnmadd_ps(dt_y, p1m, p1y);
-        __m256 new_p1z = _mm256_fnmadd_ps(dt_z, p1m, p1z);
-
-        float temp_p0x[8] __attribute__((aligned(32)));
-        float temp_p0y[8] __attribute__((aligned(32)));
-        float temp_p0z[8] __attribute__((aligned(32)));
-        float temp_p1x[8] __attribute__((aligned(32)));
-        float temp_p1y[8] __attribute__((aligned(32)));
-        float temp_p1z[8] __attribute__((aligned(32)));
-
-        _mm256_store_ps(temp_p0x, new_p0x);
-        _mm256_store_ps(temp_p0y, new_p0y);
-        _mm256_store_ps(temp_p0z, new_p0z);
-        _mm256_store_ps(temp_p1x, new_p1x);
-        _mm256_store_ps(temp_p1y, new_p1y);
-        _mm256_store_ps(temp_p1z, new_p1z);
-
-        for (int l = 0; l < 8; ++l) {
-          int idx_p0 = l * 2;
-          px_lcl[idx_p0] = temp_p0x[l];
-          py_lcl[idx_p0] = temp_p0y[l];
-          pz_lcl[idx_p0] = temp_p0z[l];
-
-          int idx_p1 = l * 2 + 1;
-          px_lcl[idx_p1] = temp_p1x[l];
-          py_lcl[idx_p1] = temp_p1y[l];
-          pz_lcl[idx_p1] = temp_p1z[l];
-        }
+    __m256 bend_rest_odd = _mm256_load_ps(&cns->ptc_bend_rest_ang[b_base + 8]);
+    __m256 bend_lambda_odd = _mm256_load_ps(&cns->ptc_bend_lambda[b_base + 8]);
+
+    for (int iter = 0; iter < MAX_ITR; ++iter) {
+      // Even distance constraints: p0 = even, p1 = odd
+      {
+        __m256 dx = _mm256_sub_ps(odd_x, even_x);
+        __m256 dy = _mm256_sub_ps(odd_y, even_y);
+        __m256 dz = _mm256_sub_ps(odd_z, even_z);
+
+        __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
+        __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
+        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+
+        __m256 w_sum = _mm256_add_ps(even_m, odd_m);
+        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
+        __m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ);
+        __m256 denom = _mm256_div_ps(one_vec, w_sum);
+
+        __m256 C = _mm256_sub_ps(dist, rest_even);
+        __m256 delta_lambda = _mm256_mul_ps(compliance_factor, _mm256_mul_ps(C, denom));
+        __m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda);
+        __m256 dlambda = delta_lambda;
+
+        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 unit_z = _mm256_mul_ps(dz, inv_dist);
+
+        __m256 corr_x = _mm256_mul_ps(unit_x, dlambda);
+        __m256 corr_y = _mm256_mul_ps(unit_y, dlambda);
+        __m256 corr_z = _mm256_mul_ps(unit_z, dlambda);
+
+        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
+        corr_x = _mm256_and_ps(corr_x, corr_mask);
+        corr_y = _mm256_and_ps(corr_y, corr_mask);
+        corr_z = _mm256_and_ps(corr_z, corr_mask);
+
+        // Apply directly
+        even_x = _mm256_add_ps(even_x, _mm256_mul_ps(corr_x, even_m));
+        even_y = _mm256_add_ps(even_y, _mm256_mul_ps(corr_y, even_m));
+        even_z = _mm256_add_ps(even_z, _mm256_mul_ps(corr_z, even_m));
+
+        odd_x = _mm256_sub_ps(odd_x, _mm256_mul_ps(corr_x, odd_m));
+        odd_y = _mm256_sub_ps(odd_y, _mm256_mul_ps(corr_y, odd_m));
+        odd_z = _mm256_sub_ps(odd_z, _mm256_mul_ps(corr_z, odd_m));
+
+        lambda_even = new_lambda;
       }
-      // Color 1: odd constraints (1-2, 3-4, ..., 13-14)
+      // Odd distance constraints: p1 = odd, p2 = even shifted left (slli)
       {
-        __m256 p0x = _mm256_i32gather_ps(px_lcl, idx_p0_odd, 4);
-        __m256 p0y = _mm256_i32gather_ps(py_lcl, idx_p0_odd, 4);
-        __m256 p0z = _mm256_i32gather_ps(pz_lcl, idx_p0_odd, 4);
-        __m256 p0m = _mm256_i32gather_ps(pm_lcl, idx_p0_odd, 4);
-
-        __m256 p1x = _mm256_i32gather_ps(px_lcl, idx_p1_odd, 4);
-        __m256 p1y = _mm256_i32gather_ps(py_lcl, idx_p1_odd, 4);
-        __m256 p1z = _mm256_i32gather_ps(pz_lcl, idx_p1_odd, 4);
-        __m256 p1m = _mm256_i32gather_ps(pm_lcl, idx_p1_odd, 4);
-
-        __m256 rest_len_vec = _mm256_i32gather_ps(&cns->ptc_rest_len[r_base], idx_rest_odd, 4);
+        __m256i even_ix = _mm256_castps_si256(even_x);
+        __m256i shifted_ix = _mm256_srli_si256(even_ix, 4);
+        __m256 p2_x = _mm256_castsi256_ps(shifted_ix);
+
+        __m256i even_iy = _mm256_castps_si256(even_y);
+        __m256i shifted_iy = _mm256_srli_si256(even_iy, 4);
+        __m256 p2_y = _mm256_castsi256_ps(shifted_iy);
+
+        __m256i even_iz = _mm256_castps_si256(even_z);
+        __m256i shifted_iz = _mm256_srli_si256(even_iz, 4);
+        __m256 p2_z = _mm256_castsi256_ps(shifted_iz);
+
+        __m256i even_im = _mm256_castps_si256(even_m);
+        __m256i shifted_im = _mm256_srli_si256(even_im, 4);
+        __m256 p2_m = _mm256_castsi256_ps(shifted_im);
+
+        __m256 dx = _mm256_sub_ps(p2_x, odd_x);
+        __m256 dy = _mm256_sub_ps(p2_y, odd_y);
+        __m256 dz = _mm256_sub_ps(p2_z, odd_z);
+
+        __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
+        __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_dist = _mm256_rsqrt_ps(dist_sq);
+        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+
+        __m256 w_sum = _mm256_add_ps(odd_m, p2_m);
+        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
+        __m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ);
+        __m256 denom = _mm256_div_ps(one_vec, w_sum);
+
+        __m256 C = _mm256_sub_ps(dist, rest_odd);
+        __m256 delta_lambda = _mm256_mul_ps(compliance_factor, _mm256_mul_ps(C, denom));
+        __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda);
+        __m256 dlambda = delta_lambda;
+
+        __m256 unit_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 unit_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 unit_z = _mm256_mul_ps(dz, inv_dist);
+
+        __m256 corr_x = _mm256_mul_ps(unit_x, dlambda);
+        __m256 corr_y = _mm256_mul_ps(unit_y, dlambda);
+        __m256 corr_z = _mm256_mul_ps(unit_z, dlambda);
+
+        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest);
+        corr_x = _mm256_and_ps(corr_x, corr_mask);
+        corr_y = _mm256_and_ps(corr_y, corr_mask);
+        corr_z = _mm256_and_ps(corr_z, corr_mask);
+
+        // Apply: p1=odd += corr * odd_m
+        odd_x = _mm256_add_ps(odd_x, _mm256_mul_ps(corr_x, odd_m));
+        odd_y = _mm256_add_ps(odd_y, _mm256_mul_ps(corr_y, odd_m));
+        odd_z = _mm256_add_ps(odd_z, _mm256_mul_ps(corr_z, odd_m));
+
+        // Propagate p2 correction back to even: delta_p2 = -corr * p2_m, then right-shift (srli) to align with even lanes
+        __m256 corr_shifted_x = _mm256_mul_ps(corr_x, p2_m);
+        __m256 delta_p2_x = _mm256_sub_ps(zero_vec, corr_shifted_x);
+        __m256i delta_ix = _mm256_castps_si256(delta_p2_x);
+        __m256i right_shifted_ix = _mm256_srli_si256(delta_ix, 4);  // Right shift, pad 0 on MSB
+        __m256 delta_even_x = _mm256_castsi256_ps(right_shifted_ix);
+        even_x = _mm256_add_ps(even_x, delta_even_x);
+
+        __m256 corr_shifted_y = _mm256_mul_ps(corr_y, p2_m);
+        __m256 delta_p2_y = _mm256_sub_ps(zero_vec, corr_shifted_y);
+        __m256i delta_iy = _mm256_castps_si256(delta_p2_y);
+        __m256i right_shifted_iy = _mm256_srli_si256(delta_iy, 4);
+        __m256 delta_even_y = _mm256_castsi256_ps(right_shifted_iy);
+        even_y = _mm256_add_ps(even_y, delta_even_y);
+
+        __m256 corr_shifted_z = _mm256_mul_ps(corr_z, p2_m);
+        __m256 delta_p2_z = _mm256_sub_ps(zero_vec, corr_shifted_z);
+        __m256i delta_iz = _mm256_castps_si256(delta_p2_z);
+        __m256i right_shifted_iz = _mm256_srli_si256(delta_iz, 4);
+        __m256 delta_even_z = _mm256_castsi256_ps(right_shifted_iz);
+        even_z = _mm256_add_ps(even_z, delta_even_z);
+
+        lambda_odd = new_lambda;
+      }
+      // Even bending constraints: p0-p1-p2
+      {
+        // p2 = even shifted left (slli)
+        __m256i even_ix = _mm256_castps_si256(even_x);
+        __m256i shifted_ix = _mm256_srli_si256(even_ix, 4);
+        __m256 p2_x = _mm256_castsi256_ps(shifted_ix);
+
+        __m256i even_iy = _mm256_castps_si256(even_y);
+        __m256i shifted_iy = _mm256_srli_si256(even_iy, 4);
+        __m256 p2_y = _mm256_castsi256_ps(shifted_iy);
+
+        __m256i even_iz = _mm256_castps_si256(even_z);
+        __m256i shifted_iz = _mm256_srli_si256(even_iz, 4);
+        __m256 p2_z = _mm256_castsi256_ps(shifted_iz);
+
+        __m256i even_im = _mm256_castps_si256(even_m);
+        __m256i shifted_im = _mm256_srli_si256(even_im, 4);
+        __m256 p2_m = _mm256_castsi256_ps(shifted_im);
+
+        // Vectors v0 = p1 - p0, v1 = p2 - p1
+        __m256 v0x = _mm256_sub_ps(odd_x, even_x);
+        __m256 v0y = _mm256_sub_ps(odd_y, even_y);
+        __m256 v0z = _mm256_sub_ps(odd_z, even_z);
+        __m256 len0_sq = _mm256_fmadd_ps(v0z, v0z, _mm256_fmadd_ps(v0y, v0y, _mm256_mul_ps(v0x, v0x)));
+        __m256 mask_len0 = _mm256_cmp_ps(len0_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_len0 = _mm256_rsqrt_ps(len0_sq);
+        __m256 n0x = _mm256_mul_ps(v0x, inv_len0);
+        __m256 n0y = _mm256_mul_ps(v0y, inv_len0);
+        __m256 n0z = _mm256_mul_ps(v0z, inv_len0);
+
+        __m256 v1x = _mm256_sub_ps(p2_x, odd_x);
+        __m256 v1y = _mm256_sub_ps(p2_y, odd_y);
+        __m256 v1z = _mm256_sub_ps(p2_z, odd_z);
+        __m256 len1_sq = _mm256_fmadd_ps(v1z, v1z, _mm256_fmadd_ps(v1y, v1y, _mm256_mul_ps(v1x, v1x)));
+        __m256 mask_len1 = _mm256_cmp_ps(len1_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_len1 = _mm256_rsqrt_ps(len1_sq);
+        __m256 n1x = _mm256_mul_ps(v1x, inv_len1);
+        __m256 n1y = _mm256_mul_ps(v1y, inv_len1);
+        __m256 n1z = _mm256_mul_ps(v1z, inv_len1);
+
+        __m256 cos_theta = _mm256_fmadd_ps(n0z, n1z, _mm256_fmadd_ps(n0y, n1y, _mm256_mul_ps(n0x, n1x)));
+        cos_theta = _mm256_min_ps(one_vec, _mm256_max_ps(neg_one_vec, cos_theta));
+        __m256 C = _mm256_sub_ps(bend_rest_even, cos_theta);
+
+        __m256 w_sum = _mm256_add_ps(_mm256_add_ps(even_m, odd_m), p2_m);
+        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
+        __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ);
+        __m256 denom = _mm256_div_ps(one_vec, w_sum);
+
+        __m256 delta_lambda = _mm256_mul_ps(bend_compliance_factor, _mm256_mul_ps(C, denom));
+        __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda);
+        __m256 dlambda = delta_lambda;
+
+        // Direction: cross = n0 × n1 normalized
+        __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(n0y, n1z), _mm256_mul_ps(n0z, n1y));
+        __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(n0z, n1x), _mm256_mul_ps(n0x, n1z));
+        __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(n0x, n1y), _mm256_mul_ps(n0y, n1x));
+        __m256 cross_sq = _mm256_fmadd_ps(cross_z, cross_z, _mm256_fmadd_ps(cross_y, cross_y, _mm256_mul_ps(cross_x, cross_x)));
+        __m256 mask_cross = _mm256_cmp_ps(cross_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_cross = _mm256_rsqrt_ps(cross_sq);
+        cross_x = _mm256_mul_ps(cross_x, inv_cross);
+        cross_y = _mm256_mul_ps(cross_y, inv_cross);
+        cross_z = _mm256_mul_ps(cross_z, inv_cross);
+
+        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask_len0, mask_len1), mask_w), mask_rest), mask_cross);
+        __m256 dt_x = _mm256_and_ps(_mm256_mul_ps(cross_x, dlambda), corr_mask);
+        __m256 dt_y = _mm256_and_ps(_mm256_mul_ps(cross_y, dlambda), corr_mask);
+        __m256 dt_z = _mm256_and_ps(_mm256_mul_ps(cross_z, dlambda), corr_mask);
+
+        // Apply: p0 -= dt * even_m
+        even_x = _mm256_sub_ps(even_x, _mm256_mul_ps(dt_x, even_m));
+        even_y = _mm256_sub_ps(even_y, _mm256_mul_ps(dt_y, even_m));
+        even_z = _mm256_sub_ps(even_z, _mm256_mul_ps(dt_z, even_m));
+
+        // Propagate p2 correction back to even: delta_p2 = +dt * p2_m, then right-shift (srli)
+        __m256 corr_p2_x = _mm256_mul_ps(dt_x, p2_m);
+        __m256 delta_p2_x = corr_p2_x;
+        __m256i delta_p2_ix = _mm256_castps_si256(delta_p2_x);
+        __m256i right_shifted_p2_ix = _mm256_srli_si256(delta_p2_ix, 4);
+        __m256 delta_even_from_p2_x = _mm256_castsi256_ps(right_shifted_p2_ix);
+        even_x = _mm256_add_ps(even_x, delta_even_from_p2_x);
+
+        __m256 corr_p2_y = _mm256_mul_ps(dt_y, p2_m);
+        __m256 delta_p2_y = corr_p2_y;
+        __m256i delta_p2_iy = _mm256_castps_si256(delta_p2_y);
+        __m256i right_shifted_p2_iy = _mm256_srli_si256(delta_p2_iy, 4);
+        __m256 delta_even_from_p2_y = _mm256_castsi256_ps(right_shifted_p2_iy);
+        even_y = _mm256_add_ps(even_y, delta_even_from_p2_y);
+
+        __m256 corr_p2_z = _mm256_mul_ps(dt_z, p2_m);
+        __m256 delta_p2_z = corr_p2_z;
+        __m256i delta_p2_iz = _mm256_castps_si256(delta_p2_z);
+        __m256i right_shifted_p2_iz = _mm256_srli_si256(delta_p2_iz, 4);
+        __m256 delta_even_from_p2_z = _mm256_castsi256_ps(right_shifted_p2_iz);
+        even_z = _mm256_add_ps(even_z, delta_even_from_p2_z);
+
+        bend_lambda_even = new_lambda;
+      }
+      // Odd bending constraints: p1=odd[0], p2=even[1], p3=odd[1]
+      {
+        // p2 = even shifted right (srli to align even[1] to lane 0)
+        __m256i even_ix = _mm256_castps_si256(even_x);
+        __m256i shifted_ix = _mm256_srli_si256(even_ix, 4);
+        __m256 p2_x = _mm256_castsi256_ps(shifted_ix);
+
+        __m256i even_iy = _mm256_castps_si256(even_y);
+        __m256i shifted_iy = _mm256_srli_si256(even_iy, 4);
+        __m256 p2_y = _mm256_castsi256_ps(shifted_iy);
+
+        __m256i even_iz = _mm256_castps_si256(even_z);
+        __m256i shifted_iz = _mm256_srli_si256(even_iz, 4);
+        __m256 p2_z = _mm256_castsi256_ps(shifted_iz);
+
+        __m256i even_im = _mm256_castps_si256(even_m);
+        __m256i shifted_im = _mm256_srli_si256(even_im, 4);
+        __m256 p2_m = _mm256_castsi256_ps(shifted_im);
+
+        // p3 = odd shifted right (srli to align odd[1] to lane 0)
+        __m256i odd_ix = _mm256_castps_si256(odd_x);
+        __m256i shifted_odd_ix = _mm256_srli_si256(odd_ix, 4);
+        __m256 p3_x = _mm256_castsi256_ps(shifted_odd_ix);
+
+        __m256i odd_iy = _mm256_castps_si256(odd_y);
+        __m256i shifted_odd_iy = _mm256_srli_si256(odd_iy, 4);
+        __m256 p3_y = _mm256_castsi256_ps(shifted_odd_iy);
+
+        __m256i odd_iz = _mm256_castps_si256(odd_z);
+        __m256i shifted_odd_iz = _mm256_srli_si256(odd_iz, 4);
+        __m256 p3_z = _mm256_castsi256_ps(shifted_odd_iz);
+
+        __m256i odd_im = _mm256_castps_si256(odd_m);
+        __m256i shifted_odd_im = _mm256_srli_si256(odd_im, 4);
+        __m256 p3_m = _mm256_castsi256_ps(shifted_odd_im);
+
+        // Vectors v0 = p2 - p1, v1 = p3 - p2
+        __m256 v0x = _mm256_sub_ps(p2_x, odd_x);
+        __m256 v0y = _mm256_sub_ps(p2_y, odd_y);
+        __m256 v0z = _mm256_sub_ps(p2_z, odd_z);
+        __m256 len0_sq = _mm256_fmadd_ps(v0z, v0z, _mm256_fmadd_ps(v0y, v0y, _mm256_mul_ps(v0x, v0x)));
+        __m256 mask_len0 = _mm256_cmp_ps(len0_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_len0 = _mm256_rsqrt_ps(len0_sq);
+        __m256 n0x = _mm256_mul_ps(v0x, inv_len0);
+        __m256 n0y = _mm256_mul_ps(v0y, inv_len0);
+        __m256 n0z = _mm256_mul_ps(v0z, inv_len0);
+
+        __m256 v1x = _mm256_sub_ps(p3_x, p2_x);
+        __m256 v1y = _mm256_sub_ps(p3_y, p2_y);
+        __m256 v1z = _mm256_sub_ps(p3_z, p2_z);
+        __m256 len1_sq = _mm256_fmadd_ps(v1z, v1z, _mm256_fmadd_ps(v1y, v1y, _mm256_mul_ps(v1x, v1x)));
+        __m256 mask_len1 = _mm256_cmp_ps(len1_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_len1 = _mm256_rsqrt_ps(len1_sq);
+        __m256 n1x = _mm256_mul_ps(v1x, inv_len1);
+        __m256 n1y = _mm256_mul_ps(v1y, inv_len1);
+        __m256 n1z = _mm256_mul_ps(v1z, inv_len1);
+
+        // Cosine of current angle
+        __m256 cos_theta = _mm256_fmadd_ps(n0z, n1z, _mm256_fmadd_ps(n0y, n1y, _mm256_mul_ps(n0x, n1x)));
+        cos_theta = _mm256_min_ps(one_vec, _mm256_max_ps(neg_one_vec, cos_theta));
+        __m256 C = _mm256_sub_ps(bend_rest_odd, cos_theta);
+
+        // Sum of inverse masses
+        __m256 w_sum = _mm256_add_ps(_mm256_add_ps(odd_m, p2_m), p3_m);
+        __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ);
+        __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ);
+        __m256 denom = _mm256_div_ps(one_vec, w_sum);
+
+        // XPBD lambda update
+        __m256 delta_lambda = _mm256_mul_ps(bend_compliance_factor, _mm256_mul_ps(C, denom));
+        __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda);
+        __m256 dlambda = delta_lambda;
+
+        // Correction direction: cross = n0 × n1 normalized
+        __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(n0y, n1z), _mm256_mul_ps(n0z, n1y));
+        __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(n0z, n1x), _mm256_mul_ps(n0x, n1z));
+        __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(n0x, n1y), _mm256_mul_ps(n0y, n1x));
+        __m256 cross_sq = _mm256_fmadd_ps(cross_z, cross_z, _mm256_fmadd_ps(cross_y, cross_y, _mm256_mul_ps(cross_x, cross_x)));
+        __m256 mask_cross = _mm256_cmp_ps(cross_sq, eps_vec, _CMP_GT_OQ);
+        __m256 inv_cross = _mm256_rsqrt_ps(cross_sq);
+        cross_x = _mm256_mul_ps(cross_x, inv_cross);
+        cross_y = _mm256_mul_ps(cross_y, inv_cross);
+        cross_z = _mm256_mul_ps(cross_z, inv_cross);
+
+        // Combine masks
+        __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask_len0, mask_len1), mask_w), mask_rest), mask_cross);
+        __m256 dt_x = _mm256_and_ps(_mm256_mul_ps(cross_x, dlambda), corr_mask);
+        __m256 dt_y = _mm256_and_ps(_mm256_mul_ps(cross_y, dlambda), corr_mask);
+        __m256 dt_z = _mm256_and_ps(_mm256_mul_ps(cross_z, dlambda), corr_mask);
+
+        // Apply: p1=odd[0] -= dt * odd_m
+        odd_x = _mm256_sub_ps(odd_x, _mm256_mul_ps(dt_x, odd_m));
+        odd_y = _mm256_sub_ps(odd_y, _mm256_mul_ps(dt_y, odd_m));
+        odd_z = _mm256_sub_ps(odd_z, _mm256_mul_ps(dt_z, odd_m));
+
+        // Apply: p3=odd[1] += dt * p3_m
+        __m256 corr_p3_x = _mm256_mul_ps(dt_x, p3_m);
+        __m256 corr_p3_y = _mm256_mul_ps(dt_y, p3_m);
+        __m256 corr_p3_z = _mm256_mul_ps(dt_z, p3_m);
+        __m256i corr_p3_ix = _mm256_castps_si256(corr_p3_x);
+        __m256i corr_p3_iy = _mm256_castps_si256(corr_p3_y);
+        __m256i corr_p3_iz = _mm256_castps_si256(corr_p3_z);
+        __m256i left_shifted_p3_ix = _mm256_slli_si256(corr_p3_ix, 4);
+        __m256i left_shifted_p3_iy = _mm256_slli_si256(corr_p3_iy, 4);
+        __m256i left_shifted_p3_iz = _mm256_slli_si256(corr_p3_iz, 4);
+        __m256 delta_odd_x = _mm256_castsi256_ps(left_shifted_p3_ix);
+        __m256 delta_odd_y = _mm256_castsi256_ps(left_shifted_p3_iy);
+        __m256 delta_odd_z = _mm256_castsi256_ps(left_shifted_p3_iz);
+        odd_x = _mm256_add_ps(odd_x, delta_odd_x);
+        odd_y = _mm256_add_ps(odd_y, delta_odd_y);
+        odd_z = _mm256_add_ps(odd_z, delta_odd_z);
+
+        // Propagate p2 correction to even: delta_p2 = +dt * p2_m, then left-shift (slli)
+        __m256 corr_p2_x = _mm256_mul_ps(dt_x, p2_m);
+        __m256 corr_p2_y = _mm256_mul_ps(dt_y, p2_m);
+        __m256 corr_p2_z = _mm256_mul_ps(dt_z, p2_m);
+        __m256i corr_p2_ix = _mm256_castps_si256(corr_p2_x);
+        __m256i corr_p2_iy = _mm256_castps_si256(corr_p2_y);
+        __m256i corr_p2_iz = _mm256_castps_si256(corr_p2_z);
+        __m256i left_shifted_p2_ix = _mm256_slli_si256(corr_p2_ix, 4);
+        __m256i left_shifted_p2_iy = _mm256_slli_si256(corr_p2_iy, 4);
+        __m256i left_shifted_p2_iz = _mm256_slli_si256(corr_p2_iz, 4);
+        __m256 delta_even_from_p2_x = _mm256_castsi256_ps(left_shifted_p2_ix);
+        __m256 delta_even_from_p2_y = _mm256_castsi256_ps(left_shifted_p2_iy);
+        __m256 delta_even_from_p2_z = _mm256_castsi256_ps(left_shifted_p2_iz);
+        even_x = _mm256_add_ps(even_x, delta_even_from_p2_x);
+        even_y = _mm256_add_ps(even_y, delta_even_from_p2_y);
+        even_z = _mm256_add_ps(even_z, delta_even_from_p2_z);
+
+        bend_lambda_odd = new_lambda;
+      }
+    }
 
-        __m256 dx = _mm256_sub_ps(p1x, p0x);
-        __m256 dy = _mm256_sub_ps(p1y, p0y);
-        __m256 dz = _mm256_sub_ps(p1z, p0z);
+    // Store back direct (no encode, since transposed)
+    _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x);
+    _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], even_y);
+    _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], even_z);
 
-        __m256 dist_sq = _mm256_mul_ps(dx, dx);
-        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
-        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+    _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], odd_x);
+    _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], odd_y);
+    _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], odd_z);
 
-        __m256 w_sum = _mm256_add_ps(p0m, p1m);
-        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
-        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
-
-        __m256 dist = _mm256_rcp_ps(inv_dist);
-        __m256 diff = _mm256_sub_ps(dist, rest_len_vec);
-        __m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec);
-        __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
-        __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));
-
-        __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
-        __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom));
-
-        __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec);
-        corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum);
-        __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part);
-
-        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-        apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);
-
-        __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist);
-        __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist);
-        __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist);
-
-        __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude);
-        __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude);
-        __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude);
-
-        dt_x = _mm256_and_ps(dt_x, apply_corr_mask);
-        dt_y = _mm256_and_ps(dt_y, apply_corr_mask);
-        dt_z = _mm256_and_ps(dt_z, apply_corr_mask);
-
-        __m256 new_p0x = _mm256_fmadd_ps(dt_x, p0m, p0x);
-        __m256 new_p0y = _mm256_fmadd_ps(dt_y, p0m, p0y);
-        __m256 new_p0z = _mm256_fmadd_ps(dt_z, p0m, p0z);
-
-        __m256 new_p1x = _mm256_fnmadd_ps(dt_x, p1m, p1x);
-        __m256 new_p1y = _mm256_fnmadd_ps(dt_y, p1m, p1y);
-        __m256 new_p1z = _mm256_fnmadd_ps(dt_z, p1m, p1z);
-
-        float temp_p0x[8] __attribute__((aligned(32)));
-        float temp_p0y[8] __attribute__((aligned(32)));
-        float temp_p0z[8] __attribute__((aligned(32)));
-        float temp_p1x[8] __attribute__((aligned(32)));
-        float temp_p1y[8] __attribute__((aligned(32)));
-        float temp_p1z[8] __attribute__((aligned(32)));
-
-        _mm256_store_ps(temp_p0x, new_p0x);
-        _mm256_store_ps(temp_p0y, new_p0y);
-        _mm256_store_ps(temp_p0z, new_p0z);
-        _mm256_store_ps(temp_p1x, new_p1x);
-        _mm256_store_ps(temp_p1y, new_p1y);
-        _mm256_store_ps(temp_p1z, new_p1z);
-
-        for (int l = 1; l < 8; ++l) {
-          int idx_p0 = (l - 1) * 2 + 1;
-          px_lcl[idx_p0] = temp_p0x[l];
-          py_lcl[idx_p0] = temp_p0y[l];
-          pz_lcl[idx_p0] = temp_p0z[l];
-
-          int idx_p1 = (l - 1) * 2 + 2;
-          px_lcl[idx_p1] = temp_p1x[l];
-          py_lcl[idx_p1] = temp_p1y[l];
-          pz_lcl[idx_p1] = temp_p1z[l];
-        }
-      }
-    }
-    for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i]));
-      _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i]));
-      _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i]));
-    }
+    // Store lambdas direct (transposed)
+    _mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even);
+    _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd);
+    _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 0], bend_lambda_even);
+    _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 8], bend_lambda_odd);
   }
-  // Step 4: Sphere Collisions with Friction
+
+  // Step 4: Sphere Collisions with Friction - duplicated for even/odd blocks
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
     int s_base = c * SPH_PER_CHN;
@@ -893,15 +1274,16 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
       __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);
 
-      for (int i = 0; i < PTC_PER_CHN; i += 8) {
-        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]);
-        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]);
-        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]);
+      // Even block
+      {
+        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
+        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
+        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
 
-        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + i]);
-        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + i]);
-        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + i]);
-        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
+        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]);
+        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]);
+        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]);
+        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
         ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
         __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
@@ -913,7 +1295,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
         __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-        __m256 dist = _mm256_rcp_ps(inv_dist);
+        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
 
         __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
         __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
@@ -930,18 +1312,86 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
         __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
 
-        __m256 vel_dot_norm = _mm256_mul_ps(vel_x, norm_x);
-        vel_dot_norm = _mm256_fmadd_ps(vel_y, norm_y, vel_dot_norm);
-        vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, vel_dot_norm);
+        __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
 
         __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
         __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
         __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
 
-        __m256 tan_mag_sq = _mm256_mul_ps(tan_x, tan_x);
-        tan_mag_sq = _mm256_fmadd_ps(tan_y, tan_y, tan_mag_sq);
-        tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, tan_mag_sq);
+        __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
+        __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec));
+
+        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
+        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
+        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
+        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
+
+        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
+        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
+        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
+
+        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
+        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
+        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
+
+        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
+        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
+        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
+
+        __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
+        __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
+        __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
+
+        _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x);
+        _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y);
+        _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z);
+      }
+
+      // Odd block
+      {
+        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
+        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
+        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
+
+        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]);
+        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]);
+        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]);
+        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
+        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+
+        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
+        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
+        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
+
+        __m256 dist_sq = _mm256_mul_ps(dx, dx);
+        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+        __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+
+        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
+        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
+
+        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
+
+        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
+        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
+        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
+
+        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
+        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
+        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
+
+        __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x)));
+
+        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
+        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
+        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
+
+        __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x)));
         __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec));
 
         __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
@@ -965,26 +1415,72 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
         __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
 
-        _mm256_store_ps(&cns->ptc_pos_x[p_base + i], new_p_x);
-        _mm256_store_ps(&cns->ptc_pos_y[p_base + i], new_p_y);
-        _mm256_store_ps(&cns->ptc_pos_z[p_base + i], new_p_z);
+        _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x);
+        _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y);
+        _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z);
       }
     }
   }
-  // Step 5: Motion Constraints
+
+  // Step 5: Motion Constraints - duplicated for even/odd blocks
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
-    for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]);
-      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]);
-      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]);
-      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
+
+    // Even block
+    {
+      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]);
+      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]);
+      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]);
+      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]);
+      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+
+      __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 0]);
+      __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 0]);
+      __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 0]);
+      __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 0]);
+
+      __m256 dx = _mm256_sub_ps(px, rx);
+      __m256 dy = _mm256_sub_ps(py, ry);
+      __m256 dz = _mm256_sub_ps(pz, rz);
+
+      __m256 dist_sq = _mm256_mul_ps(dx, dx);
+      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
+
+      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+      __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
+
+      __m256 pen = _mm256_sub_ps(dist, r_vec);
+      __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
+
+      __m256 corr_factor = _mm256_mul_ps(pen, inv_dist);
+      corr_factor = _mm256_and_ps(corr_factor, mask);
+
+      __m256 dt_x = _mm256_mul_ps(dx, corr_factor);
+      __m256 dt_y = _mm256_mul_ps(dy, corr_factor);
+      __m256 dt_z = _mm256_mul_ps(dz, corr_factor);
+
+      dt_x = _mm256_mul_ps(dt_x, pm);
+      dt_y = _mm256_mul_ps(dt_y, pm);
+      dt_z = _mm256_mul_ps(dt_z, pm);
+
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_sub_ps(px, dt_x));
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y));
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z));
+    }
+
+    // Odd block
+    {
+      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]);
+      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]);
+      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]);
+      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]);
       pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-      __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + i]);
-      __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + i]);
-      __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + i]);
-      __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + i]);
+      __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 8]);
+      __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 8]);
+      __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 8]);
+      __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 8]);
 
       __m256 dx = _mm256_sub_ps(px, rx);
       __m256 dy = _mm256_sub_ps(py, ry);
@@ -995,7 +1491,7 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
 
       __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-      __m256 dist = _mm256_rcp_ps(inv_dist);
+      __m256 dist = _mm256_mul_ps(dist_sq, inv_dist);
 
       __m256 pen = _mm256_sub_ps(dist, r_vec);
       __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
@@ -1011,9 +1507,9 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       dt_y = _mm256_mul_ps(dt_y, pm);
       dt_z = _mm256_mul_ps(dt_z, pm);
 
-      _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x));
-      _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y));
-      _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z));
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_sub_ps(px, dt_x));
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_sub_ps(py, dt_y));
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z));
     }
   }
 }
diff --git a/chain_avx2.c → chain.c b/chain_avx2.c → chain.c
@@ -12,7 +12,7 @@ enum {
   MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
   CON_PER_CHN   = PTC_PER_CHN,
   MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
-  MAX_ITR       = 16,
+  MAX_ITR       = 12,
 };
 struct chn_cfg {
   float damping;              // [0, 1]
@@ -315,6 +315,13 @@ chn_sim_run(struct chn_sim *cns, float dt) {
   const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
   const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
   const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+  const __m256i idx_p0_even = _mm256_set_epi32(14, 12, 10,  8,  6,  4,  2,  0);
+  const __m256i idx_p1_even = _mm256_set_epi32(15, 13, 11,  9,  7,  5,  3,  1);
+  const __m256i idx_rest_even = _mm256_set_epi32(14, 12, 10,  8,  6,  4,  2,  0);
+  const __m256i idx_p0_odd = _mm256_set_epi32(13, 11,  9,  7,  5,  3,  1, -1);  // Use -1 or 0 for dummy lane, masked anyway
+  const __m256i idx_p1_odd = _mm256_set_epi32(14, 12, 10,  8,  6,  4,  2, -1);  // Adjusted for consistency (dummy last)
+  const __m256i idx_rest_odd = _mm256_set_epi32(13, 11,  9,  7,  5,  3,  1, -1);
+  const __m256 valid_mask = _mm256_castsi256_ps(_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000));  // Or load from array as before
 
   // Local arrays for PBD solver
   float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
@@ -689,18 +696,19 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&pm_lcl[i], pm);
     }
     for (int iter = 0; iter < MAX_ITR; ++iter) {
-      // First block (i=0 to 7)
+      // Color 0: even constraints (0-1, 2-3, ..., 14-15)
       {
-        __m256 p0x = _mm256_load_ps(&px_lcl[0]);
-        __m256 p0y = _mm256_load_ps(&py_lcl[0]);
-        __m256 p0z = _mm256_load_ps(&pz_lcl[0]);
-        __m256 p0m = _mm256_load_ps(&pm_lcl[0]);
+        __m256 p0x = _mm256_i32gather_ps(px_lcl, idx_p0_even, 4);
+        __m256 p0y = _mm256_i32gather_ps(py_lcl, idx_p0_even, 4);
+        __m256 p0z = _mm256_i32gather_ps(pz_lcl, idx_p0_even, 4);
+        __m256 p0m = _mm256_i32gather_ps(pm_lcl, idx_p0_even, 4);
 
-        __m256 p1x = _mm256_loadu_ps(&px_lcl[1]);
-        __m256 p1y = _mm256_loadu_ps(&py_lcl[1]);
-        __m256 p1z = _mm256_loadu_ps(&pz_lcl[1]);
-        __m256 p1m = _mm256_loadu_ps(&pm_lcl[1]);
-        __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base]);
+        __m256 p1x = _mm256_i32gather_ps(px_lcl, idx_p1_even, 4);
+        __m256 p1y = _mm256_i32gather_ps(py_lcl, idx_p1_even, 4);
+        __m256 p1z = _mm256_i32gather_ps(pz_lcl, idx_p1_even, 4);
+        __m256 p1m = _mm256_i32gather_ps(pm_lcl, idx_p1_even, 4);
+
+        __m256 rest_len_vec = _mm256_i32gather_ps(&cns->ptc_rest_len[r_base], idx_rest_even, 4);
 
         __m256 dx = _mm256_sub_ps(p1x, p0x);
         __m256 dy = _mm256_sub_ps(p1y, p0y);
@@ -742,26 +750,53 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         dt_y = _mm256_and_ps(dt_y, apply_corr_mask);
         dt_z = _mm256_and_ps(dt_z, apply_corr_mask);
 
-        _mm256_store_ps(&px_lcl[0], _mm256_fmadd_ps(dt_x, p0m, p0x));
-        _mm256_store_ps(&py_lcl[0], _mm256_fmadd_ps(dt_y, p0m, p0y));
-        _mm256_store_ps(&pz_lcl[0], _mm256_fmadd_ps(dt_z, p0m, p0z));
-
-        _mm256_store_ps(&px_lcl[1], _mm256_fnmadd_ps(dt_x, p1m, p1x));
-        _mm256_store_ps(&py_lcl[1], _mm256_fnmadd_ps(dt_y, p1m, p1y));
-        _mm256_store_ps(&pz_lcl[1], _mm256_fnmadd_ps(dt_z, p1m, p1z));
+        __m256 new_p0x = _mm256_fmadd_ps(dt_x, p0m, p0x);
+        __m256 new_p0y = _mm256_fmadd_ps(dt_y, p0m, p0y);
+        __m256 new_p0z = _mm256_fmadd_ps(dt_z, p0m, p0z);
+
+        __m256 new_p1x = _mm256_fnmadd_ps(dt_x, p1m, p1x);
+        __m256 new_p1y = _mm256_fnmadd_ps(dt_y, p1m, p1y);
+        __m256 new_p1z = _mm256_fnmadd_ps(dt_z, p1m, p1z);
+
+        float temp_p0x[8] __attribute__((aligned(32)));
+        float temp_p0y[8] __attribute__((aligned(32)));
+        float temp_p0z[8] __attribute__((aligned(32)));
+        float temp_p1x[8] __attribute__((aligned(32)));
+        float temp_p1y[8] __attribute__((aligned(32)));
+        float temp_p1z[8] __attribute__((aligned(32)));
+
+        _mm256_store_ps(temp_p0x, new_p0x);
+        _mm256_store_ps(temp_p0y, new_p0y);
+        _mm256_store_ps(temp_p0z, new_p0z);
+        _mm256_store_ps(temp_p1x, new_p1x);
+        _mm256_store_ps(temp_p1y, new_p1y);
+        _mm256_store_ps(temp_p1z, new_p1z);
+
+        for (int l = 0; l < 8; ++l) {
+          int idx_p0 = l * 2;
+          px_lcl[idx_p0] = temp_p0x[l];
+          py_lcl[idx_p0] = temp_p0y[l];
+          pz_lcl[idx_p0] = temp_p0z[l];
+
+          int idx_p1 = l * 2 + 1;
+          px_lcl[idx_p1] = temp_p1x[l];
+          py_lcl[idx_p1] = temp_p1y[l];
+          pz_lcl[idx_p1] = temp_p1z[l];
+        }
       }
-      // Second block (i=8 to 14)
+      // Color 1: odd constraints (1-2, 3-4, ..., 13-14)
       {
-        __m256 p0x = _mm256_load_ps(&px_lcl[8]);
-        __m256 p0y = _mm256_load_ps(&py_lcl[8]);
-        __m256 p0z = _mm256_load_ps(&pz_lcl[8]);
-        __m256 p0m = _mm256_load_ps(&pm_lcl[8]);
+        __m256 p0x = _mm256_i32gather_ps(px_lcl, idx_p0_odd, 4);
+        __m256 p0y = _mm256_i32gather_ps(py_lcl, idx_p0_odd, 4);
+        __m256 p0z = _mm256_i32gather_ps(pz_lcl, idx_p0_odd, 4);
+        __m256 p0m = _mm256_i32gather_ps(pm_lcl, idx_p0_odd, 4);
+
+        __m256 p1x = _mm256_i32gather_ps(px_lcl, idx_p1_odd, 4);
+        __m256 p1y = _mm256_i32gather_ps(py_lcl, idx_p1_odd, 4);
+        __m256 p1z = _mm256_i32gather_ps(pz_lcl, idx_p1_odd, 4);
+        __m256 p1m = _mm256_i32gather_ps(pm_lcl, idx_p1_odd, 4);
 
-        __m256 p1x = _mm256_loadu_ps(&px_lcl[9]);
-        __m256 p1y = _mm256_loadu_ps(&py_lcl[9]);
-        __m256 p1z = _mm256_loadu_ps(&pz_lcl[9]);
-        __m256 p1m = _mm256_loadu_ps(&pm_lcl[9]);
-        __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]);
+        __m256 rest_len_vec = _mm256_i32gather_ps(&cns->ptc_rest_len[r_base], idx_rest_odd, 4);
 
         __m256 dx = _mm256_sub_ps(p1x, p0x);
         __m256 dy = _mm256_sub_ps(p1y, p0y);
@@ -790,9 +825,6 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part);
 
         __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-
-        __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
-        __m256 valid_mask = _mm256_load_ps(valid_mask_array);
         apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);
 
         __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist);
@@ -807,13 +839,39 @@ chn_sim_run(struct chn_sim *cns, float dt) {
         dt_y = _mm256_and_ps(dt_y, apply_corr_mask);
         dt_z = _mm256_and_ps(dt_z, apply_corr_mask);
 
-        _mm256_store_ps(&px_lcl[8], _mm256_fmadd_ps(dt_x, p0m, p0x));
-        _mm256_store_ps(&py_lcl[8], _mm256_fmadd_ps(dt_y, p0m, p0y));
-        _mm256_store_ps(&pz_lcl[8], _mm256_fmadd_ps(dt_z, p0m, p0z));
-
-        _mm256_storeu_ps(&px_lcl[9], _mm256_fnmadd_ps(dt_x, p1m, p1x));
-        _mm256_storeu_ps(&py_lcl[9], _mm256_fnmadd_ps(dt_y, p1m, p1y));
-        _mm256_storeu_ps(&pz_lcl[9], _mm256_fnmadd_ps(dt_z, p1m, p1z));
+        __m256 new_p0x = _mm256_fmadd_ps(dt_x, p0m, p0x);
+        __m256 new_p0y = _mm256_fmadd_ps(dt_y, p0m, p0y);
+        __m256 new_p0z = _mm256_fmadd_ps(dt_z, p0m, p0z);
+
+        __m256 new_p1x = _mm256_fnmadd_ps(dt_x, p1m, p1x);
+        __m256 new_p1y = _mm256_fnmadd_ps(dt_y, p1m, p1y);
+        __m256 new_p1z = _mm256_fnmadd_ps(dt_z, p1m, p1z);
+
+        float temp_p0x[8] __attribute__((aligned(32)));
+        float temp_p0y[8] __attribute__((aligned(32)));
+        float temp_p0z[8] __attribute__((aligned(32)));
+        float temp_p1x[8] __attribute__((aligned(32)));
+        float temp_p1y[8] __attribute__((aligned(32)));
+        float temp_p1z[8] __attribute__((aligned(32)));
+
+        _mm256_store_ps(temp_p0x, new_p0x);
+        _mm256_store_ps(temp_p0y, new_p0y);
+        _mm256_store_ps(temp_p0z, new_p0z);
+        _mm256_store_ps(temp_p1x, new_p1x);
+        _mm256_store_ps(temp_p1y, new_p1y);
+        _mm256_store_ps(temp_p1z, new_p1z);
+
+        for (int l = 1; l < 8; ++l) {
+          int idx_p0 = (l - 1) * 2 + 1;
+          px_lcl[idx_p0] = temp_p0x[l];
+          py_lcl[idx_p0] = temp_p0y[l];
+          pz_lcl[idx_p0] = temp_p0z[l];
+
+          int idx_p1 = (l - 1) * 2 + 2;
+          px_lcl[idx_p1] = temp_p1x[l];
+          py_lcl[idx_p1] = temp_p1y[l];
+          pz_lcl[idx_p1] = temp_p1z[l];
+        }
       }
     }
     for (int i = 0; i < PTC_PER_CHN; i += 8) {
@@ -958,4 +1016,4 @@ chn_sim_run(struct chn_sim *cns, float dt) {
       _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z));
     }
   }
-}
+}
diff --git a/chain_arm.c b/chain_arm.c
@@ -1,944 +0,0 @@
-#include <arm_neon.h>
-#include <math.h>
-#include <assert.h>
-#include <string.h>
-#include <float.h>
-
-enum {
-  MAX_CHNS      = 64,
-  PTC_PER_CHN   = 16,
-  SPH_PER_CHN   = 8,
-  MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
-  MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
-  CON_PER_CHN   = PTC_PER_CHN,
-  MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
-  MAX_ITR       = 16,
-};
-struct chn_cfg {
-  float damping;              // [0, 1]
-  float stiffness;            // [0, 1]
-  float stretch_factor;
-  float max_strain;
-  float friction;             // [0, 1]
-  float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
-  float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
-  float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
-};
-struct chn_sim {
-  unsigned char free_idx_cnt;
-  unsigned char free_idx[MAX_CHNS];
-  struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32)));
-
-  // chain global forces (world space gravity and wind)
-  float chn_grav_x[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_grav_y[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_grav_z[MAX_CHNS] __attribute__((aligned(32)));
-
-  float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32)));
-
-  // chain transformations (world space)
-  float chn_pos_x[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_pos_y[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_pos_z[MAX_CHNS] __attribute__((aligned(32)));
-
-  float chn_quat_x[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_quat_y[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_quat_z[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_quat_w[MAX_CHNS] __attribute__((aligned(32)));
-
-  float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32)));
-
-  float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32)));
-
-  // particle positions (model space)
-  float ptc_pos_x[MAX_PTC] __attribute__((aligned(32)));
-  float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
-  float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
-  float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32)));
-
-  float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
-  float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
-  float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
-  float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32)));
-
-  // sphere positions (model space)
-  float sph_x[MAX_SPH] __attribute__((aligned(32)));
-  float sph_y[MAX_SPH] __attribute__((aligned(32)));
-  float sph_z[MAX_SPH] __attribute__((aligned(32)));
-  float sph_r[MAX_SPH] __attribute__((aligned(32)));
-
-  // rest positions (model space)
-  float rest_pos_x[MAX_PTC] __attribute__((aligned(32)));
-  float rest_pos_y[MAX_PTC] __attribute__((aligned(32)));
-  float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
-  float motion_radius[MAX_PTC] __attribute__((aligned(32)));
-};
-extern void
-chn_sim_init(struct chn_sim *cns) {
-  assert(cns);
-  cns->free_idx_cnt = MAX_CHNS;
-  for (int i = 0; i < MAX_CHNS; ++i) {
-    cns->free_idx[i] = MAX_CHNS - i - 1;
-  }
-  for (int i = 0; i < MAX_CHNS; i += 4) {
-    vst1q_f32(&cns->chn_quat_w[i], vdupq_n_f32(1.0f));
-    vst1q_f32(&cns->chn_prev_quat_w[i], vdupq_n_f32(1.0f));  
-  }
-}
-extern int
-chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
-  assert(cns);
-  assert(cfg);
-  assert(cns->free_idx_cnt > 0);
-  assert(cnt <= PTC_PER_CHN);
-  assert(cnt > 1);
-
-  int chn = cns->free_idx[--cns->free_idx_cnt];
-  cns->chn_cfg[chn] = *cfg;
-
-  int p_idx = (chn * PTC_PER_CHN);
-  int r_idx = (chn * CON_PER_CHN);
-  for (int i = 0; i < cnt; ++i) {
-    cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0];
-    cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1];
-    cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2];
-    cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3];
-
-    cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0];
-    cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1];
-    cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2];
-  }
-  for (int i = cnt; i < PTC_PER_CHN; ++i) {
-    cns->ptc_pos_x[p_idx + i] = 0.0f;
-    cns->ptc_pos_y[p_idx + i] = 0.0f;
-    cns->ptc_pos_z[p_idx + i] = 0.0f;
-  }
-  for (int i = 1; i < cnt; ++i) {
-    float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
-    float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
-    float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
-    cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
-  }
-  for (int i = cnt; i < CON_PER_CHN; ++i) {
-    cns->ptc_rest_len[r_idx + i] = 0.0f;
-    cns->ptc_rest_len[r_idx + i] = 0.0f;
-    cns->ptc_rest_len[r_idx + i] = 0.0f;
-  }
-  return chn;
-}
-extern void
-chn_sim_del(struct chn_sim *cns, int chn) {
-  assert(cns);
-  assert(chn >= 0);
-  assert(chn < MAX_CHNS);
-
-  int p_idx = (chn * PTC_PER_CHN);
-  int s_idx = (chn * SPH_PER_CHN);
-  int c_idx = (chn * CON_PER_CHN);
-
-  cns->free_idx[cns->free_idx_cnt++] = chn;
-  memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
-
-  cns->chn_grav_x[chn] = 0.0f;
-  cns->chn_grav_y[chn] = 0.0f;
-  cns->chn_grav_z[chn] = 0.0f;
-
-  cns->chn_wnd_x[chn] = 0.0f;
-  cns->chn_wnd_y[chn] = 0.0f;
-  cns->chn_wnd_z[chn] = 0.0f;
-
-  cns->chn_pos_x[chn] = 0.0f;
-  cns->chn_pos_y[chn] = 0.0f;
-  cns->chn_pos_z[chn] = 0.0f;
-
-  cns->chn_quat_x[chn] = 0.0f;
-  cns->chn_quat_y[chn] = 0.0f;
-  cns->chn_quat_z[chn] = 0.0f;
-  cns->chn_quat_w[chn] = 1.0f;
-
-  cns->chn_prev_pos_x[chn] = 0.0f;
-  cns->chn_prev_pos_y[chn] = 0.0f;
-  cns->chn_prev_pos_z[chn] = 0.0f;
-
-  cns->chn_prev_quat_x[chn] = 0.0f;
-  cns->chn_prev_quat_y[chn] = 0.0f;
-  cns->chn_prev_quat_z[chn] = 0.0f;
-  cns->chn_prev_quat_w[chn] = 1.0f;
-
-  for (int i = 0; i < PTC_PER_CHN; i += 4) {
-    vst1q_f32(&cns->ptc_pos_x[p_idx + i], vdupq_n_f32(0.0f));
-    vst1q_f32(&cns->ptc_pos_y[p_idx + i], vdupq_n_f32(0.0f));
-    vst1q_f32(&cns->ptc_pos_z[p_idx + i], vdupq_n_f32(0.0f));
-    vst1q_f32(&cns->ptc_inv_mass[p_idx + i], vdupq_n_f32(0.0f));
-
-    vst1q_f32(&cns->ptc_prev_pos_x[p_idx + i], vdupq_n_f32(0.0f));
-    vst1q_f32(&cns->ptc_prev_pos_y[p_idx + i], vdupq_n_f32(0.0f));
-    vst1q_f32(&cns->ptc_prev_pos_z[p_idx + i], vdupq_n_f32(0.0f));
-
-    vst1q_f32(&cns->rest_pos_x[p_idx + i], vdupq_n_f32(0.0f));
-    vst1q_f32(&cns->rest_pos_y[p_idx + i], vdupq_n_f32(0.0f));
-    vst1q_f32(&cns->rest_pos_z[p_idx + i], vdupq_n_f32(0.0f));
-    vst1q_f32(&cns->motion_radius[p_idx + i], vdupq_n_f32(0.0f));
-  }
-  // Spheres
-  vst1q_f32(&cns->sph_x[s_idx], vdupq_n_f32(0.0f));
-  vst1q_f32(&cns->sph_y[s_idx], vdupq_n_f32(0.0f));
-  vst1q_f32(&cns->sph_z[s_idx], vdupq_n_f32(0.0f));
-  vst1q_f32(&cns->sph_r[s_idx], vdupq_n_f32(0.0f));
-
-  vst1q_f32(&cns->sph_x[s_idx + 4], vdupq_n_f32(0.0f));
-  vst1q_f32(&cns->sph_y[s_idx + 4], vdupq_n_f32(0.0f));
-  vst1q_f32(&cns->sph_z[s_idx + 4], vdupq_n_f32(0.0f));
-  vst1q_f32(&cns->sph_r[s_idx + 4], vdupq_n_f32(0.0f));
-
-  // Loop by 4 for NEON
-  for (int i = 0; i < CON_PER_CHN; i += 4) {
-    vst1q_f32(&cns->ptc_rest_len[c_idx + i], vdupq_n_f32(0.0f));
-  }
-}
-extern void
-chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
-  assert(cns);
-  assert(pos3);
-  assert(rot4);
-  assert(chn >= 0);
-  assert(chn < MAX_CHNS);
-
-  cns->chn_pos_x[chn] = pos3[0];
-  cns->chn_pos_y[chn] = pos3[1];
-  cns->chn_pos_z[chn] = pos3[2];
-
-  cns->chn_quat_x[chn] = rot4[0];
-  cns->chn_quat_y[chn] = rot4[1];
-  cns->chn_quat_z[chn] = rot4[2];
-  cns->chn_quat_w[chn] = rot4[3];
-}
-extern void
-chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
-  assert(cns);
-  assert(pos3);
-  assert(rot4);
-  assert(chn >= 0);
-  assert(chn < MAX_CHNS);
-
-  cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0];
-  cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1];
-  cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2];
-
-  cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0];
-  cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1];
-  cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
-  cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
-}
-extern void
-chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
-  assert(cns);
-  assert(grav3);
-  assert(chn >= 0);
-  assert(chn < MAX_CHNS);
-
-  cns->chn_grav_x[chn] = grav3[0];
-  cns->chn_grav_y[chn] = grav3[1];
-  cns->chn_grav_z[chn] = grav3[2];
-}
-extern void
-chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
-  assert(cns);
-  assert(wind3);
-  assert(chn >= 0);
-  assert(chn < MAX_CHNS);
-
-  cns->chn_wnd_x[chn] = wind3[0];
-  cns->chn_wnd_y[chn] = wind3[1];
-  cns->chn_wnd_z[chn] = wind3[2];
-}
-extern void
-chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
-  assert(cns);
-  assert(spheres);
-  assert(cnt < MAX_SPH);
-
-  int s_idx = (chn * SPH_PER_CHN);
-  for (int i = 0; i < cnt; i++) {
-    cns->sph_x[s_idx + i] = spheres[i*4+0];
-    cns->sph_y[s_idx + i] = spheres[i*4+1];
-    cns->sph_z[s_idx + i] = spheres[i*4+2];
-    cns->sph_r[s_idx + i] = spheres[i*4+3];
-  }
-  for (int i = cnt; i < SPH_PER_CHN; ++i) {
-    cns->sph_x[s_idx + i] = 0;
-    cns->sph_y[s_idx + i] = 0;
-    cns->sph_z[s_idx + i] = 0;
-    cns->sph_r[s_idx + i] = 0;
-  }
-}
-extern void
-chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
-  assert(cns);
-  assert(chn >= 0);
-  assert(chn < MAX_CHNS);
-  assert(cnt >= 0);
-  assert(cnt < PTC_PER_CHN);
-
-  int p_idx = (chn * PTC_PER_CHN);
-  for (int i = 0; i < cnt; ++i) {
-    cns->motion_radius[p_idx + i] = radius[i];
-  }
-  for (int i = cnt; i < PTC_PER_CHN; ++i) {
-    cns->motion_radius[p_idx + i] = FLT_MAX;
-  }
-}
-extern void
-chn_sim_run(struct chn_sim *cns, float dt) {
-  // NEON constants
-  const float32x4_t dt_vec = vdupq_n_f32(dt);
-  const float32x4_t dt2_vec = vdupq_n_f32(dt * dt);
-  const float32x4_t one_vec = vdupq_n_f32(1.0f);
-  const float32x4_t neg_one_vec = vdupq_n_f32(-1.0f);
-  const float32x4_t eps_vec = vdupq_n_f32(1e-6f);
-  const float32x4_t ptc_inv_mass_clamp_min = vdupq_n_f32(0.0f);
-  const float32x4_t ptc_inv_mass_clamp_max = vdupq_n_f32(1e6f);
-  const float32x4_t zero_vec = vdupq_n_f32(0.0f);
-  const float32x4_t inv_dt_vec = vdupq_n_f32(1.0f / dt);
-  const float32x4_t two_vec = vdupq_n_f32(2.0f);
-  const float32x4_t inertia_clamp_min = vdupq_n_f32(0.0f);
-  const float32x4_t inertia_clamp_max = vdupq_n_f32(1.0f);
-  // NEON doesn't have a direct equivalent to _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF))
-  // for absolute mask. We'll use `vabsq_f32` where needed or reconstruct the logic.
-
-  // Local arrays for PBD solver
-  float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
-  float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
-  float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
-  float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
-
-  // Initialize dummy elements
-  px_lcl[PTC_PER_CHN] = 0.0f;
-  py_lcl[PTC_PER_CHN] = 0.0f;
-  pz_lcl[PTC_PER_CHN] = 0.0f;
-  pm_lcl[PTC_PER_CHN] = 0.0f;
-
-  // Stack arrays for precomputed data
-  float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32)));
-
-  float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
-  float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32)));
-
-  float vel_x[MAX_CHNS] __attribute__((aligned(32)));
-  float vel_y[MAX_CHNS] __attribute__((aligned(32)));
-  float vel_z[MAX_CHNS] __attribute__((aligned(32)));
-
-  float ang_vel_x[MAX_CHNS] __attribute__((aligned(32)));
-  float ang_vel_y[MAX_CHNS] __attribute__((aligned(32)));
-  float ang_vel_z[MAX_CHNS] __attribute__((aligned(32)));
-
-  // Step 0: Precomputation
-  for (int c = 0; c < MAX_CHNS; c += 4) {
-    // Load chain quaternions
-    float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]);
-    float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]);
-    float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]);
-    float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]);
-
-    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
-    float32x4_t cqx = vnegq_f32(qx); // -qx
-    float32x4_t cqy = vnegq_f32(qy); // -qy
-    float32x4_t cqz = vnegq_f32(qz); // -qz
-
-    // Compute local-space wind
-    {
-      float32x4_t vx = vld1q_f32(&cns->chn_wnd_x[c]);
-      float32x4_t vy = vld1q_f32(&cns->chn_wnd_y[c]);
-      float32x4_t vz = vld1q_f32(&cns->chn_wnd_z[c]);
-
-      // Step 1: t = q_conj * v_world_as_quat
-      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
-      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
-      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
-      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
-      float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); // qw*vx + (-qy)*vz
-      tx = vmlsq_f32(tx, cqz, vy); // qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-
-      float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); // qw*vy + (-qz)*vx
-      ty = vmlsq_f32(ty, cqx, vz); // qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
-
-      float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); // qw*vz + (-qx)*vy
-      tz = vmlsq_f32(tz, cqy, vx); // qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
-
-      float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); // - ((-qx)*vx + (-qy)*vy) = qx*vx + qy*vy
-      tw = vmlsq_f32(tw, cqz, vz);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
-
-      // Step 2: res_vec = (t * q)_vec
-      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
-      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
-      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
-      // res_z = tw*qz + ty*qw + tx*qy - ty*qx
-      float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw);
-      res_x = vmlsq_f32(res_x, tz, qy);
-      res_x = vmlaq_f32(res_x, ty, qz); // Corrected: tw*qx + tx*qw + ty*qz - tz*qy
-
-      float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw);
-      res_y = vmlsq_f32(res_y, tx, qz);
-      res_y = vmlaq_f32(res_y, tz, qx); // Corrected: tw*qy + ty*qw + tz*qx - tx*qz
-
-      float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw);
-      res_z = vmlsq_f32(res_z, ty, qx);
-      res_z = vmlaq_f32(res_z, tx, qy); // Corrected: tw*qz + tz*qw + tx*qy - ty*qx
-
-      vst1q_f32(&chn_wnd_lcl_x[c], res_x);
-      vst1q_f32(&chn_wnd_lcl_y[c], res_y);
-      vst1q_f32(&chn_wnd_lcl_z[c], res_z);
-    }
-    // Compute local-space gravity
-    {
-      float32x4_t vx = vld1q_f32(&cns->chn_grav_x[c]);
-      float32x4_t vy = vld1q_f32(&cns->chn_grav_y[c]);
-      float32x4_t vz = vld1q_f32(&cns->chn_grav_z[c]);
-
-      // Step 1: t = q_conj * v_world_as_quat
-      float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz);
-      tx = vmlsq_f32(tx, cqz, vy);
-
-      float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx);
-      ty = vmlsq_f32(ty, cqx, vz);
-
-      float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy);
-      tz = vmlsq_f32(tz, cqy, vx);
-
-      float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy));
-      tw = vmlsq_f32(tw, cqz, vz);
-
-      // Step 2: res_vec = (t * q)_vec
-      float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw);
-      res_x = vmlsq_f32(res_x, tz, qy);
-      res_x = vmlaq_f32(res_x, ty, qz);
-
-      float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw);
-      res_y = vmlsq_f32(res_y, tx, qz);
-      res_y = vmlaq_f32(res_y, tz, qx);
-
-      float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw);
-      res_z = vmlsq_f32(res_z, ty, qx);
-      res_z = vmlaq_f32(res_z, tx, qy);
-
-      vst1q_f32(&chn_grav_lcl_x[c], res_x);
-      vst1q_f32(&chn_grav_lcl_y[c], res_y);
-      vst1q_f32(&chn_grav_lcl_z[c], res_z);
-    }
-    // Compute linear velocity
-    {
-      float32x4_t curr_x = vld1q_f32(&cns->chn_pos_x[c]);
-      float32x4_t curr_y = vld1q_f32(&cns->chn_pos_y[c]);
-      float32x4_t curr_z = vld1q_f32(&cns->chn_pos_z[c]);
-
-      float32x4_t prev_x = vld1q_f32(&cns->chn_prev_pos_x[c]);
-      float32x4_t prev_y = vld1q_f32(&cns->chn_prev_pos_y[c]);
-      float32x4_t prev_z = vld1q_f32(&cns->chn_prev_pos_z[c]);
-
-      float32x4_t vel_x_vec = vmulq_f32(vsubq_f32(curr_x, prev_x), inv_dt_vec);
-      float32x4_t vel_y_vec = vmulq_f32(vsubq_f32(curr_y, prev_y), inv_dt_vec);
-      float32x4_t vel_z_vec = vmulq_f32(vsubq_f32(curr_z, prev_z), inv_dt_vec);
-
-      vst1q_f32(&vel_x[c], vel_x_vec);
-      vst1q_f32(&vel_y[c], vel_y_vec);
-      vst1q_f32(&vel_z[c], vel_z_vec);
-    }
-    // Compute angular velocity
-    {
-      float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]);
-      float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]);
-      float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]);
-      float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]);
-
-      float32x4_t prev_qx = vld1q_f32(&cns->chn_prev_quat_x[c]);
-      float32x4_t prev_qy = vld1q_f32(&cns->chn_prev_quat_y[c]);
-      float32x4_t prev_qz = vld1q_f32(&cns->chn_prev_quat_z[c]);
-      float32x4_t prev_qw = vld1q_f32(&cns->chn_prev_quat_w[c]);
-
-      // Step 1: Compute delta quaternion (to - from)
-      float32x4_t dt_qx = vsubq_f32(qx, prev_qx);
-      float32x4_t dt_qy = vsubq_f32(qy, prev_qy);
-      float32x4_t dt_qz = vsubq_f32(qz, prev_qz);
-      float32x4_t dt_qw = vsubq_f32(qw, prev_qw);
-
-      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
-      float32x4_t conj_qx = vnegq_f32(prev_qx);
-      float32x4_t conj_qy = vnegq_f32(prev_qy);
-      float32x4_t conj_qz = vnegq_f32(prev_qz);
-      float32x4_t conj_qw = prev_qw;
-
-      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
-      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
-      // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
-      float32x4_t orient_qw = vmlsq_f32(vmulq_f32(dt_qw, conj_qw), dt_qx, conj_qx);
-      orient_qw = vmlsq_f32(orient_qw, dt_qy, conj_qy);
-      orient_qw = vmlsq_f32(orient_qw, dt_qz, conj_qz);
-
-      float32x4_t cross_x = vmlsq_f32(vmulq_f32(dt_qy, conj_qz), dt_qz, conj_qy);
-      float32x4_t cross_y = vmlsq_f32(vmulq_f32(dt_qz, conj_qx), dt_qx, conj_qz);
-      float32x4_t cross_z = vmlsq_f32(vmulq_f32(dt_qx, conj_qy), dt_qy, conj_qx);
-
-      float32x4_t orient_qx = vmlaq_f32(vmulq_f32(dt_qw, conj_qx), dt_qx, conj_qw);
-      orient_qx = vaddq_f32(orient_qx, cross_x);
-      float32x4_t orient_qy = vmlaq_f32(vmulq_f32(dt_qw, conj_qy), dt_qy, conj_qw);
-      orient_qy = vaddq_f32(orient_qy, cross_y);
-      float32x4_t orient_qz = vmlaq_f32(vmulq_f32(dt_qw, conj_qz), dt_qz, conj_qw);
-      orient_qz = vaddq_f32(orient_qz, cross_z);
-
-      // Step 4: Compute dot product (to, from) for shortest arc check
-      float32x4_t dot = vmlaq_f32(vmulq_f32(qw, prev_qw), qx, prev_qx);
-      dot = vmlaq_f32(dot, qy, prev_qy);
-      dot = vmlaq_f32(dot, qz, prev_qz);
-
-      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
-      uint32x4_t negate_mask = vcltq_f32(dot, zero_vec); // 1 if dot < 0, 0 otherwise
-      float32x4_t sign_selector = vbslq_f32(negate_mask, neg_one_vec, one_vec); // -1.0f if true, 1.0f if false
-
-      orient_qx = vmulq_f32(orient_qx, sign_selector);
-      orient_qy = vmulq_f32(orient_qy, sign_selector);
-      orient_qz = vmulq_f32(orient_qz, sign_selector);
-
-      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
-      // Inlined vrecip_f32_high_precision
-      float32x4_t rcp_two_dt;
-      rcp_two_dt = vrecpeq_f32(vmulq_f32(two_vec, dt_vec));
-      rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt));
-      rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt));
-
-      float32x4_t ang_vel_x_vec = vmulq_f32(orient_qx, rcp_two_dt);
-      float32x4_t ang_vel_y_vec = vmulq_f32(orient_qy, rcp_two_dt);
-      float32x4_t ang_vel_z_vec = vmulq_f32(orient_qz, rcp_two_dt);
-
-      // Store results
-      vst1q_f32(&ang_vel_x[c], ang_vel_x_vec);
-      vst1q_f32(&ang_vel_y[c], ang_vel_y_vec);
-      vst1q_f32(&ang_vel_z[c], ang_vel_z_vec);
-    }
-  }
-  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
-  for (int c = 0; c < MAX_CHNS; ++c) {
-    int base_idx = c * PTC_PER_CHN;
-    struct chn_cfg *cfg = &cns->chn_cfg[c];
-
-    // Load precomputed velocities and inertia parameters
-    float32x4_t vel_x_vec = vdupq_n_f32(vel_x[c]);
-    float32x4_t vel_y_vec = vdupq_n_f32(vel_y[c]);
-    float32x4_t vel_z_vec = vdupq_n_f32(vel_z[c]);
-
-    float32x4_t ang_vel_x_vec = vdupq_n_f32(ang_vel_x[c]);
-    float32x4_t ang_vel_y_vec = vdupq_n_f32(ang_vel_y[c]);
-    float32x4_t ang_vel_z_vec = vdupq_n_f32(ang_vel_z[c]);
-
-    float32x4_t linear_inertia_vec = vdupq_n_f32(cfg->linear_inertia);
-    float32x4_t angular_inertia_vec = vdupq_n_f32(cfg->angular_inertia);
-    float32x4_t centrifugal_inertia_vec = vdupq_n_f32(cfg->centrifugal_inertia);
-
-    // Clamp inertia parameters to [0, 1]
-    linear_inertia_vec = vmaxq_f32(vminq_f32(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
-    angular_inertia_vec = vmaxq_f32(vminq_f32(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
-    centrifugal_inertia_vec = vmaxq_f32(vminq_f32(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
-
-    for (int i = 0; i < PTC_PER_CHN; i += 4) {
-      float32x4_t px = vld1q_f32(&cns->ptc_pos_x[base_idx + i]);
-      float32x4_t py = vld1q_f32(&cns->ptc_pos_y[base_idx + i]);
-      float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[base_idx + i]);
-      float32x4_t p_ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]);
-      p_ptc_inv_mass = vmaxq_f32(vminq_f32(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
-
-      // Linear inertia: v * dt * linear_inertia
-      float32x4_t lin_dt_x = vmulq_f32(vmulq_f32(vel_x_vec, dt_vec), linear_inertia_vec);
-      float32x4_t lin_dt_y = vmulq_f32(vmulq_f32(vel_y_vec, dt_vec), linear_inertia_vec);
-      float32x4_t lin_dt_z = vmulq_f32(vmulq_f32(vel_z_vec, dt_vec), linear_inertia_vec);
-
-      // Angular inertia: (ω × r) * dt * angular_inertia
-      float32x4_t ang_dt_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py);
-      float32x4_t ang_dt_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz);
-      float32x4_t ang_dt_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px);
-
-      ang_dt_x = vmulq_f32(vmulq_f32(ang_dt_x, dt_vec), angular_inertia_vec);
-      ang_dt_y = vmulq_f32(vmulq_f32(ang_dt_y, dt_vec), angular_inertia_vec);
-      ang_dt_z = vmulq_f32(vmulq_f32(ang_dt_z, dt_vec), angular_inertia_vec);
-
-      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
-      float32x4_t cross1_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py);
-      float32x4_t cross1_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz);
-      float32x4_t cross1_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px);
-
-      float32x4_t cross2_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, cross1_z), ang_vel_z_vec, cross1_y);
-      float32x4_t cross2_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, cross1_x), ang_vel_x_vec, cross1_z);
-      float32x4_t cross2_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, cross1_y), ang_vel_y_vec, cross1_x);
-
-      // Calculate displacement: (ω × (ω × r)) * dt^2
-      float32x4_t base_cent_dt_x = vmulq_f32(cross2_x, dt2_vec);
-      float32x4_t base_cent_dt_y = vmulq_f32(cross2_y, dt2_vec);
-      float32x4_t base_cent_dt_z = vmulq_f32(cross2_z, dt2_vec);
-
-      // Apply the centrifugal_inertia factor
-      float32x4_t cent_dt_x = vmulq_f32(base_cent_dt_x, centrifugal_inertia_vec);
-      float32x4_t cent_dt_y = vmulq_f32(base_cent_dt_y, centrifugal_inertia_vec);
-      float32x4_t cent_dt_z = vmulq_f32(base_cent_dt_z, centrifugal_inertia_vec);
-
-      // Total delta
-      float32x4_t dt_x = vaddq_f32(vaddq_f32(lin_dt_x, ang_dt_x), cent_dt_x);
-      float32x4_t dt_y = vaddq_f32(vaddq_f32(lin_dt_y, ang_dt_y), cent_dt_y);
-      float32x4_t dt_z = vaddq_f32(vaddq_f32(lin_dt_z, ang_dt_z), cent_dt_z);
-
-      // Update positions
-      vst1q_f32(&cns->ptc_pos_x[base_idx + i], vaddq_f32(px, dt_x));
-      vst1q_f32(&cns->ptc_pos_y[base_idx + i], vaddq_f32(py, dt_y));
-      vst1q_f32(&cns->ptc_pos_z[base_idx + i], vaddq_f32(pz, dt_z));
-    }
-    // Update previous transformation
-    cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
-    cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
-    cns->chn_prev_pos_z[c] = cns->chn_pos_z[c];
-
-    cns->chn_prev_quat_x[c] = cns->chn_quat_x[c];
-    cns->chn_prev_quat_y[c] = cns->chn_quat_y[c];
-    cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
-    cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
-  }
-  // Step 2: Verlet Integration
-  for (int c = 0; c < MAX_CHNS; ++c) {
-    int base_idx = c * PTC_PER_CHN;
-    struct chn_cfg *cfg = &cns->chn_cfg[c];
-
-    float32x4_t chn_grav_x_vec = vdupq_n_f32(chn_grav_lcl_x[c]);
-    float32x4_t chn_grav_y_vec = vdupq_n_f32(chn_grav_lcl_y[c]);
-    float32x4_t chn_grav_z_vec = vdupq_n_f32(chn_grav_lcl_z[c]);
-
-    float32x4_t chn_wnd_x_vec = vdupq_n_f32(chn_wnd_lcl_x[c]);
-    float32x4_t chn_wnd_y_vec = vdupq_n_f32(chn_wnd_lcl_y[c]);
-    float32x4_t chn_wnd_z_vec = vdupq_n_f32(chn_wnd_lcl_z[c]);
-    float32x4_t damping_vec = vdupq_n_f32(cfg->damping);
-
-    for (int i = 0; i < PTC_PER_CHN; i += 4) {
-      float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[base_idx + i]);
-      float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[base_idx + i]);
-      float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[base_idx + i]);
-      float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]);
-      ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
-
-      float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[base_idx + i]);
-      float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[base_idx + i]);
-      float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[base_idx + i]);
-
-      float32x4_t force_x = vaddq_f32(chn_grav_x_vec, chn_wnd_x_vec);
-      float32x4_t force_y = vaddq_f32(chn_grav_y_vec, chn_wnd_y_vec);
-      float32x4_t force_z = vaddq_f32(chn_grav_z_vec, chn_wnd_z_vec);
-
-      float32x4_t acc_x = vmulq_f32(force_x, ptc_inv_mass);
-      float32x4_t acc_y = vmulq_f32(force_y, ptc_inv_mass);
-      float32x4_t acc_z = vmulq_f32(force_z, ptc_inv_mass);
-
-      float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x);
-      float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y);
-      float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z);
-
-      float32x4_t damped_vel_x = vmulq_f32(vel_x, damping_vec);
-      float32x4_t damped_vel_y = vmulq_f32(vel_y, damping_vec);
-      float32x4_t damped_vel_z = vmulq_f32(vel_z, damping_vec);
-
-      float32x4_t term_accel_x = vmulq_f32(acc_x, dt2_vec);
-      float32x4_t term_accel_y = vmulq_f32(acc_y, dt2_vec);
-      float32x4_t term_accel_z = vmulq_f32(acc_z, dt2_vec);
-
-      float32x4_t new_p_x = vaddq_f32(p_curr_x, vaddq_f32(damped_vel_x, term_accel_x));
-      float32x4_t new_p_y = vaddq_f32(p_curr_y, vaddq_f32(damped_vel_y, term_accel_y));
-      float32x4_t new_p_z = vaddq_f32(p_curr_z, vaddq_f32(damped_vel_z, term_accel_z));
-
-      vst1q_f32(&cns->ptc_pos_x[base_idx + i], new_p_x);
-      vst1q_f32(&cns->ptc_pos_y[base_idx + i], new_p_y);
-      vst1q_f32(&cns->ptc_pos_z[base_idx + i], new_p_z);
-
-      vst1q_f32(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x);
-      vst1q_f32(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y);
-      vst1q_f32(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z);
-    }
-  }
-  // Step 3: Distance Constraints
-  for (int c = 0; c < MAX_CHNS; ++c) {
-    int p_base = c * PTC_PER_CHN;
-    int r_base = c * CON_PER_CHN;
-
-    struct chn_cfg *cfg = &cns->chn_cfg[c];
-    float32x4_t stiff_vec = vdupq_n_f32(cfg->stiffness);
-    float32x4_t stretch_factor_vec = vdupq_n_f32(cfg->stretch_factor);
-    float32x4_t max_strain_vec_abs = vdupq_n_f32(cfg->max_strain);
-    float32x4_t neg_max_strain_vec_abs = vmulq_f32(max_strain_vec_abs, neg_one_vec);
-
-    // Loop by 4 for NEON
-    for (int i = 0; i < PTC_PER_CHN; i += 4) {
-      float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
-      pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
-      vst1q_f32(&px_lcl[i], vld1q_f32(&cns->ptc_pos_x[p_base + i]));
-      vst1q_f32(&py_lcl[i], vld1q_f32(&cns->ptc_pos_y[p_base + i]));
-      vst1q_f32(&pz_lcl[i], vld1q_f32(&cns->ptc_pos_z[p_base + i]));
-      vst1q_f32(&pm_lcl[i], pm);
-    }
-    for (int iter = 0; iter < MAX_ITR; ++iter) {
-      // Loop by 4 for NEON
-      for (int i = 0; i < PTC_PER_CHN; i += 4) {
-        float32x4_t p0x = vld1q_f32(&px_lcl[i]);
-        float32x4_t p0y = vld1q_f32(&py_lcl[i]);
-        float32x4_t p0z = vld1q_f32(&pz_lcl[i]);
-        float32x4_t p0m = vld1q_f32(&pm_lcl[i]);
-
-        float32x4_t p1x = vld1q_f32(&px_lcl[i + 1]);
-        float32x4_t p1y = vld1q_f32(&py_lcl[i + 1]);
-        float32x4_t p1z = vld1q_f32(&pz_lcl[i + 1]);
-        float32x4_t p1m = vld1q_f32(&pm_lcl[i + 1]);
-        float32x4_t rest_len_vec = vld1q_f32(&cns->ptc_rest_len[r_base + i]);
-
-        float32x4_t dx = vsubq_f32(p1x, p0x);
-        float32x4_t dy = vsubq_f32(p1y, p0y);
-        float32x4_t dz = vsubq_f32(p1z, p0z);
-
-        float32x4_t dist_sq = vmulq_f32(dx, dx);
-        dist_sq = vmlaq_f32(dist_sq, dy, dy);
-        dist_sq = vmlaq_f32(dist_sq, dz, dz);
-
-        float32x4_t inv_dist;
-        float32x4_t x_rsqrt = vmaxq_f32(dist_sq, eps_vec);
-        inv_dist = vrsqrteq_f32(x_rsqrt);
-        inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist)));
-        inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist)));
-
-        float32x4_t w_sum = vaddq_f32(p0m, p1m);
-        float32x4_t w_sum_clamped = vmaxq_f32(w_sum, eps_vec);
-
-        float32x4_t rcp_w_sum;
-        rcp_w_sum = vrecpeq_f32(w_sum_clamped);
-        rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum));
-        rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum));
-
-
-        float32x4_t dist;
-        dist = vrecpeq_f32(inv_dist);
-        dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist));
-        dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist));
-
-        float32x4_t diff = vsubq_f32(dist, rest_len_vec);
-        float32x4_t max_rest_len_eps = vmaxq_f32(rest_len_vec, eps_vec);
-
-        float32x4_t rcp_max_rest_len_eps;
-        rcp_max_rest_len_eps = vrecpeq_f32(max_rest_len_eps);
-        rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps));
-        rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps));
-        float32x4_t strain = vmulq_f32(diff, rcp_max_rest_len_eps);
-        float32x4_t strain_clamped = vmaxq_f32(neg_max_strain_vec_abs, vminq_f32(strain, max_strain_vec_abs));
-
-        float32x4_t dyn_stiff_denom = vaddq_f32(one_vec, vabsq_f32(strain_clamped));
-        float32x4_t rcp_dyn_stiff_denom;
-        rcp_dyn_stiff_denom = vrecpeq_f32(dyn_stiff_denom);
-        rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom));
-        rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom));
-        float32x4_t dyn_stiff = vmulq_f32(stiff_vec, rcp_dyn_stiff_denom);
-
-        float32x4_t corr_scl_part = vmulq_f32(dyn_stiff, stretch_factor_vec);
-        corr_scl_part = vmulq_f32(corr_scl_part, rcp_w_sum);
-        float32x4_t corr_magnitude = vmulq_f32(diff, corr_scl_part);
-
-        uint32x4_t apply_corr_mask = vcgtq_f32(dist_sq, eps_vec); // 1 if dist_sq > eps_vec, 0 otherwise
-
-        float32x4_t dt_unit_x = vmulq_f32(dx, inv_dist);
-        float32x4_t dt_unit_y = vmulq_f32(dy, inv_dist);
-        float32x4_t dt_unit_z = vmulq_f32(dz, inv_dist);
-
-        float32x4_t dt_x = vmulq_f32(dt_unit_x, corr_magnitude);
-        float32x4_t dt_y = vmulq_f32(dt_unit_y, corr_magnitude);
-        float32x4_t dt_z = vmulq_f32(dt_unit_z, corr_magnitude);
-
-        dt_x = vbslq_f32(apply_corr_mask, dt_x, zero_vec); // Apply mask: if mask is 0, set to 0
-        dt_y = vbslq_f32(apply_corr_mask, dt_y, zero_vec);
-        dt_z = vbslq_f32(apply_corr_mask, dt_z, zero_vec);
-
-        vst1q_f32(&px_lcl[i], vmlaq_f32(p0x, dt_x, p0m));
-        vst1q_f32(&py_lcl[i], vmlaq_f32(p0y, dt_y, p0m));
-        vst1q_f32(&pz_lcl[i], vmlaq_f32(p0z, dt_z, p0m));
-
-        vst1q_f32(&px_lcl[i + 1], vmlsq_f32(p1x, dt_x, p1m));
-        vst1q_f32(&py_lcl[i + 1], vmlsq_f32(p1y, dt_y, p1m));
-        vst1q_f32(&pz_lcl[i + 1], vmlsq_f32(p1z, dt_z, p1m));
-      }
-    }
-    for (int i = 0; i < PTC_PER_CHN; i += 4) {
-      vst1q_f32(&cns->ptc_pos_x[p_base + i], vld1q_f32(&px_lcl[i]));
-      vst1q_f32(&cns->ptc_pos_y[p_base + i], vld1q_f32(&py_lcl[i]));
-      vst1q_f32(&cns->ptc_pos_z[p_base + i], vld1q_f32(&pz_lcl[i]));
-    }
-  }
-  // Step 4: Sphere Collisions with Friction
-  for (int c = 0; c < MAX_CHNS; ++c) {
-    int p_base = c * PTC_PER_CHN;
-    int s_base = c * SPH_PER_CHN;
-    struct chn_cfg *cfg = &cns->chn_cfg[c];
-    float32x4_t fric_vec = vdupq_n_f32(cfg->friction);
-
-    for (int s = 0; s < SPH_PER_CHN; ++s) {
-      float32x4_t sph_x_vec = vdupq_n_f32(cns->sph_x[s_base + s]);
-      float32x4_t sph_y_vec = vdupq_n_f32(cns->sph_y[s_base + s]);
-      float32x4_t sph_z_vec = vdupq_n_f32(cns->sph_z[s_base + s]);
-      float32x4_t sph_r_vec = vdupq_n_f32(cns->sph_r[s_base + s]);
-
-      for (int i = 0; i < PTC_PER_CHN; i += 4) {
-        float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[p_base + i]);
-        float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[p_base + i]);
-        float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[p_base + i]);
-
-        float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[p_base + i]);
-        float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[p_base + i]);
-        float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[p_base + i]);
-        float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
-        ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
-
-        float32x4_t dx = vsubq_f32(p_curr_x, sph_x_vec);
-        float32x4_t dy = vsubq_f32(p_curr_y, sph_y_vec);
-        float32x4_t dz = vsubq_f32(p_curr_z, sph_z_vec);
-
-        float32x4_t dist_sq = vmulq_f32(dx, dx);
-        dist_sq = vmlaq_f32(dist_sq, dy, dy);
-        dist_sq = vmlaq_f32(dist_sq, dz, dz);
-
-        float32x4_t inv_dist_sphere;
-        float32x4_t x_rsqrt_sphere = vmaxq_f32(dist_sq, eps_vec);
-        inv_dist_sphere = vrsqrteq_f32(x_rsqrt_sphere);
-        inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere)));
-        inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere)));
-        float32x4_t inv_dist = inv_dist_sphere;
-
-        float32x4_t dist_sphere;
-        dist_sphere = vrecpeq_f32(inv_dist);
-        dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere));
-        dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere));
-        float32x4_t dist = dist_sphere;
-
-        float32x4_t pen = vsubq_f32(sph_r_vec, dist);
-        uint32x4_t col_mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise
-
-        float32x4_t norm_x = vmulq_f32(dx, inv_dist);
-        float32x4_t norm_y = vmulq_f32(dy, inv_dist);
-        float32x4_t norm_z = vmulq_f32(dz, inv_dist);
-
-        float32x4_t norm_corr_x = vmulq_f32(norm_x, pen);
-        float32x4_t norm_corr_y = vmulq_f32(norm_y, pen);
-        float32x4_t norm_corr_z = vmulq_f32(norm_z, pen);
-
-        float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x);
-        float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y);
-        float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z);
-
-        float32x4_t vel_dot_norm = vmulq_f32(vel_x, norm_x);
-        vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_y, norm_y);
-        vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_z, norm_z);
-
-        float32x4_t tan_x = vmlsq_f32(vel_x, vel_dot_norm, norm_x);
-        float32x4_t tan_y = vmlsq_f32(vel_y, vel_dot_norm, norm_y);
-        float32x4_t tan_z = vmlsq_f32(vel_z, vel_dot_norm, norm_z);
-
-        float32x4_t tan_mag_sq = vmulq_f32(tan_x, tan_x);
-        tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_y, tan_y);
-        tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_z, tan_z);
-
-        float32x4_t inv_tan_mag_sphere;
-        float32x4_t x_rsqrt_tan_mag = vmaxq_f32(tan_mag_sq, eps_vec);
-        inv_tan_mag_sphere = vrsqrteq_f32(x_rsqrt_tan_mag);
-        inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere)));
-        inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere)));
-        float32x4_t inv_tan_mag = inv_tan_mag_sphere;
-
-        float32x4_t fric_mag = vmulq_f32(pen, fric_vec);
-        float32x4_t fric_x = vmulq_f32(vmulq_f32(tan_x, inv_tan_mag), fric_mag);
-        float32x4_t fric_y = vmulq_f32(vmulq_f32(tan_y, inv_tan_mag), fric_mag);
-        float32x4_t fric_z = vmulq_f32(vmulq_f32(tan_z, inv_tan_mag), fric_mag);
-
-        float32x4_t total_corr_x = vsubq_f32(norm_corr_x, fric_x);
-        float32x4_t total_corr_y = vsubq_f32(norm_corr_y, fric_y);
-        float32x4_t total_corr_z = vsubq_f32(norm_corr_z, fric_z);
-
-        total_corr_x = vbslq_f32(col_mask, total_corr_x, zero_vec);
-        total_corr_y = vbslq_f32(col_mask, total_corr_y, zero_vec);
-        total_corr_z = vbslq_f32(col_mask, total_corr_z, zero_vec);
-
-        total_corr_x = vmulq_f32(total_corr_x, ptc_inv_mass);
-        total_corr_y = vmulq_f32(total_corr_y, ptc_inv_mass);
-        total_corr_z = vmulq_f32(total_corr_z, ptc_inv_mass);
-
-        float32x4_t new_p_x = vaddq_f32(p_curr_x, total_corr_x);
-        float32x4_t new_p_y = vaddq_f32(p_curr_y, total_corr_y);
-        float32x4_t new_p_z = vaddq_f32(p_curr_z, total_corr_z);
-
-        vst1q_f32(&cns->ptc_pos_x[p_base + i], new_p_x);
-        vst1q_f32(&cns->ptc_pos_y[p_base + i], new_p_y);
-        vst1q_f32(&cns->ptc_pos_z[p_base + i], new_p_z);
-      }
-    }
-  }
-  // Step 5: Motion Constraints
-  for (int c = 0; c < MAX_CHNS; ++c) {
-    int p_base = c * PTC_PER_CHN;
-    // Loop by 4 for NEON
-    for (int i = 0; i < PTC_PER_CHN; i += 4) {
-      float32x4_t px = vld1q_f32(&cns->ptc_pos_x[p_base + i]);
-      float32x4_t py = vld1q_f32(&cns->ptc_pos_y[p_base + i]);
-      float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[p_base + i]);
-      float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
-      pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
-
-      float32x4_t rx = vld1q_f32(&cns->rest_pos_x[p_base + i]);
-      float32x4_t ry = vld1q_f32(&cns->rest_pos_y[p_base + i]);
-      float32x4_t rz = vld1q_f32(&cns->rest_pos_z[p_base + i]);
-      float32x4_t r_vec = vld1q_f32(&cns->motion_radius[p_base + i]);
-
-      float32x4_t dx = vsubq_f32(px, rx);
-      float32x4_t dy = vsubq_f32(py, ry);
-      float32x4_t dz = vsubq_f32(pz, rz);
-
-      float32x4_t dist_sq = vmulq_f32(dx, dx);
-      dist_sq = vmlaq_f32(dist_sq, dy, dy);
-      dist_sq = vmlaq_f32(dist_sq, dz, dz);
-
-      float32x4_t inv_dist_motion;
-      float32x4_t x_rsqrt_motion = vmaxq_f32(dist_sq, eps_vec);
-      inv_dist_motion = vrsqrteq_f32(x_rsqrt_motion);
-      inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion)));
-      inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion)));
-      float32x4_t inv_dist = inv_dist_motion;
-
-      float32x4_t dist_motion;
-      dist_motion = vrecpeq_f32(inv_dist);
-      dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion));
-      dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion));
-      float32x4_t dist = dist_motion;
-
-      float32x4_t pen = vsubq_f32(dist, r_vec);
-      uint32x4_t mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise
-
-      float32x4_t corr_factor = vmulq_f32(pen, inv_dist);
-      corr_factor = vbslq_f32(mask, corr_factor, zero_vec);
-
-      float32x4_t dt_x = vmulq_f32(dx, corr_factor);
-      float32x4_t dt_y = vmulq_f32(dy, corr_factor);
-      float32x4_t dt_z = vmulq_f32(dz, corr_factor);
-
-      dt_x = vmulq_f32(dt_x, pm);
-      dt_y = vmulq_f32(dt_y, pm);
-      dt_z = vmulq_f32(dt_z, pm);
-
-      vst1q_f32(&cns->ptc_pos_x[p_base + i], vsubq_f32(px, dt_x));
-      vst1q_f32(&cns->ptc_pos_y[p_base + i], vsubq_f32(py, dt_y));
-      vst1q_f32(&cns->ptc_pos_z[p_base + i], vsubq_f32(pz, dt_z));
-    }
-  }
-}

diff --git a/chain_ispc.ispc b/chain_ispc.ispc
@@ -1,637 +0,0 @@
-#define MAX_CHAINS 64
-#define PARTICLES_PER_CHAIN 16
-#define SPHERES_PER_CHAIN 8
-#define MAX_PARTICLES (MAX_CHAINS * PARTICLES_PER_CHAIN)
-#define MAX_SPHERES (MAX_CHAINS * SPHERES_PER_CHAIN)
-#define CONSTRAINTS_PER_CHAIN PARTICLES_PER_CHAIN
-#define MAX_REST_LENGTHS (MAX_CHAINS * CONSTRAINTS_PER_CHAIN)
-#define MAX_ITERATIONS 8 
-
-// Define chain configuration structure
-struct chain_cfg {
-  uniform float damping;
-  uniform float stiffness;
-  uniform float stretch_factor;
-  uniform float max_strain;
-  uniform float friction;
-  uniform float collision_radius;
-  uniform float linear_inertia; // [0, 1]
-  uniform float angular_inertia;     // [0, 1]
-  uniform float centrifugal_inertia;// [0, 1]
-};
-
-// Define chain simulation structure
-struct chain_sim {
-  uint8 free_idx_cnt;
-  uint8 free_idx[MAX_CHAINS];
-  uniform struct chain_cfg chain_configs[MAX_CHAINS];
-
-  // Chain global forces (world space gravity and wind)
-  uniform float gravity_x[MAX_CHAINS];
-  uniform float gravity_y[MAX_CHAINS];
-  uniform float gravity_z[MAX_CHAINS];
-
-  uniform float wind_x[MAX_CHAINS];
-  uniform float wind_y[MAX_CHAINS];
-  uniform float wind_z[MAX_CHAINS];
-
-  // Chain transformations (world space)
-  uniform float chain_pos_x[MAX_CHAINS];
-  uniform float chain_pos_y[MAX_CHAINS];
-  uniform float chain_pos_z[MAX_CHAINS];
-
-  uniform float chain_quat_x[MAX_CHAINS];
-  uniform float chain_quat_y[MAX_CHAINS];
-  uniform float chain_quat_z[MAX_CHAINS];
-  uniform float chain_quat_w[MAX_CHAINS];
-
-  uniform float prev_chain_pos_x[MAX_CHAINS];
-  uniform float prev_chain_pos_y[MAX_CHAINS];
-  uniform float prev_chain_pos_z[MAX_CHAINS];
-
-  uniform float prev_chain_quat_x[MAX_CHAINS];
-  uniform float prev_chain_quat_y[MAX_CHAINS];
-  uniform float prev_chain_quat_z[MAX_CHAINS];
-  uniform float prev_chain_quat_w[MAX_CHAINS];
-
-  // Particle positions (model space)
-  uniform float p_pos_x[MAX_PARTICLES];
-  uniform float p_pos_y[MAX_PARTICLES];
-  uniform float p_pos_z[MAX_PARTICLES];
-
-  uniform float prev_p_pos_x[MAX_PARTICLES];
-  uniform float prev_p_pos_y[MAX_PARTICLES];
-  uniform float prev_p_pos_z[MAX_PARTICLES];
-
-  uniform float inv_mass[MAX_PARTICLES];
-  uniform float rest_lengths[MAX_REST_LENGTHS];
-
-  // Sphere positions (model space)
-  uniform float sphere_x[MAX_SPHERES];
-  uniform float sphere_y[MAX_SPHERES];
-  uniform float sphere_z[MAX_SPHERES];
-  uniform float sphere_radius[MAX_SPHERES];
-
-  // Rest positions (model space)
-  uniform float rest_pos_x[MAX_PARTICLES];
-  uniform float rest_pos_y[MAX_PARTICLES];
-  uniform float rest_pos_z[MAX_PARTICLES];
-  uniform float motion_radius[MAX_PARTICLES];
-};
-
-// ISPC task for chain simulation
-export void simulate_chains(
-  __attribute__((aligned(32))) uniform float *uniform chain_quat_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform chain_quat_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform chain_quat_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform chain_quat_w_ptr,
-  __attribute__((aligned(32))) uniform float *uniform wind_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform wind_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform wind_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform gravity_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform gravity_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform gravity_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform chain_pos_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform chain_pos_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform chain_pos_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_w_ptr,
-  __attribute__((aligned(32))) uniform float *uniform p_pos_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform p_pos_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform p_pos_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform inv_mass_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform rest_lengths_ptr,
-  __attribute__((aligned(32))) uniform struct chain_cfg *uniform chain_configs_ptr, // Passed as pointer to array of structs
-  __attribute__((aligned(32))) uniform float *uniform sphere_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform sphere_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform sphere_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform sphere_radius_ptr,
-  __attribute__((aligned(32))) uniform float *uniform rest_pos_x_ptr,
-  __attribute__((aligned(32))) uniform float *uniform rest_pos_y_ptr,
-  __attribute__((aligned(32))) uniform float *uniform rest_pos_z_ptr,
-  __attribute__((aligned(32))) uniform float *uniform motion_radius_ptr,
-  uniform float dt) {
-  // Common SIMD constants (varying for vector operations across lanes)
-  varying float dt_vec = dt;
-  varying float dt2_vec = dt * dt;
-  varying float one_vec = 1.0;
-  varying float neg_one_vec = -1.0;
-  varying float eps_vec = 1e-6;
-  varying float inv_mass_clamp_min = 0.0;
-  varying float inv_mass_clamp_max = 1e6;
-  varying float zero_vec = 0.0;
-  varying float inv_dt_vec = 1.0 / dt;
-  varying float two_vec = 2.0;
-  varying float inertia_clamp_min = 0.0;
-  varying float inertia_clamp_max = 1.0;
-
-  assume(MAX_CHAINS % programCount == 0);
-  assume(PARTICLES_PER_CHAIN % programCount == 0);
-  assume(SPHERES_PER_CHAIN % programCount == 0);
-  assume(MAX_SPHERES % programCount == 0);
-  assume(CONSTRAINTS_PER_CHAIN % programCount == 0);
-  assume(MAX_REST_LENGTHS % programCount == 0);
-
-  // Local arrays for PBD solver
-  __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1];
-  __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1];
-  __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1];
-  __attribute__((aligned(32))) uniform float pm_local[PARTICLES_PER_CHAIN + 1];
-
-  // All lanes can safely write to these uniform locations.
-  px_local[PARTICLES_PER_CHAIN] = 0.0;
-  py_local[PARTICLES_PER_CHAIN] = 0.0;
-  pz_local[PARTICLES_PER_CHAIN] = 0.0;
-  pm_local[PARTICLES_PER_CHAIN] = 0.0;
-
-  // Stack arrays for precomputed data
-  __attribute__((aligned(32))) uniform float wind_local_x[MAX_CHAINS];
-  __attribute__((aligned(32))) uniform float wind_local_y[MAX_CHAINS];
-  __attribute__((aligned(32))) uniform float wind_local_z[MAX_CHAINS];
-
-  __attribute__((aligned(32))) uniform float gravity_local_x[MAX_CHAINS];
-  __attribute__((aligned(32))) uniform float gravity_local_y[MAX_CHAINS];
-  __attribute__((aligned(32))) uniform float gravity_local_z[MAX_CHAINS];
-
-  __attribute__((aligned(32))) uniform float vel_x[MAX_CHAINS];
-  __attribute__((aligned(32))) uniform float vel_y[MAX_CHAINS];
-  __attribute__((aligned(32))) uniform float vel_z[MAX_CHAINS];
-
-  __attribute__((aligned(32))) uniform float ang_vel_x[MAX_CHAINS];
-  __attribute__((aligned(32))) uniform float ang_vel_y[MAX_CHAINS];
-  __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS];
-
-  // Step 0: Precomputation (SIMD, process chains in groups of programCount)
-  for (uniform int c = 0; c < MAX_CHAINS; c += programCount) {
-    // Load chain quaternions
-    varying float qx = chain_quat_x_ptr[c + programIndex];
-    varying float qy = chain_quat_y_ptr[c + programIndex];
-    varying float qz = chain_quat_z_ptr[c + programIndex];
-    varying float qw = chain_quat_w_ptr[c + programIndex];
-
-    // Compute local-space wind
-    {
-      varying float vx = wind_x_ptr[c + programIndex];
-      varying float vy = wind_y_ptr[c + programIndex];
-      varying float vz = wind_z_ptr[c + programIndex];
-
-      // Create q_conjugate components
-      varying float cqx = -qx;
-      varying float cqy = -qy;
-      varying float cqz = -qz;
-
-      // Step 1: t = q_conj * v_world_as_quat
-      varying float tx = qw * vx + cqy * vz - cqz * vy;
-      varying float ty = qw * vy + cqz * vx - cqx * vz;
-      varying float tz = qw * vz + cqx * vy - cqy * vx;
-      varying float tw = qx * vx + qy * vy + qz * vz;
-
-      // Step 2: result_vec = (t * q)_vec
-      varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
-      varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
-      varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
-
-      wind_local_x[c + programIndex] = result_x;
-      wind_local_y[c + programIndex] = result_y;
-      wind_local_z[c + programIndex] = result_z;
-    }
-
-    // Compute local-space gravity
-    {
-      varying float vx = gravity_x_ptr[c + programIndex];
-      varying float float_vy = gravity_y_ptr[c + programIndex];
-      varying float vz = gravity_z_ptr[c + programIndex];
-
-      varying float cqx = -qx;
-      varying float cqy = -qy;
-      varying float cqz = -qz;
-
-      varying float tx = qw * vx + cqy * vz - cqz * float_vy;
-      varying float ty = qw * float_vy + cqz * vx - cqx * vz;
-      varying float tz = qw * vz + cqx * float_vy - cqy * vx;
-      varying float tw = qx * vx + qy * float_vy + qz * vz;
-
-      // Corrected quaternion multiplication for gravity_local
-      varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
-      varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
-      varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
-
-      gravity_local_x[c + programIndex] = result_x;
-      gravity_local_y[c + programIndex] = result_y;
-      gravity_local_z[c + programIndex] = result_z;
-    }
-
-    // Compute linear velocity
-    {
-      varying float curr_x = chain_pos_x_ptr[c + programIndex];
-      varying float curr_y = chain_pos_y_ptr[c + programIndex];
-      varying float curr_z = chain_pos_z_ptr[c + programIndex];
-      varying float prev_x = prev_chain_pos_x_ptr[c + programIndex];
-      varying float prev_y = prev_chain_pos_y_ptr[c + programIndex];
-      varying float prev_z = prev_chain_pos_z_ptr[c + programIndex];
-
-      varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec;
-      varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec;
-      varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec;
-      vel_x[c + programIndex] = vel_x_vec;
-      vel_y[c + programIndex] = vel_y_vec;
-      vel_z[c + programIndex] = vel_z_vec;
-    }
-
-    // Compute angular velocity
-    {
-      varying float prev_qx = prev_chain_quat_x_ptr[c + programIndex];
-      varying float prev_qy = prev_chain_quat_y_ptr[c + programIndex];
-      varying float prev_qz = prev_chain_quat_z_ptr[c + programIndex];
-      varying float prev_qw = prev_chain_quat_w_ptr[c + programIndex];
-
-      // Step 1: Compute delta quaternion (to - from) - Note: This is an approximation
-      varying float delta_qx = qx - prev_qx;
-      varying float delta_qy = qy - prev_qy;
-      varying float delta_qz = qz - prev_qz;
-      varying float delta_qw = qw - prev_qw;
-
-      // Step 2: Compute conjugate of previous quaternion
-      varying float conj_qx = -prev_qx;
-      varying float conj_qy = -prev_qy;
-      varying float conj_qz = -prev_qz;
-      varying float conj_qw = prev_qw;
-      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
-      varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz;
-      varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy;
-      varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz;
-      varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx;
-      varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x;
-      varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y;
-      varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z;
-      // Step 4: Compute dot product (to, from) for shortest arc check
-      varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz;
-      // Step 5: Negate orientation delta if dot < 0
-      varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec);
-      orient_qx *= negate_sign;
-      orient_qy *= negate_sign;
-      orient_qz *= negate_sign;
-
-      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
-      varying float scale = one_vec / (two_vec * dt_vec);
-      varying float ang_vel_x_vec = orient_qx * scale;
-      varying float ang_vel_y_vec = orient_qy * scale;
-      varying float ang_vel_z_vec = orient_qz * scale;
-
-      ang_vel_x[c + programIndex] = ang_vel_x_vec;
-      ang_vel_y[c + programIndex] = ang_vel_y_vec;
-      ang_vel_z[c + programIndex] = ang_vel_z_vec;
-    }
-  }
-
-  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
-  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-    uniform int base_idx = c * PARTICLES_PER_CHAIN;
-    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
-
-    // Load velocities and inertia (scalar, broadcast to lanes)
-    varying float vel_x_vec = vel_x[c];
-    varying float vel_y_vec = vel_y[c];
-    varying float vel_z_vec = vel_z[c];
-
-    varying float ang_vel_x_vec = ang_vel_x[c];
-    varying float ang_vel_y_vec = ang_vel_y[c];
-    varying float ang_vel_z_vec = ang_vel_z[c];
-
-    varying float linear_inertia_vec = cfg->linear_inertia;
-    varying float angular_inertia_vec = cfg->angular_inertia;
-    varying float centrifugal_inertia_vec = cfg->centrifugal_inertia;
-
-    // Clamp inertia parameters to [0, 1]
-    linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max);
-    angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max);
-    centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max);
-
-    // Process particles in groups
-    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-      varying float px = p_pos_x_ptr[base_idx + i + programIndex];
-      varying float py = p_pos_y_ptr[base_idx + i + programIndex];
-      varying float pz = p_pos_z_ptr[base_idx + i + programIndex];
-      varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
-      p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
-
-      // Linear inertia: v * dt * linear_inertia
-      varying float lin_delta_x = vel_x_vec * dt_vec * linear_inertia_vec;
-      varying float lin_delta_y = vel_y_vec * dt_vec * linear_inertia_vec;
-      varying float lin_delta_z = vel_z_vec * dt_vec * linear_inertia_vec;
-
-      // Angular inertia: (ω × r) * dt * angular_inertia
-      varying float ang_delta_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
-      varying float ang_delta_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
-      varying float ang_delta_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
-
-      ang_delta_x *= dt_vec * angular_inertia_vec;
-      ang_delta_y *= dt_vec * angular_inertia_vec;
-      ang_delta_z *= dt_vec * angular_inertia_vec;
-
-      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
-      varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
-      varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
-      varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
-      varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y;
-      varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z;
-      varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x;
-
-      varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec;
-      varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec;
-      varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec;
-
-      // Total delta
-      varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x;
-      varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y;
-      varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z;
-
-      // Update positions
-      p_pos_x_ptr[base_idx + i + programIndex] = px + delta_x;
-      p_pos_y_ptr[base_idx + i + programIndex] = py + delta_y;
-      p_pos_z_ptr[base_idx + i + programIndex] = pz + delta_z;
-    }
-
-    // Update previous transformation (scalar)
-    prev_chain_pos_x_ptr[c] = chain_pos_x_ptr[c];
-    prev_chain_pos_y_ptr[c] = chain_pos_y_ptr[c];
-    prev_chain_pos_z_ptr[c] = chain_pos_z_ptr[c];
-
-    prev_chain_quat_x_ptr[c] = chain_quat_x_ptr[c];
-    prev_chain_quat_y_ptr[c] = chain_quat_y_ptr[c];
-    prev_chain_quat_z_ptr[c] = chain_quat_z_ptr[c];
-    prev_chain_quat_w_ptr[c] = chain_quat_w_ptr[c];
-  }
-
-  // Step 2: Verlet Integration
-  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-    uniform int base_idx = c * PARTICLES_PER_CHAIN;
-    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
-
-    // Load forces and damping (scalar, broadcast to lanes)
-    varying float gravity_x_vec = gravity_local_x[c];
-    varying float gravity_y_vec = gravity_local_y[c];
-    varying float gravity_z_vec = gravity_local_z[c];
-
-    varying float wind_x_vec = wind_local_x[c];
-    varying float wind_y_vec = wind_local_y[c];
-    varying float wind_z_vec = wind_local_z[c];
-
-    varying float damping_vec = cfg->damping;
-    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-      varying float p_curr_x = p_pos_x_ptr[base_idx + i + programIndex];
-      varying float p_curr_y = p_pos_y_ptr[base_idx + i + programIndex];
-      varying float p_curr_z = p_pos_z_ptr[base_idx + i + programIndex];
-
-      varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
-      p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
-
-      varying float p_prev_x = prev_p_pos_x_ptr[base_idx + i + programIndex];
-      varying float p_prev_y = prev_p_pos_y_ptr[base_idx + i + programIndex];
-      varying float p_prev_z = prev_p_pos_z_ptr[base_idx + i + programIndex];
-
-      // Compute forces and accelerations
-      varying float force_x = gravity_x_vec + wind_x_vec;
-      varying float force_y = gravity_y_vec + wind_y_vec;
-      varying float force_z = gravity_z_vec + wind_z_vec;
-
-      varying float acc_x = force_x * p_inv_mass;
-      varying float acc_y = force_y * p_inv_mass;
-      varying float acc_z = force_z * p_inv_mass;
-
-      // Compute velocity with damping
-      varying float v_x = p_curr_x - p_prev_x;
-      varying float v_y = p_curr_y - p_prev_y;
-      varying float v_z = p_curr_z - p_prev_z;
-
-      varying float damped_vel_x = v_x * damping_vec;
-      varying float damped_vel_y = v_y * damping_vec;
-      varying float damped_vel_z = v_z * damping_vec;
-
-      // Update positions: p_curr + damped_vel + acc * dt^2
-      varying float term_accel_x = acc_x * dt2_vec;
-      varying float term_accel_y = acc_y * dt2_vec;
-      varying float term_accel_z = acc_z * dt2_vec;
-
-      p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x;
-      p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y;
-      p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z;
-
-      // Update previous positions
-      prev_p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x;
-      prev_p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y;
-      prev_p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z;
-    }
-  }
-
-  // Step 3: Distance Constraints
-  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-    uniform int p_base = c * PARTICLES_PER_CHAIN;
-    uniform int r_base = c * CONSTRAINTS_PER_CHAIN;
-    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
-
-    varying float stiffness_vec = cfg->stiffness;
-    varying float stretch_factor_vec = cfg->stretch_factor;
-    varying float max_strain_vec_abs = cfg->max_strain;
-    varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec;
-
-    // Copy particles to local arrays with uniform indexing
-    // This loop now performs a vectorized load into the uniform px_local array.
-    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-      // Load a vector of particles from global memory into varying registers
-      varying float current_px = p_pos_x_ptr[p_base + i + programIndex];
-      varying float current_py = p_pos_y_ptr[p_base + i + programIndex];
-      varying float current_pz = p_pos_z_ptr[p_base + i + programIndex];
-      varying float current_pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
-
-      // Store these varying values into the uniform local array
-      // The compiler can now see this as a series of vector stores
-      // because px_local is uniform.
-      foreach_active (idx) {
-        px_local[i + programIndex] = current_px;
-        py_local[i + programIndex] = current_py;
-        pz_local[i + programIndex] = current_pz;
-        pm_local[i + programIndex] = current_pm;
-      }
-    }
-
-    // Iterate over constraints
-    for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) {
-      // Process constraints in groups of programCount (0 to PARTICLES_PER_CHAIN-2)
-      for (uniform int i = 0; i < PARTICLES_PER_CHAIN - 1; i += programCount) {
-        // Now, load directly from uniform px_local using varying programIndex
-        // The compiler can optimize these into vector loads.
-        varying float p0x = px_local[i + programIndex];
-        varying float p0y = py_local[i + programIndex];
-        varying float p0z = pz_local[i + programIndex]; 
-        varying float p0m = pm_local[i + programIndex];
-
-        // For p1, we need to be careful with the indexing relative to programIndex.
-        // The loop condition 'i < PARTICLES_PER_CHAIN - 1' ensures that 'i + programIndex + 1'
-        // will not exceed 'PARTICLES_PER_CHAIN - 1' for any active lane.
-        varying float p1x = px_local[i + programIndex + 1]; 
-        varying float p1y = py_local[i + programIndex + 1];
-        varying float p1z = pz_local[i + programIndex + 1];
-        varying float p1m = pm_local[i + programIndex + 1];
-
-        varying float rest_len_vec = rest_lengths_ptr[r_base + i + programIndex];
-
-        varying float dx = p1x - p0x;
-        varying float dy = p1y - p0y;
-        varying float dz = p1z - p0z;
-
-        varying float dist_sq = dx * dx + dy * dy + dz * dz;
-        varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
-        varying float dist = rcp(inv_dist);
-        varying float w_sum = p0m + p1m;
-        varying float rcp_w_sum = rcp(max(w_sum, eps_vec));
-        varying float diff = dist - rest_len_vec;
-        varying float strain = diff * rcp(max(rest_len_vec, eps_vec));
-        varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs);
-        varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + abs(strain_clamped));
-        varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum;
-        varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0);
-        // Mask lanes where i + programIndex >= PARTICLES_PER_CHAIN - 1 (no constraint for last particle)
-        apply_corr_mask = select(i + programIndex < PARTICLES_PER_CHAIN - 1, apply_corr_mask, 0.0);
-
-        varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask;
-        varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask;
-        varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask;
-
-        // Apply corrections, writing back to uniform px_local
-        // These are now vector stores.
-        px_local[i + programIndex] -= delta_x * p0m;
-        py_local[i + programIndex] -= delta_y * p0m;
-        pz_local[i + programIndex] -= delta_z * p0m;
-
-        px_local[i + programIndex + 1] += delta_x * p1m;
-        py_local[i + programIndex + 1] += delta_y * p1m;
-        pz_local[i + programIndex + 1] += delta_z * p1m;
-      }
-    }
-    // Write back to global arrays
-    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-      // Load a vector of particles from uniform local array into varying registers
-      varying float updated_px = px_local[i + programIndex];
-      varying float updated_py = py_local[i + programIndex];
-      varying float updated_pz = pz_local[i + programIndex];
-
-      // Store these varying values into the global memory
-      foreach_active (idx) {
-        p_pos_x_ptr[p_base + i + programIndex] = updated_px;
-        p_pos_y_ptr[p_base + i + programIndex] = updated_py;
-        p_pos_z_ptr[p_base + i + programIndex] = updated_pz;
-      }
-    }
-  }
-  // Step 4: Sphere Collisions with Friction
-  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-    uniform int p_base = c * PARTICLES_PER_CHAIN;
-    uniform int s_base = c * SPHERES_PER_CHAIN;
-    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
-    varying float friction_vec = cfg->friction;
-    for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) {
-      varying float sphere_x_vec = sphere_x_ptr[s_base + s];
-      varying float sphere_y_vec = sphere_y_ptr[s_base + s];
-      varying float sphere_z_vec = sphere_z_ptr[s_base + s];
-      varying float sphere_r_vec = sphere_radius_ptr[s_base + s];
-
-      for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-        varying float p_curr_x = p_pos_x_ptr[p_base + i + programIndex];
-        varying float p_curr_y = p_pos_y_ptr[p_base + i + programIndex];
-        varying float p_curr_z = p_pos_z_ptr[p_base + i + programIndex];
-
-        varying float p_prev_x = prev_p_pos_x_ptr[p_base + i + programIndex];
-        varying float p_prev_y = prev_p_pos_y_ptr[p_base + i + programIndex];
-        varying float p_prev_z = prev_p_pos_z_ptr[p_base + i + programIndex];
-        varying float p_inv_mass = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
-
-        varying float dx = p_curr_x - sphere_x_vec;
-        varying float dy = p_curr_y - sphere_y_vec;
-        varying float dz = p_curr_z - sphere_z_vec;
-
-        varying float dist_sq = dx * dx + dy * dy + dz * dz;
-        varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
-        varying float dist = rcp(inv_dist);
-
-        varying float penetration = sphere_r_vec - dist;
-        varying float collision_mask = select(penetration > zero_vec, 1.0, 0.0);
-
-        varying float normal_x = dx * inv_dist;
-        varying float normal_y = dy * inv_dist;
-        varying float normal_z = dz * inv_dist;
-
-        varying float normal_corr_x = normal_x * penetration;
-        varying float normal_corr_y = normal_y * penetration;
-        varying float normal_corr_z = normal_z * penetration;
-
-        varying float v_x = p_curr_x - p_prev_x;
-        varying float v_y = p_curr_y - p_prev_y;
-        varying float v_z = p_curr_z - p_prev_z;
-
-        varying float vel_dot_normal = v_x * normal_x + v_y * normal_y + v_z * normal_z;
-        varying float tangent_x = v_x - vel_dot_normal * normal_x;
-        varying float tangent_y = v_y - vel_dot_normal * normal_y;
-        varying float tangent_z = v_z - vel_dot_normal * normal_z;
-
-        varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z;
-        varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec));
-
-        varying float friction_mag = penetration * friction_vec;
-        varying float friction_x = (tangent_x * inv_tangent_mag) * friction_mag;
-        varying float friction_y = (tangent_y * inv_tangent_mag) * friction_mag;
-        varying float friction_z = (tangent_z * inv_tangent_mag) * friction_mag;
-
-        varying float total_corr_x = (normal_corr_x - friction_x) * collision_mask * p_inv_mass;
-        varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass;
-        varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass;
-
-        p_pos_x_ptr[p_base + i + programIndex] = p_curr_x + total_corr_x;
-        p_pos_y_ptr[p_base + i + programIndex] = p_curr_y + total_corr_y;
-        p_pos_z_ptr[p_base + i + programIndex] = p_curr_z + total_corr_z;
-      }
-    }
-  }
-  // Step 5: Motion Constraints
-  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-    uniform int p_base = c * PARTICLES_PER_CHAIN;
-    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-      varying float px = p_pos_x_ptr[p_base + i + programIndex];
-      varying float py = p_pos_y_ptr[p_base + i + programIndex];
-      varying float pz = p_pos_z_ptr[p_base + i + programIndex];
-      varying float pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
-
-      varying float rx = rest_pos_x_ptr[p_base + i + programIndex];
-      varying float ry = rest_pos_y_ptr[p_base + i + programIndex];
-      varying float rz = rest_pos_z_ptr[p_base + i + programIndex];
-      varying float radius_vec = motion_radius_ptr[p_base + i + programIndex];
-
-      varying float dx = px - rx;
-      varying float dy = py - ry;
-      varying float dz = pz - rz;
-      varying float dist_sq = dx * dx + dy * dy + dz * dz;
-      varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
-      varying float dist = rcp(inv_dist);
-
-      varying float penetration = dist - radius_vec;
-      varying float mask = select(penetration > zero_vec, 1.0, 0.0);
-
-      varying float corr_factor = penetration * inv_dist * mask;
-      varying float delta_x = dx * corr_factor * pm;
-      varying float delta_y = dy * corr_factor * pm;
-      varying float delta_z = dz * corr_factor * pm;
-
-      p_pos_x_ptr[p_base + i + programIndex] = px - delta_x;
-      p_pos_y_ptr[p_base + i + programIndex] = py - delta_y;
-      p_pos_z_ptr[p_base + i + programIndex] = pz - delta_z;
-    }
-  }
-}

diff --git a/chain_avbd.c b/chain_avbd.c
@@ -1,1064 +0,0 @@
-#include <immintrin.h>
-#include <math.h>
-
-enum {
-  MAX_CHAINS = 64,
-  PARTICLES_PER_CHAIN = 16,
-  SPHERES_PER_CHAIN = 8,
-  MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
-  MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN),
-  CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN,
-  MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN),
-  MAX_ITERATIONS = 4,
-};
-struct chain_cfg {
-  float damping;
-  float friction;
-  float collision_radius;
-  float linear_inertia;      // [0, 1], 1.0f = physically correct linear motion
-  float angular_inertia;     // [0, 1], 1.0f = physically correct angular rotation
-  float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force
-
-  float avbd_alpha;          // Regularization parameter (e.g., 0.95f)
-  float avbd_beta;           // Stiffness ramping control (e.g., 10.0f)
-  float avbd_gamma;          // Warm starting scaling (e.g., 0.99f)
-  float avbd_k_start;        // Initial stiffness for AVBD (e.g., 100.0f)
-  float avbd_k_max;          // Maximum stiffness for AVBD (e.g., 1e6f)
-} __attribute__((aligned(32)));
-
-struct chain_sim {
-  unsigned short free_idx_cnt;
-  unsigned short free_idx[MAX_CHAINS];
-  struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
-
-  // Chain global forces (world space gravity and wind)
-  float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  // Chain transformations (world space)
-  float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
-
-  // Particle positions (model space)
-  float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-  float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-  float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
-  float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
-
-  float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-  float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-  float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
-
-  float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
-  float dist_lambda[MAX_REST_LENGTHS] __attribute__((aligned(32)));
-  float dist_k_current[MAX_REST_LENGTHS] __attribute__((aligned(32)));
-  float dist_prev_frame_error[MAX_REST_LENGTHS] __attribute__((aligned(32)));
-
-  // Sphere positions (model space)
-  float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
-  float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
-  float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
-  float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));
-
-  float coll_lambda[MAX_PARTICLES + 1] __attribute__((aligned(32)));
-  float coll_k_current[MAX_PARTICLES + 1] __attribute__((aligned(32)));
-  float coll_prev_frame_error[MAX_PARTICLES + 1] __attribute__((aligned(32)));
-
-  // Rest positions (model space)
-  float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-  float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-  float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
-  float motion_radius[MAX_PARTICLES] __attribute__((aligned(32)));
-};
-
-void simulate_chains(struct chain_sim *cs, float dt) {
-  // Common SIMD constants
-  const __m256 dt_vec = _mm256_set1_ps(dt);
-  const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
-  const __m256 one_vec = _mm256_set1_ps(1.0f);
-  const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
-  const __m256 eps_vec = _mm256_set1_ps(1e-6f);
-  const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f);
-  const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f);
-  const __m256 zero_vec = _mm256_setzero_ps();
-  const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
-  const __m256 two_vec = _mm256_set1_ps(2.0f);
-  const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
-  const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
-  const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
-
-  // Local arrays for PBD solver
-  float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-
-  // Initialize dummy elements
-  px_local[PARTICLES_PER_CHAIN] = 0.0f;
-  py_local[PARTICLES_PER_CHAIN] = 0.0f;
-  pz_local[PARTICLES_PER_CHAIN] = 0.0f;
-  pm_local[PARTICLES_PER_CHAIN] = 0.0f;
-
-  // Stack arrays for precomputed data
-  float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  // Step 0: Precomputation (SIMD, 8 chains at once)
-  for (int c = 0; c < MAX_CHAINS; c += 8) {
-    // Load chain quaternions
-    __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
-    __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
-    __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
-    __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
-    // Compute local-space wind
-    {
-      __m256 vx = _mm256_load_ps(&cs->wind_x[c]);
-      __m256 vy = _mm256_load_ps(&cs->wind_y[c]);
-      __m256 vz = _mm256_load_ps(&cs->wind_z[c]);
-
-      // Create q_conjugate components (qw remains, qx, qy, qz are negated)
-      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
-      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
-      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
-
-      // Step 1: t = q_conj * v_world_as_quat
-      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
-      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
-      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
-      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
-
-      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
-      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-
-      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
-      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
-
-      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
-      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
-
-      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
-      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
-      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
-
-      // Step 2: result_vec = (t * q)_vec
-      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
-      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
-      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
-      // result_z = tw*qz + ty*qw + tx*qy - ty*qx
-
-      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
-      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
-      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
-      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
-      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
-      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
-
-      _mm256_store_ps(&wind_local_x[c], result_x);
-      _mm256_store_ps(&wind_local_y[c], result_y);
-      _mm256_store_ps(&wind_local_z[c], result_z);
-    }
-    // Compute local-space gravity
-    {
-      __m256 vx = _mm256_load_ps(&cs->gravity_x[c]);
-      __m256 vy = _mm256_load_ps(&cs->gravity_y[c]);
-      __m256 vz = _mm256_load_ps(&cs->gravity_z[c]);
-
-    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
-      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
-      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
-      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
-
-      // Step 1: t = q_conj * v_world_as_quat
-      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
-      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
-      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
-      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
-
-      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
-      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
-
-      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
-      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
-
-      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
-      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
-
-      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
-      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
-      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
-
-      // Step 2: result_vec = (t * q)_vec
-      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
-      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
-      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
-      // result_z = tw*qz + ty*qw + tx*qy - ty*qx
-
-      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
-      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
-      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
-      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
-      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
-      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
-
-      _mm256_store_ps(&gravity_local_x[c], result_x);
-      _mm256_store_ps(&gravity_local_y[c], result_y);
-      _mm256_store_ps(&gravity_local_z[c], result_z);
-    }
-    // Compute linear velocity
-    {
-      __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
-      __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]);
-      __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]);
-
-      __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]);
-      __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]);
-      __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]);
-
-      __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
-      __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
-      __m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec);
-
-      _mm256_store_ps(&vel_x[c], vel_x_vec);
-      _mm256_store_ps(&vel_y[c], vel_y_vec);
-      _mm256_store_ps(&vel_z[c], vel_z_vec);
-    }
-    // Compute angular velocity
-    {
-      __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
-      __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
-      __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
-      __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
-
-      __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
-      __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]);
-      __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]);
-      __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]);
-
-      // Step 1: Compute delta quaternion (to - from)
-      __m256 delta_qx = _mm256_sub_ps(qx, prev_qx);
-      __m256 delta_qy = _mm256_sub_ps(qy, prev_qy);
-      __m256 delta_qz = _mm256_sub_ps(qz, prev_qz);
-      __m256 delta_qw = _mm256_sub_ps(qw, prev_qw);
-
-      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
-      __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx);
-      __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy);
-      __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz);
-      __m256 conj_qw = prev_qw;
-
-      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
-      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
-      // Result: orientationDelta = (delta_qw, delta_qx, delta_qy, delta_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
-      __m256 orient_qw = _mm256_fnmadd_ps(delta_qx, conj_qx, _mm256_mul_ps(delta_qw, conj_qw));
-      orient_qw = _mm256_fnmadd_ps(delta_qy, conj_qy, orient_qw);
-      orient_qw = _mm256_fnmadd_ps(delta_qz, conj_qz, orient_qw);
-
-      __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(delta_qy, conj_qz), _mm256_mul_ps(delta_qz, conj_qy));
-      __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(delta_qz, conj_qx), _mm256_mul_ps(delta_qx, conj_qz));
-      __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(delta_qx, conj_qy), _mm256_mul_ps(delta_qy, conj_qx));
-
-      __m256 orient_qx = _mm256_fmadd_ps(delta_qw, conj_qx, _mm256_mul_ps(delta_qx, conj_qw));
-      orient_qx = _mm256_add_ps(orient_qx, cross_x);
-      __m256 orient_qy = _mm256_fmadd_ps(delta_qw, conj_qy, _mm256_mul_ps(delta_qy, conj_qw));
-      orient_qy = _mm256_add_ps(orient_qy, cross_y);
-      __m256 orient_qz = _mm256_fmadd_ps(delta_qw, conj_qz, _mm256_mul_ps(delta_qz, conj_qw));
-      orient_qz = _mm256_add_ps(orient_qz, cross_z);
-
-      // Step 4: Compute dot product (to, from) for shortest arc check
-      __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw));
-      dot = _mm256_fmadd_ps(qy, prev_qy, dot);
-      dot = _mm256_fmadd_ps(qz, prev_qz, dot);
-
-      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
-      __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
-      __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), 
-                                       _mm256_andnot_ps(negate_mask, one_vec));
-
-      orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
-      orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
-      orient_qz = _mm256_mul_ps(orient_qz, negate_sign);
-
-      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
-      __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
-      __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale);
-      __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale);
-      __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale);
-
-      // Store results
-      _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
-      _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
-      _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
-    }
-  }
-  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int base_idx = c * PARTICLES_PER_CHAIN;
-    struct chain_cfg *cfg = &cs->chain_configs[c];
-
-    // Load precomputed velocities and inertia parameters
-    __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
-    __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
-    __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);
-
-    __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
-    __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
-    __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);
-
-    __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
-    __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
-    __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);
-
-    // Clamp inertia parameters to [0, 1]
-    linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
-    angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
-    centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
-
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      __m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
-      __m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
-      __m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
-      __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
-      p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
-
-      // Linear inertia: v * dt * linear_inertia
-      __m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
-      __m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
-      __m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
-
-      // Angular inertia: (ω × r) * dt * angular_inertia
-      __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
-      __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
-      __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
-
-      ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec);
-      ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec);
-      ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec);
-
-      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
-      __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
-      __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
-      __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
-
-      __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
-      __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
-      __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
-
-      // Calculate displacement: (ω × (ω × r)) * dt^2
-      __m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec);
-      __m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec);
-      __m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec);
-
-      // Apply the centrifugal_inertia factor
-      __m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec);
-      __m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec);
-      __m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec);
-
-      // Total delta
-      __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x);
-      __m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y);
-      __m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z);
-
-      // Update positions
-      _mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x));
-      _mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y));
-      _mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z));
-    }
-    // Update previous transformation
-    cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
-    cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
-    cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
-
-    cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
-    cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
-    cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
-    cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
-  }
-  // Step 2: Verlet Integration
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int base_idx = c * PARTICLES_PER_CHAIN;
-    struct chain_cfg *cfg = &cs->chain_configs[c];
-
-    __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]);
-    __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]);
-    __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]);
-
-    __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]);
-    __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]);
-    __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]);
-    __m256 damping_vec = _mm256_set1_ps(cfg->damping);
-
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
-      __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
-      __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
-      __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
-      p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
-
-      __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]);
-      __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]);
-      __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]);
-
-      __m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec);
-      __m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec);
-      __m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec);
-
-      __m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass);
-      __m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass);
-      __m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass);
-
-      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
-      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
-      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
-
-      __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
-      __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
-      __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);
-
-      __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
-      __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
-      __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);
-
-      __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
-      __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
-      __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
-
-      _mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x);
-      _mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y);
-      _mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z);
-
-      _mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x);
-      _mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y);
-      _mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z);
-    }
-  }
-
-  // --- AVBD Warm-starting of lambda and k_current for all distance constraints.
-  for (int c = 0; c < MAX_CHAINS; ++c) {
-    struct chain_cfg* cfg = &cs->chain_configs[c];
-    __m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha);
-    __m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma);
-    __m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start);
-
-    // Loop through each block of 8 constraints for warm-starting
-    int r_base = c * CONSTRAINTS_PER_CHAIN; // Base index for this chain's constraints
-    for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += 8) {
-        // Load current lambda, k_current for 8 constraints simultaneously
-        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + i]);
-        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + i]);
-        // Apply warm-starting (Eq. 19 from paper)
-        // lambda_new = lambda_prev_frame * alpha * gamma
-        current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec));
-        // k_current_new = max(K_START, k_current_prev_frame * gamma)
-        current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec));
-        // Store updated warm-started values back
-        _mm256_store_ps(&cs->dist_lambda[r_base + i], current_lambda_vec);
-        _mm256_store_ps(&cs->dist_k_current[r_base + i], current_k_vec);
-    }
-  }
-  // Warm-starting for Collision Constraints ---
-  for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) {
-      struct chain_cfg* cfg = &cs->chain_configs[c_idx];
-      __m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha);
-      __m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma);
-      __m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start);
-
-      int p_base = c_idx * PARTICLES_PER_CHAIN; // Base index for this chain's particles
-      for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) {
-          __m256 current_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]);
-          __m256 current_k_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]);
-
-          current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec));
-          current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec));
-
-          _mm256_store_ps(&cs->coll_lambda[p_base + i], current_lambda_vec);
-          _mm256_store_ps(&cs->coll_k_current[p_base + i], current_k_vec);
-      }
-  }
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    struct chain_cfg *cfg = &cs->chain_configs[c];
-    __m256 alpha_vec = _mm256_set1_ps(cfg->avbd_alpha);
-    __m256 beta_vec = _mm256_set1_ps(cfg->avbd_beta);
-    __m256 k_max_vec = _mm256_set1_ps(cfg->avbd_k_max);
-
-    int p_base = c * PARTICLES_PER_CHAIN;
-    int r_base = c * CONSTRAINTS_PER_CHAIN;
-    // Load all current particle positions into local buffer (once per chain per iteration)
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      _mm256_store_ps(&px_local[i], _mm256_load_ps(&cs->p_pos_x[p_base + i]));
-      _mm256_store_ps(&py_local[i], _mm256_load_ps(&cs->p_pos_y[p_base + i]));
-      _mm256_store_ps(&pz_local[i], _mm256_load_ps(&cs->p_pos_z[p_base + i]));
-      _mm256_store_ps(&pm_local[i], _mm256_load_ps(&cs->inv_mass[p_base + i]));
-      _mm256_store_ps(&pm_local[i], _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&pm_local[i]), inv_mass_clamp_max), inv_mass_clamp_min));
-    }
-    for (int iter = 0; iter < MAX_ITERATIONS; iter++) {
-      // First block (i=0 to 7 of particles, processing constraints 0 to 7)
-      {
-        // p0 is aligned from px_local[0]
-        __m256 p0x = _mm256_load_ps(&px_local[0]);
-        __m256 p0y = _mm256_load_ps(&py_local[0]);
-        __m256 p0z = _mm256_load_ps(&pz_local[0]);
-        __m256 p0m = _mm256_load_ps(&pm_local[0]);
-
-        // p1 is unaligned from px_local[1]
-        __m256 p1x = _mm256_loadu_ps(&px_local[1]);
-        __m256 p1y = _mm256_loadu_ps(&py_local[1]);
-        __m256 p1z = _mm256_loadu_ps(&pz_local[1]);
-        __m256 p1m = _mm256_loadu_ps(&pm_local[1]);
-
-        // This corresponds to rest_lengths[r_base + 0] through rest_lengths[r_base + 7]
-        __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]);
-
-        // Calculate inverse sum of masses (rcp_w_sum)
-        __m256 w_sum = _mm256_add_ps(p0m, p1m);
-        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
-        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
-
-        // Calculate current vector and distance between particles
-        __m256 dx_vec = _mm256_sub_ps(p1x, p0x);
-        __m256 dy_vec = _mm256_sub_ps(p1y, p0y);
-        __m256 dz_vec = _mm256_sub_ps(p1z, p0z);
-
-        __m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec);
-        dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq);
-        dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq);
-
-        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); // 1/dist
-        __m256 dist = _mm256_rcp_ps(inv_dist); // Current distance
-
-        // Primal Update for Distance Constraints (Block 1) ---
-        // Load AVBD state for this block of 8 constraints (indices r_base + 0 to r_base + 7)
-        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 0]);
-        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 0]);
-        __m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 0]);
-
-        // Original constraint error C_j*(x) = current_dist - target_length
-        __m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec);
-
-        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18)
-        __m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec));
-
-        // Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda)
-        // The effective displacement is proportional to this force divided by effective stiffness (k) and mass.
-        __m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec);
-
-        // Calculate the scalar part of the correction, similar to original 'corr_scalar_part'
-        // This is simplified, representing the desired displacement due to the augmented force
-        __m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor
-
-        // Normalize difference vector
-        __m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist);
-        __m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist);
-        __m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist);
-
-        // Calculate the actual displacement vectors
-        // Note the sign inversion from original: -(delta_x * pm0) for p0 and +(delta_x * pm1) for p1
-        // The 'effective_corr_scalar_part' already implicitly carries the sign of the desired correction.
-        // So, we just multiply by the unit direction and inverse mass.
-        __m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part);
-        __m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part);
-        __m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part);
-
-        // Apply correction mask (from original code, avoid division by zero artifacts where dist is ~0)
-        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-
-        delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
-        delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
-        delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
-
-        // Update positions in local buffer
-        // p0 moves along -delta (subtract), p1 moves along +delta (add)
-        _mm256_store_ps(&px_local[0], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m)));
-        _mm256_store_ps(&py_local[0], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m)));
-        _mm256_store_ps(&pz_local[0], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m)));
-
-        _mm256_storeu_ps(&px_local[1], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m)));
-        _mm256_storeu_ps(&py_local[1], _mm256_add_ps(p1y, _mm256_mul_ps(delta_y, p1m)));
-        _mm256_storeu_ps(&pz_local[1], _mm256_add_ps(p1z, _mm256_mul_ps(delta_z, p1m)));
-
-        // Dual Update for Distance Constraints (Block 1) ---
-        // Recalculate constraint error *after* position updates for accurate lambda update
-        // Reload updated positions from local buffer to get the 'post-primal' state
-        p0x = _mm256_load_ps(&px_local[0]);
-        p0y = _mm256_load_ps(&py_local[0]);
-        p0z = _mm256_load_ps(&pz_local[0]);
-        p1x = _mm256_loadu_ps(&px_local[1]);
-        p1y = _mm256_loadu_ps(&py_local[1]);
-        p1z = _mm256_loadu_ps(&pz_local[1]);
-
-        __m256 updated_dx = _mm256_sub_ps(p1x, p0x);
-        __m256 updated_dy = _mm256_sub_ps(p1y, p0y);
-        __m256 updated_dz = _mm256_sub_ps(p1z, p0z);
-
-        __m256 updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx);
-        updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq);
-        updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq);
-
-        __m256 updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec));
-        __m256 updated_dist = _mm256_rcp_ps(updated_inv_dist);
-
-        __m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update
-
-        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
-        __m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec);
-        _mm256_store_ps(&cs->dist_lambda[r_base + 0], new_lambda_vec); // Store back
-
-        // Update k_current (Stiffness ramping, Eq. 12)
-        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
-        __m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask
-        __m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec);
-        __m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec);
-        _mm256_store_ps(&cs->dist_k_current[r_base + 0], new_k_vec); // Store back
-      }
-      // Second block (i=8 to 14 of particles, processing constraints 8 to 14)
-      {
-        // p0 is aligned from px_local[8]
-        __m256 p0x = _mm256_load_ps(&px_local[8]);
-        __m256 p0y = _mm256_load_ps(&py_local[8]);
-        __m256 p0z = _mm256_load_ps(&pz_local[8]);
-        __m256 p0m = _mm256_load_ps(&pm_local[8]);
-
-        // p1 is unaligned from px_local[9], *including* the dummy element at px_local[16]
-        // when i=15, p_base+i+1 becomes p_base+16
-        __m256 p1x = _mm256_loadu_ps(&px_local[9]);
-        __m256 p1y = _mm256_loadu_ps(&py_local[9]);
-        __m256 p1z = _mm256_loadu_ps(&pz_local[9]);
-        __m256 p1m = _mm256_loadu_ps(&pm_local[9]);
-
-        // This corresponds to rest_lengths[r_base + 8] through rest_lengths[r_base + 15]
-        __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]);
-
-        __m256 dx_vec = _mm256_sub_ps(p1x, p0x);
-        __m256 dy_vec = _mm256_sub_ps(p1y, p0y);
-        __m256 dz_vec = _mm256_sub_ps(p1z, p0z);
-
-        __m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec);
-        dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq);
-        dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq);
-
-        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-
-        __m256 w_sum = _mm256_add_ps(p0m, p1m);
-        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
-        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
-
-        __m256 dist = _mm256_rcp_ps(inv_dist);
-
-        // Primal Update for Distance Constraints (Block 2) ---
-        // Load AVBD state for this block of 8 constraints (indices r_base + 8 to r_base + 15)
-        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 8]);
-        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 8]);
-        __m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 8]);
-
-        // Original constraint error C_j*(x) = current_dist - target_length
-        __m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec);
-
-        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18)
-        __m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec));
-
-        // Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda)
-        __m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec);
-
-        // Calculate the scalar part of the correction, similar to original 'corr_scalar_part'
-        __m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor
-
-        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
-
-        // This 'valid_mask' handles the fact that the last particle doesn't have a *next* particle
-        // for a distance constraint. It effectively sets the correction for the last element (index 7 of the block) to zero.
-        // For PARTICLES_PER_CHAIN = 16, this second block covers constraints for particles 8 to 15.
-        // The constraint for particle 15 (index 7 in this block) is between particle 15 and 16 (the dummy).
-        // The mask effectively disables the constraint correction for the dummy particle.
-        __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
-        __m256 valid_mask = _mm256_load_ps(valid_mask_array);
-        apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);
-
-        __m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist);
-        __m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist);
-        __m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist);
-
-        __m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part);
-        __m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part);
-        __m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part);
-
-        delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
-        delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
-        delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
-
-        // Update positions in local buffer
-        _mm256_store_ps(&px_local[8], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m)));
-        _mm256_store_ps(&py_local[8], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m)));
-        _mm256_store_ps(&pz_local[8], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m)));
-
-        _mm256_storeu_ps(&px_local[9], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m)));
-        _mm256_storeu_ps(&py_local[9], _mm256_add_ps(py1, _mm256_mul_ps(delta_y, p1m)));
-        _mm256_storeu_ps(&pz_local[9], _mm256_add_ps(pz1, _mm256_mul_ps(delta_z, p1m)));
-
-        // Recalculate constraint error *after* position updates for accurate lambda update
-        // Reload updated positions from local buffer
-        p0x = _mm256_load_ps(&px_local[8]);
-        p0y = _mm256_load_ps(&py_local[8]);
-        p0z = _mm256_load_ps(&pz_local[8]);
-        p1x = _mm256_loadu_ps(&px_local[9]);
-        p1y = _mm256_loadu_ps(&py_local[9]);
-        p1z = _mm256_loadu_ps(&pz_local[9]);
-
-        updated_dx = _mm256_sub_ps(p1x, p0x);
-        updated_dy = _mm256_sub_ps(p1y, p0y);
-        updated_dz = _mm256_sub_ps(p1z, p0z);
-
-        updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx);
-        updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq);
-        updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq);
-
-        updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec));
-        updated_dist = _mm256_rcp_ps(updated_inv_dist);
-
-        __m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update
-        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
-        __m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec);
-        _mm256_store_ps(&cs->dist_lambda[r_base + 8], new_lambda_vec); // Store back
-
-        // Update k_current (Stiffness ramping, Eq. 12)
-        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
-        __m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask
-        __m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec);
-        __m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec);
-        _mm256_store_ps(&cs->dist_k_current[r_base + 8], new_k_vec); // Store back
-      } // End of second block
-    }
-    // After all iterations, save the final positions from local buffer to global cs->p_pos_x/y/z
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_load_ps(&px_local[i]));
-      _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_load_ps(&py_local[i]));
-      _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_load_ps(&pz_local[i]));
-    }
-  }
-  // Step 4: Sphere Collisions with Friction
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int p_base = c * PARTICLES_PER_CHAIN;
-    int s_base = c * SPHERES_PER_CHAIN;
-    struct chain_cfg *cfg = &cs->chain_configs[c];
-    __m256 friction_vec = _mm256_set1_ps(cfg->friction);
-
-    for (int s = 0; s < SPHERES_PER_CHAIN; s++) {
-      __m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]);
-      __m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]);
-      __m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]);
-      __m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]);
-
-      __m256 alpha_coll_vec = _mm256_set1_ps(cfg->avbd_alpha);
-      __m256 beta_coll_vec = _mm256_set1_ps(cfg->avbd_beta);
-      __m256 k_max_coll_vec = _mm256_set1_ps(cfg->avbd_k_max);
-
-      for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-        __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
-        __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
-        __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
-
-        __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]);
-        __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]);
-        __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);
-
-        __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]);
-        p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
-
-        __m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
-        __m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
-        __m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);
-
-        __m256 dist_sq = _mm256_mul_ps(dx, dx);
-        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
-
-        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-        __m256 dist = _mm256_rcp_ps(inv_dist);
-
-        __m256 penetration = _mm256_sub_ps(sphere_r_vec, dist);
-        __m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
-
-        __m256 normal_x = _mm256_mul_ps(dx, inv_dist);
-        __m256 normal_y = _mm256_mul_ps(dy, inv_dist);
-        __m256 normal_z = _mm256_mul_ps(dz, inv_dist);
-
-        // Load AVBD state for this particle block
-        __m256 coll_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]);
-        __m256 coll_k_current_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]);
-        __m256 coll_prev_frame_error_vec = _mm256_load_ps(&cs->coll_prev_frame_error[p_base + i]);
-
-        // AVBD Primal Update:
-        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame)
-        __m256 C_x_regularized = _mm256_sub_ps(penetration, _mm256_mul_ps(alpha_coll_vec, coll_prev_frame_error_vec));
-
-        // Apply collision_mask to regularized error, so only active constraints contribute.
-        // This ensures C_x_regularized is 0 for non-penetrating particles, making force_term 0.
-        C_x_regularized = _mm256_and_ps(C_x_regularized, collision_mask);
-
-        // Calculate force term: (k * C_reg + lambda)
-        __m256 force_term = _mm256_fmadd_ps(coll_k_current_vec, C_x_regularized, coll_lambda_vec);
-
-        // Calculate correction magnitude (delta_p in paper): force_term * inv_mass / k_current
-        // For collision, delta_x = -nablaC_j(x) * (H_j_inv * (lambda_j + K_j C_j(x))).
-        // H_j_inv for a single particle collision is its inverse mass (p_inv_mass).
-        // So, the correction should be along the normal: -normal * (force_term * p_inv_mass) / k_current
-        // The force_term already has the appropriate sign for correction direction when C_x_regularized is positive (penetration).
-        __m256 correction_magnitude_scalar = _mm256_div_ps(_mm256_mul_ps(force_term, p_inv_mass), coll_k_current_vec);
-
-        // Apply the correction along the normal direction.
-        // The sign of 'penetration' (and thus 'C_x_regularized' and 'force_term')
-        // means positive 'correction_magnitude_scalar' should push OUT of collision.
-        // So we subtract from particle position.
-        __m256 delta_pos_x = _mm256_mul_ps(normal_x, correction_magnitude_scalar);
-        __m256 delta_pos_y = _mm256_mul_ps(normal_y, correction_magnitude_scalar);
-        __m256 delta_pos_z = _mm256_mul_ps(normal_z, correction_magnitude_scalar);
-
-        // Update positions: p_curr = p_curr - delta_pos (particle moves out of penetration)
-        p_curr_x = _mm256_sub_ps(p_curr_x, delta_pos_x);
-        p_curr_y = _mm256_sub_ps(p_curr_y, delta_pos_y);
-        p_curr_z = _mm256_sub_ps(p_curr_z, delta_pos_z);
-
-        _mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x);
-        _mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y);
-        _mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z);
-
-        // Apply friction (post-stabilization step)
-        // Recalculate velocity based on updated positions
-        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
-        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
-        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
-
-        // Project velocity onto normal to get normal component
-        __m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x);
-        vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal);
-        vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal);
-
-        // Get tangential velocity component
-        __m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x));
-        __m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y));
-        __m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z));
-
-        __m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x);
-        tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq);
-        tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq);
-
-        __m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec));
-
-        // Calculate friction force magnitude limit (based on normal force proxy)
-        __m256 friction_mag_limit = _mm256_mul_ps(penetration, friction_vec); 
-
-        // Actual tangential velocity length
-        __m256 actual_tangent_mag = _mm256_mul_ps(_mm256_sqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)), one_vec); // sqrt(tangent_mag_sq) is needed here
-
-        // Calculate the scaling factor for friction. If actual_tangent_mag is greater than limit, scale down.
-        __m256 friction_scale = _mm256_div_ps(friction_mag_limit, _mm256_max_ps(actual_tangent_mag, eps_vec));
-        friction_scale = _mm256_min_ps(friction_scale, one_vec); // Clamp to 1.0 (no acceleration from friction)
-
-        // Apply friction as a reduction of tangential velocity, converting to position change
-        // The idea is to reduce the tangential displacement that happened *this* iteration.
-        // The original `vel_x/y/z` are already displacements from `p_prev` to `p_curr`.
-        // So, the tangential displacement is `tangent_x/y/z`.
-        // We want to reduce this by `(1 - friction_scale)`.
-        __m256 friction_x_corr = _mm256_mul_ps(tangent_x, friction_scale);
-        __m256 friction_y_corr = _mm256_mul_ps(tangent_y, friction_scale);
-        __m256 friction_z_corr = _mm256_mul_ps(tangent_z, friction_scale);
-
-        // Only apply if actively colliding
-        friction_x_corr = _mm256_and_ps(friction_x_corr, collision_mask);
-        friction_y_corr = _mm256_and_ps(friction_y_corr, collision_mask);
-        friction_z_corr = _mm256_and_ps(friction_z_corr, collision_mask);
-
-        // Apply friction by moving the particle along the tangential direction.
-        // If friction_scale is 0 (no friction), no change. If 1 (full friction), moves back to p_prev.
-        p_curr_x = _mm256_sub_ps(p_curr_x, friction_x_corr);
-        p_curr_y = _mm256_sub_ps(p_curr_y, friction_y_corr);
-        p_curr_z = _mm256_sub_ps(p_curr_z, friction_z_corr);
-
-        _mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x);
-        _mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y);
-        _mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z);
-
-        // Dual Update for Collision Constraints ---
-        // Recalculate collision error *after* position updates (normal & friction)
-        // Reload updated positions
-        p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
-        p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
-        p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
-
-        dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
-        dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
-        dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);
-
-        dist_sq = _mm256_mul_ps(dx, dx);
-        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
-
-        inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-        dist = _mm256_rcp_ps(inv_dist);
-
-        __m256 C_x_updated_collision = _mm256_sub_ps(sphere_r_vec, dist); // New error after primal update
-
-        // Apply the collision mask again for updates, so only active constraints update their AVBD state
-        C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, collision_mask);
-
-        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
-        __m256 new_coll_lambda_vec = _mm256_fmadd_ps(coll_k_current_vec, C_x_updated_collision, coll_lambda_vec);
-        _mm256_store_ps(&cs->coll_lambda[p_base + i], new_coll_lambda_vec);
-
-        // Update k_current (Stiffness ramping, Eq. 12)
-        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
-        __m256 abs_C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, abs_mask_ps); // Absolute value
-        __m256 k_updated_coll_vec = _mm256_fmadd_ps(beta_coll_vec, abs_C_x_updated_collision, coll_k_current_vec);
-        __m256 new_coll_k_vec = _mm256_min_ps(k_max_coll_vec, k_updated_coll_vec);
-        _mm256_store_ps(&cs->coll_k_current[p_base + i], new_coll_k_vec);
-        // --- END AVBD ADDITION ---
-      }
-    }
-  }
-  // Step 5: Motion Constraints
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int p_base = c * PARTICLES_PER_CHAIN;
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      __m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
-      __m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
-      __m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
-      __m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]);
-      pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min);
-
-      __m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]);
-      __m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]);
-      __m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]);
-      __m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]);
-
-      __m256 dx = _mm256_sub_ps(px, rx);
-      __m256 dy = _mm256_sub_ps(py, ry);
-      __m256 dz = _mm256_sub_ps(pz, rz);
-
-      __m256 dist_sq = _mm256_mul_ps(dx, dx);
-      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
-      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
-
-      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-      __m256 dist = _mm256_rcp_ps(inv_dist);
-
-      __m256 penetration = _mm256_sub_ps(dist, radius_vec);
-      __m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
-
-      __m256 corr_factor = _mm256_mul_ps(penetration, inv_dist);
-      corr_factor = _mm256_and_ps(corr_factor, mask);
-
-      __m256 delta_x = _mm256_mul_ps(dx, corr_factor);
-      __m256 delta_y = _mm256_mul_ps(dy, corr_factor);
-      __m256 delta_z = _mm256_mul_ps(dz, corr_factor);
-
-      delta_x = _mm256_mul_ps(delta_x, pm);
-      delta_y = _mm256_mul_ps(delta_y, pm);
-      delta_z = _mm256_mul_ps(delta_z, pm);
-
-      _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x));
-      _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y));
-      _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z));
-    }
-  }
-  // Store Final Errors for Next Frame's Regularization ---
-  // This happens once at the very end of simulate_chains, after all iterations are complete for the frame.
-  for (int c = 0; c < MAX_CHAINS; ++c) {
-    int p_base = c * PARTICLES_PER_CHAIN;
-    int r_base = c * CONSTRAINTS_PER_CHAIN;
-    // Loop through each block of 8 constraints (distance constraints are N-1 for N particles)
-    for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += VEC_FLOAT_COUNT) {
-      // Calculate the current error for this distance constraint block
-      // (Similar to C_x_original calculation from Step 3, but using final positions)
-      // Load final positions for p0 and p1 (from global arrays now, as local buffer is stale)
-      __m256 px0 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 0]);
-      __m256 py0 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 0]);
-      __m256 pz0 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 0]);
-
-      __m256 px1 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 1]); // Use _mm256_load_ps as this is from global
-      __m256 py1 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 1]);
-      __m256 pz1 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 1]);
-
-      __m256 diff_x = _mm256_sub_ps(px1, px0);
-      __m256 diff_y = _mm256_sub_ps(py1, py0);
-      __m256 diff_z = _mm256_sub_ps(pz1, pz0);
-
-      __m256 dist_sq = _mm256_mul_ps(diff_x, diff_x);
-      dist_sq = _mm256_fmadd_ps(diff_y, diff_y, dist_sq);
-      dist_sq = _mm256_fmadd_ps(diff_z, diff_z, dist_sq);
-
-      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
-      __m256 dist = _mm256_rcp_ps(inv_dist);
-      __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + i]);
-      __m256 final_error_vec = _mm256_sub_ps(dist, rest_len_vec);
-      _mm256_store_ps(&cs->dist_prev_frame_error[r_base + i], final_error_vec); // Store for next frame's regularization
-    }
-  }
-  for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) {
-    int p_base = c_idx * PARTICLES_PER_CHAIN;
-    int s_base = c_idx * SPHERES_PER_CHAIN; // For accessing sphere data for this chain
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) {
-      __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
-      __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
-      __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
-      __m256 deepest_penetration_vec = neg_flt_max_vec; // Initialize with negative infinity
-      // Loop through all spheres for this chain to find the max penetration for each particle in the block
-      for (int s_idx = 0; s_idx < SPHERES_PER_CHAIN; ++s_idx) {
-        __m256 s_x = _mm256_set1_ps(cs->sphere_x[s_base + s_idx]);
-        __m256 s_y = _mm256_set1_ps(cs->sphere_y[s_base + s_idx]);
-        __m256 s_z = _mm256_set1_ps(cs->sphere_z[s_base + s_idx]);
-        __m256 s_r = _mm256_set1_ps(cs->sphere_radius[s_base + s_idx]);
-
-        __m256 dx_s = _mm256_sub_ps(p_curr_x, s_x);
-        __m256 dy_s = _mm256_sub_ps(p_curr_y, s_y);
-        __m256 dz_s = _mm256_sub_ps(p_curr_z, s_z);
-
-        __m256 dist_sq_s = _mm256_mul_ps(dx_s, dx_s);
-        dist_sq_s = _mm256_fmadd_ps(dy_s, dy_s, dist_sq_s);
-        dist_sq_s = _mm256_fmadd_ps(dz_s, dz_s, dist_sq_s);
-
-        __m256 inv_dist_s = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq_s, eps_vec));
-        __m256 dist_s = _mm256_rcp_ps(inv_dist_s);
-
-        __m256 current_penetration_s = _mm256_sub_ps(s_r, dist_s); // C(x) = radius - distance
-
-        // We only care about positive penetration (i.e., actually colliding)
-        current_penetration_s = _mm256_max_ps(zero_vec, current_penetration_s);
-        deepest_penetration_vec = _mm256_max_ps(deepest_penetration_vec, current_penetration_s);
-      }
-      _mm256_store_ps(&cs->coll_prev_frame_error[p_base + i], deepest_penetration_vec); // Store for next frame's regularization
-    }
-  }
-}

diff --git a/chain.ispc → chain_ispc.ispc b/chain.ispc → chain_ispc.ispc
diff --git a/chain_arm.c b/chain_arm.c
@@ -0,0 +1,944 @@
+#include <arm_neon.h>
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <float.h>
+
+enum {
+  MAX_CHNS      = 64,
+  PTC_PER_CHN   = 16,
+  SPH_PER_CHN   = 8,
+  MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
+  MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
+  CON_PER_CHN   = PTC_PER_CHN,
+  MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
+  MAX_ITR       = 16,
+};
+struct chn_cfg {
+  float damping;              // [0, 1]
+  float stiffness;            // [0, 1]
+  float stretch_factor;
+  float max_strain;
+  float friction;             // [0, 1]
+  float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
+  float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
+  float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
+};
+struct chn_sim {
+  unsigned char free_idx_cnt;
+  unsigned char free_idx[MAX_CHNS];
+  struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32)));
+
+  // chain global forces (world space gravity and wind)
+  float chn_grav_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_grav_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_grav_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  // chain transformations (world space)
+  float chn_pos_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_pos_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_pos_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  float chn_quat_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_quat_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_quat_z[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_quat_w[MAX_CHNS] __attribute__((aligned(32)));
+
+  float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32)));
+
+  // particle positions (model space)
+  float ptc_pos_x[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32)));
+
+  float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32)));
+
+  // sphere positions (model space)
+  float sph_x[MAX_SPH] __attribute__((aligned(32)));
+  float sph_y[MAX_SPH] __attribute__((aligned(32)));
+  float sph_z[MAX_SPH] __attribute__((aligned(32)));
+  float sph_r[MAX_SPH] __attribute__((aligned(32)));
+
+  // rest positions (model space)
+  float rest_pos_x[MAX_PTC] __attribute__((aligned(32)));
+  float rest_pos_y[MAX_PTC] __attribute__((aligned(32)));
+  float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
+  float motion_radius[MAX_PTC] __attribute__((aligned(32)));
+};
+extern void
+chn_sim_init(struct chn_sim *cns) {
+  assert(cns);
+  cns->free_idx_cnt = MAX_CHNS;
+  for (int i = 0; i < MAX_CHNS; ++i) {
+    cns->free_idx[i] = MAX_CHNS - i - 1;
+  }
+  for (int i = 0; i < MAX_CHNS; i += 4) {
+    vst1q_f32(&cns->chn_quat_w[i], vdupq_n_f32(1.0f));
+    vst1q_f32(&cns->chn_prev_quat_w[i], vdupq_n_f32(1.0f));  
+  }
+}
+extern int
+chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
+  assert(cns);
+  assert(cfg);
+  assert(cns->free_idx_cnt > 0);
+  assert(cnt <= PTC_PER_CHN);
+  assert(cnt > 1);
+
+  int chn = cns->free_idx[--cns->free_idx_cnt];
+  cns->chn_cfg[chn] = *cfg;
+
+  int p_idx = (chn * PTC_PER_CHN);
+  int r_idx = (chn * CON_PER_CHN);
+  for (int i = 0; i < cnt; ++i) {
+    cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0];
+    cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1];
+    cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2];
+    cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3];
+
+    cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0];
+    cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1];
+    cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2];
+  }
+  for (int i = cnt; i < PTC_PER_CHN; ++i) {
+    cns->ptc_pos_x[p_idx + i] = 0.0f;
+    cns->ptc_pos_y[p_idx + i] = 0.0f;
+    cns->ptc_pos_z[p_idx + i] = 0.0f;
+  }
+  for (int i = 1; i < cnt; ++i) {
+    float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
+    float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
+    float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
+    cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
+  }
+  for (int i = cnt; i < CON_PER_CHN; ++i) {
+    cns->ptc_rest_len[r_idx + i] = 0.0f;
+    cns->ptc_rest_len[r_idx + i] = 0.0f;
+    cns->ptc_rest_len[r_idx + i] = 0.0f;
+  }
+  return chn;
+}
+extern void
+chn_sim_del(struct chn_sim *cns, int chn) {
+  assert(cns);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+
+  int p_idx = (chn * PTC_PER_CHN);
+  int s_idx = (chn * SPH_PER_CHN);
+  int c_idx = (chn * CON_PER_CHN);
+
+  cns->free_idx[cns->free_idx_cnt++] = chn;
+  memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
+
+  cns->chn_grav_x[chn] = 0.0f;
+  cns->chn_grav_y[chn] = 0.0f;
+  cns->chn_grav_z[chn] = 0.0f;
+
+  cns->chn_wnd_x[chn] = 0.0f;
+  cns->chn_wnd_y[chn] = 0.0f;
+  cns->chn_wnd_z[chn] = 0.0f;
+
+  cns->chn_pos_x[chn] = 0.0f;
+  cns->chn_pos_y[chn] = 0.0f;
+  cns->chn_pos_z[chn] = 0.0f;
+
+  cns->chn_quat_x[chn] = 0.0f;
+  cns->chn_quat_y[chn] = 0.0f;
+  cns->chn_quat_z[chn] = 0.0f;
+  cns->chn_quat_w[chn] = 1.0f;
+
+  cns->chn_prev_pos_x[chn] = 0.0f;
+  cns->chn_prev_pos_y[chn] = 0.0f;
+  cns->chn_prev_pos_z[chn] = 0.0f;
+
+  cns->chn_prev_quat_x[chn] = 0.0f;
+  cns->chn_prev_quat_y[chn] = 0.0f;
+  cns->chn_prev_quat_z[chn] = 0.0f;
+  cns->chn_prev_quat_w[chn] = 1.0f;
+
+  for (int i = 0; i < PTC_PER_CHN; i += 4) {
+    vst1q_f32(&cns->ptc_pos_x[p_idx + i], vdupq_n_f32(0.0f));
+    vst1q_f32(&cns->ptc_pos_y[p_idx + i], vdupq_n_f32(0.0f));
+    vst1q_f32(&cns->ptc_pos_z[p_idx + i], vdupq_n_f32(0.0f));
+    vst1q_f32(&cns->ptc_inv_mass[p_idx + i], vdupq_n_f32(0.0f));
+
+    vst1q_f32(&cns->ptc_prev_pos_x[p_idx + i], vdupq_n_f32(0.0f));
+    vst1q_f32(&cns->ptc_prev_pos_y[p_idx + i], vdupq_n_f32(0.0f));
+    vst1q_f32(&cns->ptc_prev_pos_z[p_idx + i], vdupq_n_f32(0.0f));
+
+    vst1q_f32(&cns->rest_pos_x[p_idx + i], vdupq_n_f32(0.0f));
+    vst1q_f32(&cns->rest_pos_y[p_idx + i], vdupq_n_f32(0.0f));
+    vst1q_f32(&cns->rest_pos_z[p_idx + i], vdupq_n_f32(0.0f));
+    vst1q_f32(&cns->motion_radius[p_idx + i], vdupq_n_f32(0.0f));
+  }
+  // Spheres
+  vst1q_f32(&cns->sph_x[s_idx], vdupq_n_f32(0.0f));
+  vst1q_f32(&cns->sph_y[s_idx], vdupq_n_f32(0.0f));
+  vst1q_f32(&cns->sph_z[s_idx], vdupq_n_f32(0.0f));
+  vst1q_f32(&cns->sph_r[s_idx], vdupq_n_f32(0.0f));
+
+  vst1q_f32(&cns->sph_x[s_idx + 4], vdupq_n_f32(0.0f));
+  vst1q_f32(&cns->sph_y[s_idx + 4], vdupq_n_f32(0.0f));
+  vst1q_f32(&cns->sph_z[s_idx + 4], vdupq_n_f32(0.0f));
+  vst1q_f32(&cns->sph_r[s_idx + 4], vdupq_n_f32(0.0f));
+
+  // Loop by 4 for NEON
+  for (int i = 0; i < CON_PER_CHN; i += 4) {
+    vst1q_f32(&cns->ptc_rest_len[c_idx + i], vdupq_n_f32(0.0f));
+  }
+}
+extern void
+chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
+  assert(cns);
+  assert(pos3);
+  assert(rot4);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+
+  cns->chn_pos_x[chn] = pos3[0];
+  cns->chn_pos_y[chn] = pos3[1];
+  cns->chn_pos_z[chn] = pos3[2];
+
+  cns->chn_quat_x[chn] = rot4[0];
+  cns->chn_quat_y[chn] = rot4[1];
+  cns->chn_quat_z[chn] = rot4[2];
+  cns->chn_quat_w[chn] = rot4[3];
+}
+extern void
+chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
+  assert(cns);
+  assert(pos3);
+  assert(rot4);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+
+  cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0];
+  cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1];
+  cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2];
+
+  cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0];
+  cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1];
+  cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
+  cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
+}
+extern void
+chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
+  assert(cns);
+  assert(grav3);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+
+  cns->chn_grav_x[chn] = grav3[0];
+  cns->chn_grav_y[chn] = grav3[1];
+  cns->chn_grav_z[chn] = grav3[2];
+}
+extern void
+chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
+  assert(cns);
+  assert(wind3);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+
+  cns->chn_wnd_x[chn] = wind3[0];
+  cns->chn_wnd_y[chn] = wind3[1];
+  cns->chn_wnd_z[chn] = wind3[2];
+}
+extern void
+chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
+  assert(cns);
+  assert(spheres);
+  assert(cnt < MAX_SPH);
+
+  int s_idx = (chn * SPH_PER_CHN);
+  for (int i = 0; i < cnt; i++) {
+    cns->sph_x[s_idx + i] = spheres[i*4+0];
+    cns->sph_y[s_idx + i] = spheres[i*4+1];
+    cns->sph_z[s_idx + i] = spheres[i*4+2];
+    cns->sph_r[s_idx + i] = spheres[i*4+3];
+  }
+  for (int i = cnt; i < SPH_PER_CHN; ++i) {
+    cns->sph_x[s_idx + i] = 0;
+    cns->sph_y[s_idx + i] = 0;
+    cns->sph_z[s_idx + i] = 0;
+    cns->sph_r[s_idx + i] = 0;
+  }
+}
+extern void
+chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
+  assert(cns);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+  assert(cnt >= 0);
+  assert(cnt < PTC_PER_CHN);
+
+  int p_idx = (chn * PTC_PER_CHN);
+  for (int i = 0; i < cnt; ++i) {
+    cns->motion_radius[p_idx + i] = radius[i];
+  }
+  for (int i = cnt; i < PTC_PER_CHN; ++i) {
+    cns->motion_radius[p_idx + i] = FLT_MAX;
+  }
+}
+extern void
+chn_sim_run(struct chn_sim *cns, float dt) {
+  // NEON constants
+  const float32x4_t dt_vec = vdupq_n_f32(dt);
+  const float32x4_t dt2_vec = vdupq_n_f32(dt * dt);
+  const float32x4_t one_vec = vdupq_n_f32(1.0f);
+  const float32x4_t neg_one_vec = vdupq_n_f32(-1.0f);
+  const float32x4_t eps_vec = vdupq_n_f32(1e-6f);
+  const float32x4_t ptc_inv_mass_clamp_min = vdupq_n_f32(0.0f);
+  const float32x4_t ptc_inv_mass_clamp_max = vdupq_n_f32(1e6f);
+  const float32x4_t zero_vec = vdupq_n_f32(0.0f);
+  const float32x4_t inv_dt_vec = vdupq_n_f32(1.0f / dt);
+  const float32x4_t two_vec = vdupq_n_f32(2.0f);
+  const float32x4_t inertia_clamp_min = vdupq_n_f32(0.0f);
+  const float32x4_t inertia_clamp_max = vdupq_n_f32(1.0f);
+  // NEON doesn't have a direct equivalent to _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF))
+  // for absolute mask. We'll use `vabsq_f32` where needed or reconstruct the logic.
+
+  // Local arrays for PBD solver
+  float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
+  float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
+  float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
+  float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
+
+  // Initialize dummy elements
+  px_lcl[PTC_PER_CHN] = 0.0f;
+  py_lcl[PTC_PER_CHN] = 0.0f;
+  pz_lcl[PTC_PER_CHN] = 0.0f;
+  pm_lcl[PTC_PER_CHN] = 0.0f;
+
+  // Stack arrays for precomputed data
+  float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  float vel_x[MAX_CHNS] __attribute__((aligned(32)));
+  float vel_y[MAX_CHNS] __attribute__((aligned(32)));
+  float vel_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  float ang_vel_x[MAX_CHNS] __attribute__((aligned(32)));
+  float ang_vel_y[MAX_CHNS] __attribute__((aligned(32)));
+  float ang_vel_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  // Step 0: Precomputation
+  for (int c = 0; c < MAX_CHNS; c += 4) {
+    // Load chain quaternions
+    float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]);
+    float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]);
+    float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]);
+    float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]);
+
+    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
+    float32x4_t cqx = vnegq_f32(qx); // -qx
+    float32x4_t cqy = vnegq_f32(qy); // -qy
+    float32x4_t cqz = vnegq_f32(qz); // -qz
+
+    // Compute local-space wind
+    {
+      float32x4_t vx = vld1q_f32(&cns->chn_wnd_x[c]);
+      float32x4_t vy = vld1q_f32(&cns->chn_wnd_y[c]);
+      float32x4_t vz = vld1q_f32(&cns->chn_wnd_z[c]);
+
+      // Step 1: t = q_conj * v_world_as_quat
+      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
+      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+      float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); // qw*vx + (-qy)*vz
+      tx = vmlsq_f32(tx, cqz, vy); // qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+
+      float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); // qw*vy + (-qz)*vx
+      ty = vmlsq_f32(ty, cqx, vz); // qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+
+      float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); // qw*vz + (-qx)*vy
+      tz = vmlsq_f32(tz, cqy, vx); // qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); // - ((-qx)*vx + (-qy)*vy) = qx*vx + qy*vy
+      tw = vmlsq_f32(tw, cqz, vz);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+
+      // Step 2: res_vec = (t * q)_vec
+      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
+      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
+      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
+      // res_z = tw*qz + ty*qw + tx*qy - ty*qx
+      float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw);
+      res_x = vmlsq_f32(res_x, tz, qy);
+      res_x = vmlaq_f32(res_x, ty, qz); // Corrected: tw*qx + tx*qw + ty*qz - tz*qy
+
+      float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw);
+      res_y = vmlsq_f32(res_y, tx, qz);
+      res_y = vmlaq_f32(res_y, tz, qx); // Corrected: tw*qy + ty*qw + tz*qx - tx*qz
+
+      float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw);
+      res_z = vmlsq_f32(res_z, ty, qx);
+      res_z = vmlaq_f32(res_z, tx, qy); // Corrected: tw*qz + tz*qw + tx*qy - ty*qx
+
+      vst1q_f32(&chn_wnd_lcl_x[c], res_x);
+      vst1q_f32(&chn_wnd_lcl_y[c], res_y);
+      vst1q_f32(&chn_wnd_lcl_z[c], res_z);
+    }
+    // Compute local-space gravity
+    {
+      float32x4_t vx = vld1q_f32(&cns->chn_grav_x[c]);
+      float32x4_t vy = vld1q_f32(&cns->chn_grav_y[c]);
+      float32x4_t vz = vld1q_f32(&cns->chn_grav_z[c]);
+
+      // Step 1: t = q_conj * v_world_as_quat
+      float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz);
+      tx = vmlsq_f32(tx, cqz, vy);
+
+      float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx);
+      ty = vmlsq_f32(ty, cqx, vz);
+
+      float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy);
+      tz = vmlsq_f32(tz, cqy, vx);
+
+      float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy));
+      tw = vmlsq_f32(tw, cqz, vz);
+
+      // Step 2: res_vec = (t * q)_vec
+      float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw);
+      res_x = vmlsq_f32(res_x, tz, qy);
+      res_x = vmlaq_f32(res_x, ty, qz);
+
+      float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw);
+      res_y = vmlsq_f32(res_y, tx, qz);
+      res_y = vmlaq_f32(res_y, tz, qx);
+
+      float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw);
+      res_z = vmlsq_f32(res_z, ty, qx);
+      res_z = vmlaq_f32(res_z, tx, qy);
+
+      vst1q_f32(&chn_grav_lcl_x[c], res_x);
+      vst1q_f32(&chn_grav_lcl_y[c], res_y);
+      vst1q_f32(&chn_grav_lcl_z[c], res_z);
+    }
+    // Compute linear velocity
+    {
+      float32x4_t curr_x = vld1q_f32(&cns->chn_pos_x[c]);
+      float32x4_t curr_y = vld1q_f32(&cns->chn_pos_y[c]);
+      float32x4_t curr_z = vld1q_f32(&cns->chn_pos_z[c]);
+
+      float32x4_t prev_x = vld1q_f32(&cns->chn_prev_pos_x[c]);
+      float32x4_t prev_y = vld1q_f32(&cns->chn_prev_pos_y[c]);
+      float32x4_t prev_z = vld1q_f32(&cns->chn_prev_pos_z[c]);
+
+      float32x4_t vel_x_vec = vmulq_f32(vsubq_f32(curr_x, prev_x), inv_dt_vec);
+      float32x4_t vel_y_vec = vmulq_f32(vsubq_f32(curr_y, prev_y), inv_dt_vec);
+      float32x4_t vel_z_vec = vmulq_f32(vsubq_f32(curr_z, prev_z), inv_dt_vec);
+
+      vst1q_f32(&vel_x[c], vel_x_vec);
+      vst1q_f32(&vel_y[c], vel_y_vec);
+      vst1q_f32(&vel_z[c], vel_z_vec);
+    }
+    // Compute angular velocity
+    {
+      float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]);
+      float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]);
+      float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]);
+      float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]);
+
+      float32x4_t prev_qx = vld1q_f32(&cns->chn_prev_quat_x[c]);
+      float32x4_t prev_qy = vld1q_f32(&cns->chn_prev_quat_y[c]);
+      float32x4_t prev_qz = vld1q_f32(&cns->chn_prev_quat_z[c]);
+      float32x4_t prev_qw = vld1q_f32(&cns->chn_prev_quat_w[c]);
+
+      // Step 1: Compute delta quaternion (to - from)
+      float32x4_t dt_qx = vsubq_f32(qx, prev_qx);
+      float32x4_t dt_qy = vsubq_f32(qy, prev_qy);
+      float32x4_t dt_qz = vsubq_f32(qz, prev_qz);
+      float32x4_t dt_qw = vsubq_f32(qw, prev_qw);
+
+      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
+      float32x4_t conj_qx = vnegq_f32(prev_qx);
+      float32x4_t conj_qy = vnegq_f32(prev_qy);
+      float32x4_t conj_qz = vnegq_f32(prev_qz);
+      float32x4_t conj_qw = prev_qw;
+
+      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
+      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
+      // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
+      float32x4_t orient_qw = vmlsq_f32(vmulq_f32(dt_qw, conj_qw), dt_qx, conj_qx);
+      orient_qw = vmlsq_f32(orient_qw, dt_qy, conj_qy);
+      orient_qw = vmlsq_f32(orient_qw, dt_qz, conj_qz);
+
+      float32x4_t cross_x = vmlsq_f32(vmulq_f32(dt_qy, conj_qz), dt_qz, conj_qy);
+      float32x4_t cross_y = vmlsq_f32(vmulq_f32(dt_qz, conj_qx), dt_qx, conj_qz);
+      float32x4_t cross_z = vmlsq_f32(vmulq_f32(dt_qx, conj_qy), dt_qy, conj_qx);
+
+      float32x4_t orient_qx = vmlaq_f32(vmulq_f32(dt_qw, conj_qx), dt_qx, conj_qw);
+      orient_qx = vaddq_f32(orient_qx, cross_x);
+      float32x4_t orient_qy = vmlaq_f32(vmulq_f32(dt_qw, conj_qy), dt_qy, conj_qw);
+      orient_qy = vaddq_f32(orient_qy, cross_y);
+      float32x4_t orient_qz = vmlaq_f32(vmulq_f32(dt_qw, conj_qz), dt_qz, conj_qw);
+      orient_qz = vaddq_f32(orient_qz, cross_z);
+
+      // Step 4: Compute dot product (to, from) for shortest arc check
+      float32x4_t dot = vmlaq_f32(vmulq_f32(qw, prev_qw), qx, prev_qx);
+      dot = vmlaq_f32(dot, qy, prev_qy);
+      dot = vmlaq_f32(dot, qz, prev_qz);
+
+      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
+      uint32x4_t negate_mask = vcltq_f32(dot, zero_vec); // 1 if dot < 0, 0 otherwise
+      float32x4_t sign_selector = vbslq_f32(negate_mask, neg_one_vec, one_vec); // -1.0f if true, 1.0f if false
+
+      orient_qx = vmulq_f32(orient_qx, sign_selector);
+      orient_qy = vmulq_f32(orient_qy, sign_selector);
+      orient_qz = vmulq_f32(orient_qz, sign_selector);
+
+      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
+      // Inlined vrecip_f32_high_precision
+      float32x4_t rcp_two_dt;
+      rcp_two_dt = vrecpeq_f32(vmulq_f32(two_vec, dt_vec));
+      rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt));
+      rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt));
+
+      float32x4_t ang_vel_x_vec = vmulq_f32(orient_qx, rcp_two_dt);
+      float32x4_t ang_vel_y_vec = vmulq_f32(orient_qy, rcp_two_dt);
+      float32x4_t ang_vel_z_vec = vmulq_f32(orient_qz, rcp_two_dt);
+
+      // Store results
+      vst1q_f32(&ang_vel_x[c], ang_vel_x_vec);
+      vst1q_f32(&ang_vel_y[c], ang_vel_y_vec);
+      vst1q_f32(&ang_vel_z[c], ang_vel_z_vec);
+    }
+  }
+  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int base_idx = c * PTC_PER_CHN;
+    struct chn_cfg *cfg = &cns->chn_cfg[c];
+
+    // Load precomputed velocities and inertia parameters
+    float32x4_t vel_x_vec = vdupq_n_f32(vel_x[c]);
+    float32x4_t vel_y_vec = vdupq_n_f32(vel_y[c]);
+    float32x4_t vel_z_vec = vdupq_n_f32(vel_z[c]);
+
+    float32x4_t ang_vel_x_vec = vdupq_n_f32(ang_vel_x[c]);
+    float32x4_t ang_vel_y_vec = vdupq_n_f32(ang_vel_y[c]);
+    float32x4_t ang_vel_z_vec = vdupq_n_f32(ang_vel_z[c]);
+
+    float32x4_t linear_inertia_vec = vdupq_n_f32(cfg->linear_inertia);
+    float32x4_t angular_inertia_vec = vdupq_n_f32(cfg->angular_inertia);
+    float32x4_t centrifugal_inertia_vec = vdupq_n_f32(cfg->centrifugal_inertia);
+
+    // Clamp inertia parameters to [0, 1]
+    linear_inertia_vec = vmaxq_f32(vminq_f32(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
+    angular_inertia_vec = vmaxq_f32(vminq_f32(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
+    centrifugal_inertia_vec = vmaxq_f32(vminq_f32(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
+
+    for (int i = 0; i < PTC_PER_CHN; i += 4) {
+      float32x4_t px = vld1q_f32(&cns->ptc_pos_x[base_idx + i]);
+      float32x4_t py = vld1q_f32(&cns->ptc_pos_y[base_idx + i]);
+      float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[base_idx + i]);
+      float32x4_t p_ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]);
+      p_ptc_inv_mass = vmaxq_f32(vminq_f32(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+
+      // Linear inertia: v * dt * linear_inertia
+      float32x4_t lin_dt_x = vmulq_f32(vmulq_f32(vel_x_vec, dt_vec), linear_inertia_vec);
+      float32x4_t lin_dt_y = vmulq_f32(vmulq_f32(vel_y_vec, dt_vec), linear_inertia_vec);
+      float32x4_t lin_dt_z = vmulq_f32(vmulq_f32(vel_z_vec, dt_vec), linear_inertia_vec);
+
+      // Angular inertia: (ω × r) * dt * angular_inertia
+      float32x4_t ang_dt_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py);
+      float32x4_t ang_dt_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz);
+      float32x4_t ang_dt_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px);
+
+      ang_dt_x = vmulq_f32(vmulq_f32(ang_dt_x, dt_vec), angular_inertia_vec);
+      ang_dt_y = vmulq_f32(vmulq_f32(ang_dt_y, dt_vec), angular_inertia_vec);
+      ang_dt_z = vmulq_f32(vmulq_f32(ang_dt_z, dt_vec), angular_inertia_vec);
+
+      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
+      float32x4_t cross1_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py);
+      float32x4_t cross1_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz);
+      float32x4_t cross1_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px);
+
+      float32x4_t cross2_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, cross1_z), ang_vel_z_vec, cross1_y);
+      float32x4_t cross2_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, cross1_x), ang_vel_x_vec, cross1_z);
+      float32x4_t cross2_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, cross1_y), ang_vel_y_vec, cross1_x);
+
+      // Calculate displacement: (ω × (ω × r)) * dt^2
+      float32x4_t base_cent_dt_x = vmulq_f32(cross2_x, dt2_vec);
+      float32x4_t base_cent_dt_y = vmulq_f32(cross2_y, dt2_vec);
+      float32x4_t base_cent_dt_z = vmulq_f32(cross2_z, dt2_vec);
+
+      // Apply the centrifugal_inertia factor
+      float32x4_t cent_dt_x = vmulq_f32(base_cent_dt_x, centrifugal_inertia_vec);
+      float32x4_t cent_dt_y = vmulq_f32(base_cent_dt_y, centrifugal_inertia_vec);
+      float32x4_t cent_dt_z = vmulq_f32(base_cent_dt_z, centrifugal_inertia_vec);
+
+      // Total delta
+      float32x4_t dt_x = vaddq_f32(vaddq_f32(lin_dt_x, ang_dt_x), cent_dt_x);
+      float32x4_t dt_y = vaddq_f32(vaddq_f32(lin_dt_y, ang_dt_y), cent_dt_y);
+      float32x4_t dt_z = vaddq_f32(vaddq_f32(lin_dt_z, ang_dt_z), cent_dt_z);
+
+      // Update positions
+      vst1q_f32(&cns->ptc_pos_x[base_idx + i], vaddq_f32(px, dt_x));
+      vst1q_f32(&cns->ptc_pos_y[base_idx + i], vaddq_f32(py, dt_y));
+      vst1q_f32(&cns->ptc_pos_z[base_idx + i], vaddq_f32(pz, dt_z));
+    }
+    // Update previous transformation
+    cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
+    cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
+    cns->chn_prev_pos_z[c] = cns->chn_pos_z[c];
+
+    cns->chn_prev_quat_x[c] = cns->chn_quat_x[c];
+    cns->chn_prev_quat_y[c] = cns->chn_quat_y[c];
+    cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
+    cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
+  }
+  // Step 2: Verlet Integration
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int base_idx = c * PTC_PER_CHN;
+    struct chn_cfg *cfg = &cns->chn_cfg[c];
+
+    float32x4_t chn_grav_x_vec = vdupq_n_f32(chn_grav_lcl_x[c]);
+    float32x4_t chn_grav_y_vec = vdupq_n_f32(chn_grav_lcl_y[c]);
+    float32x4_t chn_grav_z_vec = vdupq_n_f32(chn_grav_lcl_z[c]);
+
+    float32x4_t chn_wnd_x_vec = vdupq_n_f32(chn_wnd_lcl_x[c]);
+    float32x4_t chn_wnd_y_vec = vdupq_n_f32(chn_wnd_lcl_y[c]);
+    float32x4_t chn_wnd_z_vec = vdupq_n_f32(chn_wnd_lcl_z[c]);
+    float32x4_t damping_vec = vdupq_n_f32(cfg->damping);
+
+    for (int i = 0; i < PTC_PER_CHN; i += 4) {
+      float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[base_idx + i]);
+      float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[base_idx + i]);
+      float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[base_idx + i]);
+      float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]);
+      ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+
+      float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[base_idx + i]);
+      float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[base_idx + i]);
+      float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[base_idx + i]);
+
+      float32x4_t force_x = vaddq_f32(chn_grav_x_vec, chn_wnd_x_vec);
+      float32x4_t force_y = vaddq_f32(chn_grav_y_vec, chn_wnd_y_vec);
+      float32x4_t force_z = vaddq_f32(chn_grav_z_vec, chn_wnd_z_vec);
+
+      float32x4_t acc_x = vmulq_f32(force_x, ptc_inv_mass);
+      float32x4_t acc_y = vmulq_f32(force_y, ptc_inv_mass);
+      float32x4_t acc_z = vmulq_f32(force_z, ptc_inv_mass);
+
+      float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x);
+      float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y);
+      float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z);
+
+      float32x4_t damped_vel_x = vmulq_f32(vel_x, damping_vec);
+      float32x4_t damped_vel_y = vmulq_f32(vel_y, damping_vec);
+      float32x4_t damped_vel_z = vmulq_f32(vel_z, damping_vec);
+
+      float32x4_t term_accel_x = vmulq_f32(acc_x, dt2_vec);
+      float32x4_t term_accel_y = vmulq_f32(acc_y, dt2_vec);
+      float32x4_t term_accel_z = vmulq_f32(acc_z, dt2_vec);
+
+      float32x4_t new_p_x = vaddq_f32(p_curr_x, vaddq_f32(damped_vel_x, term_accel_x));
+      float32x4_t new_p_y = vaddq_f32(p_curr_y, vaddq_f32(damped_vel_y, term_accel_y));
+      float32x4_t new_p_z = vaddq_f32(p_curr_z, vaddq_f32(damped_vel_z, term_accel_z));
+
+      vst1q_f32(&cns->ptc_pos_x[base_idx + i], new_p_x);
+      vst1q_f32(&cns->ptc_pos_y[base_idx + i], new_p_y);
+      vst1q_f32(&cns->ptc_pos_z[base_idx + i], new_p_z);
+
+      vst1q_f32(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x);
+      vst1q_f32(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y);
+      vst1q_f32(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z);
+    }
+  }
+  // Step 3: Distance Constraints
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int p_base = c * PTC_PER_CHN;
+    int r_base = c * CON_PER_CHN;
+
+    struct chn_cfg *cfg = &cns->chn_cfg[c];
+    float32x4_t stiff_vec = vdupq_n_f32(cfg->stiffness);
+    float32x4_t stretch_factor_vec = vdupq_n_f32(cfg->stretch_factor);
+    float32x4_t max_strain_vec_abs = vdupq_n_f32(cfg->max_strain);
+    float32x4_t neg_max_strain_vec_abs = vmulq_f32(max_strain_vec_abs, neg_one_vec);
+
+    // Loop by 4 for NEON
+    for (int i = 0; i < PTC_PER_CHN; i += 4) {
+      float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
+      pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+      vst1q_f32(&px_lcl[i], vld1q_f32(&cns->ptc_pos_x[p_base + i]));
+      vst1q_f32(&py_lcl[i], vld1q_f32(&cns->ptc_pos_y[p_base + i]));
+      vst1q_f32(&pz_lcl[i], vld1q_f32(&cns->ptc_pos_z[p_base + i]));
+      vst1q_f32(&pm_lcl[i], pm);
+    }
+    for (int iter = 0; iter < MAX_ITR; ++iter) {
+      // Loop by 4 for NEON
+      for (int i = 0; i < PTC_PER_CHN; i += 4) {
+        float32x4_t p0x = vld1q_f32(&px_lcl[i]);
+        float32x4_t p0y = vld1q_f32(&py_lcl[i]);
+        float32x4_t p0z = vld1q_f32(&pz_lcl[i]);
+        float32x4_t p0m = vld1q_f32(&pm_lcl[i]);
+
+        float32x4_t p1x = vld1q_f32(&px_lcl[i + 1]);
+        float32x4_t p1y = vld1q_f32(&py_lcl[i + 1]);
+        float32x4_t p1z = vld1q_f32(&pz_lcl[i + 1]);
+        float32x4_t p1m = vld1q_f32(&pm_lcl[i + 1]);
+        float32x4_t rest_len_vec = vld1q_f32(&cns->ptc_rest_len[r_base + i]);
+
+        float32x4_t dx = vsubq_f32(p1x, p0x);
+        float32x4_t dy = vsubq_f32(p1y, p0y);
+        float32x4_t dz = vsubq_f32(p1z, p0z);
+
+        float32x4_t dist_sq = vmulq_f32(dx, dx);
+        dist_sq = vmlaq_f32(dist_sq, dy, dy);
+        dist_sq = vmlaq_f32(dist_sq, dz, dz);
+
+        float32x4_t inv_dist;
+        float32x4_t x_rsqrt = vmaxq_f32(dist_sq, eps_vec);
+        inv_dist = vrsqrteq_f32(x_rsqrt);
+        inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist)));
+        inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist)));
+
+        float32x4_t w_sum = vaddq_f32(p0m, p1m);
+        float32x4_t w_sum_clamped = vmaxq_f32(w_sum, eps_vec);
+
+        float32x4_t rcp_w_sum;
+        rcp_w_sum = vrecpeq_f32(w_sum_clamped);
+        rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum));
+        rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum));
+
+
+        float32x4_t dist;
+        dist = vrecpeq_f32(inv_dist);
+        dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist));
+        dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist));
+
+        float32x4_t diff = vsubq_f32(dist, rest_len_vec);
+        float32x4_t max_rest_len_eps = vmaxq_f32(rest_len_vec, eps_vec);
+
+        float32x4_t rcp_max_rest_len_eps;
+        rcp_max_rest_len_eps = vrecpeq_f32(max_rest_len_eps);
+        rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps));
+        rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps));
+        float32x4_t strain = vmulq_f32(diff, rcp_max_rest_len_eps);
+        float32x4_t strain_clamped = vmaxq_f32(neg_max_strain_vec_abs, vminq_f32(strain, max_strain_vec_abs));
+
+        float32x4_t dyn_stiff_denom = vaddq_f32(one_vec, vabsq_f32(strain_clamped));
+        float32x4_t rcp_dyn_stiff_denom;
+        rcp_dyn_stiff_denom = vrecpeq_f32(dyn_stiff_denom);
+        rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom));
+        rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom));
+        float32x4_t dyn_stiff = vmulq_f32(stiff_vec, rcp_dyn_stiff_denom);
+
+        float32x4_t corr_scl_part = vmulq_f32(dyn_stiff, stretch_factor_vec);
+        corr_scl_part = vmulq_f32(corr_scl_part, rcp_w_sum);
+        float32x4_t corr_magnitude = vmulq_f32(diff, corr_scl_part);
+
+        uint32x4_t apply_corr_mask = vcgtq_f32(dist_sq, eps_vec); // 1 if dist_sq > eps_vec, 0 otherwise
+
+        float32x4_t dt_unit_x = vmulq_f32(dx, inv_dist);
+        float32x4_t dt_unit_y = vmulq_f32(dy, inv_dist);
+        float32x4_t dt_unit_z = vmulq_f32(dz, inv_dist);
+
+        float32x4_t dt_x = vmulq_f32(dt_unit_x, corr_magnitude);
+        float32x4_t dt_y = vmulq_f32(dt_unit_y, corr_magnitude);
+        float32x4_t dt_z = vmulq_f32(dt_unit_z, corr_magnitude);
+
+        dt_x = vbslq_f32(apply_corr_mask, dt_x, zero_vec); // Apply mask: if mask is 0, set to 0
+        dt_y = vbslq_f32(apply_corr_mask, dt_y, zero_vec);
+        dt_z = vbslq_f32(apply_corr_mask, dt_z, zero_vec);
+
+        vst1q_f32(&px_lcl[i], vmlaq_f32(p0x, dt_x, p0m));
+        vst1q_f32(&py_lcl[i], vmlaq_f32(p0y, dt_y, p0m));
+        vst1q_f32(&pz_lcl[i], vmlaq_f32(p0z, dt_z, p0m));
+
+        vst1q_f32(&px_lcl[i + 1], vmlsq_f32(p1x, dt_x, p1m));
+        vst1q_f32(&py_lcl[i + 1], vmlsq_f32(p1y, dt_y, p1m));
+        vst1q_f32(&pz_lcl[i + 1], vmlsq_f32(p1z, dt_z, p1m));
+      }
+    }
+    for (int i = 0; i < PTC_PER_CHN; i += 4) {
+      vst1q_f32(&cns->ptc_pos_x[p_base + i], vld1q_f32(&px_lcl[i]));
+      vst1q_f32(&cns->ptc_pos_y[p_base + i], vld1q_f32(&py_lcl[i]));
+      vst1q_f32(&cns->ptc_pos_z[p_base + i], vld1q_f32(&pz_lcl[i]));
+    }
+  }
+  // Step 4: Sphere Collisions with Friction
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int p_base = c * PTC_PER_CHN;
+    int s_base = c * SPH_PER_CHN;
+    struct chn_cfg *cfg = &cns->chn_cfg[c];
+    float32x4_t fric_vec = vdupq_n_f32(cfg->friction);
+
+    for (int s = 0; s < SPH_PER_CHN; ++s) {
+      float32x4_t sph_x_vec = vdupq_n_f32(cns->sph_x[s_base + s]);
+      float32x4_t sph_y_vec = vdupq_n_f32(cns->sph_y[s_base + s]);
+      float32x4_t sph_z_vec = vdupq_n_f32(cns->sph_z[s_base + s]);
+      float32x4_t sph_r_vec = vdupq_n_f32(cns->sph_r[s_base + s]);
+
+      for (int i = 0; i < PTC_PER_CHN; i += 4) {
+        float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[p_base + i]);
+        float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[p_base + i]);
+        float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[p_base + i]);
+
+        float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[p_base + i]);
+        float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[p_base + i]);
+        float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[p_base + i]);
+        float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
+        ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+
+        float32x4_t dx = vsubq_f32(p_curr_x, sph_x_vec);
+        float32x4_t dy = vsubq_f32(p_curr_y, sph_y_vec);
+        float32x4_t dz = vsubq_f32(p_curr_z, sph_z_vec);
+
+        float32x4_t dist_sq = vmulq_f32(dx, dx);
+        dist_sq = vmlaq_f32(dist_sq, dy, dy);
+        dist_sq = vmlaq_f32(dist_sq, dz, dz);
+
+        float32x4_t inv_dist_sphere;
+        float32x4_t x_rsqrt_sphere = vmaxq_f32(dist_sq, eps_vec);
+        inv_dist_sphere = vrsqrteq_f32(x_rsqrt_sphere);
+        inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere)));
+        inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere)));
+        float32x4_t inv_dist = inv_dist_sphere;
+
+        float32x4_t dist_sphere;
+        dist_sphere = vrecpeq_f32(inv_dist);
+        dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere));
+        dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere));
+        float32x4_t dist = dist_sphere;
+
+        float32x4_t pen = vsubq_f32(sph_r_vec, dist);
+        uint32x4_t col_mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise
+
+        float32x4_t norm_x = vmulq_f32(dx, inv_dist);
+        float32x4_t norm_y = vmulq_f32(dy, inv_dist);
+        float32x4_t norm_z = vmulq_f32(dz, inv_dist);
+
+        float32x4_t norm_corr_x = vmulq_f32(norm_x, pen);
+        float32x4_t norm_corr_y = vmulq_f32(norm_y, pen);
+        float32x4_t norm_corr_z = vmulq_f32(norm_z, pen);
+
+        float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x);
+        float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y);
+        float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z);
+
+        float32x4_t vel_dot_norm = vmulq_f32(vel_x, norm_x);
+        vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_y, norm_y);
+        vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_z, norm_z);
+
+        float32x4_t tan_x = vmlsq_f32(vel_x, vel_dot_norm, norm_x);
+        float32x4_t tan_y = vmlsq_f32(vel_y, vel_dot_norm, norm_y);
+        float32x4_t tan_z = vmlsq_f32(vel_z, vel_dot_norm, norm_z);
+
+        float32x4_t tan_mag_sq = vmulq_f32(tan_x, tan_x);
+        tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_y, tan_y);
+        tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_z, tan_z);
+
+        float32x4_t inv_tan_mag_sphere;
+        float32x4_t x_rsqrt_tan_mag = vmaxq_f32(tan_mag_sq, eps_vec);
+        inv_tan_mag_sphere = vrsqrteq_f32(x_rsqrt_tan_mag);
+        inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere)));
+        inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere)));
+        float32x4_t inv_tan_mag = inv_tan_mag_sphere;
+
+        float32x4_t fric_mag = vmulq_f32(pen, fric_vec);
+        float32x4_t fric_x = vmulq_f32(vmulq_f32(tan_x, inv_tan_mag), fric_mag);
+        float32x4_t fric_y = vmulq_f32(vmulq_f32(tan_y, inv_tan_mag), fric_mag);
+        float32x4_t fric_z = vmulq_f32(vmulq_f32(tan_z, inv_tan_mag), fric_mag);
+
+        float32x4_t total_corr_x = vsubq_f32(norm_corr_x, fric_x);
+        float32x4_t total_corr_y = vsubq_f32(norm_corr_y, fric_y);
+        float32x4_t total_corr_z = vsubq_f32(norm_corr_z, fric_z);
+
+        total_corr_x = vbslq_f32(col_mask, total_corr_x, zero_vec);
+        total_corr_y = vbslq_f32(col_mask, total_corr_y, zero_vec);
+        total_corr_z = vbslq_f32(col_mask, total_corr_z, zero_vec);
+
+        total_corr_x = vmulq_f32(total_corr_x, ptc_inv_mass);
+        total_corr_y = vmulq_f32(total_corr_y, ptc_inv_mass);
+        total_corr_z = vmulq_f32(total_corr_z, ptc_inv_mass);
+
+        float32x4_t new_p_x = vaddq_f32(p_curr_x, total_corr_x);
+        float32x4_t new_p_y = vaddq_f32(p_curr_y, total_corr_y);
+        float32x4_t new_p_z = vaddq_f32(p_curr_z, total_corr_z);
+
+        vst1q_f32(&cns->ptc_pos_x[p_base + i], new_p_x);
+        vst1q_f32(&cns->ptc_pos_y[p_base + i], new_p_y);
+        vst1q_f32(&cns->ptc_pos_z[p_base + i], new_p_z);
+      }
+    }
+  }
+  // Step 5: Motion Constraints
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int p_base = c * PTC_PER_CHN;
+    // Loop by 4 for NEON
+    for (int i = 0; i < PTC_PER_CHN; i += 4) {
+      float32x4_t px = vld1q_f32(&cns->ptc_pos_x[p_base + i]);
+      float32x4_t py = vld1q_f32(&cns->ptc_pos_y[p_base + i]);
+      float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[p_base + i]);
+      float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
+      pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+
+      float32x4_t rx = vld1q_f32(&cns->rest_pos_x[p_base + i]);
+      float32x4_t ry = vld1q_f32(&cns->rest_pos_y[p_base + i]);
+      float32x4_t rz = vld1q_f32(&cns->rest_pos_z[p_base + i]);
+      float32x4_t r_vec = vld1q_f32(&cns->motion_radius[p_base + i]);
+
+      float32x4_t dx = vsubq_f32(px, rx);
+      float32x4_t dy = vsubq_f32(py, ry);
+      float32x4_t dz = vsubq_f32(pz, rz);
+
+      float32x4_t dist_sq = vmulq_f32(dx, dx);
+      dist_sq = vmlaq_f32(dist_sq, dy, dy);
+      dist_sq = vmlaq_f32(dist_sq, dz, dz);
+
+      float32x4_t inv_dist_motion;
+      float32x4_t x_rsqrt_motion = vmaxq_f32(dist_sq, eps_vec);
+      inv_dist_motion = vrsqrteq_f32(x_rsqrt_motion);
+      inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion)));
+      inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion)));
+      float32x4_t inv_dist = inv_dist_motion;
+
+      float32x4_t dist_motion;
+      dist_motion = vrecpeq_f32(inv_dist);
+      dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion));
+      dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion));
+      float32x4_t dist = dist_motion;
+
+      float32x4_t pen = vsubq_f32(dist, r_vec);
+      uint32x4_t mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise
+
+      float32x4_t corr_factor = vmulq_f32(pen, inv_dist);
+      corr_factor = vbslq_f32(mask, corr_factor, zero_vec);
+
+      float32x4_t dt_x = vmulq_f32(dx, corr_factor);
+      float32x4_t dt_y = vmulq_f32(dy, corr_factor);
+      float32x4_t dt_z = vmulq_f32(dz, corr_factor);
+
+      dt_x = vmulq_f32(dt_x, pm);
+      dt_y = vmulq_f32(dt_y, pm);
+      dt_z = vmulq_f32(dt_z, pm);
+
+      vst1q_f32(&cns->ptc_pos_x[p_base + i], vsubq_f32(px, dt_x));
+      vst1q_f32(&cns->ptc_pos_y[p_base + i], vsubq_f32(py, dt_y));
+      vst1q_f32(&cns->ptc_pos_z[p_base + i], vsubq_f32(pz, dt_z));
+    }
+  }
+}
diff --git a/chain.c → chain_avx2.c b/chain.c → chain_avx2.c
diff --git a/chain.c b/chain.c
@@ -81,8 +81,8 @@ struct chn_sim {
   float motion_radius[MAX_PTC] __attribute__((aligned(32)));
 };
 extern void
-chn_sim_init(struct chn_sim *chs) {
-  assert(chs);
+chn_sim_init(struct chn_sim *cns) {
+  assert(cns);
   cns->free_idx_cnt = MAX_CHNS;
   for (int i = 0; i < MAX_CHNS; ++i) {
     cns->free_idx[i] = MAX_CHNS - i - 1;
@@ -93,8 +93,8 @@ chn_sim_init(struct chn_sim *chs) {
   }
 }
 extern int
-chn_sim_add(struct chn_sim *chs, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
-  assert(chs);
+chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
+  assert(cns);
   assert(cfg);
   assert(cns->free_idx_cnt > 0);
   assert(cnt <= PTC_PER_CHN);
@@ -106,23 +106,24 @@ chn_sim_add(struct chn_sim *chs, const struct chn_cfg *cfg, const float *restric
   int p_idx = (chn * PTC_PER_CHN);
   int r_idx = (chn * CON_PER_CHN);
   for (int i = 0; i < cnt; ++i) {
-    cns->ptc_pos_x[p_idx + i] = rest_pos[i*3+0];
-    cns->ptc_pos_y[p_idx + i] = rest_pos[i*3+1];
-    cns->ptc_pos_z[p_idx + i] = rest_pos[i*3+2];
-
-    cns->rest_pos_x[p_idx + i] = rest_pos[i*3+0];
-    cns->rest_pos_y[p_idx + i] = rest_pos[i*3+1];
-    cns->rest_pos_z[p_idx + i] = rest_pos[i*3+2];
+    cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0];
+    cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1];
+    cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2];
+    cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3];
+
+    cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0];
+    cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1];
+    cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2];
   }
   for (int i = cnt; i < PTC_PER_CHN; ++i) {
     cns->ptc_pos_x[p_idx + i] = 0.0f;
     cns->ptc_pos_y[p_idx + i] = 0.0f;
     cns->ptc_pos_z[p_idx + i] = 0.0f;
   }
   for (int i = 1; i < cnt; ++i) {
-    float dx = rest_pos[i*3+0] - rest_pos[(i-1)*3+0];
-    float dy = rest_pos[i*3+1] - rest_pos[(i-1)*3+1];
-    float dz = rest_pos[i*3+2] - rest_pos[(i-1)*3+2];
+    float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
+    float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
+    float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
     cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
   }
   for (int i = cnt; i < CON_PER_CHN; ++i) {
@@ -133,8 +134,8 @@ chn_sim_add(struct chn_sim *chs, const struct chn_cfg *cfg, const float *restric
   return chn;
 }
 extern void
-chn_sim_del(struct chn_sim *chs, int chn) {
-  assert(chs);
+chn_sim_del(struct chn_sim *cns, int chn) {
+  assert(cns);
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
 
@@ -207,8 +208,8 @@ chn_sim_del(struct chn_sim *chs, int chn) {
   _mm256_store_ps(&cns->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f));
 }
 extern void
-chn_sim_mov(struct chn_sim *chs, int chn, const float *restrict pos3, const float *restrict rot4) {
-  assert(chs);
+chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
+  assert(cns);
   assert(pos3);
   assert(rot4);
   assert(chn >= 0);
@@ -224,8 +225,8 @@ chn_sim_mov(struct chn_sim *chs, int chn, const float *restrict pos3, const floa
   cns->chn_quat_w[chn] = rot4[3];
 }
 extern void
-chn_sim_tel(struct chn_sim *chs, int chn, const float *restrict pos3, const float *restrict rot4) {
-  assert(chs);
+chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
+  assert(cns);
   assert(pos3);
   assert(rot4);
   assert(chn >= 0);
@@ -241,8 +242,8 @@ chn_sim_tel(struct chn_sim *chs, int chn, const float *restrict pos3, const floa
   cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
 }
 extern void
-chn_sim_grav(struct chn_sim *chs, int chn, const float *restrict grav3) {
-  assert(chs);
+chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
+  assert(cns);
   assert(grav3);
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
@@ -252,8 +253,8 @@ chn_sim_grav(struct chn_sim *chs, int chn, const float *restrict grav3) {
   cns->chn_grav_z[chn] = grav3[2];
 }
 extern void
-chn_sim_wind(struct chn_sim *chs, int chn, const float *restrict wind3) {
-  assert(chs);
+chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
+  assert(cns);
   assert(wind3);
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
@@ -263,8 +264,8 @@ chn_sim_wind(struct chn_sim *chs, int chn, const float *restrict wind3) {
   cns->chn_wnd_z[chn] = wind3[2];
 }
 extern void
-chn_sim_set_sph(struct chn_sim *chs, int chn, const float *spheres, int cnt) {
-  assert(chs);
+chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
+  assert(cns);
   assert(spheres);
   assert(cnt < MAX_SPH);
 
@@ -283,8 +284,8 @@ chn_sim_set_sph(struct chn_sim *chs, int chn, const float *spheres, int cnt) {
   }
 }
 extern void
-chn_sim_con_motion(struct chn_sim *chs, int chn, const float *radius, int cnt) {
-  assert(chs);
+chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
+  assert(cns);
   assert(chn >= 0);
   assert(chn < MAX_CHNS);
   assert(cnt >= 0);
@@ -299,7 +300,7 @@ chn_sim_con_motion(struct chn_sim *chs, int chn, const float *radius, int cnt) {
   }
 }
 extern void
-chn_sim_run(struct chn_sim *chs, float dt) {
+chn_sim_run(struct chn_sim *cns, float dt) {
   // SIMD constants
   const __m256 dt_vec = _mm256_set1_ps(dt);
   const __m256 dt2_vec = _mm256_set1_ps(dt * dt);

diff --git a/chain.c b/chain.c
@@ -12,7 +12,7 @@ enum {
   MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
   CON_PER_CHN   = PTC_PER_CHN,
   MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
-  MAX_ITR       = 8,
+  MAX_ITR       = 16,
 };
 struct chn_cfg {
   float damping;              // [0, 1]
@@ -81,225 +81,225 @@ struct chn_sim {
   float motion_radius[MAX_PTC] __attribute__((aligned(32)));
 };
 extern void
-chn_sim_init(struct chn_sim *cs) {
-  assert(cs);
-  cs->free_idx_cnt = MAX_CHNS;
+chn_sim_init(struct chn_sim *chs) {
+  assert(chs);
+  cns->free_idx_cnt = MAX_CHNS;
   for (int i = 0; i < MAX_CHNS; ++i) {
-    cs->free_idx[i] = MAX_CHNS - i - 1;
+    cns->free_idx[i] = MAX_CHNS - i - 1;
   }
   for (int i = 0; i < MAX_CHNS; i += 8) {
-    _mm256_store_ps(&cs->chn_quat_w[i], _mm256_set1_ps(1.0f));
-    _mm256_store_ps(&cs->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));  
+    _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f));
+    _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));  
   }
 }
 extern int
-chn_sim_add(struct chn_sim *cs, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
-  assert(cs);
+chn_sim_add(struct chn_sim *chs, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
+  assert(chs);
   assert(cfg);
-  assert(cs->free_idx_cnt > 0);
+  assert(cns->free_idx_cnt > 0);
   assert(cnt <= PTC_PER_CHN);
   assert(cnt > 1);
 
-  int c = cs->free_idx[-cs->free_idx_cnt];
-  cs->chn_cfg[c] = *cfg;
+  int chn = cns->free_idx[--cns->free_idx_cnt];
+  cns->chn_cfg[chn] = *cfg;
 
-  int p_idx = (c * PTC_PER_CHN);
-  int r_idx = (c * CON_PER_CHN);
+  int p_idx = (chn * PTC_PER_CHN);
+  int r_idx = (chn * CON_PER_CHN);
   for (int i = 0; i < cnt; ++i) {
-    cs->ptc_pos_x[p_idx + i] = rest_pos[i*3+0];
-    cs->ptc_pos_y[p_idx + i] = rest_pos[i*3+1];
-    cs->ptc_pos_z[p_idx + i] = rest_pos[i*3+2];
+    cns->ptc_pos_x[p_idx + i] = rest_pos[i*3+0];
+    cns->ptc_pos_y[p_idx + i] = rest_pos[i*3+1];
+    cns->ptc_pos_z[p_idx + i] = rest_pos[i*3+2];
 
-    cs->rest_pos_x[p_idx + i] = rest_pos[i*3+0];
-    cs->rest_pos_y[p_idx + i] = rest_pos[i*3+1];
-    cs->rest_pos_z[p_idx + i] = rest_pos[i*3+2];
+    cns->rest_pos_x[p_idx + i] = rest_pos[i*3+0];
+    cns->rest_pos_y[p_idx + i] = rest_pos[i*3+1];
+    cns->rest_pos_z[p_idx + i] = rest_pos[i*3+2];
   }
   for (int i = cnt; i < PTC_PER_CHN; ++i) {
-    cs->ptc_pos_x[p_idx + i] = 0.0f;
-    cs->ptc_pos_y[p_idx + i] = 0.0f;
-    cs->ptc_pos_z[p_idx + i] = 0.0f;
+    cns->ptc_pos_x[p_idx + i] = 0.0f;
+    cns->ptc_pos_y[p_idx + i] = 0.0f;
+    cns->ptc_pos_z[p_idx + i] = 0.0f;
   }
   for (int i = 1; i < cnt; ++i) {
     float dx = rest_pos[i*3+0] - rest_pos[(i-1)*3+0];
     float dy = rest_pos[i*3+1] - rest_pos[(i-1)*3+1];
     float dz = rest_pos[i*3+2] - rest_pos[(i-1)*3+2];
-    cs->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
+    cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
   }
   for (int i = cnt; i < CON_PER_CHN; ++i) {
-    cs->ptc_rest_len[r_idx + i] = 0.0f;
-    cs->ptc_rest_len[r_idx + i] = 0.0f;
-    cs->ptc_rest_len[r_idx + i] = 0.0f;
+    cns->ptc_rest_len[r_idx + i] = 0.0f;
+    cns->ptc_rest_len[r_idx + i] = 0.0f;
+    cns->ptc_rest_len[r_idx + i] = 0.0f;
   }
-  return c;
+  return chn;
 }
 extern void
-chn_sim_del(struct chn_sim *cs, int c) {
-  assert(cs);
-  assert(c >= 0);
-  assert(c < MAX_CHNS);
-
-  int p_idx = (c * PTC_PER_CHN);
-  int s_idx = (c * SPH_PER_CHN);
-  int c_idx = (c * CON_PER_CHN);
-
-  cs->free_idx[cs->free_idx_cnt++] = c;
-  memset(&cs->chn_cfg[c], 0, sizeof(cs->chn_cfg[0]));
-
-  cs->chn_grav_x[c] = 0.0f;
-  cs->chn_grav_y[c] = 0.0f;
-  cs->chn_grav_z[c] = 0.0f;
-
-  cs->chn_wnd_x[c] = 0.0f;
-  cs->chn_wnd_y[c] = 0.0f;
-  cs->chn_wnd_z[c] = 0.0f;
-
-  cs->chn_pos_x[c] = 0.0f;
-  cs->chn_pos_y[c] = 0.0f;
-  cs->chn_pos_z[c] = 0.0f;
-
-  cs->chn_quat_x[c] = 0.0f;
-  cs->chn_quat_y[c] = 0.0f;
-  cs->chn_quat_z[c] = 0.0f;
-  cs->chn_quat_w[c] = 1.0f;
-
-  cs->chn_prev_pos_x[c] = 0.0f;
-  cs->chn_prev_pos_y[c] = 0.0f;
-  cs->chn_prev_pos_z[c] = 0.0f;
-
-  cs->chn_prev_quat_x[c] = 0.0f;
-  cs->chn_prev_quat_y[c] = 0.0f;
-  cs->chn_prev_quat_z[c] = 0.0f;
-  cs->chn_prev_quat_w[c] = 1.0f;
-
-  _mm256_store_ps(&cs->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cs->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cs->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cs->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cs->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cs->sph_x[s_idx], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->sph_y[s_idx], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->sph_z[s_idx], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->sph_r[s_idx], _mm256_set1_ps(0.0f));
-
-  _mm256_store_ps(&cs->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f));
-  _mm256_store_ps(&cs->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f));
+chn_sim_del(struct chn_sim *chs, int chn) {
+  assert(chs);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+
+  int p_idx = (chn * PTC_PER_CHN);
+  int s_idx = (chn * SPH_PER_CHN);
+  int c_idx = (chn * CON_PER_CHN);
+
+  cns->free_idx[cns->free_idx_cnt++] = chn;
+  memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
+
+  cns->chn_grav_x[chn] = 0.0f;
+  cns->chn_grav_y[chn] = 0.0f;
+  cns->chn_grav_z[chn] = 0.0f;
+
+  cns->chn_wnd_x[chn] = 0.0f;
+  cns->chn_wnd_y[chn] = 0.0f;
+  cns->chn_wnd_z[chn] = 0.0f;
+
+  cns->chn_pos_x[chn] = 0.0f;
+  cns->chn_pos_y[chn] = 0.0f;
+  cns->chn_pos_z[chn] = 0.0f;
+
+  cns->chn_quat_x[chn] = 0.0f;
+  cns->chn_quat_y[chn] = 0.0f;
+  cns->chn_quat_z[chn] = 0.0f;
+  cns->chn_quat_w[chn] = 1.0f;
+
+  cns->chn_prev_pos_x[chn] = 0.0f;
+  cns->chn_prev_pos_y[chn] = 0.0f;
+  cns->chn_prev_pos_z[chn] = 0.0f;
+
+  cns->chn_prev_quat_x[chn] = 0.0f;
+  cns->chn_prev_quat_y[chn] = 0.0f;
+  cns->chn_prev_quat_z[chn] = 0.0f;
+  cns->chn_prev_quat_w[chn] = 1.0f;
+
+  _mm256_store_ps(&cns->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->sph_x[s_idx], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->sph_y[s_idx], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->sph_z[s_idx], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->sph_r[s_idx], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cns->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cns->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f));
 }
 extern void
-chn_sim_mov(struct chn_sim *cs, int c, const float *restrict pos3, const float *restrict rot4) {
-  assert(cs);
+chn_sim_mov(struct chn_sim *chs, int chn, const float *restrict pos3, const float *restrict rot4) {
+  assert(chs);
   assert(pos3);
   assert(rot4);
-  assert(c >= 0);
-  assert(c < MAX_CHNS);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
 
-  cs->chn_pos_x[c] = pos3[0];
-  cs->chn_pos_y[c] = pos3[1];
-  cs->chn_pos_z[c] = pos3[2];
+  cns->chn_pos_x[chn] = pos3[0];
+  cns->chn_pos_y[chn] = pos3[1];
+  cns->chn_pos_z[chn] = pos3[2];
 
-  cs->chn_quat_x[c] = rot4[0];
-  cs->chn_quat_y[c] = rot4[1];
-  cs->chn_quat_z[c] = rot4[2];
-  cs->chn_quat_w[c] = rot4[3];
+  cns->chn_quat_x[chn] = rot4[0];
+  cns->chn_quat_y[chn] = rot4[1];
+  cns->chn_quat_z[chn] = rot4[2];
+  cns->chn_quat_w[chn] = rot4[3];
 }
 extern void
-chn_sim_tel(struct chn_sim *cs, int c, const float *restrict pos3, const float *restrict rot4) {
-  assert(cs);
+chn_sim_tel(struct chn_sim *chs, int chn, const float *restrict pos3, const float *restrict rot4) {
+  assert(chs);
   assert(pos3);
   assert(rot4);
-  assert(c >= 0);
-  assert(c < MAX_CHNS);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
 
-  cs->chn_pos_x[c] = cs->chn_prev_pos_x[c] = pos3[0];
-  cs->chn_pos_y[c] = cs->chn_prev_pos_y[c] = pos3[1];
-  cs->chn_pos_z[c] = cs->chn_prev_pos_z[c] = pos3[2];
+  cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0];
+  cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1];
+  cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2];
 
-  cs->chn_quat_x[c] = cs->chn_prev_quat_x[c] = rot4[0];
-  cs->chn_quat_y[c] = cs->chn_prev_quat_y[c] = rot4[1];
-  cs->chn_quat_z[c] = cs->chn_prev_quat_z[c] = rot4[2];
-  cs->chn_quat_w[c] = cs->chn_prev_quat_w[c] = rot4[3];
+  cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0];
+  cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1];
+  cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
+  cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
 }
 extern void
-chn_sim_grav(struct chn_sim *cs, int c, const float *restrict g3) {
-  assert(cs);
-  assert(g3);
-  assert(c >= 0);
-  assert(c < MAX_CHNS);
-
-  cs->chn_grav_x[c] = g3[0];
-  cs->chn_grav_y[c] = g3[1];
-  cs->chn_grav_z[c] = g3[2];
+chn_sim_grav(struct chn_sim *chs, int chn, const float *restrict grav3) {
+  assert(chs);
+  assert(grav3);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+
+  cns->chn_grav_x[chn] = grav3[0];
+  cns->chn_grav_y[chn] = grav3[1];
+  cns->chn_grav_z[chn] = grav3[2];
 }
 extern void
-chn_sim_wind(struct chn_sim *cs, int c, const float *restrict w3) {
-  assert(cs);
-  assert(w3);
-  assert(c >= 0);
-  assert(c < MAX_CHNS);
-
-  cs->chn_wnd_x[c] = w3[0];
-  cs->chn_wnd_y[c] = w3[1];
-  cs->chn_wnd_z[c] = w3[2];
+chn_sim_wind(struct chn_sim *chs, int chn, const float *restrict wind3) {
+  assert(chs);
+  assert(wind3);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
+
+  cns->chn_wnd_x[chn] = wind3[0];
+  cns->chn_wnd_y[chn] = wind3[1];
+  cns->chn_wnd_z[chn] = wind3[2];
 }
 extern void
-chn_sim_set_sph(struct chn_sim *cs, int c, const float *spheres, int cnt) {
-  assert(cs);
+chn_sim_set_sph(struct chn_sim *chs, int chn, const float *spheres, int cnt) {
+  assert(chs);
   assert(spheres);
   assert(cnt < MAX_SPH);
 
-  int s_idx = (c * SPH_PER_CHN);
+  int s_idx = (chn * SPH_PER_CHN);
   for (int i = 0; i < cnt; i++) {
-    cs->sph_x[s_idx + i] = spheres[i*4+0];
-    cs->sph_y[s_idx + i] = spheres[i*4+1];
-    cs->sph_z[s_idx + i] = spheres[i*4+2];
-    cs->sph_r[s_idx + i] = spheres[i*4+3];
+    cns->sph_x[s_idx + i] = spheres[i*4+0];
+    cns->sph_y[s_idx + i] = spheres[i*4+1];
+    cns->sph_z[s_idx + i] = spheres[i*4+2];
+    cns->sph_r[s_idx + i] = spheres[i*4+3];
   }
   for (int i = cnt; i < SPH_PER_CHN; ++i) {
-    cs->sph_x[s_idx + i] = 0;
-    cs->sph_y[s_idx + i] = 0;
-    cs->sph_z[s_idx + i] = 0;
-    cs->sph_r[s_idx + i] = 0;
+    cns->sph_x[s_idx + i] = 0;
+    cns->sph_y[s_idx + i] = 0;
+    cns->sph_z[s_idx + i] = 0;
+    cns->sph_r[s_idx + i] = 0;
   }
 }
 extern void
-chn_sim_con_motion(struct chn_sim *cs, int c, const float *radius, int cnt) {
-  assert(cs);
-  assert(c >= 0);
-  assert(c < MAX_CHNS);
+chn_sim_con_motion(struct chn_sim *chs, int chn, const float *radius, int cnt) {
+  assert(chs);
+  assert(chn >= 0);
+  assert(chn < MAX_CHNS);
   assert(cnt >= 0);
   assert(cnt < PTC_PER_CHN);
 
-  int p_idx = (c * PTC_PER_CHN);
+  int p_idx = (chn * PTC_PER_CHN);
   for (int i = 0; i < cnt; ++i) {
-    cs->motion_radius[p_idx + i] = radius[i];
+    cns->motion_radius[p_idx + i] = radius[i];
   }
   for (int i = cnt; i < PTC_PER_CHN; ++i) {
-    cs->motion_radius[p_idx + i] = FLT_MAX;
+    cns->motion_radius[p_idx + i] = FLT_MAX;
   }
 }
 extern void
-chn_sim_run(struct chn_sim *cs, float dt) {
+chn_sim_run(struct chn_sim *chs, float dt) {
   // SIMD constants
   const __m256 dt_vec = _mm256_set1_ps(dt);
   const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
@@ -347,10 +347,10 @@ chn_sim_run(struct chn_sim *cs, float dt) {
   // Step 0: Precomputation (SIMD, 8 chains at once)
   for (int c = 0; c < MAX_CHNS; c += 8) {
     // Load chain quaternions
-    __m256 qx = _mm256_load_ps(&cs->chn_quat_x[c]);
-    __m256 qy = _mm256_load_ps(&cs->chn_quat_y[c]);
-    __m256 qz = _mm256_load_ps(&cs->chn_quat_z[c]);
-    __m256 qw = _mm256_load_ps(&cs->chn_quat_w[c]);
+    __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
+    __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
+    __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]);
+    __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]);
 
     // Create q_conjugate components (qw remains, qx, qy, qz are negated)
     __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
@@ -359,9 +359,9 @@ chn_sim_run(struct chn_sim *cs, float dt) {
 
     // Compute local-space wind
     {
-      __m256 vx = _mm256_load_ps(&cs->chn_wnd_x[c]);
-      __m256 vy = _mm256_load_ps(&cs->chn_wnd_y[c]);
-      __m256 vz = _mm256_load_ps(&cs->chn_wnd_z[c]);
+      __m256 vx = _mm256_load_ps(&cns->chn_wnd_x[c]);
+      __m256 vy = _mm256_load_ps(&cns->chn_wnd_y[c]);
+      __m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]);
 
       // Step 1: t = q_conj * v_world_as_quat
       // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
@@ -400,9 +400,9 @@ chn_sim_run(struct chn_sim *cs, float dt) {
     }
     // Compute local-space gravity
     {
-      __m256 vx = _mm256_load_ps(&cs->chn_grav_x[c]);
-      __m256 vy = _mm256_load_ps(&cs->chn_grav_y[c]);
-      __m256 vz = _mm256_load_ps(&cs->chn_grav_z[c]);
+      __m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]);
+      __m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]);
+      __m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]);
 
       // Step 1: t = q_conj * v_world_as_quat
       // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
@@ -443,13 +443,13 @@ chn_sim_run(struct chn_sim *cs, float dt) {
     }
     // Compute linear velocity
     {
-      __m256 curr_x = _mm256_load_ps(&cs->chn_pos_x[c]);
-      __m256 curr_y = _mm256_load_ps(&cs->chn_pos_y[c]);
-      __m256 curr_z = _mm256_load_ps(&cs->chn_pos_z[c]);
+      __m256 curr_x = _mm256_load_ps(&cns->chn_pos_x[c]);
+      __m256 curr_y = _mm256_load_ps(&cns->chn_pos_y[c]);
+      __m256 curr_z = _mm256_load_ps(&cns->chn_pos_z[c]);
 
-      __m256 prev_x = _mm256_load_ps(&cs->chn_prev_pos_x[c]);
-      __m256 prev_y = _mm256_load_ps(&cs->chn_prev_pos_y[c]);
-      __m256 prev_z = _mm256_load_ps(&cs->chn_prev_pos_z[c]);
+      __m256 prev_x = _mm256_load_ps(&cns->chn_prev_pos_x[c]);
+      __m256 prev_y = _mm256_load_ps(&cns->chn_prev_pos_y[c]);
+      __m256 prev_z = _mm256_load_ps(&cns->chn_prev_pos_z[c]);
 
       __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
       __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
@@ -461,15 +461,15 @@ chn_sim_run(struct chn_sim *cs, float dt) {
     }
     // Compute angular velocity
     {
-      __m256 qx = _mm256_load_ps(&cs->chn_quat_x[c]);
-      __m256 qy = _mm256_load_ps(&cs->chn_quat_y[c]);
-      __m256 qz = _mm256_load_ps(&cs->chn_quat_z[c]);
-      __m256 qw = _mm256_load_ps(&cs->chn_quat_w[c]);
+      __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
+      __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
+      __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]);
+      __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]);
 
-      __m256 prev_qx = _mm256_load_ps(&cs->chn_prev_quat_x[c]);
-      __m256 prev_qy = _mm256_load_ps(&cs->chn_prev_quat_y[c]);
-      __m256 prev_qz = _mm256_load_ps(&cs->chn_prev_quat_z[c]);
-      __m256 prev_qw = _mm256_load_ps(&cs->chn_prev_quat_w[c]);
+      __m256 prev_qx = _mm256_load_ps(&cns->chn_prev_quat_x[c]);
+      __m256 prev_qy = _mm256_load_ps(&cns->chn_prev_quat_y[c]);
+      __m256 prev_qz = _mm256_load_ps(&cns->chn_prev_quat_z[c]);
+      __m256 prev_qw = _mm256_load_ps(&cns->chn_prev_quat_w[c]);
 
       // Step 1: Compute delta quaternion (to - from)
       __m256 dt_qx = _mm256_sub_ps(qx, prev_qx);
@@ -530,7 +530,7 @@ chn_sim_run(struct chn_sim *cs, float dt) {
   // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
   for (int c = 0; c < MAX_CHNS; ++c) {
     int base_idx = c * PTC_PER_CHN;
-    struct chn_cfg *cfg = &cs->chn_cfg[c];
+    struct chn_cfg *cfg = &cns->chn_cfg[c];
 
     // Load precomputed velocities and inertia parameters
     __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
@@ -551,10 +551,10 @@ chn_sim_run(struct chn_sim *cs, float dt) {
     centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
 
     for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      __m256 px = _mm256_load_ps(&cs->ptc_pos_x[base_idx + i]);
-      __m256 py = _mm256_load_ps(&cs->ptc_pos_y[base_idx + i]);
-      __m256 pz = _mm256_load_ps(&cs->ptc_pos_z[base_idx + i]);
-      __m256 p_ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[base_idx + i]);
+      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]);
+      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]);
+      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]);
+      __m256 p_ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]);
       p_ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
       // Linear inertia: v * dt * linear_inertia
@@ -596,24 +596,24 @@ chn_sim_run(struct chn_sim *cs, float dt) {
       __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);
 
       // Update positions
-      _mm256_store_ps(&cs->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x));
-      _mm256_store_ps(&cs->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y));
-      _mm256_store_ps(&cs->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z));
+      _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x));
+      _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y));
+      _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z));
     }
     // Update previous transformation
-    cs->chn_prev_pos_x[c] = cs->chn_pos_x[c];
-    cs->chn_prev_pos_y[c] = cs->chn_pos_y[c];
-    cs->chn_prev_pos_z[c] = cs->chn_pos_z[c];
-
-    cs->chn_prev_quat_x[c] = cs->chn_quat_x[c];
-    cs->chn_prev_quat_y[c] = cs->chn_quat_y[c];
-    cs->chn_prev_quat_z[c] = cs->chn_quat_z[c];
-    cs->chn_prev_quat_w[c] = cs->chn_quat_w[c];
+    cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
+    cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
+    cns->chn_prev_pos_z[c] = cns->chn_pos_z[c];
+
+    cns->chn_prev_quat_x[c] = cns->chn_quat_x[c];
+    cns->chn_prev_quat_y[c] = cns->chn_quat_y[c];
+    cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
+    cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
   }
   // Step 2: Verlet Integration
   for (int c = 0; c < MAX_CHNS; ++c) {
     int base_idx = c * PTC_PER_CHN;
-    struct chn_cfg *cfg = &cs->chn_cfg[c];
+    struct chn_cfg *cfg = &cns->chn_cfg[c];
 
     __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]);
     __m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]);
@@ -625,15 +625,15 @@ chn_sim_run(struct chn_sim *cs, float dt) {
     __m256 damping_vec = _mm256_set1_ps(cfg->damping);
 
     for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      __m256 p_curr_x = _mm256_load_ps(&cs->ptc_pos_x[base_idx + i]);
-      __m256 p_curr_y = _mm256_load_ps(&cs->ptc_pos_y[base_idx + i]);
-      __m256 p_curr_z = _mm256_load_ps(&cs->ptc_pos_z[base_idx + i]);
-      __m256 ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[base_idx + i]);
+      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]);
+      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]);
+      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]);
+      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]);
       ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-      __m256 p_prev_x = _mm256_load_ps(&cs->ptc_prev_pos_x[base_idx + i]);
-      __m256 p_prev_y = _mm256_load_ps(&cs->ptc_prev_pos_y[base_idx + i]);
-      __m256 p_prev_z = _mm256_load_ps(&cs->ptc_prev_pos_z[base_idx + i]);
+      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[base_idx + i]);
+      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[base_idx + i]);
+      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[base_idx + i]);
 
       __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec);
       __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec);
@@ -659,32 +659,32 @@ chn_sim_run(struct chn_sim *cs, float dt) {
       __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
       __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
 
-      _mm256_store_ps(&cs->ptc_pos_x[base_idx + i], new_p_x);
-      _mm256_store_ps(&cs->ptc_pos_y[base_idx + i], new_p_y);
-      _mm256_store_ps(&cs->ptc_pos_z[base_idx + i], new_p_z);
+      _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], new_p_x);
+      _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], new_p_y);
+      _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], new_p_z);
 
-      _mm256_store_ps(&cs->ptc_prev_pos_x[base_idx + i], p_curr_x);
-      _mm256_store_ps(&cs->ptc_prev_pos_y[base_idx + i], p_curr_y);
-      _mm256_store_ps(&cs->ptc_prev_pos_z[base_idx + i], p_curr_z);
+      _mm256_store_ps(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x);
+      _mm256_store_ps(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y);
+      _mm256_store_ps(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z);
     }
   }
   // Step 3: Distance Constraints
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
     int r_base = c * CON_PER_CHN;
 
-    struct chn_cfg *cfg = &cs->chn_cfg[c];
+    struct chn_cfg *cfg = &cns->chn_cfg[c];
     __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
     __m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor);
     __m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain);
     __m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec);
 
     for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      __m256 pm = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]);
+      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
       pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
-      _mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cs->ptc_pos_x[p_base + i]));
-      _mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cs->ptc_pos_y[p_base + i]));
-      _mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cs->ptc_pos_z[p_base + i]));
+      _mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cns->ptc_pos_x[p_base + i]));
+      _mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cns->ptc_pos_y[p_base + i]));
+      _mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cns->ptc_pos_z[p_base + i]));
       _mm256_store_ps(&pm_lcl[i], pm);
     }
     for (int iter = 0; iter < MAX_ITR; ++iter) {
@@ -699,7 +699,7 @@ chn_sim_run(struct chn_sim *cs, float dt) {
         __m256 p1y = _mm256_loadu_ps(&py_lcl[1]);
         __m256 p1z = _mm256_loadu_ps(&pz_lcl[1]);
         __m256 p1m = _mm256_loadu_ps(&pm_lcl[1]);
-        __m256 rest_len_vec = _mm256_load_ps(&cs->ptc_rest_len[r_base]);
+        __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base]);
 
         __m256 dx = _mm256_sub_ps(p1x, p0x);
         __m256 dy = _mm256_sub_ps(p1y, p0y);
@@ -760,7 +760,7 @@ chn_sim_run(struct chn_sim *cs, float dt) {
         __m256 p1y = _mm256_loadu_ps(&py_lcl[9]);
         __m256 p1z = _mm256_loadu_ps(&pz_lcl[9]);
         __m256 p1m = _mm256_loadu_ps(&pm_lcl[9]);
-        __m256 rest_len_vec = _mm256_load_ps(&cs->ptc_rest_len[r_base + 8]);
+        __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]);
 
         __m256 dx = _mm256_sub_ps(p1x, p0x);
         __m256 dy = _mm256_sub_ps(p1y, p0y);
@@ -816,33 +816,33 @@ chn_sim_run(struct chn_sim *cs, float dt) {
       }
     }
     for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      _mm256_store_ps(&cs->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i]));
-      _mm256_store_ps(&cs->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i]));
-      _mm256_store_ps(&cs->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i]));
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i]));
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i]));
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i]));
     }
   }
   // Step 4: Sphere Collisions with Friction
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
     int s_base = c * SPH_PER_CHN;
-    struct chn_cfg *cfg = &cs->chn_cfg[c];
+    struct chn_cfg *cfg = &cns->chn_cfg[c];
     __m256 fric_vec = _mm256_set1_ps(cfg->friction);
 
     for (int s = 0; s < SPH_PER_CHN; ++s) {
-      __m256 sph_x_vec = _mm256_set1_ps(cs->sph_x[s_base + s]);
-      __m256 sph_y_vec = _mm256_set1_ps(cs->sph_y[s_base + s]);
-      __m256 sph_z_vec = _mm256_set1_ps(cs->sph_z[s_base + s]);
-      __m256 sph_r_vec = _mm256_set1_ps(cs->sph_r[s_base + s]);
+      __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]);
+      __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]);
+      __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
+      __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);
 
       for (int i = 0; i < PTC_PER_CHN; i += 8) {
-        __m256 p_curr_x = _mm256_load_ps(&cs->ptc_pos_x[p_base + i]);
-        __m256 p_curr_y = _mm256_load_ps(&cs->ptc_pos_y[p_base + i]);
-        __m256 p_curr_z = _mm256_load_ps(&cs->ptc_pos_z[p_base + i]);
+        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]);
+        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]);
+        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]);
 
-        __m256 p_prev_x = _mm256_load_ps(&cs->ptc_prev_pos_x[p_base + i]);
-        __m256 p_prev_y = _mm256_load_ps(&cs->ptc_prev_pos_y[p_base + i]);
-        __m256 p_prev_z = _mm256_load_ps(&cs->ptc_prev_pos_z[p_base + i]);
-        __m256 ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]);
+        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + i]);
+        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + i]);
+        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + i]);
+        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
         ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
         __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
@@ -906,26 +906,26 @@ chn_sim_run(struct chn_sim *cs, float dt) {
         __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
         __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
 
-        _mm256_store_ps(&cs->ptc_pos_x[p_base + i], new_p_x);
-        _mm256_store_ps(&cs->ptc_pos_y[p_base + i], new_p_y);
-        _mm256_store_ps(&cs->ptc_pos_z[p_base + i], new_p_z);
+        _mm256_store_ps(&cns->ptc_pos_x[p_base + i], new_p_x);
+        _mm256_store_ps(&cns->ptc_pos_y[p_base + i], new_p_y);
+        _mm256_store_ps(&cns->ptc_pos_z[p_base + i], new_p_z);
       }
     }
   }
   // Step 5: Motion Constraints
   for (int c = 0; c < MAX_CHNS; ++c) {
     int p_base = c * PTC_PER_CHN;
     for (int i = 0; i < PTC_PER_CHN; i += 8) {
-      __m256 px = _mm256_load_ps(&cs->ptc_pos_x[p_base + i]);
-      __m256 py = _mm256_load_ps(&cs->ptc_pos_y[p_base + i]);
-      __m256 pz = _mm256_load_ps(&cs->ptc_pos_z[p_base + i]);
-      __m256 pm = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]);
+      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]);
+      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]);
+      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]);
+      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
       pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-      __m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]);
-      __m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]);
-      __m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]);
-      __m256 r_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]);
+      __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + i]);
+      __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + i]);
+      __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + i]);
+      __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + i]);
 
       __m256 dx = _mm256_sub_ps(px, rx);
       __m256 dy = _mm256_sub_ps(py, ry);
@@ -952,9 +952,9 @@ chn_sim_run(struct chn_sim *cs, float dt) {
       dt_y = _mm256_mul_ps(dt_y, pm);
       dt_z = _mm256_mul_ps(dt_z, pm);
 
-      _mm256_store_ps(&cs->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x));
-      _mm256_store_ps(&cs->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y));
-      _mm256_store_ps(&cs->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z));
+      _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x));
+      _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y));
+      _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z));
     }
   }
 }
diff --git a/chain.c b/chain.c
@@ -1,96 +1,313 @@
 #include <immintrin.h>
 #include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <float.h>
 
 enum {
-  MAX_CHAINS = 64,
-  PARTICLES_PER_CHAIN = 16,
-  SPHERES_PER_CHAIN = 8,
-  MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
-  MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN),
-  CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN,
-  MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN),
-  MAX_ITERATIONS = 4,
+  MAX_CHNS      = 64,
+  PTC_PER_CHN   = 16,
+  SPH_PER_CHN   = 8,
+  MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
+  MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
+  CON_PER_CHN   = PTC_PER_CHN,
+  MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
+  MAX_ITR       = 8,
 };
-
-struct chain_cfg {
-  float damping;
-  float stiffness;
+struct chn_cfg {
+  float damping;              // [0, 1]
+  float stiffness;            // [0, 1]
   float stretch_factor;
   float max_strain;
-  float friction;
-  float collision_radius;
-  float linear_inertia;      // [0, 1], 1.0f = physically correct linear motion
-  float angular_inertia;     // [0, 1], 1.0f = physically correct angular rotation
-  float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force
-} __attribute__((aligned(32)));
-
-struct chain_sim {
-  unsigned short free_idx_cnt;
-  unsigned short free_idx[MAX_CHAINS];
-  struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
-
-  // Chain global forces (world space gravity and wind)
-  float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  // Chain transformations (world space)
-  float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+  float friction;             // [0, 1]
+  float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
+  float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
+  float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
+};
+struct chn_sim {
+  unsigned char free_idx_cnt;
+  unsigned char free_idx[MAX_CHNS];
+  struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32)));
+
+  // chain global forces (world space gravity and wind)
+  float chn_grav_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_grav_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_grav_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32)));
+
+  // chain transformations (world space)
+  float chn_pos_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_pos_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_pos_z[MAX_CHNS] __attribute__((aligned(32)));
 
-  float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
-  float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
+  float chn_quat_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_quat_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_quat_z[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_quat_w[MAX_CHNS] __attribute__((aligned(32)));
 
-  float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+  float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32)));
 
-  float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
-  float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
-
-  // Particle positions (model space)
-  float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-  float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-  float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
-  float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
+  float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32)));
+
+  // particle positions (model space)
+  float ptc_pos_x[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32)));
 
-  float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-  float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-  float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
-
-  float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
-
-  // Sphere positions (model space)
-  float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
-  float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
-  float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
-  float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));
-
-  // Rest positions (model space)
-  float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-  float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-  float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
-  float motion_radius[MAX_PARTICLES] __attribute__((aligned(32)));
+  float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
+  float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32)));
+
+  // sphere positions (model space)
+  float sph_x[MAX_SPH] __attribute__((aligned(32)));
+  float sph_y[MAX_SPH] __attribute__((aligned(32)));
+  float sph_z[MAX_SPH] __attribute__((aligned(32)));
+  float sph_r[MAX_SPH] __attribute__((aligned(32)));
+
+  // rest positions (model space)
+  float rest_pos_x[MAX_PTC] __attribute__((aligned(32)));
+  float rest_pos_y[MAX_PTC] __attribute__((aligned(32)));
+  float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
+  float motion_radius[MAX_PTC] __attribute__((aligned(32)));
 };
-
-void simulate_chains(struct chain_sim *cs, float dt) {
-  // Common SIMD constants
+extern void
+chn_sim_init(struct chn_sim *cs) {
+  assert(cs);
+  cs->free_idx_cnt = MAX_CHNS;
+  for (int i = 0; i < MAX_CHNS; ++i) {
+    cs->free_idx[i] = MAX_CHNS - i - 1;
+  }
+  for (int i = 0; i < MAX_CHNS; i += 8) {
+    _mm256_store_ps(&cs->chn_quat_w[i], _mm256_set1_ps(1.0f));
+    _mm256_store_ps(&cs->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));  
+  }
+}
+extern int
+chn_sim_add(struct chn_sim *cs, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
+  assert(cs);
+  assert(cfg);
+  assert(cs->free_idx_cnt > 0);
+  assert(cnt <= PTC_PER_CHN);
+  assert(cnt > 1);
+
+  int c = cs->free_idx[-cs->free_idx_cnt];
+  cs->chn_cfg[c] = *cfg;
+
+  int p_idx = (c * PTC_PER_CHN);
+  int r_idx = (c * CON_PER_CHN);
+  for (int i = 0; i < cnt; ++i) {
+    cs->ptc_pos_x[p_idx + i] = rest_pos[i*3+0];
+    cs->ptc_pos_y[p_idx + i] = rest_pos[i*3+1];
+    cs->ptc_pos_z[p_idx + i] = rest_pos[i*3+2];
+
+    cs->rest_pos_x[p_idx + i] = rest_pos[i*3+0];
+    cs->rest_pos_y[p_idx + i] = rest_pos[i*3+1];
+    cs->rest_pos_z[p_idx + i] = rest_pos[i*3+2];
+  }
+  for (int i = cnt; i < PTC_PER_CHN; ++i) {
+    cs->ptc_pos_x[p_idx + i] = 0.0f;
+    cs->ptc_pos_y[p_idx + i] = 0.0f;
+    cs->ptc_pos_z[p_idx + i] = 0.0f;
+  }
+  for (int i = 1; i < cnt; ++i) {
+    float dx = rest_pos[i*3+0] - rest_pos[(i-1)*3+0];
+    float dy = rest_pos[i*3+1] - rest_pos[(i-1)*3+1];
+    float dz = rest_pos[i*3+2] - rest_pos[(i-1)*3+2];
+    cs->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
+  }
+  for (int i = cnt; i < CON_PER_CHN; ++i) {
+    cs->ptc_rest_len[r_idx + i] = 0.0f;
+    cs->ptc_rest_len[r_idx + i] = 0.0f;
+    cs->ptc_rest_len[r_idx + i] = 0.0f;
+  }
+  return c;
+}
+extern void
+chn_sim_del(struct chn_sim *cs, int c) {
+  assert(cs);
+  assert(c >= 0);
+  assert(c < MAX_CHNS);
+
+  int p_idx = (c * PTC_PER_CHN);
+  int s_idx = (c * SPH_PER_CHN);
+  int c_idx = (c * CON_PER_CHN);
+
+  cs->free_idx[cs->free_idx_cnt++] = c;
+  memset(&cs->chn_cfg[c], 0, sizeof(cs->chn_cfg[0]));
+
+  cs->chn_grav_x[c] = 0.0f;
+  cs->chn_grav_y[c] = 0.0f;
+  cs->chn_grav_z[c] = 0.0f;
+
+  cs->chn_wnd_x[c] = 0.0f;
+  cs->chn_wnd_y[c] = 0.0f;
+  cs->chn_wnd_z[c] = 0.0f;
+
+  cs->chn_pos_x[c] = 0.0f;
+  cs->chn_pos_y[c] = 0.0f;
+  cs->chn_pos_z[c] = 0.0f;
+
+  cs->chn_quat_x[c] = 0.0f;
+  cs->chn_quat_y[c] = 0.0f;
+  cs->chn_quat_z[c] = 0.0f;
+  cs->chn_quat_w[c] = 1.0f;
+
+  cs->chn_prev_pos_x[c] = 0.0f;
+  cs->chn_prev_pos_y[c] = 0.0f;
+  cs->chn_prev_pos_z[c] = 0.0f;
+
+  cs->chn_prev_quat_x[c] = 0.0f;
+  cs->chn_prev_quat_y[c] = 0.0f;
+  cs->chn_prev_quat_z[c] = 0.0f;
+  cs->chn_prev_quat_w[c] = 1.0f;
+
+  _mm256_store_ps(&cs->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cs->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cs->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cs->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cs->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cs->sph_x[s_idx], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->sph_y[s_idx], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->sph_z[s_idx], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->sph_r[s_idx], _mm256_set1_ps(0.0f));
+
+  _mm256_store_ps(&cs->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f));
+  _mm256_store_ps(&cs->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f));
+}
+extern void
+chn_sim_mov(struct chn_sim *cs, int c, const float *restrict pos3, const float *restrict rot4) {
+  assert(cs);
+  assert(pos3);
+  assert(rot4);
+  assert(c >= 0);
+  assert(c < MAX_CHNS);
+
+  cs->chn_pos_x[c] = pos3[0];
+  cs->chn_pos_y[c] = pos3[1];
+  cs->chn_pos_z[c] = pos3[2];
+
+  cs->chn_quat_x[c] = rot4[0];
+  cs->chn_quat_y[c] = rot4[1];
+  cs->chn_quat_z[c] = rot4[2];
+  cs->chn_quat_w[c] = rot4[3];
+}
+extern void
+chn_sim_tel(struct chn_sim *cs, int c, const float *restrict pos3, const float *restrict rot4) {
+  assert(cs);
+  assert(pos3);
+  assert(rot4);
+  assert(c >= 0);
+  assert(c < MAX_CHNS);
+
+  cs->chn_pos_x[c] = cs->chn_prev_pos_x[c] = pos3[0];
+  cs->chn_pos_y[c] = cs->chn_prev_pos_y[c] = pos3[1];
+  cs->chn_pos_z[c] = cs->chn_prev_pos_z[c] = pos3[2];
+
+  cs->chn_quat_x[c] = cs->chn_prev_quat_x[c] = rot4[0];
+  cs->chn_quat_y[c] = cs->chn_prev_quat_y[c] = rot4[1];
+  cs->chn_quat_z[c] = cs->chn_prev_quat_z[c] = rot4[2];
+  cs->chn_quat_w[c] = cs->chn_prev_quat_w[c] = rot4[3];
+}
+extern void
+chn_sim_grav(struct chn_sim *cs, int c, const float *restrict g3) {
+  assert(cs);
+  assert(g3);
+  assert(c >= 0);
+  assert(c < MAX_CHNS);
+
+  cs->chn_grav_x[c] = g3[0];
+  cs->chn_grav_y[c] = g3[1];
+  cs->chn_grav_z[c] = g3[2];
+}
+extern void
+chn_sim_wind(struct chn_sim *cs, int c, const float *restrict w3) {
+  assert(cs);
+  assert(w3);
+  assert(c >= 0);
+  assert(c < MAX_CHNS);
+
+  cs->chn_wnd_x[c] = w3[0];
+  cs->chn_wnd_y[c] = w3[1];
+  cs->chn_wnd_z[c] = w3[2];
+}
+extern void
+chn_sim_set_sph(struct chn_sim *cs, int c, const float *spheres, int cnt) {
+  assert(cs);
+  assert(spheres);
+  assert(cnt < MAX_SPH);
+
+  int s_idx = (c * SPH_PER_CHN);
+  for (int i = 0; i < cnt; i++) {
+    cs->sph_x[s_idx + i] = spheres[i*4+0];
+    cs->sph_y[s_idx + i] = spheres[i*4+1];
+    cs->sph_z[s_idx + i] = spheres[i*4+2];
+    cs->sph_r[s_idx + i] = spheres[i*4+3];
+  }
+  for (int i = cnt; i < SPH_PER_CHN; ++i) {
+    cs->sph_x[s_idx + i] = 0;
+    cs->sph_y[s_idx + i] = 0;
+    cs->sph_z[s_idx + i] = 0;
+    cs->sph_r[s_idx + i] = 0;
+  }
+}
+extern void
+chn_sim_con_motion(struct chn_sim *cs, int c, const float *radius, int cnt) {
+  assert(cs);
+  assert(c >= 0);
+  assert(c < MAX_CHNS);
+  assert(cnt >= 0);
+  assert(cnt < PTC_PER_CHN);
+
+  int p_idx = (c * PTC_PER_CHN);
+  for (int i = 0; i < cnt; ++i) {
+    cs->motion_radius[p_idx + i] = radius[i];
+  }
+  for (int i = cnt; i < PTC_PER_CHN; ++i) {
+    cs->motion_radius[p_idx + i] = FLT_MAX;
+  }
+}
+extern void
+chn_sim_run(struct chn_sim *cs, float dt) {
+  // SIMD constants
   const __m256 dt_vec = _mm256_set1_ps(dt);
   const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
   const __m256 one_vec = _mm256_set1_ps(1.0f);
   const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
   const __m256 eps_vec = _mm256_set1_ps(1e-6f);
-  const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f);
-  const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f);
+  const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f);
+  const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f);
   const __m256 zero_vec = _mm256_setzero_ps();
   const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
   const __m256 two_vec = _mm256_set1_ps(2.0f);
@@ -99,59 +316,59 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
 
   // Local arrays for PBD solver
-  float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+  float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
+  float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
+  float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
+  float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
 
   // Initialize dummy elements
-  px_local[PARTICLES_PER_CHAIN] = 0.0f;
-  py_local[PARTICLES_PER_CHAIN] = 0.0f;
-  pz_local[PARTICLES_PER_CHAIN] = 0.0f;
-  pm_local[PARTICLES_PER_CHAIN] = 0.0f;
+  px_lcl[PTC_PER_CHN] = 0.0f;
+  py_lcl[PTC_PER_CHN] = 0.0f;
+  pz_lcl[PTC_PER_CHN] = 0.0f;
+  pm_lcl[PTC_PER_CHN] = 0.0f;
 
   // Stack arrays for precomputed data
-  float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+  float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32)));
 
-  float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+  float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
+  float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32)));
 
-  float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+  float vel_x[MAX_CHNS] __attribute__((aligned(32)));
+  float vel_y[MAX_CHNS] __attribute__((aligned(32)));
+  float vel_z[MAX_CHNS] __attribute__((aligned(32)));
 
-  float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+  float ang_vel_x[MAX_CHNS] __attribute__((aligned(32)));
+  float ang_vel_y[MAX_CHNS] __attribute__((aligned(32)));
+  float ang_vel_z[MAX_CHNS] __attribute__((aligned(32)));
 
   // Step 0: Precomputation (SIMD, 8 chains at once)
-  for (int c = 0; c < MAX_CHAINS; c += 8) {
+  for (int c = 0; c < MAX_CHNS; c += 8) {
     // Load chain quaternions
-    __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
-    __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
-    __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
-    __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
+    __m256 qx = _mm256_load_ps(&cs->chn_quat_x[c]);
+    __m256 qy = _mm256_load_ps(&cs->chn_quat_y[c]);
+    __m256 qz = _mm256_load_ps(&cs->chn_quat_z[c]);
+    __m256 qw = _mm256_load_ps(&cs->chn_quat_w[c]);
+
+    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
+    __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
+    __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
+    __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
+
     // Compute local-space wind
     {
-      __m256 vx = _mm256_load_ps(&cs->wind_x[c]);
-      __m256 vy = _mm256_load_ps(&cs->wind_y[c]);
-      __m256 vz = _mm256_load_ps(&cs->wind_z[c]);
-
-      // Create q_conjugate components (qw remains, qx, qy, qz are negated)
-      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
-      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
-      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
+      __m256 vx = _mm256_load_ps(&cs->chn_wnd_x[c]);
+      __m256 vy = _mm256_load_ps(&cs->chn_wnd_y[c]);
+      __m256 vz = _mm256_load_ps(&cs->chn_wnd_z[c]);
 
       // Step 1: t = q_conj * v_world_as_quat
       // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
       // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
       // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
       // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
       // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
-
       __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
       tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
 
@@ -165,33 +382,27 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
       tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
 
-      // Step 2: result_vec = (t * q)_vec
+      // Step 2: res_vec = (t * q)_vec
       // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
-      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
-      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
-      // result_z = tw*qz + ty*qw + tx*qy - ty*qx
-
-      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
-      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
-      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
-      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
-      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
-      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
-
-      _mm256_store_ps(&wind_local_x[c], result_x);
-      _mm256_store_ps(&wind_local_y[c], result_y);
-      _mm256_store_ps(&wind_local_z[c], result_z);
+      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
+      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
+      // res_z = tw*qz + ty*qw + tx*qy - ty*qx
+      __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
+      res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
+      __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
+      res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y));
+      __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
+      res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z));
+
+      _mm256_store_ps(&chn_wnd_lcl_x[c], res_x);
+      _mm256_store_ps(&chn_wnd_lcl_y[c], res_y);
+      _mm256_store_ps(&chn_wnd_lcl_z[c], res_z);
     }
     // Compute local-space gravity
     {
-      __m256 vx = _mm256_load_ps(&cs->gravity_x[c]);
-      __m256 vy = _mm256_load_ps(&cs->gravity_y[c]);
-      __m256 vz = _mm256_load_ps(&cs->gravity_z[c]);
-
-    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
-      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
-      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
-      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
+      __m256 vx = _mm256_load_ps(&cs->chn_grav_x[c]);
+      __m256 vy = _mm256_load_ps(&cs->chn_grav_y[c]);
+      __m256 vz = _mm256_load_ps(&cs->chn_grav_z[c]);
 
       // Step 1: t = q_conj * v_world_as_quat
       // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
@@ -213,32 +424,32 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
       tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
 
-      // Step 2: result_vec = (t * q)_vec
+      // Step 2: res_vec = (t * q)_vec
       // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
-      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
-      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
-      // result_z = tw*qz + ty*qw + tx*qy - ty*qx
-
-      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
-      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
-      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
-      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
-      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
-      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
-
-      _mm256_store_ps(&gravity_local_x[c], result_x);
-      _mm256_store_ps(&gravity_local_y[c], result_y);
-      _mm256_store_ps(&gravity_local_z[c], result_z);
+      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
+      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
+      // res_z = tw*qz + ty*qw + tx*qy - ty*qx
+
+      __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
+      res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
+      __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
+      res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y));
+      __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
+      res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z));
+
+      _mm256_store_ps(&chn_grav_lcl_x[c], res_x);
+      _mm256_store_ps(&chn_grav_lcl_y[c], res_y);
+      _mm256_store_ps(&chn_grav_lcl_z[c], res_z);
     }
     // Compute linear velocity
     {
-      __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
-      __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]);
-      __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]);
+      __m256 curr_x = _mm256_load_ps(&cs->chn_pos_x[c]);
+      __m256 curr_y = _mm256_load_ps(&cs->chn_pos_y[c]);
+      __m256 curr_z = _mm256_load_ps(&cs->chn_pos_z[c]);
 
-      __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]);
-      __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]);
-      __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]);
+      __m256 prev_x = _mm256_load_ps(&cs->chn_prev_pos_x[c]);
+      __m256 prev_y = _mm256_load_ps(&cs->chn_prev_pos_y[c]);
+      __m256 prev_z = _mm256_load_ps(&cs->chn_prev_pos_z[c]);
 
       __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
       __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
@@ -250,21 +461,21 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     }
     // Compute angular velocity
     {
-      __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
-      __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
-      __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
-      __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
+      __m256 qx = _mm256_load_ps(&cs->chn_quat_x[c]);
+      __m256 qy = _mm256_load_ps(&cs->chn_quat_y[c]);
+      __m256 qz = _mm256_load_ps(&cs->chn_quat_z[c]);
+      __m256 qw = _mm256_load_ps(&cs->chn_quat_w[c]);
 
-      __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
-      __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]);
-      __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]);
-      __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]);
+      __m256 prev_qx = _mm256_load_ps(&cs->chn_prev_quat_x[c]);
+      __m256 prev_qy = _mm256_load_ps(&cs->chn_prev_quat_y[c]);
+      __m256 prev_qz = _mm256_load_ps(&cs->chn_prev_quat_z[c]);
+      __m256 prev_qw = _mm256_load_ps(&cs->chn_prev_quat_w[c]);
 
       // Step 1: Compute delta quaternion (to - from)
-      __m256 delta_qx = _mm256_sub_ps(qx, prev_qx);
-      __m256 delta_qy = _mm256_sub_ps(qy, prev_qy);
-      __m256 delta_qz = _mm256_sub_ps(qz, prev_qz);
-      __m256 delta_qw = _mm256_sub_ps(qw, prev_qw);
+      __m256 dt_qx = _mm256_sub_ps(qx, prev_qx);
+      __m256 dt_qy = _mm256_sub_ps(qy, prev_qy);
+      __m256 dt_qz = _mm256_sub_ps(qz, prev_qz);
+      __m256 dt_qw = _mm256_sub_ps(qw, prev_qw);
 
       // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
       __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx);
@@ -274,20 +485,20 @@ void simulate_chains(struct chain_sim *cs, float dt) {
 
       // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
       // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
-      // Result: orientationDelta = (delta_qw, delta_qx, delta_qy, delta_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
-      __m256 orient_qw = _mm256_fnmadd_ps(delta_qx, conj_qx, _mm256_mul_ps(delta_qw, conj_qw));
-      orient_qw = _mm256_fnmadd_ps(delta_qy, conj_qy, orient_qw);
-      orient_qw = _mm256_fnmadd_ps(delta_qz, conj_qz, orient_qw);
+      // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
+      __m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw));
+      orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw);
+      orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw);
 
-      __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(delta_qy, conj_qz), _mm256_mul_ps(delta_qz, conj_qy));
-      __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(delta_qz, conj_qx), _mm256_mul_ps(delta_qx, conj_qz));
-      __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(delta_qx, conj_qy), _mm256_mul_ps(delta_qy, conj_qx));
+      __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(dt_qy, conj_qz), _mm256_mul_ps(dt_qz, conj_qy));
+      __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(dt_qz, conj_qx), _mm256_mul_ps(dt_qx, conj_qz));
+      __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(dt_qx, conj_qy), _mm256_mul_ps(dt_qy, conj_qx));
 
-      __m256 orient_qx = _mm256_fmadd_ps(delta_qw, conj_qx, _mm256_mul_ps(delta_qx, conj_qw));
+      __m256 orient_qx = _mm256_fmadd_ps(dt_qw, conj_qx, _mm256_mul_ps(dt_qx, conj_qw));
       orient_qx = _mm256_add_ps(orient_qx, cross_x);
-      __m256 orient_qy = _mm256_fmadd_ps(delta_qw, conj_qy, _mm256_mul_ps(delta_qy, conj_qw));
+      __m256 orient_qy = _mm256_fmadd_ps(dt_qw, conj_qy, _mm256_mul_ps(dt_qy, conj_qw));
       orient_qy = _mm256_add_ps(orient_qy, cross_y);
-      __m256 orient_qz = _mm256_fmadd_ps(delta_qw, conj_qz, _mm256_mul_ps(delta_qz, conj_qw));
+      __m256 orient_qz = _mm256_fmadd_ps(dt_qw, conj_qz, _mm256_mul_ps(dt_qz, conj_qw));
       orient_qz = _mm256_add_ps(orient_qz, cross_z);
 
       // Step 4: Compute dot product (to, from) for shortest arc check
@@ -317,9 +528,9 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     }
   }
   // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int base_idx = c * PARTICLES_PER_CHAIN;
-    struct chain_cfg *cfg = &cs->chain_configs[c];
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int base_idx = c * PTC_PER_CHN;
+    struct chn_cfg *cfg = &cs->chn_cfg[c];
 
     // Load precomputed velocities and inertia parameters
     __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
@@ -339,26 +550,26 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
     centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
 
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      __m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
-      __m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
-      __m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
-      __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
-      p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
+    for (int i = 0; i < PTC_PER_CHN; i += 8) {
+      __m256 px = _mm256_load_ps(&cs->ptc_pos_x[base_idx + i]);
+      __m256 py = _mm256_load_ps(&cs->ptc_pos_y[base_idx + i]);
+      __m256 pz = _mm256_load_ps(&cs->ptc_pos_z[base_idx + i]);
+      __m256 p_ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[base_idx + i]);
+      p_ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
       // Linear inertia: v * dt * linear_inertia
-      __m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
-      __m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
-      __m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
 
       // Angular inertia: (ω × r) * dt * angular_inertia
-      __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
-      __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
-      __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
+      __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
+      __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
+      __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
 
-      ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec);
-      ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec);
-      ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec);
+      ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec);
+      ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec);
+      ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec);
 
       // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
       __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
@@ -370,67 +581,67 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
 
       // Calculate displacement: (ω × (ω × r)) * dt^2
-      __m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec);
-      __m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec);
-      __m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec);
+      __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec);
+      __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
+      __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);
 
       // Apply the centrifugal_inertia factor
-      __m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec);
-      __m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec);
-      __m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec);
+      __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec);
+      __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec);
+      __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec);
 
       // Total delta
-      __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x);
-      __m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y);
-      __m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z);
+      __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
+      __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
+      __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);
 
       // Update positions
-      _mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x));
-      _mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y));
-      _mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z));
+      _mm256_store_ps(&cs->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x));
+      _mm256_store_ps(&cs->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y));
+      _mm256_store_ps(&cs->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z));
     }
     // Update previous transformation
-    cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
-    cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
-    cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
-
-    cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
-    cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
-    cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
-    cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
+    cs->chn_prev_pos_x[c] = cs->chn_pos_x[c];
+    cs->chn_prev_pos_y[c] = cs->chn_pos_y[c];
+    cs->chn_prev_pos_z[c] = cs->chn_pos_z[c];
+
+    cs->chn_prev_quat_x[c] = cs->chn_quat_x[c];
+    cs->chn_prev_quat_y[c] = cs->chn_quat_y[c];
+    cs->chn_prev_quat_z[c] = cs->chn_quat_z[c];
+    cs->chn_prev_quat_w[c] = cs->chn_quat_w[c];
   }
   // Step 2: Verlet Integration
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int base_idx = c * PARTICLES_PER_CHAIN;
-    struct chain_cfg *cfg = &cs->chain_configs[c];
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int base_idx = c * PTC_PER_CHN;
+    struct chn_cfg *cfg = &cs->chn_cfg[c];
 
-    __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]);
-    __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]);
-    __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]);
+    __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]);
+    __m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]);
+    __m256 chn_grav_z_vec = _mm256_set1_ps(chn_grav_lcl_z[c]);
 
-    __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]);
-    __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]);
-    __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]);
+    __m256 chn_wnd_x_vec = _mm256_set1_ps(chn_wnd_lcl_x[c]);
+    __m256 chn_wnd_y_vec = _mm256_set1_ps(chn_wnd_lcl_y[c]);
+    __m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]);
     __m256 damping_vec = _mm256_set1_ps(cfg->damping);
 
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
-      __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
-      __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
-      __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
-      p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
+    for (int i = 0; i < PTC_PER_CHN; i += 8) {
+      __m256 p_curr_x = _mm256_load_ps(&cs->ptc_pos_x[base_idx + i]);
+      __m256 p_curr_y = _mm256_load_ps(&cs->ptc_pos_y[base_idx + i]);
+      __m256 p_curr_z = _mm256_load_ps(&cs->ptc_pos_z[base_idx + i]);
+      __m256 ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[base_idx + i]);
+      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-      __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]);
-      __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]);
-      __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]);
+      __m256 p_prev_x = _mm256_load_ps(&cs->ptc_prev_pos_x[base_idx + i]);
+      __m256 p_prev_y = _mm256_load_ps(&cs->ptc_prev_pos_y[base_idx + i]);
+      __m256 p_prev_z = _mm256_load_ps(&cs->ptc_prev_pos_z[base_idx + i]);
 
-      __m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec);
-      __m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec);
-      __m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec);
+      __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec);
+      __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec);
+      __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec);
 
-      __m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass);
-      __m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass);
-      __m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass);
+      __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass);
+      __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass);
+      __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass);
 
       __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
       __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
@@ -448,49 +659,47 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
       __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
 
-      _mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x);
-      _mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y);
-      _mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z);
+      _mm256_store_ps(&cs->ptc_pos_x[base_idx + i], new_p_x);
+      _mm256_store_ps(&cs->ptc_pos_y[base_idx + i], new_p_y);
+      _mm256_store_ps(&cs->ptc_pos_z[base_idx + i], new_p_z);
 
-      _mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x);
-      _mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y);
-      _mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z);
+      _mm256_store_ps(&cs->ptc_prev_pos_x[base_idx + i], p_curr_x);
+      _mm256_store_ps(&cs->ptc_prev_pos_y[base_idx + i], p_curr_y);
+      _mm256_store_ps(&cs->ptc_prev_pos_z[base_idx + i], p_curr_z);
     }
   }
   // Step 3: Distance Constraints
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int p_base = c * PARTICLES_PER_CHAIN;
-    int r_base = c * CONSTRAINTS_PER_CHAIN;
-    struct chain_cfg *cfg = &cs->chain_configs[c];
-    __m256 stiffness_vec = _mm256_set1_ps(cfg->stiffness);
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int p_base = c * PTC_PER_CHN;
+    int r_base = c * CON_PER_CHN;
+
+    struct chn_cfg *cfg = &cs->chn_cfg[c];
+    __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
     __m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor);
     __m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain);
     __m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec);
 
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      __m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
-      __m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
-      __m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
-      __m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]);
-      pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min);
-
-      _mm256_store_ps(&px_local[i], px);
-      _mm256_store_ps(&py_local[i], py);
-      _mm256_store_ps(&pz_local[i], pz);
-      _mm256_store_ps(&pm_local[i], pm);
+    for (int i = 0; i < PTC_PER_CHN; i += 8) {
+      __m256 pm = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]);
+      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
+      _mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cs->ptc_pos_x[p_base + i]));
+      _mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cs->ptc_pos_y[p_base + i]));
+      _mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cs->ptc_pos_z[p_base + i]));
+      _mm256_store_ps(&pm_lcl[i], pm);
     }
-    for (int iter = 0; iter < MAX_ITERATIONS; iter++) {
+    for (int iter = 0; iter < MAX_ITR; ++iter) {
       // First block (i=0 to 7)
       {
-        __m256 p0x = _mm256_load_ps(&px_local[0]);
-        __m256 p0y = _mm256_load_ps(&py_local[0]);
-        __m256 p0z = _mm256_load_ps(&pz_local[0]);
-        __m256 p0m = _mm256_load_ps(&pm_local[0]);
-        __m256 p1x = _mm256_loadu_ps(&px_local[1]);
-        __m256 p1y = _mm256_loadu_ps(&py_local[1]);
-        __m256 p1z = _mm256_loadu_ps(&pz_local[1]);
-        __m256 p1m = _mm256_loadu_ps(&pm_local[1]);
-        __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]);
+        __m256 p0x = _mm256_load_ps(&px_lcl[0]);
+        __m256 p0y = _mm256_load_ps(&py_lcl[0]);
+        __m256 p0z = _mm256_load_ps(&pz_lcl[0]);
+        __m256 p0m = _mm256_load_ps(&pm_lcl[0]);
+
+        __m256 p1x = _mm256_loadu_ps(&px_lcl[1]);
+        __m256 p1y = _mm256_loadu_ps(&py_lcl[1]);
+        __m256 p1z = _mm256_loadu_ps(&pz_lcl[1]);
+        __m256 p1m = _mm256_loadu_ps(&pm_lcl[1]);
+        __m256 rest_len_vec = _mm256_load_ps(&cs->ptc_rest_len[r_base]);
 
         __m256 dx = _mm256_sub_ps(p1x, p0x);
         __m256 dy = _mm256_sub_ps(p1y, p0y);
@@ -499,7 +708,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 dist_sq = _mm256_mul_ps(dx, dx);
         dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
         dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
-
         __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
 
         __m256 w_sum = _mm256_add_ps(p0m, p1m);
@@ -512,47 +720,47 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
         __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));
 
-        __m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
-        __m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom));
+        __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
+        __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom));
 
-        __m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec);
-        corr_scalar_part = _mm256_mul_ps(corr_scalar_part, rcp_w_sum);
-        __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scalar_part);
+        __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec);
+        corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum);
+        __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part);
 
         __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
 
-        __m256 delta_unit_x = _mm256_mul_ps(dx, inv_dist);
-        __m256 delta_unit_y = _mm256_mul_ps(dy, inv_dist);
-        __m256 delta_unit_z = _mm256_mul_ps(dz, inv_dist);
+        __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist);
 
-        __m256 delta_x = _mm256_mul_ps(delta_unit_x, corr_magnitude);
-        __m256 delta_y = _mm256_mul_ps(delta_unit_y, corr_magnitude);
-        __m256 delta_z = _mm256_mul_ps(delta_unit_z, corr_magnitude);
+        __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude);
+        __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude);
+        __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude);
 
-        delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
-        delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
-        delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
+        dt_x = _mm256_and_ps(dt_x, apply_corr_mask);
+        dt_y = _mm256_and_ps(dt_y, apply_corr_mask);
+        dt_z = _mm256_and_ps(dt_z, apply_corr_mask);
 
-        _mm256_store_ps(&px_local[0], _mm256_fmadd_ps(delta_x, p0m, p0x));
-        _mm256_store_ps(&py_local[0], _mm256_fmadd_ps(delta_y, p0m, p0y));
-        _mm256_store_ps(&pz_local[0], _mm256_fmadd_ps(delta_z, p0m, p0z));
+        _mm256_store_ps(&px_lcl[0], _mm256_fmadd_ps(dt_x, p0m, p0x));
+        _mm256_store_ps(&py_lcl[0], _mm256_fmadd_ps(dt_y, p0m, p0y));
+        _mm256_store_ps(&pz_lcl[0], _mm256_fmadd_ps(dt_z, p0m, p0z));
 
-        _mm256_store_ps(&px_local[1], _mm256_fnmadd_ps(delta_x, p1m, p1x));
-        _mm256_store_ps(&py_local[1], _mm256_fnmadd_ps(delta_y, p1m, p1y));
-        _mm256_store_ps(&pz_local[1], _mm256_fnmadd_ps(delta_z, p1m, p1z));
+        _mm256_store_ps(&px_lcl[1], _mm256_fnmadd_ps(dt_x, p1m, p1x));
+        _mm256_store_ps(&py_lcl[1], _mm256_fnmadd_ps(dt_y, p1m, p1y));
+        _mm256_store_ps(&pz_lcl[1], _mm256_fnmadd_ps(dt_z, p1m, p1z));
       }
       // Second block (i=8 to 14)
       {
-        __m256 p0x = _mm256_load_ps(&px_local[8]);
-        __m256 p0y = _mm256_load_ps(&py_local[8]);
-        __m256 p0z = _mm256_load_ps(&pz_local[8]);
-        __m256 p0m = _mm256_load_ps(&pm_local[8]);
+        __m256 p0x = _mm256_load_ps(&px_lcl[8]);
+        __m256 p0y = _mm256_load_ps(&py_lcl[8]);
+        __m256 p0z = _mm256_load_ps(&pz_lcl[8]);
+        __m256 p0m = _mm256_load_ps(&pm_lcl[8]);
 
-        __m256 p1x = _mm256_loadu_ps(&px_local[9]);
-        __m256 p1y = _mm256_loadu_ps(&py_local[9]);
-        __m256 p1z = _mm256_loadu_ps(&pz_local[9]);
-        __m256 p1m = _mm256_loadu_ps(&pm_local[9]);
-        __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]);
+        __m256 p1x = _mm256_loadu_ps(&px_lcl[9]);
+        __m256 p1y = _mm256_loadu_ps(&py_lcl[9]);
+        __m256 p1z = _mm256_loadu_ps(&pz_lcl[9]);
+        __m256 p1m = _mm256_loadu_ps(&pm_lcl[9]);
+        __m256 rest_len_vec = _mm256_load_ps(&cs->ptc_rest_len[r_base + 8]);
 
         __m256 dx = _mm256_sub_ps(p1x, p0x);
         __m256 dy = _mm256_sub_ps(p1y, p0y);
@@ -561,7 +769,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 dist_sq = _mm256_mul_ps(dx, dx);
         dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
         dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
-
         __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
 
         __m256 w_sum = _mm256_add_ps(p0m, p1m);
@@ -574,77 +781,73 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
         __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));
 
-        __m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
-        __m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom));
+        __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
+        __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom));
 
-        __m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec);
-        corr_scalar_part = _mm256_mul_ps(corr_scalar_part, rcp_w_sum);
-        __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scalar_part);
+        __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec);
+        corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum);
+        __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part);
 
         __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
 
         __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
         __m256 valid_mask = _mm256_load_ps(valid_mask_array);
         apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);
 
-        __m256 delta_unit_x = _mm256_mul_ps(dx, inv_dist);
-        __m256 delta_unit_y = _mm256_mul_ps(dy, inv_dist);
-        __m256 delta_unit_z = _mm256_mul_ps(dz, inv_dist);
+        __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist);
 
-        __m256 delta_x = _mm256_mul_ps(delta_unit_x, corr_magnitude);
-        __m256 delta_y = _mm256_mul_ps(delta_unit_y, corr_magnitude);
-        __m256 delta_z = _mm256_mul_ps(delta_unit_z, corr_magnitude);
+        __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude);
+        __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude);
+        __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude);
 
-        delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
-        delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
-        delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
+        dt_x = _mm256_and_ps(dt_x, apply_corr_mask);
+        dt_y = _mm256_and_ps(dt_y, apply_corr_mask);
+        dt_z = _mm256_and_ps(dt_z, apply_corr_mask);
 
-        _mm256_store_ps(&px_local[8], _mm256_fmadd_ps(delta_x, p0m, p0x));
-        _mm256_store_ps(&py_local[8], _mm256_fmadd_ps(delta_y, p0m, p0y));
-        _mm256_store_ps(&pz_local[8], _mm256_fmadd_ps(delta_z, p0m, p0z));
+        _mm256_store_ps(&px_lcl[8], _mm256_fmadd_ps(dt_x, p0m, p0x));
+        _mm256_store_ps(&py_lcl[8], _mm256_fmadd_ps(dt_y, p0m, p0y));
+        _mm256_store_ps(&pz_lcl[8], _mm256_fmadd_ps(dt_z, p0m, p0z));
 
-        _mm256_storeu_ps(&px_local[9], _mm256_fnmadd_ps(delta_x, p1m, p1x));
-        _mm256_storeu_ps(&py_local[9], _mm256_fnmadd_ps(delta_y, p1m, p1y));
-        _mm256_storeu_ps(&pz_local[9], _mm256_fnmadd_ps(delta_z, p1m, p1z));
+        _mm256_storeu_ps(&px_lcl[9], _mm256_fnmadd_ps(dt_x, p1m, p1x));
+        _mm256_storeu_ps(&py_lcl[9], _mm256_fnmadd_ps(dt_y, p1m, p1y));
+        _mm256_storeu_ps(&pz_lcl[9], _mm256_fnmadd_ps(dt_z, p1m, p1z));
       }
     }
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      __m256 px = _mm256_load_ps(&px_local[i]);
-      __m256 py = _mm256_load_ps(&py_local[i]);
-      __m256 pz = _mm256_load_ps(&pz_local[i]);
-
-      _mm256_store_ps(&cs->p_pos_x[p_base + i], px);
-      _mm256_store_ps(&cs->p_pos_y[p_base + i], py);
-      _mm256_store_ps(&cs->p_pos_z[p_base + i], pz);
+    for (int i = 0; i < PTC_PER_CHN; i += 8) {
+      _mm256_store_ps(&cs->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i]));
+      _mm256_store_ps(&cs->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i]));
+      _mm256_store_ps(&cs->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i]));
     }
   }
   // Step 4: Sphere Collisions with Friction
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int p_base = c * PARTICLES_PER_CHAIN;
-    int s_base = c * SPHERES_PER_CHAIN;
-    struct chain_cfg *cfg = &cs->chain_configs[c];
-    __m256 friction_vec = _mm256_set1_ps(cfg->friction);
-
-    for (int s = 0; s < SPHERES_PER_CHAIN; s++) {
-      __m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]);
-      __m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]);
-      __m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]);
-      __m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]);
-
-      for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-        __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
-        __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
-        __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int p_base = c * PTC_PER_CHN;
+    int s_base = c * SPH_PER_CHN;
+    struct chn_cfg *cfg = &cs->chn_cfg[c];
+    __m256 fric_vec = _mm256_set1_ps(cfg->friction);
+
+    for (int s = 0; s < SPH_PER_CHN; ++s) {
+      __m256 sph_x_vec = _mm256_set1_ps(cs->sph_x[s_base + s]);
+      __m256 sph_y_vec = _mm256_set1_ps(cs->sph_y[s_base + s]);
+      __m256 sph_z_vec = _mm256_set1_ps(cs->sph_z[s_base + s]);
+      __m256 sph_r_vec = _mm256_set1_ps(cs->sph_r[s_base + s]);
+
+      for (int i = 0; i < PTC_PER_CHN; i += 8) {
+        __m256 p_curr_x = _mm256_load_ps(&cs->ptc_pos_x[p_base + i]);
+        __m256 p_curr_y = _mm256_load_ps(&cs->ptc_pos_y[p_base + i]);
+        __m256 p_curr_z = _mm256_load_ps(&cs->ptc_pos_z[p_base + i]);
 
-        __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]);
-        __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]);
-        __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);
-        __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]);
-        p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
+        __m256 p_prev_x = _mm256_load_ps(&cs->ptc_prev_pos_x[p_base + i]);
+        __m256 p_prev_y = _mm256_load_ps(&cs->ptc_prev_pos_y[p_base + i]);
+        __m256 p_prev_z = _mm256_load_ps(&cs->ptc_prev_pos_z[p_base + i]);
+        __m256 ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]);
+        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
-        __m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
-        __m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
-        __m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);
+        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
+        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
+        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);
 
         __m256 dist_sq = _mm256_mul_ps(dx, dx);
         dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
@@ -653,77 +856,76 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
         __m256 dist = _mm256_rcp_ps(inv_dist);
 
-        __m256 penetration = _mm256_sub_ps(sphere_r_vec, dist);
-        __m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
+        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
+        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
 
-        __m256 normal_x = _mm256_mul_ps(dx, inv_dist);
-        __m256 normal_y = _mm256_mul_ps(dy, inv_dist);
-        __m256 normal_z = _mm256_mul_ps(dz, inv_dist);
+        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);
 
-        __m256 normal_corr_x = _mm256_mul_ps(normal_x, penetration);
-        __m256 normal_corr_y = _mm256_mul_ps(normal_y, penetration);
-        __m256 normal_corr_z = _mm256_mul_ps(normal_z, penetration);
+        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
+        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
+        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);
 
         __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
         __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
         __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
 
-        __m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x);
-        vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal);
-        vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal);
+        __m256 vel_dot_norm = _mm256_mul_ps(vel_x, norm_x);
+        vel_dot_norm = _mm256_fmadd_ps(vel_y, norm_y, vel_dot_norm);
+        vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, vel_dot_norm);
 
-        __m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x));
-        __m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y));
-        __m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z));
+        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
+        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
+        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));
 
-        __m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x);
-        tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq);
-        tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq);
+        __m256 tan_mag_sq = _mm256_mul_ps(tan_x, tan_x);
+        tan_mag_sq = _mm256_fmadd_ps(tan_y, tan_y, tan_mag_sq);
+        tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, tan_mag_sq);
 
-        __m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec));
+        __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec));
 
-        __m256 friction_mag = _mm256_mul_ps(penetration, friction_vec);
-        __m256 friction_x = _mm256_mul_ps(_mm256_mul_ps(tangent_x, inv_tangent_mag), friction_mag);
-        __m256 friction_y = _mm256_mul_ps(_mm256_mul_ps(tangent_y, inv_tangent_mag), friction_mag);
-        __m256 friction_z = _mm256_mul_ps(_mm256_mul_ps(tangent_z, inv_tangent_mag), friction_mag);
+        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
+        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
+        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
+        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);
 
-        __m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x);
-        __m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y);
-        __m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z);
+        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
+        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
+        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);
 
-        total_corr_x = _mm256_and_ps(total_corr_x, collision_mask);
-        total_corr_y = _mm256_and_ps(total_corr_y, collision_mask);
-        total_corr_z = _mm256_and_ps(total_corr_z, collision_mask);
+        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
+        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
+        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);
 
-        total_corr_x = _mm256_mul_ps(total_corr_x, p_inv_mass);
-        total_corr_y = _mm256_mul_ps(total_corr_y, p_inv_mass);
-        total_corr_z = _mm256_mul_ps(total_corr_z, p_inv_mass);
+        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
+        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
+        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);
 
         __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
         __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
         __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
 
-        _mm256_store_ps(&cs->p_pos_x[p_base + i], new_p_x);
-        _mm256_store_ps(&cs->p_pos_y[p_base + i], new_p_y);
-        _mm256_store_ps(&cs->p_pos_z[p_base + i], new_p_z);
+        _mm256_store_ps(&cs->ptc_pos_x[p_base + i], new_p_x);
+        _mm256_store_ps(&cs->ptc_pos_y[p_base + i], new_p_y);
+        _mm256_store_ps(&cs->ptc_pos_z[p_base + i], new_p_z);
       }
     }
   }
-
   // Step 5: Motion Constraints
-  for (int c = 0; c < MAX_CHAINS; c++) {
-    int p_base = c * PARTICLES_PER_CHAIN;
-    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-      __m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
-      __m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
-      __m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
-      __m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]);
-      pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min);
+  for (int c = 0; c < MAX_CHNS; ++c) {
+    int p_base = c * PTC_PER_CHN;
+    for (int i = 0; i < PTC_PER_CHN; i += 8) {
+      __m256 px = _mm256_load_ps(&cs->ptc_pos_x[p_base + i]);
+      __m256 py = _mm256_load_ps(&cs->ptc_pos_y[p_base + i]);
+      __m256 pz = _mm256_load_ps(&cs->ptc_pos_z[p_base + i]);
+      __m256 pm = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]);
+      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
 
       __m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]);
       __m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]);
       __m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]);
-      __m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]);
+      __m256 r_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]);
 
       __m256 dx = _mm256_sub_ps(px, rx);
       __m256 dy = _mm256_sub_ps(py, ry);
@@ -736,23 +938,23 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
       __m256 dist = _mm256_rcp_ps(inv_dist);
 
-      __m256 penetration = _mm256_sub_ps(dist, radius_vec);
-      __m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
+      __m256 pen = _mm256_sub_ps(dist, r_vec);
+      __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
 
-      __m256 corr_factor = _mm256_mul_ps(penetration, inv_dist);
+      __m256 corr_factor = _mm256_mul_ps(pen, inv_dist);
       corr_factor = _mm256_and_ps(corr_factor, mask);
 
-      __m256 delta_x = _mm256_mul_ps(dx, corr_factor);
-      __m256 delta_y = _mm256_mul_ps(dy, corr_factor);
-      __m256 delta_z = _mm256_mul_ps(dz, corr_factor);
+      __m256 dt_x = _mm256_mul_ps(dx, corr_factor);
+      __m256 dt_y = _mm256_mul_ps(dy, corr_factor);
+      __m256 dt_z = _mm256_mul_ps(dz, corr_factor);
 
-      delta_x = _mm256_mul_ps(delta_x, pm);
-      delta_y = _mm256_mul_ps(delta_y, pm);
-      delta_z = _mm256_mul_ps(delta_z, pm);
+      dt_x = _mm256_mul_ps(dt_x, pm);
+      dt_y = _mm256_mul_ps(dt_y, pm);
+      dt_z = _mm256_mul_ps(dt_z, pm);
 
-      _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x));
-      _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y));
-      _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z));
+      _mm256_store_ps(&cs->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x));
+      _mm256_store_ps(&cs->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y));
+      _mm256_store_ps(&cs->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z));
     }
   }
 }
diff --git a/chain.ispc b/chain.ispc
@@ -141,7 +141,6 @@ export void simulate_chains(
   assume(MAX_REST_LENGTHS % programCount == 0);
 
   // Local arrays for PBD solver
-  // Declared as uniform to avoid scatter/gather warnings when accessing with varying indices
   __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1];
   __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1];
   __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1];
@@ -171,7 +170,6 @@ export void simulate_chains(
   __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS];
 
   // Step 0: Precomputation (SIMD, process chains in groups of programCount)
-  // Assumption: MAX_CHAINS (64) is divisible by programCount (8 for AVX2, 4 for Neon)
   for (uniform int c = 0; c < MAX_CHAINS; c += programCount) {
     // Load chain quaternions
     varying float qx = chain_quat_x_ptr[c + programIndex];
@@ -295,7 +293,6 @@ export void simulate_chains(
   }
 
   // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
-  // Process one chain at a time, particles in groups of programCount
   for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
     uniform int base_idx = c * PARTICLES_PER_CHAIN;
     uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
@@ -375,7 +372,6 @@ export void simulate_chains(
   }
 
   // Step 2: Verlet Integration
-  // Process particles in groups of programCount
   for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
     uniform int base_idx = c * PARTICLES_PER_CHAIN;
     uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
@@ -437,7 +433,6 @@ export void simulate_chains(
   }
 
   // Step 3: Distance Constraints
-  // Process one chain at a time, constraints in groups of programCount
   for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
     uniform int p_base = c * PARTICLES_PER_CHAIN;
     uniform int r_base = c * CONSTRAINTS_PER_CHAIN;
@@ -538,7 +533,6 @@ export void simulate_chains(
     }
   }
   // Step 4: Sphere Collisions with Friction
-  // Process particles in groups of programCount
   for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
     uniform int p_base = c * PARTICLES_PER_CHAIN;
     uniform int s_base = c * SPHERES_PER_CHAIN;
@@ -607,7 +601,6 @@ export void simulate_chains(
     }
   }
   // Step 5: Motion Constraints
-  // Process particles in groups of programCount
   for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
     uniform int p_base = c * PARTICLES_PER_CHAIN;
     for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {

diff --git a/chain.c b/chain.c
@@ -127,35 +127,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
   float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
 
-  // Local arrays for PBD solver
-  float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-  float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-
-  // Initialize dummy elements
-  px_local[PARTICLES_PER_CHAIN] = 0.0f;
-  py_local[PARTICLES_PER_CHAIN] = 0.0f;
-  pz_local[PARTICLES_PER_CHAIN] = 0.0f;
-  pm_local[PARTICLES_PER_CHAIN] = 0.0f;
-
-  // Stack arrays for precomputed data
-  float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-  float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
-  float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
-  float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
-
   // Step 0: Precomputation (SIMD, 8 chains at once)
   for (int c = 0; c < MAX_CHAINS; c += 8) {
     // Load chain quaternions

diff --git a/chain.ispc b/chain.ispc
@@ -9,636 +9,636 @@
 
 // Define chain configuration structure
 struct chain_cfg {
-    uniform float damping;
-    uniform float stiffness;
-    uniform float stretch_factor;
-    uniform float max_strain;
-    uniform float friction;
-    uniform float collision_radius;
-    uniform float linear_inertia; // [0, 1]
-    uniform float angular_inertia;     // [0, 1]
-    uniform float centrifugal_inertia;// [0, 1]
+  uniform float damping;
+  uniform float stiffness;
+  uniform float stretch_factor;
+  uniform float max_strain;
+  uniform float friction;
+  uniform float collision_radius;
+  uniform float linear_inertia; // [0, 1]
+  uniform float angular_inertia;     // [0, 1]
+  uniform float centrifugal_inertia;// [0, 1]
 };
 
 // Define chain simulation structure
 struct chain_sim {
-    uint8 free_idx_cnt;
-    uint8 free_idx[MAX_CHAINS];
-    uniform struct chain_cfg chain_configs[MAX_CHAINS];
-
-    // Chain global forces (world space gravity and wind)
-    uniform float gravity_x[MAX_CHAINS];
-    uniform float gravity_y[MAX_CHAINS];
-    uniform float gravity_z[MAX_CHAINS];
-
-    uniform float wind_x[MAX_CHAINS];
-    uniform float wind_y[MAX_CHAINS];
-    uniform float wind_z[MAX_CHAINS];
-
-    // Chain transformations (world space)
-    uniform float chain_pos_x[MAX_CHAINS];
-    uniform float chain_pos_y[MAX_CHAINS];
-    uniform float chain_pos_z[MAX_CHAINS];
-
-    uniform float chain_quat_x[MAX_CHAINS];
-    uniform float chain_quat_y[MAX_CHAINS];
-    uniform float chain_quat_z[MAX_CHAINS];
-    uniform float chain_quat_w[MAX_CHAINS];
-
-    uniform float prev_chain_pos_x[MAX_CHAINS];
-    uniform float prev_chain_pos_y[MAX_CHAINS];
-    uniform float prev_chain_pos_z[MAX_CHAINS];
-
-    uniform float prev_chain_quat_x[MAX_CHAINS];
-    uniform float prev_chain_quat_y[MAX_CHAINS];
-    uniform float prev_chain_quat_z[MAX_CHAINS];
-    uniform float prev_chain_quat_w[MAX_CHAINS];
-
-    // Particle positions (model space)
-    uniform float p_pos_x[MAX_PARTICLES];
-    uniform float p_pos_y[MAX_PARTICLES];
-    uniform float p_pos_z[MAX_PARTICLES];
-    
-    uniform float prev_p_pos_x[MAX_PARTICLES];
-    uniform float prev_p_pos_y[MAX_PARTICLES];
-    uniform float prev_p_pos_z[MAX_PARTICLES];
-
-    uniform float inv_mass[MAX_PARTICLES];
-    uniform float rest_lengths[MAX_REST_LENGTHS];
-
-    // Sphere positions (model space)
-    uniform float sphere_x[MAX_SPHERES];
-    uniform float sphere_y[MAX_SPHERES];
-    uniform float sphere_z[MAX_SPHERES];
-    uniform float sphere_radius[MAX_SPHERES];
-
-    // Rest positions (model space)
-    uniform float rest_pos_x[MAX_PARTICLES];
-    uniform float rest_pos_y[MAX_PARTICLES];
-    uniform float rest_pos_z[MAX_PARTICLES];
-    uniform float motion_radius[MAX_PARTICLES];
+  uint8 free_idx_cnt;
+  uint8 free_idx[MAX_CHAINS];
+  uniform struct chain_cfg chain_configs[MAX_CHAINS];
+
+  // Chain global forces (world space gravity and wind)
+  uniform float gravity_x[MAX_CHAINS];
+  uniform float gravity_y[MAX_CHAINS];
+  uniform float gravity_z[MAX_CHAINS];
+
+  uniform float wind_x[MAX_CHAINS];
+  uniform float wind_y[MAX_CHAINS];
+  uniform float wind_z[MAX_CHAINS];
+
+  // Chain transformations (world space)
+  uniform float chain_pos_x[MAX_CHAINS];
+  uniform float chain_pos_y[MAX_CHAINS];
+  uniform float chain_pos_z[MAX_CHAINS];
+
+  uniform float chain_quat_x[MAX_CHAINS];
+  uniform float chain_quat_y[MAX_CHAINS];
+  uniform float chain_quat_z[MAX_CHAINS];
+  uniform float chain_quat_w[MAX_CHAINS];
+
+  uniform float prev_chain_pos_x[MAX_CHAINS];
+  uniform float prev_chain_pos_y[MAX_CHAINS];
+  uniform float prev_chain_pos_z[MAX_CHAINS];
+
+  uniform float prev_chain_quat_x[MAX_CHAINS];
+  uniform float prev_chain_quat_y[MAX_CHAINS];
+  uniform float prev_chain_quat_z[MAX_CHAINS];
+  uniform float prev_chain_quat_w[MAX_CHAINS];
+
+  // Particle positions (model space)
+  uniform float p_pos_x[MAX_PARTICLES];
+  uniform float p_pos_y[MAX_PARTICLES];
+  uniform float p_pos_z[MAX_PARTICLES];
+
+  uniform float prev_p_pos_x[MAX_PARTICLES];
+  uniform float prev_p_pos_y[MAX_PARTICLES];
+  uniform float prev_p_pos_z[MAX_PARTICLES];
+
+  uniform float inv_mass[MAX_PARTICLES];
+  uniform float rest_lengths[MAX_REST_LENGTHS];
+
+  // Sphere positions (model space)
+  uniform float sphere_x[MAX_SPHERES];
+  uniform float sphere_y[MAX_SPHERES];
+  uniform float sphere_z[MAX_SPHERES];
+  uniform float sphere_radius[MAX_SPHERES];
+
+  // Rest positions (model space)
+  uniform float rest_pos_x[MAX_PARTICLES];
+  uniform float rest_pos_y[MAX_PARTICLES];
+  uniform float rest_pos_z[MAX_PARTICLES];
+  uniform float motion_radius[MAX_PARTICLES];
 };
 
 // ISPC task for chain simulation
 export void simulate_chains(
-    __attribute__((aligned(32))) uniform float *uniform chain_quat_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform chain_quat_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform chain_quat_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform chain_quat_w_ptr,
-    __attribute__((aligned(32))) uniform float *uniform wind_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform wind_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform wind_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform gravity_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform gravity_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform gravity_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform chain_pos_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform chain_pos_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform chain_pos_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_w_ptr,
-    __attribute__((aligned(32))) uniform float *uniform p_pos_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform p_pos_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform p_pos_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform inv_mass_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_p_pos_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_p_pos_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform prev_p_pos_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform rest_lengths_ptr,
-    __attribute__((aligned(32))) uniform struct chain_cfg *uniform chain_configs_ptr, // Passed as pointer to array of structs
-    __attribute__((aligned(32))) uniform float *uniform sphere_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform sphere_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform sphere_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform sphere_radius_ptr,
-    __attribute__((aligned(32))) uniform float *uniform rest_pos_x_ptr,
-    __attribute__((aligned(32))) uniform float *uniform rest_pos_y_ptr,
-    __attribute__((aligned(32))) uniform float *uniform rest_pos_z_ptr,
-    __attribute__((aligned(32))) uniform float *uniform motion_radius_ptr,
-    uniform float dt) {
-    // Common SIMD constants (varying for vector operations across lanes)
-    varying float dt_vec = dt;
-    varying float dt2_vec = dt * dt;
-    varying float one_vec = 1.0;
-    varying float neg_one_vec = -1.0;
-    varying float eps_vec = 1e-6;
-    varying float inv_mass_clamp_min = 0.0;
-    varying float inv_mass_clamp_max = 1e6;
-    varying float zero_vec = 0.0;
-    varying float inv_dt_vec = 1.0 / dt;
-    varying float two_vec = 2.0;
-    varying float inertia_clamp_min = 0.0;
-    varying float inertia_clamp_max = 1.0;
-
-    assume(MAX_CHAINS % programCount == 0);
-    assume(PARTICLES_PER_CHAIN % programCount == 0);
-    assume(SPHERES_PER_CHAIN % programCount == 0);
-    assume(MAX_SPHERES % programCount == 0);
-    assume(CONSTRAINTS_PER_CHAIN % programCount == 0);
-    assume(MAX_REST_LENGTHS % programCount == 0);
-
-    // Local arrays for PBD solver
-    // Declared as uniform to avoid scatter/gather warnings when accessing with varying indices
-    __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1];
-    __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1];
-    __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1];
-    __attribute__((aligned(32))) uniform float pm_local[PARTICLES_PER_CHAIN + 1];
-
-    // All lanes can safely write to these uniform locations.
-    px_local[PARTICLES_PER_CHAIN] = 0.0;
-    py_local[PARTICLES_PER_CHAIN] = 0.0;
-    pz_local[PARTICLES_PER_CHAIN] = 0.0;
-    pm_local[PARTICLES_PER_CHAIN] = 0.0;
-
-    // Stack arrays for precomputed data
-    __attribute__((aligned(32))) uniform float wind_local_x[MAX_CHAINS];
-    __attribute__((aligned(32))) uniform float wind_local_y[MAX_CHAINS];
-    __attribute__((aligned(32))) uniform float wind_local_z[MAX_CHAINS];
-
-    __attribute__((aligned(32))) uniform float gravity_local_x[MAX_CHAINS];
-    __attribute__((aligned(32))) uniform float gravity_local_y[MAX_CHAINS];
-    __attribute__((aligned(32))) uniform float gravity_local_z[MAX_CHAINS];
-
-    __attribute__((aligned(32))) uniform float vel_x[MAX_CHAINS];
-    __attribute__((aligned(32))) uniform float vel_y[MAX_CHAINS];
-    __attribute__((aligned(32))) uniform float vel_z[MAX_CHAINS];
-
-    __attribute__((aligned(32))) uniform float ang_vel_x[MAX_CHAINS];
-    __attribute__((aligned(32))) uniform float ang_vel_y[MAX_CHAINS];
-    __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS];
-
-    // Step 0: Precomputation (SIMD, process chains in groups of programCount)
-    // Assumption: MAX_CHAINS (64) is divisible by programCount (8 for AVX2, 4 for Neon)
-    for (uniform int c = 0; c < MAX_CHAINS; c += programCount) {
-        // Load chain quaternions
-        varying float qx = chain_quat_x_ptr[c + programIndex];
-        varying float qy = chain_quat_y_ptr[c + programIndex];
-        varying float qz = chain_quat_z_ptr[c + programIndex];
-        varying float qw = chain_quat_w_ptr[c + programIndex];
-
-        // Compute local-space wind
-        {
-            varying float vx = wind_x_ptr[c + programIndex];
-            varying float vy = wind_y_ptr[c + programIndex];
-            varying float vz = wind_z_ptr[c + programIndex];
-
-            // Create q_conjugate components
-            varying float cqx = -qx;
-            varying float cqy = -qy;
-            varying float cqz = -qz;
-
-            // Step 1: t = q_conj * v_world_as_quat
-            varying float tx = qw * vx + cqy * vz - cqz * vy;
-            varying float ty = qw * vy + cqz * vx - cqx * vz;
-            varying float tz = qw * vz + cqx * vy - cqy * vx;
-            varying float tw = qx * vx + qy * vy + qz * vz;
-
-            // Step 2: result_vec = (t * q)_vec
-            varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
-            varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
-            varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
-
-            wind_local_x[c + programIndex] = result_x;
-            wind_local_y[c + programIndex] = result_y;
-            wind_local_z[c + programIndex] = result_z;
-        }
-
-        // Compute local-space gravity
-        {
-            varying float vx = gravity_x_ptr[c + programIndex];
-            varying float float_vy = gravity_y_ptr[c + programIndex];
-            varying float vz = gravity_z_ptr[c + programIndex];
-
-            varying float cqx = -qx;
-            varying float cqy = -qy;
-            varying float cqz = -qz;
-
-            varying float tx = qw * vx + cqy * vz - cqz * float_vy;
-            varying float ty = qw * float_vy + cqz * vx - cqx * vz;
-            varying float tz = qw * vz + cqx * float_vy - cqy * vx;
-            varying float tw = qx * vx + qy * float_vy + qz * vz;
-
-            // Corrected quaternion multiplication for gravity_local
-            varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
-            varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
-            varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
-
-            gravity_local_x[c + programIndex] = result_x;
-            gravity_local_y[c + programIndex] = result_y;
-            gravity_local_z[c + programIndex] = result_z;
-        }
-
-        // Compute linear velocity
-        {
-            varying float curr_x = chain_pos_x_ptr[c + programIndex];
-            varying float curr_y = chain_pos_y_ptr[c + programIndex];
-            varying float curr_z = chain_pos_z_ptr[c + programIndex];
-            varying float prev_x = prev_chain_pos_x_ptr[c + programIndex];
-            varying float prev_y = prev_chain_pos_y_ptr[c + programIndex];
-            varying float prev_z = prev_chain_pos_z_ptr[c + programIndex];
-
-            varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec;
-            varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec;
-            varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec;
-            vel_x[c + programIndex] = vel_x_vec;
-            vel_y[c + programIndex] = vel_y_vec;
-            vel_z[c + programIndex] = vel_z_vec;
-        }
-
-        // Compute angular velocity
-        {
-            varying float prev_qx = prev_chain_quat_x_ptr[c + programIndex];
-            varying float prev_qy = prev_chain_quat_y_ptr[c + programIndex];
-            varying float prev_qz = prev_chain_quat_z_ptr[c + programIndex];
-            varying float prev_qw = prev_chain_quat_w_ptr[c + programIndex];
-
-            // Step 1: Compute delta quaternion (to - from) - Note: This is an approximation
-            varying float delta_qx = qx - prev_qx;
-            varying float delta_qy = qy - prev_qy;
-            varying float delta_qz = qz - prev_qz;
-            varying float delta_qw = qw - prev_qw;
-
-            // Step 2: Compute conjugate of previous quaternion
-            varying float conj_qx = -prev_qx;
-            varying float conj_qy = -prev_qy;
-            varying float conj_qz = -prev_qz;
-            varying float conj_qw = prev_qw;
-            // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
-            varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz;
-            varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy;
-            varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz;
-            varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx;
-            varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x;
-            varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y;
-            varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z;
-            // Step 4: Compute dot product (to, from) for shortest arc check
-            varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz;
-            // Step 5: Negate orientation delta if dot < 0
-            varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec);
-            orient_qx *= negate_sign;
-            orient_qy *= negate_sign;
-            orient_qz *= negate_sign;
-
-            // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
-            varying float scale = one_vec / (two_vec * dt_vec);
-            varying float ang_vel_x_vec = orient_qx * scale;
-            varying float ang_vel_y_vec = orient_qy * scale;
-            varying float ang_vel_z_vec = orient_qz * scale;
-
-            ang_vel_x[c + programIndex] = ang_vel_x_vec;
-            ang_vel_y[c + programIndex] = ang_vel_y_vec;
-            ang_vel_z[c + programIndex] = ang_vel_z_vec;
-        }
+  __attribute__((aligned(32))) uniform float *uniform chain_quat_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform chain_quat_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform chain_quat_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform chain_quat_w_ptr,
+  __attribute__((aligned(32))) uniform float *uniform wind_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform wind_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform wind_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform gravity_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform gravity_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform gravity_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform chain_pos_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform chain_pos_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform chain_pos_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_w_ptr,
+  __attribute__((aligned(32))) uniform float *uniform p_pos_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform p_pos_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform p_pos_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform inv_mass_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform rest_lengths_ptr,
+  __attribute__((aligned(32))) uniform struct chain_cfg *uniform chain_configs_ptr, // Passed as pointer to array of structs
+  __attribute__((aligned(32))) uniform float *uniform sphere_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform sphere_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform sphere_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform sphere_radius_ptr,
+  __attribute__((aligned(32))) uniform float *uniform rest_pos_x_ptr,
+  __attribute__((aligned(32))) uniform float *uniform rest_pos_y_ptr,
+  __attribute__((aligned(32))) uniform float *uniform rest_pos_z_ptr,
+  __attribute__((aligned(32))) uniform float *uniform motion_radius_ptr,
+  uniform float dt) {
+  // Common SIMD constants (varying for vector operations across lanes)
+  varying float dt_vec = dt;
+  varying float dt2_vec = dt * dt;
+  varying float one_vec = 1.0;
+  varying float neg_one_vec = -1.0;
+  varying float eps_vec = 1e-6;
+  varying float inv_mass_clamp_min = 0.0;
+  varying float inv_mass_clamp_max = 1e6;
+  varying float zero_vec = 0.0;
+  varying float inv_dt_vec = 1.0 / dt;
+  varying float two_vec = 2.0;
+  varying float inertia_clamp_min = 0.0;
+  varying float inertia_clamp_max = 1.0;
+
+  assume(MAX_CHAINS % programCount == 0);
+  assume(PARTICLES_PER_CHAIN % programCount == 0);
+  assume(SPHERES_PER_CHAIN % programCount == 0);
+  assume(MAX_SPHERES % programCount == 0);
+  assume(CONSTRAINTS_PER_CHAIN % programCount == 0);
+  assume(MAX_REST_LENGTHS % programCount == 0);
+
+  // Local arrays for PBD solver
+  // Declared as uniform to avoid scatter/gather warnings when accessing with varying indices
+  __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1];
+  __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1];
+  __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1];
+  __attribute__((aligned(32))) uniform float pm_local[PARTICLES_PER_CHAIN + 1];
+
+  // All lanes can safely write to these uniform locations.
+  px_local[PARTICLES_PER_CHAIN] = 0.0;
+  py_local[PARTICLES_PER_CHAIN] = 0.0;
+  pz_local[PARTICLES_PER_CHAIN] = 0.0;
+  pm_local[PARTICLES_PER_CHAIN] = 0.0;
+
+  // Stack arrays for precomputed data
+  __attribute__((aligned(32))) uniform float wind_local_x[MAX_CHAINS];
+  __attribute__((aligned(32))) uniform float wind_local_y[MAX_CHAINS];
+  __attribute__((aligned(32))) uniform float wind_local_z[MAX_CHAINS];
+
+  __attribute__((aligned(32))) uniform float gravity_local_x[MAX_CHAINS];
+  __attribute__((aligned(32))) uniform float gravity_local_y[MAX_CHAINS];
+  __attribute__((aligned(32))) uniform float gravity_local_z[MAX_CHAINS];
+
+  __attribute__((aligned(32))) uniform float vel_x[MAX_CHAINS];
+  __attribute__((aligned(32))) uniform float vel_y[MAX_CHAINS];
+  __attribute__((aligned(32))) uniform float vel_z[MAX_CHAINS];
+
+  __attribute__((aligned(32))) uniform float ang_vel_x[MAX_CHAINS];
+  __attribute__((aligned(32))) uniform float ang_vel_y[MAX_CHAINS];
+  __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS];
+
+  // Step 0: Precomputation (SIMD, process chains in groups of programCount)
+  // Assumption: MAX_CHAINS (64) is divisible by programCount (8 for AVX2, 4 for Neon)
+  for (uniform int c = 0; c < MAX_CHAINS; c += programCount) {
+    // Load chain quaternions
+    varying float qx = chain_quat_x_ptr[c + programIndex];
+    varying float qy = chain_quat_y_ptr[c + programIndex];
+    varying float qz = chain_quat_z_ptr[c + programIndex];
+    varying float qw = chain_quat_w_ptr[c + programIndex];
+
+    // Compute local-space wind
+    {
+      varying float vx = wind_x_ptr[c + programIndex];
+      varying float vy = wind_y_ptr[c + programIndex];
+      varying float vz = wind_z_ptr[c + programIndex];
+
+      // Create q_conjugate components
+      varying float cqx = -qx;
+      varying float cqy = -qy;
+      varying float cqz = -qz;
+
+      // Step 1: t = q_conj * v_world_as_quat
+      varying float tx = qw * vx + cqy * vz - cqz * vy;
+      varying float ty = qw * vy + cqz * vx - cqx * vz;
+      varying float tz = qw * vz + cqx * vy - cqy * vx;
+      varying float tw = qx * vx + qy * vy + qz * vz;
+
+      // Step 2: result_vec = (t * q)_vec
+      varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
+      varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
+      varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
+
+      wind_local_x[c + programIndex] = result_x;
+      wind_local_y[c + programIndex] = result_y;
+      wind_local_z[c + programIndex] = result_z;
     }
 
-    // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
-    // Process one chain at a time, particles in groups of programCount
-    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-        uniform int base_idx = c * PARTICLES_PER_CHAIN;
-        uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
-
-        // Load velocities and inertia (scalar, broadcast to lanes)
-        varying float vel_x_vec = vel_x[c];
-        varying float vel_y_vec = vel_y[c];
-        varying float vel_z_vec = vel_z[c];
+    // Compute local-space gravity
+    {
+      varying float vx = gravity_x_ptr[c + programIndex];
+      varying float float_vy = gravity_y_ptr[c + programIndex];
+      varying float vz = gravity_z_ptr[c + programIndex];
+
+      varying float cqx = -qx;
+      varying float cqy = -qy;
+      varying float cqz = -qz;
+
+      varying float tx = qw * vx + cqy * vz - cqz * float_vy;
+      varying float ty = qw * float_vy + cqz * vx - cqx * vz;
+      varying float tz = qw * vz + cqx * float_vy - cqy * vx;
+      varying float tw = qx * vx + qy * float_vy + qz * vz;
+
+      // Corrected quaternion multiplication for gravity_local
+      varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
+      varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
+      varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
+
+      gravity_local_x[c + programIndex] = result_x;
+      gravity_local_y[c + programIndex] = result_y;
+      gravity_local_z[c + programIndex] = result_z;
+    }
 
-        varying float ang_vel_x_vec = ang_vel_x[c];
-        varying float ang_vel_y_vec = ang_vel_y[c];
-        varying float ang_vel_z_vec = ang_vel_z[c];
+    // Compute linear velocity
+    {
+      varying float curr_x = chain_pos_x_ptr[c + programIndex];
+      varying float curr_y = chain_pos_y_ptr[c + programIndex];
+      varying float curr_z = chain_pos_z_ptr[c + programIndex];
+      varying float prev_x = prev_chain_pos_x_ptr[c + programIndex];
+      varying float prev_y = prev_chain_pos_y_ptr[c + programIndex];
+      varying float prev_z = prev_chain_pos_z_ptr[c + programIndex];
+
+      varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec;
+      varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec;
+      varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec;
+      vel_x[c + programIndex] = vel_x_vec;
+      vel_y[c + programIndex] = vel_y_vec;
+      vel_z[c + programIndex] = vel_z_vec;
+    }
 
-        varying float linear_inertia_vec = cfg->linear_inertia;
-        varying float angular_inertia_vec = cfg->angular_inertia;
-        varying float centrifugal_inertia_vec = cfg->centrifugal_inertia;
-
-        // Clamp inertia parameters to [0, 1]
-        linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max);
-        angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max);
-        centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max);
-
-        // Process particles in groups
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-            varying float px = p_pos_x_ptr[base_idx + i + programIndex];
-            varying float py = p_pos_y_ptr[base_idx + i + programIndex];
-            varying float pz = p_pos_z_ptr[base_idx + i + programIndex];
-            varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
-            p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
-
-            // Linear inertia: v * dt * linear_inertia
-            varying float lin_delta_x = vel_x_vec * dt_vec * linear_inertia_vec;
-            varying float lin_delta_y = vel_y_vec * dt_vec * linear_inertia_vec;
-            varying float lin_delta_z = vel_z_vec * dt_vec * linear_inertia_vec;
-
-            // Angular inertia: (ω × r) * dt * angular_inertia
-            varying float ang_delta_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
-            varying float ang_delta_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
-            varying float ang_delta_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
-
-            ang_delta_x *= dt_vec * angular_inertia_vec;
-            ang_delta_y *= dt_vec * angular_inertia_vec;
-            ang_delta_z *= dt_vec * angular_inertia_vec;
-
-            // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
-            varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
-            varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
-            varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
-            varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y;
-            varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z;
-            varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x;
-
-            varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec;
-            varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec;
-            varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec;
-
-            // Total delta
-            varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x;
-            varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y;
-            varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z;
-
-            // Update positions
-            p_pos_x_ptr[base_idx + i + programIndex] = px + delta_x;
-            p_pos_y_ptr[base_idx + i + programIndex] = py + delta_y;
-            p_pos_z_ptr[base_idx + i + programIndex] = pz + delta_z;
-        }
-
-        // Update previous transformation (scalar)
-        prev_chain_pos_x_ptr[c] = chain_pos_x_ptr[c];
-        prev_chain_pos_y_ptr[c] = chain_pos_y_ptr[c];
-        prev_chain_pos_z_ptr[c] = chain_pos_z_ptr[c];
-
-        prev_chain_quat_x_ptr[c] = chain_quat_x_ptr[c];
-        prev_chain_quat_y_ptr[c] = chain_quat_y_ptr[c];
-        prev_chain_quat_z_ptr[c] = chain_quat_z_ptr[c];
-        prev_chain_quat_w_ptr[c] = chain_quat_w_ptr[c];
+    // Compute angular velocity
+    {
+      varying float prev_qx = prev_chain_quat_x_ptr[c + programIndex];
+      varying float prev_qy = prev_chain_quat_y_ptr[c + programIndex];
+      varying float prev_qz = prev_chain_quat_z_ptr[c + programIndex];
+      varying float prev_qw = prev_chain_quat_w_ptr[c + programIndex];
+
+      // Step 1: Compute delta quaternion (to - from) - Note: This is an approximation
+      varying float delta_qx = qx - prev_qx;
+      varying float delta_qy = qy - prev_qy;
+      varying float delta_qz = qz - prev_qz;
+      varying float delta_qw = qw - prev_qw;
+
+      // Step 2: Compute conjugate of previous quaternion
+      varying float conj_qx = -prev_qx;
+      varying float conj_qy = -prev_qy;
+      varying float conj_qz = -prev_qz;
+      varying float conj_qw = prev_qw;
+      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
+      varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz;
+      varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy;
+      varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz;
+      varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx;
+      varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x;
+      varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y;
+      varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z;
+      // Step 4: Compute dot product (to, from) for shortest arc check
+      varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz;
+      // Step 5: Negate orientation delta if dot < 0
+      varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec);
+      orient_qx *= negate_sign;
+      orient_qy *= negate_sign;
+      orient_qz *= negate_sign;
+
+      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
+      varying float scale = one_vec / (two_vec * dt_vec);
+      varying float ang_vel_x_vec = orient_qx * scale;
+      varying float ang_vel_y_vec = orient_qy * scale;
+      varying float ang_vel_z_vec = orient_qz * scale;
+
+      ang_vel_x[c + programIndex] = ang_vel_x_vec;
+      ang_vel_y[c + programIndex] = ang_vel_y_vec;
+      ang_vel_z[c + programIndex] = ang_vel_z_vec;
+    }
+  }
+
+  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
+  // Process one chain at a time, particles in groups of programCount
+  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+    uniform int base_idx = c * PARTICLES_PER_CHAIN;
+    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
+
+    // Load velocities and inertia (scalar, broadcast to lanes)
+    varying float vel_x_vec = vel_x[c];
+    varying float vel_y_vec = vel_y[c];
+    varying float vel_z_vec = vel_z[c];
+
+    varying float ang_vel_x_vec = ang_vel_x[c];
+    varying float ang_vel_y_vec = ang_vel_y[c];
+    varying float ang_vel_z_vec = ang_vel_z[c];
+
+    varying float linear_inertia_vec = cfg->linear_inertia;
+    varying float angular_inertia_vec = cfg->angular_inertia;
+    varying float centrifugal_inertia_vec = cfg->centrifugal_inertia;
+
+    // Clamp inertia parameters to [0, 1]
+    linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max);
+    angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max);
+    centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max);
+
+    // Process particles in groups
+    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+      varying float px = p_pos_x_ptr[base_idx + i + programIndex];
+      varying float py = p_pos_y_ptr[base_idx + i + programIndex];
+      varying float pz = p_pos_z_ptr[base_idx + i + programIndex];
+      varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
+      p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
+
+      // Linear inertia: v * dt * linear_inertia
+      varying float lin_delta_x = vel_x_vec * dt_vec * linear_inertia_vec;
+      varying float lin_delta_y = vel_y_vec * dt_vec * linear_inertia_vec;
+      varying float lin_delta_z = vel_z_vec * dt_vec * linear_inertia_vec;
+
+      // Angular inertia: (ω × r) * dt * angular_inertia
+      varying float ang_delta_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
+      varying float ang_delta_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
+      varying float ang_delta_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
+
+      ang_delta_x *= dt_vec * angular_inertia_vec;
+      ang_delta_y *= dt_vec * angular_inertia_vec;
+      ang_delta_z *= dt_vec * angular_inertia_vec;
+
+      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
+      varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
+      varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
+      varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
+      varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y;
+      varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z;
+      varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x;
+
+      varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec;
+      varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec;
+      varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec;
+
+      // Total delta
+      varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x;
+      varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y;
+      varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z;
+
+      // Update positions
+      p_pos_x_ptr[base_idx + i + programIndex] = px + delta_x;
+      p_pos_y_ptr[base_idx + i + programIndex] = py + delta_y;
+      p_pos_z_ptr[base_idx + i + programIndex] = pz + delta_z;
     }
 
-    // Step 2: Verlet Integration
-    // Process particles in groups of programCount
-    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-        uniform int base_idx = c * PARTICLES_PER_CHAIN;
-        uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
-
-        // Load forces and damping (scalar, broadcast to lanes)
-        varying float gravity_x_vec = gravity_local_x[c];
-        varying float gravity_y_vec = gravity_local_y[c];
-        varying float gravity_z_vec = gravity_local_z[c];
-
-        varying float wind_x_vec = wind_local_x[c];
-        varying float wind_y_vec = wind_local_y[c];
-        varying float wind_z_vec = wind_local_z[c];
-
-        varying float damping_vec = cfg->damping;
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-            varying float p_curr_x = p_pos_x_ptr[base_idx + i + programIndex];
-            varying float p_curr_y = p_pos_y_ptr[base_idx + i + programIndex];
-            varying float p_curr_z = p_pos_z_ptr[base_idx + i + programIndex];
-
-            varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
-            p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
-
-            varying float p_prev_x = prev_p_pos_x_ptr[base_idx + i + programIndex];
-            varying float p_prev_y = prev_p_pos_y_ptr[base_idx + i + programIndex];
-            varying float p_prev_z = prev_p_pos_z_ptr[base_idx + i + programIndex];
-
-            // Compute forces and accelerations
-            varying float force_x = gravity_x_vec + wind_x_vec;
-            varying float force_y = gravity_y_vec + wind_y_vec;
-            varying float force_z = gravity_z_vec + wind_z_vec;
-
-            varying float acc_x = force_x * p_inv_mass;
-            varying float acc_y = force_y * p_inv_mass;
-            varying float acc_z = force_z * p_inv_mass;
-
-            // Compute velocity with damping
-            varying float v_x = p_curr_x - p_prev_x;
-            varying float v_y = p_curr_y - p_prev_y;
-            varying float v_z = p_curr_z - p_prev_z;
-
-            varying float damped_vel_x = v_x * damping_vec;
-            varying float damped_vel_y = v_y * damping_vec;
-            varying float damped_vel_z = v_z * damping_vec;
-
-            // Update positions: p_curr + damped_vel + acc * dt^2
-            varying float term_accel_x = acc_x * dt2_vec;
-            varying float term_accel_y = acc_y * dt2_vec;
-            varying float term_accel_z = acc_z * dt2_vec;
-
-            p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x;
-            p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y;
-            p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z;
-
-            // Update previous positions
-            prev_p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x;
-            prev_p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y;
-            prev_p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z;
-        }
+    // Update previous transformation (scalar)
+    prev_chain_pos_x_ptr[c] = chain_pos_x_ptr[c];
+    prev_chain_pos_y_ptr[c] = chain_pos_y_ptr[c];
+    prev_chain_pos_z_ptr[c] = chain_pos_z_ptr[c];
+
+    prev_chain_quat_x_ptr[c] = chain_quat_x_ptr[c];
+    prev_chain_quat_y_ptr[c] = chain_quat_y_ptr[c];
+    prev_chain_quat_z_ptr[c] = chain_quat_z_ptr[c];
+    prev_chain_quat_w_ptr[c] = chain_quat_w_ptr[c];
+  }
+
+  // Step 2: Verlet Integration
+  // Process particles in groups of programCount
+  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+    uniform int base_idx = c * PARTICLES_PER_CHAIN;
+    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
+
+    // Load forces and damping (scalar, broadcast to lanes)
+    varying float gravity_x_vec = gravity_local_x[c];
+    varying float gravity_y_vec = gravity_local_y[c];
+    varying float gravity_z_vec = gravity_local_z[c];
+
+    varying float wind_x_vec = wind_local_x[c];
+    varying float wind_y_vec = wind_local_y[c];
+    varying float wind_z_vec = wind_local_z[c];
+
+    varying float damping_vec = cfg->damping;
+    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+      varying float p_curr_x = p_pos_x_ptr[base_idx + i + programIndex];
+      varying float p_curr_y = p_pos_y_ptr[base_idx + i + programIndex];
+      varying float p_curr_z = p_pos_z_ptr[base_idx + i + programIndex];
+
+      varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
+      p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
+
+      varying float p_prev_x = prev_p_pos_x_ptr[base_idx + i + programIndex];
+      varying float p_prev_y = prev_p_pos_y_ptr[base_idx + i + programIndex];
+      varying float p_prev_z = prev_p_pos_z_ptr[base_idx + i + programIndex];
+
+      // Compute forces and accelerations
+      varying float force_x = gravity_x_vec + wind_x_vec;
+      varying float force_y = gravity_y_vec + wind_y_vec;
+      varying float force_z = gravity_z_vec + wind_z_vec;
+
+      varying float acc_x = force_x * p_inv_mass;
+      varying float acc_y = force_y * p_inv_mass;
+      varying float acc_z = force_z * p_inv_mass;
+
+      // Compute velocity with damping
+      varying float v_x = p_curr_x - p_prev_x;
+      varying float v_y = p_curr_y - p_prev_y;
+      varying float v_z = p_curr_z - p_prev_z;
+
+      varying float damped_vel_x = v_x * damping_vec;
+      varying float damped_vel_y = v_y * damping_vec;
+      varying float damped_vel_z = v_z * damping_vec;
+
+      // Update positions: p_curr + damped_vel + acc * dt^2
+      varying float term_accel_x = acc_x * dt2_vec;
+      varying float term_accel_y = acc_y * dt2_vec;
+      varying float term_accel_z = acc_z * dt2_vec;
+
+      p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x;
+      p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y;
+      p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z;
+
+      // Update previous positions
+      prev_p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x;
+      prev_p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y;
+      prev_p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z;
+    }
+  }
+
+  // Step 3: Distance Constraints
+  // Process one chain at a time, constraints in groups of programCount
+  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+    uniform int p_base = c * PARTICLES_PER_CHAIN;
+    uniform int r_base = c * CONSTRAINTS_PER_CHAIN;
+    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
+
+    varying float stiffness_vec = cfg->stiffness;
+    varying float stretch_factor_vec = cfg->stretch_factor;
+    varying float max_strain_vec_abs = cfg->max_strain;
+    varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec;
+
+    // Copy particles to local arrays with uniform indexing
+    // This loop now performs a vectorized load into the uniform px_local array.
+    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+      // Load a vector of particles from global memory into varying registers
+      varying float current_px = p_pos_x_ptr[p_base + i + programIndex];
+      varying float current_py = p_pos_y_ptr[p_base + i + programIndex];
+      varying float current_pz = p_pos_z_ptr[p_base + i + programIndex];
+      varying float current_pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+
+      // Store these varying values into the uniform local array
+      // The compiler can now see this as a series of vector stores
+      // because px_local is uniform.
+      foreach_active (idx) {
+        px_local[i + programIndex] = current_px;
+        py_local[i + programIndex] = current_py;
+        pz_local[i + programIndex] = current_pz;
+        pm_local[i + programIndex] = current_pm;
+      }
     }
 
-    // Step 3: Distance Constraints
-    // Process one chain at a time, constraints in groups of programCount
-    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-        uniform int p_base = c * PARTICLES_PER_CHAIN;
-        uniform int r_base = c * CONSTRAINTS_PER_CHAIN;
-        uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
-
-        varying float stiffness_vec = cfg->stiffness;
-        varying float stretch_factor_vec = cfg->stretch_factor;
-        varying float max_strain_vec_abs = cfg->max_strain;
-        varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec;
+    // Iterate over constraints
+    for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) {
+      // Process constraints in groups of programCount (0 to PARTICLES_PER_CHAIN-2)
+      for (uniform int i = 0; i < PARTICLES_PER_CHAIN - 1; i += programCount) {
+        // Now, load directly from uniform px_local using varying programIndex
+        // The compiler can optimize these into vector loads.
+        varying float p0x = px_local[i + programIndex];
+        varying float p0y = py_local[i + programIndex];
+        varying float p0z = pz_local[i + programIndex]; 
+        varying float p0m = pm_local[i + programIndex];
+
+        // For p1, we need to be careful with the indexing relative to programIndex.
+        // The loop condition 'i < PARTICLES_PER_CHAIN - 1' ensures that 'i + programIndex + 1'
+        // will not exceed 'PARTICLES_PER_CHAIN - 1' for any active lane.
+        varying float p1x = px_local[i + programIndex + 1]; 
+        varying float p1y = py_local[i + programIndex + 1];
+        varying float p1z = pz_local[i + programIndex + 1];
+        varying float p1m = pm_local[i + programIndex + 1];
+
+        varying float rest_len_vec = rest_lengths_ptr[r_base + i + programIndex];
 
-        // Copy particles to local arrays with uniform indexing
-        // This loop now performs a vectorized load into the uniform px_local array.
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-            // Load a vector of particles from global memory into varying registers
-            varying float current_px = p_pos_x_ptr[p_base + i + programIndex];
-            varying float current_py = p_pos_y_ptr[p_base + i + programIndex];
-            varying float current_pz = p_pos_z_ptr[p_base + i + programIndex];
-            varying float current_pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
-
-            // Store these varying values into the uniform local array
-            // The compiler can now see this as a series of vector stores
-            // because px_local is uniform.
-            foreach_active (idx) {
-                px_local[i + programIndex] = current_px;
-                py_local[i + programIndex] = current_py;
-                pz_local[i + programIndex] = current_pz;
-                pm_local[i + programIndex] = current_pm;
-            }
-        }
-
-        // Iterate over constraints
-        for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) {
-            // Process constraints in groups of programCount (0 to PARTICLES_PER_CHAIN-2)
-            for (uniform int i = 0; i < PARTICLES_PER_CHAIN - 1; i += programCount) {
-                // Now, load directly from uniform px_local using varying programIndex
-                // The compiler can optimize these into vector loads.
-                varying float p0x = px_local[i + programIndex];
-                varying float p0y = py_local[i + programIndex];
-                varying float p0z = pz_local[i + programIndex]; 
-                varying float p0m = pm_local[i + programIndex];
-
-                // For p1, we need to be careful with the indexing relative to programIndex.
-                // The loop condition 'i < PARTICLES_PER_CHAIN - 1' ensures that 'i + programIndex + 1'
-                // will not exceed 'PARTICLES_PER_CHAIN - 1' for any active lane.
-                varying float p1x = px_local[i + programIndex + 1]; 
-                varying float p1y = py_local[i + programIndex + 1];
-                varying float p1z = pz_local[i + programIndex + 1];
-                varying float p1m = pm_local[i + programIndex + 1];
-
-                varying float rest_len_vec = rest_lengths_ptr[r_base + i + programIndex];
-
-                varying float dx = p1x - p0x;
-                varying float dy = p1y - p0y;
-                varying float dz = p1z - p0z;
-
-                varying float dist_sq = dx * dx + dy * dy + dz * dz;
-                varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
-                varying float dist = rcp(inv_dist);
-                varying float w_sum = p0m + p1m;
-                varying float rcp_w_sum = rcp(max(w_sum, eps_vec));
-                varying float diff = dist - rest_len_vec;
-                varying float strain = diff * rcp(max(rest_len_vec, eps_vec));
-                varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs);
-                varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + abs(strain_clamped));
-                varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum;
-                varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0);
-                // Mask lanes where i + programIndex >= PARTICLES_PER_CHAIN - 1 (no constraint for last particle)
-                apply_corr_mask = select(i + programIndex < PARTICLES_PER_CHAIN - 1, apply_corr_mask, 0.0);
-
-                varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask;
-                varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask;
-                varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask;
-
-                // Apply corrections, writing back to uniform px_local
-                // These are now vector stores.
-                px_local[i + programIndex] -= delta_x * p0m;
-                py_local[i + programIndex] -= delta_y * p0m;
-                pz_local[i + programIndex] -= delta_z * p0m;
-
-                px_local[i + programIndex + 1] += delta_x * p1m;
-                py_local[i + programIndex + 1] += delta_y * p1m;
-                pz_local[i + programIndex + 1] += delta_z * p1m;
-            }
-        }
-        // Write back to global arrays
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-            // Load a vector of particles from uniform local array into varying registers
-            varying float updated_px = px_local[i + programIndex];
-            varying float updated_py = py_local[i + programIndex];
-            varying float updated_pz = pz_local[i + programIndex];
-
-            // Store these varying values into the global memory
-            foreach_active (idx) {
-                p_pos_x_ptr[p_base + i + programIndex] = updated_px;
-                p_pos_y_ptr[p_base + i + programIndex] = updated_py;
-                p_pos_z_ptr[p_base + i + programIndex] = updated_pz;
-            }
-        }
+        varying float dx = p1x - p0x;
+        varying float dy = p1y - p0y;
+        varying float dz = p1z - p0z;
+
+        varying float dist_sq = dx * dx + dy * dy + dz * dz;
+        varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
+        varying float dist = rcp(inv_dist);
+        varying float w_sum = p0m + p1m;
+        varying float rcp_w_sum = rcp(max(w_sum, eps_vec));
+        varying float diff = dist - rest_len_vec;
+        varying float strain = diff * rcp(max(rest_len_vec, eps_vec));
+        varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs);
+        varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + abs(strain_clamped));
+        varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum;
+        varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0);
+        // Mask lanes where i + programIndex >= PARTICLES_PER_CHAIN - 1 (no constraint for last particle)
+        apply_corr_mask = select(i + programIndex < PARTICLES_PER_CHAIN - 1, apply_corr_mask, 0.0);
+
+        varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask;
+        varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask;
+        varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask;
+
+        // Apply corrections, writing back to uniform px_local
+        // These are now vector stores.
+        px_local[i + programIndex] -= delta_x * p0m;
+        py_local[i + programIndex] -= delta_y * p0m;
+        pz_local[i + programIndex] -= delta_z * p0m;
+
+        px_local[i + programIndex + 1] += delta_x * p1m;
+        py_local[i + programIndex + 1] += delta_y * p1m;
+        pz_local[i + programIndex + 1] += delta_z * p1m;
+      }
+    }
+    // Write back to global arrays
+    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+      // Load a vector of particles from uniform local array into varying registers
+      varying float updated_px = px_local[i + programIndex];
+      varying float updated_py = py_local[i + programIndex];
+      varying float updated_pz = pz_local[i + programIndex];
+
+      // Store these varying values into the global memory
+      foreach_active (idx) {
+        p_pos_x_ptr[p_base + i + programIndex] = updated_px;
+        p_pos_y_ptr[p_base + i + programIndex] = updated_py;
+        p_pos_z_ptr[p_base + i + programIndex] = updated_pz;
+      }
     }
-    // Step 4: Sphere Collisions with Friction
-    // Process particles in groups of programCount
-    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-        uniform int p_base = c * PARTICLES_PER_CHAIN;
-        uniform int s_base = c * SPHERES_PER_CHAIN;
-        uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
-        varying float friction_vec = cfg->friction;
-        for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) {
-            varying float sphere_x_vec = sphere_x_ptr[s_base + s];
-            varying float sphere_y_vec = sphere_y_ptr[s_base + s];
-            varying float sphere_z_vec = sphere_z_ptr[s_base + s];
-            varying float sphere_r_vec = sphere_radius_ptr[s_base + s];
-
-            for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-                varying float p_curr_x = p_pos_x_ptr[p_base + i + programIndex];
-                varying float p_curr_y = p_pos_y_ptr[p_base + i + programIndex];
-                varying float p_curr_z = p_pos_z_ptr[p_base + i + programIndex];
-
-                varying float p_prev_x = prev_p_pos_x_ptr[p_base + i + programIndex];
-                varying float p_prev_y = prev_p_pos_y_ptr[p_base + i + programIndex];
-                varying float p_prev_z = prev_p_pos_z_ptr[p_base + i + programIndex];
-                varying float p_inv_mass = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
-
-                varying float dx = p_curr_x - sphere_x_vec;
-                varying float dy = p_curr_y - sphere_y_vec;
-                varying float dz = p_curr_z - sphere_z_vec;
-
-                varying float dist_sq = dx * dx + dy * dy + dz * dz;
-                varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
-                varying float dist = rcp(inv_dist);
-
-                varying float penetration = sphere_r_vec - dist;
-                varying float collision_mask = select(penetration > zero_vec, 1.0, 0.0);
-
-                varying float normal_x = dx * inv_dist;
-                varying float normal_y = dy * inv_dist;
-                varying float normal_z = dz * inv_dist;
-
-                varying float normal_corr_x = normal_x * penetration;
-                varying float normal_corr_y = normal_y * penetration;
-                varying float normal_corr_z = normal_z * penetration;
-
-                varying float v_x = p_curr_x - p_prev_x;
-                varying float v_y = p_curr_y - p_prev_y;
-                varying float v_z = p_curr_z - p_prev_z;
-                
-                varying float vel_dot_normal = v_x * normal_x + v_y * normal_y + v_z * normal_z;
-                varying float tangent_x = v_x - vel_dot_normal * normal_x;
-                varying float tangent_y = v_y - vel_dot_normal * normal_y;
-                varying float tangent_z = v_z - vel_dot_normal * normal_z;
-
-                varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z;
-                varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec));
-
-                varying float friction_mag = penetration * friction_vec;
-                varying float friction_x = (tangent_x * inv_tangent_mag) * friction_mag;
-                varying float friction_y = (tangent_y * inv_tangent_mag) * friction_mag;
-                varying float friction_z = (tangent_z * inv_tangent_mag) * friction_mag;
-
-                varying float total_corr_x = (normal_corr_x - friction_x) * collision_mask * p_inv_mass;
-                varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass;
-                varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass;
-
-                p_pos_x_ptr[p_base + i + programIndex] = p_curr_x + total_corr_x;
-                p_pos_y_ptr[p_base + i + programIndex] = p_curr_y + total_corr_y;
-                p_pos_z_ptr[p_base + i + programIndex] = p_curr_z + total_corr_z;
-            }
-        }
+  }
+  // Step 4: Sphere Collisions with Friction
+  // Process particles in groups of programCount
+  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+    uniform int p_base = c * PARTICLES_PER_CHAIN;
+    uniform int s_base = c * SPHERES_PER_CHAIN;
+    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
+    varying float friction_vec = cfg->friction;
+    for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) {
+      varying float sphere_x_vec = sphere_x_ptr[s_base + s];
+      varying float sphere_y_vec = sphere_y_ptr[s_base + s];
+      varying float sphere_z_vec = sphere_z_ptr[s_base + s];
+      varying float sphere_r_vec = sphere_radius_ptr[s_base + s];
+
+      for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+        varying float p_curr_x = p_pos_x_ptr[p_base + i + programIndex];
+        varying float p_curr_y = p_pos_y_ptr[p_base + i + programIndex];
+        varying float p_curr_z = p_pos_z_ptr[p_base + i + programIndex];
+
+        varying float p_prev_x = prev_p_pos_x_ptr[p_base + i + programIndex];
+        varying float p_prev_y = prev_p_pos_y_ptr[p_base + i + programIndex];
+        varying float p_prev_z = prev_p_pos_z_ptr[p_base + i + programIndex];
+        varying float p_inv_mass = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+
+        varying float dx = p_curr_x - sphere_x_vec;
+        varying float dy = p_curr_y - sphere_y_vec;
+        varying float dz = p_curr_z - sphere_z_vec;
+
+        varying float dist_sq = dx * dx + dy * dy + dz * dz;
+        varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
+        varying float dist = rcp(inv_dist);
+
+        varying float penetration = sphere_r_vec - dist;
+        varying float collision_mask = select(penetration > zero_vec, 1.0, 0.0);
+
+        varying float normal_x = dx * inv_dist;
+        varying float normal_y = dy * inv_dist;
+        varying float normal_z = dz * inv_dist;
+
+        varying float normal_corr_x = normal_x * penetration;
+        varying float normal_corr_y = normal_y * penetration;
+        varying float normal_corr_z = normal_z * penetration;
+
+        varying float v_x = p_curr_x - p_prev_x;
+        varying float v_y = p_curr_y - p_prev_y;
+        varying float v_z = p_curr_z - p_prev_z;
+
+        varying float vel_dot_normal = v_x * normal_x + v_y * normal_y + v_z * normal_z;
+        varying float tangent_x = v_x - vel_dot_normal * normal_x;
+        varying float tangent_y = v_y - vel_dot_normal * normal_y;
+        varying float tangent_z = v_z - vel_dot_normal * normal_z;
+
+        varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z;
+        varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec));
+
+        varying float friction_mag = penetration * friction_vec;
+        varying float friction_x = (tangent_x * inv_tangent_mag) * friction_mag;
+        varying float friction_y = (tangent_y * inv_tangent_mag) * friction_mag;
+        varying float friction_z = (tangent_z * inv_tangent_mag) * friction_mag;
+
+        varying float total_corr_x = (normal_corr_x - friction_x) * collision_mask * p_inv_mass;
+        varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass;
+        varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass;
+
+        p_pos_x_ptr[p_base + i + programIndex] = p_curr_x + total_corr_x;
+        p_pos_y_ptr[p_base + i + programIndex] = p_curr_y + total_corr_y;
+        p_pos_z_ptr[p_base + i + programIndex] = p_curr_z + total_corr_z;
+      }
     }
-    // Step 5: Motion Constraints
-    // Process particles in groups of programCount
-    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
-        uniform int p_base = c * PARTICLES_PER_CHAIN;
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
-            varying float px = p_pos_x_ptr[p_base + i + programIndex];
-            varying float py = p_pos_y_ptr[p_base + i + programIndex];
-            varying float pz = p_pos_z_ptr[p_base + i + programIndex];
-            varying float pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
-
-            varying float rx = rest_pos_x_ptr[p_base + i + programIndex];
-            varying float ry = rest_pos_y_ptr[p_base + i + programIndex];
-            varying float rz = rest_pos_z_ptr[p_base + i + programIndex];
-            varying float radius_vec = motion_radius_ptr[p_base + i + programIndex];
-
-            varying float dx = px - rx;
-            varying float dy = py - ry;
-            varying float dz = pz - rz;
-            varying float dist_sq = dx * dx + dy * dy + dz * dz;
-            varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
-            varying float dist = rcp(inv_dist);
-
-            varying float penetration = dist - radius_vec;
-            varying float mask = select(penetration > zero_vec, 1.0, 0.0);
-
-            varying float corr_factor = penetration * inv_dist * mask;
-            varying float delta_x = dx * corr_factor * pm;
-            varying float delta_y = dy * corr_factor * pm;
-            varying float delta_z = dz * corr_factor * pm;
-
-            p_pos_x_ptr[p_base + i + programIndex] = px - delta_x;
-            p_pos_y_ptr[p_base + i + programIndex] = py - delta_y;
-            p_pos_z_ptr[p_base + i + programIndex] = pz - delta_z;
-        }
+  }
+  // Step 5: Motion Constraints
+  // Process particles in groups of programCount
+  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+    uniform int p_base = c * PARTICLES_PER_CHAIN;
+    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+      varying float px = p_pos_x_ptr[p_base + i + programIndex];
+      varying float py = p_pos_y_ptr[p_base + i + programIndex];
+      varying float pz = p_pos_z_ptr[p_base + i + programIndex];
+      varying float pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+
+      varying float rx = rest_pos_x_ptr[p_base + i + programIndex];
+      varying float ry = rest_pos_y_ptr[p_base + i + programIndex];
+      varying float rz = rest_pos_z_ptr[p_base + i + programIndex];
+      varying float radius_vec = motion_radius_ptr[p_base + i + programIndex];
+
+      varying float dx = px - rx;
+      varying float dy = py - ry;
+      varying float dz = pz - rz;
+      varying float dist_sq = dx * dx + dy * dy + dz * dz;
+      varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
+      varying float dist = rcp(inv_dist);
+
+      varying float penetration = dist - radius_vec;
+      varying float mask = select(penetration > zero_vec, 1.0, 0.0);
+
+      varying float corr_factor = penetration * inv_dist * mask;
+      varying float delta_x = dx * corr_factor * pm;
+      varying float delta_y = dy * corr_factor * pm;
+      varying float delta_z = dz * corr_factor * pm;
+
+      p_pos_x_ptr[p_base + i + programIndex] = px - delta_x;
+      p_pos_y_ptr[p_base + i + programIndex] = py - delta_y;
+      p_pos_z_ptr[p_base + i + programIndex] = pz - delta_z;
     }
+  }
 }
diff --git a/chain.ispc b/chain.ispc
@@ -1,24 +1,11 @@
-// Inline math functions to replace AVX2 intrinsics
-inline varying float rsqrt(varying float x) {
-    return 1.0 / sqrt(x);
-}
-inline varying float rcp(varying float x) {
-    return 1.0 / x;
-}
-inline varying float clamp(varying float x, varying float min, varying float max) {
-    return min(max(x, min), max);
-}
-
-enum {
-    MAX_CHAINS = 64,
-    PARTICLES_PER_CHAIN = 16,
-    SPHERES_PER_CHAIN = 8,
-    MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
-    MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN),
-    CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN,
-    MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN),
-    MAX_ITERATIONS = 4,
-};
+#define MAX_CHAINS 64
+#define PARTICLES_PER_CHAIN 16
+#define SPHERES_PER_CHAIN 8
+#define MAX_PARTICLES (MAX_CHAINS * PARTICLES_PER_CHAIN)
+#define MAX_SPHERES (MAX_CHAINS * SPHERES_PER_CHAIN)
+#define CONSTRAINTS_PER_CHAIN PARTICLES_PER_CHAIN
+#define MAX_REST_LENGTHS (MAX_CHAINS * CONSTRAINTS_PER_CHAIN)
+#define MAX_ITERATIONS 8 
 
 // Define chain configuration structure
 struct chain_cfg {
@@ -28,73 +15,111 @@ struct chain_cfg {
     uniform float max_strain;
     uniform float friction;
     uniform float collision_radius;
-    uniform float linear_inertia;      // [0, 1]
+    uniform float linear_inertia; // [0, 1]
     uniform float angular_inertia;     // [0, 1]
-    uniform float centrifugal_inertia; // [0, 1]
-} __attribute__((aligned(32)));
+    uniform float centrifugal_inertia;// [0, 1]
+};
 
 // Define chain simulation structure
 struct chain_sim {
-    uniform unsigned short free_idx_cnt;
-    uniform unsigned short free_idx[MAX_CHAINS];
-    uniform struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
+    uint8 free_idx_cnt;
+    uint8 free_idx[MAX_CHAINS];
+    uniform struct chain_cfg chain_configs[MAX_CHAINS];
 
     // Chain global forces (world space gravity and wind)
-    uniform float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float gravity_x[MAX_CHAINS];
+    uniform float gravity_y[MAX_CHAINS];
+    uniform float gravity_z[MAX_CHAINS];
 
-    uniform float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float wind_z[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float wind_x[MAX_CHAINS];
+    uniform float wind_y[MAX_CHAINS];
+    uniform float wind_z[MAX_CHAINS];
 
     // Chain transformations (world space)
-    uniform float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float chain_pos_x[MAX_CHAINS];
+    uniform float chain_pos_y[MAX_CHAINS];
+    uniform float chain_pos_z[MAX_CHAINS];
 
-    uniform float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float chain_quat_x[MAX_CHAINS];
+    uniform float chain_quat_y[MAX_CHAINS];
+    uniform float chain_quat_z[MAX_CHAINS];
+    uniform float chain_quat_w[MAX_CHAINS];
 
-    uniform float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float prev_chain_pos_x[MAX_CHAINS];
+    uniform float prev_chain_pos_y[MAX_CHAINS];
+    uniform float prev_chain_pos_z[MAX_CHAINS];
 
-    uniform float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float prev_chain_quat_x[MAX_CHAINS];
+    uniform float prev_chain_quat_y[MAX_CHAINS];
+    uniform float prev_chain_quat_z[MAX_CHAINS];
+    uniform float prev_chain_quat_w[MAX_CHAINS];
 
     // Particle positions (model space)
-    uniform float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-    uniform float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-    uniform float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
-    uniform float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
-
-    uniform float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-    uniform float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-    uniform float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float p_pos_x[MAX_PARTICLES];
+    uniform float p_pos_y[MAX_PARTICLES];
+    uniform float p_pos_z[MAX_PARTICLES];
+
+    uniform float prev_p_pos_x[MAX_PARTICLES];
+    uniform float prev_p_pos_y[MAX_PARTICLES];
+    uniform float prev_p_pos_z[MAX_PARTICLES];
 
-    uniform float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
+    uniform float inv_mass[MAX_PARTICLES];
+    uniform float rest_lengths[MAX_REST_LENGTHS];
 
     // Sphere positions (model space)
-    uniform float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
-    uniform float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
-    uniform float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
-    uniform float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));
+    uniform float sphere_x[MAX_SPHERES];
+    uniform float sphere_y[MAX_SPHERES];
+    uniform float sphere_z[MAX_SPHERES];
+    uniform float sphere_radius[MAX_SPHERES];
 
     // Rest positions (model space)
-    uniform float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
-    uniform float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
-    uniform float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
-    uniform float motion_radius[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float rest_pos_x[MAX_PARTICLES];
+    uniform float rest_pos_y[MAX_PARTICLES];
+    uniform float rest_pos_z[MAX_PARTICLES];
+    uniform float motion_radius[MAX_PARTICLES];
 };
 
 // ISPC task for chain simulation
-export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
-    // Common SIMD constants (varying for vector operations across 8 lanes)
+export void simulate_chains(
+    __attribute__((aligned(32))) uniform float *uniform chain_quat_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform chain_quat_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform chain_quat_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform chain_quat_w_ptr,
+    __attribute__((aligned(32))) uniform float *uniform wind_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform wind_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform wind_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform gravity_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform gravity_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform gravity_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform chain_pos_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform chain_pos_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform chain_pos_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_w_ptr,
+    __attribute__((aligned(32))) uniform float *uniform p_pos_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform p_pos_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform p_pos_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform inv_mass_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_p_pos_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_p_pos_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform prev_p_pos_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform rest_lengths_ptr,
+    __attribute__((aligned(32))) uniform struct chain_cfg *uniform chain_configs_ptr, // Passed as pointer to array of structs
+    __attribute__((aligned(32))) uniform float *uniform sphere_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform sphere_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform sphere_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform sphere_radius_ptr,
+    __attribute__((aligned(32))) uniform float *uniform rest_pos_x_ptr,
+    __attribute__((aligned(32))) uniform float *uniform rest_pos_y_ptr,
+    __attribute__((aligned(32))) uniform float *uniform rest_pos_z_ptr,
+    __attribute__((aligned(32))) uniform float *uniform motion_radius_ptr,
+    uniform float dt) {
+    // Common SIMD constants (varying for vector operations across lanes)
     varying float dt_vec = dt;
     varying float dt2_vec = dt * dt;
     varying float one_vec = 1.0;
@@ -107,52 +132,58 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
     varying float two_vec = 2.0;
     varying float inertia_clamp_min = 0.0;
     varying float inertia_clamp_max = 1.0;
-    varying float abs_mask_ps = reinterpret<float>(0x7FFFFFFF);
+
+    assume(MAX_CHAINS % programCount == 0);
+    assume(PARTICLES_PER_CHAIN % programCount == 0);
+    assume(SPHERES_PER_CHAIN % programCount == 0);
+    assume(MAX_SPHERES % programCount == 0);
+    assume(CONSTRAINTS_PER_CHAIN % programCount == 0);
+    assume(MAX_REST_LENGTHS % programCount == 0);
 
     // Local arrays for PBD solver
-    uniform float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-    uniform float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-    uniform float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
-    uniform float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+    // Declared as uniform to avoid scatter/gather warnings when accessing with varying indices
+    __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1];
+    __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1];
+    __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1];
+    __attribute__((aligned(32))) uniform float pm_local[PARTICLES_PER_CHAIN + 1];
 
-    // Initialize dummy elements
+    // All lanes can safely write to these uniform locations.
     px_local[PARTICLES_PER_CHAIN] = 0.0;
     py_local[PARTICLES_PER_CHAIN] = 0.0;
     pz_local[PARTICLES_PER_CHAIN] = 0.0;
     pm_local[PARTICLES_PER_CHAIN] = 0.0;
-
+    
     // Stack arrays for precomputed data
-    uniform float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-    uniform float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-    uniform float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-    uniform float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
-    uniform float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
-
-    // Step 0: Precomputation (SIMD, 8 chains at once)
-    // Assumption: MAX_CHAINS (64) is divisible by 8, so no leftovers
-    for (uniform int c = 0; c < MAX_CHAINS; c += 8) {
-        // Note: No index checks; c + programIndex is always valid (0 to 63)
-        // Load chain quaternions (8 lanes)
-        varying float qx = cs->chain_quat_x[c + programIndex];
-        varying float qy = cs->chain_quat_y[c + programIndex];
-        varying float qz = cs->chain_quat_z[c + programIndex];
-        varying float qw = cs->chain_quat_w[c + programIndex];
+    __attribute__((aligned(32))) uniform float wind_local_x[MAX_CHAINS];
+    __attribute__((aligned(32))) uniform float wind_local_y[MAX_CHAINS];
+    __attribute__((aligned(32))) uniform float wind_local_z[MAX_CHAINS];
+
+    __attribute__((aligned(32))) uniform float gravity_local_x[MAX_CHAINS];
+    __attribute__((aligned(32))) uniform float gravity_local_y[MAX_CHAINS];
+    __attribute__((aligned(32))) uniform float gravity_local_z[MAX_CHAINS];
+
+    __attribute__((aligned(32))) uniform float vel_x[MAX_CHAINS];
+    __attribute__((aligned(32))) uniform float vel_y[MAX_CHAINS];
+    __attribute__((aligned(32))) uniform float vel_z[MAX_CHAINS];
+
+    __attribute__((aligned(32))) uniform float ang_vel_x[MAX_CHAINS];
+    __attribute__((aligned(32))) uniform float ang_vel_y[MAX_CHAINS];
+    __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS];
+
+    // Step 0: Precomputation (SIMD, process chains in groups of programCount)
+    // Assumption: MAX_CHAINS (64) is divisible by programCount (8 for AVX2, 4 for Neon)
+    for (uniform int c = 0; c < MAX_CHAINS; c += programCount) {
+        // Load chain quaternions
+        varying float qx = chain_quat_x_ptr[c + programIndex];
+        varying float qy = chain_quat_y_ptr[c + programIndex];
+        varying float qz = chain_quat_z_ptr[c + programIndex];
+        varying float qw = chain_quat_w_ptr[c + programIndex];
 
         // Compute local-space wind
         {
-            varying float vx = cs->wind_x[c + programIndex];
-            varying float vy = cs->wind_y[c + programIndex];
-            varying float vz = cs->wind_z[c + programIndex];
+            varying float vx = wind_x_ptr[c + programIndex];
+            varying float vy = wind_y_ptr[c + programIndex];
+            varying float vz = wind_z_ptr[c + programIndex];
 
             // Create q_conjugate components
             varying float cqx = -qx;
@@ -177,60 +208,54 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
 
         // Compute local-space gravity
         {
-            varying float vx = cs->gravity_x[c + programIndex];
-            varying float vy = cs->gravity_y[c + programIndex];
-            varying float vz = cs->gravity_z[c + programIndex];
+            varying float vx = gravity_x_ptr[c + programIndex];
+            varying float float_vy = gravity_y_ptr[c + programIndex];
+            varying float vz = gravity_z_ptr[c + programIndex];
 
             varying float cqx = -qx;
             varying float cqy = -qy;
             varying float cqz = -qz;
 
-            varying float tx = qw * vx + cqy * vz - cqz * vy;
-            varying float ty = qw * vy + cqz * vx - cqx * vz;
-            varying float tz = qw * vz + cqx * vy - cqy * vx;
-            varying float tw = qx * vx + qy * vy + qz * vz;
+            varying float tx = qw * vx + cqy * vz - cqz * float_vy;
+            varying float ty = qw * float_vy + cqz * vx - cqx * vz;
+            varying float tz = qw * vz + cqx * float_vy - cqy * vx;
+            varying float tw = qx * vx + qy * float_vy + qz * vz;
 
+            // Corrected quaternion multiplication for gravity_local
             varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
             varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
             varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
-
+            
             gravity_local_x[c + programIndex] = result_x;
             gravity_local_y[c + programIndex] = result_y;
             gravity_local_z[c + programIndex] = result_z;
         }
 
         // Compute linear velocity
         {
-            varying float curr_x = cs->chain_pos_x[c + programIndex];
-            varying float curr_y = cs->chain_pos_y[c + programIndex];
-            varying float curr_z = cs->chain_pos_z[c + programIndex];
-
-            varying float prev_x = cs->prev_chain_pos_x[c + programIndex];
-            varying float prev_y = cs->prev_chain_pos_y[c + programIndex];
-            varying float prev_z = cs->prev_chain_pos_z[c + programIndex];
+            varying float curr_x = chain_pos_x_ptr[c + programIndex];
+            varying float curr_y = chain_pos_y_ptr[c + programIndex];
+            varying float curr_z = chain_pos_z_ptr[c + programIndex];
+            varying float prev_x = prev_chain_pos_x_ptr[c + programIndex];
+            varying float prev_y = prev_chain_pos_y_ptr[c + programIndex];
+            varying float prev_z = prev_chain_pos_z_ptr[c + programIndex];
 
             varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec;
             varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec;
             varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec;
-
             vel_x[c + programIndex] = vel_x_vec;
             vel_y[c + programIndex] = vel_y_vec;
             vel_z[c + programIndex] = vel_z_vec;
         }
 
         // Compute angular velocity
         {
-            varying float qx = cs->chain_quat_x[c + programIndex];
-            varying float qy = cs->chain_quat_y[c + programIndex];
-            varying float qz = cs->chain_quat_z[c + programIndex];
-            varying float qw = cs->chain_quat_w[c + programIndex];
-
-            varying float prev_qx = cs->prev_chain_quat_x[c + programIndex];
-            varying float prev_qy = cs->prev_chain_quat_y[c + programIndex];
-            varying float prev_qz = cs->prev_chain_quat_z[c + programIndex];
-            varying float prev_qw = cs->prev_chain_quat_w[c + programIndex];
+            varying float prev_qx = prev_chain_quat_x_ptr[c + programIndex];
+            varying float prev_qy = prev_chain_quat_y_ptr[c + programIndex];
+            varying float prev_qz = prev_chain_quat_z_ptr[c + programIndex];
+            varying float prev_qw = prev_chain_quat_w_ptr[c + programIndex];
 
-            // Step 1: Compute delta quaternion (to - from)
+            // Step 1: Compute delta quaternion (to - from) - Note: This is an approximation
             varying float delta_qx = qx - prev_qx;
             varying float delta_qy = qy - prev_qy;
             varying float delta_qz = qz - prev_qz;
@@ -241,21 +266,16 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
             varying float conj_qy = -prev_qy;
             varying float conj_qz = -prev_qz;
             varying float conj_qw = prev_qw;
-
             // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
             varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz;
-
             varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy;
             varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz;
             varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx;
-
             varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x;
             varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y;
             varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z;
-
             // Step 4: Compute dot product (to, from) for shortest arc check
             varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz;
-
             // Step 5: Negate orientation delta if dot < 0
             varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec);
             orient_qx *= negate_sign;
@@ -275,12 +295,12 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
     }
 
     // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
-    // Assumption: MAX_CHAINS (64) is divisible by 8, PARTICLES_PER_CHAIN (16) is divisible by 8
+    // Process one chain at a time, particles in groups of programCount
     for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
         uniform int base_idx = c * PARTICLES_PER_CHAIN;
-        uniform struct chain_cfg *cfg = &cs->chain_configs[c];
+        uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
 
-        // Load velocities and inertia (scalar, broadcast to 8 lanes)
+        // Load velocities and inertia (scalar, broadcast to lanes)
         varying float vel_x_vec = vel_x[c];
         varying float vel_y_vec = vel_y[c];
         varying float vel_z_vec = vel_z[c];
@@ -292,19 +312,18 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
         varying float linear_inertia_vec = cfg->linear_inertia;
         varying float angular_inertia_vec = cfg->angular_inertia;
         varying float centrifugal_inertia_vec = cfg->centrifugal_inertia;
-
+        
         // Clamp inertia parameters to [0, 1]
         linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max);
         angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max);
         centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max);
-
-        // Process 8 particles at a time
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-            // Note: No index checks; base_idx + i + programIndex is always valid
-            varying float px = cs->p_pos_x[base_idx + i + programIndex];
-            varying float py = cs->p_pos_y[base_idx + i + programIndex];
-            varying float pz = cs->p_pos_z[base_idx + i + programIndex];
-            varying float p_inv_mass = cs->inv_mass[base_idx + i + programIndex];
+
+        // Process particles in groups
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+            varying float px = p_pos_x_ptr[base_idx + i + programIndex];
+            varying float py = p_pos_y_ptr[base_idx + i + programIndex];
+            varying float pz = p_pos_z_ptr[base_idx + i + programIndex];
+            varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
             p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
 
             // Linear inertia: v * dt * linear_inertia
@@ -320,68 +339,68 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
             ang_delta_x *= dt_vec * angular_inertia_vec;
             ang_delta_y *= dt_vec * angular_inertia_vec;
             ang_delta_z *= dt_vec * angular_inertia_vec;
-
+            
             // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
             varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
             varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
             varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
-
             varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y;
             varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z;
             varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x;
 
             varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec;
             varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec;
             varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec;
-
+            
             // Total delta
             varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x;
             varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y;
             varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z;
-
+            
             // Update positions
-            cs->p_pos_x[base_idx + i + programIndex] = px + delta_x;
-            cs->p_pos_y[base_idx + i + programIndex] = py + delta_y;
-            cs->p_pos_z[base_idx + i + programIndex] = pz + delta_z;
+            p_pos_x_ptr[base_idx + i + programIndex] = px + delta_x;
+            p_pos_y_ptr[base_idx + i + programIndex] = py + delta_y;
+            p_pos_z_ptr[base_idx + i + programIndex] = pz + delta_z;
         }
 
         // Update previous transformation (scalar)
-        cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
-        cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
-        cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
-
-        cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
-        cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
-        cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
-        cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
+        prev_chain_pos_x_ptr[c] = chain_pos_x_ptr[c];
+        prev_chain_pos_y_ptr[c] = chain_pos_y_ptr[c];
+        prev_chain_pos_z_ptr[c] = chain_pos_z_ptr[c];
+
+        prev_chain_quat_x_ptr[c] = chain_quat_x_ptr[c];
+        prev_chain_quat_y_ptr[c] = chain_quat_y_ptr[c];
+        prev_chain_quat_z_ptr[c] = chain_quat_z_ptr[c];
+        prev_chain_quat_w_ptr[c] = chain_quat_w_ptr[c];
     }
 
     // Step 2: Verlet Integration
-    // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8
+    // Process particles in groups of programCount
     for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
         uniform int base_idx = c * PARTICLES_PER_CHAIN;
-        uniform struct chain_cfg *cfg = &cs->chain_configs[c];
+        uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
 
-        // Load forces and damping (scalar, broadcast to 8 lanes)
+        // Load forces and damping (scalar, broadcast to lanes)
         varying float gravity_x_vec = gravity_local_x[c];
         varying float gravity_y_vec = gravity_local_y[c];
         varying float gravity_z_vec = gravity_local_z[c];
 
         varying float wind_x_vec = wind_local_x[c];
         varying float wind_y_vec = wind_local_y[c];
         varying float wind_z_vec = wind_local_z[c];
+
         varying float damping_vec = cfg->damping;
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+            varying float p_curr_x = p_pos_x_ptr[base_idx + i + programIndex];
+            varying float p_curr_y = p_pos_y_ptr[base_idx + i + programIndex];
+            varying float p_curr_z = p_pos_z_ptr[base_idx + i + programIndex];
 
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-            varying float p_curr_x = cs->p_pos_x[base_idx + i + programIndex];
-            varying float p_curr_y = cs->p_pos_y[base_idx + i + programIndex];
-            varying float p_curr_z = cs->p_pos_z[base_idx + i + programIndex];
-            varying float p_inv_mass = cs->inv_mass[base_idx + i + programIndex];
+            varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
             p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
 
-            varying float p_prev_x = cs->prev_p_pos_x[base_idx + i + programIndex];
-            varying float p_prev_y = cs->prev_p_pos_y[base_idx + i + programIndex];
-            varying float p_prev_z = cs->prev_p_pos_z[base_idx + i + programIndex];
+            varying float p_prev_x = prev_p_pos_x_ptr[base_idx + i + programIndex];
+            varying float p_prev_y = prev_p_pos_y_ptr[base_idx + i + programIndex];
+            varying float p_prev_z = prev_p_pos_z_ptr[base_idx + i + programIndex];
 
             // Compute forces and accelerations
             varying float force_x = gravity_x_vec + wind_x_vec;
@@ -393,176 +412,153 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
             varying float acc_z = force_z * p_inv_mass;
 
             // Compute velocity with damping
-            varying float vel_x = p_curr_x - p_prev_x;
-            varying float vel_y = p_curr_y - p_prev_y;
-            varying float vel_z = p_curr_z - p_prev_z;
+            varying float v_x = p_curr_x - p_prev_x;
+            varying float v_y = p_curr_y - p_prev_y;
+            varying float v_z = p_curr_z - p_prev_z;
 
-            varying float damped_vel_x = vel_x * damping_vec;
-            varying float damped_vel_y = vel_y * damping_vec;
-            varying float damped_vel_z = vel_z * damping_vec;
+            varying float damped_vel_x = v_x * damping_vec;
+            varying float damped_vel_y = v_y * damping_vec;
+            varying float damped_vel_z = v_z * damping_vec;
 
             // Update positions: p_curr + damped_vel + acc * dt^2
             varying float term_accel_x = acc_x * dt2_vec;
             varying float term_accel_y = acc_y * dt2_vec;
             varying float term_accel_z = acc_z * dt2_vec;
 
-            cs->p_pos_x[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x;
-            cs->p_pos_y[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y;
-            cs->p_pos_z[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z;
-
+            p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x;
+            p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y;
+            p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z;
+            
             // Update previous positions
-            cs->prev_p_pos_x[base_idx + i + programIndex] = p_curr_x;
-            cs->prev_p_pos_y[base_idx + i + programIndex] = p_curr_y;
-            cs->prev_p_pos_z[base_idx + i + programIndex] = p_curr_z;
+            prev_p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x;
+            prev_p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y;
+            prev_p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z;
         }
     }
 
     // Step 3: Distance Constraints
-    // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8
+    // Process one chain at a time, constraints in groups of programCount
     for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
         uniform int p_base = c * PARTICLES_PER_CHAIN;
         uniform int r_base = c * CONSTRAINTS_PER_CHAIN;
-        uniform struct chain_cfg *cfg = &cs->chain_configs[c];
+        uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
 
         varying float stiffness_vec = cfg->stiffness;
         varying float stretch_factor_vec = cfg->stretch_factor;
         varying float max_strain_vec_abs = cfg->max_strain;
         varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec;
-
-        // Copy particles to local arrays
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-            px_local[i + programIndex] = cs->p_pos_x[p_base + i + programIndex];
-            py_local[i + programIndex] = cs->p_pos_y[p_base + i + programIndex];
-            pz_local[i + programIndex] = cs->p_pos_z[p_base + i + programIndex];
-            pm_local[i + programIndex] = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+
+        // Copy particles to local arrays with uniform indexing
+        // This loop now performs a vectorized load into the uniform px_local array.
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+            // Load a vector of particles from global memory into varying registers
+            varying float current_px = p_pos_x_ptr[p_base + i + programIndex];
+            varying float current_py = p_pos_y_ptr[p_base + i + programIndex];
+            varying float current_pz = p_pos_z_ptr[p_base + i + programIndex];
+            varying float current_pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+
+            // Store these varying values into the uniform local array
+            // The compiler can now see this as a series of vector stores
+            // because px_local is uniform.
+            foreach_active (idx) {
+                px_local[i + programIndex] = current_px;
+                py_local[i + programIndex] = current_py;
+                pz_local[i + programIndex] = current_pz;
+                pm_local[i + programIndex] = current_pm;
+            }
         }
 
         // Iterate over constraints
         for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) {
-            // First block (i=0 to 7)
-            {
-                varying float p0x = px_local[0];
-                varying float p0y = py_local[0];
-                varying float p0z = pz_local[0];
-                varying float p0m = pm_local[0];
-                varying float p1x = px_local[1]; // Unaligned access
-                varying float p1y = py_local[1];
-                varying float p1z = pz_local[1];
-                varying float p1m = pm_local[1];
-                varying float rest_len_vec = cs->rest_lengths[r_base];
-
-                varying float dx = p1x - p0x;
-                varying float dy = p1y - p0y;
-                varying float dz = p1z - p0z;
-
-                varying float dist_sq = dx * dx + dy * dy + dz * dz;
-                varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
-                varying float dist = rcp(inv_dist);
-
-                varying float w_sum = p0m + p1m;
-                varying float rcp_w_sum = rcp(max(w_sum, eps_vec));
-
-                varying float diff = dist - rest_len_vec;
-                varying float strain = diff * rcp(max(rest_len_vec, eps_vec));
-                varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs);
-
-                varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + (strain_clamped & abs_mask_ps));
-                varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum;
-
-                varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0);
-
-                varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask;
-                varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask;
-                varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask;
-
-                px_local[0] = p0x + delta_x * p0m;
-                py_local[0] = p0y + delta_y * p0m;
-                pz_local[0] = p0z + delta_z * p0m;
-
-                px_local[1] = p1x - delta_x * p1m;
-                py_local[1] = p1y - delta_y * p1m;
-                pz_local[1] = p1z - delta_z * p1m;
-            }
-
-            // Second block (i=8 to 14, mask last lane)
-            {
-                varying float p0x = px_local[8];
-                varying float p0y = py_local[8];
-                varying float p0z = pz_local[8];
-                varying float p0m = pm_local[8];
-                varying float p1x = px_local[9]; // Unaligned access
-                varying float p1y = py_local[9];
-                varying float p1z = pz_local[9];
-                varying float p1m = pm_local[9];
-                varying float rest_len_vec = cs->rest_lengths[r_base + 8];
-
+            // Process constraints in groups of programCount (0 to PARTICLES_PER_CHAIN-2)
+            for (uniform int i = 0; i < PARTICLES_PER_CHAIN - 1; i += programCount) {
+                // Now, load directly from uniform px_local using varying programIndex
+                // The compiler can optimize these into vector loads.
+                varying float p0x = px_local[i + programIndex];
+                varying float p0y = py_local[i + programIndex];
+                varying float p0z = pz_local[i + programIndex]; 
+                varying float p0m = pm_local[i + programIndex];
+
+                // For p1, we need to be careful with the indexing relative to programIndex.
+                // The loop condition 'i < PARTICLES_PER_CHAIN - 1' ensures that 'i + programIndex + 1'
+                // will not exceed 'PARTICLES_PER_CHAIN - 1' for any active lane.
+                varying float p1x = px_local[i + programIndex + 1]; 
+                varying float p1y = py_local[i + programIndex + 1];
+                varying float p1z = pz_local[i + programIndex + 1];
+                varying float p1m = pm_local[i + programIndex + 1];
+
+                varying float rest_len_vec = rest_lengths_ptr[r_base + i + programIndex];
+
                 varying float dx = p1x - p0x;
                 varying float dy = p1y - p0y;
                 varying float dz = p1z - p0z;
 
                 varying float dist_sq = dx * dx + dy * dy + dz * dz;
                 varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
                 varying float dist = rcp(inv_dist);
-
                 varying float w_sum = p0m + p1m;
                 varying float rcp_w_sum = rcp(max(w_sum, eps_vec));
-
                 varying float diff = dist - rest_len_vec;
                 varying float strain = diff * rcp(max(rest_len_vec, eps_vec));
                 varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs);
-
-                varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + (strain_clamped & abs_mask_ps));
+                varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + abs(strain_clamped));
                 varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum;
-
                 varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0);
-                // Mask last lane (i=15) to disable constraint
-                apply_corr_mask = select(programIndex < 7, apply_corr_mask, 0.0);
-
+                // Mask lanes where i + programIndex >= PARTICLES_PER_CHAIN - 1 (no constraint for last particle)
+                apply_corr_mask = select(i + programIndex < PARTICLES_PER_CHAIN - 1, apply_corr_mask, 0.0);
+                
                 varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask;
                 varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask;
                 varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask;
 
-                px_local[8] = p0x + delta_x * p0m;
-                py_local[8] = p0y + delta_y * p0m;
-                pz_local[8] = p0z + delta_z * p0m;
+                // Apply corrections, writing back to uniform px_local
+                // These are now vector stores.
+                px_local[i + programIndex] -= delta_x * p0m;
+                py_local[i + programIndex] -= delta_y * p0m;
+                pz_local[i + programIndex] -= delta_z * p0m;
 
-                px_local[9] = p1x - delta_x * p1m;
-                py_local[9] = p1y - delta_y * p1m;
-                pz_local[9] = p1z - delta_z * p1m;
+                px_local[i + programIndex + 1] += delta_x * p1m;
+                py_local[i + programIndex + 1] += delta_y * p1m;
+                pz_local[i + programIndex + 1] += delta_z * p1m;
             }
         }
-
         // Write back to global arrays
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-            cs->p_pos_x[p_base + i + programIndex] = px_local[i + programIndex];
-            cs->p_pos_y[p_base + i + programIndex] = py_local[i + programIndex];
-            cs->p_pos_z[p_base + i + programIndex] = pz_local[i + programIndex];
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+            // Load a vector of particles from uniform local array into varying registers
+            varying float updated_px = px_local[i + programIndex];
+            varying float updated_py = py_local[i + programIndex];
+            varying float updated_pz = pz_local[i + programIndex];
+
+            // Store these varying values into the global memory
+            foreach_active (idx) {
+                p_pos_x_ptr[p_base + i + programIndex] = updated_px;
+                p_pos_y_ptr[p_base + i + programIndex] = updated_py;
+                p_pos_z_ptr[p_base + i + programIndex] = updated_pz;
+            }
         }
     }
-
     // Step 4: Sphere Collisions with Friction
-    // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8
+    // Process particles in groups of programCount
     for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
         uniform int p_base = c * PARTICLES_PER_CHAIN;
         uniform int s_base = c * SPHERES_PER_CHAIN;
-        uniform struct chain_cfg *cfg = &cs->chain_configs[c];
+        uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
         varying float friction_vec = cfg->friction;
-
         for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) {
-            varying float sphere_x_vec = cs->sphere_x[s_base + s];
-            varying float sphere_y_vec = cs->sphere_y[s_base + s];
-            varying float sphere_z_vec = cs->sphere_z[s_base + s];
-            varying float sphere_r_vec = cs->sphere_radius[s_base + s];
+            varying float sphere_x_vec = sphere_x_ptr[s_base + s];
+            varying float sphere_y_vec = sphere_y_ptr[s_base + s];
+            varying float sphere_z_vec = sphere_z_ptr[s_base + s];
+            varying float sphere_r_vec = sphere_radius_ptr[s_base + s];
 
-            for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-                varying float p_curr_x = cs->p_pos_x[p_base + i + programIndex];
-                varying float p_curr_y = cs->p_pos_y[p_base + i + programIndex];
-                varying float p_curr_z = cs->p_pos_z[p_base + i + programIndex];
+            for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+                varying float p_curr_x = p_pos_x_ptr[p_base + i + programIndex];
+                varying float p_curr_y = p_pos_y_ptr[p_base + i + programIndex];
+                varying float p_curr_z = p_pos_z_ptr[p_base + i + programIndex];
 
-                varying float p_prev_x = cs->prev_p_pos_x[p_base + i + programIndex];
-                varying float p_prev_y = cs->prev_p_pos_y[p_base + i + programIndex];
-                varying float p_prev_z = cs->prev_p_pos_z[p_base + i + programIndex];
-                varying float p_inv_mass = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+                varying float p_prev_x = prev_p_pos_x_ptr[p_base + i + programIndex];
+                varying float p_prev_y = prev_p_pos_y_ptr[p_base + i + programIndex];
+                varying float p_prev_z = prev_p_pos_z_ptr[p_base + i + programIndex];
+                varying float p_inv_mass = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
 
                 varying float dx = p_curr_x - sphere_x_vec;
                 varying float dy = p_curr_y - sphere_y_vec;
@@ -583,14 +579,14 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
                 varying float normal_corr_y = normal_y * penetration;
                 varying float normal_corr_z = normal_z * penetration;
 
-                varying float vel_x = p_curr_x - p_prev_x;
-                varying float vel_y = p_curr_y - p_prev_y;
-                varying float vel_z = p_curr_z - p_prev_z;
-
-                varying float vel_dot_normal = vel_x * normal_x + vel_y * normal_y + vel_z * normal_z;
-                varying float tangent_x = vel_x - vel_dot_normal * normal_x;
-                varying float tangent_y = vel_y - vel_dot_normal * normal_y;
-                varying float tangent_z = vel_z - vel_dot_normal * normal_z;
+                varying float v_x = p_curr_x - p_prev_x;
+                varying float v_y = p_curr_y - p_prev_y;
+                varying float v_z = p_curr_z - p_prev_z;
+                
+                varying float vel_dot_normal = v_x * normal_x + v_y * normal_y + v_z * normal_z;
+                varying float tangent_x = v_x - vel_dot_normal * normal_x;
+                varying float tangent_y = v_y - vel_dot_normal * normal_y;
+                varying float tangent_z = v_z - vel_dot_normal * normal_z;
 
                 varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z;
                 varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec));
@@ -604,32 +600,30 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
                 varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass;
                 varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass;
 
-                cs->p_pos_x[p_base + i + programIndex] = p_curr_x + total_corr_x;
-                cs->p_pos_y[p_base + i + programIndex] = p_curr_y + total_corr_y;
-                cs->p_pos_z[p_base + i + programIndex] = p_curr_z + total_corr_z;
+                p_pos_x_ptr[p_base + i + programIndex] = p_curr_x + total_corr_x;
+                p_pos_y_ptr[p_base + i + programIndex] = p_curr_y + total_corr_y;
+                p_pos_z_ptr[p_base + i + programIndex] = p_curr_z + total_corr_z;
             }
         }
     }
-
     // Step 5: Motion Constraints
-    // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8
+    // Process particles in groups of programCount
     for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
         uniform int p_base = c * PARTICLES_PER_CHAIN;
-        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
-            varying float px = cs->p_pos_x[p_base + i + programIndex];
-            varying float py = cs->p_pos_y[p_base + i + programIndex];
-            varying float pz = cs->p_pos_z[p_base + i + programIndex];
-            varying float pm = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
+            varying float px = p_pos_x_ptr[p_base + i + programIndex];
+            varying float py = p_pos_y_ptr[p_base + i + programIndex];
+            varying float pz = p_pos_z_ptr[p_base + i + programIndex];
+            varying float pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
 
-            varying float rx = cs->rest_pos_x[p_base + i + programIndex];
-            varying float ry = cs->rest_pos_y[p_base + i + programIndex];
-            varying float rz = cs->rest_pos_z[p_base + i + programIndex];
-            varying float radius_vec = cs->motion_radius[p_base + i + programIndex];
+            varying float rx = rest_pos_x_ptr[p_base + i + programIndex];
+            varying float ry = rest_pos_y_ptr[p_base + i + programIndex];
+            varying float rz = rest_pos_z_ptr[p_base + i + programIndex];
+            varying float radius_vec = motion_radius_ptr[p_base + i + programIndex];
 
             varying float dx = px - rx;
             varying float dy = py - ry;
             varying float dz = pz - rz;
-
             varying float dist_sq = dx * dx + dy * dy + dz * dz;
             varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
             varying float dist = rcp(inv_dist);
@@ -642,9 +636,9 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
             varying float delta_y = dy * corr_factor * pm;
             varying float delta_z = dz * corr_factor * pm;
 
-            cs->p_pos_x[p_base + i + programIndex] = px - delta_x;
-            cs->p_pos_y[p_base + i + programIndex] = py - delta_y;
-            cs->p_pos_z[p_base + i + programIndex] = pz - delta_z;
+            p_pos_x_ptr[p_base + i + programIndex] = px - delta_x;
+            p_pos_y_ptr[p_base + i + programIndex] = py - delta_y;
+            p_pos_z_ptr[p_base + i + programIndex] = pz - delta_z;
         }
     }
 }
diff --git a/chain.ispc b/chain.ispc
@@ -0,0 +1,650 @@
+// Inline math functions to replace AVX2 intrinsics
+inline varying float rsqrt(varying float x) {
+    return 1.0 / sqrt(x);
+}
+inline varying float rcp(varying float x) {
+    return 1.0 / x;
+}
+inline varying float clamp(varying float x, varying float min, varying float max) {
+    return min(max(x, min), max);
+}
+
+enum {
+    MAX_CHAINS = 64,
+    PARTICLES_PER_CHAIN = 16,
+    SPHERES_PER_CHAIN = 8,
+    MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
+    MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN),
+    CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN,
+    MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN),
+    MAX_ITERATIONS = 4,
+};
+
+// Define chain configuration structure
+struct chain_cfg {
+    uniform float damping;
+    uniform float stiffness;
+    uniform float stretch_factor;
+    uniform float max_strain;
+    uniform float friction;
+    uniform float collision_radius;
+    uniform float linear_inertia;      // [0, 1]
+    uniform float angular_inertia;     // [0, 1]
+    uniform float centrifugal_inertia; // [0, 1]
+} __attribute__((aligned(32)));
+
+// Define chain simulation structure
+struct chain_sim {
+    uniform unsigned short free_idx_cnt;
+    uniform unsigned short free_idx[MAX_CHAINS];
+    uniform struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
+
+    // Chain global forces (world space gravity and wind)
+    uniform float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+    uniform float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float wind_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+    // Chain transformations (world space)
+    uniform float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+    uniform float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
+
+    uniform float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+    uniform float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
+
+    // Particle positions (model space)
+    uniform float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
+
+    uniform float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
+
+    uniform float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
+
+    // Sphere positions (model space)
+    uniform float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
+    uniform float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
+    uniform float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
+    uniform float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));
+
+    // Rest positions (model space)
+    uniform float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
+    uniform float motion_radius[MAX_PARTICLES] __attribute__((aligned(32)));
+};
+
+// ISPC task for chain simulation
+export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) {
+    // Common SIMD constants (varying for vector operations across 8 lanes)
+    varying float dt_vec = dt;
+    varying float dt2_vec = dt * dt;
+    varying float one_vec = 1.0;
+    varying float neg_one_vec = -1.0;
+    varying float eps_vec = 1e-6;
+    varying float inv_mass_clamp_min = 0.0;
+    varying float inv_mass_clamp_max = 1e6;
+    varying float zero_vec = 0.0;
+    varying float inv_dt_vec = 1.0 / dt;
+    varying float two_vec = 2.0;
+    varying float inertia_clamp_min = 0.0;
+    varying float inertia_clamp_max = 1.0;
+    varying float abs_mask_ps = reinterpret<float>(0x7FFFFFFF);
+
+    // Local arrays for PBD solver
+    uniform float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+    uniform float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+    uniform float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+    uniform float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+
+    // Initialize dummy elements
+    px_local[PARTICLES_PER_CHAIN] = 0.0;
+    py_local[PARTICLES_PER_CHAIN] = 0.0;
+    pz_local[PARTICLES_PER_CHAIN] = 0.0;
+    pm_local[PARTICLES_PER_CHAIN] = 0.0;
+
+    // Stack arrays for precomputed data
+    uniform float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+    uniform float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+    uniform float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+    uniform float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
+    uniform float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+    // Step 0: Precomputation (SIMD, 8 chains at once)
+    // Assumption: MAX_CHAINS (64) is divisible by 8, so no leftovers
+    for (uniform int c = 0; c < MAX_CHAINS; c += 8) {
+        // Note: No index checks; c + programIndex is always valid (0 to 63)
+        // Load chain quaternions (8 lanes)
+        varying float qx = cs->chain_quat_x[c + programIndex];
+        varying float qy = cs->chain_quat_y[c + programIndex];
+        varying float qz = cs->chain_quat_z[c + programIndex];
+        varying float qw = cs->chain_quat_w[c + programIndex];
+
+        // Compute local-space wind
+        {
+            varying float vx = cs->wind_x[c + programIndex];
+            varying float vy = cs->wind_y[c + programIndex];
+            varying float vz = cs->wind_z[c + programIndex];
+
+            // Create q_conjugate components
+            varying float cqx = -qx;
+            varying float cqy = -qy;
+            varying float cqz = -qz;
+
+            // Step 1: t = q_conj * v_world_as_quat
+            varying float tx = qw * vx + cqy * vz - cqz * vy;
+            varying float ty = qw * vy + cqz * vx - cqx * vz;
+            varying float tz = qw * vz + cqx * vy - cqy * vx;
+            varying float tw = qx * vx + qy * vy + qz * vz;
+
+            // Step 2: result_vec = (t * q)_vec
+            varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
+            varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
+            varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
+
+            wind_local_x[c + programIndex] = result_x;
+            wind_local_y[c + programIndex] = result_y;
+            wind_local_z[c + programIndex] = result_z;
+        }
+
+        // Compute local-space gravity
+        {
+            varying float vx = cs->gravity_x[c + programIndex];
+            varying float vy = cs->gravity_y[c + programIndex];
+            varying float vz = cs->gravity_z[c + programIndex];
+
+            varying float cqx = -qx;
+            varying float cqy = -qy;
+            varying float cqz = -qz;
+
+            varying float tx = qw * vx + cqy * vz - cqz * vy;
+            varying float ty = qw * vy + cqz * vx - cqx * vz;
+            varying float tz = qw * vz + cqx * vy - cqy * vx;
+            varying float tw = qx * vx + qy * vy + qz * vz;
+
+            varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
+            varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
+            varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
+
+            gravity_local_x[c + programIndex] = result_x;
+            gravity_local_y[c + programIndex] = result_y;
+            gravity_local_z[c + programIndex] = result_z;
+        }
+
+        // Compute linear velocity
+        {
+            varying float curr_x = cs->chain_pos_x[c + programIndex];
+            varying float curr_y = cs->chain_pos_y[c + programIndex];
+            varying float curr_z = cs->chain_pos_z[c + programIndex];
+
+            varying float prev_x = cs->prev_chain_pos_x[c + programIndex];
+            varying float prev_y = cs->prev_chain_pos_y[c + programIndex];
+            varying float prev_z = cs->prev_chain_pos_z[c + programIndex];
+
+            varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec;
+            varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec;
+            varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec;
+
+            vel_x[c + programIndex] = vel_x_vec;
+            vel_y[c + programIndex] = vel_y_vec;
+            vel_z[c + programIndex] = vel_z_vec;
+        }
+
+        // Compute angular velocity
+        {
+            varying float qx = cs->chain_quat_x[c + programIndex];
+            varying float qy = cs->chain_quat_y[c + programIndex];
+            varying float qz = cs->chain_quat_z[c + programIndex];
+            varying float qw = cs->chain_quat_w[c + programIndex];
+
+            varying float prev_qx = cs->prev_chain_quat_x[c + programIndex];
+            varying float prev_qy = cs->prev_chain_quat_y[c + programIndex];
+            varying float prev_qz = cs->prev_chain_quat_z[c + programIndex];
+            varying float prev_qw = cs->prev_chain_quat_w[c + programIndex];
+
+            // Step 1: Compute delta quaternion (to - from)
+            varying float delta_qx = qx - prev_qx;
+            varying float delta_qy = qy - prev_qy;
+            varying float delta_qz = qz - prev_qz;
+            varying float delta_qw = qw - prev_qw;
+
+            // Step 2: Compute conjugate of previous quaternion
+            varying float conj_qx = -prev_qx;
+            varying float conj_qy = -prev_qy;
+            varying float conj_qz = -prev_qz;
+            varying float conj_qw = prev_qw;
+
+            // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
+            varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz;
+
+            varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy;
+            varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz;
+            varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx;
+
+            varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x;
+            varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y;
+            varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z;
+
+            // Step 4: Compute dot product (to, from) for shortest arc check
+            varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz;
+
+            // Step 5: Negate orientation delta if dot < 0
+            varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec);
+            orient_qx *= negate_sign;
+            orient_qy *= negate_sign;
+            orient_qz *= negate_sign;
+
+            // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
+            varying float scale = one_vec / (two_vec * dt_vec);
+            varying float ang_vel_x_vec = orient_qx * scale;
+            varying float ang_vel_y_vec = orient_qy * scale;
+            varying float ang_vel_z_vec = orient_qz * scale;
+
+            ang_vel_x[c + programIndex] = ang_vel_x_vec;
+            ang_vel_y[c + programIndex] = ang_vel_y_vec;
+            ang_vel_z[c + programIndex] = ang_vel_z_vec;
+        }
+    }
+
+    // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
+    // Assumption: MAX_CHAINS (64) is divisible by 8, PARTICLES_PER_CHAIN (16) is divisible by 8
+    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+        uniform int base_idx = c * PARTICLES_PER_CHAIN;
+        uniform struct chain_cfg *cfg = &cs->chain_configs[c];
+
+        // Load velocities and inertia (scalar, broadcast to 8 lanes)
+        varying float vel_x_vec = vel_x[c];
+        varying float vel_y_vec = vel_y[c];
+        varying float vel_z_vec = vel_z[c];
+
+        varying float ang_vel_x_vec = ang_vel_x[c];
+        varying float ang_vel_y_vec = ang_vel_y[c];
+        varying float ang_vel_z_vec = ang_vel_z[c];
+
+        varying float linear_inertia_vec = cfg->linear_inertia;
+        varying float angular_inertia_vec = cfg->angular_inertia;
+        varying float centrifugal_inertia_vec = cfg->centrifugal_inertia;
+
+        // Clamp inertia parameters to [0, 1]
+        linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max);
+        angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max);
+        centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max);
+
+        // Process 8 particles at a time
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+            // Note: No index checks; base_idx + i + programIndex is always valid
+            varying float px = cs->p_pos_x[base_idx + i + programIndex];
+            varying float py = cs->p_pos_y[base_idx + i + programIndex];
+            varying float pz = cs->p_pos_z[base_idx + i + programIndex];
+            varying float p_inv_mass = cs->inv_mass[base_idx + i + programIndex];
+            p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
+
+            // Linear inertia: v * dt * linear_inertia
+            varying float lin_delta_x = vel_x_vec * dt_vec * linear_inertia_vec;
+            varying float lin_delta_y = vel_y_vec * dt_vec * linear_inertia_vec;
+            varying float lin_delta_z = vel_z_vec * dt_vec * linear_inertia_vec;
+
+            // Angular inertia: (ω × r) * dt * angular_inertia
+            varying float ang_delta_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
+            varying float ang_delta_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
+            varying float ang_delta_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
+
+            ang_delta_x *= dt_vec * angular_inertia_vec;
+            ang_delta_y *= dt_vec * angular_inertia_vec;
+            ang_delta_z *= dt_vec * angular_inertia_vec;
+
+            // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
+            varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
+            varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
+            varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
+
+            varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y;
+            varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z;
+            varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x;
+
+            varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec;
+            varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec;
+            varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec;
+
+            // Total delta
+            varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x;
+            varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y;
+            varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z;
+
+            // Update positions
+            cs->p_pos_x[base_idx + i + programIndex] = px + delta_x;
+            cs->p_pos_y[base_idx + i + programIndex] = py + delta_y;
+            cs->p_pos_z[base_idx + i + programIndex] = pz + delta_z;
+        }
+
+        // Update previous transformation (scalar)
+        cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
+        cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
+        cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
+
+        cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
+        cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
+        cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
+        cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
+    }
+
+    // Step 2: Verlet Integration
+    // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8
+    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+        uniform int base_idx = c * PARTICLES_PER_CHAIN;
+        uniform struct chain_cfg *cfg = &cs->chain_configs[c];
+
+        // Load forces and damping (scalar, broadcast to 8 lanes)
+        varying float gravity_x_vec = gravity_local_x[c];
+        varying float gravity_y_vec = gravity_local_y[c];
+        varying float gravity_z_vec = gravity_local_z[c];
+
+        varying float wind_x_vec = wind_local_x[c];
+        varying float wind_y_vec = wind_local_y[c];
+        varying float wind_z_vec = wind_local_z[c];
+        varying float damping_vec = cfg->damping;
+
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+            varying float p_curr_x = cs->p_pos_x[base_idx + i + programIndex];
+            varying float p_curr_y = cs->p_pos_y[base_idx + i + programIndex];
+            varying float p_curr_z = cs->p_pos_z[base_idx + i + programIndex];
+            varying float p_inv_mass = cs->inv_mass[base_idx + i + programIndex];
+            p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);
+
+            varying float p_prev_x = cs->prev_p_pos_x[base_idx + i + programIndex];
+            varying float p_prev_y = cs->prev_p_pos_y[base_idx + i + programIndex];
+            varying float p_prev_z = cs->prev_p_pos_z[base_idx + i + programIndex];
+
+            // Compute forces and accelerations
+            varying float force_x = gravity_x_vec + wind_x_vec;
+            varying float force_y = gravity_y_vec + wind_y_vec;
+            varying float force_z = gravity_z_vec + wind_z_vec;
+
+            varying float acc_x = force_x * p_inv_mass;
+            varying float acc_y = force_y * p_inv_mass;
+            varying float acc_z = force_z * p_inv_mass;
+
+            // Compute velocity with damping
+            varying float vel_x = p_curr_x - p_prev_x;
+            varying float vel_y = p_curr_y - p_prev_y;
+            varying float vel_z = p_curr_z - p_prev_z;
+
+            varying float damped_vel_x = vel_x * damping_vec;
+            varying float damped_vel_y = vel_y * damping_vec;
+            varying float damped_vel_z = vel_z * damping_vec;
+
+            // Update positions: p_curr + damped_vel + acc * dt^2
+            varying float term_accel_x = acc_x * dt2_vec;
+            varying float term_accel_y = acc_y * dt2_vec;
+            varying float term_accel_z = acc_z * dt2_vec;
+
+            cs->p_pos_x[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x;
+            cs->p_pos_y[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y;
+            cs->p_pos_z[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z;
+
+            // Update previous positions
+            cs->prev_p_pos_x[base_idx + i + programIndex] = p_curr_x;
+            cs->prev_p_pos_y[base_idx + i + programIndex] = p_curr_y;
+            cs->prev_p_pos_z[base_idx + i + programIndex] = p_curr_z;
+        }
+    }
+
+    // Step 3: Distance Constraints
+    // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8
+    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+        uniform int p_base = c * PARTICLES_PER_CHAIN;
+        uniform int r_base = c * CONSTRAINTS_PER_CHAIN;
+        uniform struct chain_cfg *cfg = &cs->chain_configs[c];
+
+        varying float stiffness_vec = cfg->stiffness;
+        varying float stretch_factor_vec = cfg->stretch_factor;
+        varying float max_strain_vec_abs = cfg->max_strain;
+        varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec;
+
+        // Copy particles to local arrays
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+            px_local[i + programIndex] = cs->p_pos_x[p_base + i + programIndex];
+            py_local[i + programIndex] = cs->p_pos_y[p_base + i + programIndex];
+            pz_local[i + programIndex] = cs->p_pos_z[p_base + i + programIndex];
+            pm_local[i + programIndex] = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+        }
+
+        // Iterate over constraints
+        for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) {
+            // First block (i=0 to 7)
+            {
+                varying float p0x = px_local[0];
+                varying float p0y = py_local[0];
+                varying float p0z = pz_local[0];
+                varying float p0m = pm_local[0];
+                varying float p1x = px_local[1]; // Unaligned access
+                varying float p1y = py_local[1];
+                varying float p1z = pz_local[1];
+                varying float p1m = pm_local[1];
+                varying float rest_len_vec = cs->rest_lengths[r_base];
+
+                varying float dx = p1x - p0x;
+                varying float dy = p1y - p0y;
+                varying float dz = p1z - p0z;
+
+                varying float dist_sq = dx * dx + dy * dy + dz * dz;
+                varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
+                varying float dist = rcp(inv_dist);
+
+                varying float w_sum = p0m + p1m;
+                varying float rcp_w_sum = rcp(max(w_sum, eps_vec));
+
+                varying float diff = dist - rest_len_vec;
+                varying float strain = diff * rcp(max(rest_len_vec, eps_vec));
+                varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs);
+
+                varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + (strain_clamped & abs_mask_ps));
+                varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum;
+
+                varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0);
+
+                varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask;
+                varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask;
+                varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask;
+
+                px_local[0] = p0x + delta_x * p0m;
+                py_local[0] = p0y + delta_y * p0m;
+                pz_local[0] = p0z + delta_z * p0m;
+
+                px_local[1] = p1x - delta_x * p1m;
+                py_local[1] = p1y - delta_y * p1m;
+                pz_local[1] = p1z - delta_z * p1m;
+            }
+
+            // Second block (i=8 to 14, mask last lane)
+            {
+                varying float p0x = px_local[8];
+                varying float p0y = py_local[8];
+                varying float p0z = pz_local[8];
+                varying float p0m = pm_local[8];
+                varying float p1x = px_local[9]; // Unaligned access
+                varying float p1y = py_local[9];
+                varying float p1z = pz_local[9];
+                varying float p1m = pm_local[9];
+                varying float rest_len_vec = cs->rest_lengths[r_base + 8];
+
+                varying float dx = p1x - p0x;
+                varying float dy = p1y - p0y;
+                varying float dz = p1z - p0z;
+
+                varying float dist_sq = dx * dx + dy * dy + dz * dz;
+                varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
+                varying float dist = rcp(inv_dist);
+
+                varying float w_sum = p0m + p1m;
+                varying float rcp_w_sum = rcp(max(w_sum, eps_vec));
+
+                varying float diff = dist - rest_len_vec;
+                varying float strain = diff * rcp(max(rest_len_vec, eps_vec));
+                varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs);
+
+                varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + (strain_clamped & abs_mask_ps));
+                varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum;
+
+                varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0);
+                // Mask last lane (i=15) to disable constraint
+                apply_corr_mask = select(programIndex < 7, apply_corr_mask, 0.0);
+
+                varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask;
+                varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask;
+                varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask;
+
+                px_local[8] = p0x + delta_x * p0m;
+                py_local[8] = p0y + delta_y * p0m;
+                pz_local[8] = p0z + delta_z * p0m;
+
+                px_local[9] = p1x - delta_x * p1m;
+                py_local[9] = p1y - delta_y * p1m;
+                pz_local[9] = p1z - delta_z * p1m;
+            }
+        }
+
+        // Write back to global arrays
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+            cs->p_pos_x[p_base + i + programIndex] = px_local[i + programIndex];
+            cs->p_pos_y[p_base + i + programIndex] = py_local[i + programIndex];
+            cs->p_pos_z[p_base + i + programIndex] = pz_local[i + programIndex];
+        }
+    }
+
+    // Step 4: Sphere Collisions with Friction
+    // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8
+    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+        uniform int p_base = c * PARTICLES_PER_CHAIN;
+        uniform int s_base = c * SPHERES_PER_CHAIN;
+        uniform struct chain_cfg *cfg = &cs->chain_configs[c];
+        varying float friction_vec = cfg->friction;
+
+        for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) {
+            varying float sphere_x_vec = cs->sphere_x[s_base + s];
+            varying float sphere_y_vec = cs->sphere_y[s_base + s];
+            varying float sphere_z_vec = cs->sphere_z[s_base + s];
+            varying float sphere_r_vec = cs->sphere_radius[s_base + s];
+
+            for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+                varying float p_curr_x = cs->p_pos_x[p_base + i + programIndex];
+                varying float p_curr_y = cs->p_pos_y[p_base + i + programIndex];
+                varying float p_curr_z = cs->p_pos_z[p_base + i + programIndex];
+
+                varying float p_prev_x = cs->prev_p_pos_x[p_base + i + programIndex];
+                varying float p_prev_y = cs->prev_p_pos_y[p_base + i + programIndex];
+                varying float p_prev_z = cs->prev_p_pos_z[p_base + i + programIndex];
+                varying float p_inv_mass = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+
+                varying float dx = p_curr_x - sphere_x_vec;
+                varying float dy = p_curr_y - sphere_y_vec;
+                varying float dz = p_curr_z - sphere_z_vec;
+
+                varying float dist_sq = dx * dx + dy * dy + dz * dz;
+                varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
+                varying float dist = rcp(inv_dist);
+
+                varying float penetration = sphere_r_vec - dist;
+                varying float collision_mask = select(penetration > zero_vec, 1.0, 0.0);
+
+                varying float normal_x = dx * inv_dist;
+                varying float normal_y = dy * inv_dist;
+                varying float normal_z = dz * inv_dist;
+
+                varying float normal_corr_x = normal_x * penetration;
+                varying float normal_corr_y = normal_y * penetration;
+                varying float normal_corr_z = normal_z * penetration;
+
+                varying float vel_x = p_curr_x - p_prev_x;
+                varying float vel_y = p_curr_y - p_prev_y;
+                varying float vel_z = p_curr_z - p_prev_z;
+
+                varying float vel_dot_normal = vel_x * normal_x + vel_y * normal_y + vel_z * normal_z;
+                varying float tangent_x = vel_x - vel_dot_normal * normal_x;
+                varying float tangent_y = vel_y - vel_dot_normal * normal_y;
+                varying float tangent_z = vel_z - vel_dot_normal * normal_z;
+
+                varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z;
+                varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec));
+
+                varying float friction_mag = penetration * friction_vec;
+                varying float friction_x = (tangent_x * inv_tangent_mag) * friction_mag;
+                varying float friction_y = (tangent_y * inv_tangent_mag) * friction_mag;
+                varying float friction_z = (tangent_z * inv_tangent_mag) * friction_mag;
+
+                varying float total_corr_x = (normal_corr_x - friction_x) * collision_mask * p_inv_mass;
+                varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass;
+                varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass;
+
+                cs->p_pos_x[p_base + i + programIndex] = p_curr_x + total_corr_x;
+                cs->p_pos_y[p_base + i + programIndex] = p_curr_y + total_corr_y;
+                cs->p_pos_z[p_base + i + programIndex] = p_curr_z + total_corr_z;
+            }
+        }
+    }
+
+    // Step 5: Motion Constraints
+    // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8
+    for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
+        uniform int p_base = c * PARTICLES_PER_CHAIN;
+        for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+            varying float px = cs->p_pos_x[p_base + i + programIndex];
+            varying float py = cs->p_pos_y[p_base + i + programIndex];
+            varying float pz = cs->p_pos_z[p_base + i + programIndex];
+            varying float pm = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);
+
+            varying float rx = cs->rest_pos_x[p_base + i + programIndex];
+            varying float ry = cs->rest_pos_y[p_base + i + programIndex];
+            varying float rz = cs->rest_pos_z[p_base + i + programIndex];
+            varying float radius_vec = cs->motion_radius[p_base + i + programIndex];
+
+            varying float dx = px - rx;
+            varying float dy = py - ry;
+            varying float dz = pz - rz;
+
+            varying float dist_sq = dx * dx + dy * dy + dz * dz;
+            varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
+            varying float dist = rcp(inv_dist);
+
+            varying float penetration = dist - radius_vec;
+            varying float mask = select(penetration > zero_vec, 1.0, 0.0);
+
+            varying float corr_factor = penetration * inv_dist * mask;
+            varying float delta_x = dx * corr_factor * pm;
+            varying float delta_y = dy * corr_factor * pm;
+            varying float delta_z = dz * corr_factor * pm;
+
+            cs->p_pos_x[p_base + i + programIndex] = px - delta_x;
+            cs->p_pos_y[p_base + i + programIndex] = py - delta_y;
+            cs->p_pos_z[p_base + i + programIndex] = pz - delta_z;
+        }
+    }
+}
diff --git a/chain_avbd.c b/chain_avbd.c
@@ -257,7 +257,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&vel_y[c], vel_y_vec);
       _mm256_store_ps(&vel_z[c], vel_z_vec);
     }
-// Compute angular velocity
+    // Compute angular velocity
     {
       __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
       __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);

diff --git a/chain.c b/chain.c
@@ -94,13 +94,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   const __m256 zero_vec = _mm256_setzero_ps();
   const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
   const __m256 two_vec = _mm256_set1_ps(2.0f);
-  const __m256 half_vec = _mm256_set1_ps(0.5f);
-  const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f);
-  const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f);
-  const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f);
-  const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f);
-  const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f);
-  const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f);
   const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
   const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
   const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
@@ -134,6 +127,35 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
   float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
 
+  // Local arrays for PBD solver
+  float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+  float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+  float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+  float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+
+  // Initialize dummy elements
+  px_local[PARTICLES_PER_CHAIN] = 0.0f;
+  py_local[PARTICLES_PER_CHAIN] = 0.0f;
+  pz_local[PARTICLES_PER_CHAIN] = 0.0f;
+  pm_local[PARTICLES_PER_CHAIN] = 0.0f;
+
+  // Stack arrays for precomputed data
+  float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+
   // Step 0: Precomputation (SIMD, 8 chains at once)
   for (int c = 0; c < MAX_CHAINS; c += 8) {
     // Load chain quaternions
@@ -255,66 +277,69 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&vel_y[c], vel_y_vec);
       _mm256_store_ps(&vel_z[c], vel_z_vec);
     }
-    // Compute angular velocity (SIMD with polynomial approximations)
+    // Compute angular velocity
     {
+      __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
+      __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
+      __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
+      __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
+
       __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
       __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]);
       __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]);
       __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]);
 
-      // Quaternion inverse
-      __m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy));
-      norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq);
-      norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq);
-      __m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec));
-
-      __m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq);
-      __m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq);
-      __m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq);
-      __m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq);
-
-      // Delta quaternion: q_current * q_prev^-1
-      __m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx));
-      delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx));
-      __m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy));
-      delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy));
-      __m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz));
-      delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz));
-      __m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw));
-      delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw);
-      delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw);
-
-      // Angular velocity with polynomial approximations
-      __m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec);
-
-      // Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3)))
-      __m256 x = qw_clamped;
-      __m256 x2 = _mm256_mul_ps(x, x);
-      __m256 x4 = _mm256_mul_ps(x2, x2);
-      __m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec);
-      poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec);
-      poly = _mm256_fmadd_ps(x2, poly, one_vec);
-      poly = _mm256_mul_ps(x, poly);
-      __m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly));
-
-      // Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2))
-      __m256 half_angle = _mm256_mul_ps(angle, half_vec);
-      __m256 ha2 = _mm256_mul_ps(half_angle, half_angle);
-      __m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec);
-      sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec);
-      __m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly);
-      __m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec));
-
-      // Sign: Use qw > 0 instead of angle < π
-      __m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ);
-      sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec));
-
-      // Angular velocity: (sign * qx * inv_sin_half_angle / dt)
-      __m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec));
-      __m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale);
-      __m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale);
-      __m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale);
-
+      // Step 1: Compute delta quaternion (to - from)
+      __m256 delta_qx = _mm256_sub_ps(qx, prev_qx);
+      __m256 delta_qy = _mm256_sub_ps(qy, prev_qy);
+      __m256 delta_qz = _mm256_sub_ps(qz, prev_qz);
+      __m256 delta_qw = _mm256_sub_ps(qw, prev_qw);
+
+      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
+      __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx);
+      __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy);
+      __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz);
+      __m256 conj_qw = prev_qw;
+
+      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
+      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
+      // Result: orientationDelta = (delta_qw, delta_qx, delta_qy, delta_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
+      __m256 orient_qw = _mm256_fnmadd_ps(delta_qx, conj_qx, _mm256_mul_ps(delta_qw, conj_qw));
+      orient_qw = _mm256_fnmadd_ps(delta_qy, conj_qy, orient_qw);
+      orient_qw = _mm256_fnmadd_ps(delta_qz, conj_qz, orient_qw);
+
+      __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(delta_qy, conj_qz), _mm256_mul_ps(delta_qz, conj_qy));
+      __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(delta_qz, conj_qx), _mm256_mul_ps(delta_qx, conj_qz));
+      __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(delta_qx, conj_qy), _mm256_mul_ps(delta_qy, conj_qx));
+
+      __m256 orient_qx = _mm256_fmadd_ps(delta_qw, conj_qx, _mm256_mul_ps(delta_qx, conj_qw));
+      orient_qx = _mm256_add_ps(orient_qx, cross_x);
+      __m256 orient_qy = _mm256_fmadd_ps(delta_qw, conj_qy, _mm256_mul_ps(delta_qy, conj_qw));
+      orient_qy = _mm256_add_ps(orient_qy, cross_y);
+      __m256 orient_qz = _mm256_fmadd_ps(delta_qw, conj_qz, _mm256_mul_ps(delta_qz, conj_qw));
+      orient_qz = _mm256_add_ps(orient_qz, cross_z);
+
+      // Step 4: Compute dot product (to, from) for shortest arc check
+      __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw));
+      dot = _mm256_fmadd_ps(qy, prev_qy, dot);
+      dot = _mm256_fmadd_ps(qz, prev_qz, dot);
+
+      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
+      __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
+      __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), 
+                                       _mm256_andnot_ps(negate_mask, one_vec));
+
+      orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
+      orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
+      orient_qz = _mm256_mul_ps(orient_qz, negate_sign);
+
+      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
+      __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
+      __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale);
+      __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale);
+      __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale);
+
+      // Store results
       _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
       _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
       _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);

diff --git a/chain_avbd.c b/chain_avbd.c
@@ -103,13 +103,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   const __m256 zero_vec = _mm256_setzero_ps();
   const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
   const __m256 two_vec = _mm256_set1_ps(2.0f);
-  const __m256 half_vec = _mm256_set1_ps(0.5f);
-  const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f);
-  const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f);
-  const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f);
-  const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f);
-  const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f);
-  const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f);
   const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
   const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
   const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
@@ -264,66 +257,69 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&vel_y[c], vel_y_vec);
       _mm256_store_ps(&vel_z[c], vel_z_vec);
     }
-    // Compute angular velocity (SIMD with polynomial approximations)
+// Compute angular velocity
     {
+      __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
+      __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
+      __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
+      __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
+
       __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
       __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]);
       __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]);
       __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]);
 
-      // Quaternion inverse
-      __m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy));
-      norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq);
-      norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq);
-      __m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec));
-
-      __m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq);
-      __m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq);
-      __m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq);
-      __m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq);
-
-      // Delta quaternion: q_current * q_prev^-1
-      __m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx));
-      delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx));
-      __m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy));
-      delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy));
-      __m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz));
-      delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz));
-      __m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw));
-      delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw);
-      delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw);
-
-      // Angular velocity with polynomial approximations
-      __m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec);
-
-      // Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3)))
-      __m256 x = qw_clamped;
-      __m256 x2 = _mm256_mul_ps(x, x);
-      __m256 x4 = _mm256_mul_ps(x2, x2);
-      __m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec);
-      poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec);
-      poly = _mm256_fmadd_ps(x2, poly, one_vec);
-      poly = _mm256_mul_ps(x, poly);
-      __m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly));
-
-      // Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2))
-      __m256 half_angle = _mm256_mul_ps(angle, half_vec);
-      __m256 ha2 = _mm256_mul_ps(half_angle, half_angle);
-      __m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec);
-      sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec);
-      __m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly);
-      __m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec));
-
-      // Sign: Use qw > 0 instead of angle < π
-      __m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ);
-      sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec));
-
-      // Angular velocity: (sign * qx * inv_sin_half_angle / dt)
-      __m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec));
-      __m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale);
-      __m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale);
-      __m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale);
-
+      // Step 1: Compute delta quaternion (to - from)
+      __m256 delta_qx = _mm256_sub_ps(qx, prev_qx);
+      __m256 delta_qy = _mm256_sub_ps(qy, prev_qy);
+      __m256 delta_qz = _mm256_sub_ps(qz, prev_qz);
+      __m256 delta_qw = _mm256_sub_ps(qw, prev_qw);
+
+      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
+      __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx);
+      __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy);
+      __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz);
+      __m256 conj_qw = prev_qw;
+
+      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
+      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
+      // Result: orientationDelta = (delta_qw, delta_qx, delta_qy, delta_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
+      __m256 orient_qw = _mm256_fnmadd_ps(delta_qx, conj_qx, _mm256_mul_ps(delta_qw, conj_qw));
+      orient_qw = _mm256_fnmadd_ps(delta_qy, conj_qy, orient_qw);
+      orient_qw = _mm256_fnmadd_ps(delta_qz, conj_qz, orient_qw);
+
+      __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(delta_qy, conj_qz), _mm256_mul_ps(delta_qz, conj_qy));
+      __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(delta_qz, conj_qx), _mm256_mul_ps(delta_qx, conj_qz));
+      __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(delta_qx, conj_qy), _mm256_mul_ps(delta_qy, conj_qx));
+
+      __m256 orient_qx = _mm256_fmadd_ps(delta_qw, conj_qx, _mm256_mul_ps(delta_qx, conj_qw));
+      orient_qx = _mm256_add_ps(orient_qx, cross_x);
+      __m256 orient_qy = _mm256_fmadd_ps(delta_qw, conj_qy, _mm256_mul_ps(delta_qy, conj_qw));
+      orient_qy = _mm256_add_ps(orient_qy, cross_y);
+      __m256 orient_qz = _mm256_fmadd_ps(delta_qw, conj_qz, _mm256_mul_ps(delta_qz, conj_qw));
+      orient_qz = _mm256_add_ps(orient_qz, cross_z);
+
+      // Step 4: Compute dot product (to, from) for shortest arc check
+      __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw));
+      dot = _mm256_fmadd_ps(qy, prev_qy, dot);
+      dot = _mm256_fmadd_ps(qz, prev_qz, dot);
+
+      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
+      __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
+      __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), 
+                                       _mm256_andnot_ps(negate_mask, one_vec));
+
+      orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
+      orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
+      orient_qz = _mm256_mul_ps(orient_qz, negate_sign);
+
+      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
+      __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
+      __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale);
+      __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale);
+      __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale);
+
+      // Store results
       _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
       _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
       _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);

diff --git a/chain_avbd.c b/chain_avbd.c
@@ -0,0 +1,1068 @@
+#include <immintrin.h>
+#include <math.h>
+
+enum {
+  MAX_CHAINS = 64,
+  PARTICLES_PER_CHAIN = 16,
+  SPHERES_PER_CHAIN = 8,
+  MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
+  MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN),
+  CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN,
+  MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN),
+  MAX_ITERATIONS = 4,
+};
+struct chain_cfg {
+  float damping;
+  float friction;
+  float collision_radius;
+  float linear_inertia;      // [0, 1], 1.0f = physically correct linear motion
+  float angular_inertia;     // [0, 1], 1.0f = physically correct angular rotation
+  float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force
+
+  float avbd_alpha;          // Regularization parameter (e.g., 0.95f)
+  float avbd_beta;           // Stiffness ramping control (e.g., 10.0f)
+  float avbd_gamma;          // Warm starting scaling (e.g., 0.99f)
+  float avbd_k_start;        // Initial stiffness for AVBD (e.g., 100.0f)
+  float avbd_k_max;          // Maximum stiffness for AVBD (e.g., 1e6f)
+} __attribute__((aligned(32)));
+
+struct chain_sim {
+  unsigned short free_idx_cnt;
+  unsigned short free_idx[MAX_CHAINS];
+  struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
+
+  // Chain global forces (world space gravity and wind)
+  float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float wind_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  // Chain transformations (world space)
+  float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
+  float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
+  float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
+
+  // Particle positions (model space)
+  float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
+  float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
+  float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
+  float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
+
+  float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
+  float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
+  float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
+
+  float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
+  float dist_lambda[MAX_REST_LENGTHS] __attribute__((aligned(32)));
+  float dist_k_current[MAX_REST_LENGTHS] __attribute__((aligned(32)));
+  float dist_prev_frame_error[MAX_REST_LENGTHS] __attribute__((aligned(32)));
+
+  // Sphere positions (model space)
+  float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
+  float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
+  float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
+  float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));
+
+  float coll_lambda[MAX_PARTICLES + 1] __attribute__((aligned(32)));
+  float coll_k_current[MAX_PARTICLES + 1] __attribute__((aligned(32)));
+  float coll_prev_frame_error[MAX_PARTICLES + 1] __attribute__((aligned(32)));
+
+  // Rest positions (model space)
+  float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
+  float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
+  float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
+  float motion_radius[MAX_PARTICLES] __attribute__((aligned(32)));
+};
+
+void simulate_chains(struct chain_sim *cs, float dt) {
+  // Common SIMD constants
+  const __m256 dt_vec = _mm256_set1_ps(dt);
+  const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
+  const __m256 one_vec = _mm256_set1_ps(1.0f);
+  const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
+  const __m256 eps_vec = _mm256_set1_ps(1e-6f);
+  const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f);
+  const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f);
+  const __m256 zero_vec = _mm256_setzero_ps();
+  const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
+  const __m256 two_vec = _mm256_set1_ps(2.0f);
+  const __m256 half_vec = _mm256_set1_ps(0.5f);
+  const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f);
+  const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f);
+  const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f);
+  const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f);
+  const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f);
+  const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f);
+  const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
+  const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
+  const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+
+  // Local arrays for PBD solver
+  float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+  float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+  float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+  float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
+
+  // Initialize dummy elements
+  px_local[PARTICLES_PER_CHAIN] = 0.0f;
+  py_local[PARTICLES_PER_CHAIN] = 0.0f;
+  pz_local[PARTICLES_PER_CHAIN] = 0.0f;
+  pm_local[PARTICLES_PER_CHAIN] = 0.0f;
+
+  // Stack arrays for precomputed data
+  float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  // Step 0: Precomputation (SIMD, 8 chains at once)
+  for (int c = 0; c < MAX_CHAINS; c += 8) {
+    // Load chain quaternions
+    __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
+    __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
+    __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
+    __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
+    // Compute local-space wind
+    {
+      __m256 vx = _mm256_load_ps(&cs->wind_x[c]);
+      __m256 vy = _mm256_load_ps(&cs->wind_y[c]);
+      __m256 vz = _mm256_load_ps(&cs->wind_z[c]);
+
+      // Create q_conjugate components (qw remains, qx, qy, qz are negated)
+      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
+      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
+      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
+
+      // Step 1: t = q_conj * v_world_as_quat
+      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
+      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
+      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+
+      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
+      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+
+      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
+      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
+      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
+      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+
+      // Step 2: result_vec = (t * q)_vec
+      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
+      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
+      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
+      // result_z = tw*qz + ty*qw + tx*qy - ty*qx
+
+      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
+      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
+      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
+      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
+      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
+      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
+
+      _mm256_store_ps(&wind_local_x[c], result_x);
+      _mm256_store_ps(&wind_local_y[c], result_y);
+      _mm256_store_ps(&wind_local_z[c], result_z);
+    }
+    // Compute local-space gravity
+    {
+      __m256 vx = _mm256_load_ps(&cs->gravity_x[c]);
+      __m256 vy = _mm256_load_ps(&cs->gravity_y[c]);
+      __m256 vz = _mm256_load_ps(&cs->gravity_z[c]);
+
+    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
+      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
+      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
+      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
+
+      // Step 1: t = q_conj * v_world_as_quat
+      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
+      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
+      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+
+      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
+      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+
+      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
+      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
+      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
+      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+
+      // Step 2: result_vec = (t * q)_vec
+      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
+      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
+      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
+      // result_z = tw*qz + ty*qw + tx*qy - ty*qx
+
+      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
+      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
+      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
+      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
+      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
+      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
+
+      _mm256_store_ps(&gravity_local_x[c], result_x);
+      _mm256_store_ps(&gravity_local_y[c], result_y);
+      _mm256_store_ps(&gravity_local_z[c], result_z);
+    }
+    // Compute linear velocity
+    {
+      __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
+      __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]);
+      __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]);
+
+      __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]);
+      __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]);
+      __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]);
+
+      __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
+      __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
+      __m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec);
+
+      _mm256_store_ps(&vel_x[c], vel_x_vec);
+      _mm256_store_ps(&vel_y[c], vel_y_vec);
+      _mm256_store_ps(&vel_z[c], vel_z_vec);
+    }
+    // Compute angular velocity (SIMD with polynomial approximations)
+    {
+      __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
+      __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]);
+      __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]);
+      __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]);
+
+      // Quaternion inverse
+      __m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy));
+      norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq);
+      norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq);
+      __m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec));
+
+      __m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq);
+      __m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq);
+      __m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq);
+      __m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq);
+
+      // Delta quaternion: q_current * q_prev^-1
+      __m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx));
+      delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx));
+      __m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy));
+      delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy));
+      __m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz));
+      delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz));
+      __m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw));
+      delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw);
+      delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw);
+
+      // Angular velocity with polynomial approximations
+      __m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec);
+
+      // Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3)))
+      __m256 x = qw_clamped;
+      __m256 x2 = _mm256_mul_ps(x, x);
+      __m256 x4 = _mm256_mul_ps(x2, x2);
+      __m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec);
+      poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec);
+      poly = _mm256_fmadd_ps(x2, poly, one_vec);
+      poly = _mm256_mul_ps(x, poly);
+      __m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly));
+
+      // Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2))
+      __m256 half_angle = _mm256_mul_ps(angle, half_vec);
+      __m256 ha2 = _mm256_mul_ps(half_angle, half_angle);
+      __m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec);
+      sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec);
+      __m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly);
+      __m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec));
+
+      // Sign: Use qw > 0 instead of angle < π
+      __m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ);
+      sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec));
+
+      // Angular velocity: (sign * qx * inv_sin_half_angle / dt)
+      __m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec));
+      __m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale);
+      __m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale);
+      __m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale);
+
+      _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
+      _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
+      _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
+    }
+  }
+  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
+  for (int c = 0; c < MAX_CHAINS; c++) {
+    int base_idx = c * PARTICLES_PER_CHAIN;
+    struct chain_cfg *cfg = &cs->chain_configs[c];
+
+    // Load precomputed velocities and inertia parameters
+    __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
+    __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
+    __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);
+
+    __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
+    __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
+    __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);
+
+    __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
+    __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
+    __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);
+
+    // Clamp inertia parameters to [0, 1]
+    linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
+    angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
+    centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
+
+    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+      __m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
+      __m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
+      __m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
+      __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
+      p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
+
+      // Linear inertia: v * dt * linear_inertia
+      __m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
+
+      // Angular inertia: (ω × r) * dt * angular_inertia
+      __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
+      __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
+      __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
+
+      ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec);
+      ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec);
+      ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec);
+
+      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
+      __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
+      __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
+      __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
+
+      __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
+      __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
+      __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
+
+      // Calculate displacement: (ω × (ω × r)) * dt^2
+      __m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec);
+      __m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec);
+      __m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec);
+
+      // Apply the centrifugal_inertia factor
+      __m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec);
+      __m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec);
+      __m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec);
+
+      // Total delta
+      __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x);
+      __m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y);
+      __m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z);
+
+      // Update positions
+      _mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x));
+      _mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y));
+      _mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z));
+    }
+    // Update previous transformation
+    cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
+    cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
+    cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
+
+    cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
+    cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
+    cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
+    cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
+  }
+  // Step 2: Verlet Integration
+  for (int c = 0; c < MAX_CHAINS; c++) {
+    int base_idx = c * PARTICLES_PER_CHAIN;
+    struct chain_cfg *cfg = &cs->chain_configs[c];
+
+    __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]);
+    __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]);
+    __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]);
+
+    __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]);
+    __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]);
+    __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]);
+    __m256 damping_vec = _mm256_set1_ps(cfg->damping);
+
+    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+      __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
+      __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
+      __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
+      __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
+      p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
+
+      __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]);
+      __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]);
+      __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]);
+
+      __m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec);
+      __m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec);
+      __m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec);
+
+      __m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass);
+      __m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass);
+      __m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass);
+
+      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
+      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
+      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
+
+      __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
+      __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
+      __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);
+
+      __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
+      __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
+      __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);
+
+      __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
+      __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
+      __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
+
+      _mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x);
+      _mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y);
+      _mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z);
+
+      _mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x);
+      _mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y);
+      _mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z);
+    }
+  }
+
+  // --- AVBD Warm-starting of lambda and k_current for all distance constraints.
+  for (int c = 0; c < MAX_CHAINS; ++c) {
+    struct chain_cfg* cfg = &cs->chain_configs[c];
+    __m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha);
+    __m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma);
+    __m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start);
+
+    // Loop through each block of 8 constraints for warm-starting
+    int r_base = c * CONSTRAINTS_PER_CHAIN; // Base index for this chain's constraints
+    for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += 8) {
+        // Load current lambda, k_current for 8 constraints simultaneously
+        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + i]);
+        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + i]);
+        // Apply warm-starting (Eq. 19 from paper)
+        // lambda_new = lambda_prev_frame * alpha * gamma
+        current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec));
+        // k_current_new = max(K_START, k_current_prev_frame * gamma)
+        current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec));
+        // Store updated warm-started values back
+        _mm256_store_ps(&cs->dist_lambda[r_base + i], current_lambda_vec);
+        _mm256_store_ps(&cs->dist_k_current[r_base + i], current_k_vec);
+    }
+  }
+  // Warm-starting for Collision Constraints ---
+  for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) {
+      struct chain_cfg* cfg = &cs->chain_configs[c_idx];
+      __m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha);
+      __m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma);
+      __m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start);
+
+      int p_base = c_idx * PARTICLES_PER_CHAIN; // Base index for this chain's particles
+      for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) {
+          __m256 current_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]);
+          __m256 current_k_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]);
+
+          current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec));
+          current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec));
+
+          _mm256_store_ps(&cs->coll_lambda[p_base + i], current_lambda_vec);
+          _mm256_store_ps(&cs->coll_k_current[p_base + i], current_k_vec);
+      }
+  }
+  for (int c = 0; c < MAX_CHAINS; c++) {
+    struct chain_cfg *cfg = &cs->chain_configs[c];
+    __m256 alpha_vec = _mm256_set1_ps(cfg->avbd_alpha);
+    __m256 beta_vec = _mm256_set1_ps(cfg->avbd_beta);
+    __m256 k_max_vec = _mm256_set1_ps(cfg->avbd_k_max);
+
+    int p_base = c * PARTICLES_PER_CHAIN;
+    int r_base = c * CONSTRAINTS_PER_CHAIN;
+    // Load all current particle positions into local buffer (once per chain per iteration)
+    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+      _mm256_store_ps(&px_local[i], _mm256_load_ps(&cs->p_pos_x[p_base + i]));
+      _mm256_store_ps(&py_local[i], _mm256_load_ps(&cs->p_pos_y[p_base + i]));
+      _mm256_store_ps(&pz_local[i], _mm256_load_ps(&cs->p_pos_z[p_base + i]));
+      _mm256_store_ps(&pm_local[i], _mm256_load_ps(&cs->inv_mass[p_base + i]));
+      _mm256_store_ps(&pm_local[i], _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&pm_local[i]), inv_mass_clamp_max), inv_mass_clamp_min));
+    }
+    for (int iter = 0; iter < MAX_ITERATIONS; iter++) {
+      // First block (i=0 to 7 of particles, processing constraints 0 to 7)
+      {
+        // p0 is aligned from px_local[0]
+        __m256 p0x = _mm256_load_ps(&px_local[0]);
+        __m256 p0y = _mm256_load_ps(&py_local[0]);
+        __m256 p0z = _mm256_load_ps(&pz_local[0]);
+        __m256 p0m = _mm256_load_ps(&pm_local[0]);
+
+        // p1 is unaligned from px_local[1]
+        __m256 p1x = _mm256_loadu_ps(&px_local[1]);
+        __m256 p1y = _mm256_loadu_ps(&py_local[1]);
+        __m256 p1z = _mm256_loadu_ps(&pz_local[1]);
+        __m256 p1m = _mm256_loadu_ps(&pm_local[1]);
+
+        // This corresponds to rest_lengths[r_base + 0] through rest_lengths[r_base + 7]
+        __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]);
+
+        // Calculate inverse sum of masses (rcp_w_sum)
+        __m256 w_sum = _mm256_add_ps(p0m, p1m);
+        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
+        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
+
+        // Calculate current vector and distance between particles
+        __m256 dx_vec = _mm256_sub_ps(p1x, p0x);
+        __m256 dy_vec = _mm256_sub_ps(p1y, p0y);
+        __m256 dz_vec = _mm256_sub_ps(p1z, p0z);
+
+        __m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec);
+        dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq);
+        dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq);
+
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); // 1/dist
+        __m256 dist = _mm256_rcp_ps(inv_dist); // Current distance
+
+        // Primal Update for Distance Constraints (Block 1) ---
+        // Load AVBD state for this block of 8 constraints (indices r_base + 0 to r_base + 7)
+        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 0]);
+        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 0]);
+        __m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 0]);
+
+        // Original constraint error C_j*(x) = current_dist - target_length
+        __m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec);
+
+        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18)
+        __m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec));
+
+        // Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda)
+        // The effective displacement is proportional to this force divided by effective stiffness (k) and mass.
+        __m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec);
+
+        // Calculate the scalar part of the correction, similar to original 'corr_scalar_part'
+        // This is simplified, representing the desired displacement due to the augmented force
+        __m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor
+
+        // Normalize difference vector
+        __m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist);
+        __m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist);
+        __m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist);
+
+        // Calculate the actual displacement vectors
+        // Note the sign inversion from original: -(delta_x * pm0) for p0 and +(delta_x * pm1) for p1
+        // The 'effective_corr_scalar_part' already implicitly carries the sign of the desired correction.
+        // So, we just multiply by the unit direction and inverse mass.
+        __m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part);
+        __m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part);
+        __m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part);
+
+        // Apply correction mask (from original code, avoid division by zero artifacts where dist is ~0)
+        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
+
+        delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
+        delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
+        delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
+
+        // Update positions in local buffer
+        // p0 moves along -delta (subtract), p1 moves along +delta (add)
+        _mm256_store_ps(&px_local[0], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m)));
+        _mm256_store_ps(&py_local[0], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m)));
+        _mm256_store_ps(&pz_local[0], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m)));
+
+        _mm256_storeu_ps(&px_local[1], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m)));
+        _mm256_storeu_ps(&py_local[1], _mm256_add_ps(p1y, _mm256_mul_ps(delta_y, p1m)));
+        _mm256_storeu_ps(&pz_local[1], _mm256_add_ps(p1z, _mm256_mul_ps(delta_z, p1m)));
+
+        // Dual Update for Distance Constraints (Block 1) ---
+        // Recalculate constraint error *after* position updates for accurate lambda update
+        // Reload updated positions from local buffer to get the 'post-primal' state
+        p0x = _mm256_load_ps(&px_local[0]);
+        p0y = _mm256_load_ps(&py_local[0]);
+        p0z = _mm256_load_ps(&pz_local[0]);
+        p1x = _mm256_loadu_ps(&px_local[1]);
+        p1y = _mm256_loadu_ps(&py_local[1]);
+        p1z = _mm256_loadu_ps(&pz_local[1]);
+
+        __m256 updated_dx = _mm256_sub_ps(p1x, p0x);
+        __m256 updated_dy = _mm256_sub_ps(p1y, p0y);
+        __m256 updated_dz = _mm256_sub_ps(p1z, p0z);
+
+        __m256 updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx);
+        updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq);
+        updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq);
+
+        __m256 updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec));
+        __m256 updated_dist = _mm256_rcp_ps(updated_inv_dist);
+
+        __m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update
+
+        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
+        __m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec);
+        _mm256_store_ps(&cs->dist_lambda[r_base + 0], new_lambda_vec); // Store back
+
+        // Update k_current (Stiffness ramping, Eq. 12)
+        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
+        __m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask
+        __m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec);
+        __m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec);
+        _mm256_store_ps(&cs->dist_k_current[r_base + 0], new_k_vec); // Store back
+      }
+      // Second block (i=8 to 14 of particles, processing constraints 8 to 14)
+      {
+        // p0 is aligned from px_local[8]
+        __m256 p0x = _mm256_load_ps(&px_local[8]);
+        __m256 p0y = _mm256_load_ps(&py_local[8]);
+        __m256 p0z = _mm256_load_ps(&pz_local[8]);
+        __m256 p0m = _mm256_load_ps(&pm_local[8]);
+
+        // p1 is unaligned from px_local[9], *including* the dummy element at px_local[16]
+        // when i=15, p_base+i+1 becomes p_base+16
+        __m256 p1x = _mm256_loadu_ps(&px_local[9]);
+        __m256 p1y = _mm256_loadu_ps(&py_local[9]);
+        __m256 p1z = _mm256_loadu_ps(&pz_local[9]);
+        __m256 p1m = _mm256_loadu_ps(&pm_local[9]);
+
+        // This corresponds to rest_lengths[r_base + 8] through rest_lengths[r_base + 15]
+        __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]);
+
+        __m256 dx_vec = _mm256_sub_ps(p1x, p0x);
+        __m256 dy_vec = _mm256_sub_ps(p1y, p0y);
+        __m256 dz_vec = _mm256_sub_ps(p1z, p0z);
+
+        __m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec);
+        dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq);
+        dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq);
+
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+
+        __m256 w_sum = _mm256_add_ps(p0m, p1m);
+        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
+        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
+
+        __m256 dist = _mm256_rcp_ps(inv_dist);
+
+        // Primal Update for Distance Constraints (Block 2) ---
+        // Load AVBD state for this block of 8 constraints (indices r_base + 8 to r_base + 15)
+        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 8]);
+        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 8]);
+        __m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 8]);
+
+        // Original constraint error C_j*(x) = current_dist - target_length
+        __m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec);
+
+        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18)
+        __m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec));
+
+        // Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda)
+        __m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec);
+
+        // Calculate the scalar part of the correction, similar to original 'corr_scalar_part'
+        __m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor
+
+        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
+
+        // This 'valid_mask' handles the fact that the last particle doesn't have a *next* particle
+        // for a distance constraint. It effectively sets the correction for the last element (index 7 of the block) to zero.
+        // For PARTICLES_PER_CHAIN = 16, this second block covers constraints for particles 8 to 15.
+        // The constraint for particle 15 (index 7 in this block) is between particle 15 and 16 (the dummy).
+        // The mask effectively disables the constraint correction for the dummy particle.
+        __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
+        __m256 valid_mask = _mm256_load_ps(valid_mask_array);
+        apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);
+
+        __m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist);
+        __m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist);
+        __m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist);
+
+        __m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part);
+        __m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part);
+        __m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part);
+
+        delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
+        delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
+        delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
+
+        // Update positions in local buffer
+        _mm256_store_ps(&px_local[8], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m)));
+        _mm256_store_ps(&py_local[8], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m)));
+        _mm256_store_ps(&pz_local[8], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m)));
+
+        _mm256_storeu_ps(&px_local[9], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m)));
+        _mm256_storeu_ps(&py_local[9], _mm256_add_ps(py1, _mm256_mul_ps(delta_y, p1m)));
+        _mm256_storeu_ps(&pz_local[9], _mm256_add_ps(pz1, _mm256_mul_ps(delta_z, p1m)));
+
+        // Recalculate constraint error *after* position updates for accurate lambda update
+        // Reload updated positions from local buffer
+        p0x = _mm256_load_ps(&px_local[8]);
+        p0y = _mm256_load_ps(&py_local[8]);
+        p0z = _mm256_load_ps(&pz_local[8]);
+        p1x = _mm256_loadu_ps(&px_local[9]);
+        p1y = _mm256_loadu_ps(&py_local[9]);
+        p1z = _mm256_loadu_ps(&pz_local[9]);
+
+        updated_dx = _mm256_sub_ps(p1x, p0x);
+        updated_dy = _mm256_sub_ps(p1y, p0y);
+        updated_dz = _mm256_sub_ps(p1z, p0z);
+
+        updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx);
+        updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq);
+        updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq);
+
+        updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec));
+        updated_dist = _mm256_rcp_ps(updated_inv_dist);
+
+        __m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update
+        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
+        __m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec);
+        _mm256_store_ps(&cs->dist_lambda[r_base + 8], new_lambda_vec); // Store back
+
+        // Update k_current (Stiffness ramping, Eq. 12)
+        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
+        __m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask
+        __m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec);
+        __m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec);
+        _mm256_store_ps(&cs->dist_k_current[r_base + 8], new_k_vec); // Store back
+      } // End of second block
+    }
+    // After all iterations, save the final positions from local buffer to global cs->p_pos_x/y/z
+    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+      _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_load_ps(&px_local[i]));
+      _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_load_ps(&py_local[i]));
+      _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_load_ps(&pz_local[i]));
+    }
+  }
+  // Step 4: Sphere Collisions with Friction
+  for (int c = 0; c < MAX_CHAINS; c++) {
+    int p_base = c * PARTICLES_PER_CHAIN;
+    int s_base = c * SPHERES_PER_CHAIN;
+    struct chain_cfg *cfg = &cs->chain_configs[c];
+    __m256 friction_vec = _mm256_set1_ps(cfg->friction);
+
+    for (int s = 0; s < SPHERES_PER_CHAIN; s++) {
+      __m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]);
+      __m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]);
+      __m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]);
+      __m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]);
+
+      __m256 alpha_coll_vec = _mm256_set1_ps(cfg->avbd_alpha);
+      __m256 beta_coll_vec = _mm256_set1_ps(cfg->avbd_beta);
+      __m256 k_max_coll_vec = _mm256_set1_ps(cfg->avbd_k_max);
+
+      for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+        __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
+        __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
+        __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
+
+        __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]);
+        __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]);
+        __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);
+
+        __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]);
+        p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
+
+        __m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
+        __m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
+        __m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);
+
+        __m256 dist_sq = _mm256_mul_ps(dx, dx);
+        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
+
+        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+        __m256 dist = _mm256_rcp_ps(inv_dist);
+
+        __m256 penetration = _mm256_sub_ps(sphere_r_vec, dist);
+        __m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
+
+        __m256 normal_x = _mm256_mul_ps(dx, inv_dist);
+        __m256 normal_y = _mm256_mul_ps(dy, inv_dist);
+        __m256 normal_z = _mm256_mul_ps(dz, inv_dist);
+
+        // Load AVBD state for this particle block
+        __m256 coll_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]);
+        __m256 coll_k_current_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]);
+        __m256 coll_prev_frame_error_vec = _mm256_load_ps(&cs->coll_prev_frame_error[p_base + i]);
+
+        // AVBD Primal Update:
+        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame)
+        __m256 C_x_regularized = _mm256_sub_ps(penetration, _mm256_mul_ps(alpha_coll_vec, coll_prev_frame_error_vec));
+
+        // Apply collision_mask to regularized error, so only active constraints contribute.
+        // This ensures C_x_regularized is 0 for non-penetrating particles, making force_term 0.
+        C_x_regularized = _mm256_and_ps(C_x_regularized, collision_mask);
+
+        // Calculate force term: (k * C_reg + lambda)
+        __m256 force_term = _mm256_fmadd_ps(coll_k_current_vec, C_x_regularized, coll_lambda_vec);
+
+        // Calculate correction magnitude (delta_p in paper): force_term * inv_mass / k_current
+        // For collision, delta_x = -nablaC_j(x) * (H_j_inv * (lambda_j + K_j C_j(x))).
+        // H_j_inv for a single particle collision is its inverse mass (p_inv_mass).
+        // So, the correction should be along the normal: -normal * (force_term * p_inv_mass) / k_current
+        // The force_term already has the appropriate sign for correction direction when C_x_regularized is positive (penetration).
+        __m256 correction_magnitude_scalar = _mm256_div_ps(_mm256_mul_ps(force_term, p_inv_mass), coll_k_current_vec);
+
+        // Apply the correction along the normal direction.
+        // The sign of 'penetration' (and thus 'C_x_regularized' and 'force_term')
+        // means positive 'correction_magnitude_scalar' should push OUT of collision.
+        // So we subtract from particle position.
+        __m256 delta_pos_x = _mm256_mul_ps(normal_x, correction_magnitude_scalar);
+        __m256 delta_pos_y = _mm256_mul_ps(normal_y, correction_magnitude_scalar);
+        __m256 delta_pos_z = _mm256_mul_ps(normal_z, correction_magnitude_scalar);
+
+        // Update positions: p_curr = p_curr - delta_pos (particle moves out of penetration)
+        p_curr_x = _mm256_sub_ps(p_curr_x, delta_pos_x);
+        p_curr_y = _mm256_sub_ps(p_curr_y, delta_pos_y);
+        p_curr_z = _mm256_sub_ps(p_curr_z, delta_pos_z);
+
+        _mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x);
+        _mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y);
+        _mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z);
+
+        // Apply friction (post-stabilization step)
+        // Recalculate velocity based on updated positions
+        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
+        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
+        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
+
+        // Project velocity onto normal to get normal component
+        __m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x);
+        vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal);
+        vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal);
+
+        // Get tangential velocity component
+        __m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x));
+        __m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y));
+        __m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z));
+
+        __m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x);
+        tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq);
+        tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq);
+
+        __m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec));
+
+        // Calculate friction force magnitude limit (based on normal force proxy)
+        __m256 friction_mag_limit = _mm256_mul_ps(penetration, friction_vec); 
+
+        // Actual tangential velocity length
+        __m256 actual_tangent_mag = _mm256_mul_ps(_mm256_sqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)), one_vec); // sqrt(tangent_mag_sq) is needed here
+
+        // Calculate the scaling factor for friction. If actual_tangent_mag is greater than limit, scale down.
+        __m256 friction_scale = _mm256_div_ps(friction_mag_limit, _mm256_max_ps(actual_tangent_mag, eps_vec));
+        friction_scale = _mm256_min_ps(friction_scale, one_vec); // Clamp to 1.0 (no acceleration from friction)
+
+        // Apply friction as a reduction of tangential velocity, converting to position change
+        // The idea is to reduce the tangential displacement that happened *this* iteration.
+        // The original `vel_x/y/z` are already displacements from `p_prev` to `p_curr`.
+        // So, the tangential displacement is `tangent_x/y/z`.
+        // We want to reduce this by `(1 - friction_scale)`.
+        __m256 friction_x_corr = _mm256_mul_ps(tangent_x, friction_scale);
+        __m256 friction_y_corr = _mm256_mul_ps(tangent_y, friction_scale);
+        __m256 friction_z_corr = _mm256_mul_ps(tangent_z, friction_scale);
+
+        // Only apply if actively colliding
+        friction_x_corr = _mm256_and_ps(friction_x_corr, collision_mask);
+        friction_y_corr = _mm256_and_ps(friction_y_corr, collision_mask);
+        friction_z_corr = _mm256_and_ps(friction_z_corr, collision_mask);
+
+        // Apply friction by moving the particle along the tangential direction.
+        // If friction_scale is 0 (no friction), no change. If 1 (full friction), moves back to p_prev.
+        p_curr_x = _mm256_sub_ps(p_curr_x, friction_x_corr);
+        p_curr_y = _mm256_sub_ps(p_curr_y, friction_y_corr);
+        p_curr_z = _mm256_sub_ps(p_curr_z, friction_z_corr);
+
+        _mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x);
+        _mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y);
+        _mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z);
+
+        // Dual Update for Collision Constraints ---
+        // Recalculate collision error *after* position updates (normal & friction)
+        // Reload updated positions
+        p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
+        p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
+        p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
+
+        dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
+        dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
+        dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);
+
+        dist_sq = _mm256_mul_ps(dx, dx);
+        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
+
+        inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+        dist = _mm256_rcp_ps(inv_dist);
+
+        __m256 C_x_updated_collision = _mm256_sub_ps(sphere_r_vec, dist); // New error after primal update
+
+        // Apply the collision mask again for updates, so only active constraints update their AVBD state
+        C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, collision_mask);
+
+        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
+        __m256 new_coll_lambda_vec = _mm256_fmadd_ps(coll_k_current_vec, C_x_updated_collision, coll_lambda_vec);
+        _mm256_store_ps(&cs->coll_lambda[p_base + i], new_coll_lambda_vec);
+
+        // Update k_current (Stiffness ramping, Eq. 12)
+        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
+        __m256 abs_C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, abs_mask_ps); // Absolute value
+        __m256 k_updated_coll_vec = _mm256_fmadd_ps(beta_coll_vec, abs_C_x_updated_collision, coll_k_current_vec);
+        __m256 new_coll_k_vec = _mm256_min_ps(k_max_coll_vec, k_updated_coll_vec);
+        _mm256_store_ps(&cs->coll_k_current[p_base + i], new_coll_k_vec);
+        // --- END AVBD ADDITION ---
+      }
+    }
+  }
+  // Step 5: Motion Constraints
+  for (int c = 0; c < MAX_CHAINS; c++) {
+    int p_base = c * PARTICLES_PER_CHAIN;
+    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
+      __m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
+      __m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
+      __m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
+      __m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]);
+      pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min);
+
+      __m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]);
+      __m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]);
+      __m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]);
+      __m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]);
+
+      __m256 dx = _mm256_sub_ps(px, rx);
+      __m256 dy = _mm256_sub_ps(py, ry);
+      __m256 dz = _mm256_sub_ps(pz, rz);
+
+      __m256 dist_sq = _mm256_mul_ps(dx, dx);
+      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
+      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
+
+      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+      __m256 dist = _mm256_rcp_ps(inv_dist);
+
+      __m256 penetration = _mm256_sub_ps(dist, radius_vec);
+      __m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
+
+      __m256 corr_factor = _mm256_mul_ps(penetration, inv_dist);
+      corr_factor = _mm256_and_ps(corr_factor, mask);
+
+      __m256 delta_x = _mm256_mul_ps(dx, corr_factor);
+      __m256 delta_y = _mm256_mul_ps(dy, corr_factor);
+      __m256 delta_z = _mm256_mul_ps(dz, corr_factor);
+
+      delta_x = _mm256_mul_ps(delta_x, pm);
+      delta_y = _mm256_mul_ps(delta_y, pm);
+      delta_z = _mm256_mul_ps(delta_z, pm);
+
+      _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x));
+      _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y));
+      _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z));
+    }
+  }
+  // Store Final Errors for Next Frame's Regularization ---
+  // This happens once at the very end of simulate_chains, after all iterations are complete for the frame.
+  for (int c = 0; c < MAX_CHAINS; ++c) {
+    int p_base = c * PARTICLES_PER_CHAIN;
+    int r_base = c * CONSTRAINTS_PER_CHAIN;
+    // Loop through each block of 8 constraints (distance constraints are N-1 for N particles)
+    for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += VEC_FLOAT_COUNT) {
+      // Calculate the current error for this distance constraint block
+      // (Similar to C_x_original calculation from Step 3, but using final positions)
+      // Load final positions for p0 and p1 (from global arrays now, as local buffer is stale)
+      __m256 px0 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 0]);
+      __m256 py0 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 0]);
+      __m256 pz0 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 0]);
+
+      __m256 px1 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 1]); // Use _mm256_load_ps as this is from global
+      __m256 py1 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 1]);
+      __m256 pz1 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 1]);
+
+      __m256 diff_x = _mm256_sub_ps(px1, px0);
+      __m256 diff_y = _mm256_sub_ps(py1, py0);
+      __m256 diff_z = _mm256_sub_ps(pz1, pz0);
+
+      __m256 dist_sq = _mm256_mul_ps(diff_x, diff_x);
+      dist_sq = _mm256_fmadd_ps(diff_y, diff_y, dist_sq);
+      dist_sq = _mm256_fmadd_ps(diff_z, diff_z, dist_sq);
+
+      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
+      __m256 dist = _mm256_rcp_ps(inv_dist);
+      __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + i]);
+      __m256 final_error_vec = _mm256_sub_ps(dist, rest_len_vec);
+      _mm256_store_ps(&cs->dist_prev_frame_error[r_base + i], final_error_vec); // Store for next frame's regularization
+    }
+  }
+  for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) {
+    int p_base = c_idx * PARTICLES_PER_CHAIN;
+    int s_base = c_idx * SPHERES_PER_CHAIN; // For accessing sphere data for this chain
+    for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) {
+      __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
+      __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
+      __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
+      __m256 deepest_penetration_vec = neg_flt_max_vec; // Initialize with negative infinity
+      // Loop through all spheres for this chain to find the max penetration for each particle in the block
+      for (int s_idx = 0; s_idx < SPHERES_PER_CHAIN; ++s_idx) {
+        __m256 s_x = _mm256_set1_ps(cs->sphere_x[s_base + s_idx]);
+        __m256 s_y = _mm256_set1_ps(cs->sphere_y[s_base + s_idx]);
+        __m256 s_z = _mm256_set1_ps(cs->sphere_z[s_base + s_idx]);
+        __m256 s_r = _mm256_set1_ps(cs->sphere_radius[s_base + s_idx]);
+
+        __m256 dx_s = _mm256_sub_ps(p_curr_x, s_x);
+        __m256 dy_s = _mm256_sub_ps(p_curr_y, s_y);
+        __m256 dz_s = _mm256_sub_ps(p_curr_z, s_z);
+
+        __m256 dist_sq_s = _mm256_mul_ps(dx_s, dx_s);
+        dist_sq_s = _mm256_fmadd_ps(dy_s, dy_s, dist_sq_s);
+        dist_sq_s = _mm256_fmadd_ps(dz_s, dz_s, dist_sq_s);
+
+        __m256 inv_dist_s = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq_s, eps_vec));
+        __m256 dist_s = _mm256_rcp_ps(inv_dist_s);
+
+        __m256 current_penetration_s = _mm256_sub_ps(s_r, dist_s); // C(x) = radius - distance
+
+        // We only care about positive penetration (i.e., actually colliding)
+        current_penetration_s = _mm256_max_ps(zero_vec, current_penetration_s);
+        deepest_penetration_vec = _mm256_max_ps(deepest_penetration_vec, current_penetration_s);
+      }
+      _mm256_store_ps(&cs->coll_prev_frame_error[p_base + i], deepest_penetration_vec); // Store for next frame's regularization
+    }
+  }
+}
diff --git a/chain.c b/chain.c
@@ -691,9 +691,9 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 friction_y = _mm256_mul_ps(_mm256_mul_ps(tangent_y, inv_tangent_mag), friction_mag);
         __m256 friction_z = _mm256_mul_ps(_mm256_mul_ps(tangent_z, inv_tangent_mag), friction_mag);
 
-    __m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x);
-    __m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y);
-    __m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z);
+        __m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x);
+        __m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y);
+        __m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z);
 
         total_corr_x = _mm256_and_ps(total_corr_x, collision_mask);
         total_corr_y = _mm256_and_ps(total_corr_y, collision_mask);

diff --git a/chain.c b/chain.c
@@ -13,8 +13,6 @@ enum {
 };
 
 struct chain_cfg {
-  float gravity_x, gravity_y, gravity_z; // World space
-  float wind_x, wind_y, wind_z;         // World space
   float damping;
   float stiffness;
   float stretch_factor;
@@ -31,6 +29,15 @@ struct chain_sim {
   unsigned short free_idx[MAX_CHAINS];
   struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
 
+  // Chain global forces (world space gravity and wind)
+  float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));
+
+  float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
+  float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
+  float wind_z[MAX_CHAINS] __attribute__((aligned(32)));
+
   // Chain transformations (world space)
   float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
@@ -96,6 +103,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f);
   const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
   const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
+  const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
 
   // Local arrays for PBD solver
   float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
@@ -135,53 +143,95 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
     // Compute local-space wind
     {
-      __m256 vx = _mm256_set1_ps(cs->chain_configs[c].wind_x);
-      __m256 vy = _mm256_set1_ps(cs->chain_configs[c].wind_y);
-      __m256 vz = _mm256_set1_ps(cs->chain_configs[c].wind_z);
-
-      __m256 tx = _mm256_fmadd_ps(qy, vz, _mm256_mul_ps(qw, vx));
-      tx = _mm256_fnmadd_ps(qz, vy, tx);
-      __m256 ty = _mm256_fmadd_ps(qz, vx, _mm256_mul_ps(qw, vy));
-      ty = _mm256_fnmadd_ps(qx, vz, ty);
-      __m256 tz = _mm256_fmadd_ps(qx, vy, _mm256_mul_ps(qw, vz));
-      tz = _mm256_fnmadd_ps(qy, vx, tz);
-      __m256 tw = _mm256_fnmadd_ps(qx, vx, zero_vec);
-      tw = _mm256_fnmadd_ps(qy, vy, tw);
-      tw = _mm256_fnmadd_ps(qz, vz, tw);
-
-      __m256 result_x = _mm256_fmadd_ps(ty, _mm256_sub_ps(zero_vec, qz), _mm256_mul_ps(tw, _mm256_sub_ps(zero_vec, qx)));
-      result_x = _mm256_fmadd_ps(tz, _mm256_sub_ps(zero_vec, qy), _mm256_fmadd_ps(tx, qw, result_x));
-      __m256 result_y = _mm256_fnmadd_ps(tx, _mm256_sub_ps(zero_vec, qz), _mm256_mul_ps(tw, _mm256_sub_ps(zero_vec, qy)));
-      result_y = _mm256_fmadd_ps(tz, _mm256_sub_ps(zero_vec, qx), _mm256_fmadd_ps(ty, qw, result_y));
-      __m256 result_z = _mm256_fmadd_ps(tx, _mm256_sub_ps(zero_vec, qy), _mm256_mul_ps(tw, _mm256_sub_ps(zero_vec, qz)));
-      result_z = _mm256_fmadd_ps(ty, _mm256_sub_ps(zero_vec, qx), _mm256_fmadd_ps(tz, qw, result_z));
+      __m256 vx = _mm256_load_ps(&cs->wind_x[c]);
+      __m256 vy = _mm256_load_ps(&cs->wind_y[c]);
+      __m256 vz = _mm256_load_ps(&cs->wind_z[c]);
+
+      // Create q_conjugate components (qw remains, qx, qy, qz are negated)
+      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
+      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
+      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
+
+      // Step 1: t = q_conj * v_world_as_quat
+      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
+      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
+      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+
+      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
+      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+
+      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
+      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
+      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
+      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+
+      // Step 2: result_vec = (t * q)_vec
+      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
+      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
+      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
+      // result_z = tw*qz + ty*qw + tx*qy - ty*qx
+
+      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
+      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
+      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
+      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
+      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
+      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
 
       _mm256_store_ps(&wind_local_x[c], result_x);
       _mm256_store_ps(&wind_local_y[c], result_y);
       _mm256_store_ps(&wind_local_z[c], result_z);
     }
     // Compute local-space gravity
     {
-      __m256 vx = _mm256_set1_ps(cs->chain_configs[c].gravity_x);
-      __m256 vy = _mm256_set1_ps(cs->chain_configs[c].gravity_y);
-      __m256 vz = _mm256_set1_ps(cs->chain_configs[c].gravity_z);
-
-      __m256 tx = _mm256_fmadd_ps(qy, vz, _mm256_mul_ps(qw, vx));
-      tx = _mm256_fnmadd_ps(qz, vy, tx);
-      __m256 ty = _mm256_fmadd_ps(qz, vx, _mm256_mul_ps(qw, vy));
-      ty = _mm256_fnmadd_ps(qx, vz, ty);
-      __m256 tz = _mm256_fmadd_ps(qx, vy, _mm256_mul_ps(qw, vz));
-      tz = _mm256_fnmadd_ps(qy, vx, tz);
-      __m256 tw = _mm256_fnmadd_ps(qx, vx, zero_vec);
-      tw = _mm256_fnmadd_ps(qy, vy, tw);
-      tw = _mm256_fnmadd_ps(qz, vz, tw);
-
-      __m256 result_x = _mm256_fmadd_ps(ty, _mm256_sub_ps(zero_vec, qz), _mm256_mul_ps(tw, _mm256_sub_ps(zero_vec, qx)));
-      result_x = _mm256_fmadd_ps(tz, _mm256_sub_ps(zero_vec, qy), _mm256_fmadd_ps(tx, qw, result_x));
-      __m256 result_y = _mm256_fnmadd_ps(tx, _mm256_sub_ps(zero_vec, qz), _mm256_mul_ps(tw, _mm256_sub_ps(zero_vec, qy)));
-      result_y = _mm256_fmadd_ps(tz, _mm256_sub_ps(zero_vec, qx), _mm256_fmadd_ps(ty, qw, result_y));
-      __m256 result_z = _mm256_fmadd_ps(tx, _mm256_sub_ps(zero_vec, qy), _mm256_mul_ps(tw, _mm256_sub_ps(zero_vec, qz)));
-      result_z = _mm256_fmadd_ps(ty, _mm256_sub_ps(zero_vec, qx), _mm256_fmadd_ps(tz, qw, result_z));
+      __m256 vx = _mm256_load_ps(&cs->gravity_x[c]);
+      __m256 vy = _mm256_load_ps(&cs->gravity_y[c]);
+      __m256 vz = _mm256_load_ps(&cs->gravity_z[c]);
+
+    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
+      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
+      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
+      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
+
+      // Step 1: t = q_conj * v_world_as_quat
+      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
+      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
+      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
+
+      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
+      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
+
+      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
+      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
+
+      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
+      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
+      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
+
+      // Step 2: result_vec = (t * q)_vec
+      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
+      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
+      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
+      // result_z = tw*qz + ty*qw + tx*qy - ty*qx
+
+      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
+      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
+      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
+      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
+      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
+      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
 
       _mm256_store_ps(&gravity_local_x[c], result_x);
       _mm256_store_ps(&gravity_local_y[c], result_y);
@@ -323,13 +373,15 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
       __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
 
-      __m256 cent_delta_x = _mm256_mul_ps(_mm256_mul_ps(cross2_x, dt2_vec), p_inv_mass);
-      __m256 cent_delta_y = _mm256_mul_ps(_mm256_mul_ps(cross2_y, dt2_vec), p_inv_mass);
-      __m256 cent_delta_z = _mm256_mul_ps(_mm256_mul_ps(cross2_z, dt2_vec), p_inv_mass);
+      // Calculate displacement: (ω × (ω × r)) * dt^2
+      __m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec);
+      __m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec);
+      __m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec);
 
-      cent_delta_x = _mm256_mul_ps(cent_delta_x, centrifugal_inertia_vec);
-      cent_delta_y = _mm256_mul_ps(cent_delta_y, centrifugal_inertia_vec);
-      cent_delta_z = _mm256_mul_ps(cent_delta_z, centrifugal_inertia_vec);
+      // Apply the centrifugal_inertia factor
+      __m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec);
+      __m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec);
+      __m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec);
 
       // Total delta
       __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x);
@@ -464,7 +516,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
         __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));
 
-        __m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_abs_ps(strain_clamped));
+        __m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
         __m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom));
 
         __m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec);
@@ -526,7 +578,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
         __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));
 
-        __m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_abs_ps(strain_clamped));
+        __m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
         __m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom));
 
         __m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec);
@@ -535,7 +587,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
 
         __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
 
-        alignas(32) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
+        __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
         __m256 valid_mask = _mm256_load_ps(valid_mask_array);
         apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);
 
@@ -639,9 +691,9 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 friction_y = _mm256_mul_ps(_mm256_mul_ps(tangent_y, inv_tangent_mag), friction_mag);
         __m256 friction_z = _mm256_mul_ps(_mm256_mul_ps(tangent_z, inv_tangent_mag), friction_mag);
 
-        __m256 total_corr_x = _mm256_add_ps(normal_corr_x, friction_x);
-        __m256 total_corr_y = _mm256_add_ps(normal_corr_y, friction_y);
-        __m256 total_corr_z = _mm256_add_ps(normal_corr_z, friction_z);
+    __m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x);
+    __m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y);
+    __m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z);
 
         total_corr_x = _mm256_and_ps(total_corr_x, collision_mask);
         total_corr_y = _mm256_and_ps(total_corr_y, collision_mask);
@@ -707,4 +759,4 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z));
     }
   }
-}
+}
diff --git a/chain.c b/chain.c
@@ -35,13 +35,16 @@ struct chain_sim {
   float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+
   float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
+
   float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
   float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
   float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
+
   float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
   float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
   float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
@@ -52,6 +55,7 @@ struct chain_sim {
   float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
   float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
   float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
+
   float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
   float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
   float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
@@ -129,7 +133,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
     __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
     __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
-
     // Compute local-space wind
     {
       __m256 vx = _mm256_set1_ps(cs->chain_configs[c].wind_x);
@@ -157,7 +160,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&wind_local_y[c], result_y);
       _mm256_store_ps(&wind_local_z[c], result_z);
     }
-
     // Compute local-space gravity
     {
       __m256 vx = _mm256_set1_ps(cs->chain_configs[c].gravity_x);
@@ -185,7 +187,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&gravity_local_y[c], result_y);
       _mm256_store_ps(&gravity_local_z[c], result_z);
     }
-
     // Compute linear velocity
     {
       __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
@@ -204,7 +205,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&vel_y[c], vel_y_vec);
       _mm256_store_ps(&vel_z[c], vel_z_vec);
     }
-
     // Compute angular velocity (SIMD with polynomial approximations)
     {
       __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
@@ -270,7 +270,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
     }
   }
-
   // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
   for (int c = 0; c < MAX_CHAINS; c++) {
     int base_idx = c * PARTICLES_PER_CHAIN;
@@ -342,7 +341,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y));
       _mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z));
     }
-
     // Update previous transformation
     cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
     cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
@@ -353,7 +351,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
     cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
   }
-
   // Step 2: Verlet Integration
   for (int c = 0; c < MAX_CHAINS; c++) {
     int base_idx = c * PARTICLES_PER_CHAIN;
@@ -412,7 +409,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z);
     }
   }
-
   // Step 3: Distance Constraints
   for (int c = 0; c < MAX_CHAINS; c++) {
     int p_base = c * PARTICLES_PER_CHAIN;
@@ -435,7 +431,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&pz_local[i], pz);
       _mm256_store_ps(&pm_local[i], pm);
     }
-
     for (int iter = 0; iter < MAX_ITERATIONS; iter++) {
       // First block (i=0 to 7)
       {
@@ -498,7 +493,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         _mm256_store_ps(&py_local[1], _mm256_fnmadd_ps(delta_y, p1m, p1y));
         _mm256_store_ps(&pz_local[1], _mm256_fnmadd_ps(delta_z, p1m, p1z));
       }
-
       // Second block (i=8 to 14)
       {
         __m256 p0x = _mm256_load_ps(&px_local[8]);
@@ -566,7 +560,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         _mm256_storeu_ps(&pz_local[9], _mm256_fnmadd_ps(delta_z, p1m, p1z));
       }
     }
-
     for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
       __m256 px = _mm256_load_ps(&px_local[i]);
       __m256 py = _mm256_load_ps(&py_local[i]);
@@ -577,7 +570,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       _mm256_store_ps(&cs->p_pos_z[p_base + i], pz);
     }
   }
-
   // Step 4: Sphere Collisions with Friction
   for (int c = 0; c < MAX_CHAINS; c++) {
     int p_base = c * PARTICLES_PER_CHAIN;

diff --git a/chain.c b/chain.c
@@ -109,12 +109,15 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
   float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
   float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+
   float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
   float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
   float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
+
   float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
   float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
   float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
+
   float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
   float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
   float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
@@ -188,6 +191,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
       __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]);
       __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]);
+
       __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]);
       __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]);
       __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]);
@@ -276,9 +280,11 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
     __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
     __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);
+
     __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
     __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
     __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);
+
     __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
     __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
     __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);
@@ -304,6 +310,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
       __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
       __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
+
       ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec);
       ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec);
       ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec);
@@ -320,6 +327,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 cent_delta_x = _mm256_mul_ps(_mm256_mul_ps(cross2_x, dt2_vec), p_inv_mass);
       __m256 cent_delta_y = _mm256_mul_ps(_mm256_mul_ps(cross2_y, dt2_vec), p_inv_mass);
       __m256 cent_delta_z = _mm256_mul_ps(_mm256_mul_ps(cross2_z, dt2_vec), p_inv_mass);
+
       cent_delta_x = _mm256_mul_ps(cent_delta_x, centrifugal_inertia_vec);
       cent_delta_y = _mm256_mul_ps(cent_delta_y, centrifugal_inertia_vec);
       cent_delta_z = _mm256_mul_ps(cent_delta_z, centrifugal_inertia_vec);
@@ -339,6 +347,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
     cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
     cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
+
     cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
     cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
     cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
@@ -353,6 +362,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]);
     __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]);
     __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]);
+
     __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]);
     __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]);
     __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]);
@@ -495,6 +505,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 p0y = _mm256_load_ps(&py_local[8]);
         __m256 p0z = _mm256_load_ps(&pz_local[8]);
         __m256 p0m = _mm256_load_ps(&pm_local[8]);
+
         __m256 p1x = _mm256_loadu_ps(&px_local[9]);
         __m256 p1y = _mm256_loadu_ps(&py_local[9]);
         __m256 p1z = _mm256_loadu_ps(&pz_local[9]);
@@ -560,6 +571,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 px = _mm256_load_ps(&px_local[i]);
       __m256 py = _mm256_load_ps(&py_local[i]);
       __m256 pz = _mm256_load_ps(&pz_local[i]);
+
       _mm256_store_ps(&cs->p_pos_x[p_base + i], px);
       _mm256_store_ps(&cs->p_pos_y[p_base + i], py);
       _mm256_store_ps(&cs->p_pos_z[p_base + i], pz);
@@ -583,6 +595,7 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
         __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
         __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
+
         __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]);
         __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]);
         __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);

diff --git a/chain.c b/chain.c
@@ -21,6 +21,9 @@ struct chain_cfg {
   float max_strain;
   float friction;
   float collision_radius;
+  float linear_inertia;      // [0, 1], 1.0f = physically correct linear motion
+  float angular_inertia;     // [0, 1], 1.0f = physically correct angular rotation
+  float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force
 } __attribute__((aligned(32)));
 
 struct chain_sim {
@@ -32,16 +35,13 @@ struct chain_sim {
   float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
-
   float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
   float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
-
   float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
   float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
   float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
-
   float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
   float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
   float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
@@ -85,11 +85,13 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   const __m256 two_vec = _mm256_set1_ps(2.0f);
   const __m256 half_vec = _mm256_set1_ps(0.5f);
   const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f);
-  const __m256 acos_c1_vec = _mm256_set1_ps(-0.16666666666666666f);
-  const __m256 acos_c2_vec = _mm256_set1_ps(0.07500000000000001f);
-  const __m256 acos_c3_vec = _mm256_set1_ps(-0.04464285714285714f);
-  const __m256 sin_c1_vec = _mm256_set1_ps(-0.16666666666666666f);
-  const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333333333f);
+  const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f);
+  const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f);
+  const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f);
+  const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f);
+  const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f);
+  const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
+  const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
 
   // Local arrays for PBD solver
   float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
@@ -107,15 +109,12 @@ void simulate_chains(struct chain_sim *cs, float dt) {
   float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
   float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
   float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
-
   float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
   float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
   float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
-
   float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
   float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
   float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
-
   float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
   float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
   float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
@@ -130,12 +129,10 @@ void simulate_chains(struct chain_sim *cs, float dt) {
 
     // Compute local-space wind
     {
-      // Broadcast wind (same for all chains in cfg)
       __m256 vx = _mm256_set1_ps(cs->chain_configs[c].wind_x);
       __m256 vy = _mm256_set1_ps(cs->chain_configs[c].wind_y);
       __m256 vz = _mm256_set1_ps(cs->chain_configs[c].wind_z);
 
-      // Quaternion rotation: q * v * q^-1
       __m256 tx = _mm256_fmadd_ps(qy, vz, _mm256_mul_ps(qw, vx));
       tx = _mm256_fnmadd_ps(qz, vy, tx);
       __m256 ty = _mm256_fmadd_ps(qz, vx, _mm256_mul_ps(qw, vy));
@@ -191,7 +188,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
       __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]);
       __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]);
-
       __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]);
       __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]);
       __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]);
@@ -271,18 +267,26 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     }
   }
 
-  // Step 1: Apply Chain Inertia (Linear and Centrifugal)
+  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
   for (int c = 0; c < MAX_CHAINS; c++) {
     int base_idx = c * PARTICLES_PER_CHAIN;
+    struct chain_cfg *cfg = &cs->chain_configs[c];
 
-    // Use precomputed velocities
+    // Load precomputed velocities and inertia parameters
     __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
     __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
     __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);
-
     __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
     __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
     __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);
+    __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
+    __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
+    __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);
+
+    // Clamp inertia parameters to [0, 1]
+    linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
+    angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
+    centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
 
     for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
       __m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
@@ -291,12 +295,20 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
       p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
 
-      // Linear inertia: v * dt
-      __m256 lin_delta_x = _mm256_mul_ps(vel_x_vec, dt_vec);
-      __m256 lin_delta_y = _mm256_mul_ps(vel_y_vec, dt_vec);
-      __m256 lin_delta_z = _mm256_mul_ps(vel_z_vec, dt_vec);
+      // Linear inertia: v * dt * linear_inertia
+      __m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
+      __m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
+
+      // Angular inertia: (ω × r) * dt * angular_inertia
+      __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
+      __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
+      __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
+      ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec);
+      ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec);
+      ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec);
 
-      // Centrifugal inertia: (ω × (ω × r)) * dt^2
+      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
       __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
       __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
       __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
@@ -308,11 +320,14 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 cent_delta_x = _mm256_mul_ps(_mm256_mul_ps(cross2_x, dt2_vec), p_inv_mass);
       __m256 cent_delta_y = _mm256_mul_ps(_mm256_mul_ps(cross2_y, dt2_vec), p_inv_mass);
       __m256 cent_delta_z = _mm256_mul_ps(_mm256_mul_ps(cross2_z, dt2_vec), p_inv_mass);
+      cent_delta_x = _mm256_mul_ps(cent_delta_x, centrifugal_inertia_vec);
+      cent_delta_y = _mm256_mul_ps(cent_delta_y, centrifugal_inertia_vec);
+      cent_delta_z = _mm256_mul_ps(cent_delta_z, centrifugal_inertia_vec);
 
       // Total delta
-      __m256 delta_x = _mm256_add_ps(lin_delta_x, cent_delta_x);
-      __m256 delta_y = _mm256_add_ps(lin_delta_y, cent_delta_y);
-      __m256 delta_z = _mm256_add_ps(lin_delta_z, cent_delta_z);
+      __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x);
+      __m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y);
+      __m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z);
 
       // Update positions
       _mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x));
@@ -324,7 +339,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
     cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
     cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
-
     cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
     cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
     cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
@@ -339,7 +353,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
     __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]);
     __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]);
     __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]);
-
     __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]);
     __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]);
     __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]);
@@ -420,7 +433,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 p0y = _mm256_load_ps(&py_local[0]);
         __m256 p0z = _mm256_load_ps(&pz_local[0]);
         __m256 p0m = _mm256_load_ps(&pm_local[0]);
-
         __m256 p1x = _mm256_loadu_ps(&px_local[1]);
         __m256 p1y = _mm256_loadu_ps(&py_local[1]);
         __m256 p1z = _mm256_loadu_ps(&pz_local[1]);
@@ -483,7 +495,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 p0y = _mm256_load_ps(&py_local[8]);
         __m256 p0z = _mm256_load_ps(&pz_local[8]);
         __m256 p0m = _mm256_load_ps(&pm_local[8]);
-
         __m256 p1x = _mm256_loadu_ps(&px_local[9]);
         __m256 p1y = _mm256_loadu_ps(&py_local[9]);
         __m256 p1z = _mm256_loadu_ps(&pz_local[9]);
@@ -549,7 +560,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
       __m256 px = _mm256_load_ps(&px_local[i]);
       __m256 py = _mm256_load_ps(&py_local[i]);
       __m256 pz = _mm256_load_ps(&pz_local[i]);
-
       _mm256_store_ps(&cs->p_pos_x[p_base + i], px);
       _mm256_store_ps(&cs->p_pos_y[p_base + i], py);
       _mm256_store_ps(&cs->p_pos_z[p_base + i], pz);
@@ -573,7 +583,6 @@ void simulate_chains(struct chain_sim *cs, float dt) {
         __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
         __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
         __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
-
         __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]);
         __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]);
         __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);