Last active
October 28, 2025 21:55
-
-
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
Revisions
-
vurtun revised this gist
Oct 28, 2025 . 1 changed file with 14 additions and 68 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -117,64 +117,44 @@ chn_sim__val(struct chn_sim *cns) { assert(cfg->bend_stiffness >= 0.0f); assert(cfg->bend_stiffness <= 1.0f); assert(isfinite(cns->chn_grav_x[i])); assert(isfinite(cns->chn_grav_y[i])); assert(isfinite(cns->chn_grav_z[i])); assert(isfinite(cns->chn_wnd_x[i])); assert(isfinite(cns->chn_wnd_y[i])); assert(isfinite(cns->chn_wnd_z[i])); assert(isfinite(cns->chn_pos_x[i])); assert(isfinite(cns->chn_pos_y[i])); assert(isfinite(cns->chn_pos_z[i])); assert(isfinite(cns->chn_quat_x[i])); assert(isfinite(cns->chn_quat_y[i])); assert(isfinite(cns->chn_quat_z[i])); assert(isfinite(cns->chn_quat_w[i])); // Quaternion normalization invariant float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] + cns->chn_quat_y[i]*cns->chn_quat_y[i] + cns->chn_quat_z[i]*cns->chn_quat_z[i] + cns->chn_quat_w[i]*cns->chn_quat_w[i]; assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); assert(isfinite(cns->chn_prev_pos_x[i])); assert(isfinite(cns->chn_prev_pos_y[i])); assert(isfinite(cns->chn_prev_pos_z[i])); assert(isfinite(cns->chn_prev_quat_x[i])); assert(isfinite(cns->chn_prev_quat_y[i])); assert(isfinite(cns->chn_prev_quat_z[i])); assert(isfinite(cns->chn_prev_quat_w[i])); // Prev quat norm qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] + cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] + cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] + cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i]; assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); } for (int i = 0; i < MAX_PTC; ++i) { @@ -183,74 +163,48 @@ chn_sim__val(struct chn_sim *cns) { assert(cns->motion_radius[i] >= 0.0f); assert(cns->motion_radius[i] <= MAX_MOTION_RADIUS); assert(isfinite(cns->ptc_pos_x[i])); assert(isfinite(cns->ptc_pos_y[i])); assert(isfinite(cns->ptc_pos_z[i])); assert(isfinite(cns->ptc_inv_mass[i])); assert(cns->ptc_inv_mass[i] >= 0.0f); assert(isfinite(cns->ptc_prev_pos_x[i])); assert(isfinite(cns->ptc_prev_pos_y[i])); assert(isfinite(cns->ptc_prev_pos_z[i])); assert(isfinite(cns->rest_pos_x[i])); assert(isfinite(cns->rest_pos_y[i])); assert(isfinite(cns->rest_pos_z[i])); assert(isfinite(cns->motion_radius[i])); // Spot-check motion constraint post-any update (sample every 8th for perf) float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i]; float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i]; float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i]; float dist = sqrtf(dx*dx + dy*dy + dz*dz); assert(dist <= cns->motion_radius[i] + 1e-6f); } for (int i = 0; i < MAX_SPH; ++i) { assert(i < MAX_SPH); assert(isfinite(cns->sph_x[i])); assert(isfinite(cns->sph_y[i])); assert(isfinite(cns->sph_z[i])); assert(isfinite(cns->sph_r[i])); assert(cns->sph_r[i] >= 0.0f); } for (int i = 0; i < MAX_REST_LEN; ++i) { assert(i < MAX_REST_LEN); assert(cns->ptc_rest_len[i] >= 0.0f); assert(isfinite(cns->ptc_rest_len[i])); assert(isfinite(cns->ptc_lambda[i])); assert(cns->ptc_lambda[i] >= -1e6f); assert(cns->ptc_lambda[i] <= 1e6f); assert(cns->ptc_bend_rest[i] >= 0.0f); assert(isfinite(cns->ptc_bend_rest[i])); assert(isfinite(cns->ptc_bend_lambda[i])); assert(cns->ptc_bend_lambda[i] >= -1e6f); assert(cns->ptc_bend_lambda[i] <= 1e6f); @@ -392,7 +346,6 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric float len = _mm256_cvtss_f32(sqrt_vec); assert(isfinite(len)); len = fmaxf(len, 1e-6f); int con_idx = i - 1; @@ -429,7 +382,6 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric float len = _mm256_cvtss_f32(sqrt_vec); assert(isfinite(len)); len = fmaxf(len, 1e-6f); cns->ptc_bend_rest[r_base + i] = len; @@ -449,7 +401,6 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric float len = _mm256_cvtss_f32(sqrt_vec); assert(isfinite(len)); len = fmaxf(len, 1e-6f); cns->ptc_bend_rest[r_base + 8 + i] = len; @@ -660,11 +611,6 @@ chn_sim_set_sph(struct chn_sim *cns, int chn, const float *restrict spheres, int assert(isfinite(cns->sph_y[s_base + i])); assert(isfinite(cns->sph_z[s_base + i])); assert(isfinite(cns->sph_r[s_base + i])); assert(cns->sph_r[s_base + i] >= 0.0f); } for (int i = cnt; i < SPH_PER_CHN; ++i) { @@ -939,7 +885,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { assert(isfinite(_mm256_cvtss_f32(ang_vel_z_vec))); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - Duplicated even/odd for (int c = 0; c < MAX_CHNS; ++c) { __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 1, 3); __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 1, 3); -
vurtun revised this gist
Oct 25, 2025 . 1 changed file with 143 additions and 164 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -152,10 +152,7 @@ chn_sim__val(struct chn_sim *cns) { assert(isfinite(cns->chn_quat_w[i])); // Quaternion normalization invariant float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] + cns->chn_quat_y[i]*cns->chn_quat_y[i] + cns->chn_quat_z[i]*cns->chn_quat_z[i] + cns->chn_quat_w[i]*cns->chn_quat_w[i]; assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); assert(!isnan(cns->chn_prev_pos_x[i])); @@ -177,10 +174,7 @@ chn_sim__val(struct chn_sim *cns) { assert(isfinite(cns->chn_prev_quat_w[i])); // Prev quat norm qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] + cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] + cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] + cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i]; assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); } for (int i = 0; i < MAX_PTC; ++i) { @@ -218,11 +212,14 @@ chn_sim__val(struct chn_sim *cns) { assert(isfinite(cns->rest_pos_z[i])); assert(isfinite(cns->motion_radius[i])); // Spot-check motion constraint post-any update (sample every 8th for perf) if (i % 8 == 0) { float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i]; float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i]; float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i]; float dist = sqrtf(dx*dx + dy*dy + dz*dz); assert(dist <= cns->motion_radius[i] + 1e-6f); } } for (int i = 0; i < MAX_SPH; ++i) { assert(i < MAX_SPH); @@ -235,6 +232,7 @@ chn_sim__val(struct chn_sim *cns) { assert(!isnan(cns->sph_y[i])); assert(!isnan(cns->sph_z[i])); assert(!isnan(cns->sph_r[i])); assert(cns->sph_r[i] >= 0.0f); } for (int i = 0; i < MAX_REST_LEN; ++i) { @@ -1223,6 +1221,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { assert(p_base + 15 < MAX_PTC); int r_base = c * CON_PER_CHN; assert(r_base + 15 < MAX_REST_LEN);struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); __m256 compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(stiff_vec, eps_vec)); // Fallback if 0 __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness); @@ -1530,195 +1529,175 @@ chn_sim_run(struct chn_sim *cns, float dt) { assert(s_base + 7 < MAX_SPH); struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 fric_vec = _mm256_set1_ps(cfg->friction); for (int s = 0; s < SPH_PER_CHN; ++s) { __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]); __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]); __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]); __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]); // Even block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 norm_x = _mm256_mul_ps(dx, inv_dist); __m256 norm_y = _mm256_mul_ps(dy, inv_dist); __m256 norm_z = _mm256_mul_ps(dz, inv_dist); __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x))); __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x))); tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec); // Clamp before rsqrt __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq); __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); total_corr_x = _mm256_and_ps(total_corr_x, col_mask); total_corr_y = _mm256_and_ps(total_corr_y, col_mask); total_corr_z = _mm256_and_ps(total_corr_z, col_mask); total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); assert(isfinite(_mm256_cvtss_f32(new_p_x))); assert(isfinite(_mm256_cvtss_f32(new_p_y))); assert(isfinite(_mm256_cvtss_f32(new_p_z))); } // Odd block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 norm_x = _mm256_mul_ps(dx, inv_dist); __m256 norm_y = _mm256_mul_ps(dy, inv_dist); __m256 norm_z = _mm256_mul_ps(dz, inv_dist); __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x))); __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x))); tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec); // Clamp before rsqrt __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq); __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); total_corr_x = _mm256_and_ps(total_corr_x, col_mask); total_corr_y = _mm256_and_ps(total_corr_y, col_mask); total_corr_z = _mm256_and_ps(total_corr_z, col_mask); total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); assert(isfinite(_mm256_cvtss_f32(new_p_x))); assert(isfinite(_mm256_cvtss_f32(new_p_y))); assert(isfinite(_mm256_cvtss_f32(new_p_z))); } } } // Step 5: Motion Constraints - Duplicated for even/odd for (int c = 0; c < MAX_CHNS; ++c) { -
vurtun revised this gist
Oct 25, 2025 . 1 changed file with 464 additions and 253 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -151,6 +151,13 @@ chn_sim__val(struct chn_sim *cns) { assert(isfinite(cns->chn_quat_z[i])); assert(isfinite(cns->chn_quat_w[i])); // Quaternion normalization invariant float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] + cns->chn_quat_y[i]*cns->chn_quat_y[i] + cns->chn_quat_z[i]*cns->chn_quat_z[i] + cns->chn_quat_w[i]*cns->chn_quat_w[i]; assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); assert(!isnan(cns->chn_prev_pos_x[i])); assert(!isnan(cns->chn_prev_pos_y[i])); assert(!isnan(cns->chn_prev_pos_z[i])); @@ -168,10 +175,19 @@ chn_sim__val(struct chn_sim *cns) { assert(isfinite(cns->chn_prev_quat_y[i])); assert(isfinite(cns->chn_prev_quat_z[i])); assert(isfinite(cns->chn_prev_quat_w[i])); // Prev quat norm qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] + cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] + cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] + cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i]; assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); } for (int i = 0; i < MAX_PTC; ++i) { assert(i < MAX_PTC); assert(cns->ptc_rest_len[i] >= 0.0f); assert(cns->motion_radius[i] >= 0.0f); assert(cns->motion_radius[i] <= MAX_MOTION_RADIUS); assert(!isnan(cns->ptc_pos_x[i])); assert(!isnan(cns->ptc_pos_y[i])); @@ -182,6 +198,7 @@ chn_sim__val(struct chn_sim *cns) { assert(isfinite(cns->ptc_pos_y[i])); assert(isfinite(cns->ptc_pos_z[i])); assert(isfinite(cns->ptc_inv_mass[i])); assert(cns->ptc_inv_mass[i] >= 0.0f); assert(!isnan(cns->ptc_prev_pos_x[i])); assert(!isnan(cns->ptc_prev_pos_y[i])); @@ -200,18 +217,15 @@ chn_sim__val(struct chn_sim *cns) { assert(isfinite(cns->rest_pos_y[i])); assert(isfinite(cns->rest_pos_z[i])); assert(isfinite(cns->motion_radius[i])); float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i]; float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i]; float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i]; float dist = dx*dx + dy*dy + dz*dz; assert(dist <= (cns->motion_radius[i] + 1e-6f * cns->motion_radius[i] + 1e-6f)); } for (int i = 0; i < MAX_SPH; ++i) { assert(i < MAX_SPH); assert(isfinite(cns->sph_x[i])); assert(isfinite(cns->sph_y[i])); assert(isfinite(cns->sph_z[i])); @@ -223,6 +237,26 @@ chn_sim__val(struct chn_sim *cns) { assert(!isnan(cns->sph_r[i])); assert(cns->sph_r[i] >= 0.0f); } for (int i = 0; i < MAX_REST_LEN; ++i) { assert(i < MAX_REST_LEN); assert(cns->ptc_rest_len[i] >= 0.0f); assert(!isnan(cns->ptc_rest_len[i])); assert(isfinite(cns->ptc_rest_len[i])); assert(!isnan(cns->ptc_lambda[i])); assert(isfinite(cns->ptc_lambda[i])); assert(cns->ptc_lambda[i] >= -1e6f); assert(cns->ptc_lambda[i] <= 1e6f); assert(cns->ptc_bend_rest[i] >= 0.0f); assert(!isnan(cns->ptc_bend_rest[i])); assert(isfinite(cns->ptc_bend_rest[i])); assert(!isnan(cns->ptc_bend_lambda[i])); assert(isfinite(cns->ptc_bend_lambda[i])); assert(cns->ptc_bend_lambda[i] >= -1e6f); assert(cns->ptc_bend_lambda[i] <= 1e6f); } return 1; } extern void @@ -240,6 +274,7 @@ chn_sim_init(struct chn_sim *cns) { for (int i = 0; i < MAX_PTC; i += 8) { _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS)); } assert(chn_sim__val(cns)); } extern int chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { @@ -269,15 +304,20 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->chn_cfg[chn] = *cfg; int p_base = (chn * PTC_PER_CHN); assert(p_base + 15 < MAX_PTC); int r_base = (chn * CON_PER_CHN); assert(r_base + 15 < MAX_REST_LEN); int even_base = p_base + 0; // Even indices 0-7 int odd_base = p_base + 8; // Odd indices 8-15 // Transpose particles: even indices to 0-7, odd to 8-15 int even_cnt = (cnt + 1) / 2; // Number of even particles int odd_cnt = cnt / 2; // Number of odd particles assert(even_cnt + odd_cnt == cnt); for (int i = 0; i < even_cnt; ++i) { int orig_i = 2 * i; assert(orig_i * 4 + 3 < cnt * 4); cns->ptc_pos_x[even_base + i] = rest_pos[orig_i*4+0]; cns->ptc_pos_y[even_base + i] = rest_pos[orig_i*4+1]; cns->ptc_pos_z[even_base + i] = rest_pos[orig_i*4+2]; @@ -309,6 +349,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric } for (int i = 0; i < odd_cnt; ++i) { int orig_i = 2 * i + 1; assert(orig_i * 4 + 3 < cnt * 4); cns->ptc_pos_x[odd_base + i] = rest_pos[orig_i*4+0]; cns->ptc_pos_y[odd_base + i] = rest_pos[orig_i*4+1]; cns->ptc_pos_z[odd_base + i] = rest_pos[orig_i*4+2]; @@ -347,7 +388,14 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; __m256 dist_sq_vec = _mm256_set1_ps(dx*dx + dy*dy + dz*dz); __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec); float len = _mm256_cvtss_f32(sqrt_vec); assert(isfinite(len)); assert(!isnan(len)); len = fmaxf(len, 1e-6f); int con_idx = i - 1; if ((con_idx & 0x01) == 0) { // Even constraint @@ -371,11 +419,20 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric // Transpose bending constraints: even bends (0-2,2-4,...) to r_base+0-7, odd (1-3,3-5,...) to r_base+8-15 (new) int even_bend_cnt = even_cnt - 1; int odd_bend_cnt = odd_cnt - 1; assert(even_bend_cnt == even_cnt - 1); assert(odd_bend_cnt == odd_cnt - 1); for (int i = 0; i < even_bend_cnt; ++i) { float dx = cns->rest_pos_x[even_base + i] - cns->rest_pos_x[even_base + i + 1]; float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1]; float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1]; __m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz); __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec); float len = _mm256_cvtss_f32(sqrt_vec); assert(isfinite(len)); assert(!isnan(len)); len = fmaxf(len, 1e-6f); cns->ptc_bend_rest[r_base + i] = len; cns->ptc_bend_lambda[r_base + i] = 0.0f; @@ -388,7 +445,14 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric float dx = cns->rest_pos_x[odd_base + i] - cns->rest_pos_x[odd_base + i + 1]; float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1]; float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1]; __m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz); __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec); float len = _mm256_cvtss_f32(sqrt_vec); assert(isfinite(len)); assert(!isnan(len)); len = fmaxf(len, 1e-6f); cns->ptc_bend_rest[r_base + 8 + i] = len; cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f; @@ -407,8 +471,11 @@ chn_sim_del(struct chn_sim *cns, int chn) { assert(chn < MAX_CHNS); int p_base = (chn * PTC_PER_CHN); assert(p_base + 15 < MAX_PTC); int s_base = (chn * SPH_PER_CHN); assert(s_base + 7 < MAX_SPH); int r_base = (chn * CON_PER_CHN); assert(r_base + 15 < MAX_REST_LEN); cns->free_idx[cns->free_idx_cnt++] = chn; memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); @@ -500,6 +567,14 @@ chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, cns->chn_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = rot4[3]; float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3]; assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); assert(isfinite(cns->chn_quat_x[chn])); assert(isfinite(cns->chn_quat_y[chn])); assert(isfinite(cns->chn_quat_z[chn])); assert(isfinite(cns->chn_quat_w[chn])); assert(chn_sim__val(cns)); } extern void @@ -521,6 +596,18 @@ chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3]; assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); assert(isfinite(cns->chn_quat_x[chn])); assert(isfinite(cns->chn_quat_y[chn])); assert(isfinite(cns->chn_quat_z[chn])); assert(isfinite(cns->chn_quat_w[chn])); assert(isfinite(cns->chn_prev_quat_x[chn])); assert(isfinite(cns->chn_prev_quat_y[chn])); assert(isfinite(cns->chn_prev_quat_z[chn])); assert(isfinite(cns->chn_prev_quat_w[chn])); assert(chn_sim__val(cns)); } extern void @@ -534,6 +621,9 @@ chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { cns->chn_grav_y[chn] = grav3[1]; cns->chn_grav_z[chn] = grav3[2]; assert(isfinite(cns->chn_grav_x[chn])); assert(isfinite(cns->chn_grav_y[chn])); assert(isfinite(cns->chn_grav_z[chn])); assert(chn_sim__val(cns)); } extern void @@ -547,20 +637,37 @@ chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { cns->chn_wnd_y[chn] = wind3[1]; cns->chn_wnd_z[chn] = wind3[2]; assert(isfinite(cns->chn_wnd_x[chn])); assert(isfinite(cns->chn_wnd_y[chn])); assert(isfinite(cns->chn_wnd_z[chn])); assert(chn_sim__val(cns)); } extern void chn_sim_set_sph(struct chn_sim *cns, int chn, const float *restrict spheres, int cnt) { assert(chn_sim__val(cns)); assert(spheres); assert(cnt <= SPH_PER_CHN); assert(cnt >= 0); int s_base = (chn * SPH_PER_CHN); assert(s_base + 7 < MAX_SPH); for (int i = 0; i < cnt; i++) { assert(i * 4 + 3 < cnt * 4); cns->sph_x[s_base + i] = spheres[i*4+0]; cns->sph_y[s_base + i] = spheres[i*4+1]; cns->sph_z[s_base + i] = spheres[i*4+2]; cns->sph_r[s_base + i] = spheres[i*4+3]; assert(isfinite(cns->sph_x[s_base + i])); assert(isfinite(cns->sph_y[s_base + i])); assert(isfinite(cns->sph_z[s_base + i])); assert(isfinite(cns->sph_r[s_base + i])); assert(!isnan(cns->sph_x[s_base + i])); assert(!isnan(cns->sph_y[s_base + i])); assert(!isnan(cns->sph_z[s_base + i])); assert(!isnan(cns->sph_r[s_base + i])); assert(cns->sph_r[s_base + i] >= 0.0f); } for (int i = cnt; i < SPH_PER_CHN; ++i) { cns->sph_x[s_base + i] = 0; @@ -571,27 +678,35 @@ chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { assert(chn_sim__val(cns)); } extern void chn_sim_con_motion(struct chn_sim *cns, int chn, const float *restrict radius, int cnt) { assert(chn_sim__val(cns)); assert(chn >= 0); assert(chn < MAX_CHNS); assert(cnt >= 0); assert(cnt < PTC_PER_CHN); assert(radius); int p_base = (chn * PTC_PER_CHN); assert(p_base + 15 < MAX_PTC); int even_base = p_base + 0; int odd_base = p_base + 8; int even_cnt = (cnt + 1) / 2; int odd_cnt = cnt / 2; for (int i = 0; i < even_cnt; ++i) { assert(2 * i < cnt); cns->motion_radius[even_base + i] = radius[2 * i]; assert(cns->motion_radius[even_base + i] >= 0.0f); assert(cns->motion_radius[even_base + i] <= MAX_MOTION_RADIUS); } for (int i = even_cnt; i < 8; ++i) { cns->motion_radius[even_base + i] = MAX_MOTION_RADIUS; } for (int i = 0; i < odd_cnt; ++i) { assert(2 * i + 1 < cnt); cns->motion_radius[odd_base + i] = radius[2 * i + 1]; assert(cns->motion_radius[odd_base + i] >= 0.0f); assert(cns->motion_radius[odd_base + i] <= MAX_MOTION_RADIUS); } for (int i = odd_cnt; i < 8; ++i) { cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS; @@ -601,20 +716,25 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { extern void chn_sim_run(struct chn_sim *cns, float dt) { assert(chn_sim__val(cns)); assert(dt > 0.0f); assert(dt < 1.0f); // SIMD constants const __m256 dt_vec = _mm256_set1_ps(dt); const __m256 dt2_vec = _mm256_set1_ps(dt * dt); const __m256 one_vec = _mm256_set1_ps(1.0f); const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); const __m256 eps_vec = _mm256_set1_ps(1e-6f); const __m256 lambda_min_vec = _mm256_set1_ps(-1e6f); const __m256 lambda_max_vec = _mm256_set1_ps(1e6f); const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f); const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f); const __m256 zero_vec = _mm256_setzero_ps(); const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); const __m256 two_vec = _mm256_set1_ps(2.0f); const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); const __m256 neg_rest_vec = _mm256_set1_ps(-1.0f); // For FMAs // Stack arrays for precomputed data float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); @@ -635,6 +755,10 @@ chn_sim_run(struct chn_sim *cns, float dt) { // Step 0: Precomputation for (int c = 0; c < MAX_CHNS; c += 8) { __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 0, 3); __builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 0, 3); // Load chain quaternions __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); @@ -677,6 +801,10 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&chn_wnd_lcl_x[c], res_x); _mm256_store_ps(&chn_wnd_lcl_y[c], res_y); _mm256_store_ps(&chn_wnd_lcl_z[c], res_z); assert(isfinite(_mm256_cvtss_f32(res_x))); assert(isfinite(_mm256_cvtss_f32(res_y))); assert(isfinite(_mm256_cvtss_f32(res_z))); } // Compute local-space gravity (identical to wind) { @@ -707,6 +835,10 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&chn_grav_lcl_x[c], res_x); _mm256_store_ps(&chn_grav_lcl_y[c], res_y); _mm256_store_ps(&chn_grav_lcl_z[c], res_z); assert(isfinite(_mm256_cvtss_f32(res_x))); assert(isfinite(_mm256_cvtss_f32(res_y))); assert(isfinite(_mm256_cvtss_f32(res_z))); } // Compute linear velocity { @@ -725,6 +857,10 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&vel_x[c], vel_x_vec); _mm256_store_ps(&vel_y[c], vel_y_vec); _mm256_store_ps(&vel_z[c], vel_z_vec); assert(isfinite(_mm256_cvtss_f32(vel_x_vec))); assert(isfinite(_mm256_cvtss_f32(vel_y_vec))); assert(isfinite(_mm256_cvtss_f32(vel_z_vec))); } // Compute angular velocity { @@ -773,28 +909,46 @@ chn_sim_run(struct chn_sim *cns, float dt) { // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); __m256 negate_sign = _mm256_blendv_ps(one_vec, neg_one_vec, negate_mask); orient_qx = _mm256_mul_ps(orient_qx, negate_sign); orient_qy = _mm256_mul_ps(orient_qy, negate_sign); orient_qz = _mm256_mul_ps(orient_qz, negate_sign); // Norm check and scale if >1+eps __m256 norm_sq = _mm256_fmadd_ps(orient_qz, orient_qz, _mm256_fmadd_ps(orient_qy, orient_qy, _mm256_fmadd_ps(orient_qx, orient_qx, _mm256_mul_ps(orient_qw, orient_qw)))); __m256 scale_mask = _mm256_cmp_ps(norm_sq, _mm256_set1_ps((1.0f + 1e-6f) * (1.0f + 1e-6f)), _CMP_GT_OQ); __m256 inv_norm = _mm256_rsqrt_ps(norm_sq); __m256 scale = _mm256_blendv_ps(one_vec, inv_norm, scale_mask); orient_qx = _mm256_mul_ps(orient_qx, scale); orient_qy = _mm256_mul_ps(orient_qy, scale); orient_qz = _mm256_mul_ps(orient_qz, scale); orient_qw = _mm256_mul_ps(orient_qw, scale); // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) __m256 scale_ang = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec)); __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale_ang); __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale_ang); __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale_ang); // Store results _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); assert(isfinite(_mm256_cvtss_f32(ang_vel_x_vec))); assert(isfinite(_mm256_cvtss_f32(ang_vel_y_vec))); assert(isfinite(_mm256_cvtss_f32(ang_vel_z_vec))); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - Duplicated even/odd (reverted for correctness) for (int c = 0; c < MAX_CHNS; ++c) { __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 1, 3); __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 1, 3); __builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 1, 3); int p_base = c * PTC_PER_CHN; assert(p_base + 15 < MAX_PTC); struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 delta_x_vec = _mm256_set1_ps(vel_x[c] * dt); @@ -818,14 +972,18 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor); __m256 lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor); // Centrifugal zero-check: ang_vel mag (shared) __m256 ang_mag_sq = _mm256_fmadd_ps(ang_vel_z_vec, ang_vel_z_vec, _mm256_fmadd_ps(ang_vel_y_vec, ang_vel_y_vec, _mm256_mul_ps(ang_vel_x_vec, ang_vel_x_vec))); __m256 ang_zero_mask = _mm256_cmp_ps(ang_mag_sq, eps_vec, _CMP_LT_OQ); // Even block { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); @@ -851,18 +1009,23 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec); __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec); // Apply centrifugal zero-check cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask); cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask); cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask); // Blend masks for all dt lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask); lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask); lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask); ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask); ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask); ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask); cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask); cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask); cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask); __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); @@ -872,13 +1035,14 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z)); } // Odd block { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); @@ -904,21 +1068,26 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec); __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec); // Apply centrifugal zero-check cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask); cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask); cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask); lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor); lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor); lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor); lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask); lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask); lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask); ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask); ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask); ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask); cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask); cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask); cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask); __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); @@ -938,9 +1107,11 @@ chn_sim_run(struct chn_sim *cns, float dt) { cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; } // Step 2: Verlet Integration - Duplicated for even/odd for (int c = 0; c < MAX_CHNS; ++c) { __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); int p_base = c * PTC_PER_CHN; assert(p_base + 15 < MAX_PTC); struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]); @@ -995,6 +1166,10 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], p_curr_x); _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y); _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z); assert(isfinite(_mm256_cvtss_f32(new_p_x))); assert(isfinite(_mm256_cvtss_f32(new_p_y))); assert(isfinite(_mm256_cvtss_f32(new_p_z))); } // Odd block { @@ -1035,19 +1210,23 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], p_curr_x); _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], p_curr_y); _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z); assert(isfinite(_mm256_cvtss_f32(new_p_x))); assert(isfinite(_mm256_cvtss_f32(new_p_y))); assert(isfinite(_mm256_cvtss_f32(new_p_z))); } } // Step 3: XPBD Constraints (Distance only) - Permutes intra-vector only, no cross-block mixing for (int c = 0; c < MAX_CHNS; ++c) { __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); int p_base = c * PTC_PER_CHN; assert(p_base + 15 < MAX_PTC); int r_base = c * CON_PER_CHN; assert(r_base + 15 < MAX_REST_LEN);struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); __m256 compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(stiff_vec, eps_vec)); // Fallback if 0 __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness); __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(bend_stiff_vec, eps_vec)); // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15) __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); @@ -1084,21 +1263,21 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 w_sum = _mm256_add_ps(even_m, odd_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ); __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_even)); __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_even)); // C + compliance * lambda __m256 numerator = _mm256_sub_ps(zero_vec, temp); // - (C + compliance * lambda) __m256 denom = _mm256_add_ps(w_sum, compliance_factor); // w_sum + compliance __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda); new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec)); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); // grad_A = unit __m256 unit_y = _mm256_mul_ps(dy, inv_dist); @@ -1114,14 +1293,14 @@ chn_sim_run(struct chn_sim *cns, float dt) { corr_z = _mm256_and_ps(corr_z, corr_mask); // dpos_A = delta_lambda * unit * even_m even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, even_m), one_vec, even_x); even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, even_m), one_vec, even_y); even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, even_m), one_vec, even_z); // dpos_B = delta_lambda * (-unit) * odd_m = - corr * odd_m odd_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x); odd_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y); odd_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z); lambda_even = new_lambda; } @@ -1139,21 +1318,21 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 w_sum = _mm256_add_ps(odd_m, p2_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ); __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_odd)); __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_odd)); __m256 numerator = _mm256_sub_ps(zero_vec, temp); __m256 denom = _mm256_add_ps(w_sum, compliance_factor); __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda); new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec)); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); __m256 unit_y = _mm256_mul_ps(dy, inv_dist); @@ -1169,27 +1348,21 @@ chn_sim_run(struct chn_sim *cns, float dt) { corr_z = _mm256_and_ps(corr_z, corr_mask); // dpos_A = delta_lambda * unit * odd_m odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x); odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y); odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z); // dpos_B = delta_lambda * (-unit) * p2_m = - corr * p2_m (permuted add via shift) __m256 delta_p2_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, p2_m), one_vec, zero_vec); __m256 delta_even_x = _mm256_permutevar8x32_ps(delta_p2_x, _mm256_castsi256_ps(shift_idx)); even_x = _mm256_add_ps(even_x, delta_even_x); __m256 delta_p2_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, p2_m), one_vec, zero_vec); __m256 delta_even_y = _mm256_permutevar8x32_ps(delta_p2_y, _mm256_castsi256_ps(shift_idx)); even_y = _mm256_add_ps(even_y, delta_even_y); __m256 delta_p2_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, p2_m), one_vec, zero_vec); __m256 delta_even_z = _mm256_permutevar8x32_ps(delta_p2_z, _mm256_castsi256_ps(shift_idx)); even_z = _mm256_add_ps(even_z, delta_even_z); lambda_odd = new_lambda; @@ -1213,21 +1386,21 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 w_sum = _mm256_add_ps(a_m, b_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ); __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_even)); __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_even)); __m256 numerator = _mm256_sub_ps(zero_vec, temp); __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor); __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda); new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec)); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); __m256 unit_y = _mm256_mul_ps(dy, inv_dist); @@ -1243,27 +1416,21 @@ chn_sim_run(struct chn_sim *cns, float dt) { corr_z = _mm256_and_ps(corr_z, corr_mask); // dpos_A = delta_lambda * unit * a_m even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, even_x); even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, even_y); even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, even_z); // dpos_B = - corr * b_m (permuted add) __m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec); __m256 delta_even_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx)); even_x = _mm256_add_ps(even_x, delta_even_x); __m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec); __m256 delta_even_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx)); even_y = _mm256_add_ps(even_y, delta_even_y); __m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec); __m256 delta_even_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx)); even_z = _mm256_add_ps(even_z, delta_even_z); bend_lambda_even = new_lambda; @@ -1287,21 +1454,21 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 w_sum = _mm256_add_ps(a_m, b_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ); __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_odd)); __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_odd)); __m256 numerator = _mm256_sub_ps(zero_vec, temp); __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor); __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda); new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec)); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); __m256 unit_y = _mm256_mul_ps(dy, inv_dist); @@ -1317,27 +1484,21 @@ chn_sim_run(struct chn_sim *cns, float dt) { corr_z = _mm256_and_ps(corr_z, corr_mask); // dpos_A = delta_lambda * unit * a_m odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, odd_x); odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, odd_y); odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, odd_z); // dpos_B = - corr * b_m (permuted add) __m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec); __m256 delta_odd_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx)); odd_x = _mm256_add_ps(odd_x, delta_odd_x); __m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec); __m256 delta_odd_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx)); odd_y = _mm256_add_ps(odd_y, delta_odd_y); __m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec); __m256 delta_odd_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx)); odd_z = _mm256_add_ps(odd_z, delta_odd_z); bend_lambda_odd = new_lambda; @@ -1356,182 +1517,222 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], bend_lambda_even); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], bend_lambda_odd); assert(isfinite(_mm256_cvtss_f32(even_x))); assert(isfinite(_mm256_cvtss_f32(odd_x))); } // Step 4: Sphere Collisions with Friction - Duplicated for even/odd for (int c = 0; c < MAX_CHNS; ++c) { __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); int p_base = c * PTC_PER_CHN; assert(p_base + 15 < MAX_PTC); int s_base = c * SPH_PER_CHN; assert(s_base + 7 < MAX_SPH); struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 fric_vec = _mm256_set1_ps(cfg->friction);for (int s = 0; s < SPH_PER_CHN; ++s) { __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]); __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]); __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]); __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]); // Even block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 norm_x = _mm256_mul_ps(dx, inv_dist); __m256 norm_y = _mm256_mul_ps(dy, inv_dist); __m256 norm_z = _mm256_mul_ps(dz, inv_dist); __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x))); __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x))); tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec); // Clamp before rsqrt __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq); __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); total_corr_x = _mm256_and_ps(total_corr_x, col_mask); total_corr_y = _mm256_and_ps(total_corr_y, col_mask); total_corr_z = _mm256_and_ps(total_corr_z, col_mask); total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); assert(isfinite(_mm256_cvtss_f32(new_p_x))); assert(isfinite(_mm256_cvtss_f32(new_p_y))); assert(isfinite(_mm256_cvtss_f32(new_p_z))); } // Odd block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 norm_x = _mm256_mul_ps(dx, inv_dist); __m256 norm_y = _mm256_mul_ps(dy, inv_dist); __m256 norm_z = _mm256_mul_ps(dz, inv_dist); __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x))); __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x))); tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec); // Clamp before rsqrt __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq); __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); total_corr_x = _mm256_and_ps(total_corr_x, col_mask); total_corr_y = _mm256_and_ps(total_corr_y, col_mask); total_corr_z = _mm256_and_ps(total_corr_z, col_mask); total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); assert(isfinite(_mm256_cvtss_f32(new_p_x))); assert(isfinite(_mm256_cvtss_f32(new_p_y))); assert(isfinite(_mm256_cvtss_f32(new_p_z))); } _m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); total_corr_x = _mm256_and_ps(total_corr_x, col_mask); total_corr_y = _mm256_and_ps(total_corr_y, col_mask); total_corr_z = _mm256_and_ps(total_corr_z, col_mask); total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z); assert(isfinite(_mm256_cvtss_f32(new_p_x))); assert(isfinite(_mm256_cvtss_f32(new_p_y))); assert(isfinite(_mm256_cvtss_f32(new_p_z))); } // Step 5: Motion Constraints - Duplicated for even/odd for (int c = 0; c < MAX_CHNS; ++c) { __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); int p_base = c * PTC_PER_CHN; assert(p_base + 15 < MAX_PTC); // Even block { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 0]); __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 0]); __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 0]); __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 0]); @@ -1546,6 +1747,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 pen = _mm256_sub_ps(dist, r_vec); __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); @@ -1564,9 +1766,13 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_sub_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z)); assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x)))); assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y)))); assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z)))); } // Odd block { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); @@ -1588,6 +1794,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); assert(isfinite(_mm256_cvtss_f32(dist))); __m256 pen = _mm256_sub_ps(dist, r_vec); __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); @@ -1606,6 +1813,10 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_sub_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_sub_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z)); assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x)))); assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y)))); assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z)))); } } assert(chn_sim__val(cns)); -
vurtun revised this gist
Oct 23, 2025 . 1 changed file with 194 additions and 19 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,6 +4,8 @@ #include <string.h> #include <float.h> #define MAX_MOTION_RADIUS 1000000.0f enum { MAX_CHNS = 128, PTC_PER_CHN = 16, @@ -14,8 +16,6 @@ enum { MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 16, }; struct chn_cfg { float damping; // [0, 1] float stiffness; // [0, 1] @@ -86,6 +86,145 @@ struct chn_sim { float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); float motion_radius[MAX_PTC] __attribute__((aligned(32))); }; static int chn_sim__val(struct chn_sim *cns) { assert(cns); assert(cns->free_idx_cnt <= MAX_CHNS); for (int i = 0; i < cns->free_idx_cnt; ++i) { assert(i < MAX_CHNS); assert(cns->free_idx[i] < MAX_CHNS); } for (int i = 0; i < MAX_CHNS; ++i) { const struct chn_cfg *cfg = &cns->chn_cfg[i]; assert(cfg->damping >= 0.0f); assert(cfg->damping <= 1.0f); assert(cfg->stiffness >= 0.0f); assert(cfg->stiffness <= 1.0f); assert(cfg->friction >= 0.0f); assert(cfg->friction <= 1.0f); assert(cfg->linear_inertia >= 0.0f); assert(cfg->linear_inertia <= 1.0f); assert(cfg->angular_inertia >= 0.0f); assert(cfg->angular_inertia <= 1.0f); assert(cfg->centrifugal_inertia >= 0.0f); assert(cfg->centrifugal_inertia <= 1.0f); assert(cfg->bend_stiffness >= 0.0f); assert(cfg->bend_stiffness <= 1.0f); assert(!isnan(cns->chn_grav_x[i])); assert(!isnan(cns->chn_grav_y[i])); assert(!isnan(cns->chn_grav_z[i])); assert(isfinite(cns->chn_grav_x[i])); assert(isfinite(cns->chn_grav_y[i])); assert(isfinite(cns->chn_grav_z[i])); assert(!isnan(cns->chn_wnd_x[i])); assert(!isnan(cns->chn_wnd_y[i])); assert(!isnan(cns->chn_wnd_z[i])); assert(isfinite(cns->chn_wnd_x[i])); assert(isfinite(cns->chn_wnd_y[i])); assert(isfinite(cns->chn_wnd_z[i])); assert(!isnan(cns->chn_pos_x[i])); assert(!isnan(cns->chn_pos_y[i])); assert(!isnan(cns->chn_pos_z[i])); assert(isfinite(cns->chn_pos_x[i])); assert(isfinite(cns->chn_pos_y[i])); assert(isfinite(cns->chn_pos_z[i])); assert(!isnan(cns->chn_quat_x[i])); assert(!isnan(cns->chn_quat_y[i])); assert(!isnan(cns->chn_quat_z[i])); assert(!isnan(cns->chn_quat_w[i])); assert(isfinite(cns->chn_quat_x[i])); assert(isfinite(cns->chn_quat_y[i])); assert(isfinite(cns->chn_quat_z[i])); assert(isfinite(cns->chn_quat_w[i])); assert(!isnan(cns->chn_prev_pos_x[i])); assert(!isnan(cns->chn_prev_pos_y[i])); assert(!isnan(cns->chn_prev_pos_z[i])); assert(isfinite(cns->chn_prev_pos_x[i])); assert(isfinite(cns->chn_prev_pos_y[i])); assert(isfinite(cns->chn_prev_pos_z[i])); assert(!isnan(cns->chn_prev_quat_x[i])); assert(!isnan(cns->chn_prev_quat_y[i])); assert(!isnan(cns->chn_prev_quat_z[i])); assert(!isnan(cns->chn_prev_quat_w[i])); assert(isfinite(cns->chn_prev_quat_x[i])); assert(isfinite(cns->chn_prev_quat_y[i])); assert(isfinite(cns->chn_prev_quat_z[i])); assert(isfinite(cns->chn_prev_quat_w[i])); } for (int i = 0; i < MAX_PTC; ++i) { assert(cns->ptc_rest_len[i] >= 0.0f); assert(cns->motion_radius[i] >= 0.0f); assert(!isnan(cns->ptc_pos_x[i])); assert(!isnan(cns->ptc_pos_y[i])); assert(!isnan(cns->ptc_pos_z[i])); assert(!isnan(cns->ptc_inv_mass[i])); assert(isfinite(cns->ptc_pos_x[i])); assert(isfinite(cns->ptc_pos_y[i])); assert(isfinite(cns->ptc_pos_z[i])); assert(isfinite(cns->ptc_inv_mass[i])); assert(!isnan(cns->ptc_prev_pos_x[i])); assert(!isnan(cns->ptc_prev_pos_y[i])); assert(!isnan(cns->ptc_prev_pos_z[i])); assert(isfinite(cns->ptc_prev_pos_x[i])); assert(isfinite(cns->ptc_prev_pos_y[i])); assert(isfinite(cns->ptc_prev_pos_z[i])); assert(!isnan(cns->rest_pos_x[i])); assert(!isnan(cns->rest_pos_y[i])); assert(!isnan(cns->rest_pos_z[i])); assert(!isnan(cns->motion_radius[i])); assert(isfinite(cns->rest_pos_x[i])); assert(isfinite(cns->rest_pos_y[i])); assert(isfinite(cns->rest_pos_z[i])); assert(isfinite(cns->motion_radius[i])); } for (int i = 0; i < MAX_SPH; ++i) { assert(isfinite(cns->sph_x[i])); assert(isfinite(cns->sph_y[i])); assert(isfinite(cns->sph_z[i])); assert(isfinite(cns->sph_r[i])); assert(!isnan(cns->sph_x[i])); assert(!isnan(cns->sph_y[i])); assert(!isnan(cns->sph_z[i])); assert(!isnan(cns->sph_r[i])); assert(isfinite(cns->sph_x[i])); assert(isfinite(cns->sph_y[i])); assert(isfinite(cns->sph_z[i])); assert(isfinite(cns->sph_r[i])); assert(!isnan(cns->sph_x[i])); assert(!isnan(cns->sph_y[i])); assert(!isnan(cns->sph_z[i])); assert(!isnan(cns->sph_r[i])); assert(cns->sph_r[i] >= 0.0f); } return 1; } extern void chn_sim_init(struct chn_sim *cns) { assert(cns); @@ -98,7 +237,6 @@ chn_sim_init(struct chn_sim *cns) { _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f)); _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f)); } for (int i = 0; i < MAX_PTC; i += 8) { _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS)); } @@ -110,13 +248,28 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric assert(cns->free_idx_cnt > 0); assert(cnt <= PTC_PER_CHN); assert(cnt > 1); assert(chn_sim__val(cns)); assert(cfg->damping >= 0.0f); assert(cfg->damping <= 1.0f); assert(cfg->stiffness >= 0.0f); assert(cfg->stiffness <= 1.0f); assert(cfg->friction >= 0.0f); assert(cfg->friction <= 1.0f); assert(cfg->linear_inertia >= 0.0f); assert(cfg->linear_inertia <= 1.0f); assert(cfg->angular_inertia >= 0.0f); assert(cfg->angular_inertia <= 1.0f); assert(cfg->centrifugal_inertia >= 0.0f); assert(cfg->centrifugal_inertia <= 1.0f); assert(cfg->bend_stiffness >= 0.0f); assert(cfg->bend_stiffness <= 1.0f); int chn = cns->free_idx[--cns->free_idx_cnt]; cns->chn_cfg[chn] = *cfg; int p_base = (chn * PTC_PER_CHN); int r_base = (chn * CON_PER_CHN); int even_base = p_base + 0; // Even indices 0-7 int odd_base = p_base + 8; // Odd indices 8-15 @@ -144,6 +297,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->ptc_pos_y[even_base + i] = 0.0f; cns->ptc_pos_z[even_base + i] = 0.0f; cns->ptc_inv_mass[even_base + i] = 0.0f; cns->rest_pos_x[even_base + i] = 0.0f; cns->rest_pos_y[even_base + i] = 0.0f; cns->rest_pos_z[even_base + i] = 0.0f; @@ -174,6 +328,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->ptc_pos_y[odd_base + i] = 0.0f; cns->ptc_pos_z[odd_base + i] = 0.0f; cns->ptc_inv_mass[odd_base + i] = 0.0f; cns->rest_pos_x[odd_base + i] = 0.0f; cns->rest_pos_y[odd_base + i] = 0.0f; cns->rest_pos_z[odd_base + i] = 0.0f; @@ -193,6 +348,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; float len = sqrtf(dx*dx + dy*dy + dz*dz); int con_idx = i - 1; if ((con_idx & 0x01) == 0) { // Even constraint cns->ptc_rest_len[r_base + (con_idx / 2)] = len; @@ -220,6 +376,7 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1]; float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1]; float len = sqrtf(dx * dx + dy * dy + dz * dz); cns->ptc_bend_rest[r_base + i] = len; cns->ptc_bend_lambda[r_base + i] = 0.0f; } @@ -232,25 +389,26 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1]; float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1]; float len = sqrtf(dx * dx + dy * dy + dz * dz); cns->ptc_bend_rest[r_base + 8 + i] = len; cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f; } for (int i = odd_bend_cnt; i < 8; ++i) { cns->ptc_bend_rest[r_base + 8 + i] = 0.0f; cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f; } assert(chn_sim__val(cns)); return chn; } extern void chn_sim_del(struct chn_sim *cns, int chn) { assert(chn_sim__val(cns)); assert(chn >= 0); assert(chn < MAX_CHNS); int p_base = (chn * PTC_PER_CHN); int s_base = (chn * SPH_PER_CHN); int r_base = (chn * CON_PER_CHN); cns->free_idx[cns->free_idx_cnt++] = chn; memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); @@ -281,7 +439,6 @@ chn_sim_del(struct chn_sim *cns, int chn) { cns->chn_prev_quat_z[chn] = 0.0f; cns->chn_prev_quat_w[chn] = 1.0f; _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_set1_ps(0.0f)); @@ -314,7 +471,6 @@ chn_sim_del(struct chn_sim *cns, int chn) { _mm256_store_ps(&cns->sph_z[s_base], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_r[s_base], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_rest_len[r_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f)); @@ -323,10 +479,14 @@ chn_sim_del(struct chn_sim *cns, int chn) { _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], _mm256_set1_ps(0.0f)); assert(chn_sim__val(cns)); } extern void chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(chn_sim__val(cns)); assert(pos3); assert(rot4); assert(chn >= 0); @@ -340,10 +500,13 @@ chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const floa cns->chn_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = rot4[3]; assert(chn_sim__val(cns)); } extern void chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(chn_sim__val(cns)); assert(pos3); assert(rot4); assert(chn >= 0); @@ -357,32 +520,38 @@ chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const floa cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; assert(chn_sim__val(cns)); } extern void chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { assert(chn_sim__val(cns)); assert(grav3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_grav_x[chn] = grav3[0]; cns->chn_grav_y[chn] = grav3[1]; cns->chn_grav_z[chn] = grav3[2]; assert(chn_sim__val(cns)); } extern void chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { assert(chn_sim__val(cns)); assert(wind3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_wnd_x[chn] = wind3[0]; cns->chn_wnd_y[chn] = wind3[1]; cns->chn_wnd_z[chn] = wind3[2]; assert(chn_sim__val(cns)); } extern void chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { assert(chn_sim__val(cns)); assert(spheres); assert(cnt <= SPH_PER_CHN); @@ -399,10 +568,11 @@ chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { cns->sph_z[s_base + i] = 0; cns->sph_r[s_base + i] = 0; } assert(chn_sim__val(cns)); } extern void chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { assert(chn_sim__val(cns)); assert(chn >= 0); assert(chn < MAX_CHNS); assert(cnt >= 0); @@ -413,6 +583,7 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { int odd_base = p_base + 8; int even_cnt = (cnt + 1) / 2; int odd_cnt = cnt / 2; for (int i = 0; i < even_cnt; ++i) { cns->motion_radius[even_base + i] = radius[2 * i]; } @@ -425,9 +596,12 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { for (int i = odd_cnt; i < 8; ++i) { cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS; } assert(chn_sim__val(cns)); } extern void chn_sim_run(struct chn_sim *cns, float dt) { assert(chn_sim__val(cns)); // SIMD constants const __m256 dt_vec = _mm256_set1_ps(dt); const __m256 dt2_vec = _mm256_set1_ps(dt * dt); @@ -600,7 +774,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), _mm256_andnot_ps(negate_mask, one_vec)); orient_qx = _mm256_mul_ps(orient_qx, negate_sign); orient_qy = _mm256_mul_ps(orient_qy, negate_sign); @@ -981,7 +1155,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); __m256 unit_y = _mm256_mul_ps(dy, inv_dist); __m256 unit_z = _mm256_mul_ps(dz, inv_dist); @@ -1434,4 +1608,5 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z)); } } assert(chn_sim__val(cns)); } -
vurtun revised this gist
Oct 11, 2025 . 2 changed files with 4 additions and 22 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1434,6 +1434,4 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z)); } } } This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,6 +3,8 @@ #include <stdlib.h> #include <math.h> #include "chain.c" #define WINDOW_WIDTH 800 #define WINDOW_HEIGHT 600 #define CHAIN_LENGTH 10 @@ -84,13 +86,7 @@ main(int argc, char *argv[]) { SDL_Event event; // Debug: Check gravity and masses int p_base = chn * PTC_PER_CHN; while (running) { while (SDL_PollEvent(&event)) { if (event.type == SDL_QUIT) { @@ -173,23 +169,11 @@ main(int argc, char *argv[]) { SDL_RenderDrawLine(renderer, px0, py0, px1, py1); } } SDL_RenderPresent(renderer); SDL_Delay(16); } SDL_DestroyRenderer(renderer); SDL_DestroyWindow(window); SDL_Quit(); return 0; } -
vurtun revised this gist
Oct 11, 2025 . 2 changed files with 195 additions and 194 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1436,198 +1436,4 @@ chn_sim_run(struct chn_sim *cns, float dt) { } } This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,195 @@ #include <SDL2/SDL.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #define WINDOW_WIDTH 800 #define WINDOW_HEIGHT 600 #define CHAIN_LENGTH 10 #define PARTICLE_SPACING 10.0f static void project(float x, float y, float z, int *screen_x, int *screen_y) { *screen_x = (int)(x + WINDOW_WIDTH / 2.0f); *screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f); } // Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity) static void quat_rotate_inv_translate(float wx, float wy, float wz, // world pos float cx, float cy, float cz, // chain pos float qx, float qy, float qz, float qw, // chain quat float *mx, float *my, float *mz) { // out model pos // Translate: world - chain float tx = wx - cx, ty = wy - cy, tz = wz - cz; // Quat conj: (qw, -qx, -qy, -qz) float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz; // t = conj * (0, tx,ty,tz) float tw = cqx*tx + cqy*ty + cqz*tz; float t_x = cqw*tx + cqy*tz - cqz*ty; float t_y = cqw*ty + cqz*tx - cqx*tz; float t_z = cqw*tz + cqx*ty - cqy*tx; // model = t * quat (vec part) *mx = qw*t_x + qy*t_z - qz*t_y + tw*qx; *my = qw*t_y + qz*t_x - qx*t_z + tw*qy; *mz = qw*t_z + qx*t_y - qy*t_x + tw*qz; } extern int main(int argc, char *argv[]) { if (SDL_Init(SDL_INIT_VIDEO) < 0) { printf("SDL_Init failed: %s\n", SDL_GetError()); return 1; } SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0); SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED); if (!window || !renderer) { printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError()); SDL_Quit(); return 1; } struct chn_sim sim; chn_sim_init(&sim); struct chn_cfg cfg = { .damping = 0.5f, .stiffness = 0.9f, .friction = 0.5f, .linear_inertia = 0.9f, // Tune 0.5f for more tangential lag .angular_inertia = 0.9f, .centrifugal_inertia = 0.9f, // Tune 0.5f for more outward bow .bend_stiffness = 0.8f, }; float rest_pos[CHAIN_LENGTH * 4]; for (int i = 0; i < CHAIN_LENGTH; i++) { rest_pos[i*4 + 0] = 0.0f; rest_pos[i*4 + 1] = -i * PARTICLE_SPACING; rest_pos[i*4 + 2] = 0.0f; rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f; } int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH); chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f}); // Reduced for spin test chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); // Start at circle edge // Fixed world spheres (triangle outside circle) float world_spheres[12] = { // 3 spheres * 4 floats (x,y,z,r) 60.0f, 0.0f, 0.0f, 10.0f, // Sphere 0: right -30.0f, 52.0f, 0.0f, 10.0f, // Sphere 1: top-left -30.0f,-52.0f, 0.0f, 10.0f // Sphere 2: bottom-left }; // Circular motion params float time = 0.0f; float circle_radius = 50.0f; float angular_speed = 2.0f * M_PI / 5.0f; // Full circle every 5s float dt = 0.016f; int running = 1; SDL_Event event; // Debug: Check gravity and masses printf("Gravity y: %.2f\n", sim.chn_grav_y[chn]); int p_base = chn * PTC_PER_CHN; for (int i = 0; i < CHAIN_LENGTH; i++) { int idx = p_base + (i % 2 == 0 ? i/2 : 8 + (i-1)/2); printf("p%d inv_mass: %.2f\n", i, sim.ptc_inv_mass[idx]); } while (running) { while (SDL_PollEvent(&event)) { if (event.type == SDL_QUIT) { running = 0; } } time += dt; // Circular anchor motion float anchor_x = circle_radius * cosf(angular_speed * time); float anchor_y = circle_radius * sinf(angular_speed * time); chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); // Transform fixed world spheres to current model space float model_spheres[12]; for (int s = 0; s < 3; ++s) { float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2]; float r = world_spheres[s*4 + 3]; float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn]; float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn]; float mx, my, mz; quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz); model_spheres[s*4 + 0] = mx; model_spheres[s*4 + 1] = my; model_spheres[s*4 + 2] = mz; model_spheres[s*4 + 3] = r; } chn_sim_set_sph(&sim, chn, model_spheres, 3); chn_sim_run(&sim, dt); SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255); SDL_RenderClear(renderer); SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255); // Green for chain & spheres float root_x = sim.chn_pos_x[chn]; // Current root world x/y float root_y = sim.chn_pos_y[chn]; // Draw chain segments for (int i = 0; i < CHAIN_LENGTH - 1; i++) { int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2); int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2); float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0]; float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1]; float world_x0 = model_x0 + root_x; float world_y0 = model_y0 + root_y; float world_x1 = model_x1 + root_x; float world_y1 = model_y1 + root_y; int sx0, sy0, sx1, sy1; project(world_x0, world_y0, 0, &sx0, &sy0); project(world_x1, world_y1, 0, &sx1, &sy1); SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1); } // Draw spheres (approx circle outline with 32 points) int p_base = chn * PTC_PER_CHN; int s_base = chn * SPH_PER_CHN; for (int s = 0; s < 3; ++s) { // Only draw active spheres float sph_model_x = sim.sph_x[s_base + s]; float sph_model_y = sim.sph_y[s_base + s]; float sph_r = sim.sph_r[s_base + s]; if (sph_r <= 0.0f) continue; float sph_world_x = sph_model_x + root_x; float sph_world_y = sph_model_y + root_y; int sph_sx, sph_sy; project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy); // Parametric circle: 32 points const int num_points = 32; for (int pt = 0; pt < num_points; ++pt) { float angle0 = 2.0f * M_PI * pt / num_points; float angle1 = 2.0f * M_PI * (pt + 1) / num_points; float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0); float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1); int px0, py0, px1, py1; project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0); project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1); SDL_RenderDrawLine(renderer, px0, py0, px1, py1); } } // Debug printf (every 5 steps ~0.08s) if ((int)(time / dt) % 5 == 0) { printf("t=%.2f: ", time); for (int i = 0; i < CHAIN_LENGTH; i++) { int idx = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2); float model_y = sim.ptc_pos_y[p_base + idx]; float world_y = model_y + root_y; printf("p%d world_y=%.2f (model=%.2f) ", i, world_y, model_y); } printf("\n"); } SDL_RenderPresent(renderer); SDL_Delay(16); } SDL_DestroyRenderer(renderer); SDL_DestroyWindow(window); SDL_Quit(); return 0; } -
vurtun revised this gist
Oct 11, 2025 . 1 changed file with 200 additions and 28 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -23,6 +23,7 @@ struct chn_cfg { float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force float bend_stiffness; // [0, 1], bending resistance }; struct chn_sim { unsigned char free_idx_cnt; @@ -70,6 +71,8 @@ struct chn_sim { // XPBD lambdas for distance constraints (transposed similarly) float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32))); float ptc_bend_rest[MAX_REST_LEN] __attribute__((aligned(32))); float ptc_bend_lambda[MAX_REST_LEN] __attribute__((aligned(32))); // sphere positions (model space) float sph_x[MAX_SPH] __attribute__((aligned(32))); @@ -209,6 +212,33 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f; } } // Transpose bending constraints: even bends (0-2,2-4,...) to r_base+0-7, odd (1-3,3-5,...) to r_base+8-15 (new) int even_bend_cnt = even_cnt - 1; int odd_bend_cnt = odd_cnt - 1; for (int i = 0; i < even_bend_cnt; ++i) { float dx = cns->rest_pos_x[even_base + i] - cns->rest_pos_x[even_base + i + 1]; float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1]; float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1]; float len = sqrtf(dx * dx + dy * dy + dz * dz); cns->ptc_bend_rest[r_base + i] = len; cns->ptc_bend_lambda[r_base + i] = 0.0f; } for (int i = even_bend_cnt; i < 8; ++i) { cns->ptc_bend_rest[r_base + i] = 0.0f; cns->ptc_bend_lambda[r_base + i] = 0.0f; } for (int i = 0; i < odd_bend_cnt; ++i) { float dx = cns->rest_pos_x[odd_base + i] - cns->rest_pos_x[odd_base + i + 1]; float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1]; float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1]; float len = sqrtf(dx * dx + dy * dy + dz * dz); cns->ptc_bend_rest[r_base + 8 + i] = len; cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f; } for (int i = odd_bend_cnt; i < 8; ++i) { cns->ptc_bend_rest[r_base + 8 + i] = 0.0f; cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f; } return chn; } extern void @@ -289,6 +319,10 @@ chn_sim_del(struct chn_sim *cns, int chn) { _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_rest[r_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], _mm256_set1_ps(0.0f)); } extern void chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { @@ -838,6 +872,9 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); __m256 compliance_factor = _mm256_div_ps(dt2_vec, stiff_vec); __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness); __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, bend_stiff_vec); // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15) __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 even_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); @@ -856,6 +893,14 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]); __m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]); __m256 bend_rest_even = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 0]); __m256 bend_lambda_even = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 0]); __m256 bend_rest_odd = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 8]); __m256 bend_lambda_odd = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 8]); _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f)); for (int iter = 0; iter < MAX_ITR; ++iter) { // Even distance constraints: p0 = even (A), p1 = odd (B) { @@ -975,6 +1020,154 @@ chn_sim_run(struct chn_sim *cns, float dt) { lambda_odd = new_lambda; } // Even bending constraints: between even[k] (A) and even[k+1] (B) { __m256 a_x = even_x; __m256 a_y = even_y; __m256 a_z = even_z; __m256 a_m = even_m; __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); __m256 b_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx)); __m256 b_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx)); __m256 b_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx)); __m256 b_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx)); __m256 dx = _mm256_sub_ps(a_x, b_x); __m256 dy = _mm256_sub_ps(a_y, b_y); __m256 dz = _mm256_sub_ps(a_z, b_z); __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(dist_sq); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 w_sum = _mm256_add_ps(a_m, b_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ); __m256 C = _mm256_sub_ps(dist, bend_rest_even); __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_even)); __m256 numerator = _mm256_sub_ps(zero_vec, temp); __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor); __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); __m256 unit_y = _mm256_mul_ps(dy, inv_dist); __m256 unit_z = _mm256_mul_ps(dz, inv_dist); __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda); __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda); __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda); __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); corr_x = _mm256_and_ps(corr_x, corr_mask); corr_y = _mm256_and_ps(corr_y, corr_mask); corr_z = _mm256_and_ps(corr_z, corr_mask); // dpos_A = delta_lambda * unit * a_m even_x = _mm256_add_ps(even_x, _mm256_mul_ps(corr_x, a_m)); even_y = _mm256_add_ps(even_y, _mm256_mul_ps(corr_y, a_m)); even_z = _mm256_add_ps(even_z, _mm256_mul_ps(corr_z, a_m)); // dpos_B = - corr * b_m (shifted add) __m256 delta_b_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, b_m)); __m256i delta_ix = _mm256_castps_si256(delta_b_x); __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4); __m256 delta_even_x = _mm256_castsi256_ps(left_shifted_ix); even_x = _mm256_add_ps(even_x, delta_even_x); __m256 delta_b_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, b_m)); __m256i delta_iy = _mm256_castps_si256(delta_b_y); __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4); __m256 delta_even_y = _mm256_castsi256_ps(left_shifted_iy); even_y = _mm256_add_ps(even_y, delta_even_y); __m256 delta_b_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, b_m)); __m256i delta_iz = _mm256_castps_si256(delta_b_z); __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4); __m256 delta_even_z = _mm256_castsi256_ps(left_shifted_iz); even_z = _mm256_add_ps(even_z, delta_even_z); bend_lambda_even = new_lambda; } // Odd bending constraints: between odd[k] (A) and odd[k+1] (B) { __m256 a_x = odd_x; __m256 a_y = odd_y; __m256 a_z = odd_z; __m256 a_m = odd_m; __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); __m256 b_x = _mm256_permutevar8x32_ps(odd_x, _mm256_castsi256_ps(shift_idx)); __m256 b_y = _mm256_permutevar8x32_ps(odd_y, _mm256_castsi256_ps(shift_idx)); __m256 b_z = _mm256_permutevar8x32_ps(odd_z, _mm256_castsi256_ps(shift_idx)); __m256 b_m = _mm256_permutevar8x32_ps(odd_m, _mm256_castsi256_ps(shift_idx)); __m256 dx = _mm256_sub_ps(a_x, b_x); __m256 dy = _mm256_sub_ps(a_y, b_y); __m256 dz = _mm256_sub_ps(a_z, b_z); __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(dist_sq); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 w_sum = _mm256_add_ps(a_m, b_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ); __m256 C = _mm256_sub_ps(dist, bend_rest_odd); __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_odd)); __m256 numerator = _mm256_sub_ps(zero_vec, temp); __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor); __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); __m256 unit_y = _mm256_mul_ps(dy, inv_dist); __m256 unit_z = _mm256_mul_ps(dz, inv_dist); __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda); __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda); __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda); __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); corr_x = _mm256_and_ps(corr_x, corr_mask); corr_y = _mm256_and_ps(corr_y, corr_mask); corr_z = _mm256_and_ps(corr_z, corr_mask); // dpos_A = delta_lambda * unit * a_m odd_x = _mm256_add_ps(odd_x, _mm256_mul_ps(corr_x, a_m)); odd_y = _mm256_add_ps(odd_y, _mm256_mul_ps(corr_y, a_m)); odd_z = _mm256_add_ps(odd_z, _mm256_mul_ps(corr_z, a_m)); // dpos_B = - corr * b_m (shifted add) __m256 delta_b_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, b_m)); __m256i delta_ix = _mm256_castps_si256(delta_b_x); __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4); __m256 delta_odd_x = _mm256_castsi256_ps(left_shifted_ix); odd_x = _mm256_add_ps(odd_x, delta_odd_x); __m256 delta_b_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, b_m)); __m256i delta_iy = _mm256_castps_si256(delta_b_y); __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4); __m256 delta_odd_y = _mm256_castsi256_ps(left_shifted_iy); odd_y = _mm256_add_ps(odd_y, delta_odd_y); __m256 delta_b_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, b_m)); __m256i delta_iz = _mm256_castps_si256(delta_b_z); __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4); __m256 delta_odd_z = _mm256_castsi256_ps(left_shifted_iz); odd_z = _mm256_add_ps(odd_z, delta_odd_z); bend_lambda_odd = new_lambda; } } // Store back _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x); @@ -987,6 +1180,8 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even); _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], bend_lambda_even); _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], bend_lambda_odd); } // Step 4: Sphere Collisions with Friction - duplicated for even/odd blocks for (int c = 0; c < MAX_CHNS; ++c) { @@ -1293,11 +1488,12 @@ main(int argc, char *argv[]) { struct chn_cfg cfg = { .damping = 0.5f, .stiffness = 0.9f, .friction = 0.5f, .linear_inertia = 0.9f, // Tune 0.5f for more tangential lag .angular_inertia = 0.9f, .centrifugal_inertia = 0.9f, // Tune 0.5f for more outward bow .bend_stiffness = 0.8f, }; float rest_pos[CHAIN_LENGTH * 4]; for (int i = 0; i < CHAIN_LENGTH; i++) { @@ -1361,34 +1557,8 @@ main(int argc, char *argv[]) { model_spheres[s*4 + 3] = r; } chn_sim_set_sph(&sim, chn, model_spheres, 3); chn_sim_run(&sim, dt); SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255); SDL_RenderClear(renderer); SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255); // Green for chain & spheres @@ -1414,6 +1584,8 @@ main(int argc, char *argv[]) { SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1); } // Draw spheres (approx circle outline with 32 points) int p_base = chn * PTC_PER_CHN; int s_base = chn * SPH_PER_CHN; for (int s = 0; s < 3; ++s) { // Only draw active spheres float sph_model_x = sim.sph_x[s_base + s]; float sph_model_y = sim.sph_y[s_base + s]; -
vurtun revised this gist
Oct 11, 2025 . 1 changed file with 364 additions and 418 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,20 +12,17 @@ enum { MAX_SPH = (MAX_CHNS * SPH_PER_CHN), CON_PER_CHN = PTC_PER_CHN, MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 16, }; #define MAX_MOTION_RADIUS 1000000.0f struct chn_cfg { float damping; // [0, 1] float stiffness; // [0, 1] float friction; // [0, 1] float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force }; struct chn_sim { unsigned char free_idx_cnt; @@ -45,16 +42,16 @@ struct chn_sim { float chn_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_w[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); @@ -65,7 +62,7 @@ struct chn_sim { float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); @@ -74,10 +71,6 @@ struct chn_sim { // XPBD lambdas for distance constraints (transposed similarly) float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32))); // sphere positions (model space) float sph_x[MAX_SPH] __attribute__((aligned(32))); float sph_y[MAX_SPH] __attribute__((aligned(32))); @@ -93,34 +86,19 @@ struct chn_sim { extern void chn_sim_init(struct chn_sim *cns) { assert(cns); memset(cns, 0, sizeof(*cns)); cns->free_idx_cnt = MAX_CHNS; for (int i = 0; i < MAX_CHNS; ++i) { cns->free_idx[i] = MAX_CHNS - i - 1; } for (int i = 0; i < MAX_CHNS; i += 8) { _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f)); _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f)); } // Set motion_radius to MAX_MOTION_RADIUS for (int i = 0; i < MAX_PTC; i += 8) { _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS)); } } extern int chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { @@ -152,6 +130,11 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->rest_pos_x[even_base + i] = rest_pos[orig_i*4+0]; cns->rest_pos_y[even_base + i] = rest_pos[orig_i*4+1]; cns->rest_pos_z[even_base + i] = rest_pos[orig_i*4+2]; // Set previous positions to match initial positions for zero initial velocity cns->ptc_prev_pos_x[even_base + i] = cns->ptc_pos_x[even_base + i]; cns->ptc_prev_pos_y[even_base + i] = cns->ptc_pos_y[even_base + i]; cns->ptc_prev_pos_z[even_base + i] = cns->ptc_pos_z[even_base + i]; } for (int i = even_cnt; i < 8; ++i) { cns->ptc_pos_x[even_base + i] = 0.0f; @@ -161,6 +144,11 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->rest_pos_x[even_base + i] = 0.0f; cns->rest_pos_y[even_base + i] = 0.0f; cns->rest_pos_z[even_base + i] = 0.0f; // Previous positions for pads remain 0.0f cns->ptc_prev_pos_x[even_base + i] = 0.0f; cns->ptc_prev_pos_y[even_base + i] = 0.0f; cns->ptc_prev_pos_z[even_base + i] = 0.0f; } for (int i = 0; i < odd_cnt; ++i) { int orig_i = 2 * i + 1; @@ -172,6 +160,11 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->rest_pos_x[odd_base + i] = rest_pos[orig_i*4+0]; cns->rest_pos_y[odd_base + i] = rest_pos[orig_i*4+1]; cns->rest_pos_z[odd_base + i] = rest_pos[orig_i*4+2]; // Set previous positions to match initial positions for zero initial velocity cns->ptc_prev_pos_x[odd_base + i] = cns->ptc_pos_x[odd_base + i]; cns->ptc_prev_pos_y[odd_base + i] = cns->ptc_pos_y[odd_base + i]; cns->ptc_prev_pos_z[odd_base + i] = cns->ptc_pos_z[odd_base + i]; } for (int i = odd_cnt; i < 8; ++i) { cns->ptc_pos_x[odd_base + i] = 0.0f; @@ -181,12 +174,16 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->rest_pos_x[odd_base + i] = 0.0f; cns->rest_pos_y[odd_base + i] = 0.0f; cns->rest_pos_z[odd_base + i] = 0.0f; // Previous positions for pads remain 0.0f cns->ptc_prev_pos_x[odd_base + i] = 0.0f; cns->ptc_prev_pos_y[odd_base + i] = 0.0f; cns->ptc_prev_pos_z[odd_base + i] = 0.0f; } // Set motion radius to FLT_MAX by default (no constraint) for (int i = 0; i < PTC_PER_CHN; i += 8) { _mm256_store_ps(&cns->motion_radius[p_base + i], _mm256_set1_ps(MAX_MOTION_RADIUS)); } // Transpose constraints: even cons (0,2,4,...) to r_base+0-7, odd (1,3,5,...) to r_base+8-15 for (int i = 1; i < cnt; ++i) { float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; @@ -212,47 +209,6 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f; } } return chn; } extern void @@ -320,8 +276,8 @@ chn_sim_del(struct chn_sim *cns, int chn) { _mm256_store_ps(&cns->rest_pos_z[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_z[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(MAX_MOTION_RADIUS)); _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(MAX_MOTION_RADIUS)); _mm256_store_ps(&cns->sph_x[s_base], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_y[s_base], _mm256_set1_ps(0.0f)); @@ -333,11 +289,6 @@ chn_sim_del(struct chn_sim *cns, int chn) { _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f)); } extern void chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { @@ -432,13 +383,13 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { cns->motion_radius[even_base + i] = radius[2 * i]; } for (int i = even_cnt; i < 8; ++i) { cns->motion_radius[even_base + i] = MAX_MOTION_RADIUS; } for (int i = 0; i < odd_cnt; ++i) { cns->motion_radius[odd_base + i] = radius[2 * i + 1]; } for (int i = odd_cnt; i < 8; ++i) { cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS; } } extern void @@ -474,7 +425,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { float ang_vel_y[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHNS] __attribute__((aligned(32))); // Step 0: Precomputation for (int c = 0; c < MAX_CHNS; c += 8) { // Load chain quaternions __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); @@ -614,7 +565,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), _mm256_andnot_ps(negate_mask, one_vec)); orient_qx = _mm256_mul_ps(orient_qx, negate_sign); @@ -633,14 +584,14 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 delta_x_vec = _mm256_set1_ps(vel_x[c] * dt); __m256 delta_y_vec = _mm256_set1_ps(vel_y[c] * dt); __m256 delta_z_vec = _mm256_set1_ps(vel_z[c] * dt); __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); @@ -654,15 +605,19 @@ chn_sim_run(struct chn_sim *cns, float dt) { angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); __m256 lin_lag_factor = _mm256_sub_ps(one_vec, linear_inertia_vec); __m256 lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor); __m256 lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor); __m256 lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor); // Even block (0-7) { // In Step 1, for both even and odd blocks, after loading ptc_inv_mass and clamping to pm __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); // Mask lanes with inv_mass > 0 __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); @@ -684,9 +639,22 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); __m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec); __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec); __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec); // Apply mask to all dt components before adding lin_dt_x = _mm256_and_ps(lin_dt_x, valid_mask); lin_dt_y = _mm256_and_ps(lin_dt_y, valid_mask); lin_dt_z = _mm256_and_ps(lin_dt_z, valid_mask); ang_dt_x = _mm256_and_ps(ang_dt_x, valid_mask); ang_dt_y = _mm256_and_ps(ang_dt_y, valid_mask); ang_dt_z = _mm256_and_ps(ang_dt_z, valid_mask); cent_dt_x = _mm256_and_ps(cent_dt_x, valid_mask); cent_dt_y = _mm256_and_ps(cent_dt_y, valid_mask); cent_dt_z = _mm256_and_ps(cent_dt_z, valid_mask); __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); @@ -696,16 +664,13 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z)); } // Odd block (8-15) { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); // Mask lanes with inv_mass > 0 __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); @@ -727,9 +692,25 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); __m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec); __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec); __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec); lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor); lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor); lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor); lin_dt_x = _mm256_and_ps(lin_dt_x, valid_mask); lin_dt_y = _mm256_and_ps(lin_dt_y, valid_mask); lin_dt_z = _mm256_and_ps(lin_dt_z, valid_mask); ang_dt_x = _mm256_and_ps(ang_dt_x, valid_mask); ang_dt_y = _mm256_and_ps(ang_dt_y, valid_mask); ang_dt_z = _mm256_and_ps(ang_dt_z, valid_mask); cent_dt_x = _mm256_and_ps(cent_dt_x, valid_mask); cent_dt_y = _mm256_and_ps(cent_dt_y, valid_mask); cent_dt_z = _mm256_and_ps(cent_dt_z, valid_mask); __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); @@ -739,7 +720,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_add_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_add_ps(pz, dt_z)); } // Update previous transformation cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; @@ -750,7 +730,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; } // Step 2: Verlet Integration - duplicated for even/odd blocks for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; @@ -809,7 +788,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y); _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z); } // Odd block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); @@ -851,18 +829,14 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z); } } // Step 3: XPBD Constraints (Distance only) for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int r_base = c * CON_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); __m256 compliance_factor = _mm256_div_ps(dt2_vec, stiff_vec); // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15) __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); @@ -882,370 +856,127 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]); __m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]); for (int iter = 0; iter < MAX_ITR; ++iter) { // Even distance constraints: p0 = even (A), p1 = odd (B) { __m256 dx = _mm256_sub_ps(even_x, odd_x); // A - B __m256 dy = _mm256_sub_ps(even_y, odd_y); __m256 dz = _mm256_sub_ps(even_z, odd_z); __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(dist_sq); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 w_sum = _mm256_add_ps(even_m, odd_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ); __m256 C = _mm256_sub_ps(dist, rest_even); __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_even)); // C + compliance * lambda __m256 numerator = _mm256_sub_ps(zero_vec, temp); // - (C + compliance * lambda) __m256 denom = _mm256_add_ps(w_sum, compliance_factor); // w_sum + compliance __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); // grad_A = unit __m256 unit_y = _mm256_mul_ps(dy, inv_dist); __m256 unit_z = _mm256_mul_ps(dz, inv_dist); __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda); __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda); __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda); __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); corr_x = _mm256_and_ps(corr_x, corr_mask); corr_y = _mm256_and_ps(corr_y, corr_mask); corr_z = _mm256_and_ps(corr_z, corr_mask); // dpos_A = delta_lambda * unit * even_m even_x = _mm256_add_ps(even_x, _mm256_mul_ps(corr_x, even_m)); even_y = _mm256_add_ps(even_y, _mm256_mul_ps(corr_y, even_m)); even_z = _mm256_add_ps(even_z, _mm256_mul_ps(corr_z, even_m)); // dpos_B = delta_lambda * (-unit) * odd_m = - corr * odd_m odd_x = _mm256_sub_ps(odd_x, _mm256_mul_ps(corr_x, odd_m)); odd_y = _mm256_sub_ps(odd_y, _mm256_mul_ps(corr_y, odd_m)); odd_z = _mm256_sub_ps(odd_z, _mm256_mul_ps(corr_z, odd_m)); lambda_even = new_lambda; } // Odd distance constraints: p1 = odd (A), p2 = even shifted (B) { __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); __m256 p2_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx)); __m256 p2_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx)); __m256 p2_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx)); __m256 p2_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx)); __m256 dx = _mm256_sub_ps(odd_x, p2_x); __m256 dy = _mm256_sub_ps(odd_y, p2_y); __m256 dz = _mm256_sub_ps(odd_z, p2_z); __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(dist_sq); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 w_sum = _mm256_add_ps(odd_m, p2_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ); __m256 C = _mm256_sub_ps(dist, rest_odd); __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_odd)); __m256 numerator = _mm256_sub_ps(zero_vec, temp); __m256 denom = _mm256_add_ps(w_sum, compliance_factor); __m256 delta_lambda = _mm256_div_ps(numerator, denom); __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda); __m256 unit_x = _mm256_mul_ps(dx, inv_dist); // grad_A = unit __m256 unit_y = _mm256_mul_ps(dy, inv_dist); __m256 unit_z = _mm256_mul_ps(dz, inv_dist); __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda); __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda); __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda); __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); corr_x = _mm256_and_ps(corr_x, corr_mask); corr_y = _mm256_and_ps(corr_y, corr_mask); corr_z = _mm256_and_ps(corr_z, corr_mask); // dpos_A = delta_lambda * unit * odd_m odd_x = _mm256_add_ps(odd_x, _mm256_mul_ps(corr_x, odd_m)); odd_y = _mm256_add_ps(odd_y, _mm256_mul_ps(corr_y, odd_m)); odd_z = _mm256_add_ps(odd_z, _mm256_mul_ps(corr_z, odd_m)); // dpos_B = delta_lambda * (-unit) * p2_m = - corr * p2_m __m256 delta_p2_x = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_x, p2_m)); __m256i delta_ix = _mm256_castps_si256(delta_p2_x); __m256i left_shifted_ix = _mm256_slli_si256(delta_ix, 4); __m256 delta_even_x = _mm256_castsi256_ps(left_shifted_ix); even_x = _mm256_add_ps(even_x, delta_even_x); __m256 delta_p2_y = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_y, p2_m)); __m256i delta_iy = _mm256_castps_si256(delta_p2_y); __m256i left_shifted_iy = _mm256_slli_si256(delta_iy, 4); __m256 delta_even_y = _mm256_castsi256_ps(left_shifted_iy); even_y = _mm256_add_ps(even_y, delta_even_y); __m256 delta_p2_z = _mm256_sub_ps(zero_vec, _mm256_mul_ps(corr_z, p2_m)); __m256i delta_iz = _mm256_castps_si256(delta_p2_z); __m256i left_shifted_iz = _mm256_slli_si256(delta_iz, 4); __m256 delta_even_z = _mm256_castsi256_ps(left_shifted_iz); even_z = _mm256_add_ps(even_z, delta_even_z); lambda_odd = new_lambda; } } // Store back _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], even_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], even_z); @@ -1254,13 +985,9 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], odd_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], odd_z); _mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even); _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd); } // Step 4: Sphere Collisions with Friction - duplicated for even/odd blocks for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; @@ -1279,7 +1006,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]); @@ -1295,6 +1022,8 @@ chn_sim_run(struct chn_sim *cns, float dt) { dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); @@ -1346,13 +1075,12 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); } // Odd block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]); @@ -1368,6 +1096,8 @@ chn_sim_run(struct chn_sim *cns, float dt) { dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 min_inv_dist = _mm256_set1_ps(1e-6f); inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); @@ -1421,13 +1151,11 @@ chn_sim_run(struct chn_sim *cns, float dt) { } } } // Step 5: Motion Constraints - duplicated for even/odd blocks for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; { // Even block __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); @@ -1468,9 +1196,8 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z)); } { // Odd block __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); @@ -1512,4 +1239,223 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z)); } } } #include <SDL2/SDL.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #define WINDOW_WIDTH 800 #define WINDOW_HEIGHT 600 #define CHAIN_LENGTH 10 #define PARTICLE_SPACING 10.0f static void project(float x, float y, float z, int *screen_x, int *screen_y) { *screen_x = (int)(x + WINDOW_WIDTH / 2.0f); *screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f); } // Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity) static void quat_rotate_inv_translate(float wx, float wy, float wz, // world pos float cx, float cy, float cz, // chain pos float qx, float qy, float qz, float qw, // chain quat float *mx, float *my, float *mz) { // out model pos // Translate: world - chain float tx = wx - cx, ty = wy - cy, tz = wz - cz; // Quat conj: (qw, -qx, -qy, -qz) float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz; // t = conj * (0, tx,ty,tz) float tw = cqx*tx + cqy*ty + cqz*tz; float t_x = cqw*tx + cqy*tz - cqz*ty; float t_y = cqw*ty + cqz*tx - cqx*tz; float t_z = cqw*tz + cqx*ty - cqy*tx; // model = t * quat (vec part) *mx = qw*t_x + qy*t_z - qz*t_y + tw*qx; *my = qw*t_y + qz*t_x - qx*t_z + tw*qy; *mz = qw*t_z + qx*t_y - qy*t_x + tw*qz; } extern int main(int argc, char *argv[]) { if (SDL_Init(SDL_INIT_VIDEO) < 0) { printf("SDL_Init failed: %s\n", SDL_GetError()); return 1; } SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0); SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED); if (!window || !renderer) { printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError()); SDL_Quit(); return 1; } struct chn_sim sim; chn_sim_init(&sim); struct chn_cfg cfg = { .damping = 0.5f, .stiffness = 0.99f, .friction = 0.5f, .linear_inertia = 0.9f, // Tune 0.5f for more tangential lag .angular_inertia = 0.9f, .centrifugal_inertia = 0.9f // Tune 0.5f for more outward bow }; float rest_pos[CHAIN_LENGTH * 4]; for (int i = 0; i < CHAIN_LENGTH; i++) { rest_pos[i*4 + 0] = 0.0f; rest_pos[i*4 + 1] = -i * PARTICLE_SPACING; rest_pos[i*4 + 2] = 0.0f; rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f; } int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH); chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f}); // Reduced for spin test chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); // Start at circle edge // Fixed world spheres (triangle outside circle) float world_spheres[12] = { // 3 spheres * 4 floats (x,y,z,r) 60.0f, 0.0f, 0.0f, 10.0f, // Sphere 0: right -30.0f, 52.0f, 0.0f, 10.0f, // Sphere 1: top-left -30.0f,-52.0f, 0.0f, 10.0f // Sphere 2: bottom-left }; // Circular motion params float time = 0.0f; float circle_radius = 50.0f; float angular_speed = 2.0f * M_PI / 5.0f; // Full circle every 5s float dt = 0.016f; int running = 1; SDL_Event event; // Debug: Check gravity and masses printf("Gravity y: %.2f\n", sim.chn_grav_y[chn]); int p_base = chn * PTC_PER_CHN; for (int i = 0; i < CHAIN_LENGTH; i++) { int idx = p_base + (i % 2 == 0 ? i/2 : 8 + (i-1)/2); printf("p%d inv_mass: %.2f\n", i, sim.ptc_inv_mass[idx]); } while (running) { while (SDL_PollEvent(&event)) { if (event.type == SDL_QUIT) { running = 0; } } time += dt; // Circular anchor motion float anchor_x = circle_radius * cosf(angular_speed * time); float anchor_y = circle_radius * sinf(angular_speed * time); chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); // Transform fixed world spheres to current model space float model_spheres[12]; for (int s = 0; s < 3; ++s) { float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2]; float r = world_spheres[s*4 + 3]; float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn]; float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn]; float mx, my, mz; quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz); model_spheres[s*4 + 0] = mx; model_spheres[s*4 + 1] = my; model_spheres[s*4 + 2] = mz; model_spheres[s*4 + 3] = r; } chn_sim_set_sph(&sim, chn, model_spheres, 3); chn_sim_run(&sim, dt); // Collision depth test: Print max pen across particles/spheres float max_pen = 0.0f; int max_p = -1, max_s = -1; int p_base = chn * PTC_PER_CHN; int s_base = chn * SPH_PER_CHN; for (int p = 1; p < CHAIN_LENGTH; ++p) { // Skip fixed p0 int idx = (p % 2 == 0 ? p / 2 : 8 + (p - 1) / 2); float px = sim.ptc_pos_x[p_base + idx], py = sim.ptc_pos_y[p_base + idx], pz = sim.ptc_pos_z[p_base + idx]; for (int s = 0; s < 3; ++s) { float sx = sim.sph_x[s_base + s], sy = sim.sph_y[s_base + s], sz = sim.sph_z[s_base + s]; float sr = sim.sph_r[s_base + s]; float dx = px - sx, dy = py - sy, dz = pz - sz; float dist_sq = dx*dx + dy*dy + dz*dz; float dist = sqrtf(dist_sq); float pen = sr - dist; if (pen > max_pen) { max_pen = pen; max_p = p; max_s = s; } } } if (max_pen > 0.0f && ((int)(time / dt) % 5 == 0)) { printf("Collision: p%d/s%d pen=%.2f\n", max_p, max_s, max_pen); } SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255); SDL_RenderClear(renderer); SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255); // Green for chain & spheres float root_x = sim.chn_pos_x[chn]; // Current root world x/y float root_y = sim.chn_pos_y[chn]; // Draw chain segments for (int i = 0; i < CHAIN_LENGTH - 1; i++) { int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2); int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2); float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0]; float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1]; float world_x0 = model_x0 + root_x; float world_y0 = model_y0 + root_y; float world_x1 = model_x1 + root_x; float world_y1 = model_y1 + root_y; int sx0, sy0, sx1, sy1; project(world_x0, world_y0, 0, &sx0, &sy0); project(world_x1, world_y1, 0, &sx1, &sy1); SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1); } // Draw spheres (approx circle outline with 32 points) for (int s = 0; s < 3; ++s) { // Only draw active spheres float sph_model_x = sim.sph_x[s_base + s]; float sph_model_y = sim.sph_y[s_base + s]; float sph_r = sim.sph_r[s_base + s]; if (sph_r <= 0.0f) continue; float sph_world_x = sph_model_x + root_x; float sph_world_y = sph_model_y + root_y; int sph_sx, sph_sy; project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy); // Parametric circle: 32 points const int num_points = 32; for (int pt = 0; pt < num_points; ++pt) { float angle0 = 2.0f * M_PI * pt / num_points; float angle1 = 2.0f * M_PI * (pt + 1) / num_points; float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0); float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1); int px0, py0, px1, py1; project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0); project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1); SDL_RenderDrawLine(renderer, px0, py0, px1, py1); } } // Debug printf (every 5 steps ~0.08s) if ((int)(time / dt) % 5 == 0) { printf("t=%.2f: ", time); for (int i = 0; i < CHAIN_LENGTH; i++) { int idx = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2); float model_y = sim.ptc_pos_y[p_base + idx]; float world_y = model_y + root_y; printf("p%d world_y=%.2f (model=%.2f) ", i, world_y, model_y); } printf("\n"); } SDL_RenderPresent(renderer); SDL_Delay(16); } SDL_DestroyRenderer(renderer); SDL_DestroyWindow(window); SDL_Quit(); return 0; } -
vurtun revised this gist
Oct 10, 2025 . 1 changed file with 884 additions and 388 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,7 +5,7 @@ #include <float.h> enum { MAX_CHNS = 128, PTC_PER_CHN = 16, SPH_PER_CHN = 8, MAX_PTC = (MAX_CHNS * PTC_PER_CHN), @@ -14,6 +14,8 @@ enum { MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 12, }; #define MAX_MOTION_RADIUS 10.0f struct chn_cfg { float damping; // [0, 1] float stiffness; // [0, 1] @@ -23,6 +25,7 @@ struct chn_cfg { float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force float bend_stiffness; // [0, 1] for bending constraints }; struct chn_sim { unsigned char free_idx_cnt; @@ -57,7 +60,7 @@ struct chn_sim { float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32))); // particle positions (model space, transposed: 0-7 even indices, 8-15 odd indices) float ptc_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); @@ -66,15 +69,22 @@ struct chn_sim { float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); // Transposed: even cons 0-7, odd 8-15 (last dummy) // XPBD lambdas for distance constraints (transposed similarly) float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32))); // Bending rest angles and lambdas (one less than constraints, transposed even/odd bends) float ptc_bend_rest_ang[MAX_REST_LEN - MAX_CHNS] __attribute__((aligned(32))); float ptc_bend_lambda[MAX_REST_LEN - MAX_CHNS] __attribute__((aligned(32))); // sphere positions (model space) float sph_x[MAX_SPH] __attribute__((aligned(32))); float sph_y[MAX_SPH] __attribute__((aligned(32))); float sph_z[MAX_SPH] __attribute__((aligned(32))); float sph_r[MAX_SPH] __attribute__((aligned(32))); // rest positions (model space, transposed) float rest_pos_x[MAX_PTC] __attribute__((aligned(32))); float rest_pos_y[MAX_PTC] __attribute__((aligned(32))); float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); @@ -91,6 +101,26 @@ chn_sim_init(struct chn_sim *cns) { _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f)); _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f)); } // Initialize lambdas to zero memset(cns->ptc_lambda, 0, sizeof(cns->ptc_lambda)); memset(cns->ptc_bend_lambda, 0, sizeof(cns->ptc_bend_lambda)); // Zero all transposed particle arrays memset(cns->ptc_pos_x, 0, sizeof(cns->ptc_pos_x)); memset(cns->ptc_pos_y, 0, sizeof(cns->ptc_pos_y)); memset(cns->ptc_pos_z, 0, sizeof(cns->ptc_pos_z)); memset(cns->ptc_inv_mass, 0, sizeof(cns->ptc_inv_mass)); memset(cns->ptc_prev_pos_x, 0, sizeof(cns->ptc_prev_pos_x)); memset(cns->ptc_prev_pos_y, 0, sizeof(cns->ptc_prev_pos_y)); memset(cns->ptc_prev_pos_z, 0, sizeof(cns->ptc_prev_pos_z)); memset(cns->rest_pos_x, 0, sizeof(cns->rest_pos_x)); memset(cns->rest_pos_y, 0, sizeof(cns->rest_pos_y)); memset(cns->rest_pos_z, 0, sizeof(cns->rest_pos_z)); // Set motion_radius to FLT_MAX for (int i = 0; i < MAX_PTC; i += 8) { _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS)); } memset(cns->ptc_rest_len, 0, sizeof(cns->ptc_rest_len)); memset(cns->ptc_bend_rest_ang, 0, sizeof(cns->ptc_bend_rest_ang)); } extern int chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { @@ -103,33 +133,125 @@ chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restric int chn = cns->free_idx[--cns->free_idx_cnt]; cns->chn_cfg[chn] = *cfg; int p_base = (chn * PTC_PER_CHN); int r_base = (chn * CON_PER_CHN); int b_base = (chn * (CON_PER_CHN - 1)); int even_base = p_base + 0; // Even indices 0-7 int odd_base = p_base + 8; // Odd indices 8-15 // Transpose particles: even indices to 0-7, odd to 8-15 int even_cnt = (cnt + 1) / 2; // Number of even particles int odd_cnt = cnt / 2; // Number of odd particles for (int i = 0; i < even_cnt; ++i) { int orig_i = 2 * i; cns->ptc_pos_x[even_base + i] = rest_pos[orig_i*4+0]; cns->ptc_pos_y[even_base + i] = rest_pos[orig_i*4+1]; cns->ptc_pos_z[even_base + i] = rest_pos[orig_i*4+2]; cns->ptc_inv_mass[even_base + i] = rest_pos[orig_i*4+3]; cns->rest_pos_x[even_base + i] = rest_pos[orig_i*4+0]; cns->rest_pos_y[even_base + i] = rest_pos[orig_i*4+1]; cns->rest_pos_z[even_base + i] = rest_pos[orig_i*4+2]; } for (int i = even_cnt; i < 8; ++i) { cns->ptc_pos_x[even_base + i] = 0.0f; cns->ptc_pos_y[even_base + i] = 0.0f; cns->ptc_pos_z[even_base + i] = 0.0f; cns->ptc_inv_mass[even_base + i] = 0.0f; cns->rest_pos_x[even_base + i] = 0.0f; cns->rest_pos_y[even_base + i] = 0.0f; cns->rest_pos_z[even_base + i] = 0.0f; } for (int i = 0; i < odd_cnt; ++i) { int orig_i = 2 * i + 1; cns->ptc_pos_x[odd_base + i] = rest_pos[orig_i*4+0]; cns->ptc_pos_y[odd_base + i] = rest_pos[orig_i*4+1]; cns->ptc_pos_z[odd_base + i] = rest_pos[orig_i*4+2]; cns->ptc_inv_mass[odd_base + i] = rest_pos[orig_i*4+3]; cns->rest_pos_x[odd_base + i] = rest_pos[orig_i*4+0]; cns->rest_pos_y[odd_base + i] = rest_pos[orig_i*4+1]; cns->rest_pos_z[odd_base + i] = rest_pos[orig_i*4+2]; } for (int i = odd_cnt; i < 8; ++i) { cns->ptc_pos_x[odd_base + i] = 0.0f; cns->ptc_pos_y[odd_base + i] = 0.0f; cns->ptc_pos_z[odd_base + i] = 0.0f; cns->ptc_inv_mass[odd_base + i] = 0.0f; cns->rest_pos_x[odd_base + i] = 0.0f; cns->rest_pos_y[odd_base + i] = 0.0f; cns->rest_pos_z[odd_base + i] = 0.0f; } // Set motion radius to MAX_MOTION_RADIUS by default (no constraint) _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(MAX_MOTION_RADIUS)); _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(MAX_MOTION_RADIUS)); // Transpose constraints: even cons (0,2,4,...) to r_base+0-7, odd (1,3,5,...) to r_base+8-15 for (int i = 1; i < cnt; ++i) { float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; float len = sqrtf(dx*dx + dy*dy + dz*dz); int con_idx = i - 1; if ((con_idx & 0x01) == 0) { // Even constraint cns->ptc_rest_len[r_base + (con_idx / 2)] = len; cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f; } else { // Odd constraint cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = len; cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f; } } for (int i = cnt - 1; i < CON_PER_CHN - 1; ++i) { // Pad dummies int con_idx = i; if ((con_idx & 0x01) == 0) { cns->ptc_rest_len[r_base + (con_idx / 2)] = 0.0f; cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f; } else { cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f; cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f; } } // Compute rest bending angles for triples, transposed (even bends to b_base+0-6, odd to b_base+8-13) for (int i = 2; i < cnt; ++i) { float dx0 = rest_pos[(i-1)*4+0] - rest_pos[(i-2)*4+0]; float dy0 = rest_pos[(i-1)*4+1] - rest_pos[(i-2)*4+1]; float dz0 = rest_pos[(i-1)*4+2] - rest_pos[(i-2)*4+2]; float len0 = sqrtf(dx0*dx0 + dy0*dy0 + dz0*dz0); if (len0 > 1e-6f) { dx0 /= len0; dy0 /= len0; dz0 /= len0; } float dx1 = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; float dy1 = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; float dz1 = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; float len1 = sqrtf(dx1*dx1 + dy1*dy1 + dz1*dz1); if (len1 > 1e-6f) { dx1 /= len1; dy1 /= len1; dz1 /= len1; } float dot = dx0*dx1 + dy0*dy1 + dz0*dz1; dot = fminf(1.0f, fmaxf(-1.0f, dot)); int bend_idx = i - 2; if ((bend_idx & 0x01) == 0) { // Even bend cns->ptc_bend_rest_ang[b_base + (bend_idx / 2)] = dot; cns->ptc_bend_lambda[b_base + (bend_idx / 2)] = 0.0f; } else { // Odd bend cns->ptc_bend_rest_ang[b_base + 8 + ((bend_idx - 1) / 2)] = dot; cns->ptc_bend_lambda[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f; } } // Pad dummies for (int i = cnt - 2; i < CON_PER_CHN - 2; ++i) { int bend_idx = i; if ((bend_idx & 0x01) == 0) { cns->ptc_bend_rest_ang[b_base + (bend_idx / 2)] = 0.0f; cns->ptc_bend_lambda[b_base + (bend_idx / 2)] = 0.0f; } else { cns->ptc_bend_rest_ang[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f; cns->ptc_bend_lambda[b_base + 8 + ((bend_idx - 1) / 2)] = 0.0f; } } return chn; } @@ -139,9 +261,10 @@ chn_sim_del(struct chn_sim *cns, int chn) { assert(chn >= 0); assert(chn < MAX_CHNS); int p_base = (chn * PTC_PER_CHN); int s_base = (chn * SPH_PER_CHN); int r_base = (chn * CON_PER_CHN); int b_base = (chn * (CON_PER_CHN - 1)); cns->free_idx[cns->free_idx_cnt++] = chn; memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); @@ -172,40 +295,49 @@ chn_sim_del(struct chn_sim *cns, int chn) { cns->chn_prev_quat_z[chn] = 0.0f; cns->chn_prev_quat_w[chn] = 1.0f; // Zero transposed particle blocks _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_inv_mass[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_inv_mass[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_x[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_x[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_y[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_y[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_z[p_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_z[p_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(FLT_MAX)); _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(FLT_MAX)); _mm256_store_ps(&cns->sph_x[s_base], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_y[s_base], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_z[s_base], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_r[s_base], _mm256_set1_ps(0.0f)); // Zero transposed constraints _mm256_store_ps(&cns->ptc_rest_len[r_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_rest_ang[b_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_rest_ang[b_base + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 8], _mm256_set1_ps(0.0f)); } extern void chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { @@ -267,20 +399,20 @@ extern void chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { assert(cns); assert(spheres); assert(cnt <= SPH_PER_CHN); int s_base = (chn * SPH_PER_CHN); for (int i = 0; i < cnt; i++) { cns->sph_x[s_base + i] = spheres[i*4+0]; cns->sph_y[s_base + i] = spheres[i*4+1]; cns->sph_z[s_base + i] = spheres[i*4+2]; cns->sph_r[s_base + i] = spheres[i*4+3]; } for (int i = cnt; i < SPH_PER_CHN; ++i) { cns->sph_x[s_base + i] = 0; cns->sph_y[s_base + i] = 0; cns->sph_z[s_base + i] = 0; cns->sph_r[s_base + i] = 0; } } extern void @@ -291,12 +423,22 @@ chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { assert(cnt >= 0); assert(cnt < PTC_PER_CHN); int p_base = (chn * PTC_PER_CHN); int even_base = p_base + 0; int odd_base = p_base + 8; int even_cnt = (cnt + 1) / 2; int odd_cnt = cnt / 2; for (int i = 0; i < even_cnt; ++i) { cns->motion_radius[even_base + i] = radius[2 * i]; } for (int i = even_cnt; i < 8; ++i) { cns->motion_radius[even_base + i] = FLT_MAX; } for (int i = 0; i < odd_cnt; ++i) { cns->motion_radius[odd_base + i] = radius[2 * i + 1]; } for (int i = odd_cnt; i < 8; ++i) { cns->motion_radius[odd_base + i] = FLT_MAX; } } extern void @@ -314,26 +456,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { const __m256 two_vec = _mm256_set1_ps(2.0f); const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); // Stack arrays for precomputed data float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); @@ -372,11 +494,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]); // Step 1: t = q_conj * v_world_as_quat __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy @@ -391,10 +508,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: res_vec = (t * q)_vec __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); @@ -406,37 +519,24 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&chn_wnd_lcl_y[c], res_y); _mm256_store_ps(&chn_wnd_lcl_z[c], res_z); } // Compute local-space gravity (identical to wind) { __m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]); __m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]); __m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]); __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); tx = _mm256_fnmadd_ps(cqz, vy, tx); __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); ty = _mm256_fnmadd_ps(cqx, vz, ty); __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); tz = _mm256_fnmadd_ps(cqy, vx, tz); __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); tw = _mm256_fnmadd_ps(cqy, vy, tw); tw = _mm256_fnmadd_ps(cqz, vz, tw); __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); @@ -492,8 +592,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) __m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw)); orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw); orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw); @@ -535,12 +633,11 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - duplicated for even/odd blocks for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); @@ -553,24 +650,63 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); // Even block (0-7) { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec); ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec); ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec); __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec); __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec); __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec); __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec); __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_add_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z)); } // Odd block (8-15) { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); @@ -579,7 +715,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec); ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec); __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); @@ -588,26 +723,23 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec); __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec); __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec); __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec); __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_add_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_add_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_add_ps(pz, dt_z)); } // Update previous transformation cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; @@ -618,9 +750,10 @@ chn_sim_run(struct chn_sim *cns, float dt) { cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; } // Step 2: Verlet Integration - duplicated for even/odd blocks for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]); @@ -632,20 +765,62 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]); __m256 damping_vec = _mm256_set1_ps(cfg->damping); __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec); __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec); __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec); // Even block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]); __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass); __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass); __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], p_curr_x); _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y); _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z); } // Odd block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]); __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass); __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass); @@ -667,220 +842,426 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z); _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], p_curr_x); _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], p_curr_y); _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z); } } // Step 3: XPBD Constraints (Distance + Bending) - Direct loads for transposed even/odd for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int r_base = c * CON_PER_CHN; int b_base = c * (CON_PER_CHN - 1); struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness); __m256 compliance_factor = _mm256_div_ps(dt2_vec, stiff_vec); __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, bend_stiff_vec); // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15) __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 even_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 even_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 even_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 odd_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 odd_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 odd_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 odd_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); // Direct load transposed rests/lambdas: even cons [0,2,...] in r_base+0, odd in r_base+8 __m256 rest_even = _mm256_load_ps(&cns->ptc_rest_len[r_base + 0]); __m256 lambda_even = _mm256_load_ps(&cns->ptc_lambda[r_base + 0]); __m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]); __m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]); // For bending even: bend_rest_even in b_base+0 (7 lanes + dummy), odd bends in b_base+8 if implemented __m256 bend_rest_even = _mm256_load_ps(&cns->ptc_bend_rest_ang[b_base + 0]); __m256 bend_lambda_even = _mm256_load_ps(&cns->ptc_bend_lambda[b_base + 0]); __m256 bend_rest_odd = _mm256_load_ps(&cns->ptc_bend_rest_ang[b_base + 8]); __m256 bend_lambda_odd = _mm256_load_ps(&cns->ptc_bend_lambda[b_base + 8]); for (int iter = 0; iter < MAX_ITR; ++iter) { // Even distance constraints: p0 = even, p1 = odd { __m256 dx = _mm256_sub_ps(odd_x, even_x); __m256 dy = _mm256_sub_ps(odd_y, even_y); __m256 dz = _mm256_sub_ps(odd_z, even_z); __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(dist_sq); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 w_sum = _mm256_add_ps(even_m, odd_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ); __m256 denom = _mm256_div_ps(one_vec, w_sum); __m256 C = _mm256_sub_ps(dist, rest_even); __m256 delta_lambda = _mm256_mul_ps(compliance_factor, _mm256_mul_ps(C, denom)); __m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda); __m256 dlambda = delta_lambda; __m256 unit_x = _mm256_mul_ps(dx, inv_dist); __m256 unit_y = _mm256_mul_ps(dy, inv_dist); __m256 unit_z = _mm256_mul_ps(dz, inv_dist); __m256 corr_x = _mm256_mul_ps(unit_x, dlambda); __m256 corr_y = _mm256_mul_ps(unit_y, dlambda); __m256 corr_z = _mm256_mul_ps(unit_z, dlambda); __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); corr_x = _mm256_and_ps(corr_x, corr_mask); corr_y = _mm256_and_ps(corr_y, corr_mask); corr_z = _mm256_and_ps(corr_z, corr_mask); // Apply directly even_x = _mm256_add_ps(even_x, _mm256_mul_ps(corr_x, even_m)); even_y = _mm256_add_ps(even_y, _mm256_mul_ps(corr_y, even_m)); even_z = _mm256_add_ps(even_z, _mm256_mul_ps(corr_z, even_m)); odd_x = _mm256_sub_ps(odd_x, _mm256_mul_ps(corr_x, odd_m)); odd_y = _mm256_sub_ps(odd_y, _mm256_mul_ps(corr_y, odd_m)); odd_z = _mm256_sub_ps(odd_z, _mm256_mul_ps(corr_z, odd_m)); lambda_even = new_lambda; } // Odd distance constraints: p1 = odd, p2 = even shifted left (slli) { __m256i even_ix = _mm256_castps_si256(even_x); __m256i shifted_ix = _mm256_srli_si256(even_ix, 4); __m256 p2_x = _mm256_castsi256_ps(shifted_ix); __m256i even_iy = _mm256_castps_si256(even_y); __m256i shifted_iy = _mm256_srli_si256(even_iy, 4); __m256 p2_y = _mm256_castsi256_ps(shifted_iy); __m256i even_iz = _mm256_castps_si256(even_z); __m256i shifted_iz = _mm256_srli_si256(even_iz, 4); __m256 p2_z = _mm256_castsi256_ps(shifted_iz); __m256i even_im = _mm256_castps_si256(even_m); __m256i shifted_im = _mm256_srli_si256(even_im, 4); __m256 p2_m = _mm256_castsi256_ps(shifted_im); __m256 dx = _mm256_sub_ps(p2_x, odd_x); __m256 dy = _mm256_sub_ps(p2_y, odd_y); __m256 dz = _mm256_sub_ps(p2_z, odd_z); __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 inv_dist = _mm256_rsqrt_ps(dist_sq); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 w_sum = _mm256_add_ps(odd_m, p2_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ); __m256 denom = _mm256_div_ps(one_vec, w_sum); __m256 C = _mm256_sub_ps(dist, rest_odd); __m256 delta_lambda = _mm256_mul_ps(compliance_factor, _mm256_mul_ps(C, denom)); __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda); __m256 dlambda = delta_lambda; __m256 unit_x = _mm256_mul_ps(dx, inv_dist); __m256 unit_y = _mm256_mul_ps(dy, inv_dist); __m256 unit_z = _mm256_mul_ps(dz, inv_dist); __m256 corr_x = _mm256_mul_ps(unit_x, dlambda); __m256 corr_y = _mm256_mul_ps(unit_y, dlambda); __m256 corr_z = _mm256_mul_ps(unit_z, dlambda); __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); corr_x = _mm256_and_ps(corr_x, corr_mask); corr_y = _mm256_and_ps(corr_y, corr_mask); corr_z = _mm256_and_ps(corr_z, corr_mask); // Apply: p1=odd += corr * odd_m odd_x = _mm256_add_ps(odd_x, _mm256_mul_ps(corr_x, odd_m)); odd_y = _mm256_add_ps(odd_y, _mm256_mul_ps(corr_y, odd_m)); odd_z = _mm256_add_ps(odd_z, _mm256_mul_ps(corr_z, odd_m)); // Propagate p2 correction back to even: delta_p2 = -corr * p2_m, then right-shift (srli) to align with even lanes __m256 corr_shifted_x = _mm256_mul_ps(corr_x, p2_m); __m256 delta_p2_x = _mm256_sub_ps(zero_vec, corr_shifted_x); __m256i delta_ix = _mm256_castps_si256(delta_p2_x); __m256i right_shifted_ix = _mm256_srli_si256(delta_ix, 4); // Right shift, pad 0 on MSB __m256 delta_even_x = _mm256_castsi256_ps(right_shifted_ix); even_x = _mm256_add_ps(even_x, delta_even_x); __m256 corr_shifted_y = _mm256_mul_ps(corr_y, p2_m); __m256 delta_p2_y = _mm256_sub_ps(zero_vec, corr_shifted_y); __m256i delta_iy = _mm256_castps_si256(delta_p2_y); __m256i right_shifted_iy = _mm256_srli_si256(delta_iy, 4); __m256 delta_even_y = _mm256_castsi256_ps(right_shifted_iy); even_y = _mm256_add_ps(even_y, delta_even_y); __m256 corr_shifted_z = _mm256_mul_ps(corr_z, p2_m); __m256 delta_p2_z = _mm256_sub_ps(zero_vec, corr_shifted_z); __m256i delta_iz = _mm256_castps_si256(delta_p2_z); __m256i right_shifted_iz = _mm256_srli_si256(delta_iz, 4); __m256 delta_even_z = _mm256_castsi256_ps(right_shifted_iz); even_z = _mm256_add_ps(even_z, delta_even_z); lambda_odd = new_lambda; } // Even bending constraints: p0-p1-p2 { // p2 = even shifted left (slli) __m256i even_ix = _mm256_castps_si256(even_x); __m256i shifted_ix = _mm256_srli_si256(even_ix, 4); __m256 p2_x = _mm256_castsi256_ps(shifted_ix); __m256i even_iy = _mm256_castps_si256(even_y); __m256i shifted_iy = _mm256_srli_si256(even_iy, 4); __m256 p2_y = _mm256_castsi256_ps(shifted_iy); __m256i even_iz = _mm256_castps_si256(even_z); __m256i shifted_iz = _mm256_srli_si256(even_iz, 4); __m256 p2_z = _mm256_castsi256_ps(shifted_iz); __m256i even_im = _mm256_castps_si256(even_m); __m256i shifted_im = _mm256_srli_si256(even_im, 4); __m256 p2_m = _mm256_castsi256_ps(shifted_im); // Vectors v0 = p1 - p0, v1 = p2 - p1 __m256 v0x = _mm256_sub_ps(odd_x, even_x); __m256 v0y = _mm256_sub_ps(odd_y, even_y); __m256 v0z = _mm256_sub_ps(odd_z, even_z); __m256 len0_sq = _mm256_fmadd_ps(v0z, v0z, _mm256_fmadd_ps(v0y, v0y, _mm256_mul_ps(v0x, v0x))); __m256 mask_len0 = _mm256_cmp_ps(len0_sq, eps_vec, _CMP_GT_OQ); __m256 inv_len0 = _mm256_rsqrt_ps(len0_sq); __m256 n0x = _mm256_mul_ps(v0x, inv_len0); __m256 n0y = _mm256_mul_ps(v0y, inv_len0); __m256 n0z = _mm256_mul_ps(v0z, inv_len0); __m256 v1x = _mm256_sub_ps(p2_x, odd_x); __m256 v1y = _mm256_sub_ps(p2_y, odd_y); __m256 v1z = _mm256_sub_ps(p2_z, odd_z); __m256 len1_sq = _mm256_fmadd_ps(v1z, v1z, _mm256_fmadd_ps(v1y, v1y, _mm256_mul_ps(v1x, v1x))); __m256 mask_len1 = _mm256_cmp_ps(len1_sq, eps_vec, _CMP_GT_OQ); __m256 inv_len1 = _mm256_rsqrt_ps(len1_sq); __m256 n1x = _mm256_mul_ps(v1x, inv_len1); __m256 n1y = _mm256_mul_ps(v1y, inv_len1); __m256 n1z = _mm256_mul_ps(v1z, inv_len1); __m256 cos_theta = _mm256_fmadd_ps(n0z, n1z, _mm256_fmadd_ps(n0y, n1y, _mm256_mul_ps(n0x, n1x))); cos_theta = _mm256_min_ps(one_vec, _mm256_max_ps(neg_one_vec, cos_theta)); __m256 C = _mm256_sub_ps(bend_rest_even, cos_theta); __m256 w_sum = _mm256_add_ps(_mm256_add_ps(even_m, odd_m), p2_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ); __m256 denom = _mm256_div_ps(one_vec, w_sum); __m256 delta_lambda = _mm256_mul_ps(bend_compliance_factor, _mm256_mul_ps(C, denom)); __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda); __m256 dlambda = delta_lambda; // Direction: cross = n0 × n1 normalized __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(n0y, n1z), _mm256_mul_ps(n0z, n1y)); __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(n0z, n1x), _mm256_mul_ps(n0x, n1z)); __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(n0x, n1y), _mm256_mul_ps(n0y, n1x)); __m256 cross_sq = _mm256_fmadd_ps(cross_z, cross_z, _mm256_fmadd_ps(cross_y, cross_y, _mm256_mul_ps(cross_x, cross_x))); __m256 mask_cross = _mm256_cmp_ps(cross_sq, eps_vec, _CMP_GT_OQ); __m256 inv_cross = _mm256_rsqrt_ps(cross_sq); cross_x = _mm256_mul_ps(cross_x, inv_cross); cross_y = _mm256_mul_ps(cross_y, inv_cross); cross_z = _mm256_mul_ps(cross_z, inv_cross); __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask_len0, mask_len1), mask_w), mask_rest), mask_cross); __m256 dt_x = _mm256_and_ps(_mm256_mul_ps(cross_x, dlambda), corr_mask); __m256 dt_y = _mm256_and_ps(_mm256_mul_ps(cross_y, dlambda), corr_mask); __m256 dt_z = _mm256_and_ps(_mm256_mul_ps(cross_z, dlambda), corr_mask); // Apply: p0 -= dt * even_m even_x = _mm256_sub_ps(even_x, _mm256_mul_ps(dt_x, even_m)); even_y = _mm256_sub_ps(even_y, _mm256_mul_ps(dt_y, even_m)); even_z = _mm256_sub_ps(even_z, _mm256_mul_ps(dt_z, even_m)); // Propagate p2 correction back to even: delta_p2 = +dt * p2_m, then right-shift (srli) __m256 corr_p2_x = _mm256_mul_ps(dt_x, p2_m); __m256 delta_p2_x = corr_p2_x; __m256i delta_p2_ix = _mm256_castps_si256(delta_p2_x); __m256i right_shifted_p2_ix = _mm256_srli_si256(delta_p2_ix, 4); __m256 delta_even_from_p2_x = _mm256_castsi256_ps(right_shifted_p2_ix); even_x = _mm256_add_ps(even_x, delta_even_from_p2_x); __m256 corr_p2_y = _mm256_mul_ps(dt_y, p2_m); __m256 delta_p2_y = corr_p2_y; __m256i delta_p2_iy = _mm256_castps_si256(delta_p2_y); __m256i right_shifted_p2_iy = _mm256_srli_si256(delta_p2_iy, 4); __m256 delta_even_from_p2_y = _mm256_castsi256_ps(right_shifted_p2_iy); even_y = _mm256_add_ps(even_y, delta_even_from_p2_y); __m256 corr_p2_z = _mm256_mul_ps(dt_z, p2_m); __m256 delta_p2_z = corr_p2_z; __m256i delta_p2_iz = _mm256_castps_si256(delta_p2_z); __m256i right_shifted_p2_iz = _mm256_srli_si256(delta_p2_iz, 4); __m256 delta_even_from_p2_z = _mm256_castsi256_ps(right_shifted_p2_iz); even_z = _mm256_add_ps(even_z, delta_even_from_p2_z); bend_lambda_even = new_lambda; } // Odd bending constraints: p1=odd[0], p2=even[1], p3=odd[1] { // p2 = even shifted right (srli to align even[1] to lane 0) __m256i even_ix = _mm256_castps_si256(even_x); __m256i shifted_ix = _mm256_srli_si256(even_ix, 4); __m256 p2_x = _mm256_castsi256_ps(shifted_ix); __m256i even_iy = _mm256_castps_si256(even_y); __m256i shifted_iy = _mm256_srli_si256(even_iy, 4); __m256 p2_y = _mm256_castsi256_ps(shifted_iy); __m256i even_iz = _mm256_castps_si256(even_z); __m256i shifted_iz = _mm256_srli_si256(even_iz, 4); __m256 p2_z = _mm256_castsi256_ps(shifted_iz); __m256i even_im = _mm256_castps_si256(even_m); __m256i shifted_im = _mm256_srli_si256(even_im, 4); __m256 p2_m = _mm256_castsi256_ps(shifted_im); // p3 = odd shifted right (srli to align odd[1] to lane 0) __m256i odd_ix = _mm256_castps_si256(odd_x); __m256i shifted_odd_ix = _mm256_srli_si256(odd_ix, 4); __m256 p3_x = _mm256_castsi256_ps(shifted_odd_ix); __m256i odd_iy = _mm256_castps_si256(odd_y); __m256i shifted_odd_iy = _mm256_srli_si256(odd_iy, 4); __m256 p3_y = _mm256_castsi256_ps(shifted_odd_iy); __m256i odd_iz = _mm256_castps_si256(odd_z); __m256i shifted_odd_iz = _mm256_srli_si256(odd_iz, 4); __m256 p3_z = _mm256_castsi256_ps(shifted_odd_iz); __m256i odd_im = _mm256_castps_si256(odd_m); __m256i shifted_odd_im = _mm256_srli_si256(odd_im, 4); __m256 p3_m = _mm256_castsi256_ps(shifted_odd_im); // Vectors v0 = p2 - p1, v1 = p3 - p2 __m256 v0x = _mm256_sub_ps(p2_x, odd_x); __m256 v0y = _mm256_sub_ps(p2_y, odd_y); __m256 v0z = _mm256_sub_ps(p2_z, odd_z); __m256 len0_sq = _mm256_fmadd_ps(v0z, v0z, _mm256_fmadd_ps(v0y, v0y, _mm256_mul_ps(v0x, v0x))); __m256 mask_len0 = _mm256_cmp_ps(len0_sq, eps_vec, _CMP_GT_OQ); __m256 inv_len0 = _mm256_rsqrt_ps(len0_sq); __m256 n0x = _mm256_mul_ps(v0x, inv_len0); __m256 n0y = _mm256_mul_ps(v0y, inv_len0); __m256 n0z = _mm256_mul_ps(v0z, inv_len0); __m256 v1x = _mm256_sub_ps(p3_x, p2_x); __m256 v1y = _mm256_sub_ps(p3_y, p2_y); __m256 v1z = _mm256_sub_ps(p3_z, p2_z); __m256 len1_sq = _mm256_fmadd_ps(v1z, v1z, _mm256_fmadd_ps(v1y, v1y, _mm256_mul_ps(v1x, v1x))); __m256 mask_len1 = _mm256_cmp_ps(len1_sq, eps_vec, _CMP_GT_OQ); __m256 inv_len1 = _mm256_rsqrt_ps(len1_sq); __m256 n1x = _mm256_mul_ps(v1x, inv_len1); __m256 n1y = _mm256_mul_ps(v1y, inv_len1); __m256 n1z = _mm256_mul_ps(v1z, inv_len1); // Cosine of current angle __m256 cos_theta = _mm256_fmadd_ps(n0z, n1z, _mm256_fmadd_ps(n0y, n1y, _mm256_mul_ps(n0x, n1x))); cos_theta = _mm256_min_ps(one_vec, _mm256_max_ps(neg_one_vec, cos_theta)); __m256 C = _mm256_sub_ps(bend_rest_odd, cos_theta); // Sum of inverse masses __m256 w_sum = _mm256_add_ps(_mm256_add_ps(odd_m, p2_m), p3_m); __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ); __m256 denom = _mm256_div_ps(one_vec, w_sum); // XPBD lambda update __m256 delta_lambda = _mm256_mul_ps(bend_compliance_factor, _mm256_mul_ps(C, denom)); __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda); __m256 dlambda = delta_lambda; // Correction direction: cross = n0 × n1 normalized __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(n0y, n1z), _mm256_mul_ps(n0z, n1y)); __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(n0z, n1x), _mm256_mul_ps(n0x, n1z)); __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(n0x, n1y), _mm256_mul_ps(n0y, n1x)); __m256 cross_sq = _mm256_fmadd_ps(cross_z, cross_z, _mm256_fmadd_ps(cross_y, cross_y, _mm256_mul_ps(cross_x, cross_x))); __m256 mask_cross = _mm256_cmp_ps(cross_sq, eps_vec, _CMP_GT_OQ); __m256 inv_cross = _mm256_rsqrt_ps(cross_sq); cross_x = _mm256_mul_ps(cross_x, inv_cross); cross_y = _mm256_mul_ps(cross_y, inv_cross); cross_z = _mm256_mul_ps(cross_z, inv_cross); // Combine masks __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask_len0, mask_len1), mask_w), mask_rest), mask_cross); __m256 dt_x = _mm256_and_ps(_mm256_mul_ps(cross_x, dlambda), corr_mask); __m256 dt_y = _mm256_and_ps(_mm256_mul_ps(cross_y, dlambda), corr_mask); __m256 dt_z = _mm256_and_ps(_mm256_mul_ps(cross_z, dlambda), corr_mask); // Apply: p1=odd[0] -= dt * odd_m odd_x = _mm256_sub_ps(odd_x, _mm256_mul_ps(dt_x, odd_m)); odd_y = _mm256_sub_ps(odd_y, _mm256_mul_ps(dt_y, odd_m)); odd_z = _mm256_sub_ps(odd_z, _mm256_mul_ps(dt_z, odd_m)); // Apply: p3=odd[1] += dt * p3_m __m256 corr_p3_x = _mm256_mul_ps(dt_x, p3_m); __m256 corr_p3_y = _mm256_mul_ps(dt_y, p3_m); __m256 corr_p3_z = _mm256_mul_ps(dt_z, p3_m); __m256i corr_p3_ix = _mm256_castps_si256(corr_p3_x); __m256i corr_p3_iy = _mm256_castps_si256(corr_p3_y); __m256i corr_p3_iz = _mm256_castps_si256(corr_p3_z); __m256i left_shifted_p3_ix = _mm256_slli_si256(corr_p3_ix, 4); __m256i left_shifted_p3_iy = _mm256_slli_si256(corr_p3_iy, 4); __m256i left_shifted_p3_iz = _mm256_slli_si256(corr_p3_iz, 4); __m256 delta_odd_x = _mm256_castsi256_ps(left_shifted_p3_ix); __m256 delta_odd_y = _mm256_castsi256_ps(left_shifted_p3_iy); __m256 delta_odd_z = _mm256_castsi256_ps(left_shifted_p3_iz); odd_x = _mm256_add_ps(odd_x, delta_odd_x); odd_y = _mm256_add_ps(odd_y, delta_odd_y); odd_z = _mm256_add_ps(odd_z, delta_odd_z); // Propagate p2 correction to even: delta_p2 = +dt * p2_m, then left-shift (slli) __m256 corr_p2_x = _mm256_mul_ps(dt_x, p2_m); __m256 corr_p2_y = _mm256_mul_ps(dt_y, p2_m); __m256 corr_p2_z = _mm256_mul_ps(dt_z, p2_m); __m256i corr_p2_ix = _mm256_castps_si256(corr_p2_x); __m256i corr_p2_iy = _mm256_castps_si256(corr_p2_y); __m256i corr_p2_iz = _mm256_castps_si256(corr_p2_z); __m256i left_shifted_p2_ix = _mm256_slli_si256(corr_p2_ix, 4); __m256i left_shifted_p2_iy = _mm256_slli_si256(corr_p2_iy, 4); __m256i left_shifted_p2_iz = _mm256_slli_si256(corr_p2_iz, 4); __m256 delta_even_from_p2_x = _mm256_castsi256_ps(left_shifted_p2_ix); __m256 delta_even_from_p2_y = _mm256_castsi256_ps(left_shifted_p2_iy); __m256 delta_even_from_p2_z = _mm256_castsi256_ps(left_shifted_p2_iz); even_x = _mm256_add_ps(even_x, delta_even_from_p2_x); even_y = _mm256_add_ps(even_y, delta_even_from_p2_y); even_z = _mm256_add_ps(even_z, delta_even_from_p2_z); bend_lambda_odd = new_lambda; } } // Store back direct (no encode, since transposed) _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], even_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], even_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], odd_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], odd_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], odd_z); // Store lambdas direct (transposed) _mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even); _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd); _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 0], bend_lambda_even); _mm256_store_ps(&cns->ptc_bend_lambda[b_base + 8], bend_lambda_odd); } // Step 4: Sphere Collisions with Friction - duplicated for even/odd blocks for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int s_base = c * SPH_PER_CHN; @@ -893,15 +1274,16 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]); __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]); // Even block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); @@ -913,7 +1295,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); @@ -930,18 +1312,86 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x))); __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x))); __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec)); __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); total_corr_x = _mm256_and_ps(total_corr_x, col_mask); total_corr_y = _mm256_and_ps(total_corr_y, col_mask); total_corr_z = _mm256_and_ps(total_corr_z, col_mask); total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); } // Odd block { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 norm_x = _mm256_mul_ps(dx, inv_dist); __m256 norm_y = _mm256_mul_ps(dy, inv_dist); __m256 norm_z = _mm256_mul_ps(dz, inv_dist); __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x))); __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x))); __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec)); __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); @@ -965,26 +1415,72 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z); } } } // Step 5: Motion Constraints - duplicated for even/odd blocks for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; // Even block { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 0]); __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 0]); __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 0]); __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 0]); __m256 dx = _mm256_sub_ps(px, rx); __m256 dy = _mm256_sub_ps(py, ry); __m256 dz = _mm256_sub_ps(pz, rz); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 pen = _mm256_sub_ps(dist, r_vec); __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 corr_factor = _mm256_mul_ps(pen, inv_dist); corr_factor = _mm256_and_ps(corr_factor, mask); __m256 dt_x = _mm256_mul_ps(dx, corr_factor); __m256 dt_y = _mm256_mul_ps(dy, corr_factor); __m256 dt_z = _mm256_mul_ps(dz, corr_factor); dt_x = _mm256_mul_ps(dt_x, pm); dt_y = _mm256_mul_ps(dt_y, pm); dt_z = _mm256_mul_ps(dt_z, pm); _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_sub_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z)); } // Odd block { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 8]); __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 8]); __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 8]); __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 8]); __m256 dx = _mm256_sub_ps(px, rx); __m256 dy = _mm256_sub_ps(py, ry); @@ -995,7 +1491,7 @@ chn_sim_run(struct chn_sim *cns, float dt) { dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); __m256 pen = _mm256_sub_ps(dist, r_vec); __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); @@ -1011,9 +1507,9 @@ chn_sim_run(struct chn_sim *cns, float dt) { dt_y = _mm256_mul_ps(dt_y, pm); dt_z = _mm256_mul_ps(dt_z, pm); _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_sub_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_sub_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z)); } } } -
vurtun revised this gist
Oct 9, 2025 . 3 changed files with 97 additions and 1620 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,7 +12,7 @@ enum { MAX_SPH = (MAX_CHNS * SPH_PER_CHN), CON_PER_CHN = PTC_PER_CHN, MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 12, }; struct chn_cfg { float damping; // [0, 1] @@ -315,6 +315,13 @@ chn_sim_run(struct chn_sim *cns, float dt) { const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256i idx_p0_even = _mm256_set_epi32(14, 12, 10, 8, 6, 4, 2, 0); const __m256i idx_p1_even = _mm256_set_epi32(15, 13, 11, 9, 7, 5, 3, 1); const __m256i idx_rest_even = _mm256_set_epi32(14, 12, 10, 8, 6, 4, 2, 0); const __m256i idx_p0_odd = _mm256_set_epi32(13, 11, 9, 7, 5, 3, 1, -1); // Use -1 or 0 for dummy lane, masked anyway const __m256i idx_p1_odd = _mm256_set_epi32(14, 12, 10, 8, 6, 4, 2, -1); // Adjusted for consistency (dummy last) const __m256i idx_rest_odd = _mm256_set_epi32(13, 11, 9, 7, 5, 3, 1, -1); const __m256 valid_mask = _mm256_castsi256_ps(_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)); // Or load from array as before // Local arrays for PBD solver float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); @@ -689,18 +696,19 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&pm_lcl[i], pm); } for (int iter = 0; iter < MAX_ITR; ++iter) { // Color 0: even constraints (0-1, 2-3, ..., 14-15) { __m256 p0x = _mm256_i32gather_ps(px_lcl, idx_p0_even, 4); __m256 p0y = _mm256_i32gather_ps(py_lcl, idx_p0_even, 4); __m256 p0z = _mm256_i32gather_ps(pz_lcl, idx_p0_even, 4); __m256 p0m = _mm256_i32gather_ps(pm_lcl, idx_p0_even, 4); __m256 p1x = _mm256_i32gather_ps(px_lcl, idx_p1_even, 4); __m256 p1y = _mm256_i32gather_ps(py_lcl, idx_p1_even, 4); __m256 p1z = _mm256_i32gather_ps(pz_lcl, idx_p1_even, 4); __m256 p1m = _mm256_i32gather_ps(pm_lcl, idx_p1_even, 4); __m256 rest_len_vec = _mm256_i32gather_ps(&cns->ptc_rest_len[r_base], idx_rest_even, 4); __m256 dx = _mm256_sub_ps(p1x, p0x); __m256 dy = _mm256_sub_ps(p1y, p0y); @@ -742,26 +750,53 @@ chn_sim_run(struct chn_sim *cns, float dt) { dt_y = _mm256_and_ps(dt_y, apply_corr_mask); dt_z = _mm256_and_ps(dt_z, apply_corr_mask); __m256 new_p0x = _mm256_fmadd_ps(dt_x, p0m, p0x); __m256 new_p0y = _mm256_fmadd_ps(dt_y, p0m, p0y); __m256 new_p0z = _mm256_fmadd_ps(dt_z, p0m, p0z); __m256 new_p1x = _mm256_fnmadd_ps(dt_x, p1m, p1x); __m256 new_p1y = _mm256_fnmadd_ps(dt_y, p1m, p1y); __m256 new_p1z = _mm256_fnmadd_ps(dt_z, p1m, p1z); float temp_p0x[8] __attribute__((aligned(32))); float temp_p0y[8] __attribute__((aligned(32))); float temp_p0z[8] __attribute__((aligned(32))); float temp_p1x[8] __attribute__((aligned(32))); float temp_p1y[8] __attribute__((aligned(32))); float temp_p1z[8] __attribute__((aligned(32))); _mm256_store_ps(temp_p0x, new_p0x); _mm256_store_ps(temp_p0y, new_p0y); _mm256_store_ps(temp_p0z, new_p0z); _mm256_store_ps(temp_p1x, new_p1x); _mm256_store_ps(temp_p1y, new_p1y); _mm256_store_ps(temp_p1z, new_p1z); for (int l = 0; l < 8; ++l) { int idx_p0 = l * 2; px_lcl[idx_p0] = temp_p0x[l]; py_lcl[idx_p0] = temp_p0y[l]; pz_lcl[idx_p0] = temp_p0z[l]; int idx_p1 = l * 2 + 1; px_lcl[idx_p1] = temp_p1x[l]; py_lcl[idx_p1] = temp_p1y[l]; pz_lcl[idx_p1] = temp_p1z[l]; } } // Color 1: odd constraints (1-2, 3-4, ..., 13-14) { __m256 p0x = _mm256_i32gather_ps(px_lcl, idx_p0_odd, 4); __m256 p0y = _mm256_i32gather_ps(py_lcl, idx_p0_odd, 4); __m256 p0z = _mm256_i32gather_ps(pz_lcl, idx_p0_odd, 4); __m256 p0m = _mm256_i32gather_ps(pm_lcl, idx_p0_odd, 4); __m256 p1x = _mm256_i32gather_ps(px_lcl, idx_p1_odd, 4); __m256 p1y = _mm256_i32gather_ps(py_lcl, idx_p1_odd, 4); __m256 p1z = _mm256_i32gather_ps(pz_lcl, idx_p1_odd, 4); __m256 p1m = _mm256_i32gather_ps(pm_lcl, idx_p1_odd, 4); __m256 rest_len_vec = _mm256_i32gather_ps(&cns->ptc_rest_len[r_base], idx_rest_odd, 4); __m256 dx = _mm256_sub_ps(p1x, p0x); __m256 dy = _mm256_sub_ps(p1y, p0y); @@ -790,9 +825,6 @@ chn_sim_run(struct chn_sim *cns, float dt) { __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part); __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist); @@ -807,13 +839,39 @@ chn_sim_run(struct chn_sim *cns, float dt) { dt_y = _mm256_and_ps(dt_y, apply_corr_mask); dt_z = _mm256_and_ps(dt_z, apply_corr_mask); __m256 new_p0x = _mm256_fmadd_ps(dt_x, p0m, p0x); __m256 new_p0y = _mm256_fmadd_ps(dt_y, p0m, p0y); __m256 new_p0z = _mm256_fmadd_ps(dt_z, p0m, p0z); __m256 new_p1x = _mm256_fnmadd_ps(dt_x, p1m, p1x); __m256 new_p1y = _mm256_fnmadd_ps(dt_y, p1m, p1y); __m256 new_p1z = _mm256_fnmadd_ps(dt_z, p1m, p1z); float temp_p0x[8] __attribute__((aligned(32))); float temp_p0y[8] __attribute__((aligned(32))); float temp_p0z[8] __attribute__((aligned(32))); float temp_p1x[8] __attribute__((aligned(32))); float temp_p1y[8] __attribute__((aligned(32))); float temp_p1z[8] __attribute__((aligned(32))); _mm256_store_ps(temp_p0x, new_p0x); _mm256_store_ps(temp_p0y, new_p0y); _mm256_store_ps(temp_p0z, new_p0z); _mm256_store_ps(temp_p1x, new_p1x); _mm256_store_ps(temp_p1y, new_p1y); _mm256_store_ps(temp_p1z, new_p1z); for (int l = 1; l < 8; ++l) { int idx_p0 = (l - 1) * 2 + 1; px_lcl[idx_p0] = temp_p0x[l]; py_lcl[idx_p0] = temp_p0y[l]; pz_lcl[idx_p0] = temp_p0z[l]; int idx_p1 = (l - 1) * 2 + 2; px_lcl[idx_p1] = temp_p1x[l]; py_lcl[idx_p1] = temp_p1y[l]; pz_lcl[idx_p1] = temp_p1z[l]; } } } for (int i = 0; i < PTC_PER_CHN; i += 8) { @@ -958,4 +1016,4 @@ chn_sim_run(struct chn_sim *cns, float dt) { _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z)); } } } This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,944 +0,0 @@ This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,637 +0,0 @@ -
vurtun revised this gist
Jul 13, 2025 . 1 changed file with 0 additions and 1064 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,1064 +0,0 @@ -
vurtun renamed this gist
Jul 12, 2025 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
vurtun revised this gist
Jul 12, 2025 . 2 changed files with 944 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,944 @@ #include <arm_neon.h> #include <math.h> #include <assert.h> #include <string.h> #include <float.h> enum { MAX_CHNS = 64, PTC_PER_CHN = 16, SPH_PER_CHN = 8, MAX_PTC = (MAX_CHNS * PTC_PER_CHN), MAX_SPH = (MAX_CHNS * SPH_PER_CHN), CON_PER_CHN = PTC_PER_CHN, MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 16, }; struct chn_cfg { float damping; // [0, 1] float stiffness; // [0, 1] float stretch_factor; float max_strain; float friction; // [0, 1] float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force }; struct chn_sim { unsigned char free_idx_cnt; unsigned char free_idx[MAX_CHNS]; struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32))); // chain global forces (world space gravity and wind) float chn_grav_x[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_y[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_z[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32))); // chain transformations (world space) float chn_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_w[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32))); // particle positions (model space) float ptc_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); // sphere positions (model space) float sph_x[MAX_SPH] __attribute__((aligned(32))); float sph_y[MAX_SPH] __attribute__((aligned(32))); float sph_z[MAX_SPH] __attribute__((aligned(32))); float sph_r[MAX_SPH] __attribute__((aligned(32))); // rest positions (model space) float rest_pos_x[MAX_PTC] __attribute__((aligned(32))); float rest_pos_y[MAX_PTC] __attribute__((aligned(32))); float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); float motion_radius[MAX_PTC] __attribute__((aligned(32))); }; extern void chn_sim_init(struct chn_sim *cns) { assert(cns); cns->free_idx_cnt = MAX_CHNS; for (int i = 0; i < MAX_CHNS; ++i) { cns->free_idx[i] = MAX_CHNS - i - 1; } for (int i = 0; i < MAX_CHNS; i += 4) { vst1q_f32(&cns->chn_quat_w[i], vdupq_n_f32(1.0f)); vst1q_f32(&cns->chn_prev_quat_w[i], vdupq_n_f32(1.0f)); } } extern int chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { assert(cns); assert(cfg); assert(cns->free_idx_cnt > 0); assert(cnt <= PTC_PER_CHN); assert(cnt > 1); int chn = cns->free_idx[--cns->free_idx_cnt]; cns->chn_cfg[chn] = *cfg; int p_idx = (chn * PTC_PER_CHN); int r_idx = (chn * CON_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0]; cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1]; cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2]; cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3]; cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0]; cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1]; cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->ptc_pos_x[p_idx + i] = 0.0f; cns->ptc_pos_y[p_idx + i] = 0.0f; cns->ptc_pos_z[p_idx + i] = 0.0f; } for (int i = 1; i < cnt; ++i) { float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz); } for (int i = cnt; i < CON_PER_CHN; ++i) { cns->ptc_rest_len[r_idx + i] = 0.0f; cns->ptc_rest_len[r_idx + i] = 0.0f; cns->ptc_rest_len[r_idx + i] = 0.0f; } return chn; } extern void chn_sim_del(struct chn_sim *cns, int chn) { assert(cns); assert(chn >= 0); assert(chn < MAX_CHNS); int p_idx = (chn * PTC_PER_CHN); int s_idx = (chn * SPH_PER_CHN); int c_idx = (chn * CON_PER_CHN); cns->free_idx[cns->free_idx_cnt++] = chn; memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); cns->chn_grav_x[chn] = 0.0f; cns->chn_grav_y[chn] = 0.0f; cns->chn_grav_z[chn] = 0.0f; cns->chn_wnd_x[chn] = 0.0f; cns->chn_wnd_y[chn] = 0.0f; cns->chn_wnd_z[chn] = 0.0f; cns->chn_pos_x[chn] = 0.0f; cns->chn_pos_y[chn] = 0.0f; cns->chn_pos_z[chn] = 0.0f; cns->chn_quat_x[chn] = 0.0f; cns->chn_quat_y[chn] = 0.0f; cns->chn_quat_z[chn] = 0.0f; cns->chn_quat_w[chn] = 1.0f; cns->chn_prev_pos_x[chn] = 0.0f; cns->chn_prev_pos_y[chn] = 0.0f; cns->chn_prev_pos_z[chn] = 0.0f; cns->chn_prev_quat_x[chn] = 0.0f; cns->chn_prev_quat_y[chn] = 0.0f; cns->chn_prev_quat_z[chn] = 0.0f; cns->chn_prev_quat_w[chn] = 1.0f; for (int i = 0; i < PTC_PER_CHN; i += 4) { vst1q_f32(&cns->ptc_pos_x[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_pos_y[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_pos_z[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_inv_mass[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_prev_pos_x[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_prev_pos_y[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_prev_pos_z[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->rest_pos_x[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->rest_pos_y[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->rest_pos_z[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->motion_radius[p_idx + i], vdupq_n_f32(0.0f)); } // Spheres vst1q_f32(&cns->sph_x[s_idx], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_y[s_idx], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_z[s_idx], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_r[s_idx], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_x[s_idx + 4], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_y[s_idx + 4], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_z[s_idx + 4], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_r[s_idx + 4], vdupq_n_f32(0.0f)); // Loop by 4 for NEON for (int i = 0; i < CON_PER_CHN; i += 4) { vst1q_f32(&cns->ptc_rest_len[c_idx + i], vdupq_n_f32(0.0f)); } } extern void chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(cns); assert(pos3); assert(rot4); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_pos_x[chn] = pos3[0]; cns->chn_pos_y[chn] = pos3[1]; cns->chn_pos_z[chn] = pos3[2]; cns->chn_quat_x[chn] = rot4[0]; cns->chn_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = rot4[3]; } extern void chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(cns); assert(pos3); assert(rot4); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0]; cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1]; cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2]; cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0]; cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; } extern void chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { assert(cns); assert(grav3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_grav_x[chn] = grav3[0]; cns->chn_grav_y[chn] = grav3[1]; cns->chn_grav_z[chn] = grav3[2]; } extern void chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { assert(cns); assert(wind3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_wnd_x[chn] = wind3[0]; cns->chn_wnd_y[chn] = wind3[1]; cns->chn_wnd_z[chn] = wind3[2]; } extern void chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { assert(cns); assert(spheres); assert(cnt < MAX_SPH); int s_idx = (chn * SPH_PER_CHN); for (int i = 0; i < cnt; i++) { cns->sph_x[s_idx + i] = spheres[i*4+0]; cns->sph_y[s_idx + i] = spheres[i*4+1]; cns->sph_z[s_idx + i] = spheres[i*4+2]; cns->sph_r[s_idx + i] = spheres[i*4+3]; } for (int i = cnt; i < SPH_PER_CHN; ++i) { cns->sph_x[s_idx + i] = 0; cns->sph_y[s_idx + i] = 0; cns->sph_z[s_idx + i] = 0; cns->sph_r[s_idx + i] = 0; } } extern void chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { assert(cns); assert(chn >= 0); assert(chn < MAX_CHNS); assert(cnt >= 0); assert(cnt < PTC_PER_CHN); int p_idx = (chn * PTC_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->motion_radius[p_idx + i] = radius[i]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->motion_radius[p_idx + i] = FLT_MAX; } } extern void chn_sim_run(struct chn_sim *cns, float dt) { // NEON constants const float32x4_t dt_vec = vdupq_n_f32(dt); const float32x4_t dt2_vec = vdupq_n_f32(dt * dt); const float32x4_t one_vec = vdupq_n_f32(1.0f); const float32x4_t neg_one_vec = vdupq_n_f32(-1.0f); const float32x4_t eps_vec = vdupq_n_f32(1e-6f); const float32x4_t ptc_inv_mass_clamp_min = vdupq_n_f32(0.0f); const float32x4_t ptc_inv_mass_clamp_max = vdupq_n_f32(1e6f); const float32x4_t zero_vec = vdupq_n_f32(0.0f); const float32x4_t inv_dt_vec = vdupq_n_f32(1.0f / dt); const float32x4_t two_vec = vdupq_n_f32(2.0f); const float32x4_t inertia_clamp_min = vdupq_n_f32(0.0f); const float32x4_t inertia_clamp_max = vdupq_n_f32(1.0f); // NEON doesn't have a direct equivalent to _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)) // for absolute mask. We'll use `vabsq_f32` where needed or reconstruct the logic. // Local arrays for PBD solver float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); // Initialize dummy elements px_lcl[PTC_PER_CHN] = 0.0f; py_lcl[PTC_PER_CHN] = 0.0f; pz_lcl[PTC_PER_CHN] = 0.0f; pm_lcl[PTC_PER_CHN] = 0.0f; // Stack arrays for precomputed data float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32))); float vel_x[MAX_CHNS] __attribute__((aligned(32))); float vel_y[MAX_CHNS] __attribute__((aligned(32))); float vel_z[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_x[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_y[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHNS] __attribute__((aligned(32))); // Step 0: Precomputation for (int c = 0; c < MAX_CHNS; c += 4) { // Load chain quaternions float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]); float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]); float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]); float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) float32x4_t cqx = vnegq_f32(qx); // -qx float32x4_t cqy = vnegq_f32(qy); // -qy float32x4_t cqz = vnegq_f32(qz); // -qz // Compute local-space wind { float32x4_t vx = vld1q_f32(&cns->chn_wnd_x[c]); float32x4_t vy = vld1q_f32(&cns->chn_wnd_y[c]); float32x4_t vz = vld1q_f32(&cns->chn_wnd_z[c]); // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); // qw*vx + (-qy)*vz tx = vmlsq_f32(tx, cqz, vy); // qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); // qw*vy + (-qz)*vx ty = vmlsq_f32(ty, cqx, vz); // qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); // qw*vz + (-qx)*vy tz = vmlsq_f32(tz, cqy, vx); // qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); // - ((-qx)*vx + (-qy)*vy) = qx*vx + qy*vy tw = vmlsq_f32(tw, cqz, vz); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: res_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // res_x = tw*qx + tx*qw + ty*qz - tz*qy // res_y = tw*qy + ty*qw + tz*qx - tx*qz // res_z = tw*qz + ty*qw + tx*qy - ty*qx float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw); res_x = vmlsq_f32(res_x, tz, qy); res_x = vmlaq_f32(res_x, ty, qz); // Corrected: tw*qx + tx*qw + ty*qz - tz*qy float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw); res_y = vmlsq_f32(res_y, tx, qz); res_y = vmlaq_f32(res_y, tz, qx); // Corrected: tw*qy + ty*qw + tz*qx - tx*qz float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw); res_z = vmlsq_f32(res_z, ty, qx); res_z = vmlaq_f32(res_z, tx, qy); // Corrected: tw*qz + tz*qw + tx*qy - ty*qx vst1q_f32(&chn_wnd_lcl_x[c], res_x); vst1q_f32(&chn_wnd_lcl_y[c], res_y); vst1q_f32(&chn_wnd_lcl_z[c], res_z); } // Compute local-space gravity { float32x4_t vx = vld1q_f32(&cns->chn_grav_x[c]); float32x4_t vy = vld1q_f32(&cns->chn_grav_y[c]); float32x4_t vz = vld1q_f32(&cns->chn_grav_z[c]); // Step 1: t = q_conj * v_world_as_quat float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); tx = vmlsq_f32(tx, cqz, vy); float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); ty = vmlsq_f32(ty, cqx, vz); float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); tz = vmlsq_f32(tz, cqy, vx); float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); tw = vmlsq_f32(tw, cqz, vz); // Step 2: res_vec = (t * q)_vec float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw); res_x = vmlsq_f32(res_x, tz, qy); res_x = vmlaq_f32(res_x, ty, qz); float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw); res_y = vmlsq_f32(res_y, tx, qz); res_y = vmlaq_f32(res_y, tz, qx); float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw); res_z = vmlsq_f32(res_z, ty, qx); res_z = vmlaq_f32(res_z, tx, qy); vst1q_f32(&chn_grav_lcl_x[c], res_x); vst1q_f32(&chn_grav_lcl_y[c], res_y); vst1q_f32(&chn_grav_lcl_z[c], res_z); } // Compute linear velocity { float32x4_t curr_x = vld1q_f32(&cns->chn_pos_x[c]); float32x4_t curr_y = vld1q_f32(&cns->chn_pos_y[c]); float32x4_t curr_z = vld1q_f32(&cns->chn_pos_z[c]); float32x4_t prev_x = vld1q_f32(&cns->chn_prev_pos_x[c]); float32x4_t prev_y = vld1q_f32(&cns->chn_prev_pos_y[c]); float32x4_t prev_z = vld1q_f32(&cns->chn_prev_pos_z[c]); float32x4_t vel_x_vec = vmulq_f32(vsubq_f32(curr_x, prev_x), inv_dt_vec); float32x4_t vel_y_vec = vmulq_f32(vsubq_f32(curr_y, prev_y), inv_dt_vec); float32x4_t vel_z_vec = vmulq_f32(vsubq_f32(curr_z, prev_z), inv_dt_vec); vst1q_f32(&vel_x[c], vel_x_vec); vst1q_f32(&vel_y[c], vel_y_vec); vst1q_f32(&vel_z[c], vel_z_vec); } // Compute angular velocity { float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]); float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]); float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]); float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]); float32x4_t prev_qx = vld1q_f32(&cns->chn_prev_quat_x[c]); float32x4_t prev_qy = vld1q_f32(&cns->chn_prev_quat_y[c]); float32x4_t prev_qz = vld1q_f32(&cns->chn_prev_quat_z[c]); float32x4_t prev_qw = vld1q_f32(&cns->chn_prev_quat_w[c]); // Step 1: Compute delta quaternion (to - from) float32x4_t dt_qx = vsubq_f32(qx, prev_qx); float32x4_t dt_qy = vsubq_f32(qy, prev_qy); float32x4_t dt_qz = vsubq_f32(qz, prev_qz); float32x4_t dt_qw = vsubq_f32(qw, prev_qw); // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) float32x4_t conj_qx = vnegq_f32(prev_qx); float32x4_t conj_qy = vnegq_f32(prev_qy); float32x4_t conj_qz = vnegq_f32(prev_qz); float32x4_t conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) float32x4_t orient_qw = vmlsq_f32(vmulq_f32(dt_qw, conj_qw), dt_qx, conj_qx); orient_qw = vmlsq_f32(orient_qw, dt_qy, conj_qy); orient_qw = vmlsq_f32(orient_qw, dt_qz, conj_qz); float32x4_t cross_x = vmlsq_f32(vmulq_f32(dt_qy, conj_qz), dt_qz, conj_qy); float32x4_t cross_y = vmlsq_f32(vmulq_f32(dt_qz, conj_qx), dt_qx, conj_qz); float32x4_t cross_z = vmlsq_f32(vmulq_f32(dt_qx, conj_qy), dt_qy, conj_qx); float32x4_t orient_qx = vmlaq_f32(vmulq_f32(dt_qw, conj_qx), dt_qx, conj_qw); orient_qx = vaddq_f32(orient_qx, cross_x); float32x4_t orient_qy = vmlaq_f32(vmulq_f32(dt_qw, conj_qy), dt_qy, conj_qw); orient_qy = vaddq_f32(orient_qy, cross_y); float32x4_t orient_qz = vmlaq_f32(vmulq_f32(dt_qw, conj_qz), dt_qz, conj_qw); orient_qz = vaddq_f32(orient_qz, cross_z); // Step 4: Compute dot product (to, from) for shortest arc check float32x4_t dot = vmlaq_f32(vmulq_f32(qw, prev_qw), qx, prev_qx); dot = vmlaq_f32(dot, qy, prev_qy); dot = vmlaq_f32(dot, qz, prev_qz); // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) uint32x4_t negate_mask = vcltq_f32(dot, zero_vec); // 1 if dot < 0, 0 otherwise float32x4_t sign_selector = vbslq_f32(negate_mask, neg_one_vec, one_vec); // -1.0f if true, 1.0f if false orient_qx = vmulq_f32(orient_qx, sign_selector); orient_qy = vmulq_f32(orient_qy, sign_selector); orient_qz = vmulq_f32(orient_qz, sign_selector); // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) // Inlined vrecip_f32_high_precision float32x4_t rcp_two_dt; rcp_two_dt = vrecpeq_f32(vmulq_f32(two_vec, dt_vec)); rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt)); rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt)); float32x4_t ang_vel_x_vec = vmulq_f32(orient_qx, rcp_two_dt); float32x4_t ang_vel_y_vec = vmulq_f32(orient_qy, rcp_two_dt); float32x4_t ang_vel_z_vec = vmulq_f32(orient_qz, rcp_two_dt); // Store results vst1q_f32(&ang_vel_x[c], ang_vel_x_vec); vst1q_f32(&ang_vel_y[c], ang_vel_y_vec); vst1q_f32(&ang_vel_z[c], ang_vel_z_vec); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; // Load precomputed velocities and inertia parameters float32x4_t vel_x_vec = vdupq_n_f32(vel_x[c]); float32x4_t vel_y_vec = vdupq_n_f32(vel_y[c]); float32x4_t vel_z_vec = vdupq_n_f32(vel_z[c]); float32x4_t ang_vel_x_vec = vdupq_n_f32(ang_vel_x[c]); float32x4_t ang_vel_y_vec = vdupq_n_f32(ang_vel_y[c]); float32x4_t ang_vel_z_vec = vdupq_n_f32(ang_vel_z[c]); float32x4_t linear_inertia_vec = vdupq_n_f32(cfg->linear_inertia); float32x4_t angular_inertia_vec = vdupq_n_f32(cfg->angular_inertia); float32x4_t centrifugal_inertia_vec = vdupq_n_f32(cfg->centrifugal_inertia); // Clamp inertia parameters to [0, 1] linear_inertia_vec = vmaxq_f32(vminq_f32(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); angular_inertia_vec = vmaxq_f32(vminq_f32(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); centrifugal_inertia_vec = vmaxq_f32(vminq_f32(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t px = vld1q_f32(&cns->ptc_pos_x[base_idx + i]); float32x4_t py = vld1q_f32(&cns->ptc_pos_y[base_idx + i]); float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[base_idx + i]); float32x4_t p_ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]); p_ptc_inv_mass = vmaxq_f32(vminq_f32(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); // Linear inertia: v * dt * linear_inertia float32x4_t lin_dt_x = vmulq_f32(vmulq_f32(vel_x_vec, dt_vec), linear_inertia_vec); float32x4_t lin_dt_y = vmulq_f32(vmulq_f32(vel_y_vec, dt_vec), linear_inertia_vec); float32x4_t lin_dt_z = vmulq_f32(vmulq_f32(vel_z_vec, dt_vec), linear_inertia_vec); // Angular inertia: (ω × r) * dt * angular_inertia float32x4_t ang_dt_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py); float32x4_t ang_dt_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz); float32x4_t ang_dt_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px); ang_dt_x = vmulq_f32(vmulq_f32(ang_dt_x, dt_vec), angular_inertia_vec); ang_dt_y = vmulq_f32(vmulq_f32(ang_dt_y, dt_vec), angular_inertia_vec); ang_dt_z = vmulq_f32(vmulq_f32(ang_dt_z, dt_vec), angular_inertia_vec); // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia float32x4_t cross1_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py); float32x4_t cross1_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz); float32x4_t cross1_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px); float32x4_t cross2_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, cross1_z), ang_vel_z_vec, cross1_y); float32x4_t cross2_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, cross1_x), ang_vel_x_vec, cross1_z); float32x4_t cross2_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, cross1_y), ang_vel_y_vec, cross1_x); // Calculate displacement: (ω × (ω × r)) * dt^2 float32x4_t base_cent_dt_x = vmulq_f32(cross2_x, dt2_vec); float32x4_t base_cent_dt_y = vmulq_f32(cross2_y, dt2_vec); float32x4_t base_cent_dt_z = vmulq_f32(cross2_z, dt2_vec); // Apply the centrifugal_inertia factor float32x4_t cent_dt_x = vmulq_f32(base_cent_dt_x, centrifugal_inertia_vec); float32x4_t cent_dt_y = vmulq_f32(base_cent_dt_y, centrifugal_inertia_vec); float32x4_t cent_dt_z = vmulq_f32(base_cent_dt_z, centrifugal_inertia_vec); // Total delta float32x4_t dt_x = vaddq_f32(vaddq_f32(lin_dt_x, ang_dt_x), cent_dt_x); float32x4_t dt_y = vaddq_f32(vaddq_f32(lin_dt_y, ang_dt_y), cent_dt_y); float32x4_t dt_z = vaddq_f32(vaddq_f32(lin_dt_z, ang_dt_z), cent_dt_z); // Update positions vst1q_f32(&cns->ptc_pos_x[base_idx + i], vaddq_f32(px, dt_x)); vst1q_f32(&cns->ptc_pos_y[base_idx + i], vaddq_f32(py, dt_y)); vst1q_f32(&cns->ptc_pos_z[base_idx + i], vaddq_f32(pz, dt_z)); } // Update previous transformation cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; cns->chn_prev_pos_z[c] = cns->chn_pos_z[c]; cns->chn_prev_quat_x[c] = cns->chn_quat_x[c]; cns->chn_prev_quat_y[c] = cns->chn_quat_y[c]; cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; } // Step 2: Verlet Integration for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; float32x4_t chn_grav_x_vec = vdupq_n_f32(chn_grav_lcl_x[c]); float32x4_t chn_grav_y_vec = vdupq_n_f32(chn_grav_lcl_y[c]); float32x4_t chn_grav_z_vec = vdupq_n_f32(chn_grav_lcl_z[c]); float32x4_t chn_wnd_x_vec = vdupq_n_f32(chn_wnd_lcl_x[c]); float32x4_t chn_wnd_y_vec = vdupq_n_f32(chn_wnd_lcl_y[c]); float32x4_t chn_wnd_z_vec = vdupq_n_f32(chn_wnd_lcl_z[c]); float32x4_t damping_vec = vdupq_n_f32(cfg->damping); for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[base_idx + i]); float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[base_idx + i]); float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[base_idx + i]); float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]); ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[base_idx + i]); float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[base_idx + i]); float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[base_idx + i]); float32x4_t force_x = vaddq_f32(chn_grav_x_vec, chn_wnd_x_vec); float32x4_t force_y = vaddq_f32(chn_grav_y_vec, chn_wnd_y_vec); float32x4_t force_z = vaddq_f32(chn_grav_z_vec, chn_wnd_z_vec); float32x4_t acc_x = vmulq_f32(force_x, ptc_inv_mass); float32x4_t acc_y = vmulq_f32(force_y, ptc_inv_mass); float32x4_t acc_z = vmulq_f32(force_z, ptc_inv_mass); float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x); float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y); float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z); float32x4_t damped_vel_x = vmulq_f32(vel_x, damping_vec); float32x4_t damped_vel_y = vmulq_f32(vel_y, damping_vec); float32x4_t damped_vel_z = vmulq_f32(vel_z, damping_vec); float32x4_t term_accel_x = vmulq_f32(acc_x, dt2_vec); float32x4_t term_accel_y = vmulq_f32(acc_y, dt2_vec); float32x4_t term_accel_z = vmulq_f32(acc_z, dt2_vec); float32x4_t new_p_x = vaddq_f32(p_curr_x, vaddq_f32(damped_vel_x, term_accel_x)); float32x4_t new_p_y = vaddq_f32(p_curr_y, vaddq_f32(damped_vel_y, term_accel_y)); float32x4_t new_p_z = vaddq_f32(p_curr_z, vaddq_f32(damped_vel_z, term_accel_z)); vst1q_f32(&cns->ptc_pos_x[base_idx + i], new_p_x); vst1q_f32(&cns->ptc_pos_y[base_idx + i], new_p_y); vst1q_f32(&cns->ptc_pos_z[base_idx + i], new_p_z); vst1q_f32(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x); vst1q_f32(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y); vst1q_f32(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z); } } // Step 3: Distance Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int r_base = c * CON_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; float32x4_t stiff_vec = vdupq_n_f32(cfg->stiffness); float32x4_t stretch_factor_vec = vdupq_n_f32(cfg->stretch_factor); float32x4_t max_strain_vec_abs = vdupq_n_f32(cfg->max_strain); float32x4_t neg_max_strain_vec_abs = vmulq_f32(max_strain_vec_abs, neg_one_vec); // Loop by 4 for NEON for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); vst1q_f32(&px_lcl[i], vld1q_f32(&cns->ptc_pos_x[p_base + i])); vst1q_f32(&py_lcl[i], vld1q_f32(&cns->ptc_pos_y[p_base + i])); vst1q_f32(&pz_lcl[i], vld1q_f32(&cns->ptc_pos_z[p_base + i])); vst1q_f32(&pm_lcl[i], pm); } for (int iter = 0; iter < MAX_ITR; ++iter) { // Loop by 4 for NEON for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t p0x = vld1q_f32(&px_lcl[i]); float32x4_t p0y = vld1q_f32(&py_lcl[i]); float32x4_t p0z = vld1q_f32(&pz_lcl[i]); float32x4_t p0m = vld1q_f32(&pm_lcl[i]); float32x4_t p1x = vld1q_f32(&px_lcl[i + 1]); float32x4_t p1y = vld1q_f32(&py_lcl[i + 1]); float32x4_t p1z = vld1q_f32(&pz_lcl[i + 1]); float32x4_t p1m = vld1q_f32(&pm_lcl[i + 1]); float32x4_t rest_len_vec = vld1q_f32(&cns->ptc_rest_len[r_base + i]); float32x4_t dx = vsubq_f32(p1x, p0x); float32x4_t dy = vsubq_f32(p1y, p0y); float32x4_t dz = vsubq_f32(p1z, p0z); float32x4_t dist_sq = vmulq_f32(dx, dx); dist_sq = vmlaq_f32(dist_sq, dy, dy); dist_sq = vmlaq_f32(dist_sq, dz, dz); float32x4_t inv_dist; float32x4_t x_rsqrt = vmaxq_f32(dist_sq, eps_vec); inv_dist = vrsqrteq_f32(x_rsqrt); inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist))); inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist))); float32x4_t w_sum = vaddq_f32(p0m, p1m); float32x4_t w_sum_clamped = vmaxq_f32(w_sum, eps_vec); float32x4_t rcp_w_sum; rcp_w_sum = vrecpeq_f32(w_sum_clamped); rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum)); rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum)); float32x4_t dist; dist = vrecpeq_f32(inv_dist); dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist)); dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist)); float32x4_t diff = vsubq_f32(dist, rest_len_vec); float32x4_t max_rest_len_eps = vmaxq_f32(rest_len_vec, eps_vec); float32x4_t rcp_max_rest_len_eps; rcp_max_rest_len_eps = vrecpeq_f32(max_rest_len_eps); rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps)); rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps)); float32x4_t strain = vmulq_f32(diff, rcp_max_rest_len_eps); float32x4_t strain_clamped = vmaxq_f32(neg_max_strain_vec_abs, vminq_f32(strain, max_strain_vec_abs)); float32x4_t dyn_stiff_denom = vaddq_f32(one_vec, vabsq_f32(strain_clamped)); float32x4_t rcp_dyn_stiff_denom; rcp_dyn_stiff_denom = vrecpeq_f32(dyn_stiff_denom); rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom)); rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom)); float32x4_t dyn_stiff = vmulq_f32(stiff_vec, rcp_dyn_stiff_denom); float32x4_t corr_scl_part = vmulq_f32(dyn_stiff, stretch_factor_vec); corr_scl_part = vmulq_f32(corr_scl_part, rcp_w_sum); float32x4_t corr_magnitude = vmulq_f32(diff, corr_scl_part); uint32x4_t apply_corr_mask = vcgtq_f32(dist_sq, eps_vec); // 1 if dist_sq > eps_vec, 0 otherwise float32x4_t dt_unit_x = vmulq_f32(dx, inv_dist); float32x4_t dt_unit_y = vmulq_f32(dy, inv_dist); float32x4_t dt_unit_z = vmulq_f32(dz, inv_dist); float32x4_t dt_x = vmulq_f32(dt_unit_x, corr_magnitude); float32x4_t dt_y = vmulq_f32(dt_unit_y, corr_magnitude); float32x4_t dt_z = vmulq_f32(dt_unit_z, corr_magnitude); dt_x = vbslq_f32(apply_corr_mask, dt_x, zero_vec); // Apply mask: if mask is 0, set to 0 dt_y = vbslq_f32(apply_corr_mask, dt_y, zero_vec); dt_z = vbslq_f32(apply_corr_mask, dt_z, zero_vec); vst1q_f32(&px_lcl[i], vmlaq_f32(p0x, dt_x, p0m)); vst1q_f32(&py_lcl[i], vmlaq_f32(p0y, dt_y, p0m)); vst1q_f32(&pz_lcl[i], vmlaq_f32(p0z, dt_z, p0m)); vst1q_f32(&px_lcl[i + 1], vmlsq_f32(p1x, dt_x, p1m)); vst1q_f32(&py_lcl[i + 1], vmlsq_f32(p1y, dt_y, p1m)); vst1q_f32(&pz_lcl[i + 1], vmlsq_f32(p1z, dt_z, p1m)); } } for (int i = 0; i < PTC_PER_CHN; i += 4) { vst1q_f32(&cns->ptc_pos_x[p_base + i], vld1q_f32(&px_lcl[i])); vst1q_f32(&cns->ptc_pos_y[p_base + i], vld1q_f32(&py_lcl[i])); vst1q_f32(&cns->ptc_pos_z[p_base + i], vld1q_f32(&pz_lcl[i])); } } // Step 4: Sphere Collisions with Friction for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int s_base = c * SPH_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; float32x4_t fric_vec = vdupq_n_f32(cfg->friction); for (int s = 0; s < SPH_PER_CHN; ++s) { float32x4_t sph_x_vec = vdupq_n_f32(cns->sph_x[s_base + s]); float32x4_t sph_y_vec = vdupq_n_f32(cns->sph_y[s_base + s]); float32x4_t sph_z_vec = vdupq_n_f32(cns->sph_z[s_base + s]); float32x4_t sph_r_vec = vdupq_n_f32(cns->sph_r[s_base + s]); for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[p_base + i]); float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[p_base + i]); float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[p_base + i]); float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[p_base + i]); float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[p_base + i]); float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[p_base + i]); float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); float32x4_t dx = vsubq_f32(p_curr_x, sph_x_vec); float32x4_t dy = vsubq_f32(p_curr_y, sph_y_vec); float32x4_t dz = vsubq_f32(p_curr_z, sph_z_vec); float32x4_t dist_sq = vmulq_f32(dx, dx); dist_sq = vmlaq_f32(dist_sq, dy, dy); dist_sq = vmlaq_f32(dist_sq, dz, dz); float32x4_t inv_dist_sphere; float32x4_t x_rsqrt_sphere = vmaxq_f32(dist_sq, eps_vec); inv_dist_sphere = vrsqrteq_f32(x_rsqrt_sphere); inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere))); inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere))); float32x4_t inv_dist = inv_dist_sphere; float32x4_t dist_sphere; dist_sphere = vrecpeq_f32(inv_dist); dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere)); dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere)); float32x4_t dist = dist_sphere; float32x4_t pen = vsubq_f32(sph_r_vec, dist); uint32x4_t col_mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise float32x4_t norm_x = vmulq_f32(dx, inv_dist); float32x4_t norm_y = vmulq_f32(dy, inv_dist); float32x4_t norm_z = vmulq_f32(dz, inv_dist); float32x4_t norm_corr_x = vmulq_f32(norm_x, pen); float32x4_t norm_corr_y = vmulq_f32(norm_y, pen); float32x4_t norm_corr_z = vmulq_f32(norm_z, pen); float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x); float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y); float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z); float32x4_t vel_dot_norm = vmulq_f32(vel_x, norm_x); vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_y, norm_y); vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_z, norm_z); float32x4_t tan_x = vmlsq_f32(vel_x, vel_dot_norm, norm_x); float32x4_t tan_y = vmlsq_f32(vel_y, vel_dot_norm, norm_y); float32x4_t tan_z = vmlsq_f32(vel_z, vel_dot_norm, norm_z); float32x4_t tan_mag_sq = vmulq_f32(tan_x, tan_x); tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_y, tan_y); tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_z, tan_z); float32x4_t inv_tan_mag_sphere; float32x4_t x_rsqrt_tan_mag = vmaxq_f32(tan_mag_sq, eps_vec); inv_tan_mag_sphere = vrsqrteq_f32(x_rsqrt_tan_mag); inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere))); inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere))); float32x4_t inv_tan_mag = inv_tan_mag_sphere; float32x4_t fric_mag = vmulq_f32(pen, fric_vec); float32x4_t fric_x = vmulq_f32(vmulq_f32(tan_x, inv_tan_mag), fric_mag); float32x4_t fric_y = vmulq_f32(vmulq_f32(tan_y, inv_tan_mag), fric_mag); float32x4_t fric_z = vmulq_f32(vmulq_f32(tan_z, inv_tan_mag), fric_mag); float32x4_t total_corr_x = vsubq_f32(norm_corr_x, fric_x); float32x4_t total_corr_y = vsubq_f32(norm_corr_y, fric_y); float32x4_t total_corr_z = vsubq_f32(norm_corr_z, fric_z); total_corr_x = vbslq_f32(col_mask, total_corr_x, zero_vec); total_corr_y = vbslq_f32(col_mask, total_corr_y, zero_vec); total_corr_z = vbslq_f32(col_mask, total_corr_z, zero_vec); total_corr_x = vmulq_f32(total_corr_x, ptc_inv_mass); total_corr_y = vmulq_f32(total_corr_y, ptc_inv_mass); total_corr_z = vmulq_f32(total_corr_z, ptc_inv_mass); float32x4_t new_p_x = vaddq_f32(p_curr_x, total_corr_x); float32x4_t new_p_y = vaddq_f32(p_curr_y, total_corr_y); float32x4_t new_p_z = vaddq_f32(p_curr_z, total_corr_z); vst1q_f32(&cns->ptc_pos_x[p_base + i], new_p_x); vst1q_f32(&cns->ptc_pos_y[p_base + i], new_p_y); vst1q_f32(&cns->ptc_pos_z[p_base + i], new_p_z); } } } // Step 5: Motion Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; // Loop by 4 for NEON for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t px = vld1q_f32(&cns->ptc_pos_x[p_base + i]); float32x4_t py = vld1q_f32(&cns->ptc_pos_y[p_base + i]); float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[p_base + i]); float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); float32x4_t rx = vld1q_f32(&cns->rest_pos_x[p_base + i]); float32x4_t ry = vld1q_f32(&cns->rest_pos_y[p_base + i]); float32x4_t rz = vld1q_f32(&cns->rest_pos_z[p_base + i]); float32x4_t r_vec = vld1q_f32(&cns->motion_radius[p_base + i]); float32x4_t dx = vsubq_f32(px, rx); float32x4_t dy = vsubq_f32(py, ry); float32x4_t dz = vsubq_f32(pz, rz); float32x4_t dist_sq = vmulq_f32(dx, dx); dist_sq = vmlaq_f32(dist_sq, dy, dy); dist_sq = vmlaq_f32(dist_sq, dz, dz); float32x4_t inv_dist_motion; float32x4_t x_rsqrt_motion = vmaxq_f32(dist_sq, eps_vec); inv_dist_motion = vrsqrteq_f32(x_rsqrt_motion); inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion))); inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion))); float32x4_t inv_dist = inv_dist_motion; float32x4_t dist_motion; dist_motion = vrecpeq_f32(inv_dist); dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion)); dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion)); float32x4_t dist = dist_motion; float32x4_t pen = vsubq_f32(dist, r_vec); uint32x4_t mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise float32x4_t corr_factor = vmulq_f32(pen, inv_dist); corr_factor = vbslq_f32(mask, corr_factor, zero_vec); float32x4_t dt_x = vmulq_f32(dx, corr_factor); float32x4_t dt_y = vmulq_f32(dy, corr_factor); float32x4_t dt_z = vmulq_f32(dz, corr_factor); dt_x = vmulq_f32(dt_x, pm); dt_y = vmulq_f32(dt_y, pm); dt_z = vmulq_f32(dt_z, pm); vst1q_f32(&cns->ptc_pos_x[p_base + i], vsubq_f32(px, dt_x)); vst1q_f32(&cns->ptc_pos_y[p_base + i], vsubq_f32(py, dt_y)); vst1q_f32(&cns->ptc_pos_z[p_base + i], vsubq_f32(pz, dt_z)); } } } File renamed without changes. -
vurtun revised this gist
Jul 11, 2025 . 1 changed file with 30 additions and 29 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -81,8 +81,8 @@ struct chn_sim { float motion_radius[MAX_PTC] __attribute__((aligned(32))); }; extern void chn_sim_init(struct chn_sim *cns) { assert(cns); cns->free_idx_cnt = MAX_CHNS; for (int i = 0; i < MAX_CHNS; ++i) { cns->free_idx[i] = MAX_CHNS - i - 1; @@ -93,8 +93,8 @@ chn_sim_init(struct chn_sim *chs) { } } extern int chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { assert(cns); assert(cfg); assert(cns->free_idx_cnt > 0); assert(cnt <= PTC_PER_CHN); @@ -106,23 +106,24 @@ chn_sim_add(struct chn_sim *chs, const struct chn_cfg *cfg, const float *restric int p_idx = (chn * PTC_PER_CHN); int r_idx = (chn * CON_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0]; cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1]; cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2]; cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3]; cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0]; cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1]; cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->ptc_pos_x[p_idx + i] = 0.0f; cns->ptc_pos_y[p_idx + i] = 0.0f; cns->ptc_pos_z[p_idx + i] = 0.0f; } for (int i = 1; i < cnt; ++i) { float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz); } for (int i = cnt; i < CON_PER_CHN; ++i) { @@ -133,8 +134,8 @@ chn_sim_add(struct chn_sim *chs, const struct chn_cfg *cfg, const float *restric return chn; } extern void chn_sim_del(struct chn_sim *cns, int chn) { assert(cns); assert(chn >= 0); assert(chn < MAX_CHNS); @@ -207,8 +208,8 @@ chn_sim_del(struct chn_sim *chs, int chn) { _mm256_store_ps(&cns->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f)); } extern void chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(cns); assert(pos3); assert(rot4); assert(chn >= 0); @@ -224,8 +225,8 @@ chn_sim_mov(struct chn_sim *chs, int chn, const float *restrict pos3, const floa cns->chn_quat_w[chn] = rot4[3]; } extern void chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(cns); assert(pos3); assert(rot4); assert(chn >= 0); @@ -241,8 +242,8 @@ chn_sim_tel(struct chn_sim *chs, int chn, const float *restrict pos3, const floa cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; } extern void chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { assert(cns); assert(grav3); assert(chn >= 0); assert(chn < MAX_CHNS); @@ -252,8 +253,8 @@ chn_sim_grav(struct chn_sim *chs, int chn, const float *restrict grav3) { cns->chn_grav_z[chn] = grav3[2]; } extern void chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { assert(cns); assert(wind3); assert(chn >= 0); assert(chn < MAX_CHNS); @@ -263,8 +264,8 @@ chn_sim_wind(struct chn_sim *chs, int chn, const float *restrict wind3) { cns->chn_wnd_z[chn] = wind3[2]; } extern void chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { assert(cns); assert(spheres); assert(cnt < MAX_SPH); @@ -283,8 +284,8 @@ chn_sim_set_sph(struct chn_sim *chs, int chn, const float *spheres, int cnt) { } } extern void chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { assert(cns); assert(chn >= 0); assert(chn < MAX_CHNS); assert(cnt >= 0); @@ -299,7 +300,7 @@ chn_sim_con_motion(struct chn_sim *chs, int chn, const float *radius, int cnt) { } } extern void chn_sim_run(struct chn_sim *cns, float dt) { // SIMD constants const __m256 dt_vec = _mm256_set1_ps(dt); const __m256 dt2_vec = _mm256_set1_ps(dt * dt); -
vurtun revised this gist
Jul 11, 2025 . 1 changed file with 249 additions and 249 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,7 +12,7 @@ enum { MAX_SPH = (MAX_CHNS * SPH_PER_CHN), CON_PER_CHN = PTC_PER_CHN, MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 16, }; struct chn_cfg { float damping; // [0, 1] @@ -81,225 +81,225 @@ struct chn_sim { float motion_radius[MAX_PTC] __attribute__((aligned(32))); }; extern void chn_sim_init(struct chn_sim *chs) { assert(chs); cns->free_idx_cnt = MAX_CHNS; for (int i = 0; i < MAX_CHNS; ++i) { cns->free_idx[i] = MAX_CHNS - i - 1; } for (int i = 0; i < MAX_CHNS; i += 8) { _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f)); _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f)); } } extern int chn_sim_add(struct chn_sim *chs, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { assert(chs); assert(cfg); assert(cns->free_idx_cnt > 0); assert(cnt <= PTC_PER_CHN); assert(cnt > 1); int chn = cns->free_idx[--cns->free_idx_cnt]; cns->chn_cfg[chn] = *cfg; int p_idx = (chn * PTC_PER_CHN); int r_idx = (chn * CON_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->ptc_pos_x[p_idx + i] = rest_pos[i*3+0]; cns->ptc_pos_y[p_idx + i] = rest_pos[i*3+1]; cns->ptc_pos_z[p_idx + i] = rest_pos[i*3+2]; cns->rest_pos_x[p_idx + i] = rest_pos[i*3+0]; cns->rest_pos_y[p_idx + i] = rest_pos[i*3+1]; cns->rest_pos_z[p_idx + i] = rest_pos[i*3+2]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->ptc_pos_x[p_idx + i] = 0.0f; cns->ptc_pos_y[p_idx + i] = 0.0f; cns->ptc_pos_z[p_idx + i] = 0.0f; } for (int i = 1; i < cnt; ++i) { float dx = rest_pos[i*3+0] - rest_pos[(i-1)*3+0]; float dy = rest_pos[i*3+1] - rest_pos[(i-1)*3+1]; float dz = rest_pos[i*3+2] - rest_pos[(i-1)*3+2]; cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz); } for (int i = cnt; i < CON_PER_CHN; ++i) { cns->ptc_rest_len[r_idx + i] = 0.0f; cns->ptc_rest_len[r_idx + i] = 0.0f; cns->ptc_rest_len[r_idx + i] = 0.0f; } return chn; } extern void chn_sim_del(struct chn_sim *chs, int chn) { assert(chs); assert(chn >= 0); assert(chn < MAX_CHNS); int p_idx = (chn * PTC_PER_CHN); int s_idx = (chn * SPH_PER_CHN); int c_idx = (chn * CON_PER_CHN); cns->free_idx[cns->free_idx_cnt++] = chn; memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); cns->chn_grav_x[chn] = 0.0f; cns->chn_grav_y[chn] = 0.0f; cns->chn_grav_z[chn] = 0.0f; cns->chn_wnd_x[chn] = 0.0f; cns->chn_wnd_y[chn] = 0.0f; cns->chn_wnd_z[chn] = 0.0f; cns->chn_pos_x[chn] = 0.0f; cns->chn_pos_y[chn] = 0.0f; cns->chn_pos_z[chn] = 0.0f; cns->chn_quat_x[chn] = 0.0f; cns->chn_quat_y[chn] = 0.0f; cns->chn_quat_z[chn] = 0.0f; cns->chn_quat_w[chn] = 1.0f; cns->chn_prev_pos_x[chn] = 0.0f; cns->chn_prev_pos_y[chn] = 0.0f; cns->chn_prev_pos_z[chn] = 0.0f; cns->chn_prev_quat_x[chn] = 0.0f; cns->chn_prev_quat_y[chn] = 0.0f; cns->chn_prev_quat_z[chn] = 0.0f; cns->chn_prev_quat_w[chn] = 1.0f; _mm256_store_ps(&cns->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_x[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_y[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_z[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_r[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f)); } extern void chn_sim_mov(struct chn_sim *chs, int chn, const float *restrict pos3, const float *restrict rot4) { assert(chs); assert(pos3); assert(rot4); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_pos_x[chn] = pos3[0]; cns->chn_pos_y[chn] = pos3[1]; cns->chn_pos_z[chn] = pos3[2]; cns->chn_quat_x[chn] = rot4[0]; cns->chn_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = rot4[3]; } extern void chn_sim_tel(struct chn_sim *chs, int chn, const float *restrict pos3, const float *restrict rot4) { assert(chs); assert(pos3); assert(rot4); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0]; cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1]; cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2]; cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0]; cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; } extern void chn_sim_grav(struct chn_sim *chs, int chn, const float *restrict grav3) { assert(chs); assert(grav3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_grav_x[chn] = grav3[0]; cns->chn_grav_y[chn] = grav3[1]; cns->chn_grav_z[chn] = grav3[2]; } extern void chn_sim_wind(struct chn_sim *chs, int chn, const float *restrict wind3) { assert(chs); assert(wind3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_wnd_x[chn] = wind3[0]; cns->chn_wnd_y[chn] = wind3[1]; cns->chn_wnd_z[chn] = wind3[2]; } extern void chn_sim_set_sph(struct chn_sim *chs, int chn, const float *spheres, int cnt) { assert(chs); assert(spheres); assert(cnt < MAX_SPH); int s_idx = (chn * SPH_PER_CHN); for (int i = 0; i < cnt; i++) { cns->sph_x[s_idx + i] = spheres[i*4+0]; cns->sph_y[s_idx + i] = spheres[i*4+1]; cns->sph_z[s_idx + i] = spheres[i*4+2]; cns->sph_r[s_idx + i] = spheres[i*4+3]; } for (int i = cnt; i < SPH_PER_CHN; ++i) { cns->sph_x[s_idx + i] = 0; cns->sph_y[s_idx + i] = 0; cns->sph_z[s_idx + i] = 0; cns->sph_r[s_idx + i] = 0; } } extern void chn_sim_con_motion(struct chn_sim *chs, int chn, const float *radius, int cnt) { assert(chs); assert(chn >= 0); assert(chn < MAX_CHNS); assert(cnt >= 0); assert(cnt < PTC_PER_CHN); int p_idx = (chn * PTC_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->motion_radius[p_idx + i] = radius[i]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->motion_radius[p_idx + i] = FLT_MAX; } } extern void chn_sim_run(struct chn_sim *chs, float dt) { // SIMD constants const __m256 dt_vec = _mm256_set1_ps(dt); const __m256 dt2_vec = _mm256_set1_ps(dt * dt); @@ -347,10 +347,10 @@ chn_sim_run(struct chn_sim *cs, float dt) { // Step 0: Precomputation (SIMD, 8 chains at once) for (int c = 0; c < MAX_CHNS; c += 8) { // Load chain quaternions __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]); __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx @@ -359,9 +359,9 @@ chn_sim_run(struct chn_sim *cs, float dt) { // Compute local-space wind { __m256 vx = _mm256_load_ps(&cns->chn_wnd_x[c]); __m256 vy = _mm256_load_ps(&cns->chn_wnd_y[c]); __m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]); // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) @@ -400,9 +400,9 @@ chn_sim_run(struct chn_sim *cs, float dt) { } // Compute local-space gravity { __m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]); __m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]); __m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]); // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) @@ -443,13 +443,13 @@ chn_sim_run(struct chn_sim *cs, float dt) { } // Compute linear velocity { __m256 curr_x = _mm256_load_ps(&cns->chn_pos_x[c]); __m256 curr_y = _mm256_load_ps(&cns->chn_pos_y[c]); __m256 curr_z = _mm256_load_ps(&cns->chn_pos_z[c]); __m256 prev_x = _mm256_load_ps(&cns->chn_prev_pos_x[c]); __m256 prev_y = _mm256_load_ps(&cns->chn_prev_pos_y[c]); __m256 prev_z = _mm256_load_ps(&cns->chn_prev_pos_z[c]); __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); @@ -461,15 +461,15 @@ chn_sim_run(struct chn_sim *cs, float dt) { } // Compute angular velocity { __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]); __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]); __m256 prev_qx = _mm256_load_ps(&cns->chn_prev_quat_x[c]); __m256 prev_qy = _mm256_load_ps(&cns->chn_prev_quat_y[c]); __m256 prev_qz = _mm256_load_ps(&cns->chn_prev_quat_z[c]); __m256 prev_qw = _mm256_load_ps(&cns->chn_prev_quat_w[c]); // Step 1: Compute delta quaternion (to - from) __m256 dt_qx = _mm256_sub_ps(qx, prev_qx); @@ -530,7 +530,7 @@ chn_sim_run(struct chn_sim *cs, float dt) { // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; // Load precomputed velocities and inertia parameters __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); @@ -551,10 +551,10 @@ chn_sim_run(struct chn_sim *cs, float dt) { centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]); __m256 p_ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]); p_ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); // Linear inertia: v * dt * linear_inertia @@ -596,24 +596,24 @@ chn_sim_run(struct chn_sim *cs, float dt) { __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z); // Update positions _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z)); } // Update previous transformation cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; cns->chn_prev_pos_z[c] = cns->chn_pos_z[c]; cns->chn_prev_quat_x[c] = cns->chn_quat_x[c]; cns->chn_prev_quat_y[c] = cns->chn_quat_y[c]; cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; } // Step 2: Verlet Integration for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]); __m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]); @@ -625,15 +625,15 @@ chn_sim_run(struct chn_sim *cs, float dt) { __m256 damping_vec = _mm256_set1_ps(cfg->damping); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[base_idx + i]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[base_idx + i]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[base_idx + i]); __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec); __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec); @@ -659,32 +659,32 @@ chn_sim_run(struct chn_sim *cs, float dt) { __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], new_p_z); _mm256_store_ps(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x); _mm256_store_ps(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y); _mm256_store_ps(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z); } } // Step 3: Distance Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int r_base = c * CON_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); __m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor); __m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain); __m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); _mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cns->ptc_pos_x[p_base + i])); _mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cns->ptc_pos_y[p_base + i])); _mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cns->ptc_pos_z[p_base + i])); _mm256_store_ps(&pm_lcl[i], pm); } for (int iter = 0; iter < MAX_ITR; ++iter) { @@ -699,7 +699,7 @@ chn_sim_run(struct chn_sim *cs, float dt) { __m256 p1y = _mm256_loadu_ps(&py_lcl[1]); __m256 p1z = _mm256_loadu_ps(&pz_lcl[1]); __m256 p1m = _mm256_loadu_ps(&pm_lcl[1]); __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base]); __m256 dx = _mm256_sub_ps(p1x, p0x); __m256 dy = _mm256_sub_ps(p1y, p0y); @@ -760,7 +760,7 @@ chn_sim_run(struct chn_sim *cs, float dt) { __m256 p1y = _mm256_loadu_ps(&py_lcl[9]); __m256 p1z = _mm256_loadu_ps(&pz_lcl[9]); __m256 p1m = _mm256_loadu_ps(&pm_lcl[9]); __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]); __m256 dx = _mm256_sub_ps(p1x, p0x); __m256 dy = _mm256_sub_ps(p1y, p0y); @@ -816,33 +816,33 @@ chn_sim_run(struct chn_sim *cs, float dt) { } } for (int i = 0; i < PTC_PER_CHN; i += 8) { _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i])); _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i])); _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i])); } } // Step 4: Sphere Collisions with Friction for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int s_base = c * SPH_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 fric_vec = _mm256_set1_ps(cfg->friction); for (int s = 0; s < SPH_PER_CHN; ++s) { __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]); __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]); __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]); __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + i]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + i]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + i]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); @@ -906,26 +906,26 @@ chn_sim_run(struct chn_sim *cs, float dt) { __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + i], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + i], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + i], new_p_z); } } } // Step 5: Motion Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + i]); __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + i]); __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + i]); __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + i]); __m256 dx = _mm256_sub_ps(px, rx); __m256 dy = _mm256_sub_ps(py, ry); @@ -952,9 +952,9 @@ chn_sim_run(struct chn_sim *cs, float dt) { dt_y = _mm256_mul_ps(dt_y, pm); dt_z = _mm256_mul_ps(dt_z, pm); _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z)); } } } -
vurtun revised this gist
Jul 11, 2025 . 1 changed file with 611 additions and 409 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,96 +1,313 @@ #include <immintrin.h> #include <math.h> #include <assert.h> #include <string.h> #include <float.h> enum { MAX_CHNS = 64, PTC_PER_CHN = 16, SPH_PER_CHN = 8, MAX_PTC = (MAX_CHNS * PTC_PER_CHN), MAX_SPH = (MAX_CHNS * SPH_PER_CHN), CON_PER_CHN = PTC_PER_CHN, MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 8, }; struct chn_cfg { float damping; // [0, 1] float stiffness; // [0, 1] float stretch_factor; float max_strain; float friction; // [0, 1] float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force }; struct chn_sim { unsigned char free_idx_cnt; unsigned char free_idx[MAX_CHNS]; struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32))); // chain global forces (world space gravity and wind) float chn_grav_x[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_y[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_z[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32))); // chain transformations (world space) float chn_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_w[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32))); // particle positions (model space) float ptc_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); // sphere positions (model space) float sph_x[MAX_SPH] __attribute__((aligned(32))); float sph_y[MAX_SPH] __attribute__((aligned(32))); float sph_z[MAX_SPH] __attribute__((aligned(32))); float sph_r[MAX_SPH] __attribute__((aligned(32))); // rest positions (model space) float rest_pos_x[MAX_PTC] __attribute__((aligned(32))); float rest_pos_y[MAX_PTC] __attribute__((aligned(32))); float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); float motion_radius[MAX_PTC] __attribute__((aligned(32))); }; extern void chn_sim_init(struct chn_sim *cs) { assert(cs); cs->free_idx_cnt = MAX_CHNS; for (int i = 0; i < MAX_CHNS; ++i) { cs->free_idx[i] = MAX_CHNS - i - 1; } for (int i = 0; i < MAX_CHNS; i += 8) { _mm256_store_ps(&cs->chn_quat_w[i], _mm256_set1_ps(1.0f)); _mm256_store_ps(&cs->chn_prev_quat_w[i], _mm256_set1_ps(1.0f)); } } extern int chn_sim_add(struct chn_sim *cs, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { assert(cs); assert(cfg); assert(cs->free_idx_cnt > 0); assert(cnt <= PTC_PER_CHN); assert(cnt > 1); int c = cs->free_idx[-cs->free_idx_cnt]; cs->chn_cfg[c] = *cfg; int p_idx = (c * PTC_PER_CHN); int r_idx = (c * CON_PER_CHN); for (int i = 0; i < cnt; ++i) { cs->ptc_pos_x[p_idx + i] = rest_pos[i*3+0]; cs->ptc_pos_y[p_idx + i] = rest_pos[i*3+1]; cs->ptc_pos_z[p_idx + i] = rest_pos[i*3+2]; cs->rest_pos_x[p_idx + i] = rest_pos[i*3+0]; cs->rest_pos_y[p_idx + i] = rest_pos[i*3+1]; cs->rest_pos_z[p_idx + i] = rest_pos[i*3+2]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cs->ptc_pos_x[p_idx + i] = 0.0f; cs->ptc_pos_y[p_idx + i] = 0.0f; cs->ptc_pos_z[p_idx + i] = 0.0f; } for (int i = 1; i < cnt; ++i) { float dx = rest_pos[i*3+0] - rest_pos[(i-1)*3+0]; float dy = rest_pos[i*3+1] - rest_pos[(i-1)*3+1]; float dz = rest_pos[i*3+2] - rest_pos[(i-1)*3+2]; cs->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz); } for (int i = cnt; i < CON_PER_CHN; ++i) { cs->ptc_rest_len[r_idx + i] = 0.0f; cs->ptc_rest_len[r_idx + i] = 0.0f; cs->ptc_rest_len[r_idx + i] = 0.0f; } return c; } extern void chn_sim_del(struct chn_sim *cs, int c) { assert(cs); assert(c >= 0); assert(c < MAX_CHNS); int p_idx = (c * PTC_PER_CHN); int s_idx = (c * SPH_PER_CHN); int c_idx = (c * CON_PER_CHN); cs->free_idx[cs->free_idx_cnt++] = c; memset(&cs->chn_cfg[c], 0, sizeof(cs->chn_cfg[0])); cs->chn_grav_x[c] = 0.0f; cs->chn_grav_y[c] = 0.0f; cs->chn_grav_z[c] = 0.0f; cs->chn_wnd_x[c] = 0.0f; cs->chn_wnd_y[c] = 0.0f; cs->chn_wnd_z[c] = 0.0f; cs->chn_pos_x[c] = 0.0f; cs->chn_pos_y[c] = 0.0f; cs->chn_pos_z[c] = 0.0f; cs->chn_quat_x[c] = 0.0f; cs->chn_quat_y[c] = 0.0f; cs->chn_quat_z[c] = 0.0f; cs->chn_quat_w[c] = 1.0f; cs->chn_prev_pos_x[c] = 0.0f; cs->chn_prev_pos_y[c] = 0.0f; cs->chn_prev_pos_z[c] = 0.0f; cs->chn_prev_quat_x[c] = 0.0f; cs->chn_prev_quat_y[c] = 0.0f; cs->chn_prev_quat_z[c] = 0.0f; cs->chn_prev_quat_w[c] = 1.0f; _mm256_store_ps(&cs->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->sph_x[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->sph_y[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->sph_z[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->sph_r[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cs->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f)); } extern void chn_sim_mov(struct chn_sim *cs, int c, const float *restrict pos3, const float *restrict rot4) { assert(cs); assert(pos3); assert(rot4); assert(c >= 0); assert(c < MAX_CHNS); cs->chn_pos_x[c] = pos3[0]; cs->chn_pos_y[c] = pos3[1]; cs->chn_pos_z[c] = pos3[2]; cs->chn_quat_x[c] = rot4[0]; cs->chn_quat_y[c] = rot4[1]; cs->chn_quat_z[c] = rot4[2]; cs->chn_quat_w[c] = rot4[3]; } extern void chn_sim_tel(struct chn_sim *cs, int c, const float *restrict pos3, const float *restrict rot4) { assert(cs); assert(pos3); assert(rot4); assert(c >= 0); assert(c < MAX_CHNS); cs->chn_pos_x[c] = cs->chn_prev_pos_x[c] = pos3[0]; cs->chn_pos_y[c] = cs->chn_prev_pos_y[c] = pos3[1]; cs->chn_pos_z[c] = cs->chn_prev_pos_z[c] = pos3[2]; cs->chn_quat_x[c] = cs->chn_prev_quat_x[c] = rot4[0]; cs->chn_quat_y[c] = cs->chn_prev_quat_y[c] = rot4[1]; cs->chn_quat_z[c] = cs->chn_prev_quat_z[c] = rot4[2]; cs->chn_quat_w[c] = cs->chn_prev_quat_w[c] = rot4[3]; } extern void chn_sim_grav(struct chn_sim *cs, int c, const float *restrict g3) { assert(cs); assert(g3); assert(c >= 0); assert(c < MAX_CHNS); cs->chn_grav_x[c] = g3[0]; cs->chn_grav_y[c] = g3[1]; cs->chn_grav_z[c] = g3[2]; } extern void chn_sim_wind(struct chn_sim *cs, int c, const float *restrict w3) { assert(cs); assert(w3); assert(c >= 0); assert(c < MAX_CHNS); cs->chn_wnd_x[c] = w3[0]; cs->chn_wnd_y[c] = w3[1]; cs->chn_wnd_z[c] = w3[2]; } extern void chn_sim_set_sph(struct chn_sim *cs, int c, const float *spheres, int cnt) { assert(cs); assert(spheres); assert(cnt < MAX_SPH); int s_idx = (c * SPH_PER_CHN); for (int i = 0; i < cnt; i++) { cs->sph_x[s_idx + i] = spheres[i*4+0]; cs->sph_y[s_idx + i] = spheres[i*4+1]; cs->sph_z[s_idx + i] = spheres[i*4+2]; cs->sph_r[s_idx + i] = spheres[i*4+3]; } for (int i = cnt; i < SPH_PER_CHN; ++i) { cs->sph_x[s_idx + i] = 0; cs->sph_y[s_idx + i] = 0; cs->sph_z[s_idx + i] = 0; cs->sph_r[s_idx + i] = 0; } } extern void chn_sim_con_motion(struct chn_sim *cs, int c, const float *radius, int cnt) { assert(cs); assert(c >= 0); assert(c < MAX_CHNS); assert(cnt >= 0); assert(cnt < PTC_PER_CHN); int p_idx = (c * PTC_PER_CHN); for (int i = 0; i < cnt; ++i) { cs->motion_radius[p_idx + i] = radius[i]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cs->motion_radius[p_idx + i] = FLT_MAX; } } extern void chn_sim_run(struct chn_sim *cs, float dt) { // SIMD constants const __m256 dt_vec = _mm256_set1_ps(dt); const __m256 dt2_vec = _mm256_set1_ps(dt * dt); const __m256 one_vec = _mm256_set1_ps(1.0f); const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); const __m256 eps_vec = _mm256_set1_ps(1e-6f); const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f); const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f); const __m256 zero_vec = _mm256_setzero_ps(); const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); const __m256 two_vec = _mm256_set1_ps(2.0f); @@ -99,59 +316,59 @@ void simulate_chains(struct chain_sim *cs, float dt) { const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); // Local arrays for PBD solver float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); // Initialize dummy elements px_lcl[PTC_PER_CHN] = 0.0f; py_lcl[PTC_PER_CHN] = 0.0f; pz_lcl[PTC_PER_CHN] = 0.0f; pm_lcl[PTC_PER_CHN] = 0.0f; // Stack arrays for precomputed data float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32))); float vel_x[MAX_CHNS] __attribute__((aligned(32))); float vel_y[MAX_CHNS] __attribute__((aligned(32))); float vel_z[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_x[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_y[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHNS] __attribute__((aligned(32))); // Step 0: Precomputation (SIMD, 8 chains at once) for (int c = 0; c < MAX_CHNS; c += 8) { // Load chain quaternions __m256 qx = _mm256_load_ps(&cs->chn_quat_x[c]); __m256 qy = _mm256_load_ps(&cs->chn_quat_y[c]); __m256 qz = _mm256_load_ps(&cs->chn_quat_z[c]); __m256 qw = _mm256_load_ps(&cs->chn_quat_w[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz // Compute local-space wind { __m256 vx = _mm256_load_ps(&cs->chn_wnd_x[c]); __m256 vy = _mm256_load_ps(&cs->chn_wnd_y[c]); __m256 vz = _mm256_load_ps(&cs->chn_wnd_z[c]); // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy @@ -165,33 +382,27 @@ void simulate_chains(struct chain_sim *cs, float dt) { tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: res_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // res_x = tw*qx + tx*qw + ty*qz - tz*qy // res_y = tw*qy + ty*qw + tz*qx - tx*qz // res_z = tw*qz + ty*qw + tx*qy - ty*qx __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y)); __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z)); _mm256_store_ps(&chn_wnd_lcl_x[c], res_x); _mm256_store_ps(&chn_wnd_lcl_y[c], res_y); _mm256_store_ps(&chn_wnd_lcl_z[c], res_z); } // Compute local-space gravity { __m256 vx = _mm256_load_ps(&cs->chn_grav_x[c]); __m256 vy = _mm256_load_ps(&cs->chn_grav_y[c]); __m256 vz = _mm256_load_ps(&cs->chn_grav_z[c]); // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) @@ -213,32 +424,32 @@ void simulate_chains(struct chain_sim *cs, float dt) { tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: res_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // res_x = tw*qx + tx*qw + ty*qz - tz*qy // res_y = tw*qy + ty*qw + tz*qx - tx*qz // res_z = tw*qz + ty*qw + tx*qy - ty*qx __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y)); __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z)); _mm256_store_ps(&chn_grav_lcl_x[c], res_x); _mm256_store_ps(&chn_grav_lcl_y[c], res_y); _mm256_store_ps(&chn_grav_lcl_z[c], res_z); } // Compute linear velocity { __m256 curr_x = _mm256_load_ps(&cs->chn_pos_x[c]); __m256 curr_y = _mm256_load_ps(&cs->chn_pos_y[c]); __m256 curr_z = _mm256_load_ps(&cs->chn_pos_z[c]); __m256 prev_x = _mm256_load_ps(&cs->chn_prev_pos_x[c]); __m256 prev_y = _mm256_load_ps(&cs->chn_prev_pos_y[c]); __m256 prev_z = _mm256_load_ps(&cs->chn_prev_pos_z[c]); __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); @@ -250,21 +461,21 @@ void simulate_chains(struct chain_sim *cs, float dt) { } // Compute angular velocity { __m256 qx = _mm256_load_ps(&cs->chn_quat_x[c]); __m256 qy = _mm256_load_ps(&cs->chn_quat_y[c]); __m256 qz = _mm256_load_ps(&cs->chn_quat_z[c]); __m256 qw = _mm256_load_ps(&cs->chn_quat_w[c]); __m256 prev_qx = _mm256_load_ps(&cs->chn_prev_quat_x[c]); __m256 prev_qy = _mm256_load_ps(&cs->chn_prev_quat_y[c]); __m256 prev_qz = _mm256_load_ps(&cs->chn_prev_quat_z[c]); __m256 prev_qw = _mm256_load_ps(&cs->chn_prev_quat_w[c]); // Step 1: Compute delta quaternion (to - from) __m256 dt_qx = _mm256_sub_ps(qx, prev_qx); __m256 dt_qy = _mm256_sub_ps(qy, prev_qy); __m256 dt_qz = _mm256_sub_ps(qz, prev_qz); __m256 dt_qw = _mm256_sub_ps(qw, prev_qw); // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx); @@ -274,20 +485,20 @@ void simulate_chains(struct chain_sim *cs, float dt) { // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) __m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw)); orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw); orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw); __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(dt_qy, conj_qz), _mm256_mul_ps(dt_qz, conj_qy)); __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(dt_qz, conj_qx), _mm256_mul_ps(dt_qx, conj_qz)); __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(dt_qx, conj_qy), _mm256_mul_ps(dt_qy, conj_qx)); __m256 orient_qx = _mm256_fmadd_ps(dt_qw, conj_qx, _mm256_mul_ps(dt_qx, conj_qw)); orient_qx = _mm256_add_ps(orient_qx, cross_x); __m256 orient_qy = _mm256_fmadd_ps(dt_qw, conj_qy, _mm256_mul_ps(dt_qy, conj_qw)); orient_qy = _mm256_add_ps(orient_qy, cross_y); __m256 orient_qz = _mm256_fmadd_ps(dt_qw, conj_qz, _mm256_mul_ps(dt_qz, conj_qw)); orient_qz = _mm256_add_ps(orient_qz, cross_z); // Step 4: Compute dot product (to, from) for shortest arc check @@ -317,9 +528,9 @@ void simulate_chains(struct chain_sim *cs, float dt) { } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cs->chn_cfg[c]; // Load precomputed velocities and inertia parameters __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); @@ -339,26 +550,26 @@ void simulate_chains(struct chain_sim *cs, float dt) { angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 px = _mm256_load_ps(&cs->ptc_pos_x[base_idx + i]); __m256 py = _mm256_load_ps(&cs->ptc_pos_y[base_idx + i]); __m256 pz = _mm256_load_ps(&cs->ptc_pos_z[base_idx + i]); __m256 p_ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[base_idx + i]); p_ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); // Linear inertia: v * dt * linear_inertia __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); // Angular inertia: (ω × r) * dt * angular_inertia __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec); ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec); ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec); // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); @@ -370,67 +581,67 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); // Calculate displacement: (ω × (ω × r)) * dt^2 __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec); __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); // Apply the centrifugal_inertia factor __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec); __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec); __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec); // Total delta __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z); // Update positions _mm256_store_ps(&cs->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x)); _mm256_store_ps(&cs->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y)); _mm256_store_ps(&cs->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z)); } // Update previous transformation cs->chn_prev_pos_x[c] = cs->chn_pos_x[c]; cs->chn_prev_pos_y[c] = cs->chn_pos_y[c]; cs->chn_prev_pos_z[c] = cs->chn_pos_z[c]; cs->chn_prev_quat_x[c] = cs->chn_quat_x[c]; cs->chn_prev_quat_y[c] = cs->chn_quat_y[c]; cs->chn_prev_quat_z[c] = cs->chn_quat_z[c]; cs->chn_prev_quat_w[c] = cs->chn_quat_w[c]; } // Step 2: Verlet Integration for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cs->chn_cfg[c]; __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]); __m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]); __m256 chn_grav_z_vec = _mm256_set1_ps(chn_grav_lcl_z[c]); __m256 chn_wnd_x_vec = _mm256_set1_ps(chn_wnd_lcl_x[c]); __m256 chn_wnd_y_vec = _mm256_set1_ps(chn_wnd_lcl_y[c]); __m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]); __m256 damping_vec = _mm256_set1_ps(cfg->damping); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 p_curr_x = _mm256_load_ps(&cs->ptc_pos_x[base_idx + i]); __m256 p_curr_y = _mm256_load_ps(&cs->ptc_pos_y[base_idx + i]); __m256 p_curr_z = _mm256_load_ps(&cs->ptc_pos_z[base_idx + i]); __m256 ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[base_idx + i]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 p_prev_x = _mm256_load_ps(&cs->ptc_prev_pos_x[base_idx + i]); __m256 p_prev_y = _mm256_load_ps(&cs->ptc_prev_pos_y[base_idx + i]); __m256 p_prev_z = _mm256_load_ps(&cs->ptc_prev_pos_z[base_idx + i]); __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec); __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec); __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec); __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass); __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass); __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); @@ -448,49 +659,47 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); _mm256_store_ps(&cs->ptc_pos_x[base_idx + i], new_p_x); _mm256_store_ps(&cs->ptc_pos_y[base_idx + i], new_p_y); _mm256_store_ps(&cs->ptc_pos_z[base_idx + i], new_p_z); _mm256_store_ps(&cs->ptc_prev_pos_x[base_idx + i], p_curr_x); _mm256_store_ps(&cs->ptc_prev_pos_y[base_idx + i], p_curr_y); _mm256_store_ps(&cs->ptc_prev_pos_z[base_idx + i], p_curr_z); } } // Step 3: Distance Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int r_base = c * CON_PER_CHN; struct chn_cfg *cfg = &cs->chn_cfg[c]; __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); __m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor); __m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain); __m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 pm = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); _mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cs->ptc_pos_x[p_base + i])); _mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cs->ptc_pos_y[p_base + i])); _mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cs->ptc_pos_z[p_base + i])); _mm256_store_ps(&pm_lcl[i], pm); } for (int iter = 0; iter < MAX_ITR; ++iter) { // First block (i=0 to 7) { __m256 p0x = _mm256_load_ps(&px_lcl[0]); __m256 p0y = _mm256_load_ps(&py_lcl[0]); __m256 p0z = _mm256_load_ps(&pz_lcl[0]); __m256 p0m = _mm256_load_ps(&pm_lcl[0]); __m256 p1x = _mm256_loadu_ps(&px_lcl[1]); __m256 p1y = _mm256_loadu_ps(&py_lcl[1]); __m256 p1z = _mm256_loadu_ps(&pz_lcl[1]); __m256 p1m = _mm256_loadu_ps(&pm_lcl[1]); __m256 rest_len_vec = _mm256_load_ps(&cs->ptc_rest_len[r_base]); __m256 dx = _mm256_sub_ps(p1x, p0x); __m256 dy = _mm256_sub_ps(p1y, p0y); @@ -499,7 +708,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 w_sum = _mm256_add_ps(p0m, p1m); @@ -512,47 +720,47 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom)); __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec); corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum); __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part); __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist); __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist); __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist); __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude); __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude); __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude); dt_x = _mm256_and_ps(dt_x, apply_corr_mask); dt_y = _mm256_and_ps(dt_y, apply_corr_mask); dt_z = _mm256_and_ps(dt_z, apply_corr_mask); _mm256_store_ps(&px_lcl[0], _mm256_fmadd_ps(dt_x, p0m, p0x)); _mm256_store_ps(&py_lcl[0], _mm256_fmadd_ps(dt_y, p0m, p0y)); _mm256_store_ps(&pz_lcl[0], _mm256_fmadd_ps(dt_z, p0m, p0z)); _mm256_store_ps(&px_lcl[1], _mm256_fnmadd_ps(dt_x, p1m, p1x)); _mm256_store_ps(&py_lcl[1], _mm256_fnmadd_ps(dt_y, p1m, p1y)); _mm256_store_ps(&pz_lcl[1], _mm256_fnmadd_ps(dt_z, p1m, p1z)); } // Second block (i=8 to 14) { __m256 p0x = _mm256_load_ps(&px_lcl[8]); __m256 p0y = _mm256_load_ps(&py_lcl[8]); __m256 p0z = _mm256_load_ps(&pz_lcl[8]); __m256 p0m = _mm256_load_ps(&pm_lcl[8]); __m256 p1x = _mm256_loadu_ps(&px_lcl[9]); __m256 p1y = _mm256_loadu_ps(&py_lcl[9]); __m256 p1z = _mm256_loadu_ps(&pz_lcl[9]); __m256 p1m = _mm256_loadu_ps(&pm_lcl[9]); __m256 rest_len_vec = _mm256_load_ps(&cs->ptc_rest_len[r_base + 8]); __m256 dx = _mm256_sub_ps(p1x, p0x); __m256 dy = _mm256_sub_ps(p1y, p0y); @@ -561,7 +769,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 w_sum = _mm256_add_ps(p0m, p1m); @@ -574,77 +781,73 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom)); __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec); corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum); __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part); __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; __m256 valid_mask = _mm256_load_ps(valid_mask_array); apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist); __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist); __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist); __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude); __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude); __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude); dt_x = _mm256_and_ps(dt_x, apply_corr_mask); dt_y = _mm256_and_ps(dt_y, apply_corr_mask); dt_z = _mm256_and_ps(dt_z, apply_corr_mask); _mm256_store_ps(&px_lcl[8], _mm256_fmadd_ps(dt_x, p0m, p0x)); _mm256_store_ps(&py_lcl[8], _mm256_fmadd_ps(dt_y, p0m, p0y)); _mm256_store_ps(&pz_lcl[8], _mm256_fmadd_ps(dt_z, p0m, p0z)); _mm256_storeu_ps(&px_lcl[9], _mm256_fnmadd_ps(dt_x, p1m, p1x)); _mm256_storeu_ps(&py_lcl[9], _mm256_fnmadd_ps(dt_y, p1m, p1y)); _mm256_storeu_ps(&pz_lcl[9], _mm256_fnmadd_ps(dt_z, p1m, p1z)); } } for (int i = 0; i < PTC_PER_CHN; i += 8) { _mm256_store_ps(&cs->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i])); _mm256_store_ps(&cs->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i])); _mm256_store_ps(&cs->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i])); } } // Step 4: Sphere Collisions with Friction for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int s_base = c * SPH_PER_CHN; struct chn_cfg *cfg = &cs->chn_cfg[c]; __m256 fric_vec = _mm256_set1_ps(cfg->friction); for (int s = 0; s < SPH_PER_CHN; ++s) { __m256 sph_x_vec = _mm256_set1_ps(cs->sph_x[s_base + s]); __m256 sph_y_vec = _mm256_set1_ps(cs->sph_y[s_base + s]); __m256 sph_z_vec = _mm256_set1_ps(cs->sph_z[s_base + s]); __m256 sph_r_vec = _mm256_set1_ps(cs->sph_r[s_base + s]); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 p_curr_x = _mm256_load_ps(&cs->ptc_pos_x[p_base + i]); __m256 p_curr_y = _mm256_load_ps(&cs->ptc_pos_y[p_base + i]); __m256 p_curr_z = _mm256_load_ps(&cs->ptc_pos_z[p_base + i]); __m256 p_prev_x = _mm256_load_ps(&cs->ptc_prev_pos_x[p_base + i]); __m256 p_prev_y = _mm256_load_ps(&cs->ptc_prev_pos_y[p_base + i]); __m256 p_prev_z = _mm256_load_ps(&cs->ptc_prev_pos_z[p_base + i]); __m256 ptc_inv_mass = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); @@ -653,77 +856,76 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 norm_x = _mm256_mul_ps(dx, inv_dist); __m256 norm_y = _mm256_mul_ps(dy, inv_dist); __m256 norm_z = _mm256_mul_ps(dz, inv_dist); __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 vel_dot_norm = _mm256_mul_ps(vel_x, norm_x); vel_dot_norm = _mm256_fmadd_ps(vel_y, norm_y, vel_dot_norm); vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, vel_dot_norm); __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); __m256 tan_mag_sq = _mm256_mul_ps(tan_x, tan_x); tan_mag_sq = _mm256_fmadd_ps(tan_y, tan_y, tan_mag_sq); tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, tan_mag_sq); __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec)); __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); total_corr_x = _mm256_and_ps(total_corr_x, col_mask); total_corr_y = _mm256_and_ps(total_corr_y, col_mask); total_corr_z = _mm256_and_ps(total_corr_z, col_mask); total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cs->ptc_pos_x[p_base + i], new_p_x); _mm256_store_ps(&cs->ptc_pos_y[p_base + i], new_p_y); _mm256_store_ps(&cs->ptc_pos_z[p_base + i], new_p_z); } } } // Step 5: Motion Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 px = _mm256_load_ps(&cs->ptc_pos_x[p_base + i]); __m256 py = _mm256_load_ps(&cs->ptc_pos_y[p_base + i]); __m256 pz = _mm256_load_ps(&cs->ptc_pos_z[p_base + i]); __m256 pm = _mm256_load_ps(&cs->ptc_inv_mass[p_base + i]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]); __m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]); __m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]); __m256 r_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]); __m256 dx = _mm256_sub_ps(px, rx); __m256 dy = _mm256_sub_ps(py, ry); @@ -736,23 +938,23 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 pen = _mm256_sub_ps(dist, r_vec); __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 corr_factor = _mm256_mul_ps(pen, inv_dist); corr_factor = _mm256_and_ps(corr_factor, mask); __m256 dt_x = _mm256_mul_ps(dx, corr_factor); __m256 dt_y = _mm256_mul_ps(dy, corr_factor); __m256 dt_z = _mm256_mul_ps(dz, corr_factor); dt_x = _mm256_mul_ps(dt_x, pm); dt_y = _mm256_mul_ps(dt_y, pm); dt_z = _mm256_mul_ps(dt_z, pm); _mm256_store_ps(&cs->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x)); _mm256_store_ps(&cs->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y)); _mm256_store_ps(&cs->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z)); } } } -
vurtun revised this gist
Jul 4, 2025 . 1 changed file with 0 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -141,7 +141,6 @@ export void simulate_chains( assume(MAX_REST_LENGTHS % programCount == 0); // Local arrays for PBD solver __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1]; __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1]; __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1]; @@ -171,7 +170,6 @@ export void simulate_chains( __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS]; // Step 0: Precomputation (SIMD, process chains in groups of programCount) for (uniform int c = 0; c < MAX_CHAINS; c += programCount) { // Load chain quaternions varying float qx = chain_quat_x_ptr[c + programIndex]; @@ -295,7 +293,6 @@ export void simulate_chains( } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int base_idx = c * PARTICLES_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; @@ -375,7 +372,6 @@ export void simulate_chains( } // Step 2: Verlet Integration for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int base_idx = c * PARTICLES_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; @@ -437,7 +433,6 @@ export void simulate_chains( } // Step 3: Distance Constraints for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; uniform int r_base = c * CONSTRAINTS_PER_CHAIN; @@ -538,7 +533,6 @@ export void simulate_chains( } } // Step 4: Sphere Collisions with Friction for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; uniform int s_base = c * SPHERES_PER_CHAIN; @@ -607,7 +601,6 @@ export void simulate_chains( } } // Step 5: Motion Constraints for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { -
vurtun revised this gist
Jul 3, 2025 . 1 changed file with 0 additions and 29 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -127,35 +127,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); // Step 0: Precomputation (SIMD, 8 chains at once) for (int c = 0; c < MAX_CHAINS; c += 8) { // Load chain quaternions -
vurtun revised this gist
Jul 2, 2025 . 1 changed file with 612 additions and 612 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,636 +9,636 @@ // Define chain configuration structure struct chain_cfg { uniform float damping; uniform float stiffness; uniform float stretch_factor; uniform float max_strain; uniform float friction; uniform float collision_radius; uniform float linear_inertia; // [0, 1] uniform float angular_inertia; // [0, 1] uniform float centrifugal_inertia;// [0, 1] }; // Define chain simulation structure struct chain_sim { uint8 free_idx_cnt; uint8 free_idx[MAX_CHAINS]; uniform struct chain_cfg chain_configs[MAX_CHAINS]; // Chain global forces (world space gravity and wind) uniform float gravity_x[MAX_CHAINS]; uniform float gravity_y[MAX_CHAINS]; uniform float gravity_z[MAX_CHAINS]; uniform float wind_x[MAX_CHAINS]; uniform float wind_y[MAX_CHAINS]; uniform float wind_z[MAX_CHAINS]; // Chain transformations (world space) uniform float chain_pos_x[MAX_CHAINS]; uniform float chain_pos_y[MAX_CHAINS]; uniform float chain_pos_z[MAX_CHAINS]; uniform float chain_quat_x[MAX_CHAINS]; uniform float chain_quat_y[MAX_CHAINS]; uniform float chain_quat_z[MAX_CHAINS]; uniform float chain_quat_w[MAX_CHAINS]; uniform float prev_chain_pos_x[MAX_CHAINS]; uniform float prev_chain_pos_y[MAX_CHAINS]; uniform float prev_chain_pos_z[MAX_CHAINS]; uniform float prev_chain_quat_x[MAX_CHAINS]; uniform float prev_chain_quat_y[MAX_CHAINS]; uniform float prev_chain_quat_z[MAX_CHAINS]; uniform float prev_chain_quat_w[MAX_CHAINS]; // Particle positions (model space) uniform float p_pos_x[MAX_PARTICLES]; uniform float p_pos_y[MAX_PARTICLES]; uniform float p_pos_z[MAX_PARTICLES]; uniform float prev_p_pos_x[MAX_PARTICLES]; uniform float prev_p_pos_y[MAX_PARTICLES]; uniform float prev_p_pos_z[MAX_PARTICLES]; uniform float inv_mass[MAX_PARTICLES]; uniform float rest_lengths[MAX_REST_LENGTHS]; // Sphere positions (model space) uniform float sphere_x[MAX_SPHERES]; uniform float sphere_y[MAX_SPHERES]; uniform float sphere_z[MAX_SPHERES]; uniform float sphere_radius[MAX_SPHERES]; // Rest positions (model space) uniform float rest_pos_x[MAX_PARTICLES]; uniform float rest_pos_y[MAX_PARTICLES]; uniform float rest_pos_z[MAX_PARTICLES]; uniform float motion_radius[MAX_PARTICLES]; }; // ISPC task for chain simulation export void simulate_chains( __attribute__((aligned(32))) uniform float *uniform chain_quat_x_ptr, __attribute__((aligned(32))) uniform float *uniform chain_quat_y_ptr, __attribute__((aligned(32))) uniform float *uniform chain_quat_z_ptr, __attribute__((aligned(32))) uniform float *uniform chain_quat_w_ptr, __attribute__((aligned(32))) uniform float *uniform wind_x_ptr, __attribute__((aligned(32))) uniform float *uniform wind_y_ptr, __attribute__((aligned(32))) uniform float *uniform wind_z_ptr, __attribute__((aligned(32))) uniform float *uniform gravity_x_ptr, __attribute__((aligned(32))) uniform float *uniform gravity_y_ptr, __attribute__((aligned(32))) uniform float *uniform gravity_z_ptr, __attribute__((aligned(32))) uniform float *uniform chain_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform chain_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform chain_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_x_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_y_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_z_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_w_ptr, __attribute__((aligned(32))) uniform float *uniform p_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform p_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform p_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform inv_mass_ptr, __attribute__((aligned(32))) uniform float *uniform prev_p_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform prev_p_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform prev_p_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform rest_lengths_ptr, __attribute__((aligned(32))) uniform struct chain_cfg *uniform chain_configs_ptr, // Passed as pointer to array of structs __attribute__((aligned(32))) uniform float *uniform sphere_x_ptr, __attribute__((aligned(32))) uniform float *uniform sphere_y_ptr, __attribute__((aligned(32))) uniform float *uniform sphere_z_ptr, __attribute__((aligned(32))) uniform float *uniform sphere_radius_ptr, __attribute__((aligned(32))) uniform float *uniform rest_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform rest_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform rest_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform motion_radius_ptr, uniform float dt) { // Common SIMD constants (varying for vector operations across lanes) varying float dt_vec = dt; varying float dt2_vec = dt * dt; varying float one_vec = 1.0; varying float neg_one_vec = -1.0; varying float eps_vec = 1e-6; varying float inv_mass_clamp_min = 0.0; varying float inv_mass_clamp_max = 1e6; varying float zero_vec = 0.0; varying float inv_dt_vec = 1.0 / dt; varying float two_vec = 2.0; varying float inertia_clamp_min = 0.0; varying float inertia_clamp_max = 1.0; assume(MAX_CHAINS % programCount == 0); assume(PARTICLES_PER_CHAIN % programCount == 0); assume(SPHERES_PER_CHAIN % programCount == 0); assume(MAX_SPHERES % programCount == 0); assume(CONSTRAINTS_PER_CHAIN % programCount == 0); assume(MAX_REST_LENGTHS % programCount == 0); // Local arrays for PBD solver // Declared as uniform to avoid scatter/gather warnings when accessing with varying indices __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1]; __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1]; __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1]; __attribute__((aligned(32))) uniform float pm_local[PARTICLES_PER_CHAIN + 1]; // All lanes can safely write to these uniform locations. px_local[PARTICLES_PER_CHAIN] = 0.0; py_local[PARTICLES_PER_CHAIN] = 0.0; pz_local[PARTICLES_PER_CHAIN] = 0.0; pm_local[PARTICLES_PER_CHAIN] = 0.0; // Stack arrays for precomputed data __attribute__((aligned(32))) uniform float wind_local_x[MAX_CHAINS]; __attribute__((aligned(32))) uniform float wind_local_y[MAX_CHAINS]; __attribute__((aligned(32))) uniform float wind_local_z[MAX_CHAINS]; __attribute__((aligned(32))) uniform float gravity_local_x[MAX_CHAINS]; __attribute__((aligned(32))) uniform float gravity_local_y[MAX_CHAINS]; __attribute__((aligned(32))) uniform float gravity_local_z[MAX_CHAINS]; __attribute__((aligned(32))) uniform float vel_x[MAX_CHAINS]; __attribute__((aligned(32))) uniform float vel_y[MAX_CHAINS]; __attribute__((aligned(32))) uniform float vel_z[MAX_CHAINS]; __attribute__((aligned(32))) uniform float ang_vel_x[MAX_CHAINS]; __attribute__((aligned(32))) uniform float ang_vel_y[MAX_CHAINS]; __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS]; // Step 0: Precomputation (SIMD, process chains in groups of programCount) // Assumption: MAX_CHAINS (64) is divisible by programCount (8 for AVX2, 4 for Neon) for (uniform int c = 0; c < MAX_CHAINS; c += programCount) { // Load chain quaternions varying float qx = chain_quat_x_ptr[c + programIndex]; varying float qy = chain_quat_y_ptr[c + programIndex]; varying float qz = chain_quat_z_ptr[c + programIndex]; varying float qw = chain_quat_w_ptr[c + programIndex]; // Compute local-space wind { varying float vx = wind_x_ptr[c + programIndex]; varying float vy = wind_y_ptr[c + programIndex]; varying float vz = wind_z_ptr[c + programIndex]; // Create q_conjugate components varying float cqx = -qx; varying float cqy = -qy; varying float cqz = -qz; // Step 1: t = q_conj * v_world_as_quat varying float tx = qw * vx + cqy * vz - cqz * vy; varying float ty = qw * vy + cqz * vx - cqx * vz; varying float tz = qw * vz + cqx * vy - cqy * vx; varying float tw = qx * vx + qy * vy + qz * vz; // Step 2: result_vec = (t * q)_vec varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy; varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz; varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx; wind_local_x[c + programIndex] = result_x; wind_local_y[c + programIndex] = result_y; wind_local_z[c + programIndex] = result_z; } // Compute local-space gravity { varying float vx = gravity_x_ptr[c + programIndex]; varying float float_vy = gravity_y_ptr[c + programIndex]; varying float vz = gravity_z_ptr[c + programIndex]; varying float cqx = -qx; varying float cqy = -qy; varying float cqz = -qz; varying float tx = qw * vx + cqy * vz - cqz * float_vy; varying float ty = qw * float_vy + cqz * vx - cqx * vz; varying float tz = qw * vz + cqx * float_vy - cqy * vx; varying float tw = qx * vx + qy * float_vy + qz * vz; // Corrected quaternion multiplication for gravity_local varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy; varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz; varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx; gravity_local_x[c + programIndex] = result_x; gravity_local_y[c + programIndex] = result_y; gravity_local_z[c + programIndex] = result_z; } // Compute linear velocity { varying float curr_x = chain_pos_x_ptr[c + programIndex]; varying float curr_y = chain_pos_y_ptr[c + programIndex]; varying float curr_z = chain_pos_z_ptr[c + programIndex]; varying float prev_x = prev_chain_pos_x_ptr[c + programIndex]; varying float prev_y = prev_chain_pos_y_ptr[c + programIndex]; varying float prev_z = prev_chain_pos_z_ptr[c + programIndex]; varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec; varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec; varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec; vel_x[c + programIndex] = vel_x_vec; vel_y[c + programIndex] = vel_y_vec; vel_z[c + programIndex] = vel_z_vec; } // Compute angular velocity { varying float prev_qx = prev_chain_quat_x_ptr[c + programIndex]; varying float prev_qy = prev_chain_quat_y_ptr[c + programIndex]; varying float prev_qz = prev_chain_quat_z_ptr[c + programIndex]; varying float prev_qw = prev_chain_quat_w_ptr[c + programIndex]; // Step 1: Compute delta quaternion (to - from) - Note: This is an approximation varying float delta_qx = qx - prev_qx; varying float delta_qy = qy - prev_qy; varying float delta_qz = qz - prev_qz; varying float delta_qw = qw - prev_qw; // Step 2: Compute conjugate of previous quaternion varying float conj_qx = -prev_qx; varying float conj_qy = -prev_qy; varying float conj_qz = -prev_qz; varying float conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz; varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy; varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz; varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx; varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x; varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y; varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z; // Step 4: Compute dot product (to, from) for shortest arc check varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz; // Step 5: Negate orientation delta if dot < 0 varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec); orient_qx *= negate_sign; orient_qy *= negate_sign; orient_qz *= negate_sign; // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) varying float scale = one_vec / (two_vec * dt_vec); varying float ang_vel_x_vec = orient_qx * scale; varying float ang_vel_y_vec = orient_qy * scale; varying float ang_vel_z_vec = orient_qz * scale; ang_vel_x[c + programIndex] = ang_vel_x_vec; ang_vel_y[c + programIndex] = ang_vel_y_vec; ang_vel_z[c + programIndex] = ang_vel_z_vec; } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) // Process one chain at a time, particles in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int base_idx = c * PARTICLES_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; // Load velocities and inertia (scalar, broadcast to lanes) varying float vel_x_vec = vel_x[c]; varying float vel_y_vec = vel_y[c]; varying float vel_z_vec = vel_z[c]; varying float ang_vel_x_vec = ang_vel_x[c]; varying float ang_vel_y_vec = ang_vel_y[c]; varying float ang_vel_z_vec = ang_vel_z[c]; varying float linear_inertia_vec = cfg->linear_inertia; varying float angular_inertia_vec = cfg->angular_inertia; varying float centrifugal_inertia_vec = cfg->centrifugal_inertia; // Clamp inertia parameters to [0, 1] linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max); angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max); centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max); // Process particles in groups for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { varying float px = p_pos_x_ptr[base_idx + i + programIndex]; varying float py = p_pos_y_ptr[base_idx + i + programIndex]; varying float pz = p_pos_z_ptr[base_idx + i + programIndex]; varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex]; p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max); // Linear inertia: v * dt * linear_inertia varying float lin_delta_x = vel_x_vec * dt_vec * linear_inertia_vec; varying float lin_delta_y = vel_y_vec * dt_vec * linear_inertia_vec; varying float lin_delta_z = vel_z_vec * dt_vec * linear_inertia_vec; // Angular inertia: (ω × r) * dt * angular_inertia varying float ang_delta_x = ang_vel_y_vec * pz - ang_vel_z_vec * py; varying float ang_delta_y = ang_vel_z_vec * px - ang_vel_x_vec * pz; varying float ang_delta_z = ang_vel_x_vec * py - ang_vel_y_vec * px; ang_delta_x *= dt_vec * angular_inertia_vec; ang_delta_y *= dt_vec * angular_inertia_vec; ang_delta_z *= dt_vec * angular_inertia_vec; // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py; varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz; varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px; varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y; varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z; varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x; varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec; varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec; varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec; // Total delta varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x; varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y; varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z; // Update positions p_pos_x_ptr[base_idx + i + programIndex] = px + delta_x; p_pos_y_ptr[base_idx + i + programIndex] = py + delta_y; p_pos_z_ptr[base_idx + i + programIndex] = pz + delta_z; } // Update previous transformation (scalar) prev_chain_pos_x_ptr[c] = chain_pos_x_ptr[c]; prev_chain_pos_y_ptr[c] = chain_pos_y_ptr[c]; prev_chain_pos_z_ptr[c] = chain_pos_z_ptr[c]; prev_chain_quat_x_ptr[c] = chain_quat_x_ptr[c]; prev_chain_quat_y_ptr[c] = chain_quat_y_ptr[c]; prev_chain_quat_z_ptr[c] = chain_quat_z_ptr[c]; prev_chain_quat_w_ptr[c] = chain_quat_w_ptr[c]; } // Step 2: Verlet Integration // Process particles in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int base_idx = c * PARTICLES_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; // Load forces and damping (scalar, broadcast to lanes) varying float gravity_x_vec = gravity_local_x[c]; varying float gravity_y_vec = gravity_local_y[c]; varying float gravity_z_vec = gravity_local_z[c]; varying float wind_x_vec = wind_local_x[c]; varying float wind_y_vec = wind_local_y[c]; varying float wind_z_vec = wind_local_z[c]; varying float damping_vec = cfg->damping; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { varying float p_curr_x = p_pos_x_ptr[base_idx + i + programIndex]; varying float p_curr_y = p_pos_y_ptr[base_idx + i + programIndex]; varying float p_curr_z = p_pos_z_ptr[base_idx + i + programIndex]; varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex]; p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max); varying float p_prev_x = prev_p_pos_x_ptr[base_idx + i + programIndex]; varying float p_prev_y = prev_p_pos_y_ptr[base_idx + i + programIndex]; varying float p_prev_z = prev_p_pos_z_ptr[base_idx + i + programIndex]; // Compute forces and accelerations varying float force_x = gravity_x_vec + wind_x_vec; varying float force_y = gravity_y_vec + wind_y_vec; varying float force_z = gravity_z_vec + wind_z_vec; varying float acc_x = force_x * p_inv_mass; varying float acc_y = force_y * p_inv_mass; varying float acc_z = force_z * p_inv_mass; // Compute velocity with damping varying float v_x = p_curr_x - p_prev_x; varying float v_y = p_curr_y - p_prev_y; varying float v_z = p_curr_z - p_prev_z; varying float damped_vel_x = v_x * damping_vec; varying float damped_vel_y = v_y * damping_vec; varying float damped_vel_z = v_z * damping_vec; // Update positions: p_curr + damped_vel + acc * dt^2 varying float term_accel_x = acc_x * dt2_vec; varying float term_accel_y = acc_y * dt2_vec; varying float term_accel_z = acc_z * dt2_vec; p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x; p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y; p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z; // Update previous positions prev_p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x; prev_p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y; prev_p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z; } } // Step 3: Distance Constraints // Process one chain at a time, constraints in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; uniform int r_base = c * CONSTRAINTS_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; varying float stiffness_vec = cfg->stiffness; varying float stretch_factor_vec = cfg->stretch_factor; varying float max_strain_vec_abs = cfg->max_strain; varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec; // Copy particles to local arrays with uniform indexing // This loop now performs a vectorized load into the uniform px_local array. for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { // Load a vector of particles from global memory into varying registers varying float current_px = p_pos_x_ptr[p_base + i + programIndex]; varying float current_py = p_pos_y_ptr[p_base + i + programIndex]; varying float current_pz = p_pos_z_ptr[p_base + i + programIndex]; varying float current_pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); // Store these varying values into the uniform local array // The compiler can now see this as a series of vector stores // because px_local is uniform. foreach_active (idx) { px_local[i + programIndex] = current_px; py_local[i + programIndex] = current_py; pz_local[i + programIndex] = current_pz; pm_local[i + programIndex] = current_pm; } } // Iterate over constraints for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) { // Process constraints in groups of programCount (0 to PARTICLES_PER_CHAIN-2) for (uniform int i = 0; i < PARTICLES_PER_CHAIN - 1; i += programCount) { // Now, load directly from uniform px_local using varying programIndex // The compiler can optimize these into vector loads. varying float p0x = px_local[i + programIndex]; varying float p0y = py_local[i + programIndex]; varying float p0z = pz_local[i + programIndex]; varying float p0m = pm_local[i + programIndex]; // For p1, we need to be careful with the indexing relative to programIndex. // The loop condition 'i < PARTICLES_PER_CHAIN - 1' ensures that 'i + programIndex + 1' // will not exceed 'PARTICLES_PER_CHAIN - 1' for any active lane. varying float p1x = px_local[i + programIndex + 1]; varying float p1y = py_local[i + programIndex + 1]; varying float p1z = pz_local[i + programIndex + 1]; varying float p1m = pm_local[i + programIndex + 1]; varying float rest_len_vec = rest_lengths_ptr[r_base + i + programIndex]; varying float dx = p1x - p0x; varying float dy = p1y - p0y; varying float dz = p1z - p0z; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); varying float w_sum = p0m + p1m; varying float rcp_w_sum = rcp(max(w_sum, eps_vec)); varying float diff = dist - rest_len_vec; varying float strain = diff * rcp(max(rest_len_vec, eps_vec)); varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs); varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + abs(strain_clamped)); varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum; varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0); // Mask lanes where i + programIndex >= PARTICLES_PER_CHAIN - 1 (no constraint for last particle) apply_corr_mask = select(i + programIndex < PARTICLES_PER_CHAIN - 1, apply_corr_mask, 0.0); varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask; varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask; varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask; // Apply corrections, writing back to uniform px_local // These are now vector stores. px_local[i + programIndex] -= delta_x * p0m; py_local[i + programIndex] -= delta_y * p0m; pz_local[i + programIndex] -= delta_z * p0m; px_local[i + programIndex + 1] += delta_x * p1m; py_local[i + programIndex + 1] += delta_y * p1m; pz_local[i + programIndex + 1] += delta_z * p1m; } } // Write back to global arrays for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { // Load a vector of particles from uniform local array into varying registers varying float updated_px = px_local[i + programIndex]; varying float updated_py = py_local[i + programIndex]; varying float updated_pz = pz_local[i + programIndex]; // Store these varying values into the global memory foreach_active (idx) { p_pos_x_ptr[p_base + i + programIndex] = updated_px; p_pos_y_ptr[p_base + i + programIndex] = updated_py; p_pos_z_ptr[p_base + i + programIndex] = updated_pz; } } } // Step 4: Sphere Collisions with Friction // Process particles in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; uniform int s_base = c * SPHERES_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; varying float friction_vec = cfg->friction; for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) { varying float sphere_x_vec = sphere_x_ptr[s_base + s]; varying float sphere_y_vec = sphere_y_ptr[s_base + s]; varying float sphere_z_vec = sphere_z_ptr[s_base + s]; varying float sphere_r_vec = sphere_radius_ptr[s_base + s]; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { varying float p_curr_x = p_pos_x_ptr[p_base + i + programIndex]; varying float p_curr_y = p_pos_y_ptr[p_base + i + programIndex]; varying float p_curr_z = p_pos_z_ptr[p_base + i + programIndex]; varying float p_prev_x = prev_p_pos_x_ptr[p_base + i + programIndex]; varying float p_prev_y = prev_p_pos_y_ptr[p_base + i + programIndex]; varying float p_prev_z = prev_p_pos_z_ptr[p_base + i + programIndex]; varying float p_inv_mass = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); varying float dx = p_curr_x - sphere_x_vec; varying float dy = p_curr_y - sphere_y_vec; varying float dz = p_curr_z - sphere_z_vec; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); varying float penetration = sphere_r_vec - dist; varying float collision_mask = select(penetration > zero_vec, 1.0, 0.0); varying float normal_x = dx * inv_dist; varying float normal_y = dy * inv_dist; varying float normal_z = dz * inv_dist; varying float normal_corr_x = normal_x * penetration; varying float normal_corr_y = normal_y * penetration; varying float normal_corr_z = normal_z * penetration; varying float v_x = p_curr_x - p_prev_x; varying float v_y = p_curr_y - p_prev_y; varying float v_z = p_curr_z - p_prev_z; varying float vel_dot_normal = v_x * normal_x + v_y * normal_y + v_z * normal_z; varying float tangent_x = v_x - vel_dot_normal * normal_x; varying float tangent_y = v_y - vel_dot_normal * normal_y; varying float tangent_z = v_z - vel_dot_normal * normal_z; varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z; varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec)); varying float friction_mag = penetration * friction_vec; varying float friction_x = (tangent_x * inv_tangent_mag) * friction_mag; varying float friction_y = (tangent_y * inv_tangent_mag) * friction_mag; varying float friction_z = (tangent_z * inv_tangent_mag) * friction_mag; varying float total_corr_x = (normal_corr_x - friction_x) * collision_mask * p_inv_mass; varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass; varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass; p_pos_x_ptr[p_base + i + programIndex] = p_curr_x + total_corr_x; p_pos_y_ptr[p_base + i + programIndex] = p_curr_y + total_corr_y; p_pos_z_ptr[p_base + i + programIndex] = p_curr_z + total_corr_z; } } } // Step 5: Motion Constraints // Process particles in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { varying float px = p_pos_x_ptr[p_base + i + programIndex]; varying float py = p_pos_y_ptr[p_base + i + programIndex]; varying float pz = p_pos_z_ptr[p_base + i + programIndex]; varying float pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); varying float rx = rest_pos_x_ptr[p_base + i + programIndex]; varying float ry = rest_pos_y_ptr[p_base + i + programIndex]; varying float rz = rest_pos_z_ptr[p_base + i + programIndex]; varying float radius_vec = motion_radius_ptr[p_base + i + programIndex]; varying float dx = px - rx; varying float dy = py - ry; varying float dz = pz - rz; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); varying float penetration = dist - radius_vec; varying float mask = select(penetration > zero_vec, 1.0, 0.0); varying float corr_factor = penetration * inv_dist * mask; varying float delta_x = dx * corr_factor * pm; varying float delta_y = dy * corr_factor * pm; varying float delta_z = dz * corr_factor * pm; p_pos_x_ptr[p_base + i + programIndex] = px - delta_x; p_pos_y_ptr[p_base + i + programIndex] = py - delta_y; p_pos_z_ptr[p_base + i + programIndex] = pz - delta_z; } } } -
vurtun revised this gist
Jul 2, 2025 . 1 changed file with 304 additions and 310 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,24 +1,11 @@ #define MAX_CHAINS 64 #define PARTICLES_PER_CHAIN 16 #define SPHERES_PER_CHAIN 8 #define MAX_PARTICLES (MAX_CHAINS * PARTICLES_PER_CHAIN) #define MAX_SPHERES (MAX_CHAINS * SPHERES_PER_CHAIN) #define CONSTRAINTS_PER_CHAIN PARTICLES_PER_CHAIN #define MAX_REST_LENGTHS (MAX_CHAINS * CONSTRAINTS_PER_CHAIN) #define MAX_ITERATIONS 8 // Define chain configuration structure struct chain_cfg { @@ -28,73 +15,111 @@ struct chain_cfg { uniform float max_strain; uniform float friction; uniform float collision_radius; uniform float linear_inertia; // [0, 1] uniform float angular_inertia; // [0, 1] uniform float centrifugal_inertia;// [0, 1] }; // Define chain simulation structure struct chain_sim { uint8 free_idx_cnt; uint8 free_idx[MAX_CHAINS]; uniform struct chain_cfg chain_configs[MAX_CHAINS]; // Chain global forces (world space gravity and wind) uniform float gravity_x[MAX_CHAINS]; uniform float gravity_y[MAX_CHAINS]; uniform float gravity_z[MAX_CHAINS]; uniform float wind_x[MAX_CHAINS]; uniform float wind_y[MAX_CHAINS]; uniform float wind_z[MAX_CHAINS]; // Chain transformations (world space) uniform float chain_pos_x[MAX_CHAINS]; uniform float chain_pos_y[MAX_CHAINS]; uniform float chain_pos_z[MAX_CHAINS]; uniform float chain_quat_x[MAX_CHAINS]; uniform float chain_quat_y[MAX_CHAINS]; uniform float chain_quat_z[MAX_CHAINS]; uniform float chain_quat_w[MAX_CHAINS]; uniform float prev_chain_pos_x[MAX_CHAINS]; uniform float prev_chain_pos_y[MAX_CHAINS]; uniform float prev_chain_pos_z[MAX_CHAINS]; uniform float prev_chain_quat_x[MAX_CHAINS]; uniform float prev_chain_quat_y[MAX_CHAINS]; uniform float prev_chain_quat_z[MAX_CHAINS]; uniform float prev_chain_quat_w[MAX_CHAINS]; // Particle positions (model space) uniform float p_pos_x[MAX_PARTICLES]; uniform float p_pos_y[MAX_PARTICLES]; uniform float p_pos_z[MAX_PARTICLES]; uniform float prev_p_pos_x[MAX_PARTICLES]; uniform float prev_p_pos_y[MAX_PARTICLES]; uniform float prev_p_pos_z[MAX_PARTICLES]; uniform float inv_mass[MAX_PARTICLES]; uniform float rest_lengths[MAX_REST_LENGTHS]; // Sphere positions (model space) uniform float sphere_x[MAX_SPHERES]; uniform float sphere_y[MAX_SPHERES]; uniform float sphere_z[MAX_SPHERES]; uniform float sphere_radius[MAX_SPHERES]; // Rest positions (model space) uniform float rest_pos_x[MAX_PARTICLES]; uniform float rest_pos_y[MAX_PARTICLES]; uniform float rest_pos_z[MAX_PARTICLES]; uniform float motion_radius[MAX_PARTICLES]; }; // ISPC task for chain simulation export void simulate_chains( __attribute__((aligned(32))) uniform float *uniform chain_quat_x_ptr, __attribute__((aligned(32))) uniform float *uniform chain_quat_y_ptr, __attribute__((aligned(32))) uniform float *uniform chain_quat_z_ptr, __attribute__((aligned(32))) uniform float *uniform chain_quat_w_ptr, __attribute__((aligned(32))) uniform float *uniform wind_x_ptr, __attribute__((aligned(32))) uniform float *uniform wind_y_ptr, __attribute__((aligned(32))) uniform float *uniform wind_z_ptr, __attribute__((aligned(32))) uniform float *uniform gravity_x_ptr, __attribute__((aligned(32))) uniform float *uniform gravity_y_ptr, __attribute__((aligned(32))) uniform float *uniform gravity_z_ptr, __attribute__((aligned(32))) uniform float *uniform chain_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform chain_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform chain_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_x_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_y_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_z_ptr, __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_w_ptr, __attribute__((aligned(32))) uniform float *uniform p_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform p_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform p_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform inv_mass_ptr, __attribute__((aligned(32))) uniform float *uniform prev_p_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform prev_p_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform prev_p_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform rest_lengths_ptr, __attribute__((aligned(32))) uniform struct chain_cfg *uniform chain_configs_ptr, // Passed as pointer to array of structs __attribute__((aligned(32))) uniform float *uniform sphere_x_ptr, __attribute__((aligned(32))) uniform float *uniform sphere_y_ptr, __attribute__((aligned(32))) uniform float *uniform sphere_z_ptr, __attribute__((aligned(32))) uniform float *uniform sphere_radius_ptr, __attribute__((aligned(32))) uniform float *uniform rest_pos_x_ptr, __attribute__((aligned(32))) uniform float *uniform rest_pos_y_ptr, __attribute__((aligned(32))) uniform float *uniform rest_pos_z_ptr, __attribute__((aligned(32))) uniform float *uniform motion_radius_ptr, uniform float dt) { // Common SIMD constants (varying for vector operations across lanes) varying float dt_vec = dt; varying float dt2_vec = dt * dt; varying float one_vec = 1.0; @@ -107,52 +132,58 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { varying float two_vec = 2.0; varying float inertia_clamp_min = 0.0; varying float inertia_clamp_max = 1.0; assume(MAX_CHAINS % programCount == 0); assume(PARTICLES_PER_CHAIN % programCount == 0); assume(SPHERES_PER_CHAIN % programCount == 0); assume(MAX_SPHERES % programCount == 0); assume(CONSTRAINTS_PER_CHAIN % programCount == 0); assume(MAX_REST_LENGTHS % programCount == 0); // Local arrays for PBD solver // Declared as uniform to avoid scatter/gather warnings when accessing with varying indices __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1]; __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1]; __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1]; __attribute__((aligned(32))) uniform float pm_local[PARTICLES_PER_CHAIN + 1]; // All lanes can safely write to these uniform locations. px_local[PARTICLES_PER_CHAIN] = 0.0; py_local[PARTICLES_PER_CHAIN] = 0.0; pz_local[PARTICLES_PER_CHAIN] = 0.0; pm_local[PARTICLES_PER_CHAIN] = 0.0; // Stack arrays for precomputed data __attribute__((aligned(32))) uniform float wind_local_x[MAX_CHAINS]; __attribute__((aligned(32))) uniform float wind_local_y[MAX_CHAINS]; __attribute__((aligned(32))) uniform float wind_local_z[MAX_CHAINS]; __attribute__((aligned(32))) uniform float gravity_local_x[MAX_CHAINS]; __attribute__((aligned(32))) uniform float gravity_local_y[MAX_CHAINS]; __attribute__((aligned(32))) uniform float gravity_local_z[MAX_CHAINS]; __attribute__((aligned(32))) uniform float vel_x[MAX_CHAINS]; __attribute__((aligned(32))) uniform float vel_y[MAX_CHAINS]; __attribute__((aligned(32))) uniform float vel_z[MAX_CHAINS]; __attribute__((aligned(32))) uniform float ang_vel_x[MAX_CHAINS]; __attribute__((aligned(32))) uniform float ang_vel_y[MAX_CHAINS]; __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS]; // Step 0: Precomputation (SIMD, process chains in groups of programCount) // Assumption: MAX_CHAINS (64) is divisible by programCount (8 for AVX2, 4 for Neon) for (uniform int c = 0; c < MAX_CHAINS; c += programCount) { // Load chain quaternions varying float qx = chain_quat_x_ptr[c + programIndex]; varying float qy = chain_quat_y_ptr[c + programIndex]; varying float qz = chain_quat_z_ptr[c + programIndex]; varying float qw = chain_quat_w_ptr[c + programIndex]; // Compute local-space wind { varying float vx = wind_x_ptr[c + programIndex]; varying float vy = wind_y_ptr[c + programIndex]; varying float vz = wind_z_ptr[c + programIndex]; // Create q_conjugate components varying float cqx = -qx; @@ -177,60 +208,54 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { // Compute local-space gravity { varying float vx = gravity_x_ptr[c + programIndex]; varying float float_vy = gravity_y_ptr[c + programIndex]; varying float vz = gravity_z_ptr[c + programIndex]; varying float cqx = -qx; varying float cqy = -qy; varying float cqz = -qz; varying float tx = qw * vx + cqy * vz - cqz * float_vy; varying float ty = qw * float_vy + cqz * vx - cqx * vz; varying float tz = qw * vz + cqx * float_vy - cqy * vx; varying float tw = qx * vx + qy * float_vy + qz * vz; // Corrected quaternion multiplication for gravity_local varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy; varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz; varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx; gravity_local_x[c + programIndex] = result_x; gravity_local_y[c + programIndex] = result_y; gravity_local_z[c + programIndex] = result_z; } // Compute linear velocity { varying float curr_x = chain_pos_x_ptr[c + programIndex]; varying float curr_y = chain_pos_y_ptr[c + programIndex]; varying float curr_z = chain_pos_z_ptr[c + programIndex]; varying float prev_x = prev_chain_pos_x_ptr[c + programIndex]; varying float prev_y = prev_chain_pos_y_ptr[c + programIndex]; varying float prev_z = prev_chain_pos_z_ptr[c + programIndex]; varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec; varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec; varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec; vel_x[c + programIndex] = vel_x_vec; vel_y[c + programIndex] = vel_y_vec; vel_z[c + programIndex] = vel_z_vec; } // Compute angular velocity { varying float prev_qx = prev_chain_quat_x_ptr[c + programIndex]; varying float prev_qy = prev_chain_quat_y_ptr[c + programIndex]; varying float prev_qz = prev_chain_quat_z_ptr[c + programIndex]; varying float prev_qw = prev_chain_quat_w_ptr[c + programIndex]; // Step 1: Compute delta quaternion (to - from) - Note: This is an approximation varying float delta_qx = qx - prev_qx; varying float delta_qy = qy - prev_qy; varying float delta_qz = qz - prev_qz; @@ -241,21 +266,16 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { varying float conj_qy = -prev_qy; varying float conj_qz = -prev_qz; varying float conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz; varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy; varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz; varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx; varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x; varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y; varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z; // Step 4: Compute dot product (to, from) for shortest arc check varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz; // Step 5: Negate orientation delta if dot < 0 varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec); orient_qx *= negate_sign; @@ -275,12 +295,12 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) // Process one chain at a time, particles in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int base_idx = c * PARTICLES_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; // Load velocities and inertia (scalar, broadcast to lanes) varying float vel_x_vec = vel_x[c]; varying float vel_y_vec = vel_y[c]; varying float vel_z_vec = vel_z[c]; @@ -292,19 +312,18 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { varying float linear_inertia_vec = cfg->linear_inertia; varying float angular_inertia_vec = cfg->angular_inertia; varying float centrifugal_inertia_vec = cfg->centrifugal_inertia; // Clamp inertia parameters to [0, 1] linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max); angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max); centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max); // Process particles in groups for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { varying float px = p_pos_x_ptr[base_idx + i + programIndex]; varying float py = p_pos_y_ptr[base_idx + i + programIndex]; varying float pz = p_pos_z_ptr[base_idx + i + programIndex]; varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex]; p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max); // Linear inertia: v * dt * linear_inertia @@ -320,68 +339,68 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { ang_delta_x *= dt_vec * angular_inertia_vec; ang_delta_y *= dt_vec * angular_inertia_vec; ang_delta_z *= dt_vec * angular_inertia_vec; // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py; varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz; varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px; varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y; varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z; varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x; varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec; varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec; varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec; // Total delta varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x; varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y; varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z; // Update positions p_pos_x_ptr[base_idx + i + programIndex] = px + delta_x; p_pos_y_ptr[base_idx + i + programIndex] = py + delta_y; p_pos_z_ptr[base_idx + i + programIndex] = pz + delta_z; } // Update previous transformation (scalar) prev_chain_pos_x_ptr[c] = chain_pos_x_ptr[c]; prev_chain_pos_y_ptr[c] = chain_pos_y_ptr[c]; prev_chain_pos_z_ptr[c] = chain_pos_z_ptr[c]; prev_chain_quat_x_ptr[c] = chain_quat_x_ptr[c]; prev_chain_quat_y_ptr[c] = chain_quat_y_ptr[c]; prev_chain_quat_z_ptr[c] = chain_quat_z_ptr[c]; prev_chain_quat_w_ptr[c] = chain_quat_w_ptr[c]; } // Step 2: Verlet Integration // Process particles in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int base_idx = c * PARTICLES_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; // Load forces and damping (scalar, broadcast to lanes) varying float gravity_x_vec = gravity_local_x[c]; varying float gravity_y_vec = gravity_local_y[c]; varying float gravity_z_vec = gravity_local_z[c]; varying float wind_x_vec = wind_local_x[c]; varying float wind_y_vec = wind_local_y[c]; varying float wind_z_vec = wind_local_z[c]; varying float damping_vec = cfg->damping; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { varying float p_curr_x = p_pos_x_ptr[base_idx + i + programIndex]; varying float p_curr_y = p_pos_y_ptr[base_idx + i + programIndex]; varying float p_curr_z = p_pos_z_ptr[base_idx + i + programIndex]; varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex]; p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max); varying float p_prev_x = prev_p_pos_x_ptr[base_idx + i + programIndex]; varying float p_prev_y = prev_p_pos_y_ptr[base_idx + i + programIndex]; varying float p_prev_z = prev_p_pos_z_ptr[base_idx + i + programIndex]; // Compute forces and accelerations varying float force_x = gravity_x_vec + wind_x_vec; @@ -393,176 +412,153 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { varying float acc_z = force_z * p_inv_mass; // Compute velocity with damping varying float v_x = p_curr_x - p_prev_x; varying float v_y = p_curr_y - p_prev_y; varying float v_z = p_curr_z - p_prev_z; varying float damped_vel_x = v_x * damping_vec; varying float damped_vel_y = v_y * damping_vec; varying float damped_vel_z = v_z * damping_vec; // Update positions: p_curr + damped_vel + acc * dt^2 varying float term_accel_x = acc_x * dt2_vec; varying float term_accel_y = acc_y * dt2_vec; varying float term_accel_z = acc_z * dt2_vec; p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x; p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y; p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z; // Update previous positions prev_p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x; prev_p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y; prev_p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z; } } // Step 3: Distance Constraints // Process one chain at a time, constraints in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; uniform int r_base = c * CONSTRAINTS_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; varying float stiffness_vec = cfg->stiffness; varying float stretch_factor_vec = cfg->stretch_factor; varying float max_strain_vec_abs = cfg->max_strain; varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec; // Copy particles to local arrays with uniform indexing // This loop now performs a vectorized load into the uniform px_local array. for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { // Load a vector of particles from global memory into varying registers varying float current_px = p_pos_x_ptr[p_base + i + programIndex]; varying float current_py = p_pos_y_ptr[p_base + i + programIndex]; varying float current_pz = p_pos_z_ptr[p_base + i + programIndex]; varying float current_pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); // Store these varying values into the uniform local array // The compiler can now see this as a series of vector stores // because px_local is uniform. foreach_active (idx) { px_local[i + programIndex] = current_px; py_local[i + programIndex] = current_py; pz_local[i + programIndex] = current_pz; pm_local[i + programIndex] = current_pm; } } // Iterate over constraints for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) { // Process constraints in groups of programCount (0 to PARTICLES_PER_CHAIN-2) for (uniform int i = 0; i < PARTICLES_PER_CHAIN - 1; i += programCount) { // Now, load directly from uniform px_local using varying programIndex // The compiler can optimize these into vector loads. varying float p0x = px_local[i + programIndex]; varying float p0y = py_local[i + programIndex]; varying float p0z = pz_local[i + programIndex]; varying float p0m = pm_local[i + programIndex]; // For p1, we need to be careful with the indexing relative to programIndex. // The loop condition 'i < PARTICLES_PER_CHAIN - 1' ensures that 'i + programIndex + 1' // will not exceed 'PARTICLES_PER_CHAIN - 1' for any active lane. varying float p1x = px_local[i + programIndex + 1]; varying float p1y = py_local[i + programIndex + 1]; varying float p1z = pz_local[i + programIndex + 1]; varying float p1m = pm_local[i + programIndex + 1]; varying float rest_len_vec = rest_lengths_ptr[r_base + i + programIndex]; varying float dx = p1x - p0x; varying float dy = p1y - p0y; varying float dz = p1z - p0z; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); varying float w_sum = p0m + p1m; varying float rcp_w_sum = rcp(max(w_sum, eps_vec)); varying float diff = dist - rest_len_vec; varying float strain = diff * rcp(max(rest_len_vec, eps_vec)); varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs); varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + abs(strain_clamped)); varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum; varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0); // Mask lanes where i + programIndex >= PARTICLES_PER_CHAIN - 1 (no constraint for last particle) apply_corr_mask = select(i + programIndex < PARTICLES_PER_CHAIN - 1, apply_corr_mask, 0.0); varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask; varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask; varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask; // Apply corrections, writing back to uniform px_local // These are now vector stores. px_local[i + programIndex] -= delta_x * p0m; py_local[i + programIndex] -= delta_y * p0m; pz_local[i + programIndex] -= delta_z * p0m; px_local[i + programIndex + 1] += delta_x * p1m; py_local[i + programIndex + 1] += delta_y * p1m; pz_local[i + programIndex + 1] += delta_z * p1m; } } // Write back to global arrays for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { // Load a vector of particles from uniform local array into varying registers varying float updated_px = px_local[i + programIndex]; varying float updated_py = py_local[i + programIndex]; varying float updated_pz = pz_local[i + programIndex]; // Store these varying values into the global memory foreach_active (idx) { p_pos_x_ptr[p_base + i + programIndex] = updated_px; p_pos_y_ptr[p_base + i + programIndex] = updated_py; p_pos_z_ptr[p_base + i + programIndex] = updated_pz; } } } // Step 4: Sphere Collisions with Friction // Process particles in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; uniform int s_base = c * SPHERES_PER_CHAIN; uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; varying float friction_vec = cfg->friction; for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) { varying float sphere_x_vec = sphere_x_ptr[s_base + s]; varying float sphere_y_vec = sphere_y_ptr[s_base + s]; varying float sphere_z_vec = sphere_z_ptr[s_base + s]; varying float sphere_r_vec = sphere_radius_ptr[s_base + s]; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { varying float p_curr_x = p_pos_x_ptr[p_base + i + programIndex]; varying float p_curr_y = p_pos_y_ptr[p_base + i + programIndex]; varying float p_curr_z = p_pos_z_ptr[p_base + i + programIndex]; varying float p_prev_x = prev_p_pos_x_ptr[p_base + i + programIndex]; varying float p_prev_y = prev_p_pos_y_ptr[p_base + i + programIndex]; varying float p_prev_z = prev_p_pos_z_ptr[p_base + i + programIndex]; varying float p_inv_mass = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); varying float dx = p_curr_x - sphere_x_vec; varying float dy = p_curr_y - sphere_y_vec; @@ -583,14 +579,14 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { varying float normal_corr_y = normal_y * penetration; varying float normal_corr_z = normal_z * penetration; varying float v_x = p_curr_x - p_prev_x; varying float v_y = p_curr_y - p_prev_y; varying float v_z = p_curr_z - p_prev_z; varying float vel_dot_normal = v_x * normal_x + v_y * normal_y + v_z * normal_z; varying float tangent_x = v_x - vel_dot_normal * normal_x; varying float tangent_y = v_y - vel_dot_normal * normal_y; varying float tangent_z = v_z - vel_dot_normal * normal_z; varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z; varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec)); @@ -604,32 +600,30 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass; varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass; p_pos_x_ptr[p_base + i + programIndex] = p_curr_x + total_corr_x; p_pos_y_ptr[p_base + i + programIndex] = p_curr_y + total_corr_y; p_pos_z_ptr[p_base + i + programIndex] = p_curr_z + total_corr_z; } } } // Step 5: Motion Constraints // Process particles in groups of programCount for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { varying float px = p_pos_x_ptr[p_base + i + programIndex]; varying float py = p_pos_y_ptr[p_base + i + programIndex]; varying float pz = p_pos_z_ptr[p_base + i + programIndex]; varying float pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); varying float rx = rest_pos_x_ptr[p_base + i + programIndex]; varying float ry = rest_pos_y_ptr[p_base + i + programIndex]; varying float rz = rest_pos_z_ptr[p_base + i + programIndex]; varying float radius_vec = motion_radius_ptr[p_base + i + programIndex]; varying float dx = px - rx; varying float dy = py - ry; varying float dz = pz - rz; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); @@ -642,9 +636,9 @@ export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { varying float delta_y = dy * corr_factor * pm; varying float delta_z = dz * corr_factor * pm; p_pos_x_ptr[p_base + i + programIndex] = px - delta_x; p_pos_y_ptr[p_base + i + programIndex] = py - delta_y; p_pos_z_ptr[p_base + i + programIndex] = pz - delta_z; } } } -
vurtun revised this gist
Jul 2, 2025 . 1 changed file with 650 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,650 @@ // Inline math functions to replace AVX2 intrinsics inline varying float rsqrt(varying float x) { return 1.0 / sqrt(x); } inline varying float rcp(varying float x) { return 1.0 / x; } inline varying float clamp(varying float x, varying float min, varying float max) { return min(max(x, min), max); } enum { MAX_CHAINS = 64, PARTICLES_PER_CHAIN = 16, SPHERES_PER_CHAIN = 8, MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN), MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN), CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN, MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN), MAX_ITERATIONS = 4, }; // Define chain configuration structure struct chain_cfg { uniform float damping; uniform float stiffness; uniform float stretch_factor; uniform float max_strain; uniform float friction; uniform float collision_radius; uniform float linear_inertia; // [0, 1] uniform float angular_inertia; // [0, 1] uniform float centrifugal_inertia; // [0, 1] } __attribute__((aligned(32))); // Define chain simulation structure struct chain_sim { uniform unsigned short free_idx_cnt; uniform unsigned short free_idx[MAX_CHAINS]; uniform struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32))); // Chain global forces (world space gravity and wind) uniform float gravity_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float gravity_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float gravity_z[MAX_CHAINS] __attribute__((aligned(32))); uniform float wind_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float wind_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float wind_z[MAX_CHAINS] __attribute__((aligned(32))); // Chain transformations (world space) uniform float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); uniform float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); uniform float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); uniform float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); uniform float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); uniform float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); // Particle positions (model space) uniform float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); uniform float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); uniform float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); uniform float inv_mass[MAX_PARTICLES] __attribute__((aligned(32))); uniform float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); uniform float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); uniform float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); uniform float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32))); // Sphere positions (model space) uniform float sphere_x[MAX_SPHERES] __attribute__((aligned(32))); uniform float sphere_y[MAX_SPHERES] __attribute__((aligned(32))); uniform float sphere_z[MAX_SPHERES] __attribute__((aligned(32))); uniform float sphere_radius[MAX_SPHERES] __attribute__((aligned(32))); // Rest positions (model space) uniform float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); uniform float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); uniform float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); uniform float motion_radius[MAX_PARTICLES] __attribute__((aligned(32))); }; // ISPC task for chain simulation export void simulate_chains(uniform struct chain_sim *cs, uniform float dt) { // Common SIMD constants (varying for vector operations across 8 lanes) varying float dt_vec = dt; varying float dt2_vec = dt * dt; varying float one_vec = 1.0; varying float neg_one_vec = -1.0; varying float eps_vec = 1e-6; varying float inv_mass_clamp_min = 0.0; varying float inv_mass_clamp_max = 1e6; varying float zero_vec = 0.0; varying float inv_dt_vec = 1.0 / dt; varying float two_vec = 2.0; varying float inertia_clamp_min = 0.0; varying float inertia_clamp_max = 1.0; varying float abs_mask_ps = reinterpret<float>(0x7FFFFFFF); // Local arrays for PBD solver uniform float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); uniform float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); uniform float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); uniform float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); // Initialize dummy elements px_local[PARTICLES_PER_CHAIN] = 0.0; py_local[PARTICLES_PER_CHAIN] = 0.0; pz_local[PARTICLES_PER_CHAIN] = 0.0; pm_local[PARTICLES_PER_CHAIN] = 0.0; // Stack arrays for precomputed data uniform float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); uniform float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); uniform float vel_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float vel_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float vel_z[MAX_CHAINS] __attribute__((aligned(32))); uniform float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); uniform float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); uniform float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); // Step 0: Precomputation (SIMD, 8 chains at once) // Assumption: MAX_CHAINS (64) is divisible by 8, so no leftovers for (uniform int c = 0; c < MAX_CHAINS; c += 8) { // Note: No index checks; c + programIndex is always valid (0 to 63) // Load chain quaternions (8 lanes) varying float qx = cs->chain_quat_x[c + programIndex]; varying float qy = cs->chain_quat_y[c + programIndex]; varying float qz = cs->chain_quat_z[c + programIndex]; varying float qw = cs->chain_quat_w[c + programIndex]; // Compute local-space wind { varying float vx = cs->wind_x[c + programIndex]; varying float vy = cs->wind_y[c + programIndex]; varying float vz = cs->wind_z[c + programIndex]; // Create q_conjugate components varying float cqx = -qx; varying float cqy = -qy; varying float cqz = -qz; // Step 1: t = q_conj * v_world_as_quat varying float tx = qw * vx + cqy * vz - cqz * vy; varying float ty = qw * vy + cqz * vx - cqx * vz; varying float tz = qw * vz + cqx * vy - cqy * vx; varying float tw = qx * vx + qy * vy + qz * vz; // Step 2: result_vec = (t * q)_vec varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy; varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz; varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx; wind_local_x[c + programIndex] = result_x; wind_local_y[c + programIndex] = result_y; wind_local_z[c + programIndex] = result_z; } // Compute local-space gravity { varying float vx = cs->gravity_x[c + programIndex]; varying float vy = cs->gravity_y[c + programIndex]; varying float vz = cs->gravity_z[c + programIndex]; varying float cqx = -qx; varying float cqy = -qy; varying float cqz = -qz; varying float tx = qw * vx + cqy * vz - cqz * vy; varying float ty = qw * vy + cqz * vx - cqx * vz; varying float tz = qw * vz + cqx * vy - cqy * vx; varying float tw = qx * vx + qy * vy + qz * vz; varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy; varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz; varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx; gravity_local_x[c + programIndex] = result_x; gravity_local_y[c + programIndex] = result_y; gravity_local_z[c + programIndex] = result_z; } // Compute linear velocity { varying float curr_x = cs->chain_pos_x[c + programIndex]; varying float curr_y = cs->chain_pos_y[c + programIndex]; varying float curr_z = cs->chain_pos_z[c + programIndex]; varying float prev_x = cs->prev_chain_pos_x[c + programIndex]; varying float prev_y = cs->prev_chain_pos_y[c + programIndex]; varying float prev_z = cs->prev_chain_pos_z[c + programIndex]; varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec; varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec; varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec; vel_x[c + programIndex] = vel_x_vec; vel_y[c + programIndex] = vel_y_vec; vel_z[c + programIndex] = vel_z_vec; } // Compute angular velocity { varying float qx = cs->chain_quat_x[c + programIndex]; varying float qy = cs->chain_quat_y[c + programIndex]; varying float qz = cs->chain_quat_z[c + programIndex]; varying float qw = cs->chain_quat_w[c + programIndex]; varying float prev_qx = cs->prev_chain_quat_x[c + programIndex]; varying float prev_qy = cs->prev_chain_quat_y[c + programIndex]; varying float prev_qz = cs->prev_chain_quat_z[c + programIndex]; varying float prev_qw = cs->prev_chain_quat_w[c + programIndex]; // Step 1: Compute delta quaternion (to - from) varying float delta_qx = qx - prev_qx; varying float delta_qy = qy - prev_qy; varying float delta_qz = qz - prev_qz; varying float delta_qw = qw - prev_qw; // Step 2: Compute conjugate of previous quaternion varying float conj_qx = -prev_qx; varying float conj_qy = -prev_qy; varying float conj_qz = -prev_qz; varying float conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz; varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy; varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz; varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx; varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x; varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y; varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z; // Step 4: Compute dot product (to, from) for shortest arc check varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz; // Step 5: Negate orientation delta if dot < 0 varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec); orient_qx *= negate_sign; orient_qy *= negate_sign; orient_qz *= negate_sign; // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) varying float scale = one_vec / (two_vec * dt_vec); varying float ang_vel_x_vec = orient_qx * scale; varying float ang_vel_y_vec = orient_qy * scale; varying float ang_vel_z_vec = orient_qz * scale; ang_vel_x[c + programIndex] = ang_vel_x_vec; ang_vel_y[c + programIndex] = ang_vel_y_vec; ang_vel_z[c + programIndex] = ang_vel_z_vec; } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) // Assumption: MAX_CHAINS (64) is divisible by 8, PARTICLES_PER_CHAIN (16) is divisible by 8 for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int base_idx = c * PARTICLES_PER_CHAIN; uniform struct chain_cfg *cfg = &cs->chain_configs[c]; // Load velocities and inertia (scalar, broadcast to 8 lanes) varying float vel_x_vec = vel_x[c]; varying float vel_y_vec = vel_y[c]; varying float vel_z_vec = vel_z[c]; varying float ang_vel_x_vec = ang_vel_x[c]; varying float ang_vel_y_vec = ang_vel_y[c]; varying float ang_vel_z_vec = ang_vel_z[c]; varying float linear_inertia_vec = cfg->linear_inertia; varying float angular_inertia_vec = cfg->angular_inertia; varying float centrifugal_inertia_vec = cfg->centrifugal_inertia; // Clamp inertia parameters to [0, 1] linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max); angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max); centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max); // Process 8 particles at a time for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { // Note: No index checks; base_idx + i + programIndex is always valid varying float px = cs->p_pos_x[base_idx + i + programIndex]; varying float py = cs->p_pos_y[base_idx + i + programIndex]; varying float pz = cs->p_pos_z[base_idx + i + programIndex]; varying float p_inv_mass = cs->inv_mass[base_idx + i + programIndex]; p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max); // Linear inertia: v * dt * linear_inertia varying float lin_delta_x = vel_x_vec * dt_vec * linear_inertia_vec; varying float lin_delta_y = vel_y_vec * dt_vec * linear_inertia_vec; varying float lin_delta_z = vel_z_vec * dt_vec * linear_inertia_vec; // Angular inertia: (ω × r) * dt * angular_inertia varying float ang_delta_x = ang_vel_y_vec * pz - ang_vel_z_vec * py; varying float ang_delta_y = ang_vel_z_vec * px - ang_vel_x_vec * pz; varying float ang_delta_z = ang_vel_x_vec * py - ang_vel_y_vec * px; ang_delta_x *= dt_vec * angular_inertia_vec; ang_delta_y *= dt_vec * angular_inertia_vec; ang_delta_z *= dt_vec * angular_inertia_vec; // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py; varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz; varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px; varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y; varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z; varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x; varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec; varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec; varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec; // Total delta varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x; varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y; varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z; // Update positions cs->p_pos_x[base_idx + i + programIndex] = px + delta_x; cs->p_pos_y[base_idx + i + programIndex] = py + delta_y; cs->p_pos_z[base_idx + i + programIndex] = pz + delta_z; } // Update previous transformation (scalar) cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; cs->prev_chain_pos_z[c] = cs->chain_pos_z[c]; cs->prev_chain_quat_x[c] = cs->chain_quat_x[c]; cs->prev_chain_quat_y[c] = cs->chain_quat_y[c]; cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; cs->prev_chain_quat_w[c] = cs->chain_quat_w[c]; } // Step 2: Verlet Integration // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8 for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int base_idx = c * PARTICLES_PER_CHAIN; uniform struct chain_cfg *cfg = &cs->chain_configs[c]; // Load forces and damping (scalar, broadcast to 8 lanes) varying float gravity_x_vec = gravity_local_x[c]; varying float gravity_y_vec = gravity_local_y[c]; varying float gravity_z_vec = gravity_local_z[c]; varying float wind_x_vec = wind_local_x[c]; varying float wind_y_vec = wind_local_y[c]; varying float wind_z_vec = wind_local_z[c]; varying float damping_vec = cfg->damping; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { varying float p_curr_x = cs->p_pos_x[base_idx + i + programIndex]; varying float p_curr_y = cs->p_pos_y[base_idx + i + programIndex]; varying float p_curr_z = cs->p_pos_z[base_idx + i + programIndex]; varying float p_inv_mass = cs->inv_mass[base_idx + i + programIndex]; p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max); varying float p_prev_x = cs->prev_p_pos_x[base_idx + i + programIndex]; varying float p_prev_y = cs->prev_p_pos_y[base_idx + i + programIndex]; varying float p_prev_z = cs->prev_p_pos_z[base_idx + i + programIndex]; // Compute forces and accelerations varying float force_x = gravity_x_vec + wind_x_vec; varying float force_y = gravity_y_vec + wind_y_vec; varying float force_z = gravity_z_vec + wind_z_vec; varying float acc_x = force_x * p_inv_mass; varying float acc_y = force_y * p_inv_mass; varying float acc_z = force_z * p_inv_mass; // Compute velocity with damping varying float vel_x = p_curr_x - p_prev_x; varying float vel_y = p_curr_y - p_prev_y; varying float vel_z = p_curr_z - p_prev_z; varying float damped_vel_x = vel_x * damping_vec; varying float damped_vel_y = vel_y * damping_vec; varying float damped_vel_z = vel_z * damping_vec; // Update positions: p_curr + damped_vel + acc * dt^2 varying float term_accel_x = acc_x * dt2_vec; varying float term_accel_y = acc_y * dt2_vec; varying float term_accel_z = acc_z * dt2_vec; cs->p_pos_x[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x; cs->p_pos_y[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y; cs->p_pos_z[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z; // Update previous positions cs->prev_p_pos_x[base_idx + i + programIndex] = p_curr_x; cs->prev_p_pos_y[base_idx + i + programIndex] = p_curr_y; cs->prev_p_pos_z[base_idx + i + programIndex] = p_curr_z; } } // Step 3: Distance Constraints // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8 for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; uniform int r_base = c * CONSTRAINTS_PER_CHAIN; uniform struct chain_cfg *cfg = &cs->chain_configs[c]; varying float stiffness_vec = cfg->stiffness; varying float stretch_factor_vec = cfg->stretch_factor; varying float max_strain_vec_abs = cfg->max_strain; varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec; // Copy particles to local arrays for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { px_local[i + programIndex] = cs->p_pos_x[p_base + i + programIndex]; py_local[i + programIndex] = cs->p_pos_y[p_base + i + programIndex]; pz_local[i + programIndex] = cs->p_pos_z[p_base + i + programIndex]; pm_local[i + programIndex] = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); } // Iterate over constraints for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) { // First block (i=0 to 7) { varying float p0x = px_local[0]; varying float p0y = py_local[0]; varying float p0z = pz_local[0]; varying float p0m = pm_local[0]; varying float p1x = px_local[1]; // Unaligned access varying float p1y = py_local[1]; varying float p1z = pz_local[1]; varying float p1m = pm_local[1]; varying float rest_len_vec = cs->rest_lengths[r_base]; varying float dx = p1x - p0x; varying float dy = p1y - p0y; varying float dz = p1z - p0z; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); varying float w_sum = p0m + p1m; varying float rcp_w_sum = rcp(max(w_sum, eps_vec)); varying float diff = dist - rest_len_vec; varying float strain = diff * rcp(max(rest_len_vec, eps_vec)); varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs); varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + (strain_clamped & abs_mask_ps)); varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum; varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0); varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask; varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask; varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask; px_local[0] = p0x + delta_x * p0m; py_local[0] = p0y + delta_y * p0m; pz_local[0] = p0z + delta_z * p0m; px_local[1] = p1x - delta_x * p1m; py_local[1] = p1y - delta_y * p1m; pz_local[1] = p1z - delta_z * p1m; } // Second block (i=8 to 14, mask last lane) { varying float p0x = px_local[8]; varying float p0y = py_local[8]; varying float p0z = pz_local[8]; varying float p0m = pm_local[8]; varying float p1x = px_local[9]; // Unaligned access varying float p1y = py_local[9]; varying float p1z = pz_local[9]; varying float p1m = pm_local[9]; varying float rest_len_vec = cs->rest_lengths[r_base + 8]; varying float dx = p1x - p0x; varying float dy = p1y - p0y; varying float dz = p1z - p0z; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); varying float w_sum = p0m + p1m; varying float rcp_w_sum = rcp(max(w_sum, eps_vec)); varying float diff = dist - rest_len_vec; varying float strain = diff * rcp(max(rest_len_vec, eps_vec)); varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs); varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + (strain_clamped & abs_mask_ps)); varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum; varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0); // Mask last lane (i=15) to disable constraint apply_corr_mask = select(programIndex < 7, apply_corr_mask, 0.0); varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask; varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask; varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask; px_local[8] = p0x + delta_x * p0m; py_local[8] = p0y + delta_y * p0m; pz_local[8] = p0z + delta_z * p0m; px_local[9] = p1x - delta_x * p1m; py_local[9] = p1y - delta_y * p1m; pz_local[9] = p1z - delta_z * p1m; } } // Write back to global arrays for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { cs->p_pos_x[p_base + i + programIndex] = px_local[i + programIndex]; cs->p_pos_y[p_base + i + programIndex] = py_local[i + programIndex]; cs->p_pos_z[p_base + i + programIndex] = pz_local[i + programIndex]; } } // Step 4: Sphere Collisions with Friction // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8 for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; uniform int s_base = c * SPHERES_PER_CHAIN; uniform struct chain_cfg *cfg = &cs->chain_configs[c]; varying float friction_vec = cfg->friction; for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) { varying float sphere_x_vec = cs->sphere_x[s_base + s]; varying float sphere_y_vec = cs->sphere_y[s_base + s]; varying float sphere_z_vec = cs->sphere_z[s_base + s]; varying float sphere_r_vec = cs->sphere_radius[s_base + s]; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { varying float p_curr_x = cs->p_pos_x[p_base + i + programIndex]; varying float p_curr_y = cs->p_pos_y[p_base + i + programIndex]; varying float p_curr_z = cs->p_pos_z[p_base + i + programIndex]; varying float p_prev_x = cs->prev_p_pos_x[p_base + i + programIndex]; varying float p_prev_y = cs->prev_p_pos_y[p_base + i + programIndex]; varying float p_prev_z = cs->prev_p_pos_z[p_base + i + programIndex]; varying float p_inv_mass = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); varying float dx = p_curr_x - sphere_x_vec; varying float dy = p_curr_y - sphere_y_vec; varying float dz = p_curr_z - sphere_z_vec; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); varying float penetration = sphere_r_vec - dist; varying float collision_mask = select(penetration > zero_vec, 1.0, 0.0); varying float normal_x = dx * inv_dist; varying float normal_y = dy * inv_dist; varying float normal_z = dz * inv_dist; varying float normal_corr_x = normal_x * penetration; varying float normal_corr_y = normal_y * penetration; varying float normal_corr_z = normal_z * penetration; varying float vel_x = p_curr_x - p_prev_x; varying float vel_y = p_curr_y - p_prev_y; varying float vel_z = p_curr_z - p_prev_z; varying float vel_dot_normal = vel_x * normal_x + vel_y * normal_y + vel_z * normal_z; varying float tangent_x = vel_x - vel_dot_normal * normal_x; varying float tangent_y = vel_y - vel_dot_normal * normal_y; varying float tangent_z = vel_z - vel_dot_normal * normal_z; varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z; varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec)); varying float friction_mag = penetration * friction_vec; varying float friction_x = (tangent_x * inv_tangent_mag) * friction_mag; varying float friction_y = (tangent_y * inv_tangent_mag) * friction_mag; varying float friction_z = (tangent_z * inv_tangent_mag) * friction_mag; varying float total_corr_x = (normal_corr_x - friction_x) * collision_mask * p_inv_mass; varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass; varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass; cs->p_pos_x[p_base + i + programIndex] = p_curr_x + total_corr_x; cs->p_pos_y[p_base + i + programIndex] = p_curr_y + total_corr_y; cs->p_pos_z[p_base + i + programIndex] = p_curr_z + total_corr_z; } } } // Step 5: Motion Constraints // Assumption: PARTICLES_PER_CHAIN (16) is divisible by 8 for (uniform int c = 0; c < MAX_CHAINS; c += 1) { uniform int p_base = c * PARTICLES_PER_CHAIN; for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { varying float px = cs->p_pos_x[p_base + i + programIndex]; varying float py = cs->p_pos_y[p_base + i + programIndex]; varying float pz = cs->p_pos_z[p_base + i + programIndex]; varying float pm = clamp(cs->inv_mass[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); varying float rx = cs->rest_pos_x[p_base + i + programIndex]; varying float ry = cs->rest_pos_y[p_base + i + programIndex]; varying float rz = cs->rest_pos_z[p_base + i + programIndex]; varying float radius_vec = cs->motion_radius[p_base + i + programIndex]; varying float dx = px - rx; varying float dy = py - ry; varying float dz = pz - rz; varying float dist_sq = dx * dx + dy * dy + dz * dz; varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); varying float dist = rcp(inv_dist); varying float penetration = dist - radius_vec; varying float mask = select(penetration > zero_vec, 1.0, 0.0); varying float corr_factor = penetration * inv_dist * mask; varying float delta_x = dx * corr_factor * pm; varying float delta_y = dy * corr_factor * pm; varying float delta_z = dz * corr_factor * pm; cs->p_pos_x[p_base + i + programIndex] = px - delta_x; cs->p_pos_y[p_base + i + programIndex] = py - delta_y; cs->p_pos_z[p_base + i + programIndex] = pz - delta_z; } } } -
vurtun revised this gist
Jun 29, 2025 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -257,7 +257,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&vel_y[c], vel_y_vec); _mm256_store_ps(&vel_z[c], vel_z_vec); } // Compute angular velocity { __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); -
vurtun revised this gist
Jun 29, 2025 . 2 changed files with 143 additions and 122 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -94,13 +94,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { const __m256 zero_vec = _mm256_setzero_ps(); const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); const __m256 two_vec = _mm256_set1_ps(2.0f); const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); @@ -134,6 +127,35 @@ void simulate_chains(struct chain_sim *cs, float dt) { float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); // Local arrays for PBD solver float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); // Initialize dummy elements px_local[PARTICLES_PER_CHAIN] = 0.0f; py_local[PARTICLES_PER_CHAIN] = 0.0f; pz_local[PARTICLES_PER_CHAIN] = 0.0f; pm_local[PARTICLES_PER_CHAIN] = 0.0f; // Stack arrays for precomputed data float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); float vel_x[MAX_CHAINS] __attribute__((aligned(32))); float vel_y[MAX_CHAINS] __attribute__((aligned(32))); float vel_z[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); // Step 0: Precomputation (SIMD, 8 chains at once) for (int c = 0; c < MAX_CHAINS; c += 8) { // Load chain quaternions @@ -255,66 +277,69 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&vel_y[c], vel_y_vec); _mm256_store_ps(&vel_z[c], vel_z_vec); } // Compute angular velocity { __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]); __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]); __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]); __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]); // Step 1: Compute delta quaternion (to - from) __m256 delta_qx = _mm256_sub_ps(qx, prev_qx); __m256 delta_qy = _mm256_sub_ps(qy, prev_qy); __m256 delta_qz = _mm256_sub_ps(qz, prev_qz); __m256 delta_qw = _mm256_sub_ps(qw, prev_qw); // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx); __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy); __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz); __m256 conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) // Result: orientationDelta = (delta_qw, delta_qx, delta_qy, delta_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) __m256 orient_qw = _mm256_fnmadd_ps(delta_qx, conj_qx, _mm256_mul_ps(delta_qw, conj_qw)); orient_qw = _mm256_fnmadd_ps(delta_qy, conj_qy, orient_qw); orient_qw = _mm256_fnmadd_ps(delta_qz, conj_qz, orient_qw); __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(delta_qy, conj_qz), _mm256_mul_ps(delta_qz, conj_qy)); __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(delta_qz, conj_qx), _mm256_mul_ps(delta_qx, conj_qz)); __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(delta_qx, conj_qy), _mm256_mul_ps(delta_qy, conj_qx)); __m256 orient_qx = _mm256_fmadd_ps(delta_qw, conj_qx, _mm256_mul_ps(delta_qx, conj_qw)); orient_qx = _mm256_add_ps(orient_qx, cross_x); __m256 orient_qy = _mm256_fmadd_ps(delta_qw, conj_qy, _mm256_mul_ps(delta_qy, conj_qw)); orient_qy = _mm256_add_ps(orient_qy, cross_y); __m256 orient_qz = _mm256_fmadd_ps(delta_qw, conj_qz, _mm256_mul_ps(delta_qz, conj_qw)); orient_qz = _mm256_add_ps(orient_qz, cross_z); // Step 4: Compute dot product (to, from) for shortest arc check __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw)); dot = _mm256_fmadd_ps(qy, prev_qy, dot); dot = _mm256_fmadd_ps(qz, prev_qz, dot); // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), _mm256_andnot_ps(negate_mask, one_vec)); orient_qx = _mm256_mul_ps(orient_qx, negate_sign); orient_qy = _mm256_mul_ps(orient_qy, negate_sign); orient_qz = _mm256_mul_ps(orient_qz, negate_sign); // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec)); __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale); __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale); __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale); // Store results _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -103,13 +103,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { const __m256 zero_vec = _mm256_setzero_ps(); const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); const __m256 two_vec = _mm256_set1_ps(2.0f); const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); @@ -264,66 +257,69 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&vel_y[c], vel_y_vec); _mm256_store_ps(&vel_z[c], vel_z_vec); } // Compute angular velocity { __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]); __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]); __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]); __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]); // Step 1: Compute delta quaternion (to - from) __m256 delta_qx = _mm256_sub_ps(qx, prev_qx); __m256 delta_qy = _mm256_sub_ps(qy, prev_qy); __m256 delta_qz = _mm256_sub_ps(qz, prev_qz); __m256 delta_qw = _mm256_sub_ps(qw, prev_qw); // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx); __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy); __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz); __m256 conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) // Result: orientationDelta = (delta_qw, delta_qx, delta_qy, delta_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) __m256 orient_qw = _mm256_fnmadd_ps(delta_qx, conj_qx, _mm256_mul_ps(delta_qw, conj_qw)); orient_qw = _mm256_fnmadd_ps(delta_qy, conj_qy, orient_qw); orient_qw = _mm256_fnmadd_ps(delta_qz, conj_qz, orient_qw); __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(delta_qy, conj_qz), _mm256_mul_ps(delta_qz, conj_qy)); __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(delta_qz, conj_qx), _mm256_mul_ps(delta_qx, conj_qz)); __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(delta_qx, conj_qy), _mm256_mul_ps(delta_qy, conj_qx)); __m256 orient_qx = _mm256_fmadd_ps(delta_qw, conj_qx, _mm256_mul_ps(delta_qx, conj_qw)); orient_qx = _mm256_add_ps(orient_qx, cross_x); __m256 orient_qy = _mm256_fmadd_ps(delta_qw, conj_qy, _mm256_mul_ps(delta_qy, conj_qw)); orient_qy = _mm256_add_ps(orient_qy, cross_y); __m256 orient_qz = _mm256_fmadd_ps(delta_qw, conj_qz, _mm256_mul_ps(delta_qz, conj_qw)); orient_qz = _mm256_add_ps(orient_qz, cross_z); // Step 4: Compute dot product (to, from) for shortest arc check __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw)); dot = _mm256_fmadd_ps(qy, prev_qy, dot); dot = _mm256_fmadd_ps(qz, prev_qz, dot); // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), _mm256_andnot_ps(negate_mask, one_vec)); orient_qx = _mm256_mul_ps(orient_qx, negate_sign); orient_qy = _mm256_mul_ps(orient_qy, negate_sign); orient_qz = _mm256_mul_ps(orient_qz, negate_sign); // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec)); __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale); __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale); __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale); // Store results _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); -
vurtun revised this gist
Jun 23, 2025 . 1 changed file with 1068 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,1068 @@ #include <immintrin.h> #include <math.h> enum { MAX_CHAINS = 64, PARTICLES_PER_CHAIN = 16, SPHERES_PER_CHAIN = 8, MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN), MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN), CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN, MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN), MAX_ITERATIONS = 4, }; struct chain_cfg { float damping; float friction; float collision_radius; float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force float avbd_alpha; // Regularization parameter (e.g., 0.95f) float avbd_beta; // Stiffness ramping control (e.g., 10.0f) float avbd_gamma; // Warm starting scaling (e.g., 0.99f) float avbd_k_start; // Initial stiffness for AVBD (e.g., 100.0f) float avbd_k_max; // Maximum stiffness for AVBD (e.g., 1e6f) } __attribute__((aligned(32))); struct chain_sim { unsigned short free_idx_cnt; unsigned short free_idx[MAX_CHAINS]; struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32))); // Chain global forces (world space gravity and wind) float gravity_x[MAX_CHAINS] __attribute__((aligned(32))); float gravity_y[MAX_CHAINS] __attribute__((aligned(32))); float gravity_z[MAX_CHAINS] __attribute__((aligned(32))); float wind_x[MAX_CHAINS] __attribute__((aligned(32))); float wind_y[MAX_CHAINS] __attribute__((aligned(32))); float wind_z[MAX_CHAINS] __attribute__((aligned(32))); // Chain transformations (world space) float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); // Particle positions (model space) float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); float inv_mass[MAX_PARTICLES] __attribute__((aligned(32))); float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32))); float dist_lambda[MAX_REST_LENGTHS] __attribute__((aligned(32))); float dist_k_current[MAX_REST_LENGTHS] __attribute__((aligned(32))); float dist_prev_frame_error[MAX_REST_LENGTHS] __attribute__((aligned(32))); // Sphere positions (model space) float sphere_x[MAX_SPHERES] __attribute__((aligned(32))); float sphere_y[MAX_SPHERES] __attribute__((aligned(32))); float sphere_z[MAX_SPHERES] __attribute__((aligned(32))); float sphere_radius[MAX_SPHERES] __attribute__((aligned(32))); float coll_lambda[MAX_PARTICLES + 1] __attribute__((aligned(32))); float coll_k_current[MAX_PARTICLES + 1] __attribute__((aligned(32))); float coll_prev_frame_error[MAX_PARTICLES + 1] __attribute__((aligned(32))); // Rest positions (model space) float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); float motion_radius[MAX_PARTICLES] __attribute__((aligned(32))); }; void simulate_chains(struct chain_sim *cs, float dt) { // Common SIMD constants const __m256 dt_vec = _mm256_set1_ps(dt); const __m256 dt2_vec = _mm256_set1_ps(dt * dt); const __m256 one_vec = _mm256_set1_ps(1.0f); const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); const __m256 eps_vec = _mm256_set1_ps(1e-6f); const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f); const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f); const __m256 zero_vec = _mm256_setzero_ps(); const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); const __m256 two_vec = _mm256_set1_ps(2.0f); const __m256 half_vec = _mm256_set1_ps(0.5f); const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f); const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f); const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f); const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f); const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f); const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f); const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); // Local arrays for PBD solver float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); // Initialize dummy elements px_local[PARTICLES_PER_CHAIN] = 0.0f; py_local[PARTICLES_PER_CHAIN] = 0.0f; pz_local[PARTICLES_PER_CHAIN] = 0.0f; pm_local[PARTICLES_PER_CHAIN] = 0.0f; // Stack arrays for precomputed data float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); float vel_x[MAX_CHAINS] __attribute__((aligned(32))); float vel_y[MAX_CHAINS] __attribute__((aligned(32))); float vel_z[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); // Step 0: Precomputation (SIMD, 8 chains at once) for (int c = 0; c < MAX_CHAINS; c += 8) { // Load chain quaternions __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); // Compute local-space wind { __m256 vx = _mm256_load_ps(&cs->wind_x[c]); __m256 vy = _mm256_load_ps(&cs->wind_y[c]); __m256 vz = _mm256_load_ps(&cs->wind_z[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: result_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // result_x = tw*qx + tx*qw + ty*qz - tz*qy // result_y = tw*qy + ty*qw + tz*qx - tx*qz // result_z = tw*qz + ty*qw + tx*qy - ty*qx __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); _mm256_store_ps(&wind_local_x[c], result_x); _mm256_store_ps(&wind_local_y[c], result_y); _mm256_store_ps(&wind_local_z[c], result_z); } // Compute local-space gravity { __m256 vx = _mm256_load_ps(&cs->gravity_x[c]); __m256 vy = _mm256_load_ps(&cs->gravity_y[c]); __m256 vz = _mm256_load_ps(&cs->gravity_z[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: result_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // result_x = tw*qx + tx*qw + ty*qz - tz*qy // result_y = tw*qy + ty*qw + tz*qx - tx*qz // result_z = tw*qz + ty*qw + tx*qy - ty*qx __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); _mm256_store_ps(&gravity_local_x[c], result_x); _mm256_store_ps(&gravity_local_y[c], result_y); _mm256_store_ps(&gravity_local_z[c], result_z); } // Compute linear velocity { __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]); __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]); __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]); __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]); __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]); __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]); __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); __m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec); _mm256_store_ps(&vel_x[c], vel_x_vec); _mm256_store_ps(&vel_y[c], vel_y_vec); _mm256_store_ps(&vel_z[c], vel_z_vec); } // Compute angular velocity (SIMD with polynomial approximations) { __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]); __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]); __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]); __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]); // Quaternion inverse __m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy)); norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq); norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq); __m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec)); __m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq); __m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq); __m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq); __m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq); // Delta quaternion: q_current * q_prev^-1 __m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx)); delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx)); __m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy)); delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy)); __m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz)); delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz)); __m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw)); delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw); delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw); // Angular velocity with polynomial approximations __m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec); // Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))) __m256 x = qw_clamped; __m256 x2 = _mm256_mul_ps(x, x); __m256 x4 = _mm256_mul_ps(x2, x2); __m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec); poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec); poly = _mm256_fmadd_ps(x2, poly, one_vec); poly = _mm256_mul_ps(x, poly); __m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly)); // Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2)) __m256 half_angle = _mm256_mul_ps(angle, half_vec); __m256 ha2 = _mm256_mul_ps(half_angle, half_angle); __m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec); sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec); __m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly); __m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec)); // Sign: Use qw > 0 instead of angle < π __m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ); sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec)); // Angular velocity: (sign * qx * inv_sin_half_angle / dt) __m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec)); __m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale); __m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale); __m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale); _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHAINS; c++) { int base_idx = c * PARTICLES_PER_CHAIN; struct chain_cfg *cfg = &cs->chain_configs[c]; // Load precomputed velocities and inertia parameters __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); // Clamp inertia parameters to [0, 1] linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { __m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); __m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); __m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); // Linear inertia: v * dt * linear_inertia __m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); __m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); __m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); // Angular inertia: (ω × r) * dt * angular_inertia __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec); ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec); ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec); // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); // Calculate displacement: (ω × (ω × r)) * dt^2 __m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec); __m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec); __m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec); // Apply the centrifugal_inertia factor __m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec); __m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec); __m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec); // Total delta __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x); __m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y); __m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z); // Update positions _mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x)); _mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y)); _mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z)); } // Update previous transformation cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; cs->prev_chain_pos_z[c] = cs->chain_pos_z[c]; cs->prev_chain_quat_x[c] = cs->chain_quat_x[c]; cs->prev_chain_quat_y[c] = cs->chain_quat_y[c]; cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; cs->prev_chain_quat_w[c] = cs->chain_quat_w[c]; } // Step 2: Verlet Integration for (int c = 0; c < MAX_CHAINS; c++) { int base_idx = c * PARTICLES_PER_CHAIN; struct chain_cfg *cfg = &cs->chain_configs[c]; __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]); __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]); __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]); __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]); __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]); __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]); __m256 damping_vec = _mm256_set1_ps(cfg->damping); for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]); __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]); __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]); __m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec); __m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec); __m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec); __m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass); __m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass); __m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); _mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x); _mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y); _mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z); _mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x); _mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y); _mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z); } } // --- AVBD Warm-starting of lambda and k_current for all distance constraints. for (int c = 0; c < MAX_CHAINS; ++c) { struct chain_cfg* cfg = &cs->chain_configs[c]; __m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha); __m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma); __m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start); // Loop through each block of 8 constraints for warm-starting int r_base = c * CONSTRAINTS_PER_CHAIN; // Base index for this chain's constraints for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += 8) { // Load current lambda, k_current for 8 constraints simultaneously __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + i]); __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + i]); // Apply warm-starting (Eq. 19 from paper) // lambda_new = lambda_prev_frame * alpha * gamma current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec)); // k_current_new = max(K_START, k_current_prev_frame * gamma) current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec)); // Store updated warm-started values back _mm256_store_ps(&cs->dist_lambda[r_base + i], current_lambda_vec); _mm256_store_ps(&cs->dist_k_current[r_base + i], current_k_vec); } } // Warm-starting for Collision Constraints --- for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) { struct chain_cfg* cfg = &cs->chain_configs[c_idx]; __m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha); __m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma); __m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start); int p_base = c_idx * PARTICLES_PER_CHAIN; // Base index for this chain's particles for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) { __m256 current_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]); __m256 current_k_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]); current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec)); current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec)); _mm256_store_ps(&cs->coll_lambda[p_base + i], current_lambda_vec); _mm256_store_ps(&cs->coll_k_current[p_base + i], current_k_vec); } } for (int c = 0; c < MAX_CHAINS; c++) { struct chain_cfg *cfg = &cs->chain_configs[c]; __m256 alpha_vec = _mm256_set1_ps(cfg->avbd_alpha); __m256 beta_vec = _mm256_set1_ps(cfg->avbd_beta); __m256 k_max_vec = _mm256_set1_ps(cfg->avbd_k_max); int p_base = c * PARTICLES_PER_CHAIN; int r_base = c * CONSTRAINTS_PER_CHAIN; // Load all current particle positions into local buffer (once per chain per iteration) for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { _mm256_store_ps(&px_local[i], _mm256_load_ps(&cs->p_pos_x[p_base + i])); _mm256_store_ps(&py_local[i], _mm256_load_ps(&cs->p_pos_y[p_base + i])); _mm256_store_ps(&pz_local[i], _mm256_load_ps(&cs->p_pos_z[p_base + i])); _mm256_store_ps(&pm_local[i], _mm256_load_ps(&cs->inv_mass[p_base + i])); _mm256_store_ps(&pm_local[i], _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&pm_local[i]), inv_mass_clamp_max), inv_mass_clamp_min)); } for (int iter = 0; iter < MAX_ITERATIONS; iter++) { // First block (i=0 to 7 of particles, processing constraints 0 to 7) { // p0 is aligned from px_local[0] __m256 p0x = _mm256_load_ps(&px_local[0]); __m256 p0y = _mm256_load_ps(&py_local[0]); __m256 p0z = _mm256_load_ps(&pz_local[0]); __m256 p0m = _mm256_load_ps(&pm_local[0]); // p1 is unaligned from px_local[1] __m256 p1x = _mm256_loadu_ps(&px_local[1]); __m256 p1y = _mm256_loadu_ps(&py_local[1]); __m256 p1z = _mm256_loadu_ps(&pz_local[1]); __m256 p1m = _mm256_loadu_ps(&pm_local[1]); // This corresponds to rest_lengths[r_base + 0] through rest_lengths[r_base + 7] __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]); // Calculate inverse sum of masses (rcp_w_sum) __m256 w_sum = _mm256_add_ps(p0m, p1m); __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); // Calculate current vector and distance between particles __m256 dx_vec = _mm256_sub_ps(p1x, p0x); __m256 dy_vec = _mm256_sub_ps(p1y, p0y); __m256 dz_vec = _mm256_sub_ps(p1z, p0z); __m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec); dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq); dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); // 1/dist __m256 dist = _mm256_rcp_ps(inv_dist); // Current distance // Primal Update for Distance Constraints (Block 1) --- // Load AVBD state for this block of 8 constraints (indices r_base + 0 to r_base + 7) __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 0]); __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 0]); __m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 0]); // Original constraint error C_j*(x) = current_dist - target_length __m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec); // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18) __m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec)); // Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda) // The effective displacement is proportional to this force divided by effective stiffness (k) and mass. __m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec); // Calculate the scalar part of the correction, similar to original 'corr_scalar_part' // This is simplified, representing the desired displacement due to the augmented force __m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor // Normalize difference vector __m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist); __m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist); __m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist); // Calculate the actual displacement vectors // Note the sign inversion from original: -(delta_x * pm0) for p0 and +(delta_x * pm1) for p1 // The 'effective_corr_scalar_part' already implicitly carries the sign of the desired correction. // So, we just multiply by the unit direction and inverse mass. __m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part); __m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part); __m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part); // Apply correction mask (from original code, avoid division by zero artifacts where dist is ~0) __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); delta_x = _mm256_and_ps(delta_x, apply_corr_mask); delta_y = _mm256_and_ps(delta_y, apply_corr_mask); delta_z = _mm256_and_ps(delta_z, apply_corr_mask); // Update positions in local buffer // p0 moves along -delta (subtract), p1 moves along +delta (add) _mm256_store_ps(&px_local[0], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m))); _mm256_store_ps(&py_local[0], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m))); _mm256_store_ps(&pz_local[0], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m))); _mm256_storeu_ps(&px_local[1], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m))); _mm256_storeu_ps(&py_local[1], _mm256_add_ps(p1y, _mm256_mul_ps(delta_y, p1m))); _mm256_storeu_ps(&pz_local[1], _mm256_add_ps(p1z, _mm256_mul_ps(delta_z, p1m))); // Dual Update for Distance Constraints (Block 1) --- // Recalculate constraint error *after* position updates for accurate lambda update // Reload updated positions from local buffer to get the 'post-primal' state p0x = _mm256_load_ps(&px_local[0]); p0y = _mm256_load_ps(&py_local[0]); p0z = _mm256_load_ps(&pz_local[0]); p1x = _mm256_loadu_ps(&px_local[1]); p1y = _mm256_loadu_ps(&py_local[1]); p1z = _mm256_loadu_ps(&pz_local[1]); __m256 updated_dx = _mm256_sub_ps(p1x, p0x); __m256 updated_dy = _mm256_sub_ps(p1y, p0y); __m256 updated_dz = _mm256_sub_ps(p1z, p0z); __m256 updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx); updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq); updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq); __m256 updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec)); __m256 updated_dist = _mm256_rcp_ps(updated_inv_dist); __m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) __m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec); _mm256_store_ps(&cs->dist_lambda[r_base + 0], new_lambda_vec); // Store back // Update k_current (Stiffness ramping, Eq. 12) // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) __m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask __m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec); __m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec); _mm256_store_ps(&cs->dist_k_current[r_base + 0], new_k_vec); // Store back } // Second block (i=8 to 14 of particles, processing constraints 8 to 14) { // p0 is aligned from px_local[8] __m256 p0x = _mm256_load_ps(&px_local[8]); __m256 p0y = _mm256_load_ps(&py_local[8]); __m256 p0z = _mm256_load_ps(&pz_local[8]); __m256 p0m = _mm256_load_ps(&pm_local[8]); // p1 is unaligned from px_local[9], *including* the dummy element at px_local[16] // when i=15, p_base+i+1 becomes p_base+16 __m256 p1x = _mm256_loadu_ps(&px_local[9]); __m256 p1y = _mm256_loadu_ps(&py_local[9]); __m256 p1z = _mm256_loadu_ps(&pz_local[9]); __m256 p1m = _mm256_loadu_ps(&pm_local[9]); // This corresponds to rest_lengths[r_base + 8] through rest_lengths[r_base + 15] __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]); __m256 dx_vec = _mm256_sub_ps(p1x, p0x); __m256 dy_vec = _mm256_sub_ps(p1y, p0y); __m256 dz_vec = _mm256_sub_ps(p1z, p0z); __m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec); dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq); dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 w_sum = _mm256_add_ps(p0m, p1m); __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); __m256 dist = _mm256_rcp_ps(inv_dist); // Primal Update for Distance Constraints (Block 2) --- // Load AVBD state for this block of 8 constraints (indices r_base + 8 to r_base + 15) __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 8]); __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 8]); __m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 8]); // Original constraint error C_j*(x) = current_dist - target_length __m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec); // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18) __m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec)); // Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda) __m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec); // Calculate the scalar part of the correction, similar to original 'corr_scalar_part' __m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); // This 'valid_mask' handles the fact that the last particle doesn't have a *next* particle // for a distance constraint. It effectively sets the correction for the last element (index 7 of the block) to zero. // For PARTICLES_PER_CHAIN = 16, this second block covers constraints for particles 8 to 15. // The constraint for particle 15 (index 7 in this block) is between particle 15 and 16 (the dummy). // The mask effectively disables the constraint correction for the dummy particle. __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; __m256 valid_mask = _mm256_load_ps(valid_mask_array); apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); __m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist); __m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist); __m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist); __m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part); __m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part); __m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part); delta_x = _mm256_and_ps(delta_x, apply_corr_mask); delta_y = _mm256_and_ps(delta_y, apply_corr_mask); delta_z = _mm256_and_ps(delta_z, apply_corr_mask); // Update positions in local buffer _mm256_store_ps(&px_local[8], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m))); _mm256_store_ps(&py_local[8], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m))); _mm256_store_ps(&pz_local[8], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m))); _mm256_storeu_ps(&px_local[9], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m))); _mm256_storeu_ps(&py_local[9], _mm256_add_ps(py1, _mm256_mul_ps(delta_y, p1m))); _mm256_storeu_ps(&pz_local[9], _mm256_add_ps(pz1, _mm256_mul_ps(delta_z, p1m))); // Recalculate constraint error *after* position updates for accurate lambda update // Reload updated positions from local buffer p0x = _mm256_load_ps(&px_local[8]); p0y = _mm256_load_ps(&py_local[8]); p0z = _mm256_load_ps(&pz_local[8]); p1x = _mm256_loadu_ps(&px_local[9]); p1y = _mm256_loadu_ps(&py_local[9]); p1z = _mm256_loadu_ps(&pz_local[9]); updated_dx = _mm256_sub_ps(p1x, p0x); updated_dy = _mm256_sub_ps(p1y, p0y); updated_dz = _mm256_sub_ps(p1z, p0z); updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx); updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq); updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq); updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec)); updated_dist = _mm256_rcp_ps(updated_inv_dist); __m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) __m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec); _mm256_store_ps(&cs->dist_lambda[r_base + 8], new_lambda_vec); // Store back // Update k_current (Stiffness ramping, Eq. 12) // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) __m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask __m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec); __m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec); _mm256_store_ps(&cs->dist_k_current[r_base + 8], new_k_vec); // Store back } // End of second block } // After all iterations, save the final positions from local buffer to global cs->p_pos_x/y/z for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_load_ps(&px_local[i])); _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_load_ps(&py_local[i])); _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_load_ps(&pz_local[i])); } } // Step 4: Sphere Collisions with Friction for (int c = 0; c < MAX_CHAINS; c++) { int p_base = c * PARTICLES_PER_CHAIN; int s_base = c * SPHERES_PER_CHAIN; struct chain_cfg *cfg = &cs->chain_configs[c]; __m256 friction_vec = _mm256_set1_ps(cfg->friction); for (int s = 0; s < SPHERES_PER_CHAIN; s++) { __m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]); __m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]); __m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]); __m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]); __m256 alpha_coll_vec = _mm256_set1_ps(cfg->avbd_alpha); __m256 beta_coll_vec = _mm256_set1_ps(cfg->avbd_beta); __m256 k_max_coll_vec = _mm256_set1_ps(cfg->avbd_k_max); for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]); __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]); __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]); __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]); p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec); __m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec); __m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 penetration = _mm256_sub_ps(sphere_r_vec, dist); __m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); __m256 normal_x = _mm256_mul_ps(dx, inv_dist); __m256 normal_y = _mm256_mul_ps(dy, inv_dist); __m256 normal_z = _mm256_mul_ps(dz, inv_dist); // Load AVBD state for this particle block __m256 coll_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]); __m256 coll_k_current_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]); __m256 coll_prev_frame_error_vec = _mm256_load_ps(&cs->coll_prev_frame_error[p_base + i]); // AVBD Primal Update: // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) __m256 C_x_regularized = _mm256_sub_ps(penetration, _mm256_mul_ps(alpha_coll_vec, coll_prev_frame_error_vec)); // Apply collision_mask to regularized error, so only active constraints contribute. // This ensures C_x_regularized is 0 for non-penetrating particles, making force_term 0. C_x_regularized = _mm256_and_ps(C_x_regularized, collision_mask); // Calculate force term: (k * C_reg + lambda) __m256 force_term = _mm256_fmadd_ps(coll_k_current_vec, C_x_regularized, coll_lambda_vec); // Calculate correction magnitude (delta_p in paper): force_term * inv_mass / k_current // For collision, delta_x = -nablaC_j(x) * (H_j_inv * (lambda_j + K_j C_j(x))). // H_j_inv for a single particle collision is its inverse mass (p_inv_mass). // So, the correction should be along the normal: -normal * (force_term * p_inv_mass) / k_current // The force_term already has the appropriate sign for correction direction when C_x_regularized is positive (penetration). __m256 correction_magnitude_scalar = _mm256_div_ps(_mm256_mul_ps(force_term, p_inv_mass), coll_k_current_vec); // Apply the correction along the normal direction. // The sign of 'penetration' (and thus 'C_x_regularized' and 'force_term') // means positive 'correction_magnitude_scalar' should push OUT of collision. // So we subtract from particle position. __m256 delta_pos_x = _mm256_mul_ps(normal_x, correction_magnitude_scalar); __m256 delta_pos_y = _mm256_mul_ps(normal_y, correction_magnitude_scalar); __m256 delta_pos_z = _mm256_mul_ps(normal_z, correction_magnitude_scalar); // Update positions: p_curr = p_curr - delta_pos (particle moves out of penetration) p_curr_x = _mm256_sub_ps(p_curr_x, delta_pos_x); p_curr_y = _mm256_sub_ps(p_curr_y, delta_pos_y); p_curr_z = _mm256_sub_ps(p_curr_z, delta_pos_z); _mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x); _mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y); _mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z); // Apply friction (post-stabilization step) // Recalculate velocity based on updated positions __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); // Project velocity onto normal to get normal component __m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x); vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal); vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal); // Get tangential velocity component __m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x)); __m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y)); __m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z)); __m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x); tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq); tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq); __m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)); // Calculate friction force magnitude limit (based on normal force proxy) __m256 friction_mag_limit = _mm256_mul_ps(penetration, friction_vec); // Actual tangential velocity length __m256 actual_tangent_mag = _mm256_mul_ps(_mm256_sqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)), one_vec); // sqrt(tangent_mag_sq) is needed here // Calculate the scaling factor for friction. If actual_tangent_mag is greater than limit, scale down. __m256 friction_scale = _mm256_div_ps(friction_mag_limit, _mm256_max_ps(actual_tangent_mag, eps_vec)); friction_scale = _mm256_min_ps(friction_scale, one_vec); // Clamp to 1.0 (no acceleration from friction) // Apply friction as a reduction of tangential velocity, converting to position change // The idea is to reduce the tangential displacement that happened *this* iteration. // The original `vel_x/y/z` are already displacements from `p_prev` to `p_curr`. // So, the tangential displacement is `tangent_x/y/z`. // We want to reduce this by `(1 - friction_scale)`. __m256 friction_x_corr = _mm256_mul_ps(tangent_x, friction_scale); __m256 friction_y_corr = _mm256_mul_ps(tangent_y, friction_scale); __m256 friction_z_corr = _mm256_mul_ps(tangent_z, friction_scale); // Only apply if actively colliding friction_x_corr = _mm256_and_ps(friction_x_corr, collision_mask); friction_y_corr = _mm256_and_ps(friction_y_corr, collision_mask); friction_z_corr = _mm256_and_ps(friction_z_corr, collision_mask); // Apply friction by moving the particle along the tangential direction. // If friction_scale is 0 (no friction), no change. If 1 (full friction), moves back to p_prev. p_curr_x = _mm256_sub_ps(p_curr_x, friction_x_corr); p_curr_y = _mm256_sub_ps(p_curr_y, friction_y_corr); p_curr_z = _mm256_sub_ps(p_curr_z, friction_z_corr); _mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x); _mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y); _mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z); // Dual Update for Collision Constraints --- // Recalculate collision error *after* position updates (normal & friction) // Reload updated positions p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); dx = _mm256_sub_ps(p_curr_x, sphere_x_vec); dy = _mm256_sub_ps(p_curr_y, sphere_y_vec); dz = _mm256_sub_ps(p_curr_z, sphere_z_vec); dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); dist = _mm256_rcp_ps(inv_dist); __m256 C_x_updated_collision = _mm256_sub_ps(sphere_r_vec, dist); // New error after primal update // Apply the collision mask again for updates, so only active constraints update their AVBD state C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, collision_mask); // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) __m256 new_coll_lambda_vec = _mm256_fmadd_ps(coll_k_current_vec, C_x_updated_collision, coll_lambda_vec); _mm256_store_ps(&cs->coll_lambda[p_base + i], new_coll_lambda_vec); // Update k_current (Stiffness ramping, Eq. 12) // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) __m256 abs_C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, abs_mask_ps); // Absolute value __m256 k_updated_coll_vec = _mm256_fmadd_ps(beta_coll_vec, abs_C_x_updated_collision, coll_k_current_vec); __m256 new_coll_k_vec = _mm256_min_ps(k_max_coll_vec, k_updated_coll_vec); _mm256_store_ps(&cs->coll_k_current[p_base + i], new_coll_k_vec); // --- END AVBD ADDITION --- } } } // Step 5: Motion Constraints for (int c = 0; c < MAX_CHAINS; c++) { int p_base = c * PARTICLES_PER_CHAIN; for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { __m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]); __m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]); __m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]); __m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]); pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]); __m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]); __m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]); __m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]); __m256 dx = _mm256_sub_ps(px, rx); __m256 dy = _mm256_sub_ps(py, ry); __m256 dz = _mm256_sub_ps(pz, rz); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 penetration = _mm256_sub_ps(dist, radius_vec); __m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); __m256 corr_factor = _mm256_mul_ps(penetration, inv_dist); corr_factor = _mm256_and_ps(corr_factor, mask); __m256 delta_x = _mm256_mul_ps(dx, corr_factor); __m256 delta_y = _mm256_mul_ps(dy, corr_factor); __m256 delta_z = _mm256_mul_ps(dz, corr_factor); delta_x = _mm256_mul_ps(delta_x, pm); delta_y = _mm256_mul_ps(delta_y, pm); delta_z = _mm256_mul_ps(delta_z, pm); _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x)); _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y)); _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z)); } } // Store Final Errors for Next Frame's Regularization --- // This happens once at the very end of simulate_chains, after all iterations are complete for the frame. for (int c = 0; c < MAX_CHAINS; ++c) { int p_base = c * PARTICLES_PER_CHAIN; int r_base = c * CONSTRAINTS_PER_CHAIN; // Loop through each block of 8 constraints (distance constraints are N-1 for N particles) for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += VEC_FLOAT_COUNT) { // Calculate the current error for this distance constraint block // (Similar to C_x_original calculation from Step 3, but using final positions) // Load final positions for p0 and p1 (from global arrays now, as local buffer is stale) __m256 px0 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 0]); __m256 py0 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 0]); __m256 pz0 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 0]); __m256 px1 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 1]); // Use _mm256_load_ps as this is from global __m256 py1 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 1]); __m256 pz1 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 1]); __m256 diff_x = _mm256_sub_ps(px1, px0); __m256 diff_y = _mm256_sub_ps(py1, py0); __m256 diff_z = _mm256_sub_ps(pz1, pz0); __m256 dist_sq = _mm256_mul_ps(diff_x, diff_x); dist_sq = _mm256_fmadd_ps(diff_y, diff_y, dist_sq); dist_sq = _mm256_fmadd_ps(diff_z, diff_z, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + i]); __m256 final_error_vec = _mm256_sub_ps(dist, rest_len_vec); _mm256_store_ps(&cs->dist_prev_frame_error[r_base + i], final_error_vec); // Store for next frame's regularization } } for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) { int p_base = c_idx * PARTICLES_PER_CHAIN; int s_base = c_idx * SPHERES_PER_CHAIN; // For accessing sphere data for this chain for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) { __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); __m256 deepest_penetration_vec = neg_flt_max_vec; // Initialize with negative infinity // Loop through all spheres for this chain to find the max penetration for each particle in the block for (int s_idx = 0; s_idx < SPHERES_PER_CHAIN; ++s_idx) { __m256 s_x = _mm256_set1_ps(cs->sphere_x[s_base + s_idx]); __m256 s_y = _mm256_set1_ps(cs->sphere_y[s_base + s_idx]); __m256 s_z = _mm256_set1_ps(cs->sphere_z[s_base + s_idx]); __m256 s_r = _mm256_set1_ps(cs->sphere_radius[s_base + s_idx]); __m256 dx_s = _mm256_sub_ps(p_curr_x, s_x); __m256 dy_s = _mm256_sub_ps(p_curr_y, s_y); __m256 dz_s = _mm256_sub_ps(p_curr_z, s_z); __m256 dist_sq_s = _mm256_mul_ps(dx_s, dx_s); dist_sq_s = _mm256_fmadd_ps(dy_s, dy_s, dist_sq_s); dist_sq_s = _mm256_fmadd_ps(dz_s, dz_s, dist_sq_s); __m256 inv_dist_s = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq_s, eps_vec)); __m256 dist_s = _mm256_rcp_ps(inv_dist_s); __m256 current_penetration_s = _mm256_sub_ps(s_r, dist_s); // C(x) = radius - distance // We only care about positive penetration (i.e., actually colliding) current_penetration_s = _mm256_max_ps(zero_vec, current_penetration_s); deepest_penetration_vec = _mm256_max_ps(deepest_penetration_vec, current_penetration_s); } _mm256_store_ps(&cs->coll_prev_frame_error[p_base + i], deepest_penetration_vec); // Store for next frame's regularization } } } -
vurtun revised this gist
Jun 6, 2025 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -691,9 +691,9 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 friction_y = _mm256_mul_ps(_mm256_mul_ps(tangent_y, inv_tangent_mag), friction_mag); __m256 friction_z = _mm256_mul_ps(_mm256_mul_ps(tangent_z, inv_tangent_mag), friction_mag); __m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x); __m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y); __m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z); total_corr_x = _mm256_and_ps(total_corr_x, collision_mask); total_corr_y = _mm256_and_ps(total_corr_y, collision_mask); -
vurtun revised this gist
Jun 3, 2025 . 1 changed file with 107 additions and 55 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,8 +13,6 @@ enum { }; struct chain_cfg { float damping; float stiffness; float stretch_factor; @@ -31,6 +29,15 @@ struct chain_sim { unsigned short free_idx[MAX_CHAINS]; struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32))); // Chain global forces (world space gravity and wind) float gravity_x[MAX_CHAINS] __attribute__((aligned(32))); float gravity_y[MAX_CHAINS] __attribute__((aligned(32))); float gravity_z[MAX_CHAINS] __attribute__((aligned(32))); float wind_x[MAX_CHAINS] __attribute__((aligned(32))); float wind_y[MAX_CHAINS] __attribute__((aligned(32))); float wind_z[MAX_CHAINS] __attribute__((aligned(32))); // Chain transformations (world space) float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); @@ -96,6 +103,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f); const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); // Local arrays for PBD solver float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); @@ -135,53 +143,95 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); // Compute local-space wind { __m256 vx = _mm256_load_ps(&cs->wind_x[c]); __m256 vy = _mm256_load_ps(&cs->wind_y[c]); __m256 vz = _mm256_load_ps(&cs->wind_z[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: result_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // result_x = tw*qx + tx*qw + ty*qz - tz*qy // result_y = tw*qy + ty*qw + tz*qx - tx*qz // result_z = tw*qz + ty*qw + tx*qy - ty*qx __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); _mm256_store_ps(&wind_local_x[c], result_x); _mm256_store_ps(&wind_local_y[c], result_y); _mm256_store_ps(&wind_local_z[c], result_z); } // Compute local-space gravity { __m256 vx = _mm256_load_ps(&cs->gravity_x[c]); __m256 vy = _mm256_load_ps(&cs->gravity_y[c]); __m256 vz = _mm256_load_ps(&cs->gravity_z[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: result_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // result_x = tw*qx + tx*qw + ty*qz - tz*qy // result_y = tw*qy + ty*qw + tz*qx - tx*qz // result_z = tw*qz + ty*qw + tx*qy - ty*qx __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); _mm256_store_ps(&gravity_local_x[c], result_x); _mm256_store_ps(&gravity_local_y[c], result_y); @@ -323,13 +373,15 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); // Calculate displacement: (ω × (ω × r)) * dt^2 __m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec); __m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec); __m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec); // Apply the centrifugal_inertia factor __m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec); __m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec); __m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec); // Total delta __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x); @@ -464,7 +516,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); __m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); __m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom)); __m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec); @@ -526,7 +578,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); __m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); __m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom)); __m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec); @@ -535,7 +587,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; __m256 valid_mask = _mm256_load_ps(valid_mask_array); apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); @@ -639,9 +691,9 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 friction_y = _mm256_mul_ps(_mm256_mul_ps(tangent_y, inv_tangent_mag), friction_mag); __m256 friction_z = _mm256_mul_ps(_mm256_mul_ps(tangent_z, inv_tangent_mag), friction_mag); __m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x); __m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y); __m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z); total_corr_x = _mm256_and_ps(total_corr_x, collision_mask); total_corr_y = _mm256_and_ps(total_corr_y, collision_mask); @@ -707,4 +759,4 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z)); } } } -
vurtun revised this gist
May 20, 2025 . 1 changed file with 4 additions and 12 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -35,13 +35,16 @@ struct chain_sim { float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); @@ -52,6 +55,7 @@ struct chain_sim { float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); float inv_mass[MAX_PARTICLES] __attribute__((aligned(32))); float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); @@ -129,7 +133,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); // Compute local-space wind { __m256 vx = _mm256_set1_ps(cs->chain_configs[c].wind_x); @@ -157,7 +160,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&wind_local_y[c], result_y); _mm256_store_ps(&wind_local_z[c], result_z); } // Compute local-space gravity { __m256 vx = _mm256_set1_ps(cs->chain_configs[c].gravity_x); @@ -185,7 +187,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&gravity_local_y[c], result_y); _mm256_store_ps(&gravity_local_z[c], result_z); } // Compute linear velocity { __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]); @@ -204,7 +205,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&vel_y[c], vel_y_vec); _mm256_store_ps(&vel_z[c], vel_z_vec); } // Compute angular velocity (SIMD with polynomial approximations) { __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]); @@ -270,7 +270,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHAINS; c++) { int base_idx = c * PARTICLES_PER_CHAIN; @@ -342,7 +341,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y)); _mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z)); } // Update previous transformation cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; @@ -353,7 +351,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; cs->prev_chain_quat_w[c] = cs->chain_quat_w[c]; } // Step 2: Verlet Integration for (int c = 0; c < MAX_CHAINS; c++) { int base_idx = c * PARTICLES_PER_CHAIN; @@ -412,7 +409,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z); } } // Step 3: Distance Constraints for (int c = 0; c < MAX_CHAINS; c++) { int p_base = c * PARTICLES_PER_CHAIN; @@ -435,7 +431,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&pz_local[i], pz); _mm256_store_ps(&pm_local[i], pm); } for (int iter = 0; iter < MAX_ITERATIONS; iter++) { // First block (i=0 to 7) { @@ -498,7 +493,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&py_local[1], _mm256_fnmadd_ps(delta_y, p1m, p1y)); _mm256_store_ps(&pz_local[1], _mm256_fnmadd_ps(delta_z, p1m, p1z)); } // Second block (i=8 to 14) { __m256 p0x = _mm256_load_ps(&px_local[8]); @@ -566,7 +560,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_storeu_ps(&pz_local[9], _mm256_fnmadd_ps(delta_z, p1m, p1z)); } } for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { __m256 px = _mm256_load_ps(&px_local[i]); __m256 py = _mm256_load_ps(&py_local[i]); @@ -577,7 +570,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { _mm256_store_ps(&cs->p_pos_z[p_base + i], pz); } } // Step 4: Sphere Collisions with Friction for (int c = 0; c < MAX_CHAINS; c++) { int p_base = c * PARTICLES_PER_CHAIN; -
vurtun revised this gist
May 20, 2025 . 1 changed file with 13 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -109,12 +109,15 @@ void simulate_chains(struct chain_sim *cs, float dt) { float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); float vel_x[MAX_CHAINS] __attribute__((aligned(32))); float vel_y[MAX_CHAINS] __attribute__((aligned(32))); float vel_z[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); @@ -188,6 +191,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]); __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]); __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]); __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]); __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]); __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]); @@ -276,9 +280,11 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); @@ -304,6 +310,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec); ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec); ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec); @@ -320,6 +327,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 cent_delta_x = _mm256_mul_ps(_mm256_mul_ps(cross2_x, dt2_vec), p_inv_mass); __m256 cent_delta_y = _mm256_mul_ps(_mm256_mul_ps(cross2_y, dt2_vec), p_inv_mass); __m256 cent_delta_z = _mm256_mul_ps(_mm256_mul_ps(cross2_z, dt2_vec), p_inv_mass); cent_delta_x = _mm256_mul_ps(cent_delta_x, centrifugal_inertia_vec); cent_delta_y = _mm256_mul_ps(cent_delta_y, centrifugal_inertia_vec); cent_delta_z = _mm256_mul_ps(cent_delta_z, centrifugal_inertia_vec); @@ -339,6 +347,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; cs->prev_chain_pos_z[c] = cs->chain_pos_z[c]; cs->prev_chain_quat_x[c] = cs->chain_quat_x[c]; cs->prev_chain_quat_y[c] = cs->chain_quat_y[c]; cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; @@ -353,6 +362,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]); __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]); __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]); __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]); __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]); __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]); @@ -495,6 +505,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 p0y = _mm256_load_ps(&py_local[8]); __m256 p0z = _mm256_load_ps(&pz_local[8]); __m256 p0m = _mm256_load_ps(&pm_local[8]); __m256 p1x = _mm256_loadu_ps(&px_local[9]); __m256 p1y = _mm256_loadu_ps(&py_local[9]); __m256 p1z = _mm256_loadu_ps(&pz_local[9]); @@ -560,6 +571,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 px = _mm256_load_ps(&px_local[i]); __m256 py = _mm256_load_ps(&py_local[i]); __m256 pz = _mm256_load_ps(&pz_local[i]); _mm256_store_ps(&cs->p_pos_x[p_base + i], px); _mm256_store_ps(&cs->p_pos_y[p_base + i], py); _mm256_store_ps(&cs->p_pos_z[p_base + i], pz); @@ -583,6 +595,7 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]); __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]); __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]); -
vurtun revised this gist
May 20, 2025 . No changes.There are no files selected for viewing
-
vurtun revised this gist
May 20, 2025 . 1 changed file with 40 additions and 31 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -21,6 +21,9 @@ struct chain_cfg { float max_strain; float friction; float collision_radius; float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force } __attribute__((aligned(32))); struct chain_sim { @@ -32,16 +35,13 @@ struct chain_sim { float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); @@ -85,11 +85,13 @@ void simulate_chains(struct chain_sim *cs, float dt) { const __m256 two_vec = _mm256_set1_ps(2.0f); const __m256 half_vec = _mm256_set1_ps(0.5f); const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f); const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f); const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f); const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f); const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f); const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f); const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); // Local arrays for PBD solver float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); @@ -107,15 +109,12 @@ void simulate_chains(struct chain_sim *cs, float dt) { float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); float vel_x[MAX_CHAINS] __attribute__((aligned(32))); float vel_y[MAX_CHAINS] __attribute__((aligned(32))); float vel_z[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); @@ -130,12 +129,10 @@ void simulate_chains(struct chain_sim *cs, float dt) { // Compute local-space wind { __m256 vx = _mm256_set1_ps(cs->chain_configs[c].wind_x); __m256 vy = _mm256_set1_ps(cs->chain_configs[c].wind_y); __m256 vz = _mm256_set1_ps(cs->chain_configs[c].wind_z); __m256 tx = _mm256_fmadd_ps(qy, vz, _mm256_mul_ps(qw, vx)); tx = _mm256_fnmadd_ps(qz, vy, tx); __m256 ty = _mm256_fmadd_ps(qz, vx, _mm256_mul_ps(qw, vy)); @@ -191,7 +188,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]); __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]); __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]); __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]); __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]); __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]); @@ -271,18 +267,26 @@ void simulate_chains(struct chain_sim *cs, float dt) { } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHAINS; c++) { int base_idx = c * PARTICLES_PER_CHAIN; struct chain_cfg *cfg = &cs->chain_configs[c]; // Load precomputed velocities and inertia parameters __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); // Clamp inertia parameters to [0, 1] linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { __m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); @@ -291,12 +295,20 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); // Linear inertia: v * dt * linear_inertia __m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); __m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); __m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); // Angular inertia: (ω × r) * dt * angular_inertia __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec); ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec); ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec); // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); @@ -308,11 +320,14 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 cent_delta_x = _mm256_mul_ps(_mm256_mul_ps(cross2_x, dt2_vec), p_inv_mass); __m256 cent_delta_y = _mm256_mul_ps(_mm256_mul_ps(cross2_y, dt2_vec), p_inv_mass); __m256 cent_delta_z = _mm256_mul_ps(_mm256_mul_ps(cross2_z, dt2_vec), p_inv_mass); cent_delta_x = _mm256_mul_ps(cent_delta_x, centrifugal_inertia_vec); cent_delta_y = _mm256_mul_ps(cent_delta_y, centrifugal_inertia_vec); cent_delta_z = _mm256_mul_ps(cent_delta_z, centrifugal_inertia_vec); // Total delta __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x); __m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y); __m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z); // Update positions _mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x)); @@ -324,7 +339,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; cs->prev_chain_pos_z[c] = cs->chain_pos_z[c]; cs->prev_chain_quat_x[c] = cs->chain_quat_x[c]; cs->prev_chain_quat_y[c] = cs->chain_quat_y[c]; cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; @@ -339,7 +353,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]); __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]); __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]); __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]); __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]); __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]); @@ -420,7 +433,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 p0y = _mm256_load_ps(&py_local[0]); __m256 p0z = _mm256_load_ps(&pz_local[0]); __m256 p0m = _mm256_load_ps(&pm_local[0]); __m256 p1x = _mm256_loadu_ps(&px_local[1]); __m256 p1y = _mm256_loadu_ps(&py_local[1]); __m256 p1z = _mm256_loadu_ps(&pz_local[1]); @@ -483,7 +495,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 p0y = _mm256_load_ps(&py_local[8]); __m256 p0z = _mm256_load_ps(&pz_local[8]); __m256 p0m = _mm256_load_ps(&pm_local[8]); __m256 p1x = _mm256_loadu_ps(&px_local[9]); __m256 p1y = _mm256_loadu_ps(&py_local[9]); __m256 p1z = _mm256_loadu_ps(&pz_local[9]); @@ -549,7 +560,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 px = _mm256_load_ps(&px_local[i]); __m256 py = _mm256_load_ps(&py_local[i]); __m256 pz = _mm256_load_ps(&pz_local[i]); _mm256_store_ps(&cs->p_pos_x[p_base + i], px); _mm256_store_ps(&cs->p_pos_y[p_base + i], py); _mm256_store_ps(&cs->p_pos_z[p_base + i], pz); @@ -573,7 +583,6 @@ void simulate_chains(struct chain_sim *cs, float dt) { __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]); __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]); __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);
NewerOlder