Last active
October 28, 2025 21:55
-
-
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <immintrin.h> | |
| #include <math.h> | |
| #include <assert.h> | |
| #include <string.h> | |
| #include <float.h> | |
| #define MAX_MOTION_RADIUS 1000000.0f | |
| enum { | |
| MAX_CHNS = 128, | |
| PTC_PER_CHN = 16, | |
| SPH_PER_CHN = 8, | |
| MAX_PTC = (MAX_CHNS * PTC_PER_CHN), | |
| MAX_SPH = (MAX_CHNS * SPH_PER_CHN), | |
| CON_PER_CHN = PTC_PER_CHN, | |
| MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), | |
| MAX_ITR = 16, | |
| }; | |
| struct chn_cfg { | |
| float damping; // [0, 1] | |
| float stiffness; // [0, 1] | |
| float friction; // [0, 1] | |
| float linear_inertia; // [0, 1], 1.0f = physically correct linear motion | |
| float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation | |
| float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force | |
| float bend_stiffness; // [0, 1], bending resistance | |
| }; | |
| struct chn_sim { | |
| unsigned char free_idx_cnt; | |
| unsigned char free_idx[MAX_CHNS]; | |
| struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32))); | |
| // chain global forces (world space gravity and wind) | |
| float chn_grav_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_grav_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_grav_z[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32))); | |
| // chain transformations (world space) | |
| float chn_pos_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_pos_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_pos_z[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_quat_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_quat_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_quat_z[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_quat_w[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32))); | |
| // particle positions (model space, transposed: 0-7 even indices, 8-15 odd indices) | |
| float ptc_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
| float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
| float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
| float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32))); | |
| float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
| float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
| float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
| float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); // Transposed: even cons 0-7, odd 8-15 (last dummy) | |
| // XPBD lambdas for distance constraints (transposed similarly) | |
| float ptc_lambda[MAX_REST_LEN] __attribute__((aligned(32))); | |
| float ptc_bend_rest[MAX_REST_LEN] __attribute__((aligned(32))); | |
| float ptc_bend_lambda[MAX_REST_LEN] __attribute__((aligned(32))); | |
| // sphere positions (model space) | |
| float sph_x[MAX_SPH] __attribute__((aligned(32))); | |
| float sph_y[MAX_SPH] __attribute__((aligned(32))); | |
| float sph_z[MAX_SPH] __attribute__((aligned(32))); | |
| float sph_r[MAX_SPH] __attribute__((aligned(32))); | |
| // rest positions (model space, transposed) | |
| float rest_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
| float rest_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
| float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
| float motion_radius[MAX_PTC] __attribute__((aligned(32))); | |
| }; | |
| static int | |
| chn_sim__val(struct chn_sim *cns) { | |
| assert(cns); | |
| assert(cns->free_idx_cnt <= MAX_CHNS); | |
| for (int i = 0; i < cns->free_idx_cnt; ++i) { | |
| assert(i < MAX_CHNS); | |
| assert(cns->free_idx[i] < MAX_CHNS); | |
| } | |
| for (int i = 0; i < MAX_CHNS; ++i) { | |
| const struct chn_cfg *cfg = &cns->chn_cfg[i]; | |
| assert(cfg->damping >= 0.0f); | |
| assert(cfg->damping <= 1.0f); | |
| assert(cfg->stiffness >= 0.0f); | |
| assert(cfg->stiffness <= 1.0f); | |
| assert(cfg->friction >= 0.0f); | |
| assert(cfg->friction <= 1.0f); | |
| assert(cfg->linear_inertia >= 0.0f); | |
| assert(cfg->linear_inertia <= 1.0f); | |
| assert(cfg->angular_inertia >= 0.0f); | |
| assert(cfg->angular_inertia <= 1.0f); | |
| assert(cfg->centrifugal_inertia >= 0.0f); | |
| assert(cfg->centrifugal_inertia <= 1.0f); | |
| assert(cfg->bend_stiffness >= 0.0f); | |
| assert(cfg->bend_stiffness <= 1.0f); | |
| assert(isfinite(cns->chn_grav_x[i])); | |
| assert(isfinite(cns->chn_grav_y[i])); | |
| assert(isfinite(cns->chn_grav_z[i])); | |
| assert(isfinite(cns->chn_wnd_x[i])); | |
| assert(isfinite(cns->chn_wnd_y[i])); | |
| assert(isfinite(cns->chn_wnd_z[i])); | |
| assert(isfinite(cns->chn_pos_x[i])); | |
| assert(isfinite(cns->chn_pos_y[i])); | |
| assert(isfinite(cns->chn_pos_z[i])); | |
| assert(isfinite(cns->chn_quat_x[i])); | |
| assert(isfinite(cns->chn_quat_y[i])); | |
| assert(isfinite(cns->chn_quat_z[i])); | |
| assert(isfinite(cns->chn_quat_w[i])); | |
| // Quaternion normalization invariant | |
| float qdot = cns->chn_quat_x[i]*cns->chn_quat_x[i] + | |
| cns->chn_quat_y[i]*cns->chn_quat_y[i] + | |
| cns->chn_quat_z[i]*cns->chn_quat_z[i] + | |
| cns->chn_quat_w[i]*cns->chn_quat_w[i]; | |
| assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); | |
| assert(isfinite(cns->chn_prev_pos_x[i])); | |
| assert(isfinite(cns->chn_prev_pos_y[i])); | |
| assert(isfinite(cns->chn_prev_pos_z[i])); | |
| assert(isfinite(cns->chn_prev_quat_x[i])); | |
| assert(isfinite(cns->chn_prev_quat_y[i])); | |
| assert(isfinite(cns->chn_prev_quat_z[i])); | |
| assert(isfinite(cns->chn_prev_quat_w[i])); | |
| // Prev quat norm | |
| qdot = cns->chn_prev_quat_x[i]*cns->chn_prev_quat_x[i] + | |
| cns->chn_prev_quat_y[i]*cns->chn_prev_quat_y[i] + | |
| cns->chn_prev_quat_z[i]*cns->chn_prev_quat_z[i] + | |
| cns->chn_prev_quat_w[i]*cns->chn_prev_quat_w[i]; | |
| assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); | |
| } | |
| for (int i = 0; i < MAX_PTC; ++i) { | |
| assert(i < MAX_PTC); | |
| assert(cns->ptc_rest_len[i] >= 0.0f); | |
| assert(cns->motion_radius[i] >= 0.0f); | |
| assert(cns->motion_radius[i] <= MAX_MOTION_RADIUS); | |
| assert(isfinite(cns->ptc_pos_x[i])); | |
| assert(isfinite(cns->ptc_pos_y[i])); | |
| assert(isfinite(cns->ptc_pos_z[i])); | |
| assert(isfinite(cns->ptc_inv_mass[i])); | |
| assert(cns->ptc_inv_mass[i] >= 0.0f); | |
| assert(isfinite(cns->ptc_prev_pos_x[i])); | |
| assert(isfinite(cns->ptc_prev_pos_y[i])); | |
| assert(isfinite(cns->ptc_prev_pos_z[i])); | |
| assert(isfinite(cns->rest_pos_x[i])); | |
| assert(isfinite(cns->rest_pos_y[i])); | |
| assert(isfinite(cns->rest_pos_z[i])); | |
| assert(isfinite(cns->motion_radius[i])); | |
| // Spot-check motion constraint post-any update (sample every 8th for perf) | |
| float dx = cns->ptc_pos_x[i] - cns->rest_pos_x[i]; | |
| float dy = cns->ptc_pos_y[i] - cns->rest_pos_y[i]; | |
| float dz = cns->ptc_pos_z[i] - cns->rest_pos_z[i]; | |
| float dist = sqrtf(dx*dx + dy*dy + dz*dz); | |
| assert(dist <= cns->motion_radius[i] + 1e-6f); | |
| } | |
| for (int i = 0; i < MAX_SPH; ++i) { | |
| assert(i < MAX_SPH); | |
| assert(isfinite(cns->sph_x[i])); | |
| assert(isfinite(cns->sph_y[i])); | |
| assert(isfinite(cns->sph_z[i])); | |
| assert(isfinite(cns->sph_r[i])); | |
| assert(cns->sph_r[i] >= 0.0f); | |
| } | |
| for (int i = 0; i < MAX_REST_LEN; ++i) { | |
| assert(i < MAX_REST_LEN); | |
| assert(cns->ptc_rest_len[i] >= 0.0f); | |
| assert(isfinite(cns->ptc_rest_len[i])); | |
| assert(isfinite(cns->ptc_lambda[i])); | |
| assert(cns->ptc_lambda[i] >= -1e6f); | |
| assert(cns->ptc_lambda[i] <= 1e6f); | |
| assert(cns->ptc_bend_rest[i] >= 0.0f); | |
| assert(isfinite(cns->ptc_bend_rest[i])); | |
| assert(isfinite(cns->ptc_bend_lambda[i])); | |
| assert(cns->ptc_bend_lambda[i] >= -1e6f); | |
| assert(cns->ptc_bend_lambda[i] <= 1e6f); | |
| } | |
| return 1; | |
| } | |
| extern void | |
| chn_sim_init(struct chn_sim *cns) { | |
| assert(cns); | |
| memset(cns, 0, sizeof(*cns)); | |
| cns->free_idx_cnt = MAX_CHNS; | |
| for (int i = 0; i < MAX_CHNS; ++i) { | |
| cns->free_idx[i] = MAX_CHNS - i - 1; | |
| } | |
| for (int i = 0; i < MAX_CHNS; i += 8) { | |
| _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f)); | |
| _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f)); | |
| } | |
| for (int i = 0; i < MAX_PTC; i += 8) { | |
| _mm256_store_ps(&cns->motion_radius[i], _mm256_set1_ps(MAX_MOTION_RADIUS)); | |
| } | |
| assert(chn_sim__val(cns)); | |
| } | |
| extern int | |
| chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { | |
| assert(cns); | |
| assert(cfg); | |
| assert(cns->free_idx_cnt > 0); | |
| assert(cnt <= PTC_PER_CHN); | |
| assert(cnt > 1); | |
| assert(chn_sim__val(cns)); | |
| assert(cfg->damping >= 0.0f); | |
| assert(cfg->damping <= 1.0f); | |
| assert(cfg->stiffness >= 0.0f); | |
| assert(cfg->stiffness <= 1.0f); | |
| assert(cfg->friction >= 0.0f); | |
| assert(cfg->friction <= 1.0f); | |
| assert(cfg->linear_inertia >= 0.0f); | |
| assert(cfg->linear_inertia <= 1.0f); | |
| assert(cfg->angular_inertia >= 0.0f); | |
| assert(cfg->angular_inertia <= 1.0f); | |
| assert(cfg->centrifugal_inertia >= 0.0f); | |
| assert(cfg->centrifugal_inertia <= 1.0f); | |
| assert(cfg->bend_stiffness >= 0.0f); | |
| assert(cfg->bend_stiffness <= 1.0f); | |
| int chn = cns->free_idx[--cns->free_idx_cnt]; | |
| cns->chn_cfg[chn] = *cfg; | |
| int p_base = (chn * PTC_PER_CHN); | |
| assert(p_base + 15 < MAX_PTC); | |
| int r_base = (chn * CON_PER_CHN); | |
| assert(r_base + 15 < MAX_REST_LEN); | |
| int even_base = p_base + 0; // Even indices 0-7 | |
| int odd_base = p_base + 8; // Odd indices 8-15 | |
| // Transpose particles: even indices to 0-7, odd to 8-15 | |
| int even_cnt = (cnt + 1) / 2; // Number of even particles | |
| int odd_cnt = cnt / 2; // Number of odd particles | |
| assert(even_cnt + odd_cnt == cnt); | |
| for (int i = 0; i < even_cnt; ++i) { | |
| int orig_i = 2 * i; | |
| assert(orig_i * 4 + 3 < cnt * 4); | |
| cns->ptc_pos_x[even_base + i] = rest_pos[orig_i*4+0]; | |
| cns->ptc_pos_y[even_base + i] = rest_pos[orig_i*4+1]; | |
| cns->ptc_pos_z[even_base + i] = rest_pos[orig_i*4+2]; | |
| cns->ptc_inv_mass[even_base + i] = rest_pos[orig_i*4+3]; | |
| cns->rest_pos_x[even_base + i] = rest_pos[orig_i*4+0]; | |
| cns->rest_pos_y[even_base + i] = rest_pos[orig_i*4+1]; | |
| cns->rest_pos_z[even_base + i] = rest_pos[orig_i*4+2]; | |
| // Set previous positions to match initial positions for zero initial velocity | |
| cns->ptc_prev_pos_x[even_base + i] = cns->ptc_pos_x[even_base + i]; | |
| cns->ptc_prev_pos_y[even_base + i] = cns->ptc_pos_y[even_base + i]; | |
| cns->ptc_prev_pos_z[even_base + i] = cns->ptc_pos_z[even_base + i]; | |
| } | |
| for (int i = even_cnt; i < 8; ++i) { | |
| cns->ptc_pos_x[even_base + i] = 0.0f; | |
| cns->ptc_pos_y[even_base + i] = 0.0f; | |
| cns->ptc_pos_z[even_base + i] = 0.0f; | |
| cns->ptc_inv_mass[even_base + i] = 0.0f; | |
| cns->rest_pos_x[even_base + i] = 0.0f; | |
| cns->rest_pos_y[even_base + i] = 0.0f; | |
| cns->rest_pos_z[even_base + i] = 0.0f; | |
| // Previous positions for pads remain 0.0f | |
| cns->ptc_prev_pos_x[even_base + i] = 0.0f; | |
| cns->ptc_prev_pos_y[even_base + i] = 0.0f; | |
| cns->ptc_prev_pos_z[even_base + i] = 0.0f; | |
| } | |
| for (int i = 0; i < odd_cnt; ++i) { | |
| int orig_i = 2 * i + 1; | |
| assert(orig_i * 4 + 3 < cnt * 4); | |
| cns->ptc_pos_x[odd_base + i] = rest_pos[orig_i*4+0]; | |
| cns->ptc_pos_y[odd_base + i] = rest_pos[orig_i*4+1]; | |
| cns->ptc_pos_z[odd_base + i] = rest_pos[orig_i*4+2]; | |
| cns->ptc_inv_mass[odd_base + i] = rest_pos[orig_i*4+3]; | |
| cns->rest_pos_x[odd_base + i] = rest_pos[orig_i*4+0]; | |
| cns->rest_pos_y[odd_base + i] = rest_pos[orig_i*4+1]; | |
| cns->rest_pos_z[odd_base + i] = rest_pos[orig_i*4+2]; | |
| // Set previous positions to match initial positions for zero initial velocity | |
| cns->ptc_prev_pos_x[odd_base + i] = cns->ptc_pos_x[odd_base + i]; | |
| cns->ptc_prev_pos_y[odd_base + i] = cns->ptc_pos_y[odd_base + i]; | |
| cns->ptc_prev_pos_z[odd_base + i] = cns->ptc_pos_z[odd_base + i]; | |
| } | |
| for (int i = odd_cnt; i < 8; ++i) { | |
| cns->ptc_pos_x[odd_base + i] = 0.0f; | |
| cns->ptc_pos_y[odd_base + i] = 0.0f; | |
| cns->ptc_pos_z[odd_base + i] = 0.0f; | |
| cns->ptc_inv_mass[odd_base + i] = 0.0f; | |
| cns->rest_pos_x[odd_base + i] = 0.0f; | |
| cns->rest_pos_y[odd_base + i] = 0.0f; | |
| cns->rest_pos_z[odd_base + i] = 0.0f; | |
| // Previous positions for pads remain 0.0f | |
| cns->ptc_prev_pos_x[odd_base + i] = 0.0f; | |
| cns->ptc_prev_pos_y[odd_base + i] = 0.0f; | |
| cns->ptc_prev_pos_z[odd_base + i] = 0.0f; | |
| } | |
| // Set motion radius to FLT_MAX by default (no constraint) | |
| for (int i = 0; i < PTC_PER_CHN; i += 8) { | |
| _mm256_store_ps(&cns->motion_radius[p_base + i], _mm256_set1_ps(MAX_MOTION_RADIUS)); | |
| } | |
| // Transpose constraints: even cons (0,2,4,...) to r_base+0-7, odd (1,3,5,...) to r_base+8-15 | |
| for (int i = 1; i < cnt; ++i) { | |
| float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; | |
| float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; | |
| float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; | |
| __m256 dist_sq_vec = _mm256_set1_ps(dx*dx + dy*dy + dz*dz); | |
| __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec); | |
| float len = _mm256_cvtss_f32(sqrt_vec); | |
| assert(isfinite(len)); | |
| len = fmaxf(len, 1e-6f); | |
| int con_idx = i - 1; | |
| if ((con_idx & 0x01) == 0) { // Even constraint | |
| cns->ptc_rest_len[r_base + (con_idx / 2)] = len; | |
| cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f; | |
| } else { // Odd constraint | |
| cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = len; | |
| cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f; | |
| } | |
| } | |
| for (int i = cnt - 1; i < CON_PER_CHN - 1; ++i) { // Pad dummies | |
| int con_idx = i; | |
| if ((con_idx & 0x01) == 0) { | |
| cns->ptc_rest_len[r_base + (con_idx / 2)] = 0.0f; | |
| cns->ptc_lambda[r_base + (con_idx / 2)] = 0.0f; | |
| } else { | |
| cns->ptc_rest_len[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f; | |
| cns->ptc_lambda[r_base + 8 + ((con_idx - 1) / 2)] = 0.0f; | |
| } | |
| } | |
| // Transpose bending constraints: even bends (0-2,2-4,...) to r_base+0-7, odd (1-3,3-5,...) to r_base+8-15 (new) | |
| int even_bend_cnt = even_cnt - 1; | |
| int odd_bend_cnt = odd_cnt - 1; | |
| assert(even_bend_cnt == even_cnt - 1); | |
| assert(odd_bend_cnt == odd_cnt - 1); | |
| for (int i = 0; i < even_bend_cnt; ++i) { | |
| float dx = cns->rest_pos_x[even_base + i] - cns->rest_pos_x[even_base + i + 1]; | |
| float dy = cns->rest_pos_y[even_base + i] - cns->rest_pos_y[even_base + i + 1]; | |
| float dz = cns->rest_pos_z[even_base + i] - cns->rest_pos_z[even_base + i + 1]; | |
| __m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz); | |
| __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec); | |
| float len = _mm256_cvtss_f32(sqrt_vec); | |
| assert(isfinite(len)); | |
| len = fmaxf(len, 1e-6f); | |
| cns->ptc_bend_rest[r_base + i] = len; | |
| cns->ptc_bend_lambda[r_base + i] = 0.0f; | |
| } | |
| for (int i = even_bend_cnt; i < 8; ++i) { | |
| cns->ptc_bend_rest[r_base + i] = 0.0f; | |
| cns->ptc_bend_lambda[r_base + i] = 0.0f; | |
| } | |
| for (int i = 0; i < odd_bend_cnt; ++i) { | |
| float dx = cns->rest_pos_x[odd_base + i] - cns->rest_pos_x[odd_base + i + 1]; | |
| float dy = cns->rest_pos_y[odd_base + i] - cns->rest_pos_y[odd_base + i + 1]; | |
| float dz = cns->rest_pos_z[odd_base + i] - cns->rest_pos_z[odd_base + i + 1]; | |
| __m256 dist_sq_vec = _mm256_set1_ps(dx * dx + dy * dy + dz * dz); | |
| __m256 sqrt_vec = _mm256_sqrt_ps(dist_sq_vec); | |
| float len = _mm256_cvtss_f32(sqrt_vec); | |
| assert(isfinite(len)); | |
| len = fmaxf(len, 1e-6f); | |
| cns->ptc_bend_rest[r_base + 8 + i] = len; | |
| cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f; | |
| } | |
| for (int i = odd_bend_cnt; i < 8; ++i) { | |
| cns->ptc_bend_rest[r_base + 8 + i] = 0.0f; | |
| cns->ptc_bend_lambda[r_base + 8 + i] = 0.0f; | |
| } | |
| assert(chn_sim__val(cns)); | |
| return chn; | |
| } | |
| extern void | |
| chn_sim_del(struct chn_sim *cns, int chn) { | |
| assert(chn_sim__val(cns)); | |
| assert(chn >= 0); | |
| assert(chn < MAX_CHNS); | |
| int p_base = (chn * PTC_PER_CHN); | |
| assert(p_base + 15 < MAX_PTC); | |
| int s_base = (chn * SPH_PER_CHN); | |
| assert(s_base + 7 < MAX_SPH); | |
| int r_base = (chn * CON_PER_CHN); | |
| assert(r_base + 15 < MAX_REST_LEN); | |
| cns->free_idx[cns->free_idx_cnt++] = chn; | |
| memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); | |
| cns->chn_grav_x[chn] = 0.0f; | |
| cns->chn_grav_y[chn] = 0.0f; | |
| cns->chn_grav_z[chn] = 0.0f; | |
| cns->chn_wnd_x[chn] = 0.0f; | |
| cns->chn_wnd_y[chn] = 0.0f; | |
| cns->chn_wnd_z[chn] = 0.0f; | |
| cns->chn_pos_x[chn] = 0.0f; | |
| cns->chn_pos_y[chn] = 0.0f; | |
| cns->chn_pos_z[chn] = 0.0f; | |
| cns->chn_quat_x[chn] = 0.0f; | |
| cns->chn_quat_y[chn] = 0.0f; | |
| cns->chn_quat_z[chn] = 0.0f; | |
| cns->chn_quat_w[chn] = 1.0f; | |
| cns->chn_prev_pos_x[chn] = 0.0f; | |
| cns->chn_prev_pos_y[chn] = 0.0f; | |
| cns->chn_prev_pos_z[chn] = 0.0f; | |
| cns->chn_prev_quat_x[chn] = 0.0f; | |
| cns->chn_prev_quat_y[chn] = 0.0f; | |
| cns->chn_prev_quat_z[chn] = 0.0f; | |
| cns->chn_prev_quat_w[chn] = 1.0f; | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_inv_mass[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_inv_mass[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->rest_pos_x[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->rest_pos_x[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->rest_pos_y[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->rest_pos_y[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->rest_pos_z[p_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->rest_pos_z[p_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->motion_radius[p_base + 0], _mm256_set1_ps(MAX_MOTION_RADIUS)); | |
| _mm256_store_ps(&cns->motion_radius[p_base + 8], _mm256_set1_ps(MAX_MOTION_RADIUS)); | |
| _mm256_store_ps(&cns->sph_x[s_base], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->sph_y[s_base], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->sph_z[s_base], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->sph_r[s_base], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_rest_len[r_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_rest_len[r_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_lambda[r_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_lambda[r_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_bend_rest[r_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], _mm256_set1_ps(0.0f)); | |
| assert(chn_sim__val(cns)); | |
| } | |
| extern void | |
| chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, | |
| const float *restrict rot4) { | |
| assert(chn_sim__val(cns)); | |
| assert(pos3); | |
| assert(rot4); | |
| assert(chn >= 0); | |
| assert(chn < MAX_CHNS); | |
| cns->chn_pos_x[chn] = pos3[0]; | |
| cns->chn_pos_y[chn] = pos3[1]; | |
| cns->chn_pos_z[chn] = pos3[2]; | |
| cns->chn_quat_x[chn] = rot4[0]; | |
| cns->chn_quat_y[chn] = rot4[1]; | |
| cns->chn_quat_z[chn] = rot4[2]; | |
| cns->chn_quat_w[chn] = rot4[3]; | |
| float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3]; | |
| assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); | |
| assert(isfinite(cns->chn_quat_x[chn])); | |
| assert(isfinite(cns->chn_quat_y[chn])); | |
| assert(isfinite(cns->chn_quat_z[chn])); | |
| assert(isfinite(cns->chn_quat_w[chn])); | |
| assert(chn_sim__val(cns)); | |
| } | |
| extern void | |
| chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, | |
| const float *restrict rot4) { | |
| assert(chn_sim__val(cns)); | |
| assert(pos3); | |
| assert(rot4); | |
| assert(chn >= 0); | |
| assert(chn < MAX_CHNS); | |
| cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0]; | |
| cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1]; | |
| cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2]; | |
| cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0]; | |
| cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1]; | |
| cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; | |
| cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; | |
| float qdot = rot4[0]*rot4[0] + rot4[1]*rot4[1] + rot4[2]*rot4[2] + rot4[3]*rot4[3]; | |
| assert(fabsf(sqrtf(qdot) - 1.0f) < 1e-6f); | |
| assert(isfinite(cns->chn_quat_x[chn])); | |
| assert(isfinite(cns->chn_quat_y[chn])); | |
| assert(isfinite(cns->chn_quat_z[chn])); | |
| assert(isfinite(cns->chn_quat_w[chn])); | |
| assert(isfinite(cns->chn_prev_quat_x[chn])); | |
| assert(isfinite(cns->chn_prev_quat_y[chn])); | |
| assert(isfinite(cns->chn_prev_quat_z[chn])); | |
| assert(isfinite(cns->chn_prev_quat_w[chn])); | |
| assert(chn_sim__val(cns)); | |
| } | |
| extern void | |
| chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { | |
| assert(chn_sim__val(cns)); | |
| assert(grav3); | |
| assert(chn >= 0); | |
| assert(chn < MAX_CHNS); | |
| cns->chn_grav_x[chn] = grav3[0]; | |
| cns->chn_grav_y[chn] = grav3[1]; | |
| cns->chn_grav_z[chn] = grav3[2]; | |
| assert(isfinite(cns->chn_grav_x[chn])); | |
| assert(isfinite(cns->chn_grav_y[chn])); | |
| assert(isfinite(cns->chn_grav_z[chn])); | |
| assert(chn_sim__val(cns)); | |
| } | |
| extern void | |
| chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { | |
| assert(chn_sim__val(cns)); | |
| assert(wind3); | |
| assert(chn >= 0); | |
| assert(chn < MAX_CHNS); | |
| cns->chn_wnd_x[chn] = wind3[0]; | |
| cns->chn_wnd_y[chn] = wind3[1]; | |
| cns->chn_wnd_z[chn] = wind3[2]; | |
| assert(isfinite(cns->chn_wnd_x[chn])); | |
| assert(isfinite(cns->chn_wnd_y[chn])); | |
| assert(isfinite(cns->chn_wnd_z[chn])); | |
| assert(chn_sim__val(cns)); | |
| } | |
| extern void | |
| chn_sim_set_sph(struct chn_sim *cns, int chn, const float *restrict spheres, int cnt) { | |
| assert(chn_sim__val(cns)); | |
| assert(spheres); | |
| assert(cnt <= SPH_PER_CHN); | |
| assert(cnt >= 0); | |
| int s_base = (chn * SPH_PER_CHN); | |
| assert(s_base + 7 < MAX_SPH); | |
| for (int i = 0; i < cnt; i++) { | |
| assert(i * 4 + 3 < cnt * 4); | |
| cns->sph_x[s_base + i] = spheres[i*4+0]; | |
| cns->sph_y[s_base + i] = spheres[i*4+1]; | |
| cns->sph_z[s_base + i] = spheres[i*4+2]; | |
| cns->sph_r[s_base + i] = spheres[i*4+3]; | |
| assert(isfinite(cns->sph_x[s_base + i])); | |
| assert(isfinite(cns->sph_y[s_base + i])); | |
| assert(isfinite(cns->sph_z[s_base + i])); | |
| assert(isfinite(cns->sph_r[s_base + i])); | |
| assert(cns->sph_r[s_base + i] >= 0.0f); | |
| } | |
| for (int i = cnt; i < SPH_PER_CHN; ++i) { | |
| cns->sph_x[s_base + i] = 0; | |
| cns->sph_y[s_base + i] = 0; | |
| cns->sph_z[s_base + i] = 0; | |
| cns->sph_r[s_base + i] = 0; | |
| } | |
| assert(chn_sim__val(cns)); | |
| } | |
| extern void | |
| chn_sim_con_motion(struct chn_sim *cns, int chn, const float *restrict radius, int cnt) { | |
| assert(chn_sim__val(cns)); | |
| assert(chn >= 0); | |
| assert(chn < MAX_CHNS); | |
| assert(cnt >= 0); | |
| assert(cnt < PTC_PER_CHN); | |
| assert(radius); | |
| int p_base = (chn * PTC_PER_CHN); | |
| assert(p_base + 15 < MAX_PTC); | |
| int even_base = p_base + 0; | |
| int odd_base = p_base + 8; | |
| int even_cnt = (cnt + 1) / 2; | |
| int odd_cnt = cnt / 2; | |
| for (int i = 0; i < even_cnt; ++i) { | |
| assert(2 * i < cnt); | |
| cns->motion_radius[even_base + i] = radius[2 * i]; | |
| assert(cns->motion_radius[even_base + i] >= 0.0f); | |
| assert(cns->motion_radius[even_base + i] <= MAX_MOTION_RADIUS); | |
| } | |
| for (int i = even_cnt; i < 8; ++i) { | |
| cns->motion_radius[even_base + i] = MAX_MOTION_RADIUS; | |
| } | |
| for (int i = 0; i < odd_cnt; ++i) { | |
| assert(2 * i + 1 < cnt); | |
| cns->motion_radius[odd_base + i] = radius[2 * i + 1]; | |
| assert(cns->motion_radius[odd_base + i] >= 0.0f); | |
| assert(cns->motion_radius[odd_base + i] <= MAX_MOTION_RADIUS); | |
| } | |
| for (int i = odd_cnt; i < 8; ++i) { | |
| cns->motion_radius[odd_base + i] = MAX_MOTION_RADIUS; | |
| } | |
| assert(chn_sim__val(cns)); | |
| } | |
| extern void | |
| chn_sim_run(struct chn_sim *cns, float dt) { | |
| assert(chn_sim__val(cns)); | |
| assert(dt > 0.0f); | |
| assert(dt < 1.0f); | |
| // SIMD constants | |
| const __m256 dt_vec = _mm256_set1_ps(dt); | |
| const __m256 dt2_vec = _mm256_set1_ps(dt * dt); | |
| const __m256 one_vec = _mm256_set1_ps(1.0f); | |
| const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); | |
| const __m256 eps_vec = _mm256_set1_ps(1e-6f); | |
| const __m256 lambda_min_vec = _mm256_set1_ps(-1e6f); | |
| const __m256 lambda_max_vec = _mm256_set1_ps(1e6f); | |
| const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f); | |
| const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f); | |
| const __m256 zero_vec = _mm256_setzero_ps(); | |
| const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); | |
| const __m256 two_vec = _mm256_set1_ps(2.0f); | |
| const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); | |
| const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); | |
| const __m256 neg_rest_vec = _mm256_set1_ps(-1.0f); // For FMAs | |
| // Stack arrays for precomputed data | |
| float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32))); | |
| float vel_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float vel_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float vel_z[MAX_CHNS] __attribute__((aligned(32))); | |
| float ang_vel_x[MAX_CHNS] __attribute__((aligned(32))); | |
| float ang_vel_y[MAX_CHNS] __attribute__((aligned(32))); | |
| float ang_vel_z[MAX_CHNS] __attribute__((aligned(32))); | |
| // Step 0: Precomputation | |
| for (int c = 0; c < MAX_CHNS; c += 8) { | |
| __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); | |
| __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 0, 3); | |
| __builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 0, 3); | |
| // Load chain quaternions | |
| __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); | |
| __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); | |
| __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]); | |
| __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]); | |
| // Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
| __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
| __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
| __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
| // Compute local-space wind | |
| { | |
| __m256 vx = _mm256_load_ps(&cns->chn_wnd_x[c]); | |
| __m256 vy = _mm256_load_ps(&cns->chn_wnd_y[c]); | |
| __m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]); | |
| // Step 1: t = q_conj * v_world_as_quat | |
| __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
| tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
| __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
| ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
| __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
| tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
| __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
| tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
| tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
| // Step 2: res_vec = (t * q)_vec | |
| __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
| res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); | |
| __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
| res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y)); | |
| __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
| res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z)); | |
| _mm256_store_ps(&chn_wnd_lcl_x[c], res_x); | |
| _mm256_store_ps(&chn_wnd_lcl_y[c], res_y); | |
| _mm256_store_ps(&chn_wnd_lcl_z[c], res_z); | |
| assert(isfinite(_mm256_cvtss_f32(res_x))); | |
| assert(isfinite(_mm256_cvtss_f32(res_y))); | |
| assert(isfinite(_mm256_cvtss_f32(res_z))); | |
| } | |
| // Compute local-space gravity (identical to wind) | |
| { | |
| __m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]); | |
| __m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]); | |
| __m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]); | |
| __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); | |
| tx = _mm256_fnmadd_ps(cqz, vy, tx); | |
| __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); | |
| ty = _mm256_fnmadd_ps(cqx, vz, ty); | |
| __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); | |
| tz = _mm256_fnmadd_ps(cqy, vx, tz); | |
| __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); | |
| tw = _mm256_fnmadd_ps(cqy, vy, tw); | |
| tw = _mm256_fnmadd_ps(cqz, vz, tw); | |
| __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
| res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); | |
| __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
| res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y)); | |
| __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
| res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z)); | |
| _mm256_store_ps(&chn_grav_lcl_x[c], res_x); | |
| _mm256_store_ps(&chn_grav_lcl_y[c], res_y); | |
| _mm256_store_ps(&chn_grav_lcl_z[c], res_z); | |
| assert(isfinite(_mm256_cvtss_f32(res_x))); | |
| assert(isfinite(_mm256_cvtss_f32(res_y))); | |
| assert(isfinite(_mm256_cvtss_f32(res_z))); | |
| } | |
| // Compute linear velocity | |
| { | |
| __m256 curr_x = _mm256_load_ps(&cns->chn_pos_x[c]); | |
| __m256 curr_y = _mm256_load_ps(&cns->chn_pos_y[c]); | |
| __m256 curr_z = _mm256_load_ps(&cns->chn_pos_z[c]); | |
| __m256 prev_x = _mm256_load_ps(&cns->chn_prev_pos_x[c]); | |
| __m256 prev_y = _mm256_load_ps(&cns->chn_prev_pos_y[c]); | |
| __m256 prev_z = _mm256_load_ps(&cns->chn_prev_pos_z[c]); | |
| __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); | |
| __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); | |
| __m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec); | |
| _mm256_store_ps(&vel_x[c], vel_x_vec); | |
| _mm256_store_ps(&vel_y[c], vel_y_vec); | |
| _mm256_store_ps(&vel_z[c], vel_z_vec); | |
| assert(isfinite(_mm256_cvtss_f32(vel_x_vec))); | |
| assert(isfinite(_mm256_cvtss_f32(vel_y_vec))); | |
| assert(isfinite(_mm256_cvtss_f32(vel_z_vec))); | |
| } | |
| // Compute angular velocity | |
| { | |
| __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); | |
| __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); | |
| __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]); | |
| __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]); | |
| __m256 prev_qx = _mm256_load_ps(&cns->chn_prev_quat_x[c]); | |
| __m256 prev_qy = _mm256_load_ps(&cns->chn_prev_quat_y[c]); | |
| __m256 prev_qz = _mm256_load_ps(&cns->chn_prev_quat_z[c]); | |
| __m256 prev_qw = _mm256_load_ps(&cns->chn_prev_quat_w[c]); | |
| // Step 1: Compute delta quaternion (to - from) | |
| __m256 dt_qx = _mm256_sub_ps(qx, prev_qx); | |
| __m256 dt_qy = _mm256_sub_ps(qy, prev_qy); | |
| __m256 dt_qz = _mm256_sub_ps(qz, prev_qz); | |
| __m256 dt_qw = _mm256_sub_ps(qw, prev_qw); | |
| // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) | |
| __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx); | |
| __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy); | |
| __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz); | |
| __m256 conj_qw = prev_qw; | |
| // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) | |
| __m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw)); | |
| orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw); | |
| orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw); | |
| __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(dt_qy, conj_qz), _mm256_mul_ps(dt_qz, conj_qy)); | |
| __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(dt_qz, conj_qx), _mm256_mul_ps(dt_qx, conj_qz)); | |
| __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(dt_qx, conj_qy), _mm256_mul_ps(dt_qy, conj_qx)); | |
| __m256 orient_qx = _mm256_fmadd_ps(dt_qw, conj_qx, _mm256_mul_ps(dt_qx, conj_qw)); | |
| orient_qx = _mm256_add_ps(orient_qx, cross_x); | |
| __m256 orient_qy = _mm256_fmadd_ps(dt_qw, conj_qy, _mm256_mul_ps(dt_qy, conj_qw)); | |
| orient_qy = _mm256_add_ps(orient_qy, cross_y); | |
| __m256 orient_qz = _mm256_fmadd_ps(dt_qw, conj_qz, _mm256_mul_ps(dt_qz, conj_qw)); | |
| orient_qz = _mm256_add_ps(orient_qz, cross_z); | |
| // Step 4: Compute dot product (to, from) for shortest arc check | |
| __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw)); | |
| dot = _mm256_fmadd_ps(qy, prev_qy, dot); | |
| dot = _mm256_fmadd_ps(qz, prev_qz, dot); | |
| // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) | |
| __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); | |
| __m256 negate_sign = _mm256_blendv_ps(one_vec, neg_one_vec, negate_mask); | |
| orient_qx = _mm256_mul_ps(orient_qx, negate_sign); | |
| orient_qy = _mm256_mul_ps(orient_qy, negate_sign); | |
| orient_qz = _mm256_mul_ps(orient_qz, negate_sign); | |
| // Norm check and scale if >1+eps | |
| __m256 norm_sq = _mm256_fmadd_ps(orient_qz, orient_qz, _mm256_fmadd_ps(orient_qy, orient_qy, _mm256_fmadd_ps(orient_qx, orient_qx, _mm256_mul_ps(orient_qw, orient_qw)))); | |
| __m256 scale_mask = _mm256_cmp_ps(norm_sq, _mm256_set1_ps((1.0f + 1e-6f) * (1.0f + 1e-6f)), _CMP_GT_OQ); | |
| __m256 inv_norm = _mm256_rsqrt_ps(norm_sq); | |
| __m256 scale = _mm256_blendv_ps(one_vec, inv_norm, scale_mask); | |
| orient_qx = _mm256_mul_ps(orient_qx, scale); | |
| orient_qy = _mm256_mul_ps(orient_qy, scale); | |
| orient_qz = _mm256_mul_ps(orient_qz, scale); | |
| orient_qw = _mm256_mul_ps(orient_qw, scale); | |
| // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) | |
| __m256 scale_ang = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec)); | |
| __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale_ang); | |
| __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale_ang); | |
| __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale_ang); | |
| // Store results | |
| _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); | |
| _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); | |
| _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); | |
| assert(isfinite(_mm256_cvtss_f32(ang_vel_x_vec))); | |
| assert(isfinite(_mm256_cvtss_f32(ang_vel_y_vec))); | |
| assert(isfinite(_mm256_cvtss_f32(ang_vel_z_vec))); | |
| } | |
| } | |
| // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) - Duplicated even/odd | |
| for (int c = 0; c < MAX_CHNS; ++c) { | |
| __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 1, 3); | |
| __builtin_prefetch(&cns->ptc_pos_y[c * PTC_PER_CHN + 32], 1, 3); | |
| __builtin_prefetch(&cns->ptc_pos_z[c * PTC_PER_CHN + 32], 1, 3); | |
| int p_base = c * PTC_PER_CHN; | |
| assert(p_base + 15 < MAX_PTC); | |
| struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
| __m256 delta_x_vec = _mm256_set1_ps(vel_x[c] * dt); | |
| __m256 delta_y_vec = _mm256_set1_ps(vel_y[c] * dt); | |
| __m256 delta_z_vec = _mm256_set1_ps(vel_z[c] * dt); | |
| __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); | |
| __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); | |
| __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); | |
| __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); | |
| __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); | |
| __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); | |
| linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
| angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
| centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
| __m256 lin_lag_factor = _mm256_sub_ps(one_vec, linear_inertia_vec); | |
| __m256 lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor); | |
| __m256 lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor); | |
| __m256 lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor); | |
| // Centrifugal zero-check: ang_vel mag (shared) | |
| __m256 ang_mag_sq = _mm256_fmadd_ps(ang_vel_z_vec, ang_vel_z_vec, _mm256_fmadd_ps(ang_vel_y_vec, ang_vel_y_vec, _mm256_mul_ps(ang_vel_x_vec, ang_vel_x_vec))); | |
| __m256 ang_zero_mask = _mm256_cmp_ps(ang_mag_sq, eps_vec, _CMP_LT_OQ); | |
| // Even block | |
| { | |
| __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); | |
| __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); | |
| __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); | |
| __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); | |
| pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); | |
| __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
| __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
| __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
| ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec); | |
| ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec); | |
| ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec); | |
| __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
| __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
| __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
| __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); | |
| __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); | |
| __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); | |
| __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec); | |
| __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); | |
| __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); | |
| __m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec); | |
| __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec); | |
| __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec); | |
| // Apply centrifugal zero-check | |
| cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask); | |
| cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask); | |
| cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask); | |
| // Blend masks for all dt | |
| lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask); | |
| lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask); | |
| lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask); | |
| ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask); | |
| ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask); | |
| ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask); | |
| cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask); | |
| cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask); | |
| cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask); | |
| __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); | |
| __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); | |
| __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_add_ps(px, dt_x)); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_add_ps(py, dt_y)); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_add_ps(pz, dt_z)); | |
| } | |
| // Odd block | |
| { | |
| __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); | |
| __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); | |
| __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); | |
| __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); | |
| pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| __m256 valid_mask = _mm256_cmp_ps(pm, zero_vec, _CMP_GT_OQ); | |
| __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
| __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
| __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
| ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec); | |
| ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec); | |
| ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec); | |
| __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
| __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
| __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
| __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); | |
| __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); | |
| __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); | |
| __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec); | |
| __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); | |
| __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); | |
| __m256 cent_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_x), centrifugal_inertia_vec); | |
| __m256 cent_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_y), centrifugal_inertia_vec); | |
| __m256 cent_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, base_cent_dt_z), centrifugal_inertia_vec); | |
| // Apply centrifugal zero-check | |
| cent_dt_x = _mm256_blendv_ps(cent_dt_x, zero_vec, ang_zero_mask); | |
| cent_dt_y = _mm256_blendv_ps(cent_dt_y, zero_vec, ang_zero_mask); | |
| cent_dt_z = _mm256_blendv_ps(cent_dt_z, zero_vec, ang_zero_mask); | |
| lin_dt_x = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_x_vec), lin_lag_factor); | |
| lin_dt_y = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_y_vec), lin_lag_factor); | |
| lin_dt_z = _mm256_mul_ps(_mm256_sub_ps(zero_vec, delta_z_vec), lin_lag_factor); | |
| lin_dt_x = _mm256_blendv_ps(zero_vec, lin_dt_x, valid_mask); | |
| lin_dt_y = _mm256_blendv_ps(zero_vec, lin_dt_y, valid_mask); | |
| lin_dt_z = _mm256_blendv_ps(zero_vec, lin_dt_z, valid_mask); | |
| ang_dt_x = _mm256_blendv_ps(zero_vec, ang_dt_x, valid_mask); | |
| ang_dt_y = _mm256_blendv_ps(zero_vec, ang_dt_y, valid_mask); | |
| ang_dt_z = _mm256_blendv_ps(zero_vec, ang_dt_z, valid_mask); | |
| cent_dt_x = _mm256_blendv_ps(zero_vec, cent_dt_x, valid_mask); | |
| cent_dt_y = _mm256_blendv_ps(zero_vec, cent_dt_y, valid_mask); | |
| cent_dt_z = _mm256_blendv_ps(zero_vec, cent_dt_z, valid_mask); | |
| __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); | |
| __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); | |
| __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_add_ps(px, dt_x)); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_add_ps(py, dt_y)); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_add_ps(pz, dt_z)); | |
| } | |
| // Update previous transformation | |
| cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; | |
| cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; | |
| cns->chn_prev_pos_z[c] = cns->chn_pos_z[c]; | |
| cns->chn_prev_quat_x[c] = cns->chn_quat_x[c]; | |
| cns->chn_prev_quat_y[c] = cns->chn_quat_y[c]; | |
| cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; | |
| cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; | |
| } | |
| // Step 2: Verlet Integration - Duplicated for even/odd | |
| for (int c = 0; c < MAX_CHNS; ++c) { | |
| __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); | |
| int p_base = c * PTC_PER_CHN; | |
| assert(p_base + 15 < MAX_PTC); | |
| struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
| __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]); | |
| __m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]); | |
| __m256 chn_grav_z_vec = _mm256_set1_ps(chn_grav_lcl_z[c]); | |
| __m256 chn_wnd_x_vec = _mm256_set1_ps(chn_wnd_lcl_x[c]); | |
| __m256 chn_wnd_y_vec = _mm256_set1_ps(chn_wnd_lcl_y[c]); | |
| __m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]); | |
| __m256 damping_vec = _mm256_set1_ps(cfg->damping); | |
| __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec); | |
| __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec); | |
| __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec); | |
| // Even block | |
| { | |
| __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); | |
| __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); | |
| __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); | |
| __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); | |
| ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]); | |
| __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]); | |
| __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]); | |
| __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass); | |
| __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass); | |
| __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass); | |
| __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
| __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
| __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
| __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); | |
| __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); | |
| __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); | |
| __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); | |
| __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); | |
| __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); | |
| __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); | |
| __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); | |
| __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); | |
| _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 0], p_curr_x); | |
| _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 0], p_curr_y); | |
| _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 0], p_curr_z); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_x))); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_y))); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_z))); | |
| } | |
| // Odd block | |
| { | |
| __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); | |
| __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); | |
| __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); | |
| __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); | |
| ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]); | |
| __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]); | |
| __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]); | |
| __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass); | |
| __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass); | |
| __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass); | |
| __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
| __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
| __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
| __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); | |
| __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); | |
| __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); | |
| __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); | |
| __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); | |
| __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); | |
| __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); | |
| __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); | |
| __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], new_p_x); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], new_p_y); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], new_p_z); | |
| _mm256_store_ps(&cns->ptc_prev_pos_x[p_base + 8], p_curr_x); | |
| _mm256_store_ps(&cns->ptc_prev_pos_y[p_base + 8], p_curr_y); | |
| _mm256_store_ps(&cns->ptc_prev_pos_z[p_base + 8], p_curr_z); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_x))); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_y))); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_z))); | |
| } | |
| } | |
| // Step 3: XPBD Constraints (Distance only) - Permutes intra-vector only, no cross-block mixing | |
| for (int c = 0; c < MAX_CHNS; ++c) { | |
| __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); | |
| int p_base = c * PTC_PER_CHN; | |
| assert(p_base + 15 < MAX_PTC); | |
| int r_base = c * CON_PER_CHN; | |
| assert(r_base + 15 < MAX_REST_LEN);struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
| __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); | |
| __m256 compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(stiff_vec, eps_vec)); // Fallback if 0 | |
| __m256 bend_stiff_vec = _mm256_set1_ps(cfg->bend_stiffness); | |
| __m256 bend_compliance_factor = _mm256_div_ps(dt2_vec, _mm256_max_ps(bend_stiff_vec, eps_vec)); | |
| // Direct load even (p0, p2, ..., p14) and odd (p1, p3, ..., p15) | |
| __m256 even_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); | |
| __m256 even_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); | |
| __m256 even_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); | |
| __m256 even_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| __m256 odd_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); | |
| __m256 odd_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); | |
| __m256 odd_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); | |
| __m256 odd_m = _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]), ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| // Direct load transposed rests/lambdas: even cons [0,2,...] in r_base+0, odd in r_base+8 | |
| __m256 rest_even = _mm256_load_ps(&cns->ptc_rest_len[r_base + 0]); | |
| __m256 lambda_even = _mm256_load_ps(&cns->ptc_lambda[r_base + 0]); | |
| __m256 rest_odd = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]); | |
| __m256 lambda_odd = _mm256_load_ps(&cns->ptc_lambda[r_base + 8]); | |
| __m256 bend_rest_even = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 0]); | |
| __m256 bend_lambda_even = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 0]); | |
| __m256 bend_rest_odd = _mm256_load_ps(&cns->ptc_bend_rest[r_base + 8]); | |
| __m256 bend_lambda_odd = _mm256_load_ps(&cns->ptc_bend_lambda[r_base + 8]); | |
| _mm256_store_ps(&cns->ptc_bend_rest[r_base + 8], _mm256_set1_ps(0.0f)); | |
| _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], _mm256_set1_ps(0.0f)); | |
| for (int iter = 0; iter < MAX_ITR; ++iter) { | |
| // Even distance constraints: p0 = even (A), p1 = odd (B) | |
| { | |
| __m256 dx = _mm256_sub_ps(even_x, odd_x); // A - B | |
| __m256 dy = _mm256_sub_ps(even_y, odd_y); | |
| __m256 dz = _mm256_sub_ps(even_z, odd_z); | |
| __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); | |
| __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
| __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
| __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); | |
| assert(isfinite(_mm256_cvtss_f32(dist))); | |
| __m256 w_sum = _mm256_add_ps(even_m, odd_m); | |
| __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); | |
| __m256 mask_rest = _mm256_cmp_ps(rest_even, eps_vec, _CMP_GT_OQ); | |
| __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_even)); | |
| __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_even)); // C + compliance * lambda | |
| __m256 numerator = _mm256_sub_ps(zero_vec, temp); // - (C + compliance * lambda) | |
| __m256 denom = _mm256_add_ps(w_sum, compliance_factor); // w_sum + compliance | |
| __m256 delta_lambda = _mm256_div_ps(numerator, denom); | |
| __m256 new_lambda = _mm256_add_ps(lambda_even, delta_lambda); | |
| new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec)); | |
| __m256 unit_x = _mm256_mul_ps(dx, inv_dist); // grad_A = unit | |
| __m256 unit_y = _mm256_mul_ps(dy, inv_dist); | |
| __m256 unit_z = _mm256_mul_ps(dz, inv_dist); | |
| __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda); | |
| __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda); | |
| __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda); | |
| __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); | |
| corr_x = _mm256_and_ps(corr_x, corr_mask); | |
| corr_y = _mm256_and_ps(corr_y, corr_mask); | |
| corr_z = _mm256_and_ps(corr_z, corr_mask); | |
| // dpos_A = delta_lambda * unit * even_m | |
| even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, even_m), one_vec, even_x); | |
| even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, even_m), one_vec, even_y); | |
| even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, even_m), one_vec, even_z); | |
| // dpos_B = delta_lambda * (-unit) * odd_m = - corr * odd_m | |
| odd_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x); | |
| odd_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y); | |
| odd_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z); | |
| lambda_even = new_lambda; | |
| } | |
| // Odd distance constraints: p1 = odd (A), p2 = even shifted (B) | |
| { | |
| __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); | |
| __m256 p2_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx)); | |
| __m256 p2_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx)); | |
| __m256 p2_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx)); | |
| __m256 p2_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx)); | |
| __m256 dx = _mm256_sub_ps(odd_x, p2_x); | |
| __m256 dy = _mm256_sub_ps(odd_y, p2_y); | |
| __m256 dz = _mm256_sub_ps(odd_z, p2_z); | |
| __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); | |
| __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
| __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
| __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); | |
| assert(isfinite(_mm256_cvtss_f32(dist))); | |
| __m256 w_sum = _mm256_add_ps(odd_m, p2_m); | |
| __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); | |
| __m256 mask_rest = _mm256_cmp_ps(rest_odd, eps_vec, _CMP_GT_OQ); | |
| __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, rest_odd)); | |
| __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_odd)); | |
| __m256 numerator = _mm256_sub_ps(zero_vec, temp); | |
| __m256 denom = _mm256_add_ps(w_sum, compliance_factor); | |
| __m256 delta_lambda = _mm256_div_ps(numerator, denom); | |
| __m256 new_lambda = _mm256_add_ps(lambda_odd, delta_lambda); | |
| new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec)); | |
| __m256 unit_x = _mm256_mul_ps(dx, inv_dist); | |
| __m256 unit_y = _mm256_mul_ps(dy, inv_dist); | |
| __m256 unit_z = _mm256_mul_ps(dz, inv_dist); | |
| __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda); | |
| __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda); | |
| __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda); | |
| __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); | |
| corr_x = _mm256_and_ps(corr_x, corr_mask); | |
| corr_y = _mm256_and_ps(corr_y, corr_mask); | |
| corr_z = _mm256_and_ps(corr_z, corr_mask); | |
| // dpos_A = delta_lambda * unit * odd_m | |
| odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, odd_m), one_vec, odd_x); | |
| odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, odd_m), one_vec, odd_y); | |
| odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, odd_m), one_vec, odd_z); | |
| // dpos_B = delta_lambda * (-unit) * p2_m = - corr * p2_m (permuted add via shift) | |
| __m256 delta_p2_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, p2_m), one_vec, zero_vec); | |
| __m256 delta_even_x = _mm256_permutevar8x32_ps(delta_p2_x, _mm256_castsi256_ps(shift_idx)); | |
| even_x = _mm256_add_ps(even_x, delta_even_x); | |
| __m256 delta_p2_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, p2_m), one_vec, zero_vec); | |
| __m256 delta_even_y = _mm256_permutevar8x32_ps(delta_p2_y, _mm256_castsi256_ps(shift_idx)); | |
| even_y = _mm256_add_ps(even_y, delta_even_y); | |
| __m256 delta_p2_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, p2_m), one_vec, zero_vec); | |
| __m256 delta_even_z = _mm256_permutevar8x32_ps(delta_p2_z, _mm256_castsi256_ps(shift_idx)); | |
| even_z = _mm256_add_ps(even_z, delta_even_z); | |
| lambda_odd = new_lambda; | |
| } | |
| // Even bending constraints: between even[k] (A) and even[k+1] (B) | |
| { | |
| __m256 a_x = even_x; | |
| __m256 a_y = even_y; | |
| __m256 a_z = even_z; | |
| __m256 a_m = even_m; | |
| __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); | |
| __m256 b_x = _mm256_permutevar8x32_ps(even_x, _mm256_castsi256_ps(shift_idx)); | |
| __m256 b_y = _mm256_permutevar8x32_ps(even_y, _mm256_castsi256_ps(shift_idx)); | |
| __m256 b_z = _mm256_permutevar8x32_ps(even_z, _mm256_castsi256_ps(shift_idx)); | |
| __m256 b_m = _mm256_permutevar8x32_ps(even_m, _mm256_castsi256_ps(shift_idx)); | |
| __m256 dx = _mm256_sub_ps(a_x, b_x); | |
| __m256 dy = _mm256_sub_ps(a_y, b_y); | |
| __m256 dz = _mm256_sub_ps(a_z, b_z); | |
| __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); | |
| __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
| __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
| __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); | |
| assert(isfinite(_mm256_cvtss_f32(dist))); | |
| __m256 w_sum = _mm256_add_ps(a_m, b_m); | |
| __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); | |
| __m256 mask_rest = _mm256_cmp_ps(bend_rest_even, eps_vec, _CMP_GT_OQ); | |
| __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_even)); | |
| __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_even)); | |
| __m256 numerator = _mm256_sub_ps(zero_vec, temp); | |
| __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor); | |
| __m256 delta_lambda = _mm256_div_ps(numerator, denom); | |
| __m256 new_lambda = _mm256_add_ps(bend_lambda_even, delta_lambda); | |
| new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec)); | |
| __m256 unit_x = _mm256_mul_ps(dx, inv_dist); | |
| __m256 unit_y = _mm256_mul_ps(dy, inv_dist); | |
| __m256 unit_z = _mm256_mul_ps(dz, inv_dist); | |
| __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda); | |
| __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda); | |
| __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda); | |
| __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); | |
| corr_x = _mm256_and_ps(corr_x, corr_mask); | |
| corr_y = _mm256_and_ps(corr_y, corr_mask); | |
| corr_z = _mm256_and_ps(corr_z, corr_mask); | |
| // dpos_A = delta_lambda * unit * a_m | |
| even_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, even_x); | |
| even_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, even_y); | |
| even_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, even_z); | |
| // dpos_B = - corr * b_m (permuted add) | |
| __m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec); | |
| __m256 delta_even_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx)); | |
| even_x = _mm256_add_ps(even_x, delta_even_x); | |
| __m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec); | |
| __m256 delta_even_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx)); | |
| even_y = _mm256_add_ps(even_y, delta_even_y); | |
| __m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec); | |
| __m256 delta_even_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx)); | |
| even_z = _mm256_add_ps(even_z, delta_even_z); | |
| bend_lambda_even = new_lambda; | |
| } | |
| // Odd bending constraints: between odd[k] (A) and odd[k+1] (B) | |
| { | |
| __m256 a_x = odd_x; | |
| __m256 a_y = odd_y; | |
| __m256 a_z = odd_z; | |
| __m256 a_m = odd_m; | |
| __m256i shift_idx = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); | |
| __m256 b_x = _mm256_permutevar8x32_ps(odd_x, _mm256_castsi256_ps(shift_idx)); | |
| __m256 b_y = _mm256_permutevar8x32_ps(odd_y, _mm256_castsi256_ps(shift_idx)); | |
| __m256 b_z = _mm256_permutevar8x32_ps(odd_z, _mm256_castsi256_ps(shift_idx)); | |
| __m256 b_m = _mm256_permutevar8x32_ps(odd_m, _mm256_castsi256_ps(shift_idx)); | |
| __m256 dx = _mm256_sub_ps(a_x, b_x); | |
| __m256 dy = _mm256_sub_ps(a_y, b_y); | |
| __m256 dz = _mm256_sub_ps(a_z, b_z); | |
| __m256 dist_sq = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx))); | |
| __m256 mask_dist = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
| __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
| __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); | |
| assert(isfinite(_mm256_cvtss_f32(dist))); | |
| __m256 w_sum = _mm256_add_ps(a_m, b_m); | |
| __m256 mask_w = _mm256_cmp_ps(w_sum, eps_vec, _CMP_GT_OQ); | |
| __m256 mask_rest = _mm256_cmp_ps(bend_rest_odd, eps_vec, _CMP_GT_OQ); | |
| __m256 C = _mm256_fmadd_ps(dist, one_vec, _mm256_mul_ps(neg_rest_vec, bend_rest_odd)); | |
| __m256 temp = _mm256_add_ps(C, _mm256_mul_ps(bend_compliance_factor, bend_lambda_odd)); | |
| __m256 numerator = _mm256_sub_ps(zero_vec, temp); | |
| __m256 denom = _mm256_add_ps(w_sum, bend_compliance_factor); | |
| __m256 delta_lambda = _mm256_div_ps(numerator, denom); | |
| __m256 new_lambda = _mm256_add_ps(bend_lambda_odd, delta_lambda); | |
| new_lambda = _mm256_max_ps(lambda_min_vec, _mm256_min_ps(new_lambda, lambda_max_vec)); | |
| __m256 unit_x = _mm256_mul_ps(dx, inv_dist); | |
| __m256 unit_y = _mm256_mul_ps(dy, inv_dist); | |
| __m256 unit_z = _mm256_mul_ps(dz, inv_dist); | |
| __m256 corr_x = _mm256_mul_ps(unit_x, delta_lambda); | |
| __m256 corr_y = _mm256_mul_ps(unit_y, delta_lambda); | |
| __m256 corr_z = _mm256_mul_ps(unit_z, delta_lambda); | |
| __m256 corr_mask = _mm256_and_ps(_mm256_and_ps(mask_dist, mask_w), mask_rest); | |
| corr_x = _mm256_and_ps(corr_x, corr_mask); | |
| corr_y = _mm256_and_ps(corr_y, corr_mask); | |
| corr_z = _mm256_and_ps(corr_z, corr_mask); | |
| // dpos_A = delta_lambda * unit * a_m | |
| odd_x = _mm256_fmadd_ps(_mm256_mul_ps(corr_x, a_m), one_vec, odd_x); | |
| odd_y = _mm256_fmadd_ps(_mm256_mul_ps(corr_y, a_m), one_vec, odd_y); | |
| odd_z = _mm256_fmadd_ps(_mm256_mul_ps(corr_z, a_m), one_vec, odd_z); | |
| // dpos_B = - corr * b_m (permuted add) | |
| __m256 delta_b_x = _mm256_fnmadd_ps(_mm256_mul_ps(corr_x, b_m), one_vec, zero_vec); | |
| __m256 delta_odd_x = _mm256_permutevar8x32_ps(delta_b_x, _mm256_castsi256_ps(shift_idx)); | |
| odd_x = _mm256_add_ps(odd_x, delta_odd_x); | |
| __m256 delta_b_y = _mm256_fnmadd_ps(_mm256_mul_ps(corr_y, b_m), one_vec, zero_vec); | |
| __m256 delta_odd_y = _mm256_permutevar8x32_ps(delta_b_y, _mm256_castsi256_ps(shift_idx)); | |
| odd_y = _mm256_add_ps(odd_y, delta_odd_y); | |
| __m256 delta_b_z = _mm256_fnmadd_ps(_mm256_mul_ps(corr_z, b_m), one_vec, zero_vec); | |
| __m256 delta_odd_z = _mm256_permutevar8x32_ps(delta_b_z, _mm256_castsi256_ps(shift_idx)); | |
| odd_z = _mm256_add_ps(odd_z, delta_odd_z); | |
| bend_lambda_odd = new_lambda; | |
| } | |
| } | |
| // Store back | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], even_x); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], even_y); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], even_z); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], odd_x); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], odd_y); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], odd_z); | |
| _mm256_store_ps(&cns->ptc_lambda[r_base + 0], lambda_even); | |
| _mm256_store_ps(&cns->ptc_lambda[r_base + 8], lambda_odd); | |
| _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 0], bend_lambda_even); | |
| _mm256_store_ps(&cns->ptc_bend_lambda[r_base + 8], bend_lambda_odd); | |
| assert(isfinite(_mm256_cvtss_f32(even_x))); | |
| assert(isfinite(_mm256_cvtss_f32(odd_x))); | |
| } | |
| // Step 4: Sphere Collisions with Friction - Duplicated for even/odd | |
| for (int c = 0; c < MAX_CHNS; ++c) { | |
| __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); | |
| int p_base = c * PTC_PER_CHN; | |
| assert(p_base + 15 < MAX_PTC); | |
| int s_base = c * SPH_PER_CHN; | |
| assert(s_base + 7 < MAX_SPH); | |
| struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
| __m256 fric_vec = _mm256_set1_ps(cfg->friction); | |
| for (int s = 0; s < SPH_PER_CHN; ++s) { | |
| __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]); | |
| __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]); | |
| __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]); | |
| __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]); | |
| // Even block | |
| { | |
| __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); | |
| __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); | |
| __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); | |
| __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 0]); | |
| __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 0]); | |
| __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 0]); | |
| __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); | |
| ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); | |
| __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); | |
| __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); | |
| __m256 dist_sq = _mm256_mul_ps(dx, dx); | |
| dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
| dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
| __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
| __m256 min_inv_dist = _mm256_set1_ps(1e-6f); | |
| inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); | |
| __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); | |
| assert(isfinite(_mm256_cvtss_f32(dist))); | |
| __m256 pen = _mm256_sub_ps(sph_r_vec, dist); | |
| __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); | |
| __m256 norm_x = _mm256_mul_ps(dx, inv_dist); | |
| __m256 norm_y = _mm256_mul_ps(dy, inv_dist); | |
| __m256 norm_z = _mm256_mul_ps(dz, inv_dist); | |
| __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); | |
| __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); | |
| __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); | |
| __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
| __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
| __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
| __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x))); | |
| __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); | |
| __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); | |
| __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); | |
| __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x))); | |
| tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec); // Clamp before rsqrt | |
| __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq); | |
| __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); | |
| __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); | |
| __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); | |
| __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); | |
| __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); | |
| __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); | |
| __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); | |
| total_corr_x = _mm256_and_ps(total_corr_x, col_mask); | |
| total_corr_y = _mm256_and_ps(total_corr_y, col_mask); | |
| total_corr_z = _mm256_and_ps(total_corr_z, col_mask); | |
| total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); | |
| total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); | |
| total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); | |
| __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); | |
| __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); | |
| __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_x))); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_y))); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_z))); | |
| } | |
| // Odd block | |
| { | |
| __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); | |
| __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); | |
| __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); | |
| __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + 8]); | |
| __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + 8]); | |
| __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + 8]); | |
| __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); | |
| ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); | |
| __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); | |
| __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); | |
| __m256 dist_sq = _mm256_mul_ps(dx, dx); | |
| dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
| dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
| __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
| __m256 min_inv_dist = _mm256_set1_ps(1e-6f); | |
| inv_dist = _mm256_max_ps(inv_dist, min_inv_dist); | |
| __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); | |
| assert(isfinite(_mm256_cvtss_f32(dist))); | |
| __m256 pen = _mm256_sub_ps(sph_r_vec, dist); | |
| __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); | |
| __m256 norm_x = _mm256_mul_ps(dx, inv_dist); | |
| __m256 norm_y = _mm256_mul_ps(dy, inv_dist); | |
| __m256 norm_z = _mm256_mul_ps(dz, inv_dist); | |
| __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); | |
| __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); | |
| __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); | |
| __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
| __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
| __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
| __m256 vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, _mm256_fmadd_ps(vel_y, norm_y, _mm256_mul_ps(vel_x, norm_x))); | |
| __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); | |
| __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); | |
| __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); | |
| __m256 tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, _mm256_fmadd_ps(tan_y, tan_y, _mm256_mul_ps(tan_x, tan_x))); | |
| tan_mag_sq = _mm256_max_ps(tan_mag_sq, eps_vec); // Clamp before rsqrt | |
| __m256 inv_tan_mag = _mm256_rsqrt_ps(tan_mag_sq); | |
| __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); | |
| __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); | |
| __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); | |
| __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); | |
| __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); | |
| __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); | |
| __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); | |
| total_corr_x = _mm256_and_ps(total_corr_x, col_mask); | |
| total_corr_y = _mm256_and_ps(total_corr_y, col_mask); | |
| total_corr_z = _mm256_and_ps(total_corr_z, col_mask); | |
| total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); | |
| total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); | |
| total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); | |
| __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); | |
| __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); | |
| __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], new_p_x); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], new_p_y); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], new_p_z); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_x))); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_y))); | |
| assert(isfinite(_mm256_cvtss_f32(new_p_z))); | |
| } | |
| } | |
| } | |
| // Step 5: Motion Constraints - Duplicated for even/odd | |
| for (int c = 0; c < MAX_CHNS; ++c) { | |
| __builtin_prefetch(&cns->ptc_pos_x[c * PTC_PER_CHN + 32], 0, 3); | |
| int p_base = c * PTC_PER_CHN; | |
| assert(p_base + 15 < MAX_PTC); | |
| // Even block | |
| { | |
| __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 0]); | |
| __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 0]); | |
| __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 0]); | |
| __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 0]); | |
| pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 0]); | |
| __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 0]); | |
| __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 0]); | |
| __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 0]); | |
| __m256 dx = _mm256_sub_ps(px, rx); | |
| __m256 dy = _mm256_sub_ps(py, ry); | |
| __m256 dz = _mm256_sub_ps(pz, rz); | |
| __m256 dist_sq = _mm256_mul_ps(dx, dx); | |
| dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
| dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
| __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
| __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); | |
| assert(isfinite(_mm256_cvtss_f32(dist))); | |
| __m256 pen = _mm256_sub_ps(dist, r_vec); | |
| __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); | |
| __m256 corr_factor = _mm256_mul_ps(pen, inv_dist); | |
| corr_factor = _mm256_and_ps(corr_factor, mask); | |
| __m256 dt_x = _mm256_mul_ps(dx, corr_factor); | |
| __m256 dt_y = _mm256_mul_ps(dy, corr_factor); | |
| __m256 dt_z = _mm256_mul_ps(dz, corr_factor); | |
| dt_x = _mm256_mul_ps(dt_x, pm); | |
| dt_y = _mm256_mul_ps(dt_y, pm); | |
| dt_z = _mm256_mul_ps(dt_z, pm); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 0], _mm256_sub_ps(px, dt_x)); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 0], _mm256_sub_ps(py, dt_y)); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 0], _mm256_sub_ps(pz, dt_z)); | |
| assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x)))); | |
| assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y)))); | |
| assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z)))); | |
| } | |
| // Odd block | |
| { | |
| __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + 8]); | |
| __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + 8]); | |
| __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + 8]); | |
| __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + 8]); | |
| pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
| __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + 8]); | |
| __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + 8]); | |
| __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + 8]); | |
| __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + 8]); | |
| __m256 dx = _mm256_sub_ps(px, rx); | |
| __m256 dy = _mm256_sub_ps(py, ry); | |
| __m256 dz = _mm256_sub_ps(pz, rz); | |
| __m256 dist_sq = _mm256_mul_ps(dx, dx); | |
| dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
| dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
| __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
| __m256 dist = _mm256_mul_ps(dist_sq, inv_dist); | |
| assert(isfinite(_mm256_cvtss_f32(dist))); | |
| __m256 pen = _mm256_sub_ps(dist, r_vec); | |
| __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); | |
| __m256 corr_factor = _mm256_mul_ps(pen, inv_dist); | |
| corr_factor = _mm256_and_ps(corr_factor, mask); | |
| __m256 dt_x = _mm256_mul_ps(dx, corr_factor); | |
| __m256 dt_y = _mm256_mul_ps(dy, corr_factor); | |
| __m256 dt_z = _mm256_mul_ps(dz, corr_factor); | |
| dt_x = _mm256_mul_ps(dt_x, pm); | |
| dt_y = _mm256_mul_ps(dt_y, pm); | |
| dt_z = _mm256_mul_ps(dt_z, pm); | |
| _mm256_store_ps(&cns->ptc_pos_x[p_base + 8], _mm256_sub_ps(px, dt_x)); | |
| _mm256_store_ps(&cns->ptc_pos_y[p_base + 8], _mm256_sub_ps(py, dt_y)); | |
| _mm256_store_ps(&cns->ptc_pos_z[p_base + 8], _mm256_sub_ps(pz, dt_z)); | |
| assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(px, dt_x)))); | |
| assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(py, dt_y)))); | |
| assert(isfinite(_mm256_cvtss_f32(_mm256_sub_ps(pz, dt_z)))); | |
| } | |
| } | |
| assert(chn_sim__val(cns)); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <SDL2/SDL.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <math.h> | |
| #include "chain.c" | |
| #define WINDOW_WIDTH 800 | |
| #define WINDOW_HEIGHT 600 | |
| #define CHAIN_LENGTH 10 | |
| #define PARTICLE_SPACING 10.0f | |
| static void | |
| project(float x, float y, float z, int *screen_x, int *screen_y) { | |
| *screen_x = (int)(x + WINDOW_WIDTH / 2.0f); | |
| *screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f); | |
| } | |
| // Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity) | |
| static void quat_rotate_inv_translate(float wx, float wy, float wz, // world pos | |
| float cx, float cy, float cz, // chain pos | |
| float qx, float qy, float qz, float qw, // chain quat | |
| float *mx, float *my, float *mz) { // out model pos | |
| // Translate: world - chain | |
| float tx = wx - cx, ty = wy - cy, tz = wz - cz; | |
| // Quat conj: (qw, -qx, -qy, -qz) | |
| float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz; | |
| // t = conj * (0, tx,ty,tz) | |
| float tw = cqx*tx + cqy*ty + cqz*tz; | |
| float t_x = cqw*tx + cqy*tz - cqz*ty; | |
| float t_y = cqw*ty + cqz*tx - cqx*tz; | |
| float t_z = cqw*tz + cqx*ty - cqy*tx; | |
| // model = t * quat (vec part) | |
| *mx = qw*t_x + qy*t_z - qz*t_y + tw*qx; | |
| *my = qw*t_y + qz*t_x - qx*t_z + tw*qy; | |
| *mz = qw*t_z + qx*t_y - qy*t_x + tw*qz; | |
| } | |
| extern int | |
| main(int argc, char *argv[]) { | |
| if (SDL_Init(SDL_INIT_VIDEO) < 0) { | |
| printf("SDL_Init failed: %s\n", SDL_GetError()); | |
| return 1; | |
| } | |
| SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0); | |
| SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED); | |
| if (!window || !renderer) { | |
| printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError()); | |
| SDL_Quit(); | |
| return 1; | |
| } | |
| struct chn_sim sim; | |
| chn_sim_init(&sim); | |
| struct chn_cfg cfg = { | |
| .damping = 0.5f, | |
| .stiffness = 0.9f, | |
| .friction = 0.5f, | |
| .linear_inertia = 0.9f, // Tune 0.5f for more tangential lag | |
| .angular_inertia = 0.9f, | |
| .centrifugal_inertia = 0.9f, // Tune 0.5f for more outward bow | |
| .bend_stiffness = 0.8f, | |
| }; | |
| float rest_pos[CHAIN_LENGTH * 4]; | |
| for (int i = 0; i < CHAIN_LENGTH; i++) { | |
| rest_pos[i*4 + 0] = 0.0f; | |
| rest_pos[i*4 + 1] = -i * PARTICLE_SPACING; | |
| rest_pos[i*4 + 2] = 0.0f; | |
| rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f; | |
| } | |
| int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH); | |
| chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f}); // Reduced for spin test | |
| chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); // Start at circle edge | |
| // Fixed world spheres (triangle outside circle) | |
| float world_spheres[12] = { // 3 spheres * 4 floats (x,y,z,r) | |
| 60.0f, 0.0f, 0.0f, 10.0f, // Sphere 0: right | |
| -30.0f, 52.0f, 0.0f, 10.0f, // Sphere 1: top-left | |
| -30.0f,-52.0f, 0.0f, 10.0f // Sphere 2: bottom-left | |
| }; | |
| // Circular motion params | |
| float time = 0.0f; | |
| float circle_radius = 50.0f; | |
| float angular_speed = 2.0f * M_PI / 5.0f; // Full circle every 5s | |
| float dt = 0.016f; | |
| int running = 1; | |
| SDL_Event event; | |
| // Debug: Check gravity and masses | |
| int p_base = chn * PTC_PER_CHN; | |
| while (running) { | |
| while (SDL_PollEvent(&event)) { | |
| if (event.type == SDL_QUIT) { | |
| running = 0; | |
| } | |
| } | |
| time += dt; | |
| // Circular anchor motion | |
| float anchor_x = circle_radius * cosf(angular_speed * time); | |
| float anchor_y = circle_radius * sinf(angular_speed * time); | |
| chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); | |
| // Transform fixed world spheres to current model space | |
| float model_spheres[12]; | |
| for (int s = 0; s < 3; ++s) { | |
| float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2]; | |
| float r = world_spheres[s*4 + 3]; | |
| float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn]; | |
| float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn]; | |
| float mx, my, mz; | |
| quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz); | |
| model_spheres[s*4 + 0] = mx; | |
| model_spheres[s*4 + 1] = my; | |
| model_spheres[s*4 + 2] = mz; | |
| model_spheres[s*4 + 3] = r; | |
| } | |
| chn_sim_set_sph(&sim, chn, model_spheres, 3); | |
| chn_sim_run(&sim, dt); | |
| SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255); | |
| SDL_RenderClear(renderer); | |
| SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255); // Green for chain & spheres | |
| float root_x = sim.chn_pos_x[chn]; // Current root world x/y | |
| float root_y = sim.chn_pos_y[chn]; | |
| // Draw chain segments | |
| for (int i = 0; i < CHAIN_LENGTH - 1; i++) { | |
| int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2); | |
| int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2); | |
| float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0]; | |
| float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1]; | |
| float world_x0 = model_x0 + root_x; | |
| float world_y0 = model_y0 + root_y; | |
| float world_x1 = model_x1 + root_x; | |
| float world_y1 = model_y1 + root_y; | |
| int sx0, sy0, sx1, sy1; | |
| project(world_x0, world_y0, 0, &sx0, &sy0); | |
| project(world_x1, world_y1, 0, &sx1, &sy1); | |
| SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1); | |
| } | |
| // Draw spheres (approx circle outline with 32 points) | |
| int p_base = chn * PTC_PER_CHN; | |
| int s_base = chn * SPH_PER_CHN; | |
| for (int s = 0; s < 3; ++s) { // Only draw active spheres | |
| float sph_model_x = sim.sph_x[s_base + s]; | |
| float sph_model_y = sim.sph_y[s_base + s]; | |
| float sph_r = sim.sph_r[s_base + s]; | |
| if (sph_r <= 0.0f) continue; | |
| float sph_world_x = sph_model_x + root_x; | |
| float sph_world_y = sph_model_y + root_y; | |
| int sph_sx, sph_sy; | |
| project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy); | |
| // Parametric circle: 32 points | |
| const int num_points = 32; | |
| for (int pt = 0; pt < num_points; ++pt) { | |
| float angle0 = 2.0f * M_PI * pt / num_points; | |
| float angle1 = 2.0f * M_PI * (pt + 1) / num_points; | |
| float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0); | |
| float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1); | |
| int px0, py0, px1, py1; | |
| project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0); | |
| project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1); | |
| SDL_RenderDrawLine(renderer, px0, py0, px1, py1); | |
| } | |
| } | |
| SDL_RenderPresent(renderer); | |
| SDL_Delay(16); | |
| } | |
| SDL_DestroyRenderer(renderer); | |
| SDL_DestroyWindow(window); | |
| SDL_Quit(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment