#include #include #include #include #include enum { MAX_CHNS = 64, PTC_PER_CHN = 16, SPH_PER_CHN = 8, MAX_PTC = (MAX_CHNS * PTC_PER_CHN), MAX_SPH = (MAX_CHNS * SPH_PER_CHN), CON_PER_CHN = PTC_PER_CHN, MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 16, }; struct chn_cfg { float damping; // [0, 1] float stiffness; // [0, 1] float stretch_factor; float max_strain; float friction; // [0, 1] float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force }; struct chn_sim { unsigned char free_idx_cnt; unsigned char free_idx[MAX_CHNS]; struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32))); // chain global forces (world space gravity and wind) float chn_grav_x[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_y[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_z[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32))); // chain transformations (world space) float chn_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_w[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32))); // particle positions (model space) float ptc_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); // sphere positions (model space) float sph_x[MAX_SPH] __attribute__((aligned(32))); float sph_y[MAX_SPH] __attribute__((aligned(32))); float sph_z[MAX_SPH] __attribute__((aligned(32))); float sph_r[MAX_SPH] __attribute__((aligned(32))); // rest positions (model space) float rest_pos_x[MAX_PTC] __attribute__((aligned(32))); float rest_pos_y[MAX_PTC] __attribute__((aligned(32))); float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); float motion_radius[MAX_PTC] __attribute__((aligned(32))); }; extern void chn_sim_init(struct chn_sim *cns) { assert(cns); cns->free_idx_cnt = MAX_CHNS; for (int i = 0; i < MAX_CHNS; ++i) { cns->free_idx[i] = MAX_CHNS - i - 1; } for (int i = 0; i < MAX_CHNS; i += 8) { _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f)); _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f)); } } extern int chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { assert(cns); assert(cfg); assert(cns->free_idx_cnt > 0); assert(cnt <= PTC_PER_CHN); assert(cnt > 1); int chn = cns->free_idx[--cns->free_idx_cnt]; cns->chn_cfg[chn] = *cfg; int p_idx = (chn * PTC_PER_CHN); int r_idx = (chn * CON_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0]; cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1]; cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2]; cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3]; cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0]; cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1]; cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->ptc_pos_x[p_idx + i] = 0.0f; cns->ptc_pos_y[p_idx + i] = 0.0f; cns->ptc_pos_z[p_idx + i] = 0.0f; } for (int i = 1; i < cnt; ++i) { float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz); } for (int i = cnt; i < CON_PER_CHN; ++i) { cns->ptc_rest_len[r_idx + i] = 0.0f; cns->ptc_rest_len[r_idx + i] = 0.0f; cns->ptc_rest_len[r_idx + i] = 0.0f; } return chn; } extern void chn_sim_del(struct chn_sim *cns, int chn) { assert(cns); assert(chn >= 0); assert(chn < MAX_CHNS); int p_idx = (chn * PTC_PER_CHN); int s_idx = (chn * SPH_PER_CHN); int c_idx = (chn * CON_PER_CHN); cns->free_idx[cns->free_idx_cnt++] = chn; memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); cns->chn_grav_x[chn] = 0.0f; cns->chn_grav_y[chn] = 0.0f; cns->chn_grav_z[chn] = 0.0f; cns->chn_wnd_x[chn] = 0.0f; cns->chn_wnd_y[chn] = 0.0f; cns->chn_wnd_z[chn] = 0.0f; cns->chn_pos_x[chn] = 0.0f; cns->chn_pos_y[chn] = 0.0f; cns->chn_pos_z[chn] = 0.0f; cns->chn_quat_x[chn] = 0.0f; cns->chn_quat_y[chn] = 0.0f; cns->chn_quat_z[chn] = 0.0f; cns->chn_quat_w[chn] = 1.0f; cns->chn_prev_pos_x[chn] = 0.0f; cns->chn_prev_pos_y[chn] = 0.0f; cns->chn_prev_pos_z[chn] = 0.0f; cns->chn_prev_quat_x[chn] = 0.0f; cns->chn_prev_quat_y[chn] = 0.0f; cns->chn_prev_quat_z[chn] = 0.0f; cns->chn_prev_quat_w[chn] = 1.0f; _mm256_store_ps(&cns->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_x[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_y[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_z[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->sph_r[s_idx], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f)); _mm256_store_ps(&cns->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f)); } extern void chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(cns); assert(pos3); assert(rot4); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_pos_x[chn] = pos3[0]; cns->chn_pos_y[chn] = pos3[1]; cns->chn_pos_z[chn] = pos3[2]; cns->chn_quat_x[chn] = rot4[0]; cns->chn_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = rot4[3]; } extern void chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(cns); assert(pos3); assert(rot4); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0]; cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1]; cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2]; cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0]; cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; } extern void chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { assert(cns); assert(grav3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_grav_x[chn] = grav3[0]; cns->chn_grav_y[chn] = grav3[1]; cns->chn_grav_z[chn] = grav3[2]; } extern void chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { assert(cns); assert(wind3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_wnd_x[chn] = wind3[0]; cns->chn_wnd_y[chn] = wind3[1]; cns->chn_wnd_z[chn] = wind3[2]; } extern void chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { assert(cns); assert(spheres); assert(cnt < MAX_SPH); int s_idx = (chn * SPH_PER_CHN); for (int i = 0; i < cnt; i++) { cns->sph_x[s_idx + i] = spheres[i*4+0]; cns->sph_y[s_idx + i] = spheres[i*4+1]; cns->sph_z[s_idx + i] = spheres[i*4+2]; cns->sph_r[s_idx + i] = spheres[i*4+3]; } for (int i = cnt; i < SPH_PER_CHN; ++i) { cns->sph_x[s_idx + i] = 0; cns->sph_y[s_idx + i] = 0; cns->sph_z[s_idx + i] = 0; cns->sph_r[s_idx + i] = 0; } } extern void chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { assert(cns); assert(chn >= 0); assert(chn < MAX_CHNS); assert(cnt >= 0); assert(cnt < PTC_PER_CHN); int p_idx = (chn * PTC_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->motion_radius[p_idx + i] = radius[i]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->motion_radius[p_idx + i] = FLT_MAX; } } extern void chn_sim_run(struct chn_sim *cns, float dt) { // SIMD constants const __m256 dt_vec = _mm256_set1_ps(dt); const __m256 dt2_vec = _mm256_set1_ps(dt * dt); const __m256 one_vec = _mm256_set1_ps(1.0f); const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); const __m256 eps_vec = _mm256_set1_ps(1e-6f); const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f); const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f); const __m256 zero_vec = _mm256_setzero_ps(); const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); const __m256 two_vec = _mm256_set1_ps(2.0f); const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); // Local arrays for PBD solver float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); // Initialize dummy elements px_lcl[PTC_PER_CHN] = 0.0f; py_lcl[PTC_PER_CHN] = 0.0f; pz_lcl[PTC_PER_CHN] = 0.0f; pm_lcl[PTC_PER_CHN] = 0.0f; // Stack arrays for precomputed data float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32))); float vel_x[MAX_CHNS] __attribute__((aligned(32))); float vel_y[MAX_CHNS] __attribute__((aligned(32))); float vel_z[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_x[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_y[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHNS] __attribute__((aligned(32))); // Step 0: Precomputation (SIMD, 8 chains at once) for (int c = 0; c < MAX_CHNS; c += 8) { // Load chain quaternions __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]); __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz // Compute local-space wind { __m256 vx = _mm256_load_ps(&cns->chn_wnd_x[c]); __m256 vy = _mm256_load_ps(&cns->chn_wnd_y[c]); __m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]); // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: res_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // res_x = tw*qx + tx*qw + ty*qz - tz*qy // res_y = tw*qy + ty*qw + tz*qx - tx*qz // res_z = tw*qz + ty*qw + tx*qy - ty*qx __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y)); __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z)); _mm256_store_ps(&chn_wnd_lcl_x[c], res_x); _mm256_store_ps(&chn_wnd_lcl_y[c], res_y); _mm256_store_ps(&chn_wnd_lcl_z[c], res_z); } // Compute local-space gravity { __m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]); __m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]); __m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]); // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: res_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // res_x = tw*qx + tx*qw + ty*qz - tz*qy // res_y = tw*qy + ty*qw + tz*qx - tx*qz // res_z = tw*qz + ty*qw + tx*qy - ty*qx __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y)); __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z)); _mm256_store_ps(&chn_grav_lcl_x[c], res_x); _mm256_store_ps(&chn_grav_lcl_y[c], res_y); _mm256_store_ps(&chn_grav_lcl_z[c], res_z); } // Compute linear velocity { __m256 curr_x = _mm256_load_ps(&cns->chn_pos_x[c]); __m256 curr_y = _mm256_load_ps(&cns->chn_pos_y[c]); __m256 curr_z = _mm256_load_ps(&cns->chn_pos_z[c]); __m256 prev_x = _mm256_load_ps(&cns->chn_prev_pos_x[c]); __m256 prev_y = _mm256_load_ps(&cns->chn_prev_pos_y[c]); __m256 prev_z = _mm256_load_ps(&cns->chn_prev_pos_z[c]); __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); __m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec); _mm256_store_ps(&vel_x[c], vel_x_vec); _mm256_store_ps(&vel_y[c], vel_y_vec); _mm256_store_ps(&vel_z[c], vel_z_vec); } // Compute angular velocity { __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]); __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]); __m256 prev_qx = _mm256_load_ps(&cns->chn_prev_quat_x[c]); __m256 prev_qy = _mm256_load_ps(&cns->chn_prev_quat_y[c]); __m256 prev_qz = _mm256_load_ps(&cns->chn_prev_quat_z[c]); __m256 prev_qw = _mm256_load_ps(&cns->chn_prev_quat_w[c]); // Step 1: Compute delta quaternion (to - from) __m256 dt_qx = _mm256_sub_ps(qx, prev_qx); __m256 dt_qy = _mm256_sub_ps(qy, prev_qy); __m256 dt_qz = _mm256_sub_ps(qz, prev_qz); __m256 dt_qw = _mm256_sub_ps(qw, prev_qw); // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx); __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy); __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz); __m256 conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) __m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw)); orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw); orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw); __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(dt_qy, conj_qz), _mm256_mul_ps(dt_qz, conj_qy)); __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(dt_qz, conj_qx), _mm256_mul_ps(dt_qx, conj_qz)); __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(dt_qx, conj_qy), _mm256_mul_ps(dt_qy, conj_qx)); __m256 orient_qx = _mm256_fmadd_ps(dt_qw, conj_qx, _mm256_mul_ps(dt_qx, conj_qw)); orient_qx = _mm256_add_ps(orient_qx, cross_x); __m256 orient_qy = _mm256_fmadd_ps(dt_qw, conj_qy, _mm256_mul_ps(dt_qy, conj_qw)); orient_qy = _mm256_add_ps(orient_qy, cross_y); __m256 orient_qz = _mm256_fmadd_ps(dt_qw, conj_qz, _mm256_mul_ps(dt_qz, conj_qw)); orient_qz = _mm256_add_ps(orient_qz, cross_z); // Step 4: Compute dot product (to, from) for shortest arc check __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw)); dot = _mm256_fmadd_ps(qy, prev_qy, dot); dot = _mm256_fmadd_ps(qz, prev_qz, dot); // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), _mm256_andnot_ps(negate_mask, one_vec)); orient_qx = _mm256_mul_ps(orient_qx, negate_sign); orient_qy = _mm256_mul_ps(orient_qy, negate_sign); orient_qz = _mm256_mul_ps(orient_qz, negate_sign); // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec)); __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale); __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale); __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale); // Store results _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; // Load precomputed velocities and inertia parameters __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); // Clamp inertia parameters to [0, 1] linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]); __m256 p_ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]); p_ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); // Linear inertia: v * dt * linear_inertia __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); // Angular inertia: (ω × r) * dt * angular_inertia __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec); ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec); ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec); // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); // Calculate displacement: (ω × (ω × r)) * dt^2 __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec); __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); // Apply the centrifugal_inertia factor __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec); __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec); __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec); // Total delta __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z); // Update positions _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z)); } // Update previous transformation cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; cns->chn_prev_pos_z[c] = cns->chn_pos_z[c]; cns->chn_prev_quat_x[c] = cns->chn_quat_x[c]; cns->chn_prev_quat_y[c] = cns->chn_quat_y[c]; cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; } // Step 2: Verlet Integration for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]); __m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]); __m256 chn_grav_z_vec = _mm256_set1_ps(chn_grav_lcl_z[c]); __m256 chn_wnd_x_vec = _mm256_set1_ps(chn_wnd_lcl_x[c]); __m256 chn_wnd_y_vec = _mm256_set1_ps(chn_wnd_lcl_y[c]); __m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]); __m256 damping_vec = _mm256_set1_ps(cfg->damping); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[base_idx + i]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[base_idx + i]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[base_idx + i]); __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec); __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec); __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec); __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass); __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass); __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], new_p_z); _mm256_store_ps(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x); _mm256_store_ps(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y); _mm256_store_ps(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z); } } // Step 3: Distance Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int r_base = c * CON_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); __m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor); __m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain); __m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); _mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cns->ptc_pos_x[p_base + i])); _mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cns->ptc_pos_y[p_base + i])); _mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cns->ptc_pos_z[p_base + i])); _mm256_store_ps(&pm_lcl[i], pm); } for (int iter = 0; iter < MAX_ITR; ++iter) { // First block (i=0 to 7) { __m256 p0x = _mm256_load_ps(&px_lcl[0]); __m256 p0y = _mm256_load_ps(&py_lcl[0]); __m256 p0z = _mm256_load_ps(&pz_lcl[0]); __m256 p0m = _mm256_load_ps(&pm_lcl[0]); __m256 p1x = _mm256_loadu_ps(&px_lcl[1]); __m256 p1y = _mm256_loadu_ps(&py_lcl[1]); __m256 p1z = _mm256_loadu_ps(&pz_lcl[1]); __m256 p1m = _mm256_loadu_ps(&pm_lcl[1]); __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base]); __m256 dx = _mm256_sub_ps(p1x, p0x); __m256 dy = _mm256_sub_ps(p1y, p0y); __m256 dz = _mm256_sub_ps(p1z, p0z); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 w_sum = _mm256_add_ps(p0m, p1m); __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 diff = _mm256_sub_ps(dist, rest_len_vec); __m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec); __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom)); __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec); corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum); __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part); __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist); __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist); __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist); __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude); __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude); __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude); dt_x = _mm256_and_ps(dt_x, apply_corr_mask); dt_y = _mm256_and_ps(dt_y, apply_corr_mask); dt_z = _mm256_and_ps(dt_z, apply_corr_mask); _mm256_store_ps(&px_lcl[0], _mm256_fmadd_ps(dt_x, p0m, p0x)); _mm256_store_ps(&py_lcl[0], _mm256_fmadd_ps(dt_y, p0m, p0y)); _mm256_store_ps(&pz_lcl[0], _mm256_fmadd_ps(dt_z, p0m, p0z)); _mm256_store_ps(&px_lcl[1], _mm256_fnmadd_ps(dt_x, p1m, p1x)); _mm256_store_ps(&py_lcl[1], _mm256_fnmadd_ps(dt_y, p1m, p1y)); _mm256_store_ps(&pz_lcl[1], _mm256_fnmadd_ps(dt_z, p1m, p1z)); } // Second block (i=8 to 14) { __m256 p0x = _mm256_load_ps(&px_lcl[8]); __m256 p0y = _mm256_load_ps(&py_lcl[8]); __m256 p0z = _mm256_load_ps(&pz_lcl[8]); __m256 p0m = _mm256_load_ps(&pm_lcl[8]); __m256 p1x = _mm256_loadu_ps(&px_lcl[9]); __m256 p1y = _mm256_loadu_ps(&py_lcl[9]); __m256 p1z = _mm256_loadu_ps(&pz_lcl[9]); __m256 p1m = _mm256_loadu_ps(&pm_lcl[9]); __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]); __m256 dx = _mm256_sub_ps(p1x, p0x); __m256 dy = _mm256_sub_ps(p1y, p0y); __m256 dz = _mm256_sub_ps(p1z, p0z); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 w_sum = _mm256_add_ps(p0m, p1m); __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 diff = _mm256_sub_ps(dist, rest_len_vec); __m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec); __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom)); __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec); corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum); __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part); __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; __m256 valid_mask = _mm256_load_ps(valid_mask_array); apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist); __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist); __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist); __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude); __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude); __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude); dt_x = _mm256_and_ps(dt_x, apply_corr_mask); dt_y = _mm256_and_ps(dt_y, apply_corr_mask); dt_z = _mm256_and_ps(dt_z, apply_corr_mask); _mm256_store_ps(&px_lcl[8], _mm256_fmadd_ps(dt_x, p0m, p0x)); _mm256_store_ps(&py_lcl[8], _mm256_fmadd_ps(dt_y, p0m, p0y)); _mm256_store_ps(&pz_lcl[8], _mm256_fmadd_ps(dt_z, p0m, p0z)); _mm256_storeu_ps(&px_lcl[9], _mm256_fnmadd_ps(dt_x, p1m, p1x)); _mm256_storeu_ps(&py_lcl[9], _mm256_fnmadd_ps(dt_y, p1m, p1y)); _mm256_storeu_ps(&pz_lcl[9], _mm256_fnmadd_ps(dt_z, p1m, p1z)); } } for (int i = 0; i < PTC_PER_CHN; i += 8) { _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i])); _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i])); _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i])); } } // Step 4: Sphere Collisions with Friction for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int s_base = c * SPH_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; __m256 fric_vec = _mm256_set1_ps(cfg->friction); for (int s = 0; s < SPH_PER_CHN; ++s) { __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]); __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]); __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]); __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]); for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]); __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]); __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]); __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + i]); __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + i]); __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + i]); __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 pen = _mm256_sub_ps(sph_r_vec, dist); __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 norm_x = _mm256_mul_ps(dx, inv_dist); __m256 norm_y = _mm256_mul_ps(dy, inv_dist); __m256 norm_z = _mm256_mul_ps(dz, inv_dist); __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); __m256 vel_dot_norm = _mm256_mul_ps(vel_x, norm_x); vel_dot_norm = _mm256_fmadd_ps(vel_y, norm_y, vel_dot_norm); vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, vel_dot_norm); __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); __m256 tan_mag_sq = _mm256_mul_ps(tan_x, tan_x); tan_mag_sq = _mm256_fmadd_ps(tan_y, tan_y, tan_mag_sq); tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, tan_mag_sq); __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec)); __m256 fric_mag = _mm256_mul_ps(pen, fric_vec); __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); total_corr_x = _mm256_and_ps(total_corr_x, col_mask); total_corr_y = _mm256_and_ps(total_corr_y, col_mask); total_corr_z = _mm256_and_ps(total_corr_z, col_mask); total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); _mm256_store_ps(&cns->ptc_pos_x[p_base + i], new_p_x); _mm256_store_ps(&cns->ptc_pos_y[p_base + i], new_p_y); _mm256_store_ps(&cns->ptc_pos_z[p_base + i], new_p_z); } } } // Step 5: Motion Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; for (int i = 0; i < PTC_PER_CHN; i += 8) { __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]); __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]); __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]); __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + i]); __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + i]); __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + i]); __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + i]); __m256 dx = _mm256_sub_ps(px, rx); __m256 dy = _mm256_sub_ps(py, ry); __m256 dz = _mm256_sub_ps(pz, rz); __m256 dist_sq = _mm256_mul_ps(dx, dx); dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); __m256 dist = _mm256_rcp_ps(inv_dist); __m256 pen = _mm256_sub_ps(dist, r_vec); __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); __m256 corr_factor = _mm256_mul_ps(pen, inv_dist); corr_factor = _mm256_and_ps(corr_factor, mask); __m256 dt_x = _mm256_mul_ps(dx, corr_factor); __m256 dt_y = _mm256_mul_ps(dy, corr_factor); __m256 dt_z = _mm256_mul_ps(dz, corr_factor); dt_x = _mm256_mul_ps(dt_x, pm); dt_y = _mm256_mul_ps(dt_y, pm); dt_z = _mm256_mul_ps(dt_z, pm); _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x)); _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y)); _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z)); } } }