#include #include #include #include #include enum { MAX_CHNS = 64, PTC_PER_CHN = 16, SPH_PER_CHN = 8, MAX_PTC = (MAX_CHNS * PTC_PER_CHN), MAX_SPH = (MAX_CHNS * SPH_PER_CHN), CON_PER_CHN = PTC_PER_CHN, MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), MAX_ITR = 16, }; struct chn_cfg { float damping; // [0, 1] float stiffness; // [0, 1] float stretch_factor; float max_strain; float friction; // [0, 1] float linear_inertia; // [0, 1], 1.0f = physically correct linear motion float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force }; struct chn_sim { unsigned char free_idx_cnt; unsigned char free_idx[MAX_CHNS]; struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32))); // chain global forces (world space gravity and wind) float chn_grav_x[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_y[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_z[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32))); // chain transformations (world space) float chn_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_quat_w[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32))); // particle positions (model space) float ptc_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); // sphere positions (model space) float sph_x[MAX_SPH] __attribute__((aligned(32))); float sph_y[MAX_SPH] __attribute__((aligned(32))); float sph_z[MAX_SPH] __attribute__((aligned(32))); float sph_r[MAX_SPH] __attribute__((aligned(32))); // rest positions (model space) float rest_pos_x[MAX_PTC] __attribute__((aligned(32))); float rest_pos_y[MAX_PTC] __attribute__((aligned(32))); float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); float motion_radius[MAX_PTC] __attribute__((aligned(32))); }; extern void chn_sim_init(struct chn_sim *cns) { assert(cns); cns->free_idx_cnt = MAX_CHNS; for (int i = 0; i < MAX_CHNS; ++i) { cns->free_idx[i] = MAX_CHNS - i - 1; } for (int i = 0; i < MAX_CHNS; i += 4) { vst1q_f32(&cns->chn_quat_w[i], vdupq_n_f32(1.0f)); vst1q_f32(&cns->chn_prev_quat_w[i], vdupq_n_f32(1.0f)); } } extern int chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { assert(cns); assert(cfg); assert(cns->free_idx_cnt > 0); assert(cnt <= PTC_PER_CHN); assert(cnt > 1); int chn = cns->free_idx[--cns->free_idx_cnt]; cns->chn_cfg[chn] = *cfg; int p_idx = (chn * PTC_PER_CHN); int r_idx = (chn * CON_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0]; cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1]; cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2]; cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3]; cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0]; cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1]; cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->ptc_pos_x[p_idx + i] = 0.0f; cns->ptc_pos_y[p_idx + i] = 0.0f; cns->ptc_pos_z[p_idx + i] = 0.0f; } for (int i = 1; i < cnt; ++i) { float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz); } for (int i = cnt; i < CON_PER_CHN; ++i) { cns->ptc_rest_len[r_idx + i] = 0.0f; cns->ptc_rest_len[r_idx + i] = 0.0f; cns->ptc_rest_len[r_idx + i] = 0.0f; } return chn; } extern void chn_sim_del(struct chn_sim *cns, int chn) { assert(cns); assert(chn >= 0); assert(chn < MAX_CHNS); int p_idx = (chn * PTC_PER_CHN); int s_idx = (chn * SPH_PER_CHN); int c_idx = (chn * CON_PER_CHN); cns->free_idx[cns->free_idx_cnt++] = chn; memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); cns->chn_grav_x[chn] = 0.0f; cns->chn_grav_y[chn] = 0.0f; cns->chn_grav_z[chn] = 0.0f; cns->chn_wnd_x[chn] = 0.0f; cns->chn_wnd_y[chn] = 0.0f; cns->chn_wnd_z[chn] = 0.0f; cns->chn_pos_x[chn] = 0.0f; cns->chn_pos_y[chn] = 0.0f; cns->chn_pos_z[chn] = 0.0f; cns->chn_quat_x[chn] = 0.0f; cns->chn_quat_y[chn] = 0.0f; cns->chn_quat_z[chn] = 0.0f; cns->chn_quat_w[chn] = 1.0f; cns->chn_prev_pos_x[chn] = 0.0f; cns->chn_prev_pos_y[chn] = 0.0f; cns->chn_prev_pos_z[chn] = 0.0f; cns->chn_prev_quat_x[chn] = 0.0f; cns->chn_prev_quat_y[chn] = 0.0f; cns->chn_prev_quat_z[chn] = 0.0f; cns->chn_prev_quat_w[chn] = 1.0f; for (int i = 0; i < PTC_PER_CHN; i += 4) { vst1q_f32(&cns->ptc_pos_x[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_pos_y[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_pos_z[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_inv_mass[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_prev_pos_x[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_prev_pos_y[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->ptc_prev_pos_z[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->rest_pos_x[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->rest_pos_y[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->rest_pos_z[p_idx + i], vdupq_n_f32(0.0f)); vst1q_f32(&cns->motion_radius[p_idx + i], vdupq_n_f32(0.0f)); } // Spheres vst1q_f32(&cns->sph_x[s_idx], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_y[s_idx], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_z[s_idx], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_r[s_idx], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_x[s_idx + 4], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_y[s_idx + 4], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_z[s_idx + 4], vdupq_n_f32(0.0f)); vst1q_f32(&cns->sph_r[s_idx + 4], vdupq_n_f32(0.0f)); // Loop by 4 for NEON for (int i = 0; i < CON_PER_CHN; i += 4) { vst1q_f32(&cns->ptc_rest_len[c_idx + i], vdupq_n_f32(0.0f)); } } extern void chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(cns); assert(pos3); assert(rot4); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_pos_x[chn] = pos3[0]; cns->chn_pos_y[chn] = pos3[1]; cns->chn_pos_z[chn] = pos3[2]; cns->chn_quat_x[chn] = rot4[0]; cns->chn_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = rot4[3]; } extern void chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { assert(cns); assert(pos3); assert(rot4); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0]; cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1]; cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2]; cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0]; cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1]; cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; } extern void chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { assert(cns); assert(grav3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_grav_x[chn] = grav3[0]; cns->chn_grav_y[chn] = grav3[1]; cns->chn_grav_z[chn] = grav3[2]; } extern void chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { assert(cns); assert(wind3); assert(chn >= 0); assert(chn < MAX_CHNS); cns->chn_wnd_x[chn] = wind3[0]; cns->chn_wnd_y[chn] = wind3[1]; cns->chn_wnd_z[chn] = wind3[2]; } extern void chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { assert(cns); assert(spheres); assert(cnt < MAX_SPH); int s_idx = (chn * SPH_PER_CHN); for (int i = 0; i < cnt; i++) { cns->sph_x[s_idx + i] = spheres[i*4+0]; cns->sph_y[s_idx + i] = spheres[i*4+1]; cns->sph_z[s_idx + i] = spheres[i*4+2]; cns->sph_r[s_idx + i] = spheres[i*4+3]; } for (int i = cnt; i < SPH_PER_CHN; ++i) { cns->sph_x[s_idx + i] = 0; cns->sph_y[s_idx + i] = 0; cns->sph_z[s_idx + i] = 0; cns->sph_r[s_idx + i] = 0; } } extern void chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { assert(cns); assert(chn >= 0); assert(chn < MAX_CHNS); assert(cnt >= 0); assert(cnt < PTC_PER_CHN); int p_idx = (chn * PTC_PER_CHN); for (int i = 0; i < cnt; ++i) { cns->motion_radius[p_idx + i] = radius[i]; } for (int i = cnt; i < PTC_PER_CHN; ++i) { cns->motion_radius[p_idx + i] = FLT_MAX; } } extern void chn_sim_run(struct chn_sim *cns, float dt) { // NEON constants const float32x4_t dt_vec = vdupq_n_f32(dt); const float32x4_t dt2_vec = vdupq_n_f32(dt * dt); const float32x4_t one_vec = vdupq_n_f32(1.0f); const float32x4_t neg_one_vec = vdupq_n_f32(-1.0f); const float32x4_t eps_vec = vdupq_n_f32(1e-6f); const float32x4_t ptc_inv_mass_clamp_min = vdupq_n_f32(0.0f); const float32x4_t ptc_inv_mass_clamp_max = vdupq_n_f32(1e6f); const float32x4_t zero_vec = vdupq_n_f32(0.0f); const float32x4_t inv_dt_vec = vdupq_n_f32(1.0f / dt); const float32x4_t two_vec = vdupq_n_f32(2.0f); const float32x4_t inertia_clamp_min = vdupq_n_f32(0.0f); const float32x4_t inertia_clamp_max = vdupq_n_f32(1.0f); // NEON doesn't have a direct equivalent to _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)) // for absolute mask. We'll use `vabsq_f32` where needed or reconstruct the logic. // Local arrays for PBD solver float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); // Initialize dummy elements px_lcl[PTC_PER_CHN] = 0.0f; py_lcl[PTC_PER_CHN] = 0.0f; pz_lcl[PTC_PER_CHN] = 0.0f; pm_lcl[PTC_PER_CHN] = 0.0f; // Stack arrays for precomputed data float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32))); float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32))); float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32))); float vel_x[MAX_CHNS] __attribute__((aligned(32))); float vel_y[MAX_CHNS] __attribute__((aligned(32))); float vel_z[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_x[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_y[MAX_CHNS] __attribute__((aligned(32))); float ang_vel_z[MAX_CHNS] __attribute__((aligned(32))); // Step 0: Precomputation for (int c = 0; c < MAX_CHNS; c += 4) { // Load chain quaternions float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]); float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]); float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]); float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]); // Create q_conjugate components (qw remains, qx, qy, qz are negated) float32x4_t cqx = vnegq_f32(qx); // -qx float32x4_t cqy = vnegq_f32(qy); // -qy float32x4_t cqz = vnegq_f32(qz); // -qz // Compute local-space wind { float32x4_t vx = vld1q_f32(&cns->chn_wnd_x[c]); float32x4_t vy = vld1q_f32(&cns->chn_wnd_y[c]); float32x4_t vz = vld1q_f32(&cns->chn_wnd_z[c]); // Step 1: t = q_conj * v_world_as_quat // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); // qw*vx + (-qy)*vz tx = vmlsq_f32(tx, cqz, vy); // qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); // qw*vy + (-qz)*vx ty = vmlsq_f32(ty, cqx, vz); // qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); // qw*vz + (-qx)*vy tz = vmlsq_f32(tz, cqy, vx); // qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); // - ((-qx)*vx + (-qy)*vy) = qx*vx + qy*vy tw = vmlsq_f32(tw, cqz, vz); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz // Step 2: res_vec = (t * q)_vec // result = (tw, tx, ty, tz) * (qw, qx, qy, qz) // res_x = tw*qx + tx*qw + ty*qz - tz*qy // res_y = tw*qy + ty*qw + tz*qx - tx*qz // res_z = tw*qz + ty*qw + tx*qy - ty*qx float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw); res_x = vmlsq_f32(res_x, tz, qy); res_x = vmlaq_f32(res_x, ty, qz); // Corrected: tw*qx + tx*qw + ty*qz - tz*qy float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw); res_y = vmlsq_f32(res_y, tx, qz); res_y = vmlaq_f32(res_y, tz, qx); // Corrected: tw*qy + ty*qw + tz*qx - tx*qz float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw); res_z = vmlsq_f32(res_z, ty, qx); res_z = vmlaq_f32(res_z, tx, qy); // Corrected: tw*qz + tz*qw + tx*qy - ty*qx vst1q_f32(&chn_wnd_lcl_x[c], res_x); vst1q_f32(&chn_wnd_lcl_y[c], res_y); vst1q_f32(&chn_wnd_lcl_z[c], res_z); } // Compute local-space gravity { float32x4_t vx = vld1q_f32(&cns->chn_grav_x[c]); float32x4_t vy = vld1q_f32(&cns->chn_grav_y[c]); float32x4_t vz = vld1q_f32(&cns->chn_grav_z[c]); // Step 1: t = q_conj * v_world_as_quat float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); tx = vmlsq_f32(tx, cqz, vy); float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); ty = vmlsq_f32(ty, cqx, vz); float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); tz = vmlsq_f32(tz, cqy, vx); float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); tw = vmlsq_f32(tw, cqz, vz); // Step 2: res_vec = (t * q)_vec float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw); res_x = vmlsq_f32(res_x, tz, qy); res_x = vmlaq_f32(res_x, ty, qz); float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw); res_y = vmlsq_f32(res_y, tx, qz); res_y = vmlaq_f32(res_y, tz, qx); float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw); res_z = vmlsq_f32(res_z, ty, qx); res_z = vmlaq_f32(res_z, tx, qy); vst1q_f32(&chn_grav_lcl_x[c], res_x); vst1q_f32(&chn_grav_lcl_y[c], res_y); vst1q_f32(&chn_grav_lcl_z[c], res_z); } // Compute linear velocity { float32x4_t curr_x = vld1q_f32(&cns->chn_pos_x[c]); float32x4_t curr_y = vld1q_f32(&cns->chn_pos_y[c]); float32x4_t curr_z = vld1q_f32(&cns->chn_pos_z[c]); float32x4_t prev_x = vld1q_f32(&cns->chn_prev_pos_x[c]); float32x4_t prev_y = vld1q_f32(&cns->chn_prev_pos_y[c]); float32x4_t prev_z = vld1q_f32(&cns->chn_prev_pos_z[c]); float32x4_t vel_x_vec = vmulq_f32(vsubq_f32(curr_x, prev_x), inv_dt_vec); float32x4_t vel_y_vec = vmulq_f32(vsubq_f32(curr_y, prev_y), inv_dt_vec); float32x4_t vel_z_vec = vmulq_f32(vsubq_f32(curr_z, prev_z), inv_dt_vec); vst1q_f32(&vel_x[c], vel_x_vec); vst1q_f32(&vel_y[c], vel_y_vec); vst1q_f32(&vel_z[c], vel_z_vec); } // Compute angular velocity { float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]); float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]); float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]); float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]); float32x4_t prev_qx = vld1q_f32(&cns->chn_prev_quat_x[c]); float32x4_t prev_qy = vld1q_f32(&cns->chn_prev_quat_y[c]); float32x4_t prev_qz = vld1q_f32(&cns->chn_prev_quat_z[c]); float32x4_t prev_qw = vld1q_f32(&cns->chn_prev_quat_w[c]); // Step 1: Compute delta quaternion (to - from) float32x4_t dt_qx = vsubq_f32(qx, prev_qx); float32x4_t dt_qy = vsubq_f32(qy, prev_qy); float32x4_t dt_qz = vsubq_f32(qz, prev_qz); float32x4_t dt_qw = vsubq_f32(qw, prev_qw); // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) float32x4_t conj_qx = vnegq_f32(prev_qx); float32x4_t conj_qy = vnegq_f32(prev_qy); float32x4_t conj_qz = vnegq_f32(prev_qz); float32x4_t conj_qw = prev_qw; // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) float32x4_t orient_qw = vmlsq_f32(vmulq_f32(dt_qw, conj_qw), dt_qx, conj_qx); orient_qw = vmlsq_f32(orient_qw, dt_qy, conj_qy); orient_qw = vmlsq_f32(orient_qw, dt_qz, conj_qz); float32x4_t cross_x = vmlsq_f32(vmulq_f32(dt_qy, conj_qz), dt_qz, conj_qy); float32x4_t cross_y = vmlsq_f32(vmulq_f32(dt_qz, conj_qx), dt_qx, conj_qz); float32x4_t cross_z = vmlsq_f32(vmulq_f32(dt_qx, conj_qy), dt_qy, conj_qx); float32x4_t orient_qx = vmlaq_f32(vmulq_f32(dt_qw, conj_qx), dt_qx, conj_qw); orient_qx = vaddq_f32(orient_qx, cross_x); float32x4_t orient_qy = vmlaq_f32(vmulq_f32(dt_qw, conj_qy), dt_qy, conj_qw); orient_qy = vaddq_f32(orient_qy, cross_y); float32x4_t orient_qz = vmlaq_f32(vmulq_f32(dt_qw, conj_qz), dt_qz, conj_qw); orient_qz = vaddq_f32(orient_qz, cross_z); // Step 4: Compute dot product (to, from) for shortest arc check float32x4_t dot = vmlaq_f32(vmulq_f32(qw, prev_qw), qx, prev_qx); dot = vmlaq_f32(dot, qy, prev_qy); dot = vmlaq_f32(dot, qz, prev_qz); // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) uint32x4_t negate_mask = vcltq_f32(dot, zero_vec); // 1 if dot < 0, 0 otherwise float32x4_t sign_selector = vbslq_f32(negate_mask, neg_one_vec, one_vec); // -1.0f if true, 1.0f if false orient_qx = vmulq_f32(orient_qx, sign_selector); orient_qy = vmulq_f32(orient_qy, sign_selector); orient_qz = vmulq_f32(orient_qz, sign_selector); // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) // Inlined vrecip_f32_high_precision float32x4_t rcp_two_dt; rcp_two_dt = vrecpeq_f32(vmulq_f32(two_vec, dt_vec)); rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt)); rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt)); float32x4_t ang_vel_x_vec = vmulq_f32(orient_qx, rcp_two_dt); float32x4_t ang_vel_y_vec = vmulq_f32(orient_qy, rcp_two_dt); float32x4_t ang_vel_z_vec = vmulq_f32(orient_qz, rcp_two_dt); // Store results vst1q_f32(&ang_vel_x[c], ang_vel_x_vec); vst1q_f32(&ang_vel_y[c], ang_vel_y_vec); vst1q_f32(&ang_vel_z[c], ang_vel_z_vec); } } // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; // Load precomputed velocities and inertia parameters float32x4_t vel_x_vec = vdupq_n_f32(vel_x[c]); float32x4_t vel_y_vec = vdupq_n_f32(vel_y[c]); float32x4_t vel_z_vec = vdupq_n_f32(vel_z[c]); float32x4_t ang_vel_x_vec = vdupq_n_f32(ang_vel_x[c]); float32x4_t ang_vel_y_vec = vdupq_n_f32(ang_vel_y[c]); float32x4_t ang_vel_z_vec = vdupq_n_f32(ang_vel_z[c]); float32x4_t linear_inertia_vec = vdupq_n_f32(cfg->linear_inertia); float32x4_t angular_inertia_vec = vdupq_n_f32(cfg->angular_inertia); float32x4_t centrifugal_inertia_vec = vdupq_n_f32(cfg->centrifugal_inertia); // Clamp inertia parameters to [0, 1] linear_inertia_vec = vmaxq_f32(vminq_f32(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); angular_inertia_vec = vmaxq_f32(vminq_f32(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); centrifugal_inertia_vec = vmaxq_f32(vminq_f32(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t px = vld1q_f32(&cns->ptc_pos_x[base_idx + i]); float32x4_t py = vld1q_f32(&cns->ptc_pos_y[base_idx + i]); float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[base_idx + i]); float32x4_t p_ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]); p_ptc_inv_mass = vmaxq_f32(vminq_f32(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); // Linear inertia: v * dt * linear_inertia float32x4_t lin_dt_x = vmulq_f32(vmulq_f32(vel_x_vec, dt_vec), linear_inertia_vec); float32x4_t lin_dt_y = vmulq_f32(vmulq_f32(vel_y_vec, dt_vec), linear_inertia_vec); float32x4_t lin_dt_z = vmulq_f32(vmulq_f32(vel_z_vec, dt_vec), linear_inertia_vec); // Angular inertia: (ω × r) * dt * angular_inertia float32x4_t ang_dt_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py); float32x4_t ang_dt_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz); float32x4_t ang_dt_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px); ang_dt_x = vmulq_f32(vmulq_f32(ang_dt_x, dt_vec), angular_inertia_vec); ang_dt_y = vmulq_f32(vmulq_f32(ang_dt_y, dt_vec), angular_inertia_vec); ang_dt_z = vmulq_f32(vmulq_f32(ang_dt_z, dt_vec), angular_inertia_vec); // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia float32x4_t cross1_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py); float32x4_t cross1_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz); float32x4_t cross1_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px); float32x4_t cross2_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, cross1_z), ang_vel_z_vec, cross1_y); float32x4_t cross2_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, cross1_x), ang_vel_x_vec, cross1_z); float32x4_t cross2_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, cross1_y), ang_vel_y_vec, cross1_x); // Calculate displacement: (ω × (ω × r)) * dt^2 float32x4_t base_cent_dt_x = vmulq_f32(cross2_x, dt2_vec); float32x4_t base_cent_dt_y = vmulq_f32(cross2_y, dt2_vec); float32x4_t base_cent_dt_z = vmulq_f32(cross2_z, dt2_vec); // Apply the centrifugal_inertia factor float32x4_t cent_dt_x = vmulq_f32(base_cent_dt_x, centrifugal_inertia_vec); float32x4_t cent_dt_y = vmulq_f32(base_cent_dt_y, centrifugal_inertia_vec); float32x4_t cent_dt_z = vmulq_f32(base_cent_dt_z, centrifugal_inertia_vec); // Total delta float32x4_t dt_x = vaddq_f32(vaddq_f32(lin_dt_x, ang_dt_x), cent_dt_x); float32x4_t dt_y = vaddq_f32(vaddq_f32(lin_dt_y, ang_dt_y), cent_dt_y); float32x4_t dt_z = vaddq_f32(vaddq_f32(lin_dt_z, ang_dt_z), cent_dt_z); // Update positions vst1q_f32(&cns->ptc_pos_x[base_idx + i], vaddq_f32(px, dt_x)); vst1q_f32(&cns->ptc_pos_y[base_idx + i], vaddq_f32(py, dt_y)); vst1q_f32(&cns->ptc_pos_z[base_idx + i], vaddq_f32(pz, dt_z)); } // Update previous transformation cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; cns->chn_prev_pos_z[c] = cns->chn_pos_z[c]; cns->chn_prev_quat_x[c] = cns->chn_quat_x[c]; cns->chn_prev_quat_y[c] = cns->chn_quat_y[c]; cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; } // Step 2: Verlet Integration for (int c = 0; c < MAX_CHNS; ++c) { int base_idx = c * PTC_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; float32x4_t chn_grav_x_vec = vdupq_n_f32(chn_grav_lcl_x[c]); float32x4_t chn_grav_y_vec = vdupq_n_f32(chn_grav_lcl_y[c]); float32x4_t chn_grav_z_vec = vdupq_n_f32(chn_grav_lcl_z[c]); float32x4_t chn_wnd_x_vec = vdupq_n_f32(chn_wnd_lcl_x[c]); float32x4_t chn_wnd_y_vec = vdupq_n_f32(chn_wnd_lcl_y[c]); float32x4_t chn_wnd_z_vec = vdupq_n_f32(chn_wnd_lcl_z[c]); float32x4_t damping_vec = vdupq_n_f32(cfg->damping); for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[base_idx + i]); float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[base_idx + i]); float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[base_idx + i]); float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]); ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[base_idx + i]); float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[base_idx + i]); float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[base_idx + i]); float32x4_t force_x = vaddq_f32(chn_grav_x_vec, chn_wnd_x_vec); float32x4_t force_y = vaddq_f32(chn_grav_y_vec, chn_wnd_y_vec); float32x4_t force_z = vaddq_f32(chn_grav_z_vec, chn_wnd_z_vec); float32x4_t acc_x = vmulq_f32(force_x, ptc_inv_mass); float32x4_t acc_y = vmulq_f32(force_y, ptc_inv_mass); float32x4_t acc_z = vmulq_f32(force_z, ptc_inv_mass); float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x); float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y); float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z); float32x4_t damped_vel_x = vmulq_f32(vel_x, damping_vec); float32x4_t damped_vel_y = vmulq_f32(vel_y, damping_vec); float32x4_t damped_vel_z = vmulq_f32(vel_z, damping_vec); float32x4_t term_accel_x = vmulq_f32(acc_x, dt2_vec); float32x4_t term_accel_y = vmulq_f32(acc_y, dt2_vec); float32x4_t term_accel_z = vmulq_f32(acc_z, dt2_vec); float32x4_t new_p_x = vaddq_f32(p_curr_x, vaddq_f32(damped_vel_x, term_accel_x)); float32x4_t new_p_y = vaddq_f32(p_curr_y, vaddq_f32(damped_vel_y, term_accel_y)); float32x4_t new_p_z = vaddq_f32(p_curr_z, vaddq_f32(damped_vel_z, term_accel_z)); vst1q_f32(&cns->ptc_pos_x[base_idx + i], new_p_x); vst1q_f32(&cns->ptc_pos_y[base_idx + i], new_p_y); vst1q_f32(&cns->ptc_pos_z[base_idx + i], new_p_z); vst1q_f32(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x); vst1q_f32(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y); vst1q_f32(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z); } } // Step 3: Distance Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int r_base = c * CON_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; float32x4_t stiff_vec = vdupq_n_f32(cfg->stiffness); float32x4_t stretch_factor_vec = vdupq_n_f32(cfg->stretch_factor); float32x4_t max_strain_vec_abs = vdupq_n_f32(cfg->max_strain); float32x4_t neg_max_strain_vec_abs = vmulq_f32(max_strain_vec_abs, neg_one_vec); // Loop by 4 for NEON for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); vst1q_f32(&px_lcl[i], vld1q_f32(&cns->ptc_pos_x[p_base + i])); vst1q_f32(&py_lcl[i], vld1q_f32(&cns->ptc_pos_y[p_base + i])); vst1q_f32(&pz_lcl[i], vld1q_f32(&cns->ptc_pos_z[p_base + i])); vst1q_f32(&pm_lcl[i], pm); } for (int iter = 0; iter < MAX_ITR; ++iter) { // Loop by 4 for NEON for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t p0x = vld1q_f32(&px_lcl[i]); float32x4_t p0y = vld1q_f32(&py_lcl[i]); float32x4_t p0z = vld1q_f32(&pz_lcl[i]); float32x4_t p0m = vld1q_f32(&pm_lcl[i]); float32x4_t p1x = vld1q_f32(&px_lcl[i + 1]); float32x4_t p1y = vld1q_f32(&py_lcl[i + 1]); float32x4_t p1z = vld1q_f32(&pz_lcl[i + 1]); float32x4_t p1m = vld1q_f32(&pm_lcl[i + 1]); float32x4_t rest_len_vec = vld1q_f32(&cns->ptc_rest_len[r_base + i]); float32x4_t dx = vsubq_f32(p1x, p0x); float32x4_t dy = vsubq_f32(p1y, p0y); float32x4_t dz = vsubq_f32(p1z, p0z); float32x4_t dist_sq = vmulq_f32(dx, dx); dist_sq = vmlaq_f32(dist_sq, dy, dy); dist_sq = vmlaq_f32(dist_sq, dz, dz); float32x4_t inv_dist; float32x4_t x_rsqrt = vmaxq_f32(dist_sq, eps_vec); inv_dist = vrsqrteq_f32(x_rsqrt); inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist))); inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist))); float32x4_t w_sum = vaddq_f32(p0m, p1m); float32x4_t w_sum_clamped = vmaxq_f32(w_sum, eps_vec); float32x4_t rcp_w_sum; rcp_w_sum = vrecpeq_f32(w_sum_clamped); rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum)); rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum)); float32x4_t dist; dist = vrecpeq_f32(inv_dist); dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist)); dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist)); float32x4_t diff = vsubq_f32(dist, rest_len_vec); float32x4_t max_rest_len_eps = vmaxq_f32(rest_len_vec, eps_vec); float32x4_t rcp_max_rest_len_eps; rcp_max_rest_len_eps = vrecpeq_f32(max_rest_len_eps); rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps)); rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps)); float32x4_t strain = vmulq_f32(diff, rcp_max_rest_len_eps); float32x4_t strain_clamped = vmaxq_f32(neg_max_strain_vec_abs, vminq_f32(strain, max_strain_vec_abs)); float32x4_t dyn_stiff_denom = vaddq_f32(one_vec, vabsq_f32(strain_clamped)); float32x4_t rcp_dyn_stiff_denom; rcp_dyn_stiff_denom = vrecpeq_f32(dyn_stiff_denom); rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom)); rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom)); float32x4_t dyn_stiff = vmulq_f32(stiff_vec, rcp_dyn_stiff_denom); float32x4_t corr_scl_part = vmulq_f32(dyn_stiff, stretch_factor_vec); corr_scl_part = vmulq_f32(corr_scl_part, rcp_w_sum); float32x4_t corr_magnitude = vmulq_f32(diff, corr_scl_part); uint32x4_t apply_corr_mask = vcgtq_f32(dist_sq, eps_vec); // 1 if dist_sq > eps_vec, 0 otherwise float32x4_t dt_unit_x = vmulq_f32(dx, inv_dist); float32x4_t dt_unit_y = vmulq_f32(dy, inv_dist); float32x4_t dt_unit_z = vmulq_f32(dz, inv_dist); float32x4_t dt_x = vmulq_f32(dt_unit_x, corr_magnitude); float32x4_t dt_y = vmulq_f32(dt_unit_y, corr_magnitude); float32x4_t dt_z = vmulq_f32(dt_unit_z, corr_magnitude); dt_x = vbslq_f32(apply_corr_mask, dt_x, zero_vec); // Apply mask: if mask is 0, set to 0 dt_y = vbslq_f32(apply_corr_mask, dt_y, zero_vec); dt_z = vbslq_f32(apply_corr_mask, dt_z, zero_vec); vst1q_f32(&px_lcl[i], vmlaq_f32(p0x, dt_x, p0m)); vst1q_f32(&py_lcl[i], vmlaq_f32(p0y, dt_y, p0m)); vst1q_f32(&pz_lcl[i], vmlaq_f32(p0z, dt_z, p0m)); vst1q_f32(&px_lcl[i + 1], vmlsq_f32(p1x, dt_x, p1m)); vst1q_f32(&py_lcl[i + 1], vmlsq_f32(p1y, dt_y, p1m)); vst1q_f32(&pz_lcl[i + 1], vmlsq_f32(p1z, dt_z, p1m)); } } for (int i = 0; i < PTC_PER_CHN; i += 4) { vst1q_f32(&cns->ptc_pos_x[p_base + i], vld1q_f32(&px_lcl[i])); vst1q_f32(&cns->ptc_pos_y[p_base + i], vld1q_f32(&py_lcl[i])); vst1q_f32(&cns->ptc_pos_z[p_base + i], vld1q_f32(&pz_lcl[i])); } } // Step 4: Sphere Collisions with Friction for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; int s_base = c * SPH_PER_CHN; struct chn_cfg *cfg = &cns->chn_cfg[c]; float32x4_t fric_vec = vdupq_n_f32(cfg->friction); for (int s = 0; s < SPH_PER_CHN; ++s) { float32x4_t sph_x_vec = vdupq_n_f32(cns->sph_x[s_base + s]); float32x4_t sph_y_vec = vdupq_n_f32(cns->sph_y[s_base + s]); float32x4_t sph_z_vec = vdupq_n_f32(cns->sph_z[s_base + s]); float32x4_t sph_r_vec = vdupq_n_f32(cns->sph_r[s_base + s]); for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[p_base + i]); float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[p_base + i]); float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[p_base + i]); float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[p_base + i]); float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[p_base + i]); float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[p_base + i]); float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); float32x4_t dx = vsubq_f32(p_curr_x, sph_x_vec); float32x4_t dy = vsubq_f32(p_curr_y, sph_y_vec); float32x4_t dz = vsubq_f32(p_curr_z, sph_z_vec); float32x4_t dist_sq = vmulq_f32(dx, dx); dist_sq = vmlaq_f32(dist_sq, dy, dy); dist_sq = vmlaq_f32(dist_sq, dz, dz); float32x4_t inv_dist_sphere; float32x4_t x_rsqrt_sphere = vmaxq_f32(dist_sq, eps_vec); inv_dist_sphere = vrsqrteq_f32(x_rsqrt_sphere); inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere))); inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere))); float32x4_t inv_dist = inv_dist_sphere; float32x4_t dist_sphere; dist_sphere = vrecpeq_f32(inv_dist); dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere)); dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere)); float32x4_t dist = dist_sphere; float32x4_t pen = vsubq_f32(sph_r_vec, dist); uint32x4_t col_mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise float32x4_t norm_x = vmulq_f32(dx, inv_dist); float32x4_t norm_y = vmulq_f32(dy, inv_dist); float32x4_t norm_z = vmulq_f32(dz, inv_dist); float32x4_t norm_corr_x = vmulq_f32(norm_x, pen); float32x4_t norm_corr_y = vmulq_f32(norm_y, pen); float32x4_t norm_corr_z = vmulq_f32(norm_z, pen); float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x); float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y); float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z); float32x4_t vel_dot_norm = vmulq_f32(vel_x, norm_x); vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_y, norm_y); vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_z, norm_z); float32x4_t tan_x = vmlsq_f32(vel_x, vel_dot_norm, norm_x); float32x4_t tan_y = vmlsq_f32(vel_y, vel_dot_norm, norm_y); float32x4_t tan_z = vmlsq_f32(vel_z, vel_dot_norm, norm_z); float32x4_t tan_mag_sq = vmulq_f32(tan_x, tan_x); tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_y, tan_y); tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_z, tan_z); float32x4_t inv_tan_mag_sphere; float32x4_t x_rsqrt_tan_mag = vmaxq_f32(tan_mag_sq, eps_vec); inv_tan_mag_sphere = vrsqrteq_f32(x_rsqrt_tan_mag); inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere))); inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere))); float32x4_t inv_tan_mag = inv_tan_mag_sphere; float32x4_t fric_mag = vmulq_f32(pen, fric_vec); float32x4_t fric_x = vmulq_f32(vmulq_f32(tan_x, inv_tan_mag), fric_mag); float32x4_t fric_y = vmulq_f32(vmulq_f32(tan_y, inv_tan_mag), fric_mag); float32x4_t fric_z = vmulq_f32(vmulq_f32(tan_z, inv_tan_mag), fric_mag); float32x4_t total_corr_x = vsubq_f32(norm_corr_x, fric_x); float32x4_t total_corr_y = vsubq_f32(norm_corr_y, fric_y); float32x4_t total_corr_z = vsubq_f32(norm_corr_z, fric_z); total_corr_x = vbslq_f32(col_mask, total_corr_x, zero_vec); total_corr_y = vbslq_f32(col_mask, total_corr_y, zero_vec); total_corr_z = vbslq_f32(col_mask, total_corr_z, zero_vec); total_corr_x = vmulq_f32(total_corr_x, ptc_inv_mass); total_corr_y = vmulq_f32(total_corr_y, ptc_inv_mass); total_corr_z = vmulq_f32(total_corr_z, ptc_inv_mass); float32x4_t new_p_x = vaddq_f32(p_curr_x, total_corr_x); float32x4_t new_p_y = vaddq_f32(p_curr_y, total_corr_y); float32x4_t new_p_z = vaddq_f32(p_curr_z, total_corr_z); vst1q_f32(&cns->ptc_pos_x[p_base + i], new_p_x); vst1q_f32(&cns->ptc_pos_y[p_base + i], new_p_y); vst1q_f32(&cns->ptc_pos_z[p_base + i], new_p_z); } } } // Step 5: Motion Constraints for (int c = 0; c < MAX_CHNS; ++c) { int p_base = c * PTC_PER_CHN; // Loop by 4 for NEON for (int i = 0; i < PTC_PER_CHN; i += 4) { float32x4_t px = vld1q_f32(&cns->ptc_pos_x[p_base + i]); float32x4_t py = vld1q_f32(&cns->ptc_pos_y[p_base + i]); float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[p_base + i]); float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); float32x4_t rx = vld1q_f32(&cns->rest_pos_x[p_base + i]); float32x4_t ry = vld1q_f32(&cns->rest_pos_y[p_base + i]); float32x4_t rz = vld1q_f32(&cns->rest_pos_z[p_base + i]); float32x4_t r_vec = vld1q_f32(&cns->motion_radius[p_base + i]); float32x4_t dx = vsubq_f32(px, rx); float32x4_t dy = vsubq_f32(py, ry); float32x4_t dz = vsubq_f32(pz, rz); float32x4_t dist_sq = vmulq_f32(dx, dx); dist_sq = vmlaq_f32(dist_sq, dy, dy); dist_sq = vmlaq_f32(dist_sq, dz, dz); float32x4_t inv_dist_motion; float32x4_t x_rsqrt_motion = vmaxq_f32(dist_sq, eps_vec); inv_dist_motion = vrsqrteq_f32(x_rsqrt_motion); inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion))); inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion))); float32x4_t inv_dist = inv_dist_motion; float32x4_t dist_motion; dist_motion = vrecpeq_f32(inv_dist); dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion)); dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion)); float32x4_t dist = dist_motion; float32x4_t pen = vsubq_f32(dist, r_vec); uint32x4_t mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise float32x4_t corr_factor = vmulq_f32(pen, inv_dist); corr_factor = vbslq_f32(mask, corr_factor, zero_vec); float32x4_t dt_x = vmulq_f32(dx, corr_factor); float32x4_t dt_y = vmulq_f32(dy, corr_factor); float32x4_t dt_z = vmulq_f32(dz, corr_factor); dt_x = vmulq_f32(dt_x, pm); dt_y = vmulq_f32(dt_y, pm); dt_z = vmulq_f32(dt_z, pm); vst1q_f32(&cns->ptc_pos_x[p_base + i], vsubq_f32(px, dt_x)); vst1q_f32(&cns->ptc_pos_y[p_base + i], vsubq_f32(py, dt_y)); vst1q_f32(&cns->ptc_pos_z[p_base + i], vsubq_f32(pz, dt_z)); } } }