aosf44.h (9775B)
1 /* Copyright (C) 2014-2019, 2021, 2023, 2025 Vincent Forest (vaplv@free.fr) 2 * 3 * The RSIMD library is free software: you can redistribute it and/or modify 4 * it under the terms of the GNU General Public License as published 5 * by the Free Software Foundation, either version 3 of the License, or 6 * (at your option) any later version. 7 * 8 * The RSIMD library is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ 15 16 #ifndef AOSF44_H 17 #define AOSF44_H 18 19 #include "aosf33.h" 20 #include "rsimd.h" 21 22 /* 23 * Functions on column major AoS float44 matrices. A 4x4 matrix is a set of 4 24 * 4-wide SIMD float vectors, each representing a matrix column. 25 */ 26 27 /******************************************************************************* 28 * Set operations 29 ******************************************************************************/ 30 static FINLINE float* 31 aosf44_store(float dst[16], const v4f_T m[4]) 32 { 33 ASSERT(m && dst); 34 35 if(IS_ALIGNED(dst, 16)) { 36 v4f_store(dst + 0, m[0]); 37 v4f_store(dst + 4, m[1]); 38 v4f_store(dst + 8, m[2]); 39 v4f_store(dst + 12, m[3]); 40 } else { 41 ALIGN(16) float tmp[4]; 42 int i; 43 FOR_EACH(i, 0, 4) { 44 v4f_store(tmp, m[i]); 45 dst[i*4 + 0] = tmp[0]; 46 dst[i*4 + 1] = tmp[1]; 47 dst[i*4 + 2] = tmp[2]; 48 dst[i*4 + 3] = tmp[3]; 49 } 50 } 51 return dst; 52 } 53 54 static FINLINE v4f_T* 55 aosf44_load(v4f_T m[4], const float src[16]) 56 { 57 ASSERT(m && src); 58 if(IS_ALIGNED(src, 16)) { 59 m[0] = v4f_load(src + 0); 60 m[1] = v4f_load(src + 4); 61 m[2] = v4f_load(src + 8); 62 m[3] = v4f_load(src + 12); 63 } else { 64 int i; 65 FOR_EACH(i, 0, 4) 66 m[i] = v4f_set(src[i*3+0], src[i*3+1], src[i*4+2], src[i*4+3]); 67 } 68 return m; 69 } 70 71 static FINLINE v4f_T* 72 aosf44_set 73 (v4f_T m[4], const v4f_T c0, const v4f_T c1, const v4f_T c2, const v4f_T c3) 74 { 75 ASSERT(m); 76 m[0] = c0, m[1] = c1, m[2] = c2, m[3] = c3; 77 return m; 78 } 79 80 static FINLINE v4f_T* 81 aosf44_identity(v4f_T m[4]) 82 { 83 ASSERT(m); 84 m[0] = v4f_set(1.f, 0.f, 0.f, 0.f); 85 m[1] = v4f_set(0.f, 1.f, 0.f, 0.f); 86 m[2] = v4f_set(0.f, 0.f, 1.f, 0.f); 87 m[3] = v4f_set(0.f, 0.f, 0.f, 1.f); 88 return m; 89 } 90 91 static FINLINE v4f_T* 92 aosf44_zero(v4f_T m[4]) 93 { 94 ASSERT(m); 95 m[0] = v4f_zero(); 96 m[1] = v4f_zero(); 97 m[2] = v4f_zero(); 98 m[3] = v4f_zero(); 99 return m; 100 } 101 102 static FINLINE v4f_T* 103 aosf44_set_row0(v4f_T m[4], const v4f_T v) 104 { 105 const v4f_T xyzw = v; 106 const v4f_T yyww = v4f_yyww(v); 107 const v4f_T zwzw = v4f_zwzw(v); 108 const v4f_T wwww = v4f_yyww(zwzw); 109 ASSERT(m); 110 m[0] = v4f_ayzw(m[0], xyzw); 111 m[1] = v4f_ayzw(m[1], yyww); 112 m[2] = v4f_ayzw(m[2], zwzw); 113 m[3] = v4f_ayzw(m[3], wwww); 114 return m; 115 } 116 117 static FINLINE v4f_T* 118 aosf44_set_row1(v4f_T m[4], const v4f_T v) 119 { 120 ASSERT(m); 121 m[0] = v4f_xbzw(m[0], v4f_xxyy(v)); 122 m[1] = v4f_xbzw(m[1], v); 123 m[2] = v4f_xbzw(m[2], v4f_zzww(v)); 124 m[3] = v4f_xbzw(m[3], v4f_zwzw(v)); 125 return m; 126 } 127 128 static FINLINE v4f_T* 129 aosf44_set_row2(v4f_T m[4], const v4f_T v) 130 { 131 ASSERT(m); 132 m[0] = v4f_xycw(m[0], v4f_xyxy(v)); 133 m[1] = v4f_xycw(m[1], v4f_xxyy(v)); 134 m[2] = v4f_xycw(m[2], v); 135 m[3] = v4f_xycw(m[3], v4f_zzww(v)); 136 return m; 137 } 138 139 static FINLINE v4f_T* 140 aosf44_set_row3(v4f_T m[4], const v4f_T v) 141 { 142 ASSERT(m); 143 m[0] = v4f_xyzd(m[0], v4f_xxxx(v)); 144 m[1] = v4f_xyzd(m[1], v4f_xxyy(v)); 145 m[2] = v4f_xyzd(m[2], v4f_xxzz(v)); 146 m[3] = v4f_xyzd(m[3], v); 147 return m; 148 } 149 150 static FINLINE v4f_T* 151 aosf44_set_row(v4f_T m[4], const v4f_T v, const int id) 152 { 153 const v4f_T mask = v4f_mask(-(id==0), -(id==1), -(id==2), -(id==3)); 154 ASSERT(m && id >= 0 && id <= 3); 155 m[0] = v4f_sel(m[0], v4f_xxxx(v), mask); 156 m[1] = v4f_sel(m[1], v4f_yyyy(v), mask); 157 m[2] = v4f_sel(m[2], v4f_zzzz(v), mask); 158 m[3] = v4f_sel(m[3], v4f_wwww(v), mask); 159 return m; 160 } 161 162 static FINLINE v4f_T* 163 aosf44_set_col(v4f_T m[4], const v4f_T v, const int id) 164 { 165 ASSERT(m && id >= 0 && id <= 3); 166 m[id] = v; 167 return m; 168 } 169 170 /******************************************************************************* 171 * Get operations 172 ******************************************************************************/ 173 static FINLINE v4f_T 174 aosf44_row0(const v4f_T m[4]) 175 { 176 ASSERT(m); 177 return v4f_048C 178 (v4f_xxxx(m[0]), v4f_xxxx(m[1]), v4f_xxxx(m[2]), v4f_xxxx(m[3])); 179 } 180 181 static FINLINE v4f_T 182 aosf44_row1(const v4f_T m[4]) 183 { 184 ASSERT(m); 185 return v4f_048C 186 (v4f_yyyy(m[0]), v4f_yyyy(m[1]), v4f_yyyy(m[2]), v4f_yyyy(m[3])); 187 } 188 189 static FINLINE v4f_T 190 aosf44_row2(const v4f_T m[4]) 191 { 192 ASSERT(m); 193 return v4f_048C 194 (v4f_zzzz(m[0]), v4f_zzzz(m[1]), v4f_zzzz(m[2]), v4f_zzzz(m[3])); 195 } 196 197 static FINLINE v4f_T 198 aosf44_row3(const v4f_T m[4]) 199 { 200 ASSERT(m); 201 return v4f_048C 202 (v4f_wwww(m[0]), v4f_wwww(m[1]), v4f_wwww(m[2]), v4f_wwww(m[3])); 203 } 204 205 static FINLINE v4f_T 206 aosf44_row(const v4f_T m[4], const int id) 207 { 208 ASSERT(m && id >= 0 && id <= 3); 209 if(id == 0) { 210 return aosf44_row0(m); 211 } else if(id == 1) { 212 return aosf44_row1(m); 213 } else if(id == 2) { 214 return aosf44_row2(m); 215 } else { 216 return aosf44_row3(m); 217 } 218 } 219 220 static FINLINE v4f_T 221 aosf44_col(const v4f_T m[4], const int id) 222 { 223 ASSERT(m && id >= 0 && id <= 3); 224 return m[id]; 225 } 226 227 /******************************************************************************* 228 * Arithmetic operations 229 ******************************************************************************/ 230 static FINLINE v4f_T* 231 aosf44_add(v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4]) 232 { 233 ASSERT(res && m0 && m1); 234 res[0] = v4f_add(m0[0], m1[0]); 235 res[1] = v4f_add(m0[1], m1[1]); 236 res[2] = v4f_add(m0[2], m1[2]); 237 res[3] = v4f_add(m0[3], m1[3]); 238 return res; 239 } 240 241 static FINLINE v4f_T* 242 aosf44_sub(v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4]) 243 { 244 ASSERT(res && m0 && m1); 245 res[0] = v4f_sub(m0[0], m1[0]); 246 res[1] = v4f_sub(m0[1], m1[1]); 247 res[2] = v4f_sub(m0[2], m1[2]); 248 res[3] = v4f_sub(m0[3], m1[3]); 249 return res; 250 } 251 252 static FINLINE v4f_T* 253 aosf44_minus(v4f_T res[4], const v4f_T m[4]) 254 { 255 ASSERT(res && m); 256 res[0] = v4f_minus(m[0]); 257 res[1] = v4f_minus(m[1]); 258 res[2] = v4f_minus(m[2]); 259 res[3] = v4f_minus(m[3]); 260 return res; 261 } 262 263 static FINLINE v4f_T* 264 aosf44_abs(v4f_T res[4], const v4f_T m[4]) 265 { 266 ASSERT(res && m); 267 res[0] = v4f_abs(m[0]); 268 res[1] = v4f_abs(m[1]); 269 res[2] = v4f_abs(m[2]); 270 res[3] = v4f_abs(m[3]); 271 return res; 272 } 273 274 static FINLINE v4f_T* 275 aosf44_mul(v4f_T res[4], const v4f_T m[4], const v4f_T v) 276 { 277 ASSERT(res && m); 278 res[0] = v4f_mul(m[0], v); 279 res[1] = v4f_mul(m[1], v); 280 res[2] = v4f_mul(m[2], v); 281 res[3] = v4f_mul(m[3], v); 282 return res; 283 } 284 285 static FINLINE v4f_T 286 aosf44_mulf4(const v4f_T m[4], const v4f_T v) 287 { 288 v4f_T r0, r1, r2; 289 ASSERT(m); 290 r0 = v4f_mul(m[0], v4f_xxxx(v)); 291 r1 = v4f_madd(m[1], v4f_yyyy(v), r0); 292 r2 = v4f_madd(m[2], v4f_zzzz(v), r1); 293 return v4f_madd(m[3], v4f_wwww(v), r2); 294 } 295 296 static FINLINE v4f_T 297 aosf4_mulf44(v4f_T v, const v4f_T m[4]) 298 { 299 v4f_T xxxx, yyyy, zzzz, wwww, xyxy, zwzw; 300 ASSERT(m); 301 xxxx = v4f_dot(v, m[0]); 302 yyyy = v4f_dot(v, m[1]); 303 zzzz = v4f_dot(v, m[2]); 304 wwww = v4f_dot(v, m[3]); 305 xyxy = v4f_xayb(xxxx, yyyy); 306 zwzw = v4f_xayb(zzzz, wwww); 307 return v4f_xyab(xyxy, zwzw); 308 } 309 310 static FINLINE v4f_T* 311 aosf44_mulf44 312 (v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4]) 313 { 314 v4f_T c0, c1, c2, c3; 315 ASSERT(res && m0 && m1); 316 c0 = aosf44_mulf4(m0, m1[0]); 317 c1 = aosf44_mulf4(m0, m1[1]); 318 c2 = aosf44_mulf4(m0, m1[2]); 319 c3 = aosf44_mulf4(m0, m1[3]); 320 res[0] = c0; 321 res[1] = c1; 322 res[2] = c2; 323 res[3] = c3; 324 return res; 325 } 326 327 static FINLINE v4f_T* 328 aosf44_transpose(v4f_T res[4], const v4f_T m[4]) 329 { 330 v4f_T in_c0, in_c1, in_c2, in_c3; 331 v4f_T x0x2y0y2, x1x3y1y3, z0z2w0w2, z1z3w1w3; 332 ASSERT(res && m); 333 in_c0 = m[0]; 334 in_c1 = m[1]; 335 in_c2 = m[2]; 336 in_c3 = m[3]; 337 x0x2y0y2 = v4f_xayb(in_c0, in_c2); 338 x1x3y1y3 = v4f_xayb(in_c1, in_c3); 339 z0z2w0w2 = v4f_zcwd(in_c0, in_c2); 340 z1z3w1w3 = v4f_zcwd(in_c1, in_c3); 341 res[0] = v4f_xayb(x0x2y0y2, x1x3y1y3); 342 res[1] = v4f_zcwd(x0x2y0y2, x1x3y1y3); 343 res[2] = v4f_xayb(z0z2w0w2, z1z3w1w3); 344 res[3] = v4f_zcwd(z0z2w0w2, z1z3w1w3); 345 return res; 346 } 347 348 static FINLINE v4f_T 349 aosf44_det(const v4f_T m[4]) 350 { 351 v4f_T xxxx, yyyy, zzzz, wwww, xyxy, zwzw, xyzw; 352 v4f_T f33_012_012[3], f33_012_013[3], f33_012_023[3], f33_012_123[3]; 353 ASSERT(m); 354 aosf33_set(f33_012_012, m[0], m[1], m[2]); 355 aosf33_set(f33_012_013, m[0], m[1], m[3]); 356 aosf33_set(f33_012_023, m[0], m[2], m[3]); 357 aosf33_set(f33_012_123, m[1], m[2], m[3]); 358 xxxx = v4f_minus(aosf33_det(f33_012_123)); 359 yyyy = aosf33_det(f33_012_023); 360 zzzz = v4f_minus(aosf33_det(f33_012_013)); 361 wwww = aosf33_det(f33_012_012); 362 xyxy = v4f_xayb(xxxx, yyyy); 363 zwzw = v4f_xayb(zzzz, wwww); 364 xyzw = v4f_xyab(xyxy, zwzw); 365 return v4f_dot(xyzw, aosf44_row3(m)); 366 } 367 368 RSIMD_API v4f_T /* Return the determinant */ 369 aosf44_inverse(v4f_T out[4], const v4f_T in[4]); 370 371 static FINLINE v4f_T /* Return the determinant */ 372 aosf44_invtrans(v4f_T out[4], const v4f_T a[4]) 373 { 374 v4f_T det; 375 ASSERT(out && a); 376 det = aosf44_inverse(out, a); 377 aosf44_transpose(out, out); 378 return det; 379 } 380 381 static FINLINE v4f_T 382 aosf44_eq(const v4f_T a[4], const v4f_T b[4]) 383 { 384 ASSERT(a && b); 385 if(a == b) { 386 return v4f_true(); 387 } else { 388 const v4f_T eq_c0 = v4f_eq(a[0], b[0]); 389 const v4f_T eq_c1 = v4f_eq(a[1], b[1]); 390 const v4f_T eq_c2 = v4f_eq(a[2], b[2]); 391 const v4f_T eq_c3 = v4f_eq(a[3], b[3]); 392 const v4f_T eq = v4f_and(v4f_and(eq_c0, eq_c1), v4f_and(eq_c2, eq_c3)); 393 const v4f_T tmp = v4f_and(v4f_xzxz(eq), v4f_ywyw(eq)); 394 const v4f_T ret = v4f_and(tmp, v4f_yxwz(tmp)); 395 return ret; 396 } 397 } 398 399 #endif /* AOSF44_H */ 400