avxf.h (7306B)
1 /* Copyright (C) 2014-2019, 2021, 2023, 2025 Vincent Forest (vaplv@free.fr) 2 * 3 * The RSIMD library is free software: you can redistribute it and/or modify 4 * it under the terms of the GNU General Public License as published 5 * by the Free Software Foundation, either version 3 of the License, or 6 * (at your option) any later version. 7 * 8 * The RSIMD library is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ 15 16 #ifndef RSIMD_AVXF_H 17 #define RSIMD_AVXF_H 18 19 /* 20 * 8 packed single precision floating-point values 21 */ 22 23 #include "avx.h" 24 25 #include <rsys/math.h> 26 #include <immintrin.h> 27 28 typedef __m256 v8f_T; 29 30 /******************************************************************************* 31 * Set operations 32 ******************************************************************************/ 33 static FINLINE float* 34 v8f_store(float dst[8], v8f_T v) 35 { 36 ASSERT(dst && IS_ALIGNED(dst, 32)); 37 _mm256_store_ps(dst, v); 38 return dst; 39 } 40 41 static FINLINE v8f_T 42 v8f_load(const float src[8]) 43 { 44 ASSERT(src && IS_ALIGNED(src, 32)); 45 return _mm256_load_ps(src); 46 } 47 48 static FINLINE v8f_T 49 v8f_loadu(const float f[8]) 50 { 51 ASSERT(f); 52 return _mm256_set_ps(f[7], f[6], f[5], f[4], f[3],f[2], f[1], f[0]); 53 } 54 55 static FINLINE v8f_T 56 v8f_set1(const float x) 57 { 58 return _mm256_set1_ps(x); 59 } 60 61 static FINLINE v8f_T 62 v8f_set 63 (const float a, const float b, const float c, const float d, 64 const float e, const float f, const float g, const float h) 65 { 66 return _mm256_set_ps(h, g, f, e, d, c, b, a); 67 } 68 69 static FINLINE v8f_T 70 v8f_zero(void) 71 { 72 return _mm256_setzero_ps(); 73 } 74 75 static FINLINE v8f_T 76 v8f_mask 77 (const int32_t a, const int32_t b, const int32_t c, const int32_t d, 78 const int32_t e, const int32_t f, const int32_t g, const int32_t h) 79 { 80 return _mm256_castsi256_ps(_mm256_set_epi32(h, g, f, e, d, c, b, a)); 81 } 82 83 static FINLINE v8f_T 84 v8f_mask1(const int32_t x) 85 { 86 return _mm256_castsi256_ps(_mm256_set1_epi32(x)); 87 } 88 89 static FINLINE v8f_T 90 v8f_true(void) 91 { 92 return _mm256_castsi256_ps(_mm256_set1_epi32(~0)); 93 } 94 95 static FINLINE v8f_T 96 v8f_false(void) 97 { 98 return v8f_zero(); 99 } 100 101 /******************************************************************************* 102 * Extract components 103 ******************************************************************************/ 104 static FINLINE v4f_T 105 v8f_abcd(const v8f_T v) 106 { 107 return _mm256_extractf128_ps(v, 0); 108 } 109 110 static FINLINE v4f_T 111 v8f_efgh(const v8f_T v) 112 { 113 return _mm256_extractf128_ps(v, 1); 114 } 115 116 static FINLINE int 117 v8f_movemask(const v8f_T v) 118 { 119 return _mm256_movemask_ps(v); 120 } 121 122 /******************************************************************************* 123 * Bitwise operations 124 ******************************************************************************/ 125 static FINLINE v8f_T 126 v8f_or(const v8f_T v0, const v8f_T v1) 127 { 128 return _mm256_or_ps(v0, v1); 129 } 130 131 static FINLINE v8f_T 132 v8f_and(const v8f_T v0, const v8f_T v1) 133 { 134 return _mm256_and_ps(v0, v1); 135 } 136 137 static FINLINE v8f_T 138 v8f_andnot(const v8f_T v0, const v8f_T v1) 139 { 140 return _mm256_andnot_ps(v0, v1); 141 } 142 143 static FINLINE v8f_T 144 v8f_xor(const v8f_T v0, const v8f_T v1) 145 { 146 return _mm256_xor_ps(v0, v1); 147 } 148 149 static FINLINE v8f_T 150 v8f_sel(const v8f_T vfalse, const v8f_T vtrue, const v8f_T vcond) 151 { 152 return _mm256_blendv_ps(vfalse, vtrue, vcond); 153 } 154 155 /******************************************************************************* 156 * Arithmetic operations 157 ******************************************************************************/ 158 static FINLINE v8f_T 159 v8f_minus(const v8f_T v) 160 { 161 return v8f_xor(v8f_set1(-0.f), v); 162 } 163 164 static FINLINE v8f_T 165 v8f_add(const v8f_T v0, const v8f_T v1) 166 { 167 return _mm256_add_ps(v0, v1); 168 } 169 170 static FINLINE v8f_T 171 v8f_sub(const v8f_T v0, const v8f_T v1) 172 { 173 return _mm256_sub_ps(v0, v1); 174 } 175 176 static FINLINE v8f_T 177 v8f_mul(const v8f_T v0, const v8f_T v1) 178 { 179 return _mm256_mul_ps(v0, v1); 180 } 181 182 static FINLINE v8f_T 183 v8f_div(const v8f_T v0, const v8f_T v1) 184 { 185 return _mm256_div_ps(v0, v1); 186 } 187 188 static FINLINE v8f_T 189 v8f_madd(const v8f_T v0, const v8f_T v1, const v8f_T v2) 190 { 191 return _mm256_add_ps(_mm256_mul_ps(v0, v1), v2); 192 } 193 194 static FINLINE v8f_T 195 v8f_abs(const v8f_T v) 196 { 197 const union { int32_t i; float f; } mask = { 0x7fffffff }; 198 return v8f_and(v, v8f_set1(mask.f)); 199 } 200 201 static FINLINE v8f_T 202 v8f_sqrt(const v8f_T v) 203 { 204 return _mm256_sqrt_ps(v); 205 } 206 207 static FINLINE v8f_T 208 v8f_rsqrte(const v8f_T v) 209 { 210 return _mm256_rsqrt_ps(v); 211 } 212 213 static FINLINE v8f_T 214 v8f_rsqrt(const v8f_T v) 215 { 216 const v8f_T y = v8f_rsqrte(v); 217 const v8f_T yyv = v8f_mul(v8f_mul(y, y), v); 218 const v8f_T tmp = v8f_sub(v8f_set1(1.5f), v8f_mul(yyv, v8f_set1(0.5f))); 219 return v8f_mul(tmp, y); 220 } 221 222 static FINLINE v8f_T 223 v8f_rcpe(const v8f_T v) 224 { 225 return _mm256_rcp_ps(v); 226 } 227 228 static FINLINE v8f_T 229 v8f_rcp(const v8f_T v) 230 { 231 const v8f_T y = v8f_rcpe(v); 232 const v8f_T tmp = v8f_sub(v8f_set1(2.f), v8f_mul(y, v)); 233 return v8f_mul(tmp, y); 234 } 235 236 static FINLINE v8f_T 237 v8f_lerp(const v8f_T from, const v8f_T to, const v8f_T param) 238 { 239 return v8f_madd(v8f_sub(to, from), param, from); 240 } 241 242 /******************************************************************************* 243 * Comparators 244 ******************************************************************************/ 245 static FINLINE v8f_T 246 v8f_eq(const v8f_T v0, const v8f_T v1) 247 { 248 return _mm256_cmp_ps(v0, v1, _CMP_EQ_OS); 249 } 250 251 static FINLINE v8f_T 252 v8f_neq(const v8f_T v0, const v8f_T v1) 253 { 254 return _mm256_cmp_ps(v0, v1, _CMP_NEQ_OS); 255 } 256 257 static FINLINE v8f_T 258 v8f_ge(const v8f_T v0, const v8f_T v1) 259 { 260 return _mm256_cmp_ps(v0, v1, _CMP_GE_OS); 261 } 262 263 static FINLINE v8f_T 264 v8f_le(const v8f_T v0, const v8f_T v1) 265 { 266 return _mm256_cmp_ps(v0, v1, _CMP_LE_OS); 267 } 268 269 static FINLINE v8f_T 270 v8f_gt(const v8f_T v0, const v8f_T v1) 271 { 272 return _mm256_cmp_ps(v0, v1, _CMP_GT_OS); 273 } 274 275 static FINLINE v8f_T 276 v8f_lt(const v8f_T v0, const v8f_T v1) 277 { 278 return _mm256_cmp_ps(v0, v1, _CMP_LT_OS); 279 } 280 281 static FINLINE v8f_T 282 v8f_eq_eps(const v8f_T v0, const v8f_T v1, const v8f_T eps) 283 { 284 return v8f_le(v8f_abs(v8f_sub(v0, v1)), eps); 285 } 286 287 static FINLINE v8f_T 288 v8f_min(const v8f_T v0, const v8f_T v1) 289 { 290 return _mm256_min_ps(v0, v1); 291 } 292 293 static FINLINE v8f_T 294 v8f_max(const v8f_T v0, const v8f_T v1) 295 { 296 return _mm256_max_ps(v0, v1); 297 } 298 299 static FINLINE float 300 v8f_reduce_min(const v8f_T v0) 301 { 302 ALIGN(32) float tmp[8]; 303 const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2)); 304 const v8f_T v2 = _mm256_min_ps(v0, v1); 305 const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1)); 306 const v8f_T v4 = _mm256_min_ps(v2, v3); 307 _mm256_store_ps(tmp, v4); 308 return MMIN(tmp[0], tmp[4]); 309 } 310 311 static FINLINE float 312 v8f_reduce_max(const v8f_T v0) 313 { 314 ALIGN(32) float tmp[8]; 315 const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2)); 316 const v8f_T v2 = _mm256_max_ps(v0, v1); 317 const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1)); 318 const v8f_T v4 = _mm256_max_ps(v2, v3); 319 _mm256_store_ps(tmp, v4); 320 return MMAX(tmp[0], tmp[4]); 321 } 322 323 static FINLINE v8f_T 324 v8f_clamp(const v8f_T v, const v8f_T vmin, const v8f_T vmax) 325 { 326 return v8f_min(v8f_max(v, vmin), vmax); 327 } 328 329 #endif /* RSIMD_AVX_H */ 330