avxi.h (5598B)
1 /* Copyright (C) 2014-2019, 2021, 2023, 2025 Vincent Forest (vaplv@free.fr) 2 * 3 * The RSIMD library is free software: you can redistribute it and/or modify 4 * it under the terms of the GNU General Public License as published 5 * by the Free Software Foundation, either version 3 of the License, or 6 * (at your option) any later version. 7 * 8 * The RSIMD library is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ 15 16 #ifndef RSIMD_AVXI_H 17 #define RSIMD_AVXI_H 18 19 /* 20 * 8 packed signed integers 21 */ 22 23 #include <rsys/math.h> 24 #include <immintrin.h> 25 26 typedef __m256i v8i_T; 27 28 /******************************************************************************* 29 * Set operations 30 ******************************************************************************/ 31 static FINLINE int32_t* 32 v8i_store(int32_t dst[8], v8i_T v) 33 { 34 ASSERT(dst && IS_ALIGNED(dst, 32)); 35 _mm256_store_si256((v8i_T*)dst, v); 36 return dst; 37 } 38 39 static FINLINE v8i_T 40 v8i_load(const int32_t src[8]) 41 { 42 ASSERT(src && IS_ALIGNED(src, 32)); 43 return _mm256_load_si256((const v8i_T*)src); 44 } 45 46 static FINLINE v8i_T 47 v8i_set1(const int32_t i) 48 { 49 return _mm256_set1_epi32(i); 50 } 51 52 static FINLINE v8i_T 53 v8i_set 54 (const int32_t a, const int32_t b, const int32_t c, const int32_t d, 55 const int32_t e, const int32_t f, const int32_t g, const int32_t h) 56 { 57 return _mm256_set_epi32(h, g, f, e, d, c, b, a); 58 } 59 60 static FINLINE v8i_T 61 v8i_zero(void) 62 { 63 return _mm256_setzero_si256(); 64 } 65 66 static FINLINE v8i_T 67 v8i_set_v4i(const v4i_T abcd, const v4i_T efgh) 68 { 69 v8i_T tmp = v8i_zero(); 70 tmp = _mm256_insertf128_si256(tmp, abcd, 0); 71 tmp = _mm256_insertf128_si256(tmp, efgh, 1); 72 return tmp; 73 } 74 75 /******************************************************************************* 76 * Extract components 77 ******************************************************************************/ 78 static FINLINE v4i_T 79 v8i_abcd(const v8i_T v) 80 { 81 return _mm256_extractf128_si256(v, 0); 82 } 83 84 static FINLINE v4i_T 85 v8i_efgh(const v8i_T v) 86 { 87 return _mm256_extractf128_si256(v, 1); 88 } 89 90 /******************************************************************************* 91 * Bitwise operators 92 ******************************************************************************/ 93 static FINLINE v8i_T 94 v8i_or(const v8i_T v0, const v8i_T v1) 95 { 96 const v8f_T a = _mm256_castsi256_ps(v0); 97 const v8f_T b = _mm256_castsi256_ps(v1); 98 const v8f_T c = _mm256_or_ps(a, b); 99 return _mm256_castps_si256(c); 100 } 101 102 static FINLINE v8i_T 103 v8i_and(const v8i_T v0, const v8i_T v1) 104 { 105 const v8f_T a = _mm256_castsi256_ps(v0); 106 const v8f_T b = _mm256_castsi256_ps(v1); 107 const v8f_T c = _mm256_and_ps(a, b); 108 return _mm256_castps_si256(c); 109 } 110 111 static FINLINE v8i_T 112 v8i_andnot(const v8i_T v0, const v8i_T v1) 113 { 114 const v8f_T a = _mm256_castsi256_ps(v0); 115 const v8f_T b = _mm256_castsi256_ps(v1); 116 const v8f_T c = _mm256_andnot_ps(a, b); 117 return _mm256_castps_si256(c); 118 } 119 120 static FINLINE v8i_T 121 v8i_xor(const v8i_T v0, const v8i_T v1) 122 { 123 const v8f_T a = _mm256_castsi256_ps(v0); 124 const v8f_T b = _mm256_castsi256_ps(v1); 125 const v8f_T c = _mm256_xor_ps(a, b); 126 return _mm256_castps_si256(c); 127 } 128 129 /******************************************************************************* 130 * Comparators 131 ******************************************************************************/ 132 static FINLINE v8i_T 133 v8i_eq(const v8i_T v0, const v8i_T v1) 134 { 135 const v4i_T v0_abcd = v8i_abcd(v0); 136 const v4i_T v0_efgh = v8i_efgh(v0); 137 const v4i_T v1_abcd = v8i_abcd(v1); 138 const v4i_T v1_efgh = v8i_efgh(v1); 139 const v4i_T abcd = v4i_eq(v0_abcd, v1_abcd); 140 const v4i_T efgh = v4i_eq(v0_efgh, v1_efgh); 141 return v8i_set_v4i(abcd, efgh); 142 } 143 144 static FINLINE v8i_T 145 v8i_neq(const v8i_T v0, const v8i_T v1) 146 { 147 const v4i_T v0_abcd = v8i_abcd(v0); 148 const v4i_T v0_efgh = v8i_efgh(v0); 149 const v4i_T v1_abcd = v8i_abcd(v1); 150 const v4i_T v1_efgh = v8i_efgh(v1); 151 const v4i_T abcd = v4i_neq(v0_abcd, v1_abcd); 152 const v4i_T efgh = v4i_neq(v0_efgh, v1_efgh); 153 return v8i_set_v4i(abcd, efgh); 154 } 155 156 static FINLINE v8i_T 157 v8i_sel(const v8i_T vfalse, const v8i_T vtrue, const v8i_T vcond) 158 { 159 const v8f_T a = _mm256_castsi256_ps(vfalse); 160 const v8f_T b = _mm256_castsi256_ps(vtrue); 161 const v8f_T c = _mm256_castsi256_ps(vcond); 162 return _mm256_castps_si256(_mm256_blendv_ps(a, b, c)); 163 } 164 165 static FINLINE v8i_T 166 v8i_min(const v8i_T v0, const v8i_T v1) 167 { 168 const v4i_T v0_abcd = v8i_abcd(v0); 169 const v4i_T v0_efgh = v8i_efgh(v0); 170 const v4i_T v1_abcd = v8i_abcd(v1); 171 const v4i_T v1_efgh = v8i_efgh(v1); 172 const v4i_T abcd = v4i_min(v0_abcd, v1_abcd); 173 const v4i_T efgh = v4i_min(v0_efgh, v1_efgh); 174 return v8i_set_v4i(abcd, efgh); 175 } 176 177 static FINLINE v8i_T 178 v8i_max(const v8i_T v0, const v8i_T v1) 179 { 180 const v4i_T v0_abcd = v8i_abcd(v0); 181 const v4i_T v0_efgh = v8i_efgh(v0); 182 const v4i_T v1_abcd = v8i_abcd(v1); 183 const v4i_T v1_efgh = v8i_efgh(v1); 184 const v4i_T abcd = v4i_max(v0_abcd, v1_abcd); 185 const v4i_T efgh = v4i_max(v0_efgh, v1_efgh); 186 return v8i_set_v4i(abcd, efgh); 187 } 188 189 static FINLINE int32_t 190 v8i_reduce_min_i32(const v8i_T v) 191 { 192 const v4i_T tmp = v4i_min(v8i_abcd(v), v8i_efgh(v)); 193 return v4i_x(v4i_reduce_min(tmp)); 194 } 195 196 static FINLINE int32_t 197 v8i_reduce_max_i32(const v8i_T v) 198 { 199 const v4i_T tmp = v4i_max(v8i_abcd(v), v8i_efgh(v)); 200 return v4i_x(v4i_reduce_max(tmp)); 201 } 202 203 #endif /* RSIMD_AVXI_H */ 204