rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

avxi.h (5598B)


      1 /* Copyright (C) 2014-2019, 2021, 2023, 2025 Vincent Forest (vaplv@free.fr)
      2  *
      3  * The RSIMD library is free software: you can redistribute it and/or modify
      4  * it under the terms of the GNU General Public License as published
      5  * by the Free Software Foundation, either version 3 of the License, or
      6  * (at your option) any later version.
      7  *
      8  * The RSIMD library is distributed in the hope that it will be useful,
      9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
     11  * GNU General Public License for more details.
     12  *
     13  * You should have received a copy of the GNU General Public License
     14  * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
     15 
     16 #ifndef RSIMD_AVXI_H
     17 #define RSIMD_AVXI_H
     18 
     19 /*
     20  * 8 packed signed integers
     21  */
     22 
     23 #include <rsys/math.h>
     24 #include <immintrin.h>
     25 
     26 typedef __m256i v8i_T;
     27 
     28 /*******************************************************************************
     29  * Set operations
     30  ******************************************************************************/
     31 static FINLINE int32_t*
     32 v8i_store(int32_t dst[8], v8i_T v)
     33 {
     34   ASSERT(dst && IS_ALIGNED(dst, 32));
     35   _mm256_store_si256((v8i_T*)dst, v);
     36   return dst;
     37 }
     38 
     39 static FINLINE v8i_T
     40 v8i_load(const int32_t src[8])
     41 {
     42   ASSERT(src && IS_ALIGNED(src, 32));
     43   return _mm256_load_si256((const v8i_T*)src);
     44 }
     45 
     46 static FINLINE v8i_T
     47 v8i_set1(const int32_t i)
     48 {
     49   return _mm256_set1_epi32(i);
     50 }
     51 
     52 static FINLINE v8i_T
     53 v8i_set
     54   (const int32_t a, const int32_t b, const int32_t c, const int32_t d,
     55    const int32_t e, const int32_t f, const int32_t g, const int32_t h)
     56 {
     57   return _mm256_set_epi32(h, g, f, e, d, c, b, a);
     58 }
     59 
     60 static FINLINE v8i_T
     61 v8i_zero(void)
     62 {
     63   return _mm256_setzero_si256();
     64 }
     65 
     66 static FINLINE v8i_T
     67 v8i_set_v4i(const v4i_T abcd, const v4i_T efgh)
     68 {
     69   v8i_T tmp = v8i_zero();
     70   tmp = _mm256_insertf128_si256(tmp, abcd, 0);
     71   tmp = _mm256_insertf128_si256(tmp, efgh, 1);
     72   return tmp;
     73 }
     74 
     75 /*******************************************************************************
     76  * Extract components
     77  ******************************************************************************/
     78 static FINLINE v4i_T
     79 v8i_abcd(const v8i_T v)
     80 {
     81   return _mm256_extractf128_si256(v, 0);
     82 }
     83 
     84 static FINLINE v4i_T
     85 v8i_efgh(const v8i_T v)
     86 {
     87   return _mm256_extractf128_si256(v, 1);
     88 }
     89 
     90 /*******************************************************************************
     91  * Bitwise operators
     92  ******************************************************************************/
     93 static FINLINE v8i_T
     94 v8i_or(const v8i_T v0, const v8i_T v1)
     95 {
     96   const v8f_T a = _mm256_castsi256_ps(v0);
     97   const v8f_T b = _mm256_castsi256_ps(v1);
     98   const v8f_T c = _mm256_or_ps(a, b);
     99   return _mm256_castps_si256(c);
    100 }
    101 
    102 static FINLINE v8i_T
    103 v8i_and(const v8i_T v0, const v8i_T v1)
    104 {
    105   const v8f_T a = _mm256_castsi256_ps(v0);
    106   const v8f_T b = _mm256_castsi256_ps(v1);
    107   const v8f_T c = _mm256_and_ps(a, b);
    108   return _mm256_castps_si256(c);
    109 }
    110 
    111 static FINLINE v8i_T
    112 v8i_andnot(const v8i_T v0, const v8i_T v1)
    113 {
    114   const v8f_T a = _mm256_castsi256_ps(v0);
    115   const v8f_T b = _mm256_castsi256_ps(v1);
    116   const v8f_T c = _mm256_andnot_ps(a, b);
    117   return _mm256_castps_si256(c);
    118 }
    119 
    120 static FINLINE v8i_T
    121 v8i_xor(const v8i_T v0, const v8i_T v1)
    122 {
    123   const v8f_T a = _mm256_castsi256_ps(v0);
    124   const v8f_T b = _mm256_castsi256_ps(v1);
    125   const v8f_T c = _mm256_xor_ps(a, b);
    126   return _mm256_castps_si256(c);
    127 }
    128 
    129 /*******************************************************************************
    130  * Comparators
    131  ******************************************************************************/
    132 static FINLINE v8i_T
    133 v8i_eq(const v8i_T v0, const v8i_T v1)
    134 {
    135   const v4i_T v0_abcd = v8i_abcd(v0);
    136   const v4i_T v0_efgh = v8i_efgh(v0);
    137   const v4i_T v1_abcd = v8i_abcd(v1);
    138   const v4i_T v1_efgh = v8i_efgh(v1);
    139   const v4i_T abcd = v4i_eq(v0_abcd, v1_abcd);
    140   const v4i_T efgh = v4i_eq(v0_efgh, v1_efgh);
    141   return v8i_set_v4i(abcd, efgh);
    142 }
    143 
    144 static FINLINE v8i_T
    145 v8i_neq(const v8i_T v0, const v8i_T v1)
    146 {
    147   const v4i_T v0_abcd = v8i_abcd(v0);
    148   const v4i_T v0_efgh = v8i_efgh(v0);
    149   const v4i_T v1_abcd = v8i_abcd(v1);
    150   const v4i_T v1_efgh = v8i_efgh(v1);
    151   const v4i_T abcd = v4i_neq(v0_abcd, v1_abcd);
    152   const v4i_T efgh = v4i_neq(v0_efgh, v1_efgh);
    153   return v8i_set_v4i(abcd, efgh);
    154 }
    155 
    156 static FINLINE v8i_T
    157 v8i_sel(const v8i_T vfalse, const v8i_T vtrue, const v8i_T vcond)
    158 {
    159   const v8f_T a = _mm256_castsi256_ps(vfalse);
    160   const v8f_T b = _mm256_castsi256_ps(vtrue);
    161   const v8f_T c = _mm256_castsi256_ps(vcond);
    162   return _mm256_castps_si256(_mm256_blendv_ps(a, b, c));
    163 }
    164 
    165 static FINLINE v8i_T
    166 v8i_min(const v8i_T v0, const v8i_T v1)
    167 {
    168   const v4i_T v0_abcd = v8i_abcd(v0);
    169   const v4i_T v0_efgh = v8i_efgh(v0);
    170   const v4i_T v1_abcd = v8i_abcd(v1);
    171   const v4i_T v1_efgh = v8i_efgh(v1);
    172   const v4i_T abcd = v4i_min(v0_abcd, v1_abcd);
    173   const v4i_T efgh = v4i_min(v0_efgh, v1_efgh);
    174   return v8i_set_v4i(abcd, efgh);
    175 }
    176 
    177 static FINLINE v8i_T
    178 v8i_max(const v8i_T v0, const v8i_T v1)
    179 {
    180   const v4i_T v0_abcd = v8i_abcd(v0);
    181   const v4i_T v0_efgh = v8i_efgh(v0);
    182   const v4i_T v1_abcd = v8i_abcd(v1);
    183   const v4i_T v1_efgh = v8i_efgh(v1);
    184   const v4i_T abcd = v4i_max(v0_abcd, v1_abcd);
    185   const v4i_T efgh = v4i_max(v0_efgh, v1_efgh);
    186   return v8i_set_v4i(abcd, efgh);
    187 }
    188 
    189 static FINLINE int32_t
    190 v8i_reduce_min_i32(const v8i_T v)
    191 {
    192   const v4i_T tmp = v4i_min(v8i_abcd(v), v8i_efgh(v));
    193   return v4i_x(v4i_reduce_min(tmp));
    194 }
    195 
    196 static FINLINE int32_t
    197 v8i_reduce_max_i32(const v8i_T v)
    198 {
    199   const v4i_T tmp = v4i_max(v8i_abcd(v), v8i_efgh(v));
    200   return v4i_x(v4i_reduce_max(tmp));
    201 }
    202 
    203 #endif /* RSIMD_AVXI_H */
    204