rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

commit 8338c772b70a024a2d39bbe56077a6048962f158
parent 2e0a013190c43fb24fea0a048c4955cf8cb1be83
Author: vaplv <vaplv@free.fr>
Date:   Sun, 13 May 2018 18:01:04 +0200

Add and test the min/max/reduce_min/reduce_max v4i functions

Diffstat:
Msrc/sse/ssei.h | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/test_v4i.c | 38++++++++++++++++++++++++++++++++++++++
2 files changed, 100 insertions(+), 0 deletions(-)

diff --git a/src/sse/ssei.h b/src/sse/ssei.h @@ -204,5 +204,67 @@ v4i_sel(const v4i_T vfalse, const v4i_T vtrue, const v4i_T vcond) #endif } +static FINLINE v4i_T +v4i_min(const v4i_T v0, const v4i_T v1) +{ +#ifdef SIMD_SSE4_1 + return _mm_min_epi32(v0, v1); +#else + ALIGN(16) int32_t a[4]; + ALIGN(16) int32_t b[4]; + v4i_store(a, v0); + v4i_store(b, v1); + return v4i_set + (MMIN(a[0], b[0]), + MMIN(a[1], b[1]), + MMIN(a[2], b[2]), + MMIN(a[3], b[3])); +#endif +} + +static FINLINE v4i_T +v4i_max(const v4i_T v0, const v4i_T v1) +{ +#ifdef SIMD_SSE4_1 + return _mm_max_epi32(v0, v1); +#else + ALIGN(16) int32_t a[4]; + ALIGN(16) int32_t b[4]; + v4i_store(a, v0); + v4i_store(b, v1); + return v4i_set + (MMAX(a[0], b[0]), + MMAX(a[1], b[1]), + MMAX(a[2], b[2]), + MMAX(a[3], b[3])); +#endif +} + +static FINLINE v4i_T +v4i_reduce_min(const v4i_T v) +{ +#ifdef SIMD_SSE4_1 + const v4i_T tmp = v4i_min(v4i_yxwz(v), v); + return v4i_min(v4i_zwxy(tmp), tmp); +#else + ALIGN(16) int32_t a[4]; + v4i_store(a, v); + return v4i_set1(MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3]))); +#endif +} + +static FINLINE v4i_T +v4i_reduce_max(const v4i_T v) +{ +#ifdef SIMD_SSE4_1 + const v4i_T tmp = v4i_max(v4i_yxwz(v), v); + return v4i_max(v4i_zwxy(tmp), tmp); +#else + ALIGN(16) int32_t a[4]; + v4i_store(a, v); + return v4i_set1(MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3]))); +#endif +} + #endif /* RSIMD_SSEI_H */ diff --git a/src/test_v4i.c b/src/test_v4i.c @@ -198,6 +198,44 @@ main(int argc, char** argv) CHK(v4i_z(k) == 2); CHK(v4i_w(k) == 3); + i = v4i_set(1, 2, 3, 4); + j = v4i_set(-2, -4, 3, 6); + k = v4i_min(i, j); + CHK(v4i_x(k) == -2); + CHK(v4i_y(k) == -4); + CHK(v4i_z(k) == 3); + CHK(v4i_w(k) == 4); + + k = v4i_max(i, j); + CHK(v4i_x(k) == 1); + CHK(v4i_y(k) == 2); + CHK(v4i_z(k) == 3); + CHK(v4i_w(k) == 6); + + k = v4i_reduce_min(i); + CHK(v4i_x(k) == 1); + CHK(v4i_y(k) == 1); + CHK(v4i_z(k) == 1); + CHK(v4i_w(k) == 1); + + k = v4i_reduce_min(j); + CHK(v4i_x(k) == -4); + CHK(v4i_y(k) == -4); + CHK(v4i_z(k) == -4); + CHK(v4i_w(k) == -4); + + k = v4i_reduce_max(i); + CHK(v4i_x(k) == 4); + CHK(v4i_y(k) == 4); + CHK(v4i_z(k) == 4); + CHK(v4i_w(k) == 4); + + k = v4i_reduce_max(j); + CHK(v4i_x(k) == 6); + CHK(v4i_y(k) == 6); + CHK(v4i_z(k) == 6); + CHK(v4i_w(k) == 6); + return 0; }