commit 8338c772b70a024a2d39bbe56077a6048962f158
parent 2e0a013190c43fb24fea0a048c4955cf8cb1be83
Author: vaplv <vaplv@free.fr>
Date: Sun, 13 May 2018 18:01:04 +0200
Add and test the min/max/reduce_min/reduce_max v4i functions
Diffstat:
| M | src/sse/ssei.h | | | 62 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | src/test_v4i.c | | | 38 | ++++++++++++++++++++++++++++++++++++++ |
2 files changed, 100 insertions(+), 0 deletions(-)
diff --git a/src/sse/ssei.h b/src/sse/ssei.h
@@ -204,5 +204,67 @@ v4i_sel(const v4i_T vfalse, const v4i_T vtrue, const v4i_T vcond)
#endif
}
+static FINLINE v4i_T
+v4i_min(const v4i_T v0, const v4i_T v1)
+{
+#ifdef SIMD_SSE4_1
+ return _mm_min_epi32(v0, v1);
+#else
+ ALIGN(16) int32_t a[4];
+ ALIGN(16) int32_t b[4];
+ v4i_store(a, v0);
+ v4i_store(b, v1);
+ return v4i_set
+ (MMIN(a[0], b[0]),
+ MMIN(a[1], b[1]),
+ MMIN(a[2], b[2]),
+ MMIN(a[3], b[3]));
+#endif
+}
+
+static FINLINE v4i_T
+v4i_max(const v4i_T v0, const v4i_T v1)
+{
+#ifdef SIMD_SSE4_1
+ return _mm_max_epi32(v0, v1);
+#else
+ ALIGN(16) int32_t a[4];
+ ALIGN(16) int32_t b[4];
+ v4i_store(a, v0);
+ v4i_store(b, v1);
+ return v4i_set
+ (MMAX(a[0], b[0]),
+ MMAX(a[1], b[1]),
+ MMAX(a[2], b[2]),
+ MMAX(a[3], b[3]));
+#endif
+}
+
+static FINLINE v4i_T
+v4i_reduce_min(const v4i_T v)
+{
+#ifdef SIMD_SSE4_1
+ const v4i_T tmp = v4i_min(v4i_yxwz(v), v);
+ return v4i_min(v4i_zwxy(tmp), tmp);
+#else
+ ALIGN(16) int32_t a[4];
+ v4i_store(a, v);
+ return v4i_set1(MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3])));
+#endif
+}
+
+static FINLINE v4i_T
+v4i_reduce_max(const v4i_T v)
+{
+#ifdef SIMD_SSE4_1
+ const v4i_T tmp = v4i_max(v4i_yxwz(v), v);
+ return v4i_max(v4i_zwxy(tmp), tmp);
+#else
+ ALIGN(16) int32_t a[4];
+ v4i_store(a, v);
+ return v4i_set1(MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3])));
+#endif
+}
+
#endif /* RSIMD_SSEI_H */
diff --git a/src/test_v4i.c b/src/test_v4i.c
@@ -198,6 +198,44 @@ main(int argc, char** argv)
CHK(v4i_z(k) == 2);
CHK(v4i_w(k) == 3);
+ i = v4i_set(1, 2, 3, 4);
+ j = v4i_set(-2, -4, 3, 6);
+ k = v4i_min(i, j);
+ CHK(v4i_x(k) == -2);
+ CHK(v4i_y(k) == -4);
+ CHK(v4i_z(k) == 3);
+ CHK(v4i_w(k) == 4);
+
+ k = v4i_max(i, j);
+ CHK(v4i_x(k) == 1);
+ CHK(v4i_y(k) == 2);
+ CHK(v4i_z(k) == 3);
+ CHK(v4i_w(k) == 6);
+
+ k = v4i_reduce_min(i);
+ CHK(v4i_x(k) == 1);
+ CHK(v4i_y(k) == 1);
+ CHK(v4i_z(k) == 1);
+ CHK(v4i_w(k) == 1);
+
+ k = v4i_reduce_min(j);
+ CHK(v4i_x(k) == -4);
+ CHK(v4i_y(k) == -4);
+ CHK(v4i_z(k) == -4);
+ CHK(v4i_w(k) == -4);
+
+ k = v4i_reduce_max(i);
+ CHK(v4i_x(k) == 4);
+ CHK(v4i_y(k) == 4);
+ CHK(v4i_z(k) == 4);
+ CHK(v4i_w(k) == 4);
+
+ k = v4i_reduce_max(j);
+ CHK(v4i_x(k) == 6);
+ CHK(v4i_y(k) == 6);
+ CHK(v4i_z(k) == 6);
+ CHK(v4i_w(k) == 6);
+
return 0;
}