commit 07f25d2d6f1905054d968f9d7aefd8eff73b4f02
parent 9870c031f34427d5d4a01e34fd5f72a42a8e0f28
Author: vaplv <vaplv@free.fr>
Date: Sat, 2 Jun 2018 15:08:47 +0200
Add and test the v8i_T API
Diffstat:
6 files changed, 409 insertions(+), 2 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -115,6 +115,7 @@ if(NOT NO_TEST)
if(AVX AND CMAKE_COMPILER_IS_GNUCC)
new_test(test_v8f "-mavx")
+ new_test(test_v8i "-mavx")
endif(AVX AND CMAKE_COMPILER_IS_GNUCC)
endif(NOT NO_TEST)
diff --git a/src/avx/avx.h b/src/avx/avx.h
@@ -17,8 +17,7 @@
#define RSIMD_AVX_H
#include "avxf.h"
-
-typedef __m256i v8i_T;
+#include "avxi.h"
/* Reinterpret cast */
static FINLINE v8i_T v8f_rcast_v8i(const v8f_T v) {return _mm256_castps_si256(v);}
diff --git a/src/avx/avxi.h b/src/avx/avxi.h
@@ -0,0 +1,205 @@
+/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr)
+ *
+ * The RSIMD library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The RSIMD library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef RSIMD_AVXI_H
+#define RSIMD_AVXI_H
+
+/*
+ * 8 packed signed integers
+ */
+
+#include "avx.h"
+
+#include <rsys/math.h>
+#include <immintrin.h>
+
+typedef __m256i v8i_T;
+
+/*******************************************************************************
+ * Set operations
+ ******************************************************************************/
+static FINLINE int32_t*
+v8i_store(int32_t dst[8], v8i_T v)
+{
+ ASSERT(dst && IS_ALIGNED(dst, 32));
+ _mm256_store_si256((v8i_T*)dst, v);
+ return dst;
+}
+
+static FINLINE v8i_T
+v8i_load(const int32_t src[8])
+{
+ ASSERT(src && IS_ALIGNED(src, 32));
+ return _mm256_load_si256((const v8i_T*)src);
+}
+
+static FINLINE v8i_T
+v8i_set1(const int32_t i)
+{
+ return _mm256_set1_epi32(i);
+}
+
+static FINLINE v8i_T
+v8i_set
+ (const int32_t a, const int32_t b, const int32_t c, const int32_t d,
+ const int32_t e, const int32_t f, const int32_t g, const int32_t h)
+{
+ return _mm256_set_epi32(h, g, f, e, d, c, b, a);
+}
+
+static FINLINE v8i_T
+v8i_zero(void)
+{
+ return _mm256_setzero_si256();
+}
+
+/*******************************************************************************
+ * Extract components
+ ******************************************************************************/
+static FINLINE v4i_T
+v8i_abcd(const v8i_T v)
+{
+ return _mm256_extractf128_si256(v, 0);
+}
+
+static FINLINE v4i_T
+v8i_efgh(const v8i_T v)
+{
+ return _mm256_extractf128_si256(v, 1);
+}
+
+/*******************************************************************************
+ * Bitwise operators
+ ******************************************************************************/
+static FINLINE v8i_T
+v8i_or(const v8i_T v0, const v8i_T v1)
+{
+ const v8f_T a = _mm256_castsi256_ps(v0);
+ const v8f_T b = _mm256_castsi256_ps(v1);
+ const v8f_T c = _mm256_or_ps(a, b);
+ return _mm256_castps_si256(c);
+}
+
+static FINLINE v8i_T
+v8i_and(const v8i_T v0, const v8i_T v1)
+{
+ const v8f_T a = _mm256_castsi256_ps(v0);
+ const v8f_T b = _mm256_castsi256_ps(v1);
+ const v8f_T c = _mm256_and_ps(a, b);
+ return _mm256_castps_si256(c);
+}
+
+static FINLINE v8i_T
+v8i_andnot(const v8i_T v0, const v8i_T v1)
+{
+ const v8f_T a = _mm256_castsi256_ps(v0);
+ const v8f_T b = _mm256_castsi256_ps(v1);
+ const v8f_T c = _mm256_andnot_ps(a, b);
+ return _mm256_castps_si256(c);
+}
+
+static FINLINE v8i_T
+v8i_xor(const v8i_T v0, const v8i_T v1)
+{
+ const v8f_T a = _mm256_castsi256_ps(v0);
+ const v8f_T b = _mm256_castsi256_ps(v1);
+ const v8f_T c = _mm256_xor_ps(a, b);
+ return _mm256_castps_si256(c);
+}
+
+/*******************************************************************************
+ * Comparators
+ ******************************************************************************/
+static FINLINE v8i_T
+v8i_eq(const v8i_T v0, const v8i_T v1)
+{
+ ALIGN(32) int32_t a[8];
+ ALIGN(32) int32_t b[8];
+ v8i_store(a, v0);
+ v8i_store(b, v1);
+ return v8i_set
+ (-(a[0]==b[0]),-(a[1]==b[1]),-(a[2]==b[2]),-(a[3]==b[3]),
+ -(a[4]==b[4]),-(a[5]==b[5]),-(a[6]==b[6]),-(a[7]==b[7]));
+
+}
+
+static FINLINE v8i_T
+v8i_neq(const v8i_T v0, const v8i_T v1)
+{
+ ALIGN(32) int32_t a[8];
+ ALIGN(32) int32_t b[8];
+ v8i_store(a, v0);
+ v8i_store(b, v1);
+ return v8i_set
+ (-(a[0]!=b[0]),-(a[1]!=b[1]),-(a[2]!=b[2]),-(a[3]!=b[3]),
+ -(a[4]!=b[4]),-(a[5]!=b[5]),-(a[6]!=b[6]),-(a[7]!=b[7]));
+
+}
+
+static FINLINE v8i_T
+v8i_sel(const v8i_T vfalse, const v8i_T vtrue, const v8i_T vcond)
+{
+ const v8f_T a = _mm256_castsi256_ps(vfalse);
+ const v8f_T b = _mm256_castsi256_ps(vtrue);
+ const v8f_T c = _mm256_castsi256_ps(vcond);
+ return _mm256_castps_si256(_mm256_blendv_ps(a, b, c));
+}
+
+static FINLINE v8i_T
+v8i_min(const v8i_T v0, const v8i_T v1)
+{
+ ALIGN(32) int32_t a[8];
+ ALIGN(32) int32_t b[8];
+ v8i_store(a, v0);
+ v8i_store(b, v1);
+ return v8i_set
+ (MMIN(a[0],b[0]), MMIN(a[1],b[1]), MMIN(a[2],b[2]), MMIN(a[3],b[3]),
+ MMIN(a[4],b[4]), MMIN(a[5],b[5]), MMIN(a[6],b[6]), MMIN(a[7],b[7]));
+}
+
+static FINLINE v8i_T
+v8i_max(const v8i_T v0, const v8i_T v1)
+{
+ ALIGN(32) int32_t a[8];
+ ALIGN(32) int32_t b[8];
+ v8i_store(a, v0);
+ v8i_store(b, v1);
+ return v8i_set
+ (MMAX(a[0],b[0]), MMAX(a[1],b[1]), MMAX(a[2],b[2]), MMAX(a[3],b[3]),
+ MMAX(a[4],b[4]), MMAX(a[5],b[5]), MMAX(a[6],b[6]), MMAX(a[7],b[7]));
+}
+
+static FINLINE int32_t
+v8i_reduce_min_i32(const v8i_T v)
+{
+ ALIGN(32) int32_t tmp[8];
+ v8i_store(tmp, v);
+ return MMIN
+ (MMIN(MMIN(tmp[0], tmp[1]), MMIN(tmp[2], tmp[3])),
+ MMIN(MMIN(tmp[4], tmp[5]), MMIN(tmp[6], tmp[7])));
+}
+
+static FINLINE int32_t
+v8i_reduce_max_i32(const v8i_T v)
+{
+ ALIGN(32) int32_t tmp[8];
+ v8i_store(tmp, v);
+ return MMAX
+ (MMAX(MMAX(tmp[0], tmp[1]), MMAX(tmp[2], tmp[3])),
+ MMAX(MMAX(tmp[4], tmp[5]), MMAX(tmp[6], tmp[7])));
+}
+
+#endif /* RSIMD_AVXI_H */
+
diff --git a/src/sse/ssei.h b/src/sse/ssei.h
@@ -266,5 +266,21 @@ v4i_reduce_max(const v4i_T v)
#endif
}
+static FINLINE int32_t
+v4i_reduce_min_i32(const v4i_T v)
+{
+ ALIGN(16) int32_t a[4];
+ v4i_store(a, v);
+ return MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3]));
+}
+
+static FINLINE int32_t
+v4i_reduce_max_i32(const v4i_T v)
+{
+ ALIGN(16) int32_t a[4];
+ v4i_store(a, v);
+ return MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3]));
+}
+
#endif /* RSIMD_SSEI_H */
diff --git a/src/test_v4i.c b/src/test_v4i.c
@@ -217,24 +217,28 @@ main(int argc, char** argv)
CHK(v4i_y(k) == 1);
CHK(v4i_z(k) == 1);
CHK(v4i_w(k) == 1);
+ CHK(v4i_reduce_min_i32(i) == 1);
k = v4i_reduce_min(j);
CHK(v4i_x(k) == -4);
CHK(v4i_y(k) == -4);
CHK(v4i_z(k) == -4);
CHK(v4i_w(k) == -4);
+ CHK(v4i_reduce_min_i32(j) == -4);
k = v4i_reduce_max(i);
CHK(v4i_x(k) == 4);
CHK(v4i_y(k) == 4);
CHK(v4i_z(k) == 4);
CHK(v4i_w(k) == 4);
+ CHK(v4i_reduce_max_i32(i) == 4);
k = v4i_reduce_max(j);
CHK(v4i_x(k) == 6);
CHK(v4i_y(k) == 6);
CHK(v4i_z(k) == 6);
CHK(v4i_w(k) == 6);
+ CHK(v4i_reduce_max_i32(j) == 6);
return 0;
}
diff --git a/src/test_v8i.c b/src/test_v8i.c
@@ -0,0 +1,182 @@
+/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr)
+ *
+ * The RSIMD library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The RSIMD library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "rsimd.h"
+
+int
+main(int argc, char** argv)
+{
+ v8i_T i, j, k;
+ ALIGN(32) int32_t tmp[8] = {0,1,2,3,4,5,6,7};
+ (void)argc, (void)argv;
+
+ i = v8i_load(tmp);
+ CHK(v4i_x(v8i_abcd(i)) == 0);
+ CHK(v4i_y(v8i_abcd(i)) == 1);
+ CHK(v4i_z(v8i_abcd(i)) == 2);
+ CHK(v4i_w(v8i_abcd(i)) == 3);
+ CHK(v4i_x(v8i_efgh(i)) == 4);
+ CHK(v4i_y(v8i_efgh(i)) == 5);
+ CHK(v4i_z(v8i_efgh(i)) == 6);
+ CHK(v4i_w(v8i_efgh(i)) == 7);
+
+ tmp[0]= tmp[1] = tmp[2] = tmp[3] = 0;
+ tmp[4]= tmp[5] = tmp[6] = tmp[7] = 0;
+ CHK(v8i_store(tmp, i) == tmp);
+ CHK(tmp[0] == 0);
+ CHK(tmp[1] == 1);
+ CHK(tmp[2] == 2);
+ CHK(tmp[3] == 3);
+ CHK(tmp[4] == 4);
+ CHK(tmp[5] == 5);
+ CHK(tmp[6] == 6);
+ CHK(tmp[7] == 7);
+
+ i = v8i_set(1, 2, 3, 4, 5, 6, 7, 8);
+ CHK(v4i_x(v8i_abcd(i)) == 1);
+ CHK(v4i_y(v8i_abcd(i)) == 2);
+ CHK(v4i_z(v8i_abcd(i)) == 3);
+ CHK(v4i_w(v8i_abcd(i)) == 4);
+ CHK(v4i_x(v8i_efgh(i)) == 5);
+ CHK(v4i_y(v8i_efgh(i)) == 6);
+ CHK(v4i_z(v8i_efgh(i)) == 7);
+ CHK(v4i_w(v8i_efgh(i)) == 8);
+
+ i = v8i_set1(-1);
+ CHK(v4i_x(v8i_abcd(i)) == -1);
+ CHK(v4i_y(v8i_abcd(i)) == -1);
+ CHK(v4i_z(v8i_abcd(i)) == -1);
+ CHK(v4i_w(v8i_abcd(i)) == -1);
+ CHK(v4i_x(v8i_efgh(i)) == -1);
+ CHK(v4i_y(v8i_efgh(i)) == -1);
+ CHK(v4i_z(v8i_efgh(i)) == -1);
+ CHK(v4i_w(v8i_efgh(i)) == -1);
+
+ i = v8i_zero();
+ CHK(v4i_x(v8i_abcd(i)) == 0);
+ CHK(v4i_y(v8i_abcd(i)) == 0);
+ CHK(v4i_z(v8i_abcd(i)) == 0);
+ CHK(v4i_w(v8i_abcd(i)) == 0);
+ CHK(v4i_x(v8i_efgh(i)) == 0);
+ CHK(v4i_y(v8i_efgh(i)) == 0);
+ CHK(v4i_z(v8i_efgh(i)) == 0);
+ CHK(v4i_w(v8i_efgh(i)) == 0);
+
+ i = v8i_set
+ (0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F,
+ 0x00102030, 0x40506070, (int32_t)0x8090A0B0, (int32_t)0xC0D0E0F0);
+ j = v8i_set
+ (0x01020401, 0x70605040, 0x0F1F2F3F, 0x00000000,
+ 0x10204010, 0x06050400, (int32_t)0xF1F2F3F0, 0x10000000);
+ k = v8i_or(i, j);
+ CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01030603);
+ CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x74655647);
+ CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x0F1F2F3F);
+ CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x0C0D0E0F);
+ CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10306030);
+ CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x46556470);
+ CHK(v4i_z(v8i_efgh(k)) == (int32_t)0xF1F2F3F0);
+ CHK(v4i_w(v8i_efgh(k)) == (int32_t)0xD0D0E0F0);
+
+ k = v8i_and(i, j);
+ CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x00000001);
+ CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x00000000);
+ CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x08090A0B);
+ CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x00000000);
+ CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x00000010);
+ CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x00000000);
+ CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x8090A0B0);
+ CHK(v4i_w(v8i_efgh(k)) == (int32_t)0x00000000);
+
+ k = v8i_andnot(i, j);
+ CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01020400);
+ CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x70605040);
+ CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x07162534);
+ CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x00000000);
+ CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10204000);
+ CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x06050400);
+ CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x71625340);
+ CHK(v4i_w(v8i_efgh(k)) == (int32_t)0x10000000);
+
+ k = v8i_xor(i, j);
+ CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01030602);
+ CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x74655647);
+ CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x07162534);
+ CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x0C0D0E0F);
+ CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10306020);
+ CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x46556470);
+ CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x71625340);
+ CHK(v4i_w(v8i_efgh(k)) == (int32_t)0XD0D0E0F0);
+
+ i = v8i_set( 1, 2,3,4,5, 6,7,8);
+ j = v8i_set(-2,-4,3,6,5,-1,8,8);
+
+ k = v8i_eq(i, j);
+ CHK(v4i_x(v8i_abcd(k)) == 0);
+ CHK(v4i_y(v8i_abcd(k)) == 0);
+ CHK(v4i_z(v8i_abcd(k)) ==~0);
+ CHK(v4i_w(v8i_abcd(k)) == 0);
+ CHK(v4i_x(v8i_efgh(k)) ==~0);
+ CHK(v4i_y(v8i_efgh(k)) == 0);
+ CHK(v4i_z(v8i_efgh(k)) == 0);
+ CHK(v4i_w(v8i_efgh(k)) ==~0);
+
+ k = v8i_neq(i, j);
+ CHK(v4i_x(v8i_abcd(k)) ==~0);
+ CHK(v4i_y(v8i_abcd(k)) ==~0);
+ CHK(v4i_z(v8i_abcd(k)) == 0);
+ CHK(v4i_w(v8i_abcd(k)) ==~0);
+ CHK(v4i_x(v8i_efgh(k)) == 0);
+ CHK(v4i_y(v8i_efgh(k)) ==~0);
+ CHK(v4i_z(v8i_efgh(k)) ==~0);
+ CHK(v4i_w(v8i_efgh(k)) == 0);
+
+ k = v8i_sel(i, j, v8i_set(~0,~0,0,~0,0,0,~0,0));
+ CHK(v4i_x(v8i_abcd(k)) ==-2);
+ CHK(v4i_y(v8i_abcd(k)) ==-4);
+ CHK(v4i_z(v8i_abcd(k)) == 3);
+ CHK(v4i_w(v8i_abcd(k)) == 6);
+ CHK(v4i_x(v8i_efgh(k)) == 5);
+ CHK(v4i_y(v8i_efgh(k)) == 6);
+ CHK(v4i_z(v8i_efgh(k)) == 8);
+ CHK(v4i_w(v8i_efgh(k)) == 8);
+
+ k = v8i_min(i, j);
+ CHK(v4i_x(v8i_abcd(k)) ==-2);
+ CHK(v4i_y(v8i_abcd(k)) ==-4);
+ CHK(v4i_z(v8i_abcd(k)) == 3);
+ CHK(v4i_w(v8i_abcd(k)) == 4);
+ CHK(v4i_x(v8i_efgh(k)) == 5);
+ CHK(v4i_y(v8i_efgh(k)) ==-1);
+ CHK(v4i_z(v8i_efgh(k)) == 7);
+ CHK(v4i_w(v8i_efgh(k)) == 8);
+
+ k = v8i_max(i, j);
+ CHK(v4i_x(v8i_abcd(k)) == 1);
+ CHK(v4i_y(v8i_abcd(k)) == 2);
+ CHK(v4i_z(v8i_abcd(k)) == 3);
+ CHK(v4i_w(v8i_abcd(k)) == 6);
+ CHK(v4i_x(v8i_efgh(k)) == 5);
+ CHK(v4i_y(v8i_efgh(k)) == 6);
+ CHK(v4i_z(v8i_efgh(k)) == 8);
+ CHK(v4i_w(v8i_efgh(k)) == 8);
+
+ CHK(v8i_reduce_min_i32(i) == 1);
+ CHK(v8i_reduce_min_i32(j) ==-4);
+ CHK(v8i_reduce_max_i32(i) == 8);
+ CHK(v8i_reduce_max_i32(j) == 8);
+
+ return 0;
+}