rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

commit 63a3ff76ed03c58f3531cd4a479c579473fc69bc
parent ce233eeda7857b166ff431ca03da43b24f577f59
Author: vaplv <vaplv@free.fr>
Date:   Fri, 17 Oct 2014 14:28:23 +0200

Implement and test the AoS float33 SIMD functions

Diffstat:
Mcmake/CMakeLists.txt | 2++
Asrc/aosf33.h | 336+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/test_aosf33.c | 199+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 537 insertions(+), 0 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt @@ -40,6 +40,7 @@ set(VERSION_PATCH 0) set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}) set(RSIMD_FILES_INC + aosf33.h rsimd.h sse/sse.h sse/ssef.h @@ -73,6 +74,7 @@ endmacro(new_test) new_test(test_v4f) new_test(test_v4i) +new_test(test_aosf33) ################################################################################ # Install directives diff --git a/src/aosf33.h b/src/aosf33.h @@ -0,0 +1,336 @@ +/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef AOSF33_H +#define AOSF33_H + +#include "rsimd.h" +#include <math.h> + +/* + * SIMD Functions on column major AoS float33 matrices. A matrix is + * respresented by 3 4-wide SIMD float vectors + */ + +/******************************************************************************* + * Set operations + ******************************************************************************/ +static FINLINE v4f_T* +aosf33_set(v4f_T m[3], const v4f_T c0, const v4f_T c1, const v4f_T c2) +{ + ASSERT(m); + m[0] = c0; + m[1] = c1; + m[2] = c2; + return m; +} + +static FINLINE v4f_T* +aosf33_identity(v4f_T m[3]) +{ + ASSERT(m); + m[0] = v4f_set(1.f, 0.f, 0.f, 0.f); + m[1] = v4f_set(0.f, 1.f, 0.f, 0.f); + m[2] = v4f_set(0.f, 0.f, 1.f, 0.f); + return m; +} + +static FINLINE v4f_T* +aosf33_zero(v4f_T m[3]) +{ + ASSERT(m); + m[0] = v4f_zero(); + m[1] = v4f_zero(); + m[2] = v4f_zero(); + return m; +} + +static FINLINE v4f_T* +aosf33_set_row0(v4f_T m[3], const v4f_T v) +{ + ASSERT(m); + m[0] = v4f_ayzw(m[0], v); + m[1] = v4f_ayzw(m[1], v4f_yyww(v)); + m[2] = v4f_ayzw(m[2], v4f_zwzw(v)); + return m; +} + +static FINLINE v4f_T* +aosf33_set_row1(v4f_T m[3], const v4f_T v) +{ + ASSERT(m); + m[0] = v4f_xbzw(m[0], v4f_xxyy(v)); + m[1] = v4f_xbzw(m[1], v); + m[2] = v4f_xbzw(m[2], v4f_zzww(v)); + return m; +} + +static FINLINE v4f_T* +aosf33_set_row2(v4f_T m[3], const v4f_T v) +{ + ASSERT(m); + m[0] = v4f_xyab(m[0], v4f_xyxy(v)); + m[1] = v4f_xyab(m[1], v4f_yyzz(v)); + m[2] = v4f_xyab(m[2], v4f_zzww(v)); + return m; +} + +static FINLINE v4f_T* +aosf33_set_row(v4f_T m[3], const v4f_T v, const int id) +{ + const v4f_T mask = v4f_mask(-(id==0), -(id==1), -(id==2), 0); + ASSERT(m && id >= 0 && id <= 2); + m[0] = v4f_sel(m[0], v4f_xxxx(v), mask); + m[1] = v4f_sel(m[1], v4f_yyyy(v), mask); + m[2] = v4f_sel(m[2], v4f_zzzz(v), mask); + return m; +} + +static FINLINE v4f_T* +aosf33_set_col(v4f_T m[3], const v4f_T v, const int id) +{ + ASSERT(m && id >= 0 && id <= 2); + m[id] = v; + return m; +} + +/******************************************************************************* + * Arithmetic operations + ******************************************************************************/ +static FINLINE v4f_T* +aosf33_add(v4f_T res[3], const v4f_T m0[3], const v4f_T m1[3]) +{ + ASSERT(res && m0 && m1); + res[0] = v4f_add(m0[0], m1[0]); + res[1] = v4f_add(m0[1], m1[1]); + res[2] = v4f_add(m0[2], m1[2]); + return res; +} + +static FINLINE v4f_T* +aosf33_sub(v4f_T res[3], const v4f_T m0[3], const v4f_T m1[3]) +{ + ASSERT(res && m0 && m1); + res[0] = v4f_sub(m0[0], m1[0]); + res[1] = v4f_sub(m0[1], m1[1]); + res[2] = v4f_sub(m0[2], m1[2]); + return res; +} + +static FINLINE v4f_T* +aosf33_minus(v4f_T res[3], const v4f_T m[3]) +{ + ASSERT(res && m); + res[0] = v4f_minus(m[0]); + res[1] = v4f_minus(m[1]); + res[2] = v4f_minus(m[2]); + return res; +} + +static FINLINE v4f_T* +aosf33_abs(v4f_T res[3], const v4f_T m[3]) +{ + ASSERT(res && m); + res[0] = v4f_abs(m[0]); + res[1] = v4f_abs(m[1]); + res[2] = v4f_abs(m[2]); + return res; +} + +static FINLINE v4f_T* +aosf33_mul(v4f_T res[3], const v4f_T m[3], const v4f_T v) +{ + ASSERT(res && m); + res[0] = v4f_mul(m[0], v); + res[1] = v4f_mul(m[1], v); + res[2] = v4f_mul(m[2], v); + return res; +} + +static FINLINE v4f_T +aosf33_mulf3(const v4f_T m[3], const v4f_T v) +{ + v4f_T r0, r1; + ASSERT(m); + r0 = v4f_mul(m[0], v4f_xxxx(v)); + r1 = v4f_madd(m[1], v4f_yyyy(v), r0); + return v4f_madd(m[2], v4f_zzzz(v), r1); +} + +static FINLINE v4f_T +aosf3_mulf33(const v4f_T v, const v4f_T m[3]) +{ + v4f_T xxxx, yyyy, zzzz, yyzz; + ASSERT(m); + xxxx = v4f_dot3(v, m[0]); + yyyy = v4f_dot3(v, m[1]); + zzzz = v4f_dot3(v, m[2]); + yyzz = v4f_xyab(yyyy, zzzz); + return v4f_ayzw(yyzz, xxxx); +} + +static FINLINE v4f_T* +aosf33_mulf33(v4f_T res[3], const v4f_T a[3], const v4f_T b[3]) +{ + v4f_T c0, c1, c2; + ASSERT(res && a && b); + c0 = aosf33_mulf3(a, b[0]); + c1 = aosf33_mulf3(a, b[1]); + c2 = aosf33_mulf3(a, b[2]); + res[0] = c0; + res[1] = c1; + res[2] = c2; + return res; +} + +static FINLINE v4f_T* +aosf33_transpose(v4f_T res[3], const v4f_T m[3]) +{ + v4f_T c0, c1, c2; + v4f_T x0x2y0y2, z0z2w0w2, z1z1y1y1; + ASSERT(res && m); + c0 = m[0]; + c1 = m[1]; + c2 = m[2]; + x0x2y0y2 = v4f_xayb(c0, c2); + z0z2w0w2 = v4f_zcwd(c0, c2); + z1z1y1y1 = v4f_zzyy(c1); + res[0] = v4f_xayb(x0x2y0y2, c1); + res[1] = v4f_zcwd(x0x2y0y2, z1z1y1y1); + res[2] = v4f_xayb(z0z2w0w2, z1z1y1y1); + return res; +} + +static FINLINE v4f_T +aosf33_det(const v4f_T m[3]) +{ + ASSERT(m); + return v4f_dot3(m[2], v4f_cross3(m[0], m[1])); +} + +static FINLINE v4f_T /* Return the determinant */ +aosf33_invtrans(v4f_T res[3], const v4f_T m[3]) +{ + v4f_T t[3], det, invdet; + ASSERT(res && m); + t[0] = v4f_cross3(m[1], m[2]); + t[1] = v4f_cross3(m[2], m[0]); + t[2] = v4f_cross3(m[0], m[1]); + det = v4f_dot3(t[2], m[2]); + invdet = v4f_rcp(det); + aosf33_mul(res, t, invdet); + return det; +} + +static FINLINE v4f_T +aosf33_inverse(v4f_T res[3], const v4f_T m[3]) +{ + v4f_T det; + ASSERT(res && m); + det = aosf33_invtrans(res, m); + aosf33_transpose(res, res); + return det; +} + +/******************************************************************************* + * Get operations + ******************************************************************************/ +static FINLINE v4f_T +aosf33_row0(const v4f_T m[3]) +{ + ASSERT(m); + return v4f_ayzw(v4f_xyab(v4f_xxzz(m[1]), v4f_xxzz(m[2])), m[0]); +} + +static FINLINE v4f_T +aosf33_row1(const v4f_T m[3]) +{ + ASSERT(m); + return v4f_ayzw(v4f_xyab(v4f_yyww(m[1]), v4f_yyww(m[2])), v4f_yyww(m[0])); +} + +static FINLINE v4f_T +aosf33_row2(const v4f_T m[3]) +{ + ASSERT(m); + return v4f_ayzw(v4f_xyab(v4f_zzww(m[1]), v4f_zzww(m[2])), v4f_zzww(m[0])); +} + +static FINLINE v4f_T +aosf33_row(const v4f_T m[3], int id) +{ + v4f_T t[3]; + ASSERT(m && id >= 0 && id <= 2); + aosf33_transpose(t, m); + return t[id]; +} + +static FINLINE v4f_T +aosf33_col(const v4f_T m[3], int id) +{ + ASSERT(m && id >= 0 && id <= 2); + return m[id]; +} + +static FINLINE float* +aosf33_store(float res[9] /* Column major */, const v4f_T m[3]) +{ + ALIGN(16) float tmp[4]; + int i; + ASSERT(res && m); + FOR_EACH(i, 0, 3) { + v4f_store(tmp, m[i]); + res[i*3 + 0] = tmp[0]; + res[i*3 + 1] = tmp[1]; + res[i*3 + 2] = tmp[2]; + } + return res; +} + +/******************************************************************************* + * Build functions + ******************************************************************************/ +static FINLINE v4f_T* /* XYZ norm */ +aosf33_rotation(v4f_T res[3], float pitch, float yaw, float roll) +{ + float c1, c2, c3, s1, s2, s3; + ASSERT(res); + c1 = (float)cos(pitch); + c2 = (float)cos(yaw); + c3 = (float)cos(roll); + s1 = (float)sin(pitch); + s2 = (float)sin(yaw); + s3 = (float)sin(roll); + res[0] = v4f_set(c2*c3, c1*s3 + c3*s1*s2, s1*s3 - c1*c3*s2, 0.f); + res[1] = v4f_set(-c2*s3, c1*c3 - s1*s2*s3, c1*s2*s3 + c3*s1, 0.f); + res[2] = v4f_set(s2, -c2*s1, c1*c2, 0.f); + return res; +} + +static FINLINE v4f_T* /* rotation around the Y axis */ +aosf33_yaw_rotation(v4f_T res[3], float yaw) +{ + float c, s; + ASSERT(res); + c = (float)cos(yaw); + s = (float)sin(yaw); + res[0] = v4f_set(c, 0.f, -s, 0.f); + res[1] = v4f_set(0.f, 1.f, 0.f, 0.f); + res[2] = v4f_set(s, 0.f, c, 0.f); + return res; +} + +#endif /* AOSF33_H */ + diff --git a/src/test_aosf33.c b/src/test_aosf33.c @@ -0,0 +1,199 @@ +/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "aosf33.h" +#include <rsys/float33.h> + +#define AOSF33_EQ_EPS(M, A, B, C, D, E, F, G, H, I, Eps) \ + { \ + float a[9], b[9]; \ + b[0] = (A); b[1] = (B); b[2] = (C); \ + b[3] = (D); b[4] = (E); b[5] = (F); \ + b[6] = (G); b[7] = (H); b[8] = (I); \ + CHECK(f33_eq_eps(aosf33_store(a, (M)), b, Eps), 1); \ + } (void)0 +#define AOSF33_EQ(M, A, B, C, D, E, F, G, H, I) \ + AOSF33_EQ_EPS(M, A, B, C, D, E, F, G, H, I, 0.f) + +int +main(int argc, char** argv) +{ + float tmp[9]; + v4f_T m[3], n[3], o[3], v; + (void)argc, (void)argv; + + CHECK(aosf33_set(m, + v4f_set(0.f, 1.f, 2.f, 0.f), + v4f_set(3.f, 4.f, 5.f, 0.f), + v4f_set(6.f, 7.f, 8.f, 0.f)), m); + CHECK(aosf33_store(tmp, m), tmp); + CHECK(tmp[0], 0.f); + CHECK(tmp[1], 1.f); + CHECK(tmp[2], 2.f); + CHECK(tmp[3], 3.f); + CHECK(tmp[4], 4.f); + CHECK(tmp[5], 5.f); + CHECK(tmp[6], 6.f); + CHECK(tmp[7], 7.f); + CHECK(tmp[8], 8.f); + AOSF33_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f); + CHECK(aosf33_identity(m), m); + AOSF33_EQ(m, 1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f); + CHECK(aosf33_zero(m), m); + AOSF33_EQ(m, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); + + CHECK(aosf33_set_row0(m, v4f_set(0.f, 1.f, 2.f, 9.f)), m); + AOSF33_EQ(m, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 2.f, 0.f, 0.f); + CHECK(aosf33_set_row1(m, v4f_set(3.f, 4.f, 5.f, 10.f)), m); + AOSF33_EQ(m, 0.f, 3.f, 0.f, 1.f, 4.f, 0.f, 2.f, 5.f, 0.f); + CHECK(aosf33_set_row2(m, v4f_set(6.f, 7.f, 8.f, 11.f)), m); + AOSF33_EQ(m, 0.f, 3.f, 6.f, 1.f, 4.f, 7.f, 2.f, 5.f, 8.f); + + CHECK(aosf33_zero(m), m); + CHECK(aosf33_set_row(m, v4f_set(0.f, 1.f, 2.f, 9.f), 0), m); + AOSF33_EQ(m, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 2.f, 0.f, 0.f); + CHECK(aosf33_set_row(m, v4f_set(3.f, 4.f, 5.f, 10.f), 1), m); + AOSF33_EQ(m, 0.f, 3.f, 0.f, 1.f, 4.f, 0.f, 2.f, 5.f, 0.f); + CHECK(aosf33_set_row(m, v4f_set(6.f, 7.f, 8.f, 11.f), 2), m); + AOSF33_EQ(m, 0.f, 3.f, 6.f, 1.f, 4.f, 7.f, 2.f, 5.f, 8.f); + + CHECK(aosf33_zero(m), m); + CHECK(aosf33_set_col(m, v4f_set(0.f, 1.f, 2.f, 9.f), 0), m); + AOSF33_EQ(m, 0.f, 1.f, 2.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); + CHECK(aosf33_set_col(m, v4f_set(3.f, 4.f, 5.f, 10.f), 1), m); + AOSF33_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 0.f, 0.f, 0.f); + CHECK(aosf33_set_col(m, v4f_set(6.f, 7.f, 8.f, 11.f), 2), m); + AOSF33_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f); + + v = aosf33_row0(m); + CHECK(v4f_x(v), 0.f); + CHECK(v4f_y(v), 3.f); + CHECK(v4f_z(v), 6.f); + + v = aosf33_row1(m); + CHECK(v4f_x(v), 1.f); + CHECK(v4f_y(v), 4.f); + CHECK(v4f_z(v), 7.f); + + v = aosf33_row2(m); + CHECK(v4f_x(v), 2.f); + CHECK(v4f_y(v), 5.f); + CHECK(v4f_z(v), 8.f); + + v = aosf33_row(m, 0); + CHECK(v4f_x(v), 0.f); + CHECK(v4f_y(v), 3.f); + CHECK(v4f_z(v), 6.f); + + v = aosf33_row(m, 1); + CHECK(v4f_x(v), 1.f); + CHECK(v4f_y(v), 4.f); + CHECK(v4f_z(v), 7.f); + + v = aosf33_row(m, 2); + CHECK(v4f_x(v), 2.f); + CHECK(v4f_y(v), 5.f); + CHECK(v4f_z(v), 8.f); + + v = aosf33_col(m, 0); + CHECK(v4f_x(v), 0.f); + CHECK(v4f_y(v), 1.f); + CHECK(v4f_z(v), 2.f); + + v = aosf33_col(m, 1); + CHECK(v4f_x(v), 3.f); + CHECK(v4f_y(v), 4.f); + CHECK(v4f_z(v), 5.f); + + v = aosf33_col(m, 2); + CHECK(v4f_x(v), 6.f); + CHECK(v4f_y(v), 7.f); + CHECK(v4f_z(v), 8.f); + + aosf33_set(m, + v4f_set(0.f, 1.f, 2.f, 0.f), + v4f_set(3.f, 4.f, 5.f, 0.f), + v4f_set(6.f, 7.f, 8.f, 0.f)); + aosf33_set(n, + v4f_set(1.f, 2.f, 3.f, 0.f), + v4f_set(4.f, 5.f, 6.f, 0.f), + v4f_set(7.f, 8.f, 9.f, 0.f)); + CHECK(aosf33_add(o, m, n), o); + AOSF33_EQ(o, 1.f, 3.f, 5.f, 7.f, 9.f, 11.f, 13.f, 15.f, 17.f); + CHECK(aosf33_sub(o, o, n), o); + AOSF33_EQ(o, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f); + + aosf33_set(m, + v4f_set(1.f, 2.f, -3.f, 0.f), + v4f_set(-4.f, -5.f, 6.f, 0.f), + v4f_set(7.f, -8.f, 9.f, 0.f)); + CHECK(aosf33_minus(m, m), m); + AOSF33_EQ(m, -1.f, -2.f, 3.f, 4.f, 5.f, -6.f, -7.f, 8.f, -9.f); + + CHECK(aosf33_mul(o, m, v4f_set1(2.f)), o); + AOSF33_EQ(o, -2.f, -4.f, 6.f, 8.f, 10.f, -12.f, -14.f, 16.f, -18.f); + + aosf33_set(m, + v4f_set(1.f, 2.f, 3.f, 0.f), + v4f_set(4.f, 5.f, 6.f, 0.f), + v4f_set(7.f, 8.f, 9.f, 0.f)); + v = aosf33_mulf3(m, v4f_set(1.f, 2.f, 3.f, 0.f)); + CHECK(v4f_x(v), 30.f); + CHECK(v4f_y(v), 36.f); + CHECK(v4f_z(v), 42.f); + v = aosf3_mulf33(v4f_set(1.f, 2.f, 3.f, 0.f), m); + CHECK(v4f_x(v), 14.f); + CHECK(v4f_y(v), 32.f); + CHECK(v4f_z(v), 50.f); + aosf33_set(n, + v4f_set(2.f, 9.f, 8.f, 0.f), + v4f_set(1.f, -2.f, 2.f, 0.f), + v4f_set(1.f, -8.f, -4.f, 0.f)); + CHECK(aosf33_mulf33(o, m, n), o); + AOSF33_EQ(o, 94.f, 113.f, 132.f, 7.f, 8.f, 9.f, -59.f, -70.f, -81.f); + + CHECK(aosf33_transpose(o, m), o); + AOSF33_EQ(o, 1.f, 4.f, 7.f, 2.f, 5.f, 8.f, 3.f, 6.f, 9.f); + + aosf33_set(m, + v4f_set(1.f, 2.f, 3.f, 0.f), + v4f_set(4.f, 5.f, 6.f, 0.f), + v4f_set(3.f, -4.f, 9.f, 0.f)); + v = aosf33_det(m); + CHECK(v4f_x(v), -60.f); + CHECK(v4f_y(v), -60.f); + CHECK(v4f_z(v), -60.f); + CHECK(v4f_w(v), -60.f); + + v = aosf33_inverse(n, m); + CHECK(v4f_x(v), -60.f); + CHECK(v4f_y(v), -60.f); + CHECK(v4f_z(v), -60.f); + CHECK(v4f_w(v), -60.f); + aosf33_mulf33(o, m, n); + AOSF33_EQ_EPS(o, 1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f, 1.e-6f); + + v = aosf33_invtrans(o, m); + CHECK(v4f_x(v), -60.f); + CHECK(v4f_y(v), -60.f); + CHECK(v4f_z(v), -60.f); + CHECK(v4f_w(v), -60.f); + AOSF33_EQ(o, + v4f_x(n[0]), v4f_x(n[1]), v4f_x(n[2]), + v4f_y(n[0]), v4f_y(n[1]), v4f_y(n[2]), + v4f_z(n[0]), v4f_z(n[1]), v4f_z(n[2])); + return 0; +} +