commit ee22862cb041351d56c14327a801b09519e71674
parent 0add4c02c451f18ec21f41fa90ce7a360377eb28
Author: vaplv <vaplv@free.fr>
Date: Fri, 17 Oct 2014 15:44:33 +0200
Add and test the AoS float44 SIMD functions
Diffstat:
| M | cmake/CMakeLists.txt | | | 6 | +++++- |
| A | src/aosf44.c | | | 112 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/aosf44.h | | | 400 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/test_aosf44.c | | | 420 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
4 files changed, 937 insertions(+), 1 deletion(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -41,12 +41,15 @@ set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH})
set(RSIMD_FILES_INC
aosf33.h
+ aosf44.h
rsimd.h
sse/sse.h
sse/ssef.h
sse/ssei.h
sse/sse_swz.h)
-set(RSIMD_FILES_SRC sse/ssef.c)
+set(RSIMD_FILES_SRC
+ aosf44.c
+ sse/ssef.c)
rcmake_prepend_path(RSIMD_FILES_INC ${RSIMD_SOURCE_DIR})
rcmake_prepend_path(RSIMD_FILES_SRC ${RSIMD_SOURCE_DIR})
@@ -75,6 +78,7 @@ endmacro(new_test)
new_test(test_v4f)
new_test(test_v4i)
new_test(test_aosf33)
+new_test(test_aosf44)
################################################################################
# Install directives
diff --git a/src/aosf44.c b/src/aosf44.c
@@ -0,0 +1,112 @@
+/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr)
+ *
+ * The RSIMD library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The RSIMD library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "aosf44.h"
+
+v4f_T
+aosf44_inverse(v4f_T res[4], const v4f_T m[4])
+{
+ v4f_T c0, c1, c2, c3, r3;
+ v4f_T f33_023_c0, f33_023_c1, f33_023_c2, f33_023_c3;
+ v4f_T f33_123_c0, f33_123_c1, f33_123_c2, f33_123_c3;
+ v4f_T f33_013_c0, f33_013_c1, f33_013_c2, f33_013_c3;
+ v4f_T f33_012_012[3], f33_012_013[3], f33_012_023[3], f33_012_123[3];
+ v4f_T f33_023_012[3], f33_023_013[3], f33_023_023[3], f33_023_123[3];
+ v4f_T f33_123_012[3], f33_123_013[3], f33_123_023[3], f33_123_123[3];
+ v4f_T f33_013_012[3], f33_013_013[3], f33_013_023[3], f33_013_123[3];
+ v4f_T det_012, det_023, det_123, det_013;
+ v4f_T cofacts, det, idet, mpmp_idet, pmpm_idet;
+ ASSERT(res && m);
+
+ /* Retrieve the columns 0, 1, 2 and 3 and the row 3 of the "m" matrix. */
+ c0 = m[0];
+ c1 = m[1];
+ c2 = m[2];
+ c3 = m[3];
+ r3 = aosf44_row3(m);
+
+ /* Define the 3x3 sub-matrix and compute their determinant */
+ aosf33_set(f33_012_012, c0, c1, c2);
+ aosf33_set(f33_012_013, c0, c1, c3);
+ aosf33_set(f33_012_023, c0, c2, c3);
+ aosf33_set(f33_012_123, c1, c2, c3);
+ det_012 = v4f_048C
+ (aosf33_det(f33_012_123),
+ aosf33_det(f33_012_023),
+ aosf33_det(f33_012_013),
+ aosf33_det(f33_012_012));
+
+ f33_023_c0 = v4f_xzww(c0);
+ f33_023_c1 = v4f_xzww(c1);
+ f33_023_c2 = v4f_xzww(c2);
+ f33_023_c3 = v4f_xzww(c3);
+ aosf33_set(f33_023_012, f33_023_c0, f33_023_c1, f33_023_c2);
+ aosf33_set(f33_023_013, f33_023_c0, f33_023_c1, f33_023_c3);
+ aosf33_set(f33_023_023, f33_023_c0, f33_023_c2, f33_023_c3);
+ aosf33_set(f33_023_123, f33_023_c1, f33_023_c2, f33_023_c3);
+ det_023 = v4f_048C
+ (aosf33_det(f33_023_123),
+ aosf33_det(f33_023_023),
+ aosf33_det(f33_023_013),
+ aosf33_det(f33_023_012));
+
+ f33_123_c0 = v4f_yzww(c0);
+ f33_123_c1 = v4f_yzww(c1);
+ f33_123_c2 = v4f_yzww(c2);
+ f33_123_c3 = v4f_yzww(c3);
+ aosf33_set(f33_123_012, f33_123_c0, f33_123_c1, f33_123_c2);
+ aosf33_set(f33_123_013, f33_123_c0, f33_123_c1, f33_123_c3);
+ aosf33_set(f33_123_023, f33_123_c0, f33_123_c2, f33_123_c3);
+ aosf33_set(f33_123_123, f33_123_c1, f33_123_c2, f33_123_c3);
+ det_123 = v4f_048C
+ (aosf33_det(f33_123_123),
+ aosf33_det(f33_123_023),
+ aosf33_det(f33_123_013),
+ aosf33_det(f33_123_012));
+
+ f33_013_c0 = v4f_xyww(c0);
+ f33_013_c1 = v4f_xyww(c1);
+ f33_013_c2 = v4f_xyww(c2);
+ f33_013_c3 = v4f_xyww(c3);
+ aosf33_set(f33_013_012, f33_013_c0, f33_013_c1, f33_013_c2);
+ aosf33_set(f33_013_013, f33_013_c0, f33_013_c1, f33_013_c3);
+ aosf33_set(f33_013_023, f33_013_c0, f33_013_c2, f33_013_c3);
+ aosf33_set(f33_013_123, f33_013_c1, f33_013_c2, f33_013_c3);
+ det_013 = v4f_048C
+ (aosf33_det(f33_013_123),
+ aosf33_det(f33_013_023),
+ aosf33_det(f33_013_013),
+ aosf33_det(f33_013_012));
+
+ /* Compute the cofactors of the column 3 */
+ cofacts = v4f_mul(det_012, v4f_set(-1.f, 1.f, -1.f, 1.f));
+
+ /* Compute the determinant of the "m" matrix */
+ det = v4f_dot(cofacts, r3);
+
+ /* Invert the matrix */
+ idet = v4f_rcp(det);
+ mpmp_idet = v4f_xor
+ (idet, v4f_mask((int32_t)0x80000000, 0, (int32_t)0x80000000, 0));
+ pmpm_idet = v4f_xor
+ (idet, v4f_mask(0, (int32_t)0x80000000, 0, (int32_t)0x80000000));
+ res[0] = v4f_mul(det_123, pmpm_idet);
+ res[1] = v4f_mul(det_023, mpmp_idet);
+ res[2] = v4f_mul(det_013, pmpm_idet);
+ res[3] = v4f_mul(det_012, mpmp_idet);
+
+ return det;
+}
+
diff --git a/src/aosf44.h b/src/aosf44.h
@@ -0,0 +1,400 @@
+/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr)
+ *
+ * The RSIMD library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The RSIMD library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef AOSF44_H
+#define AOSF44_H
+
+#include "aosf33.h"
+#include "rsimd.h"
+
+/*
+ * Functions on column major AoS float44 matrices. A 4x4 matrix is a set of 4
+ * 4-wide SIMD float vectors, each representing a matrix column.
+ */
+
+/*******************************************************************************
+ * Set operations
+ ******************************************************************************/
+static FINLINE float*
+aosf44_store(float dst[16], const v4f_T m[4])
+{
+ ASSERT(m && dst);
+
+ if(IS_ALIGNED(dst, 16)) {
+ v4f_store(dst + 0, m[0]);
+ v4f_store(dst + 4, m[1]);
+ v4f_store(dst + 8, m[2]);
+ v4f_store(dst + 12, m[3]);
+ } else {
+ ALIGN(16) float tmp[4];
+ int i;
+ FOR_EACH(i, 0, 4) {
+ v4f_store(tmp, m[i]);
+ dst[i*4 + 0] = tmp[0];
+ dst[i*4 + 1] = tmp[1];
+ dst[i*4 + 2] = tmp[2];
+ dst[i*4 + 3] = tmp[3];
+ }
+ }
+ return dst;
+}
+
+static FINLINE v4f_T*
+aosf44_load(v4f_T m[4], const float src[16])
+{
+ ASSERT(m && src);
+ if(IS_ALIGNED(src, 16)) {
+ m[0] = v4f_load(src + 0);
+ m[1] = v4f_load(src + 4);
+ m[2] = v4f_load(src + 8);
+ m[3] = v4f_load(src + 12);
+ } else {
+ int i;
+ FOR_EACH(i, 0, 4)
+ m[i] = v4f_set(src[i*3+0], src[i*3+1], src[i*4+2], src[i*4+3]);
+ }
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_set
+ (v4f_T m[4], const v4f_T c0, const v4f_T c1, const v4f_T c2, const v4f_T c3)
+{
+ ASSERT(m);
+ m[0] = c0, m[1] = c1, m[2] = c2, m[3] = c3;
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_identity(v4f_T m[4])
+{
+ ASSERT(m);
+ m[0] = v4f_set(1.f, 0.f, 0.f, 0.f);
+ m[1] = v4f_set(0.f, 1.f, 0.f, 0.f);
+ m[2] = v4f_set(0.f, 0.f, 1.f, 0.f);
+ m[3] = v4f_set(0.f, 0.f, 0.f, 1.f);
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_zero(v4f_T m[4])
+{
+ ASSERT(m);
+ m[0] = v4f_zero();
+ m[1] = v4f_zero();
+ m[2] = v4f_zero();
+ m[3] = v4f_zero();
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_set_row0(v4f_T m[4], const v4f_T v)
+{
+ const v4f_T xyzw = v;
+ const v4f_T yyww = v4f_yyww(v);
+ const v4f_T zwzw = v4f_zwzw(v);
+ const v4f_T wwww = v4f_yyww(zwzw);
+ ASSERT(m);
+ m[0] = v4f_ayzw(m[0], xyzw);
+ m[1] = v4f_ayzw(m[1], yyww);
+ m[2] = v4f_ayzw(m[2], zwzw);
+ m[3] = v4f_ayzw(m[3], wwww);
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_set_row1(v4f_T m[4], const v4f_T v)
+{
+ ASSERT(m);
+ m[0] = v4f_xbzw(m[0], v4f_xxyy(v));
+ m[1] = v4f_xbzw(m[1], v);
+ m[2] = v4f_xbzw(m[2], v4f_zzww(v));
+ m[3] = v4f_xbzw(m[3], v4f_zwzw(v));
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_set_row2(v4f_T m[4], const v4f_T v)
+{
+ ASSERT(m);
+ m[0] = v4f_xycw(m[0], v4f_xyxy(v));
+ m[1] = v4f_xycw(m[1], v4f_xxyy(v));
+ m[2] = v4f_xycw(m[2], v);
+ m[3] = v4f_xycw(m[3], v4f_zzww(v));
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_set_row3(v4f_T m[4], const v4f_T v)
+{
+ ASSERT(m);
+ m[0] = v4f_xyzd(m[0], v4f_xxxx(v));
+ m[1] = v4f_xyzd(m[1], v4f_xxyy(v));
+ m[2] = v4f_xyzd(m[2], v4f_xxzz(v));
+ m[3] = v4f_xyzd(m[3], v);
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_set_row(v4f_T m[4], const v4f_T v, const int id)
+{
+ const v4f_T mask = v4f_mask(-(id==0), -(id==1), -(id==2), -(id==3));
+ ASSERT(m && id >= 0 && id <= 3);
+ m[0] = v4f_sel(m[0], v4f_xxxx(v), mask);
+ m[1] = v4f_sel(m[1], v4f_yyyy(v), mask);
+ m[2] = v4f_sel(m[2], v4f_zzzz(v), mask);
+ m[3] = v4f_sel(m[3], v4f_wwww(v), mask);
+ return m;
+}
+
+static FINLINE v4f_T*
+aosf44_set_col(v4f_T m[4], const v4f_T v, const int id)
+{
+ ASSERT(m && id >= 0 && id <= 3);
+ m[id] = v;
+ return m;
+}
+
+/*******************************************************************************
+ * Get operations
+ ******************************************************************************/
+static FINLINE v4f_T
+aosf44_row0(const v4f_T m[4])
+{
+ ASSERT(m);
+ return v4f_048C
+ (v4f_xxxx(m[0]), v4f_xxxx(m[1]), v4f_xxxx(m[2]), v4f_xxxx(m[3]));
+}
+
+static FINLINE v4f_T
+aosf44_row1(const v4f_T m[4])
+{
+ ASSERT(m);
+ return v4f_048C
+ (v4f_yyyy(m[0]), v4f_yyyy(m[1]), v4f_yyyy(m[2]), v4f_yyyy(m[3]));
+}
+
+static FINLINE v4f_T
+aosf44_row2(const v4f_T m[4])
+{
+ ASSERT(m);
+ return v4f_048C
+ (v4f_zzzz(m[0]), v4f_zzzz(m[1]), v4f_zzzz(m[2]), v4f_zzzz(m[3]));
+}
+
+static FINLINE v4f_T
+aosf44_row3(const v4f_T m[4])
+{
+ ASSERT(m);
+ return v4f_048C
+ (v4f_wwww(m[0]), v4f_wwww(m[1]), v4f_wwww(m[2]), v4f_wwww(m[3]));
+}
+
+static FINLINE v4f_T
+aosf44_row(const v4f_T m[4], const int id)
+{
+ ASSERT(m && id >= 0 && id <= 3);
+ if(id == 0) {
+ return aosf44_row0(m);
+ } else if(id == 1) {
+ return aosf44_row1(m);
+ } else if(id == 2) {
+ return aosf44_row2(m);
+ } else {
+ return aosf44_row3(m);
+ }
+}
+
+static FINLINE v4f_T
+aosf44_col(const v4f_T m[4], const int id)
+{
+ ASSERT(m && id >= 0 && id <= 3);
+ return m[id];
+}
+
+/*******************************************************************************
+ * Arithmetic operations
+ ******************************************************************************/
+static FINLINE v4f_T*
+aosf44_add(v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4])
+{
+ ASSERT(res && m0 && m1);
+ res[0] = v4f_add(m0[0], m1[0]);
+ res[1] = v4f_add(m0[1], m1[1]);
+ res[2] = v4f_add(m0[2], m1[2]);
+ res[3] = v4f_add(m0[3], m1[3]);
+ return res;
+}
+
+static FINLINE v4f_T*
+aosf44_sub(v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4])
+{
+ ASSERT(res && m0 && m1);
+ res[0] = v4f_sub(m0[0], m1[0]);
+ res[1] = v4f_sub(m0[1], m1[1]);
+ res[2] = v4f_sub(m0[2], m1[2]);
+ res[3] = v4f_sub(m0[3], m1[3]);
+ return res;
+}
+
+static FINLINE v4f_T*
+aosf44_minus(v4f_T res[4], const v4f_T m[4])
+{
+ ASSERT(res && m);
+ res[0] = v4f_minus(m[0]);
+ res[1] = v4f_minus(m[1]);
+ res[2] = v4f_minus(m[2]);
+ res[3] = v4f_minus(m[3]);
+ return res;
+}
+
+static FINLINE v4f_T*
+aosf44_abs(v4f_T res[4], const v4f_T m[4])
+{
+ ASSERT(res && m);
+ res[0] = v4f_abs(m[0]);
+ res[1] = v4f_abs(m[1]);
+ res[2] = v4f_abs(m[2]);
+ res[3] = v4f_abs(m[3]);
+ return res;
+}
+
+static FINLINE v4f_T*
+aosf44_mul(v4f_T res[4], const v4f_T m[4], const v4f_T v)
+{
+ ASSERT(res && m);
+ res[0] = v4f_mul(m[0], v);
+ res[1] = v4f_mul(m[1], v);
+ res[2] = v4f_mul(m[2], v);
+ res[3] = v4f_mul(m[3], v);
+ return res;
+}
+
+static FINLINE v4f_T
+aosf44_mulf4(const v4f_T m[4], const v4f_T v)
+{
+ v4f_T r0, r1, r2;
+ ASSERT(m);
+ r0 = v4f_mul(m[0], v4f_xxxx(v));
+ r1 = v4f_madd(m[1], v4f_yyyy(v), r0);
+ r2 = v4f_madd(m[2], v4f_zzzz(v), r1);
+ return v4f_madd(m[3], v4f_wwww(v), r2);
+}
+
+static FINLINE v4f_T
+aosf4_mulf44(v4f_T v, const v4f_T m[4])
+{
+ v4f_T xxxx, yyyy, zzzz, wwww, xyxy, zwzw;
+ ASSERT(m);
+ xxxx = v4f_dot(v, m[0]);
+ yyyy = v4f_dot(v, m[1]);
+ zzzz = v4f_dot(v, m[2]);
+ wwww = v4f_dot(v, m[3]);
+ xyxy = v4f_xayb(xxxx, yyyy);
+ zwzw = v4f_xayb(zzzz, wwww);
+ return v4f_xyab(xyxy, zwzw);
+}
+
+static FINLINE v4f_T*
+aosf44_mulf44
+ (v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4])
+{
+ v4f_T c0, c1, c2, c3;
+ ASSERT(res && m0 && m1);
+ c0 = aosf44_mulf4(m0, m1[0]);
+ c1 = aosf44_mulf4(m0, m1[1]);
+ c2 = aosf44_mulf4(m0, m1[2]);
+ c3 = aosf44_mulf4(m0, m1[3]);
+ res[0] = c0;
+ res[1] = c1;
+ res[2] = c2;
+ res[3] = c3;
+ return res;
+}
+
+static FINLINE v4f_T*
+aosf44_transpose(v4f_T res[4], const v4f_T m[4])
+{
+ v4f_T in_c0, in_c1, in_c2, in_c3;
+ v4f_T x0x2y0y2, x1x3y1y3, z0z2w0w2, z1z3w1w3;
+ ASSERT(res && m);
+ in_c0 = m[0];
+ in_c1 = m[1];
+ in_c2 = m[2];
+ in_c3 = m[3];
+ x0x2y0y2 = v4f_xayb(in_c0, in_c2);
+ x1x3y1y3 = v4f_xayb(in_c1, in_c3);
+ z0z2w0w2 = v4f_zcwd(in_c0, in_c2);
+ z1z3w1w3 = v4f_zcwd(in_c1, in_c3);
+ res[0] = v4f_xayb(x0x2y0y2, x1x3y1y3);
+ res[1] = v4f_zcwd(x0x2y0y2, x1x3y1y3);
+ res[2] = v4f_xayb(z0z2w0w2, z1z3w1w3);
+ res[3] = v4f_zcwd(z0z2w0w2, z1z3w1w3);
+ return res;
+}
+
+static FINLINE v4f_T
+aosf44_det(const v4f_T m[4])
+{
+ v4f_T xxxx, yyyy, zzzz, wwww, xyxy, zwzw, xyzw;
+ v4f_T f33_012_012[3], f33_012_013[3], f33_012_023[3], f33_012_123[3];
+ ASSERT(m);
+ aosf33_set(f33_012_012, m[0], m[1], m[2]);
+ aosf33_set(f33_012_013, m[0], m[1], m[3]);
+ aosf33_set(f33_012_023, m[0], m[2], m[3]);
+ aosf33_set(f33_012_123, m[1], m[2], m[3]);
+ xxxx = v4f_minus(aosf33_det(f33_012_123));
+ yyyy = aosf33_det(f33_012_023);
+ zzzz = v4f_minus(aosf33_det(f33_012_013));
+ wwww = aosf33_det(f33_012_012);
+ xyxy = v4f_xayb(xxxx, yyyy);
+ zwzw = v4f_xayb(zzzz, wwww);
+ xyzw = v4f_xyab(xyxy, zwzw);
+ return v4f_dot(xyzw, aosf44_row3(m));
+}
+
+RSIMD_API v4f_T /* Return the determinant */
+aosf44_inverse(v4f_T out[4], const v4f_T in[4]);
+
+static FINLINE v4f_T /* Return the determinant */
+aosf44_invtrans(v4f_T out[4], const v4f_T a[4])
+{
+ v4f_T det;
+ ASSERT(out && a);
+ det = aosf44_inverse(out, a);
+ aosf44_transpose(out, out);
+ return det;
+}
+
+static FINLINE v4f_T
+aosf44_eq(const v4f_T a[4], const v4f_T b[4])
+{
+ ASSERT(a && b);
+ if(a == b) {
+ return v4f_true();
+ } else {
+ const v4f_T eq_c0 = v4f_eq(a[0], b[0]);
+ const v4f_T eq_c1 = v4f_eq(a[1], b[1]);
+ const v4f_T eq_c2 = v4f_eq(a[2], b[2]);
+ const v4f_T eq_c3 = v4f_eq(a[3], b[3]);
+ const v4f_T eq = v4f_and(v4f_and(eq_c0, eq_c1), v4f_and(eq_c2, eq_c3));
+ const v4f_T tmp = v4f_and(v4f_xzxz(eq), v4f_ywyw(eq));
+ const v4f_T ret = v4f_and(tmp, v4f_yxwz(tmp));
+ return ret;
+ }
+}
+
+#endif /* AOSF44_H */
+
diff --git a/src/test_aosf44.c b/src/test_aosf44.c
@@ -0,0 +1,420 @@
+/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr)
+ *
+ * The RSIMD library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The RSIMD library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "aosf44.h"
+#include <rsys/float44.h>
+
+#define AOSF44_EQ_EPS(Mat, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Eps)\
+ { \
+ float a[16], b[16]; \
+ b[0] = (A); b[1] = (B); b[2] = (C); b[3] = (D); \
+ b[4] = (E); b[5] = (F); b[6] = (G); b[7] = (H); \
+ b[8] = (I); b[9] = (J); b[10]= (K); b[11]= (L); \
+ b[12]= (M); b[13]= (N); b[14]= (O); b[15]= (P); \
+ CHECK(f44_eq_eps(aosf44_store(a, (Mat)), b, Eps), 1); \
+ } (void)0
+#define AOSF44_EQ(Mat, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+ AOSF44_EQ_EPS(Mat, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, 0.f)
+
+int
+main(int argc, char** argv)
+{
+ v4f_T m[4], n[4], o[4], v;
+ ALIGN(16) float tmp[16];
+ (void)argc, (void)argv;
+
+ CHECK(aosf44_set(m,
+ v4f_set(0.f, 1.f, 2.f, 3.f),
+ v4f_set(4.f, 5.f, 6.f, 7.f),
+ v4f_set(8.f, 9.f, 10.f, 11.f),
+ v4f_set(12.f, 13.f, 14.f, 15.f)), m);
+ AOSF44_EQ(m,
+ 0.f, 1.f, 2.f, 3.f,
+ 4.f, 5.f, 6.f, 7.f,
+ 8.f, 9.f, 10.f, 11.f,
+ 12.f, 13.f, 14.f, 15.f);
+
+ CHECK(aosf44_store(tmp, m), tmp);
+ CHECK(tmp[0], 0.f);
+ CHECK(tmp[1], 1.f);
+ CHECK(tmp[2], 2.f);
+ CHECK(tmp[3], 3.f);
+ CHECK(tmp[4], 4.f);
+ CHECK(tmp[5], 5.f);
+ CHECK(tmp[6], 6.f);
+ CHECK(tmp[7], 7.f);
+ CHECK(tmp[8], 8.f);
+ CHECK(tmp[9], 9.f);
+ CHECK(tmp[10], 10.f);
+ CHECK(tmp[11], 11.f);
+ CHECK(tmp[12], 12.f);
+ CHECK(tmp[13], 13.f);
+ CHECK(tmp[14], 14.f);
+ CHECK(tmp[15], 15.f);
+
+ tmp[0] = 0.f; tmp[1] = 2.f; tmp[2] = 4.f; tmp[3] = 6.f;
+ tmp[4] = 8.f; tmp[5] = 10.f; tmp[6] = 12.f; tmp[7] = 14.f;
+ tmp[8] = 16.f; tmp[9] = 18.f; tmp[10] = 20.f; tmp[11] = 22.f;
+ tmp[12] = 24.f; tmp[13] = 26.f; tmp[14] = 28.f; tmp[15] = 30.f;
+ CHECK(aosf44_load(m, tmp), m);
+ AOSF44_EQ(m,
+ 0.f, 2.f, 4.f, 6.f,
+ 8.f, 10.f, 12.f, 14.f,
+ 16.f, 18.f, 20.f, 22.f,
+ 24.f, 26.f, 28.f, 30.f);
+
+ CHECK(aosf44_identity(m), m);
+ AOSF44_EQ(m,
+ 1.f, 0.f, 0.f, 0.f,
+ 0.f, 1.f, 0.f, 0.f,
+ 0.f, 0.f, 1.f, 0.f,
+ 0.f, 0.f, 0.f, 1.f);
+
+ CHECK(aosf44_zero(m), m);
+ AOSF44_EQ(m,
+ 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f);
+
+ CHECK(aosf44_set_row0(m, v4f_set(0.f, 1.f, 2.f, 3.f)), m);
+ AOSF44_EQ(m,
+ 0.f, 0.f, 0.f, 0.f,
+ 1.f, 0.f, 0.f, 0.f,
+ 2.f, 0.f, 0.f, 0.f,
+ 3.f, 0.f, 0.f, 0.f);
+ CHECK(aosf44_set_row1(m, v4f_set(4.f, 5.f, 6.f, 7.f)), m);
+ AOSF44_EQ(m,
+ 0.f, 4.f, 0.f, 0.f,
+ 1.f, 5.f, 0.f, 0.f,
+ 2.f, 6.f, 0.f, 0.f,
+ 3.f, 7.f, 0.f, 0.f);
+ CHECK(aosf44_set_row2(m, v4f_set(8.f, 9.f, 10.f, 11.f)), m);
+ AOSF44_EQ(m,
+ 0.f, 4.f, 8.f, 0.f,
+ 1.f, 5.f, 9.f, 0.f,
+ 2.f, 6.f, 10.f, 0.f,
+ 3.f, 7.f, 11.f, 0.f);
+ CHECK(aosf44_set_row3(m, v4f_set(12.f, 13.f, 14.f, 15.f)), m);
+ AOSF44_EQ(m,
+ 0.f, 4.f, 8.f, 12.f,
+ 1.f, 5.f, 9.f, 13.f,
+ 2.f, 6.f, 10.f, 14.f,
+ 3.f, 7.f, 11.f, 15.f);
+
+ CHECK(aosf44_zero(m), m);
+ CHECK(aosf44_set_row(m, v4f_set(0.f, 1.f, 2.f, 3.f), 0), m);
+ AOSF44_EQ(m,
+ 0.f, 0.f, 0.f, 0.f,
+ 1.f, 0.f, 0.f, 0.f,
+ 2.f, 0.f, 0.f, 0.f,
+ 3.f, 0.f, 0.f, 0.f);
+ CHECK(aosf44_set_row(m, v4f_set(4.f, 5.f, 6.f, 7.f), 1), m);
+ AOSF44_EQ(m,
+ 0.f, 4.f, 0.f, 0.f,
+ 1.f, 5.f, 0.f, 0.f,
+ 2.f, 6.f, 0.f, 0.f,
+ 3.f, 7.f, 0.f, 0.f);
+ CHECK(aosf44_set_row(m, v4f_set(8.f, 9.f, 10.f, 11.f), 2), m);
+ AOSF44_EQ(m,
+ 0.f, 4.f, 8.f, 0.f,
+ 1.f, 5.f, 9.f, 0.f,
+ 2.f, 6.f, 10.f, 0.f,
+ 3.f, 7.f, 11.f, 0.f);
+ CHECK(aosf44_set_row(m, v4f_set(12.f, 13.f, 14.f, 15.f), 3), m);
+ AOSF44_EQ(m,
+ 0.f, 4.f, 8.f, 12.f,
+ 1.f, 5.f, 9.f, 13.f,
+ 2.f, 6.f, 10.f, 14.f,
+ 3.f, 7.f, 11.f, 15.f);
+
+ CHECK(aosf44_zero(m), m);
+ CHECK(aosf44_set_col(m, v4f_set(0.f, 1.f, 2.f, 3.f), 0), m);
+ AOSF44_EQ(m,
+ 0.f, 1.f, 2.f, 3.f,
+ 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f);
+ CHECK(aosf44_set_col(m, v4f_set(4.f, 5.f, 6.f, 7.f), 1), m);
+ AOSF44_EQ(m,
+ 0.f, 1.f, 2.f, 3.f,
+ 4.f, 5.f, 6.f, 7.f,
+ 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f);
+ CHECK(aosf44_set_col(m, v4f_set(8.f, 9.f, 10.f, 11.f), 2), m);
+ AOSF44_EQ(m,
+ 0.f, 1.f, 2.f, 3.f,
+ 4.f, 5.f, 6.f, 7.f,
+ 8.f, 9.f, 10.f, 11.f,
+ 0.f, 0.f, 0.f, 0.f);
+ CHECK(aosf44_set_col(m, v4f_set(12.f, 13.f, 14.f, 15.f), 3), m);
+ AOSF44_EQ(m,
+ 0.f, 1.f, 2.f, 3.f,
+ 4.f, 5.f, 6.f, 7.f,
+ 8.f, 9.f, 10.f, 11.f,
+ 12.f, 13.f, 14.f, 15.f);
+
+ v = aosf44_row0(m);
+ CHECK(v4f_x(v), 0.f);
+ CHECK(v4f_y(v), 4.f);
+ CHECK(v4f_z(v), 8.f);
+ CHECK(v4f_w(v), 12.f);
+
+ v = aosf44_row1(m);
+ CHECK(v4f_x(v), 1.f);
+ CHECK(v4f_y(v), 5.f);
+ CHECK(v4f_z(v), 9.f);
+ CHECK(v4f_w(v), 13.f);
+
+ v = aosf44_row2(m);
+ CHECK(v4f_x(v), 2.f);
+ CHECK(v4f_y(v), 6.f);
+ CHECK(v4f_z(v), 10.f);
+ CHECK(v4f_w(v), 14.f);
+
+ v = aosf44_row3(m);
+ CHECK(v4f_x(v), 3.f);
+ CHECK(v4f_y(v), 7.f);
+ CHECK(v4f_z(v), 11.f);
+ CHECK(v4f_w(v), 15.f);
+
+ v = aosf44_row(m, 0);
+ CHECK(v4f_x(v), 0.f);
+ CHECK(v4f_y(v), 4.f);
+ CHECK(v4f_z(v), 8.f);
+ CHECK(v4f_w(v), 12.f);
+
+ v = aosf44_row(m, 1);
+ CHECK(v4f_x(v), 1.f);
+ CHECK(v4f_y(v), 5.f);
+ CHECK(v4f_z(v), 9.f);
+ CHECK(v4f_w(v), 13.f);
+
+ v = aosf44_row(m, 2);
+ CHECK(v4f_x(v), 2.f);
+ CHECK(v4f_y(v), 6.f);
+ CHECK(v4f_z(v), 10.f);
+ CHECK(v4f_w(v), 14.f);
+
+ v = aosf44_row(m, 3);
+ CHECK(v4f_x(v), 3.f);
+ CHECK(v4f_y(v), 7.f);
+ CHECK(v4f_z(v), 11.f);
+ CHECK(v4f_w(v), 15.f);
+
+ v = aosf44_col(m, 0);
+ CHECK(v4f_x(v), 0.f);
+ CHECK(v4f_y(v), 1.f);
+ CHECK(v4f_z(v), 2.f);
+ CHECK(v4f_w(v), 3.f);
+
+ v = aosf44_col(m, 1);
+ CHECK(v4f_x(v), 4.f);
+ CHECK(v4f_y(v), 5.f);
+ CHECK(v4f_z(v), 6.f);
+ CHECK(v4f_w(v), 7.f);
+
+ v = aosf44_col(m, 2);
+ CHECK(v4f_x(v), 8.f);
+ CHECK(v4f_y(v), 9.f);
+ CHECK(v4f_z(v), 10.f);
+ CHECK(v4f_w(v), 11.f);
+
+ v = aosf44_col(m, 3);
+ CHECK(v4f_x(v), 12.f);
+ CHECK(v4f_y(v), 13.f);
+ CHECK(v4f_z(v), 14.f);
+ CHECK(v4f_w(v), 15.f);
+
+ CHECK(aosf44_set(m,
+ v4f_set(0.f, 1.f, 2.f, 3.f),
+ v4f_set(4.f, 5.f, 6.f, 7.f),
+ v4f_set(8.f, 9.f, 10.f, 11.f),
+ v4f_set(12.f, 13.f, 14.f, 15.f)), m);
+ CHECK(aosf44_set(n,
+ v4f_set(0.f, 2.f, 1.f, 3.f),
+ v4f_set(1.f, -2.f, -1.f, -3.f),
+ v4f_set(1.f, 0.f, 0.f, 2.f),
+ v4f_set(3.f, 2.f, 1.f, 0.f)), n);
+ CHECK(aosf44_add(o, m, n), o);
+ AOSF44_EQ(o,
+ 0.f, 3.f, 3.f, 6.f,
+ 5.f, 3.f, 5.f, 4.f,
+ 9.f, 9.f, 10.f, 13.f,
+ 15.f, 15.f, 15.f, 15.f);
+
+ CHECK(aosf44_sub(o, m, n), o);
+ AOSF44_EQ(o,
+ 0.f, -1.f, 1.f, 0.f,
+ 3.f, 7.f, 7.f, 10.f,
+ 7.f, 9.f, 10.f, 9.f,
+ 9.f, 11.f, 13.f, 15.f);
+
+ CHECK(aosf44_minus(o, n), o);
+ AOSF44_EQ(o,
+ 0.f, -2.f, -1.f, -3.f,
+ -1.f, 2.f, 1.f, 3.f,
+ -1.f, 0.f, 0.f, -2.f,
+ -3.f, -2.f, -1.f, 0.f);
+
+ CHECK(aosf44_abs(o, o), o);
+ AOSF44_EQ(o,
+ 0.f, 2.f, 1.f, 3.f,
+ 1.f, 2.f, 1.f, 3.f,
+ 1.f, 0.f, 0.f, 2.f,
+ 3.f, 2.f, 1.f, 0.f);
+
+ CHECK(aosf44_mul(o, n, v4f_set(1.f, 2.f, 3.f, 2.f)), o);
+ AOSF44_EQ(o,
+ 0.f, 4.f, 3.f, 6.f,
+ 1.f, -4.f, -3.f, -6.f,
+ 1.f, 0.f, 0.f, 4.f,
+ 3.f, 4.f, 3.f, 0.f);
+
+ aosf44_set(m,
+ v4f_set(0.f, 1.f, 2.f, 3.f),
+ v4f_set(4.f, 5.f, 6.f, 7.f),
+ v4f_set(8.f, 9.f, 10.f, 11.f),
+ v4f_set(12.f, 13.f, 14.f, 15.f));
+ v = aosf44_mulf4(m, v4f_set(1.f, 2.f, 3.f, 1.f));
+ CHECK(v4f_x(v), 44.f);
+ CHECK(v4f_y(v), 51.f);
+ CHECK(v4f_z(v), 58.f);
+ CHECK(v4f_w(v), 65.f);
+
+ v = aosf4_mulf44(v4f_set(1.f, 2.f, 3.f, 1.f), m);
+ CHECK(v4f_x(v), 11.f);
+ CHECK(v4f_y(v), 39.f);
+ CHECK(v4f_z(v), 67.f);
+ CHECK(v4f_w(v), 95.f);
+
+ aosf44_set(m,
+ v4f_set(1.f, 2.f, 3.f, 4.f),
+ v4f_set(4.f, 5.f, 6.f, 7.f),
+ v4f_set(7.f, 8.f, 9.f, 10.f),
+ v4f_set(10.f, 11.f, 12.f, 13.f));
+ aosf44_set(n,
+ v4f_set(2.f, 9.f, 8.f, 1.f),
+ v4f_set(1.f, -2.f, 2.f, 1.f),
+ v4f_set(1.f, -8.f, -4.f, 2.f),
+ v4f_set(1.f, 3.f, 4.f, 2.f));
+ CHECK(aosf44_mulf44(o, m, n), o);
+ AOSF44_EQ(o,
+ 104.f, 124.f, 144.f, 164.f,
+ 17.f, 19.f, 21.f, 23.f,
+ -39.f, -48.f, -57.f, -66.f,
+ 61.f, 71.f, 81.f, 91.f);
+
+ CHECK(aosf44_transpose(o, n), o);
+ AOSF44_EQ(o,
+ 2.f, 1.f, 1.f, 1.f,
+ 9.f, -2.f, -8.f, 3.f,
+ 8.f, 2.f, -4.f, 4.f,
+ 1.f, 1.f, 2.f, 2.f);
+
+ v = aosf44_det(n);
+ CHECK(v4f_x(v), 78.f);
+ CHECK(v4f_y(v), 78.f);
+ CHECK(v4f_z(v), 78.f);
+ CHECK(v4f_w(v), 78.f);
+
+ v = aosf44_inverse(m, n);
+ CHECK(v4f_x(v), 78.f);
+ CHECK(v4f_y(v), 78.f);
+ CHECK(v4f_z(v), 78.f);
+ CHECK(v4f_w(v), 78.f);
+ CHECK(aosf44_mulf44(o, m, n), o);
+ AOSF44_EQ_EPS(o,
+ 1.f, 0.f, 0.f, 0.f,
+ 0.f, 1.f, 0.f, 0.f,
+ 0.f, 0.f, 1.f, 0.f,
+ 0.f, 0.f, 0.f, 1.f,
+ 1.e-6f);
+
+ v = aosf44_invtrans(o, n);
+ CHECK(v4f_x(v), 78.f);
+ CHECK(v4f_y(v), 78.f);
+ CHECK(v4f_z(v), 78.f);
+ CHECK(v4f_w(v), 78.f);
+ AOSF44_EQ(o,
+ v4f_x(m[0]), v4f_x(m[1]), v4f_x(m[2]), v4f_x(m[3]),
+ v4f_y(m[0]), v4f_y(m[1]), v4f_y(m[2]), v4f_y(m[3]),
+ v4f_z(m[0]), v4f_z(m[1]), v4f_z(m[2]), v4f_z(m[3]),
+ v4f_w(m[0]), v4f_w(m[1]), v4f_w(m[2]), v4f_w(m[3]));
+
+ aosf44_set(m,
+ v4f_set(0.f, 1.f, 2.f, 3.f),
+ v4f_set(5.f, 5.f, 6.f, 7.f),
+ v4f_set(8.f, 9.f, 10.f, 11.f),
+ v4f_set(12.f, 13.f, 14.f, 15.f));
+ aosf44_set(n,
+ v4f_set(0.f, 1.f, 2.f, 3.f),
+ v4f_set(5.f, 5.f, 6.f, 7.f),
+ v4f_set(8.f, 9.f, 10.f, 11.f),
+ v4f_set(12.f, 13.f, 14.f, 15.f));
+
+ v = aosf44_eq(m, n);
+ CHECK(v4f_mask_x(v), ~0);
+ CHECK(v4f_mask_y(v), ~0);
+ CHECK(v4f_mask_z(v), ~0);
+ CHECK(v4f_mask_w(v), ~0);
+
+ n[0] = v4f_set(0.f, 1.0f, 2.f, 4.f);
+ v = aosf44_eq(m, n);
+ CHECK(v4f_mask_x(v), 0);
+ CHECK(v4f_mask_y(v), 0);
+ CHECK(v4f_mask_z(v), 0);
+ CHECK(v4f_mask_w(v), 0);
+ n[0] = v4f_set(0.f, 1.0f, 2.f, 3.f);
+
+ n[1] = v4f_set(4.f, 5.0f, 6.f, 7.f);
+ v = aosf44_eq(m, n);
+ CHECK(v4f_mask_x(v), 0);
+ CHECK(v4f_mask_y(v), 0);
+ CHECK(v4f_mask_z(v), 0);
+ CHECK(v4f_mask_w(v), 0);
+ n[1] = v4f_set(5.f, 5.0f, 6.f, 7.f);
+
+ m[2] = v4f_set(8.f, -9.0f, 10.f, 11.f);
+ v = aosf44_eq(m, n);
+ CHECK(v4f_mask_x(v), 0);
+ CHECK(v4f_mask_y(v), 0);
+ CHECK(v4f_mask_z(v), 0);
+ CHECK(v4f_mask_w(v), 0);
+ m[2] = v4f_set(8.f, 9.0f, 10.f, 11.f);
+
+ n[3] = v4f_set(12.f, 13.1f, 14.f, 15.f);
+ v = aosf44_eq(m, n);
+ CHECK(v4f_mask_x(v), 0);
+ CHECK(v4f_mask_y(v), 0);
+ CHECK(v4f_mask_z(v), 0);
+ CHECK(v4f_mask_w(v), 0);
+
+ v = aosf44_eq(m, m);
+ CHECK(v4f_mask_x(v), ~0);
+ CHECK(v4f_mask_y(v), ~0);
+ CHECK(v4f_mask_z(v), ~0);
+ CHECK(v4f_mask_w(v), ~0);
+ n[3] = v4f_set(12.f, 13.0f, 14.f, 15.f);
+
+ v = aosf44_eq(m, n);
+ CHECK(v4f_mask_x(v), ~0);
+ CHECK(v4f_mask_y(v), ~0);
+ CHECK(v4f_mask_z(v), ~0);
+ CHECK(v4f_mask_w(v), ~0);
+ return 0;
+}
+
+