rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

commit 16f4dd73fbd19643a8cab92f9e2af0d0efbe0d9a
parent 73009ee225c3142b746eeb020c2f2142e2d4e4aa
Author: vaplv <vaplv@free.fr>
Date:   Wed, 16 Jun 2021 09:14:49 +0200

Merge branch 'release_0.3'

Diffstat:
DCOPYING.LESSER | 165-------------------------------------------------------------------------------
MREADME.md | 44+++++++++++++++++++++++++++++++-------------
Mcmake/CMakeLists.txt | 160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Acmake/RSIMDConfig.cmake.in | 133+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acmake/RSIMDConfigVersion.cmake.in | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acmake/SleefConfig.cmake | 35+++++++++++++++++++++++++++++++++++
Msrc/aosf33.h | 8++++----
Msrc/aosf44.c | 8++++----
Msrc/aosf44.h | 8++++----
Msrc/aosq.c | 8++++----
Msrc/aosq.h | 9+++++----
Asrc/avx/avx.h | 26++++++++++++++++++++++++++
Asrc/avx/avxf.h | 330+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/avx/avxi.h | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/math.h | 29+++++++++++++++++++++++++++++
Asrc/math4.h | 41+++++++++++++++++++++++++++++++++++++++++
Asrc/math8.h | 24++++++++++++++++++++++++
Asrc/mathX.h | 137+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/rsimd.h | 15++++++++-------
Asrc/rsimd_version.h.in | 23+++++++++++++++++++++++
Msrc/soa4f2.h | 20++++++--------------
Msrc/soa4f3.h | 24++++++------------------
Msrc/soa4f4.h | 17++++++++++-------
Dsrc/soa4fX.h | 352-------------------------------------------------------------------------------
Asrc/soa8f2.h | 22++++++++++++++++++++++
Asrc/soa8f3.h | 22++++++++++++++++++++++
Asrc/soa8f4.h | 27+++++++++++++++++++++++++++
Asrc/soaXf2.h | 33+++++++++++++++++++++++++++++++++
Asrc/soaXf3.h | 38++++++++++++++++++++++++++++++++++++++
Asrc/soaXfY.h | 356+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/soaXfY_begin.h | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/soaXfY_end.h | 31+++++++++++++++++++++++++++++++
Msrc/sse/sse.h | 8++++----
Msrc/sse/sse_swz.h | 8++++----
Dsrc/sse/ssef.c | 150-------------------------------------------------------------------------------
Msrc/sse/ssef.h | 65++++++++++++-----------------------------------------------------
Msrc/sse/ssei.h | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Msrc/test_aosf33.c | 166++++++++++++++++++++++++++++++++++++++++----------------------------------------
Msrc/test_aosf44.c | 298++++++++++++++++++++++++++++++++++++++++----------------------------------------
Msrc/test_aosq.c | 114++++++++++++++++++++++++++++++++++++++++----------------------------------------
Asrc/test_math4.c | 138+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/test_math8.c | 172+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/test_soa4f2.c | 108+++++++------------------------------------------------------------------------
Msrc/test_soa4f3.c | 139++++++-------------------------------------------------------------------------
Msrc/test_soa4f4.c | 209++++---------------------------------------------------------------------------
Dsrc/test_soa4f_utils.h | 32--------------------------------
Asrc/test_soa8f2.c | 28++++++++++++++++++++++++++++
Asrc/test_soa8f3.c | 28++++++++++++++++++++++++++++
Asrc/test_soa8f4.c | 28++++++++++++++++++++++++++++
Asrc/test_soaXfY.h | 262+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/test_v4f.c | 811+++++++++++++++++++++++++++++++++++++------------------------------------------
Msrc/test_v4i.c | 284++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Asrc/test_v8f.c | 450+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/test_v8i.c | 192+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/vXf_begin.h | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/vXf_end.h | 31+++++++++++++++++++++++++++++++
56 files changed, 4180 insertions(+), 2140 deletions(-)

diff --git a/COPYING.LESSER b/COPYING.LESSER @@ -1,165 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. - - 0. Additional Definitions. - - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. - - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. diff --git a/README.md b/README.md @@ -4,24 +4,42 @@ This C89 library defines an interface that encapsulates and make easier the manipulation of SIMD instruction sets. It also provides a SIMD implementation of linear algebra operations for 3x3 and 4x4 matrices as well as quaternions arranged in an `Array of Structures` SIMD layout. Linear algebra functions on -`Structure of Arrays` 2/3/4 dimensions vectors are also implemented. - -Note that currently only the SSE2 instruction set is supported. +`Structure of Arrays` 2/3/4 dimensions vectors are also implemented. Finally it +exposes a vectorized version of some math functions by relying on the +[Sleef](https://sleef.org/) library. ## How to build The library uses [CMake](http://www.cmake.org) and the [RCMake](https://gitlab.com/vaplv/rcmake/) package to build. It also depends on -the [RSys](https://gitlab.com/vaplv/rsys/) library. First, install the RCMake -package and the RSys library. Then, generate the project from the -cmake/CMakeLists.txt file by appending the RCMake and RSys install directories -to the `CMAKE_PREFIX_PATH` variable. The resulting project can be edited, - built, tested and installed as any CMake project. +the [RSys](https://gitlab.com/vaplv/rsys/) and the [Sleef](https://sleef.org) +library. First, install the RCMake package, the RSys and the Sleef libraries. +Then, generate the project from the cmake/CMakeLists.txt file by appending the +RCMake, RSys and Sleef install directories to the `CMAKE_PREFIX_PATH` variable. +The resulting project can be edited, built, tested and installed as any CMake +project. -## License +## Release notes + +### Version 0.3 + +- Add 8-way vector API for the float and int32 types. +- Add the `v<4|8>i_[reduce_]<min|max>` functions. +- Add the `v4i_minus` function. +- Rely on the [Sleef](https://sleef.org) library to replace the hand-crafted + implementation of the trigonometric functions. +- Add math functions for both 4-way and 8-way vectors. Provided math functions are: + copysign, floor, pow, exp[<2|10>] and log[<2|10>]. -RSIMD is Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr). It is a free -software released under the [OSI](https://opensource.org)-approved LGPL v3+ -license. You are welcome to redistribute it under certain conditions; refer to -the COPYING files for details. +### Version 0.2.1 + +- If supported by the compiler, use the SSE4.1 blendv instruction in the + `v4f_sel` function +- Turns the RSIMD library in shared library. + +## License +Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr). RSIMD is free software +released under the GPL v3+ license: GNU GPL version 3 or later. You are welcome +to redistribute it under certain conditions; refer to the COPYING file for +details. diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +# Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) # # The RSIMD CMake is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -13,78 +13,149 @@ # You should have received a copy of the GNU General Public License # along with the RSIMD CMake. If not, see <http://www.gnu.org/licenses/>. -cmake_minimum_required(VERSION 2.6) +cmake_minimum_required(VERSION 3.1) project(rsimd C) -cmake_policy(SET CMP0011 NEW) enable_testing() -option(BUILD_STATIC "Build RSIMD as a static library" ON) option(NO_TEST "Disable the tests" OFF) set(RSIMD_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../src) ################################################################################ # Check dependencies ################################################################################ +set(Sleef_DIR ${PROJECT_SOURCE_DIR}) + find_package(RCMake REQUIRED) -find_package(RSys REQUIRED) +find_package(RSys 0.12 REQUIRED) +find_package(PkgConfig REQUIRED) +find_package(Sleef REQUIRED) -include_directories(${RSys_INCLUDE_DIR}) +include_directories(${RSys_INCLUDE_DIR} ${Sleef_INCLUDE_DIR}) +include(CheckCCompilerFlag) set(CMAKE_MODULE_PATH ${RCMAKE_SOURCE_DIR}) include(rcmake) ################################################################################ # Check compiler features ################################################################################ -if(CMAKE_COMPILER_IS_GNUCC) - include(CheckCCompilerFlag) +if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux" +OR NOT CMAKE_COMPILER_IS_GNUCC) + message(STATUS ${CMAKE_SYSTEM_NAME}) + message(FATAL_ERROR "Unsupported platform") +endif() + +execute_process(COMMAND cat "/proc/cpuinfo" + OUTPUT_VARIABLE CPUINFO_OUT + ERROR_VARIABLE CPUINFO_ERR + RESULT_VARIABLE CPUINFO_RES) +if(NOT CPUINFO_RES EQUAL 0) + message(FATAL_ERROR "${CPUINFO_ERR}") +endif() + +string(REGEX MATCH "[ \t\r\n]+sse2[ \t\r\n]+" SSE2 ${CPUINFO_OUT}) +string(REGEX MATCH "[ \t\r\n]+sse4_1[ \t\r\n]+" SSE4_1 ${CPUINFO_OUT}) +string(REGEX MATCH "[ \t\r\n]+avx[ \t\r\n]+" AVX ${CPUINFO_OUT}) +string(REGEX MATCH "[ \t\r\n]+fma[ \t\r\n]+" FMA ${CPUINFO_OUT}) + +if(SSE2) + unset(SSE2) + CHECK_C_COMPILER_FLAG("-msse2" SSE2) + message(STATUS "Use the SSE2 instruction set ") +else() + message(FATAL_ERROR "The SSE2 instruction set must be supported.") +endif() +if(SSE4_1) + unset(SSE4_1) CHECK_C_COMPILER_FLAG("-msse4.1" SSE4_1) -endif(CMAKE_COMPILER_IS_GNUCC) + message(STATUS "Use the SSE4.1 instruction set") +endif() +if(AVX) + unset(AVX) + CHECK_C_COMPILER_FLAG("-mavx" AVX) + message(STATUS "Use the AVX instruction set") +endif() +if(FMA) + unset(FMA) + CHECK_C_COMPILER_FLAG("-mfma" FMA) + message(STATUS "Use the FMA instruction set") +endif() ################################################################################ # Configure and define targets ################################################################################ set(VERSION_MAJOR 0) -set(VERSION_MINOR 2) +set(VERSION_MINOR 3) set(VERSION_PATCH 0) set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}) +set(RSIMD_SSE2 ${SSE2}) +set(RSIMD_SSE4_1 ${SSE4_1}) +set(RSIMD_AVX ${AVX}) +set(RSIMD_FMA ${FMA}) + +# Configure the files generic to the RSIMD version +configure_file(${RSIMD_SOURCE_DIR}/rsimd_version.h.in + ${CMAKE_CURRENT_BINARY_DIR}/rsimd_version.h @ONLY) +configure_file(${PROJECT_SOURCE_DIR}/RSIMDConfigVersion.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/RSIMDConfigVersion.cmake @ONLY) +configure_file(${PROJECT_SOURCE_DIR}/RSIMDConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/RSIMDConfig.cmake @ONLY) + set(RSIMD_FILES_INC_LEGACY aosf33.h aosf44.h aosq.h + math.h + mathX.h + math4.h + math8.h rsimd.h - soa4fX.h + soaXfY.h + soaXfY_begin.h + soaXfY_end.h + soaXf2.h + soaXf3.h soa4f2.h soa4f3.h - soa4f4.h) + soa4f4.h + soa8f2.h + soa8f3.h + soa8f4.h + vXf_begin.h + vXf_end.h) set(RSIMD_FILES_INC_SSE sse/sse.h sse/ssef.h sse/ssei.h sse/sse_swz.h) +set(RSIMD_FILES_INC_AVX + avx/avx.h + avx/avxf.h + avx/avxi.h) set(RSIMD_FILES_SRC aosf44.c - aosq.c - sse/ssef.c) + aosq.c) set(RSIMD_FILES_DOC COPYING COPYING.LESSER README.md) +set(RSIMD_FILES_CMAKE + RSIMDConfig.cmake + RSIMDConfigVersion.cmake) rcmake_prepend_path(RSIMD_FILES_INC_LEGACY ${RSIMD_SOURCE_DIR}) rcmake_prepend_path(RSIMD_FILES_INC_SSE ${RSIMD_SOURCE_DIR}) +rcmake_prepend_path(RSIMD_FILES_INC_AVX ${RSIMD_SOURCE_DIR}) rcmake_prepend_path(RSIMD_FILES_SRC ${RSIMD_SOURCE_DIR}) rcmake_prepend_path(RSIMD_FILES_DOC ${PROJECT_SOURCE_DIR}/../) -set(RSIMD_FILES_INC ${RSIMD_FILES_INC_LEGACY} ${RSIMD_FILES_INC_SSE}) - -if(BUILD_STATIC) - add_library(rsimd STATIC ${RSIMD_FILES_INC} ${RSIMD_FILES_SRC}) - set_target_properties(rsimd PROPERTIES DEFINE_SYMBOL RSIMD_STATIC) -else(BUILD_STATIC) - add_library(rsimd SHARED ${RSIMD_FILES_INC} ${RSIMD_FILES_SRC}) - set_target_properties(rsimd PROPERTIES - DEFINE_SYMBOL RSIMD_SHARED_BUILD - VERSION ${VERSION} - SOVERSION ${VERSION_MAJOR}) -endif(BUILD_STATIC) - -rcmake_setup_devel(rsimd RSIMD ${VERSION} rsimd/rsimd_version.h) +rcmake_prepend_path(RSIMD_FILES_CMAKE ${PROJECT_SOURCE_DIR}/) +set(RSIMD_FILES_INC + ${RSIMD_FILES_INC_LEGACY} + ${RSIMD_FILES_INC_SSE} + ${RSIMD_FILES_INC_AVX}) + +add_library(rsimd SHARED ${RSIMD_FILES_INC} ${RSIMD_FILES_SRC}) +target_link_libraries(rsimd Sleef) +set_target_properties(rsimd PROPERTIES DEFINE_SYMBOL RSIMD_SHARED_BUILD) +set_target_properties(rsimd PROPERTIES + VERSION ${VERSION} + SOVERSION ${VERSION_MAJOR}) ################################################################################ # Add tests @@ -96,18 +167,19 @@ if(NOT NO_TEST) add_test(${_name} ${_name}) if(NOT "${ARGN}" STREQUAL "") set_target_properties(${_name} PROPERTIES COMPILE_FLAGS ${ARGN}) - endif(NOT "${ARGN}" STREQUAL "") - endfunction(new_test_named) + endif() + endfunction() function(new_test _name) new_test_named(${_name} ${_name} ${ARGN}) - endfunction(new_test) + endfunction() new_test(test_v4f) new_test(test_v4i) new_test(test_aosf33) new_test(test_aosf44) new_test(test_aosq) + new_test(test_math4) new_test(test_soa4f2) new_test(test_soa4f3) new_test(test_soa4f4) @@ -115,7 +187,23 @@ if(NOT NO_TEST) if(SSE4_1 AND CMAKE_COMPILER_IS_GNUCC) new_test_named(test_v4f_sse4_1 test_v4f "-msse4.1") new_test_named(test_v4i_sse4_1 test_v4i "-msse4.1") - endif(SSE4_1 AND CMAKE_COMPILER_IS_GNUCC) + endif() + + if(FMA AND CMAKE_COMPILER_IS_GNUCC) + new_test_named(test_v4f_fma test_v4f "-mfma") + new_test_named(test_soa8f2_fma test_soa8f2 "-mfma") + new_test_named(test_soa8f3_fma test_soa8f3 "-mfma") + new_test_named(test_soa8f4_fma test_soa8f4 "-mfma") + endif() + + if(AVX AND CMAKE_COMPILER_IS_GNUCC) + new_test(test_math8 "-mavx") + new_test(test_v8f "-mavx") + new_test(test_v8i "-mavx") + new_test(test_soa8f2 "-mavx") + new_test(test_soa8f3 "-mavx") + new_test(test_soa8f4 "-mavx") + endif(AVX AND CMAKE_COMPILER_IS_GNUCC) endif(NOT NO_TEST) @@ -127,6 +215,14 @@ install(TARGETS rsimd LIBRARY DESTINATION lib RUNTIME DESTINATION bin) install(FILES ${RSIMD_FILES_INC_LEGACY} DESTINATION include/rsimd) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/rsimd_version.h DESTINATION include/rsimd) install(FILES ${RSIMD_FILES_INC_SSE} DESTINATION include/rsimd/sse) +install(FILES ${RSIMD_FILES_INC_AVX} DESTINATION include/rsimd/avx) install(FILES ${RSIMD_FILES_DOC} DESTINATION share/doc/rsimd) +install(FILES ${Sleef_DIR}/SleefConfig.cmake DESTINATION lib/cmake/Sleef/) + +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/RSIMDConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/RSIMDConfigVersion.cmake + DESTINATION lib/cmake/RSIMD) diff --git a/cmake/RSIMDConfig.cmake.in b/cmake/RSIMDConfig.cmake.in @@ -0,0 +1,133 @@ +# Copyright (C) 2013-2021 Vincent Forest (vaplv@free.fr) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +cmake_minimum_required(VERSION 3.1) + +# Check dependenc +find_package(Sleef REQUIRED) + +set(RSIMD_SSE2 @RSIMD_SSE2@) +set(RSIMD_SSE4_1 @RSIMD_SSE4_1@) +set(RSIMD_AVX @RSIMD_AVX@) +set(RSIMD_FMA @RSIMD_FMA@) + +# Check compiler features +if(CMAKE_COMPILER_IS_GNUCC) + include(CheckCCompilerFlag) + if(RSIMD_SSE2) + CHECK_C_COMPILER_FLAG("-msse2" SSE2) + if(SSE2) + list(APPEND _compile_flags -msse2) + endif() + endif() + if(RSIMD_SSE4_1) + CHECK_C_COMPILER_FLAG("-msse4.1" SSE4_1) + if(SSE4_1) + list(APPEND _compile_flags -msse4.1) + endif() + endif() + if(RSIMD_AVX) + CHECK_C_COMPILER_FLAG("-mavx" AVX) + if(AVX) + list(APPEND _compile_flags -mavx) + endif() + endif() + if(RSIMD_FMA) + CHECK_C_COMPILER_FLAG("-mfma" FMA) + if(FMA) + list(APPEND _compile_flags -mfma) + endif() + endif() +endif() + +# Try to find the RSIMD devel. Once done this will define: +# - RSIMD_FOUND: system has RSIMD +# - RSIMD_INCLUDE_DIR: the include directory +# - RSIMD Target: Link this to use rsimd + +# Look for library header +find_path(RSIMD_INCLUDE_DIR rsimd/rsimd_version.h) + +# Look for Release, Debug, RelWithDebInfo and MinSizeRel libraries +unset(RSIMD_LIBRARY CACHE) +unset(RSIMD_LIBRARY_RELEASE CACHE) +unset(RSIMD_LIBRARY_DEBUG CACHE) +unset(RSIMD_LIBRARY_RELWITHDEBINFO CACHE) +unset(RSIMD_LIBRARY_MINSIZEREL CACHE) + +# Find per configuration type libraries +find_library(RSIMD_LIBRARY_RELEASE + rsimd + PATH_SUFFIXES bin Bin BIN + DOC "Path to the library rsimd used during release builds.") +find_library(RSIMD_LIBRARY_DEBUG + rsimd-dbg + PATH_SUFFIXES bin Bin BIN + DOC "Path to the library rsimd used during debug builds.") +find_library(RSIMD_LIBRARY_RELWITHDEBINFO + rsimd-rdbg + PATH_SUFFIXES bin Bin BIN + DOC "Path to the library rsimd used during release with debug info builds.") +find_library(RSIMD_LIBRARY_MINSIZEREL + rsimd-mszr + PATH_SUFFIXES bin Bin BIN + DOC "Path to the library rsimd used during minsize builds.") + +# Define the generic rsimd library +if(RSIMD_LIBRARY_RELEASE) + set(RSIMD_LIBRARY ${RSIMD_LIBRARY_RELEASE}) +elseif(RSIMD_LIBRARY_RELWITHDEBINFO) + set(RSIMD_LIBRARY ${RSIMD_LIBRARY_RELWITHDEBINFO}) +elseif(RSIMD_LIBRARY_MINSIZEREL) + set(RSIMD_LIBRARY ${RSIMD_LIBRARY_MINSIZEREL}) +elseif(RSIMD_LIBRARY_DEBUG) + set(RSIMD_LIBRARY ${RSIMD_LIBRARY_DEBUG}) +endif() + +# Define the per configuration library fallback when not found +set(_configs RELEASE DEBUG RELWITHDEBINFO MINSIZEREL) +foreach(_cfg ${_configs}) + if(NOT RSIMD_LIBRARY_${_cfg}) + get_property(_doc CACHE RSIMD_LIBRARY_${_cfg} PROPERTY HELPSTRING) + set(RSIMD_LIBRARY_${_cfg} + ${RSIMD_LIBRARY} CACHE PATH ${_doc} FORCE) + endif() +endforeach() + +# Create the imported library target +add_library(RSIMD SHARED IMPORTED) + +# Setup the properties of the imported target +if(CMAKE_HOST_WIN32) + set(_import_prop IMPORTED_IMPLIB) +else() + set(_import_prop IMPORTED_LOCATION) +endif() +set_target_properties(RSIMD PROPERTIES + ${_import_prop} ${RSIMD_LIBRARY} + ${_import_prop}_RELEASE ${RSIMD_LIBRARY_RELEASE} + ${_import_prop}_DEBUG ${RSIMD_LIBRARY_DEBUG} + ${_import_prop}_RELWITHDEBINFO ${RSIMD_LIBRARY_RELWITHDEBINFO} + ${_import_prop}_MINSIZEREL ${RSIMD_LIBRARY_MINSIZEREL} + INTERFACE_INCLUDE_DIRECTORIES ${RSIMD_INCLUDE_DIR} + INTERFACE_LINK_LIBRARIES Sleef + INTERFACE_COMPILE_OPTIONS "${_compile_flags}") + +# Check the package +include(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(RSIMD DEFAULT_MSG + RSIMD_INCLUDE_DIR + RSIMD_LIBRARY) + diff --git a/cmake/RSIMDConfigVersion.cmake.in b/cmake/RSIMDConfigVersion.cmake.in @@ -0,0 +1,54 @@ +# Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +set(VERSION_MAJOR @VERSION_MAJOR@) +set(VERSION_MINOR @VERSION_MINOR@) +set(VERSION_PATCH @VERSION_PATCH@) +set(PACKAGE_VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") + +if(NOT PACKAGE_FIND_VERSION + OR PACKAGE_VERSION VERSION_EQUAL PACKAGE_FIND_VERSION) + set(PACKAGE_VERSION_COMPATIBLE TRUE) + set(PACKAGE_VERSION_EXACT TRUE) + set(PACKAGE_VERSION_UNSUITABLE FALSE) + return() +endif() + +if(NOT VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR) + set(PACKAGE_VERSION_COMPATIBLE FALSE) + set(PACKAGE_VERSION_EXACT FALSE) + set(PACKAGE_VERSION_UNSUITABLE TRUE) + return() +endif() + +if(VERSION_MINOR VERSION_LESS PACKAGE_FIND_VERSION_MINOR) + set(PACKAGE_VERSION_COMPATIBLE FALSE) + set(PACKAGE_VERSION_EXACT FALSE) + set(PACKAGE_VERSION_UNSUITABLE TRUE) + return() +endif() + +if(VERSION_MINOR VERSION_EQUAL PACKAGE_FIND_VERSION_MINOR) + if(VERSION_PATCH VERSION_LESS PACKAGE_FIND_VERSION_PATCH) + set(PACKAGE_VERSION_COMPATIBLE FALSE) + set(PACKAGE_VERSION_EXACT FALSE) + set(PACKAGE_VERSION_UNSUITABLE TRUE) + return() + endif() +endif() + +set(PACKAGE_VERSION_COMPATIBLE TRUE) +set(PACKAGE_VERSION_EXACT FALSE) +set(PACKAGE_VERSION_UNSUITABLE FALSE) diff --git a/cmake/SleefConfig.cmake b/cmake/SleefConfig.cmake @@ -0,0 +1,35 @@ +# Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) +# +# The RSIMD CMake is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# The RSIMD CMake is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the RSIMD CMake. If not, see <http://www.gnu.org/licenses/>. + +cmake_minimum_required(VERSION 3.1) + +# Look for library header +find_path(Sleef_INCLUDE_DIR sleef.h) + +find_library(Sleef_LIBRARY sleef PATH_SUFFIXES lib64 + DOC "Path to the sleef library") + +# Create the imported library target +add_library(Sleef SHARED IMPORTED) +set_target_properties(Sleef PROPERTIES + IMPORTED_LOCATION ${Sleef_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${Sleef_INCLUDE_DIR}) + +# Check the package +include(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(Sleef DEFAULT_MSG + Sleef_INCLUDE_DIR + Sleef_LIBRARY) + diff --git a/src/aosf33.h b/src/aosf33.h @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef AOSF33_H diff --git a/src/aosf44.c b/src/aosf44.c @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #include "aosf44.h" diff --git a/src/aosf44.h b/src/aosf44.h @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef AOSF44_H diff --git a/src/aosq.c b/src/aosq.c @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #include "aosq.h" diff --git a/src/aosq.h b/src/aosq.h @@ -1,22 +1,23 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef AOSQ_H #define AOSQ_H #include "rsimd.h" +#include "math.h" /* * Functions on AoS quaternion encoded into a v4f_T as { i, j, k, a } diff --git a/src/avx/avx.h b/src/avx/avx.h @@ -0,0 +1,26 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_AVX_H +#define RSIMD_AVX_H + +#include "avxf.h" +#include "avxi.h" + +/* Reinterpret cast */ +static FINLINE v8i_T v8f_rcast_v8i(const v8f_T v) {return _mm256_castps_si256(v);} +static FINLINE v8f_T v8i_rcast_v8f(const v8i_T v) {return _mm256_castsi256_ps(v);} + +#endif /* RSIMD_AVX_H */ diff --git a/src/avx/avxf.h b/src/avx/avxf.h @@ -0,0 +1,330 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_AVXF_H +#define RSIMD_AVXF_H + +/* + * 8 packed single precision floating-point values + */ + +#include "avx.h" + +#include <rsys/math.h> +#include <immintrin.h> + +typedef __m256 v8f_T; + +/******************************************************************************* + * Set operations + ******************************************************************************/ +static FINLINE float* +v8f_store(float dst[8], v8f_T v) +{ + ASSERT(dst && IS_ALIGNED(dst, 32)); + _mm256_store_ps(dst, v); + return dst; +} + +static FINLINE v8f_T +v8f_load(const float src[8]) +{ + ASSERT(src && IS_ALIGNED(src, 32)); + return _mm256_load_ps(src); +} + +static FINLINE v8f_T +v8f_loadu(const float f[8]) +{ + ASSERT(f); + return _mm256_set_ps(f[7], f[6], f[5], f[4], f[3],f[2], f[1], f[0]); +} + +static FINLINE v8f_T +v8f_set1(const float x) +{ + return _mm256_set1_ps(x); +} + +static FINLINE v8f_T +v8f_set + (const float a, const float b, const float c, const float d, + const float e, const float f, const float g, const float h) +{ + return _mm256_set_ps(h, g, f, e, d, c, b, a); +} + +static FINLINE v8f_T +v8f_zero(void) +{ + return _mm256_setzero_ps(); +} + +static FINLINE v8f_T +v8f_mask + (const int32_t a, const int32_t b, const int32_t c, const int32_t d, + const int32_t e, const int32_t f, const int32_t g, const int32_t h) +{ + return _mm256_castsi256_ps(_mm256_set_epi32(h, g, f, e, d, c, b, a)); +} + +static FINLINE v8f_T +v8f_mask1(const int32_t x) +{ + return _mm256_castsi256_ps(_mm256_set1_epi32(x)); +} + +static FINLINE v8f_T +v8f_true(void) +{ + return _mm256_castsi256_ps(_mm256_set1_epi32(~0)); +} + +static FINLINE v8f_T +v8f_false(void) +{ + return v8f_zero(); +} + +/******************************************************************************* + * Extract components + ******************************************************************************/ +static FINLINE v4f_T +v8f_abcd(const v8f_T v) +{ + return _mm256_extractf128_ps(v, 0); +} + +static FINLINE v4f_T +v8f_efgh(const v8f_T v) +{ + return _mm256_extractf128_ps(v, 1); +} + +static FINLINE int +v8f_movemask(const v8f_T v) +{ + return _mm256_movemask_ps(v); +} + +/******************************************************************************* + * Bitwise operations + ******************************************************************************/ +static FINLINE v8f_T +v8f_or(const v8f_T v0, const v8f_T v1) +{ + return _mm256_or_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_and(const v8f_T v0, const v8f_T v1) +{ + return _mm256_and_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_andnot(const v8f_T v0, const v8f_T v1) +{ + return _mm256_andnot_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_xor(const v8f_T v0, const v8f_T v1) +{ + return _mm256_xor_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_sel(const v8f_T vfalse, const v8f_T vtrue, const v8f_T vcond) +{ + return _mm256_blendv_ps(vfalse, vtrue, vcond); +} + +/******************************************************************************* + * Arithmetic operations + ******************************************************************************/ +static FINLINE v8f_T +v8f_minus(const v8f_T v) +{ + return v8f_xor(v8f_set1(-0.f), v); +} + +static FINLINE v8f_T +v8f_add(const v8f_T v0, const v8f_T v1) +{ + return _mm256_add_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_sub(const v8f_T v0, const v8f_T v1) +{ + return _mm256_sub_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_mul(const v8f_T v0, const v8f_T v1) +{ + return _mm256_mul_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_div(const v8f_T v0, const v8f_T v1) +{ + return _mm256_div_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_madd(const v8f_T v0, const v8f_T v1, const v8f_T v2) +{ + return _mm256_add_ps(_mm256_mul_ps(v0, v1), v2); +} + +static FINLINE v8f_T +v8f_abs(const v8f_T v) +{ + const union { int32_t i; float f; } mask = { 0x7fffffff }; + return v8f_and(v, v8f_set1(mask.f)); +} + +static FINLINE v8f_T +v8f_sqrt(const v8f_T v) +{ + return _mm256_sqrt_ps(v); +} + +static FINLINE v8f_T +v8f_rsqrte(const v8f_T v) +{ + return _mm256_rsqrt_ps(v); +} + +static FINLINE v8f_T +v8f_rsqrt(const v8f_T v) +{ + const v8f_T y = v8f_rsqrte(v); + const v8f_T yyv = v8f_mul(v8f_mul(y, y), v); + const v8f_T tmp = v8f_sub(v8f_set1(1.5f), v8f_mul(yyv, v8f_set1(0.5f))); + return v8f_mul(tmp, y); +} + +static FINLINE v8f_T +v8f_rcpe(const v8f_T v) +{ + return _mm256_rcp_ps(v); +} + +static FINLINE v8f_T +v8f_rcp(const v8f_T v) +{ + const v8f_T y = v8f_rcpe(v); + const v8f_T tmp = v8f_sub(v8f_set1(2.f), v8f_mul(y, v)); + return v8f_mul(tmp, y); +} + +static FINLINE v8f_T +v8f_lerp(const v8f_T from, const v8f_T to, const v8f_T param) +{ + return v8f_madd(v8f_sub(to, from), param, from); +} + +/******************************************************************************* + * Comparators + ******************************************************************************/ +static FINLINE v8f_T +v8f_eq(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_EQ_OS); +} + +static FINLINE v8f_T +v8f_neq(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_NEQ_OS); +} + +static FINLINE v8f_T +v8f_ge(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_GE_OS); +} + +static FINLINE v8f_T +v8f_le(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_LE_OS); +} + +static FINLINE v8f_T +v8f_gt(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_GT_OS); +} + +static FINLINE v8f_T +v8f_lt(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_LT_OS); +} + +static FINLINE v8f_T +v8f_eq_eps(const v8f_T v0, const v8f_T v1, const v8f_T eps) +{ + return v8f_le(v8f_abs(v8f_sub(v0, v1)), eps); +} + +static FINLINE v8f_T +v8f_min(const v8f_T v0, const v8f_T v1) +{ + return _mm256_min_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_max(const v8f_T v0, const v8f_T v1) +{ + return _mm256_max_ps(v0, v1); +} + +static FINLINE float +v8f_reduce_min(const v8f_T v0) +{ + ALIGN(32) float tmp[8]; + const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2)); + const v8f_T v2 = _mm256_min_ps(v0, v1); + const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1)); + const v8f_T v4 = _mm256_min_ps(v2, v3); + _mm256_store_ps(tmp, v4); + return MMIN(tmp[0], tmp[4]); +} + +static FINLINE float +v8f_reduce_max(const v8f_T v0) +{ + ALIGN(32) float tmp[8]; + const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2)); + const v8f_T v2 = _mm256_max_ps(v0, v1); + const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1)); + const v8f_T v4 = _mm256_max_ps(v2, v3); + _mm256_store_ps(tmp, v4); + return MMAX(tmp[0], tmp[4]); +} + +static FINLINE v8f_T +v8f_clamp(const v8f_T v, const v8f_T vmin, const v8f_T vmax) +{ + return v8f_min(v8f_max(v, vmin), vmax); +} + +#endif /* RSIMD_AVX_H */ + diff --git a/src/avx/avxi.h b/src/avx/avxi.h @@ -0,0 +1,204 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_AVXI_H +#define RSIMD_AVXI_H + +/* + * 8 packed signed integers + */ + +#include <rsys/math.h> +#include <immintrin.h> + +typedef __m256i v8i_T; + +/******************************************************************************* + * Set operations + ******************************************************************************/ +static FINLINE int32_t* +v8i_store(int32_t dst[8], v8i_T v) +{ + ASSERT(dst && IS_ALIGNED(dst, 32)); + _mm256_store_si256((v8i_T*)dst, v); + return dst; +} + +static FINLINE v8i_T +v8i_load(const int32_t src[8]) +{ + ASSERT(src && IS_ALIGNED(src, 32)); + return _mm256_load_si256((const v8i_T*)src); +} + +static FINLINE v8i_T +v8i_set1(const int32_t i) +{ + return _mm256_set1_epi32(i); +} + +static FINLINE v8i_T +v8i_set + (const int32_t a, const int32_t b, const int32_t c, const int32_t d, + const int32_t e, const int32_t f, const int32_t g, const int32_t h) +{ + return _mm256_set_epi32(h, g, f, e, d, c, b, a); +} + +static FINLINE v8i_T +v8i_zero(void) +{ + return _mm256_setzero_si256(); +} + +static FINLINE v8i_T +v8i_set_v4i(const v4i_T abcd, const v4i_T efgh) +{ + v8i_T tmp = v8i_zero(); + tmp = _mm256_insertf128_si256(tmp, abcd, 0); + tmp = _mm256_insertf128_si256(tmp, efgh, 1); + return tmp; +} + +/******************************************************************************* + * Extract components + ******************************************************************************/ +static FINLINE v4i_T +v8i_abcd(const v8i_T v) +{ + return _mm256_extractf128_si256(v, 0); +} + +static FINLINE v4i_T +v8i_efgh(const v8i_T v) +{ + return _mm256_extractf128_si256(v, 1); +} + +/******************************************************************************* + * Bitwise operators + ******************************************************************************/ +static FINLINE v8i_T +v8i_or(const v8i_T v0, const v8i_T v1) +{ + const v8f_T a = _mm256_castsi256_ps(v0); + const v8f_T b = _mm256_castsi256_ps(v1); + const v8f_T c = _mm256_or_ps(a, b); + return _mm256_castps_si256(c); +} + +static FINLINE v8i_T +v8i_and(const v8i_T v0, const v8i_T v1) +{ + const v8f_T a = _mm256_castsi256_ps(v0); + const v8f_T b = _mm256_castsi256_ps(v1); + const v8f_T c = _mm256_and_ps(a, b); + return _mm256_castps_si256(c); +} + +static FINLINE v8i_T +v8i_andnot(const v8i_T v0, const v8i_T v1) +{ + const v8f_T a = _mm256_castsi256_ps(v0); + const v8f_T b = _mm256_castsi256_ps(v1); + const v8f_T c = _mm256_andnot_ps(a, b); + return _mm256_castps_si256(c); +} + +static FINLINE v8i_T +v8i_xor(const v8i_T v0, const v8i_T v1) +{ + const v8f_T a = _mm256_castsi256_ps(v0); + const v8f_T b = _mm256_castsi256_ps(v1); + const v8f_T c = _mm256_xor_ps(a, b); + return _mm256_castps_si256(c); +} + +/******************************************************************************* + * Comparators + ******************************************************************************/ +static FINLINE v8i_T +v8i_eq(const v8i_T v0, const v8i_T v1) +{ + const v4i_T v0_abcd = v8i_abcd(v0); + const v4i_T v0_efgh = v8i_efgh(v0); + const v4i_T v1_abcd = v8i_abcd(v1); + const v4i_T v1_efgh = v8i_efgh(v1); + const v4i_T abcd = v4i_eq(v0_abcd, v1_abcd); + const v4i_T efgh = v4i_eq(v0_efgh, v1_efgh); + return v8i_set_v4i(abcd, efgh); +} + +static FINLINE v8i_T +v8i_neq(const v8i_T v0, const v8i_T v1) +{ + const v4i_T v0_abcd = v8i_abcd(v0); + const v4i_T v0_efgh = v8i_efgh(v0); + const v4i_T v1_abcd = v8i_abcd(v1); + const v4i_T v1_efgh = v8i_efgh(v1); + const v4i_T abcd = v4i_neq(v0_abcd, v1_abcd); + const v4i_T efgh = v4i_neq(v0_efgh, v1_efgh); + return v8i_set_v4i(abcd, efgh); +} + +static FINLINE v8i_T +v8i_sel(const v8i_T vfalse, const v8i_T vtrue, const v8i_T vcond) +{ + const v8f_T a = _mm256_castsi256_ps(vfalse); + const v8f_T b = _mm256_castsi256_ps(vtrue); + const v8f_T c = _mm256_castsi256_ps(vcond); + return _mm256_castps_si256(_mm256_blendv_ps(a, b, c)); +} + +static FINLINE v8i_T +v8i_min(const v8i_T v0, const v8i_T v1) +{ + const v4i_T v0_abcd = v8i_abcd(v0); + const v4i_T v0_efgh = v8i_efgh(v0); + const v4i_T v1_abcd = v8i_abcd(v1); + const v4i_T v1_efgh = v8i_efgh(v1); + const v4i_T abcd = v4i_min(v0_abcd, v1_abcd); + const v4i_T efgh = v4i_min(v0_efgh, v1_efgh); + return v8i_set_v4i(abcd, efgh); +} + +static FINLINE v8i_T +v8i_max(const v8i_T v0, const v8i_T v1) +{ + const v4i_T v0_abcd = v8i_abcd(v0); + const v4i_T v0_efgh = v8i_efgh(v0); + const v4i_T v1_abcd = v8i_abcd(v1); + const v4i_T v1_efgh = v8i_efgh(v1); + const v4i_T abcd = v4i_max(v0_abcd, v1_abcd); + const v4i_T efgh = v4i_max(v0_efgh, v1_efgh); + return v8i_set_v4i(abcd, efgh); +} + +static FINLINE int32_t +v8i_reduce_min_i32(const v8i_T v) +{ + const v4i_T tmp = v4i_min(v8i_abcd(v), v8i_efgh(v)); + return v4i_x(v4i_reduce_min(tmp)); +} + +static FINLINE int32_t +v8i_reduce_max_i32(const v8i_T v) +{ + const v4i_T tmp = v4i_max(v8i_abcd(v), v8i_efgh(v)); + return v4i_x(v4i_reduce_max(tmp)); +} + +#endif /* RSIMD_AVXI_H */ + diff --git a/src/math.h b/src/math.h @@ -0,0 +1,29 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_MATH_H +#define RSIMD_MATH_H + +#include <rsys/rsys.h> + +#ifdef SIMD_SSE2 + #include "math4.h" +#endif +#ifdef SIMD_AVX + #include "math8.h" +#endif + +#endif /* RSIMD_MATH_H */ + diff --git a/src/math4.h b/src/math4.h @@ -0,0 +1,41 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_MATH4_H +#define RSIMD_MATH4_H + +#define RSIMD_WIDTH__ 4 +#include "vXf_begin.h" +#include "mathX.h" +#include "vXf_end.h" + +/******************************************************************************* + * Miscellaneous + ******************************************************************************/ +static FINLINE v4f_T /* Cartesian (xyz) to spherical (r, theta, phi)*/ +v4f_xyz_to_rthetaphi(const v4f_T v) +{ + const v4f_T zero = v4f_zero(); + const v4f_T len2 = v4f_len2(v); + const v4f_T len3 = v4f_len3(v); + const v4f_T theta = v4f_sel + (v4f_acos(v4f_div(v4f_zzzz(v), len3)), zero, v4f_eq(len3, zero)); + const v4f_T tmp_phi = v4f_sel + (v4f_asin(v4f_div(v4f_yyyy(v), len2)), zero, v4f_eq(len2, zero)); + const v4f_T phi = v4f_sel + (v4f_sub(v4f_set1((float)PI), tmp_phi),tmp_phi, v4f_ge(v4f_xxxx(v), zero)); + return v4f_xyab(v4f_xayb(len3, theta), phi); +} +#endif /* RSIMD_MATH4_H */ diff --git a/src/math8.h b/src/math8.h @@ -0,0 +1,24 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_MATH8_H +#define RSIMD_MATH8_H + +#define RSIMD_WIDTH__ 8 +#include "vXf_begin.h" +#include "mathX.h" +#include "vXf_end.h" + +#endif /* RSIMD_MATH8_H */ diff --git a/src/mathX.h b/src/mathX.h @@ -0,0 +1,137 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "rsimd.h" + +#ifdef COMPILER_GCC + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wignored-qualifiers" +#endif + +#include <sleef.h> + +#ifdef COMPILER_GCC + #pragma GCC diagnostic pop +#endif + +static FINLINE RSIMD_vXf_T__ +RSIMD_vXf__(copysign)(const RSIMD_vXf_T__ x, const RSIMD_vXf_T__ y) +{ + return RSIMD_Sleef__(copysignf)(x, y); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(floor)(const RSIMD_vXf_T__ x) +{ + return RSIMD_Sleef__(floorf)(x); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(pow)(const RSIMD_vXf_T__ x, const RSIMD_vXf_T__ y) +{ + return RSIMD_Sleef_ULP__(powf, u10)(x, y); +} + +/******************************************************************************* + * Exponentatial functions + ******************************************************************************/ +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(exp2)(const RSIMD_vXf_T__ x) +{ + return RSIMD_Sleef_ULP__(exp2f, u10)(x); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(exp)(const RSIMD_vXf_T__ x) +{ + return RSIMD_Sleef_ULP__(expf, u10)(x); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(exp10)(const RSIMD_vXf_T__ x) +{ + return RSIMD_Sleef_ULP__(exp10f, u10)(x); +} + +/******************************************************************************* + * Log functions + ******************************************************************************/ +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(log2)(const RSIMD_vXf_T__ x) +{ + return RSIMD_Sleef_ULP__(log2f, u10)(x); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(log)(const RSIMD_vXf_T__ x) +{ + return RSIMD_Sleef_ULP__(logf, u10)(x); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(log10)(const RSIMD_vXf_T__ x) +{ + return RSIMD_Sleef_ULP__(log10f, u10)(x); +} + +/******************************************************************************* + * Trigonometric functions + ******************************************************************************/ +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(sin)(const RSIMD_vXf_T__ v) +{ + return RSIMD_Sleef_ULP__(sinf, u10)(v); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(asin)(const RSIMD_vXf_T__ v) +{ + return RSIMD_Sleef_ULP__(asinf, u10)(v); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(cos)(const RSIMD_vXf_T__ v) +{ + return RSIMD_Sleef_ULP__(cosf, u10)(v); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(acos)(const RSIMD_vXf_T__ v) +{ + return RSIMD_Sleef_ULP__(acosf, u10)(v); +} + +static INLINE void +RSIMD_vXf__(sincos) + (const RSIMD_vXf_T__ v, RSIMD_vXf_T__* RESTRICT s, RSIMD_vXf_T__* RESTRICT c) +{ + const RSIMD_Sleef_vecf__(2) r = RSIMD_Sleef_ULP__(sincosf, u10)(v); + *s = r.x; + *c = r.y; +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(tan)(const RSIMD_vXf_T__ v) +{ + return RSIMD_Sleef_ULP__(tanf, u10)(v); +} + +static INLINE RSIMD_vXf_T__ +RSIMD_vXf__(atan)(const RSIMD_vXf_T__ v) +{ + return RSIMD_Sleef_ULP__(atanf, u10)(v); +} + + diff --git a/src/rsimd.h b/src/rsimd.h @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef RSIMD_H @@ -20,7 +20,7 @@ #if defined(RSIMD_SHARED_BUILD) #define RSIMD_API extern EXPORT_SYM -#elif defined(RSIMD_STATIC) +#elif defined(RSIMD_STATIC_BUILD) #define RSIMD_API extern LOCAL_SYM #else #define RSIMD_API extern IMPORT_SYM @@ -28,8 +28,9 @@ #ifdef SIMD_SSE2 #include "sse/sse.h" -#else - #error Unsupported_Platform +#endif +#ifdef SIMD_AVX + #include "avx/avx.h" #endif #endif /* RSIMD_H */ diff --git a/src/rsimd_version.h.in b/src/rsimd_version.h.in @@ -0,0 +1,23 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_VERSION_H +#define RSIMD_VERSION_H + +#define RSIMD_VERSION_MAJOR @VERSION_MAJOR@ +#define RSIMD_VERSION_MINOR @VERSION_MINOR@ +#define RSIMD_VERSION_PATCH @VERSION_PATCH@ + +#endif /* RSIMD_VERSION_H */ diff --git a/src/soa4f2.h b/src/soa4f2.h @@ -1,30 +1,22 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef SOA4F2_H #define SOA4F2_H -/* Generate the common soa4fX funcs */ -#define SOA4FX_DIMENSION__ 2 -#include "soa4fX.h" - -static FINLINE v4f_T -soa4f2_cross(const v4f_T a[2], const v4f_T b[2]) -{ - ASSERT(a && b); - return v4f_sub(v4f_mul(a[0], b[1]), v4f_mul(a[1], b[0])); -} +#define RSIMD_WIDTH__ 4 +#include "soaXf2.h" #endif /* SOA4F2_H */ diff --git a/src/soa4f3.h b/src/soa4f3.h @@ -1,34 +1,22 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef SOA4F3_H #define SOA4F3_H -/* Generate the common soa4fX functions */ -#define SOA4FX_DIMENSION__ 3 -#include "soa4fX.h" - -static FINLINE v4f_T* -soa4f3_cross(v4f_T dst[3], const v4f_T a[3], const v4f_T b[3]) -{ - v4f_T tmp[3]; - ASSERT(dst && a && b); - tmp[0] = v4f_sub(v4f_mul(a[1], b[2]), v4f_mul(a[2], b[1])); - tmp[1] = v4f_sub(v4f_mul(a[2], b[0]), v4f_mul(a[0], b[2])); - tmp[2] = v4f_sub(v4f_mul(a[0], b[1]), v4f_mul(a[1], b[0])); - return soa4f3_set__(dst, tmp); -} +#define RSIMD_WIDTH__ 4 +#include "soaXf3.h" #endif /* SOA4F3_H */ diff --git a/src/soa4f4.h b/src/soa4f4.h @@ -1,24 +1,27 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef SOA4F4_H #define SOA4F4_H -/* Generate the common soa4fX functions */ -#define SOA4FX_DIMENSION__ 4 -#include "soa4fX.h" +/* Generate the common soa4f4 functions */ +#define RSIMD_WIDTH__ 4 +#define RSIMD_SOA_DIMENSION__ 4 +#include "soaXfY_begin.h" +#include "soaXfY.h" +#include "soaXfY_end.h" #endif /* SOA4F4_H */ diff --git a/src/soa4fX.h b/src/soa4fX.h @@ -1,352 +0,0 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) - * - * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The RSIMD library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ - -/* - * Header used to generate funcs on SoA SIMD float vectors of X dimensions - */ -#if !defined(SOA4FX_DIMENSION__) - #error Missing arguments -#endif - -#if defined(SOA4FX_FUNC__) - #error Unexpected SOA4FX_FUNC__ macro defintion -#endif - -#include "rsimd.h" - -#ifdef COMPILER_GCC - #pragma GCC push_options - #pragma GCC optimize("unroll-loops") -#endif - -STATIC_ASSERT(SOA4FX_DIMENSION__ > 1, Unexpected_value); - -#define SOA4FX_FUNC__(Func) \ - CONCAT(CONCAT(CONCAT(soa4f, SOA4FX_DIMENSION__), _), Func) - -/* Helper macro */ -#define SIZEOF_SOA4FX__ sizeof(v4f_T[SOA4FX_DIMENSION__]) - -#if SOA4FX_DIMENSION__ <= 4 -static FINLINE v4f_T* -CONCAT(soa4f, SOA4FX_DIMENSION__) - (v4f_T* dst - ,const v4f_T x - ,const v4f_T y -#if SOA4FX_DIMENSION__ > 2 - ,const v4f_T z -#endif -#if SOA4FX_DIMENSION__ > 3 - ,const v4f_T w -#endif - ) -{ - ASSERT(dst); - dst[0] = x; - dst[1] = y; -#if SOA4FX_DIMENSION__ > 2 - dst[2] = z; -#endif -#if SOA4FX_DIMENSION__ > 3 - dst[3] = w; -#endif - return dst; -} -#endif - -static FINLINE v4f_T* -SOA4FX_FUNC__(splat)(v4f_T* dst, const v4f_T val) -{ - int i; - ASSERT(dst); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - dst[i] = val; - return dst; -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(set__)(v4f_T* dst, const v4f_T* src) -{ - int i; - ASSERT(dst && src); - ASSERT(!MEM_AREA_OVERLAP(dst, SIZEOF_SOA4FX__, src, SIZEOF_SOA4FX__)); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - dst[i] = src[i]; - return dst; -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(set)(v4f_T* dst, const v4f_T* src) -{ - ASSERT(dst && src); - if(!MEM_AREA_OVERLAP(dst, SIZEOF_SOA4FX__, src, SIZEOF_SOA4FX__)) { - return SOA4FX_FUNC__(set__)(dst, src); - } else { - v4f_T tmp[SOA4FX_DIMENSION__]; - return SOA4FX_FUNC__(set__)(dst, SOA4FX_FUNC__(set__)(tmp, src)); - } -} - -static FINLINE v4f_T -SOA4FX_FUNC__(dot)(const v4f_T* a, const v4f_T* b) -{ - v4f_T dot; - int i; - ASSERT(a && b); - dot = v4f_mul(a[0], b[0]); - FOR_EACH(i, 1, SOA4FX_DIMENSION__) { - dot = v4f_add(dot, v4f_mul(a[i], b[i])); - } - return dot; -} - -static FINLINE v4f_T -SOA4FX_FUNC__(len)(const v4f_T* a) -{ - ASSERT(a); - return v4f_sqrt(SOA4FX_FUNC__(dot)(a, a)); -} - -static FINLINE v4f_T -SOA4FX_FUNC__(normalize)(v4f_T* dst, const v4f_T* a) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - v4f_T sqr_len, rcp_len; - v4f_T mask; - int i; - ASSERT(dst && a); - - sqr_len = SOA4FX_FUNC__(dot)(a, a); - mask = v4f_neq(sqr_len, v4f_zero()); - rcp_len = v4f_rsqrt(sqr_len); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_and(mask, v4f_mul(a[i], rcp_len)); - SOA4FX_FUNC__(set__)(dst, tmp); - return v4f_mul(sqr_len, rcp_len); -} - -static FINLINE v4f_T -SOA4FX_FUNC__(is_normalized)(const v4f_T* a) -{ - return v4f_eq_eps(SOA4FX_FUNC__(len)(a), v4f_set1(1.f), v4f_set1(1.e-6f)); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(add)(v4f_T* dst, const v4f_T* a, const v4f_T* b) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a && b); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_add(a[i], b[i]); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(addf)(v4f_T* dst, const v4f_T* a, const v4f_T f) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_add(a[i], f); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(sub)(v4f_T* dst, const v4f_T* a, const v4f_T* b) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a && b); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_sub(a[i], b[i]); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(subf)(v4f_T* dst, const v4f_T* a, const v4f_T f) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_sub(a[i], f); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(mul)(v4f_T* dst, const v4f_T* a, const v4f_T* b) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a && b); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_mul(a[i], b[i]); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(mulf)(v4f_T* dst, const v4f_T* a, const v4f_T f) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_mul(a[i], f); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(div)(v4f_T* dst, const v4f_T* a, const v4f_T* b) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a && b); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_div(a[i], b[i]); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(divf)(v4f_T* dst, const v4f_T* a, const v4f_T f) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_div(a[i], f); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(minus)(v4f_T* dst, const v4f_T* a) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_minus(a[i]); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T -SOA4FX_FUNC__(sum)(const v4f_T* a) -{ - v4f_T f; - int i = 0; - ASSERT(a); - f = a[i]; - FOR_EACH(i, 1, SOA4FX_DIMENSION__) - f = v4f_add(f, a[i]); - return f; -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(lerp) - (v4f_T* dst, - const v4f_T* from, - const v4f_T* to, - const v4f_T t) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - v4f_T t_adjusted; - int i; - ASSERT(dst && from && to); - t_adjusted = v4f_min(v4f_max(t, v4f_zero()), v4f_set1(1.f)); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_add(from[i], v4f_mul(t_adjusted, v4f_sub(to[i], from[i]))); - SOA4FX_FUNC__(set__)(dst, tmp); - return dst; -} - -static FINLINE v4f_T -SOA4FX_FUNC__(eq)(const v4f_T* a, const v4f_T* b) -{ - v4f_T is_eq; - int i = 0; - ASSERT(a && b); - is_eq = v4f_eq(a[0], b[0]); - FOR_EACH(i, 1, SOA4FX_DIMENSION__) - is_eq = v4f_and(is_eq, v4f_eq(a[i], b[i])); - return is_eq; -} - -static FINLINE v4f_T -SOA4FX_FUNC__(eq_eps)(const v4f_T* a, const v4f_T* b, const v4f_T eps) -{ - v4f_T is_eq; - int i = 0; - ASSERT(a && b); - is_eq = v4f_eq_eps(a[0], b[0], eps); - FOR_EACH(i, 1, SOA4FX_DIMENSION__) - is_eq = v4f_and(is_eq, v4f_eq_eps(a[i], b[i], eps)); - return is_eq; -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(max)(v4f_T* dst, const v4f_T* a, const v4f_T* b) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a && b); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_max(a[i], b[i]); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(min)(v4f_T* dst, const v4f_T* a, const v4f_T* b) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && a && b); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_min(a[i], b[i]); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(sel) - (v4f_T* dst, const v4f_T* vfalse, const v4f_T* vtrue, const v4f_T cond) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && vfalse && vtrue); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_sel(vfalse[i], vtrue[i], cond); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -static FINLINE v4f_T* -SOA4FX_FUNC__(selv) - (v4f_T* dst, const v4f_T* vfalse, const v4f_T* vtrue, const v4f_T* vcond) -{ - v4f_T tmp[SOA4FX_DIMENSION__]; - int i; - ASSERT(dst && vfalse && vtrue); - FOR_EACH(i, 0, SOA4FX_DIMENSION__) - tmp[i] = v4f_sel(vfalse[i], vtrue[i], vcond[i]); - return SOA4FX_FUNC__(set__)(dst, tmp); -} - -#undef SIZEOF_SOA4FX__ -#undef SOA4FX_DIMENSION__ -#undef SOA4FX_FUNC__ - -#ifdef COMPILER_GCC - #pragma GCC pop_options -#endif - diff --git a/src/soa8f2.h b/src/soa8f2.h @@ -0,0 +1,22 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef SOA8F2_H +#define SOA8F2_H + +#define RSIMD_WIDTH__ 8 +#include "soaXf2.h" + +#endif /* SOA8F2_H */ diff --git a/src/soa8f3.h b/src/soa8f3.h @@ -0,0 +1,22 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef SOA8F3_H +#define SOA8F3_H + +#define RSIMD_WIDTH__ 8 +#include "soaXf3.h" + +#endif /* SOA8F3_H */ diff --git a/src/soa8f4.h b/src/soa8f4.h @@ -0,0 +1,27 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef SOA8F4_H +#define SOA8F4_H + +/* Generate the common soa4f4 functions */ +#define RSIMD_WIDTH__ 8 +#define RSIMD_SOA_DIMENSION__ 4 +#include "soaXfY_begin.h" +#include "soaXfY.h" +#include "soaXfY_end.h" + +#endif /* SOA8F4_H */ + diff --git a/src/soaXf2.h b/src/soaXf2.h @@ -0,0 +1,33 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_WIDTH__ + #error "Undefined RSIMD_WIDTH__ macro" +#endif + +#define RSIMD_SOA_DIMENSION__ 2 +#include "soaXfY_begin.h" +#include "soaXfY.h" + +static FINLINE RSIMD_vXf_T__ +RSIMD_soaXfY__(cross)(const RSIMD_vXf_T__ a[2], const RSIMD_vXf_T__ b[2]) +{ + ASSERT(a && b); + return RSIMD_vXf__(sub) + (RSIMD_vXf__(mul)(a[0], b[1]), + RSIMD_vXf__(mul)(a[1], b[0])); +} + +#include "soaXfY_end.h" diff --git a/src/soaXf3.h b/src/soaXf3.h @@ -0,0 +1,38 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_WIDTH__ + #error "Undefined RSIMD_WIDTH__ macro" +#endif + +#define RSIMD_SOA_DIMENSION__ 3 +#include "soaXfY_begin.h" +#include "soaXfY.h" + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(cross) + (RSIMD_vXf_T__ dst[3], + const RSIMD_vXf_T__ a[3], + const RSIMD_vXf_T__ b[3]) +{ + RSIMD_vXf_T__ tmp[3]; + ASSERT(dst && a && b); + tmp[0] = RSIMD_vXf__(sub)(RSIMD_vXf__(mul)(a[1], b[2]), RSIMD_vXf__(mul)(a[2], b[1])); + tmp[1] = RSIMD_vXf__(sub)(RSIMD_vXf__(mul)(a[2], b[0]), RSIMD_vXf__(mul)(a[0], b[2])); + tmp[2] = RSIMD_vXf__(sub)(RSIMD_vXf__(mul)(a[0], b[1]), RSIMD_vXf__(mul)(a[1], b[0])); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +#include "soaXfY_end.h" diff --git a/src/soaXfY.h b/src/soaXfY.h @@ -0,0 +1,356 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +/* + * Header used to generate funcs on SoA SIMD float vectors of Y dimensions + */ + +#ifndef SOAXFY_BEGIN_H + #error "The soaXfY_begin.h header must be included first" +#endif + +/* Force GCC to unroll the loops */ +#ifdef COMPILER_GCC + #pragma GCC push_options + #pragma GCC optimize("unroll-loops") +#endif + +#if RSIMD_SOA_DIMENSION__ <= 4 +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY_PREFIX__ + (RSIMD_vXf_T__* dst + ,const RSIMD_vXf_T__ x + ,const RSIMD_vXf_T__ y +#if RSIMD_SOA_DIMENSION__ > 2 + ,const RSIMD_vXf_T__ z +#endif +#if RSIMD_SOA_DIMENSION__ > 3 + ,const RSIMD_vXf_T__ w +#endif + ) +{ + ASSERT(dst); + dst[0] = x; + dst[1] = y; +#if RSIMD_SOA_DIMENSION__ > 2 + dst[2] = z; +#endif +#if RSIMD_SOA_DIMENSION__ > 3 + dst[3] = w; +#endif + return dst; +} +#endif + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(splat)(RSIMD_vXf_T__* dst, const RSIMD_vXf_T__ val) +{ + int i; + ASSERT(dst); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + dst[i] = val; + return dst; +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(set__)(RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* src) +{ + int i; + ASSERT(dst && src); + ASSERT(!MEM_AREA_OVERLAP(dst, SIZEOF_RSIMD_soaXfY__, src, SIZEOF_RSIMD_soaXfY__)); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + dst[i] = src[i]; + return dst; +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(set)(RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* src) +{ + ASSERT(dst && src); + if(!MEM_AREA_OVERLAP(dst, SIZEOF_RSIMD_soaXfY__, src, SIZEOF_RSIMD_soaXfY__)) { + return RSIMD_soaXfY__(set__)(dst, src); + } else { + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + return RSIMD_soaXfY__(set__)(dst, RSIMD_soaXfY__(set__)(tmp, src)); + } +} + +static FINLINE RSIMD_vXf_T__ +RSIMD_soaXfY__(dot)(const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b) +{ + RSIMD_vXf_T__ dot; + int i; + ASSERT(a && b); + dot = RSIMD_vXf__(mul)(a[0], b[0]); + FOR_EACH(i, 1, RSIMD_SOA_DIMENSION__) { + dot = RSIMD_vXf__(madd)(a[i], b[i], dot); + } + return dot; +} + +static FINLINE RSIMD_vXf_T__ +RSIMD_soaXfY__(len)(const RSIMD_vXf_T__* a) +{ + ASSERT(a); + return RSIMD_vXf__(sqrt)(RSIMD_soaXfY__(dot)(a, a)); +} + +static FINLINE RSIMD_vXf_T__ +RSIMD_soaXfY__(normalize)(RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + RSIMD_vXf_T__ sqr_len, rcp_len; + RSIMD_vXf_T__ mask; + int i; + ASSERT(dst && a); + + sqr_len = RSIMD_soaXfY__(dot)(a, a); + mask = RSIMD_vXf__(neq)(sqr_len, RSIMD_vXf__(zero)()); + rcp_len = RSIMD_vXf__(rsqrt)(sqr_len); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(and)(mask, RSIMD_vXf__(mul)(a[i], rcp_len)); + RSIMD_soaXfY__(set__)(dst, tmp); + return RSIMD_vXf__(mul)(sqr_len, rcp_len); +} + +static FINLINE RSIMD_vXf_T__ +RSIMD_soaXfY__(is_normalized)(const RSIMD_vXf_T__* a) +{ + return RSIMD_vXf__(eq_eps) + (RSIMD_soaXfY__(len)(a), + RSIMD_vXf__(set1)(1.f), + RSIMD_vXf__(set1)(1.e-6f)); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(add) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a && b); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(add)(a[i], b[i]); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(addf) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__ f) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(add)(a[i], f); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(sub) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a && b); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(sub)(a[i], b[i]); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(subf) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__ f) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(sub)(a[i], f); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(mul) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a && b); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(mul)(a[i], b[i]); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(mulf) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__ f) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(mul)(a[i], f); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(div) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a && b); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(div)(a[i], b[i]); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(divf) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__ f) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(div)(a[i], f); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(minus)(RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(minus)(a[i]); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__ +RSIMD_soaXfY__(sum)(const RSIMD_vXf_T__* a) +{ + RSIMD_vXf_T__ f; + int i = 0; + ASSERT(a); + f = a[i]; + FOR_EACH(i, 1, RSIMD_SOA_DIMENSION__) + f = RSIMD_vXf__(add)(f, a[i]); + return f; +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(lerp) + (RSIMD_vXf_T__* dst, + const RSIMD_vXf_T__* from, + const RSIMD_vXf_T__* to, + const RSIMD_vXf_T__ t) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && from && to); + + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(lerp)(from[i], to[i], t); + RSIMD_soaXfY__(set__)(dst, tmp); + return dst; +} + +static FINLINE RSIMD_vXf_T__ +RSIMD_soaXfY__(eq)(const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b) +{ + RSIMD_vXf_T__ is_eq; + int i = 0; + ASSERT(a && b); + is_eq = RSIMD_vXf__(eq)(a[0], b[0]); + FOR_EACH(i, 1, RSIMD_SOA_DIMENSION__) + is_eq = RSIMD_vXf__(and)(is_eq, RSIMD_vXf__(eq)(a[i], b[i])); + return is_eq; +} + +static FINLINE RSIMD_vXf_T__ +RSIMD_soaXfY__(eq_eps) + (const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b, const RSIMD_vXf_T__ eps) +{ + RSIMD_vXf_T__ is_eq; + int i = 0; + ASSERT(a && b); + is_eq = RSIMD_vXf__(eq_eps)(a[0], b[0], eps); + FOR_EACH(i, 1, RSIMD_SOA_DIMENSION__) + is_eq = RSIMD_vXf__(and)(is_eq, RSIMD_vXf__(eq_eps)(a[i], b[i], eps)); + return is_eq; +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(max) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a && b); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(max)(a[i], b[i]); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(min) + (RSIMD_vXf_T__* dst, const RSIMD_vXf_T__* a, const RSIMD_vXf_T__* b) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && a && b); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(min)(a[i], b[i]); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(sel) + (RSIMD_vXf_T__* dst, + const RSIMD_vXf_T__* vfalse, + const RSIMD_vXf_T__* vtrue, + const RSIMD_vXf_T__ cond) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && vfalse && vtrue); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(sel)(vfalse[i], vtrue[i], cond); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +static FINLINE RSIMD_vXf_T__* +RSIMD_soaXfY__(selv) + (RSIMD_vXf_T__* dst, + const RSIMD_vXf_T__* vfalse, + const RSIMD_vXf_T__* vtrue, + const RSIMD_vXf_T__* vcond) +{ + RSIMD_vXf_T__ tmp[RSIMD_SOA_DIMENSION__]; + int i; + ASSERT(dst && vfalse && vtrue); + FOR_EACH(i, 0, RSIMD_SOA_DIMENSION__) + tmp[i] = RSIMD_vXf__(sel)(vfalse[i], vtrue[i], vcond[i]); + return RSIMD_soaXfY__(set__)(dst, tmp); +} + +/* Restore compilation parameters */ +#ifdef COMPILER_GCC + #pragma GCC pop_options +#endif + diff --git a/src/soaXfY_begin.h b/src/soaXfY_begin.h @@ -0,0 +1,51 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "rsimd.h" +#include "vXf_begin.h" + +/* This file can be included once */ +#ifdef SOAXFY_BEGIN_H + #error "The soaXfY_begin.h header is already included" +#endif +#define SOAXFY_BEGIN_H + +/* Check parameters */ +#if !defined(RSIMD_SOA_DIMENSION__) + #error "Undefined RSIMD_SOA_DIMENSION__ macro" +#endif +#if !defined(RSIMD_WIDTH__) + #error "Undefined RSIMD_WIDTH__ macro" +#endif +#if RSIMD_SOA_DIMENSION__ < 1 || RSIMD_SOA_DIMENSION__ > 4 + #error "Unexpected RSIMD_SOA_DIMENSION__ value" +#endif +#if RSIMD_WIDTH__ != 4 && RSIMD_WIDTH__ != 8 + #error "Unexpected RSIMD_WIDTH__ value of "STR(RSIMD_WIDTH__) +#endif + +/* Check that internal macros are not already defined */ +#if defined(RSIMD_soaXfY_PREFIX__) \ + || defined(RSIMD_soaXfY__) \ + || defined(SIZEOF_RSIMD_soaXfY__) + #error "Unexpected macro definition" +#endif + +/* Macros genric to RSIMD_WIDTH__ and RSIMD_SOA_DIMENSION__ */ +#define RSIMD_soaXfY_PREFIX__ \ + CONCAT(CONCAT(CONCAT(soa, RSIMD_WIDTH__), f), RSIMD_SOA_DIMENSION__) +#define RSIMD_soaXfY__(Func) CONCAT(CONCAT(RSIMD_soaXfY_PREFIX__, _), Func) +#define SIZEOF_RSIMD_soaXfY__ sizeof(RSIMD_vXf_T__[RSIMD_SOA_DIMENSION__]) + diff --git a/src/soaXfY_end.h b/src/soaXfY_end.h @@ -0,0 +1,31 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef SOAXFY_BEGIN_H + #error "The soaXfY_begin.h file must be included" +#endif + +/* Undef helper macros */ +#undef RSIMD_soaXfY_PREFIX__ +#undef RSIMD_soaXfY__ +#undef SIZEOF_RSIMD_soaXfY__ + +/* Undef parameters */ +#undef RSIMD_SOA_DIMENSION__ +#undef RSIMD_WIDTH__ + +#undef SOAXFY_BEGIN_H + +#include "vXf_end.h" diff --git a/src/sse/sse.h b/src/sse/sse.h @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef RSIMD_SSE_H diff --git a/src/sse/sse_swz.h b/src/sse/sse_swz.h @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef RSIMD_SSE_SWZ_H diff --git a/src/sse/ssef.c b/src/sse/ssef.c @@ -1,150 +0,0 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) - * - * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The RSIMD library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ - -#include "../rsimd.h" - -#define KC0 v4f_set1(0.63661977236f) -#define KC1 v4f_set1(1.57079625129f) -#define KC2 v4f_set1(7.54978995489e-8f) -#define CC0 v4f_set1(-0.0013602249f) -#define CC1 v4f_set1(0.0416566950f) -#define CC2 v4f_set1(-0.4999990225f) -#define SC0 v4f_set1(-0.0001950727f) -#define SC1 v4f_set1(0.0083320758f) -#define SC2 v4f_set1(-0.1666665247f) -#define ONE v4f_set1(1.f) - -v4f_T -v4f_sin(const v4f_T v) -{ - const v4i_T zeroi = v4i_zero(); - const v4i_T onei = v4i_set1(1); - const v4i_T twoi = v4i_set1(2); - const v4i_T threei = v4i_set1(3); - - const v4f_T x = v4f_mul(v, KC0); - const v4i_T q = v4f_to_v4i(x); - const v4i_T off = v4i_and(q, threei); - const v4f_T qf = v4i_to_v4f(q); - - const v4f_T tmp = v4f_sub(v, v4f_mul(qf, KC1)); - const v4f_T xl = v4f_sub(tmp, v4f_mul(qf, KC2)); - const v4f_T xl2 = v4f_mul(xl, xl); - const v4f_T xl3 = v4f_mul(xl2, xl); - - const v4f_T cx = - v4f_madd(v4f_madd(v4f_madd(CC0, xl2, CC1), xl2, CC2), xl2, ONE); - const v4f_T sx = - v4f_madd(v4f_madd(v4f_madd(SC0, xl2, SC1), xl2, SC2), xl3, xl); - - const v4f_T mask0 = (v4f_T) v4i_eq(v4i_and(off, onei), zeroi); - const v4f_T mask1 = (v4f_T) v4i_eq(v4i_and(off, twoi), zeroi); - const v4f_T res = v4f_sel(cx, sx, mask0); - return v4f_sel(v4f_minus(res), res, mask1); -} - -v4f_T -v4f_cos(const v4f_T v) -{ - const v4i_T zeroi = v4i_zero(); - const v4i_T onei = v4i_set1(1); - const v4i_T twoi = v4i_set1(2); - const v4i_T threei = v4i_set1(3); - - const v4f_T x = v4f_mul(v, KC0); - const v4i_T q = v4f_to_v4i(x); - const v4i_T off = v4i_add(v4i_and(q, threei), onei); - const v4f_T qf = v4i_to_v4f(q); - - const v4f_T tmp = v4f_sub(v, v4f_mul(qf, KC1)); - const v4f_T xl = v4f_sub(tmp, v4f_mul(qf, KC2)); - const v4f_T xl2 = v4f_mul(xl, xl); - const v4f_T xl3 = v4f_mul(xl2, xl); - - const v4f_T cx = - v4f_madd(v4f_madd(v4f_madd(CC0, xl2, CC1), xl2, CC2), xl2, ONE); - const v4f_T sx = - v4f_madd(v4f_madd(v4f_madd(SC0, xl2, SC1), xl2, SC2), xl3, xl); - - const v4f_T mask0 = (v4f_T) v4i_eq(v4i_and(off, onei), zeroi); - const v4f_T mask1 = (v4f_T) v4i_eq(v4i_and(off, twoi), zeroi); - const v4f_T res = v4f_sel(cx, sx, mask0); - return v4f_sel(v4f_minus(res), res, mask1); -} - -void -v4f_sincos(const v4f_T v, v4f_T* RESTRICT s, v4f_T* RESTRICT c) -{ - const v4i_T zeroi = v4i_zero(); - const v4i_T onei = v4i_set1(1); - const v4i_T twoi = v4i_set1(2); - const v4i_T threei = v4i_set1(3); - - const v4f_T x = v4f_mul(v, KC0); - const v4i_T q = v4f_to_v4i(x); - const v4i_T soff = v4i_and(q, threei); - const v4i_T coff = v4i_add(v4i_and(q, threei), onei); - const v4f_T qf = v4i_to_v4f(q); - - const v4f_T tmp = v4f_sub(v, v4f_mul(qf, KC1)); - const v4f_T xl = v4f_sub(tmp, v4f_mul(qf, KC2)); - const v4f_T xl2 = v4f_mul(xl, xl); - const v4f_T xl3 = v4f_mul(xl2, xl); - - const v4f_T cx = - v4f_madd(v4f_madd(v4f_madd(CC0, xl2, CC1), xl2, CC2), xl2, ONE); - const v4f_T sx = - v4f_madd(v4f_madd(v4f_madd(SC0, xl2, SC1), xl2, SC2), xl3, xl); - - const v4f_T smask0 = (v4f_T) v4i_eq(v4i_and(soff, onei), zeroi); - const v4f_T smask1 = (v4f_T) v4i_eq(v4i_and(soff, twoi), zeroi); - const v4f_T sres = v4f_sel(cx, sx, smask0); - - const v4f_T cmask0 = (v4f_T) v4i_eq(v4i_and(coff, onei), zeroi); - const v4f_T cmask1 = (v4f_T) v4i_eq(v4i_and(coff, twoi), zeroi); - const v4f_T cres = v4f_sel(cx, sx, cmask0); - - *s = v4f_sel(v4f_minus(sres), sres, smask1); - *c = v4f_sel(v4f_minus(cres), cres, cmask1); -} - -v4f_T -v4f_acos(const v4f_T v) -{ - const v4f_T absv = v4f_abs(v); - const v4f_T t0 = v4f_sqrt(v4f_sub(v4f_set1(1.f), absv)); - const v4f_T absv2 =v4f_mul(absv, absv); - const v4f_T absv4 = v4f_mul(absv2, absv2); - - const v4f_T h0 = v4f_set1(-0.0012624911f); - const v4f_T h1 = v4f_set1(0.0066700901f); - const v4f_T h2 = v4f_set1(-0.0170881256f); - const v4f_T h3 = v4f_set1(0.0308918810f); - const v4f_T hi = - v4f_madd(v4f_madd(v4f_madd(h0, absv, h1), absv, h2), absv, h3); - - const v4f_T l0 = v4f_set1(-0.0501743046f); - const v4f_T l1 = v4f_set1(0.0889789874f); - const v4f_T l2 = v4f_set1(-0.2145988016f); - const v4f_T l3 = v4f_set1((float)(PI*0.5)); - const v4f_T lo = - v4f_madd(v4f_madd(v4f_madd(l0, absv, l1), absv, l2), absv, l3); - - const v4f_T res = v4f_mul(v4f_madd(hi, absv4, lo), t0); - const v4f_T mask = v4f_lt(v, v4f_zero()); - - return v4f_sel(res, v4f_set1((float)PI) - res, mask); -} - diff --git a/src/sse/ssef.h b/src/sse/ssef.h @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef RSIMD_SSEF_H @@ -28,6 +28,9 @@ #ifdef SIMD_SSE4_1 #include <smmintrin.h> #endif +#ifdef FMADD + #include <immintrin.h> +#endif typedef __m128 v4f_T; #define V4F_AT__(Vec, Id) __builtin_ia32_vec_ext_v4sf(Vec, Id) @@ -69,7 +72,7 @@ v4f_loadu3(const float src[3]) } static FINLINE v4f_T -v4f_set1(float x) +v4f_set1(const float x) { return _mm_set1_ps(x); } @@ -315,7 +318,11 @@ v4f_div(const v4f_T v0, const v4f_T v1) static FINLINE v4f_T v4f_madd(const v4f_T v0, const v4f_T v1, const v4f_T v2) { +#ifdef FMADD + return _mm_fmadd_ps(v0, v1, v2); +#else return _mm_add_ps(_mm_mul_ps(v0, v1), v2); +#endif } static FINLINE v4f_T @@ -473,35 +480,6 @@ v4f_normalize3(const v4f_T v) } /******************************************************************************* - * Trigonometric operations - ******************************************************************************/ -RSIMD_API v4f_T v4f_sin(const v4f_T v); -RSIMD_API v4f_T v4f_cos(const v4f_T v); -RSIMD_API v4f_T v4f_acos(const v4f_T v); -RSIMD_API void v4f_sincos(const v4f_T v, v4f_T* RESTRICT s, v4f_T* RESTRICT c); - -static FINLINE v4f_T -v4f_tan(const v4f_T v) -{ - v4f_T s, c; - v4f_sincos(v, &s, &c); - return v4f_div(s, c); -} - -static FINLINE v4f_T -v4f_asin(const v4f_T v) -{ - return v4f_sub(v4f_set1((float)(PI*0.5)), v4f_acos(v)); -} - -static FINLINE v4f_T -v4f_atan(v4f_T v) -{ - const v4f_T tmp = v4f_rsqrt(v4f_madd(v, v, v4f_set1(1.f))); - return v4f_asin(v4f_mul(v, tmp)); -} - -/******************************************************************************* * Comparators ******************************************************************************/ static FINLINE v4f_T @@ -578,24 +556,5 @@ v4f_clamp(const v4f_T v, const v4f_T vmin, const v4f_T vmax) return v4f_min(v4f_max(v, vmin), vmax); } -/******************************************************************************* - * Miscellaneous - ******************************************************************************/ -static FINLINE v4f_T /* Cartesian (xyz) to spherical (r, theta, phi)*/ -v4f_xyz_to_rthetaphi(const v4f_T v) -{ - const v4f_T zero = v4f_zero(); - const v4f_T len2 = v4f_len2(v); - const v4f_T len3 = v4f_len3(v); - const v4f_T theta = v4f_sel - (v4f_acos(v4f_div(v4f_zzzz(v), len3)), zero, v4f_eq(len3, zero)); - const v4f_T tmp_phi = v4f_sel - (v4f_asin(v4f_div(v4f_yyyy(v), len2)), zero, v4f_eq(len2, zero)); - const v4f_T phi = v4f_sel - (v4f_sub(v4f_set1((float)PI), tmp_phi), tmp_phi, v4f_ge(v4f_xxxx(v), zero)); - - return v4f_xyab(v4f_xayb(len3, theta), phi); -} - #endif /* RSIMD_SSEF_H */ diff --git a/src/sse/ssei.h b/src/sse/ssei.h @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #ifndef RSIMD_SSEI_H @@ -155,6 +155,12 @@ v4i_sub(const v4i_T v0, const v4i_T v1) return _mm_sub_epi32(v0, v1); } +static FINLINE v4i_T +v4i_minus(const v4i_T v) +{ + return v4i_add(v4i_not(v), v4i_set1(1)); +} + /******************************************************************************* * Comparators ******************************************************************************/ @@ -204,5 +210,79 @@ v4i_sel(const v4i_T vfalse, const v4i_T vtrue, const v4i_T vcond) #endif } +static FINLINE v4i_T +v4i_min(const v4i_T v0, const v4i_T v1) +{ +#ifdef SIMD_SSE4_1 + return _mm_min_epi32(v0, v1); +#else + ALIGN(16) int32_t a[4]; + ALIGN(16) int32_t b[4]; + v4i_store(a, v0); + v4i_store(b, v1); + return v4i_set + (MMIN(a[0], b[0]), + MMIN(a[1], b[1]), + MMIN(a[2], b[2]), + MMIN(a[3], b[3])); +#endif +} + +static FINLINE v4i_T +v4i_max(const v4i_T v0, const v4i_T v1) +{ +#ifdef SIMD_SSE4_1 + return _mm_max_epi32(v0, v1); +#else + ALIGN(16) int32_t a[4]; + ALIGN(16) int32_t b[4]; + v4i_store(a, v0); + v4i_store(b, v1); + return v4i_set + (MMAX(a[0], b[0]), + MMAX(a[1], b[1]), + MMAX(a[2], b[2]), + MMAX(a[3], b[3])); +#endif +} + +static FINLINE v4i_T +v4i_reduce_min(const v4i_T v) +{ +#ifdef SIMD_SSE4_1 + const v4i_T tmp = v4i_min(v4i_yxwz(v), v); + return v4i_min(v4i_zwxy(tmp), tmp); +#else + ALIGN(16) int32_t a[4]; + v4i_store(a, v); + return v4i_set1(MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3]))); +#endif +} + +static FINLINE v4i_T +v4i_reduce_max(const v4i_T v) +{ +#ifdef SIMD_SSE4_1 + const v4i_T tmp = v4i_max(v4i_yxwz(v), v); + return v4i_max(v4i_zwxy(tmp), tmp); +#else + ALIGN(16) int32_t a[4]; + v4i_store(a, v); + return v4i_set1(MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3]))); +#endif +} + +static FINLINE int32_t +v4i_reduce_min_i32(const v4i_T v) +{ + return v4i_x(v4i_reduce_min(v)); +} + +static FINLINE int32_t +v4i_reduce_max_i32(const v4i_T v) +{ + return v4i_x(v4i_reduce_max(v)); +} + #endif /* RSIMD_SSEI_H */ diff --git a/src/test_aosf33.c b/src/test_aosf33.c @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #include "aosf33.h" @@ -22,7 +22,7 @@ b[0] = (A); b[1] = (B); b[2] = (C); \ b[3] = (D); b[4] = (E); b[5] = (F); \ b[6] = (G); b[7] = (H); b[8] = (I); \ - CHECK(f33_eq_eps(aosf33_store(a, (M)), b, Eps), 1); \ + CHK(f33_eq_eps(aosf33_store(a, (M)), b, Eps) == 1); \ } (void)0 #define AOSF33_EQ(M, A, B, C, D, E, F, G, H, I) \ AOSF33_EQ_EPS(M, A, B, C, D, E, F, G, H, I, 0.f) @@ -34,100 +34,100 @@ main(int argc, char** argv) v4f_T m[3], n[3], o[3], v; (void)argc, (void)argv; - CHECK(aosf33_set(m, + CHK(aosf33_set(m, v4f_set(0.f, 1.f, 2.f, 0.f), v4f_set(3.f, 4.f, 5.f, 0.f), - v4f_set(6.f, 7.f, 8.f, 0.f)), m); - CHECK(aosf33_store(tmp, m), tmp); - CHECK(tmp[0], 0.f); - CHECK(tmp[1], 1.f); - CHECK(tmp[2], 2.f); - CHECK(tmp[3], 3.f); - CHECK(tmp[4], 4.f); - CHECK(tmp[5], 5.f); - CHECK(tmp[6], 6.f); - CHECK(tmp[7], 7.f); - CHECK(tmp[8], 8.f); + v4f_set(6.f, 7.f, 8.f, 0.f)) == m); + CHK(aosf33_store(tmp, m) == tmp); + CHK(tmp[0] == 0.f); + CHK(tmp[1] == 1.f); + CHK(tmp[2] == 2.f); + CHK(tmp[3] == 3.f); + CHK(tmp[4] == 4.f); + CHK(tmp[5] == 5.f); + CHK(tmp[6] == 6.f); + CHK(tmp[7] == 7.f); + CHK(tmp[8] == 8.f); AOSF33_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f); - CHECK(aosf33_identity(m), m); + CHK(aosf33_identity(m) == m); AOSF33_EQ(m, 1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f); - CHECK(aosf33_zero(m), m); + CHK(aosf33_zero(m) == m); AOSF33_EQ(m, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); f3(tmp+0, -1.f, -2.f, -3.f); f3(tmp+3, -4.f, -5.f, -6.f); f3(tmp+6, -7.f, -8.f, -9.f); - CHECK(aosf33_load(m, tmp), m); + CHK(aosf33_load(m, tmp) == m); AOSF33_EQ(m, -1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f, -9.f); - CHECK(aosf33_zero(m), m); - CHECK(aosf33_set_row0(m, v4f_set(0.f, 1.f, 2.f, 9.f)), m); + CHK(aosf33_zero(m) == m); + CHK(aosf33_set_row0(m, v4f_set(0.f, 1.f, 2.f, 9.f)) == m); AOSF33_EQ(m, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 2.f, 0.f, 0.f); - CHECK(aosf33_set_row1(m, v4f_set(3.f, 4.f, 5.f, 10.f)), m); + CHK(aosf33_set_row1(m, v4f_set(3.f, 4.f, 5.f, 10.f)) == m); AOSF33_EQ(m, 0.f, 3.f, 0.f, 1.f, 4.f, 0.f, 2.f, 5.f, 0.f); - CHECK(aosf33_set_row2(m, v4f_set(6.f, 7.f, 8.f, 11.f)), m); + CHK(aosf33_set_row2(m, v4f_set(6.f, 7.f, 8.f, 11.f)) == m); AOSF33_EQ(m, 0.f, 3.f, 6.f, 1.f, 4.f, 7.f, 2.f, 5.f, 8.f); - CHECK(aosf33_zero(m), m); - CHECK(aosf33_set_row(m, v4f_set(0.f, 1.f, 2.f, 9.f), 0), m); + CHK(aosf33_zero(m) == m); + CHK(aosf33_set_row(m, v4f_set(0.f, 1.f, 2.f, 9.f), 0) == m); AOSF33_EQ(m, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 2.f, 0.f, 0.f); - CHECK(aosf33_set_row(m, v4f_set(3.f, 4.f, 5.f, 10.f), 1), m); + CHK(aosf33_set_row(m, v4f_set(3.f, 4.f, 5.f, 10.f), 1) == m); AOSF33_EQ(m, 0.f, 3.f, 0.f, 1.f, 4.f, 0.f, 2.f, 5.f, 0.f); - CHECK(aosf33_set_row(m, v4f_set(6.f, 7.f, 8.f, 11.f), 2), m); + CHK(aosf33_set_row(m, v4f_set(6.f, 7.f, 8.f, 11.f), 2) == m); AOSF33_EQ(m, 0.f, 3.f, 6.f, 1.f, 4.f, 7.f, 2.f, 5.f, 8.f); - CHECK(aosf33_zero(m), m); - CHECK(aosf33_set_col(m, v4f_set(0.f, 1.f, 2.f, 9.f), 0), m); + CHK(aosf33_zero(m) == m); + CHK(aosf33_set_col(m, v4f_set(0.f, 1.f, 2.f, 9.f), 0) == m); AOSF33_EQ(m, 0.f, 1.f, 2.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); - CHECK(aosf33_set_col(m, v4f_set(3.f, 4.f, 5.f, 10.f), 1), m); + CHK(aosf33_set_col(m, v4f_set(3.f, 4.f, 5.f, 10.f), 1) == m); AOSF33_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 0.f, 0.f, 0.f); - CHECK(aosf33_set_col(m, v4f_set(6.f, 7.f, 8.f, 11.f), 2), m); + CHK(aosf33_set_col(m, v4f_set(6.f, 7.f, 8.f, 11.f), 2) == m); AOSF33_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f); v = aosf33_row0(m); - CHECK(v4f_x(v), 0.f); - CHECK(v4f_y(v), 3.f); - CHECK(v4f_z(v), 6.f); + CHK(v4f_x(v) == 0.f); + CHK(v4f_y(v) == 3.f); + CHK(v4f_z(v) == 6.f); v = aosf33_row1(m); - CHECK(v4f_x(v), 1.f); - CHECK(v4f_y(v), 4.f); - CHECK(v4f_z(v), 7.f); + CHK(v4f_x(v) == 1.f); + CHK(v4f_y(v) == 4.f); + CHK(v4f_z(v) == 7.f); v = aosf33_row2(m); - CHECK(v4f_x(v), 2.f); - CHECK(v4f_y(v), 5.f); - CHECK(v4f_z(v), 8.f); + CHK(v4f_x(v) == 2.f); + CHK(v4f_y(v) == 5.f); + CHK(v4f_z(v) == 8.f); v = aosf33_row(m, 0); - CHECK(v4f_x(v), 0.f); - CHECK(v4f_y(v), 3.f); - CHECK(v4f_z(v), 6.f); + CHK(v4f_x(v) == 0.f); + CHK(v4f_y(v) == 3.f); + CHK(v4f_z(v) == 6.f); v = aosf33_row(m, 1); - CHECK(v4f_x(v), 1.f); - CHECK(v4f_y(v), 4.f); - CHECK(v4f_z(v), 7.f); + CHK(v4f_x(v) == 1.f); + CHK(v4f_y(v) == 4.f); + CHK(v4f_z(v) == 7.f); v = aosf33_row(m, 2); - CHECK(v4f_x(v), 2.f); - CHECK(v4f_y(v), 5.f); - CHECK(v4f_z(v), 8.f); + CHK(v4f_x(v) == 2.f); + CHK(v4f_y(v) == 5.f); + CHK(v4f_z(v) == 8.f); v = aosf33_col(m, 0); - CHECK(v4f_x(v), 0.f); - CHECK(v4f_y(v), 1.f); - CHECK(v4f_z(v), 2.f); + CHK(v4f_x(v) == 0.f); + CHK(v4f_y(v) == 1.f); + CHK(v4f_z(v) == 2.f); v = aosf33_col(m, 1); - CHECK(v4f_x(v), 3.f); - CHECK(v4f_y(v), 4.f); - CHECK(v4f_z(v), 5.f); + CHK(v4f_x(v) == 3.f); + CHK(v4f_y(v) == 4.f); + CHK(v4f_z(v) == 5.f); v = aosf33_col(m, 2); - CHECK(v4f_x(v), 6.f); - CHECK(v4f_y(v), 7.f); - CHECK(v4f_z(v), 8.f); + CHK(v4f_x(v) == 6.f); + CHK(v4f_y(v) == 7.f); + CHK(v4f_z(v) == 8.f); aosf33_set(m, v4f_set(0.f, 1.f, 2.f, 0.f), @@ -137,19 +137,19 @@ main(int argc, char** argv) v4f_set(1.f, 2.f, 3.f, 0.f), v4f_set(4.f, 5.f, 6.f, 0.f), v4f_set(7.f, 8.f, 9.f, 0.f)); - CHECK(aosf33_add(o, m, n), o); + CHK(aosf33_add(o, m, n) == o); AOSF33_EQ(o, 1.f, 3.f, 5.f, 7.f, 9.f, 11.f, 13.f, 15.f, 17.f); - CHECK(aosf33_sub(o, o, n), o); + CHK(aosf33_sub(o, o, n) == o); AOSF33_EQ(o, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f); aosf33_set(m, v4f_set(1.f, 2.f, -3.f, 0.f), v4f_set(-4.f, -5.f, 6.f, 0.f), v4f_set(7.f, -8.f, 9.f, 0.f)); - CHECK(aosf33_minus(m, m), m); + CHK(aosf33_minus(m, m) == m); AOSF33_EQ(m, -1.f, -2.f, 3.f, 4.f, 5.f, -6.f, -7.f, 8.f, -9.f); - CHECK(aosf33_mul(o, m, v4f_set1(2.f)), o); + CHK(aosf33_mul(o, m, v4f_set1(2.f)) == o); AOSF33_EQ(o, -2.f, -4.f, 6.f, 8.f, 10.f, -12.f, -14.f, 16.f, -18.f); aosf33_set(m, @@ -157,21 +157,21 @@ main(int argc, char** argv) v4f_set(4.f, 5.f, 6.f, 0.f), v4f_set(7.f, 8.f, 9.f, 0.f)); v = aosf33_mulf3(m, v4f_set(1.f, 2.f, 3.f, 0.f)); - CHECK(v4f_x(v), 30.f); - CHECK(v4f_y(v), 36.f); - CHECK(v4f_z(v), 42.f); + CHK(v4f_x(v) == 30.f); + CHK(v4f_y(v) == 36.f); + CHK(v4f_z(v) == 42.f); v = aosf3_mulf33(v4f_set(1.f, 2.f, 3.f, 0.f), m); - CHECK(v4f_x(v), 14.f); - CHECK(v4f_y(v), 32.f); - CHECK(v4f_z(v), 50.f); + CHK(v4f_x(v) == 14.f); + CHK(v4f_y(v) == 32.f); + CHK(v4f_z(v) == 50.f); aosf33_set(n, v4f_set(2.f, 9.f, 8.f, 0.f), v4f_set(1.f, -2.f, 2.f, 0.f), v4f_set(1.f, -8.f, -4.f, 0.f)); - CHECK(aosf33_mulf33(o, m, n), o); + CHK(aosf33_mulf33(o, m, n) == o); AOSF33_EQ(o, 94.f, 113.f, 132.f, 7.f, 8.f, 9.f, -59.f, -70.f, -81.f); - CHECK(aosf33_transpose(o, m), o); + CHK(aosf33_transpose(o, m) == o); AOSF33_EQ(o, 1.f, 4.f, 7.f, 2.f, 5.f, 8.f, 3.f, 6.f, 9.f); aosf33_set(m, @@ -179,24 +179,24 @@ main(int argc, char** argv) v4f_set(4.f, 5.f, 6.f, 0.f), v4f_set(3.f, -4.f, 9.f, 0.f)); v = aosf33_det(m); - CHECK(v4f_x(v), -60.f); - CHECK(v4f_y(v), -60.f); - CHECK(v4f_z(v), -60.f); - CHECK(v4f_w(v), -60.f); + CHK(v4f_x(v) == -60.f); + CHK(v4f_y(v) == -60.f); + CHK(v4f_z(v) == -60.f); + CHK(v4f_w(v) == -60.f); v = aosf33_inverse(n, m); - CHECK(v4f_x(v), -60.f); - CHECK(v4f_y(v), -60.f); - CHECK(v4f_z(v), -60.f); - CHECK(v4f_w(v), -60.f); + CHK(v4f_x(v) == -60.f); + CHK(v4f_y(v) == -60.f); + CHK(v4f_z(v) == -60.f); + CHK(v4f_w(v) == -60.f); aosf33_mulf33(o, m, n); AOSF33_EQ_EPS(o, 1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f, 1.e-6f); v = aosf33_invtrans(o, m); - CHECK(v4f_x(v), -60.f); - CHECK(v4f_y(v), -60.f); - CHECK(v4f_z(v), -60.f); - CHECK(v4f_w(v), -60.f); + CHK(v4f_x(v) == -60.f); + CHK(v4f_y(v) == -60.f); + CHK(v4f_z(v) == -60.f); + CHK(v4f_w(v) == -60.f); AOSF33_EQ(o, v4f_x(n[0]), v4f_x(n[1]), v4f_x(n[2]), v4f_y(n[0]), v4f_y(n[1]), v4f_y(n[2]), diff --git a/src/test_aosf44.c b/src/test_aosf44.c @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #include "aosf44.h" @@ -23,7 +23,7 @@ b[4] = (E); b[5] = (F); b[6] = (G); b[7] = (H); \ b[8] = (I); b[9] = (J); b[10]= (K); b[11]= (L); \ b[12]= (M); b[13]= (N); b[14]= (O); b[15]= (P); \ - CHECK(f44_eq_eps(aosf44_store(a, (Mat)), b, Eps), 1); \ + CHK(f44_eq_eps(aosf44_store(a, (Mat)), b, Eps) == 1); \ } (void)0 #define AOSF44_EQ(Mat, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ AOSF44_EQ_EPS(Mat, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, 0.f) @@ -35,131 +35,131 @@ main(int argc, char** argv) ALIGN(16) float tmp[16]; (void)argc, (void)argv; - CHECK(aosf44_set(m, + CHK(aosf44_set(m, v4f_set(0.f, 1.f, 2.f, 3.f), v4f_set(4.f, 5.f, 6.f, 7.f), v4f_set(8.f, 9.f, 10.f, 11.f), - v4f_set(12.f, 13.f, 14.f, 15.f)), m); + v4f_set(12.f, 13.f, 14.f, 15.f)) == m); AOSF44_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f); - CHECK(aosf44_store(tmp, m), tmp); - CHECK(tmp[0], 0.f); - CHECK(tmp[1], 1.f); - CHECK(tmp[2], 2.f); - CHECK(tmp[3], 3.f); - CHECK(tmp[4], 4.f); - CHECK(tmp[5], 5.f); - CHECK(tmp[6], 6.f); - CHECK(tmp[7], 7.f); - CHECK(tmp[8], 8.f); - CHECK(tmp[9], 9.f); - CHECK(tmp[10], 10.f); - CHECK(tmp[11], 11.f); - CHECK(tmp[12], 12.f); - CHECK(tmp[13], 13.f); - CHECK(tmp[14], 14.f); - CHECK(tmp[15], 15.f); + CHK(aosf44_store(tmp, m) == tmp); + CHK(tmp[0] == 0.f); + CHK(tmp[1] == 1.f); + CHK(tmp[2] == 2.f); + CHK(tmp[3] == 3.f); + CHK(tmp[4] == 4.f); + CHK(tmp[5] == 5.f); + CHK(tmp[6] == 6.f); + CHK(tmp[7] == 7.f); + CHK(tmp[8] == 8.f); + CHK(tmp[9] == 9.f); + CHK(tmp[10] == 10.f); + CHK(tmp[11] == 11.f); + CHK(tmp[12] == 12.f); + CHK(tmp[13] == 13.f); + CHK(tmp[14] == 14.f); + CHK(tmp[15] == 15.f); tmp[0] = 0.f; tmp[1] = 2.f; tmp[2] = 4.f; tmp[3] = 6.f; tmp[4] = 8.f; tmp[5] = 10.f; tmp[6] = 12.f; tmp[7] = 14.f; tmp[8] = 16.f; tmp[9] = 18.f; tmp[10] = 20.f; tmp[11] = 22.f; tmp[12] = 24.f; tmp[13] = 26.f; tmp[14] = 28.f; tmp[15] = 30.f; - CHECK(aosf44_load(m, tmp), m); + CHK(aosf44_load(m, tmp) == m); AOSF44_EQ(m, 0.f, 2.f, 4.f, 6.f, 8.f, 10.f, 12.f, 14.f, 16.f, 18.f, 20.f, 22.f, 24.f, 26.f, 28.f, 30.f); - CHECK(aosf44_identity(m), m); + CHK(aosf44_identity(m) == m); AOSF44_EQ(m, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f); - CHECK(aosf44_zero(m), m); + CHK(aosf44_zero(m) == m); AOSF44_EQ(m, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); - CHECK(aosf44_set_row0(m, v4f_set(0.f, 1.f, 2.f, 3.f)), m); + CHK(aosf44_set_row0(m, v4f_set(0.f, 1.f, 2.f, 3.f)) == m); AOSF44_EQ(m, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 2.f, 0.f, 0.f, 0.f, 3.f, 0.f, 0.f, 0.f); - CHECK(aosf44_set_row1(m, v4f_set(4.f, 5.f, 6.f, 7.f)), m); + CHK(aosf44_set_row1(m, v4f_set(4.f, 5.f, 6.f, 7.f)) == m); AOSF44_EQ(m, 0.f, 4.f, 0.f, 0.f, 1.f, 5.f, 0.f, 0.f, 2.f, 6.f, 0.f, 0.f, 3.f, 7.f, 0.f, 0.f); - CHECK(aosf44_set_row2(m, v4f_set(8.f, 9.f, 10.f, 11.f)), m); + CHK(aosf44_set_row2(m, v4f_set(8.f, 9.f, 10.f, 11.f)) == m); AOSF44_EQ(m, 0.f, 4.f, 8.f, 0.f, 1.f, 5.f, 9.f, 0.f, 2.f, 6.f, 10.f, 0.f, 3.f, 7.f, 11.f, 0.f); - CHECK(aosf44_set_row3(m, v4f_set(12.f, 13.f, 14.f, 15.f)), m); + CHK(aosf44_set_row3(m, v4f_set(12.f, 13.f, 14.f, 15.f)) == m); AOSF44_EQ(m, 0.f, 4.f, 8.f, 12.f, 1.f, 5.f, 9.f, 13.f, 2.f, 6.f, 10.f, 14.f, 3.f, 7.f, 11.f, 15.f); - CHECK(aosf44_zero(m), m); - CHECK(aosf44_set_row(m, v4f_set(0.f, 1.f, 2.f, 3.f), 0), m); + CHK(aosf44_zero(m) == m); + CHK(aosf44_set_row(m, v4f_set(0.f, 1.f, 2.f, 3.f), 0) == m); AOSF44_EQ(m, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 2.f, 0.f, 0.f, 0.f, 3.f, 0.f, 0.f, 0.f); - CHECK(aosf44_set_row(m, v4f_set(4.f, 5.f, 6.f, 7.f), 1), m); + CHK(aosf44_set_row(m, v4f_set(4.f, 5.f, 6.f, 7.f), 1) == m); AOSF44_EQ(m, 0.f, 4.f, 0.f, 0.f, 1.f, 5.f, 0.f, 0.f, 2.f, 6.f, 0.f, 0.f, 3.f, 7.f, 0.f, 0.f); - CHECK(aosf44_set_row(m, v4f_set(8.f, 9.f, 10.f, 11.f), 2), m); + CHK(aosf44_set_row(m, v4f_set(8.f, 9.f, 10.f, 11.f), 2) == m); AOSF44_EQ(m, 0.f, 4.f, 8.f, 0.f, 1.f, 5.f, 9.f, 0.f, 2.f, 6.f, 10.f, 0.f, 3.f, 7.f, 11.f, 0.f); - CHECK(aosf44_set_row(m, v4f_set(12.f, 13.f, 14.f, 15.f), 3), m); + CHK(aosf44_set_row(m, v4f_set(12.f, 13.f, 14.f, 15.f), 3) == m); AOSF44_EQ(m, 0.f, 4.f, 8.f, 12.f, 1.f, 5.f, 9.f, 13.f, 2.f, 6.f, 10.f, 14.f, 3.f, 7.f, 11.f, 15.f); - CHECK(aosf44_zero(m), m); - CHECK(aosf44_set_col(m, v4f_set(0.f, 1.f, 2.f, 3.f), 0), m); + CHK(aosf44_zero(m) == m); + CHK(aosf44_set_col(m, v4f_set(0.f, 1.f, 2.f, 3.f), 0) == m); AOSF44_EQ(m, 0.f, 1.f, 2.f, 3.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); - CHECK(aosf44_set_col(m, v4f_set(4.f, 5.f, 6.f, 7.f), 1), m); + CHK(aosf44_set_col(m, v4f_set(4.f, 5.f, 6.f, 7.f), 1) == m); AOSF44_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); - CHECK(aosf44_set_col(m, v4f_set(8.f, 9.f, 10.f, 11.f), 2), m); + CHK(aosf44_set_col(m, v4f_set(8.f, 9.f, 10.f, 11.f), 2) == m); AOSF44_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 0.f, 0.f, 0.f, 0.f); - CHECK(aosf44_set_col(m, v4f_set(12.f, 13.f, 14.f, 15.f), 3), m); + CHK(aosf44_set_col(m, v4f_set(12.f, 13.f, 14.f, 15.f), 3) == m); AOSF44_EQ(m, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, @@ -167,116 +167,116 @@ main(int argc, char** argv) 12.f, 13.f, 14.f, 15.f); v = aosf44_row0(m); - CHECK(v4f_x(v), 0.f); - CHECK(v4f_y(v), 4.f); - CHECK(v4f_z(v), 8.f); - CHECK(v4f_w(v), 12.f); + CHK(v4f_x(v) == 0.f); + CHK(v4f_y(v) == 4.f); + CHK(v4f_z(v) == 8.f); + CHK(v4f_w(v) == 12.f); v = aosf44_row1(m); - CHECK(v4f_x(v), 1.f); - CHECK(v4f_y(v), 5.f); - CHECK(v4f_z(v), 9.f); - CHECK(v4f_w(v), 13.f); + CHK(v4f_x(v) == 1.f); + CHK(v4f_y(v) == 5.f); + CHK(v4f_z(v) == 9.f); + CHK(v4f_w(v) == 13.f); v = aosf44_row2(m); - CHECK(v4f_x(v), 2.f); - CHECK(v4f_y(v), 6.f); - CHECK(v4f_z(v), 10.f); - CHECK(v4f_w(v), 14.f); + CHK(v4f_x(v) == 2.f); + CHK(v4f_y(v) == 6.f); + CHK(v4f_z(v) == 10.f); + CHK(v4f_w(v) == 14.f); v = aosf44_row3(m); - CHECK(v4f_x(v), 3.f); - CHECK(v4f_y(v), 7.f); - CHECK(v4f_z(v), 11.f); - CHECK(v4f_w(v), 15.f); + CHK(v4f_x(v) == 3.f); + CHK(v4f_y(v) == 7.f); + CHK(v4f_z(v) == 11.f); + CHK(v4f_w(v) == 15.f); v = aosf44_row(m, 0); - CHECK(v4f_x(v), 0.f); - CHECK(v4f_y(v), 4.f); - CHECK(v4f_z(v), 8.f); - CHECK(v4f_w(v), 12.f); + CHK(v4f_x(v) == 0.f); + CHK(v4f_y(v) == 4.f); + CHK(v4f_z(v) == 8.f); + CHK(v4f_w(v) == 12.f); v = aosf44_row(m, 1); - CHECK(v4f_x(v), 1.f); - CHECK(v4f_y(v), 5.f); - CHECK(v4f_z(v), 9.f); - CHECK(v4f_w(v), 13.f); + CHK(v4f_x(v) == 1.f); + CHK(v4f_y(v) == 5.f); + CHK(v4f_z(v) == 9.f); + CHK(v4f_w(v) == 13.f); v = aosf44_row(m, 2); - CHECK(v4f_x(v), 2.f); - CHECK(v4f_y(v), 6.f); - CHECK(v4f_z(v), 10.f); - CHECK(v4f_w(v), 14.f); + CHK(v4f_x(v) == 2.f); + CHK(v4f_y(v) == 6.f); + CHK(v4f_z(v) == 10.f); + CHK(v4f_w(v) == 14.f); v = aosf44_row(m, 3); - CHECK(v4f_x(v), 3.f); - CHECK(v4f_y(v), 7.f); - CHECK(v4f_z(v), 11.f); - CHECK(v4f_w(v), 15.f); + CHK(v4f_x(v) == 3.f); + CHK(v4f_y(v) == 7.f); + CHK(v4f_z(v) == 11.f); + CHK(v4f_w(v) == 15.f); v = aosf44_col(m, 0); - CHECK(v4f_x(v), 0.f); - CHECK(v4f_y(v), 1.f); - CHECK(v4f_z(v), 2.f); - CHECK(v4f_w(v), 3.f); + CHK(v4f_x(v) == 0.f); + CHK(v4f_y(v) == 1.f); + CHK(v4f_z(v) == 2.f); + CHK(v4f_w(v) == 3.f); v = aosf44_col(m, 1); - CHECK(v4f_x(v), 4.f); - CHECK(v4f_y(v), 5.f); - CHECK(v4f_z(v), 6.f); - CHECK(v4f_w(v), 7.f); + CHK(v4f_x(v) == 4.f); + CHK(v4f_y(v) == 5.f); + CHK(v4f_z(v) == 6.f); + CHK(v4f_w(v) == 7.f); v = aosf44_col(m, 2); - CHECK(v4f_x(v), 8.f); - CHECK(v4f_y(v), 9.f); - CHECK(v4f_z(v), 10.f); - CHECK(v4f_w(v), 11.f); + CHK(v4f_x(v) == 8.f); + CHK(v4f_y(v) == 9.f); + CHK(v4f_z(v) == 10.f); + CHK(v4f_w(v) == 11.f); v = aosf44_col(m, 3); - CHECK(v4f_x(v), 12.f); - CHECK(v4f_y(v), 13.f); - CHECK(v4f_z(v), 14.f); - CHECK(v4f_w(v), 15.f); + CHK(v4f_x(v) == 12.f); + CHK(v4f_y(v) == 13.f); + CHK(v4f_z(v) == 14.f); + CHK(v4f_w(v) == 15.f); - CHECK(aosf44_set(m, + CHK(aosf44_set(m, v4f_set(0.f, 1.f, 2.f, 3.f), v4f_set(4.f, 5.f, 6.f, 7.f), v4f_set(8.f, 9.f, 10.f, 11.f), - v4f_set(12.f, 13.f, 14.f, 15.f)), m); - CHECK(aosf44_set(n, + v4f_set(12.f, 13.f, 14.f, 15.f)) == m); + CHK(aosf44_set(n, v4f_set(0.f, 2.f, 1.f, 3.f), v4f_set(1.f, -2.f, -1.f, -3.f), v4f_set(1.f, 0.f, 0.f, 2.f), - v4f_set(3.f, 2.f, 1.f, 0.f)), n); - CHECK(aosf44_add(o, m, n), o); + v4f_set(3.f, 2.f, 1.f, 0.f)) == n); + CHK(aosf44_add(o, m, n) == o); AOSF44_EQ(o, 0.f, 3.f, 3.f, 6.f, 5.f, 3.f, 5.f, 4.f, 9.f, 9.f, 10.f, 13.f, 15.f, 15.f, 15.f, 15.f); - CHECK(aosf44_sub(o, m, n), o); + CHK(aosf44_sub(o, m, n) == o); AOSF44_EQ(o, 0.f, -1.f, 1.f, 0.f, 3.f, 7.f, 7.f, 10.f, 7.f, 9.f, 10.f, 9.f, 9.f, 11.f, 13.f, 15.f); - CHECK(aosf44_minus(o, n), o); + CHK(aosf44_minus(o, n) == o); AOSF44_EQ(o, 0.f, -2.f, -1.f, -3.f, -1.f, 2.f, 1.f, 3.f, -1.f, 0.f, 0.f, -2.f, -3.f, -2.f, -1.f, 0.f); - CHECK(aosf44_abs(o, o), o); + CHK(aosf44_abs(o, o) == o); AOSF44_EQ(o, 0.f, 2.f, 1.f, 3.f, 1.f, 2.f, 1.f, 3.f, 1.f, 0.f, 0.f, 2.f, 3.f, 2.f, 1.f, 0.f); - CHECK(aosf44_mul(o, n, v4f_set(1.f, 2.f, 3.f, 2.f)), o); + CHK(aosf44_mul(o, n, v4f_set(1.f, 2.f, 3.f, 2.f)) == o); AOSF44_EQ(o, 0.f, 4.f, 3.f, 6.f, 1.f, -4.f, -3.f, -6.f, @@ -289,16 +289,16 @@ main(int argc, char** argv) v4f_set(8.f, 9.f, 10.f, 11.f), v4f_set(12.f, 13.f, 14.f, 15.f)); v = aosf44_mulf4(m, v4f_set(1.f, 2.f, 3.f, 1.f)); - CHECK(v4f_x(v), 44.f); - CHECK(v4f_y(v), 51.f); - CHECK(v4f_z(v), 58.f); - CHECK(v4f_w(v), 65.f); + CHK(v4f_x(v) == 44.f); + CHK(v4f_y(v) == 51.f); + CHK(v4f_z(v) == 58.f); + CHK(v4f_w(v) == 65.f); v = aosf4_mulf44(v4f_set(1.f, 2.f, 3.f, 1.f), m); - CHECK(v4f_x(v), 11.f); - CHECK(v4f_y(v), 39.f); - CHECK(v4f_z(v), 67.f); - CHECK(v4f_w(v), 95.f); + CHK(v4f_x(v) == 11.f); + CHK(v4f_y(v) == 39.f); + CHK(v4f_z(v) == 67.f); + CHK(v4f_w(v) == 95.f); aosf44_set(m, v4f_set(1.f, 2.f, 3.f, 4.f), @@ -310,14 +310,14 @@ main(int argc, char** argv) v4f_set(1.f, -2.f, 2.f, 1.f), v4f_set(1.f, -8.f, -4.f, 2.f), v4f_set(1.f, 3.f, 4.f, 2.f)); - CHECK(aosf44_mulf44(o, m, n), o); + CHK(aosf44_mulf44(o, m, n) == o); AOSF44_EQ(o, 104.f, 124.f, 144.f, 164.f, 17.f, 19.f, 21.f, 23.f, -39.f, -48.f, -57.f, -66.f, 61.f, 71.f, 81.f, 91.f); - CHECK(aosf44_transpose(o, n), o); + CHK(aosf44_transpose(o, n) == o); AOSF44_EQ(o, 2.f, 1.f, 1.f, 1.f, 9.f, -2.f, -8.f, 3.f, @@ -325,17 +325,17 @@ main(int argc, char** argv) 1.f, 1.f, 2.f, 2.f); v = aosf44_det(n); - CHECK(v4f_x(v), 78.f); - CHECK(v4f_y(v), 78.f); - CHECK(v4f_z(v), 78.f); - CHECK(v4f_w(v), 78.f); + CHK(v4f_x(v) == 78.f); + CHK(v4f_y(v) == 78.f); + CHK(v4f_z(v) == 78.f); + CHK(v4f_w(v) == 78.f); v = aosf44_inverse(m, n); - CHECK(v4f_x(v), 78.f); - CHECK(v4f_y(v), 78.f); - CHECK(v4f_z(v), 78.f); - CHECK(v4f_w(v), 78.f); - CHECK(aosf44_mulf44(o, m, n), o); + CHK(v4f_x(v) == 78.f); + CHK(v4f_y(v) == 78.f); + CHK(v4f_z(v) == 78.f); + CHK(v4f_w(v) == 78.f); + CHK(aosf44_mulf44(o, m, n) == o); AOSF44_EQ_EPS(o, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, @@ -344,10 +344,10 @@ main(int argc, char** argv) 1.e-6f); v = aosf44_invtrans(o, n); - CHECK(v4f_x(v), 78.f); - CHECK(v4f_y(v), 78.f); - CHECK(v4f_z(v), 78.f); - CHECK(v4f_w(v), 78.f); + CHK(v4f_x(v) == 78.f); + CHK(v4f_y(v) == 78.f); + CHK(v4f_z(v) == 78.f); + CHK(v4f_w(v) == 78.f); AOSF44_EQ(o, v4f_x(m[0]), v4f_x(m[1]), v4f_x(m[2]), v4f_x(m[3]), v4f_y(m[0]), v4f_y(m[1]), v4f_y(m[2]), v4f_y(m[3]), @@ -366,54 +366,54 @@ main(int argc, char** argv) v4f_set(12.f, 13.f, 14.f, 15.f)); v = aosf44_eq(m, n); - CHECK(v4f_mask_x(v), ~0); - CHECK(v4f_mask_y(v), ~0); - CHECK(v4f_mask_z(v), ~0); - CHECK(v4f_mask_w(v), ~0); + CHK(v4f_mask_x(v) == ~0); + CHK(v4f_mask_y(v) == ~0); + CHK(v4f_mask_z(v) == ~0); + CHK(v4f_mask_w(v) == ~0); n[0] = v4f_set(0.f, 1.0f, 2.f, 4.f); v = aosf44_eq(m, n); - CHECK(v4f_mask_x(v), 0); - CHECK(v4f_mask_y(v), 0); - CHECK(v4f_mask_z(v), 0); - CHECK(v4f_mask_w(v), 0); + CHK(v4f_mask_x(v) == 0); + CHK(v4f_mask_y(v) == 0); + CHK(v4f_mask_z(v) == 0); + CHK(v4f_mask_w(v) == 0); n[0] = v4f_set(0.f, 1.0f, 2.f, 3.f); n[1] = v4f_set(4.f, 5.0f, 6.f, 7.f); v = aosf44_eq(m, n); - CHECK(v4f_mask_x(v), 0); - CHECK(v4f_mask_y(v), 0); - CHECK(v4f_mask_z(v), 0); - CHECK(v4f_mask_w(v), 0); + CHK(v4f_mask_x(v) == 0); + CHK(v4f_mask_y(v) == 0); + CHK(v4f_mask_z(v) == 0); + CHK(v4f_mask_w(v) == 0); n[1] = v4f_set(5.f, 5.0f, 6.f, 7.f); m[2] = v4f_set(8.f, -9.0f, 10.f, 11.f); v = aosf44_eq(m, n); - CHECK(v4f_mask_x(v), 0); - CHECK(v4f_mask_y(v), 0); - CHECK(v4f_mask_z(v), 0); - CHECK(v4f_mask_w(v), 0); + CHK(v4f_mask_x(v) == 0); + CHK(v4f_mask_y(v) == 0); + CHK(v4f_mask_z(v) == 0); + CHK(v4f_mask_w(v) == 0); m[2] = v4f_set(8.f, 9.0f, 10.f, 11.f); n[3] = v4f_set(12.f, 13.1f, 14.f, 15.f); v = aosf44_eq(m, n); - CHECK(v4f_mask_x(v), 0); - CHECK(v4f_mask_y(v), 0); - CHECK(v4f_mask_z(v), 0); - CHECK(v4f_mask_w(v), 0); + CHK(v4f_mask_x(v) == 0); + CHK(v4f_mask_y(v) == 0); + CHK(v4f_mask_z(v) == 0); + CHK(v4f_mask_w(v) == 0); v = aosf44_eq(m, m); - CHECK(v4f_mask_x(v), ~0); - CHECK(v4f_mask_y(v), ~0); - CHECK(v4f_mask_z(v), ~0); - CHECK(v4f_mask_w(v), ~0); + CHK(v4f_mask_x(v) == ~0); + CHK(v4f_mask_y(v) == ~0); + CHK(v4f_mask_z(v) == ~0); + CHK(v4f_mask_w(v) == ~0); n[3] = v4f_set(12.f, 13.0f, 14.f, 15.f); v = aosf44_eq(m, n); - CHECK(v4f_mask_x(v), ~0); - CHECK(v4f_mask_y(v), ~0); - CHECK(v4f_mask_z(v), ~0); - CHECK(v4f_mask_w(v), ~0); + CHK(v4f_mask_x(v) == ~0); + CHK(v4f_mask_y(v) == ~0); + CHK(v4f_mask_z(v) == ~0); + CHK(v4f_mask_w(v) == ~0); return 0; } diff --git a/src/test_aosq.c b/src/test_aosq.c @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #include "aosq.h" @@ -23,7 +23,7 @@ b[0] = (A); b[1] = (B); b[2] = (C); \ b[3] = (D); b[4] = (E); b[5] = (F); \ b[6] = (G); b[7] = (H); b[8] = (I); \ - CHECK(f33_eq_eps(aosf33_store(a, (M)), b, Eps), 1); \ + CHK(f33_eq_eps(aosf33_store(a, (M)), b, Eps) == 1); \ } (void)0 int @@ -35,94 +35,94 @@ main(int argc, char** argv) (void)argc, (void)argv; q0 = aosq_identity(); - CHECK(v4f_x(q0), 0.f); - CHECK(v4f_y(q0), 0.f); - CHECK(v4f_z(q0), 0.f); - CHECK(v4f_w(q0), 1.f); + CHK(v4f_x(q0) == 0.f); + CHK(v4f_y(q0) == 0.f); + CHK(v4f_z(q0) == 0.f); + CHK(v4f_w(q0) == 1.f); q0 = aosq_set_axis_angle(v4f_set(2.f, 5.f, 1.f, 0.f), v4f_set1((float)PI*0.3f)); - CHECK(eq_eps(v4f_x(q0), 0.907981f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(q0), 2.269953f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(q0), 0.453991f, 1.e-6f), 1); - CHECK(eq_eps(v4f_w(q0), 0.891007f, 1.e-6f), 1); + CHK(eq_eps(v4f_x(q0), 0.907981f, 1.e-6f) == 1); + CHK(eq_eps(v4f_y(q0), 2.269953f, 1.e-6f) == 1); + CHK(eq_eps(v4f_z(q0), 0.453991f, 1.e-6f) == 1); + CHK(eq_eps(v4f_w(q0), 0.891007f, 1.e-6f) == 1); q0 = v4f_set(1.f, 2.f, 3.f, -3.f); q1 = v4f_set(1.f, 2.f, 3.f, -3.f); t = aosq_eq(q0, q1); - cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); - cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); - cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); - cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + cast.f = v4f_x(t); CHK(cast.i == (int32_t)0xFFFFFFFF); + cast.f = v4f_y(t); CHK(cast.i == (int32_t)0xFFFFFFFF); + cast.f = v4f_z(t); CHK(cast.i == (int32_t)0xFFFFFFFF); + cast.f = v4f_w(t); CHK(cast.i == (int32_t)0xFFFFFFFF); q1 = v4f_set(0.f, 2.f, 3.f, -3.f); t = aosq_eq(q0, q1); - cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_x(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_y(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_z(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_w(t); CHK(cast.i == (int32_t)0x00000000); q1 = v4f_set(1.f, 0.f, 3.f, -3.f); t = aosq_eq(q0, q1); - cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_x(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_y(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_z(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_w(t); CHK(cast.i == (int32_t)0x00000000); q1 = v4f_set(1.f, 2.f, 0.f, -3.f); t = aosq_eq(q0, q1); - cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_x(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_y(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_z(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_w(t); CHK(cast.i == (int32_t)0x00000000); q1 = v4f_set(1.f, 2.f, 3.f, 0.f); t = aosq_eq(q0, q1); - cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_x(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_y(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_z(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_w(t); CHK(cast.i == (int32_t)0x00000000); q1 = v4f_set(1.01f, 2.f, 3.02f, -3.f); t = aosq_eq_eps(q0, q1, v4f_set1(0.01f)); - cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); - cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_x(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_y(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_z(t); CHK(cast.i == (int32_t)0x00000000); + cast.f = v4f_w(t); CHK(cast.i == (int32_t)0x00000000); t = aosq_eq_eps(q0, q1, v4f_set1(0.02f)); - cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); - cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); - cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); - cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + cast.f = v4f_x(t); CHK(cast.i == (int32_t)0xFFFFFFFF); + cast.f = v4f_y(t); CHK(cast.i == (int32_t)0xFFFFFFFF); + cast.f = v4f_z(t); CHK(cast.i == (int32_t)0xFFFFFFFF); + cast.f = v4f_w(t); CHK(cast.i == (int32_t)0xFFFFFFFF); q0 = v4f_set(1.f, 2.f, 3.f, 4.f); q1 = v4f_set(5.f, 6.f, 7.f, 8.f); q2 = aosq_mul(q0, q1); - CHECK(v4f_x(q2), 24.f); - CHECK(v4f_y(q2), 48.f); - CHECK(v4f_z(q2), 48.f); - CHECK(v4f_w(q2), -6.f); + CHK(v4f_x(q2) == 24.f); + CHK(v4f_y(q2) == 48.f); + CHK(v4f_z(q2) == 48.f); + CHK(v4f_w(q2) == -6.f); q2 = aosq_conj(q0); - CHECK(v4f_x(q2), -1.f); - CHECK(v4f_y(q2), -2.f); - CHECK(v4f_z(q2), -3.f); - CHECK(v4f_w(q2), 4.f); + CHK(v4f_x(q2) == -1.f); + CHK(v4f_y(q2) == -2.f); + CHK(v4f_z(q2) == -3.f); + CHK(v4f_w(q2) == 4.f); q0 = v4f_normalize(v4f_set(1.f, 2.f, 5.f, 0.5f)); q1 = v4f_xyzz(q0); q1 = v4f_xyzd(q1, aosq_calca(q1)); - CHECK(v4f_x(q0), v4f_x(q1)); - CHECK(v4f_y(q0), v4f_y(q1)); - CHECK(v4f_z(q0), v4f_z(q1)); - CHECK(eq_eps(v4f_w(q0), v4f_w(q1), 1.e-6f), 1); + CHK(v4f_x(q0) == v4f_x(q1)); + CHK(v4f_y(q0) == v4f_y(q1)); + CHK(v4f_z(q0) == v4f_z(q1)); + CHK(eq_eps(v4f_w(q0), v4f_w(q1), 1.e-6f) == 1); q0 = v4f_set(1.f, 2.f, 3.f, 5.f); q1 = v4f_set(2.f, 6.f, 7.f, 6.f); q2 = aosq_slerp(q0, q1, v4f_set1(0.3f)); - CHECK(eq_eps(v4f_x(q2), 1.3f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(q2), 3.2f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(q2), 4.2f, 1.e-6f), 1); - CHECK(eq_eps(v4f_w(q2), 5.3f, 1.e-6f), 1); + CHK(eq_eps(v4f_x(q2), 1.3f, 1.e-6f) == 1); + CHK(eq_eps(v4f_y(q2), 3.2f, 1.e-6f) == 1); + CHK(eq_eps(v4f_z(q2), 4.2f, 1.e-6f) == 1); + CHK(eq_eps(v4f_w(q2), 5.3f, 1.e-6f) == 1); q0 = v4f_set(2.f, 5.f, 17.f, 9.f); aosq_to_aosf33(q0, m); diff --git a/src/test_math4.c b/src/test_math4.c @@ -0,0 +1,138 @@ +/* Copyright (C) 2013-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#define _POSIX_C_SOURCE 200112L + +#include "rsimd.h" +#include "math.h" + +#include <math.h> + +#define LOG2E 1.4426950408889634074 /* log_2 e */ +#define LN10 2.30258509299404568402 /* log_e 10 */ + +#define CHKV4_EPS(V, Ref, Eps) { \ + CHK(eq_eps(v4f_x(V), Ref[0], fabsf(Ref[0]) * Eps)); \ + CHK(eq_eps(v4f_y(V), Ref[1], fabsf(Ref[1]) * Eps)); \ + CHK(eq_eps(v4f_z(V), Ref[2], fabsf(Ref[2]) * Eps)); \ + CHK(eq_eps(v4f_w(V), Ref[3], fabsf(Ref[3]) * Eps)); \ +} (void)0 + +#define CHKV4_FUNC_EPS(V, Func, Eps) { \ + const v4f_T r__ = v4f_##Func(V); \ + float ref__[4]; \ + ref__[0] = (float)Func(v4f_x(V)); \ + ref__[1] = (float)Func(v4f_y(V)); \ + ref__[2] = (float)Func(v4f_z(V)); \ + ref__[3] = (float)Func(v4f_w(V)); \ + CHKV4_EPS(r__, ref__, Eps); \ +} (void)0 + +static void +test_trigo(void) +{ + v4f_T i, j, k; + float ref[4]; + + i = v4f_set((float)PI/2.f, (float)PI/3.f, (float)PI/4.f, (float)PI/6.f); + + CHKV4_FUNC_EPS(i, cos, 1.e-6f); + CHKV4_FUNC_EPS(i, sin, 1.e-6f); + + v4f_sincos(i, &k, &j); + ref[0] = (float)sin(v4f_x(i)); + ref[1] = (float)sin(v4f_y(i)); + ref[2] = (float)sin(v4f_z(i)); + ref[3] = (float)sin(v4f_w(i)); + CHKV4_EPS(k, ref, 1.e-6f); + ref[0] = (float)cos(v4f_x(i)); + ref[1] = (float)cos(v4f_y(i)); + ref[2] = (float)cos(v4f_z(i)); + ref[3] = (float)cos(v4f_w(i)); + CHKV4_EPS(j, ref, 1.e-6f); + + i = v4f_set((float)PI/8.f, (float)PI/3.f, (float)PI/4.f, (float)PI/6.f); + CHKV4_FUNC_EPS(i, tan, 1.e-6f); + CHKV4_FUNC_EPS(v4f_cos(i), acos, 1.e-6f); + CHKV4_FUNC_EPS(v4f_sin(i), asin, 1.e-6f); + CHKV4_FUNC_EPS(v4f_tan(i), atan, 1.e-6f); +} + +static void +test_exp(void) +{ + const v4f_T i = v4f_set(1.f, -1.234f, 0.f, 3.14156f); + v4f_T j; + float ref[4]; + + CHKV4_FUNC_EPS(i, exp, 1.e-6f); + CHKV4_FUNC_EPS(i, exp2, 1.e-6f); + + j = v4f_exp10(i); + ref[0] = (float)exp2(LOG2E * LN10 * v4f_x(i)); + ref[1] = (float)exp2(LOG2E * LN10 * v4f_y(i)); + ref[2] = (float)exp2(LOG2E * LN10 * v4f_z(i)); + ref[3] = (float)exp2(LOG2E * LN10 * v4f_w(i)); + CHKV4_EPS(j, ref, 1.e-6f); +} + +static void +test_log(void) +{ + const v4f_T i = v4f_set(4.675f, 3.14f, 9.99999f, 1.234e-13f); + + CHKV4_FUNC_EPS(i, log, 1.e-6f); + CHKV4_FUNC_EPS(i, log2, 1.e-6f); + CHKV4_FUNC_EPS(i, log10, 1.e-6f); +} + +static void +test_misc(void) +{ + v4f_T i, j, k; + float ref[4]; + + i = v4f_set(-1.2345f, 9.3e-7f, 3.879e9f, -10.56f); + j = v4f_set(7.89e-9f, 0.12f, -4.9e10f, 3.14f); + k = v4f_copysign(i, j); + ref[0] = (float)copysign(v4f_x(i), v4f_x(j)); + ref[1] = (float)copysign(v4f_y(i), v4f_y(j)); + ref[2] = (float)copysign(v4f_z(i), v4f_z(j)); + ref[3] = (float)copysign(v4f_w(i), v4f_w(j)); + CHKV4_EPS(k, ref, 1.e-6f); + + CHKV4_FUNC_EPS(i, floor, 1.e-6f); + + k = v4f_pow(v4f_abs(i), j); + ref[0] = (float)pow(fabsf(v4f_x(i)), v4f_x(j)); + ref[1] = (float)pow(fabsf(v4f_y(i)), v4f_y(j)); + ref[2] = (float)pow(fabsf(v4f_z(i)), v4f_z(j)); + ref[3] = (float)pow(fabsf(v4f_w(i)), v4f_w(j)); + CHKV4_EPS(k, ref, 1.e-6f); +} + +int +main(int argc, char** argv) +{ + (void)argc, (void)argv; + + test_trigo(); + test_exp(); + test_log(); + test_misc(); + + return 0; +} + diff --git a/src/test_math8.c b/src/test_math8.c @@ -0,0 +1,172 @@ +/* Copyright (C) 2013-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#define _POSIX_C_SOURCE 200112L + +#include "rsimd.h" +#include "math.h" + +#include <math.h> + +#define LOG2E 1.4426950408889634074 /* log_2 e */ +#define LN10 2.30258509299404568402 /* log_e 10 */ + +#define CHKV8_EPS(V, Ref, Eps) { \ + CHK(eq_eps(v4f_x(v8f_abcd(V)), Ref[0], fabsf(Ref[0]) * Eps)); \ + CHK(eq_eps(v4f_y(v8f_abcd(V)), Ref[1], fabsf(Ref[1]) * Eps)); \ + CHK(eq_eps(v4f_z(v8f_abcd(V)), Ref[2], fabsf(Ref[2]) * Eps)); \ + CHK(eq_eps(v4f_w(v8f_abcd(V)), Ref[3], fabsf(Ref[3]) * Eps)); \ + CHK(eq_eps(v4f_x(v8f_efgh(V)), Ref[4], fabsf(Ref[4]) * Eps)); \ + CHK(eq_eps(v4f_y(v8f_efgh(V)), Ref[5], fabsf(Ref[5]) * Eps)); \ + CHK(eq_eps(v4f_z(v8f_efgh(V)), Ref[6], fabsf(Ref[6]) * Eps)); \ + CHK(eq_eps(v4f_w(v8f_efgh(V)), Ref[7], fabsf(Ref[7]) * Eps)); \ +} (void)0 + +#define CHKV8_FUNC_EPS(V, Func, Eps) { \ + const v8f_T r__ = v8f_##Func(V); \ + float ref__[8]; \ + ref__[0] = (float)Func(v4f_x(v8f_abcd(V))); \ + ref__[1] = (float)Func(v4f_y(v8f_abcd(V))); \ + ref__[2] = (float)Func(v4f_z(v8f_abcd(V))); \ + ref__[3] = (float)Func(v4f_w(v8f_abcd(V))); \ + ref__[4] = (float)Func(v4f_x(v8f_efgh(V))); \ + ref__[5] = (float)Func(v4f_y(v8f_efgh(V))); \ + ref__[6] = (float)Func(v4f_z(v8f_efgh(V))); \ + ref__[7] = (float)Func(v4f_w(v8f_efgh(V))); \ + CHKV8_EPS(r__, ref__, Eps); \ +} (void)0 + +static void +test_trigo(void) +{ + v8f_T i, j, k; + float ref[8]; + + i = v8f_set + ((float)PI/2.f, (float)PI/3.f, (float)PI/4.f, (float)PI/6.f, + (float)PI/8.f, (float)PI/7.f, (float)PI/16.f, (float)PI/9.f); + + CHKV8_FUNC_EPS(i, cos, 1.e-6f); + CHKV8_FUNC_EPS(i, sin, 1.e-6f); + + v8f_sincos(i, &k, &j); + ref[0] = (float)sin(v4f_x(v8f_abcd(i))); + ref[1] = (float)sin(v4f_y(v8f_abcd(i))); + ref[2] = (float)sin(v4f_z(v8f_abcd(i))); + ref[3] = (float)sin(v4f_w(v8f_abcd(i))); + ref[4] = (float)sin(v4f_x(v8f_efgh(i))); + ref[5] = (float)sin(v4f_y(v8f_efgh(i))); + ref[6] = (float)sin(v4f_z(v8f_efgh(i))); + ref[7] = (float)sin(v4f_w(v8f_efgh(i))); + CHKV8_EPS(k, ref, 1.e-6f); + ref[0] = (float)cos(v4f_x(v8f_abcd(i))); + ref[1] = (float)cos(v4f_y(v8f_abcd(i))); + ref[2] = (float)cos(v4f_z(v8f_abcd(i))); + ref[3] = (float)cos(v4f_w(v8f_abcd(i))); + ref[4] = (float)cos(v4f_x(v8f_efgh(i))); + ref[5] = (float)cos(v4f_y(v8f_efgh(i))); + ref[6] = (float)cos(v4f_z(v8f_efgh(i))); + ref[7] = (float)cos(v4f_w(v8f_efgh(i))); + CHKV8_EPS(j, ref, 1.e-6f); + + i = v8f_set + ((float)PI/2.2f, (float)PI/3.f, (float)PI/4.f, (float)PI/6.f, + (float)PI/8.f, (float)PI/7.f, (float)PI/16.f, (float)PI/9.f); + + CHKV8_FUNC_EPS(i, tan, 1.e-6); + CHKV8_FUNC_EPS(v8f_cos(i), acos, 1.e-6f); + CHKV8_FUNC_EPS(v8f_sin(i), asin, 1.e-6f); + CHKV8_FUNC_EPS(v8f_tan(i), atan, 1.e-6f); +} + +static void +test_exp(void) +{ + const v8f_T i = v8f_set + (1.f, -1.234f, 0.f, 3.14156f, 0.9187f, 7.9f, 3.333f, 2.387e-7f); + v8f_T j; + float ref[8]; + + CHKV8_FUNC_EPS(i, exp, 1.e-6f); + CHKV8_FUNC_EPS(i, exp2, 1.e-6f); + + j = v8f_exp10(i); + ref[0] = (float)exp2(LOG2E * LN10 * v4f_x(v8f_abcd(i))); + ref[1] = (float)exp2(LOG2E * LN10 * v4f_y(v8f_abcd(i))); + ref[2] = (float)exp2(LOG2E * LN10 * v4f_z(v8f_abcd(i))); + ref[3] = (float)exp2(LOG2E * LN10 * v4f_w(v8f_abcd(i))); + ref[4] = (float)exp2(LOG2E * LN10 * v4f_x(v8f_efgh(i))); + ref[5] = (float)exp2(LOG2E * LN10 * v4f_y(v8f_efgh(i))); + ref[6] = (float)exp2(LOG2E * LN10 * v4f_z(v8f_efgh(i))); + ref[7] = (float)exp2(LOG2E * LN10 * v4f_w(v8f_efgh(i))); + CHKV8_EPS(j, ref, 1.e-6f); +} + +static void +test_log(void) +{ + const v8f_T i = v8f_set + (4.675f, 3.14f, 9.99999f, 1.234e-13f, 3.33e-3f, 0.98f, 8.f, 9.87654f); + CHKV8_FUNC_EPS(i, log, 1.e-6f); + CHKV8_FUNC_EPS(i, log2, 1.e-6f); + CHKV8_FUNC_EPS(i, log10, 1.e-6f); +} + +static void +test_misc(void) +{ + v8f_T i, j, k; + float ref[8]; + + i = v8f_set(-1.2345f, 9.3e-7f, 3.879e9f, -10.56f, 9.9f, -3.1f, 0.33e-6f, 1.f); + j = v8f_set(7.89e-9f, 0.12f, -4.9e10f, 3.14f, 5.f, 0.1e-19f, 1.234f, -0.45f); + k = v8f_copysign(i, j); + ref[0] = (float)copysign(v4f_x(v8f_abcd(i)), v4f_x(v8f_abcd(j))); + ref[1] = (float)copysign(v4f_y(v8f_abcd(i)), v4f_y(v8f_abcd(j))); + ref[2] = (float)copysign(v4f_z(v8f_abcd(i)), v4f_z(v8f_abcd(j))); + ref[3] = (float)copysign(v4f_w(v8f_abcd(i)), v4f_w(v8f_abcd(j))); + ref[4] = (float)copysign(v4f_x(v8f_efgh(i)), v4f_x(v8f_efgh(j))); + ref[5] = (float)copysign(v4f_y(v8f_efgh(i)), v4f_y(v8f_efgh(j))); + ref[6] = (float)copysign(v4f_z(v8f_efgh(i)), v4f_z(v8f_efgh(j))); + ref[7] = (float)copysign(v4f_w(v8f_efgh(i)), v4f_w(v8f_efgh(j))); + CHKV8_EPS(k, ref, 1.e-6f); + + CHKV8_FUNC_EPS(i, floor, 1.e-6f); + + k = v8f_pow(v8f_abs(i), j); + ref[0] = (float)pow(fabsf(v4f_x(v8f_abcd(i))), v4f_x(v8f_abcd(j))); + ref[1] = (float)pow(fabsf(v4f_y(v8f_abcd(i))), v4f_y(v8f_abcd(j))); + ref[2] = (float)pow(fabsf(v4f_z(v8f_abcd(i))), v4f_z(v8f_abcd(j))); + ref[3] = (float)pow(fabsf(v4f_w(v8f_abcd(i))), v4f_w(v8f_abcd(j))); + ref[4] = (float)pow(fabsf(v4f_x(v8f_efgh(i))), v4f_x(v8f_efgh(j))); + ref[5] = (float)pow(fabsf(v4f_y(v8f_efgh(i))), v4f_y(v8f_efgh(j))); + ref[6] = (float)pow(fabsf(v4f_z(v8f_efgh(i))), v4f_z(v8f_efgh(j))); + ref[7] = (float)pow(fabsf(v4f_w(v8f_efgh(i))), v4f_w(v8f_efgh(j))); + CHKV8_EPS(k, ref, 1.e-6f); +} + +int +main(int argc, char** argv) +{ + (void)argc, (void)argv; + + test_trigo(); + test_exp(); + test_log(); + test_misc(); + + return 0; +} + diff --git a/src/test_soa4f2.c b/src/test_soa4f2.c @@ -1,118 +1,28 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ -#include "soa4f2.h" -#include "test_soa4f_utils.h" - -#define CHECK_F2(V, A, B, C, D, E, F, G, H) \ - { \ - const v4f_T* v__ = (V); \ - CHECK_V4MASK(v4f_eq(v__[0], v4f_set((A), (B), (C), (D))), V4TRUE); \ - CHECK_V4MASK(v4f_eq(v__[1], v4f_set((E), (F), (G), (H))), V4TRUE); \ - } (void)0 +/* Generate the test_soa3f2 function */ +#define SOA_SIMD_WIDTH 4 +#define SOA_DIMENSION 2 +#include "test_soaXfY.h" int main(int argc, char** argv) { - v4f_T a[2], b[2], c[2], dst[2], f; (void)argc, (void)argv; - - CHECK(soa4f2_set(a, soa4f2_splat(c, v4f_set1(-1.f))), a); - CHECK_V4MASK(v4f_eq(a[0], v4f_set1(-1.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[1], v4f_set1(-1.f)), V4TRUE); - - CHECK(soa4f2(c, v4f_set(0.f, 1.f, 2.f, 3.f), v4f_set(5.f, 6.f, 7.f, 8.f)), c); - CHECK(soa4f2_set(a, c), a); - CHECK_V4MASK(v4f_eq(c[0], v4f_set(0.f, 1.f, 2.f, 3.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(c[1], v4f_set(5.f, 6.f, 7.f, 8.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[0], v4f_set(0.f, 1.f, 2.f, 3.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[1], v4f_set(5.f, 6.f, 7.f, 8.f)), V4TRUE); - - CHECK(soa4f2(a, v4f_set(-1.f, 2.f, 3.f,-4.f),v4f_set(5.f,-6.f,-7.f, 8.f)), a); - CHECK(soa4f2_minus(b, a), b); - CHECK_F2(b, 1.f,-2.f,-3.f, 4.f, -5.f, 6.f, 7.f,-8.f); - - CHECK(soa4f2_addf(dst, a, v4f_set(1.f, 2.f, 0.f, 3.f)), dst); - CHECK_F2(dst, 0.f, 4.f, 3.f, -1.f, 6.f, -4.f, -7.f, 11.f); - CHECK(soa4f2_add(dst, a, b), dst); - CHECK_F2(dst, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); - CHECK(soa4f2_subf(dst, a, v4f_set(1.f, 2.f, 0.f, 3.f)), dst); - CHECK_F2(dst, -2.f, 0.f, 3.f, -7.f, 4.f, -8.f, -7.f, 5.f); - CHECK(soa4f2_sub(dst, a, b), dst); - CHECK_F2(dst, -2.f, 4.f, 6.f, -8.f, 10.f, -12.f, -14.f, 16.f); - CHECK(soa4f2_mulf(dst, a, v4f_set(2.f, 3.f, 0.f, -1.f)), dst); - CHECK_F2(dst, -2.f, 6.f, 0.f, 4.f, 10.f, -18.f, 0.f, -8.f); - CHECK(soa4f2_mul(dst, a, b), dst); - CHECK_F2(dst, -1.f, -4.f, -9.f, -16.f, -25.f, -36.f, -49.f, -64.f); - CHECK(soa4f2_divf(dst, a, v4f_set(2.f, 0.5f, 1.f, 4.f)), dst); - CHECK_F2(dst, -0.5f, 4.f, 3.f, -1.f, 2.5f, -12.f, -7.f, 2.f); - CHECK(soa4f2_div(dst, a, b), dst); - CHECK_F2(dst, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f); - - soa4f2(a, v4f_set1(0.f), v4f_set1(1.f)); - soa4f2(b, v4f_set1(1.f), v4f_set1(2.f)); - CHECK(soa4f2_lerp(dst, a, b, v4f_set1(0.5f)), dst); - CHECK_F2(dst, 0.5f, 0.5f, 0.5f, 0.5f, 1.5f, 1.5f, 1.5f, 1.5f); - soa4f2(a, v4f_set(-1.f, 2.f, 3.f,-4.f), v4f_set(5.f,-6.f,-7.f, 8.f)); - soa4f2_minus(b, a); - CHECK(soa4f2_lerp(dst, a, b, v4f_set(-0.5f, 1.f, 0.5f, 4.f)), dst); - CHECK_F2(dst, -1.f, -2.f, 0.f, 4.f, 5.f, 6.f, 0.f, -8.f); - - f = soa4f2_sum(b); - CHECK_V4MASK(v4f_eq(f, v4f_set(-4.f, 4.f, 4.f, -4.f)), V4TRUE); - f = soa4f2_dot(a, b); - CHECK_V4MASK(v4f_eq(f, v4f_set(-26.f, -40.f, -58.f, -80.f)), V4TRUE); - f = soa4f2_len(a); - CHECK_V4MASK - (v4f_eq_eps(f, v4f_sqrt(soa4f2_dot(a, a)), v4f_set1(1.e-6f)), V4TRUE); - - CHECK_V4MASK(soa4f2_is_normalized(b), V4FALSE); - f = soa4f2_normalize(dst, b); - CHECK_V4MASK(v4f_eq_eps(f, soa4f2_len(b), v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(soa4f2_is_normalized(b), V4FALSE); - CHECK_V4MASK(soa4f2_is_normalized(dst), V4TRUE); - soa4f2_divf(b, b, f); - CHECK_V4MASK(v4f_eq_eps(dst[0], b[0], v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(v4f_eq_eps(dst[1], b[1], v4f_set1(1.e-6f)), V4TRUE); - - CHECK_V4MASK(soa4f2_eq(a, a), V4TRUE); - CHECK_V4MASK(soa4f2_eq(a, b), V4FALSE); - soa4f2(a, v4f_set(-1.f, 2.f, 3.f,-4.f), v4f_set(5.f,-6.f,-7.f, 8.f)); - soa4f2(b, v4f_set(-1.f,-2.f, 5.f,-4.001f), v4f_set(5.f,-6.f, 7.f, 8.001f)); - CHECK_V4MASK__(soa4f2_eq(a, b), ~0, 0, 0, 0); - CHECK_V4MASK__(soa4f2_eq_eps(a, b, v4f_set1(1.e-6f)), ~0, 0, 0, 0); - CHECK_V4MASK__(soa4f2_eq_eps(a, b, v4f_set(0.f,0.f,0.f,1.e-6f)),~0, 0, 0, 0); - CHECK_V4MASK__(soa4f2_eq_eps(a, b, v4f_set(0.f,0.f,0.f,1.e-2f)),~0, 0, 0,~0); - - soa4f2(a, v4f_set(1.f, 2.f, 3.f,-1.f), v4f_set(-2.f, 0.f,-7.f, 0.f)); - soa4f2(b, v4f_set(3.f, 2.f, 1.f,-2.f), v4f_set(1.f,-6.f, 0.5f, 2.f)); - f = soa4f2_cross(a, b); - CHECK_V4MASK(v4f_eq(f, v4f_set(7.f, -12.f, 8.5f, -2.f)), V4TRUE); - - CHECK(soa4f2_min(dst, a, b), dst); - CHECK_F2(dst, 1.f, 2.f, 1.f, -2.f, -2.f, -6.f, -7.f, 0.f); - CHECK(soa4f2_max(dst, a, b), dst); - CHECK_F2(dst, 3.f, 2.f, 3.f, -1.f, 1.f, 0.f, 0.5f, 2.f); - - soa4f2_sel(dst, b, a, v4f_mask(~0, ~0, 0, ~0)); - CHECK_F2(dst, 1.f, 2.f, 1.f, -1.f, -2.f, 0.f, 0.5f, 0.f); - - soa4f2(c, v4f_mask(~0, ~0, 1, ~0), v4f_mask(~0, 0, 0, 0)); - soa4f2_selv(dst, b, a, c); - CHECK_F2(dst, 1.f, 2.f, 1.f, -1.f, -2.f, -6.f, 0.5f, 2.f); - + test_soa4f2(); return 0; } diff --git a/src/test_soa4f3.c b/src/test_soa4f3.c @@ -1,148 +1,27 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ -#include "soa4f3.h" -#include "test_soa4f_utils.h" - -#define CHECK_F3(V, A, B, C, D, E, F, G, H, I, J, K, L) \ - { \ - const v4f_T* v__ = (V); \ - CHECK_V4MASK(v4f_eq(v__[0], v4f_set((A), (B), (C), (D))), V4TRUE); \ - CHECK_V4MASK(v4f_eq(v__[1], v4f_set((E), (F), (G), (H))), V4TRUE); \ - CHECK_V4MASK(v4f_eq(v__[2], v4f_set((I), (J), (K), (L))), V4TRUE); \ - } (void)0 +/* Generate the test_soa4f2 function */ +#define SOA_SIMD_WIDTH 4 +#define SOA_DIMENSION 3 +#include "test_soaXfY.h" int main(int argc, char** argv) { - v4f_T a[3], b[3], c[3], dst[3], f; (void)argc, (void)argv; - - CHECK(soa4f3_set(a, soa4f3_splat(c, v4f_set1(-1.f))), a); - CHECK_V4MASK(v4f_eq(a[0], v4f_set1(-1.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[1], v4f_set1(-1.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[2], v4f_set1(-1.f)), V4TRUE); - CHECK(soa4f3(c, - v4f_set(0.f, 1.f, 2.f, 3.f), - v4f_set(5.f, 6.f, 7.f, 8.f), - v4f_set(9.f, 10.f, 11.f, 12.f)), c); - CHECK(soa4f3_set(a, c), a); - CHECK_V4MASK(v4f_eq(c[0], v4f_set(0.f, 1.f, 2.f, 3.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(c[1], v4f_set(5.f, 6.f, 7.f, 8.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(c[2], v4f_set(9.f, 10.f, 11.f, 12.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[0], v4f_set(0.f, 1.f, 2.f, 3.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[1], v4f_set(5.f, 6.f, 7.f, 8.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[2], v4f_set(9.f, 10.f, 11.f, 12.f)), V4TRUE); - - CHECK(soa4f3(a, - v4f_set(-1.f, 2.f, 3.f, -4.f), - v4f_set(5.f, -6.f, -7.f, 8.f), - v4f_set(9.f, -10.f, 1.f, -2.f)), a); - CHECK(soa4f3_minus(b, a), b); - CHECK_F3(b, 1.f,-2.f,-3.f, 4.f,-5.f, 6.f, 7.f,-8.f,-9.f, 10.f,-1.f, 2.f); - - CHECK(soa4f3_addf(dst, a, v4f_set(1.f, 2.f, 0.f, 3.f)), dst); - CHECK_F3(dst, 0.f, 4.f, 3.f,-1.f, 6.f,-4.f,-7.f, 11.f, 10.f,-8.f, 1.f, 1.f); - CHECK(soa4f3_add(dst, a, b), dst); - CHECK_F3(dst, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); - CHECK(soa4f3_subf(dst, a, v4f_set(1.f, 2.f, 0.f, 3.f)), dst); - CHECK_F3(dst,-2.f, 0.f, 3.f,-7.f, 4.f,-8.f,-7.f, 5.f, 8.f,-12.f, 1.f,-5.f); - CHECK(soa4f3_sub(dst, a, b), dst); - CHECK_F3(dst,-2.f, 4.f, 6.f,-8.f, 10.f,-12.f,-14.f, 16.f, 18.f,-20.f, 2.f,-4.f); - CHECK(soa4f3_mulf(dst, a, v4f_set(2.f, 3.f, 0.f, -1.f)), dst); - CHECK_F3(dst,-2.f, 6.f, 0.f, 4.f, 10.f,-18.f, 0.f,-8.f, 18.f,-30.f, 0.f, 2.f); - CHECK(soa4f3_mul(dst, a, b), dst); - CHECK_F3(dst,-1.f,-4.f,-9.f,-16.f,-25.f,-36.f,-49.f,-64.f,-81.f,-100.f,-1.f,-4.f); - CHECK(soa4f3_divf(dst, a, v4f_set(2.f, 0.5f, 1.f, 4.f)), dst); - CHECK_F3(dst,-0.5f, 4.f, 3.f,-1.f, 2.5f,-12.f,-7.f, 2.f, 4.5f,-20.f, 1.f,-0.5f); - CHECK(soa4f3_div(dst, a, b), dst); - CHECK_F3(dst,-1.f,-1.f,-1.f,-1.f,-1.f,-1.f,-1.f,-1.f,-1.f,-1.f,-1.f,-1.f); - - soa4f3(a, v4f_set1(0.f), v4f_set1(1.f), v4f_set1(2.f)); - soa4f3(b, v4f_set1(1.f), v4f_set1(2.f), v4f_set1(-1.f)); - CHECK(soa4f3_lerp(dst, a, b, v4f_set1(0.5f)), dst); - CHECK_F3(dst, - 0.5f, 0.5f, 0.5f, 0.5f, - 1.5f, 1.5f, 1.5f, 1.5f, - 0.5f, 0.5f, 0.5f, 0.5f); - CHECK(soa4f3(a, - v4f_set(-1.f, 2.f, 3.f, -4.f), - v4f_set(5.f, -6.f, -7.f, 8.f), - v4f_set(9.f, -10.f, 1.f, -2.f)), a); - CHECK(soa4f3_minus(b, a), b); - CHECK(soa4f3_lerp(dst, a, b, v4f_set(-0.5f, 1.f, 0.5f, 4.f)), dst); - CHECK_F3(dst, -1.f, -2.f, 0.f, 4.f, 5.f, 6.f, 0.f, -8.f, 9.f, 10.f, 0.f, 2.f); - - f = soa4f3_sum(b); - CHECK_V4MASK(v4f_eq(f, v4f_set(-13.f, 14.f, 3.f, -2.f)), V4TRUE); - f = soa4f3_dot(a, b); - CHECK_V4MASK(v4f_eq(f, v4f_set(-107.f, -140.f, -59.f, -84.f)), V4TRUE); - f = soa4f3_len(a); - CHECK_V4MASK - (v4f_eq_eps(f, v4f_sqrt(soa4f3_dot(a, a)), v4f_set1(1.e-6f)), V4TRUE); - - CHECK_V4MASK(soa4f3_is_normalized(b), V4FALSE); - f = soa4f3_normalize(dst, b); - CHECK_V4MASK(v4f_eq_eps(f, soa4f3_len(b), v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(soa4f3_is_normalized(b), V4FALSE); - CHECK_V4MASK(soa4f3_is_normalized(dst), V4TRUE); - soa4f3_divf(b, b, f); - CHECK_V4MASK(v4f_eq_eps(dst[0], b[0], v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(v4f_eq_eps(dst[1], b[1], v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(v4f_eq_eps(dst[2], b[2], v4f_set1(1.e-6f)), V4TRUE); - - CHECK_V4MASK(soa4f3_eq(a, a), V4TRUE); - CHECK_V4MASK(soa4f3_eq(a, b), V4FALSE); - soa4f3(a, - v4f_set(-1.f, 2.f, 3.f,-4.f), - v4f_set(5.f,-6.f,-7.f, 8.f), - v4f_set(9.f,-10.f,1.f, -2.f)); - soa4f3(b, - v4f_set(-1.f, 2.f, 5.f,-4.001f), - v4f_set(5.f,-6.03f,7.f, 8.0), - v4f_set(9.f,-10.f,0.f, -2.001f)); - CHECK_V4MASK__(soa4f3_eq(a, b), ~0, 0, 0, 0); - CHECK_V4MASK__(soa4f3_eq_eps(a, b, v4f_set1(1.e-6f)), ~0, 0, 0, 0); - CHECK_V4MASK__(soa4f3_eq_eps(a, b, v4f_set(0.f,0.f,0.f,1.e-6f)),~0, 0, 0, 0); - CHECK_V4MASK__(soa4f3_eq_eps(a, b, v4f_set(0.f,0.f,0.f,1.e-2f)),~0, 0, 0,~0); - CHECK_V4MASK__(soa4f3_eq_eps(a, b, v4f_set(0.f,1.e-2f,0.f,1.e-2f)),~0, 0, 0,~0); - CHECK_V4MASK__(soa4f3_eq_eps(a, b, v4f_set(0.f,1.e-1f,0.f,1.e-2f)),~0,~0, 0,~0); - - soa4f3(a, - v4f_set(1.f, 2.f, 3.f,-1.f), - v4f_set(-2.f, 0.f,-7.f, 0.f), - v4f_set(-1.f, 4.f, 3.f, 2.f)); - soa4f3(b, - v4f_set(3.f, 2.f, 1.f,-2.f), - v4f_set(1.f,-6.f, 0.5f, 2.f), - v4f_set(0.f, 1.f, 0.f, 3.f)); - CHECK(soa4f3_cross(dst, a, b), dst); - CHECK_F3(dst, 1.f, 24.f,-1.5f,-4.f,-3.f, 6.f, 3.f,-1.f, 7.f,-12.f, 8.5f,-2.f); - - CHECK(soa4f3_min(dst, a, b), dst); - CHECK_F3(dst, 1.f, 2.f, 1.f, -2.f,-2.f,-6.f,-7.f, 0.f,-1.f, 1.f, 0.f, 2.f); - CHECK(soa4f3_max(dst, a, b), dst); - CHECK_F3(dst, 3.f, 2.f, 3.f, -1.f, 1.f, 0.f, 0.5f, 2.f, 0.f, 4.f, 3.f, 3.f); - - soa4f3_sel(dst, b, a, v4f_mask(~0, ~0, 1, ~0)); - CHECK_F3(dst, 1.f, 2.f, 1.f, -1.f, -2.f, 0.f, 0.5f, 0.f, -1.f, 4.f, 0.f, 2.f); - - soa4f3(c, v4f_mask(~0,~0, 0,~0), v4f_mask(~0, 0, 0, 0), v4f_mask(0,~0,~0, 0)); - soa4f3_selv(dst, b, a, c); - CHECK_F3(dst, 1.f, 2.f, 1.f,-1.f,-2.f,-6.f, 0.5f, 2.f, 0.f, 4.f, 3.f, 3.f); - + test_soa4f3(); return 0; } - diff --git a/src/test_soa4f4.c b/src/test_soa4f4.c @@ -1,218 +1,27 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ -#include "soa4f4.h" -#include "test_soa4f_utils.h" - -#define CHECK_F4(V, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ - { \ - const v4f_T* v__ = (V); \ - CHECK_V4MASK(v4f_eq(v__[0], v4f_set((A), (B), (C), (D))), V4TRUE); \ - CHECK_V4MASK(v4f_eq(v__[1], v4f_set((E), (F), (G), (H))), V4TRUE); \ - CHECK_V4MASK(v4f_eq(v__[2], v4f_set((I), (J), (K), (L))), V4TRUE); \ - CHECK_V4MASK(v4f_eq(v__[3], v4f_set((M), (N), (O), (P))), V4TRUE); \ - } (void)0 +/* Generate the test_soa4f2 function */ +#define SOA_SIMD_WIDTH 4 +#define SOA_DIMENSION 4 +#include "test_soaXfY.h" int main(int argc, char** argv) { - v4f_T a[4], b[4], c[4], dst[4], f; (void)argc, (void)argv; - - CHECK(soa4f4_set(a, soa4f4_splat(c, v4f_set1(-1.f))), a); - CHECK_V4MASK(v4f_eq(a[0], v4f_set1(-1.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[1], v4f_set1(-1.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[2], v4f_set1(-1.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[3], v4f_set1(-1.f)), V4TRUE); - CHECK(soa4f4(c, - v4f_set(0.f, 1.f, 2.f, 3.f), - v4f_set(5.f, 6.f, 7.f, 8.f), - v4f_set(9.f, 10.f, 11.f, 12.f), - v4f_set(13.f, 14.f, 15.f, 16.f)), c); - CHECK(soa4f4_set(a, c), a); - CHECK_V4MASK(v4f_eq(c[0], v4f_set(0.f, 1.f, 2.f, 3.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(c[1], v4f_set(5.f, 6.f, 7.f, 8.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(c[2], v4f_set(9.f, 10.f, 11.f, 12.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(c[3], v4f_set(13.f, 14.f, 15.f, 16.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[0], v4f_set(0.f, 1.f, 2.f, 3.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[1], v4f_set(5.f, 6.f, 7.f, 8.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[2], v4f_set(9.f, 10.f, 11.f, 12.f)), V4TRUE); - CHECK_V4MASK(v4f_eq(a[3], v4f_set(13.f, 14.f, 15.f, 16.f)), V4TRUE); - - CHECK(soa4f4(a, - v4f_set(-1.f, 2.f, 3.f, -4.f), - v4f_set(5.f, -6.f, -7.f, 8.f), - v4f_set(9.f, -10.f, 1.f, -2.f), - v4f_set(5.f, -3.f, -7.f, 1.f)), a); - CHECK(soa4f4_minus(b, a), b); - CHECK_F4(b, - 1.f, -2.f, -3.f, 4.f, - -5.f, 6.f, 7.f, -8.f, - -9.f, 10.f, -1.f, 2.f, - -5.f, 3.f, 7.f, -1.f); - - CHECK(soa4f4_addf(dst, a, v4f_set(1.f, 2.f, 0.f, 3.f)), dst); - CHECK_F4(dst, - 0.f, 4.f, 3.f, -1.f, - 6.f, -4.f, -7.f, 11.f, - 10.f, -8.f, 1.f, 1.f, - 6.f, -1.f, -7.f, 4.f); - CHECK(soa4f4_add(dst, a, b), dst); - CHECK_F4(dst, - 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f); - - CHECK(soa4f4_subf(dst, a, v4f_set(1.f, 2.f, 0.f, 3.f)), dst); - CHECK_F4(dst, - -2.f, 0.f, 3.f, -7.f, - 4.f, -8.f, -7.f, 5.f, - 8.f,-12.f, 1.f,-5.f, - 4.f, -5.f, -7.f, -2.f); - CHECK(soa4f4_sub(dst, a, b), dst); - CHECK_F4(dst, - -2.f, 4.f, 6.f, -8.f, - 10.f, -12.f, -14.f, 16.f, - 18.f, -20.f, 2.f, -4.f, - 10.f, -6.f, -14.f, 2.f); - - CHECK(soa4f4_mulf(dst, a, v4f_set(2.f, 3.f, 0.f, -1.f)), dst); - CHECK_F4(dst, - -2.f, 6.f, 0.f, 4.f, - 10.f, -18.f, 0.f, -8.f, - 18.f, -30.f, 0.f, 2.f, - 10.f, -9.f, 0.f, -1.f); - CHECK(soa4f4_mul(dst, a, b), dst); - CHECK_F4(dst, - -1.f, -4.f, -9.f, -16.f, - -25.f, -36.f, -49.f, -64.f, - -81.f, -100.f, -1.f, -4.f, - -25.f, -9.f, -49.f, -1.f); - - CHECK(soa4f4_divf(dst, a, v4f_set(2.f, 0.5f, 1.f, 4.f)), dst); - CHECK_F4(dst, - -0.5f, 4.f, 3.f, -1.f, - 2.5f, -12.f, -7.f, 2.f, - 4.5f, -20.f, 1.f, -0.5f, - 2.5f, -6.f, -7.f, 0.25f); - CHECK(soa4f4_div(dst, a, b), dst); - CHECK_F4(dst, - -1.f, -1.f, -1.f, -1.f, - -1.f, -1.f, -1.f, -1.f, - -1.f, -1.f, -1.f, -1.f, - -1.f, -1.f, -1.f, -1.f); - - CHECK(soa4f4(a, - v4f_set(-1.f, 2.f, 3.f, -4.f), - v4f_set(5.f, -6.f, -7.f, 8.f), - v4f_set(9.f, -10.f, 1.f, -2.f), - v4f_set(5.f, -3.f, -7.f, 1.f)), a); - CHECK(soa4f4_minus(b, a), b); - CHECK(soa4f4_lerp(dst, a, b, v4f_set(-0.5f, 1.f, 0.5f, 4.f)), dst); - CHECK_F4(dst, - -1.f, -2.f, 0.f, 4.f, - 5.f, 6.f, 0.f, -8.f, - 9.f, 10.f, 0.f, 2.f, - 5.f, 3.f, 0.f, -1.f); - - f = soa4f4_sum(b); - CHECK_V4MASK(v4f_eq(f, v4f_set(-18.f, 17.f, 10.f, -3.f)), V4TRUE); - f = soa4f4_dot(a, b); - CHECK_V4MASK(v4f_eq(f, v4f_set(-132.f, -149.f, -108.f, -85.f)), V4TRUE); - f = soa4f4_len(a); - CHECK_V4MASK - (v4f_eq_eps(f, v4f_sqrt(soa4f4_dot(a, a)), v4f_set1(1.e-6f)), V4TRUE); - - CHECK_V4MASK(soa4f4_is_normalized(b), V4FALSE); - f = soa4f4_normalize(dst, b); - CHECK_V4MASK(v4f_eq_eps(f, soa4f4_len(b), v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(soa4f4_is_normalized(b), V4FALSE); - CHECK_V4MASK(soa4f4_is_normalized(dst), V4TRUE); - soa4f4_divf(b, b, f); - CHECK_V4MASK(v4f_eq_eps(dst[0], b[0], v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(v4f_eq_eps(dst[1], b[1], v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(v4f_eq_eps(dst[2], b[2], v4f_set1(1.e-6f)), V4TRUE); - CHECK_V4MASK(v4f_eq_eps(dst[3], b[3], v4f_set1(1.e-6f)), V4TRUE); - - CHECK_V4MASK(soa4f4_eq(a, a), V4TRUE); - CHECK_V4MASK(soa4f4_eq(a, b), V4FALSE); - soa4f4(a, - v4f_set(-1.f, 2.f, 3.f, -4.f), - v4f_set(5.f, -6.f, -7.f, 8.f), - v4f_set(9.f, -10.f, 1.f, -2.f), - v4f_set(1.f, -1.f, 1.f, -2.f)); - soa4f4(b, - v4f_set(-1.f, 2.f, 3.f,-4.001f), - v4f_set(5.f,-6.03f,-7.f, 8.0), - v4f_set(9.f,-10.f,1.f, -2.001f), - v4f_set(1.f, -1.f, 1.0005f, -2.f)); - CHECK_V4MASK__(soa4f4_eq(a, b), ~0, 0, 0, 0); - CHECK_V4MASK__(soa4f4_eq_eps(a, b, v4f_set1(1.e-6f)), ~0, 0, 0, 0); - CHECK_V4MASK__(soa4f4_eq_eps(a, b, v4f_set(0.f, 0.f, 0.f, 1.e-6f)), - ~0, 0, 0, 0); - CHECK_V4MASK__(soa4f4_eq_eps(a, b, v4f_set(0.f, 0.f, 0.f, 1.e-2f)), - ~0, 0, 0,~0); - CHECK_V4MASK__(soa4f4_eq_eps(a, b, v4f_set(0.f, 1.e-2f, 0.f, 1.e-2f)), - ~0, 0, 0,~0); - CHECK_V4MASK__(soa4f4_eq_eps(a, b, v4f_set(0.f, 1.e-1f, 0.f, 1.e-2f)), - ~0,~0, 0,~0); - CHECK_V4MASK__(soa4f4_eq_eps(a, b, v4f_set(0.f, 1.e-1f, 1.e-3f, 1.e-2f)), - ~0,~0,~0,~0); - - soa4f4(a, - v4f_set(1.f, 2.f, 3.f, -1.f), - v4f_set(-2.f, 0.f, -7.f, 0.f), - v4f_set(-1.f, 4.f, 3.f, 2.f), - v4f_set(-5.f, 7.f, 0.5f, -1.f)); - soa4f4(b, - v4f_set(3.f, 2.f, 1.f,-2.f), - v4f_set(1.f,-6.f, 0.5f, 2.f), - v4f_set(0.f, 1.f, 0.f, 3.f), - v4f_set(1.f,-1.f, 0.f, 0.f)); - CHECK(soa4f4_min(dst, a, b), dst); - CHECK_F4(dst, - 1.f, 2.f, 1.f, -2.f, - -2.f, -6.f, -7.f, 0.f, - -1.f, 1.f, 0.f, 2.f, - -5.f, -1.f, 0.f, -1.f); - CHECK(soa4f4_max(dst, a, b), dst); - CHECK_F4(dst, - 3.f, 2.f, 3.f, -1.f, - 1.f, 0.f, 0.5f, 2.f, - 0.f, 4.f, 3.f, 3.f, - 1.f, 7.f, 0.5f, 0.f); - - soa4f4_sel(dst, b, a, v4f_mask(~0, ~0, 1, ~0)); - CHECK_F4(dst, - 1.f, 2.f, 1.f, -1.f, - -2.f, 0.f, 0.5f, 0.f, - -1.f, 4.f, 0.f, 2.f, - -5.f, 7.f, 0.f, -1.f); - - soa4f4(c, - v4f_mask(~0,~0, 0,~0), - v4f_mask(~0, 0, 0, 0), - v4f_mask( 0,~0,~0, 0), - v4f_mask(~0,~0, 0, 0)); - soa4f4_selv(dst, b, a, c); - CHECK_F4(dst, - 1.f, 2.f, 1.f, -1.f, - -2.f, -6.f, 0.5f, 2.f, - 0.f, 4.f, 3.f, 3.f, - -5.f, 7.f, 0.f, 0.f); - + test_soa4f4(); return 0; } diff --git a/src/test_soa4f_utils.h b/src/test_soa4f_utils.h @@ -1,32 +0,0 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) - * - * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The RSIMD library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ - -#ifndef TEST_SOA4F_UTILS_H -#define TEST_SOA4F_UTILS_H - -#define V4TRUE ~0, ~0, ~0, ~0 -#define V4FALSE 0, 0, 0, 0 -#define CHECK_V4MASK__(Mask, A, B, C, D) \ - { \ - const v4f_T mask__ = (Mask); \ - CHECK(v4f_mask_x(mask__), (A)); \ - CHECK(v4f_mask_y(mask__), (B)); \ - CHECK(v4f_mask_z(mask__), (C)); \ - CHECK(v4f_mask_w(mask__), (D)); \ - } (void)0 -#define CHECK_V4MASK(Mask, Vec) CHECK_V4MASK__(Mask, Vec) - -#endif /* TEST_SOA4F_UTILS_H */ - diff --git a/src/test_soa8f2.c b/src/test_soa8f2.c @@ -0,0 +1,28 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +/* Generate the test_soa8f2 function */ +#define SOA_SIMD_WIDTH 8 +#define SOA_DIMENSION 2 +#include "test_soaXfY.h" + +int +main(int argc, char** argv) +{ + (void)argc, (void)argv; + test_soa8f2(); + return 0; +} + diff --git a/src/test_soa8f3.c b/src/test_soa8f3.c @@ -0,0 +1,28 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +/* Generate the test_soa8f3 function */ +#define SOA_SIMD_WIDTH 8 +#define SOA_DIMENSION 3 +#include "test_soaXfY.h" + +int +main(int argc, char** argv) +{ + (void)argc, (void)argv; + test_soa8f3(); + return 0; +} + diff --git a/src/test_soa8f4.c b/src/test_soa8f4.c @@ -0,0 +1,28 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +/* Generate the test_soa8f4 function */ +#define SOA_SIMD_WIDTH 8 +#define SOA_DIMENSION 4 +#include "test_soaXfY.h" + +int +main(int argc, char** argv) +{ + (void)argc, (void)argv; + test_soa8f4(); + return 0; +} + diff --git a/src/test_soaXfY.h b/src/test_soaXfY.h @@ -0,0 +1,262 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "rsimd.h" +#include <rsys/rsys.h> + +/* Check macros */ +#ifndef SOA_SIMD_WIDTH + #error "Missing the SOA_SIMD_WIDTH macro" +#endif +#if SOA_SIMD_WIDTH != 4 && SOA_SIMD_WIDTH != 8 + #error "Invalid value for the SOA_SIMD_WIDTH macro" +#endif +#ifndef SOA_DIMENSION + #error "Missing the SOA_DIMENSION macro" +#endif +#if SOA_DIMENSION < 2 || SOA_DIMENSION > 4 + #error "Invalid value for the SOA_DIMENSION macro" +#endif + +/* Define macros generics to the SOA_SIMD_WIDTH parameter */ +#if SOA_SIMD_WIDTH == 4 + #define soaX soa4 + #define vXf(Func) CONCAT(v4f_, Func) + #define vXf_T v4f_T + #define VEC(A, B, C, D, E, F, G, H) v4f_set(A, B, C, D) + #define MASK(A, B, C, D, E, F, G, H) v4f_mask(A, B, C, D) + #define CHKVX(V0, V1) { \ + const v4f_T v0__ = (V0); \ + const v4f_T v1__ = (V1); \ + CHK(v4f_mask_x(v0__) == v4f_mask_y(v1__)); \ + CHK(v4f_mask_y(v0__) == v4f_mask_y(v1__)); \ + CHK(v4f_mask_z(v0__) == v4f_mask_z(v1__)); \ + CHK(v4f_mask_w(v0__) == v4f_mask_w(v1__)); \ + } (void)0 +#elif SOA_SIMD_WIDTH == 8 + #define soaX soa8 + #define vXf(Func) CONCAT(v8f_, Func) + #define vXf_T v8f_T + #define VEC(A, B, C, D, E, F, G, H) v8f_set(A, B, C, D, E, F, G, H) + #define MASK(A, B, C, D, E, F, G, H) v8f_mask(A, B, C, D, E, F, G, H) + #define CHKVX(V0, V1) { \ + const v8f_T v0__ = (V0); \ + const v8f_T v1__ = (V1); \ + CHK(v4f_mask_x(v8f_abcd(v0__)) == v4f_mask_y(v8f_abcd(v1__))); \ + CHK(v4f_mask_y(v8f_abcd(v0__)) == v4f_mask_y(v8f_abcd(v1__))); \ + CHK(v4f_mask_z(v8f_abcd(v0__)) == v4f_mask_z(v8f_abcd(v1__))); \ + CHK(v4f_mask_w(v8f_abcd(v0__)) == v4f_mask_w(v8f_abcd(v1__))); \ + CHK(v4f_mask_x(v8f_efgh(v0__)) == v4f_mask_y(v8f_efgh(v1__))); \ + CHK(v4f_mask_y(v8f_efgh(v0__)) == v4f_mask_y(v8f_efgh(v1__))); \ + CHK(v4f_mask_z(v8f_efgh(v0__)) == v4f_mask_z(v8f_efgh(v1__))); \ + CHK(v4f_mask_w(v8f_efgh(v0__)) == v4f_mask_w(v8f_efgh(v1__))); \ + } (void)0 +#endif + +/* Define macros generics to the SOA_DIMENSION parameter */ +#if SOA_DIMENSION == 2 + #define soaXfY(Func) CONCAT(CONCAT(soaX, f2_), Func) + #define SOA_VEC(Dst, X, Y, Z, W) CONCAT(soaX, f2)(Dst, X, Y) +#elif SOA_DIMENSION == 3 + #define soaXfY(Func) CONCAT(CONCAT(soaX, f3_), Func) + #define SOA_VEC(Dst, X, Y, Z, W) CONCAT(soaX, f3)(Dst, X, Y, Z) +#elif SOA_DIMENSION == 4 + #define soaXfY(Func) CONCAT(CONCAT(soaX, f4_), Func) + #define SOA_VEC(Dst, X, Y, Z, W) CONCAT(soaX, f4)(Dst, X, Y, Z, W) +#endif + +/* Include the corresponding header */ +#if SOA_SIMD_WIDTH == 4 + #if SOA_DIMENSION == 2 + #include "soa4f2.h" + #elif SOA_DIMENSION == 3 + #include "soa4f3.h" + #elif SOA_DIMENSION == 4 + #include "soa4f4.h" + #endif +#else + #if SOA_DIMENSION == 2 + #include "soa8f2.h" + #elif SOA_DIMENSION == 3 + #include "soa8f3.h" + #elif SOA_DIMENSION == 4 + #include "soa8f4.h" + #endif +#endif + +/* Define constants */ +#define VXTRUE MASK(~0,~0,~0,~0,~0,~0,~0,~0) +#define VXFALSE MASK(0,0,0,0,0,0,0,0) + +static void +CONCAT(CONCAT(CONCAT(test_, soaX), f), SOA_DIMENSION)(void) +{ + vXf_T a[SOA_DIMENSION], b[SOA_DIMENSION], c[SOA_DIMENSION]; + vXf_T v[4], f, tmp, mask; + int i; + + v[0] = VEC(.5f, -1.f, -2.f, 3.f, -4.f, 5.f , 6.f , -7.f); + v[1] = VEC(-8.f, 9.f, -10.f, 11.f, 12.f, -13.f, -14.f, -15.f); + v[2] = VEC(16.f, -17.f, 18.f, -19.f, 20.f, 21.f, 22.f, -23.f); + v[3] = VEC(0.6f, -0.1f, 0.8f, -0.9f, 0.02f, 0.1f,-0.22f, -0.3f); + f = VEC(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); + + /* Setters */ + soaXfY(splat)(a, vXf(set1)(-1)); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(a[i], vXf(set1)(-1.f)), VXTRUE); + } + CHK(soaXfY(set)(b, a) == b); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(b[i], a[i]), VXTRUE); + } + CHK(SOA_VEC(a, v[0], v[1], v[2], v[3]) == a); + CHK(soaXfY(set)(b, a) == b); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(a[i], v[i]), VXTRUE); + CHKVX(vXf(eq)(b[i], v[i]), VXTRUE); + } + + /* Unary operator */ + CHK(soaXfY(minus)(b, a) == b); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(a[i], v[i]), VXTRUE); + CHKVX(vXf(eq)(b[i], vXf(minus)(v[i])), VXTRUE); + } + + /* Regular binary operators */ + CHK(soaXfY(addf)(c, a, f) == c); + FOR_EACH(i, 0, SOA_DIMENSION) CHKVX(vXf(eq)(c[i], vXf(add)(a[i], f)), VXTRUE); + CHK(soaXfY(subf)(c, a, f) == c); + FOR_EACH(i, 0, SOA_DIMENSION) CHKVX(vXf(eq)(c[i], vXf(sub)(a[i], f)), VXTRUE); + CHK(soaXfY(mulf)(c, a, f) == c); + FOR_EACH(i, 0, SOA_DIMENSION) CHKVX(vXf(eq)(c[i], vXf(mul)(a[i], f)), VXTRUE); + CHK(soaXfY(divf)(c, a, f) == c); + FOR_EACH(i, 0, SOA_DIMENSION) CHKVX(vXf(eq)(c[i], vXf(div)(a[i], f)), VXTRUE); + CHK(soaXfY(add)(c, a, b) == c); + FOR_EACH(i, 0, SOA_DIMENSION) CHKVX(vXf(eq)(c[i], vXf(zero)()), VXTRUE); + CHK(soaXfY(sub)(c, a, b) == c); + FOR_EACH(i, 0, SOA_DIMENSION) CHKVX(vXf(eq)(c[i], vXf(sub)(a[i], b[i])), VXTRUE); + CHK(soaXfY(mul)(c, a, b) == c); + FOR_EACH(i, 0, SOA_DIMENSION) CHKVX(vXf(eq)(c[i], vXf(mul)(a[i], b[i])), VXTRUE); + CHK(soaXfY(div)(c, a, b) == c); + FOR_EACH(i, 0, SOA_DIMENSION) CHKVX(vXf(eq)(c[i], vXf(div)(a[i], b[i])), VXTRUE); + + /* Linear interpolation */ + CHK(soaXfY(lerp)(c, a, b, f)); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(c[i], vXf(lerp)(a[i], b[i], f)), VXTRUE); + } + + /* Sum operator */ + f = soaXfY(sum)(a); + tmp = vXf(zero)(); + FOR_EACH(i, 0, SOA_DIMENSION) { + tmp = vXf(add)(tmp, a[i]); + } + CHKVX(vXf(eq)(f, tmp), VXTRUE); + + /* Dot operator */ + f = soaXfY(dot)(a, b); + tmp = vXf(zero)(); + FOR_EACH(i, 0, SOA_DIMENSION) { + tmp = vXf(add)(tmp, vXf(mul(a[i], b[i]))); + } + CHKVX(vXf(eq)(f, tmp), VXTRUE); + + /* Vector normalization functions */ + CHKVX(soaXfY(is_normalized)(a), VXFALSE); + f = soaXfY(normalize)(c, a); + CHKVX(vXf(eq)(soaXfY(len)(a), vXf(sqrt)(soaXfY(dot)(a, a))), VXTRUE); + tmp = vXf(sqrt)(soaXfY(dot)(a, a)); + CHKVX(vXf(eq_eps)(f, vXf(sqrt)(soaXfY(dot)(a, a)), vXf(set1)(1.e-4f)), VXTRUE); + CHKVX(soaXfY(is_normalized)(c), VXTRUE); + CHKVX(vXf(eq_eps)(soaXfY(len)(c), vXf(set1)(1.f), vXf(set1)(1.e-4f)), VXTRUE); + soaXfY(divf)(b, a, f); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq_eps)(b[i], c[i], vXf(set1)(1.e-4f)), VXTRUE); + } + + /* Comparators */ + CHKVX(soaXfY(eq)(a, a), VXTRUE); + CHKVX(soaXfY(eq)(a, b), VXFALSE); + soaXfY(addf)(b, a, vXf(set1(1.e-4f))); + CHKVX(soaXfY(eq)(a, b), VXFALSE); + CHKVX(soaXfY(eq_eps)(a, b, vXf(set1)(1.e-3f)), VXTRUE); + tmp = VEC(0, 0, 1.e-3f, 0, 0, 0, 1.e-3f, 1.e-3f); + mask = MASK(0, 0, ~0, 0, 0, 0, ~0, ~0); + CHKVX(soaXfY(eq_eps)(a, b, tmp), mask); + + /* Min/Max */ + CHK(soaXfY(min)(c, a, b) == c); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(c[i], vXf(min)(a[i], b[i])), VXTRUE); + } + CHK(soaXfY(max)(c, a, b) == c); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(c[i], vXf(max)(a[i], b[i])), VXTRUE); + } + + /* Select */ + v[0] = MASK(0,0,~0,~0,0,~0,~0,0); + v[1] = MASK(0,~0,~0,0,0,0,0,~0); + v[2] = MASK(0, 0, 0,0,~0,~0,0, 0); + v[3] = MASK(~0,~0,~0,0,~0,0,0,~0); + CHK(soaXfY(sel)(c, b, a, v[0]) == c); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(c[i], vXf(sel)(b[i], a[i], v[0])), VXTRUE); + } + CHK(soaXfY(selv)(c, b, a, v) == c); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq)(c[i], vXf(sel)(b[i], a[i], v[i])), VXTRUE); + } + + /* Cross product */ +#if SOA_DIMENSION == 2 + v[0] = vXf(mul)(a[0], b[1]); + v[1] = vXf(mul)(a[1], b[0]); + tmp = vXf(sub)(v[0], v[1]); + f = soaXfY(cross)(a, b); + CHKVX(vXf(eq_eps)(f, tmp, vXf(set1)(1.e-6f)), VXTRUE); +#elif SOA_DIMENSION == 3 + v[0] = vXf(sub)(vXf(mul)(a[1], b[2]), vXf(mul)(a[2], b[1])); + v[1] = vXf(sub)(vXf(mul)(a[2], b[0]), vXf(mul)(a[0], b[2])); + v[2] = vXf(sub)(vXf(mul)(a[0], b[1]), vXf(mul)(a[1], b[0])); + CHK(soaXfY(cross)(c, a, b) == c); + FOR_EACH(i, 0, SOA_DIMENSION) { + CHKVX(vXf(eq_eps)(c[i], v[i], vXf(set1)(1.e-6f)), VXTRUE); + } +#endif +} + +/* Generic parameters */ +#undef SOA_SIMD_WIDTH +#undef SOA_DIMENSION + +/* Macros generic to the SOA_SIMD_WIDTH parameter */ +#undef soaX +#undef vXf +#undef vXf_T +#undef VEC +#undef MASK +#undef CHKVX + +/* Macros generic to the SOA_DIMENSION parameter */ +#undef soaXfY +#undef SOA_VEC + +/* Constants */ +#undef VXTRUE +#undef VXFALSE diff --git a/src/test_v4f.c b/src/test_v4f.c @@ -1,19 +1,20 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #include "rsimd.h" +#include "math.h" int main(int argc, char** argv) @@ -25,522 +26,474 @@ main(int argc, char** argv) (void)argc, (void)argv; i = v4f_loadu(tmp+1); - CHECK(v4f_x(i), 1.f); - CHECK(v4f_y(i), 2.f); - CHECK(v4f_z(i), 3.f); - CHECK(v4f_w(i), 4.f); + CHK(v4f_x(i) == 1.f); + CHK(v4f_y(i) == 2.f); + CHK(v4f_z(i) == 3.f); + CHK(v4f_w(i) == 4.f); i = v4f_loadu3(tmp); - CHECK(v4f_x(i), 0.f); - CHECK(v4f_y(i), 1.f); - CHECK(v4f_z(i), 2.f); + CHK(v4f_x(i) == 0.f); + CHK(v4f_y(i) == 1.f); + CHK(v4f_z(i) == 2.f); i = v4f_load(tmp); - CHECK(v4f_x(i), 0.f); - CHECK(v4f_y(i), 1.f); - CHECK(v4f_z(i), 2.f); - CHECK(v4f_w(i), 3.f); + CHK(v4f_x(i) == 0.f); + CHK(v4f_y(i) == 1.f); + CHK(v4f_z(i) == 2.f); + CHK(v4f_w(i) == 3.f); tmp[0] = tmp[1] = tmp[2] = tmp[3] = 0.f; - CHECK(v4f_store(tmp, i), tmp); - CHECK(tmp[0], 0.f); - CHECK(tmp[1], 1.f); - CHECK(tmp[2], 2.f); - CHECK(tmp[3], 3.f); + CHK(v4f_store(tmp, i) == tmp); + CHK(tmp[0] == 0.f); + CHK(tmp[1] == 1.f); + CHK(tmp[2] == 2.f); + CHK(tmp[3] == 3.f); i = v4f_set(1.f, 2.f, 3.f, 4.f); - CHECK(v4f_x(i), 1.f); - CHECK(v4f_y(i), 2.f); - CHECK(v4f_z(i), 3.f); - CHECK(v4f_w(i), 4.f); + CHK(v4f_x(i) == 1.f); + CHK(v4f_y(i) == 2.f); + CHK(v4f_z(i) == 3.f); + CHK(v4f_w(i) == 4.f); i = v4f_set1(-2.f); - CHECK(v4f_x(i), -2.f); - CHECK(v4f_y(i), -2.f); - CHECK(v4f_z(i), -2.f); - CHECK(v4f_w(i), -2.f); + CHK(v4f_x(i) == -2.f); + CHK(v4f_y(i) == -2.f); + CHK(v4f_z(i) == -2.f); + CHK(v4f_w(i) == -2.f); i = v4f_zero(); - CHECK(v4f_x(i), 0.f); - CHECK(v4f_y(i), 0.f); - CHECK(v4f_z(i), 0.f); - CHECK(v4f_w(i), 0.f); + CHK(v4f_x(i) == 0.f); + CHK(v4f_y(i) == 0.f); + CHK(v4f_z(i) == 0.f); + CHK(v4f_w(i) == 0.f); i = v4f_mask(~0, 0, ~0, ~0); - cast.f[0] = v4f_x(i); CHECK(cast.i[0], (int32_t)0xFFFFFFFF); - cast.f[1] = v4f_y(i); CHECK(cast.i[1], (int32_t)0x00000000); - cast.f[2] = v4f_z(i); CHECK(cast.i[2], (int32_t)0xFFFFFFFF); - cast.f[3] = v4f_w(i); CHECK(cast.i[3], (int32_t)0xFFFFFFFF); + cast.f[0] = v4f_x(i); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(i); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(i); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(i); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); i = v4f_mask1(8); - cast.f[0] = v4f_x(i); CHECK(cast.i[0], 8); - cast.f[1] = v4f_y(i); CHECK(cast.i[1], 8); - cast.f[2] = v4f_z(i); CHECK(cast.i[2], 8); - cast.f[3] = v4f_w(i); CHECK(cast.i[3], 8); + cast.f[0] = v4f_x(i); CHK(cast.i[0] == 8); + cast.f[1] = v4f_y(i); CHK(cast.i[1] == 8); + cast.f[2] = v4f_z(i); CHK(cast.i[2] == 8); + cast.f[3] = v4f_w(i); CHK(cast.i[3] == 8); i = v4f_true(); - cast.f[0] = v4f_x(i); CHECK(cast.i[0], (int32_t)0xFFFFFFFF); - cast.f[1] = v4f_y(i); CHECK(cast.i[1], (int32_t)0xFFFFFFFF); - cast.f[2] = v4f_z(i); CHECK(cast.i[2], (int32_t)0xFFFFFFFF); - cast.f[3] = v4f_w(i); CHECK(cast.i[3], (int32_t)0xFFFFFFFF); + cast.f[0] = v4f_x(i); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(i); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(i); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(i); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); i = v4f_false(); - cast.f[0] = v4f_x(i); CHECK(cast.i[0], (int32_t)0x00000000); - cast.f[1] = v4f_y(i); CHECK(cast.i[1], (int32_t)0x00000000); - cast.f[2] = v4f_z(i); CHECK(cast.i[2], (int32_t)0x00000000); - cast.f[3] = v4f_w(i); CHECK(cast.i[3], (int32_t)0x00000000); + cast.f[0] = v4f_x(i); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(i); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(i); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(i); CHK(cast.i[3] == (int32_t)0x00000000); i = v4f_mask(~0, 0, ~0, ~0); j = v4f_mask(0, 0, 0, ~0); k = v4f_or(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0xFFFFFFFF); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0x00000000); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0xFFFFFFFF); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0xFFFFFFFF); - CHECK(v4f_mask_x(i), ~0); - CHECK(v4f_mask_y(i), 0); - CHECK(v4f_mask_z(i), ~0); - CHECK(v4f_mask_w(i), ~0); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); + CHK(v4f_mask_x(i) == ~0); + CHK(v4f_mask_y(i) == 0); + CHK(v4f_mask_z(i) == ~0); + CHK(v4f_mask_w(i) == ~0); k = v4f_and(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0x00000000); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0x00000000); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0x00000000); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0xFFFFFFFF); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); k = v4f_xor(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0xFFFFFFFF); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0x00000000); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0xFFFFFFFF); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0x00000000); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0x00000000); i = v4f_mask(0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F); j = v4f_mask(0x01020401, 0x70605040, 0x0F1F2F3F, 0x00000000); k = v4f_andnot(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], 0x01020400); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], 0x70605040); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], 0x07162534); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], 0x00000000); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == 0x01020400); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == 0x70605040); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == 0x07162534); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == 0x00000000); - CHECK(v4f_movemask(i), 0); + CHK(v4f_movemask(i) == 0); i = v4f_mask(0x01020401, (int32_t)0x80605040, 0x7F1F2F3F, 0x00000000); - CHECK(v4f_movemask(i), 2); + CHK(v4f_movemask(i) == 2); i = v4f_set(1.f, 2.f, 3.f, 4.f); j = v4f_set(5.f, 6.f, 7.f, 8.f); k = v4f_sel(i, j, v4f_mask(~0, 0, 0, ~0)); - CHECK(v4f_x(k), 5.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 8.f); + CHK(v4f_x(k) == 5.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 8.f); k = v4f_xayb(i, j); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 5.f); - CHECK(v4f_z(k), 2.f); - CHECK(v4f_w(k), 6.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 5.f); + CHK(v4f_z(k) == 2.f); + CHK(v4f_w(k) == 6.f); k = v4f_xyab(i, j); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 5.f); - CHECK(v4f_w(k), 6.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 5.f); + CHK(v4f_w(k) == 6.f); k = v4f_zcwd(i, j); - CHECK(v4f_x(k), 3.f); - CHECK(v4f_y(k), 7.f); - CHECK(v4f_z(k), 4.f); - CHECK(v4f_w(k), 8.f); + CHK(v4f_x(k) == 3.f); + CHK(v4f_y(k) == 7.f); + CHK(v4f_z(k) == 4.f); + CHK(v4f_w(k) == 8.f); k = v4f_zwcd(i, j); - CHECK(v4f_x(k), 3.f); - CHECK(v4f_y(k), 4.f); - CHECK(v4f_z(k), 7.f); - CHECK(v4f_w(k), 8.f); + CHK(v4f_x(k) == 3.f); + CHK(v4f_y(k) == 4.f); + CHK(v4f_z(k) == 7.f); + CHK(v4f_w(k) == 8.f); k = v4f_ayzw(i, j); - CHECK(v4f_x(k), 5.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 4.f); + CHK(v4f_x(k) == 5.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 4.f); k = v4f_xycd(i, j); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 7.f); - CHECK(v4f_w(k), 8.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 7.f); + CHK(v4f_w(k) == 8.f); k = v4f_ywbd(i, j); - CHECK(v4f_x(k), 2.f); - CHECK(v4f_y(k), 4.f); - CHECK(v4f_z(k), 6.f); - CHECK(v4f_w(k), 8.f); + CHK(v4f_x(k) == 2.f); + CHK(v4f_y(k) == 4.f); + CHK(v4f_z(k) == 6.f); + CHK(v4f_w(k) == 8.f); k = v4f_xbzw(i, j); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 6.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 4.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 6.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 4.f); k = v4f_xycw(i, j); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 7.f); - CHECK(v4f_w(k), 4.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 7.f); + CHK(v4f_w(k) == 4.f); k = v4f_xyzd(i, j); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 8.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 8.f); k = v4f_048C(v4f_set1(1.f), v4f_set1(2.f), v4f_set1(3.f), v4f_set1(4.f)); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 4.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 4.f); i = v4f_set(-1.f, 2.f, -3.f, 4.f); j = v4f_minus(i); - CHECK(v4f_x(j), 1.f); - CHECK(v4f_y(j), -2.f); - CHECK(v4f_z(j), 3.f); - CHECK(v4f_w(j), -4.f); + CHK(v4f_x(j) == 1.f); + CHK(v4f_y(j) == -2.f); + CHK(v4f_z(j) == 3.f); + CHK(v4f_w(j) == -4.f); k = v4f_add(i, j); - CHECK(v4f_x(k), 0.f); - CHECK(v4f_y(k), 0.f); - CHECK(v4f_z(k), 0.f); - CHECK(v4f_w(k), 0.f); + CHK(v4f_x(k) == 0.f); + CHK(v4f_y(k) == 0.f); + CHK(v4f_z(k) == 0.f); + CHK(v4f_w(k) == 0.f); k = v4f_sub(i, j); - CHECK(v4f_x(k), -2.f); - CHECK(v4f_y(k), 4.f); - CHECK(v4f_z(k), -6.f); - CHECK(v4f_w(k), 8.f); + CHK(v4f_x(k) == -2.f); + CHK(v4f_y(k) == 4.f); + CHK(v4f_z(k) == -6.f); + CHK(v4f_w(k) == 8.f); k = v4f_mul(i, j); - CHECK(v4f_x(k), -1.f); - CHECK(v4f_y(k), -4.f); - CHECK(v4f_z(k), -9.f); - CHECK(v4f_w(k), -16.f); + CHK(v4f_x(k) == -1.f); + CHK(v4f_y(k) == -4.f); + CHK(v4f_z(k) == -9.f); + CHK(v4f_w(k) == -16.f); k = v4f_div(k, i); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), -2.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), -4.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == -2.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == -4.f); k = v4f_madd(i, j, k); - CHECK(v4f_x(k), 0.f); - CHECK(v4f_y(k), -6.f); - CHECK(v4f_z(k), -6.f); - CHECK(v4f_w(k), -20.f); + CHK(v4f_x(k) == 0.f); + CHK(v4f_y(k) == -6.f); + CHK(v4f_z(k) == -6.f); + CHK(v4f_w(k) == -20.f); k = v4f_abs(i); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 4.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 4.f); i = v4f_set(4.f, 9.f, 16.f, 25.f); k = v4f_sqrt(i); - CHECK(v4f_x(k), 2.f); - CHECK(v4f_y(k), 3.f); - CHECK(v4f_z(k), 4.f); - CHECK(v4f_w(k), 5.f); + CHK(v4f_x(k) == 2.f); + CHK(v4f_y(k) == 3.f); + CHK(v4f_z(k) == 4.f); + CHK(v4f_w(k) == 5.f); k = v4f_rsqrte(i); - CHECK(eq_eps(v4f_x(k), 1.f/2.f, 1.e-3f), 1); - CHECK(eq_eps(v4f_y(k), 1.f/3.f, 1.e-3f), 1); - CHECK(eq_eps(v4f_z(k), 1.f/4.f, 1.e-3f), 1); - CHECK(eq_eps(v4f_w(k), 1.f/5.f, 1.e-3f), 1); + CHK(eq_eps(v4f_x(k), 1.f/2.f, 1.e-3f) == 1); + CHK(eq_eps(v4f_y(k), 1.f/3.f, 1.e-3f) == 1); + CHK(eq_eps(v4f_z(k), 1.f/4.f, 1.e-3f) == 1); + CHK(eq_eps(v4f_w(k), 1.f/5.f, 1.e-3f) == 1); k = v4f_rsqrt(i); - CHECK(eq_eps(v4f_x(k), 1.f/2.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), 1.f/3.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), 1.f/4.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), 1.f/5.f, 1.e-6f), 1); + CHK(eq_eps(v4f_x(k), 1.f/2.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_y(k), 1.f/3.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_z(k), 1.f/4.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_w(k), 1.f/5.f, 1.e-6f) == 1); k = v4f_rcpe(i); - CHECK(eq_eps(v4f_x(k), 1.f/4.f, 1.e-3f), 1); - CHECK(eq_eps(v4f_y(k), 1.f/9.f, 1.e-3f), 1); - CHECK(eq_eps(v4f_z(k), 1.f/16.f, 1.e-3f), 1); - CHECK(eq_eps(v4f_w(k), 1.f/25.f, 1.e-3f), 1); + CHK(eq_eps(v4f_x(k), 1.f/4.f, 1.e-3f) == 1); + CHK(eq_eps(v4f_y(k), 1.f/9.f, 1.e-3f) == 1); + CHK(eq_eps(v4f_z(k), 1.f/16.f, 1.e-3f) == 1); + CHK(eq_eps(v4f_w(k), 1.f/25.f, 1.e-3f) == 1); k = v4f_rcp(i); - CHECK(eq_eps(v4f_x(k), 1.f/4.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), 1.f/9.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), 1.f/16.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), 1.f/25.f, 1.e-6f), 1); + CHK(eq_eps(v4f_x(k), 1.f/4.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_y(k), 1.f/9.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_z(k), 1.f/16.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_w(k), 1.f/25.f, 1.e-6f) == 1); i = v4f_set(0.f, 1.f, 2.f, 4.f); j = v4f_set(1.f, 2.f, -1.f, 1.f); k = v4f_lerp(i, j, v4f_set1(0.5f)); - CHECK(v4f_x(k), 0.5f); - CHECK(v4f_y(k), 1.5f); - CHECK(v4f_z(k), 0.5f); - CHECK(v4f_w(k), 2.5f); + CHK(v4f_x(k) == 0.5f); + CHK(v4f_y(k) == 1.5f); + CHK(v4f_z(k) == 0.5f); + CHK(v4f_w(k) == 2.5f); k = v4f_sum(j); - CHECK(v4f_x(k), 3.f); - CHECK(v4f_y(k), 3.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 3.f); + CHK(v4f_x(k) == 3.f); + CHK(v4f_y(k) == 3.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 3.f); k = v4f_dot(i, j); - CHECK(v4f_x(k), 4.f); - CHECK(v4f_y(k), 4.f); - CHECK(v4f_z(k), 4.f); - CHECK(v4f_w(k), 4.f); + CHK(v4f_x(k) == 4.f); + CHK(v4f_y(k) == 4.f); + CHK(v4f_z(k) == 4.f); + CHK(v4f_w(k) == 4.f); k = v4f_len(i); - CHECK(eq_eps(v4f_x(k), (float)sqrt(21.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), (float)sqrt(21.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), (float)sqrt(21.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), (float)sqrt(21.0), 1.e-6f), 1); + CHK(eq_eps(v4f_x(k), (float)sqrt(21.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_y(k), (float)sqrt(21.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_z(k), (float)sqrt(21.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_w(k), (float)sqrt(21.0), 1.e-6f) == 1); i = v4f_set(0.f, 4.f, 2.f, 3.f); k = v4f_normalize(i); - CHECK(eq_eps(v4f_x(k), 0.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), 0.742781353f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), 0.371390676f, 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), 0.557086014f, 1.e-6f), 1); + CHK(eq_eps(v4f_x(k), 0.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_y(k), 0.742781353f, 1.e-6f) == 1); + CHK(eq_eps(v4f_z(k), 0.371390676f, 1.e-6f) == 1); + CHK(eq_eps(v4f_w(k), 0.557086014f, 1.e-6f) == 1); i = v4f_set(1.f, 4.f, 2.f, 3.f); k = v4f_sum2(i); - CHECK(v4f_x(k), 5.f); - CHECK(v4f_y(k), 5.f); - CHECK(v4f_z(k), 5.f); - CHECK(v4f_w(k), 5.f); + CHK(v4f_x(k) == 5.f); + CHK(v4f_y(k) == 5.f); + CHK(v4f_z(k) == 5.f); + CHK(v4f_w(k) == 5.f); j = v4f_set(2.f, 3.f, 5.f, 1.f); k = v4f_dot2(i, j); - CHECK(v4f_x(k), 14.f); - CHECK(v4f_y(k), 14.f); - CHECK(v4f_z(k), 14.f); - CHECK(v4f_w(k), 14.f); + CHK(v4f_x(k) == 14.f); + CHK(v4f_y(k) == 14.f); + CHK(v4f_z(k) == 14.f); + CHK(v4f_w(k) == 14.f); k = v4f_len2(i); - CHECK(eq_eps(v4f_x(k), (float)sqrt(17.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), (float)sqrt(17.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), (float)sqrt(17.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), (float)sqrt(17.0), 1.e-6f), 1); + CHK(eq_eps(v4f_x(k), (float)sqrt(17.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_y(k), (float)sqrt(17.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_z(k), (float)sqrt(17.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_w(k), (float)sqrt(17.0), 1.e-6f) == 1); i = v4f_set(1.f, -2.f, 2.f, 5.f); j = v4f_set(3.f, 1.f, 1.f, 5.f); k = v4f_cross2(i, j); - CHECK(v4f_x(k), 7.f); - CHECK(v4f_y(k), 7.f); - CHECK(v4f_z(k), 7.f); - CHECK(v4f_w(k), 7.f); + CHK(v4f_x(k) == 7.f); + CHK(v4f_y(k) == 7.f); + CHK(v4f_z(k) == 7.f); + CHK(v4f_w(k) == 7.f); k = v4f_cross2(j, i); - CHECK(v4f_x(k), -7.f); - CHECK(v4f_y(k), -7.f); - CHECK(v4f_z(k), -7.f); - CHECK(v4f_w(k), -7.f); + CHK(v4f_x(k) == -7.f); + CHK(v4f_y(k) == -7.f); + CHK(v4f_z(k) == -7.f); + CHK(v4f_w(k) == -7.f); i = v4f_set(0.f, 4.f, 5.f, 7.f); k = v4f_normalize2(i); - CHECK(eq_eps(v4f_x(k), 0.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), 1.f, 1.e-6f), 1); + CHK(eq_eps(v4f_x(k), 0.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_y(k), 1.f, 1.e-6f) == 1); k = v4f_sum3(i); - CHECK(v4f_x(k), 9.f); - CHECK(v4f_y(k), 9.f); - CHECK(v4f_z(k), 9.f); - CHECK(v4f_w(k), 9.f); + CHK(v4f_x(k) == 9.f); + CHK(v4f_y(k) == 9.f); + CHK(v4f_z(k) == 9.f); + CHK(v4f_w(k) == 9.f); i = v4f_set(2.f, 3.f, 2.f, 4.f); j = v4f_set(0.f, 4.f, 2.f, 19.f); k = v4f_dot3(i, j); - CHECK(v4f_x(k), 16.f); - CHECK(v4f_y(k), 16.f); - CHECK(v4f_z(k), 16.f); - CHECK(v4f_w(k), 16.f); + CHK(v4f_x(k) == 16.f); + CHK(v4f_y(k) == 16.f); + CHK(v4f_z(k) == 16.f); + CHK(v4f_w(k) == 16.f); k = v4f_len3(j); - CHECK(eq_eps(v4f_x(k), (float)sqrt(20.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), (float)sqrt(20.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), (float)sqrt(20.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), (float)sqrt(20.0), 1.e-6f), 1); + CHK(eq_eps(v4f_x(k), (float)sqrt(20.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_y(k), (float)sqrt(20.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_z(k), (float)sqrt(20.0), 1.e-6f) == 1); + CHK(eq_eps(v4f_w(k), (float)sqrt(20.0), 1.e-6f) == 1); k = v4f_normalize3(j); - CHECK(eq_eps(v4f_x(k), 0.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), 0.8944271910f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), 0.4472135995f, 1.e-6f), 1); + CHK(eq_eps(v4f_x(k), 0.f, 1.e-6f) == 1); + CHK(eq_eps(v4f_y(k), 0.8944271910f, 1.e-6f) == 1); + CHK(eq_eps(v4f_z(k), 0.4472135995f, 1.e-6f) == 1); i = v4f_set(1.f, -2.f, 2.f, 4.f); j = v4f_set(3.f, 1.f, -1.5f, 2.f); k = v4f_cross3(i, j); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 7.5f); - CHECK(v4f_z(k), 7.f); - - i = v4f_set((float)PI/2.f, (float)PI/3.f, (float)PI/4.f, (float)PI/6.f); - k = v4f_cos(i); - CHECK(eq_eps(v4f_x(k), (float)cos(PI/2.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), (float)cos(PI/3.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), (float)cos(PI/4.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), (float)cos(PI/6.0), 1.e-6f), 1); - - k = v4f_sin(i); - CHECK(eq_eps(v4f_x(k), (float)sin(PI/2.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), (float)sin(PI/3.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), (float)sin(PI/4.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), (float)sin(PI/6.0), 1.e-6f), 1); - - v4f_sincos(i, &k, &j); - CHECK(eq_eps(v4f_x(k), (float)sin(PI/2.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), (float)sin(PI/3.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), (float)sin(PI/4.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), (float)sin(PI/6.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_x(j), (float)cos(PI/2.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_y(j), (float)cos(PI/3.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_z(j), (float)cos(PI/4.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_w(j), (float)cos(PI/6.0), 1.e-6f), 1); - - i = v4f_set((float)PI/8.f, (float)PI/3.f, (float)PI/4.f, (float)PI/6.f); - k = v4f_tan(i); - CHECK(eq_eps(v4f_x(k), (float)tan(PI/8.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), (float)tan(PI/3.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), (float)tan(PI/4.0), 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), (float)tan(PI/6.0), 1.e-6f), 1); - - k = v4f_acos(v4f_cos(i)); - CHECK(eq_eps(v4f_x(k), PI/8.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), PI/3.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), PI/4.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), PI/6.f, 1.e-6f), 1); - - k = v4f_asin(v4f_sin(i)); - CHECK(eq_eps(v4f_x(k), PI/8.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), PI/3.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), PI/4.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), PI/6.f, 1.e-6f), 1); - - k = v4f_atan(v4f_tan(i)); - CHECK(eq_eps(v4f_x(k), PI/8.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_y(k), PI/3.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_z(k), PI/4.f, 1.e-6f), 1); - CHECK(eq_eps(v4f_w(k), PI/6.f, 1.e-6f), 1); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 7.5f); + CHK(v4f_z(k) == 7.f); i = v4f_set(1.f, 2.f, 3.f, 4.f); j = v4f_set(-2.f, -4.f, 3.f, 6.f); k = v4f_eq(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0x00000000); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0x00000000); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0xFFFFFFFF); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0x00000000); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0x00000000); k = v4f_neq(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0xFFFFFFFF); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0xFFFFFFFF); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0x00000000); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0xFFFFFFFF); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); k = v4f_gt(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0xFFFFFFFF); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0xFFFFFFFF); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0x00000000); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0x00000000); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0x00000000); k = v4f_lt(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0x00000000); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0x00000000); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0x00000000); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0xFFFFFFFF); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); k = v4f_ge(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0xFFFFFFFF); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0xFFFFFFFF); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0xFFFFFFFF); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0x00000000); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0x00000000); k = v4f_le(i, j); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0x00000000); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0x00000000); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0xFFFFFFFF); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0xFFFFFFFF); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); i = v4f_set(1.01f, 2.01f, 3.02f, 0.02f); j = v4f_set(1.f, 2.f, 3.f, 0.f); k = v4f_set(0.f, 0.01f, 0.02f, 0.f); k = v4f_eq_eps(i, j, k); - cast.f[0] = v4f_x(k); CHECK(cast.i[0], (int32_t)0x00000000); - cast.f[1] = v4f_y(k); CHECK(cast.i[1], (int32_t)0xFFFFFFFF); - cast.f[2] = v4f_z(k); CHECK(cast.i[2], (int32_t)0xFFFFFFFF); - cast.f[3] = v4f_w(k); CHECK(cast.i[3], (int32_t)0x00000000); + cast.f[0] = v4f_x(k); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(k); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(k); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(k); CHK(cast.i[3] == (int32_t)0x00000000); i = v4f_set(1.f, 2.f, 3.f, 4.f); j = v4f_set(-2.f, -4.f, 3.f, 6.f); k = v4f_min(i, j); - CHECK(v4f_x(k), -2.f); - CHECK(v4f_y(k), -4.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 4.f); + CHK(v4f_x(k) == -2.f); + CHK(v4f_y(k) == -4.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 4.f); k = v4f_max(i, j); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 6.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 6.f); k = v4f_reduce_min(i); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 1.f); - CHECK(v4f_z(k), 1.f); - CHECK(v4f_w(k), 1.f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 1.f); + CHK(v4f_z(k) == 1.f); + CHK(v4f_w(k) == 1.f); k = v4f_reduce_min(j); - CHECK(v4f_x(k), -4.f); - CHECK(v4f_y(k), -4.f); - CHECK(v4f_z(k), -4.f); - CHECK(v4f_w(k), -4.f); + CHK(v4f_x(k) == -4.f); + CHK(v4f_y(k) == -4.f); + CHK(v4f_z(k) == -4.f); + CHK(v4f_w(k) == -4.f); k = v4f_reduce_max(i); - CHECK(v4f_x(k), 4.f); - CHECK(v4f_y(k), 4.f); - CHECK(v4f_z(k), 4.f); - CHECK(v4f_w(k), 4.f); + CHK(v4f_x(k) == 4.f); + CHK(v4f_y(k) == 4.f); + CHK(v4f_z(k) == 4.f); + CHK(v4f_w(k) == 4.f); k = v4f_reduce_max(j); - CHECK(v4f_x(k), 6.f); - CHECK(v4f_y(k), 6.f); - CHECK(v4f_z(k), 6.f); - CHECK(v4f_w(k), 6.f); + CHK(v4f_x(k) == 6.f); + CHK(v4f_y(k) == 6.f); + CHK(v4f_z(k) == 6.f); + CHK(v4f_w(k) == 6.f); k = v4f_clamp(i, v4f_set(0.f, 0.f, -1.f, 3.f), v4f_set(1.f, 3.f, 2.f, 3.1f)); - CHECK(v4f_x(k), 1.f); - CHECK(v4f_y(k), 2.f); - CHECK(v4f_z(k), 2.f); - CHECK(v4f_w(k), 3.1f); + CHK(v4f_x(k) == 1.f); + CHK(v4f_y(k) == 2.f); + CHK(v4f_z(k) == 2.f); + CHK(v4f_w(k) == 3.1f); l = v4f_to_v4i(j); - CHECK(v4i_x(l), -2); - CHECK(v4i_y(l), -4); - CHECK(v4i_z(l), 3); - CHECK(v4i_w(l), 6); + CHK(v4i_x(l) == -2); + CHK(v4i_y(l) == -4); + CHK(v4i_z(l) == 3); + CHK(v4i_w(l) == 6); k = v4i_to_v4f(l); - CHECK(v4f_x(k), -2.f); - CHECK(v4f_y(k), -4.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), 6.f); + CHK(v4f_x(k) == -2.f); + CHK(v4f_y(k) == -4.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == 6.f); i = v4f_set(1.5f, 2.51f, 3.2f, 4.35f); l = v4f_to_v4i(i); - CHECK(v4i_x(l), 2); - CHECK(v4i_y(l), 3); - CHECK(v4i_z(l), 3); - CHECK(v4i_w(l), 4); + CHK(v4i_x(l) == 2); + CHK(v4i_y(l) == 3); + CHK(v4i_z(l) == 3); + CHK(v4i_w(l) == 4); l = v4f_trunk_v4i(i); - CHECK(v4i_x(l), 1); - CHECK(v4i_y(l), 2); - CHECK(v4i_z(l), 3); - CHECK(v4i_w(l), 4); + CHK(v4i_x(l) == 1); + CHK(v4i_y(l) == 2); + CHK(v4i_z(l) == 3); + CHK(v4i_w(l) == 4); cast.f[0] = 1.f; cast.f[1] = 2.f; @@ -549,121 +502,121 @@ main(int argc, char** argv) i = v4f_set(cast.f[0], cast.f[1], cast.f[2], cast.f[3]); l = v4f_rcast_v4i(i); - CHECK(v4i_x(l), cast.i[0]); - CHECK(v4i_y(l), cast.i[1]); - CHECK(v4i_z(l), cast.i[2]); - CHECK(v4i_w(l), cast.i[3]); + CHK(v4i_x(l) == cast.i[0]); + CHK(v4i_y(l) == cast.i[1]); + CHK(v4i_z(l) == cast.i[2]); + CHK(v4i_w(l) == cast.i[3]); i = v4i_rcast_v4f(l); - CHECK(v4f_x(i), cast.f[0]); - CHECK(v4f_y(i), cast.f[1]); - CHECK(v4f_z(i), cast.f[2]); - CHECK(v4f_w(i), cast.f[3]); + CHK(v4f_x(i) == cast.f[0]); + CHK(v4f_y(i) == cast.f[1]); + CHK(v4f_z(i) == cast.f[2]); + CHK(v4f_w(i) == cast.f[3]); k = v4f_xxxx(j); - CHECK(v4f_x(k), -2.f); - CHECK(v4f_y(k), -2.f); - CHECK(v4f_z(k), -2.f); - CHECK(v4f_w(k), -2.f); + CHK(v4f_x(k) == -2.f); + CHK(v4f_y(k) == -2.f); + CHK(v4f_z(k) == -2.f); + CHK(v4f_w(k) == -2.f); k = v4f_yyxx(j); - CHECK(v4f_x(k), -4.f); - CHECK(v4f_y(k), -4.f); - CHECK(v4f_z(k), -2.f); - CHECK(v4f_w(k), -2.f); + CHK(v4f_x(k) == -4.f); + CHK(v4f_y(k) == -4.f); + CHK(v4f_z(k) == -2.f); + CHK(v4f_w(k) == -2.f); k = v4f_wwxy(j); - CHECK(v4f_x(k), 6.f); - CHECK(v4f_y(k), 6.f); - CHECK(v4f_z(k), -2.f); - CHECK(v4f_w(k), -4.f); + CHK(v4f_x(k) == 6.f); + CHK(v4f_y(k) == 6.f); + CHK(v4f_z(k) == -2.f); + CHK(v4f_w(k) == -4.f); k = v4f_zyzy(j); - CHECK(v4f_x(k), 3.f); - CHECK(v4f_y(k), -4.f); - CHECK(v4f_z(k), 3.f); - CHECK(v4f_w(k), -4.f); + CHK(v4f_x(k) == 3.f); + CHK(v4f_y(k) == -4.f); + CHK(v4f_z(k) == 3.f); + CHK(v4f_w(k) == -4.f); k = v4f_wyyz(j); - CHECK(v4f_x(k), 6.f); - CHECK(v4f_y(k), -4.f); - CHECK(v4f_z(k), -4.f); - CHECK(v4f_w(k), 3.f); + CHK(v4f_x(k) == 6.f); + CHK(v4f_y(k) == -4.f); + CHK(v4f_z(k) == -4.f); + CHK(v4f_w(k) == 3.f); i = v4f_xyz_to_rthetaphi(v4f_set(10.f, 5.f, 3.f, 0.f)); - CHECK(eq_eps(v4f_x(i), 11.575836f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.308643f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 0.463647f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 11.575836f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.308643f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 0.463647f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(8.56f, 7.234f, 33.587f, 0.f)); - CHECK(eq_eps(v4f_x(i), 35.407498f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 0.322063f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 0.701638f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 35.407498f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 0.322063f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 0.701638f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(0.f, 0.f, 0.f, 0.f)); - CHECK(eq_eps(v4f_x(i), 0.f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 0.f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 0.f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 0.f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 0.f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 0.f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(4.53f, 0.f, 0.f, 0.f)); - CHECK(eq_eps(v4f_x(i), 4.53f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.570796f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 0.f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 4.53f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.570796f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 0.f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(0.f, 7.2f, 0.f, 0.f)); - CHECK(eq_eps(v4f_x(i), 7.2f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.570796f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 1.570796f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 7.2f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.570796f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 1.570796f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(4.53f, 7.2f, 0.f, 0.f)); - CHECK(eq_eps(v4f_x(i), 8.506521f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.570796f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 1.009206f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 8.506521f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.570796f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 1.009206f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(0.f, 0.f, 3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 3.1f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 0.f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 0.f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 3.1f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 0.f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 0.f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(4.53f, 0.f, 3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 5.489162f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 0.970666f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 0.f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 5.489162f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 0.970666f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 0.f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(0.f, 7.2f, 3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 7.839005f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.164229f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 1.570796f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 7.839005f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.164229f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 1.570796f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(4.53f, 7.2f, 3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.221327f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 1.009206f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.221327f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 1.009206f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(-4.53f, 7.2f, 3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.221327f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 2.132386f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.221327f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 2.132386f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(-4.53f, -7.2f, 3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.221327f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), -2.132386f, 1.e-5f) || - eq_eps(v4f_z(i), 2*PI - 2.132386f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.221327f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), -2.132386f, 1.e-5f) + || eq_eps(v4f_z(i), 2*PI - 2.132386f, 1.e-5f)); i = v4f_xyz_to_rthetaphi(v4f_set(4.53f, -7.2f, 3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.221327f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), -1.009206f, 1.e-5f) || - eq_eps(v4f_z(i), 2*PI - 1.009206f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.221327f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), -1.009206f, 1.e-5f) + || eq_eps(v4f_z(i), 2*PI - 1.009206f, 1.e-5f)); i = v4f_xyz_to_rthetaphi(v4f_set(4.53f, 7.2f, -3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.920264f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 1.009206f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.920264f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 1.009206f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(-4.53f, 7.2f, -3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.920264f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), 2.132386f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.920264f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), 2.132386f, 1.e-5f) == 1); i = v4f_xyz_to_rthetaphi(v4f_set(4.53f, -7.2f, -3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.920264f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), -1.009206f, 1.e-5f) || - eq_eps(v4f_z(i), 2*PI - 1.009206f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.920264f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), -1.009206f, 1.e-5f) + || eq_eps(v4f_z(i), 2*PI - 1.009206f, 1.e-5f)); i = v4f_xyz_to_rthetaphi(v4f_set(-4.53f, -7.2f, -3.1f, 0.f)); - CHECK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f), 1); - CHECK(eq_eps(v4f_y(i), 1.920264f, 1.e-5f), 1); - CHECK(eq_eps(v4f_z(i), -2.132386f, 1.e-5f) || - eq_eps(v4f_z(i), 2*PI - 2.132386f, 1.e-5f), 1); + CHK(eq_eps(v4f_x(i), 9.053778f, 1.e-5f) == 1); + CHK(eq_eps(v4f_y(i), 1.920264f, 1.e-5f) == 1); + CHK(eq_eps(v4f_z(i), -2.132386f, 1.e-5f) + || eq_eps(v4f_z(i), 2*PI - 2.132386f, 1.e-5f)); return 0; } diff --git a/src/test_v4i.c b/src/test_v4i.c @@ -1,16 +1,16 @@ -/* Copyright (C) 2014-2016 Vincent Forest (vaplv@free.fr) +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) * * The RSIMD library is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published + * it under the terms of the GNU General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The RSIMD library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * GNU General Public License for more details. * - * You should have received a copy of the GNU Lesser General Public License + * You should have received a copy of the GNU General Public License * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ #include "rsimd.h" @@ -23,180 +23,228 @@ main(int argc, char** argv) (void)argc, (void)argv; i = v4i_load(tmp); - CHECK(v4i_x(i), 0); - CHECK(v4i_y(i), 1); - CHECK(v4i_z(i), 2); - CHECK(v4i_w(i), 3); + CHK(v4i_x(i) == 0); + CHK(v4i_y(i) == 1); + CHK(v4i_z(i) == 2); + CHK(v4i_w(i) == 3); tmp[0] = tmp[1] = tmp[2] = tmp[3] = 0; - CHECK(v4i_store(tmp, i), tmp); - CHECK(tmp[0], 0); - CHECK(tmp[1], 1); - CHECK(tmp[2], 2); - CHECK(tmp[3], 3); + CHK(v4i_store(tmp, i) == tmp); + CHK(tmp[0] == 0); + CHK(tmp[1] == 1); + CHK(tmp[2] == 2); + CHK(tmp[3] == 3); i = v4i_set(1, 2, 3, 4); - CHECK(v4i_x(i), 1); - CHECK(v4i_y(i), 2); - CHECK(v4i_z(i), 3); - CHECK(v4i_w(i), 4); + CHK(v4i_x(i) == 1); + CHK(v4i_y(i) == 2); + CHK(v4i_z(i) == 3); + CHK(v4i_w(i) == 4); i = v4i_set1(-1); - CHECK(v4i_x(i), -1); - CHECK(v4i_y(i), -1); - CHECK(v4i_z(i), -1); - CHECK(v4i_w(i), -1); + CHK(v4i_x(i) == -1); + CHK(v4i_y(i) == -1); + CHK(v4i_z(i) == -1); + CHK(v4i_w(i) == -1); i = v4i_zero(); - CHECK(v4i_x(i), 0); - CHECK(v4i_y(i), 0); - CHECK(v4i_z(i), 0); - CHECK(v4i_w(i), 0); + CHK(v4i_x(i) == 0); + CHK(v4i_y(i) == 0); + CHK(v4i_z(i) == 0); + CHK(v4i_w(i) == 0); i = v4i_set(1, 2, 3, 4); j = v4i_set(5, 6, 7, 8); k = v4i_xayb(i, j); - CHECK(v4i_x(k), 1); - CHECK(v4i_y(k), 5); - CHECK(v4i_z(k), 2); - CHECK(v4i_w(k), 6); + CHK(v4i_x(k) == 1); + CHK(v4i_y(k) == 5); + CHK(v4i_z(k) == 2); + CHK(v4i_w(k) == 6); k = v4i_zcwd(i, j); - CHECK(v4i_x(k), 3); - CHECK(v4i_y(k), 7); - CHECK(v4i_z(k), 4); - CHECK(v4i_w(k), 8); + CHK(v4i_x(k) == 3); + CHK(v4i_y(k) == 7); + CHK(v4i_z(k) == 4); + CHK(v4i_w(k) == 8); i = v4i_set(0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F); j = v4i_set(0x01020401, 0x70605040, 0x0F1F2F3F, 0x00000000); k = v4i_or(i, j); - CHECK(v4i_x(k), (int32_t)0x01030603); - CHECK(v4i_y(k), (int32_t)0x74655647); - CHECK(v4i_z(k), (int32_t)0x0F1F2F3F); - CHECK(v4i_w(k), (int32_t)0x0C0D0E0F); + CHK(v4i_x(k) == (int32_t)0x01030603); + CHK(v4i_y(k) == (int32_t)0x74655647); + CHK(v4i_z(k) == (int32_t)0x0F1F2F3F); + CHK(v4i_w(k) == (int32_t)0x0C0D0E0F); k = v4i_and(i, j); - CHECK(v4i_x(k), (int32_t)0x00000001); - CHECK(v4i_y(k), (int32_t)0x00000000); - CHECK(v4i_z(k), (int32_t)0x08090A0B); - CHECK(v4i_w(k), (int32_t)0x00000000); + CHK(v4i_x(k) == (int32_t)0x00000001); + CHK(v4i_y(k) == (int32_t)0x00000000); + CHK(v4i_z(k) == (int32_t)0x08090A0B); + CHK(v4i_w(k) == (int32_t)0x00000000); k = v4i_andnot(i, j); - CHECK(v4i_x(k), (int32_t)0x01020400); - CHECK(v4i_y(k), (int32_t)0x70605040); - CHECK(v4i_z(k), (int32_t)0x07162534); - CHECK(v4i_w(k), (int32_t)0x00000000); + CHK(v4i_x(k) == (int32_t)0x01020400); + CHK(v4i_y(k) == (int32_t)0x70605040); + CHK(v4i_z(k) == (int32_t)0x07162534); + CHK(v4i_w(k) == (int32_t)0x00000000); k = v4i_xor(i, j); - CHECK(v4i_x(k), (int32_t)0x01030602); - CHECK(v4i_y(k), (int32_t)0x74655647); - CHECK(v4i_z(k), (int32_t)0x07162534); - CHECK(v4i_w(k), (int32_t)0x0C0D0E0F); + CHK(v4i_x(k) == (int32_t)0x01030602); + CHK(v4i_y(k) == (int32_t)0x74655647); + CHK(v4i_z(k) == (int32_t)0x07162534); + CHK(v4i_w(k) == (int32_t)0x0C0D0E0F); k = v4i_not(i); - CHECK(v4i_x(k), (int32_t)0xFFFEFDFC); - CHECK(v4i_y(k), (int32_t)0xFBFAF9F8); - CHECK(v4i_z(k), (int32_t)0xF7F6F5F4); - CHECK(v4i_w(k), (int32_t)0xF3F2F1F0); + CHK(v4i_x(k) == (int32_t)0xFFFEFDFC); + CHK(v4i_y(k) == (int32_t)0xFBFAF9F8); + CHK(v4i_z(k) == (int32_t)0xF7F6F5F4); + CHK(v4i_w(k) == (int32_t)0xF3F2F1F0); i = v4i_set(32, 16, 8, 4); k = v4i_rshift(i, 4); - CHECK(v4i_x(k), 2); - CHECK(v4i_y(k), 1); - CHECK(v4i_z(k), 0); - CHECK(v4i_w(k), 0); - + CHK(v4i_x(k) == 2); + CHK(v4i_y(k) == 1); + CHK(v4i_z(k) == 0); + CHK(v4i_w(k) == 0); + k = v4i_rshift(i, 1); - CHECK(v4i_x(k), 16); - CHECK(v4i_y(k), 8); - CHECK(v4i_z(k), 4); - CHECK(v4i_w(k), 2); + CHK(v4i_x(k) == 16); + CHK(v4i_y(k) == 8); + CHK(v4i_z(k) == 4); + CHK(v4i_w(k) == 2); k = v4i_lshift(i, 4); - CHECK(v4i_x(k), 512); - CHECK(v4i_y(k), 256); - CHECK(v4i_z(k), 128); - CHECK(v4i_w(k), 64); + CHK(v4i_x(k) == 512); + CHK(v4i_y(k) == 256); + CHK(v4i_z(k) == 128); + CHK(v4i_w(k) == 64); i = v4i_set(1, 2, 3, 4); j = v4i_set(-2, -4, 3, 6); k = v4i_add(i, j); - CHECK(v4i_x(k), -1); - CHECK(v4i_y(k), -2); - CHECK(v4i_z(k), 6); - CHECK(v4i_w(k), 10); + CHK(v4i_x(k) == -1); + CHK(v4i_y(k) == -2); + CHK(v4i_z(k) == 6); + CHK(v4i_w(k) == 10); k = v4i_sub(i, j); - CHECK(v4i_x(k), 3); - CHECK(v4i_y(k), 6); - CHECK(v4i_z(k), 0); - CHECK(v4i_w(k), -2); + CHK(v4i_x(k) == 3); + CHK(v4i_y(k) == 6); + CHK(v4i_z(k) == 0); + CHK(v4i_w(k) == -2); + + k = v4i_minus(j); + CHK(v4i_x(k) == -v4i_x(j)); + CHK(v4i_y(k) == -v4i_y(j)); + CHK(v4i_z(k) == -v4i_z(j)); + CHK(v4i_w(k) == -v4i_w(j)); k = v4i_eq(i, j); - CHECK(v4i_x(k), (int32_t)0x00000000); - CHECK(v4i_y(k), (int32_t)0x00000000); - CHECK(v4i_z(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_w(k), (int32_t)0x00000000); + CHK(v4i_x(k) == (int32_t)0x00000000); + CHK(v4i_y(k) == (int32_t)0x00000000); + CHK(v4i_z(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_w(k) == (int32_t)0x00000000); k = v4i_neq(i, j); - CHECK(v4i_x(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_y(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_z(k), (int32_t)0x00000000); - CHECK(v4i_w(k), (int32_t)0xFFFFFFFF); + CHK(v4i_x(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_y(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_z(k) == (int32_t)0x00000000); + CHK(v4i_w(k) == (int32_t)0xFFFFFFFF); k = v4i_gt(i, j); - CHECK(v4i_x(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_y(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_z(k), (int32_t)0x00000000); - CHECK(v4i_w(k), (int32_t)0x00000000); + CHK(v4i_x(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_y(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_z(k) == (int32_t)0x00000000); + CHK(v4i_w(k) == (int32_t)0x00000000); k = v4i_lt(i, j); - CHECK(v4i_x(k), (int32_t)0x00000000); - CHECK(v4i_y(k), (int32_t)0x00000000); - CHECK(v4i_z(k), (int32_t)0x00000000); - CHECK(v4i_w(k), (int32_t)0xFFFFFFFF); + CHK(v4i_x(k) == (int32_t)0x00000000); + CHK(v4i_y(k) == (int32_t)0x00000000); + CHK(v4i_z(k) == (int32_t)0x00000000); + CHK(v4i_w(k) == (int32_t)0xFFFFFFFF); k = v4i_ge(i, j); - CHECK(v4i_x(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_y(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_z(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_w(k), (int32_t)0x00000000); + CHK(v4i_x(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_y(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_z(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_w(k) == (int32_t)0x00000000); k = v4i_le(i, j); - CHECK(v4i_x(k), (int32_t)0x00000000); - CHECK(v4i_y(k), (int32_t)0x00000000); - CHECK(v4i_z(k), (int32_t)0xFFFFFFFF); - CHECK(v4i_w(k), (int32_t)0xFFFFFFFF); + CHK(v4i_x(k) == (int32_t)0x00000000); + CHK(v4i_y(k) == (int32_t)0x00000000); + CHK(v4i_z(k) == (int32_t)0xFFFFFFFF); + CHK(v4i_w(k) == (int32_t)0xFFFFFFFF); k = v4i_sel(i, j, v4i_set(~0, 0, ~0, 0)); - CHECK(v4i_x(k), -2); - CHECK(v4i_y(k), 2); - CHECK(v4i_z(k), 3); - CHECK(v4i_w(k), 4); + CHK(v4i_x(k) == -2); + CHK(v4i_y(k) == 2); + CHK(v4i_z(k) == 3); + CHK(v4i_w(k) == 4); k = v4i_xxxx(i); - CHECK(v4i_x(k), 1); - CHECK(v4i_y(k), 1); - CHECK(v4i_z(k), 1); - CHECK(v4i_w(k), 1); + CHK(v4i_x(k) == 1); + CHK(v4i_y(k) == 1); + CHK(v4i_z(k) == 1); + CHK(v4i_w(k) == 1); k = v4i_wwxy(i); - CHECK(v4i_x(k), 4); - CHECK(v4i_y(k), 4); - CHECK(v4i_z(k), 1); - CHECK(v4i_w(k), 2); + CHK(v4i_x(k) == 4); + CHK(v4i_y(k) == 4); + CHK(v4i_z(k) == 1); + CHK(v4i_w(k) == 2); k = v4i_xyxy(i); - CHECK(v4i_x(k), 1); - CHECK(v4i_y(k), 2); - CHECK(v4i_z(k), 1); - CHECK(v4i_w(k), 2); + CHK(v4i_x(k) == 1); + CHK(v4i_y(k) == 2); + CHK(v4i_z(k) == 1); + CHK(v4i_w(k) == 2); k = v4i_wyyz(i); - CHECK(v4i_x(k), 4); - CHECK(v4i_y(k), 2); - CHECK(v4i_z(k), 2); - CHECK(v4i_w(k), 3); + CHK(v4i_x(k) == 4); + CHK(v4i_y(k) == 2); + CHK(v4i_z(k) == 2); + CHK(v4i_w(k) == 3); + + i = v4i_set(1, 2, 3, 4); + j = v4i_set(-2, -4, 3, 6); + k = v4i_min(i, j); + CHK(v4i_x(k) == -2); + CHK(v4i_y(k) == -4); + CHK(v4i_z(k) == 3); + CHK(v4i_w(k) == 4); + + k = v4i_max(i, j); + CHK(v4i_x(k) == 1); + CHK(v4i_y(k) == 2); + CHK(v4i_z(k) == 3); + CHK(v4i_w(k) == 6); + + k = v4i_reduce_min(i); + CHK(v4i_x(k) == 1); + CHK(v4i_y(k) == 1); + CHK(v4i_z(k) == 1); + CHK(v4i_w(k) == 1); + CHK(v4i_reduce_min_i32(i) == 1); + + k = v4i_reduce_min(j); + CHK(v4i_x(k) == -4); + CHK(v4i_y(k) == -4); + CHK(v4i_z(k) == -4); + CHK(v4i_w(k) == -4); + CHK(v4i_reduce_min_i32(j) == -4); + + k = v4i_reduce_max(i); + CHK(v4i_x(k) == 4); + CHK(v4i_y(k) == 4); + CHK(v4i_z(k) == 4); + CHK(v4i_w(k) == 4); + CHK(v4i_reduce_max_i32(i) == 4); + + k = v4i_reduce_max(j); + CHK(v4i_x(k) == 6); + CHK(v4i_y(k) == 6); + CHK(v4i_z(k) == 6); + CHK(v4i_w(k) == 6); + CHK(v4i_reduce_max_i32(j) == 6); return 0; } diff --git a/src/test_v8f.c b/src/test_v8f.c @@ -0,0 +1,450 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "rsimd.h" + +int +main(int argc, char** argv) +{ + v8f_T i, j, k; + ALIGN(32) union { int32_t i[8]; float f[8]; } cast; + ALIGN(32) float tmp[9] = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f}; + (void)argc, (void)argv; + + i = v8f_loadu(tmp+1); + CHK(v4f_x(v8f_abcd(i)) == 1.f); + CHK(v4f_y(v8f_abcd(i)) == 2.f); + CHK(v4f_z(v8f_abcd(i)) == 3.f); + CHK(v4f_w(v8f_abcd(i)) == 4.f); + CHK(v4f_x(v8f_efgh(i)) == 5.f); + CHK(v4f_y(v8f_efgh(i)) == 6.f); + CHK(v4f_z(v8f_efgh(i)) == 7.f); + CHK(v4f_w(v8f_efgh(i)) == 8.f); + + i = v8f_load(tmp); + CHK(v4f_x(v8f_abcd(i)) == 0.f); + CHK(v4f_y(v8f_abcd(i)) == 1.f); + CHK(v4f_z(v8f_abcd(i)) == 2.f); + CHK(v4f_w(v8f_abcd(i)) == 3.f); + CHK(v4f_x(v8f_efgh(i)) == 4.f); + CHK(v4f_y(v8f_efgh(i)) == 5.f); + CHK(v4f_z(v8f_efgh(i)) == 6.f); + CHK(v4f_w(v8f_efgh(i)) == 7.f); + + tmp[0] = tmp[1] = tmp[2] = tmp[3] = 0.f; + tmp[4] = tmp[5] = tmp[6] = tmp[7] = 0.f; + CHK(v8f_store(tmp, i) == tmp); + CHK(tmp[0] == 0.f); + CHK(tmp[1] == 1.f); + CHK(tmp[2] == 2.f); + CHK(tmp[3] == 3.f); + CHK(tmp[4] == 4.f); + CHK(tmp[5] == 5.f); + CHK(tmp[6] == 6.f); + CHK(tmp[7] == 7.f); + CHK(tmp[8] == 8.f); + + i = v8f_set1(-2.f); + CHK(v4f_x(v8f_abcd(i)) == -2.f); + CHK(v4f_y(v8f_abcd(i)) == -2.f); + CHK(v4f_z(v8f_abcd(i)) == -2.f); + CHK(v4f_w(v8f_abcd(i)) == -2.f); + CHK(v4f_x(v8f_efgh(i)) == -2.f); + CHK(v4f_y(v8f_efgh(i)) == -2.f); + CHK(v4f_z(v8f_efgh(i)) == -2.f); + CHK(v4f_w(v8f_efgh(i)) == -2.f); + + i = v8f_set(0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f); + CHK(v4f_x(v8f_abcd(i)) == 0.f); + CHK(v4f_y(v8f_abcd(i)) == 1.f); + CHK(v4f_z(v8f_abcd(i)) == 2.f); + CHK(v4f_w(v8f_abcd(i)) == 3.f); + CHK(v4f_x(v8f_efgh(i)) == 4.f); + CHK(v4f_y(v8f_efgh(i)) == 5.f); + CHK(v4f_z(v8f_efgh(i)) == 6.f); + CHK(v4f_w(v8f_efgh(i)) == 7.f); + + i = v8f_zero(); + CHK(v4f_x(v8f_abcd(i)) == 0.f); + CHK(v4f_y(v8f_abcd(i)) == 0.f); + CHK(v4f_z(v8f_abcd(i)) == 0.f); + CHK(v4f_w(v8f_abcd(i)) == 0.f); + CHK(v4f_x(v8f_efgh(i)) == 0.f); + CHK(v4f_y(v8f_efgh(i)) == 0.f); + CHK(v4f_z(v8f_efgh(i)) == 0.f); + CHK(v4f_w(v8f_efgh(i)) == 0.f); + + i = v8f_mask(~0,~0,0,0,0,~0,~0,0); + cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0x00000000); + + i = v8f_mask1(~0); + cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); + cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0xFFFFFFFF); + cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + i = v8f_true(); + cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); + cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0xFFFFFFFF); + cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + i = v8f_false(); + cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0x00000000); + cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0x00000000); + cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0x00000000); + + i = v8f_mask(~0,~0,0,0,0,~0,~0,0); + j = v8f_mask(~0,0,~0,0,0,~0,0,~0); + k = v8f_or(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + k = v8f_and(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0x00000000); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0x00000000); + + k = v8f_andnot(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0x00000000); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0x00000000); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + k = v8f_xor(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0x00000000); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + CHK(v8f_movemask(k) == 0xC6); + i = v8f_mask + ((int32_t)0x01020401, (int32_t)0x80605040, (int32_t)0x7F1F2F3F, (int32_t)0, + (int32_t)0xF0000000, (int32_t)0xFFFFFFFF, (int32_t)0x7FFFFFFF, (int32_t)~0); + CHK(v8f_movemask(i) == 0xB2); + + i = v8f_set(0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f); + j = v8f_set(8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f); + k = v8f_sel(i, j, v8f_mask(~0,~0,0,0,0,~0,~0,0)); + CHK(v4f_x(v8f_abcd(k)) == 8.f); + CHK(v4f_y(v8f_abcd(k)) == 9.f); + CHK(v4f_z(v8f_abcd(k)) == 2.f); + CHK(v4f_w(v8f_abcd(k)) == 3.f); + CHK(v4f_x(v8f_efgh(k)) == 4.f); + CHK(v4f_y(v8f_efgh(k)) == 13.f); + CHK(v4f_z(v8f_efgh(k)) == 14.f); + CHK(v4f_w(v8f_efgh(k)) == 7.f); + + k = v8f_minus(i); + CHK(v4f_x(v8f_abcd(k)) == -0.f); + CHK(v4f_y(v8f_abcd(k)) == -1.f); + CHK(v4f_z(v8f_abcd(k)) == -2.f); + CHK(v4f_w(v8f_abcd(k)) == -3.f); + CHK(v4f_x(v8f_efgh(k)) == -4.f); + CHK(v4f_y(v8f_efgh(k)) == -5.f); + CHK(v4f_z(v8f_efgh(k)) == -6.f); + CHK(v4f_w(v8f_efgh(k)) == -7.f); + + k = v8f_add(i, j); + CHK(v4f_x(v8f_abcd(k)) == 8.f); + CHK(v4f_y(v8f_abcd(k)) == 10.f); + CHK(v4f_z(v8f_abcd(k)) == 12.f); + CHK(v4f_w(v8f_abcd(k)) == 14.f); + CHK(v4f_x(v8f_efgh(k)) == 16.f); + CHK(v4f_y(v8f_efgh(k)) == 18.f); + CHK(v4f_z(v8f_efgh(k)) == 20.f); + CHK(v4f_w(v8f_efgh(k)) == 22.f); + + k = v8f_sub(i, j); + CHK(v4f_x(v8f_abcd(k)) == -8.f); + CHK(v4f_y(v8f_abcd(k)) == -8.f); + CHK(v4f_z(v8f_abcd(k)) == -8.f); + CHK(v4f_w(v8f_abcd(k)) == -8.f); + CHK(v4f_x(v8f_efgh(k)) == -8.f); + CHK(v4f_y(v8f_efgh(k)) == -8.f); + CHK(v4f_z(v8f_efgh(k)) == -8.f); + CHK(v4f_w(v8f_efgh(k)) == -8.f); + + k = v8f_mul(i, j); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) == 9.f); + CHK(v4f_z(v8f_abcd(k)) == 20.f); + CHK(v4f_w(v8f_abcd(k)) == 33.f); + CHK(v4f_x(v8f_efgh(k)) == 48.f); + CHK(v4f_y(v8f_efgh(k)) == 65.f); + CHK(v4f_z(v8f_efgh(k)) == 84.f); + CHK(v4f_w(v8f_efgh(k)) == 105.f); + + k = v8f_div(i, j); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) == 1.f/9.f); + CHK(v4f_z(v8f_abcd(k)) == 0.2f); + CHK(v4f_w(v8f_abcd(k)) == 3.f/11.f); + CHK(v4f_x(v8f_efgh(k)) == 1.f/3.f); + CHK(v4f_y(v8f_efgh(k)) == 5.f/13.f); + CHK(v4f_z(v8f_efgh(k)) == 3.f/7.f); + CHK(v4f_w(v8f_efgh(k)) == 7.f/15.f); + + k = v8f_set(0.1f,0.2f,0.3f,0.4f,0.5f,0.6f,0.7f,0.8f); + k = v8f_madd(i, j, k); + CHK(v4f_x(v8f_abcd(k)) == 0.1f); + CHK(v4f_y(v8f_abcd(k)) == 9.2f); + CHK(v4f_z(v8f_abcd(k)) == 20.3f); + CHK(v4f_w(v8f_abcd(k)) == 33.4f); + CHK(v4f_x(v8f_efgh(k)) == 48.5f); + CHK(v4f_y(v8f_efgh(k)) == 65.6f); + CHK(v4f_z(v8f_efgh(k)) == 84.7f); + CHK(v4f_w(v8f_efgh(k)) == 105.8f); + + k = v8f_abs(v8f_minus(i)); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) == 1.f); + CHK(v4f_z(v8f_abcd(k)) == 2.f); + CHK(v4f_w(v8f_abcd(k)) == 3.f); + CHK(v4f_x(v8f_efgh(k)) == 4.f); + CHK(v4f_y(v8f_efgh(k)) == 5.f); + CHK(v4f_z(v8f_efgh(k)) == 6.f); + CHK(v4f_w(v8f_efgh(k)) == 7.f); + + i = v8f_set(1.f, 4.f, 9.f, 16.f, 25.f, 36.f, 49.f, 64.f); + k = v8f_sqrt(i); + CHK(v4f_x(v8f_abcd(k)) == 1.f); + CHK(v4f_y(v8f_abcd(k)) == 2.f); + CHK(v4f_z(v8f_abcd(k)) == 3.f); + CHK(v4f_w(v8f_abcd(k)) == 4.f); + CHK(v4f_x(v8f_efgh(k)) == 5.f); + CHK(v4f_y(v8f_efgh(k)) == 6.f); + CHK(v4f_z(v8f_efgh(k)) == 7.f); + CHK(v4f_w(v8f_efgh(k)) == 8.f); + + k = v8f_rsqrte(i); + CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-3f)); + CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-3f)); + CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-3f)); + CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-3f)); + CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-3f)); + CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-3f)); + CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-3f)); + CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-3f)); + + k = v8f_rsqrt(i); + CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-6f)); + CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-6f)); + CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-6f)); + CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-6f)); + CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-6f)); + CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-6f)); + CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-6f)); + CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-6f)); + + i = v8f_set(1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f); + k = v8f_rcpe(i); + CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-3f)); + CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-3f)); + CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-3f)); + CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-3f)); + CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-3f)); + CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-3f)); + CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-3f)); + CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-3f)); + + k = v8f_rcp(i); + CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-6f)); + CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-6f)); + CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-6f)); + CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-6f)); + CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-6f)); + CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-6f)); + CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-6f)); + CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-6f)); + + j = v8f_set(2.f,3.f,4.f,5.f,6.f,7.f,8.f,9.f); + k = v8f_lerp(i, j, v8f_set1(0.5f)); + CHK(v4f_x(v8f_abcd(k)) == 1.5f); + CHK(v4f_y(v8f_abcd(k)) == 2.5f); + CHK(v4f_z(v8f_abcd(k)) == 3.5f); + CHK(v4f_w(v8f_abcd(k)) == 4.5f); + CHK(v4f_x(v8f_efgh(k)) == 5.5f); + CHK(v4f_y(v8f_efgh(k)) == 6.5f); + CHK(v4f_z(v8f_efgh(k)) == 7.5f); + CHK(v4f_w(v8f_efgh(k)) == 8.5f); + + i = v8f_set(0.f, 1.f,2.f,3.f, 4.f,5.f,6.f,7.f); + j = v8f_set(0.f,-1.f,4.f,4.f,-2.f,6.f,6.f,8.f); + + k = v8f_eq(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0); + + k = v8f_neq(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + k = v8f_ge(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0); + + k = v8f_le(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + k = v8f_gt(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0); + + k = v8f_lt(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + j = v8f_set(0.0001f, 0.99999f, 2.f, 3.1f, 4.001f, 5.0002f, 6.f, 6.999999f); + k = v8f_eq_eps(i, j, v8f_set1(1.e-4f)); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + k = v8f_eq_eps(i, j, v8f_set(1.e-4f, 1.e-4f, 0.f, 0.1f, 1.e-3f, 2.e-4f, 0.f, 1.e-5f)); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + i = v8f_set(0.f, 1.f,2.f,3.f, 4.f,5.f,6.f,7.f); + j = v8f_set(0.f,-1.f,4.f,4.f,-2.f,6.f,6.f,8.f); + + k = v8f_min(i, j); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) ==-1.f); + CHK(v4f_z(v8f_abcd(k)) == 2.f); + CHK(v4f_w(v8f_abcd(k)) == 3.f); + CHK(v4f_x(v8f_efgh(k)) ==-2.f); + CHK(v4f_y(v8f_efgh(k)) == 5.f); + CHK(v4f_z(v8f_efgh(k)) == 6.f); + CHK(v4f_w(v8f_efgh(k)) == 7.f); + + k = v8f_max(i, j); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) == 1.f); + CHK(v4f_z(v8f_abcd(k)) == 4.f); + CHK(v4f_w(v8f_abcd(k)) == 4.f); + CHK(v4f_x(v8f_efgh(k)) == 4.f); + CHK(v4f_y(v8f_efgh(k)) == 6.f); + CHK(v4f_z(v8f_efgh(k)) == 6.f); + CHK(v4f_w(v8f_efgh(k)) == 8.f); + + CHK(v8f_reduce_min(i) == 0.f); + CHK(v8f_reduce_min(j) ==-2.f); + CHK(v8f_reduce_max(i) == 7.f); + CHK(v8f_reduce_max(j) == 8.f); + + k = v8f_clamp(i, + v8f_set(1.f, 1.f, 3.1f, 5.f, 4.f, 0.f, 0.f, -1.f), + v8f_set(1.f, 1.f, 4.f, 6.f, 4.f, 1.f, 6.f, 5.f)); + + CHK(v4f_x(v8f_abcd(k)) == 1.f); + CHK(v4f_y(v8f_abcd(k)) == 1.f); + CHK(v4f_z(v8f_abcd(k)) == 3.1f); + CHK(v4f_w(v8f_abcd(k)) == 5.f); + CHK(v4f_x(v8f_efgh(k)) == 4.f); + CHK(v4f_y(v8f_efgh(k)) == 1.f); + CHK(v4f_z(v8f_efgh(k)) == 6.f); + CHK(v4f_w(v8f_efgh(k)) == 5.f); + + return 0; +} + diff --git a/src/test_v8i.c b/src/test_v8i.c @@ -0,0 +1,192 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "rsimd.h" + +int +main(int argc, char** argv) +{ + v8i_T i, j, k; + ALIGN(32) int32_t tmp[8] = {0,1,2,3,4,5,6,7}; + (void)argc, (void)argv; + + i = v8i_load(tmp); + CHK(v4i_x(v8i_abcd(i)) == 0); + CHK(v4i_y(v8i_abcd(i)) == 1); + CHK(v4i_z(v8i_abcd(i)) == 2); + CHK(v4i_w(v8i_abcd(i)) == 3); + CHK(v4i_x(v8i_efgh(i)) == 4); + CHK(v4i_y(v8i_efgh(i)) == 5); + CHK(v4i_z(v8i_efgh(i)) == 6); + CHK(v4i_w(v8i_efgh(i)) == 7); + + tmp[0]= tmp[1] = tmp[2] = tmp[3] = 0; + tmp[4]= tmp[5] = tmp[6] = tmp[7] = 0; + CHK(v8i_store(tmp, i) == tmp); + CHK(tmp[0] == 0); + CHK(tmp[1] == 1); + CHK(tmp[2] == 2); + CHK(tmp[3] == 3); + CHK(tmp[4] == 4); + CHK(tmp[5] == 5); + CHK(tmp[6] == 6); + CHK(tmp[7] == 7); + + i = v8i_set(1, 2, 3, 4, 5, 6, 7, 8); + CHK(v4i_x(v8i_abcd(i)) == 1); + CHK(v4i_y(v8i_abcd(i)) == 2); + CHK(v4i_z(v8i_abcd(i)) == 3); + CHK(v4i_w(v8i_abcd(i)) == 4); + CHK(v4i_x(v8i_efgh(i)) == 5); + CHK(v4i_y(v8i_efgh(i)) == 6); + CHK(v4i_z(v8i_efgh(i)) == 7); + CHK(v4i_w(v8i_efgh(i)) == 8); + + i = v8i_set1(-1); + CHK(v4i_x(v8i_abcd(i)) == -1); + CHK(v4i_y(v8i_abcd(i)) == -1); + CHK(v4i_z(v8i_abcd(i)) == -1); + CHK(v4i_w(v8i_abcd(i)) == -1); + CHK(v4i_x(v8i_efgh(i)) == -1); + CHK(v4i_y(v8i_efgh(i)) == -1); + CHK(v4i_z(v8i_efgh(i)) == -1); + CHK(v4i_w(v8i_efgh(i)) == -1); + + i = v8i_zero(); + CHK(v4i_x(v8i_abcd(i)) == 0); + CHK(v4i_y(v8i_abcd(i)) == 0); + CHK(v4i_z(v8i_abcd(i)) == 0); + CHK(v4i_w(v8i_abcd(i)) == 0); + CHK(v4i_x(v8i_efgh(i)) == 0); + CHK(v4i_y(v8i_efgh(i)) == 0); + CHK(v4i_z(v8i_efgh(i)) == 0); + CHK(v4i_w(v8i_efgh(i)) == 0); + + i = v8i_set_v4i(v4i_set(-1,-2,3,4), v4i_set(5,6,-7,-8)); + CHK(v4i_x(v8i_abcd(i)) ==-1); + CHK(v4i_y(v8i_abcd(i)) ==-2); + CHK(v4i_z(v8i_abcd(i)) == 3); + CHK(v4i_w(v8i_abcd(i)) == 4); + CHK(v4i_x(v8i_efgh(i)) == 5); + CHK(v4i_y(v8i_efgh(i)) == 6); + CHK(v4i_z(v8i_efgh(i)) ==-7); + CHK(v4i_w(v8i_efgh(i)) ==-8); + + i = v8i_set + (0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F, + 0x00102030, 0x40506070, (int32_t)0x8090A0B0, (int32_t)0xC0D0E0F0); + j = v8i_set + (0x01020401, 0x70605040, 0x0F1F2F3F, 0x00000000, + 0x10204010, 0x06050400, (int32_t)0xF1F2F3F0, 0x10000000); + k = v8i_or(i, j); + CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01030603); + CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x74655647); + CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x0F1F2F3F); + CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x0C0D0E0F); + CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10306030); + CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x46556470); + CHK(v4i_z(v8i_efgh(k)) == (int32_t)0xF1F2F3F0); + CHK(v4i_w(v8i_efgh(k)) == (int32_t)0xD0D0E0F0); + + k = v8i_and(i, j); + CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x00000001); + CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x00000000); + CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x08090A0B); + CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x00000000); + CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x00000010); + CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x00000000); + CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x8090A0B0); + CHK(v4i_w(v8i_efgh(k)) == (int32_t)0x00000000); + + k = v8i_andnot(i, j); + CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01020400); + CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x70605040); + CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x07162534); + CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x00000000); + CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10204000); + CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x06050400); + CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x71625340); + CHK(v4i_w(v8i_efgh(k)) == (int32_t)0x10000000); + + k = v8i_xor(i, j); + CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01030602); + CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x74655647); + CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x07162534); + CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x0C0D0E0F); + CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10306020); + CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x46556470); + CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x71625340); + CHK(v4i_w(v8i_efgh(k)) == (int32_t)0XD0D0E0F0); + + i = v8i_set( 1, 2,3,4,5, 6,7,8); + j = v8i_set(-2,-4,3,6,5,-1,8,8); + + k = v8i_eq(i, j); + CHK(v4i_x(v8i_abcd(k)) == 0); + CHK(v4i_y(v8i_abcd(k)) == 0); + CHK(v4i_z(v8i_abcd(k)) ==~0); + CHK(v4i_w(v8i_abcd(k)) == 0); + CHK(v4i_x(v8i_efgh(k)) ==~0); + CHK(v4i_y(v8i_efgh(k)) == 0); + CHK(v4i_z(v8i_efgh(k)) == 0); + CHK(v4i_w(v8i_efgh(k)) ==~0); + + k = v8i_neq(i, j); + CHK(v4i_x(v8i_abcd(k)) ==~0); + CHK(v4i_y(v8i_abcd(k)) ==~0); + CHK(v4i_z(v8i_abcd(k)) == 0); + CHK(v4i_w(v8i_abcd(k)) ==~0); + CHK(v4i_x(v8i_efgh(k)) == 0); + CHK(v4i_y(v8i_efgh(k)) ==~0); + CHK(v4i_z(v8i_efgh(k)) ==~0); + CHK(v4i_w(v8i_efgh(k)) == 0); + + k = v8i_sel(i, j, v8i_set(~0,~0,0,~0,0,0,~0,0)); + CHK(v4i_x(v8i_abcd(k)) ==-2); + CHK(v4i_y(v8i_abcd(k)) ==-4); + CHK(v4i_z(v8i_abcd(k)) == 3); + CHK(v4i_w(v8i_abcd(k)) == 6); + CHK(v4i_x(v8i_efgh(k)) == 5); + CHK(v4i_y(v8i_efgh(k)) == 6); + CHK(v4i_z(v8i_efgh(k)) == 8); + CHK(v4i_w(v8i_efgh(k)) == 8); + + k = v8i_min(i, j); + CHK(v4i_x(v8i_abcd(k)) ==-2); + CHK(v4i_y(v8i_abcd(k)) ==-4); + CHK(v4i_z(v8i_abcd(k)) == 3); + CHK(v4i_w(v8i_abcd(k)) == 4); + CHK(v4i_x(v8i_efgh(k)) == 5); + CHK(v4i_y(v8i_efgh(k)) ==-1); + CHK(v4i_z(v8i_efgh(k)) == 7); + CHK(v4i_w(v8i_efgh(k)) == 8); + + k = v8i_max(i, j); + CHK(v4i_x(v8i_abcd(k)) == 1); + CHK(v4i_y(v8i_abcd(k)) == 2); + CHK(v4i_z(v8i_abcd(k)) == 3); + CHK(v4i_w(v8i_abcd(k)) == 6); + CHK(v4i_x(v8i_efgh(k)) == 5); + CHK(v4i_y(v8i_efgh(k)) == 6); + CHK(v4i_z(v8i_efgh(k)) == 8); + CHK(v4i_w(v8i_efgh(k)) == 8); + + CHK(v8i_reduce_min_i32(i) == 1); + CHK(v8i_reduce_min_i32(j) ==-4); + CHK(v8i_reduce_max_i32(i) == 8); + CHK(v8i_reduce_max_i32(j) == 8); + + return 0; +} diff --git a/src/vXf_begin.h b/src/vXf_begin.h @@ -0,0 +1,57 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "rsimd.h" + +/* This file can be included once */ +#ifdef VXF_BEGIN_H + #error "The vXf_begin.h header is already included" +#endif +#define VXF_BEGIN_H + +/* Check parameter */ +#if !defined(RSIMD_WIDTH__) + #error "Undefined RSIMD_WIDTH__ macro" +#endif +#if RSIMD_WIDTH__ != 4 && RSIMD_WIDTH__ != 8 + #error "Unexpected RSIMD_WIDTH__ value of "STR(RSIMD_WIDTH__) +#endif + +/* Check that internal macros are not already defined */ +#if defined(RSIMD_vXf__) \ + || defined(RSIMD_vXf_T__) \ + || defined(RSIMD_Sleef__) \ + || defined(RSIMD_Sleef_ULP__) \ + || defined(RSIMD_Sleef_vecf__) + #error "Unexpected macro definition" +#endif + +/* Macros generic to RSIMD_WIDTH__ */ +#define RSIMD_vXf__(Func) \ + CONCAT(CONCAT(CONCAT(CONCAT(v, RSIMD_WIDTH__), f), _), Func) +#define RSIMD_vXf_T__ CONCAT(CONCAT(v, RSIMD_WIDTH__), f_T) + +/* Sleef macros */ +#define RSIMD_Sleef__(Func) CONCAT(CONCAT(Sleef_, Func), RSIMD_WIDTH__) +#define RSIMD_Sleef_ULP__(Func, Suffix) \ + CONCAT(CONCAT(CONCAT(CONCAT(Sleef_, Func), RSIMD_WIDTH__), _), Suffix) + +/* Vector types of the Sleef library */ +#if RSIMD_WIDTH__ == 4 + #define RSIMD_Sleef_vecf__(Dim) CONCAT(Sleef___m128_, Dim) +#elif RSIMD_WIDTH__ == 8 + #define RSIMD_Sleef_vecf__(Dim) CONCAT(Sleef___m256_, Dim) +#endif + diff --git a/src/vXf_end.h b/src/vXf_end.h @@ -0,0 +1,31 @@ +/* Copyright (C) 2014-2021 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef VXF_BEGIN_H + #error "The vXf_begin.h file must be included" +#endif + +/* Undef helper macros */ +#undef RSIMD_vXf__ +#undef RSIMD_vXf_T__ +#undef RSIMD_Sleef__ +#undef RSIMD_Sleef_ULP__ +#undef RSIMD_Sleef_vecf__ + +/* Undef parameters */ +#undef RSIMD_WIDTH__ + +#undef VXF_BEGIN_H +