star-sf

Set of surface and volume scattering functions
git clone git://git.meso-star.fr/star-sf.git
Log | Files | Refs | README | LICENSE

commit 8ecf1014665dd83abec7c6ff0395df98926221b3
parent 3a93ad1530eda7d01df6d8d68cd26b185ab12b69
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Thu, 29 Apr 2021 14:34:43 +0200

Optimise the ssf_phase_rdgfa_setup function with simd<4|8>

Diffstat:
Mcmake/CMakeLists.txt | 96++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Msrc/ssf.h | 12++++++++++--
Msrc/ssf_phase_rdgfa.c | 76+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Asrc/ssf_phase_rdgfa_simdX.h | 183+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/test_ssf_phase_rdgfa.c | 27+++++++++++++++++++++++++--
5 files changed, 367 insertions(+), 27 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt @@ -13,7 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.1) project(ssf C) enable_testing() @@ -27,13 +27,61 @@ set(SSF_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../src) find_package(RCMake 0.4 REQUIRED) find_package(RSys 0.6 REQUIRED) find_package(StarSP 0.5 REQUIRED) -include_directories(${RSys_INCLUDE_DIR} ${StarSP_INCLUDE_DIR}) +find_package(RSIMD 0.3) +if(RSIMD_FOUND) + option(USE_SIMD "Use SIMD instruction sets" ON) +endif() set(CMAKE_MODULE_PATH ${RCMAKE_SOURCE_DIR}) include(rcmake) include(rcmake_runtime) -rcmake_append_runtime_dirs(_runtime_dirs RSys StarSP) +include_directories(${RSys_INCLUDE_DIR} ${StarSP_INCLUDE_DIR}) + +set(_dependencies RSys StarSP) + +if(USE_SIMD) + include_directories(${RSIMD_INCLUDE_DIR}) + set(_dependencies ${_dependencies} RSIMD) +endif() + +rcmake_append_runtime_dirs(_runtime_dirs ${_dependencies}) + +################################################################################ +# Check SIMD support +################################################################################ +if(NOT USE_SIMD) + message(STATUS "Do not use the SIMD instruction sets") +else() + message(STATUS "Use the SIMD instruction sets") + + option(SSF_USE_SIMD_128 "Enable the SIMD-128 instruction sets" ON) + option(SSF_USE_SIMD_256 "Enable the SIMD-256 instruction sets" ON) + + if(CMAKE_COMPILER_IS_GNUCC) + include(CheckCCompilerFlag) + CHECK_C_COMPILER_FLAG("-msse2" SSE2) + CHECK_C_COMPILER_FLAG("-msse4.1" SSE4_1) + CHECK_C_COMPILER_FLAG("-mavx" AVX) + endif() + + if(SSF_USE_SIMD_128 AND NOT SSE2) + get_property(_docstring CACHE SSF_USE_SIMD_128 PROPERTY HELPSTRING) + set(SSF_USE_SIMD_128 OFF CACHE BOOL ${_docstring} FORCE) + endif() + if(SSF_USE_SIMD_256 AND NOT AVX) + get_property(_docstring CACHE SSF_USE_SIMD_256 PROPERTY HELPSTRING) + set(SSF_USE_SIMD_128 OFF CACHE BOOL ${_docstring} FORCE) + endif() + + if(SSF_USE_SIMD_128) + set(_simd_size "${_simd_size};SSF_USE_SIMD_128") + endif() + + if(SSF_USE_SIMD_256) + set(_simd_size "${_simd_size};SSF_USE_SIMD_256") + endif() +endif() ################################################################################ # Define targets @@ -77,20 +125,16 @@ rcmake_prepend_path(SSF_FILES_INC_API ${SSF_SOURCE_DIR}) rcmake_prepend_path(SSF_FILES_DOC ${PROJECT_SOURCE_DIR}/../) if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_GNUCC) - set(MATH_LIB m) + set(_dependencies ${_dependencies} m) endif() -if(BUILD_STATIC) - add_library(ssf STATIC ${SSF_FILES_INC} ${SSF_FILES_SRC} ${SSF_FILES_INC_API}) - set_target_properties(ssf PROPERTIES COMPILE_DEFINITIONS SSF_STATIC_BUILD) -else() - add_library(ssf SHARED ${SSF_FILES_INC} ${SSF_FILES_SRC}) - target_link_libraries(ssf RSys StarSP ${MATH_LIB}) - set_target_properties(ssf PROPERTIES - DEFINE_SYMBOL SSF_SHARED_BUILD - VERSION ${VERSION} - SOVERSION ${VERSION_MAJOR}) -endif() +add_library(ssf SHARED ${SSF_FILES_INC} ${SSF_FILES_SRC}) +target_link_libraries(ssf ${_dependencies}) +set_target_properties(ssf PROPERTIES + DEFINE_SYMBOL SSF_SHARED_BUILD + COMPILE_DEFINITIONS "${_simd_size}" + VERSION ${VERSION} + SOVERSION ${VERSION_MAJOR}) rcmake_setup_devel(ssf StarSF ${VERSION} star/ssf_version.h) @@ -99,10 +143,18 @@ rcmake_setup_devel(ssf StarSF ${VERSION} star/ssf_version.h) ################################################################################ if(NOT NO_TEST) - function(new_test _name) + function(build_test _name) add_executable(${_name} ${SSF_SOURCE_DIR}/${_name}.c) target_link_libraries(${_name} ssf RSys StarSP ${MATH_LIB}) - add_test(${_name} ${_name}) + endfunction() + + function(register_test _name) + add_test(${_name} ${ARGN}) + endfunction() + + function(new_test _name) + build_test(${_name} ${ARGN}) + register_test(${_name} ${_name}) endfunction() new_test(test_ssf_beckmann_distribution) @@ -120,11 +172,19 @@ if(NOT NO_TEST) new_test(test_ssf_phase) new_test(test_ssf_phase_hg) new_test(test_ssf_phase_rayleigh) - new_test(test_ssf_phase_rdgfa) new_test(test_ssf_specular_dielectric_dielectric_reflection) new_test(test_ssf_specular_reflection) new_test(test_ssf_thin_specular_dielectric) + build_test(test_ssf_phase_rdgfa) + register_test(test_ssf_phase_rdgfa_simd_none test_ssf_phase_rdgfa simd_none) + if(SSF_USE_SIMD_128) + register_test(test_ssf_phase_rdgfa_simd_128 test_ssf_phase_rdgfa simd_128) + endif() + if(SSF_USE_SIMD_256) + register_test(test_ssf_phase_rdgfa_simd_256 test_ssf_phase_rdgfa simd_256) + endif() + rcmake_copy_runtime_libraries(test_ssf_beckmann_distribution) endif() diff --git a/src/ssf.h b/src/ssf.h @@ -57,6 +57,12 @@ enum ssf_bxdf_flag { SSF_GLOSSY = BIT(4) }; +enum ssf_simd { + SSF_SIMD_NONE, + SSF_SIMD_128, + SSF_SIMD_256 +}; + /* Generic BSDF type descriptor. Note that by convention the outgoing direction * `wo' and the incoming direction `wi' point outward the surface. Furthermore, * `wo' and the normal N must point on the same side of the surface. As a @@ -192,10 +198,12 @@ struct ssf_phase_rdgfa_setup_args { double fractal_dimension; /* No unit */ double gyration_radius; /* In nm */ - /* #intervals used to discretized the angular domain */ + /* Number of #intervals to use to discretize the angular domain */ size_t nintervals; + + enum ssf_simd simd; /* SIMD instruction sets to use */ }; -#define SSF_PHASE_RDGFA_SETUP_ARGS_DEFAULT__ {0,0,0,1000} +#define SSF_PHASE_RDGFA_SETUP_ARGS_DEFAULT__ {0,0,0,1000,SSF_SIMD_NONE} static const struct ssf_phase_rdgfa_setup_args SSF_PHASE_RDGFA_SETUP_ARGS_DEFAULT = SSF_PHASE_RDGFA_SETUP_ARGS_DEFAULT__; diff --git a/src/ssf_phase_rdgfa.c b/src/ssf_phase_rdgfa.c @@ -21,6 +21,26 @@ #include <rsys/algorithm.h> #include <rsys/dynamic_array_double.h> +#if defined(SSF_USE_SIMD_128) || defined(SSF_USE_SIMD_256) + /* Helper macro */ + #define USE_SIMD +#endif + +#ifdef USE_SIMD + #include <rsimd/math.h> + #include <rsimd/rsimd.h> + + /* Generate the struct darray_simdf dynamic array */ + #define DARRAY_NAME simdf + #define DARRAY_DATA float + #ifdef SSF_USE_SIMD_256 + #define DARRAY_ALIGNMENT 64 + #else + #define DARRAY_ALIGNMENT 16 + #endif + #include <rsys/dynamic_array.h> +#endif + #define EXP1 2.7182818284590452354 struct rdgfa { @@ -30,9 +50,14 @@ struct rdgfa { double rcp_normalize_factor; /* Reciprocal normalization factor of the CDF */ - /* Discretized cumulative (#entries = nintervals[0] + nintervals[1]) */ + /* Discretized cumulative (#entries = nintervals) */ struct darray_double cdf; +#ifdef USE_SIMD + struct darray_simdf f_list; + struct darray_simdf d_omega_list; +#endif + /* Precomputed values */ double Rg2; /* gyration_radius^2 */ double cst_4pi_div_lambda; /* (4*PI)/wavelength */ @@ -115,7 +140,6 @@ eval2 return phase; } - /* Not normalized */ static FINLINE double eval(struct rdgfa* rdgfa, const double theta) @@ -123,7 +147,7 @@ eval(struct rdgfa* rdgfa, const double theta) return eval2(rdgfa, theta, cos(theta)); } -static res_T +static INLINE res_T compute_cumulative(struct rdgfa* rdgfa) { double* cdf = NULL; @@ -131,6 +155,7 @@ compute_cumulative(struct rdgfa* rdgfa) double theta1; size_t i; res_T res = RES_OK; + ASSERT(rdgfa); /* Allocate the cumulative array */ res = darray_double_resize(&rdgfa->cdf, rdgfa->nintervals); @@ -172,6 +197,16 @@ error: goto exit; } +/* Generate the simd functions if required */ +#ifdef SSF_USE_SIMD_128 + #define SIMD_WIDTH__ 4 + #include "ssf_phase_rdgfa_simdX.h" +#endif +#ifdef SSF_USE_SIMD_256 + #define SIMD_WIDTH__ 8 + #include "ssf_phase_rdgfa_simdX.h" +#endif + /******************************************************************************* * Private functions ******************************************************************************/ @@ -182,6 +217,10 @@ rdgfa_init(struct mem_allocator* allocator, void* phase) ASSERT(phase); memset(rdgfa, 0, sizeof(*rdgfa)); darray_double_init(allocator, &rdgfa->cdf); +#ifdef USE_SIMD + darray_simdf_init(allocator, &rdgfa->f_list); + darray_simdf_init(allocator, &rdgfa->d_omega_list); +#endif /* USE_SIMD */ return RES_OK; } @@ -191,6 +230,10 @@ rdgfa_release(void* phase) struct rdgfa* rdgfa = phase; ASSERT(phase); darray_double_release(&rdgfa->cdf); +#ifdef USE_SIMD + darray_simdf_release(&rdgfa->f_list); + darray_simdf_release(&rdgfa->d_omega_list); +#endif /* USE_SIMD */ } static double @@ -325,8 +368,31 @@ ssf_phase_rdgfa_setup rdgfa->g = pow(1 + 4*k2*rdgfa->Rg2/(3*Df), -Df*0.5); /* Precompute the phase function cumulative */ - res = compute_cumulative(rdgfa); - if(res != RES_OK) goto error; + switch(args->simd) { + case SSF_SIMD_NONE: + res = compute_cumulative(rdgfa); + if(res != RES_OK) goto error; + break; + case SSF_SIMD_128: + #ifdef SSF_USE_SIMD_128 + res = compute_cumulative_simd4(rdgfa); + if(res != RES_OK) goto error; + #else + res = RES_BAD_OP; + goto error; + #endif + break; + case SSF_SIMD_256: + #ifdef SSF_USE_SIMD_256 + res = compute_cumulative_simd8(rdgfa); + if(res != RES_OK) goto error; + #else + res = RES_BAD_OP; + goto error; + #endif + break; + default: FATAL("Unreachable code.\n"); break; + } exit: return res; diff --git a/src/ssf_phase_rdgfa_simdX.h b/src/ssf_phase_rdgfa_simdX.h @@ -0,0 +1,183 @@ +/* Copyright (C) 2016-2018, 2021 |Meso|Star> (contact@meso-star.com) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#if !defined(SIMD_WIDTH__) + #error "Undefined SIMD_WIDTH__ macro" +#endif +#if SIMD_WIDTH__ != 4 && SIMD_WIDTH__ != 8 + #error "Unexpected SIMD_WIDTH__ value of "STR(RSIMD_WIDTH__) +#endif + +/* Check that internal macros are not already defined */ +#if defined(vXf__) \ + || defined(vXf_T__) \ + || defined(SIMD_FUNC__) + #error "Unexpected macro definition" +#endif + +/* Macros generic to the SIMD_WIDTH__ */ +#define vXf_T CONCAT(CONCAT(v, SIMD_WIDTH__), f_T) +#define vXf(Func) CONCAT(CONCAT(CONCAT(v, SIMD_WIDTH__), f_), Func) +#define SIMD_FUNC(Func) CONCAT(CONCAT(Func, _simd), SIMD_WIDTH__) + +static INLINE vXf_T +SIMD_FUNC(eval_f)(struct rdgfa* rdgfa, const vXf_T theta) +{ + /* Input arguments */ + const vXf_T Df = vXf(set1)((float)rdgfa->fractal_dimension); + + /* Precompute constants */ + const vXf_T Rg2 = vXf(set1)((float)rdgfa->Rg2); + const vXf_T half_theta = vXf(mul)(theta, vXf(set1)(0.5f)); + + /* Precompute values */ + const vXf_T sin_half_theta = vXf(sin)(half_theta); + const vXf_T q = vXf(mul)(vXf(set1)((float)rdgfa->cst_4pi_div_lambda), sin_half_theta); + const vXf_T q2Rg2 = vXf(mul)(vXf(mul)(q, q), Rg2); + + /* Evaluate f(theta) when q2Rg2 < 1.5*Df */ + const vXf_T val0 = vXf(exp)(vXf(mul)(vXf(set1)(-1.f/3.f), q2Rg2)); + + /* Evaluate f(theta) when q2Rg2 >= 1.5*Df */ + const vXf_T tmp0 = vXf(div)(vXf(set1)((float)rdgfa->cst_3Df_div_2E), q2Rg2); + const vXf_T half_Df = vXf(mul)(Df, vXf(set1)(0.5f)); + const vXf_T val1 = vXf(pow)(tmp0, half_Df); + + /* Setup f */ + const vXf_T mask = vXf(lt)(q2Rg2, vXf(mul)(Df, vXf(set1)(1.5f))); + const vXf_T f = vXf(sel)(val1, val0, mask); + return f; +} + +static INLINE vXf_T +SIMD_FUNC(eval2) + (struct rdgfa* rdgfa, + const vXf_T theta, + const vXf_T cos_theta) +{ + const vXf_T f = SIMD_FUNC(eval_f)(rdgfa, theta); + const vXf_T g = vXf(set1)((float)rdgfa->g); + const vXf_T cos2_theta = vXf(mul)(cos_theta, cos_theta); + const vXf_T cst0 = vXf(set1)(3.f/(16.f*(float)PI)); + const vXf_T tmp0 = vXf(div)(f, g); + const vXf_T tmp1 = vXf(add)(vXf(set1)(1), cos2_theta); + const vXf_T phase = vXf(mul)(vXf(mul)(cst0, tmp0), tmp1); + return phase; +} + +static INLINE vXf_T +SIMD_FUNC(eval)(struct rdgfa* rdgfa, const vXf_T theta) +{ + return SIMD_FUNC(eval2)(rdgfa, theta, vXf(cos)(theta)); +} + +static INLINE res_T +SIMD_FUNC(compute_cumulative)(struct rdgfa* rdgfa) +{ + vXf_T dtheta; + vXf_T theta1; + vXf_T step; + vXf_T two_PI; + float* f_list = NULL; + float* d_omega_list = NULL; + double* cdf = NULL; + size_t nangles; + size_t i; + res_T res = RES_OK; + ASSERT(rdgfa); + + /* Force the number of angles to be a multiple of the SIMD width */ + nangles = rdgfa->nintervals + 1; + nangles = (nangles + SIMD_WIDTH__-1)/ SIMD_WIDTH__ * SIMD_WIDTH__; + + /* Allocate the cumulative array */ + res = darray_double_resize(&rdgfa->cdf, rdgfa->nintervals); + if(res != RES_OK) goto error; + + /* Allocate temporaries arrays */ + res = darray_simdf_resize(&rdgfa->f_list, nangles); + if(res != RES_OK) goto error; + res = darray_simdf_resize(&rdgfa->d_omega_list, nangles); + if(res != RES_OK) goto error; + + /* Fetch data */ + cdf = darray_double_data_get(&rdgfa->cdf); + f_list = darray_simdf_data_get(&rdgfa->f_list); + d_omega_list = darray_simdf_data_get(&rdgfa->d_omega_list); + + /* Compute the angular step for the angular domain */ + rdgfa->dtheta = PI / (double)rdgfa->nintervals; + + step = vXf(set1)((float)rdgfa->dtheta*(float)SIMD_WIDTH__); + dtheta = vXf(set1)((float)rdgfa->dtheta); + two_PI = vXf(set1)((float)(2*PI)); +#if SIMD_WIDTH__ == 4 + theta1 = vXf(mul)(dtheta, vXf(set)(0, 1, 2, 3)); +#elif SIMD_WIDTH__ == 8 + theta1 = vXf(mul)(dtheta, vXf(set)(0, 1, 2, 3, 4, 5, 6, 7)); +#endif + + /* Compute f and d_omaga */ + FOR_EACH(i, 0, nangles/SIMD_WIDTH__) { + /* Compute f */ + const vXf_T f = SIMD_FUNC(eval)(rdgfa, theta1); + + /* Compute d_omega */ + const vXf_T theta2 = vXf(add)(theta1, dtheta); + const vXf_T tmp0 = vXf(mul)(vXf(add)(theta1, theta2), vXf(set1)(0.5f)); + const vXf_T d_omega = vXf(mul)(vXf(mul)(two_PI, vXf(sin)(tmp0)), dtheta); + + /* Store the result */ + vXf(store)(&f_list[i*SIMD_WIDTH__], f); + vXf(store)(&d_omega_list[i*SIMD_WIDTH__], d_omega); + + /* Go to the next angles */ + theta1 = vXf(add)(theta1, step); + } + + /* Compute the (unormalized) cumulative */ + FOR_EACH(i, 0, rdgfa->nintervals) { + const double f1 = f_list[i+0]; + const double f2 = f_list[i+1]; + const double d_omega = d_omega_list[i]; + const double tmp = (f1 + f2) * 0.5 * d_omega; + cdf[i] = (i == 0 ? tmp : tmp + cdf[i-1]); + } + + /* Save the normalization factor */ + rdgfa->rcp_normalize_factor = 1.0 / cdf[rdgfa->nintervals-1]; + + /* Finally normalize the CDF */ + FOR_EACH(i, 0, rdgfa->nintervals) { + cdf[i] *= rdgfa->rcp_normalize_factor; + } + +exit: + return res; +error: + darray_double_clear(&rdgfa->cdf); + darray_simdf_clear(&rdgfa->f_list); + darray_simdf_clear(&rdgfa->d_omega_list); + goto exit; +} + +/* Undef generic macros */ +#undef vXf_T +#undef vXf +#undef SIMD_FUNC + +/* Undef parameter */ +#undef SIMD_WIDTH__ + diff --git a/src/test_ssf_phase_rdgfa.c b/src/test_ssf_phase_rdgfa.c @@ -16,6 +16,8 @@ #include "ssf.h" #include "test_ssf_utils.h" +#include <string.h> + int main(int argc, char** argv) { @@ -30,9 +32,26 @@ main(int argc, char** argv) struct ssp_rng* rng; double cumulative_prev; double wo[3]; + int err = 0; size_t i; (void)argc, (void)argv; + if(argc <= 1) { + fprintf(stderr, "Usage: %s <simd_none|simd_128|simd_256>\n", argv[0]); + goto error; + } + + if(!strcmp(argv[1], "simd_none")) { + args.simd = SSF_SIMD_NONE; + } else if(!strcmp(argv[1], "simd_128")) { + args.simd = SSF_SIMD_128; + } else if(!strcmp(argv[1], "simd_256")) { + args.simd = SSF_SIMD_256; + } else { + fprintf(stderr, "Invalid argument '%s'.\n", argv[1]); + goto error; + } + mem_init_proxy_allocator(&allocator, &mem_default_allocator); CHK(ssp_rng_create(&allocator, &ssp_rng_mt19937_64, &rng) == RES_OK); @@ -87,7 +106,7 @@ main(int argc, char** argv) CHK(eq_eps(pdf, ssf_phase_eval(phase, wo, wi), fabs(pdf*1.e-6))); CHK(d3_is_normalized(wi)); -#if 0 +#if 1 fprintf(stderr, "v %g %g %g\n", wi[0]*pdf, wi[1]*pdf, wi[2]*pdf); fprintf(stderr, "p %lu\n", (unsigned long)(i+1)); #endif @@ -100,6 +119,10 @@ main(int argc, char** argv) check_memory_allocator(&allocator); mem_shutdown_proxy_allocator(&allocator); CHK(mem_allocated_size() == 0); - return 0; +exit: + return err; +error: + err = -1; + goto exit; }