commit 83e34981cd05b07d2b156c24cfa911f442822251
parent 477fb701f096b706c222a18958146814be6d5a8b
Author: Vincent Forest <vincent.forest@meso-star.com>
Date: Mon, 1 Feb 2021 10:53:49 +0100
Add and test the radcoesfs_compute_simd4 function
Diffstat:
3 files changed, 318 insertions(+), 6 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -13,7 +13,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.1)
project(atrstm C)
enable_testing()
@@ -28,6 +28,7 @@ find_package(AtrTP 0.0 REQUIRED)
find_package(OpenMP 1.2 REQUIRED)
find_package(RCMake 0.4 REQUIRED)
find_package(RSys 0.12 REQUIRED)
+find_package(RSIMD 0.3 REQUIRED)
find_package(StarTetraHedra 0.0 REQUIRED)
find_package(StarUVM 0.0 REQUIRED)
find_package(StarVX 0.2 REQUIRED)
@@ -69,6 +70,7 @@ set(ATRSTM_FILES_INC
atrstm_log.h
atrstm_partition.h
atrstm_radcoefs.h
+ atrstm_radcoefs_simd4.h
atrstm_setup_octrees.h
atrstm_svx.h)
@@ -84,7 +86,7 @@ rcmake_prepend_path(ATRSTM_FILES_INC_API ${ATRSTM_SOURCE_DIR})
rcmake_prepend_path(ATRSTM_FILES_DOC ${PROJECT_SOURCE_DIR}/../)
add_library(atrstm SHARED ${ATRSTM_FILES_SRC} ${ATRSTM_FILES_INC} ${ATRSTM_FILES_INC_API})
-target_link_libraries(atrstm AtrRI AtrTP RSys StarTetraHedra StarUVM StarVX m)
+target_link_libraries(atrstm AtrRI AtrTP RSys RSIMD StarTetraHedra StarUVM StarVX m)
if(CMAKE_COMPILER_IS_GNUCC)
set_target_properties(atrstm PROPERTIES LINK_FLAGS "${OpenMP_C_FLAGS}")
diff --git a/src/atrstm_radcoefs_simd4.h b/src/atrstm_radcoefs_simd4.h
@@ -0,0 +1,154 @@
+/* Copyright (C) 2020, 2021 CNRS
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef ATRSTM_RADCOEFS_SIMD4_H
+#define ATRSTM_RADCOEFS_SIMD4_H
+
+#include "atrstm.h"
+
+#include <rsimd/math.h>
+#include <rsimd/rsimd.h>
+
+#include <rsys/math.h>
+#include <rsys/rsys.h>
+
+/* Radiative coefficients in m^-1 */
+struct radcoefs_simd4 {
+ v4f_T ka; /* Absorption coefficient */
+ v4f_T ks; /* Scattering coefficient */
+ v4f_T kext; /* Extinction coefficient */
+};
+static const struct radcoefs_simd4 RADCOEFS_SIMD4_NULL;
+
+struct radcoefs_compute_simd4_args {
+ v4f_T lambda; /* wavelength [nm] */
+ v4f_T n; /* Refractive index (real component) */
+ v4f_T kappa; /* Refractive index (imaginary component) */
+ v4f_T gyration_radius_prefactor;
+ v4f_T fractal_dimension;
+ v4f_T soot_volumic_fraction; /* In m^3(soot) / m^3 */
+ v4f_T soot_primary_particles_count;
+ v4f_T soot_primary_particles_diameter; /* In nm */
+ int radcoefs_mask; /* Combination of atrstm_radcoef_flag */
+};
+static const struct radcoefs_compute_simd4_args RADCOEFS_COMPUTE_SIMD4_ARGS_NULL;
+
+/* SIMD version of the radcoefs_compute function. Please refer to its scalar
+ * version for informations of what it does */
+static INLINE void
+radcoefs_compute_simd4
+ (struct radcoefs_simd4* radcoefs,
+ const struct radcoefs_compute_simd4_args* args)
+{
+ /* Constant */
+ const v4f_T pi = v4f_set1((float)PI);
+
+ /* Input variables */
+ const v4f_T lambda = args->lambda; /* [nm] */
+ const v4f_T n = args->n;
+ const v4f_T kappa = args->kappa;
+ const v4f_T Fv = args->soot_volumic_fraction; /* [nm^3(soot)/nm] */
+ const v4f_T Np = args->soot_primary_particles_count;
+ const v4f_T Dp = args->soot_primary_particles_diameter; /* [nm] */
+ const v4f_T kf = args->gyration_radius_prefactor;
+ const v4f_T Df = args->fractal_dimension;
+ const int ka = (args->radcoefs_mask & ATRSTM_RADCOEF_FLAG_Ka) != 0;
+ const int ks = (args->radcoefs_mask & ATRSTM_RADCOEF_FLAG_Ks) != 0;
+ const int kext = (args->radcoefs_mask & ATRSTM_RADCOEF_FLAG_Kext) != 0;
+
+ /* SIMD mask */
+ const v4f_T is_Fv_not_null = v4f_neq(Fv, v4f_zero());
+ const v4f_T is_Np_not_null = v4f_neq(Np, v4f_zero());
+ const v4f_T query_radcoefs =
+ v4f_mask1(args->radcoefs_mask & ATRSTM_RADCOEFS_MASK_ALL ? ~0 : 0);
+ const v4f_T mask =
+ v4f_and(v4f_and(is_Fv_not_null, is_Np_not_null), query_radcoefs);
+
+ /* Clean up radiative coefficients */
+ radcoefs->ka = v4f_zero();
+ radcoefs->ks = v4f_zero();
+ radcoefs->kext = v4f_zero();
+
+ if(v4f_movemask(mask) != 0) {
+ /* Precomputed values */
+ const v4f_T n2 = v4f_mul(n, n);
+ const v4f_T kappa2 = v4f_mul(kappa, kappa);
+ const v4f_T four_n2_kappa2 = v4f_mul(v4f_set1(4), v4f_mul(n2, kappa2));
+ const v4f_T xp = v4f_div(v4f_mul(pi, Dp), lambda);
+ const v4f_T xp3 = v4f_mul(v4f_mul(xp, xp), xp);
+ const v4f_T k = v4f_div(v4f_mul(v4f_set1(2), pi), lambda);
+ const v4f_T k2 = v4f_mul(k, k);
+
+ const v4f_T Dp3 = v4f_mul(v4f_mul(Dp, Dp), Dp);
+ const v4f_T Np_pi_Dp3 = v4f_mul(v4f_mul(Np, pi), Dp3);
+ const v4f_T Va = v4f_div(Np_pi_Dp3, v4f_set1(6)); /* [nm^3] */
+ const v4f_T rho = v4f_div(Fv, Va); /* [#aggregates / nm^3] */
+ const v4f_T tmp0 = v4f_mul(v4f_set1(1e9f), rho);
+
+ /* E(m), m = n + i*kappa */
+ const v4f_T tmp1 = v4f_add(v4f_sub(n2, kappa2), v4f_set1(2));
+ const v4f_T denom = v4f_madd(tmp1, tmp1, four_n2_kappa2);
+ const v4f_T six_n_kappa = v4f_mul(v4f_mul(v4f_set1(6), n), kappa);
+ const v4f_T Em = v4f_div(six_n_kappa, denom);
+
+ if(ka || kext) {
+ /* Absorption cross section, [nm^3/aggrefate] */
+ const v4f_T four_pi_xp3 = v4f_mul(v4f_mul(v4f_set1(4), pi), xp3);
+ const v4f_T Cabs = v4f_mul(v4f_mul(Np, v4f_div(four_pi_xp3, k2)), Em);
+
+ /* Absorption coefficient, [m^-1] */
+ radcoefs->ka = v4f_and(v4f_mul(tmp0, Cabs), mask);
+ }
+
+ if(ks || kext) {
+ /* F(m), m = n + i*kappa */
+ const v4f_T n2_sub_kappa2 = v4f_sub(n2, kappa2);
+ const v4f_T n2_sub_kappa2_sub_1 = v4f_sub(n2_sub_kappa2, v4f_set1(1));
+ const v4f_T n2_sub_kappa2_add_2 = v4f_add(n2_sub_kappa2, v4f_set1(2));
+ const v4f_T tmp2 = v4f_mul(n2_sub_kappa2_sub_1, n2_sub_kappa2_add_2);
+ const v4f_T tmp3 = v4f_div(v4f_add(tmp2, four_n2_kappa2), denom);
+ const v4f_T Fm = v4f_madd(tmp3, tmp3, v4f_mul(Em, Em));
+
+ /* Gyration radius */
+ const v4f_T tmp4 = v4f_pow(v4f_div(Np, kf), v4f_rcp(Df));
+ const v4f_T Rg = v4f_mul(v4f_mul(v4f_set1(0.5f), Dp), tmp4); /* [nm] */
+ const v4f_T Rg2 = v4f_mul(Rg, Rg);
+ const v4f_T four_k2_Rg2 = v4f_mul(v4f_mul(v4f_set1(4), k2), Rg2);
+ const v4f_T tmp5 = v4f_div(four_k2_Rg2, v4f_mul(v4f_set1(3), Df));
+ const v4f_T tmp6 = v4f_add(v4f_set1(1), tmp5);
+ const v4f_T g = v4f_pow(tmp6, v4f_mul(Df, v4f_set1(-0.5f)));
+
+ /* Scattering cross section, [nm^3/aggrefate] */
+ const v4f_T Np2 = v4f_mul(Np, Np);
+ const v4f_T xp6 = v4f_mul(xp3, xp3);
+ const v4f_T eight_pi_xp6 = v4f_mul(v4f_mul(v4f_set1(8), pi), xp6);
+ const v4f_T tmp7 = v4f_div(eight_pi_xp6, v4f_mul(v4f_set1(3), k2));
+ const v4f_T Csca = v4f_mul(v4f_mul(v4f_mul(Np2, tmp7), Fm), g);
+
+ /* Scattering coefficient, [m^-1] */
+ radcoefs->ks = v4f_and(v4f_mul(tmp0, Csca), mask);
+ }
+
+ if(kext) {
+ radcoefs->kext = v4f_add(radcoefs->ka, radcoefs->ks);
+ }
+
+ ASSERT(v4f_movemask(mask) && v4f_movemask(v4f_neq(denom, v4f_zero())));
+ ASSERT(v4f_movemask(mask) && v4f_movemask(v4f_neq(Df, v4f_zero())));
+ ASSERT(v4f_movemask(mask) && v4f_movemask(v4f_neq(Dp, v4f_zero())));
+ }
+}
+
+#endif /* ATRSTM_RADCOEFS_SIMD4_H */
diff --git a/src/test_atrstm_radcoefs.c b/src/test_atrstm_radcoefs.c
@@ -14,16 +14,16 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. */
#include "atrstm_radcoefs.h"
+#include "atrstm_radcoefs_simd4.h"
-int
-main(int argc, char** argv)
+static void
+test_scalar(void)
{
const double ka_ref = 5.7382401729092799E-1;
const double ks_ref = 7.2169062018378995E-6;
- struct radcoefs radcoefs;
+ struct radcoefs radcoefs = RADCOEFS_NULL;
struct radcoefs_compute_args args = RADCOEFS_COMPUTE_ARGS_NULL;
- (void)argc, (void)argv;
args.lambda = 633;
args.n = 1.90;
@@ -60,5 +60,161 @@ main(int argc, char** argv)
radcoefs_compute(&radcoefs, &args);
CHK(eq_eps(radcoefs.kext, ka_ref+ks_ref, (ka_ref+ks_ref)*1e-6));
+ args.radcoefs_mask = ATRSTM_RADCOEFS_MASK_ALL;
+
+ args.soot_primary_particles_count = 0;
+ radcoefs_compute(&radcoefs, &args);
+ CHK(radcoefs.ka == 0);
+ CHK(radcoefs.ks == 0);
+ CHK(radcoefs.kext == 0);
+
+ args.soot_primary_particles_count = 100;
+ args.soot_volumic_fraction = 0;
+ radcoefs_compute(&radcoefs, &args);
+ CHK(radcoefs.ka == 0);
+ CHK(radcoefs.ks == 0);
+ CHK(radcoefs.kext == 0);
+}
+
+static void
+test_simd(void)
+{
+ const double ka_ref = 5.7382401729092799E-1;
+ const double ks_ref = 7.2169062018378995E-6;
+
+ const double ka_ref2[4] = {
+ 0.52178067472799794,
+ 0.52178067472799794,
+ 1.0435613494559959,
+ 0.52178067472799794
+ };
+ const double ks_ref2[4] = {
+ 9.6010140939975883e-002,
+ 3.3272961678492224e-002,
+ 0.19202028187995177,
+ 9.9964602374815484e-002
+ };
+ struct radcoefs_simd4 radcoefs = RADCOEFS_SIMD4_NULL;
+ struct radcoefs_compute_simd4_args args = RADCOEFS_COMPUTE_SIMD4_ARGS_NULL;
+ float ALIGN(16) ka[4], ks[4], kext[4];
+
+ args.lambda = v4f_set1(633.f);
+ args.n = v4f_set1(1.90f);
+ args.kappa = v4f_set1(0.55f);
+ args.gyration_radius_prefactor = v4f_set1(1.70f);
+ args.fractal_dimension = v4f_set1(1.75f);
+ args.soot_volumic_fraction = v4f_set(1e-7f, 0.f, 1e-7f, 1e-7f);
+ args.soot_primary_particles_count = v4f_set(100.f, 100.f, 0.f, 100.f);
+ args.soot_primary_particles_diameter = v4f_set1(1.f);
+ args.radcoefs_mask = ATRSTM_RADCOEFS_MASK_ALL;
+
+ radcoefs_compute_simd4(&radcoefs, &args);
+
+ v4f_store(ka, radcoefs.ka);
+ v4f_store(ks, radcoefs.ks);
+ v4f_store(kext, radcoefs.kext);
+ printf("ka = {%g, %g, %g, %g}; ks = {%g, %g, %g, %g}\n",
+ SPLIT4(ka), SPLIT4(ks));
+
+ CHK(eq_eps(ka[0], ka_ref, ka_ref*1.e-6));
+ CHK(eq_eps(ka[3], ka_ref, ka_ref*1.e-6));
+ CHK(ka[1] == 0);
+ CHK(ka[2] == 0);
+
+ CHK(eq_eps(ks[0], ks_ref, ks_ref*1.e-6));
+ CHK(eq_eps(ks[3], ks_ref, ks_ref*1.e-6));
+ CHK(ks[1] == 0);
+ CHK(ks[2] == 0);
+
+ CHK(eq_eps(kext[0], ka_ref+ks_ref, (ka_ref+ks_ref)*1e-6));
+ CHK(eq_eps(kext[3], ka_ref+ks_ref, (ka_ref+ks_ref)*1e-6));
+ CHK(kext[1] == 0);
+ CHK(kext[2] == 0);
+
+ args.radcoefs_mask = ATRSTM_RADCOEF_FLAG_Ka;
+ radcoefs_compute_simd4(&radcoefs, &args);
+ v4f_store(ka, radcoefs.ka);
+ v4f_store(ks, radcoefs.ks);
+ v4f_store(kext, radcoefs.kext);
+ CHK(eq_eps(ka[0], ka_ref, ka_ref*1.e-6));
+ CHK(eq_eps(ka[3], ka_ref, ka_ref*1.e-6));
+ CHK(ka[1] == 0);
+ CHK(ka[2] == 0);
+ CHK(ks[0] == 0 && ks[1] == 0 && ks[2] == 0 && ks[0] == 0);
+ CHK(kext[0] == 0 && kext[1] == 0 && kext[2] == 0 && kext[0] == 0);
+
+ args.radcoefs_mask = ATRSTM_RADCOEF_FLAG_Ks;
+ radcoefs_compute_simd4(&radcoefs, &args);
+ v4f_store(ka, radcoefs.ka);
+ v4f_store(ks, radcoefs.ks);
+ v4f_store(kext, radcoefs.kext);
+ CHK(eq_eps(ks[0], ks_ref, ks_ref*1.e-6));
+ CHK(eq_eps(ks[3], ks_ref, ks_ref*1.e-6));
+ CHK(ks[1] == 0);
+ CHK(ks[2] == 0);
+ CHK(ka[0] == 0 && ka[1] == 0 && ka[2] == 0 && ka[0] == 0);
+ CHK(kext[0] == 0 && kext[1] == 0 && kext[2] == 0 && kext[0] == 0);
+
+ /* Note that actually even though Ka and Ks are note required they are
+ * internally computed to evaluate kext and are returned to the caller. Their
+ * value are thus not null */
+ args.radcoefs_mask = ATRSTM_RADCOEF_FLAG_Kext;
+ radcoefs_compute_simd4(&radcoefs, &args);
+ v4f_store(kext, radcoefs.kext);
+ CHK(eq_eps(kext[0], ka_ref+ks_ref, (ka_ref+ks_ref)*1e-6));
+ CHK(eq_eps(kext[3], ka_ref+ks_ref, (ka_ref+ks_ref)*1e-6));
+ CHK(kext[1] == 0);
+ CHK(kext[2] == 0);
+
+ args.lambda = v4f_set1(633.f);
+ args.n = v4f_set1(1.75f);
+ args.kappa = v4f_set1(0.435f);
+ args.gyration_radius_prefactor = v4f_set1(1.70f);
+ args.fractal_dimension = v4f_set1(1.75f);
+ args.soot_volumic_fraction = v4f_set
+ (9.9999999999999995e-008f,
+ 9.9999999999999995e-008f,
+ 1.9999999999999999e-007f,
+ 9.9999999999999995e-008f);
+ args.soot_primary_particles_count = v4f_set
+ (400.f,
+ 400.f,
+ 400.f,
+ 800.f);
+ args.soot_primary_particles_diameter = v4f_set
+ (34.000000006413003f,
+ 17.000000003206502f,
+ 34.000000006413003f,
+ 34.000000006413003f);
+ args.radcoefs_mask = ATRSTM_RADCOEFS_MASK_ALL;
+ radcoefs_compute_simd4(&radcoefs, &args);
+ v4f_store(ka, radcoefs.ka);
+ v4f_store(ks, radcoefs.ks);
+ v4f_store(kext, radcoefs.kext);
+ printf("ka = {%g, %g, %g, %g}; ks = {%g, %g, %g, %g}\n",
+ SPLIT4(ka), SPLIT4(ks));
+ CHK(eq_eps(ka[0], ka_ref2[0], ka_ref2[0]*1.e-6));
+ CHK(eq_eps(ka[1], ka_ref2[1], ka_ref2[1]*1.e-6));
+ CHK(eq_eps(ka[2], ka_ref2[2], ka_ref2[2]*1.e-6));
+ CHK(eq_eps(ka[3], ka_ref2[3], ka_ref2[3]*1.e-6));
+ CHK(eq_eps(ks[0], ks_ref2[0], ks_ref2[0]*1.e-6));
+ CHK(eq_eps(ks[1], ks_ref2[1], ks_ref2[1]*1.e-6));
+ CHK(eq_eps(ks[2], ks_ref2[2], ks_ref2[2]*1.e-6));
+ CHK(eq_eps(ks[3], ks_ref2[3], ks_ref2[3]*1.e-6));
+ CHK(eq_eps(kext[0], ka_ref2[0]+ks_ref2[0], (ka_ref2[0]+ks_ref2[0])*1.e-6));
+ CHK(eq_eps(kext[1], ka_ref2[1]+ks_ref2[1], (ka_ref2[1]+ks_ref2[1])*1.e-6));
+ CHK(eq_eps(kext[2], ka_ref2[2]+ks_ref2[2], (ka_ref2[2]+ks_ref2[2])*1.e-6));
+ CHK(eq_eps(kext[3], ka_ref2[3]+ks_ref2[3], (ka_ref2[3]+ks_ref2[3])*1.e-6));
+}
+
+
+int
+main(int argc, char** argv)
+{
+ (void)argc, (void)argv;
+
+ test_scalar();
+ test_simd();
+
return 0;
}