Do not use OpenMP to perform accumulation reduction - stardis-solver

commit 8c9afb07f5524a5f4057182f54710804d766c0ee
parent 7430763d68f3ef2e1254248eedaf77c6d9183c8b
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Fri,  8 Mar 2019 17:43:32 +0100

Do not use OpenMP to perform accumulation reduction

Diffstat:
M src/sdis_Xd_begin.h  | 26 ++++++++++++++++++++++++++
M src/sdis_solve_boundary_Xd.h  | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M src/sdis_solve_medium_Xd.h  | 14 +++++---------
M src/sdis_solve_probe_Xd.h  | 40 +++++++++++++++++++++++-----------------
M src/sdis_solve_probe_boundary_Xd.h  | 148 ++++++++++++++++++++++++++++++++++++++++++++++---------------------------------

5 files changed, 223 insertions(+), 133 deletions(-)
diff --git a/src/sdis_Xd_begin.h b/src/sdis_Xd_begin.h
@@ -22,6 +22,14 @@
 struct green_path_handle;
 struct sdis_heat_path;
 
+struct accum {
+  double sum; /* Sum of MC weights */
+  double sum2; /* Sum of square MC weights */
+  size_t count; /* #accumulated MC weights */
+};
+#define ACCUM_NULL__ {0,0,0}
+static const struct accum ACCUM_NULL = ACCUM_NULL__;
+
 struct rwalk_context {
   struct green_path_handle* green_path;
   struct sdis_heat_path* heat_path;
@@ -31,6 +39,24 @@ struct rwalk_context {
 #define RWALK_CONTEXT_NULL__ {NULL, NULL, 0, 0}
 static const struct rwalk_context RWALK_CONTEXT_NULL = RWALK_CONTEXT_NULL__;
 
+static INLINE void
+sum_accums
+  (const struct accum accums[],
+   const size_t naccums,
+   struct accum* accum)
+{
+  struct accum acc = ACCUM_NULL;
+  size_t i;
+  ASSERT(accums && naccums && accum);
+
+  FOR_EACH(i, 0, naccums) {
+    acc.sum += accums[i].sum;
+    acc.sum2 += accums[i].sum2;
+    acc.count += accums[i].count;
+  }
+  *accum = acc;
+}
+
 #endif /* SDIS_XD_BEGIN_H */
 
 #ifdef SDIS_XD_BEGIN_H__
diff --git a/src/sdis_solve_boundary_Xd.h b/src/sdis_solve_boundary_Xd.h
@@ -93,10 +93,9 @@ XD(solve_boundary)
   struct sdis_estimator* estimator = NULL;
   struct ssp_rng_proxy* rng_proxy = NULL;
   struct ssp_rng** rngs = NULL;
+  struct accum* accums = NULL;
   size_t i;
-  size_t N = 0; /* #realisations that do not fail */
   size_t view_nprims;
-  double weight=0, sqr_weight=0;
   int64_t irealisation;
   ATOMIC res = RES_OK;
 
@@ -174,16 +173,21 @@ XD(solve_boundary)
     if(res != RES_OK) goto error;
   }
 
+  /* Create the per thread accumulator */
+  accums = MEM_CALLOC(scn->dev->allocator, scn->dev->nthreads, sizeof(*accums));
+  if(!accums) { res = RES_MEM_ERR; goto error; }
+
   /* Create the estimator */
   res = estimator_create(scn->dev, SDIS_ESTIMATOR_TEMPERATURE, &estimator);
   if(res != RES_OK) goto error;
 
   omp_set_num_threads((int)scn->dev->nthreads);
-  #pragma omp parallel for schedule(static) reduction(+:weight,sqr_weight,N)
+  #pragma omp parallel for schedule(static)
   for(irealisation=0; irealisation<(int64_t)nrealisations; ++irealisation) {
     const int ithread = omp_get_thread_num();
-    struct sXd(primitive) prim;
     struct ssp_rng* rng = rngs[ithread];
+    struct accum* accum = &accums[ithread];
+    struct sXd(primitive) prim;
     enum sdis_side side;
     size_t iprim;
     double w = NaN;
@@ -226,29 +230,32 @@ XD(solve_boundary)
 
     /* Update the MC accumulators */
     if(res_local == RES_OK) {
-      weight += w;
-      sqr_weight += w*w;
-      ++N;
+      accum->sum += w;
+      accum->sum2 += w*w;
+      ++accum->count;
     } else if(res_local != RES_BAD_OP) {
       ATOMIC_SET(&res, res_local);
       continue;
     }
   }
 
+  sum_accums(accums, scn->dev->nthreads, &accums[0]);
+
   /* Setup the estimated temperature */
-  estimator_setup_realisations_count(estimator, nrealisations, N);
-  estimator_setup_temperature(estimator, weight, sqr_weight);
+  estimator_setup_realisations_count(estimator, nrealisations, accums[0].count);
+  estimator_setup_temperature(estimator, accums[0].sum, accums[0].sum2);
 
 exit:
+  if(rngs) {
+    FOR_EACH(i, 0, scn->dev->nthreads) {if(rngs[i]) SSP(rng_ref_put(rngs[i]));}
+    MEM_RM(scn->dev->allocator, rngs);
+  }
+  if(accums) MEM_RM(scn->dev->allocator, accums);
   if(scene) SXD(scene_ref_put(scene));
   if(shape) SXD(shape_ref_put(shape));
   if(view) SXD(scene_view_ref_put(view));
   if(rng_proxy) SSP(rng_proxy_ref_put(rng_proxy));
   if(out_estimator) *out_estimator = estimator;
-  if(rngs) {
-    FOR_EACH(i, 0, scn->dev->nthreads) {if(rngs[i]) SSP(rng_ref_put(rngs[i]));}
-    MEM_RM(scn->dev->allocator, rngs);
-  }
   return (res_T)res;
 error:
   if(estimator) {
@@ -278,12 +285,11 @@ XD(solve_boundary_flux)
   struct sdis_estimator* estimator = NULL;
   struct ssp_rng_proxy* rng_proxy = NULL;
   struct ssp_rng** rngs = NULL;
-  double weight_t = 0, sqr_weight_t = 0;
-  double weight_fc = 0, sqr_weight_fc = 0;
-  double weight_fr = 0, sqr_weight_fr = 0;
-  double weight_f = 0, sqr_weight_f = 0;
+  struct accum* acc_tp = NULL; /* Per thread temperature accumulator */
+  struct accum* acc_fl = NULL; /* Per thread flux accumulator */
+  struct accum* acc_fc = NULL; /* Per thread convective flux accumulator */
+  struct accum* acc_fr = NULL; /* Per thread radiative flux accumulator */
   size_t i;
-  size_t N = 0; /* #realisations that do not fail */
   size_t view_nprims;
   int64_t irealisation;
   ATOMIC res = RES_OK;
@@ -363,17 +369,31 @@ XD(solve_boundary_flux)
     if(res != RES_OK) goto error;
   }
 
+  /* Create the per thread accumulator */
+  #define ALLOC_ACCUMS(Dst) {                                                  \
+    Dst = MEM_CALLOC(scn->dev->allocator, scn->dev->nthreads, sizeof(*Dst));   \
+    if(!Dst) { res = RES_MEM_ERR; goto error; }                                \
+  } (void)0
+  ALLOC_ACCUMS(acc_tp);
+  ALLOC_ACCUMS(acc_fc);
+  ALLOC_ACCUMS(acc_fl);
+  ALLOC_ACCUMS(acc_fr);
+  #undef ALLOC_ACCUMS
+
   /* Create the estimator */
   res = estimator_create(scn->dev, SDIS_ESTIMATOR_FLUX, &estimator);
   if(res != RES_OK) goto error;
 
   omp_set_num_threads((int)scn->dev->nthreads);
-  #pragma omp parallel for schedule(static) reduction(+:weight_t,sqr_weight_t,\
-     weight_fc,sqr_weight_fc,weight_fr,sqr_weight_fr,weight_f,sqr_weight_f,N)
+  #pragma omp parallel for schedule(static)
   for(irealisation = 0; irealisation < (int64_t)nrealisations; ++irealisation) {
     const int ithread = omp_get_thread_num();
-    struct sXd(primitive) prim;
     struct ssp_rng* rng = rngs[ithread];
+    struct accum* acc_temp = &acc_tp[ithread];
+    struct accum* acc_flux = &acc_fl[ithread];
+    struct accum* acc_fcon = &acc_fc[ithread];
+    struct accum* acc_frad = &acc_fr[ithread];
+    struct sXd(primitive) prim;
     struct sdis_interface_fragment frag = SDIS_INTERFACE_FRAGMENT_NULL;
     const struct sdis_interface* interf;
     const struct sdis_medium *fmd, *bmd;
@@ -444,48 +464,66 @@ XD(solve_boundary_flux)
     if(hc > 0) flux_mask |= FLUX_FLAG_CONVECTIVE;
     res_local = XD(boundary_flux_realisation)(scn, rng, iprim, uv, time,
       solid_side, fp_to_meter, Tarad, Tref, flux_mask, T_brf);
-    if(res_local != RES_OK) {
-      if(res_local != RES_BAD_OP) {
-        ATOMIC_SET(&res, res_local);
-        continue;
-      }
-    } else {
+    if(res_local == RES_OK) {
       const double Tboundary = T_brf[0];
       const double Tradiative = T_brf[1];
       const double Tfluid = T_brf[2];
       const double w_conv = hc * (Tboundary - Tfluid);
       const double w_rad = hr * (Tboundary - Tradiative);
       const double w_total = w_conv + w_rad;
-      weight_t += Tboundary;
-      sqr_weight_t += Tboundary * Tboundary;
-      weight_fc += w_conv;
-      sqr_weight_fc += w_conv * w_conv;
-      weight_fr += w_rad;
-      sqr_weight_fr += w_rad * w_rad;
-      weight_f += w_total;
-      sqr_weight_f += w_total * w_total;
-      ++N;
+      /* Temperature */
+      acc_temp->sum += Tboundary;
+      acc_temp->sum2 += Tboundary*Tboundary;
+      ++acc_temp->count;
+      /* Overwall flux */
+      acc_flux->sum += w_total;
+      acc_flux->sum2 += w_total*w_total;
+      ++acc_flux->count;
+      /* Convective flux */
+      acc_fcon->sum  += w_conv;
+      acc_fcon->sum2 += w_conv*w_conv;
+      ++acc_fcon->count;
+      /* Radiative flux */
+      acc_frad->sum += w_rad;
+      acc_frad->sum2 += w_rad*w_rad;
+      ++acc_frad->count;
+    } else if(res_local != RES_BAD_OP) {
+      ATOMIC_SET(&res, res_local);
+      continue;
     }
   }
   if(res != RES_OK) goto error;
 
+  /* Redux the per thread accumulators  */
+  sum_accums(acc_tp, scn->dev->nthreads, &acc_tp[0]);
+  sum_accums(acc_fc, scn->dev->nthreads, &acc_fc[0]);
+  sum_accums(acc_fr, scn->dev->nthreads, &acc_fr[0]);
+  sum_accums(acc_fl, scn->dev->nthreads, &acc_fl[0]);
+  ASSERT(acc_tp[0].count == acc_fl[0].count);
+  ASSERT(acc_tp[0].count == acc_fr[0].count);
+  ASSERT(acc_tp[0].count == acc_fc[0].count);
+
   /* Setup the estimated values */
-  estimator_setup_realisations_count(estimator, nrealisations, N);
-  estimator_setup_temperature(estimator, weight_t, sqr_weight_t);
-  estimator_setup_flux(estimator, FLUX_CONVECTIVE, weight_fc, sqr_weight_fc);
-  estimator_setup_flux(estimator, FLUX_RADIATIVE, weight_fr, sqr_weight_fr);
-  estimator_setup_flux(estimator, FLUX_TOTAL, weight_f, sqr_weight_f);
+  estimator_setup_realisations_count(estimator, nrealisations, acc_tp[0].count);
+  estimator_setup_temperature(estimator, acc_tp[0].sum, acc_tp[0].sum2);
+  estimator_setup_flux(estimator, FLUX_CONVECTIVE, acc_fc[0].sum, acc_fc[0].sum2);
+  estimator_setup_flux(estimator, FLUX_RADIATIVE, acc_fr[0].sum, acc_fr[0].sum2);
+  estimator_setup_flux(estimator, FLUX_TOTAL, acc_fl[0].sum, acc_fl[0].sum2);
 
 exit:
+  if(rngs) {
+    FOR_EACH(i, 0, scn->dev->nthreads) { if(rngs[i]) SSP(rng_ref_put(rngs[i])); }
+    MEM_RM(scn->dev->allocator, rngs);
+  }
+  if(acc_tp) MEM_RM(scn->dev->allocator, acc_tp);
+  if(acc_fc) MEM_RM(scn->dev->allocator, acc_fc);
+  if(acc_fr) MEM_RM(scn->dev->allocator, acc_fr);
+  if(acc_fl) MEM_RM(scn->dev->allocator, acc_fl);
   if(scene) SXD(scene_ref_put(scene));
   if(shape) SXD(shape_ref_put(shape));
   if(view) SXD(scene_view_ref_put(view));
   if(rng_proxy) SSP(rng_proxy_ref_put(rng_proxy));
   if(out_estimator) *out_estimator = estimator;
-  if(rngs) {
-    FOR_EACH(i, 0, scn->dev->nthreads) { if(rngs[i]) SSP(rng_ref_put(rngs[i])); }
-    MEM_RM(scn->dev->allocator, rngs);
-  }
   return (res_T)res;
 error:
   if(estimator) {
diff --git a/src/sdis_solve_medium_Xd.h b/src/sdis_solve_medium_Xd.h
@@ -211,7 +211,7 @@ XD(solve_medium)
   struct ssp_rng_proxy* rng_proxy = NULL;
   struct ssp_rng** rngs = NULL;
   struct sdis_estimator* estimator = NULL;
-  struct accum { double sum, sum2; size_t naccums; }* accums = NULL;
+  struct accum* accums = NULL;
   int64_t irealisation;
   int cumul_is_init = 0;
   size_t i;
@@ -300,7 +300,7 @@ XD(solve_medium)
     } else {
       accum->sum += weight;
       accum->sum2 += weight*weight;
-      ++accum->naccums;
+      ++accum->count;
     }
 
     /* Finalize the registered path */
@@ -321,15 +321,11 @@ XD(solve_medium)
   if(res != RES_OK) goto error;
 
   /* Merge the per thread accumulators into the accumulator of the thread 0 */
-  FOR_EACH(i, 1, scn->dev->nthreads) {
-    accums[0].sum += accums[i].sum;
-    accums[0].sum2 += accums[i].sum2;
-    accums[0].naccums += accums[i].naccums;
-  }
-  ASSERT(accums[0].naccums <= nrealisations);
+  sum_accums(accums, scn->dev->nthreads, &accums[0]);
+  ASSERT(accums[0].count <= nrealisations);
 
   /* Setup the estimated temperature */
-  estimator_setup_realisations_count(estimator, nrealisations, accums[0].naccums);
+  estimator_setup_realisations_count(estimator, nrealisations, accums[0].count);
   estimator_setup_temperature(estimator, accums[0].sum, accums[0].sum2);
 
 exit:
diff --git a/src/sdis_solve_probe_Xd.h b/src/sdis_solve_probe_Xd.h
@@ -47,11 +47,8 @@ XD(solve_probe)
   struct sdis_green_function** greens = NULL;
   struct ssp_rng_proxy* rng_proxy = NULL;
   struct ssp_rng** rngs = NULL;
-  double weight = 0;
-  double sqr_weight = 0;
-  const int64_t rcount = (int64_t)nrealisations;
+  struct accum* accums = NULL;
   int64_t irealisation = 0;
-  size_t N = 0; /* #realisations that do not fail */
   size_t i;
   ATOMIC res = RES_OK;
 
@@ -91,6 +88,10 @@ XD(solve_probe)
     if(res != RES_OK) goto error;
   }
 
+  /* Create the per thread accumulator */
+  accums = MEM_CALLOC(scn->dev->allocator, scn->dev->nthreads, sizeof(*accums));
+  if(!accums) { res = RES_MEM_ERR; goto error; }
+
   /* Retrieve the medium in which the submitted position lies */
   res = scene_get_medium(scn, position, NULL, &medium);
   if(res != RES_OK) goto error;
@@ -114,17 +115,18 @@ XD(solve_probe)
 
   /* Here we go! Launch the Monte Carlo estimation */
   omp_set_num_threads((int)scn->dev->nthreads);
-  #pragma omp parallel for schedule(static) reduction(+:weight,sqr_weight,N)
-  for(irealisation = 0; irealisation < rcount; ++irealisation) {
-    res_T res_local;
-    double w = NaN;
-    double time;
+  #pragma omp parallel for schedule(static)
+  for(irealisation = 0; irealisation < (int64_t)nrealisations; ++irealisation) {
     const int ithread = omp_get_thread_num();
     struct ssp_rng* rng = rngs[ithread];
+    struct accum* accum = &accums[ithread];
     struct green_path_handle* pgreen_path = NULL;
     struct green_path_handle green_path = GREEN_PATH_HANDLE_NULL;
     struct sdis_heat_path* pheat_path = NULL;
     struct sdis_heat_path heat_path;
+    double w = NaN;
+    double time;
+    res_T res_local;
 
     if(ATOMIC_GET(&res) != RES_OK) continue; /* An error occurred */
 
@@ -146,12 +148,13 @@ XD(solve_probe)
 
     res_local = XD(probe_realisation)((size_t)irealisation, scn, rng, medium,
       position, time, fp_to_meter, Tarad, Tref, pgreen_path, pheat_path, &w);
-    if(res_local != RES_OK) {
-      if(res_local != RES_BAD_OP) { ATOMIC_SET(&res, res_local); continue; }
-    } else {
-      weight += w;
-      sqr_weight += w*w;
-      ++N;
+    if(res_local == RES_OK) {
+      accum->sum += w;
+      accum->sum2 += w*w;
+      ++accum->count;
+    } else if(res_local != RES_BAD_OP) {
+      ATOMIC_SET(&res, res_local);
+      continue;
     }
 
     if(pheat_path) {
@@ -172,8 +175,10 @@ XD(solve_probe)
 
   /* Setup the estimated temperature */
   if(out_estimator) {
-    estimator_setup_realisations_count(estimator, nrealisations, N);
-    estimator_setup_temperature(estimator, weight, sqr_weight);
+    struct accum acc;
+    sum_accums(accums, scn->dev->nthreads, &acc);
+    estimator_setup_realisations_count(estimator, nrealisations, acc.count);
+    estimator_setup_temperature(estimator, acc.sum, acc.sum2);
   }
 
   if(out_green) {
@@ -202,6 +207,7 @@ exit:
     }
     MEM_RM(scn->dev->allocator, greens);
   }
+  if(accums) MEM_RM(scn->dev->allocator, accums);
   if(rng_proxy) SSP(rng_proxy_ref_put(rng_proxy));
   if(out_green) *out_green = green;
   if(out_estimator) *out_estimator = estimator;
diff --git a/src/sdis_solve_probe_boundary_Xd.h b/src/sdis_solve_probe_boundary_Xd.h
@@ -44,11 +44,8 @@ XD(solve_probe_boundary)
   struct sdis_estimator* estimator = NULL;
   struct ssp_rng_proxy* rng_proxy = NULL;
   struct ssp_rng** rngs = NULL;
-  double weight = 0;
-  double sqr_weight = 0;
-  const int64_t rcount = (int64_t)nrealisations;
+  struct accum* accums = NULL;
   int64_t irealisation = 0;
-  size_t N = 0; /* #realisations that do not fail */
   size_t i;
   ATOMIC res = RES_OK;
 
@@ -115,28 +112,30 @@ XD(solve_probe_boundary)
   /* Create the per thread RNG */
   rngs = MEM_CALLOC
     (scn->dev->allocator, scn->dev->nthreads, sizeof(struct ssp_rng*));
-  if(!rngs) {
-    res = RES_MEM_ERR;
-    goto error;
-  }
+  if(!rngs) { res = RES_MEM_ERR; goto error; }
   FOR_EACH(i, 0, scn->dev->nthreads) {
     res = ssp_rng_proxy_create_rng(rng_proxy, i, rngs+i);
     if(res != RES_OK) goto error;
   }
 
+  /* Create the per thread accumulator */
+  accums = MEM_CALLOC(scn->dev->allocator, scn->dev->nthreads, sizeof(*accums));
+  if(!accums) { res = RES_MEM_ERR; goto error; }
+
   /* Create the estimator */
   res = estimator_create(scn->dev, SDIS_ESTIMATOR_TEMPERATURE, &estimator);
   if(res != RES_OK) goto error;
 
   /* Here we go! Launch the Monte Carlo estimation */
   omp_set_num_threads((int)scn->dev->nthreads);
-  #pragma omp parallel for schedule(static) reduction(+:weight,sqr_weight,N)
-  for(irealisation = 0; irealisation < rcount; ++irealisation) {
-    res_T res_local;
-    double w = NaN;
-    double time;
+  #pragma omp parallel for schedule(static)
+  for(irealisation = 0; irealisation < (int64_t)nrealisations; ++irealisation) {
     const int ithread = omp_get_thread_num();
     struct ssp_rng* rng = rngs[ithread];
+    struct accum* accum = &accums[ithread];
+    double w = NaN;
+    double time;
+    res_T res_local;
 
     if(ATOMIC_GET(&res) != RES_OK) continue; /* An error occurred */
 
@@ -144,22 +143,21 @@ XD(solve_probe_boundary)
 
     res_local = XD(boundary_realisation)
       (scn, rng, iprim, uv, time, side, fp_to_meter, Tarad, Tref, &w);
-    if(res_local != RES_OK) {
-      if(res_local != RES_BAD_OP) {
-        ATOMIC_SET(&res, res_local);
-        continue;
-      }
-    } else {
-      weight += w;
-      sqr_weight += w*w;
-      ++N;
+    if(res_local == RES_OK) {
+      accum->sum += w;
+      accum->sum2 += w*w;
+      ++accum->count;
+    } else if(res_local != RES_BAD_OP) {
+      ATOMIC_SET(&res, res_local);
+      continue;
     }
   }
   if(res != RES_OK) goto error;
 
   /* Setup the estimated temperature */
-  estimator_setup_realisations_count(estimator, nrealisations, N);
-  estimator_setup_temperature(estimator, weight, sqr_weight);
+  sum_accums(accums, scn->dev->nthreads, &accums[0]);
+  estimator_setup_realisations_count(estimator, nrealisations, accums[0].count);
+  estimator_setup_temperature(estimator, accums[0].sum, accums[0].sum2);
 
 exit:
   if(rngs) {
@@ -168,6 +166,7 @@ exit:
     }
     MEM_RM(scn->dev->allocator, rngs);
   }
+  if(accums) MEM_RM(scn->dev->allocator, accums);
   if(rng_proxy) SSP(rng_proxy_ref_put(rng_proxy));
   if(out_estimator) *out_estimator = estimator;
   return (res_T)res;
@@ -198,13 +197,11 @@ XD(solve_probe_boundary_flux)
   const struct sdis_medium *fmd, *bmd;
   enum sdis_side solid_side, fluid_side;
   struct sdis_interface_fragment frag;
-  double weight_t = 0, sqr_weight_t = 0;
-  double weight_fc = 0, sqr_weight_fc = 0;
-  double weight_fr = 0, sqr_weight_fr = 0;
-  double weight_f= 0, sqr_weight_f = 0;
-  const int64_t rcount = (int64_t)nrealisations;
+  struct accum* acc_tp = NULL; /* Per thread temperature accumulator */
+  struct accum* acc_fl = NULL; /* Per thread flux accumulator */
+  struct accum* acc_fc = NULL; /* Per thread convective flux accumulator */
+  struct accum* acc_fr = NULL; /* Per thread radiative flux accumulator */
   int64_t irealisation = 0;
-  size_t N = 0; /* #realisations that do not fail */
   size_t i;
   ATOMIC res = RES_OK;
 
@@ -279,15 +276,23 @@ XD(solve_probe_boundary_flux)
   /* Create the per thread RNG */
   rngs = MEM_CALLOC
     (scn->dev->allocator, scn->dev->nthreads, sizeof(struct ssp_rng*));
-  if(!rngs) {
-    res = RES_MEM_ERR;
-    goto error;
-  }
+  if(!rngs) { res = RES_MEM_ERR; goto error; }
   FOR_EACH(i, 0, scn->dev->nthreads) {
     res = ssp_rng_proxy_create_rng(rng_proxy, i, rngs + i);
     if(res != RES_OK) goto error;
   }
 
+  /* Create the per thread accumulator */
+  #define ALLOC_ACCUMS(Dst) {                                                  \
+    Dst = MEM_CALLOC(scn->dev->allocator, scn->dev->nthreads, sizeof(*Dst));   \
+    if(!Dst) { res = RES_MEM_ERR; goto error; }                                \
+  } (void)0
+  ALLOC_ACCUMS(acc_tp);
+  ALLOC_ACCUMS(acc_fc);
+  ALLOC_ACCUMS(acc_fl);
+  ALLOC_ACCUMS(acc_fr);
+  #undef ALLOC_ACCUMS
+
   /* Prebuild the interface fragment */
   res = XD(build_interface_fragment)
     (&frag, scn, (unsigned)iprim, uv, fluid_side);
@@ -299,15 +304,18 @@ XD(solve_probe_boundary_flux)
 
   /* Here we go! Launch the Monte Carlo estimation */
   omp_set_num_threads((int)scn->dev->nthreads);
-  #pragma omp parallel for schedule(static) reduction(+:weight_t,sqr_weight_t,\
-     weight_fc,sqr_weight_fc,weight_fr,sqr_weight_fr,weight_f,sqr_weight_f,N)
-  for(irealisation = 0; irealisation < rcount; ++irealisation) {
-    res_T res_local;
-    double T_brf[3] = { 0, 0, 0 };
+  #pragma omp parallel for schedule(static)
+  for(irealisation = 0; irealisation < (int64_t)nrealisations; ++irealisation) {
     const int ithread = omp_get_thread_num();
     struct ssp_rng* rng = rngs[ithread];
+    struct accum* acc_temp = &acc_tp[ithread];
+    struct accum* acc_flux = &acc_fl[ithread];
+    struct accum* acc_fcon = &acc_fc[ithread];
+    struct accum* acc_frad = &acc_fr[ithread];
     double time, epsilon, hc, hr;
     int flux_mask = 0;
+    double T_brf[3] = { 0, 0, 0 };
+    res_T res_local;
 
     if(ATOMIC_GET(&res) != RES_OK) continue; /* An error occurred */
 
@@ -325,45 +333,61 @@ XD(solve_probe_boundary_flux)
     if(hc > 0) flux_mask |= FLUX_FLAG_CONVECTIVE;
     res_local = XD(boundary_flux_realisation)(scn, rng, iprim, uv, time,
       solid_side, fp_to_meter, Tarad, Tref, flux_mask, T_brf);
-    if(res_local != RES_OK) {
-      if(res_local != RES_BAD_OP) {
-        ATOMIC_SET(&res, res_local);
-        continue;
-      }
-    } else {
+    if(res_local == RES_OK) {
       const double Tboundary = T_brf[0];
       const double Tradiative = T_brf[1];
       const double Tfluid = T_brf[2];
       const double w_conv = hc * (Tboundary - Tfluid);
       const double w_rad = hr * (Tboundary - Tradiative);
       const double w_total = w_conv + w_rad;
-      weight_t += Tboundary;
-      sqr_weight_t += Tboundary * Tboundary;
-      weight_fc += w_conv;
-      sqr_weight_fc += w_conv * w_conv;
-      weight_fr += w_rad;
-      sqr_weight_fr += w_rad * w_rad;
-      weight_f += w_total;
-      sqr_weight_f += w_total * w_total;
-      ++N;
+      /* Temperature */
+      acc_temp->sum += Tboundary;
+      acc_temp->sum2 += Tboundary*Tboundary;
+      ++acc_temp->count;
+      /* Overwall flux */
+      acc_flux->sum += w_total;
+      acc_flux->sum2 += w_total*w_total;
+      ++acc_flux->count;
+      /* Convective flux */
+      acc_fcon->sum  += w_conv;
+      acc_fcon->sum2 += w_conv*w_conv;
+      ++acc_fcon->count;
+      /* Radiative flux */
+      acc_frad->sum += w_rad;
+      acc_frad->sum2 += w_rad*w_rad;
+      ++acc_frad->count;
+    } else if(res_local != RES_BAD_OP) {
+      ATOMIC_SET(&res, res_local);
+      continue;
     }
   }
   if(res != RES_OK) goto error;
 
+  /* Redux the per thread accumulators  */
+  sum_accums(acc_tp, scn->dev->nthreads, &acc_tp[0]);
+  sum_accums(acc_fc, scn->dev->nthreads, &acc_fc[0]);
+  sum_accums(acc_fr, scn->dev->nthreads, &acc_fr[0]);
+  sum_accums(acc_fl, scn->dev->nthreads, &acc_fl[0]);
+  ASSERT(acc_tp[0].count == acc_fl[0].count);
+  ASSERT(acc_tp[0].count == acc_fr[0].count);
+  ASSERT(acc_tp[0].count == acc_fc[0].count);
+
   /* Setup the estimated values */
-  estimator_setup_realisations_count(estimator, nrealisations, N);
-  estimator_setup_temperature(estimator, weight_t, sqr_weight_t);
-  estimator_setup_flux(estimator, FLUX_CONVECTIVE, weight_fc, sqr_weight_fc);
-  estimator_setup_flux(estimator, FLUX_RADIATIVE, weight_fr, sqr_weight_fr);
-  estimator_setup_flux(estimator, FLUX_TOTAL, weight_f, sqr_weight_f);
+  estimator_setup_realisations_count(estimator, nrealisations, acc_tp[0].count);
+  estimator_setup_temperature(estimator, acc_tp[0].sum, acc_tp[0].sum2);
+  estimator_setup_flux(estimator, FLUX_CONVECTIVE, acc_fc[0].sum, acc_fc[0].sum2);
+  estimator_setup_flux(estimator, FLUX_RADIATIVE, acc_fr[0].sum, acc_fr[0].sum2);
+  estimator_setup_flux(estimator, FLUX_TOTAL, acc_fl[0].sum, acc_fl[0].sum2);
 
 exit:
   if(rngs) {
-    FOR_EACH(i, 0, scn->dev->nthreads) {
-      if(rngs[i]) SSP(rng_ref_put(rngs[i]));
-    }
+    FOR_EACH(i, 0, scn->dev->nthreads) {if(rngs[i]) SSP(rng_ref_put(rngs[i]));}
     MEM_RM(scn->dev->allocator, rngs);
   }
+  if(acc_tp) MEM_RM(scn->dev->allocator, acc_tp);
+  if(acc_fc) MEM_RM(scn->dev->allocator, acc_fc);
+  if(acc_fr) MEM_RM(scn->dev->allocator, acc_fr);
+  if(acc_fl) MEM_RM(scn->dev->allocator, acc_fl);
   if(rng_proxy) SSP(rng_proxy_ref_put(rng_proxy));
   if(out_estimator) *out_estimator = estimator;
   return (res_T)res;

	stardis-solver Solve coupled heat transfers
	git clone git://git.meso-star.fr/stardis-solver.git
	Log \| Files \| Refs \| README \| LICENSE

M	src/sdis_Xd_begin.h	\|	26	++++++++++++++++++++++++++
M	src/sdis_solve_boundary_Xd.h	\|	128	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	src/sdis_solve_medium_Xd.h	\|	14	+++++---------
M	src/sdis_solve_probe_Xd.h	\|	40	+++++++++++++++++++++++-----------------
M	src/sdis_solve_probe_boundary_Xd.h	\|	148	++++++++++++++++++++++++++++++++++++++++++++++---------------------------------