Plug the long wave realisation in the draw function - htrdr - Solving radiative transfer in heterogeneous media

commit 56fa5639dc8b223bf8b25a561ab6174f961bb504
parent b9aba36680492ebc8f20769da4f6f90d3c63619f
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Tue, 17 Mar 2020 10:36:41 +0100

Plug the long wave realisation in the draw function

Diffstat:
M cmake/CMakeLists.txt  | 2 +-
M src/htrdr.c  | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M src/htrdr.h  | 3 +++
M src/htrdr_c.h  | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/htrdr_compute_radiance_lw.c  | 60 ------------------------------------------------------------
A src/htrdr_draw_radiance.c  | 897 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D src/htrdr_draw_radiance_sw.c  | 851 -------------------------------------------------------------------------------
M src/htrdr_solve.h  | 2 +-
M src/htrdr_sun.c  | 8 --------

9 files changed, 1047 insertions(+), 922 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -93,7 +93,7 @@ set(HTRDR_FILES_SRC
   htrdr_camera.c
   htrdr_compute_radiance_sw.c
   htrdr_compute_radiance_lw.c
-  htrdr_draw_radiance_sw.c
+  htrdr_draw_radiance.c
   htrdr_grid.c
   htrdr_ground.c
   htrdr_interface.c
diff --git a/src/htrdr.c b/src/htrdr.c
@@ -357,6 +357,74 @@ error:
   goto exit;
 }
 
+static res_T
+setup_lw_cdf(struct htrdr* htrdr)
+{
+  /* Reference temperature used to compute the Long Wave cumulative */
+  const double Tref = 290; /* In Kelvin */
+  double* cdf = NULL;
+  double* pdf = NULL;
+  double sum = 0;
+  size_t i;
+  size_t nbands;
+  res_T res = RES_OK;
+  ASSERT(htrdr && htsky_is_long_wave(htrdr->sky));
+
+  nbands = htsky_get_spectral_bands_count(htrdr->sky);
+  res = darray_double_resize(&htrdr->lw_cdf, nbands);
+  if(res != RES_OK) {
+    htrdr_log_err(htrdr,
+      "error allocating the CDF of the long wave spectral bands -- %s.\n",
+      res_to_cstr(res));
+    goto error;
+  }
+
+  /* Alias the same array by the cdf and the pdf variable to make easier the
+   * reading of the code */
+  pdf = darray_double_data_get(&htrdr->lw_cdf);
+  cdf = darray_double_data_get(&htrdr->lw_cdf);
+
+  /* Compute the *unormalized* probability to sample a long wave band */
+  sum = 0;
+  FOR_EACH(i, 0, nbands) {
+    const size_t iband = htsky_get_spectral_band_id(htrdr->sky, i);
+    double wlens[2];
+    HTSKY(get_spectral_band_bounds(htrdr->sky, iband, wlens));
+
+    /* Convert from nanometer to meter */
+    wlens[0] = wlens[0] * 1.e9;
+    wlens[1] = wlens[1] * 1.e9;
+
+    /* Compute the probability of the current band */
+    pdf[i] = planck(wlens[0], wlens[1], Tref);
+
+    /* Update the norm */
+    sum += pdf[i];
+  }
+
+  /* Compute the cumulative of the previously computed probabilities */
+  FOR_EACH(i, 0, nbands) {
+    /* Normalize the probability */
+    pdf[i] /= sum;
+
+    /* Setup the cumulative */
+    if(i == 0) {
+      cdf[i] = pdf[i];
+    } else {
+      cdf[i] = pdf[i] + cdf[i-1];
+      ASSERT(cdf[i] >= cdf[i-1]);
+    }
+  }
+
+  cdf[nbands - 1] = 1.0;
+
+exit:
+  return res;
+error:
+  darray_double_clear(&htrdr->lw_cdf);
+  goto exit;
+}
+
 /*******************************************************************************
  * Local functions
  ******************************************************************************/
@@ -386,6 +454,8 @@ htrdr_init
 
   str_init(htrdr->allocator, &htrdr->output_name);
 
+  darray_double_init(htrdr->allocator, &htrdr->lw_cdf);
+
   nthreads_max = MMAX(omp_get_max_threads(), omp_get_num_procs());
   htrdr->dump_vtk = args->dump_vtk;
   htrdr->verbose = args->verbose;
@@ -482,6 +552,12 @@ htrdr_init
   res = htsky_create(&htrdr->logger, htrdr->allocator, &htsky_args, &htrdr->sky);
   if(res != RES_OK) goto error;
 
+  if(htsky_is_long_wave(htrdr->sky)) {
+    /* Define the CDF used to sample a long wave band */
+    res = setup_lw_cdf(htrdr);
+    if(res != RES_OK) goto error;
+  }
+
   htrdr->lifo_allocators = MEM_CALLOC
     (htrdr->allocator, htrdr->nthreads, sizeof(*htrdr->lifo_allocators));
   if(!htrdr->lifo_allocators) {
@@ -530,6 +606,7 @@ htrdr_release(struct htrdr* htrdr)
     MEM_RM(htrdr->allocator, htrdr->lifo_allocators);
   }
   str_release(&htrdr->output_name);
+  darray_double_release(&htrdr->lw_cdf);
   logger_release(&htrdr->logger);
 }
 
@@ -556,7 +633,7 @@ htrdr_run(struct htrdr* htrdr)
       }
     }
   } else {
-    res = htrdr_draw_radiance_sw(htrdr, htrdr->cam, htrdr->width,
+    res = htrdr_draw_radiance(htrdr, htrdr->cam, htrdr->width,
       htrdr->height, htrdr->spp, htrdr->buf);
     if(res != RES_OK) goto error;
     if(htrdr->mpi_rank == 0) {
diff --git a/src/htrdr.h b/src/htrdr.h
@@ -17,6 +17,7 @@
 #ifndef HTRDR_H
 #define HTRDR_H
 
+#include <rsys/dynamic_array_double.h>
 #include <rsys/logger.h>
 #include <rsys/ref_count.h>
 #include <rsys/str.h>
@@ -54,6 +55,8 @@ struct htrdr {
 
   struct htsky* sky;
 
+  struct darray_double lw_cdf; /* CDF to sample a Long Waves band */
+
   size_t spp; /* #samples per pixel */
   size_t width; /* Image width */
   size_t height; /* Image height */
diff --git a/src/htrdr_c.h b/src/htrdr_c.h
@@ -103,6 +103,65 @@ morton_xyz_decode_u21(const uint64_t code, uint32_t xyz[3])
   xyz[2] = (uint32_t)morton3D_decode_u21(code >> 0);
 }
 
+static INLINE double
+wiebelt(const double v)
+{
+  int m;
+  double w, v2, v4;
+  /*.153989717364e+00;*/
+  const double fifteen_over_pi_power_4 = 15.0/(PI*PI*PI*PI);
+  const double z0 = 1.0/3.0;
+  const double z1 = 1.0/8.0;
+  const double z2 = 1.0/60.0;
+  const double z4 = 1.0/5040.0;
+  const double z6 = 1.0/272160.0;
+  const double z8 = 1.0/13305600.0;
+
+  if(v >= 2.) {
+    w = 0;
+    for(m=1; m<6 ;m++)
+      w+=exp(-m*v)/(m*m*m*m) * (((m*v+3)*m*v+6)*m*v+6);
+    w = w * fifteen_over_pi_power_4;
+  } else {
+    v2 = v*v;
+    v4 = v2*v2;
+    w = z0 - z1*v + z2*v2 - z4*v2*v2 + z6*v4*v2 - z8*v4*v4;
+    w = 1. - fifteen_over_pi_power_4*v2*v*w;
+  }
+  ASSERT(w >= 0.0 && w <= 1.0);
+  return w;
+}
+
+static INLINE double
+blackbody_fraction
+  (const double lambda0, /* In meter */
+   const double lambda1, /* In meter */
+   const double temperature) /* In Kelvin */
+{
+  const double C2 = 1.43877735e-2; /* m.K */
+  double x0 = C2 / lambda0;
+  double x1 = C2 / lambda1;
+  double v0 = x0 / temperature;
+  double v1 = x1 / temperature;
+  double w0 = wiebelt(v0);
+  double w1 = wiebelt(v1);
+  return w1 - w0;
+}
+
+static INLINE double
+planck
+  (const double lambda_min, /* In meter */
+   const double lambda_max, /* In meter */
+   const double temperature) /* In Kelvin  */
+{
+  const double T2 = temperature*temperature;
+  const double T4 = T2*T2;
+  const double BOLTZMANN_CONSTANT = 5.6696e-8; /* W/m^2/K^4 */
+  ASSERT(lambda_min < lambda_max && temperature >= 0);
+  return blackbody_fraction(lambda_min, lambda_max, temperature)
+       * BOLTZMANN_CONSTANT * T4;
+}
+
 extern LOCAL_SYM  res_T
 open_output_stream
   (struct htrdr* htrdr,
@@ -146,5 +205,13 @@ update_mpi_progress(struct htrdr* htrdr, const enum htrdr_mpi_message progress)
   print_mpi_progress(htrdr, progress);
 }
 
+static FINLINE int
+cmp_dbl(const void* a, const void* b)
+{
+  const double d0 = *((const double*)a);
+  const double d1 = *((const double*)b);
+  return d0 < d1 ? -1 : (d0 > d1 ? 1 : 0);
+}
+
 #endif /* HTRDR_C_H */
 
diff --git a/src/htrdr_compute_radiance_lw.c b/src/htrdr_compute_radiance_lw.c
@@ -30,8 +30,6 @@
 #include <rsys/double2.h>
 #include <rsys/double3.h>
 
-#define BOLTZMANN_CONSTANT 5.6696e-8 /* W/m^2/K^4 */
-
 enum event {
   EVENT_ABSORPTION,
   EVENT_SCATTERING,
@@ -56,64 +54,6 @@ static const struct filter_context FILTER_CONTEXT_NULL = {
 /*******************************************************************************
  * Helper functions
  ******************************************************************************/
-static FINLINE double
-wiebelt(const double v)
-{
-  int m;
-  double w, v2, v4;
-  /*.153989717364e+00;*/
-  const double fifteen_over_pi_power_4 = 15.0/(PI*PI*PI*PI);
-  const double z0 = 1.0/3.0;
-  const double z1 = 1.0/8.0;
-  const double z2 = 1.0/60.0;
-  const double z4 = 1.0/5040.0;
-  const double z6 = 1.0/272160.0;
-  const double z8 = 1.0/13305600.0;
-
-  if(v >= 2.) {
-    w = 0;
-    for(m=1; m<6 ;m++)
-      w+=exp(-m*v)/(m*m*m*m) * (((m*v+3)*m*v+6)*m*v+6);
-    w = w * fifteen_over_pi_power_4;
-  } else {
-    v2 = v*v;
-    v4 = v2*v2;
-    w = z0 - z1*v + z2*v2 - z4*v2*v2 + z6*v4*v2 - z8*v4*v4;
-    w = 1. - fifteen_over_pi_power_4*v2*v*w;
-  }
-  ASSERT(w >= 0.0 && w <= 1.0);
-  return w;
-}
-
-static FINLINE double
-blackbody_fraction
-  (const double lambda0, /* In meter */
-   const double lambda1, /* In meter */
-   const double temperature) /* In Kelvin */
-{
-  const double C2 = 1.43877735e-2; /* m.K */
-  double x0 = C2 / lambda0;
-  double x1 = C2 / lambda1;
-  double v0 = x0 / temperature;
-  double v1 = x1 / temperature;
-  double w0 = wiebelt(v0);
-  double w1 = wiebelt(v1);
-  return w1 - w0;
-}
-
-static FINLINE double
-planck
-  (const double lambda_min, /* In meter */
-   const double lambda_max, /* In meter */
-   const double temperature) /* In Kelvin  */
-{
-  const double T2 = temperature*temperature;
-  const double T4 = T2*T2;
-  ASSERT(lambda_min < lambda_max && temperature >= 0);
-  return blackbody_fraction(lambda_min, lambda_max, temperature)
-       * BOLTZMANN_CONSTANT * T4;
-}
-
 static int
 hit_filter
   (const struct svx_hit* hit,
diff --git a/src/htrdr_draw_radiance.c b/src/htrdr_draw_radiance.c
@@ -0,0 +1,897 @@
+/* Copyright (C) 2018, 2019, 2020 |Meso|Star> (contact@meso-star.com)
+ * Copyright (C) 2018, 2019 CNRS, Université Paul Sabatier
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#define _POSIX_C_SOURCE 200112L /* nanosleep && nextafter */
+
+#include "htrdr.h"
+#include "htrdr_c.h"
+#include "htrdr_buffer.h"
+#include "htrdr_camera.h"
+#include "htrdr_solve.h"
+
+#include <high_tune/htsky.h>
+
+#include <rsys/algorithm.h>
+#include <rsys/clock_time.h>
+#include <rsys/cstr.h>
+#include <rsys/dynamic_array_u32.h>
+#include <rsys/math.h>
+#include <rsys/mutex.h>
+#include <star/ssp.h>
+
+#include <omp.h>
+#include <mpi.h>
+#include <time.h>
+#include <unistd.h>
+
+#define RNG_SEQUENCE_SIZE 10000
+
+#define TILE_MCODE_NULL UINT32_MAX
+#define TILE_SIZE 32 /* Definition in X & Y of a tile */
+STATIC_ASSERT(IS_POW2(TILE_SIZE), TILE_SIZE_must_be_a_power_of_2);
+
+/* Tile of row ordered image pixels */
+struct tile {
+  struct list_node node;
+  struct mem_allocator* allocator;
+  ref_T ref;
+
+  struct tile_data {
+    uint16_t x, y; /* 2D coordinates of the tile in tile space */
+    /* Simulate the flexible array member of the C99 standard. */
+    struct htrdr_accum accums[1/*dummy element*/];
+  } data;
+};
+
+/* List of tile to compute onto the MPI process. */
+struct proc_work {
+  struct mutex* mutex;
+  struct darray_u32 tiles; /* #tiles to render */
+  size_t itile; /* Next tile to render in the above list of tiles */
+};
+
+/*******************************************************************************
+ * Helper functions
+ ******************************************************************************/
+static FINLINE uint16_t
+morton2D_decode(const uint32_t u32)
+{
+  uint32_t x = u32 & 0x55555555;
+  x = (x | (x >> 1)) & 0x33333333;
+  x = (x | (x >> 2)) & 0x0F0F0F0F;
+  x = (x | (x >> 4)) & 0x00FF00FF;
+  x = (x | (x >> 8)) & 0x0000FFFF;
+  return (uint16_t)x;
+}
+
+static FINLINE uint32_t
+morton2D_encode(const uint16_t u16)
+{
+  uint32_t u32 = u16;
+  u32 = (u32 | (u32 << 8)) & 0x00FF00FF;
+  u32 = (u32 | (u32 << 4)) & 0X0F0F0F0F;
+  u32 = (u32 | (u32 << 2)) & 0x33333333;
+  u32 = (u32 | (u32 << 1)) & 0x55555555;
+  return u32;
+}
+
+static FINLINE struct tile*
+tile_create(struct mem_allocator* allocator)
+{
+  struct tile* tile;
+  const size_t tile_sz =
+    sizeof(struct tile) - sizeof(struct htrdr_accum)/*rm dummy accum*/;
+  const size_t buf_sz = /* Flexiblbe array element */
+    TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]);
+  ASSERT(allocator);
+
+  tile = MEM_ALLOC(allocator, tile_sz+buf_sz);
+  if(!tile) return NULL;
+
+  ref_init(&tile->ref);
+  list_init(&tile->node);
+  tile->allocator = allocator;
+  ASSERT(IS_ALIGNED(&tile->data.accums, ALIGNOF(struct htrdr_accum)));
+
+  return tile;
+}
+
+static INLINE void
+tile_ref_get(struct tile* tile)
+{
+  ASSERT(tile);
+  tile_ref_get(tile);
+}
+
+static INLINE void
+release_tile(ref_T* ref)
+{
+  struct tile* tile = CONTAINER_OF(ref, struct tile, ref);
+  ASSERT(ref);
+  MEM_RM(tile->allocator, tile);
+}
+
+static INLINE void
+tile_ref_put(struct tile* tile)
+{
+  ASSERT(tile);
+  ref_put(&tile->ref, release_tile);
+}
+
+static FINLINE struct htrdr_accum*
+tile_at
+  (struct tile* tile,
+   const size_t x, /* In tile space */
+   const size_t y) /* In tile space */
+{
+  ASSERT(tile && x < TILE_SIZE && y < TILE_SIZE);
+  return tile->data.accums + (y*TILE_SIZE + x) * HTRDR_ESTIMATES_COUNT__;
+}
+
+static void
+write_tile_data(struct htrdr_buffer* buf, const struct tile_data* tile_data)
+{
+  struct htrdr_buffer_layout layout = HTRDR_BUFFER_LAYOUT_NULL;
+  size_t icol, irow;
+  size_t irow_tile;
+  size_t ncols_tile, nrows_tile;
+  char* buf_mem;
+  ASSERT(buf && tile_data);
+
+  htrdr_buffer_get_layout(buf, &layout);
+  buf_mem = htrdr_buffer_get_data(buf);
+  ASSERT(layout.elmt_size == sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]));
+
+  /* Compute the row/column of the tile origin into the buffer */
+  icol = tile_data->x * (size_t)TILE_SIZE;
+  irow = tile_data->y * (size_t)TILE_SIZE;
+
+  /* Define the number of tile row/columns to write into the buffer */
+  ncols_tile = MMIN(icol + TILE_SIZE, layout.width)  - icol;
+  nrows_tile = MMIN(irow + TILE_SIZE, layout.height) - irow;
+
+  /* Copy the tile data, row by row */
+  FOR_EACH(irow_tile, 0, nrows_tile) {
+    char* buf_row = buf_mem + (irow + irow_tile) * layout.pitch;
+    const struct htrdr_accum* tile_row =
+      tile_data->accums + irow_tile*TILE_SIZE*HTRDR_ESTIMATES_COUNT__;
+    memcpy(buf_row + icol*sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]),
+      tile_row, ncols_tile*sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]));
+  }
+}
+
+static INLINE void
+proc_work_init(struct mem_allocator* allocator, struct proc_work* work)
+{
+  ASSERT(work);
+  darray_u32_init(allocator, &work->tiles);
+  work->itile = 0;
+  CHK(work->mutex = mutex_create());
+}
+
+static INLINE void
+proc_work_release(struct proc_work* work)
+{
+  darray_u32_release(&work->tiles);
+  mutex_destroy(work->mutex);
+}
+
+static INLINE void
+proc_work_reset(struct proc_work* work)
+{
+  ASSERT(work);
+  mutex_lock(work->mutex);
+  darray_u32_clear(&work->tiles);
+  work->itile = 0;
+  mutex_unlock(work->mutex);
+}
+
+static INLINE void
+proc_work_add_tile(struct proc_work* work, const uint32_t mcode)
+{
+  mutex_lock(work->mutex);
+  CHK(darray_u32_push_back(&work->tiles, &mcode) == RES_OK);
+  mutex_unlock(work->mutex);
+}
+
+static INLINE uint32_t
+proc_work_get_tile(struct proc_work* work)
+{
+  uint32_t mcode;
+  ASSERT(work);
+  mutex_lock(work->mutex);
+  if(work->itile >= darray_u32_size_get(&work->tiles)) {
+    mcode = TILE_MCODE_NULL;
+  } else {
+    mcode = darray_u32_cdata_get(&work->tiles)[work->itile];
+    ++work->itile;
+  }
+  mutex_unlock(work->mutex);
+  return mcode;
+}
+
+static INLINE size_t
+proc_work_get_ntiles(struct proc_work* work)
+{
+  size_t sz = 0;
+  ASSERT(work);
+  mutex_lock(work->mutex);
+  sz = darray_u32_size_get(&work->tiles);
+  mutex_unlock(work->mutex);
+  return sz;
+}
+
+static void
+mpi_wait_for_request(struct htrdr* htrdr, MPI_Request* req)
+{
+  ASSERT(htrdr && req);
+
+  /* Wait for process synchronisation */
+  for(;;) {
+    struct timespec t;
+    int complete;
+    t.tv_sec = 0;
+    t.tv_nsec = 10000000; /* 10ms */
+
+    mutex_lock(htrdr->mpi_mutex);
+    MPI(Test(req, &complete, MPI_STATUS_IGNORE));
+    mutex_unlock(htrdr->mpi_mutex);
+    if(complete) break;
+
+    nanosleep(&t, NULL);
+  }
+}
+
+static void
+mpi_probe_thieves
+  (struct htrdr* htrdr,
+   struct proc_work* work,
+   ATOMIC* probe_thieves)
+{
+  uint32_t tiles[UINT8_MAX];
+  struct timespec t;
+  ASSERT(htrdr && work && probe_thieves);
+
+  if(htrdr->mpi_nprocs == 1) /* The process is alone. No thief is possible */
+    return;
+
+  t.tv_sec = 0;
+
+  /* Protect MPI calls of multiple invocations from concurrent threads */
+  #define P_MPI(Func) {                                                        \
+    mutex_lock(htrdr->mpi_mutex);                                              \
+    MPI(Func);                                                                 \
+    mutex_unlock(htrdr->mpi_mutex);                                            \
+  } (void)0
+
+  while(ATOMIC_GET(probe_thieves)) {
+    MPI_Status status;
+    size_t itile;
+    int msg;
+
+    /* Probe if a steal request was submitted by any processes */
+    P_MPI(Iprobe(MPI_ANY_SOURCE, HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD, &msg,
+      &status));
+
+    if(msg) { /* A steal request was posted */
+      MPI_Request req;
+      uint8_t ntiles_to_steal;
+
+      /* Asynchronously receive the steal request */
+      P_MPI(Irecv(&ntiles_to_steal, 1, MPI_UINT8_T, status.MPI_SOURCE,
+        HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD, &req));
+
+      /* Wait for the completion of the steal request */
+      mpi_wait_for_request(htrdr, &req);
+
+      /* Thief some tiles */
+      FOR_EACH(itile, 0, ntiles_to_steal) {
+        tiles[itile] = proc_work_get_tile(work);
+      }
+      P_MPI(Send(&tiles, ntiles_to_steal, MPI_UINT32_T, status.MPI_SOURCE,
+        HTRDR_MPI_WORK_STEALING, MPI_COMM_WORLD));
+    }
+    t.tv_nsec = 500000000; /* 500ms */
+    nanosleep(&t, NULL);
+  }
+  #undef P_MPI
+}
+
+static int
+mpi_sample_working_process(struct htrdr* htrdr, struct ssp_rng* rng)
+{
+  int iproc, i;
+  int dst_rank;
+  ASSERT(htrdr && rng && htrdr->mpi_nworking_procs);
+
+  /* Sample the index of the 1st active process */
+  iproc = (int)(ssp_rng_canonical(rng) * (double)htrdr->mpi_nworking_procs);
+
+  /* Find the rank of the sampled active process. Use a simple linear search
+   * since the overall number of processes should be quite low; at most few
+   * dozens.  */
+  i = 0;
+  FOR_EACH(dst_rank, 0, htrdr->mpi_nprocs) {
+    if(htrdr->mpi_working_procs[dst_rank] == 0) continue; /* Inactive process */
+    if(i == iproc) break; /* The rank of the sampled process is found */
+    ++i;
+  }
+  ASSERT(dst_rank < htrdr->mpi_nprocs);
+  return dst_rank;
+}
+
+/* Return the number of stolen tiles */
+static size_t
+mpi_steal_work
+  (struct htrdr* htrdr,
+   struct ssp_rng* rng,
+   struct proc_work* work)
+{
+  MPI_Request req;
+  size_t itile;
+  size_t nthieves = 0;
+  uint32_t tiles[UINT8_MAX]; /* Morton code of the stolen tile */
+  int proc_to_steal; /* Process to steal */
+  uint8_t ntiles_to_steal = MMIN((uint8_t)(htrdr->nthreads*2), 16);
+  ASSERT(htrdr && rng && work && htrdr->nthreads < UINT8_MAX);
+
+  /* Protect MPI calls of multiple invocations from concurrent threads */
+  #define P_MPI(Func) {                                                        \
+    mutex_lock(htrdr->mpi_mutex);                                              \
+    MPI(Func);                                                                 \
+    mutex_unlock(htrdr->mpi_mutex);                                            \
+  } (void)0
+
+  /* No more working process => nohting to steal */
+  if(!htrdr->mpi_nworking_procs) return 0;
+
+  /* Sample a process to steal */
+  proc_to_steal = mpi_sample_working_process(htrdr, rng);
+
+  /* Send a steal request to the sampled process and wait for a response */
+  P_MPI(Send(&ntiles_to_steal, 1, MPI_UINT8_T, proc_to_steal,
+    HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD));
+
+  /* Receive the stolen tile from the sampled process */
+  P_MPI(Irecv(tiles, ntiles_to_steal, MPI_UINT32_T, proc_to_steal,
+    HTRDR_MPI_WORK_STEALING, MPI_COMM_WORLD, &req));
+
+  mpi_wait_for_request(htrdr, &req);
+
+  FOR_EACH(itile, 0, ntiles_to_steal) {
+    if(tiles[itile] == TILE_MCODE_NULL) {
+      ASSERT(htrdr->mpi_working_procs[proc_to_steal] != 0);
+      htrdr->mpi_working_procs[proc_to_steal] = 0;
+      htrdr->mpi_nworking_procs--;
+      break;
+    }
+    proc_work_add_tile(work, tiles[itile]);
+    ++nthieves;
+  }
+  #undef P_MPI
+  return nthieves;
+}
+
+static res_T
+mpi_gather_tiles
+  (struct htrdr* htrdr,
+   struct htrdr_buffer* buf,
+   const size_t ntiles,
+   struct list_node* tiles)
+{
+  /* Compute the size of the tile_data */
+  const size_t msg_sz =
+    sizeof(struct tile_data) - sizeof(struct htrdr_accum)/*dummy*/
+  + TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]);
+
+  struct list_node* node = NULL;
+  struct tile* tile = NULL;
+  res_T res = RES_OK;
+  ASSERT(htrdr && tiles);
+  ASSERT(htrdr->mpi_rank != 0 || buf);
+  (void)ntiles;
+
+  if(htrdr->mpi_rank != 0) { /* Non master process */
+    /* Send the computed tile to the master process */
+    LIST_FOR_EACH(node, tiles) {
+      struct tile* t = CONTAINER_OF(node, struct tile, node);
+      MPI(Send(&t->data, (int)msg_sz, MPI_CHAR, 0,
+        HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD));
+    }
+  } else { /* Master process */
+    size_t itile = 0;
+
+    LIST_FOR_EACH(node, tiles) {
+      struct tile* t = CONTAINER_OF(node, struct tile, node);
+      write_tile_data(buf, &t->data);
+      ++itile;
+    }
+
+    if(itile != ntiles) {
+      ASSERT(htrdr->mpi_nprocs > 1);
+
+      /* Create a temporary tile to receive the tile data computed by the
+       * concurrent MPI processes */
+      tile = tile_create(htrdr->allocator);
+      if(!tile) {
+        res = RES_MEM_ERR;
+        htrdr_log_err(htrdr,
+          "could not allocate the temporary tile used to gather the process "
+          "output data -- %s.\n", res_to_cstr(res));
+        goto error;
+      }
+
+      /* Receive the tile data of the concurret MPI processes */
+      FOR_EACH(itile, itile, ntiles) {
+        MPI(Recv(&tile->data, (int)msg_sz, MPI_CHAR, MPI_ANY_SOURCE,
+          HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        write_tile_data(buf, &tile->data);
+      }
+    }
+  }
+
+exit:
+  if(tile) tile_ref_put(tile);
+  return res;
+error:
+  goto exit;
+}
+
+static INLINE size_t
+sample_lw_spectral_interval(struct htrdr* htrdr, const double r)
+{
+  const double* cdf = NULL;
+  const double* find = NULL;
+  double r_next = nextafter(r, DBL_MAX);
+  size_t cdf_length = 0;
+  size_t i;
+  ASSERT(htrdr && iband);
+  ASSERT(r >= 0 && r < 1);
+
+  cdf = darray_double_cdata_get(&htrdr->lw_cdf);
+  cdf_length = darray_double_size_get(&htrdr->lw_cdf);
+
+  /* Use r_next rather than r in order to find the first entry that is not less
+   * than *or equal* to r */
+  find = search_lower_bound(&r_next, cdf, cdf_length, sizeof(double), cmp_dbl);
+  ASSERT(find);
+
+  i = (size_t)(find - cdf);
+  ASSERT(i < cdf_length && cdf[i] > r && (!i || cdf[i-1] <= r));
+  return i;
+}
+
+static res_T
+draw_tile
+  (struct htrdr* htrdr,
+   const size_t ithread,
+   const int64_t tile_mcode, /* For debug only */
+   const size_t tile_org[2], /* Origin of the tile in pixel space */
+   const size_t tile_sz[2], /* Definition of the tile */
+   const double pix_sz[2], /* Size of a pixel in the normalized image plane */
+   const struct htrdr_camera* cam,
+   const size_t spp, /* #samples per pixel */
+   struct ssp_rng* rng,
+   struct tile* tile)
+{
+  size_t nchannels;
+  size_t npixels;
+  size_t mcode; /* Morton code of tile pixel */
+  ASSERT(htrdr && tile_org && tile_sz && pix_sz && cam && spp && tile);
+  (void)tile_mcode;
+  /* Adjust the #pixels to process them wrt a morton order */
+  npixels = round_up_pow2(MMAX(tile_sz[0], tile_sz[1]));
+  npixels *= npixels;
+
+  /* Define how many channels to handle */
+  nchannels = htsky_is_long_wave(htrdr->sky) ? 1 : 3;
+
+  FOR_EACH(mcode, 0, npixels) {
+    struct htrdr_accum* pix_accums;
+    size_t ipix_tile[2]; /* Pixel coord in the tile */
+    size_t ipix[2]; /* Pixel coord in the buffer */
+    size_t ichannel;
+
+    ipix_tile[0] = morton2D_decode((uint32_t)(mcode>>0));
+    if(ipix_tile[0] >= tile_sz[0]) continue; /* Pixel is out of tile */
+    ipix_tile[1] = morton2D_decode((uint32_t)(mcode>>1));
+    if(ipix_tile[1] >= tile_sz[1]) continue; /* Pixel is out of tile */
+
+    /* Fetch and reset the pixel accumulator */
+    pix_accums = tile_at(tile, ipix_tile[0], ipix_tile[1]);
+
+    /* Reset the pixel accumulators */
+    pix_accums[HTRDR_ESTIMATE_X] = HTRDR_ACCUM_NULL;
+    pix_accums[HTRDR_ESTIMATE_Y] = HTRDR_ACCUM_NULL;
+    pix_accums[HTRDR_ESTIMATE_Z] = HTRDR_ACCUM_NULL;
+    pix_accums[HTRDR_ESTIMATE_TIME] = HTRDR_ACCUM_NULL;
+
+    /* Compute the pixel coordinate */
+    ipix[0] = tile_org[0] + ipix_tile[0];
+    ipix[1] = tile_org[1] + ipix_tile[1];
+
+    FOR_EACH(ichannel, 0, nchannels) {
+      /* Check that the X, Y and Z estimates are stored in accumulators 0, 1 et
+       * 2, respectively */
+      STATIC_ASSERT
+      (  HTRDR_ESTIMATE_X == 0
+      && HTRDR_ESTIMATE_Y == 1
+      && HTRDR_ESTIMATE_Z == 2,
+      Unexpected_htrdr_estimate_enumerate);
+      size_t isamp;
+
+      FOR_EACH(isamp, 0, spp) {
+        struct time t0, t1;
+        double pix_samp[2];
+        double ray_org[3];
+        double ray_dir[3];
+        double weight;
+        double r0, r1;
+        size_t iband;
+        size_t iquad;
+        double usec;
+
+        /* Begin the registration of the time spent to in the realisation */
+        time_current(&t0);
+
+        /* Sample a position into the pixel, in the normalized image plane */
+        pix_samp[0] = ((double)ipix[0] + ssp_rng_canonical(rng)) * pix_sz[0];
+        pix_samp[1] = ((double)ipix[1] + ssp_rng_canonical(rng)) * pix_sz[1];
+
+        /* Generate a ray starting from the pinhole camera and passing through the
+         * pixel sample */
+        htrdr_camera_ray(cam, pix_samp, ray_org, ray_dir);
+
+        r0 = ssp_rng_canonical(rng);
+        r1 = ssp_rng_canonical(rng);
+
+        if(htsky_is_long_wave(htrdr->sky)) {
+          /* Sample a spectral band and a quadrature point */
+          iband = sample_lw_spectral_interval(htrdr, r0);
+          iquad = htsky_spectral_band_sample_quadrature(htrdr->sky, r1, iband);
+          /* Compute the luminance */
+          weight = htrdr_compute_radiance_lw
+            (htrdr, ithread, rng, ray_org, ray_dir, iband, iquad);
+        } else {
+          /* Sample a spectral band and a quadrature point */
+          switch(ichannel) {
+            case 0:
+              htsky_sample_sw_spectral_data_CIE_1931_X
+                (htrdr->sky, r0, r1, &iband, &iquad);
+              break;
+            case 1:
+              htsky_sample_sw_spectral_data_CIE_1931_Y
+                (htrdr->sky, r0, r1, &iband, &iquad);
+              break;
+            case 2:
+              htsky_sample_sw_spectral_data_CIE_1931_Z
+                (htrdr->sky, r0, r1, &iband, &iquad);
+              break;
+            default: FATAL("Unreachable code.\n"); break;
+          }
+          /* Compute the luminance */
+          weight = htrdr_compute_radiance_sw
+            (htrdr, ithread, rng, ray_org, ray_dir, iband, iquad);
+        }
+        ASSERT(weight >= 0);
+
+        /* End the registration of the per realisation time */
+        time_sub(&t0, time_current(&t1), &t0);
+        usec = (double)time_val(&t0, TIME_NSEC) * 0.001;
+
+        /* Update the pixel accumulator of the current channel */
+        pix_accums[ichannel].sum_weights += weight;
+        pix_accums[ichannel].sum_weights_sqr += weight*weight;
+        pix_accums[ichannel].nweights += 1;
+
+        /* Update the pixel accumulator of per realisation time */
+        pix_accums[HTRDR_ESTIMATE_TIME].sum_weights += usec;
+        pix_accums[HTRDR_ESTIMATE_TIME].sum_weights_sqr += usec*usec;
+        pix_accums[HTRDR_ESTIMATE_TIME].nweights += 1;
+      }
+    }
+  }
+  return RES_OK;
+}
+
+static res_T
+draw_image
+  (struct htrdr* htrdr,
+   const struct htrdr_camera* cam,
+   const size_t width, /* Image width */
+   const size_t height, /* Image height */
+   const size_t spp,
+   const size_t ntiles_x,
+   const size_t ntiles_y,
+   const size_t ntiles_adjusted,
+   const double pix_sz[2], /* Pixel size in the normalized image plane */
+   struct proc_work* work,
+   struct list_node* tiles)
+{
+  struct ssp_rng* rng_proc = NULL;
+  size_t nthreads = 0;
+  size_t nthieves = 0;
+  size_t proc_ntiles = 0;
+  ATOMIC nsolved_tiles = 0;
+  ATOMIC res = RES_OK;
+  ASSERT(htrdr && cam && spp && ntiles_adjusted && work && tiles);
+  ASSERT(pix_sz && pix_sz[0] > 0 && pix_sz[1] > 0);
+  ASSERT(width && height);
+  (void)ntiles_x, (void)ntiles_y;
+
+  res = ssp_rng_create(htrdr->allocator, &ssp_rng_mt19937_64, &rng_proc);
+  if(res != RES_OK) {
+    htrdr_log_err(htrdr, "could not create the RNG used to sample a process "
+      "to steal -- %s.\n", res_to_cstr((res_T)res));
+    goto error;
+  }
+
+  proc_ntiles = proc_work_get_ntiles(work);
+  nthreads = MMIN(htrdr->nthreads, proc_ntiles);
+
+  /* The process is not considered as a working process for himself */
+  htrdr->mpi_working_procs[htrdr->mpi_rank] = 0;
+  --htrdr->mpi_nworking_procs;
+
+  omp_set_num_threads((int)nthreads);
+  #pragma omp parallel
+  for(;;) {
+    const int ithread = omp_get_thread_num();
+    struct ssp_rng_proxy* rng_proxy = NULL;
+    struct ssp_rng* rng;
+    struct tile* tile;
+    uint32_t mcode = TILE_MCODE_NULL;
+    size_t tile_org[2];
+    size_t tile_sz[2];
+    size_t n;
+    res_T res_local = RES_OK;
+    int32_t pcent;
+
+    /* Get a tile to draw */
+    #pragma omp critical
+    {
+      mcode = proc_work_get_tile(work);
+      if(mcode == TILE_MCODE_NULL) { /* No more work on this process */
+        /* Try to steal works to concurrent processes */
+        proc_work_reset(work);
+        nthieves = mpi_steal_work(htrdr, rng_proc, work);
+        if(nthieves != 0) {
+          mcode = proc_work_get_tile(work);
+        }
+      }
+    }
+    if(mcode == TILE_MCODE_NULL) break; /* No more work */
+
+    /* Decode the morton code to retrieve the tile index  */
+    tile_org[0] = morton2D_decode((uint32_t)(mcode>>0));
+    tile_org[1] = morton2D_decode((uint32_t)(mcode>>1));
+    ASSERT(tile_org[0] < ntiles_x && tile_org[1] < ntiles_y);
+
+    /* Create the tile */
+    tile = tile_create(htrdr->allocator);
+    if(!tile) {
+      ATOMIC_SET(&res, RES_MEM_ERR);
+      htrdr_log_err(htrdr,
+        "could not allocate the memory space of the tile (%lu, %lu) -- %s.\n",
+        (unsigned long)tile_org[0], (unsigned long)tile_org[1],
+         res_to_cstr((res_T)ATOMIC_GET(&res)));
+      break;
+    }
+
+    /* Register the tile */
+    #pragma omp critical
+    list_add_tail(tiles, &tile->node);
+
+    tile->data.x = (uint16_t)tile_org[0];
+    tile->data.y = (uint16_t)tile_org[1];
+
+    /* Define the tile origin in pixel space */
+    tile_org[0] *= TILE_SIZE;
+    tile_org[1] *= TILE_SIZE;
+
+    /* Compute the size of the tile clamped by the borders of the buffer */
+    tile_sz[0] = MMIN(TILE_SIZE, width - tile_org[0]);
+    tile_sz[1] = MMIN(TILE_SIZE, height - tile_org[1]);
+
+    /* Create a proxy RNG for the current tile. This proxy is used for the
+     * current thread only and thus it has to manage only one RNG. This proxy
+     * is initialised in order to ensure that an unique and predictable set of
+     * random numbers is used for the current tile. */
+    SSP(rng_proxy_create2
+      (&htrdr->lifo_allocators[ithread],
+       &ssp_rng_threefry,
+       RNG_SEQUENCE_SIZE * (size_t)mcode, /* Offset */
+       RNG_SEQUENCE_SIZE, /* Size */
+       RNG_SEQUENCE_SIZE * (size_t)ntiles_adjusted, /* Pitch */
+       1, &rng_proxy));
+    SSP(rng_proxy_create_rng(rng_proxy, 0, &rng));
+
+    /* Launch the tile rendering */
+    res_local = draw_tile(htrdr, (size_t)ithread, mcode, tile_org, tile_sz,
+      pix_sz, cam, spp, rng, tile);
+
+    SSP(rng_proxy_ref_put(rng_proxy));
+    SSP(rng_ref_put(rng));
+
+    if(res_local != RES_OK) {
+      ATOMIC_SET(&res, res_local);
+      break;
+    }
+
+    /* Update the progress status */
+    n = (size_t)ATOMIC_INCR(&nsolved_tiles);
+    pcent = (int32_t)((double)n * 100.0 / (double)proc_ntiles + 0.5/*round*/);
+
+    #pragma omp critical
+    if(pcent > htrdr->mpi_progress_render[0]) {
+      htrdr->mpi_progress_render[0] = pcent;
+      if(htrdr->mpi_rank == 0) {
+        update_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+      } else { /* Send the progress percentage to the master process */
+        send_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING, pcent);
+      }
+    }
+  }
+
+  if(ATOMIC_GET(&res) != RES_OK) goto error;
+
+  /* Synchronize the process */
+  mutex_lock(htrdr->mpi_mutex);
+  MPI(Barrier(MPI_COMM_WORLD));
+  mutex_unlock(htrdr->mpi_mutex);
+
+exit:
+  if(rng_proc) SSP(rng_ref_put(rng_proc));
+  return (res_T)res;
+error:
+  goto exit;
+}
+
+/*******************************************************************************
+ * Local functions
+ ******************************************************************************/
+res_T
+htrdr_draw_radiance
+  (struct htrdr* htrdr,
+   const struct htrdr_camera* cam,
+   const size_t width,
+   const size_t height,
+   const size_t spp,
+   struct htrdr_buffer* buf)
+{
+  char strbuf[128];
+  struct time t0, t1;
+  struct list_node tiles;
+  size_t ntiles_x, ntiles_y, ntiles, ntiles_adjusted;
+  size_t itile;
+  struct proc_work work;
+  struct htrdr_buffer_layout layout = HTRDR_BUFFER_LAYOUT_NULL;
+  size_t proc_ntiles_adjusted;
+  double pix_sz[2];
+  ATOMIC probe_thieves = 1;
+  ATOMIC res = RES_OK;
+  ASSERT(htrdr && cam && width && height);
+  ASSERT(htrdr->mpi_rank != 0 || buf);
+
+  list_init(&tiles);
+  proc_work_init(htrdr->allocator, &work);
+
+  if(htrdr->mpi_rank == 0) {
+    htrdr_buffer_get_layout(buf, &layout);
+    ASSERT(layout.width || layout.height || layout.elmt_size);
+    ASSERT(layout.width == width && layout.height == height);
+
+    if(layout.elmt_size != sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__])
+    || layout.alignment < ALIGNOF(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__])) {
+      htrdr_log_err(htrdr,
+        "%s: invalid buffer layout. "
+        "The pixel size must be the size of %lu accumulators.\n",
+        FUNC_NAME, (unsigned long)HTRDR_ESTIMATES_COUNT__);
+      res = RES_BAD_ARG;
+      goto error;
+    }
+  }
+
+  /* Compute the overall number of tiles */
+  ntiles_x = (width + (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
+  ntiles_y = (height+ (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
+  ntiles = ntiles_x * ntiles_y;
+
+  /* Compute the pixel size in the normalized image plane */
+  pix_sz[0] = 1.0 / (double)width;
+  pix_sz[1] = 1.0 / (double)height;
+
+  /* Adjust the #tiles for the morton-encoding procedure */
+  ntiles_adjusted = round_up_pow2(MMAX(ntiles_x, ntiles_y));
+  ntiles_adjusted *= ntiles_adjusted;
+
+  /* Define the initial number of tiles of the current process */
+  proc_ntiles_adjusted = ntiles_adjusted / (size_t)htrdr->mpi_nprocs;
+  if(htrdr->mpi_rank == 0) { /* Affect the remaining tiles to the master proc */
+    proc_ntiles_adjusted +=
+      ntiles_adjusted - proc_ntiles_adjusted*(size_t)htrdr->mpi_nprocs;
+  }
+
+  /* Define the initial list of tiles of the process */
+  FOR_EACH(itile, 0, proc_ntiles_adjusted) {
+    uint32_t mcode;
+    uint16_t tile_org[2];
+
+    mcode = (uint32_t)itile*(uint32_t)htrdr->mpi_nprocs
+          + (uint32_t)htrdr->mpi_rank;
+
+    tile_org[0] = morton2D_decode(mcode>>0);
+    if(tile_org[0] >= ntiles_x) continue;
+    tile_org[1] = morton2D_decode(mcode>>1);
+    if(tile_org[1] >= ntiles_y) continue;
+    proc_work_add_tile(&work, mcode);
+  }
+
+  if(htrdr->mpi_rank == 0) {
+    fetch_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+    print_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+  }
+
+  time_current(&t0);
+
+  omp_set_nested(1); /* Enable nested threads for draw_image */
+  #pragma omp parallel sections num_threads(2)
+  {
+    #pragma omp section
+    mpi_probe_thieves(htrdr, &work, &probe_thieves);
+
+    #pragma omp section
+    {
+      draw_image(htrdr, cam, width, height, spp, ntiles_x, ntiles_y,
+        ntiles_adjusted, pix_sz, &work, &tiles);
+      /* The processes have no more work to do. Stop probing for thieves */
+      ATOMIC_SET(&probe_thieves, 0);
+    }
+  }
+
+  if(htrdr->mpi_rank == 0) {
+    update_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+    fprintf(stderr, "\n"); /* Add a new line after the progress statuses */
+  }
+
+  time_sub(&t0, time_current(&t1), &t0);
+  time_dump(&t0, TIME_ALL, NULL, strbuf, sizeof(strbuf));
+  htrdr_log(htrdr, "Rendering time: %s\n", strbuf);
+
+  /* Gather accum buffers from the group of processes */
+  time_current(&t0);
+  res = mpi_gather_tiles(htrdr, buf, ntiles, &tiles);
+  if(res != RES_OK) goto error;
+  time_sub(&t0, time_current(&t1), &t0);
+  time_dump(&t0, TIME_ALL, NULL, strbuf, sizeof(strbuf));
+  htrdr_log(htrdr, "Image gathering time: %s\n", strbuf);
+
+exit:
+  { /* Free allocated tiles */
+    struct list_node* node;
+    struct list_node* tmp;
+    LIST_FOR_EACH_SAFE(node, tmp, &tiles) {
+      struct tile* tile = CONTAINER_OF(node, struct tile, node);
+      list_del(node);
+      tile_ref_put(tile);
+    }
+  }
+  proc_work_release(&work);
+  return (res_T)res;
+error:
+  goto exit;
+}
+
diff --git a/src/htrdr_draw_radiance_sw.c b/src/htrdr_draw_radiance_sw.c
@@ -1,851 +0,0 @@
-/* Copyright (C) 2018, 2019, 2020 |Meso|Star> (contact@meso-star.com)
- * Copyright (C) 2018, 2019 CNRS, Université Paul Sabatier
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>. */
-
-#define _POSIX_C_SOURCE 199309L /* nanosleep */
-
-#include "htrdr.h"
-#include "htrdr_c.h"
-#include "htrdr_buffer.h"
-#include "htrdr_camera.h"
-#include "htrdr_solve.h"
-
-#include <high_tune/htsky.h>
-
-#include <rsys/clock_time.h>
-#include <rsys/cstr.h>
-#include <rsys/dynamic_array_u32.h>
-#include <rsys/math.h>
-#include <rsys/mutex.h>
-#include <star/ssp.h>
-
-#include <omp.h>
-#include <mpi.h>
-#include <time.h>
-#include <unistd.h>
-
-#define RNG_SEQUENCE_SIZE 10000
-
-#define TILE_MCODE_NULL UINT32_MAX
-#define TILE_SIZE 32 /* Definition in X & Y of a tile */
-STATIC_ASSERT(IS_POW2(TILE_SIZE), TILE_SIZE_must_be_a_power_of_2);
-
-/* Tile of row ordered image pixels */
-struct tile {
-  struct list_node node;
-  struct mem_allocator* allocator;
-  ref_T ref;
-
-  struct tile_data {
-    uint16_t x, y; /* 2D coordinates of the tile in tile space */
-    /* Simulate the flexible array member of the C99 standard. */
-    struct htrdr_accum accums[1/*dummy element*/];
-  } data;
-};
-
-/* List of tile to compute onto the MPI process. */
-struct proc_work {
-  struct mutex* mutex;
-  struct darray_u32 tiles; /* #tiles to render */
-  size_t itile; /* Next tile to render in the above list of tiles */
-};
-
-/*******************************************************************************
- * Helper functions
- ******************************************************************************/
-static FINLINE uint16_t
-morton2D_decode(const uint32_t u32)
-{
-  uint32_t x = u32 & 0x55555555;
-  x = (x | (x >> 1)) & 0x33333333;
-  x = (x | (x >> 2)) & 0x0F0F0F0F;
-  x = (x | (x >> 4)) & 0x00FF00FF;
-  x = (x | (x >> 8)) & 0x0000FFFF;
-  return (uint16_t)x;
-}
-
-static FINLINE uint32_t
-morton2D_encode(const uint16_t u16)
-{
-  uint32_t u32 = u16;
-  u32 = (u32 | (u32 << 8)) & 0x00FF00FF;
-  u32 = (u32 | (u32 << 4)) & 0X0F0F0F0F;
-  u32 = (u32 | (u32 << 2)) & 0x33333333;
-  u32 = (u32 | (u32 << 1)) & 0x55555555;
-  return u32;
-}
-
-static FINLINE struct tile*
-tile_create(struct mem_allocator* allocator)
-{
-  struct tile* tile;
-  const size_t tile_sz =
-    sizeof(struct tile) - sizeof(struct htrdr_accum)/*rm dummy accum*/;
-  const size_t buf_sz = /* Flexiblbe array element */
-    TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]);
-  ASSERT(allocator);
-
-  tile = MEM_ALLOC(allocator, tile_sz+buf_sz);
-  if(!tile) return NULL;
-
-  ref_init(&tile->ref);
-  list_init(&tile->node);
-  tile->allocator = allocator;
-  ASSERT(IS_ALIGNED(&tile->data.accums, ALIGNOF(struct htrdr_accum)));
-
-  return tile;
-}
-
-static INLINE void
-tile_ref_get(struct tile* tile)
-{
-  ASSERT(tile);
-  tile_ref_get(tile);
-}
-
-static INLINE void
-release_tile(ref_T* ref)
-{
-  struct tile* tile = CONTAINER_OF(ref, struct tile, ref);
-  ASSERT(ref);
-  MEM_RM(tile->allocator, tile);
-}
-
-static INLINE void
-tile_ref_put(struct tile* tile)
-{
-  ASSERT(tile);
-  ref_put(&tile->ref, release_tile);
-}
-
-static FINLINE struct htrdr_accum*
-tile_at
-  (struct tile* tile,
-   const size_t x, /* In tile space */
-   const size_t y) /* In tile space */
-{
-  ASSERT(tile && x < TILE_SIZE && y < TILE_SIZE);
-  return tile->data.accums + (y*TILE_SIZE + x) * HTRDR_ESTIMATES_COUNT__;
-}
-
-static void
-write_tile_data(struct htrdr_buffer* buf, const struct tile_data* tile_data)
-{
-  struct htrdr_buffer_layout layout = HTRDR_BUFFER_LAYOUT_NULL;
-  size_t icol, irow;
-  size_t irow_tile;
-  size_t ncols_tile, nrows_tile;
-  char* buf_mem;
-  ASSERT(buf && tile_data);
-
-  htrdr_buffer_get_layout(buf, &layout);
-  buf_mem = htrdr_buffer_get_data(buf);
-  ASSERT(layout.elmt_size == sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]));
-
-  /* Compute the row/column of the tile origin into the buffer */
-  icol = tile_data->x * (size_t)TILE_SIZE;
-  irow = tile_data->y * (size_t)TILE_SIZE;
-
-  /* Define the number of tile row/columns to write into the buffer */
-  ncols_tile = MMIN(icol + TILE_SIZE, layout.width)  - icol;
-  nrows_tile = MMIN(irow + TILE_SIZE, layout.height) - irow;
-
-  /* Copy the tile data, row by row */
-  FOR_EACH(irow_tile, 0, nrows_tile) {
-    char* buf_row = buf_mem + (irow + irow_tile) * layout.pitch;
-    const struct htrdr_accum* tile_row =
-      tile_data->accums + irow_tile*TILE_SIZE*HTRDR_ESTIMATES_COUNT__;
-    memcpy(buf_row + icol*sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]),
-      tile_row, ncols_tile*sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]));
-  }
-}
-
-static INLINE void
-proc_work_init(struct mem_allocator* allocator, struct proc_work* work)
-{
-  ASSERT(work);
-  darray_u32_init(allocator, &work->tiles);
-  work->itile = 0;
-  CHK(work->mutex = mutex_create());
-}
-
-static INLINE void
-proc_work_release(struct proc_work* work)
-{
-  darray_u32_release(&work->tiles);
-  mutex_destroy(work->mutex);
-}
-
-static INLINE void
-proc_work_reset(struct proc_work* work)
-{
-  ASSERT(work);
-  mutex_lock(work->mutex);
-  darray_u32_clear(&work->tiles);
-  work->itile = 0;
-  mutex_unlock(work->mutex);
-}
-
-static INLINE void
-proc_work_add_tile(struct proc_work* work, const uint32_t mcode)
-{
-  mutex_lock(work->mutex);
-  CHK(darray_u32_push_back(&work->tiles, &mcode) == RES_OK);
-  mutex_unlock(work->mutex);
-}
-
-static INLINE uint32_t
-proc_work_get_tile(struct proc_work* work)
-{
-  uint32_t mcode;
-  ASSERT(work);
-  mutex_lock(work->mutex);
-  if(work->itile >= darray_u32_size_get(&work->tiles)) {
-    mcode = TILE_MCODE_NULL;
-  } else {
-    mcode = darray_u32_cdata_get(&work->tiles)[work->itile];
-    ++work->itile;
-  }
-  mutex_unlock(work->mutex);
-  return mcode;
-}
-
-static INLINE size_t
-proc_work_get_ntiles(struct proc_work* work)
-{
-  size_t sz = 0;
-  ASSERT(work);
-  mutex_lock(work->mutex);
-  sz = darray_u32_size_get(&work->tiles);
-  mutex_unlock(work->mutex);
-  return sz;
-}
-
-static void
-mpi_wait_for_request(struct htrdr* htrdr, MPI_Request* req)
-{
-  ASSERT(htrdr && req);
-
-  /* Wait for process synchronisation */
-  for(;;) {
-    struct timespec t;
-    int complete;
-    t.tv_sec = 0;
-    t.tv_nsec = 10000000; /* 10ms */
-
-    mutex_lock(htrdr->mpi_mutex);
-    MPI(Test(req, &complete, MPI_STATUS_IGNORE));
-    mutex_unlock(htrdr->mpi_mutex);
-    if(complete) break;
-
-    nanosleep(&t, NULL);
-  }
-}
-
-static void
-mpi_probe_thieves
-  (struct htrdr* htrdr,
-   struct proc_work* work,
-   ATOMIC* probe_thieves)
-{
-  uint32_t tiles[UINT8_MAX];
-  struct timespec t;
-  ASSERT(htrdr && work && probe_thieves);
-
-  if(htrdr->mpi_nprocs == 1) /* The process is alone. No thief is possible */
-    return;
-
-  t.tv_sec = 0;
-
-  /* Protect MPI calls of multiple invocations from concurrent threads */
-  #define P_MPI(Func) {                                                        \
-    mutex_lock(htrdr->mpi_mutex);                                              \
-    MPI(Func);                                                                 \
-    mutex_unlock(htrdr->mpi_mutex);                                            \
-  } (void)0
-
-  while(ATOMIC_GET(probe_thieves)) {
-    MPI_Status status;
-    size_t itile;
-    int msg;
-
-    /* Probe if a steal request was submitted by any processes */
-    P_MPI(Iprobe(MPI_ANY_SOURCE, HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD, &msg,
-      &status));
-
-    if(msg) { /* A steal request was posted */
-      MPI_Request req;
-      uint8_t ntiles_to_steal;
-
-      /* Asynchronously receive the steal request */
-      P_MPI(Irecv(&ntiles_to_steal, 1, MPI_UINT8_T, status.MPI_SOURCE,
-        HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD, &req));
-
-      /* Wait for the completion of the steal request */
-      mpi_wait_for_request(htrdr, &req);
-
-      /* Thief some tiles */
-      FOR_EACH(itile, 0, ntiles_to_steal) {
-        tiles[itile] = proc_work_get_tile(work);
-      }
-      P_MPI(Send(&tiles, ntiles_to_steal, MPI_UINT32_T, status.MPI_SOURCE,
-        HTRDR_MPI_WORK_STEALING, MPI_COMM_WORLD));
-    }
-    t.tv_nsec = 500000000; /* 500ms */
-    nanosleep(&t, NULL);
-  }
-  #undef P_MPI
-}
-
-static int
-mpi_sample_working_process(struct htrdr* htrdr, struct ssp_rng* rng)
-{
-  int iproc, i;
-  int dst_rank;
-  ASSERT(htrdr && rng && htrdr->mpi_nworking_procs);
-
-  /* Sample the index of the 1st active process */
-  iproc = (int)(ssp_rng_canonical(rng) * (double)htrdr->mpi_nworking_procs);
-
-  /* Find the rank of the sampled active process. Use a simple linear search
-   * since the overall number of processes should be quite low; at most few
-   * dozens.  */
-  i = 0;
-  FOR_EACH(dst_rank, 0, htrdr->mpi_nprocs) {
-    if(htrdr->mpi_working_procs[dst_rank] == 0) continue; /* Inactive process */
-    if(i == iproc) break; /* The rank of the sampled process is found */
-    ++i;
-  }
-  ASSERT(dst_rank < htrdr->mpi_nprocs);
-  return dst_rank;
-}
-
-/* Return the number of stolen tiles */
-static size_t
-mpi_steal_work
-  (struct htrdr* htrdr,
-   struct ssp_rng* rng,
-   struct proc_work* work)
-{
-  MPI_Request req;
-  size_t itile;
-  size_t nthieves = 0;
-  uint32_t tiles[UINT8_MAX]; /* Morton code of the stolen tile */
-  int proc_to_steal; /* Process to steal */
-  uint8_t ntiles_to_steal = MMIN((uint8_t)(htrdr->nthreads*2), 16);
-  ASSERT(htrdr && rng && work && htrdr->nthreads < UINT8_MAX);
-
-  /* Protect MPI calls of multiple invocations from concurrent threads */
-  #define P_MPI(Func) {                                                        \
-    mutex_lock(htrdr->mpi_mutex);                                              \
-    MPI(Func);                                                                 \
-    mutex_unlock(htrdr->mpi_mutex);                                            \
-  } (void)0
-
-  /* No more working process => nohting to steal */
-  if(!htrdr->mpi_nworking_procs) return 0;
-
-  /* Sample a process to steal */
-  proc_to_steal = mpi_sample_working_process(htrdr, rng);
-
-  /* Send a steal request to the sampled process and wait for a response */
-  P_MPI(Send(&ntiles_to_steal, 1, MPI_UINT8_T, proc_to_steal,
-    HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD));
-
-  /* Receive the stolen tile from the sampled process */
-  P_MPI(Irecv(tiles, ntiles_to_steal, MPI_UINT32_T, proc_to_steal,
-    HTRDR_MPI_WORK_STEALING, MPI_COMM_WORLD, &req));
-
-  mpi_wait_for_request(htrdr, &req);
-
-  FOR_EACH(itile, 0, ntiles_to_steal) {
-    if(tiles[itile] == TILE_MCODE_NULL) {
-      ASSERT(htrdr->mpi_working_procs[proc_to_steal] != 0);
-      htrdr->mpi_working_procs[proc_to_steal] = 0;
-      htrdr->mpi_nworking_procs--;
-      break;
-    }
-    proc_work_add_tile(work, tiles[itile]);
-    ++nthieves;
-  }
-  #undef P_MPI
-  return nthieves;
-}
-
-static res_T
-mpi_gather_tiles
-  (struct htrdr* htrdr,
-   struct htrdr_buffer* buf,
-   const size_t ntiles,
-   struct list_node* tiles)
-{
-  /* Compute the size of the tile_data */
-  const size_t msg_sz =
-    sizeof(struct tile_data) - sizeof(struct htrdr_accum)/*dummy*/
-  + TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__]);
-
-  struct list_node* node = NULL;
-  struct tile* tile = NULL;
-  res_T res = RES_OK;
-  ASSERT(htrdr && tiles);
-  ASSERT(htrdr->mpi_rank != 0 || buf);
-  (void)ntiles;
-
-  if(htrdr->mpi_rank != 0) { /* Non master process */
-    /* Send the computed tile to the master process */
-    LIST_FOR_EACH(node, tiles) {
-      struct tile* t = CONTAINER_OF(node, struct tile, node);
-      MPI(Send(&t->data, (int)msg_sz, MPI_CHAR, 0,
-        HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD));
-    }
-  } else { /* Master process */
-    size_t itile = 0;
-
-    LIST_FOR_EACH(node, tiles) {
-      struct tile* t = CONTAINER_OF(node, struct tile, node);
-      write_tile_data(buf, &t->data);
-      ++itile;
-    }
-
-    if(itile != ntiles) {
-      ASSERT(htrdr->mpi_nprocs > 1);
-
-      /* Create a temporary tile to receive the tile data computed by the
-       * concurrent MPI processes */
-      tile = tile_create(htrdr->allocator);
-      if(!tile) {
-        res = RES_MEM_ERR;
-        htrdr_log_err(htrdr,
-          "could not allocate the temporary tile used to gather the process "
-          "output data -- %s.\n", res_to_cstr(res));
-        goto error;
-      }
-
-      /* Receive the tile data of the concurret MPI processes */
-      FOR_EACH(itile, itile, ntiles) {
-        MPI(Recv(&tile->data, (int)msg_sz, MPI_CHAR, MPI_ANY_SOURCE,
-          HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
-        write_tile_data(buf, &tile->data);
-      }
-    }
-  }
-
-exit:
-  if(tile) tile_ref_put(tile);
-  return res;
-error:
-  goto exit;
-}
-
-static res_T
-draw_tile
-  (struct htrdr* htrdr,
-   const size_t ithread,
-   const int64_t tile_mcode, /* For debug only */
-   const size_t tile_org[2], /* Origin of the tile in pixel space */
-   const size_t tile_sz[2], /* Definition of the tile */
-   const double pix_sz[2], /* Size of a pixel in the normalized image plane */
-   const struct htrdr_camera* cam,
-   const size_t spp, /* #samples per pixel */
-   struct ssp_rng* rng,
-   struct tile* tile)
-{
-  size_t npixels;
-  size_t mcode; /* Morton code of tile pixel */
-  ASSERT(htrdr && tile_org && tile_sz && pix_sz && cam && spp && tile);
-  (void)tile_mcode;
-  /* Adjust the #pixels to process them wrt a morton order */
-  npixels = round_up_pow2(MMAX(tile_sz[0], tile_sz[1]));
-  npixels *= npixels;
-
-  FOR_EACH(mcode, 0, npixels) {
-    struct htrdr_accum* pix_accums;
-    size_t ipix_tile[2]; /* Pixel coord in the tile */
-    size_t ipix[2]; /* Pixel coord in the buffer */
-    size_t ichannel;
-
-    ipix_tile[0] = morton2D_decode((uint32_t)(mcode>>0));
-    if(ipix_tile[0] >= tile_sz[0]) continue; /* Pixel is out of tile */
-    ipix_tile[1] = morton2D_decode((uint32_t)(mcode>>1));
-    if(ipix_tile[1] >= tile_sz[1]) continue; /* Pixel is out of tile */
-
-    /* Fetch and reset the pixel accumulator */
-    pix_accums = tile_at(tile, ipix_tile[0], ipix_tile[1]);
-    pix_accums[HTRDR_ESTIMATE_TIME] = HTRDR_ACCUM_NULL; /* Reset time per radiative path */
-
-    /* Compute the pixel coordinate */
-    ipix[0] = tile_org[0] + ipix_tile[0];
-    ipix[1] = tile_org[1] + ipix_tile[1];
-
-    FOR_EACH(ichannel, 0, 3) {
-      /* Check that the X, Y and Z estimates are stored in accumulators 0, 1 et
-       * 2, respectively */
-      STATIC_ASSERT
-      (  HTRDR_ESTIMATE_X == 0
-      && HTRDR_ESTIMATE_Y == 1
-      && HTRDR_ESTIMATE_Z == 2,
-      Unexpected_htrdr_estimate_enumerate);
-      size_t isamp;
-
-      pix_accums[ichannel] = HTRDR_ACCUM_NULL;
-      FOR_EACH(isamp, 0, spp) {
-        struct time t0, t1;
-        double pix_samp[2];
-        double ray_org[3];
-        double ray_dir[3];
-        double weight;
-        double r0, r1;
-        size_t iband;
-        size_t iquad;
-        double usec;
-
-        /* Sample a position into the pixel, in the normalized image plane */
-        pix_samp[0] = ((double)ipix[0] + ssp_rng_canonical(rng)) * pix_sz[0];
-        pix_samp[1] = ((double)ipix[1] + ssp_rng_canonical(rng)) * pix_sz[1];
-
-        /* Generate a ray starting from the pinhole camera and passing through the
-         * pixel sample */
-        htrdr_camera_ray(cam, pix_samp, ray_org, ray_dir);
-
-        /* Sample a spectral band and a quadrature point */
-        r0 = ssp_rng_canonical(rng);
-        r1 = ssp_rng_canonical(rng);
-        switch(ichannel) {
-          case 0:
-            htsky_sample_sw_spectral_data_CIE_1931_X
-              (htrdr->sky, r0, r1, &iband, &iquad);
-            break;
-          case 1:
-            htsky_sample_sw_spectral_data_CIE_1931_Y
-              (htrdr->sky, r0, r1, &iband, &iquad);
-            break;
-          case 2:
-            htsky_sample_sw_spectral_data_CIE_1931_Z
-              (htrdr->sky, r0, r1, &iband, &iquad);
-            break;
-          default: FATAL("Unreachable code.\n"); break;
-        }
-
-        /* Compute the radiance that reach the pixel through the ray */
-        time_current(&t0);
-        weight = htrdr_compute_radiance_sw
-          (htrdr, ithread, rng, ray_org, ray_dir, iband, iquad);
-        ASSERT(weight >= 0);
-        time_sub(&t0, time_current(&t1), &t0);
-        usec = (double)time_val(&t0, TIME_NSEC) * 0.001;
-
-        /* Update the pixel accumulator of the current channel */
-        pix_accums[ichannel].sum_weights += weight;
-        pix_accums[ichannel].sum_weights_sqr += weight*weight;
-        pix_accums[ichannel].nweights += 1;
-
-        /* Update the pixel accumulator of per realisation time */
-        pix_accums[HTRDR_ESTIMATE_TIME].sum_weights += usec;
-        pix_accums[HTRDR_ESTIMATE_TIME].sum_weights_sqr += usec*usec;
-        pix_accums[HTRDR_ESTIMATE_TIME].nweights += 1;
-      }
-    }
-  }
-  return RES_OK;
-}
-
-static res_T
-draw_image
-  (struct htrdr* htrdr,
-   const struct htrdr_camera* cam,
-   const size_t width, /* Image width */
-   const size_t height, /* Image height */
-   const size_t spp,
-   const size_t ntiles_x,
-   const size_t ntiles_y,
-   const size_t ntiles_adjusted,
-   const double pix_sz[2], /* Pixel size in the normalized image plane */
-   struct proc_work* work,
-   struct list_node* tiles)
-{
-  struct ssp_rng* rng_proc = NULL;
-  size_t nthreads = 0;
-  size_t nthieves = 0;
-  size_t proc_ntiles = 0;
-  ATOMIC nsolved_tiles = 0;
-  ATOMIC res = RES_OK;
-  ASSERT(htrdr && cam && spp && ntiles_adjusted && work && tiles);
-  ASSERT(pix_sz && pix_sz[0] > 0 && pix_sz[1] > 0);
-  ASSERT(width && height);
-  (void)ntiles_x, (void)ntiles_y;
-
-  res = ssp_rng_create(htrdr->allocator, &ssp_rng_mt19937_64, &rng_proc);
-  if(res != RES_OK) {
-    htrdr_log_err(htrdr, "could not create the RNG used to sample a process "
-      "to steal -- %s.\n", res_to_cstr((res_T)res));
-    goto error;
-  }
-
-  proc_ntiles = proc_work_get_ntiles(work);
-  nthreads = MMIN(htrdr->nthreads, proc_ntiles);
-
-  /* The process is not considered as a working process for himself */
-  htrdr->mpi_working_procs[htrdr->mpi_rank] = 0;
-  --htrdr->mpi_nworking_procs;
-
-  omp_set_num_threads((int)nthreads);
-  #pragma omp parallel
-  for(;;) {
-    const int ithread = omp_get_thread_num();
-    struct ssp_rng_proxy* rng_proxy = NULL;
-    struct ssp_rng* rng;
-    struct tile* tile;
-    uint32_t mcode = TILE_MCODE_NULL;
-    size_t tile_org[2];
-    size_t tile_sz[2];
-    size_t n;
-    res_T res_local = RES_OK;
-    int32_t pcent;
-
-    /* Get a tile to draw */
-    #pragma omp critical
-    {
-      mcode = proc_work_get_tile(work);
-      if(mcode == TILE_MCODE_NULL) { /* No more work on this process */
-        /* Try to steal works to concurrent processes */
-        proc_work_reset(work);
-        nthieves = mpi_steal_work(htrdr, rng_proc, work);
-        if(nthieves != 0) {
-          mcode = proc_work_get_tile(work);
-        }
-      }
-    }
-    if(mcode == TILE_MCODE_NULL) break; /* No more work */
-
-    /* Decode the morton code to retrieve the tile index  */
-    tile_org[0] = morton2D_decode((uint32_t)(mcode>>0));
-    tile_org[1] = morton2D_decode((uint32_t)(mcode>>1));
-    ASSERT(tile_org[0] < ntiles_x && tile_org[1] < ntiles_y);
-
-    /* Create the tile */
-    tile = tile_create(htrdr->allocator);
-    if(!tile) {
-      ATOMIC_SET(&res, RES_MEM_ERR);
-      htrdr_log_err(htrdr,
-        "could not allocate the memory space of the tile (%lu, %lu) -- %s.\n",
-        (unsigned long)tile_org[0], (unsigned long)tile_org[1],
-         res_to_cstr((res_T)ATOMIC_GET(&res)));
-      break;
-    }
-
-    /* Register the tile */
-    #pragma omp critical
-    list_add_tail(tiles, &tile->node);
-
-    tile->data.x = (uint16_t)tile_org[0];
-    tile->data.y = (uint16_t)tile_org[1];
-
-    /* Define the tile origin in pixel space */
-    tile_org[0] *= TILE_SIZE;
-    tile_org[1] *= TILE_SIZE;
-
-    /* Compute the size of the tile clamped by the borders of the buffer */
-    tile_sz[0] = MMIN(TILE_SIZE, width - tile_org[0]);
-    tile_sz[1] = MMIN(TILE_SIZE, height - tile_org[1]);
-
-    /* Create a proxy RNG for the current tile. This proxy is used for the
-     * current thread only and thus it has to manage only one RNG. This proxy
-     * is initialised in order to ensure that a unique and predictable set of
-     * random numbers is used for the current tile. */
-    SSP(rng_proxy_create2
-      (&htrdr->lifo_allocators[ithread],
-       &ssp_rng_threefry,
-       RNG_SEQUENCE_SIZE * (size_t)mcode, /* Offset */
-       RNG_SEQUENCE_SIZE, /* Size */
-       RNG_SEQUENCE_SIZE * (size_t)ntiles_adjusted, /* Pitch */
-       1, &rng_proxy));
-    SSP(rng_proxy_create_rng(rng_proxy, 0, &rng));
-
-    /* Launch the tile rendering */
-    res_local = draw_tile(htrdr, (size_t)ithread, mcode, tile_org, tile_sz,
-      pix_sz, cam, spp, rng, tile);
-
-    SSP(rng_proxy_ref_put(rng_proxy));
-    SSP(rng_ref_put(rng));
-
-    if(res_local != RES_OK) {
-      ATOMIC_SET(&res, res_local);
-      break;
-    }
-
-    /* Update the progress status */
-    n = (size_t)ATOMIC_INCR(&nsolved_tiles);
-    pcent = (int32_t)((double)n * 100.0 / (double)proc_ntiles + 0.5/*round*/);
-
-    #pragma omp critical
-    if(pcent > htrdr->mpi_progress_render[0]) {
-      htrdr->mpi_progress_render[0] = pcent;
-      if(htrdr->mpi_rank == 0) {
-        update_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
-      } else { /* Send the progress percentage to the master process */
-        send_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING, pcent);
-      }
-    }
-  }
-
-  if(ATOMIC_GET(&res) != RES_OK) goto error;
-
-  /* Synchronize the process */
-  mutex_lock(htrdr->mpi_mutex);
-  MPI(Barrier(MPI_COMM_WORLD));
-  mutex_unlock(htrdr->mpi_mutex);
-
-exit:
-  if(rng_proc) SSP(rng_ref_put(rng_proc));
-  return (res_T)res;
-error:
-  goto exit;
-}
-
-/*******************************************************************************
- * Local functions
- ******************************************************************************/
-res_T
-htrdr_draw_radiance_sw
-  (struct htrdr* htrdr,
-   const struct htrdr_camera* cam,
-   const size_t width,
-   const size_t height,
-   const size_t spp,
-   struct htrdr_buffer* buf)
-{
-  char strbuf[128];
-  struct time t0, t1;
-  struct list_node tiles;
-  size_t ntiles_x, ntiles_y, ntiles, ntiles_adjusted;
-  size_t itile;
-  struct proc_work work;
-  struct htrdr_buffer_layout layout = HTRDR_BUFFER_LAYOUT_NULL;
-  size_t proc_ntiles_adjusted;
-  double pix_sz[2];
-  ATOMIC probe_thieves = 1;
-  ATOMIC res = RES_OK;
-  ASSERT(htrdr && cam && width && height);
-  ASSERT(htrdr->mpi_rank != 0 || buf);
-
-  list_init(&tiles);
-  proc_work_init(htrdr->allocator, &work);
-
-  if(htrdr->mpi_rank == 0) {
-    htrdr_buffer_get_layout(buf, &layout);
-    ASSERT(layout.width || layout.height || layout.elmt_size);
-    ASSERT(layout.width == width && layout.height == height);
-
-    if(layout.elmt_size != sizeof(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__])
-    || layout.alignment < ALIGNOF(struct htrdr_accum[HTRDR_ESTIMATES_COUNT__])) {
-      htrdr_log_err(htrdr,
-        "%s: invalid buffer layout. "
-        "The pixel size must be the size of %lu accumulators.\n",
-        FUNC_NAME, (unsigned long)HTRDR_ESTIMATES_COUNT__);
-      res = RES_BAD_ARG;
-      goto error;
-    }
-  }
-
-  /* Compute the overall number of tiles */
-  ntiles_x = (width + (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
-  ntiles_y = (height+ (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
-  ntiles = ntiles_x * ntiles_y;
-
-  /* Compute the pixel size in the normalized image plane */
-  pix_sz[0] = 1.0 / (double)width;
-  pix_sz[1] = 1.0 / (double)height;
-
-  /* Adjust the #tiles for the morton-encoding procedure */
-  ntiles_adjusted = round_up_pow2(MMAX(ntiles_x, ntiles_y));
-  ntiles_adjusted *= ntiles_adjusted;
-
-  /* Define the initial number of tiles of the current process */
-  proc_ntiles_adjusted = ntiles_adjusted / (size_t)htrdr->mpi_nprocs;
-  if(htrdr->mpi_rank == 0) { /* Affect the remaining tiles to the master proc */
-    proc_ntiles_adjusted +=
-      ntiles_adjusted - proc_ntiles_adjusted*(size_t)htrdr->mpi_nprocs;
-  }
-
-  /* Define the initial list of tiles of the process */
-  FOR_EACH(itile, 0, proc_ntiles_adjusted) {
-    uint32_t mcode;
-    uint16_t tile_org[2];
-
-    mcode = (uint32_t)itile*(uint32_t)htrdr->mpi_nprocs
-          + (uint32_t)htrdr->mpi_rank;
-
-    tile_org[0] = morton2D_decode(mcode>>0);
-    if(tile_org[0] >= ntiles_x) continue;
-    tile_org[1] = morton2D_decode(mcode>>1);
-    if(tile_org[1] >= ntiles_y) continue;
-    proc_work_add_tile(&work, mcode);
-  }
-
-  if(htrdr->mpi_rank == 0) {
-    fetch_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
-    print_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
-  }
-
-  time_current(&t0);
-
-  omp_set_nested(1); /* Enable nested threads for draw_image */
-  #pragma omp parallel sections num_threads(2)
-  {
-    #pragma omp section
-    mpi_probe_thieves(htrdr, &work, &probe_thieves);
-
-    #pragma omp section
-    {
-      draw_image(htrdr, cam, width, height, spp, ntiles_x, ntiles_y,
-        ntiles_adjusted, pix_sz, &work, &tiles);
-      /* The processes have no more work to do. Stop probing for thieves */
-      ATOMIC_SET(&probe_thieves, 0);
-    }
-  }
-
-  if(htrdr->mpi_rank == 0) {
-    update_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
-    fprintf(stderr, "\n"); /* Add a new line after the progress statuses */
-  }
-
-  time_sub(&t0, time_current(&t1), &t0);
-  time_dump(&t0, TIME_ALL, NULL, strbuf, sizeof(strbuf));
-  htrdr_log(htrdr, "Rendering time: %s\n", strbuf);
-
-  /* Gather accum buffers from the group of processes */
-  time_current(&t0);
-  res = mpi_gather_tiles(htrdr, buf, ntiles, &tiles);
-  if(res != RES_OK) goto error;
-  time_sub(&t0, time_current(&t1), &t0);
-  time_dump(&t0, TIME_ALL, NULL, strbuf, sizeof(strbuf));
-  htrdr_log(htrdr, "Image gathering time: %s\n", strbuf);
-
-exit:
-  { /* Free allocated tiles */
-    struct list_node* node;
-    struct list_node* tmp;
-    LIST_FOR_EACH_SAFE(node, tmp, &tiles) {
-      struct tile* tile = CONTAINER_OF(node, struct tile, node);
-      list_del(node);
-      tile_ref_put(tile);
-    }
-  }
-  proc_work_release(&work);
-  return (res_T)res;
-error:
-  goto exit;
-}
-
diff --git a/src/htrdr_solve.h b/src/htrdr_solve.h
@@ -56,7 +56,7 @@ htrdr_compute_radiance_lw
    const size_t iquad); /* Index of the quadrature point into the band */
 
 extern LOCAL_SYM res_T
-htrdr_draw_radiance_sw
+htrdr_draw_radiance
   (struct htrdr* htrdr,
    const struct htrdr_camera* cam,
    const size_t width, /* Image width */
diff --git a/src/htrdr_sun.c b/src/htrdr_sun.c
@@ -45,14 +45,6 @@ struct htrdr_sun {
 /*******************************************************************************
  * Helper functions
  ******************************************************************************/
-static INLINE int
-cmp_dbl(const void* a, const void* b)
-{
-  const double d0 = *((const double*)a);
-  const double d1 = *((const double*)b);
-  return d0 < d1 ? -1 : (d0 > d1 ? 1 : 0);
-}
-
 static void
 release_sun(ref_T* ref)
 {

	htrdr Solving radiative transfer in heterogeneous media
	git clone git://git.meso-star.fr/htrdr.git
	Log \| Files \| Refs \| README \| LICENSE

M	cmake/CMakeLists.txt	\|	2	+-
M	src/htrdr.c	\|	79	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M	src/htrdr.h	\|	3	+++
M	src/htrdr_c.h	\|	67	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/htrdr_compute_radiance_lw.c	\|	60	------------------------------------------------------------
A	src/htrdr_draw_radiance.c	\|	897	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	src/htrdr_draw_radiance_sw.c	\|	851	-------------------------------------------------------------------------------
M	src/htrdr_solve.h	\|	2	+-
M	src/htrdr_sun.c	\|	8	--------