Merge branch 'feature_mpi' - htrdr - Solving radiative transfer in heterogeneous media

commit 6f6b95522a6e61a7af9ea6980b75fe8fd26e77cd
parent c7b47ff81cd070a3254159cdc5574528ee77a82f
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Thu,  8 Nov 2018 14:22:05 +0100

Merge branch 'feature_mpi'

Diffstat:
M README.md  | 24 ++++++++++++++++++++----
M cmake/CMakeLists.txt  | 6 +++++-
M src/htrdr.c  | 460 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M src/htrdr.h  | 43 ++++++++++++++++++++++++++++++++++++++++---
M src/htrdr_args.c  | 18 +++++++++++++++++-
M src/htrdr_args.h  | 4 ++++
M src/htrdr_buffer.c  | 1 +
M src/htrdr_c.h  | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
M src/htrdr_compute_radiance_sw.c  | 3 ++-
M src/htrdr_draw_radiance_sw.c  | 742 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M src/htrdr_ground.c  | 11 +++++++++++
M src/htrdr_ground.h  | 5 +++++
M src/htrdr_main.c  | 34 ++++++++++++++++++++++++++++++++++
M src/htrdr_sky.c  | 189 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M src/htrdr_slab.c  | 6 +++---
M src/htrdr_solve.h  | 5 ++++-

16 files changed, 1407 insertions(+), 195 deletions(-)
diff --git a/README.md b/README.md
@@ -1,7 +1,22 @@
 # High-Tune: RenDeRer
 
-This programs is used to test the implementation of radiative transfer
-Monte-Carlo algorithms in cloudy atmopsheres.
+This program is a part of the [High-Tune](http://www.umr-cnrm.fr/high-tune/)
+project: it illustrates the implementation of efficient radiative transfer
+Monte-Carlo algorithms in cloudy atmospheres.
+
+This program implements a rendering algorithm that computes the radiance in the
+spectral interval [380, 780] nanometres that reaches an image through a pinhole
+camera. The rendered scene is at least composed of an infinite 1D atmosphere
+along the Z axis. Optionally, one can add 3D data describing the cloud
+properties and/or a geometry describing the ground with a lambertian
+reflectivity. The clouds and the ground, can be both infinitely repeated along
+the X and Y axis.
+
+In addition of shared memory parallelism, htrdr supports the [*M*essage
+*P*assing *I*nterface](https://www.mpi-forum.org/) specification to
+parallelise its computations in a distribute memory environment; the HTRDR
+binary can be run either directly or through a MPI process launcher like
+`mpirun`.
 
 ## How to build
 
@@ -17,8 +32,9 @@ on the
 [Star-3DAW](https://gitlab.com/meso-star/star-3daw/),
 [Star-SF](https://gitlab.com/meso-star/star-sf/),
 [Star-SP](https://gitlab.com/meso-star/stat-sp/) and
-[Star-VX](https://gitlab.com/meso-star/star-vx/) libraries and on the
-[OpenMP](http://www.openmp.org) 1.2 specification to parallelize its
+[Star-VX](https://gitlab.com/meso-star/star-vx/) libraries and on
+[OpenMP](http://www.openmp.org) 1.2 and the
+[MPI](https://www.mpi-forum.org/) 2.0 specification to parallelize its
 computations.
 
 First ensure that CMake is installed on your system. Then install the RCMake
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -34,11 +34,14 @@ find_package(HTCP REQUIRED)
 find_package(HTGOP REQUIRED)
 find_package(HTMIE REQUIRED)
 find_package(OpenMP 1.2 REQUIRED)
+find_package(MPI 2 REQUIRED)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${RCMAKE_SOURCE_DIR})
 include(rcmake)
 include(rcmake_runtime)
 
+set(CMAKE_C_COMPILER ${MPI_C_COMPILER})
+
 include_directories(
   ${RSys_INCLUDE_DIR}
   ${Star3D_INCLUDE_DIR}
@@ -48,7 +51,8 @@ include_directories(
   ${StarVX_INCLUDE_DIR}
   ${HTCP_INCLUDE_DIR}
   ${HTGOP_INCLUDE_DIR}
-  ${HTMIE_INCLUDE_DIR})
+  ${HTMIE_INCLUDE_DIR}
+  ${MPI_INCLUDE_PATH})
 
 ################################################################################
 # Configure and define targets
diff --git a/src/htrdr.c b/src/htrdr.c
@@ -25,7 +25,7 @@
 #include "htrdr_sun.h"
 #include "htrdr_solve.h"
 
-#include <rsys/clock_time.h>
+#include <rsys/cstr.h>
 #include <rsys/mem_allocator.h>
 #include <rsys/str.h>
 
@@ -39,10 +39,12 @@
 #include <stdarg.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <time.h>
 #include <sys/time.h> /* timespec */
 #include <sys/stat.h> /* S_IRUSR & S_IWUSR */
 
 #include <omp.h>
+#include <mpi.h>
 
 /*******************************************************************************
  * Helper functions
@@ -243,6 +245,185 @@ spherical_to_cartesian_dir
   dir[2] = sin_elevation;
 }
 
+static void
+release_mpi(struct htrdr* htrdr)
+{
+  ASSERT(htrdr);
+  if(htrdr->mpi_working_procs) {
+    MEM_RM(htrdr->allocator, htrdr->mpi_working_procs);
+    htrdr->mpi_working_procs = NULL;
+  }
+  if(htrdr->mpi_progress_octree) {
+    MEM_RM(htrdr->allocator, htrdr->mpi_progress_octree);
+    htrdr->mpi_progress_octree = NULL;
+  }
+  if(htrdr->mpi_progress_render) {
+    MEM_RM(htrdr->allocator, htrdr->mpi_progress_render);
+    htrdr->mpi_progress_render = NULL;
+  }
+  if(htrdr->mpi_err_str) {
+    MEM_RM(htrdr->allocator, htrdr->mpi_err_str);
+    htrdr->mpi_err_str = NULL;
+  }
+  if(htrdr->mpi_mutex) {
+    mutex_destroy(htrdr->mpi_mutex);
+    htrdr->mpi_mutex = NULL;
+  }
+}
+
+static res_T
+mpi_print_proc_info(struct htrdr* htrdr)
+{
+  char proc_name[MPI_MAX_PROCESSOR_NAME];
+  int proc_name_len;
+  char* proc_names = NULL;
+  uint32_t* proc_nthreads = NULL;
+  uint32_t nthreads = 0;
+  int iproc;
+  res_T res = RES_OK;
+  ASSERT(htrdr);
+
+  if(htrdr->mpi_rank == 0) {
+    proc_names = MEM_CALLOC(htrdr->allocator, (size_t)htrdr->mpi_nprocs,
+      MPI_MAX_PROCESSOR_NAME*sizeof(*proc_names));
+    if(!proc_names) {
+      res = RES_MEM_ERR;
+      htrdr_log_err(htrdr,
+        "could not allocate the temporary memory for MPI process names -- "
+        "%s.\n", res_to_cstr(res));
+      goto error;
+    }
+
+    proc_nthreads = MEM_CALLOC(htrdr->allocator, (size_t)htrdr->mpi_nprocs,
+      sizeof(*proc_nthreads));
+    if(!proc_nthreads) {
+      res = RES_MEM_ERR;
+      htrdr_log_err(htrdr,
+        "could not allocate the temporary memory for the #threads of the MPI "
+        "processes -- %s.\n", res_to_cstr(res));
+      goto error;
+    }
+  }
+
+  /* Gather process name */
+  MPI(Get_processor_name(proc_name, &proc_name_len));
+  MPI(Gather(proc_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, proc_names,
+    MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, MPI_COMM_WORLD));
+
+  /* Gather process #threads */
+  nthreads = (uint32_t)htrdr->nthreads;
+  MPI(Gather(&nthreads, 1, MPI_UINT32_T, proc_nthreads, 1, MPI_UINT32_T, 0,
+    MPI_COMM_WORLD));
+
+  if(htrdr->mpi_rank == 0) {
+    FOR_EACH(iproc, 0, htrdr->mpi_nprocs) {
+      htrdr_log(htrdr, "Process %d -- %s; #threads: %u\n",
+        iproc, proc_names + iproc*MPI_MAX_PROCESSOR_NAME, proc_nthreads[iproc]);
+    }
+  }
+
+exit:
+  if(proc_names) MEM_RM(htrdr->allocator, proc_names);
+  if(proc_nthreads) MEM_RM(htrdr->allocator, proc_nthreads);
+  return res;
+error:
+  goto exit;
+}
+
+static res_T
+init_mpi(struct htrdr* htrdr)
+{
+  size_t n;
+  int err;
+  res_T res = RES_OK;
+  ASSERT(htrdr);
+
+  htrdr->mpi_err_str = MEM_CALLOC
+    (htrdr->allocator, htrdr->nthreads, MPI_MAX_ERROR_STRING);
+  if(!htrdr->mpi_err_str) {
+    res = RES_MEM_ERR;
+    htrdr_log_err(htrdr,
+      "could not allocate the MPI error strings -- %s.\n",
+      res_to_cstr(res));
+    goto error;
+  }
+
+  err = MPI_Comm_rank(MPI_COMM_WORLD, &htrdr->mpi_rank);
+  if(err != MPI_SUCCESS) {
+    htrdr_log_err(htrdr,
+      "could not determine the MPI rank of the calling process -- %s.\n",
+      htrdr_mpi_error_string(htrdr, err));
+    res = RES_UNKNOWN_ERR;
+    goto error;
+  }
+
+  err = MPI_Comm_size(MPI_COMM_WORLD, &htrdr->mpi_nprocs);
+  if(err != MPI_SUCCESS) {
+    htrdr_log_err(htrdr,
+      "could retrieve the size of the MPI group -- %s.\n",
+      htrdr_mpi_error_string(htrdr, err));
+    res = RES_UNKNOWN_ERR;
+    goto error;
+  }
+
+  htrdr->mpi_working_procs = MEM_CALLOC(htrdr->allocator,
+    (size_t)htrdr->mpi_nprocs, sizeof(*htrdr->mpi_working_procs));
+  if(!htrdr->mpi_working_procs) {
+    htrdr_log_err(htrdr,
+      "could not allocate the list of working processes.\n");
+    res = RES_MEM_ERR;
+    goto error;
+  }
+
+  /* Initialy, all the processes are working */
+  htrdr->mpi_nworking_procs = (size_t)htrdr->mpi_nprocs;
+  memset(htrdr->mpi_working_procs, 0xFF,
+    htrdr->mpi_nworking_procs*sizeof(*htrdr->mpi_working_procs));
+
+  /* Allocate #processes progress statuses on the master process and only 1
+   * progress status on the other ones: the master process will gather the
+   * status of the other processes to report their progression. */
+  n = (size_t)(htrdr->mpi_rank == 0 ? htrdr->mpi_nprocs : 1);
+
+  htrdr->mpi_progress_octree = MEM_CALLOC
+    (htrdr->allocator, n, sizeof(*htrdr->mpi_progress_octree));
+  if(!htrdr->mpi_progress_octree) {
+    res = RES_MEM_ERR;
+    htrdr_log_err(htrdr,
+      "could not allocate the progress state of the octree building -- %s.\n",
+      res_to_cstr(res));
+    goto error;
+  }
+
+  htrdr->mpi_progress_render = MEM_CALLOC
+    (htrdr->allocator, n, sizeof(*htrdr->mpi_progress_render));
+  if(!htrdr->mpi_progress_render) {
+    res = RES_MEM_ERR;
+    htrdr_log_err(htrdr,
+      "could not allocate the progress state of the scene rendering -- %s.\n",
+      res_to_cstr(res));
+    goto error;
+  }
+
+  htrdr->mpi_mutex = mutex_create();
+  if(!htrdr->mpi_mutex) {
+    res = RES_MEM_ERR;
+    htrdr_log_err(htrdr,
+      "could not create the mutex to protect MPI calls from concurrent "
+      "threads -- %s.\n", res_to_cstr(res));
+    goto error;
+  }
+
+  if(htrdr->mpi_nprocs != 1)
+    mpi_print_proc_info(htrdr);
+
+exit:
+  return res;
+error:
+  release_mpi(htrdr);
+  goto exit;
+}
+
 /*******************************************************************************
  * Local functions
  ******************************************************************************/
@@ -271,9 +452,20 @@ htrdr_init
   str_init(htrdr->allocator, &htrdr->output_name);
 
   htrdr->dump_vtk = args->dump_vtk;
+  htrdr->cache_grids = args->cache_grids;
   htrdr->verbose = args->verbose;
   htrdr->nthreads = MMIN(args->nthreads, (unsigned)omp_get_num_procs());
   htrdr->spp = args->image.spp;
+  htrdr->width = args->image.definition[0];
+  htrdr->height = args->image.definition[1];
+
+  res = init_mpi(htrdr);
+  if(res != RES_OK) goto error;
+
+  if(htrdr->cache_grids && htrdr->mpi_nprocs != 1) {
+    htrdr_log_warn(htrdr, "cached grids are not supported in a MPI execution.\n");
+    htrdr->cache_grids = 0;
+  }
 
   if(!args->output) {
     htrdr->output = stdout;
@@ -287,14 +479,17 @@ htrdr_init
   res = str_set(&htrdr->output_name, output_name);
   if(res != RES_OK) {
     htrdr_log_err(htrdr,
-      "could not store the name of the output stream `%s'.\n", output_name);
+      "%s: could not store the name of the output stream `%s' -- %s.\n",
+      FUNC_NAME, output_name, res_to_cstr(res));
     goto error;
   }
 
   res = svx_device_create
     (&htrdr->logger, htrdr->allocator, args->verbose, &htrdr->svx);
   if(res != RES_OK) {
-    htrdr_log_err(htrdr, "could not create the Star-VoXel device.\n");
+    htrdr_log_err(htrdr,
+      "%s: could not create the Star-VoXel device -- %s.\n",
+      FUNC_NAME, res_to_cstr(res));
     goto error;
   }
 
@@ -304,12 +499,14 @@ htrdr_init
   res = s3d_device_create
     (&htrdr->logger, htrdr->allocator, 0, &htrdr->s3d);
   if(res != RES_OK) {
-    htrdr_log_err(htrdr, "could not create the Star-3D device.\n");
+    htrdr_log_err(htrdr,
+      "%s: could not create the Star-3D device -- %s.\n",
+      FUNC_NAME, res_to_cstr(res));
     goto error;
   }
 
-  res = htrdr_ground_create
-    (htrdr, args->filename_obj, args->repeat_ground, &htrdr->ground);
+  res = htrdr_ground_create(htrdr, args->filename_obj,
+    args->ground_reflectivity, args->repeat_ground, &htrdr->ground);
   if(res != RES_OK) goto error;
 
   proj_ratio =
@@ -319,14 +516,18 @@ htrdr_init
     args->camera.up, proj_ratio, MDEG2RAD(args->camera.fov_y), &htrdr->cam);
   if(res != RES_OK) goto error;
 
-  res = htrdr_buffer_create(htrdr,
-    args->image.definition[0], /* Width */
-    args->image.definition[1], /* Height */
-    args->image.definition[0]*sizeof(struct htrdr_accum[3]), /* Pitch */
-    sizeof(struct htrdr_accum[3]),
-    ALIGNOF(struct htrdr_accum[3]), /* Alignment */
-    &htrdr->buf);
-  if(res != RES_OK) goto error;
+  /* Create the image buffer only on the master process; the image parts
+   * rendered by the processes are gathered onto the master process. */
+  if(!htrdr->dump_vtk && htrdr->mpi_rank == 0) {
+    res = htrdr_buffer_create(htrdr,
+      args->image.definition[0], /* Width */
+      args->image.definition[1], /* Height */
+      args->image.definition[0]*sizeof(struct htrdr_accum[3]), /* Pitch */
+      sizeof(struct htrdr_accum[3]),
+      ALIGNOF(struct htrdr_accum[3]), /* Alignment */
+      &htrdr->buf);
+    if(res != RES_OK) goto error;
+  }
 
   res = htrdr_sun_create(htrdr, &htrdr->sun);
   if(res != RES_OK) goto error;
@@ -342,24 +543,25 @@ htrdr_init
   htrdr->lifo_allocators = MEM_CALLOC
     (htrdr->allocator, htrdr->nthreads, sizeof(*htrdr->lifo_allocators));
   if(!htrdr->lifo_allocators) {
-    htrdr_log_err(htrdr,
-      "could not allocate the list of per thread LIFO allocator.\n");
     res = RES_MEM_ERR;
+    htrdr_log_err(htrdr,
+      "%s: could not allocate the list of per thread LIFO allocator -- %s.\n",
+      FUNC_NAME, res_to_cstr(res));
     goto error;
   }
 
   FOR_EACH(ithread, 0, htrdr->nthreads) {
     res = mem_init_lifo_allocator
-      (&htrdr->lifo_allocators[ithread], htrdr->allocator, 4096);
+      (&htrdr->lifo_allocators[ithread], htrdr->allocator, 16384);
     if(res != RES_OK) {
       htrdr_log_err(htrdr,
-        "could not initialise the LIFO allocator of the thread %lu.\n",
-        (unsigned long)ithread);
+        "%s: could not initialise the LIFO allocator of the thread %lu -- %s.\n",
+        FUNC_NAME, (unsigned long)ithread, res_to_cstr(res));
       goto error;
     }
   }
 
-  exit:
+exit:
   return res;
 error:
   htrdr_release(htrdr);
@@ -370,6 +572,7 @@ void
 htrdr_release(struct htrdr* htrdr)
 {
   ASSERT(htrdr);
+  release_mpi(htrdr);
   if(htrdr->s3d) S3D(device_ref_put(htrdr->s3d));
   if(htrdr->svx) SVX(device_ref_put(htrdr->svx));
   if(htrdr->ground) htrdr_ground_ref_put(htrdr->ground);
@@ -395,6 +598,10 @@ htrdr_run(struct htrdr* htrdr)
   if(htrdr->dump_vtk) {
     const size_t nbands = htrdr_sky_get_sw_spectral_bands_count(htrdr->sky);
     size_t i;
+
+    /* Nothing to do */
+    if(htrdr->mpi_rank != 0) goto exit;
+
     FOR_EACH(i, 0, nbands) {
       const size_t iband = htrdr_sky_get_sw_spectral_band_id(htrdr->sky, i);
       const size_t nquads = htrdr_sky_get_sw_spectral_band_quadrature_length
@@ -407,19 +614,14 @@ htrdr_run(struct htrdr* htrdr)
       }
     }
   } else {
-    struct time t0, t1;
-    char buf[128];
-
-    time_current(&t0);
-    res = htrdr_draw_radiance_sw(htrdr, htrdr->cam, htrdr->spp, htrdr->buf);
-    if(res != RES_OK) goto error;
-    time_sub(&t0, time_current(&t1), &t0);
-    time_dump(&t0, TIME_ALL, NULL, buf, sizeof(buf));
-    htrdr_log(htrdr, "Rendering time: %s\n", buf);
-
-    res = dump_accum_buffer
-      (htrdr, htrdr->buf, str_cget(&htrdr->output_name), htrdr->output);
+    res = htrdr_draw_radiance_sw(htrdr, htrdr->cam, htrdr->width,
+      htrdr->height, htrdr->spp, htrdr->buf);
     if(res != RES_OK) goto error;
+    if(htrdr->mpi_rank == 0) {
+      res = dump_accum_buffer
+        (htrdr, htrdr->buf, str_cget(&htrdr->output_name), htrdr->output);
+      if(res != RES_OK) goto error;
+    }
   }
 exit:
   return res;
@@ -430,11 +632,14 @@ error:
 void
 htrdr_log(struct htrdr* htrdr, const char* msg, ...)
 {
-  va_list vargs_list;
   ASSERT(htrdr && msg);
-  va_start(vargs_list, msg);
-  log_msg(htrdr, LOG_OUTPUT, msg, vargs_list);
-  va_end(vargs_list);
+  /* Log standard message only on master process */
+  if(htrdr->mpi_rank == 0) {
+    va_list vargs_list;
+    va_start(vargs_list, msg);
+    log_msg(htrdr, LOG_OUTPUT, msg, vargs_list);
+    va_end(vargs_list);
+  }
 }
 
 void
@@ -442,6 +647,7 @@ htrdr_log_err(struct htrdr* htrdr, const char* msg, ...)
 {
   va_list vargs_list;
   ASSERT(htrdr && msg);
+  /* Log errors on all processes */
   va_start(vargs_list, msg);
   log_msg(htrdr, LOG_ERROR, msg, vargs_list);
   va_end(vargs_list);
@@ -450,11 +656,48 @@ htrdr_log_err(struct htrdr* htrdr, const char* msg, ...)
 void
 htrdr_log_warn(struct htrdr* htrdr, const char* msg, ...)
 {
-  va_list vargs_list;
   ASSERT(htrdr && msg);
-  va_start(vargs_list, msg);
-  log_msg(htrdr, LOG_WARNING, msg, vargs_list);
-  va_end(vargs_list);
+  /* Log warnings only on master process */
+  if(htrdr->mpi_rank == 0) {
+    va_list vargs_list;
+    va_start(vargs_list, msg);
+    log_msg(htrdr, LOG_WARNING, msg, vargs_list);
+    va_end(vargs_list);
+  }
+}
+
+const char*
+htrdr_mpi_error_string(struct htrdr* htrdr, const int mpi_err)
+{
+  const int ithread = omp_get_thread_num();
+  char* str;
+  int strlen_err;
+  int err;
+  ASSERT(htrdr && (size_t)ithread < htrdr->nthreads);
+  str = htrdr->mpi_err_str + ithread*MPI_MAX_ERROR_STRING;
+  err = MPI_Error_string(mpi_err, str, &strlen_err);
+  return err == MPI_SUCCESS ? str : "Invalid MPI error";
+}
+
+void
+htrdr_fprintf(struct htrdr* htrdr, FILE* stream, const char* msg, ...)
+{
+  ASSERT(htrdr && msg);
+  if(htrdr->mpi_rank == 0) {
+    va_list vargs_list;
+    va_start(vargs_list, msg);
+    vfprintf(stream, msg, vargs_list);
+    va_end(vargs_list);
+  }
+}
+
+void
+htrdr_fflush(struct htrdr* htrdr, FILE* stream)
+{
+  ASSERT(htrdr);
+  if(htrdr->mpi_rank == 0) {
+    fflush(stream);
+  }
 }
 
 /*******************************************************************************
@@ -642,3 +885,140 @@ exit:
 error:
   goto exit;
 }
+
+void
+send_mpi_progress
+  (struct htrdr* htrdr, const enum htrdr_mpi_message msg, const int32_t percent)
+{
+  ASSERT(htrdr);
+  ASSERT(msg == HTRDR_MPI_PROGRESS_RENDERING
+      || msg == HTRDR_MPI_PROGRESS_BUILD_OCTREE);
+  (void)htrdr;
+  mutex_lock(htrdr->mpi_mutex);
+  MPI(Send(&percent, 1, MPI_INT32_T, 0, msg, MPI_COMM_WORLD));
+  mutex_unlock(htrdr->mpi_mutex);
+}
+
+void
+fetch_mpi_progress(struct htrdr* htrdr, const enum htrdr_mpi_message msg)
+{
+  struct timespec t;
+  int32_t* progress = NULL;
+  int iproc;
+  ASSERT(htrdr && htrdr->mpi_rank == 0);
+
+  t.tv_sec = 0;
+  t.tv_nsec = 10000000; /* 10ms */
+
+  switch(msg) {
+    case HTRDR_MPI_PROGRESS_BUILD_OCTREE:
+      progress = htrdr->mpi_progress_octree;
+      break;
+    case HTRDR_MPI_PROGRESS_RENDERING:
+      progress = htrdr->mpi_progress_render;
+     break;
+    default: FATAL("Unreachable code.\n"); break;
+  }
+
+  FOR_EACH(iproc, 1, htrdr->mpi_nprocs) {
+    /* Flush the last sent percentage of the process `iproc' */
+    for(;;) {
+      MPI_Request req;
+      int flag;
+      int complete;
+
+      mutex_lock(htrdr->mpi_mutex);
+      MPI(Iprobe(iproc, msg, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE));
+      mutex_unlock(htrdr->mpi_mutex);
+
+      if(flag == 0) break; /* No more message */
+
+      mutex_lock(htrdr->mpi_mutex);
+      MPI(Irecv(&progress[iproc], 1, MPI_INT32_T, iproc, msg, MPI_COMM_WORLD, &req));
+      mutex_unlock(htrdr->mpi_mutex);
+      for(;;) {
+        mutex_lock(htrdr->mpi_mutex);
+        MPI(Test(&req, &complete, MPI_STATUS_IGNORE));
+        mutex_unlock(htrdr->mpi_mutex);
+        if(complete) break;
+        nanosleep(&t, NULL);
+      }
+    }
+  }
+}
+
+void
+print_mpi_progress(struct htrdr* htrdr, const enum htrdr_mpi_message msg)
+{
+  ASSERT(htrdr && htrdr->mpi_rank == 0);
+
+  if(htrdr->mpi_nprocs == 1) {
+    switch(msg) {
+      case HTRDR_MPI_PROGRESS_BUILD_OCTREE:
+        htrdr_fprintf(htrdr, stderr, "\033[2K\rBuilding octree: %3d%%",
+          htrdr->mpi_progress_octree[0]);
+        break;
+      case HTRDR_MPI_PROGRESS_RENDERING:
+        htrdr_fprintf(htrdr, stderr, "\033[2K\rRendering: %3d%%",
+          htrdr->mpi_progress_render[0]);
+        break;
+      default: FATAL("Unreachable code.\n"); break;
+    }
+    htrdr_fflush(htrdr, stderr);
+  } else {
+    int iproc;
+    FOR_EACH(iproc, 0, htrdr->mpi_nprocs) {
+      switch(msg) {
+        case HTRDR_MPI_PROGRESS_BUILD_OCTREE:
+          htrdr_fprintf(htrdr, stderr,
+            "\033[2K\rProcess %d -- building octree: %3d%%%c",
+            iproc, htrdr->mpi_progress_octree[iproc],
+            iproc == htrdr->mpi_nprocs - 1 ? '\r' : '\n');
+          break;
+        case HTRDR_MPI_PROGRESS_RENDERING:
+          htrdr_fprintf(htrdr, stderr,
+            "\033[2K\rProcess %d -- rendering: %3d%%%c",
+            iproc, htrdr->mpi_progress_render[iproc],
+            iproc == htrdr->mpi_nprocs - 1 ? '\r' : '\n');
+          break;
+        default: FATAL("Unreachable code.\n"); break;
+      }
+    }
+  }
+}
+
+void
+clear_mpi_progress(struct htrdr* htrdr, const enum htrdr_mpi_message msg)
+{
+  ASSERT(htrdr);
+  (void)msg;
+  if(htrdr->mpi_nprocs > 1) {
+    htrdr_fprintf(htrdr, stderr, "\033[%dA", htrdr->mpi_nprocs-1);
+  }
+}
+
+int
+total_mpi_progress(const struct htrdr* htrdr, const enum htrdr_mpi_message msg)
+{
+  const int* progress = NULL;
+  int total = 0;
+  int iproc;
+  ASSERT(htrdr && htrdr->mpi_rank == 0);
+
+  switch(msg) {
+    case HTRDR_MPI_PROGRESS_BUILD_OCTREE:
+      progress = htrdr->mpi_progress_octree;
+      break;
+    case HTRDR_MPI_PROGRESS_RENDERING:
+      progress = htrdr->mpi_progress_render;
+      break;
+    default: FATAL("Unreachable code.\n"); break;
+  }
+
+  FOR_EACH(iproc, 0, htrdr->mpi_nprocs) {
+    total += progress[iproc];
+  }
+  total = total / htrdr->mpi_nprocs;
+  return total;
+}
+
diff --git a/src/htrdr.h b/src/htrdr.h
@@ -35,6 +35,7 @@ struct htrdr_buffer;
 struct htrdr_sky;
 struct htrdr_rectangle;
 struct mem_allocator;
+struct mutext;
 struct s3d_device;
 struct s3d_scene;
 struct ssf_bsdf;
@@ -52,13 +53,28 @@ struct htrdr {
   struct htrdr_camera* cam;
   struct htrdr_buffer* buf;
   size_t spp; /* #samples per pixel */
+  size_t width; /* Image width */
+  size_t height; /* Image height */
 
   FILE* output;
   struct str output_name;
 
-  unsigned nthreads;
-  int dump_vtk;
-  int verbose;
+  unsigned nthreads; /* #threads of the process */
+  int dump_vtk; /* Dump octree VTK */
+  int cache_grids; /* Use/Precompute grid caches */
+  int verbose; /* Verbosity level */
+
+  int mpi_rank; /* Rank of the process in the MPI group */
+  int mpi_nprocs; /* Overall #processes in the MPI group */
+  char* mpi_err_str; /* Temp buffer used to store MPI error string */
+  int8_t* mpi_working_procs; /* Define the rank of active processes */
+  size_t mpi_nworking_procs;
+
+  /* Process progress percentage */
+  int32_t* mpi_progress_octree;
+  int32_t* mpi_progress_render;
+
+  struct mutex* mpi_mutex; /* Protect MPI calls from concurrent threads */
 
   struct logger logger;
   struct mem_allocator* allocator;
@@ -109,5 +125,26 @@ htrdr_log_warn
 #endif
   ;
 
+extern LOCAL_SYM const char*
+htrdr_mpi_error_string
+  (struct htrdr* htrdr,
+   const int mpi_err);
+
+extern LOCAL_SYM void
+htrdr_fprintf
+  (struct htrdr* htrdr,
+   FILE* stream,
+   const char* msg,
+   ...)
+#ifdef COMPILER_GCC
+  __attribute((format(printf, 3, 4)))
+#endif
+  ;
+
+extern LOCAL_SYM void
+htrdr_fflush
+  (struct htrdr* htrdr,
+   FILE* stream);
+
 #endif /* HTRDR_H */
 
diff --git a/src/htrdr_args.c b/src/htrdr_args.c
@@ -46,10 +46,19 @@ print_help(const char* cmd)
   printf(
 "  -d               dump octree data to OUTPUT wrt the VTK ASCII file format.\n");
   printf(
+"  -e               ground reflectivity in [0, 1]. By default its value is `%g'.\n",
+    HTRDR_ARGS_DEFAULT.ground_reflectivity);
+  printf(
 "  -f               overwrite the OUTPUT file if it already exists.\n");
   printf(
 "  -g FILENAME      path of an OBJ file representing the ground geometry.\n");
   printf(
+"  -G               precompute/use grids of raw cloud opitical properties built\n"
+"                   from the HTCP file, the atmospheric profile and the Mie's\n"
+"                   data. If the corresponding grids were generated in a\n"
+"                   previous run, reuse them as far as it is possible, i.e. if\n"
+"                   the HTCP, Mie and atmospheric files were not updated.\n");
+  printf(
 "  -h               display this help and exit.\n");
   printf(
 "  -i <image>       define the image to compute.\n");
@@ -320,7 +329,7 @@ htrdr_args_init(struct htrdr_args* args, int argc, char** argv)
 
   *args = HTRDR_ARGS_DEFAULT;
 
-  while((opt = getopt(argc, argv, "a:C:c:D:dfg:hi:m:o:RrT:t:v")) != -1) {
+  while((opt = getopt(argc, argv, "a:C:c:D:de:fGg:hi:m:o:RrT:t:v")) != -1) {
     switch(opt) {
       case 'a': args->filename_gas = optarg; break;
       case 'C':
@@ -330,7 +339,14 @@ htrdr_args_init(struct htrdr_args* args, int argc, char** argv)
       case 'c': args->filename_les = optarg; break;
       case 'D': res = parse_sun_dir(args, optarg); break;
       case 'd': args->dump_vtk = 1; break;
+      case 'e':
+        res = cstr_to_double(optarg, &args->ground_reflectivity);
+        if(args->ground_reflectivity < 0 || args->ground_reflectivity > 1) {
+          res = RES_BAD_ARG;
+        }
+        break;
       case 'f': args->force_overwriting = 1; break;
+      case 'G': args->cache_grids = 1; break;
       case 'g': args->filename_obj = optarg; break;
       case 'h':
         print_help(argv[0]);
diff --git a/src/htrdr_args.h b/src/htrdr_args.h
@@ -47,10 +47,12 @@ struct htrdr_args {
   double sun_azimuth; /* In degrees */
   double sun_elevation; /* In degrees */
   double optical_thickness; /* Threshold used during octree building */
+  double ground_reflectivity; /* Reflectivity of the ground */
 
   unsigned nthreads; /* Hint on the number of threads to use */
   int force_overwriting;
   int dump_vtk; /* Dump the loaded cloud properties in a VTK file */
+  int cache_grids; /* Use grid caching mechanism */
   int verbose; /* Verbosity level */
   int repeat_clouds; /* Make the clouds infinite in X and Y */
   int repeat_ground; /* Make the ground infinite in X and Y */
@@ -80,9 +82,11 @@ struct htrdr_args {
   0, /* Sun azimuth */                                                         \
   90, /* Sun elevation */                                                      \
   1.0, /* Optical thickness */                                                 \
+  0.5, /* Ground reflectivity */                                               \
   (unsigned)~0, /* #threads */                                                 \
   0, /* Force overwriting */                                                   \
   0, /* dump VTK */                                                            \
+  0, /* Grid cache */                                                          \
   0, /* Verbose flag */                                                        \
   0, /* Repeat clouds */                                                       \
   0, /* Repeat ground */                                                       \
diff --git a/src/htrdr_buffer.c b/src/htrdr_buffer.c
@@ -163,3 +163,4 @@ htrdr_buffer_at(struct htrdr_buffer* buf, const size_t x, const size_t y)
   ASSERT(buf && x < buf->width && y < buf->height);
   return buf->mem + y*buf->pitch + x*buf->elmtsz;
 }
+
diff --git a/src/htrdr_c.h b/src/htrdr_c.h
@@ -18,6 +18,20 @@
 
 #include <rsys/rsys.h>
 
+#ifndef NDEBUG
+  #define MPI(Func) ASSERT(MPI_##Func == MPI_SUCCESS)
+#else
+  #define MPI(Func) MPI_##Func
+#endif
+
+enum htrdr_mpi_message {
+  HTRDR_MPI_PROGRESS_BUILD_OCTREE,
+  HTRDR_MPI_PROGRESS_RENDERING,
+  HTRDR_MPI_STEAL_REQUEST,
+  HTRDR_MPI_WORK_STEALING,
+  HTRDR_MPI_TILE_DATA
+};
+
 struct htrdr;
 
 #define SW_WAVELENGTH_MIN 380 /* In nanometer */
@@ -96,7 +110,7 @@ is_file_updated
 
 extern LOCAL_SYM res_T
 update_file_stamp
-  (struct htrdr* htrdr, 
+  (struct htrdr* htrdr,
    const char* filename);
 
 extern LOCAL_SYM res_T
@@ -104,5 +118,40 @@ create_directory
   (struct htrdr* htrdt,
    const char* path);
 
+extern LOCAL_SYM void
+send_mpi_progress
+  (struct htrdr* htrdr,
+   const enum htrdr_mpi_message progress,
+   const int32_t percent);
+
+extern LOCAL_SYM void
+fetch_mpi_progress
+  (struct htrdr* htrdr,
+   const enum htrdr_mpi_message progress);
+
+extern LOCAL_SYM void
+print_mpi_progress
+  (struct htrdr* htrdr,
+   const enum htrdr_mpi_message progress);
+
+extern LOCAL_SYM void
+clear_mpi_progress
+  (struct htrdr* htrdr,
+   const enum htrdr_mpi_message progress);
+
+extern int32_t
+total_mpi_progress
+  (const struct htrdr* htrdr,
+   const enum htrdr_mpi_message progress);
+
+static INLINE void
+update_mpi_progress(struct htrdr* htrdr, const enum htrdr_mpi_message progress)
+{
+  ASSERT(htrdr);
+  fetch_mpi_progress(htrdr, progress);
+  clear_mpi_progress(htrdr, progress);
+  print_mpi_progress(htrdr, progress);
+}
+
 #endif /* HTRDR_C_H */
 
diff --git a/src/htrdr_compute_radiance_sw.c b/src/htrdr_compute_radiance_sw.c
@@ -285,7 +285,8 @@ htrdr_compute_radiance_sw
   CHK(RES_OK == ssf_phase_create
     (&htrdr->lifo_allocators[ithread], &ssf_phase_rayleigh, &phase_rayleigh));
 
-  SSF(lambertian_reflection_setup(bsdf, 0.5));
+  SSF(lambertian_reflection_setup
+    (bsdf, htrdr_ground_get_reflectivity(htrdr->ground)));
 
   /* Setup the phase function for this spectral band & quadrature point */
   g = htrdr_sky_fetch_particle_phase_function_asymmetry_parameter
diff --git a/src/htrdr_draw_radiance_sw.c b/src/htrdr_draw_radiance_sw.c
@@ -20,14 +20,44 @@
 #include "htrdr_sky.h"
 #include "htrdr_solve.h"
 
+#include <rsys/clock_time.h>
+#include <rsys/cstr.h>
+#include <rsys/dynamic_array_u32.h>
+#include <rsys/math.h>
+#include <rsys/mutex.h>
 #include <star/ssp.h>
 
 #include <omp.h>
-#include <rsys/math.h>
+#include <mpi.h>
+#include <time.h>
+#include <unistd.h>
+
+#define RNG_SEQUENCE_SIZE 10000
 
-#define TILE_SIZE 32 /* definition in X & Y of a tile */
+#define TILE_MCODE_NULL UINT32_MAX
+#define TILE_SIZE 32 /* Definition in X & Y of a tile */
 STATIC_ASSERT(IS_POW2(TILE_SIZE), TILE_SIZE_must_be_a_power_of_2);
 
+/* Tile of row ordered image pixels */
+struct tile {
+  struct list_node node;
+  struct mem_allocator* allocator;
+  ref_T ref;
+
+  struct tile_data {
+    uint16_t x, y; /* 2D coordinates of the tile in tile space */
+    /* Simulate the flexible array member of the C99 standard. */
+    struct htrdr_accum accums[1/*dummy element*/];
+  } data;
+};
+
+/* List of tile to compute onto the MPI process. */
+struct proc_work {
+  struct mutex* mutex;
+  struct darray_u32 tiles; /* #tiles to render */
+  size_t itile; /* Next tile to render in the above list of tiles */
+};
+
 /*******************************************************************************
  * Helper functions
  ******************************************************************************/
@@ -42,6 +72,378 @@ morton2D_decode(const uint32_t u32)
   return (uint16_t)x;
 }
 
+static FINLINE uint32_t
+morton2D_encode(const uint16_t u16)
+{
+  uint32_t u32 = u16;
+  u32 = (u32 | (u32 << 8)) & 0x00FF00FF;
+  u32 = (u32 | (u32 << 4)) & 0X0F0F0F0F;
+  u32 = (u32 | (u32 << 2)) & 0x33333333;
+  u32 = (u32 | (u32 << 1)) & 0x55555555;
+  return u32;
+}
+
+static FINLINE struct tile*
+tile_create(struct mem_allocator* allocator)
+{
+  struct tile* tile;
+  const size_t tile_sz =
+    sizeof(struct tile) - sizeof(struct htrdr_accum)/*rm dummy accum*/;
+  const size_t buf_sz = /* Flexiblbe array element */
+    TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum)*3/*#channels*/;
+  ASSERT(allocator);
+
+  tile = MEM_ALLOC(allocator, tile_sz+buf_sz);
+  if(!tile) return NULL;
+
+  ref_init(&tile->ref);
+  list_init(&tile->node);
+  tile->allocator = allocator;
+  ASSERT(IS_ALIGNED(&tile->data.accums, ALIGNOF(struct htrdr_accum)));
+
+  return tile;
+}
+
+static INLINE void
+tile_ref_get(struct tile* tile)
+{
+  ASSERT(tile);
+  tile_ref_get(tile);
+}
+
+static INLINE void
+release_tile(ref_T* ref)
+{
+  struct tile* tile = CONTAINER_OF(ref, struct tile, ref);
+  ASSERT(ref);
+  MEM_RM(tile->allocator, tile);
+}
+
+static INLINE void
+tile_ref_put(struct tile* tile)
+{
+  ASSERT(tile);
+  ref_put(&tile->ref, release_tile);
+}
+
+static FINLINE struct htrdr_accum*
+tile_at
+  (struct tile* tile,
+   const size_t x, /* In tile space */
+   const size_t y) /* In tile space */
+{
+  ASSERT(tile && x < TILE_SIZE && y < TILE_SIZE);
+  return tile->data.accums + (y*TILE_SIZE + x)*3/*#channels*/;
+}
+
+static void
+write_tile_data(struct htrdr_buffer* buf, const struct tile_data* tile_data)
+{
+  struct htrdr_buffer_layout layout = HTRDR_BUFFER_LAYOUT_NULL;
+  size_t icol, irow;
+  size_t irow_tile;
+  size_t ncols_tile, nrows_tile;
+  char* buf_mem;
+  ASSERT(buf && tile_data);
+
+  htrdr_buffer_get_layout(buf, &layout);
+  buf_mem = htrdr_buffer_get_data(buf);
+
+  /* Compute the row/column of the tile origin into the buffer */
+  icol = tile_data->x * (size_t)TILE_SIZE;
+  irow = tile_data->y * (size_t)TILE_SIZE;
+
+  /* Define the number of tile row/columns to write into the buffer */
+  ncols_tile = MMIN(icol + TILE_SIZE, layout.width)  - icol;
+  nrows_tile = MMIN(irow + TILE_SIZE, layout.height) - irow;
+
+  /* Copy the tile data, row by row */
+  FOR_EACH(irow_tile, 0, nrows_tile) {
+    char* buf_row = buf_mem + (irow + irow_tile) * layout.pitch;
+    const struct htrdr_accum* tile_row =
+      tile_data->accums + irow_tile*TILE_SIZE*3/*#channels*/;
+    memcpy(buf_row + icol*sizeof(struct htrdr_accum)*3, tile_row,
+      ncols_tile*sizeof(struct htrdr_accum)*3/*#channels*/);
+  }
+}
+
+static INLINE void
+proc_work_init(struct mem_allocator* allocator, struct proc_work* work)
+{
+  ASSERT(work);
+  darray_u32_init(allocator, &work->tiles);
+  work->itile = 0;
+  CHK(work->mutex = mutex_create());
+}
+
+static INLINE void
+proc_work_release(struct proc_work* work)
+{
+  darray_u32_release(&work->tiles);
+  mutex_destroy(work->mutex);
+}
+
+static INLINE void
+proc_work_reset(struct proc_work* work)
+{
+  ASSERT(work);
+  mutex_lock(work->mutex);
+  darray_u32_clear(&work->tiles);
+  work->itile = 0;
+  mutex_unlock(work->mutex);
+}
+
+static INLINE void
+proc_work_add_tile(struct proc_work* work, const uint32_t mcode)
+{
+  mutex_lock(work->mutex);
+  CHK(darray_u32_push_back(&work->tiles, &mcode) == RES_OK);
+  mutex_unlock(work->mutex);
+}
+
+static INLINE uint32_t
+proc_work_get_tile(struct proc_work* work)
+{
+  uint32_t mcode;
+  ASSERT(work);
+  mutex_lock(work->mutex);
+  if(work->itile >= darray_u32_size_get(&work->tiles)) {
+    mcode = TILE_MCODE_NULL;
+  } else {
+    mcode = darray_u32_cdata_get(&work->tiles)[work->itile];
+    ++work->itile;
+  }
+  mutex_unlock(work->mutex);
+  return mcode;
+}
+
+static INLINE size_t
+proc_work_get_ntiles(struct proc_work* work)
+{
+  size_t sz = 0;
+  ASSERT(work);
+  mutex_lock(work->mutex);
+  sz = darray_u32_size_get(&work->tiles);
+  mutex_unlock(work->mutex);
+  return sz;
+}
+
+static void
+mpi_wait_for_request(struct htrdr* htrdr, MPI_Request* req)
+{
+  ASSERT(htrdr && req);
+
+  /* Wait for process synchronisation */
+  for(;;) {
+    struct timespec t;
+    int complete;
+    t.tv_sec = 0;
+    t.tv_nsec = 10000000; /* 10ms */
+
+    mutex_lock(htrdr->mpi_mutex);
+    MPI(Test(req, &complete, MPI_STATUS_IGNORE));
+    mutex_unlock(htrdr->mpi_mutex);
+    if(complete) break;
+
+    nanosleep(&t, NULL);
+  }
+}
+
+static void
+mpi_probe_thieves
+  (struct htrdr* htrdr,
+   struct proc_work* work,
+   ATOMIC* probe_thieves)
+{
+  uint32_t tiles[UINT8_MAX];
+  struct timespec t;
+  ASSERT(htrdr && work && probe_thieves);
+
+  if(htrdr->mpi_nprocs == 1) /* The process is alone. No thief is possible */
+    return;
+
+  t.tv_sec = 0;
+
+  /* Protect MPI calls of multiple invocations from concurrent threads */
+  #define P_MPI(Func) {                                                        \
+    mutex_lock(htrdr->mpi_mutex);                                              \
+    MPI(Func);                                                                 \
+    mutex_unlock(htrdr->mpi_mutex);                                            \
+  } (void)0
+
+  while(ATOMIC_GET(probe_thieves)) {
+    MPI_Status status;
+    size_t itile;
+    int msg;
+
+    /* Probe if a steal request was submitted by any processes */
+    P_MPI(Iprobe(MPI_ANY_SOURCE, HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD, &msg,
+      &status));
+
+    if(msg) { /* A steal request was posted */
+      MPI_Request req;
+      uint8_t ntiles_to_steal;
+
+      /* Asynchronously receive the steal request */
+      P_MPI(Irecv(&ntiles_to_steal, 1, MPI_UINT8_T, status.MPI_SOURCE,
+        HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD, &req));
+
+      /* Wait for the completion of the steal request */
+      mpi_wait_for_request(htrdr, &req);
+
+      /* Thief some tiles */
+      FOR_EACH(itile, 0, ntiles_to_steal) {
+        tiles[itile] = proc_work_get_tile(work);
+      }
+      P_MPI(Send(&tiles, ntiles_to_steal, MPI_UINT32_T, status.MPI_SOURCE,
+        HTRDR_MPI_WORK_STEALING, MPI_COMM_WORLD));
+    }
+    t.tv_nsec = 500000000; /* 500ms */
+    nanosleep(&t, NULL);
+  }
+  #undef P_MPI
+}
+
+static int
+mpi_sample_working_process(struct htrdr* htrdr, struct ssp_rng* rng)
+{
+  int iproc, i;
+  int dst_rank;
+  ASSERT(htrdr && rng && htrdr->mpi_nworking_procs);
+
+  /* Sample the index of the 1st active process */
+  iproc = (int)(ssp_rng_canonical(rng) * (double)htrdr->mpi_nworking_procs);
+
+  /* Find the rank of the sampled active process. Use a simple linear search
+   * since the overall number of processes should be quite low; at most few
+   * dozens.  */
+  i = 0;
+  FOR_EACH(dst_rank, 0, htrdr->mpi_nprocs) {
+    if(htrdr->mpi_working_procs[dst_rank] == 0) continue; /* Inactive process */
+    if(i == iproc) break; /* The rank of the sampled process is found */
+    ++i;
+  }
+  ASSERT(dst_rank < htrdr->mpi_nprocs);
+  return dst_rank;
+}
+
+/* Return the number of stolen tiles */
+static size_t
+mpi_steal_work
+  (struct htrdr* htrdr,
+   struct ssp_rng* rng,
+   struct proc_work* work)
+{
+  MPI_Request req;
+  size_t itile;
+  size_t nthieves = 0;
+  uint32_t tiles[UINT8_MAX]; /* Morton code of the stolen tile */
+  int proc_to_steal; /* Process to steal */
+  uint8_t ntiles_to_steal = MMIN((uint8_t)(htrdr->nthreads*2), 16);
+  ASSERT(htrdr && rng && work && htrdr->nthreads < UINT8_MAX);
+
+  /* Protect MPI calls of multiple invocations from concurrent threads */
+  #define P_MPI(Func) {                                                        \
+    mutex_lock(htrdr->mpi_mutex);                                              \
+    MPI(Func);                                                                 \
+    mutex_unlock(htrdr->mpi_mutex);                                            \
+  } (void)0
+
+  /* No more working process => nohting to steal */
+  if(!htrdr->mpi_nworking_procs) return 0;
+
+  /* Sample a process to steal */
+  proc_to_steal = mpi_sample_working_process(htrdr, rng);
+
+  /* Send a steal request to the sampled process and wait for a response */
+  P_MPI(Send(&ntiles_to_steal, 1, MPI_UINT8_T, proc_to_steal,
+    HTRDR_MPI_STEAL_REQUEST, MPI_COMM_WORLD));
+
+  /* Receive the stolen tile from the sampled process */
+  P_MPI(Irecv(tiles, ntiles_to_steal, MPI_UINT32_T, proc_to_steal,
+    HTRDR_MPI_WORK_STEALING, MPI_COMM_WORLD, &req));
+
+  mpi_wait_for_request(htrdr, &req);
+
+  FOR_EACH(itile, 0, ntiles_to_steal) {
+    if(tiles[itile] == TILE_MCODE_NULL) {
+      ASSERT(htrdr->mpi_working_procs[proc_to_steal] != 0);
+      htrdr->mpi_working_procs[proc_to_steal] = 0;
+      htrdr->mpi_nworking_procs--;
+      break;
+    }
+    proc_work_add_tile(work, tiles[itile]);
+    ++nthieves;
+  }
+  #undef P_MPI
+  return nthieves;
+}
+
+static res_T
+mpi_gather_tiles
+  (struct htrdr* htrdr,
+   struct htrdr_buffer* buf,
+   const size_t ntiles,
+   struct list_node* tiles)
+{
+  /* Compute the size of the tile_data */
+  const size_t msg_sz =
+    sizeof(struct tile_data) - sizeof(struct htrdr_accum)/*dummy*/
+  + TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum)*3/*#channels*/;
+
+  struct list_node* node = NULL;
+  struct tile* tile = NULL;
+  res_T res = RES_OK;
+  ASSERT(htrdr && tiles);
+  ASSERT(htrdr->mpi_rank != 0 || buf);
+  (void)ntiles;
+
+  if(htrdr->mpi_rank != 0) { /* Non master process */
+    /* Send the computed tile to the master process */
+    LIST_FOR_EACH(node, tiles) {
+      struct tile* t = CONTAINER_OF(node, struct tile, node);
+      MPI(Send(&t->data, (int)msg_sz, MPI_CHAR, 0,
+        HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD));
+    }
+  } else { /* Master process */
+    size_t itile = 0;
+
+    LIST_FOR_EACH(node, tiles) {
+      struct tile* t = CONTAINER_OF(node, struct tile, node);
+      write_tile_data(buf, &t->data);
+      ++itile;
+    }
+
+    if(itile != ntiles) {
+      ASSERT(htrdr->mpi_nprocs > 1);
+
+      /* Create a temporary tile to receive the tile data computed by the
+       * concurrent MPI processes */
+      tile = tile_create(htrdr->allocator);
+      if(!tile) {
+        res = RES_MEM_ERR;
+        htrdr_log_err(htrdr,
+          "could not allocate the temporary tile used to gather the process "
+          "output data -- %s.\n", res_to_cstr(res));
+        goto error;
+      }
+
+      /* Receive the tile data of the concurret MPI processes */
+      FOR_EACH(itile, itile, ntiles) {
+        MPI(Recv(&tile->data, (int)msg_sz, MPI_CHAR, MPI_ANY_SOURCE,
+          HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        write_tile_data(buf, &tile->data);
+      }
+    }
+  }
+
+exit:
+  if(tile) tile_ref_put(tile);
+  return res;
+error:
+  goto exit;
+}
+
 static res_T
 draw_tile
   (struct htrdr* htrdr,
@@ -53,11 +455,11 @@ draw_tile
    const struct htrdr_camera* cam,
    const size_t spp, /* #samples per pixel */
    struct ssp_rng* rng,
-   struct htrdr_buffer* buf)
+   struct tile* tile)
 {
   size_t npixels;
   size_t mcode; /* Morton code of tile pixel */
-  ASSERT(htrdr && tile_org && tile_sz && pix_sz && cam && spp && buf);
+  ASSERT(htrdr && tile_org && tile_sz && pix_sz && cam && spp && tile);
   (void)tile_mcode;
   /* Adjust the #pixels to process them wrt a morton order */
   npixels = round_up_pow2(MMAX(tile_sz[0], tile_sz[1]));
@@ -74,13 +476,13 @@ draw_tile
     ipix_tile[1] = morton2D_decode((uint32_t)(mcode>>1));
     if(ipix_tile[1] >= tile_sz[1]) continue; /* Pixel is out of tile */
 
+    /* Fetch and reset the pixel accumulator */
+    pix_accums = tile_at(tile, ipix_tile[0], ipix_tile[1]);
+
     /* Compute the pixel coordinate */
     ipix[0] = tile_org[0] + ipix_tile[0];
     ipix[1] = tile_org[1] + ipix_tile[1];
 
-    /* Fetch and reset the pixel accumulator */
-    pix_accums = htrdr_buffer_at(buf, ipix[0], ipix[1]);
-
     FOR_EACH(ichannel, 0, 3) {
       size_t isamp;
       pix_accums[ichannel] = HTRDR_ACCUM_NULL;
@@ -133,131 +535,293 @@ draw_tile
   return RES_OK;
 }
 
-/*******************************************************************************
- * Local functions
- ******************************************************************************/
-res_T
-htrdr_draw_radiance_sw
+static res_T
+draw_image
   (struct htrdr* htrdr,
    const struct htrdr_camera* cam,
+   const size_t width, /* Image width */
+   const size_t height, /* Image height */
    const size_t spp,
-   struct htrdr_buffer* buf)
+   const size_t ntiles_x,
+   const size_t ntiles_y,
+   const size_t ntiles_adjusted,
+   const double pix_sz[2], /* Pixel size in the normalized image plane */
+   struct proc_work* work,
+   struct list_node* tiles)
 {
-  struct ssp_rng_proxy* rng_proxy = NULL;
-  struct ssp_rng** rngs = NULL;
-  size_t ntiles_x, ntiles_y, ntiles, ntiles_adjusted;
-  size_t progress = 0;
-  size_t i;
-  int32_t mcode; /* Morton code of the tile */
-  struct htrdr_buffer_layout layout;
-  double pix_sz[2]; /* Pixel size in the normalized image plane */
+  struct ssp_rng* rng_proc = NULL;
+  MPI_Request req;
+  size_t nthreads = 0;
+  size_t nthieves = 0;
+  size_t proc_ntiles = 0;
   ATOMIC nsolved_tiles = 0;
   ATOMIC res = RES_OK;
-  ASSERT(htrdr && cam && buf);
+  ASSERT(htrdr && cam && spp && ntiles_adjusted && work && tiles);
+  ASSERT(pix_sz && pix_sz[0] > 0 && pix_sz[1] > 0);
+  ASSERT(width && height);
+  (void)ntiles_x, (void)ntiles_y;
 
-  htrdr_buffer_get_layout(buf, &layout);
-  ASSERT(layout.width || layout.height || layout.elmt_size);
-
-  if(layout.elmt_size != sizeof(struct htrdr_accum[3])/*#channels*/
-  || layout.alignment < ALIGNOF(struct htrdr_accum[3])) {
-    htrdr_log_err(htrdr,
-      "%s: invalid buffer layout. "
-      "The pixel size must be the size of 3 * accumulators.\n",
-      FUNC_NAME);
-    res = RES_BAD_ARG;
-    goto error;
-  }
-
-  res = ssp_rng_proxy_create
-    (htrdr->allocator, &ssp_rng_mt19937_64, htrdr->nthreads, &rng_proxy);
+  res = ssp_rng_create(htrdr->allocator, &ssp_rng_mt19937_64, &rng_proc);
   if(res != RES_OK) {
-    htrdr_log_err(htrdr, "%s: could not create the RNG proxy.\n", FUNC_NAME);
+    htrdr_log_err(htrdr, "could not create the RNG used to sample a process "
+      "to steal -- %s.\n", res_to_cstr((res_T)res));
     goto error;
   }
 
-  rngs = MEM_CALLOC(htrdr->allocator, htrdr->nthreads, sizeof(*rngs));
-  if(!rngs) {
-    htrdr_log_err(htrdr, "%s: could not allocate the RNGs list.\n", FUNC_NAME);
-    res = RES_MEM_ERR;
-    goto error;
-  }
+  proc_ntiles = proc_work_get_ntiles(work);
+  nthreads = MMIN(htrdr->nthreads, proc_ntiles);
 
-  FOR_EACH(i, 0, htrdr->nthreads) {
-    res = ssp_rng_proxy_create_rng(rng_proxy, i, &rngs[i]);
-    if(res != RES_OK) {
-      htrdr_log_err(htrdr,"%s: could not create the per thread RNG.\n", FUNC_NAME);
-      goto error;
-    }
-  }
-
-  ntiles_x = (layout.width + (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
-  ntiles_y = (layout.height+ (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
-  ntiles_adjusted = round_up_pow2(MMAX(ntiles_x, ntiles_y));
-  ntiles_adjusted *= ntiles_adjusted;
-  ntiles = ntiles_x * ntiles_y;
-
-  pix_sz[0] = 1.0 / (double)layout.width;
-  pix_sz[1] = 1.0 / (double)layout.height;
-
-  fprintf(stderr, "Rendering: %3i%%", 0);
-  fflush(stderr);
+  /* The process is not considered as a working process for himself */
+  htrdr->mpi_working_procs[htrdr->mpi_rank] = 0;
+  --htrdr->mpi_nworking_procs;
 
-  omp_set_num_threads((int)htrdr->nthreads);
-
-  #pragma omp parallel for schedule(static, 1/*chunck size*/)
-  for(mcode=0; mcode<(int64_t)ntiles_adjusted; ++mcode) {
+  omp_set_num_threads((int)nthreads);
+  #pragma omp parallel
+  for(;;) {
     const int ithread = omp_get_thread_num();
-    struct ssp_rng* rng = rngs[ithread];
+    struct ssp_rng_proxy* rng_proxy = NULL;
+    struct ssp_rng* rng;
+    struct tile* tile;
+    uint32_t mcode = TILE_MCODE_NULL;
     size_t tile_org[2];
     size_t tile_sz[2];
-    size_t pcent;
     size_t n;
     res_T res_local = RES_OK;
+    int32_t pcent;
+
+    /* Get a tile to draw */
+    #pragma omp critical
+    {
+      mcode = proc_work_get_tile(work);
+      if(mcode == TILE_MCODE_NULL) { /* No more work on this process */
+        /* Try to steal works to concurrent processes */
+        proc_work_reset(work);
+        nthieves = mpi_steal_work(htrdr, rng_proc, work);
+        if(nthieves != 0) {
+          mcode = proc_work_get_tile(work);
+        }
+      }
+    }
+    if(mcode == TILE_MCODE_NULL) break; /* No more work */
 
     /* Decode the morton code to retrieve the tile index  */
     tile_org[0] = morton2D_decode((uint32_t)(mcode>>0));
-    if(tile_org[0] >= ntiles_x) continue; /* Skip border tile */
     tile_org[1] = morton2D_decode((uint32_t)(mcode>>1));
-    if(tile_org[1] >= ntiles_y) continue; /* Skip border tile */
+    ASSERT(tile_org[0] < ntiles_x && tile_org[1] < ntiles_y);
+
+    /* Create the tile */
+    tile = tile_create(htrdr->allocator);
+    if(!tile) {
+      ATOMIC_SET(&res, RES_MEM_ERR);
+      htrdr_log_err(htrdr,
+        "could not allocate the memory space of the tile (%lu, %lu) -- %s.\n",
+        (unsigned long)tile_org[0], (unsigned long)tile_org[1],
+         res_to_cstr((res_T)ATOMIC_GET(&res)));
+      break;
+    }
+
+    /* Register the tile */
+    #pragma omp critical
+    list_add_tail(tiles, &tile->node);
+
+    tile->data.x = (uint16_t)tile_org[0];
+    tile->data.y = (uint16_t)tile_org[1];
 
     /* Define the tile origin in pixel space */
     tile_org[0] *= TILE_SIZE;
     tile_org[1] *= TILE_SIZE;
 
     /* Compute the size of the tile clamped by the borders of the buffer */
-    tile_sz[0] = MMIN(TILE_SIZE, layout.width - tile_org[0]);
-    tile_sz[1] = MMIN(TILE_SIZE, layout.height - tile_org[1]);
-
+    tile_sz[0] = MMIN(TILE_SIZE, width - tile_org[0]);
+    tile_sz[1] = MMIN(TILE_SIZE, height - tile_org[1]);
+
+    /* Create a proxy RNG for the current tile. This proxy is used for the
+     * current thread only and thus it has to manage only one RNG. This proxy
+     * is initialised in order to ensure that a unique and predictable set of
+     * random numbers is used for the current tile. */
+    SSP(rng_proxy_create2
+      (&htrdr->lifo_allocators[ithread],
+       &ssp_rng_threefry,
+       RNG_SEQUENCE_SIZE * (size_t)mcode, /* Offset */
+       RNG_SEQUENCE_SIZE, /* Size */
+       RNG_SEQUENCE_SIZE * (size_t)ntiles_adjusted, /* Pitch */
+       1, &rng_proxy));
+    SSP(rng_proxy_create_rng(rng_proxy, 0, &rng));
+
+    /* Launch the tile rendering */
     res_local = draw_tile(htrdr, (size_t)ithread, mcode, tile_org, tile_sz,
-      pix_sz, cam, spp, rng, buf);
+      pix_sz, cam, spp, rng, tile);
+
+    SSP(rng_proxy_ref_put(rng_proxy));
+    SSP(rng_ref_put(rng));
+
     if(res_local != RES_OK) {
       ATOMIC_SET(&res, res_local);
-      continue;
+      break;
     }
 
+    /* Update the progress status */
     n = (size_t)ATOMIC_INCR(&nsolved_tiles);
-    pcent = n * 100 / ntiles;
+    pcent = (int32_t)((double)n * 100.0 / (double)proc_ntiles + 0.5/*round*/);
 
     #pragma omp critical
-    if(pcent > progress) {
-      progress = pcent;
-      fprintf(stderr, "%c[2K\rRendering: %3lu%%",
-        27, (unsigned long)pcent);
-      fflush(stderr);
+    if(pcent > htrdr->mpi_progress_render[0]) {
+      htrdr->mpi_progress_render[0] = pcent;
+      if(htrdr->mpi_rank == 0) {
+        update_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+      } else { /* Send the progress percentage to the master process */
+        send_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING, pcent);
+      }
     }
+  }
+
+  if(ATOMIC_GET(&res) != RES_OK) goto error;
+
+  /* Synchronize the process */
+  mutex_lock(htrdr->mpi_mutex);
+  MPI(Ibarrier(MPI_COMM_WORLD, &req));
+  mutex_unlock(htrdr->mpi_mutex);
+
+  /* Wait for processes synchronization */
+  mpi_wait_for_request(htrdr, &req);
+
+exit:
+  if(rng_proc) SSP(rng_ref_put(rng_proc));
+  return (res_T)res;
+error:
+  goto exit;
+}
 
-    if(ATOMIC_GET(&res) != RES_OK) continue;
+/*******************************************************************************
+ * Local functions
+ ******************************************************************************/
+res_T
+htrdr_draw_radiance_sw
+  (struct htrdr* htrdr,
+   const struct htrdr_camera* cam,
+   const size_t width,
+   const size_t height,
+   const size_t spp,
+   struct htrdr_buffer* buf)
+{
+  char strbuf[128];
+  struct time t0, t1;
+  struct list_node tiles;
+  size_t ntiles_x, ntiles_y, ntiles, ntiles_adjusted;
+  size_t itile;
+  struct proc_work work;
+  struct htrdr_buffer_layout layout = HTRDR_BUFFER_LAYOUT_NULL;
+  size_t proc_ntiles_adjusted;
+  double pix_sz[2];
+  ATOMIC probe_thieves = 1;
+  ATOMIC res = RES_OK;
+  ASSERT(htrdr && cam && width && height);
+  ASSERT(htrdr->mpi_rank != 0 || buf);
+
+  list_init(&tiles);
+  proc_work_init(htrdr->allocator, &work);
+
+  if(htrdr->mpi_rank == 0) {
+    htrdr_buffer_get_layout(buf, &layout);
+    ASSERT(layout.width || layout.height || layout.elmt_size);
+    ASSERT(layout.width == width && layout.height == height);
+
+    if(layout.elmt_size != sizeof(struct htrdr_accum[3])/*#channels*/
+    || layout.alignment < ALIGNOF(struct htrdr_accum[3])) {
+      htrdr_log_err(htrdr,
+        "%s: invalid buffer layout. "
+        "The pixel size must be the size of 3 * accumulators.\n",
+        FUNC_NAME);
+      res = RES_BAD_ARG;
+      goto error;
+    }
   }
-  fprintf(stderr, "\n");
+
+  /* Compute the overall number of tiles */
+  ntiles_x = (width + (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
+  ntiles_y = (height+ (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
+  ntiles = ntiles_x * ntiles_y;
+
+  /* Compute the pixel size in the normalized image plane */
+  pix_sz[0] = 1.0 / (double)width;
+  pix_sz[1] = 1.0 / (double)height;
+
+  /* Adjust the #tiles for the morton-encoding procedure */
+  ntiles_adjusted = round_up_pow2(MMAX(ntiles_x, ntiles_y));
+  ntiles_adjusted *= ntiles_adjusted;
+
+  /* Define the initial number of tiles of the current process */
+  proc_ntiles_adjusted = ntiles_adjusted / (size_t)htrdr->mpi_nprocs;
+  if(htrdr->mpi_rank == 0) { /* Affect the remaining tiles to the master proc */
+    proc_ntiles_adjusted +=
+      ntiles_adjusted - proc_ntiles_adjusted*(size_t)htrdr->mpi_nprocs;
+  }
+
+  /* Define the initial list of tiles of the process */
+  FOR_EACH(itile, 0, proc_ntiles_adjusted) {
+    uint32_t mcode;
+    uint16_t tile_org[2];
+
+    mcode = (uint32_t)itile*(uint32_t)htrdr->mpi_nprocs
+          + (uint32_t)htrdr->mpi_rank;
+
+    tile_org[0] = morton2D_decode(mcode>>0);
+    if(tile_org[0] >= ntiles_x) continue;
+    tile_org[1] = morton2D_decode(mcode>>1);
+    if(tile_org[1] >= ntiles_y) continue;
+    proc_work_add_tile(&work, mcode);
+  }
+
+  if(htrdr->mpi_rank == 0) {
+    fetch_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+    print_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+  }
+
+  time_current(&t0);
+
+  omp_set_nested(1); /* Enable nested threads for draw_image */
+  #pragma omp parallel sections num_threads(2)
+  {
+    #pragma omp section
+    mpi_probe_thieves(htrdr, &work, &probe_thieves);
+
+    #pragma omp section
+    {
+      draw_image(htrdr, cam, width, height, spp, ntiles_x, ntiles_y,
+        ntiles_adjusted, pix_sz, &work, &tiles);
+      /* The processes have no more work to do. Stop probing for thieves */
+      ATOMIC_SET(&probe_thieves, 0);
+    }
+  }
+
+  if(htrdr->mpi_rank == 0) {
+    update_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+    fprintf(stderr, "\n"); /* Add a new line after the progress statuses */
+  }
+
+  time_sub(&t0, time_current(&t1), &t0);
+  time_dump(&t0, TIME_ALL, NULL, strbuf, sizeof(strbuf));
+  htrdr_log(htrdr, "Rendering time: %s\n", strbuf);
+
+  /* Gather accum buffers from the group of processes */
+  time_current(&t0);
+  res = mpi_gather_tiles(htrdr, buf, ntiles, &tiles);
+  if(res != RES_OK) goto error;
+  time_sub(&t0, time_current(&t1), &t0);
+  time_dump(&t0, TIME_ALL, NULL, strbuf, sizeof(strbuf));
+  htrdr_log(htrdr, "Image gathering time: %s\n", strbuf);
 
 exit:
-  if(rng_proxy) SSP(rng_proxy_ref_put(rng_proxy));
-  if(rngs) {
-    FOR_EACH(i, 0, htrdr->nthreads) {
-      if(rngs[i]) SSP(rng_ref_put(rngs[i]));
+  { /* Free allocated tiles */
+    struct list_node* node;
+    struct list_node* tmp;
+    LIST_FOR_EACH_SAFE(node, tmp, &tiles) {
+      struct tile* tile = CONTAINER_OF(node, struct tile, node);
+      list_del(node);
+      tile_ref_put(tile);
     }
-    MEM_RM(htrdr->allocator, rngs);
   }
+  proc_work_release(&work);
   return (res_T)res;
 error:
   goto exit;
diff --git a/src/htrdr_ground.c b/src/htrdr_ground.c
@@ -48,6 +48,7 @@ struct htrdr_ground {
   struct s3d_scene_view* view;
   float lower[3]; /* Ground lower bound */
   float upper[3]; /* Ground upper bound */
+  double reflectivity;
   int repeat; /* Make the ground infinite in X and Y */
 
   struct htrdr* htrdr;
@@ -223,6 +224,7 @@ res_T
 htrdr_ground_create
   (struct htrdr* htrdr,
    const char* obj_filename,
+   const double reflectivity,
    const int repeat_ground, /* Infinitely repeat the ground in X and Y */
    struct htrdr_ground** out_ground)
 {
@@ -231,6 +233,7 @@ htrdr_ground_create
   struct time t0, t1;
   res_T res = RES_OK;
   ASSERT(htrdr && obj_filename && out_ground);
+  ASSERT(reflectivity >= 0 || reflectivity <= 1);
 
   ground = MEM_CALLOC(htrdr->allocator, 1, sizeof(*ground));
   if(!ground) {
@@ -243,6 +246,7 @@ htrdr_ground_create
   ref_init(&ground->ref);
   ground->htrdr = htrdr;
   ground->repeat = repeat_ground;
+  ground->reflectivity = reflectivity;
 
   time_current(&t0);
   res = setup_ground(ground, obj_filename);
@@ -276,6 +280,13 @@ htrdr_ground_ref_put(struct htrdr_ground* ground)
   ref_put(&ground->ref, release_ground);
 }
 
+double
+htrdr_ground_get_reflectivity(const struct htrdr_ground* ground)
+{
+  ASSERT(ground);
+  return ground->reflectivity;
+}
+
 res_T
 htrdr_ground_trace_ray
   (struct htrdr_ground* ground,
diff --git a/src/htrdr_ground.h b/src/htrdr_ground.h
@@ -27,6 +27,7 @@ extern LOCAL_SYM res_T
 htrdr_ground_create
   (struct htrdr* htrdr,
    const char* obj_filename,
+   const double reflectivity, /* In [0, 1] */
    const int repeat_ground, /* Infinitely repeat the ground in X and Y */
    struct htrdr_ground** ground);
 
@@ -38,6 +39,10 @@ extern LOCAL_SYM void
 htrdr_ground_ref_put
   (struct htrdr_ground* ground);
 
+extern LOCAL_SYM double
+htrdr_ground_get_reflectivity
+  (const struct htrdr_ground* ground);
+
 extern LOCAL_SYM res_T
 htrdr_ground_trace_ray
   (struct htrdr_ground* ground,
diff --git a/src/htrdr_main.c b/src/htrdr_main.c
@@ -16,8 +16,21 @@
 #include "htrdr.h"
 #include "htrdr_args.h"
 
+#include <mpi.h>
 #include <rsys/mem_allocator.h>
 
+static const char*
+thread_support_string(const int val)
+{
+  switch(val) {
+    case MPI_THREAD_SINGLE: return "MPI_THREAD_SINGLE";
+    case MPI_THREAD_FUNNELED: return "MPI_THREAD_FUNNELED";
+    case MPI_THREAD_SERIALIZED: return "MPI_THREAD_SERIALIZED";
+    case MPI_THREAD_MULTIPLE: return "MPI_THREAD_MULTIPLE";
+    default: FATAL("Unreachable code.\n"); break;
+  }
+}
+
 /*******************************************************************************
  * Program
  ******************************************************************************/
@@ -29,12 +42,32 @@ main(int argc, char** argv)
   size_t memsz = 0;
   int err = 0;
   int is_htrdr_init = 0;
+  int thread_support = 0;
   res_T res = RES_OK;
 
+  err = MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support);
+  if(err != MPI_SUCCESS) {
+    fprintf(stderr, "Error initializing MPI.\n");
+    goto error;
+  }
+
+  if(thread_support != MPI_THREAD_SERIALIZED) {
+    fprintf(stderr, "The provided MPI implementation does not support "
+      "serialized API calls from multiple threads. Provided thread support: "
+      "%s.\n", thread_support_string(thread_support));
+    goto error;
+  }
+
   res = htrdr_args_init(&args, argc, argv);
   if(res != RES_OK) goto error;
   if(args.quit) goto exit;
 
+  if(args.dump_vtk) {
+    int rank;
+    CHK(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS);
+    if(rank != 0) goto exit; /* Nothing to do except for the master process */
+  }
+
   res = htrdr_init(NULL, &args, &htrdr);
   if(res != RES_OK) goto error;
   is_htrdr_init = 1;
@@ -43,6 +76,7 @@ main(int argc, char** argv)
   if(res != RES_OK) goto error;
 
 exit:
+  MPI_Finalize();
   if(is_htrdr_init) htrdr_release(&htrdr);
   htrdr_args_release(&args);
   if((memsz = mem_allocated_size()) != 0) {
diff --git a/src/htrdr_sky.c b/src/htrdr_sky.c
@@ -38,6 +38,7 @@
 #include <fcntl.h> /* open */
 #include <libgen.h>
 #include <math.h>
+#include <mpi.h>
 #include <omp.h>
 #include <string.h>
 #include <sys/stat.h> /* mkdir */
@@ -65,6 +66,15 @@ struct split {
 #define DARRAY_DATA struct split
 #include <rsys/dynamic_array.h>
 
+struct spectral_data {
+  size_t iband; /* Index of the spectral band */
+  size_t iquad; /* Quadrature point into the band */
+};
+
+#define DARRAY_NAME specdata
+#define DARRAY_DATA struct spectral_data
+#include <rsys/dynamic_array.h>
+
 struct build_tree_context {
   const struct htrdr_sky* sky;
   double vxsz[3];
@@ -1009,8 +1019,9 @@ setup_cloud_grid
   /* Compute the overall number of voxels in the htrdr_grid */
   ncells = definition[0] * definition[1] * definition[2];
 
-  fprintf(stderr, "Generating cloud grid %lu.%lu: %3u%%", iband, iquad, 0);
-  fflush(stderr);
+  htrdr_fprintf(sky->htrdr, stderr,
+    "Generating cloud grid %lu.%lu: %3u%%", iband, iquad, 0);
+  htrdr_fflush(sky->htrdr, stderr);
 
   omp_set_num_threads((int)sky->htrdr->nthreads);
   #pragma omp parallel for
@@ -1033,12 +1044,13 @@ setup_cloud_grid
     #pragma omp critical
     if(pcent > progress) {
       progress = pcent;
-      fprintf(stderr, "%c[2K\rGenerating cloud grid %lu.%lu: %3u%%",
+      htrdr_fprintf(sky->htrdr, stderr,
+        "%c[2K\rGenerating cloud grid %lu.%lu: %3u%%",
         27, iband, iquad, (unsigned)pcent);
-      fflush(stderr);
+      htrdr_fflush(sky->htrdr, stderr);
     }
   }
-  fprintf(stderr, "\n");
+  htrdr_fprintf(sky->htrdr, stderr, "\n");
 
 exit:
   *out_grid = grid;
@@ -1063,16 +1075,20 @@ setup_clouds
    const double optical_thickness_threshold,
    const int force_cache_update)
 {
-  struct svx_voxel_desc vox_desc = SVX_VOXEL_DESC_NULL;
-  struct build_tree_context ctx = BUILD_TREE_CONTEXT_NULL;
+  struct darray_specdata specdata;
   size_t nvoxs[3];
+  double vxsz[3];
   double low[3];
   double upp[3];
+  int64_t ispecdata;
   size_t nbands;
   size_t i;
-  res_T res = RES_OK;
+  ATOMIC nbuilt_octrees = 0;
+  ATOMIC res = RES_OK;
   ASSERT(sky && sky->sw_bands && optical_thickness_threshold >= 0);
 
+  darray_specdata_init(sky->htrdr->allocator, &specdata);
+
   res = htcp_get_desc(sky->htcp, &sky->htcp_desc);
   if(res != RES_OK) {
     htrdr_log_err(sky->htrdr, "could not retrieve the HTCP descriptor.\n");
@@ -1126,27 +1142,21 @@ setup_clouds
       const double upp_z = (double)(iz + 1) * sky->lut_cell_sz + low[2];
       darray_split_data_get(&sky->svx2htcp_z)[iz].index = iz2;
       darray_split_data_get(&sky->svx2htcp_z)[iz].height = upp[2];
-      if(upp_z >= upp[2]) upp[2] += sky->htcp_desc.vxsz_z[++iz2];
+      if(upp_z >= upp[2] && iz + 1 < nsplits) {
+        ASSERT(iz2 + 1 < sky->htcp_desc.spatial_definition[2]);
+        upp[2] += sky->htcp_desc.vxsz_z[++iz2];
+      }
     }
     ASSERT(eq_eps(upp[2] - low[2], len_z, 1.e-6));
   }
 
   /* Setup the build context */
-  ctx.sky = sky;
-  ctx.tau_threshold = optical_thickness_threshold;
-  ctx.vxsz[0] = sky->htcp_desc.upper[0] - sky->htcp_desc.lower[0];
-  ctx.vxsz[1] = sky->htcp_desc.upper[1] - sky->htcp_desc.lower[1];
-  ctx.vxsz[2] = sky->htcp_desc.upper[2] - sky->htcp_desc.lower[2];
-  ctx.vxsz[0] = ctx.vxsz[0] / (double)sky->htcp_desc.spatial_definition[0];
-  ctx.vxsz[1] = ctx.vxsz[1] / (double)sky->htcp_desc.spatial_definition[1];
-  ctx.vxsz[2] = ctx.vxsz[2] / (double)sky->htcp_desc.spatial_definition[2];
-
-  /* Setup the voxel descriptor */
-  vox_desc.get = cloud_vox_get;
-  vox_desc.merge = cloud_vox_merge;
-  vox_desc.challenge_merge = cloud_vox_challenge_merge;
-  vox_desc.context = &ctx;
-  vox_desc.size = sizeof(float) * NFLOATS_PER_VOXEL;
+  vxsz[0] = sky->htcp_desc.upper[0] - sky->htcp_desc.lower[0];
+  vxsz[1] = sky->htcp_desc.upper[1] - sky->htcp_desc.lower[1];
+  vxsz[2] = sky->htcp_desc.upper[2] - sky->htcp_desc.lower[2];
+  vxsz[0] = vxsz[0] / (double)sky->htcp_desc.spatial_definition[0];
+  vxsz[1] = vxsz[1] / (double)sky->htcp_desc.spatial_definition[1];
+  vxsz[2] = vxsz[2] / (double)sky->htcp_desc.spatial_definition[2];
 
   /* Create as many cloud data structure than considered SW spectral bands */
   nbands = htrdr_sky_get_sw_spectral_bands_count(sky);
@@ -1158,58 +1168,135 @@ setup_clouds
     goto error;
   }
 
+  /* Compute how many octree are going to be built */
   FOR_EACH(i, 0, nbands) {
-    size_t iquad;
     struct htgop_spectral_interval band;
-    ctx.iband = i + sky->sw_bands_range[0];
+    const size_t iband = i + sky->sw_bands_range[0];
+    size_t iquad;
 
-    HTGOP(get_sw_spectral_interval(sky->htgop, ctx.iband, &band));
+    HTGOP(get_sw_spectral_interval(sky->htgop, iband, &band));
 
     sky->clouds[i] = MEM_CALLOC(sky->htrdr->allocator,
       band.quadrature_length, sizeof(*sky->clouds[i]));
     if(!sky->clouds[i]) {
       htrdr_log_err(sky->htrdr,
         "could not create the list of per quadrature point cloud data "
-        "for the band %lu.\n", (unsigned long)ctx.iband);
+        "for the band %lu.\n", (unsigned long)iband);
       res = RES_MEM_ERR;
       goto error;
     }
 
-    /* Build a cloud octree for each quadrature point of the considered
-     * spectral band */
     FOR_EACH(iquad, 0, band.quadrature_length) {
-      ctx.quadrature_range[0] = iquad;
-      ctx.quadrature_range[1] = iquad;
-
-      /* Compute grid of voxels data */
-      res = setup_cloud_grid(sky, nvoxs, ctx.iband, iquad, htcp_filename,
-        htgop_filename, htmie_filename, force_cache_update, &ctx.grid);
-      if(res != RES_OK) goto error;
-
-      /* Create the octree */
-      res = svx_octree_create(sky->htrdr->svx, low, upp, nvoxs, &vox_desc,
-        &sky->clouds[i][iquad].octree);
+      struct spectral_data spectral_data;
+      spectral_data.iband = iband;
+      spectral_data.iquad = iquad;
+      res = darray_specdata_push_back(&specdata, &spectral_data);
       if(res != RES_OK) {
-        htrdr_log_err(sky->htrdr, "could not create the octree of the cloud "
-          "properties for the band %lu.\n", (unsigned long)ctx.iband);
+        htrdr_log_err(sky->htrdr,
+          "could not register the quadrature point %lu of the spectral band "
+          "%lu .\n", (unsigned long)iband, (unsigned long)iquad);
         goto error;
       }
+    }
+  }
 
-      /* Fetch the octree descriptor for future use */
-      SVX(tree_get_desc
-        (sky->clouds[i][iquad].octree, &sky->clouds[i][iquad].octree_desc));
+  /* Make the generation of the octrees parallel or sequential whether the
+   * cache is disabled or not respectively: when the cache is enabled, we
+   * parallelize the generation of the cached grids, not the building of the
+   * octrees. */
+  if(sky->htrdr->cache_grids) {
+    omp_set_num_threads(1);
+  } else {
+    omp_set_num_threads((int)sky->htrdr->nthreads);
+    if(sky->htrdr->mpi_rank == 0) {
+      fetch_mpi_progress(sky->htrdr, HTRDR_MPI_PROGRESS_BUILD_OCTREE);
+      print_mpi_progress(sky->htrdr, HTRDR_MPI_PROGRESS_BUILD_OCTREE);
+    }
+  }
+  #pragma omp parallel for schedule(dynamic, 1/*chunksize*/)
+  for(ispecdata=0;
+      (size_t)ispecdata<darray_specdata_size_get(&specdata);
+      ++ispecdata) {
+    struct svx_voxel_desc vox_desc = SVX_VOXEL_DESC_NULL;
+    struct build_tree_context ctx = BUILD_TREE_CONTEXT_NULL;
+    const size_t iband = darray_specdata_data_get(&specdata)[ispecdata].iband;
+    const size_t iquad = darray_specdata_data_get(&specdata)[ispecdata].iquad;
+    const size_t id = iband - sky->sw_bands_range[0];
+    int32_t pcent;
+    size_t n;
+    res_T res_local = RES_OK;
+
+    if(ATOMIC_GET(&res) != RES_OK) continue;
+
+    /* Setup the build context */
+    ctx.sky = sky;
+    ctx.vxsz[0] = vxsz[0];
+    ctx.vxsz[1] = vxsz[1];
+    ctx.vxsz[2] = vxsz[2];
+    ctx.tau_threshold = optical_thickness_threshold;
+    ctx.iband = iband;
+    ctx.quadrature_range[0] = iquad;
+    ctx.quadrature_range[1] = iquad;
+
+    /* Setup the voxel descriptor */
+    vox_desc.get = cloud_vox_get;
+    vox_desc.get = cloud_vox_get;
+    vox_desc.merge = cloud_vox_merge;
+    vox_desc.challenge_merge = cloud_vox_challenge_merge;
+    vox_desc.context = &ctx;
+    vox_desc.size = sizeof(float) * NFLOATS_PER_VOXEL;
+
+    /* Compute grid of voxels data */
+    if(sky->htrdr->cache_grids) {
+      res_local = setup_cloud_grid(sky, nvoxs, iband, iquad, htcp_filename,
+        htgop_filename, htmie_filename, force_cache_update, &ctx.grid);
+      if(res_local != RES_OK) continue;
+    }
 
-      if(ctx.grid) {
-        htrdr_grid_ref_put(ctx.grid);
-        ctx.grid = NULL;
+    /* Create the octree */
+    res_local = svx_octree_create(sky->htrdr->svx, low, upp, nvoxs, &vox_desc,
+      &sky->clouds[id][iquad].octree);
+    if(ctx.grid) htrdr_grid_ref_put(ctx.grid);
+    if(res_local != RES_OK) {
+      htrdr_log_err(sky->htrdr, "could not create the octree of the cloud "
+        "properties for the band %lu.\n", (unsigned long)ctx.iband);
+      ATOMIC_SET(&res, res_local);
+      continue;
+    }
+
+    /* Fetch the octree descriptor for future use */
+    SVX(tree_get_desc
+      (sky->clouds[id][iquad].octree, &sky->clouds[id][iquad].octree_desc));
+
+    if(!sky->htrdr->cache_grids) {
+      /* Update the progress message */
+      n = (size_t)ATOMIC_INCR(&nbuilt_octrees);
+      pcent = (int32_t)(n * 100 / darray_specdata_size_get(&specdata));
+
+      #pragma omp critical
+      if(pcent > sky->htrdr->mpi_progress_octree[0]) {
+        sky->htrdr->mpi_progress_octree[0] = pcent;
+        if(sky->htrdr->mpi_rank == 0) {
+          update_mpi_progress(sky->htrdr, HTRDR_MPI_PROGRESS_BUILD_OCTREE);
+        } else { /* Send the progress percentage to the master process */
+          send_mpi_progress(sky->htrdr, HTRDR_MPI_PROGRESS_BUILD_OCTREE, pcent);
+        }
       }
     }
   }
 
+  if(!sky->htrdr->cache_grids && sky->htrdr->mpi_rank == 0) {
+    while(total_mpi_progress(sky->htrdr, HTRDR_MPI_PROGRESS_BUILD_OCTREE) != 100) {
+      update_mpi_progress(sky->htrdr, HTRDR_MPI_PROGRESS_BUILD_OCTREE);
+      sleep(1);
+    }
+    fprintf(stderr, "\n"); /* Add a new line after the progress statuses */
+  }
+
 exit:
-  return res;
+  darray_specdata_release(&specdata);
+  return (res_T)res;
 error:
-  if(ctx.grid) htrdr_grid_ref_put(ctx.grid);
   clean_clouds(sky);
   darray_split_clear(&sky->svx2htcp_z);
   goto exit;
diff --git a/src/htrdr_slab.c b/src/htrdr_slab.c
@@ -41,7 +41,7 @@ htrdr_slab_trace_ray
   double cell_sz[3]; /* Size of a cell */
   double t_max[3], t_delta[2], t_min_z;
   size_t istep;
-  int xy[2]; /* 2D index of the repeated cell */
+  int64_t xy[2]; /* 2D index of the repeated cell */
   int incr[2]; /* Index increment */
   res_T res = RES_OK;
   ASSERT(htrdr && org && dir && range && cell_low && cell_upp && trace_cell);
@@ -64,8 +64,8 @@ htrdr_slab_trace_ray
    * non duplicated cell */
   pos[0] = org[0] + t_min_z*dir[0];
   pos[1] = org[1] + t_min_z*dir[1];
-  xy[0] = (int)floor((pos[0] - cell_low[0]) / cell_sz[0]);
-  xy[1] = (int)floor((pos[1] - cell_low[1]) / cell_sz[1]);
+  xy[0] = (int64_t)floor((pos[0] - cell_low[0]) / cell_sz[0]);
+  xy[1] = (int64_t)floor((pos[1] - cell_low[1]) / cell_sz[1]);
 
   /* Define the 2D index increment wrt dir sign */
   incr[0] = dir[0] < 0 ? -1 : 1;
diff --git a/src/htrdr_solve.h b/src/htrdr_solve.h
@@ -48,8 +48,11 @@ extern LOCAL_SYM res_T
 htrdr_draw_radiance_sw
   (struct htrdr* htrdr,
    const struct htrdr_camera* cam,
+   const size_t width, /* Image width */
+   const size_t height, /* Image height */
    const size_t spp, /* #samples per pixel, i.e. #realisations */
-   struct htrdr_buffer* buf); /* Buffer of struct htrdr_accum[3] */
+   /* Buffer of struct htrdr_accum[3]. May be NULL on on non master processes */
+   struct htrdr_buffer* buf); 
 
 extern LOCAL_SYM int
 htrdr_ground_filter

	htrdr Solving radiative transfer in heterogeneous media
	git clone git://git.meso-star.fr/htrdr.git
	Log \| Files \| Refs \| README \| LICENSE

M	README.md	\|	24	++++++++++++++++++++----
M	cmake/CMakeLists.txt	\|	6	+++++-
M	src/htrdr.c	\|	460	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M	src/htrdr.h	\|	43	++++++++++++++++++++++++++++++++++++++++---
M	src/htrdr_args.c	\|	18	+++++++++++++++++-
M	src/htrdr_args.h	\|	4	++++
M	src/htrdr_buffer.c	\|	1	+
M	src/htrdr_c.h	\|	51	++++++++++++++++++++++++++++++++++++++++++++++++++-
M	src/htrdr_compute_radiance_sw.c	\|	3	++-
M	src/htrdr_draw_radiance_sw.c	\|	742	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M	src/htrdr_ground.c	\|	11	+++++++++++
M	src/htrdr_ground.h	\|	5	+++++
M	src/htrdr_main.c	\|	34	++++++++++++++++++++++++++++++++++
M	src/htrdr_sky.c	\|	189	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M	src/htrdr_slab.c	\|	6	+++---
M	src/htrdr_solve.h	\|	5	++++-