Alloc the image buffer only on the master process - htrdr - Solving radiative transfer in heterogeneous media

commit f5938d975152de903266c5c10a322ed6580c0508
parent b47ba0173b5236c65c13abb7aa9fe6588aa89371
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Mon,  5 Nov 2018 14:05:34 +0100

Alloc the image buffer only on the master process

Diffstat:
M src/htrdr.c  | 34 ++++++++++++++++------------------
M src/htrdr.h  | 2 ++
M src/htrdr_draw_radiance_sw.c  | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M src/htrdr_solve.h  | 5 ++++-

4 files changed, 113 insertions(+), 69 deletions(-)
diff --git a/src/htrdr.c b/src/htrdr.c
@@ -25,7 +25,6 @@
 #include "htrdr_sun.h"
 #include "htrdr_solve.h"
 
-#include <rsys/clock_time.h>
 #include <rsys/cstr.h>
 #include <rsys/mem_allocator.h>
 #include <rsys/str.h>
@@ -457,6 +456,8 @@ htrdr_init
   htrdr->verbose = args->verbose;
   htrdr->nthreads = MMIN(args->nthreads, (unsigned)omp_get_num_procs());
   htrdr->spp = args->image.spp;
+  htrdr->width = args->image.definition[0];
+  htrdr->height = args->image.definition[1];
 
   res = init_mpi(htrdr);
   if(res != RES_OK) goto error;
@@ -515,14 +516,18 @@ htrdr_init
     args->camera.up, proj_ratio, MDEG2RAD(args->camera.fov_y), &htrdr->cam);
   if(res != RES_OK) goto error;
 
-  res = htrdr_buffer_create(htrdr,
-    args->image.definition[0], /* Width */
-    args->image.definition[1], /* Height */
-    args->image.definition[0]*sizeof(struct htrdr_accum[3]), /* Pitch */
-    sizeof(struct htrdr_accum[3]),
-    ALIGNOF(struct htrdr_accum[3]), /* Alignment */
-    &htrdr->buf);
-  if(res != RES_OK) goto error;
+  /* Create the image buffer only on the master process; the image parts
+   * rendered by the processes are gathered onto the master process. */
+  if(!htrdr->dump_vtk && htrdr->mpi_rank == 0) {
+    res = htrdr_buffer_create(htrdr,
+      args->image.definition[0], /* Width */
+      args->image.definition[1], /* Height */
+      args->image.definition[0]*sizeof(struct htrdr_accum[3]), /* Pitch */
+      sizeof(struct htrdr_accum[3]),
+      ALIGNOF(struct htrdr_accum[3]), /* Alignment */
+      &htrdr->buf);
+    if(res != RES_OK) goto error;
+  }
 
   res = htrdr_sun_create(htrdr, &htrdr->sun);
   if(res != RES_OK) goto error;
@@ -609,16 +614,9 @@ htrdr_run(struct htrdr* htrdr)
       }
     }
   } else {
-    struct time t0, t1;
-    char buf[128];
-
-    time_current(&t0);
-    res = htrdr_draw_radiance_sw(htrdr, htrdr->cam, htrdr->spp, htrdr->buf);
+    res = htrdr_draw_radiance_sw(htrdr, htrdr->cam, htrdr->width,
+      htrdr->height, htrdr->spp, htrdr->buf);
     if(res != RES_OK) goto error;
-    time_sub(&t0, time_current(&t1), &t0);
-    time_dump(&t0, TIME_ALL, NULL, buf, sizeof(buf));
-    htrdr_log(htrdr, "Rendering time: %s\n", buf);
-
     if(htrdr->mpi_rank == 0) {
       res = dump_accum_buffer
         (htrdr, htrdr->buf, str_cget(&htrdr->output_name), htrdr->output);
diff --git a/src/htrdr.h b/src/htrdr.h
@@ -53,6 +53,8 @@ struct htrdr {
   struct htrdr_camera* cam;
   struct htrdr_buffer* buf;
   size_t spp; /* #samples per pixel */
+  size_t width; /* Image width */
+  size_t height; /* Image height */
 
   FILE* output;
   struct str output_name;
diff --git a/src/htrdr_draw_radiance_sw.c b/src/htrdr_draw_radiance_sw.c
@@ -20,6 +20,7 @@
 #include "htrdr_sky.h"
 #include "htrdr_solve.h"
 
+#include <rsys/clock_time.h>
 #include <rsys/cstr.h>
 #include <rsys/dynamic_array_u32.h>
 #include <rsys/math.h>
@@ -31,12 +32,13 @@
 #include <time.h>
 #include <unistd.h>
 
-#define RNG_SEQUENCE_SIZE 100000
+#define RNG_SEQUENCE_SIZE 10000
 
 #define TILE_MCODE_NULL UINT32_MAX
 #define TILE_SIZE 32 /* Definition in X & Y of a tile */
 STATIC_ASSERT(IS_POW2(TILE_SIZE), TILE_SIZE_must_be_a_power_of_2);
 
+/* Tile of row ordered image pixels */
 struct tile {
   struct list_node node;
   struct mem_allocator* allocator;
@@ -44,11 +46,12 @@ struct tile {
 
   struct tile_data {
     uint16_t x, y; /* 2D coordinates of the tile in tile space */
-    struct htrdr_accum accums[1/*dummy element*/]; /* Row ordered */
+    /* Simulate the flexible array member of the C99 standard. */
+    struct htrdr_accum accums[1/*dummy element*/];
   } data;
 };
 
-/* Overall work of a process */
+/* List of tile to compute onto the MPI process. */
 struct proc_work {
   struct mutex* mutex;
   struct darray_u32 tiles; /* #tiles to render */
@@ -86,7 +89,7 @@ tile_create(struct mem_allocator* allocator)
   struct tile* tile;
   const size_t tile_sz =
     sizeof(struct tile) - sizeof(struct htrdr_accum)/*rm dummy accum*/;
-  const size_t buf_sz =
+  const size_t buf_sz = /* Flexiblbe array element */
     TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum)*3/*#channels*/;
   ASSERT(allocator);
 
@@ -146,20 +149,21 @@ write_tile_data(struct htrdr_buffer* buf, const struct tile_data* tile_data)
   htrdr_buffer_get_layout(buf, &layout);
   buf_mem = htrdr_buffer_get_data(buf);
 
+  /* Compute the row/column of the tile origin into the buffer */
   icol = tile_data->x * (size_t)TILE_SIZE;
   irow = tile_data->y * (size_t)TILE_SIZE;
+
+  /* Define the number of tile row/columns to write into the buffer */
   ncols_tile = MMIN(icol + TILE_SIZE, layout.width)  - icol;
   nrows_tile = MMIN(irow + TILE_SIZE, layout.height) - irow;
 
+  /* Copy the tile data, row by row */
   FOR_EACH(irow_tile, 0, nrows_tile) {
     char* buf_row = buf_mem + (irow + irow_tile) * layout.pitch;
     const struct htrdr_accum* tile_row =
       tile_data->accums + irow_tile*TILE_SIZE*3/*#channels*/;
-
-    memcpy
-      (buf_row + icol*sizeof(struct htrdr_accum)*3,
-       tile_row,
-       ncols_tile*sizeof(struct htrdr_accum)*3/*#channels*/);
+    memcpy(buf_row + icol*sizeof(struct htrdr_accum)*3, tile_row,
+      ncols_tile*sizeof(struct htrdr_accum)*3/*#channels*/);
   }
 }
 
@@ -311,7 +315,7 @@ mpi_sample_working_process(struct htrdr* htrdr, struct ssp_rng* rng)
   iproc = (int)(ssp_rng_canonical(rng) * (double)htrdr->mpi_nworking_procs);
 
   /* Find the rank of the sampled active process. Use a simple linear search
-   * since the overall number of process should be quite low; at most few
+   * since the overall number of processes should be quite low; at most few
    * dozens.  */
   i = 0;
   FOR_EACH(dst_rank, 0, htrdr->mpi_nprocs) {
@@ -382,43 +386,54 @@ mpi_gather_tiles
    const size_t ntiles,
    struct list_node* tiles)
 {
+  /* Compute the size of the tile_data */
   const size_t msg_sz =
     sizeof(struct tile_data) - sizeof(struct htrdr_accum)/*dummy*/
   + TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum)*3/*#channels*/;
-  struct list_node* node;
+
+  struct list_node* node = NULL;
   struct tile* tile = NULL;
   res_T res = RES_OK;
-  ASSERT(htrdr && buf && tiles);
+  ASSERT(htrdr && tiles);
+  ASSERT(htrdr->mpi_rank != 0 || buf);
   (void)ntiles;
 
-  if(htrdr->mpi_rank != 0) {
+  if(htrdr->mpi_rank != 0) { /* Non master process */
+    /* Send the computed tile to the master process */
     LIST_FOR_EACH(node, tiles) {
       struct tile* t = CONTAINER_OF(node, struct tile, node);
       MPI(Send(&t->data, (int)msg_sz, MPI_CHAR, 0,
         HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD));
     }
-  } else {
+  } else { /* Master process */
     size_t itile = 0;
 
-    tile = tile_create(htrdr->allocator);
-    if(!tile) {
-      res = RES_MEM_ERR;
-      htrdr_log_err(htrdr,
-        "could not allocate the temporary tile used to gather the process "
-        "output data -- %s.\n", res_to_cstr(res));
-      goto error;
-
-    }
     LIST_FOR_EACH(node, tiles) {
       struct tile* t = CONTAINER_OF(node, struct tile, node);
       write_tile_data(buf, &t->data);
       ++itile;
     }
 
-    FOR_EACH(itile, itile, ntiles) {
-      MPI(Recv(&tile->data, (int)msg_sz, MPI_CHAR, MPI_ANY_SOURCE,
-        HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
-      write_tile_data(buf, &tile->data);
+    if(itile != ntiles) {
+      ASSERT(htrdr->mpi_nprocs > 1);
+
+      /* Create a temporary tile to receive the tile data computed by the
+       * concurrent MPI processes */
+      tile = tile_create(htrdr->allocator);
+      if(!tile) {
+        res = RES_MEM_ERR;
+        htrdr_log_err(htrdr,
+          "could not allocate the temporary tile used to gather the process "
+          "output data -- %s.\n", res_to_cstr(res));
+        goto error;
+      }
+
+      /* Receive the tile data of the concurret MPI processes */
+      FOR_EACH(itile, itile, ntiles) {
+        MPI(Recv(&tile->data, (int)msg_sz, MPI_CHAR, MPI_ANY_SOURCE,
+          HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        write_tile_data(buf, &tile->data);
+      }
     }
   }
 
@@ -524,13 +539,14 @@ static res_T
 draw_image
   (struct htrdr* htrdr,
    const struct htrdr_camera* cam,
+   const size_t width, /* Image width */
+   const size_t height, /* Image height */
    const size_t spp,
    const size_t ntiles_x,
    const size_t ntiles_y,
    const size_t ntiles_adjusted,
    const double pix_sz[2], /* Pixel size in the normalized image plane */
    struct proc_work* work,
-   const struct htrdr_buffer_layout* layout,
    struct list_node* tiles)
 {
   struct ssp_rng* rng_proc = NULL;
@@ -542,6 +558,7 @@ draw_image
   ATOMIC res = RES_OK;
   ASSERT(htrdr && cam && spp && ntiles_adjusted && work && tiles);
   ASSERT(pix_sz && pix_sz[0] > 0 && pix_sz[1] > 0);
+  ASSERT(width && height);
   (void)ntiles_x, (void)ntiles_y;
 
   res = ssp_rng_create(htrdr->allocator, &ssp_rng_mt19937_64, &rng_proc);
@@ -615,9 +632,13 @@ draw_image
     tile_org[1] *= TILE_SIZE;
 
     /* Compute the size of the tile clamped by the borders of the buffer */
-    tile_sz[0] = MMIN(TILE_SIZE, layout->width - tile_org[0]);
-    tile_sz[1] = MMIN(TILE_SIZE, layout->height - tile_org[1]);
+    tile_sz[0] = MMIN(TILE_SIZE, width - tile_org[0]);
+    tile_sz[1] = MMIN(TILE_SIZE, height - tile_org[1]);
 
+    /* Create a proxy RNG for the current tile. This proxy is used for the
+     * current thread only and thus it has to manage only one RNG. This proxy
+     * is initialised in order to ensure that a unique and predictable set of
+     * random numbers is used for the current tile. */
     SSP(rng_proxy_create2
       (&htrdr->lifo_allocators[ithread],
        &ssp_rng_threefry,
@@ -627,6 +648,7 @@ draw_image
        1, &rng_proxy));
     SSP(rng_proxy_create_rng(rng_proxy, 0, &rng));
 
+    /* Launch the tile rendering */
     res_local = draw_tile(htrdr, (size_t)ithread, mcode, tile_org, tile_sz,
       pix_sz, cam, spp, rng, tile);
 
@@ -638,6 +660,7 @@ draw_image
       break;
     }
 
+    /* Update the progress status */
     n = (size_t)ATOMIC_INCR(&nsolved_tiles);
     pcent = (int32_t)((double)n * 100.0 / (double)proc_ntiles + 0.5/*round*/);
 
@@ -659,7 +682,7 @@ draw_image
   MPI(Ibarrier(MPI_COMM_WORLD, &req));
   mutex_unlock(htrdr->mpi_mutex);
 
-  /* Wait for synchronization */
+  /* Wait for processes synchronization */
   mpi_wait_for_request(htrdr, &req);
 
 exit:
@@ -676,44 +699,52 @@ res_T
 htrdr_draw_radiance_sw
   (struct htrdr* htrdr,
    const struct htrdr_camera* cam,
+   const size_t width,
+   const size_t height,
    const size_t spp,
    struct htrdr_buffer* buf)
 {
+  char strbuf[128];
+  struct time t0, t1;
   struct list_node tiles;
   size_t ntiles_x, ntiles_y, ntiles, ntiles_adjusted;
   size_t itile;
   struct proc_work work;
-  struct htrdr_buffer_layout layout;
+  struct htrdr_buffer_layout layout = HTRDR_BUFFER_LAYOUT_NULL;
   size_t proc_ntiles_adjusted;
   double pix_sz[2];
   ATOMIC probe_thieves = 1;
   ATOMIC res = RES_OK;
-  ASSERT(htrdr && cam && buf);
+  ASSERT(htrdr && cam && width && height);
+  ASSERT(htrdr->mpi_rank != 0 || buf);
 
   list_init(&tiles);
   proc_work_init(htrdr->allocator, &work);
 
-  htrdr_buffer_get_layout(buf, &layout);
-  ASSERT(layout.width || layout.height || layout.elmt_size);
-
-  if(layout.elmt_size != sizeof(struct htrdr_accum[3])/*#channels*/
-  || layout.alignment < ALIGNOF(struct htrdr_accum[3])) {
-    htrdr_log_err(htrdr,
-      "%s: invalid buffer layout. "
-      "The pixel size must be the size of 3 * accumulators.\n",
-      FUNC_NAME);
-    res = RES_BAD_ARG;
-    goto error;
+  if(htrdr->mpi_rank == 0) {
+    htrdr_buffer_get_layout(buf, &layout);
+    ASSERT(layout.width || layout.height || layout.elmt_size);
+    ASSERT(layout.width == width && layout.height == height);
+
+    if(layout.elmt_size != sizeof(struct htrdr_accum[3])/*#channels*/
+    || layout.alignment < ALIGNOF(struct htrdr_accum[3])) {
+      htrdr_log_err(htrdr,
+        "%s: invalid buffer layout. "
+        "The pixel size must be the size of 3 * accumulators.\n",
+        FUNC_NAME);
+      res = RES_BAD_ARG;
+      goto error;
+    }
   }
 
   /* Compute the overall number of tiles */
-  ntiles_x = (layout.width + (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
-  ntiles_y = (layout.height+ (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
+  ntiles_x = (width + (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
+  ntiles_y = (height+ (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
   ntiles = ntiles_x * ntiles_y;
 
   /* Compute the pixel size in the normalized image plane */
-  pix_sz[0] = 1.0 / (double)layout.width;
-  pix_sz[1] = 1.0 / (double)layout.height;
+  pix_sz[0] = 1.0 / (double)width;
+  pix_sz[1] = 1.0 / (double)height;
 
   /* Adjust the #tiles for the morton-encoding procedure */
   ntiles_adjusted = round_up_pow2(MMAX(ntiles_x, ntiles_y));
@@ -746,6 +777,8 @@ htrdr_draw_radiance_sw
     print_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
   }
 
+  time_current(&t0);
+
   omp_set_nested(1); /* Enable nested threads for draw_image */
   #pragma omp parallel sections num_threads(2)
   {
@@ -754,8 +787,8 @@ htrdr_draw_radiance_sw
 
     #pragma omp section
     {
-      draw_image(htrdr, cam, spp, ntiles_x, ntiles_y, ntiles_adjusted, pix_sz,
-        &work, &layout, &tiles);
+      draw_image(htrdr, cam, width, height, spp, ntiles_x, ntiles_y,
+        ntiles_adjusted, pix_sz, &work, &tiles);
       /* The processes have no more work to do. Stop probing for thieves */
       ATOMIC_SET(&probe_thieves, 0);
     }
@@ -766,9 +799,17 @@ htrdr_draw_radiance_sw
     fprintf(stderr, "\n"); /* Add a new line after the progress statuses */
   }
 
+  time_sub(&t0, time_current(&t1), &t0);
+  time_dump(&t0, TIME_ALL, NULL, strbuf, sizeof(strbuf));
+  htrdr_log(htrdr, "Rendering time: %s\n", strbuf);
+
   /* Gather accum buffers from the group of processes */
+  time_current(&t0);
   res = mpi_gather_tiles(htrdr, buf, ntiles, &tiles);
   if(res != RES_OK) goto error;
+  time_sub(&t0, time_current(&t1), &t0);
+  time_dump(&t0, TIME_ALL, NULL, strbuf, sizeof(strbuf));
+  htrdr_log(htrdr, "Image gathering time: %s\n", strbuf);
 
 exit:
   { /* Free allocated tiles */
diff --git a/src/htrdr_solve.h b/src/htrdr_solve.h
@@ -48,8 +48,11 @@ extern LOCAL_SYM res_T
 htrdr_draw_radiance_sw
   (struct htrdr* htrdr,
    const struct htrdr_camera* cam,
+   const size_t width, /* Image width */
+   const size_t height, /* Image height */
    const size_t spp, /* #samples per pixel, i.e. #realisations */
-   struct htrdr_buffer* buf); /* Buffer of struct htrdr_accum[3] */
+   /* Buffer of struct htrdr_accum[3]. May be NULL on on non master processes */
+   struct htrdr_buffer* buf); 
 
 extern LOCAL_SYM int
 htrdr_ground_filter

	htrdr Solving radiative transfer in heterogeneous media
	git clone git://git.meso-star.fr/htrdr.git
	Log \| Files \| Refs \| README \| LICENSE

M	src/htrdr.c	\|	34	++++++++++++++++------------------
M	src/htrdr.h	\|	2	++
M	src/htrdr_draw_radiance_sw.c	\|	141	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	src/htrdr_solve.h	\|	5	++++-