Send only the computed tile to the master process - htrdr - Solving radiative transfer in heterogeneous media

commit b47ba0173b5236c65c13abb7aa9fe6588aa89371
parent 9c3d5d0cec87f0ea316c2ec9247d3b149248cb3c
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Wed, 31 Oct 2018 17:47:13 +0100

Send only the computed tile to the master process

Do not gather the whole images of a process. Each process lists the set
of tiles that it computes and send them to the master process.

Diffstat:
M src/htrdr_c.h  | 3 ++-
M src/htrdr_draw_radiance_sw.c  | 265 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------

2 files changed, 185 insertions(+), 83 deletions(-)
diff --git a/src/htrdr_c.h b/src/htrdr_c.h
@@ -28,7 +28,8 @@ enum htrdr_mpi_message {
   HTRDR_MPI_PROGRESS_BUILD_OCTREE,
   HTRDR_MPI_PROGRESS_RENDERING,
   HTRDR_MPI_STEAL_REQUEST,
-  HTRDR_MPI_WORK_STEALING
+  HTRDR_MPI_WORK_STEALING,
+  HTRDR_MPI_TILE_DATA
 };
 
 struct htrdr;
diff --git a/src/htrdr_draw_radiance_sw.c b/src/htrdr_draw_radiance_sw.c
@@ -33,9 +33,21 @@
 
 #define RNG_SEQUENCE_SIZE 100000
 
+#define TILE_MCODE_NULL UINT32_MAX
 #define TILE_SIZE 32 /* Definition in X & Y of a tile */
 STATIC_ASSERT(IS_POW2(TILE_SIZE), TILE_SIZE_must_be_a_power_of_2);
 
+struct tile {
+  struct list_node node;
+  struct mem_allocator* allocator;
+  ref_T ref;
+
+  struct tile_data {
+    uint16_t x, y; /* 2D coordinates of the tile in tile space */
+    struct htrdr_accum accums[1/*dummy element*/]; /* Row ordered */
+  } data;
+};
+
 /* Overall work of a process */
 struct proc_work {
   struct mutex* mutex;
@@ -43,8 +55,6 @@ struct proc_work {
   size_t itile; /* Next tile to render in the above list of tiles */
 };
 
-#define TILE_MCODE_NULL UINT32_MAX
-
 /*******************************************************************************
  * Helper functions
  ******************************************************************************/
@@ -70,6 +80,89 @@ morton2D_encode(const uint16_t u16)
   return u32;
 }
 
+static FINLINE struct tile*
+tile_create(struct mem_allocator* allocator)
+{
+  struct tile* tile;
+  const size_t tile_sz =
+    sizeof(struct tile) - sizeof(struct htrdr_accum)/*rm dummy accum*/;
+  const size_t buf_sz =
+    TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum)*3/*#channels*/;
+  ASSERT(allocator);
+
+  tile = MEM_ALLOC(allocator, tile_sz+buf_sz);
+  if(!tile) return NULL;
+
+  ref_init(&tile->ref);
+  list_init(&tile->node);
+  tile->allocator = allocator;
+  ASSERT(IS_ALIGNED(&tile->data.accums, ALIGNOF(struct htrdr_accum)));
+
+  return tile;
+}
+
+static INLINE void
+tile_ref_get(struct tile* tile)
+{
+  ASSERT(tile);
+  tile_ref_get(tile);
+}
+
+static INLINE void
+release_tile(ref_T* ref)
+{
+  struct tile* tile = CONTAINER_OF(ref, struct tile, ref);
+  ASSERT(ref);
+  MEM_RM(tile->allocator, tile);
+}
+
+static INLINE void
+tile_ref_put(struct tile* tile)
+{
+  ASSERT(tile);
+  ref_put(&tile->ref, release_tile);
+}
+
+static FINLINE struct htrdr_accum*
+tile_at
+  (struct tile* tile,
+   const size_t x, /* In tile space */
+   const size_t y) /* In tile space */
+{
+  ASSERT(tile && x < TILE_SIZE && y < TILE_SIZE);
+  return tile->data.accums + (y*TILE_SIZE + x)*3/*#channels*/;
+}
+
+static void
+write_tile_data(struct htrdr_buffer* buf, const struct tile_data* tile_data)
+{
+  struct htrdr_buffer_layout layout = HTRDR_BUFFER_LAYOUT_NULL;
+  size_t icol, irow;
+  size_t irow_tile;
+  size_t ncols_tile, nrows_tile;
+  char* buf_mem;
+  ASSERT(buf && tile_data);
+
+  htrdr_buffer_get_layout(buf, &layout);
+  buf_mem = htrdr_buffer_get_data(buf);
+
+  icol = tile_data->x * (size_t)TILE_SIZE;
+  irow = tile_data->y * (size_t)TILE_SIZE;
+  ncols_tile = MMIN(icol + TILE_SIZE, layout.width)  - icol;
+  nrows_tile = MMIN(irow + TILE_SIZE, layout.height) - irow;
+
+  FOR_EACH(irow_tile, 0, nrows_tile) {
+    char* buf_row = buf_mem + (irow + irow_tile) * layout.pitch;
+    const struct htrdr_accum* tile_row =
+      tile_data->accums + irow_tile*TILE_SIZE*3/*#channels*/;
+
+    memcpy
+      (buf_row + icol*sizeof(struct htrdr_accum)*3,
+       tile_row,
+       ncols_tile*sizeof(struct htrdr_accum)*3/*#channels*/);
+  }
+}
+
 static INLINE void
 proc_work_init(struct mem_allocator* allocator, struct proc_work* work)
 {
@@ -283,78 +376,54 @@ mpi_steal_work
 }
 
 static res_T
-mpi_gather_buffer
+mpi_gather_tiles
   (struct htrdr* htrdr,
-   struct htrdr_buffer* buf)
+   struct htrdr_buffer* buf,
+   const size_t ntiles,
+   struct list_node* tiles)
 {
-  struct htrdr_buffer_layout layout;
-  struct htrdr_accum* gathered_accums = NULL;
-  size_t x, y;
-  int iproc;
+  const size_t msg_sz =
+    sizeof(struct tile_data) - sizeof(struct htrdr_accum)/*dummy*/
+  + TILE_SIZE*TILE_SIZE*sizeof(struct htrdr_accum)*3/*#channels*/;
+  struct list_node* node;
+  struct tile* tile = NULL;
   res_T res = RES_OK;
-  ASSERT(htrdr && buf);
-
-  /* The process is alone. There is nothing to gather since all work was done
-   * by it */
-  if(htrdr->mpi_nprocs == 1)
-    goto exit;
-
-  /* Fetch the memory layout of the submitted buffer */
-  htrdr_buffer_get_layout(buf, &layout);
-  ASSERT(layout.elmt_size == sizeof(struct htrdr_accum) * 3/*#channels*/);
-  ASSERT(layout.alignment <= ALIGNOF(struct htrdr_accum));
+  ASSERT(htrdr && buf && tiles);
+  (void)ntiles;
+
+  if(htrdr->mpi_rank != 0) {
+    LIST_FOR_EACH(node, tiles) {
+      struct tile* t = CONTAINER_OF(node, struct tile, node);
+      MPI(Send(&t->data, (int)msg_sz, MPI_CHAR, 0,
+        HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD));
+    }
+  } else {
+    size_t itile = 0;
 
-  /* The process 0 allocates the memory used to store the gathered buffer lines
-   * of the MPI processes */
-  if(htrdr->mpi_rank == 0) {
-    gathered_accums = MEM_ALLOC
-      (htrdr->allocator, layout.pitch * (size_t)htrdr->mpi_nprocs);
-    if(!gathered_accums) {
+    tile = tile_create(htrdr->allocator);
+    if(!tile) {
       res = RES_MEM_ERR;
       htrdr_log_err(htrdr,
-        "could not allocate the temporary memory for MPI gathering -- %s.\n",
-        res_to_cstr(res));
+        "could not allocate the temporary tile used to gather the process "
+        "output data -- %s.\n", res_to_cstr(res));
       goto error;
-    }
-  }
 
-  FOR_EACH(y, 0, layout.height) {
-    struct htrdr_accum* buf_row_accums = (struct htrdr_accum*)
-      ((char*)htrdr_buffer_get_data(buf) + y * layout.pitch);
-    int err; /* MPI error */
-
-    /* Gather the buffer lines */
-    mutex_lock(htrdr->mpi_mutex);
-    err = MPI_Gather(buf_row_accums, (int)layout.pitch, MPI_CHAR, gathered_accums,
-      (int)layout.pitch, MPI_CHAR, 0, MPI_COMM_WORLD);
-    mutex_unlock(htrdr->mpi_mutex);
-    if(err != MPI_SUCCESS) {
-      htrdr_log_err(htrdr,
-        "could not gather the buffer line `%lu' from the group of processes -- "
-        "%s.\n",
-        (unsigned long)y, htrdr_mpi_error_string(htrdr, err));
-      res = RES_UNKNOWN_ERR;
-      goto error;
+    }
+    LIST_FOR_EACH(node, tiles) {
+      struct tile* t = CONTAINER_OF(node, struct tile, node);
+      write_tile_data(buf, &t->data);
+      ++itile;
     }
 
-    /* Accumulates the gathered lines into the buffer of the process 0 */
-    if(htrdr->mpi_rank == 0) {
-      memset(buf_row_accums, 0, layout.pitch);
-      FOR_EACH(iproc, 0, htrdr->mpi_nprocs) {
-        struct htrdr_accum* proc_accums = (struct htrdr_accum*)
-          ((char*)gathered_accums + (size_t)iproc * layout.pitch);
-        FOR_EACH(x, 0, layout.width * 3/*#channels*/) {
-          buf_row_accums[x].sum_weights += proc_accums[x].sum_weights;
-          buf_row_accums[x].sum_weights_sqr += proc_accums[x].sum_weights_sqr;
-          buf_row_accums[x].nweights += proc_accums[x].nweights;
-          buf_row_accums[x].nfailures += proc_accums[x].nfailures;
-        }
-      }
+    FOR_EACH(itile, itile, ntiles) {
+      MPI(Recv(&tile->data, (int)msg_sz, MPI_CHAR, MPI_ANY_SOURCE,
+        HTRDR_MPI_TILE_DATA, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+      write_tile_data(buf, &tile->data);
     }
   }
 
 exit:
-  if(gathered_accums) MEM_RM(htrdr->allocator, gathered_accums);
+  if(tile) tile_ref_put(tile);
   return res;
 error:
   goto exit;
@@ -371,11 +440,11 @@ draw_tile
    const struct htrdr_camera* cam,
    const size_t spp, /* #samples per pixel */
    struct ssp_rng* rng,
-   struct htrdr_buffer* buf)
+   struct tile* tile)
 {
   size_t npixels;
   size_t mcode; /* Morton code of tile pixel */
-  ASSERT(htrdr && tile_org && tile_sz && pix_sz && cam && spp && buf);
+  ASSERT(htrdr && tile_org && tile_sz && pix_sz && cam && spp && tile);
   (void)tile_mcode;
   /* Adjust the #pixels to process them wrt a morton order */
   npixels = round_up_pow2(MMAX(tile_sz[0], tile_sz[1]));
@@ -392,13 +461,13 @@ draw_tile
     ipix_tile[1] = morton2D_decode((uint32_t)(mcode>>1));
     if(ipix_tile[1] >= tile_sz[1]) continue; /* Pixel is out of tile */
 
+    /* Fetch and reset the pixel accumulator */
+    pix_accums = tile_at(tile, ipix_tile[0], ipix_tile[1]);
+
     /* Compute the pixel coordinate */
     ipix[0] = tile_org[0] + ipix_tile[0];
     ipix[1] = tile_org[1] + ipix_tile[1];
 
-    /* Fetch and reset the pixel accumulator */
-    pix_accums = htrdr_buffer_at(buf, ipix[0], ipix[1]);
-
     FOR_EACH(ichannel, 0, 3) {
       size_t isamp;
       pix_accums[ichannel] = HTRDR_ACCUM_NULL;
@@ -459,19 +528,20 @@ draw_image
    const size_t ntiles_x,
    const size_t ntiles_y,
    const size_t ntiles_adjusted,
+   const double pix_sz[2], /* Pixel size in the normalized image plane */
    struct proc_work* work,
-   struct htrdr_buffer* buf)
+   const struct htrdr_buffer_layout* layout,
+   struct list_node* tiles)
 {
-  struct htrdr_buffer_layout layout;
   struct ssp_rng* rng_proc = NULL;
   MPI_Request req;
-  double pix_sz[2];
   size_t nthreads = 0;
   size_t nthieves = 0;
   size_t proc_ntiles = 0;
   ATOMIC nsolved_tiles = 0;
   ATOMIC res = RES_OK;
-  ASSERT(htrdr && cam && spp && ntiles_adjusted && work && buf);
+  ASSERT(htrdr && cam && spp && ntiles_adjusted && work && tiles);
+  ASSERT(pix_sz && pix_sz[0] > 0 && pix_sz[1] > 0);
   (void)ntiles_x, (void)ntiles_y;
 
   res = ssp_rng_create(htrdr->allocator, &ssp_rng_mt19937_64, &rng_proc);
@@ -481,11 +551,6 @@ draw_image
     goto error;
   }
 
-  /* Compute the size of a pixel in the normalized image plane */
-  htrdr_buffer_get_layout(buf, &layout);
-  pix_sz[0] = 1.0 / (double)layout.width;
-  pix_sz[1] = 1.0 / (double)layout.height;
-
   proc_ntiles = proc_work_get_ntiles(work);
   nthreads = MMIN(htrdr->nthreads, proc_ntiles);
 
@@ -499,7 +564,8 @@ draw_image
     const int ithread = omp_get_thread_num();
     struct ssp_rng_proxy* rng_proxy = NULL;
     struct ssp_rng* rng;
-    uint32_t mcode;
+    struct tile* tile;
+    uint32_t mcode = TILE_MCODE_NULL;
     size_t tile_org[2];
     size_t tile_sz[2];
     size_t n;
@@ -526,13 +592,31 @@ draw_image
     tile_org[1] = morton2D_decode((uint32_t)(mcode>>1));
     ASSERT(tile_org[0] < ntiles_x && tile_org[1] < ntiles_y);
 
+    /* Create the tile */
+    tile = tile_create(htrdr->allocator);
+    if(!tile) {
+      ATOMIC_SET(&res, RES_MEM_ERR);
+      htrdr_log_err(htrdr,
+        "could not allocate the memory space of the tile (%lu, %lu) -- %s.\n",
+        (unsigned long)tile_org[0], (unsigned long)tile_org[1],
+         res_to_cstr((res_T)ATOMIC_GET(&res)));
+      break;
+    }
+
+    /* Register the tile */
+    #pragma omp critical
+    list_add_tail(tiles, &tile->node);
+
+    tile->data.x = (uint16_t)tile_org[0];
+    tile->data.y = (uint16_t)tile_org[1];
+
     /* Define the tile origin in pixel space */
     tile_org[0] *= TILE_SIZE;
     tile_org[1] *= TILE_SIZE;
 
     /* Compute the size of the tile clamped by the borders of the buffer */
-    tile_sz[0] = MMIN(TILE_SIZE, layout.width - tile_org[0]);
-    tile_sz[1] = MMIN(TILE_SIZE, layout.height - tile_org[1]);
+    tile_sz[0] = MMIN(TILE_SIZE, layout->width - tile_org[0]);
+    tile_sz[1] = MMIN(TILE_SIZE, layout->height - tile_org[1]);
 
     SSP(rng_proxy_create2
       (&htrdr->lifo_allocators[ithread],
@@ -544,7 +628,7 @@ draw_image
     SSP(rng_proxy_create_rng(rng_proxy, 0, &rng));
 
     res_local = draw_tile(htrdr, (size_t)ithread, mcode, tile_org, tile_sz,
-      pix_sz, cam, spp, rng, buf);
+      pix_sz, cam, spp, rng, tile);
 
     SSP(rng_proxy_ref_put(rng_proxy));
     SSP(rng_ref_put(rng));
@@ -595,15 +679,18 @@ htrdr_draw_radiance_sw
    const size_t spp,
    struct htrdr_buffer* buf)
 {
-  size_t ntiles_x, ntiles_y, ntiles_adjusted;
+  struct list_node tiles;
+  size_t ntiles_x, ntiles_y, ntiles, ntiles_adjusted;
   size_t itile;
   struct proc_work work;
   struct htrdr_buffer_layout layout;
   size_t proc_ntiles_adjusted;
+  double pix_sz[2];
   ATOMIC probe_thieves = 1;
   ATOMIC res = RES_OK;
   ASSERT(htrdr && cam && buf);
 
+  list_init(&tiles);
   proc_work_init(htrdr->allocator, &work);
 
   htrdr_buffer_get_layout(buf, &layout);
@@ -622,6 +709,11 @@ htrdr_draw_radiance_sw
   /* Compute the overall number of tiles */
   ntiles_x = (layout.width + (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
   ntiles_y = (layout.height+ (TILE_SIZE-1)/*ceil*/)/TILE_SIZE;
+  ntiles = ntiles_x * ntiles_y;
+
+  /* Compute the pixel size in the normalized image plane */
+  pix_sz[0] = 1.0 / (double)layout.width;
+  pix_sz[1] = 1.0 / (double)layout.height;
 
   /* Adjust the #tiles for the morton-encoding procedure */
   ntiles_adjusted = round_up_pow2(MMAX(ntiles_x, ntiles_y));
@@ -662,8 +754,8 @@ htrdr_draw_radiance_sw
 
     #pragma omp section
     {
-      draw_image(htrdr, cam, spp, ntiles_x, ntiles_y, ntiles_adjusted, &work,
-        buf);
+      draw_image(htrdr, cam, spp, ntiles_x, ntiles_y, ntiles_adjusted, pix_sz,
+        &work, &layout, &tiles);
       /* The processes have no more work to do. Stop probing for thieves */
       ATOMIC_SET(&probe_thieves, 0);
     }
@@ -675,10 +767,19 @@ htrdr_draw_radiance_sw
   }
 
   /* Gather accum buffers from the group of processes */
-  res = mpi_gather_buffer(htrdr, buf);
+  res = mpi_gather_tiles(htrdr, buf, ntiles, &tiles);
   if(res != RES_OK) goto error;
 
 exit:
+  { /* Free allocated tiles */
+    struct list_node* node;
+    struct list_node* tmp;
+    LIST_FOR_EACH_SAFE(node, tmp, &tiles) {
+      struct tile* tile = CONTAINER_OF(node, struct tile, node);
+      list_del(node);
+      tile_ref_put(tile);
+    }
+  }
   proc_work_release(&work);
   return (res_T)res;
 error:

	htrdr Solving radiative transfer in heterogeneous media
	git clone git://git.meso-star.com/htrdr.git
	Log \| Files \| Refs \| README \| LICENSE

M	src/htrdr_c.h	\|	3	++-
M	src/htrdr_draw_radiance_sw.c	\|	265	++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------