From ebcf1e0101e7fcf7495b1f0e3f460b2bcc4ad524 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABl=20Donval?= <github.com@umessage.me>
Date: Thu, 5 Oct 2023 10:56:10 +0100
Subject: [PATCH] Make use of the full range of AUX matrices

The gist of it is that we have this signature `mr2d_malloc(Int n)`,
bounding the internal `malloc` to `Int`, which is Scalapack's indexing type.
This is not how `malloc` is defined and means that maximum AUX matrix
size is artifically limited on 64-bit systems to 2GB.

This PR operates this transformation: `mr2d_malloc(Int n) -> mr2d_malloc(size_t n)`,
yet ensures `Int -> size_t` conversion does not involve Int-negative values
and ensures that no 64-bit values are passed to `malloc` on 32-bit systems.

The main advantage of this on 64-bit system allows the use of
the **full** _signed_ 32-bit indexing range instead of `range / element size`.
E.g. the max AUX matrix size is now 16GB instead of 2GB previously.

With this, "standard" 32-bit Scalapack and Blas/Lapack can still be used
in programs like GPAW.

Details (and full commit history) are in
https://github.com/Reference-ScaLAPACK/scalapack/pull/85
which does not seem to receive much attention.

This patch is a way to provide that feature to conda users in the meantime.
---
 recipe/high_mem_32bits.patch | 617 +++++++++++++++++++++++++++++++++++
 recipe/meta.yaml             |   4 +-
 2 files changed, 620 insertions(+), 1 deletion(-)
 create mode 100644 recipe/high_mem_32bits.patch

diff --git a/recipe/high_mem_32bits.patch b/recipe/high_mem_32bits.patch
new file mode 100644
index 0000000..b047f24
--- /dev/null
+++ b/recipe/high_mem_32bits.patch
@@ -0,0 +1,617 @@
+diff --git a/REDIST/SRC/pcgemr.c b/REDIST/SRC/pcgemr.c
+index bd6de2f..d3a1482 100644
+--- a/REDIST/SRC/pcgemr.c
++++ b/REDIST/SRC/pcgemr.c
+@@ -342,7 +342,7 @@ Cpcgemr2d(m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -467,10 +467,10 @@ Cpcgemr2d(m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -570,7 +570,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -706,7 +706,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pcgemr2.c b/REDIST/SRC/pcgemr2.c
+index ce5370d..c6b20c8 100644
+--- a/REDIST/SRC/pcgemr2.c
++++ b/REDIST/SRC/pcgemr2.c
+@@ -121,7 +121,7 @@ setmemory(complex **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (complex *) mr2d_malloc(
+-				       blocksize * sizeof(complex));
++				       (size_t)blocksize * sizeof(complex));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/pctrmr.c b/REDIST/SRC/pctrmr.c
+index 0122528..1516690 100644
+--- a/REDIST/SRC/pctrmr.c
++++ b/REDIST/SRC/pctrmr.c
+@@ -359,7 +359,7 @@ Cpctrmr2d(uplo, diag, m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -484,10 +484,10 @@ Cpctrmr2d(uplo, diag, m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -592,7 +592,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -664,7 +664,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pctrmr2.c b/REDIST/SRC/pctrmr2.c
+index 99e34f9..0e6d04b 100644
+--- a/REDIST/SRC/pctrmr2.c
++++ b/REDIST/SRC/pctrmr2.c
+@@ -121,7 +121,7 @@ setmemory(complex **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (complex *) mr2d_malloc(
+-				       blocksize * sizeof(complex));
++				       (size_t)blocksize * sizeof(complex));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/pdgemr.c b/REDIST/SRC/pdgemr.c
+index 706d7b2..2cc2d01 100644
+--- a/REDIST/SRC/pdgemr.c
++++ b/REDIST/SRC/pdgemr.c
+@@ -339,7 +339,7 @@ Cpdgemr2d(m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -464,10 +464,10 @@ Cpdgemr2d(m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -569,7 +569,7 @@ init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang)
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -713,7 +713,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pdgemr2.c b/REDIST/SRC/pdgemr2.c
+index b6498cf..a030570 100644
+--- a/REDIST/SRC/pdgemr2.c
++++ b/REDIST/SRC/pdgemr2.c
+@@ -118,7 +118,7 @@ setmemory(double **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (double *) mr2d_malloc(
+-				      blocksize * sizeof(double));
++				      (size_t)blocksize * sizeof(double));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/pdtrmr.c b/REDIST/SRC/pdtrmr.c
+index 7cf8d01..7381652 100644
+--- a/REDIST/SRC/pdtrmr.c
++++ b/REDIST/SRC/pdtrmr.c
+@@ -356,7 +356,7 @@ Cpdtrmr2d(uplo, diag, m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -481,10 +481,10 @@ Cpdtrmr2d(uplo, diag, m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -589,7 +589,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -661,7 +661,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pdtrmr2.c b/REDIST/SRC/pdtrmr2.c
+index 65e970f..c35085f 100644
+--- a/REDIST/SRC/pdtrmr2.c
++++ b/REDIST/SRC/pdtrmr2.c
+@@ -118,7 +118,7 @@ setmemory(double **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (double *) mr2d_malloc(
+-				      blocksize * sizeof(double));
++				      (size_t)blocksize * sizeof(double));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/pgemraux.c b/REDIST/SRC/pgemraux.c
+index 17dc301..404af7a 100644
+--- a/REDIST/SRC/pgemraux.c
++++ b/REDIST/SRC/pgemraux.c
+@@ -104,11 +104,12 @@ extern void Cpigemr2d();
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <assert.h>
++const size_t NEGFLAG = ~( ((size_t)-1) >> 1);
+ void *
+-mr2d_malloc(Int n)
++mr2d_malloc(size_t n)
+ {
+   void *ptr;
+-  assert(n > 0);
++  assert((n & NEGFLAG) == 0);
+   ptr = (void *) malloc(n);
+   if (ptr == NULL) {
+     fprintf(stderr, "xxmr2d:out of memory\n");
+diff --git a/REDIST/SRC/pigemr.c b/REDIST/SRC/pigemr.c
+index e9e0f99..ddcaf57 100644
+--- a/REDIST/SRC/pigemr.c
++++ b/REDIST/SRC/pigemr.c
+@@ -339,7 +339,7 @@ Cpigemr2d(m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -464,10 +464,10 @@ Cpigemr2d(m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -567,7 +567,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -703,7 +703,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pigemr2.c b/REDIST/SRC/pigemr2.c
+index 0e6d11d..4d3f9f9 100644
+--- a/REDIST/SRC/pigemr2.c
++++ b/REDIST/SRC/pigemr2.c
+@@ -118,7 +118,7 @@ setmemory(Int **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (Int *) mr2d_malloc(
+-				   blocksize * sizeof(Int));
++				   (size_t)blocksize * sizeof(Int));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/pitrmr.c b/REDIST/SRC/pitrmr.c
+index 043c37f..65acaf0 100644
+--- a/REDIST/SRC/pitrmr.c
++++ b/REDIST/SRC/pitrmr.c
+@@ -356,7 +356,7 @@ Cpitrmr2d(uplo, diag, m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -481,10 +481,10 @@ Cpitrmr2d(uplo, diag, m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)(ma->nbcol) * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -589,7 +589,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -661,7 +661,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pitrmr2.c b/REDIST/SRC/pitrmr2.c
+index a86f207..ac36cfc 100644
+--- a/REDIST/SRC/pitrmr2.c
++++ b/REDIST/SRC/pitrmr2.c
+@@ -118,7 +118,7 @@ setmemory(Int **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (Int *) mr2d_malloc(
+-				   blocksize * sizeof(Int));
++				   (size_t)blocksize * sizeof(Int));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/psgemr.c b/REDIST/SRC/psgemr.c
+index 6e053bf..4e12e70 100644
+--- a/REDIST/SRC/psgemr.c
++++ b/REDIST/SRC/psgemr.c
+@@ -339,7 +339,7 @@ Cpsgemr2d(m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -464,10 +464,10 @@ Cpsgemr2d(m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -567,7 +567,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -703,7 +703,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/psgemr2.c b/REDIST/SRC/psgemr2.c
+index 07b3568..55442db 100644
+--- a/REDIST/SRC/psgemr2.c
++++ b/REDIST/SRC/psgemr2.c
+@@ -118,7 +118,7 @@ setmemory(float **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (float *) mr2d_malloc(
+-				     blocksize * sizeof(float));
++				     (size_t)blocksize * sizeof(float));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/pstrmr.c b/REDIST/SRC/pstrmr.c
+index e63b93f..89ed468 100644
+--- a/REDIST/SRC/pstrmr.c
++++ b/REDIST/SRC/pstrmr.c
+@@ -350,7 +350,7 @@ Cpstrmr2d(char *uplo, char *diag, Int m, Int n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -475,10 +475,10 @@ Cpstrmr2d(char *uplo, char *diag, Int m, Int n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -583,7 +583,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -655,7 +655,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pstrmr2.c b/REDIST/SRC/pstrmr2.c
+index dcec2b5..7b59761 100644
+--- a/REDIST/SRC/pstrmr2.c
++++ b/REDIST/SRC/pstrmr2.c
+@@ -118,7 +118,7 @@ setmemory(float **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (float *) mr2d_malloc(
+-				     blocksize * sizeof(float));
++				     (size_t)blocksize * sizeof(float));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/pzgemr.c b/REDIST/SRC/pzgemr.c
+index 4b2f014..6047599 100644
+--- a/REDIST/SRC/pzgemr.c
++++ b/REDIST/SRC/pzgemr.c
+@@ -342,7 +342,7 @@ Cpzgemr2d(m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -467,10 +467,10 @@ Cpzgemr2d(m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -570,7 +570,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -706,7 +706,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pzgemr2.c b/REDIST/SRC/pzgemr2.c
+index 979748e..862fdff 100644
+--- a/REDIST/SRC/pzgemr2.c
++++ b/REDIST/SRC/pzgemr2.c
+@@ -121,7 +121,7 @@ setmemory(dcomplex **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (dcomplex *) mr2d_malloc(
+-					blocksize * sizeof(dcomplex));
++					(size_t)blocksize * sizeof(dcomplex));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
+diff --git a/REDIST/SRC/pztrmr.c b/REDIST/SRC/pztrmr.c
+index 2de759a..a0c18d8 100644
+--- a/REDIST/SRC/pztrmr.c
++++ b/REDIST/SRC/pztrmr.c
+@@ -341,6 +341,7 @@ Cpztrmr2d(uplo, diag, m, n,
+   Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum);
+   gcontext = globcontext;
+   nprocs = nprow * npcol;
++  assert (nprocs > 0);
+   /* if the global context that is given to us has not the shape of a line
+    * (nprow != 1), create a new context.  TODO: to be optimal, we should
+    * avoid this because it is an uncessary synchronisation */
+@@ -359,7 +360,7 @@ Cpztrmr2d(uplo, diag, m, n,
+   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
+   /* exchange the missing parameters among the processors: shape of grids and
+    * location of the processors */
+-  param = (Int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(Int));
++  param = (Int *) mr2d_malloc(3 * ((size_t)nprocs * 2 + NBPARAM) * sizeof(Int));
+   ra = param + nprocs * 2 + NBPARAM;
+   ca = param + (nprocs * 2 + NBPARAM) * 2;
+   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
+@@ -484,10 +485,10 @@ Cpztrmr2d(uplo, diag, m, n,
+   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
+    * intervals, in fact the worst case should be less, perhaps half that,I
+    * should think of that one day. */
+-  h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
+-				  ma->nbcol * sizeof(IDESC));
+-  v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
+-				  * ma->nbrow * sizeof(IDESC));
++  h_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->n, q0 * ma->nbcol)) *
++				  (size_t)ma->nbcol * sizeof(IDESC));
++  v_inter = (IDESC *) mr2d_malloc((size_t)(DIVUP(ma->m, p0 * ma->nbrow))
++				  * (size_t)ma->nbrow * sizeof(IDESC));
+   /* We go for the scanning of indices. For each processor including mypnum,
+    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
+    * it. Then for each processor, we compute the size of message to be
+@@ -592,7 +593,7 @@ init_chenille(Int mypnum, Int nprocs, Int n0, Int *proc0, Int n1, Int *proc1, In
+   Int   ns, nr, i, tot;
+   Int  *sender, *recver, *g0, *g1;
+   tot = max(n0, n1);
+-  sender = (Int *) mr2d_malloc((nprocs + tot) * sizeof(Int) * 2);
++  sender = (Int *) mr2d_malloc((size_t)(nprocs + tot) * sizeof(Int) * 2);
+   recver = sender + tot;
+   *psend = sender;
+   *precv = recver;
+@@ -664,7 +665,7 @@ gridreshape(Int *ctxtp)
+   Int   i, j;
+   ori = *ctxtp;
+   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
+-  usermap = mr2d_malloc(sizeof(Int) * nprow * npcol);
++  usermap = mr2d_malloc(sizeof(Int) * (size_t)nprow * (size_t)npcol);
+   for (i = 0; i < nprow; i++)
+     for (j = 0; j < npcol; j++) {
+       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
+diff --git a/REDIST/SRC/pztrmr2.c b/REDIST/SRC/pztrmr2.c
+index c75abce..29f2290 100644
+--- a/REDIST/SRC/pztrmr2.c
++++ b/REDIST/SRC/pztrmr2.c
+@@ -121,7 +121,7 @@ setmemory(dcomplex **adpointer, Int blocksize)
+     return;
+   }
+   *adpointer = (dcomplex *) mr2d_malloc(
+-					blocksize * sizeof(dcomplex));
++					(size_t) blocksize * sizeof(dcomplex));
+ }
+ /******************************************************************/
+ /* Free the memory space after the malloc */
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index 52d3def..5bcbd8e 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -14,9 +14,10 @@ source:
   sha256: {{ sha256 }}
   patches:
     - fortran_mangling_cross.patch
+    - high_mem_32bits.patch
 
 build:
-  number: 3
+  number: 4
   skip: true  # [win]
   run_exports:
     - {{ pin_subpackage("scalapack", max_pin="x.x") }}
@@ -59,3 +60,4 @@ extra:
     - davidbrochart
     - SylvainCorlay
     - traversaro
+    - gdonval