diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 3851bb41..86c6f784 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -16,7 +16,7 @@ jobs:
     - name: Install libraries
       run: |
         sudo apt-get update
-        sudo apt-get install libfftw3-dev libcunit1-dev liboctave-dev julia
+        sudo apt-get install libfftw3-dev libcunit1-dev liboctave-dev doxygen-latex graphviz
     - name: bootstrap
       run: ./bootstrap.sh
     - name: configure
diff --git a/README.md b/README.md
index 906ee997..00bacf23 100644
--- a/README.md
+++ b/README.md
@@ -37,16 +37,16 @@ Some examples for application of these transforms are provided:
 Detailed API documentation in HTML format can be found in
 `doc/html/index.html`, if you are working from a release tarball.
 When working from a source repository, the documentation can be
-generated with Doxygen.
+generated with Doxygen (which requires the `doxygen-latex` and `perl` packages):
 ```
 make doc
 ```
 
 Building
 --------
-The NFFT depends on the [FFTW](https://fftw.org) library, which is available for many Linux distros, Homebrew on macOS and MSYS2 on Windows. If you compile the FFTW yourself, it should be configured with the flag `--enable-shared`.
+The NFFT depends on the [FFTW](https://fftw.org) library, which is available for many Linux distros, Homebrew on macOS and MSYS2 on Windows. If you compile the FFTW yourself, it should be configured with the flag `--enable-shared` (and `--enable-threads` for the multi-threaded version). Building the NFFT requires `make` and a C compiler such as `gcc`.
 
-When working from a source repository, you need to run libtoolize and autoreconf first. A bash script to do this is provided.
+When working from a source repository, you need to run libtoolize and autoreconf first. A bash script to do this is provided. This step requries the tools `autoconf`, `automake` and `libtool`.
 ```
 ./bootstrap.sh
 ```
@@ -65,7 +65,7 @@ Here are some useful optional flags for `./configure`:
 * `--enable-all` specifies that all modules should be compiled,
 * `--enable-openmp` enables the multicore support and
 * `--enable-julia` specifies that the julia interface will be compiled.
-* `--with-matlab=/path/to/matlab` specifies a path of Matlab, and
+* `--with-matlab=/path/to/matlab` specifies the path of a Matlab installation, and
 * `--with-octave=/path/to/octave` does the same for GNU Octave.
 * For a list of all available options, run `./configure --help`.
 
@@ -86,15 +86,25 @@ make install
 
 Citing
 ------
-The most current general paper, the one that we recommend if you wish to cite NFFT, is *Keiner, J., Kunis, S., and Potts, D.
+The current general paper, the one that we recommend if you wish to cite NFFT, is *Keiner, J., Kunis, S., and Potts, D.
 ''Using NFFT 3 - a software library for various nonequispaced fast Fourier transforms''
-ACM Trans. Math. Software,36, Article 19, 1-30, 2009*.
+ACM Trans. Math. Software 36, Article 19, 1-30, 2009*. BibTeX entry:
+```
+@article{KeKuPo09,
+ author = {Jens Keiner and Stefan Kunis and Daniel Potts},
+ title = {Using {NFFT3} - a Software Library for Various Nonequispaced Fast {Fourier} Transforms},
+ journal = {{ACM} Trans. Math. Software},
+ year = {2009},
+ volume = {36},
+ pages = {Article 19, 1--30},
+ doi = {10.1145/1555386.1555388}}
+```
 
 Feedback
 --------
 Your comments are welcome! This is the third version of the library and may
 not be as robust or well documented as it should be. Please keep track of bugs
-or missing/confusing instructions and report them to
+or missing/confusing instructions and report them in our issue tracker or directly to
 [Daniel Potts](mailto:potts@mathematik.tu-chemnitz.de).
 The postal address is
 
diff --git a/applications/fastsum/fastsum.m b/applications/fastsum/fastsum.m
index 56c534ce..3aefc435 100644
--- a/applications/fastsum/fastsum.m
+++ b/applications/fastsum/fastsum.m
@@ -48,7 +48,9 @@
 %   'cot'                     K(x) = cot(cx)
 %   'one_over_cube'           K(x) = 1/x^3
 %   'laplacian_rbf'           K(x) = EXP(-|x|/c)
+%   'der_laplacian_rbf'       K(x) = |x|/c EXP(-|x|/c)
 %   'xx_gaussian'             K(x) = x^2/c^2 EXP(-x^2/c^2)
+%   'absx'                    K(x) = |x|
 %
 % Markus Fenn, 2006.
 
diff --git a/applications/fastsum/fastsum_matlab.c b/applications/fastsum/fastsum_matlab.c
index 522ef6d5..952cb91f 100644
--- a/applications/fastsum/fastsum_matlab.c
+++ b/applications/fastsum/fastsum_matlab.c
@@ -122,8 +122,12 @@ int main(int argc, char **argv)
       kernel = log_sin;
     else if (strcmp(s, "laplacian_rbf") == 0)
       kernel = laplacian_rbf;
+    else if (strcmp(s, "der_laplacian_rbf") == 0)
+      kernel = der_laplacian_rbf;
     else if (strcmp(s, "xx_gaussian") == 0)
       kernel = xx_gaussian;
+    else if (strcmp(s, "absx") == 0)
+      kernel = absx;
     else
     {
       printf("Unrecognized kernel function!\n");
diff --git a/applications/fastsum/fastsum_test.c b/applications/fastsum/fastsum_test.c
index 467202e1..a9a0ea99 100644
--- a/applications/fastsum/fastsum_test.c
+++ b/applications/fastsum/fastsum_test.c
@@ -124,8 +124,12 @@ int main(int argc, char **argv)
       kernel = log_sin;
     else if (strcmp(s, "laplacian_rbf") == 0)
       kernel = laplacian_rbf;
+    else if (strcmp(s, "der_laplacian_rbf") == 0)
+      kernel = der_laplacian_rbf;
     else if (strcmp(s, "xx_gaussian") == 0)
       kernel = xx_gaussian;
+    else if (strcmp(s, "absx") == 0)
+      kernel = absx;
     else
     {
       s = "multiquadric";
diff --git a/applications/fastsum/kernels.c b/applications/fastsum/kernels.c
index 34b3ca90..de27b90a 100644
--- a/applications/fastsum/kernels.c
+++ b/applications/fastsum/kernels.c
@@ -431,6 +431,22 @@ C laplacian_rbf(R x, int der, const R *param)    /* K(x)=EXP(-|x|/c) */
   return value;
 }
 
+C der_laplacian_rbf(R x, int der, const R *param)    /* K(x)=|x|/c EXP(-|x|/c) */
+{
+  R c = param[0];
+  R value = K(0.0);
+
+  switch (der)
+  {
+    case  0 : value = (FABS(x)/c)*EXP(-FABS(x)/c); break;
+    default:
+        value = (POW(K(-1.0),(R)der))*((FABS(x)-(R)der*c)/POW(c,(R)der+1))*EXP(-FABS(x)/c);
+        value *= 1 - 2 * ((x < K(0.0)) && (der % 2));
+  }
+
+  return value;
+}
+
 C xx_gaussian(R x, int der, const R *param)    /* K(x)=x^2/c^2 EXP(-x^2/c^2) */
 {
   R c = param[0];
@@ -457,6 +473,22 @@ C xx_gaussian(R x, int der, const R *param)    /* K(x)=x^2/c^2 EXP(-x^2/c^2) */
   return value / (c*c);
 }
 
+C absx(R x, int der, const R *param)    /* K(x)=|x| */
+{
+  R value=K(0.0);
+
+  (void)param;
+  
+  if (der == 0) value=FABS(x);
+  else if (der == 1){
+    if (x<0) value=K(-1.0);
+    else value=K(1.0);
+  }
+  else value=K(0.0);
+  
+  return value;
+}
+
 /* \} */
 
 /* kernels.c */
diff --git a/applications/fastsum/kernels.h b/applications/fastsum/kernels.h
index 006a376e..0742c1b8 100644
--- a/applications/fastsum/kernels.h
+++ b/applications/fastsum/kernels.h
@@ -56,7 +56,9 @@ C kcot(R x, int der, const R *param);                  /**< K(x) = cot(cx) */
 C one_over_cube(R x, int der, const R *param);         /**< K(x) = 1/x^3 */
 C log_sin(R x, int der, const R *param);               /**< K(x) = log(|sin(cx)|) */
 C laplacian_rbf(R x, int der, const R *param);         /**< K(x) = exp(-|x|/c) */
+C der_laplacian_rbf(R x, int der, const R *param);     /**< K(x) = |x|/c exp(-|x|/c) */
 C xx_gaussian(R x, int der, const R *param);           /**< K(x) = x^2/c^2 exp(-x^2/c^2) */
+C absx(R x, int der, const R *param);                  /**< K(x) = |x| */
 /* \} */
 
 #ifdef __cplusplus
diff --git a/configure.ac b/configure.ac
index 7af5ce6d..fe302ac2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -20,7 +20,7 @@
 
 m4_define([nfft_version_major], [3])
 m4_define([nfft_version_minor], [5])
-m4_define([nfft_version_patch], [3])
+m4_define([nfft_version_patch], [4])
 m4_define([nfft_version_type], [alpha])
 m4_append([NFFT_VERSION], m4_expand([nfft_version_major.nfft_version_minor.nfft_version_patch]))
 m4_append([NFFT_VERSION], m4_expand([nfft_version_type]))
@@ -278,7 +278,7 @@ AC_LANG(C)
 AX_COMPILER_VENDOR
 
 # check for C99 compliant mode (possibly with GNU extensions)
-AC_PROG_CC
+AC_PROG_CC_C99
 
 # per-target flags
 AM_PROG_CC_C_O
diff --git a/julia/fastsum/libfastsumjulia.c b/julia/fastsum/libfastsumjulia.c
index dff69909..45f35773 100644
--- a/julia/fastsum/libfastsumjulia.c
+++ b/julia/fastsum/libfastsumjulia.c
@@ -55,6 +55,12 @@ int jfastsum_init( fastsum_plan* p, int d, char* s, double* c, unsigned int f, i
 		kernel = log_sin;
 	else if ( strcmp(s, "laplacian_rbf") == 0 )
 		kernel = laplacian_rbf;
+    else if ( strcmp(s, "der_laplacian_rbf") == 0 )
+		kernel = der_laplacian_rbf;
+	else if ( strcmp(s, "xx_gaussian") == 0 )
+		kernel = xx_gaussian;
+	else if ( strcmp(s, "absx") == 0 )
+		kernel = absx;
 	else {
 		return 1;
 	}
diff --git a/linux-build-mex.sh b/linux-build-mex.sh
index 332687fd..d98a937e 100755
--- a/linux-build-mex.sh
+++ b/linux-build-mex.sh
@@ -31,8 +31,8 @@ set -ex
 exec > >(tee linux-build-mex.log)
 exec 2>&1
 
-FFTWVERSION=3.3.9
-GCCVERSION=8.5.0
+FFTWVERSION=3.3.10
+GCCVERSION=11.2.0
 GCCARCH=haswell
 BINARIES_ARCH_README='
 Please note that since the binaries were compiled with gcc flag -march=haswell,
diff --git a/m4/ax_prog_matlab.m4 b/m4/ax_prog_matlab.m4
index 25ff7f3a..2f6a131c 100644
--- a/m4/ax_prog_matlab.m4
+++ b/m4/ax_prog_matlab.m4
@@ -104,6 +104,7 @@ AC_DEFUN([AX_PROG_MATLAB],
         mac) matlab_mexext="mexmac";;
         maci) matlab_mexext="mexmaci";;
         maci64) matlab_mexext="mexmaci64";;
+	maca64) matlab_mexext="mexmaca64";;
         sol64) matlab_mexext="mexs64";;
         win32) matlab_mexext="mexw32";;
         win64) matlab_mexext="mexw64";;
@@ -171,6 +172,7 @@ AC_DEFUN([AX_PROG_MATLAB],
           mexmac) matlab_arch="mac";;
           mexmaci) matlab_arch="maci";;
           mexmaci64) matlab_arch="maci64";;
+	  mexmaca64) matlab_arch="maca64";;
           mexs64) matlab_arch="sol64";;
           mexw32) matlab_arch="win32";;
           mexw64) matlab_arch="win64";;
@@ -185,6 +187,7 @@ AC_DEFUN([AX_PROG_MATLAB],
           mac) matlab_mexext="mexmac";;
           maci) matlab_mexext="mexmaci";;
           maci64) matlab_mexext="mexmaci64";;
+	  maca64) matlab_mexext="mexmaca64";;
           sol64) matlab_mexext="mexs64";;
           win32) matlab_mexext="mexw32";;
           win64) matlab_mexext="mexw64";;
@@ -205,7 +208,7 @@ AC_DEFUN([AX_PROG_MATLAB],
     # dynamic library extension for architecture
     case $matlab_arch in
       glnx86|glnxa64|sol|sol64) matlab_libext=".so";;
-      mac|mac64|maci|maci64) matlab_libext=".dylib";;
+      mac|mac64|maci|maci64|maca64) matlab_libext=".dylib";;
       win32|win64) matlab_libext=".dll";;
       *) AC_MSG_ERROR([Unsupported or invalid architecture ${matlab_arch}.]);;
     esac
diff --git a/macos-build-mex.sh b/macos-build-mex.sh
index 2660cd48..14fd4e66 100755
--- a/macos-build-mex.sh
+++ b/macos-build-mex.sh
@@ -4,7 +4,7 @@
 # A Matlab installation must be specified in order to build the
 # Matlab interface. The paths should not contain spaces!
 #
-# The script is known to work on macOS 10.5 Catalina with Homebrew.
+# The script is known to work on macOS 11 Big Sur with Homebrew.
 #
 # At least the following packages are required:
 # octave gnu-sed cunit
@@ -19,11 +19,11 @@ set -ex
 
 GCCARCH=haswell
 FFTWDIR=/usr/local
-GCC="gcc-9 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk"
+GCC="gcc-11"
 
 # default values (to be overwritten if respective parameters are set)
 OCTAVEDIR=/usr/local
-
+MATLABDIR=/Applications/MATLAB_R2021b.app
 # read the options
 TEMP=`getopt -o o:m:f:v: --long octave:,matlab:,matlab-version:,fftw: -n 'macos-build-mex.sh' -- "$@"`
 eval set -- "$TEMP"
diff --git a/matlab/fastsum/fastsummex.c b/matlab/fastsum/fastsummex.c
index e0e725f7..ea50560a 100644
--- a/matlab/fastsum/fastsummex.c
+++ b/matlab/fastsum/fastsummex.c
@@ -134,8 +134,12 @@ static kernel get_kernel(const mxArray *p)
     ker = log_sin;
   else if (strcmp(s, "laplacian_rbf") == 0)
     ker = laplacian_rbf;
+  else if (strcmp(s, "der_laplacian_rbf") == 0)
+    ker = der_laplacian_rbf;
   else if (strcmp(s, "xx_gaussian") == 0)
     ker = xx_gaussian;
+  else if (strcmp(s, "absx") == 0)
+    ker = absx;
   else
     mexErrMsgTxt("fastsum: Unknown kernel function.");
   return ker;
diff --git a/matlab/fastsum/simple_test.m b/matlab/fastsum/simple_test.m
index 14052637..6ed15571 100644
--- a/matlab/fastsum/simple_test.m
+++ b/matlab/fastsum/simple_test.m
@@ -39,7 +39,9 @@
 % 'one_over_cube'           K(x) = 1/x^3
 % 'log_sin'                 K(x) = LOG(|SIN(cx)|)
 % 'laplacian_rbf'           K(x) = EXP(-|x|/c)
+% 'der_laplacian_rbf'       K(x) = |x|/c EXP(-|x|/c)
 % 'xx_gaussian'             K(x) = x^2/c^2 EXP(-x^2/c^2) 
+% 'absx'                    K(x) = |x| 
 
 %% Initialize parameters
 d = 2;          % number of dimensions
diff --git a/matlab/fastsum/test_fastsum.m b/matlab/fastsum/test_fastsum.m
index 7e2dc015..ab1aff68 100644
--- a/matlab/fastsum/test_fastsum.m
+++ b/matlab/fastsum/test_fastsum.m
@@ -39,7 +39,9 @@
 % 'one_over_cube'           K(x) = 1/x^3
 % 'log_sin'                 K(x) = LOG(|SIN(cx)|)
 % 'laplacian_rbf'           K(x) = EXP(-|x|/c)
+% 'der_laplacian_rbf'       K(x) = |x|/c EXP(-|x|/c)
 % 'xx_gaussian'             K(x) = x^2/c^2 EXP(-x^2/c^2) 
+% 'absx'                    K(x) = |x| 
 
 %% Initialize parameters
 d = 2;          % number of dimensions
diff --git a/windows-build-dll.sh b/windows-build-dll.sh
index 5fd81d86..1476b093 100644
--- a/windows-build-dll.sh
+++ b/windows-build-dll.sh
@@ -6,7 +6,7 @@
 # The Matlab path should not contain spaces!
 # 
 # Example call:
-# ./nfft-build-dll.sh --fftw=3.3.8 --octave=5.2.0 --matlab=/c/path/to/matlab
+# ./nfft-build-dll.sh --fftw=3.3.10 --octave=6.4.0 --matlab=/c/path/to/matlab
 # 
 # WARNING: This script downloads and compiles FFTW and downloads GCC, Julia and Octave (requires ~ 3GB).
 # 
@@ -23,8 +23,8 @@ set -ex
 
 
 # default values (to be overwritten if respective parameters are set)
-FFTWVERSION=3.3.8
-OCTAVEVERSION=5.2.0
+FFTWVERSION=3.3.10
+OCTAVEVERSION=6.4.0
 MATLABVERSION=""
 ARCH=64
 GCCARCH=""