fortran-lang · jalvesz · Dec 24, 2024 · Dec 24, 2024 · Dec 26, 2024 · Dec 26, 2024
diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md
@@ -0,0 +1,154 @@
+---
+title: intrinsics
+---
+
+# The `stdlib_intrinsics` module
+
+[TOC]
+
+## Introduction
+
+The `stdlib_intrinsics` module provides replacements for some of the well known intrinsic functions found in Fortran compilers for which either a faster and/or more accurate implementation is found which has also proven of interest to the Fortran community.
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `fsum` function
+
+#### Description
+
+The `fsum` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x [,mask] )`
+
+`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x, dim [,mask] )`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: N-D array of either `real` or `complex` type. This argument is `intent(in)`.
+
+`dim` (optional): scalar of type `integer` with a value in the range from 1 to n, where n equals the rank of `x`.
+
+`mask` (optional): N-D array of `logical` values, with the same shape as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+If `dim` is absent, the output is a scalar of the same `type` and `kind` as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned.
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `fsum_kahan` function
+
+#### Description
+
+The `fsum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error:
+
+```fortran
+elemental subroutine kahan_kernel_<kind>(a,s,c)
+    type(<kind>), intent(in) :: a
+    type(<kind>), intent(inout) :: s
+    type(<kind>), intent(inout) :: c
+    type(<kind>) :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = t
+end subroutine
+```
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):fsum_kahan(interface)]] ` (x [,mask] )`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
+`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of `type` and `kind` same as to that of `x`.
+
+#### Example
+
+```fortran
+{!example/intrinsics/example_sum.f90!}
+```
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `fprod` function
+
+#### Description
+
+The `fprod` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when crunching large arrays, for repetitive products of smaller arrays consider the classical `dot_product`.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):fprod(interface)]] ` (x, y)`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
+`y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of `type` and `kind` same as to that of `x` and `y`.
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `fprod_kahan` function
+
+#### Description
+
+The `fprod_kahan` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential , complemented by the same `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) used for `fsum` to reduce the round-off error.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):fprod_kahan(interface)]] ` (x, y)`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
+`y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of `type` and `kind` same as to that of `x` and `y`.
+
+```fortran
+{!example/intrinsics/example_dot_product.f90!}
+```
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(constants)
 add_subdirectory(error)
 add_subdirectory(hashmaps)
 add_subdirectory(hash_procedures)
+add_subdirectory(intrinsics)
 add_subdirectory(io)
 add_subdirectory(linalg)
 add_subdirectory(logger)

diff --git a/example/intrinsics/CMakeLists.txt b/example/intrinsics/CMakeLists.txt
@@ -0,0 +1,2 @@
+ADD_EXAMPLE(sum)
+ADD_EXAMPLE(dot_product)
diff --git a/example/intrinsics/example_dot_product.f90 b/example/intrinsics/example_dot_product.f90
@@ -0,0 +1,18 @@
+program example_dot_product
+    use stdlib_kinds, only: sp
+    use stdlib_intrinsics, only: fprod, fprod_kahan
+    implicit none
+
+    real(sp), allocatable :: x(:), y(:)
+    real(sp) :: total_prod(3)
+
+    allocate( x(1000), y(1000) )
+    call random_number(x)
+    call random_number(y)
+
+    total_prod(1) = dot_product(x,y) !> compiler intrinsic
+    total_prod(2) = fprod(x,y)       !> chunked summation over inner product
+    total_prod(3) = fprod_kahan(x,y) !> chunked kahan summation over inner product
+    print *, total_prod(1:3)
+
+end program example_dot_product
diff --git a/example/intrinsics/example_sum.f90 b/example/intrinsics/example_sum.f90
@@ -0,0 +1,17 @@
+program example_sum
+    use stdlib_kinds, only: sp
+    use stdlib_intrinsics, only: fsum, fsum_kahan
+    implicit none
+
+    real(sp), allocatable :: x(:)
+    real(sp) :: total_sum(3)
+
+    allocate( x(1000) )
+    call random_number(x)
+
+    total_sum(1) = sum(x)       !> compiler intrinsic
+    total_sum(2) = fsum(x)      !> chunked summation
+    total_sum(3) = fsum_kahan(x)!> chunked kahan summation
+    print *, total_sum(1:3)
+
+end program example_sum
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -16,6 +16,9 @@ set(fppFiles
     stdlib_hash_64bit_fnv.fypp
     stdlib_hash_64bit_pengy.fypp
     stdlib_hash_64bit_spookyv2.fypp
+    stdlib_intrinsics_dot_product.fypp
+    stdlib_intrinsics_sum.fypp
+    stdlib_intrinsics.fypp
     stdlib_io.fypp
     stdlib_io_npy.fypp
     stdlib_io_npy_load.fypp

diff --git a/src/stdlib_constants.fypp b/src/stdlib_constants.fypp
@@ -1,5 +1,8 @@
 #:include "common.fypp"
 #:set KINDS = REAL_KINDS
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+
 module stdlib_constants
     !! Constants
     !! ([Specification](../page/specs/stdlib_constants.html))
@@ -60,5 +63,13 @@ module stdlib_constants
     real(dp), parameter, public :: u = ATOMIC_MASS_CONSTANT%value !! Atomic mass constant
 
     ! Additional constants if needed
+    #:for k, t, s in R_KINDS_TYPES
+    ${t}$, parameter, public :: zero_${s}$ = 0._${k}$
+    ${t}$, parameter, public :: one_${s}$  = 1._${k}$
+    #:endfor
+    #:for k, t, s in C_KINDS_TYPES
+    ${t}$, parameter, public :: zero_${s}$ = (0._${k}$,0._${k}$)
+    ${t}$, parameter, public :: one_${s}$  = (1._${k}$,0._${k}$)
+    #:endfor
 
 end module stdlib_constants
diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
@@ -0,0 +1,114 @@
+#:include "common.fypp"
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
+#:set RANKS = range(2, MAXRANK + 1)
+
+! This module is based on https://github.com/jalvesz/fast_math
+module stdlib_intrinsics
+    !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
+    !! ([Specification](../page/specs/stdlib_intrinsics.html))
+    use stdlib_kinds
+    implicit none
+    private
+
+    interface fsum 
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        pure module function fsum_1d_${rs}$(a) result(s)
+            ${rt}$, intent(in) :: a(:)
+            ${rt}$ :: s
+        end function
+        pure module function fsum_1d_${rs}$_mask(a,mask) result(s)
+            ${rt}$, intent(in) :: a(:)
+            logical, intent(in) :: mask(:)
+            ${rt}$ :: s
+        end function
+        #:for rank in RANKS
+        pure module function fsum_${rank}$d_${rs}$( x, mask ) result( s )
+            ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${rt}$ :: s
+        end function
+        pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
+            ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+            integer, intent(in):: dim
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$
+        end function
+        #:endfor
+        #:endfor
+    end interface
+    public :: fsum
+
+    interface fsum_kahan
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        pure module function fsum_kahan_1d_${rs}$(a) result(s)
+            ${rt}$, intent(in) :: a(:)
+            ${rt}$ :: s
+        end function
+        pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s)
+            ${rt}$, intent(in) :: a(:)
+            logical, intent(in) :: mask(:)
+            ${rt}$ :: s
+        end function
+        #:endfor
+    end interface
+    public :: fsum_kahan
+
+    interface fprod
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        pure module function fprod_${rs}$(a,b) result(p)
+            ${rt}$, intent(in) :: a(:)
+            ${rt}$, intent(in) :: b(:)
+            ${rt}$ :: p
+        end function
+        #:endfor
+    end interface
+    public :: fprod
+
+    interface fprod_kahan
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        pure module function fprod_kahan_${rs}$(a,b) result(p)
+            ${rt}$, intent(in) :: a(:)
+            ${rt}$, intent(in) :: b(:)
+            ${rt}$ :: p
+        end function
+        #:endfor
+    end interface
+    public :: fprod_kahan
+
+    interface kahan_kernel 
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        module procedure :: kahan_kernel_${rs}$
+        module procedure :: kahan_kernel_m_${rs}$
+        #:endfor
+    end interface
+    public :: kahan_kernel
+
+contains
+
+#:for rk, rt, rs in RC_KINDS_TYPES
+elemental subroutine kahan_kernel_${rs}$(a,s,c)
+    ${rt}$, intent(in) :: a
+    ${rt}$, intent(inout) :: s
+    ${rt}$, intent(inout) :: c
+    ${rt}$ :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = t
+end subroutine  
+elemental subroutine kahan_kernel_m_${rs}$(a,s,c,m)
+    ${rt}$, intent(in) :: a
+    ${rt}$, intent(inout) :: s
+    ${rt}$, intent(inout) :: c
+    logical, intent(in) :: m
+    ${rt}$ :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = merge( s , t , m )
+end subroutine 
+#:endfor
+
+end module stdlib_intrinsics