Compare revisions

Pradipta Samanta · Pradipta Samanta · Pradipta Samanta · Pradipta Samanta · Pradipta Samanta · Pradipta Samanta
--- a/.cmake-format.py
+++ b/.cmake-format.py
@@ -37,3 +37,4 @@ with section("lint"):
 	local_var_pattern = '[a-zA-Z][0-9a-zA-z_]+'
 	private_var_pattern = '[a-z][a-z0-9_]+'
 	public_var_pattern = '[A-Z][0-9a-zA-Z_]+'
+	global_var_pattern = '[A-Z][0-9a-zA-Z_]+'
--- a/.gitignore
+++ b/.gitignore
@@ -22,4 +22,5 @@ iconmath_Tests

 # Test stage files:
 /**/Testing/*
+iconmath_Tests
 run_tests.f90
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,9 @@ cmake_minimum_required(VERSION 3.18)
 project(
  iconmath
  VERSION 1.0.0
-  LANGUAGES Fortran)
+  LANGUAGES Fortran CXX)
+
+set(CMAKE_CXX_STANDARD 17)

 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 option(BUILD_TESTING "Build tests" ON)
@@ -23,6 +25,7 @@ option(BUILD_ICONMATH_HORIZONTAL "Build horizontal library" ON)

 option(IM_ENABLE_MIXED_PRECISION "Enable mixed precision" OFF)
 option(IM_ENABLE_LOOP_EXCHANGE "Enable loop exchange" OFF)
+option(IM_USE_CPP_BINDINGS "Use C++ bindings" OFF)
 option(IM_ENABLE_DIM_SWAP "Enable dimension swap" OFF)
 option(IM_ENABLE_OPENACC "Enable OpenACC support" OFF)
 option(IM_ENABLE_OPENMP "Enable OpenMP support" OFF)
@@ -107,6 +110,22 @@ else()
  endif()
 endif()

+include(FetchContent)
+# configure kokkos 4.4 repository link
+FetchContent_Declare(
+  kokkos
+  URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz
+  URL_HASH MD5=eafd0d42c9831858aa84fde78576644c)
+
+# disable build of C++23 mdspan experimental support for now
+set(Kokkos_ENABLE_IMPL_MDSPAN OFF CACHE BOOL "Experimental mdspan support")
+
+# by default, build the Kokkos serial backend for CPU
+set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "Kokkos Serial backend")
+set(Kokkos_ARCH_NATIVE ON CACHE BOOL "Kokkos native architecture optimisations")
+
+FetchContent_MakeAvailable(kokkos)
+
 add_subdirectory(src)

 # Allow for 'make test' even if the tests are disabled:

--- a/src/support/CMakeLists.txt
+++ b/src/support/CMakeLists.txt
@@ -13,9 +13,11 @@ add_library(
  iconmath-support
  mo_gridman_constants.f90
  mo_lib_grid_geometry_info.f90
-  mo_lib_loopindices.f90
+  mo_lib_loopindices.cpp
+  mo_lib_loopindices.F90
  mo_math_constants.f90
  mo_math_types.f90
+  mo_math_utilities.cpp
  mo_math_utilities.F90
  mo_random_number_generators.F90)

@@ -40,6 +42,10 @@ if(IM_ENABLE_DIM_SWAP)
  target_compile_definitions(iconmath-support PRIVATE __SWAPDIM)
 endif()

+if(IM_USE_CPP_BINDINGS)
+  target_compile_definitions(iconmath-support PUBLIC __USE_CPP_BINDINGS)
+endif()
+
 if(IM_ENABLE_OPENACC)
  # If _OPENACC is defined, assume that the required compiler flags are already
  # provided, e.g. in CMAKE_Fortran_FLAGS:
@@ -57,9 +63,23 @@ target_include_directories(
    # Path to the Fortran modules:
    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:Fortran>:${Fortran_MODULE_DIRECTORY}>>
    $<INSTALL_INTERFACE:$<$<COMPILE_LANGUAGE:Fortran>:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>>
-)
+  INTERFACE
+    # Path to the internal C/C++ headers (for testing): Requires CMake 3.15+ for
+    # multiple compile languages
+    # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html
+    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:C,CXX>:${CMAKE_CURRENT_SOURCE_DIR}>>
+  PRIVATE
+    # Path to config.h (for C and C++ only): Requires CMake 3.15+ for multiple
+    # compile languages
+    # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html
+    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:C,CXX>:${CMAKE_CURRENT_BINARY_DIR}>>)

-target_link_libraries(iconmath-support PUBLIC fortran-support::fortran-support)
+target_link_libraries(iconmath-support
+    PUBLIC
+        fortran-support::fortran-support
+    PRIVATE
+        Kokkos::kokkos
+)

 install(TARGETS iconmath-support EXPORT "${PROJECT_NAME}-targets")


--- a/src/support/mo_lib_loopindices.f90
+++ b/src/support/mo_lib_loopindices.f90
@@ -16,12 +16,18 @@

 MODULE mo_lib_loopindices

+#ifdef __USE_CPP_BINDINGS
+  USE, INTRINSIC :: ISO_C_BINDING
+#endif
+
  IMPLICIT NONE

  PRIVATE

  PUBLIC :: get_indices_c_lib, get_indices_e_lib, get_indices_v_lib

+#ifndef __USE_CPP_BINDINGS
+
 CONTAINS

 !-------------------------------------------------------------------------
@@ -121,5 +127,31 @@ CONTAINS

  END SUBROUTINE get_indices_v_lib

-END MODULE mo_lib_loopindices
+#else
+
+  INTERFACE
+    SUBROUTINE get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                                 i_startidx_out, i_endidx_out) BIND(C, NAME="get_indices_c_lib")
+      IMPORT :: c_int
+      INTEGER(c_int), VALUE :: i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk
+      INTEGER(c_int) :: i_startidx_out, i_endidx_out
+    END SUBROUTINE get_indices_c_lib
+
+    SUBROUTINE get_indices_e_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                                 i_startidx_out, i_endidx_out) BIND(C, NAME="get_indices_e_lib")
+      IMPORT :: c_int
+      INTEGER(c_int), VALUE :: i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk
+      INTEGER(c_int) :: i_startidx_out, i_endidx_out
+    END SUBROUTINE get_indices_e_lib
+
+    SUBROUTINE get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                                 i_startidx_out, i_endidx_out) BIND(C, NAME="get_indices_v_lib")
+      IMPORT :: c_int
+      INTEGER(c_int), VALUE :: i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk
+      INTEGER(c_int) :: i_startidx_out, i_endidx_out
+    END SUBROUTINE get_indices_v_lib
+  END INTERFACE
+
+#endif

+END MODULE mo_lib_loopindices
--- a/src/support/mo_lib_loopindices.cpp
+++ b/src/support/mo_lib_loopindices.cpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+
+#include <algorithm> // For std::max
+
+extern "C" {
+    // get_indices_c_lib function
+    void get_indices_c_lib(int i_startidx_in, int i_endidx_in, int nproma, int i_blk, int i_startblk, int i_endblk,
+                           int &i_startidx_out, int &i_endidx_out) {
+        if (i_blk == i_startblk) {
+            i_startidx_out = std::max(1, i_startidx_in);
+            i_endidx_out = nproma;
+            if (i_blk == i_endblk) {
+                i_endidx_out = i_endidx_in;
+            }
+        } else if (i_blk == i_endblk) {
+            i_startidx_out = 1;
+            i_endidx_out = i_endidx_in;
+        } else {
+            i_startidx_out = 1;
+            i_endidx_out = nproma;
+        }
+    }
+
+    // get_indices_e_lib function
+    void get_indices_e_lib(int i_startidx_in, int i_endidx_in, int nproma, int i_blk, int i_startblk, int i_endblk,
+                           int &i_startidx_out, int &i_endidx_out) {
+        i_startidx_out = (i_blk != i_startblk) ? 1 : std::max(1, i_startidx_in);
+        i_endidx_out = (i_blk != i_endblk) ? nproma : i_endidx_in;
+    }
+
+    // get_indices_v_lib function
+    void get_indices_v_lib(int i_startidx_in, int i_endidx_in, int nproma, int i_blk, int i_startblk, int i_endblk,
+                           int &i_startidx_out, int &i_endidx_out) {
+        if (i_blk == i_startblk) {
+            i_startidx_out = i_startidx_in;
+            i_endidx_out = nproma;
+            if (i_blk == i_endblk) {
+                i_endidx_out = i_endidx_in;
+            }
+        } else if (i_blk == i_endblk) {
+            i_startidx_out = 1;
+            i_endidx_out = i_endidx_in;
+        } else {
+            i_startidx_out = 1;
+            i_endidx_out = nproma;
+        }
+    }
+}
--- a/src/support/mo_math_utilities.F90
+++ b/src/support/mo_math_utilities.F90
@@ -22,6 +22,7 @@
 ! #endif
 MODULE mo_math_utilities

+  USE, INTRINSIC :: ISO_C_BINDING
  USE mo_iconlib_kind, ONLY: wp, dp, sp
  USE mo_math_constants, ONLY: pi, pi_2, dbl_eps
  USE mo_gridman_constants, ONLY: SUCCESS, TORUS_MAX_LAT
@@ -160,8 +161,100 @@ MODULE mo_math_utilities

  CHARACTER(LEN=*), PARAMETER :: modname = 'mo_math_utilities'

+  !-------------------------------------------------------------------------
+  !>
+  !! TDMA tridiagonal matrix solver for a_i*x_(i-1) + b_i*x_i + c_i*x_(i+1) = d_i
+  !!
+  !!       a - sub-diagonal (means it is the diagonal below the main diagonal)
+  !!       b - the main diagonal
+  !!       c - sup-diagonal (means it is the diagonal above the main diagonal)
+  !!       d - right part
+  !!  varout - the answer (identical to x in description above)
+  !!    slev - start level (top)
+  !!    elev - end level (bottom)
+
+! Preprocessor directive to conditionally include the tdma_solver_vec implementation
+#ifndef __USE_CPP_BINDINGS
+
 CONTAINS

+  SUBROUTINE tdma_solver_vec(a, b, c, d, slev, elev, startidx, endidx, varout, opt_acc_queue)
+    INTEGER, INTENT(IN) :: slev, elev
+    INTEGER, INTENT(IN) :: startidx, endidx
+    REAL(wp), INTENT(IN) :: a(:, :), b(:, :), c(:, :), d(:, :)
+    REAL(wp), INTENT(OUT) :: varout(:, :)
+    INTEGER, OPTIONAL, INTENT(IN) :: opt_acc_queue
+
+    !
+    ! local
+    REAL(wp):: m, c_p(SIZE(a, 1), SIZE(a, 2)), d_p(SIZE(a, 1), SIZE(a, 2))
+    INTEGER :: i
+    INTEGER :: jc
+    INTEGER :: acc_queue
+
+    IF (PRESENT(opt_acc_queue)) THEN
+      acc_queue = opt_acc_queue
+    ELSE
+      acc_queue = 1
+    END IF
+
+    ! initialize c-prime and d-prime
+    !$ACC PARALLEL DEFAULT(PRESENT) CREATE(c_p, d_p) ASYNC(acc_queue)
+    !$ACC LOOP GANG(STATIC: 1) VECTOR
+    DO jc = startidx, endidx
+      c_p(jc, slev) = c(jc, slev)/b(jc, slev)
+      d_p(jc, slev) = d(jc, slev)/b(jc, slev)
+    END DO
+    ! solve for vectors c-prime and d-prime
+    !$ACC LOOP SEQ
+!NEC$ outerloop_unroll(4)
+    DO i = slev + 1, elev
+      !$ACC LOOP GANG(STATIC: 1) VECTOR PRIVATE(m)
+      DO jc = startidx, endidx
+        m = 1._wp/(b(jc, i) - c_p(jc, i - 1)*a(jc, i))
+        c_p(jc, i) = c(jc, i)*m
+        d_p(jc, i) = (d(jc, i) - d_p(jc, i - 1)*a(jc, i))*m
+      END DO
+    END DO
+    ! initialize varout
+    !$ACC LOOP GANG(STATIC: 1) VECTOR
+    DO jc = startidx, endidx
+      varout(jc, elev) = d_p(jc, elev)
+    END DO
+    ! solve for varout from the vectors c-prime and d-prime
+    !$ACC LOOP SEQ
+!NEC$ outerloop_unroll(4)
+    DO i = elev - 1, slev, -1
+      !$ACC LOOP GANG(STATIC: 1) VECTOR
+      DO jc = startidx, endidx
+        varout(jc, i) = d_p(jc, i) - c_p(jc, i)*varout(jc, i + 1)
+      END DO
+    END DO
+    !$ACC END PARALLEL
+
+    IF (.NOT. PRESENT(opt_acc_queue)) THEN
+      !$ACC WAIT(acc_queue)
+    END IF
+
+  END SUBROUTINE tdma_solver_vec
+
+#else
+
+  ! C++ binding for tdma_solver_vec
+  INTERFACE
+    SUBROUTINE tdma_solver_vec(a, b, c, d, slev, elev, startidx, endidx, nrows, ncols, varout, opt_acc_queue) &
+      BIND(C, NAME="tdma_solver_vec")
+      IMPORT :: c_double, c_int
+      REAL(c_double), INTENT(IN) :: a(*), b(*), c(*), d(*)
+      INTEGER(c_int), VALUE :: slev, elev, startidx, endidx, nrows, ncols
+      REAL(c_double), INTENT(OUT) :: varout(*)
+      INTEGER(c_int), OPTIONAL :: opt_acc_queue
+    END SUBROUTINE tdma_solver_vec
+  END INTERFACE
+
+CONTAINS
+#endif
+
  !-------------------------------------------------------------------------
  ! Variant for double-precision (or working-precision=dp) lon+lat in ICON
  INTEGER FUNCTION check_orientation_dp(lonc, lon, lat, n)
@@ -2019,78 +2112,6 @@ CONTAINS

  END SUBROUTINE tdma_solver

-  !-------------------------------------------------------------------------
-  !>
-  !! TDMA tridiagonal matrix solver for a_i*x_(i-1) + b_i*x_i + c_i*x_(i+1) = d_i
-  !!
-  !!       a - sub-diagonal (means it is the diagonal below the main diagonal)
-  !!       b - the main diagonal
-  !!       c - sup-diagonal (means it is the diagonal above the main diagonal)
-  !!       d - right part
-  !!  varout - the answer (identical to x in description above)
-  !!    slev - start level (top)
-  !!    elev - end level (bottom)
-  SUBROUTINE tdma_solver_vec(a, b, c, d, slev, elev, startidx, endidx, varout, opt_acc_queue)
-    INTEGER, INTENT(IN) :: slev, elev
-    INTEGER, INTENT(IN) :: startidx, endidx
-    REAL(wp), INTENT(IN) :: a(:, :), b(:, :), c(:, :), d(:, :)
-    REAL(wp), INTENT(OUT) :: varout(:, :)
-    INTEGER, OPTIONAL, INTENT(IN) :: opt_acc_queue
-
-    !
-    ! local
-    REAL(wp):: m, c_p(SIZE(a, 1), SIZE(a, 2)), d_p(SIZE(a, 1), SIZE(a, 2))
-    INTEGER :: i
-    INTEGER :: jc
-    INTEGER :: acc_queue
-
-    IF (PRESENT(opt_acc_queue)) THEN
-      acc_queue = opt_acc_queue
-    ELSE
-      acc_queue = 1
-    END IF
-
-    ! initialize c-prime and d-prime
-    !$ACC PARALLEL DEFAULT(PRESENT) CREATE(c_p, d_p) ASYNC(acc_queue)
-    !$ACC LOOP GANG(STATIC: 1) VECTOR
-    DO jc = startidx, endidx
-      c_p(jc, slev) = c(jc, slev)/b(jc, slev)
-      d_p(jc, slev) = d(jc, slev)/b(jc, slev)
-    END DO
-    ! solve for vectors c-prime and d-prime
-    !$ACC LOOP SEQ
-!NEC$ outerloop_unroll(4)
-    DO i = slev + 1, elev
-      !$ACC LOOP GANG(STATIC: 1) VECTOR PRIVATE(m)
-      DO jc = startidx, endidx
-        m = 1._wp/(b(jc, i) - c_p(jc, i - 1)*a(jc, i))
-        c_p(jc, i) = c(jc, i)*m
-        d_p(jc, i) = (d(jc, i) - d_p(jc, i - 1)*a(jc, i))*m
-      END DO
-    END DO
-    ! initialize varout
-    !$ACC LOOP GANG(STATIC: 1) VECTOR
-    DO jc = startidx, endidx
-      varout(jc, elev) = d_p(jc, elev)
-    END DO
-    ! solve for varout from the vectors c-prime and d-prime
-    !$ACC LOOP SEQ
-!NEC$ outerloop_unroll(4)
-    DO i = elev - 1, slev, -1
-      !$ACC LOOP GANG(STATIC: 1) VECTOR
-      DO jc = startidx, endidx
-        varout(jc, i) = d_p(jc, i) - c_p(jc, i)*varout(jc, i + 1)
-      END DO
-    END DO
-    !$ACC END PARALLEL
-
-    IF (.NOT. PRESENT(opt_acc_queue)) THEN
-      !$ACC WAIT(acc_queue)
-    END IF
-
-  END SUBROUTINE tdma_solver_vec
-  !-------------------------------------------------------------------------
-
  !-------------------------------------------------------------------------
  !
  !> Helper functions for computing the vertical layer structure

--- a/src/support/mo_math_utilities.cpp
+++ b/src/support/mo_math_utilities.cpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+
+#include <vector>
+#include <iostream>
+#include <chrono> // For timing
+#include <Kokkos_Core.hpp>
+
+extern "C" {
+
+void tdma_solver_vec_kokkos(const double* a, const double* b, const double* c, const double* d,
+                              int slev, int elev, int startidx, int endidx,
+                              int nrows, int ncols, double* varout) {
+
+  // Start timing
+  auto start_time = std::chrono::high_resolution_clock::now();
+
+  // Allocate temporary arrays using Kokkos::View.
+  // The views c_p and d_p are allocated as 2D arrays with dimensions [nrows][ncols].
+  // Kokkos::View automatically handles memory management.
+  Kokkos::View<double**> c_p("c_p", nrows, ncols);
+  Kokkos::View<double**> d_p("d_p", nrows, ncols);
+
+  // Wrap the input arrays in unmanaged views.
+  // We assume that the input arrays are laid out in column-major order as in the original code.
+  // Here we use LayoutLeft so that the first index (row) is contiguous.
+  typedef Kokkos::View<const double**, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> UnmanagedConst2D;
+  typedef Kokkos::View<double**, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> Unmanaged2D;
+  UnmanagedConst2D a_view(a, nrows, ncols);
+  UnmanagedConst2D b_view(b, nrows, ncols);
+  UnmanagedConst2D c_view(c, nrows, ncols);
+  UnmanagedConst2D d_view(d, nrows, ncols);
+  Unmanaged2D varout_view(varout, nrows, ncols);
+
+  // Initialize c-prime and d-prime at the starting level (slev)
+  Kokkos::parallel_for("init_c_p_d_p", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
+    c_p(jc, slev) = c_view(jc, slev) / b_view(jc, slev);
+    d_p(jc, slev) = d_view(jc, slev) / b_view(jc, slev);
+  });
+  Kokkos::fence();
+
+  // Forward sweep: compute c-prime and d-prime for each column from slev+1 to elev-1.
+  for (int i = slev + 1; i < elev; ++i) {
+    Kokkos::parallel_for("forward_sweep", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
+      double m = 1.0 / (b_view(jc, i) - c_p(jc, i - 1) * a_view(jc, i));
+      c_p(jc, i) = c_view(jc, i) * m;
+      d_p(jc, i) = (d_view(jc, i) - d_p(jc, i - 1) * a_view(jc, i)) * m;
+    });
+    Kokkos::fence();
+  }
+
+  // Initialize the output array at the last level (elev-1)
+  Kokkos::parallel_for("init_varout", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
+    varout_view(jc, elev-1) = d_p(jc, elev-1);
+  });
+  Kokkos::fence();
+
+  // Back substitution: update varout for columns from elev-2 down to slev.
+  for (int i = elev - 2; i >= slev; --i) {
+    Kokkos::parallel_for("back_substitution", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
+      varout_view(jc, i) = d_p(jc, i) - c_p(jc, i) * varout_view(jc, i + 1);
+    });
+    Kokkos::fence();
+  }
+
+  // End timing and print the elapsed time
+  auto end_time = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed_time = end_time - start_time;
+  std::cout << "Elapsed time for tdma_solver_vec (Kokkos): " << elapsed_time.count() << " seconds" << std::endl;
+}
+
+}
--- a/test/fortran/test_loopindices.f90
+++ b/test/fortran/test_loopindices.f90
+! ICON
+!
+! ---------------------------------------------------------------
+! Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+! Contact information: icon-model.org
+!
+! See AUTHORS.TXT for a list of authors
+! See LICENSES/ for license information
+! SPDX-License-Identifier: BSD-3-Clause
+! ---------------------------------------------------------------
+
+MODULE TEST_mo_lib_loopindices
+  USE FORTUTF
+
+  IMPLICIT NONE
+
+  PRIVATE
+
+  PUBLIC :: TEST_get_indices_lib
+
+CONTAINS
+
+  SUBROUTINE TEST_get_indices_lib
+    USE mo_lib_loopindices, ONLY: get_indices_c_lib, get_indices_e_lib, get_indices_v_lib
+
+    INTEGER :: i_startidx_in ! Start index as input
+    INTEGER :: i_endidx_in ! End index as input
+    INTEGER :: nproma ! inner loop length/vector length
+    INTEGER :: i_blk ! Current block (variable jb in do loops)
+    INTEGER :: i_startblk ! Start block of do loop
+    INTEGER :: i_endblk ! End block of do loop
+
+    INTEGER :: i_startidx_out, i_endidx_out ! Start and end indices (jc loop), as output
+
+    i_startidx_in = 2
+    i_endidx_in = 15
+    nproma = 32
+    i_startblk = 1
+    i_endblk = 40
+
+    ! CASE: I -> i_blk == i_startblk
+    i_blk = 1
+    CALL get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_c_start_1")
+    CALL ASSERT_EQUAL(i_startidx_out, MAX(1, i_startidx_in))
+    CALL TAG_TEST("TEST_get_indices_c_end_1")
+    CALL ASSERT_EQUAL(i_endidx_out, nproma)
+
+    i_startidx_out = 0
+    i_endidx_out = 0
+
+    CALL get_indices_e_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_e_start_1")
+    CALL ASSERT_EQUAL(i_startidx_out, MAX(1, i_startidx_in))
+    CALL TAG_TEST("TEST_get_indices_e_end_1")
+    CALL ASSERT_EQUAL(i_endidx_out, nproma)
+
+    i_startidx_out = 0
+    i_endidx_out = 0
+
+    CALL get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_v_start_1")
+    CALL ASSERT_EQUAL(i_startidx_out, MAX(1, i_startidx_in))
+    CALL TAG_TEST("TEST_get_indices_v_end_1")
+    CALL ASSERT_EQUAL(i_endidx_out, nproma)
+
+    i_startidx_out = 0
+    i_endidx_out = 0
+
+    ! CASE: II -> i_blk == i_endblk
+    i_blk = 40
+    CALL get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_c_start_2")
+    CALL ASSERT_EQUAL(i_startidx_out, 1)
+    CALL TAG_TEST("TEST_get_indices_c_end_2")
+    CALL ASSERT_EQUAL(i_endidx_out, i_endidx_in)
+
+    i_startidx_out = 0
+    i_endidx_out = 0
+
+    CALL get_indices_e_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_e_start_2")
+    CALL ASSERT_EQUAL(i_startidx_out, 1)
+    CALL TAG_TEST("TEST_get_indices_e_end_2")
+    CALL ASSERT_EQUAL(i_endidx_out, i_endidx_in)
+
+    i_startidx_out = 0
+    i_endidx_out = 0
+
+    CALL get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_v_start_2")
+    CALL ASSERT_EQUAL(i_startidx_out, 1)
+    CALL TAG_TEST("TEST_get_indices_v_end_2")
+    CALL ASSERT_EQUAL(i_endidx_out, i_endidx_in)
+
+    i_startidx_out = 0
+    i_endidx_out = 0
+
+    ! CASE: III -> Every other cases
+    i_blk = 20
+    CALL get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_c_start_3")
+    CALL ASSERT_EQUAL(i_startidx_out, 1)
+    CALL TAG_TEST("TEST_get_indices_c_end_3")
+    CALL ASSERT_EQUAL(i_endidx_out, nproma)
+
+    i_startidx_out = 0
+    i_endidx_out = 0
+
+    CALL get_indices_e_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_e_start_3")
+    CALL ASSERT_EQUAL(i_startidx_out, 1)
+    CALL TAG_TEST("TEST_get_indices_e_end_3")
+    CALL ASSERT_EQUAL(i_endidx_out, nproma)
+
+    i_startidx_out = 0
+    i_endidx_out = 0
+
+    CALL get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                           i_startidx_out, i_endidx_out)
+
+    CALL TAG_TEST("TEST_get_indices_v_start_3")
+    CALL ASSERT_EQUAL(i_startidx_out, 1)
+    CALL TAG_TEST("TEST_get_indices_v_end_3")
+    CALL ASSERT_EQUAL(i_endidx_out, nproma)
+
+  END SUBROUTINE TEST_get_indices_lib
+
+END MODULE TEST_mo_lib_loopindices
--- a/test/fortran/test_math_utilities.F90
+++ b/test/fortran/test_math_utilities.F90
+! ICON
+!
+! ---------------------------------------------------------------
+! Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+! Contact information: icon-model.org
+!
+! See AUTHORS.TXT for a list of authors
+! See LICENSES/ for license information
+! SPDX-License-Identifier: BSD-3-Clause
+! ---------------------------------------------------------------
+
+MODULE TEST_mo_math_utilities
+
+  USE FORTUTF
+  USE mo_math_types, ONLY: t_cartesian_coordinates, t_geographical_coordinates, &
+    &                      t_line, t_tangent_vectors
+  USE mo_math_constants, ONLY: pi, pi_2, pi_4
+  USE mo_lib_grid_geometry_info
+  ! USE mo_physical_constants, ONLY: earth_radius
+  USE, INTRINSIC :: ISO_FORTRAN_ENV, ONLY: wp => real64
+
+  IMPLICIT NONE
+
+  PRIVATE
+
+  PUBLIC :: TEST_cc2gc, TEST_gc2cc, TEST_cc2tv, TEST_gvec2cvec, TEST_cvec2gvec, TEST_tdma_solver_vec
+  REAL(wp), PARAMETER :: earth_radius = 6.371229e6_wp !! [m]    average radius
+
+CONTAINS
+
+  SUBROUTINE TEST_cc2gc
+
+    USE mo_math_utilities, ONLY: cc2gc
+
+    TYPE(t_cartesian_coordinates)    :: coord
+    TYPE(t_grid_geometry_info)       :: geometry_info
+    TYPE(t_geographical_coordinates) :: pos
+
+    REAL(wp) :: lon_ref, lat_ref
+
+    coord%x(1) = 10.0_wp
+    coord%x(2) = 10.0_wp
+    coord%x(3) = 5.0_wp
+
+    geometry_info%geometry_type = sphere_geometry
+
+    pos = cc2gc(coord, geometry_info)
+
+    lon_ref = 0.78539816339744828_wp
+    lat_ref = 0.33983690945412193_wp
+
+    CALL TAG_TEST("TEST_cc2gc_sphere_lon")
+    CALL ASSERT_EQUAL(pos%lon, lon_ref)
+    CALL TAG_TEST("TEST_cc2gc_sphere_lat")
+    CALL ASSERT_EQUAL(pos%lat, lat_ref)
+
+    geometry_info%geometry_type = planar_torus_geometry
+
+    geometry_info%domain_length = 2.0_wp*pi*earth_radius
+    geometry_info%domain_height = 2.0_wp*pi*earth_radius
+
+    coord%x(1) = 1000000.0_wp
+    coord%x(2) = 100.0_wp
+
+    pos = cc2gc(coord, geometry_info)
+
+    lon_ref = 0.15695558894524117_wp
+    lat_ref = -0.17453205322393880_wp
+
+    CALL TAG_TEST("TEST_cc2gc_torus_lon")
+    CALL ASSERT_EQUAL(pos%lon, lon_ref)
+    CALL TAG_TEST("TEST_cc2gc_torus_lat")
+    CALL ASSERT_EQUAL(pos%lat, lat_ref)
+
+  END SUBROUTINE TEST_cc2gc
+
+  SUBROUTINE TEST_gc2cc
+
+    USE mo_math_utilities, ONLY: gc2cc
+
+    TYPE(t_cartesian_coordinates)    :: coord, coord_ref
+    TYPE(t_grid_geometry_info)       :: geometry_info
+    TYPE(t_geographical_coordinates) :: pos
+
+    REAL(wp) :: lon_ref, lat_ref, tol
+    INTEGER :: i
+    CHARACTER(LEN=32) :: tag
+
+    tol = 1d-15
+    pos%lon = 0.78_wp
+    pos%lat = 0.34_wp
+
+    geometry_info%geometry_type = sphere_geometry
+
+    coord = gc2cc(pos, geometry_info)
+
+    coord_ref%x(1) = 0.6702170547483377_wp
+    coord_ref%x(2) = 0.6630199536212522_wp
+    coord_ref%x(3) = 0.3334870921408144_wp
+
+    DO i = 1, SIZE(coord%x)
+!     write(*,"(i4,a,f24.16)") i, ' coord%x(i): ', coord%x(i)
+      WRITE (tag, '(a,i1)') "TEST_gc2cc_sphere_", i
+      CALL TAG_TEST(tag)
+      CALL ASSERT_ALMOST_EQUAL(coord%x(i), coord_ref%x(i), tol)
+    END DO
+
+    geometry_info%geometry_type = planar_torus_geometry
+
+    geometry_info%domain_length = 2.0_wp*pi*earth_radius
+    geometry_info%domain_height = 2.0_wp*pi*earth_radius
+
+    pos%lon = 0.15_wp
+    pos%lat = -0.17_wp
+
+    coord = gc2cc(pos, geometry_info)
+
+    coord_ref%x(1) = 955684.3499999998603016_wp
+    coord_ref%x(2) = 519845.4807382422150113_wp
+    coord_ref%x(3) = 0.0_wp
+
+    DO i = 1, SIZE(coord%x)
+!     write(*,"(i4,a,f24.16)") i, ' coord%x(i): ', coord%x(i)
+      WRITE (tag, '(a,i1)') "TEST_gc2cc_torus_", i
+      CALL TAG_TEST(tag)
+      CALL ASSERT_EQUAL(coord%x(i), coord_ref%x(i))
+    END DO
+
+  END SUBROUTINE TEST_gc2cc
+
+  SUBROUTINE TEST_cc2tv
+    USE mo_math_utilities, ONLY: cc2tv
+
+    TYPE(t_cartesian_coordinates) :: coord
+    TYPE(t_geographical_coordinates) :: pos
+    TYPE(t_tangent_vectors):: tt
+    REAL(wp) :: v1_ref, v2_ref
+
+    pos%lon = pi_2
+    pos%lat = pi_4
+    coord%x(1) = 10.0_wp
+    coord%x(2) = 15.0_wp
+    coord%x(3) = 5.0_wp
+
+    tt = cc2tv(coord, pos)
+
+    v1_ref = -9.9999999999999982_wp
+    v2_ref = -7.0710678118654737_wp
+
+    CALL TAG_TEST("TEST_cc2tv_v1")
+    CALL ASSERT_EQUAL(tt%v1, v1_ref)
+    CALL TAG_TEST("TEST_cc2tv_v2")
+    CALL ASSERT_EQUAL(tt%v2, v2_ref)
+
+  END SUBROUTINE TEST_cc2tv
+
+  SUBROUTINE TEST_gvec2cvec
+    USE mo_math_utilities, ONLY: gvec2cvec
+
+    REAL(wp) :: p_gu, p_gv ! zonal and meridional vec. component
+    REAL(wp) :: p_long, p_lat ! geo. coord. of data point
+    TYPE(t_grid_geometry_info) :: geometry_info
+
+    REAL(wp) :: p_cu, p_cv, p_cw ! Cart. vector
+    REAL(wp) :: p_cu_ref, p_cv_ref, p_cw_ref ! Cart. vector ref
+
+    geometry_info%geometry_type = sphere_geometry
+
+    p_gu = 10.0_wp
+    p_gv = 5.0_wp
+
+    p_long = pi_2
+    p_lat = pi_4
+
+    CALL gvec2cvec(p_gu, p_gv, p_long, p_lat, p_cu, p_cv, p_cw, geometry_info)
+
+    p_cu_ref = -10.0_wp
+    p_cv_ref = -3.5355339059327369_wp
+    p_cw_ref = 3.5355339059327378_wp
+
+    CALL TAG_TEST("TEST_gvec2cvec_sphere_cu")
+    CALL ASSERT_EQUAL(p_cu, p_cu_ref)
+    CALL TAG_TEST("TEST_gvec2cvec_sphere_cv")
+    CALL ASSERT_EQUAL(p_cv, p_cv_ref)
+    CALL TAG_TEST("TEST_gvec2cvec_sphere_cw")
+    CALL ASSERT_EQUAL(p_cw, p_cw_ref)
+
+    geometry_info%geometry_type = planar_torus_geometry
+
+    CALL gvec2cvec(p_gu, p_gv, p_long, p_lat, p_cu, p_cv, p_cw, geometry_info)
+
+    CALL TAG_TEST("TEST_gvec2cvec_torus_cu")
+    CALL ASSERT_EQUAL(p_cu, p_gu)
+    CALL TAG_TEST("TEST_gvec2cvec_torus_cv")
+    CALL ASSERT_EQUAL(p_cv, p_gv)
+    CALL TAG_TEST("TEST_gvec2cvec_torus_cw")
+    CALL ASSERT_EQUAL(p_cw, 0.0_wp)
+
+  END SUBROUTINE TEST_gvec2cvec
+
+  SUBROUTINE TEST_cvec2gvec
+    USE mo_math_utilities, ONLY: cvec2gvec
+
+    REAL(wp) :: p_cu, p_cv, p_cw ! Cart. vector
+    REAL(wp) :: p_long, p_lat ! geo. coord. of data point
+    TYPE(t_grid_geometry_info) :: geometry_info
+
+    REAL(wp) :: p_gu, p_gv ! zonal and meridional vec. comp.
+    REAL(wp) :: p_gu_ref, p_gv_ref
+
+    geometry_info%geometry_type = sphere_geometry
+
+    p_cu = -10.0_wp
+    p_cv = -3.0_wp
+    p_cw = 3.0_wp
+
+    p_long = pi_2
+    p_lat = pi_4
+
+    CALL cvec2gvec(p_cu, p_cv, p_cw, p_long, p_lat, p_gu, p_gv, geometry_info)
+
+!   write(*,"(a,f24.16)") ' p_gu: ', p_gu
+!   write(*,"(a,f24.16)") ' p_gv: ', p_gv
+
+    p_gu_ref = 10.0_wp
+    p_gv_ref = 4.2426406871192857_wp
+
+    CALL TAG_TEST("TEST_cvec2gvec_sphere_gu")
+    CALL ASSERT_EQUAL(p_gu, p_gu_ref)
+    CALL TAG_TEST("TEST_cvec2gvec_sphere_gv")
+    CALL ASSERT_EQUAL(p_gv, p_gv_ref)
+
+    geometry_info%geometry_type = planar_torus_geometry
+
+    CALL cvec2gvec(p_cu, p_cv, p_cw, p_long, p_lat, p_gu, p_gv, geometry_info)
+
+    CALL TAG_TEST("TEST_cvec2gvec_torus_gu")
+    CALL ASSERT_EQUAL(p_gu, p_cu)
+    CALL TAG_TEST("TEST_cvec2gvec_torus_gv")
+    CALL ASSERT_EQUAL(p_gv, p_cv)
+
+  END SUBROUTINE TEST_cvec2gvec
+
+  SUBROUTINE TEST_tdma_solver_vec
+
+    USE mo_math_utilities, ONLY: tdma_solver_vec
+
+    INTEGER, PARAMETER :: n = 10
+    REAL(wp) :: a(n, n), b(n, n), c(n, n), d(n, n), x(n, n)
+    INTEGER :: i, j
+    REAL(wp) :: sum, sum_ref, tol
+    REAL(wp) :: start_time, end_time, elapsed_time
+
+    DO i = 1, n
+      DO j = 1, n
+        a(i, j) = 1.0_wp*(i + j)
+        b(i, j) = 2.0_wp*(i + j)
+        c(i, j) = 1.0_wp*(i + j)
+        d(i, j) = 1.0_wp*(i + j)
+      END DO
+    END DO
+
+    tol = 1d-15
+
+    CALL CPU_TIME(start_time)
+#ifndef __USE_CPP_BINDINGS
+    CALL tdma_solver_vec(a, b, c, d, 1, n, 1, n, x)
+#else
+    CALL tdma_solver_vec(a, b, c, d, 0, n, 0, n, n, n, x, -1)
+#endif
+    CALL CPU_TIME(end_time)
+
+    ! Compute elapsed time
+    elapsed_time = end_time - start_time
+
+    ! Output timing result
+    WRITE (*, *) "Elapsed time for tdma_solver_vec: ", elapsed_time, " seconds"
+
+    sum = 0.0_wp
+    DO i = 1, n
+      DO j = 1, n
+        sum = sum + x(i, j)
+      END DO
+    END DO
+    sum_ref = 27.2727272727272769_wp
+    CALL TAG_TEST("TEST_tdma_solver_vec_full")
+    CALL ASSERT_ALMOST_EQUAL(sum, sum_ref, tol)
+
+    x = 0.0_wp
+#ifndef __USE_CPP_BINDINGS
+    CALL tdma_solver_vec(a, b, c, d, 2, n - 1, 2, n - 1, x)
+#else
+    CALL tdma_solver_vec(a, b, c, d, 1, n - 1, 1, n - 1, n, n, x, -1)
+#endif
+    sum = 0.0_wp
+    DO i = 2, n - 1
+      DO j = 2, n - 1
+        sum = sum + x(i, j)
+      END DO
+    END DO
+    sum_ref = 17.7777777777777679_wp
+
+    CALL TAG_TEST("TEST_tdma_solver_vec_partial")
+    CALL ASSERT_ALMOST_EQUAL(sum, sum_ref, tol)
+
+  END SUBROUTINE TEST_tdma_solver_vec
+
+END MODULE TEST_mo_math_utilities
No results found