Compare revisions

Dylan Kierans · Pradipta Samanta · Pradipta Samanta · Pradipta Samanta · Pradipta Samanta · Pradipta Samanta
--- a/.cmake-format.py
+++ b/.cmake-format.py
@@ -37,3 +37,4 @@ with section("lint"):
 	local_var_pattern = '[a-zA-Z][0-9a-zA-z_]+'
 	private_var_pattern = '[a-z][a-z0-9_]+'
 	public_var_pattern = '[A-Z][0-9a-zA-Z_]+'
+	global_var_pattern = '[A-Z][0-9a-zA-Z_]+'
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,12 @@ cmake_install.cmake
 iconmath-config-version.cmake
 iconmath-config.cmake
 iconmath-targets.cmake
+KokkosConfig.cmake
+KokkosConfigVersion.cmake
+KokkosTargets.cmake
+KokkosConfigCommon.cmake
+Kokkos_Version_Info.cpp
+Kokkos_Version_Info.hpp

 # Build stage files:
 *.L
@@ -22,4 +28,5 @@ iconmath_Tests

 # Test stage files:
 /**/Testing/*
+iconmath_Tests
 run_tests.f90
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,9 @@ cmake_minimum_required(VERSION 3.18)
 project(
  iconmath
  VERSION 1.0.0
-  LANGUAGES Fortran)
+  LANGUAGES Fortran CXX)
+
+set(CMAKE_CXX_STANDARD 17)

 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 option(BUILD_TESTING "Build tests" ON)
@@ -23,9 +25,11 @@ option(BUILD_ICONMATH_HORIZONTAL "Build horizontal library" ON)

 option(IM_ENABLE_MIXED_PRECISION "Enable mixed precision" OFF)
 option(IM_ENABLE_LOOP_EXCHANGE "Enable loop exchange" OFF)
+option(IM_USE_CPP_BINDINGS "Use C++ bindings" OFF)
 option(IM_ENABLE_DIM_SWAP "Enable dimension swap" OFF)
 option(IM_ENABLE_OPENACC "Enable OpenACC support" OFF)
 option(IM_ENABLE_OPENMP "Enable OpenMP support" OFF)
+set(IM_ENABLE_GPU OFF CACHE STRING "Enable Kokkos GPU support for arch. Valid values: OFF, nvidia-sm80")

 # GNUInstallDirs issues a warning if CMAKE_SIZEOF_VOID_P is not defined, which
 # is the case with NAG. One way to circumvent that is to enable C language for
@@ -107,6 +111,32 @@ else()
  endif()
 endif()

+include(FetchContent)
+# configure kokkos 4.4 repository link
+FetchContent_Declare(
+  kokkos
+  URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz
+  URL_HASH MD5=eafd0d42c9831858aa84fde78576644c)
+
+# disable build of C++23 mdspan experimental support for now
+set(Kokkos_ENABLE_IMPL_MDSPAN OFF CACHE BOOL "Experimental mdspan support")
+
+# by default, build the Kokkos serial backend for CPU
+set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "Kokkos Serial backend")
+set(Kokkos_ARCH_NATIVE ON CACHE BOOL "Kokkos native architecture optimisations")
+
+if ("${IM_ENABLE_GPU}" STREQUAL "nvidia-sm80")
+  # NVIDIA A100
+  set(Kokkos_ENABLE_CUDA ON CACHE BOOL "Kokkos CUDA backend")
+  set(Kokkos_ARCH_AMPERE80 ON CACHE BOOL "CUDA architecture: Ampere cc80")
+endif()
+
+if (${IM_ENABLE_OPENMP})
+  set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "Kokkos OpenMP backend")
+endif()
+
+FetchContent_MakeAvailable(kokkos)
+
 add_subdirectory(src)

 # Allow for 'make test' even if the tests are disabled:

--- a/_typos.toml
+++ b/_typos.toml
@@ -2,6 +2,9 @@
 extend-ignore-re = [
 	".*_pn",
 ]
+extend-ignore-words-re = [
+  "Comput",
+]

 [default.extend-words]
 Wirth = "Wirth" # author name

--- a/src/horizontal/CMakeLists.txt
+++ b/src/horizontal/CMakeLists.txt
@@ -39,7 +39,9 @@ if(IM_ENABLE_OPENACC)
  # If _OPENACC is defined, assume that the required compiler flags are already
  # provided, e.g. in CMAKE_Fortran_FLAGS:
  if(NOT HAS_OPENACC_MACRO)
-    target_compile_options(iconmath-horizontal PRIVATE ${OpenACC_Fortran_OPTIONS})
+    target_compile_options(iconmath-horizontal
+                           PRIVATE
+                           $<$<COMPILE_LANGUAGE:Fortran>:${OpenACC_Fortran_OPTIONS}>)
    # This make sures that unit tests (FortUTF) compiles without the need of
    # passing OpenACC compile option.
    target_link_libraries(iconmath-horizontal PRIVATE OpenACC::OpenACC_Fortran)
@@ -52,11 +54,22 @@ target_include_directories(
    # Path to the Fortran modules:
    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:Fortran>:${Fortran_MODULE_DIRECTORY}>>
    $<INSTALL_INTERFACE:$<$<COMPILE_LANGUAGE:Fortran>:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>>
+    # Path to the internal C/C++ headers (for testing): Requires CMake 3.15+ for
+    # multiple compile languages
+    # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html
+    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:C,CXX>:${CMAKE_CURRENT_SOURCE_DIR}>>
+  PRIVATE
+    # Path to config.h (for C and C++ only): Requires CMake 3.15+ for multiple
+    # compile languages
+    # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html
+    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:C,CXX>:${CMAKE_CURRENT_BINARY_DIR}>>
 )

 target_link_libraries(iconmath-horizontal PUBLIC fortran-support::fortran-support)
 target_link_libraries(iconmath-horizontal PUBLIC iconmath-support)
 target_link_libraries(iconmath-horizontal PUBLIC iconmath-interpolation)
+target_link_libraries(iconmath-horizontal PRIVATE Kokkos::kokkos)
+set_target_properties(iconmath-horizontal PROPERTIES LINKER_LANGUAGE Fortran)

 install(TARGETS iconmath-horizontal EXPORT "${PROJECT_NAME}-targets")


--- a/src/interpolation/CMakeLists.txt
+++ b/src/interpolation/CMakeLists.txt
@@ -12,8 +12,12 @@
 add_library(
  iconmath-interpolation
  mo_lib_interpolation_scalar.F90
+  mo_lib_interpolation_scalar.cpp
  mo_lib_interpolation_vector.F90
-  mo_lib_intp_rbf.F90)
+  mo_lib_interpolation_vector.cpp
+  mo_lib_intp_rbf.F90
+  mo_lib_intp_rbf.cpp
+  interpolation_bindings.cpp)

 add_library(${PROJECT_NAME}::interpolation ALIAS iconmath-interpolation)

@@ -40,7 +44,8 @@ if(IM_ENABLE_OPENACC)
  # provided, e.g. in CMAKE_Fortran_FLAGS:
  if(NOT HAS_OPENACC_MACRO)
    target_compile_options(iconmath-interpolation
-                           PRIVATE ${OpenACC_Fortran_OPTIONS})
+                           PRIVATE
+                           $<$<COMPILE_LANGUAGE:Fortran>:${OpenACC_Fortran_OPTIONS}>)
    # This make sures that unit tests (FortUTF) compiles without the need of
    # passing OpenACC compile option.
    target_link_libraries(iconmath-interpolation PRIVATE OpenACC::OpenACC_Fortran)
@@ -55,10 +60,21 @@ target_include_directories(
    $<INSTALL_INTERFACE:$<$<COMPILE_LANGUAGE:Fortran>:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>>
    # Path to internal include directory
    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:Fortran>:${PROJECT_SOURCE_DIR}/include>>
+    # Path to the internal C/C++ headers (for testing): Requires CMake 3.15+ for
+    # multiple compile languages
+    # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html
+    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:C,CXX>:${CMAKE_CURRENT_SOURCE_DIR}>>
+  PRIVATE
+    # Path to config.h (for C and C++ only): Requires CMake 3.15+ for multiple
+    # compile languages
+    # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html
+    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:C,CXX>:${CMAKE_CURRENT_BINARY_DIR}>>
 )

 target_link_libraries(iconmath-interpolation PUBLIC fortran-support::fortran-support)
 target_link_libraries(iconmath-interpolation PUBLIC iconmath-support)
+target_link_libraries(iconmath-interpolation PRIVATE Kokkos::kokkos)
+set_target_properties(iconmath-interpolation PROPERTIES LINKER_LANGUAGE Fortran)

 install(TARGETS iconmath-interpolation EXPORT "${PROJECT_NAME}-targets")


--- a/src/interpolation/interpolation_bindings.cpp
+++ b/src/interpolation/interpolation_bindings.cpp
--- a/src/interpolation/interpolation_bindings.h
+++ b/src/interpolation/interpolation_bindings.h
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+#pragma once
+
+extern "C" {
+
+// mo_lib_interpolation_vector.F90
+void edges2cells_vector_lib_dp(const double *p_vn_in, const double *p_vt_in,
+                               const int *cell_edge_idx,
+                               const int *cell_edge_blk,
+                               const double *e_bln_c_u, const double *e_bln_c_v,
+                               double *p_u_out, double *p_v_out, int i_startblk,
+                               int i_endblk, int i_startidx_in, int i_endidx_in,
+                               int slev, int elev, int nproma, int nlev,
+                               int nblks_e, int nblks_c);
+
+void edges2cells_vector_lib_sp(const float *p_vn_in, const float *p_vt_in,
+                               const int *cell_edge_idx,
+                               const int *cell_edge_blk, const float *e_bln_c_u,
+                               const float *e_bln_c_v, float *p_u_out,
+                               float *p_v_out, int i_startblk, int i_endblk,
+                               int i_startidx_in, int i_endidx_in, int slev,
+                               int elev, int nproma, int nlev, int nblks_e,
+                               int nblks_c);
+
+// mo_lib_interpolation_scalar.F90
+void verts2edges_scalar_lib_dp(
+    const double *p_vertex_in, const int *edge_vertex_idx,
+    const int *edge_vertex_blk, const double *coeff_int, double *p_edge_out,
+    const int i_startblk, const int i_endblk, const int i_startidx_in,
+    const int i_endidx_in, const int slev, const int elev, const int nproma,
+    const int nlev, const int nblks_v, const int nblks_e, const bool lacc);
+
+void verts2edges_scalar_lib_sp(
+    const float *p_vertex_in, const int *edge_vertex_idx,
+    const int *edge_vertex_blk, const float *coeff_int, float *p_edge_out,
+    const int i_startblk, const int i_endblk, const int i_startidx_in,
+    const int i_endidx_in, const int slev, const int elev, const int nproma,
+    const int nlev, const int nblks_v, const int nblks_e, const bool lacc);
+
+void cells2edges_scalar_lib_dp(
+    const double *p_cell_in, const int *edge_cell_idx, const int *edge_cell_blk,
+    const double *coeff_int, double *p_edge_out, const int *i_startblk_in,
+    const int *i_endblk_in, const int *i_startidx_in, const int *i_endidx_in,
+    const int slev, const int elev, const int nproma, const int nlev,
+    const int nblk_c, const int nblks_e, const int patch_id,
+    const bool l_limited_area, const bool lfill_latbc, const bool lacc);
+
+void cells2edges_scalar_lib_sp(const float *p_cell_in, const int *edge_cell_idx,
+                               const int *edge_cell_blk, const float *coeff_int,
+                               float *p_edge_out, const int *i_startblk_in,
+                               const int *i_endblk_in, const int *i_startidx_in,
+                               const int *i_endidx_in, const int slev,
+                               const int elev, const int nproma, const int nlev,
+                               const int nblk_c, const int nblks_e,
+                               const int patch_id, const bool l_limited_area,
+                               const bool lfill_latbc, const bool lacc);
+
+void cells2edges_scalar_lib_sp2dp(
+    const float *p_cell_in, const int *edge_cell_idx, const int *edge_cell_blk,
+    const double *coeff_int, double *p_edge_out, const int *i_startblk_in,
+    const int *i_endblk_in, const int *i_startidx_in, const int *i_endidx_in,
+    const int slev, const int elev, const int nproma, const int nlev,
+    const int nblk_c, const int nblks_e, const int patch_id,
+    const bool l_limited_area, const bool lfill_latbc, const bool lacc);
+
+void edges2verts_scalar_lib_dp(
+    const double *p_edge_in, const int *vert_edge_idx, const int *vert_edge_blk,
+    const double *v_int, double *p_vert_out, const int i_startblk,
+    const int i_endblk, const int i_startidx_in, const int i_endidx_in,
+    const int slev, const int elev, const int nproma, const int nlev,
+    const int nblks_e, const int nblks_v, const bool lacc);
+
+void edges2verts_scalar_lib_sp(const float *p_edge_in, const int *vert_edge_idx,
+                               const int *vert_edge_blk, const float *v_int,
+                               float *p_vert_out, const int i_startblk,
+                               const int i_endblk, const int i_startidx_in,
+                               const int i_endidx_in, const int slev,
+                               const int elev, const int nproma, const int nlev,
+                               const int nblks_e, const int nblks_v,
+                               const bool lacc);
+
+void edges2cells_scalar_lib_dp(const double *p_edge_in, const int *edge_idx,
+                               const int *edge_blk, const double *coeff_int,
+                               double *p_cell_out, const int i_startblk,
+                               const int i_endblk, const int i_startidx_in,
+                               const int i_endidx_in, const int slev,
+                               const int elev, const int nproma, const int nlev,
+                               const int nblks_e, const int nblks_c,
+                               const bool lacc);
+void edges2cells_scalar_lib_sp(const float *p_edge_in, const int *edge_idx,
+                               const int *edge_blk, const float *coeff_int,
+                               float *p_cell_out, const int i_startblk,
+                               const int i_endblk, const int i_startidx_in,
+                               const int i_endidx_in, const int slev,
+                               const int elev, const int nproma, const int nlev,
+                               const int nblks_e, const int nblks_c,
+                               const bool lacc);
+
+/////////////////////////////////////////////
+
+void cells2verts_scalar_lib_dp(
+    const double *p_cell_in, const int *vert_cell_idx, const int *vert_cell_blk,
+    const double *coeff_int, double *p_vert_out, const int i_startblk,
+    const int i_endblk, const int i_startidx_in, const int i_endidx_in,
+    const int slev, const int elev, const int nproma, const int nlev,
+    const int nblks_c, const int nblks_v, const bool lacc,
+    const bool acc_async);
+void cells2verts_scalar_lib_dp2sp(
+    const double *p_cell_in, const int *vert_cell_idx, const int *vert_cell_blk,
+    const float *coeff_int, float *p_vert_out, const int i_startblk,
+    const int i_endblk, const int i_startidx_in, const int i_endidx_in,
+    const int slev, const int elev, const int nproma, const int nlev,
+    const int nblks_c, const int nblks_v, const bool lacc,
+    const bool acc_async);
+void cells2verts_scalar_lib_sp(const float *p_cell_in, const int *vert_cell_idx,
+                               const int *vert_cell_blk, const float *coeff_int,
+                               float *p_vert_out, const int i_startblk,
+                               const int i_endblk, const int i_startidx_in,
+                               const int i_endidx_in, const int slev,
+                               const int elev, const int nproma, const int nlev,
+                               const int nblks_c, const int nblks_v,
+                               const bool lacc, const bool acc_async);
+
+/////////////////////////////////////////////
+
+void cells2verts_scalar_ri_lib_dp(
+    const double *p_cell_in, const int *vert_cell_idx, const int *vert_cell_blk,
+    const double *coeff_int, double *p_vert_out, const int i_startblk,
+    const int i_endblk, const int i_startidx_in, const int i_endidx_in,
+    const int slev, const int elev, const int nproma, const int nlev,
+    const int nblks_c, const int nblks_v, const bool lacc,
+    const bool acc_async);
+
+void cells2verts_scalar_ri_lib_dp2sp(
+    const double *p_cell_in, const int *vert_cell_idx, const int *vert_cell_blk,
+    const double *coeff_int, float *p_vert_out, const int i_startblk,
+    const int i_endblk, const int i_startidx_in, const int i_endidx_in,
+    const int slev, const int elev, const int nproma, const int nlev,
+    const int nblks_c, const int nblks_v, const bool lacc,
+    const bool acc_async);
+
+void cells2verts_scalar_ri_lib_sp(
+    const float *p_cell_in, const int *vert_cell_idx, const int *vert_cell_blk,
+    const float *coeff_int, float *p_vert_out, const int i_startblk,
+    const int i_endblk, const int i_startidx_in, const int i_endidx_in,
+    const int slev, const int elev, const int nproma, const int nlev,
+    const int nblks_c, const int nblks_v, const bool lacc,
+    const bool acc_async);
+
+/////////////////////////////////////////////
+
+void verts2cells_scalar_lib_dp(
+    const double *p_vert_in, const int *cell_index_idx,
+    const int *cell_vertex_blk, const double *coeff_int, double *p_cell_out,
+    const int nblks_c, const int npromz_c, const int slev, const int elev,
+    const int nproma, const int nlev, const int nblks_v, const bool lacc);
+
+void verts2cells_scalar_lib_sp(
+    const float *p_vert_in, const int *cell_index_idx,
+    const int *cell_vertex_blk, const float *coeff_int, float *p_cell_out,
+    const int nblks_c, const int npromz_c, const int slev, const int elev,
+    const int nproma, const int nlev, const int nblks_v, const bool lacc);
+
+/////////////////////////////////////////////
+
+void cell_avg_lib_dp(const double *psi_c, const int *cell_neighbor_idx,
+                     const int *cell_neighbor_blk, const double *avg_coeff,
+                     double *avg_psi_c, const int i_startblk,
+                     const int i_endblk, const int i_startidx_in,
+                     const int i_endidx_in, const int slev, const int elev,
+                     const int nproma, const int nlev, const int nblks_c,
+                     const bool lacc);
+void cell_avg_lib_sp(const float *psi_c, const int *cell_neighbor_idx,
+                     const int *cell_neighbor_blk, const float *avg_coeff,
+                     float *avg_psi_c, const int i_startblk, const int i_endblk,
+                     const int i_startidx_in, const int i_endidx_in,
+                     const int slev, const int elev, const int nproma,
+                     const int nlev, const int nblks_c, const bool lacc);
+
+void rbf_vec_interpol_vertex_lib_dp(
+    const double *p_e_in, const int *rbf_vec_idx_v, const int *rbf_vec_blk_v,
+    const double *rbf_vec_coeff_v, double *p_u_out, double *p_v_out,
+    const int i_startblk, const int i_endblk, const int i_startidx_in,
+    const int i_endidx_in, const int slev, const int elev, const int nproma,
+    const bool lacc, const bool acc_async, const int nlev, const int nblks_e,
+    const int nblks_v);
+
+void rbf_vec_interpol_vertex_lib_sp(
+    const float *p_e_in, const int *rbf_vec_idx_v, const int *rbf_vec_blk_v,
+    const float *rbf_vec_coeff_v, float *p_u_out, float *p_v_out,
+    const int i_startblk, const int i_endblk, const int i_startidx_in,
+    const int i_endidx_in, const int slev, const int elev, const int nproma,
+    const bool lacc, const bool acc_async, const int nlev, const int nblks_e,
+    const int nblks_v);
+
+void rbf_vec_interpol_vertex_lib_dpsp(
+    const double *p_e_in, const int *rbf_vec_idx_v, const int *rbf_vec_blk_v,
+    const double *rbf_vec_coeff_v, float *p_u_out, float *p_v_out,
+    const int i_startblk, const int i_endblk, const int i_startidx_in,
+    const int i_endidx_in, const int slev, const int elev, const int nproma,
+    const bool lacc, const bool acc_async, const int nlev, const int nblks_e,
+    const int nblks_v);
+
+void rbf_interpol_c2grad_lib_sp(
+    const float *p_cell_in, const int *rbf_c2grad_idx,
+    const int *rbf_c2grad_blk, const float *rbf_c2grad_coeff, float *grad_x,
+    const float *grad_y, int i_startblk, int i_endblk, int i_startidx_in,
+    int i_endidx_in, int slev, int elev, int nproma, int rbf_c2grad_dim,
+    int nlev, int nblk_c, bool lacc);
+
+void rbf_interpol_c2grad_lib_dp(
+    const double *p_cell_in, const int *rbf_c2grad_idx,
+    const int *rbf_c2grad_blk, const double *rbf_c2grad_coeff, double *grad_x,
+    const double *grad_y, int i_startblk, int i_endblk, int i_startidx_in,
+    int i_endidx_in, int slev, int elev, int nproma, int rbf_c2grad_dim,
+    int nlev, int nblk_c, bool lacc);
+
+void rbf_vec_interpol_cell_lib_sp(
+    const float *p_vn_in, const int *rbf_vec_idx_c, const int *rbf_vec_blk_c,
+    const float *rbf_vec_coeff_c, float *p_u_out, float *p_v_out,
+    int i_startblk, int i_endblk, int i_startidx_in, int i_endidx_in, int slev,
+    int elev, int nproma, int nlev, int nblks_c, int nblks_e, int rbf_vec_dim_c,
+    bool lacc, bool acc_async);
+
+void rbf_vec_interpol_cell_lib_dp(
+    const double *p_vn_in, const int *rbf_vec_idx_c, const int *rbf_vec_blk_c,
+    const double *rbf_vec_coeff_c, double *p_u_out, double *p_v_out,
+    int i_startblk, int i_endblk, int i_startidx_in, int i_endidx_in, int slev,
+    int elev, int nproma, int nlev, int nblks_c, int nblks_e, int rbf_vec_dim_c,
+    bool lacc, bool acc_async);
+
+void rbf_vec_interpol_edge_lib_dp(
+    const double *p_vn_in, const int *rbf_vec_idx_e, const int *rbf_vec_blk_e,
+    const double *rbf_vec_coeff_e, double *p_vt_out, int i_startblk,
+    int i_endblk, int i_startidx_in, int i_endidx_in, int slev, int elev,
+    int nlev, int nproma, int rbf_vec_dim_e, int nblks_e, bool lacc,
+    bool acc_async);
+
+void rbf_vec_interpol_edge_lib_sp(
+    const float *p_vn_in, const int *rbf_vec_idx_e, const int *rbf_vec_blk_e,
+    const float *rbf_vec_coeff_e, float *p_vt_out, int i_startblk, int i_endblk,
+    int i_startidx_in, int i_endidx_in, int slev, int elev, int nlev,
+    int nproma, int rbf_vec_dim_e, int nblks_e, bool lacc, bool acc_async);
+}
--- a/src/interpolation/mo_lib_interpolation_scalar.cpp
+++ b/src/interpolation/mo_lib_interpolation_scalar.cpp
--- a/src/interpolation/mo_lib_interpolation_scalar.hpp
+++ b/src/interpolation/mo_lib_interpolation_scalar.hpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+
+#pragma once
+
+template <typename T>
+void verts2edges_scalar_lib(const T *p_vertex_in, const int *edge_vertex_idx,
+                            const int *edge_vertex_blk, const T *coeff_int,
+                            T *p_edge_out, const int i_startblk,
+                            const int i_endblk, const int i_startidx_in,
+                            const int i_endidx_in, const int slev,
+                            const int elev, const int nproma, const int nlev,
+                            const int nblks_v, const int nblks_e,
+                            const bool lacc);
+;
+
+template <typename T, typename S>
+void cells2edges_scalar_lib(const T *p_cell_in, const int *edge_cell_idx,
+                            const int *edge_cell_blk, const S *coeff_int,
+                            S *p_edge_out, const int *i_startblk_in,
+                            const int *i_endblk_in, const int *i_startidx_in,
+                            const int *i_endidx_in, const int slev,
+                            const int elev, const int nproma, const int nlev,
+                            const int nblk_c, const int nblks_e,
+                            const int patch_id, const bool l_limited_area,
+                            const bool lfill_latbc, const bool lacc);
+
+template <typename T>
+void edges2verts_scalar_lib(const T *p_edge_in, const int *vert_edge_idx,
+                            const int *vert_edge_blk, const T *v_int,
+                            T *p_vert_out, const int i_startblk,
+                            const int i_endblk, const int i_startidx_in,
+                            const int i_endidx_in, const int slev,
+                            const int elev, const int nproma, const int nlev,
+                            const int nblks_e, const int nblks_v,
+                            const bool lacc);
+
+template <typename T>
+void edges2cells_scalar_lib(const T *p_edge_in, const int *edge_idx,
+                            const int *edge_blk, const T *coeff_int,
+                            T *p_cell_out, const int i_startblk,
+                            const int i_endblk, const int i_startidx_in,
+                            const int i_endidx_in, const int slev,
+                            const int elev, const int nproma, const int nlev,
+                            const int nblks_e, const int nblks_c,
+                            const bool lacc);
+
+template <typename T, typename S>
+void cells2verts_scalar_lib(const T *p_cell_in, const int *vert_cell_idx,
+                            const int *vert_cell_blk, const S *coeff_int,
+                            S *p_vert_out, const int i_startblk,
+                            const int i_endblk, const int i_startidx_in,
+                            const int i_endidx_in, const int slev,
+                            const int elev, const int nproma, const int nlev,
+                            const int nblks_c, const int nblks_v,
+                            const bool lacc, const bool acc_async);
+
+template <typename T, typename S>
+void cells2verts_scalar_ri_lib(const T *p_cell_in, const int *vert_cell_idx,
+                               const int *vert_cell_blk, const T *coeff_int,
+                               S *p_vert_out, const int i_startblk,
+                               const int i_endblk, const int i_startidx_in,
+                               const int i_endidx_in, const int slev,
+                               const int elev, const int nproma, const int nlev,
+                               const int nblks_c, const int nblks_v,
+                               const bool lacc, const bool acc_async);
+
+template <typename T>
+void verts2cells_scalar_lib(const T *p_vert_in, const int *cell_index_idx,
+                            const int *cell_vertex_blk, const T *coeff_int,
+                            T *p_cell_out, const int nblks_c,
+                            const int npromz_c, const int slev, const int elev,
+                            const int nproma, const int nlev, const int nblks_v,
+                            const bool lacc);
+
+template <typename T>
+void cell_avg_lib(const T *psi_c, const int *cell_neighbor_idx,
+                  const int *cell_neighbor_blk, const T *avg_coeff,
+                  T *avg_psi_c, const int i_startblk, const int i_endblk,
+                  const int i_startidx_in, const int i_endidx_in,
+                  const int slev, const int elev, const int nproma,
+                  const int nlev, const int nblks_c, const bool lacc);
--- a/src/interpolation/mo_lib_interpolation_vector.cpp
+++ b/src/interpolation/mo_lib_interpolation_vector.cpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+
+#include "mo_lib_interpolation_vector.hpp"
+
+template <typename T>
+void edges2cells_vector_lib(const T *p_vn_in, const T *p_vt_in,
+                            const int *cell_edge_idx, const int *cell_edge_blk,
+                            const T *e_bln_c_u, const T *e_bln_c_v, T *p_u_out,
+                            T *p_v_out,
+                            // Additional integer parameters.
+                            int i_startblk, int i_endblk, int i_startidx_in,
+                            int i_endidx_in, int slev, int elev, int nproma,
+                            // Dimensions for the arrays.
+                            int nlev, int nblks_e, int nblks_c) {
+
+  // Wrap raw pointers in unmanaged Kokkos Views.
+  typedef Kokkos::View<const T ***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged>
+      UnmanagedConstT3D;
+  typedef Kokkos::View<T ***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged>
+      UnmanagedT3D;
+  typedef Kokkos::View<const int ***, Kokkos::LayoutLeft,
+                       Kokkos::MemoryUnmanaged>
+      UnmanagedConstInt3D;
+
+  UnmanagedConstT3D p_vn_in_view(p_vn_in, nproma, nlev, nblks_e);
+  UnmanagedConstT3D p_vt_in_view(p_vt_in, nproma, nlev, nblks_e);
+
+  UnmanagedConstInt3D cell_edge_idx_view(cell_edge_idx, nproma, nblks_c, 3);
+  UnmanagedConstInt3D cell_edge_blk_view(cell_edge_blk, nproma, nblks_c, 3);
+
+  UnmanagedConstT3D e_bln_c_u_view(e_bln_c_u, nproma, 6, nblks_c);
+  UnmanagedConstT3D e_bln_c_v_view(e_bln_c_v, nproma, 6, nblks_c);
+
+  UnmanagedT3D p_u_out_view(p_u_out, nproma, nlev, nblks_c);
+  UnmanagedT3D p_v_out_view(p_v_out, nproma, nlev, nblks_c);
+
+  // Loop over cell blocks as in the original Fortran code.
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
+    // Call get_indices_c_lib to get inner loop indices for block jb.
+    int i_startidx, i_endidx;
+    get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
+                      i_endblk, i_startidx, i_endidx);
+
+    Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy(
+        {slev, i_startidx}, {elev + 1, i_endidx + 1});
+    Kokkos::parallel_for(
+        "edges2cells_inner", innerPolicy,
+        KOKKOS_LAMBDA(const int jk, const int jc) {
+          // Compute the bilinear interpolation for cell (jc, jk, jb).
+          p_u_out_view(jc, jk, jb) =
+              e_bln_c_u_view(jc, 0, jb) *
+                  p_vn_in_view(cell_edge_idx_view(jc, jb, 0) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 0) - 1) +
+              e_bln_c_u_view(jc, 1, jb) *
+                  p_vt_in_view(cell_edge_idx_view(jc, jb, 0) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 0) - 1) +
+              e_bln_c_u_view(jc, 2, jb) *
+                  p_vn_in_view(cell_edge_idx_view(jc, jb, 1) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 1) - 1) +
+              e_bln_c_u_view(jc, 3, jb) *
+                  p_vt_in_view(cell_edge_idx_view(jc, jb, 1) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 1) - 1) +
+              e_bln_c_u_view(jc, 4, jb) *
+                  p_vn_in_view(cell_edge_idx_view(jc, jb, 2) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 2) - 1) +
+              e_bln_c_u_view(jc, 5, jb) *
+                  p_vt_in_view(cell_edge_idx_view(jc, jb, 2) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 2) - 1);
+
+          p_v_out_view(jc, jk, jb) =
+              e_bln_c_v_view(jc, 0, jb) *
+                  p_vn_in_view(cell_edge_idx_view(jc, jb, 0) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 0) - 1) +
+              e_bln_c_v_view(jc, 1, jb) *
+                  p_vt_in_view(cell_edge_idx_view(jc, jb, 0) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 0) - 1) +
+              e_bln_c_v_view(jc, 2, jb) *
+                  p_vn_in_view(cell_edge_idx_view(jc, jb, 1) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 1) - 1) +
+              e_bln_c_v_view(jc, 3, jb) *
+                  p_vt_in_view(cell_edge_idx_view(jc, jb, 1) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 1) - 1) +
+              e_bln_c_v_view(jc, 4, jb) *
+                  p_vn_in_view(cell_edge_idx_view(jc, jb, 2) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 2) - 1) +
+              e_bln_c_v_view(jc, 5, jb) *
+                  p_vt_in_view(cell_edge_idx_view(jc, jb, 2) - 1, jk,
+                               cell_edge_blk_view(jc, jb, 2) - 1);
+        });
+    // Optionally fence after each block if required.
+    Kokkos::fence();
+  }
+}
+
+template void edges2cells_vector_lib<double>(
+    const double *p_vn_in, const double *p_vt_in, const int *cell_edge_idx,
+    const int *cell_edge_blk, const double *e_bln_c_u, const double *e_bln_c_v,
+    double *p_u_out, double *p_v_out,
+    // Additional integer parameters.
+    int i_startblk, int i_endblk, int i_startidx_in, int i_endidx_in, int slev,
+    int elev, int nproma,
+    // Dimensions for the arrays.
+    int nlev, int nblks_e, int nblks_c);
+
+template void edges2cells_vector_lib<float>(
+    const float *p_vn_in, const float *p_vt_in, const int *cell_edge_idx,
+    const int *cell_edge_blk, const float *e_bln_c_u, const float *e_bln_c_v,
+    float *p_u_out, float *p_v_out,
+    // Additional integer parameters.
+    int i_startblk, int i_endblk, int i_startidx_in, int i_endidx_in, int slev,
+    int elev, int nproma,
+    // Dimensions for the arrays.
+    int nlev, int nblks_e, int nblks_c);
\ No newline at end of file
--- a/src/interpolation/mo_lib_interpolation_vector.hpp
+++ b/src/interpolation/mo_lib_interpolation_vector.hpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+#pragma once
+
+#include "mo_lib_loopindices.hpp"
+#include <Kokkos_Core.hpp>
+#include <vector>
+
+// The templated C++ function using Kokkos.
+// Raw pointer arguments are wrapped into unmanaged Kokkos::Views.
+// Note: The dimensions below must match the Fortran arrays.
+//   - p_vn_in and p_vt_in:   dimensions [nproma, nlev, nblks_e]
+//   - cell_edge_idx and cell_edge_blk: dimensions [nproma, nblks_c, 3]
+//   - e_bln_c_u and e_bln_c_v: dimensions [nproma, 6, nblks_c]
+//   - p_u_out and p_v_out:   dimensions [nproma, nlev, nblks_c]
+template <typename T>
+void edges2cells_vector_lib(const T *p_vn_in, const T *p_vt_in,
+                            const int *cell_edge_idx, const int *cell_edge_blk,
+                            const T *e_bln_c_u, const T *e_bln_c_v, T *p_u_out,
+                            T *p_v_out,
+                            // Additional integer parameters.
+                            int i_startblk, int i_endblk, int i_startidx_in,
+                            int i_endidx_in, int slev, int elev, int nproma,
+                            // Dimensions for the arrays.
+                            int nlev, int nblks_e, int nblks_c);
\ No newline at end of file
--- a/src/interpolation/mo_lib_intp_rbf.cpp
+++ b/src/interpolation/mo_lib_intp_rbf.cpp
--- a/src/interpolation/mo_lib_intp_rbf.hpp
+++ b/src/interpolation/mo_lib_intp_rbf.hpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+#pragma once
+
+#include "mo_lib_loopindices.hpp"
+#include <Kokkos_Core.hpp>
+#include <vector>
+
+template <typename T, typename S>
+void rbf_vec_interpol_vertex_lib(
+    const T *p_e_in, const int *rbf_vec_idx_v, const int *rbf_vec_blk_v,
+    const T *rbf_vec_coeff_v, S *p_u_out, S *p_v_out, const int i_startblk,
+    const int i_endblk, const int i_startidx_in, const int i_endidx_in,
+    const int slev, const int elev, const int nproma, const bool lacc,
+    const bool acc_async, const int nlev, const int nblks_e, const int nblks_c);
+
+template <typename T>
+void rbf_interpol_c2grad_lib(const T *p_cell_in, const int *rbf_c2grad_idx,
+                             const int *rbf_c2grad_blk,
+                             const T *rbf_c2grad_coeff, T *grad_x, T *grad_y,
+                             int i_startblk, int i_endblk, int i_startidx_in,
+                             int i_endidx_in, int slev, int elev, int nproma,
+                             int rbf_c2grad_dim, int nlev, int nblks_c,
+                             bool lacc);
+
+template <typename T>
+void rbf_vec_interpol_cell_lib(const T *p_vn_in, const int *rbf_vec_idx_c,
+                               const int *rbf_vec_blk_c,
+                               const T *rbf_vec_coeff_c, T *p_u_out, T *p_v_out,
+                               int i_startblk, int i_endblk, int i_startidx_in,
+                               int i_endidx_in, int slev, int elev, int nproma,
+                               int nlev, int nblks_c, int nblks_e,
+                               int rbf_vec_dim_c, bool lacc, bool acc_async);
+
+template <typename T>
+void rbf_vec_interpol_edge_lib(const T *p_vn_in, const int *rbf_vec_idx_e,
+                               const int *rbf_vec_blk_e,
+                               const T *rbf_vec_coeff_e, T *p_vt_out,
+                               int i_startblk, int i_endblk, int i_startidx_in,
+                               int i_endidx_in, int slev, int elev, int nlev,
+                               int nproma, int rbf_vec_dim_e, int nblks_e,
+                               bool lacc, bool acc_async);
--- a/src/support/CMakeLists.txt
+++ b/src/support/CMakeLists.txt
@@ -13,11 +13,14 @@ add_library(
  iconmath-support
  mo_gridman_constants.f90
  mo_lib_grid_geometry_info.f90
-  mo_lib_loopindices.f90
+  mo_lib_loopindices.cpp
+  mo_lib_loopindices.F90
  mo_math_constants.f90
  mo_math_types.f90
+  mo_math_utilities.cpp
  mo_math_utilities.F90
-  mo_random_number_generators.F90)
+  mo_random_number_generators.F90
+  support_bindings.cpp)

 add_library(${PROJECT_NAME}::support ALIAS iconmath-support)

@@ -40,11 +43,17 @@ if(IM_ENABLE_DIM_SWAP)
  target_compile_definitions(iconmath-support PRIVATE __SWAPDIM)
 endif()

+if(IM_USE_CPP_BINDINGS)
+  target_compile_definitions(iconmath-support PUBLIC __USE_CPP_BINDINGS)
+endif()
+
 if(IM_ENABLE_OPENACC)
  # If _OPENACC is defined, assume that the required compiler flags are already
  # provided, e.g. in CMAKE_Fortran_FLAGS:
  if(NOT HAS_OPENACC_MACRO)
-    target_compile_options(iconmath-support PRIVATE ${OpenACC_Fortran_OPTIONS})
+    target_compile_options(iconmath-support
+                           PRIVATE
+                           $<$<COMPILE_LANGUAGE:Fortran>:${OpenACC_Fortran_OPTIONS}>)
    # This make sures that unit tests (FortUTF) compiles without the need of
    # passing OpenACC compile option.
    target_link_libraries(iconmath-support PRIVATE OpenACC::OpenACC_Fortran)
@@ -57,9 +66,24 @@ target_include_directories(
    # Path to the Fortran modules:
    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:Fortran>:${Fortran_MODULE_DIRECTORY}>>
    $<INSTALL_INTERFACE:$<$<COMPILE_LANGUAGE:Fortran>:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>>
-)
+  INTERFACE
+    # Path to the internal C/C++ headers (for testing): Requires CMake 3.15+ for
+    # multiple compile languages
+    # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html
+    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:C,CXX>:${CMAKE_CURRENT_SOURCE_DIR}>>
+  PRIVATE
+    # Path to config.h (for C and C++ only): Requires CMake 3.15+ for multiple
+    # compile languages
+    # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html
+    $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:C,CXX>:${CMAKE_CURRENT_BINARY_DIR}>>)

-target_link_libraries(iconmath-support PUBLIC fortran-support::fortran-support)
+target_link_libraries(iconmath-support
+    PUBLIC
+        fortran-support::fortran-support
+    PRIVATE
+        Kokkos::kokkos
+)
+set_target_properties(iconmath-support PROPERTIES LINKER_LANGUAGE Fortran)

 install(TARGETS iconmath-support EXPORT "${PROJECT_NAME}-targets")


--- a/src/support/mo_lib_loopindices.f90
+++ b/src/support/mo_lib_loopindices.f90
@@ -16,12 +16,18 @@

 MODULE mo_lib_loopindices

+#ifdef __USE_CPP_BINDINGS
+  USE, INTRINSIC :: ISO_C_BINDING
+#endif
+
  IMPLICIT NONE

  PRIVATE

  PUBLIC :: get_indices_c_lib, get_indices_e_lib, get_indices_v_lib

+#ifndef __USE_CPP_BINDINGS
+
 CONTAINS

 !-------------------------------------------------------------------------
@@ -121,5 +127,31 @@ CONTAINS

  END SUBROUTINE get_indices_v_lib

-END MODULE mo_lib_loopindices
+#else
+
+  INTERFACE
+    SUBROUTINE get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                                 i_startidx_out, i_endidx_out) BIND(C, NAME="get_indices_c_lib")
+      IMPORT :: c_int
+      INTEGER(c_int), VALUE :: i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk
+      INTEGER(c_int) :: i_startidx_out, i_endidx_out
+    END SUBROUTINE get_indices_c_lib
+
+    SUBROUTINE get_indices_e_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                                 i_startidx_out, i_endidx_out) BIND(C, NAME="get_indices_e_lib")
+      IMPORT :: c_int
+      INTEGER(c_int), VALUE :: i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk
+      INTEGER(c_int) :: i_startidx_out, i_endidx_out
+    END SUBROUTINE get_indices_e_lib
+
+    SUBROUTINE get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk, &
+                                 i_startidx_out, i_endidx_out) BIND(C, NAME="get_indices_v_lib")
+      IMPORT :: c_int
+      INTEGER(c_int), VALUE :: i_startidx_in, i_endidx_in, nproma, i_blk, i_startblk, i_endblk
+      INTEGER(c_int) :: i_startidx_out, i_endidx_out
+    END SUBROUTINE get_indices_v_lib
+  END INTERFACE
+
+#endif

+END MODULE mo_lib_loopindices
--- a/src/support/mo_lib_loopindices.cpp
+++ b/src/support/mo_lib_loopindices.cpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+
+#include <algorithm> // For std::max
+
+// get_indices_c_lib function
+void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+                        const int i_blk, const int i_startblk, const int i_endblk,
+                        int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) {
+    
+    //Since code is ported incrementally from Fortran to C++, depending on where the function is called from
+    //(either fortran or c++), the first index should be either 0 or 1.
+    int first_index;
+    if (called_from_cpp)
+        first_index = 0;
+    else
+        first_index = 1;                   
+    
+    if (i_blk == i_startblk) {
+        i_startidx_out = std::max(first_index, i_startidx_in);
+        i_endidx_out = nproma;
+        if (i_blk == i_endblk) {
+            i_endidx_out = i_endidx_in;
+        }
+    } else if (i_blk == i_endblk) {
+        i_startidx_out = first_index;
+        i_endidx_out = i_endidx_in;
+    } else {
+        i_startidx_out = first_index;
+        i_endidx_out = nproma;
+    }
+}
+
+// get_indices_e_lib function
+void get_indices_e_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+                        const int i_blk, const int i_startblk, const int i_endblk,
+                        int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) {
+    
+    //Since code is ported incrementally from Fortran to C++, depending on where the function is called from, 
+    //the first index should be either 0 or 1.
+    int first_index;
+    if (called_from_cpp)
+        first_index = 0;
+    else
+        first_index = 1;
+
+    i_startidx_out = (i_blk != i_startblk) ? first_index : std::max(first_index, i_startidx_in);
+    i_endidx_out = (i_blk != i_endblk) ? nproma : i_endidx_in;
+}
+
+// get_indices_v_lib function
+void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+                        const int i_blk, const int i_startblk, const int i_endblk,
+                        int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) {
+    
+    //Since code is ported incrementally from Fortran to C++, depending on where the function is called from, 
+    //the first index should be either 0 or 1.
+    int first_index;
+    if (called_from_cpp)
+        first_index = 0;
+    else
+        first_index = 1;
+
+    if (i_blk == i_startblk) {
+        i_startidx_out = i_startidx_in;
+        i_endidx_out = nproma;
+        if (i_blk == i_endblk) {
+            i_endidx_out = i_endidx_in;
+        }
+    } else if (i_blk == i_endblk) {
+        i_startidx_out = first_index;
+        i_endidx_out = i_endidx_in;
+    } else {
+        i_startidx_out = first_index;
+        i_endidx_out = nproma;
+    }
+}
\ No newline at end of file
--- a/src/support/mo_lib_loopindices.hpp
+++ b/src/support/mo_lib_loopindices.hpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+#pragma once
+
+void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+    const int i_blk, const int i_startblk, const int i_endblk,
+    int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true);
+
+void get_indices_e_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+                        const int i_blk, const int i_startblk, const int i_endblk,
+                        int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true);
+
+void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+                            const int i_blk, const int i_startblk, const int i_endblk,
+                            int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true);
+    
\ No newline at end of file
--- a/src/support/mo_math_utilities.F90
+++ b/src/support/mo_math_utilities.F90
@@ -22,6 +22,7 @@
 ! #endif
 MODULE mo_math_utilities

+  USE, INTRINSIC :: ISO_C_BINDING
  USE mo_iconlib_kind, ONLY: wp, dp, sp
  USE mo_math_constants, ONLY: pi, pi_2, dbl_eps
  USE mo_gridman_constants, ONLY: SUCCESS, TORUS_MAX_LAT
@@ -78,7 +79,11 @@ MODULE mo_math_utilities
  PUBLIC :: line_intersect
  PUBLIC :: lintersect
  PUBLIC :: tdma_solver
+#ifndef __USE_CPP_BINDINGS
  PUBLIC :: tdma_solver_vec
+#else
+  PUBLIC :: tdma_solver_vec_dp
+#endif
  PUBLIC :: check_orientation

  !  vertical coordinates routines
@@ -126,6 +131,11 @@ MODULE mo_math_utilities
    MODULE PROCEDURE check_orientation_sp
  END INTERFACE

+  INTERFACE rotate_latlon
+    MODULE PROCEDURE rotate_latlon_dp
+    MODULE PROCEDURE rotate_latlon_sp
+  END INTERFACE
+
  !> Derived type specifying a set of REAL(dp) values.
  !
  TYPE t_value_set
@@ -160,7 +170,99 @@ MODULE mo_math_utilities

  CHARACTER(LEN=*), PARAMETER :: modname = 'mo_math_utilities'

+  !-------------------------------------------------------------------------
+  !>
+  !! TDMA tridiagonal matrix solver for a_i*x_(i-1) + b_i*x_i + c_i*x_(i+1) = d_i
+  !!
+  !!       a - sub-diagonal (means it is the diagonal below the main diagonal)
+  !!       b - the main diagonal
+  !!       c - sup-diagonal (means it is the diagonal above the main diagonal)
+  !!       d - right part
+  !!  varout - the answer (identical to x in description above)
+  !!    slev - start level (top)
+  !!    elev - end level (bottom)
+
+! Preprocessor directive to conditionally include the tdma_solver_vec implementation
+#ifndef __USE_CPP_BINDINGS
+
+CONTAINS
+
+  SUBROUTINE tdma_solver_vec(a, b, c, d, slev, elev, startidx, endidx, varout, opt_acc_queue)
+    INTEGER, INTENT(IN) :: slev, elev
+    INTEGER, INTENT(IN) :: startidx, endidx
+    REAL(wp), INTENT(IN) :: a(:, :), b(:, :), c(:, :), d(:, :)
+    REAL(wp), INTENT(OUT) :: varout(:, :)
+    INTEGER, OPTIONAL, INTENT(IN) :: opt_acc_queue
+
+    !
+    ! local
+    REAL(wp):: m, c_p(SIZE(a, 1), SIZE(a, 2)), d_p(SIZE(a, 1), SIZE(a, 2))
+    INTEGER :: i
+    INTEGER :: jc
+    INTEGER :: acc_queue
+
+    IF (PRESENT(opt_acc_queue)) THEN
+      acc_queue = opt_acc_queue
+    ELSE
+      acc_queue = 1
+    END IF
+
+    ! initialize c-prime and d-prime
+    !$ACC PARALLEL DEFAULT(PRESENT) CREATE(c_p, d_p) ASYNC(acc_queue)
+    !$ACC LOOP GANG(STATIC: 1) VECTOR
+    DO jc = startidx, endidx
+      c_p(jc, slev) = c(jc, slev)/b(jc, slev)
+      d_p(jc, slev) = d(jc, slev)/b(jc, slev)
+    END DO
+    ! solve for vectors c-prime and d-prime
+    !$ACC LOOP SEQ
+!NEC$ outerloop_unroll(4)
+    DO i = slev + 1, elev
+      !$ACC LOOP GANG(STATIC: 1) VECTOR PRIVATE(m)
+      DO jc = startidx, endidx
+        m = 1._wp/(b(jc, i) - c_p(jc, i - 1)*a(jc, i))
+        c_p(jc, i) = c(jc, i)*m
+        d_p(jc, i) = (d(jc, i) - d_p(jc, i - 1)*a(jc, i))*m
+      END DO
+    END DO
+    ! initialize varout
+    !$ACC LOOP GANG(STATIC: 1) VECTOR
+    DO jc = startidx, endidx
+      varout(jc, elev) = d_p(jc, elev)
+    END DO
+    ! solve for varout from the vectors c-prime and d-prime
+    !$ACC LOOP SEQ
+!NEC$ outerloop_unroll(4)
+    DO i = elev - 1, slev, -1
+      !$ACC LOOP GANG(STATIC: 1) VECTOR
+      DO jc = startidx, endidx
+        varout(jc, i) = d_p(jc, i) - c_p(jc, i)*varout(jc, i + 1)
+      END DO
+    END DO
+    !$ACC END PARALLEL
+
+    IF (.NOT. PRESENT(opt_acc_queue)) THEN
+      !$ACC WAIT(acc_queue)
+    END IF
+
+  END SUBROUTINE tdma_solver_vec
+
+#else
+
+  ! C++ binding for tdma_solver_vec
+  INTERFACE
+    SUBROUTINE tdma_solver_vec_dp(a, b, c, d, slev, elev, startidx, endidx, nrows, ncols, varout, opt_acc_queue) &
+      BIND(C, NAME="tdma_solver_vec_dp")
+      IMPORT :: c_double, c_int
+      REAL(c_double), INTENT(IN) :: a(*), b(*), c(*), d(*)
+      INTEGER(c_int), VALUE :: slev, elev, startidx, endidx, nrows, ncols
+      REAL(c_double), INTENT(OUT) :: varout(*)
+      INTEGER(c_int), OPTIONAL :: opt_acc_queue
+    END SUBROUTINE tdma_solver_vec_dp
+  END INTERFACE
+
 CONTAINS
+#endif

  !-------------------------------------------------------------------------
  ! Variant for double-precision (or working-precision=dp) lon+lat in ICON
@@ -1674,12 +1776,12 @@ CONTAINS
  !! See the routine "mo_lonlat_grid::rotate_latlon_grid" for a
  !! detailed description of the transformation process.
  !!
-  SUBROUTINE rotate_latlon(lat, lon, pollat, pollon)
+  SUBROUTINE rotate_latlon_dp(lat, lon, pollat, pollon)

-    REAL(wp), INTENT(inout) :: lat, lon
-    REAL(wp), INTENT(in)    :: pollat, pollon
+    REAL(dp), INTENT(inout) :: lat, lon
+    REAL(dp), INTENT(in)    :: pollat, pollon

-    REAL(wp) :: rotlat, rotlon
+    REAL(dp) :: rotlat, rotlon

    !-----------------------------------------------------------------------
    rotlat = ASIN(SIN(lat)*SIN(pollat) + COS(lat)*COS(pollat)*COS(lon - pollon))
@@ -1689,7 +1791,24 @@ CONTAINS
    lat = rotlat
    lon = rotlon

-  END SUBROUTINE rotate_latlon
+  END SUBROUTINE rotate_latlon_dp
+
+  SUBROUTINE rotate_latlon_sp(lat, lon, pollat, pollon)
+
+    REAL(sp), INTENT(inout) :: lat, lon
+    REAL(sp), INTENT(in)    :: pollat, pollon
+
+    REAL(sp) :: rotlat, rotlon
+
+    !-----------------------------------------------------------------------
+    rotlat = ASIN(SIN(lat)*SIN(pollat) + COS(lat)*COS(pollat)*COS(lon - pollon))
+    rotlon = ATAN2(COS(lat)*SIN(lon - pollon), &
+      & (COS(lat)*SIN(pollat)*COS(lon - pollon) - SIN(lat)*COS(pollat)))
+
+    lat = rotlat
+    lon = rotlon
+
+  END SUBROUTINE rotate_latlon_sp
  !-------------------------------------------------------------------------

  !-----------------------------------------------------------------------
@@ -2019,96 +2138,23 @@ CONTAINS

  END SUBROUTINE tdma_solver

-  !-------------------------------------------------------------------------
-  !>
-  !! TDMA tridiagonal matrix solver for a_i*x_(i-1) + b_i*x_i + c_i*x_(i+1) = d_i
-  !!
-  !!       a - sub-diagonal (means it is the diagonal below the main diagonal)
-  !!       b - the main diagonal
-  !!       c - sup-diagonal (means it is the diagonal above the main diagonal)
-  !!       d - right part
-  !!  varout - the answer (identical to x in description above)
-  !!    slev - start level (top)
-  !!    elev - end level (bottom)
-  SUBROUTINE tdma_solver_vec(a, b, c, d, slev, elev, startidx, endidx, varout, opt_acc_queue)
-    INTEGER, INTENT(IN) :: slev, elev
-    INTEGER, INTENT(IN) :: startidx, endidx
-    REAL(wp), INTENT(IN) :: a(:, :), b(:, :), c(:, :), d(:, :)
-    REAL(wp), INTENT(OUT) :: varout(:, :)
-    INTEGER, OPTIONAL, INTENT(IN) :: opt_acc_queue
-
-    !
-    ! local
-    REAL(wp):: m, c_p(SIZE(a, 1), SIZE(a, 2)), d_p(SIZE(a, 1), SIZE(a, 2))
-    INTEGER :: i
-    INTEGER :: jc
-    INTEGER :: acc_queue
-
-    IF (PRESENT(opt_acc_queue)) THEN
-      acc_queue = opt_acc_queue
-    ELSE
-      acc_queue = 1
-    END IF
-
-    ! initialize c-prime and d-prime
-    !$ACC PARALLEL DEFAULT(PRESENT) CREATE(c_p, d_p) ASYNC(acc_queue)
-    !$ACC LOOP GANG(STATIC: 1) VECTOR
-    DO jc = startidx, endidx
-      c_p(jc, slev) = c(jc, slev)/b(jc, slev)
-      d_p(jc, slev) = d(jc, slev)/b(jc, slev)
-    END DO
-    ! solve for vectors c-prime and d-prime
-    !$ACC LOOP SEQ
-!NEC$ outerloop_unroll(4)
-    DO i = slev + 1, elev
-      !$ACC LOOP GANG(STATIC: 1) VECTOR PRIVATE(m)
-      DO jc = startidx, endidx
-        m = 1._wp/(b(jc, i) - c_p(jc, i - 1)*a(jc, i))
-        c_p(jc, i) = c(jc, i)*m
-        d_p(jc, i) = (d(jc, i) - d_p(jc, i - 1)*a(jc, i))*m
-      END DO
-    END DO
-    ! initialize varout
-    !$ACC LOOP GANG(STATIC: 1) VECTOR
-    DO jc = startidx, endidx
-      varout(jc, elev) = d_p(jc, elev)
-    END DO
-    ! solve for varout from the vectors c-prime and d-prime
-    !$ACC LOOP SEQ
-!NEC$ outerloop_unroll(4)
-    DO i = elev - 1, slev, -1
-      !$ACC LOOP GANG(STATIC: 1) VECTOR
-      DO jc = startidx, endidx
-        varout(jc, i) = d_p(jc, i) - c_p(jc, i)*varout(jc, i + 1)
-      END DO
-    END DO
-    !$ACC END PARALLEL
-
-    IF (.NOT. PRESENT(opt_acc_queue)) THEN
-      !$ACC WAIT(acc_queue)
-    END IF
-
-  END SUBROUTINE tdma_solver_vec
-  !-------------------------------------------------------------------------
-
  !-------------------------------------------------------------------------
  !
  !> Helper functions for computing the vertical layer structure
-  !> zlev double-precision in ICON
  SUBROUTINE set_zlev(zlev_i, zlev_m, n_zlev, dzlev_m)
    INTEGER, INTENT(IN)  :: n_zlev
-    REAL(dp), INTENT(OUT) :: zlev_i(n_zlev + 1), zlev_m(n_zlev)
-    REAL(dp), INTENT(IN)  :: dzlev_m(:) ! namelist input of layer thickness
+    REAL(wp), INTENT(OUT) :: zlev_i(n_zlev + 1), zlev_m(n_zlev)
+    REAL(wp), INTENT(IN)  :: dzlev_m(:) ! namelist input of layer thickness

    INTEGER :: jk
-    REAL(dp) :: accum
+    REAL(wp) :: accum

-    accum = 0.0_dp
+    accum = 0.0_wp
    ! zlev_i    : upper border surface of vertical cells
    ! zlev_m    : position of coordinate surfaces in meters below zero surface.
    DO jk = 1, n_zlev
      zlev_i(jk) = accum
-      zlev_m(jk) = 0.5_dp*(2.0_dp*accum + dzlev_m(jk))
+      zlev_m(jk) = 0.5_wp*(2.0_wp*accum + dzlev_m(jk))
      accum = accum + dzlev_m(jk)
    END DO
    zlev_i(n_zlev + 1) = accum

--- a/src/support/mo_math_utilities.cpp
+++ b/src/support/mo_math_utilities.cpp
+// ICON
+//
+// ---------------------------------------------------------------
+// Copyright (C) 2004-2024, DWD, MPI-M, DKRZ, KIT, ETH, MeteoSwiss
+// Contact information: icon-model.org
+//
+// See AUTHORS.TXT for a list of authors
+// See LICENSES/ for license information
+// SPDX-License-Identifier: BSD-3-Clause
+// ---------------------------------------------------------------
+
+#include "mo_math_utilities.hpp"
+
+template <typename T>
+void tdma_solver_vec(const T* a, const T* b, const T* c, const T* d,
+                              int slev, int elev, int startidx, int endidx,
+                              int nrows, int ncols, T* varout) {
+
+  // Start timing
+  auto start_time = std::chrono::high_resolution_clock::now();
+
+  // Allocate temporary arrays using Kokkos::View.
+  // The views c_p and d_p are allocated as 2D arrays with dimensions [nrows][ncols].
+  // Kokkos::View automatically handles memory management.
+  Kokkos::View<T**> c_p("c_p", nrows, ncols);
+  Kokkos::View<T**> d_p("d_p", nrows, ncols);
+
+  // Wrap the input arrays in unmanaged views.
+  // We assume that the input arrays are laid out in column-major order as in the original code.
+  // Here we use LayoutLeft so that the first index (row) is contiguous.
+  typedef Kokkos::View<const T**, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> UnmanagedConst2D;
+  typedef Kokkos::View<T**, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> Unmanaged2D;
+  UnmanagedConst2D a_view(a, nrows, ncols);
+  UnmanagedConst2D b_view(b, nrows, ncols);
+  UnmanagedConst2D c_view(c, nrows, ncols);
+  UnmanagedConst2D d_view(d, nrows, ncols);
+  Unmanaged2D varout_view(varout, nrows, ncols);
+
+  // Initialize c-prime and d-prime at the starting level (slev)
+  Kokkos::parallel_for("init_c_p_d_p", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
+    c_p(jc, slev) = c_view(jc, slev) / b_view(jc, slev);
+    d_p(jc, slev) = d_view(jc, slev) / b_view(jc, slev);
+  });
+  Kokkos::fence();
+
+  // Forward sweep: compute c-prime and d-prime for each column from slev+1 to elev-1.
+  for (int i = slev + 1; i < elev; ++i) {
+    Kokkos::parallel_for("forward_sweep", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
+      T m = 1.0 / (b_view(jc, i) - c_p(jc, i - 1) * a_view(jc, i));
+      c_p(jc, i) = c_view(jc, i) * m;
+      d_p(jc, i) = (d_view(jc, i) - d_p(jc, i - 1) * a_view(jc, i)) * m;
+    });
+    Kokkos::fence();
+  }
+
+  // Initialize the output array at the last level (elev-1)
+  Kokkos::parallel_for("init_varout", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
+    varout_view(jc, elev-1) = d_p(jc, elev-1);
+  });
+  Kokkos::fence();
+
+  // Back substitution: update varout for columns from elev-2 down to slev.
+  for (int i = elev - 2; i >= slev; --i) {
+    Kokkos::parallel_for("back_substitution", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
+      varout_view(jc, i) = d_p(jc, i) - c_p(jc, i) * varout_view(jc, i + 1);
+    });
+    Kokkos::fence();
+  }
+
+  c_p = Kokkos::View<T**>();
+  d_p = Kokkos::View<T**>();
+  // End timing and print the elapsed time
+  auto end_time = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed_time = end_time - start_time;
+  std::cout << "Elapsed time for tdma_solver_vec (Kokkos): " << elapsed_time.count() << " seconds" << std::endl;
+}
+
+template
+void tdma_solver_vec<double>(const double* a, const double* b, const double* c, const double* d,
+  int slev, int elev, int startidx, int endidx,
+  int nrows, int ncols, double* varout);
+
+template
+void tdma_solver_vec<float>(const float* a, const float* b, const float* c, const float* d,
+  int slev, int elev, int startidx, int endidx,
+  int nrows, int ncols, float* varout);
+
No results found