diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8dca0f4882ffed6d6e03f1a8affd070865fba39a..536ebb55487f6c1dc7c0a1e9c0a2406065582671 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.20)
-project(demo LANGUAGES CXX VERSION 0.0.1)
+project(demo LANGUAGES CXX Fortran VERSION 0.0.1)
 
 
 include(FetchContent)
@@ -30,15 +30,42 @@ elseif("${MU_ARCH}" STREQUAL "a100")
     set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE)
     set(Kokkos_ENABLE_CUDA ON CACHE BOOL "" FORCE)
     set(Kokkos_ARCH_AMPERE80 ON CACHE BOOL "" FORCE)
+elseif("${MU_ARCH}" STREQUAL "mi250x")
+    set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE)
+    set(Kokkos_ENABLE_HIP ON CACHE BOOL "" FORCE)
+    set(Kokkos_ARCH_AMD_GFX90A ON CACHE BOOL "" FORCE)
 else()
     message(FATAL_ERROR "${MU_ARCH} is not a valid/tested configuration, select architecture, x86_64, a100, h100, mi250x, mi300a(-unified), h100(-unified)")
 endif()
 
 FetchContent_MakeAvailable(kokkos)
 
-add_executable(demo main.cpp)
+add_executable(demo cdriver.cpp main.cpp)
 target_link_libraries(demo PUBLIC Kokkos::kokkos)
 
+set_source_files_properties(
+  fdriver.f90
+  PROPERTIES Fortran_PREPROCESS ON
+)
+
+if (Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP)
+  set(WITH_OPENACC_FORTRAN 1)
+else()
+  set(WITH_OPENACC_FORTRAN 0)
+endif()
+
+add_executable(fdemo fdriver.f90 main.cpp)
+target_link_libraries(fdemo PUBLIC Kokkos::kokkos)
+if (WITH_OPENACC_FORTRAN)
+  #find_package(OpenACC REQUIRED OpenACC_Fortran)
+  find_package(OpenACC)
+  if (NOT ${OpenACC_Fortran_FOUND})
+    message(FATAL_ERROR "OpenACC_Fortran_FOUND=${OpenACC_Fortran_FOUND}")
+  endif()
+  target_link_libraries(fdemo PUBLIC OpenACC::OpenACC_Fortran )
+endif()
+set_property(TARGET fdemo PROPERTY LINKER_LANGUAGE Fortran)
+
 if ("${MU_ARCH}" STREQUAL "a100")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc -Minfo=accel -gpu=cc80")
     add_compile_definitions(DEMO_DEVICE)
@@ -46,3 +73,7 @@ if ("${MU_ARCH}" STREQUAL "a100")
     target_link_options(demo PUBLIC "-gpu=pinned")
 endif()
 
+if ("${MU_ARCH}" STREQUAL "mi250x")
+  add_compile_definitions(DEMO_DEVICE)
+endif()
+
diff --git a/cdriver.cpp b/cdriver.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..beac2f23b91cffa788c7c9143c3b82bc0fab717f
--- /dev/null
+++ b/cdriver.cpp
@@ -0,0 +1,11 @@
+#include <Kokkos_Core.hpp>
+
+void cpp_run_tests();
+
+int main() {
+  Kokkos::initialize();
+  cpp_run_tests();
+  Kokkos::finalize();
+  return 0;
+}
+
diff --git a/conf.sh b/conf.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b81fc040face5968bbbbd39a62b09da7488bdfe7
--- /dev/null
+++ b/conf.sh
@@ -0,0 +1,16 @@
+# source this
+
+case $(hostname) in
+    levante*.dkrz.de|vader*.dkrz.de)
+        export DEVICE=a100
+        ;;
+    nid*) # lumi-g
+        export DEVICE=mi250x
+        ;;
+    *)
+        echo "unknown system: $(hostname)"
+        exit 1
+        ;;
+esac
+
+echo "$0: DEVICE=$DEVICE"
diff --git a/debug_script.sh b/debug_script.sh
index b2200668469b472497b9bd8c4f7be505c3e802a2..73a621d605de024afff29c5fe74a7264dd461806 100755
--- a/debug_script.sh
+++ b/debug_script.sh
@@ -8,10 +8,15 @@ ulimit -s unlimited
 
 set -e
 
+# set DEVICE:
+source conf.sh
+
+export VERBOSE_LEVEL=2
+        
 if [ "$1" == 'gpu' ]
 then
     #rm -rf build_gpu
-    cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O0 -g"
+    cmake -B build_gpu -S . -DMU_ARCH=$DEVICE -DCMAKE_CXX_FLAGS="-O0 -g"
     cmake --build build_gpu --parallel 
 
     ncells=(8)
@@ -28,7 +33,7 @@ else
 
     export OMP_PROC_BIND=close
     export OMP_PLACES=cores
-    export OMP_NUM_THREADS=8
+    export OMP_NUM_THREADS=1
 fi
 
 for jb in ${ncells[*]}; do 
@@ -39,9 +44,11 @@ for jb in ${ncells[*]}; do
             export NCELLS=$jb
             if [ "$1" == 'gpu' ] 
             then
-                ./build_gpu/demo 
+                #./build_gpu/demo
+                ./build_gpu/fdemo
             else
-                ./build/demo
+                #./build/demo
+                ./build/fdemo
             fi
             echo "---"
         done
diff --git a/fdriver.f90 b/fdriver.f90
new file mode 100644
index 0000000000000000000000000000000000000000..d4b93c8dcb95d1582ed5f66be476e73af0071b74
--- /dev/null
+++ b/fdriver.f90
@@ -0,0 +1,177 @@
+MODULE fsupport
+  USE, INTRINSIC:: iso_c_binding
+#ifdef _OPENACC
+  USE openacc
+#endif
+  IMPLICIT NONE
+  PRIVATE
+  PUBLIC :: dp
+  PUBLIC :: t_domain
+  PUBLIC :: get_domain
+  PUBLIC :: init_field
+  PUBLIC :: zero_field
+  PUBLIC :: physop
+  PUBLIC :: f2c_default_run_all
+  !PUBLIC :: show_field
+  PUBLIC :: f2c_kokkos_initialize
+  PUBLIC :: f2c_kokkos_finalize
+  INTEGER, PARAMETER :: dp = c_double
+
+  TYPE, BIND(c):: t_domain
+    INTEGER(c_int) :: ncells
+    INTEGER(c_int) :: nlev
+    INTEGER(c_int) :: nblocks
+    INTEGER(c_int) :: nproma
+  END TYPE t_domain
+
+  INTERFACE
+    SUBROUTINE f2c_kokkos_initialize() BIND(c)
+    END SUBROUTINE f2c_kokkos_initialize
+
+    SUBROUTINE f2c_kokkos_finalize() BIND(c)
+    END SUBROUTINE f2c_kokkos_finalize
+
+    SUBROUTINE f2c_get_domain(dom) BIND(c)
+      IMPORT t_domain
+      TYPE(t_domain) :: dom
+    END SUBROUTINE f2c_get_domain
+
+    SUBROUTINE f2c_default_run_all(field, ref_in, ref_out, nblocks, nlev, nproma) BIND(c)
+      IMPORT c_double
+      REAL(c_double) :: field(*), ref_in(*), ref_out(*)
+      INTEGER, VALUE ::  nblocks, nlev, nproma
+    END SUBROUTINE f2c_default_run_all
+  END INTERFACE
+
+CONTAINS
+
+  SUBROUTINE get_domain(dom)
+    TYPE(t_domain), INTENT(out) :: dom
+    CALL f2c_get_domain(dom)
+  END SUBROUTINE get_domain
+
+  SUBROUTINE init_field(field)
+    REAL(dp), INTENT(out) :: field(:,:,:)
+    INTEGER :: j,k,i
+    INTEGER :: nj,nk,ni
+    nj = SIZE(field,3)
+    nk = SIZE(field,2)
+    ni = SIZE(field,1)
+    !PRINT*,'ni,nk,nj=',ni,nk,nj
+    DO j = 1, nj
+!$ACC parallel default(present)
+!$ACC LOOP GANG VECTOR COLLAPSE(2)
+      DO k = 1, nk
+        DO i = 1, ni
+          field(i,k,j) = REAL(i-1 + (k-1)*ni + (j-1)*ni*nk, dp)
+          !PRINT*,'i,k,j,val=',i,k,j, field(i,k,j)
+        ENDDO
+      ENDDO
+!$ACC END PARALLEL
+    ENDDO
+  END SUBROUTINE init_field
+
+  SUBROUTINE zero_field(v)
+    REAL(dp), INTENT(out) :: v(:,:,:)
+    INTEGER :: j,k,i
+    INTEGER :: nj,nk,ni
+    nj = SIZE(v,3)
+    nk = SIZE(v,2)
+    ni = SIZE(v,1)
+    DO j = 1, nj
+!$ACC parallel default(present)
+!$ACC LOOP GANG VECTOR COLLAPSE(2)
+      DO k = 1, nk
+        DO i = 1, ni
+          v(i,k,j) = 0.0_dp
+        ENDDO
+      ENDDO
+!$ACC END PARALLEL
+    ENDDO
+  END SUBROUTINE zero_field
+
+  SUBROUTINE physop(v)
+    REAL(dp), INTENT(out) :: v(:,:,:)
+    INTEGER :: j,k,i
+    INTEGER :: nj,nk,ni
+    nj = SIZE(v,3)
+    nk = SIZE(v,2)
+    ni = SIZE(v,1)
+    PRINT*,'physop: ni,nk,nj=',ni,nk,nj
+    DO j = 1, nj
+!$ACC PARALLEL DEFAULT(PRESENT)
+      k = 1
+!$ACC LOOP GANG VECTOR
+      DO i = 1, ni
+        v(i,k,j) = v(i,k,j) + v(i,k+1,j)/10000
+      ENDDO
+!$ACC LOOP SEQ
+      DO k = 2, nk-1
+!$ACC LOOP GANG VECTOR
+        DO i = 1, ni
+          v(i,k,j) = v(i,k,j) +  v(i,k-1,j)/100 + v(i,k+1,j)/10000;
+        ENDDO
+      ENDDO
+      k = nk
+!$ACC LOOP GANG VECTOR
+      DO i = 1, ni
+        v(i,k,j) = v(i,k,j) + v(i,k-1,j)/100;
+      ENDDO
+!$ACC END PARALLEL
+    ENDDO
+  END SUBROUTINE physop
+
+#if 0
+  SUBROUTINE show_field(label,f)
+    CHARACTER(len=*), INTENT(in) :: label
+    REAL(dp), INTENT(in) :: f(:,:,:)
+    INTEGER :: i,k,j
+    INTEGER :: ni,nk,nj
+    nj = SIZE(f,1)
+    nk = SIZE(f,2)
+    ni = SIZE(f,3)
+    DO j = 1, nj
+!$ACC PARALLEL DEFAULT(PRESENT)
+!$ACC LOOP SEQ
+      DO k = 1, nk
+        DO i = 1, ni
+          PRINT '(A,3I8,F)','show_field: '//label,j,k,i,f(i,k,j)
+        ENDDO
+      ENDDO
+!$ACC END PARALLEL
+    ENDDO
+  END SUBROUTINE show_field
+#endif
+END MODULE fsupport
+
+PROGRAM fdriver
+  USE fsupport, ONLY: dp, t_domain, get_domain, init_field, zero_field, physop, &
+       & f2c_default_run_all, f2c_kokkos_initialize, f2c_kokkos_finalize
+  IMPLICIT NONE
+
+  TYPE(t_domain) :: dom
+  REAL(dp), ALLOCATABLE :: field(:,:,:), ref_in(:,:,:), ref_out(:,:,:)
+  CALL f2c_kokkos_initialize()
+  PRINT*,'fdriver::start'
+  CALL get_domain(dom)
+  PRINT '(A,4I8)','ncells,lev,nblocks,nproma=',dom%ncells,dom%nlev,dom%nblocks,dom%nproma
+
+  ALLOCATE( field(dom%nproma,dom%nlev,dom%nblocks), &
+       &    ref_in(dom%nproma,dom%nlev,dom%nblocks), &
+       &    ref_out(dom%nproma,dom%nlev,dom%nblocks) )
+
+!$acc enter data create(field, ref_in, ref_out)
+  CALL zero_field(field)
+  CALL init_field(ref_in)
+  CALL init_field(ref_out)
+  CALL physop(ref_out)
+
+!$acc host_data use_device(field, ref_in, ref_out)
+  CALL f2c_default_run_all(field, ref_in, ref_out, dom%nblocks, dom%nlev,  dom%nproma)
+!$acc end host_data
+
+!$acc exit data copyout(field, ref_in, ref_out)
+
+  CALL f2c_kokkos_finalize()
+  PRINT*,'fdriver:end'
+END PROGRAM fdriver
diff --git a/lumi-cpu-job.sh b/lumi-cpu-job.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dbc60acb1ff6cd2387d833ef5cef689d52582ecd
--- /dev/null
+++ b/lumi-cpu-job.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH --job-name=demo_cpu
+#SBATCH --output=demo_cpu.o%j
+#SBATCH --error=demo_cpu.o%j
+#SBATCH --partition=dev-g
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=8
+#SBATCH --time=00:10:00
+#SBATCH --account=project_465001141
+
+export VERBOSE=1
+#./debug_script.sh
+./script.sh
diff --git a/lumi-gpu-job.sh b/lumi-gpu-job.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9e98eb954132a085bc787ffc971382765b7edfd4
--- /dev/null
+++ b/lumi-gpu-job.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#SBATCH --job-name=demo_gpu   # Job name
+#SBATCH --output=demo_gpu.o%j # Name of stdout output file
+#SBATCH --error=demo_gpu.o%j  # Name of stderr error file
+#SBATCH --partition=dev-g  # partition name : standard-g
+#SBATCH --gpus=1
+#SBATCH --time=00:10:00       # Run time (d-hh:mm:ss)
+#SBATCH --account=project_465001141  # Project for billing
+
+
+pwd
+
+#./debug_script.sh gpu
+./script.sh gpu
diff --git a/lumi_cpu_setup.sh b/lumi_cpu_setup.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f70aaf4bbe0adfab3ed6298eb30bb7d4cd8e249d
--- /dev/null
+++ b/lumi_cpu_setup.sh
@@ -0,0 +1,22 @@
+# source this
+module load LUMI/24.03
+module load buildtools/24.03
+module load googletest/1.14.0-cpeCray-24.03
+
+
+
+module load PrgEnv-cray
+export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH
+echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+
+# load GPU support
+#module load craype-accel-amd-gfx90a 
+module load rocm
+module load cce/17.0.1 craype-x86-milan
+
+# new: python with jinja2
+#source ~/venv/bin/activate
+
+#export CRAYPE_LINK_TYPE=dynamic
+export CXX=CC
+export FC=ftn
diff --git a/lumi_gpu_setup.sh b/lumi_gpu_setup.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2e6c6e6fc617af3d80a23a221e08b2ebc119ce69
--- /dev/null
+++ b/lumi_gpu_setup.sh
@@ -0,0 +1,20 @@
+# source this
+module load LUMI/24.03
+module load buildtools/24.03
+module load googletest/1.14.0-cpeCray-24.03
+
+
+module load PrgEnv-cray
+export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH
+echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+
+# load GPU support
+module load craype-accel-amd-gfx90a 
+module load rocm
+
+# new: python with jinja2
+source ~/venv/bin/activate
+
+export CRAYPE_LINK_TYPE=dynamic
+export CXX=CC
+export FC=ftn
diff --git a/main.cpp b/main.cpp
index eaa71b98ad2b38e86cf39254017a79b5cb5507cf..0326de323dd76658cacf8727bf70cee3bfba1b4d 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,5 +1,6 @@
 #include <Kokkos_Core.hpp>
 #include <functional>
+#undef NDEBUG // always use assertion
 #include <cassert>
 #include <iostream>
 #include <utility>
@@ -13,15 +14,10 @@ using space_t = Kokkos::DefaultExecutionSpace::memory_space;
 typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy;
 
 Kokkos::Timer timer;
-//Kokkos::Timer transposition_timer;
-//Kokkos::Timer merge_horizontal_timer;
-
-// constexpr int nblocks = 2;
-// constexpr int nlev = 90;
-// constexpr int nproma = 55000;
 
 #define ENABLE_CHECK_BOUNDS
 
+static int verbose_level = 0;
 
 struct AllLevels {
   constexpr static int value = 0;
@@ -50,6 +46,27 @@ static void validate(double* array, const double *ref, int nblocks, int nlev, in
   }
 }
 
+
+inline static HOST_DEVICE_ATTRIBUTES double  ferror(double x1, double x0) {
+  constexpr double eps = 1.e-14;
+  if (fabs(x0) > eps) {
+    return fabs(x1-x0)/fabs(x0);
+  } else {
+    return fabs(x1-x0);
+  }
+}
+
+static void validate_max_error(double* array, const double *ref, int nblocks, int nlev, int nproma) {
+  const int ntot = nblocks*nlev*nproma;
+  double totmax;
+  Kokkos::parallel_reduce("validator", ntot, KOKKOS_LAMBDA(const int &i, double &emax) {
+      double err = ferror(array[i],ref[i]);
+      emax = fmax(emax, err);
+    }, Kokkos::Max<double>(totmax));
+  if (verbose_level > 0) printf("validate_max_error: total max error=%.10e\n",totmax);
+  assert(totmax < 1.e-14);
+}
+
 KOKKOS_INLINE_FUNCTION void check_bounds(int i0, int i1, int i2, int n0, int n1, int n2) {
 #ifdef ENABLE_CHECK_BOUNDS
   assert(i0 >= 0 && i1 >= 0 && i2 >= 0 && i0 < n0 && i1 < n1 && i2 < n2);
@@ -62,37 +79,6 @@ KOKKOS_INLINE_FUNCTION void check_bounds(int i0, int i1, int n0, int n1) {
 #endif
 }
 
-template<typename ViewType>
-void show_view(const std::string &label, const ViewType &view, const bool with_values=false) {
-  printf("show_view: label=%s, ",label.c_str());
-  using MyLayout = typename ViewType::array_layout;
-  if (std::is_same<MyLayout, Kokkos::LayoutLeft>::value) {
-    printf("Layout=LayoutLeft\n");
-  } else if (std::is_same<MyLayout, Kokkos::LayoutRight>::value) {
-    printf("Layout=LayoutRight\n");
-  } else {
-    printf("Layout: **unknown**\n");
-  }
-  if (with_values) {
-#ifdef DEMO_DEVICE
-    auto h_view = Kokkos::create_mirror_view_and_copy(Kokkos::Serial(),view);
-#else
-    auto &h_view = view;
-#endif
-    const int n0 = h_view.extent(0);
-    const int n1 = h_view.extent(1);
-    const int n2 = h_view.extent(2);
-    printf("n0=%d, n1=%d, n2=%d\n",n0,n1,n2);
-    for (int i0=0; i0 < n0; ++i0) {
-      for (int i2=0; i2 < n2; ++i2) {
-        for (int i1=0; i1 < n1; ++i1) {
-          printf("show_view: i0=%d, i1=%d, i2=%d, v=%f\n",i0,i1,i2, h_view(i0,i1,i2));
-        }
-      };
-    };
-  }
-  Kokkos::fence();
-}
 
 template<typename ViewType>
 inline HOST_DEVICE_ATTRIBUTES void update_column(ViewType &v) {
@@ -125,7 +111,6 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t
   auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
 
   timer.reset();
-
   for (int jb = 0; jb < nblocks; ++jb)
     Kokkos::parallel_for(
         "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
@@ -133,18 +118,20 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t
         });
   Kokkos::fence();
 
-  Kokkos::fence();
   if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
   Kokkos::deep_copy(view, d_view);
 
-  //validate(array, nblocks, nlev, nproma);
+}
+
+void scenario_1_noprint(double* array, int nblocks, int nlev, int nproma) {
+  printf("scenario_1_noprint\n");
+  scenario_1(array, nblocks, nlev, nproma, false);
 }
 
 
 void scenario_1b(double* array, int nblocks, int nlev, int nproma) {
   std::cout << "scenario 1b: always LayoutRight; view(array, nblocks, nlev, nproma); parallel: nproma ----- "
-              << std::endl;
-
+            << std::endl;
   Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
 
   timer.reset();
@@ -170,6 +157,8 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
   //Kokkos::View<double**, Kokkos::MemoryUnmanaged> d_view(array, ncells, nlev);
   Kokkos::View<double**> d_view2d("d_view2d", ncells, nlev);
   Kokkos::Timer tr_timer;
+  Kokkos::Timer total_timer;
+  total_timer.reset();
 
   // view3d ->  view2d:
   tr_timer.reset();
@@ -182,7 +171,7 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
         });
     });
   Kokkos::fence();
-  printf("Time transposition 3d -> 2d= %f ms\n", tr_timer.seconds() * 1000);
+  printf("Transposition 3d -> 2d = %f ms\n", tr_timer.seconds() * 1000);
   
   timer.reset();
     Kokkos::parallel_for(
@@ -192,7 +181,7 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
            update_column(column);
         });
   Kokkos::fence();
-  printf("Time = %f ms\n", timer.seconds() * 1000);
+  printf("Kernel = %f ms\n", timer.seconds() * 1000);
 
   // view2d ->  view3d:
   tr_timer.reset();
@@ -205,7 +194,9 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
         });
     });
   Kokkos::fence();
-  printf("Time transposition 2d -> 3d= %f ms\n\n", tr_timer.seconds() * 1000);
+  printf("Transposition 2d -> 3d = %f ms\n", tr_timer.seconds() * 1000);
+
+  printf("Time = %f ms\n\n", total_timer.seconds() * 1000);
 }
 
 
@@ -223,6 +214,8 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
 
   Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view3d(array, nproma, nlev, nblocks);
   Kokkos::Timer tr_timer;
+  Kokkos::Timer total_timer;
+  total_timer.reset();
 
   // view3d ->  view2d:
   tr_timer.reset();
@@ -235,7 +228,7 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
         });
     });
   Kokkos::fence();
-  printf("Time transposition 3d -> 2d= %f ms\n", tr_timer.seconds() * 1000);
+  printf("Transposition 3d -> 2d = %f ms\n", tr_timer.seconds() * 1000);
 
   timer.reset();
 
@@ -247,7 +240,7 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
         });
   Kokkos::fence();
 
-  printf("Time = %f ms\n", timer.seconds() * 1000);
+  printf("Kernel = %f ms\n", timer.seconds() * 1000);
 
   // view2d ->  view3d:
   tr_timer.reset();
@@ -260,7 +253,8 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
         });
     });
   Kokkos::fence();
-  printf("Time transposition 2d -> 3d= %f ms\n\n", tr_timer.seconds() * 1000);
+  printf("Transposition 2d -> 3d = %f ms\n", tr_timer.seconds() * 1000);
+  printf("Time = %f ms\n\n", total_timer.seconds() * 1000);
 
 }
 
@@ -580,8 +574,7 @@ void scenario_7(double* array, int nblocks, int nlev, int nproma, bool print=tru
 #endif
 
 void scenario_7b(double* array, int nblocks, int nlev, int nproma) {
-
-  std::cout << "scenario 7b: +ACC modifiedDefault layout; CPUview(array, blocks, nlev, nproma); GPUview(nproma, nlev, nblocks), parallel blocks ----- " << std::endl;
+  std::cout << "scenario 7b: +ACC Default layout; CPUview(array, blocks, nlev, nproma); GPUview(nproma, nlev, nblocks), parallel blocks ----- " << std::endl;
 #ifdef DEMO_DEVICE
   Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
 #else
@@ -606,14 +599,13 @@ void scenario_7b(double* array, int nblocks, int nlev, int nproma) {
 
 void scenario_7bmacro(double* array, int nblocks, int nlev, int nproma) {
 
-  std::cout << "scenario 7bmacro: +ACC CPU:LayoutLeft GPU:LayoutRight; CPUview(array, nblocks, nlev, nproma); GPUview(nproma, nlev, nblocks) parallel: asICON----- " << std::endl;
+  std::cout << "scenario 7bmacro: +ACC CPU:LayoutLeft GPU:LayoutRight; CPUview(array, nblocks, nlev, nproma); GPUview(nproma, nlev, nblocks) para: asICON" << std::endl;
 
 #if defined(DEMO_DEVICE)
   Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
 #else
   Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
 #endif
-  show_view("7bmacro",d_view);
   timer.reset();
   outer_for("", 0, nblocks, outer_lambda(const int jb){
       inner_for("",0, nproma, inner_lambda(const int jc) {
@@ -649,7 +641,6 @@ void scenario_8(double* array, int nblocks, int nlev, int nproma) {
     std::cout << "scenario 8: +ACC always LayoutLeft view(array, nproma, nlev, nblocks) parallel: nproma ----- " << std::endl;
 
     Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks) ;
-    show_view("scenario_8", d_view);
     timer.reset();
     for (int jb = 0; jb < nblocks; ++jb)
       Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
@@ -662,7 +653,7 @@ void scenario_8(double* array, int nblocks, int nlev, int nproma) {
 
 
 void scenario_8macro(double* array, int nblocks, int nlev, int nproma) {
-    std::cout << "scenario 8macro: +ACC always LayoutLeftm, view(array, nproma, nlev, nblocks); parallel: asICON ----- " << std::endl;
+    std::cout << "scenario 8macro: +ACC always LayoutLeft, view(array, nproma, nlev, nblocks); parallel: asICON ----- " << std::endl;
     using space_t = Kokkos::DefaultExecutionSpace::memory_space;
     Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
     timer.reset();
@@ -733,8 +724,47 @@ void physop(double *array, int nproma, int nlev, int nblocks) {
   }
 }
 
+bool is_device_ptr(const void* ptr) {
+  using ExecSpace = Kokkos::DefaultExecutionSpace;
+#ifdef KOKKOS_ENABLE_CUDA
+  constexpr bool exec_on_cuda = std::is_same<ExecSpace,Kokkos::Cuda>::value;
+  if constexpr (exec_on_cuda) {
+    cudaPointerAttributes att;
+    auto err = cudaPointerGetAttributes (&att, ptr);
+    assert(err == cudaSuccess);
+    if (verbose_level > 2 ) {
+      printf("is_device_ptr: device=%d\n",att.device);
+      printf("is_device_ptr: memtype=%d\n",att.type );
+      printf("is_device_ptr:   ptr=%p\n",ptr);
+      printf("is_device_ptr: d_ptr=%p\n",att.devicePointer);
+      printf("is_device_ptr: h_ptr=%p\n",att.hostPointer);
+    }
+    if (att.devicePointer && ptr == att.devicePointer ) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  constexpr bool exec_on_hip = std::is_same<ExecSpace,Kokkos::HIP>::value;
+  if constexpr (exec_on_hip) {
+    hipPointerAttribute_t att;
+    auto err = hipPointerGetAttributes(&att, ptr);
+    assert(err == hipSuccess);
+    if (att.devicePointer && ptr == att.devicePointer ) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+#endif
+
+  return false;
 
-void run_scenario(double *array, const double*array_in_ref, const double *array_out_ref,
+}
+    
+void run_scenario_using_host_data(double *array, const double *array_in_ref, const double *array_out_ref,
                   int nblocks, int nlev, int nproma,
                   const std::function<void(double*, int, int, int)> &scenario) {
   const size_t nbytes = nproma*nlev*nblocks*sizeof(double);
@@ -742,17 +772,113 @@ void run_scenario(double *array, const double*array_in_ref, const double *array_
   openacc_calls(array, array_out_ref, nblocks, nlev, nproma, scenario);
 }
 
-int main() {
-  int64_t ncells64 = atoi(std::getenv("NCELLS"));
-  assert(ncells64 < INT_MAX);
-  int64_t nproma64  = atoi(std::getenv("NPROMA"));
-  assert(nproma64 < INT_MAX);
 
-  int ncells  = ncells64;
-  int nlev    = atoi(std::getenv("NLEV"));
-  int nproma  = nproma64;
+void default_run_scenario(double *array, const double *array_in_ref, const double *array_out_ref,
+                                    int nblocks, int nlev, int nproma,
+                                    const std::function<void(double*, int, int, int)> &scenario) {
+  // init:
+  Kokkos::View<double*, Kokkos::MemoryUnmanaged> view(array, nblocks*nlev*nproma);
+  Kokkos::View<const double*, Kokkos::MemoryUnmanaged> view_in_ref(array_in_ref, nblocks*nlev*nproma);
+  Kokkos::deep_copy(view, view_in_ref);
+
+  // run:
+  scenario(view.data(), nblocks, nlev, nproma);
+
+  // check:
+  validate_max_error(array, array_out_ref, nblocks, nlev, nproma);
+}
+
+struct Domain {
+  int ncells;
+  int nlev;
+  int nblocks;
+  int nproma;
+};
+
+int get_int(const char* str, const int default_value) {
+  const char* istr = std::getenv(str);
+  if (not istr) {
+    printf("get_int: key=%s, using default=%d\n",str, default_value);
+    return default_value;
+  }
+  assert(istr);
+  int64_t i64 = atoi(istr);
+  assert(i64 >= INT_MIN and i64 <= INT_MAX);
+  int i = i64;
+  return i;
+}
+
+Domain get_domain() {
+  int ncells = get_int("NCELLS", 0);
+  int nproma  = get_int("NPROMA", 0);
+  int nlev  = get_int("NLEV", 0);
   int nblocks = (ncells - 1) / nproma + 1;
   assert(nproma*nblocks == ncells);
+  Domain dom;
+  dom.ncells  = ncells;
+  dom.nlev    = nlev;
+  dom.nproma  = nproma;
+  dom.nblocks = nblocks;
+  return dom;
+}
+
+extern "C"
+void f2c_get_domain(Domain *dom) {
+  Domain td = get_domain();
+  *dom = get_domain();
+}
+
+
+void default_run_all(double* field, double* ref_in, double* ref_out, int nblocks, int nlev, int nproma) {
+  verbose_level=get_int("VERBOSE_LEVEL", 0);
+  if (verbose_level > 0) printf("default_run_all: start\n");
+
+  assert(is_device_ptr(field) == is_device_ptr(ref_in));
+  assert(is_device_ptr(field) == is_device_ptr(ref_out));
+
+  std::vector funs = {
+    scenario_1_noprint,
+    scenario_1b, scenario_1c, scenario_1cmacro,
+    scenario_6b,
+    scenario_7bmacro, scenario_7btemplate,
+    scenario_8, scenario_8macro, scenario_8template
+  };
+
+  if (exec_on_device() and (nproma > 50000)) {
+    printf("skip scenario_7b: GPU parallel over blocks way too slow\n\n");
+  } else {
+    funs.push_back(scenario_7b);
+  }
+
+  for (const auto &f : funs) {
+    default_run_scenario(field, ref_in, ref_out,
+                         nblocks, nlev, nproma,
+                         f);
+    printf("====\n");
+  }
+  if (verbose_level > 0) printf("default_run_all: end\n");
+}
+
+extern "C"
+void f2c_default_run_all(double* field, double* ref_in, double* ref_out, int nblocks, int nlev, int nproma) {
+  default_run_all(field, ref_in, ref_out, nblocks, nlev, nproma);
+}
+
+extern "C" void f2c_kokkos_initialize() {
+  Kokkos::initialize();
+}
+
+extern "C" void f2c_kokkos_finalize() {
+  Kokkos::finalize();
+}
+
+void cpp_run_tests() {
+  const Domain dom = get_domain();
+  int nlev    = dom.nlev;
+  int nproma  = dom.nproma;
+  int nblocks = dom.nblocks;
+
+  verbose_level=get_int("VERBOSE_LEVEL", 0);
 
   std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
 
@@ -760,7 +886,6 @@ int main() {
   double array_in_ref[nblocks * nlev * nproma];
   double array[nblocks * nlev * nproma];
   const size_t nbytes = nproma*nlev*nblocks*sizeof(double);
-  Kokkos::initialize();
   {
 
     init_array(array_in_ref, nproma,nlev,nblocks);
@@ -774,9 +899,9 @@ int main() {
     //memcpy(array, array_in_ref, nbytes);
     //scenario_1(array, nblocks, nlev, nproma);
     //validate(array, array_out_ref, nblocks, nlev, nproma);
-    run_scenario(array, array_in_ref, array_out_ref,
-                 nblocks, nlev, nproma,
-                 scenario_6b);
+    run_scenario_using_host_data(array, array_in_ref, array_out_ref,
+                                 nblocks, nlev, nproma,
+                                 scenario_6b);
     memcpy(array, array_in_ref, nbytes);
     std::function<void(double*, int, int, int)> s_1b = scenario_1b;
     openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_1b);
@@ -834,7 +959,6 @@ int main() {
 */
 
   }
-  Kokkos::finalize();
 
-  return 0;
+
 }
diff --git a/script.sh b/script.sh
index ae78a01db5b7d5443b81900aa409e70930af2964..cfa780001f6aeb7d247289350b9762e69fbbbef6 100755
--- a/script.sh
+++ b/script.sh
@@ -4,14 +4,20 @@
 #nvhpc/24.7-gcc-11.2.0 
 #export LD_LIBRARY_PATH
 
+
 ulimit -s unlimited
 
 set -e
 
+# set DEVICE:
+source conf.sh
+
+export VERBOSE_LEVEL=2
+
 if [ "$1" == 'gpu' ]
 then
     #rm -rf build_gpu
-    cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3"
+    cmake -B build_gpu -S . -DMU_ARCH=$DEVICE -DCMAKE_CXX_FLAGS="-O3"
     cmake --build build_gpu --parallel 
 
     ncells=(5000064)
@@ -20,7 +26,7 @@ then
     #nproma=$((449*29))
 else 
     #rm -rf build
-    cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3"
+    cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3" -DCMAKE_Fortran_FLAGS="-O3 -fopenmp"
     cmake --build build --parallel
 
     ncells=(5000064)
@@ -30,6 +36,7 @@ else
     export OMP_PROC_BIND=close
     export OMP_PLACES=cores
     export OMP_NUM_THREADS=8
+    echo "OMP_NUM_THREADS=${OMP_NUM_THREADS}"
 fi
 
 for jb in ${ncells[*]}; do 
@@ -40,10 +47,12 @@ for jb in ${ncells[*]}; do
             export NCELLS=$jb
             if [ "$1" == 'gpu' ] 
             then
-                ./build_gpu/demo 
+                #./build_gpu/demo
+                ./build_gpu/fdemo 
             else
-                ./build/demo
-                echo "---"
+                #./build/demo
+                ./build/fdemo
+                echo "--------------------------------------------------------"
             fi
         done
     done