diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100755
index 0000000000000000000000000000000000000000..413f35f3c197d1aeb0e4309de3e144656daf05cd
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 3.20)
+project(demo LANGUAGES CXX VERSION 0.0.1)
+
+
+include(FetchContent)
+
+# silence warning
+if (POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+endif ()
+
+# if using kokkos as shared library, -fPIC is needed
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+
+# configure kokkos 4.2 repository link
+FetchContent_Declare(kokkos
+        URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz)
+
+if (("${MU_ARCH}" STREQUAL "x86_64") OR ("${MU_ARCH}" STREQUAL "arm"))
+    find_package(OpenMP)
+    if (OpenMP_FOUND)
+        set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "" FORCE)
+    else()
+        set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE)
+    endif()
+    set(Kokkos_ARCH_NATIVE ON CACHE BOOL "" FORCE)
+elseif("${MU_ARCH}" STREQUAL "a100")
+    set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE)
+    set(Kokkos_ENABLE_CUDA ON CACHE BOOL "" FORCE)
+    set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "" FORCE)
+    set(Kokkos_ARCH_AMPERE80 ON CACHE BOOL "" FORCE)
+else()
+    message(FATAL_ERROR "${MU_ARCH} is not a valid/tested configuration, select architecture, x86_64, a100, h100, mi250x, mi300a(-unified), h100(-unified)")
+endif()
+
+FetchContent_MakeAvailable(kokkos)
+
+add_executable(demo main.cpp)
+target_link_libraries(demo PUBLIC Kokkos::kokkos)
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7d6f4231f05f46bf30ab0444f349f020f8a7aa5
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,227 @@
+#include <iostream>
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Timer.hpp"
+
+using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy;
+
+ Kokkos::Timer timer;
+   
+
+//constexpr int nblocks = 2;
+//constexpr int nlev = 90;
+//constexpr int nproma = 55000;
+
+static void validate(double* array, int nblocks, int nlev, int nproma) {
+    for (int i = 0; i < nblocks * nlev * nproma; ++i)  {
+       assert(array[i] == static_cast<double>(i));
+    }
+}
+
+void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+    if(print)
+        std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+
+    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
+
+    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+    timer.reset();
+
+    for (int jb = 0 ; jb < nblocks; ++jb) 
+    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
+        for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;  
+            d_view(jb, jk, jc) = p;
+        }});
+    
+    if(print)
+        printf("Time = %f ms\n\n", timer.seconds() * 1000);
+    Kokkos::deep_copy(view, d_view);
+
+    validate(array, nblocks, nlev, nproma);
+}
+
+void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+    if(print)
+        std::cout << "Right layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+
+    Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
+ 
+    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
+    
+    timer.reset();
+
+    for (int jb = 0 ; jb < nblocks; ++jb) 
+    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
+        for (int jk = 0; jk < nlev; ++jk) {
+            int p = jc * nlev * nblocks + jk * nblocks + jb;
+            d_view(jc, jk, jb) = p;
+
+          //  printf("%f ", d_view(jb, jk, jc));      
+        }});
+
+    if(print)
+        printf("Time = %f ms\n\n", timer.seconds() * 1000);
+    Kokkos::deep_copy(view, d_view);
+    validate(array, nblocks, nlev, nproma);
+
+}
+
+void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+    if(print)
+        std::cout << "Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+
+    Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
+ 
+    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
+    
+    timer.reset();
+
+    for (int jb = 0 ; jb < nblocks; ++jb) 
+    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
+        for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;  
+            d_view(jb, jk, jc) = p;
+
+          //  printf("%f ", d_view(jb, jk, jc));      
+        }});
+
+    if(print)
+        printf("Time = %f ms\n\n", timer.seconds() * 1000);
+    Kokkos::deep_copy(view, d_view);
+    validate(array, nblocks, nlev, nproma);
+
+}
+
+void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+
+    if(print)
+        std::cout << "Left layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+
+    Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
+ 
+    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
+    
+    timer.reset();
+    for (int jb = 0 ; jb < nblocks; ++jb) 
+    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
+        for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;  
+            d_view(jc, jk, jb) = p;
+
+          //  printf("%f ", d_view(jb, jk, jc));      
+        }});
+
+    if(print)
+        printf("Time = %f ms\n\n", timer.seconds() * 1000);
+    Kokkos::deep_copy(view, d_view);
+    validate(array, nblocks, nlev, nproma);
+
+}
+
+void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+
+    if(print)
+        std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+
+    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
+ 
+    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
+    
+    timer.reset();
+    for (int jb = 0 ; jb < nblocks; ++jb) 
+    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
+        for (int jk = 0; jk < nlev; ++jk) {
+            int p = jc * nlev * nblocks + jk * nblocks + jb;
+            d_view(jc, jk, jb) = p;
+
+          //  printf("%f ", d_view(jb, jk, jc));      
+        }});
+
+    if(print)
+        printf("Time = %f ms\n\n", timer.seconds() * 1000);
+    Kokkos::deep_copy(view, d_view);
+    validate(array, nblocks, nlev, nproma);
+
+}
+
+
+
+int main() {
+
+    int ncells = atoi(std::getenv("NCELLS"));
+    int nlev = atoi(std::getenv("NLEV"));
+    int nproma = atoi(std::getenv("NPROMA"));
+    int nblocks = (ncells - 1) / nproma + 1;
+
+    std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
+
+    double array[nblocks * nlev * nproma];
+ /*   
+    for (int jb = 0; jb < nblocks; ++jb)
+        for (int jk = 0; jk < nlev; ++jk)
+            for (int jc = 0; jc < nproma; ++jc) {
+                int p = jb * nlev * nproma + jk * nproma + jc;
+                array[p] = 1; //static_cast<double>(p);
+            }
+*/
+/*
+    for (int i = 0; i < nblocks * nlev * nproma; ++i) 
+        std::cout << array[i] << " " ;
+    std::cout << "\n";     
+
+
+   for (int jb = 0; jb < nblocks; ++jb)
+        for (int jk = 0; jk < nlev; ++jk){
+            for (int jc = 0; jc < nproma; ++jc) 
+                std::cout << view(jb, jk, jc)<< " ";
+          std::cout << "\n";
+        }
+*/
+
+    Kokkos::initialize();
+{
+
+    scenario_1(array, nblocks, nlev, nproma, false);
+
+    scenario_1(array, nblocks, nlev, nproma);
+    scenario_2(array, nblocks, nlev, nproma);
+    scenario_2b(array, nblocks, nlev, nproma);
+    scenario_3(array, nblocks, nlev, nproma);
+    scenario_4(array, nblocks, nlev, nproma);
+
+}
+    Kokkos::finalize();
+
+    return 0;
+}
+
+/**
+ * 
+ * #if 0
+    Kokkos::parallel_for(
+      "print", md_range_policy({0, 0, 0}, {nblocks, nlev, nproma}),
+      KOKKOS_LAMBDA(const int jb, const int jk, const int jc) {
+            int p = jb * nlev * nproma + jk * nproma + jc;
+            d_view(jb, jk, jc) += p;
+            printf("%f ", d_view(jb, jk, jc));        
+    });
+    std::cout << "\n";
+#endif
+    for (int jb = 0 ; jb < nblocks; ++jb) 
+    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
+        for (int jk = 0; jk < nlev; ++jk) {
+          //  int p = jb * nlev * nproma + jk * nproma + jc;  left
+            int p = jc * nlev * nblocks + jk * nblocks + jb;
+           // d_view(jb, jk, jc) = p;
+            d_view(jc, jk, jb) = p;
+
+          //  printf("%f ", d_view(jb, jk, jc));      
+        }});
+*/
\ No newline at end of file
diff --git a/script.sh b/script.sh
new file mode 100755
index 0000000000000000000000000000000000000000..636792996cc0154654387fb7c42cb7b5046a30ae
--- /dev/null
+++ b/script.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+#gcc
+#nvhpc/24.7-gcc-11.2.0 
+#export LD_LIBRARY_PATH
+
+ulimit -s unlimited
+
+if [ "$1" == 'gpu' ]
+then
+    cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3"
+    cmake --build build_gpu --parallel 
+    ncells=(5000000)
+    nlev=(90)
+    nproma=(10000 30000 50000 100000 1000000 5000000)
+else 
+    cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3"
+    cmake --build build --parallel
+    ncells=(5000000)
+    nlev=(90)   
+    nproma=(32) # 64 96 128)
+
+    export OMP_PROC_BIND=close
+    export OMP_PLACES=cores
+    export OMP_NUM_THREADS=8
+fi
+
+for jb in ${ncells[*]}; do 
+    for jk in ${nlev[*]}; do
+        for jc in ${nproma[*]}; do
+            export NPROMA=$jc
+            export NLEV=$jk
+            export NCELLS=$jb
+            if [ "$1" == 'gpu' ] 
+            then
+                ./build_gpu/demo 
+            else
+                ./build/demo
+            fi
+        done
+    done
+done