Skip to content
Snippets Groups Projects
Commit 82662749 authored by Georgiana Mania's avatar Georgiana Mania
Browse files

initial commit

parents
No related branches found
No related tags found
1 merge request!1initial commit
cmake_minimum_required(VERSION 3.20)
project(demo LANGUAGES CXX VERSION 0.0.1)
include(FetchContent)
# silence warning
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif ()
# if using kokkos as shared library, -fPIC is needed
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
# configure kokkos 4.2 repository link
FetchContent_Declare(kokkos
URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz)
if (("${MU_ARCH}" STREQUAL "x86_64") OR ("${MU_ARCH}" STREQUAL "arm"))
find_package(OpenMP)
if (OpenMP_FOUND)
set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "" FORCE)
else()
set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE)
endif()
set(Kokkos_ARCH_NATIVE ON CACHE BOOL "" FORCE)
elseif("${MU_ARCH}" STREQUAL "a100")
set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE)
set(Kokkos_ENABLE_CUDA ON CACHE BOOL "" FORCE)
set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "" FORCE)
set(Kokkos_ARCH_AMPERE80 ON CACHE BOOL "" FORCE)
else()
message(FATAL_ERROR "${MU_ARCH} is not a valid/tested configuration, select architecture, x86_64, a100, h100, mi250x, mi300a(-unified), h100(-unified)")
endif()
FetchContent_MakeAvailable(kokkos)
add_executable(demo main.cpp)
target_link_libraries(demo PUBLIC Kokkos::kokkos)
main.cpp 0 → 100644
#include <iostream>
#include <Kokkos_Core.hpp>
#include "Kokkos_Timer.hpp"
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy;
Kokkos::Timer timer;
//constexpr int nblocks = 2;
//constexpr int nlev = 90;
//constexpr int nproma = 55000;
static void validate(double* array, int nblocks, int nlev, int nproma) {
for (int i = 0; i < nblocks * nlev * nproma; ++i) {
assert(array[i] == static_cast<double>(i));
}
}
void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) {
if(print)
std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
timer.reset();
for (int jb = 0 ; jb < nblocks; ++jb)
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
d_view(jb, jk, jc) = p;
}});
if(print)
printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) {
if(print)
std::cout << "Right layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
timer.reset();
for (int jb = 0 ; jb < nblocks; ++jb)
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jc * nlev * nblocks + jk * nblocks + jb;
d_view(jc, jk, jb) = p;
// printf("%f ", d_view(jb, jk, jc));
}});
if(print)
printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) {
if(print)
std::cout << "Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
timer.reset();
for (int jb = 0 ; jb < nblocks; ++jb)
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
d_view(jb, jk, jc) = p;
// printf("%f ", d_view(jb, jk, jc));
}});
if(print)
printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) {
if(print)
std::cout << "Left layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
timer.reset();
for (int jb = 0 ; jb < nblocks; ++jb)
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
d_view(jc, jk, jb) = p;
// printf("%f ", d_view(jb, jk, jc));
}});
if(print)
printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) {
if(print)
std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
timer.reset();
for (int jb = 0 ; jb < nblocks; ++jb)
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jc * nlev * nblocks + jk * nblocks + jb;
d_view(jc, jk, jb) = p;
// printf("%f ", d_view(jb, jk, jc));
}});
if(print)
printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
int main() {
int ncells = atoi(std::getenv("NCELLS"));
int nlev = atoi(std::getenv("NLEV"));
int nproma = atoi(std::getenv("NPROMA"));
int nblocks = (ncells - 1) / nproma + 1;
std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
double array[nblocks * nlev * nproma];
/*
for (int jb = 0; jb < nblocks; ++jb)
for (int jk = 0; jk < nlev; ++jk)
for (int jc = 0; jc < nproma; ++jc) {
int p = jb * nlev * nproma + jk * nproma + jc;
array[p] = 1; //static_cast<double>(p);
}
*/
/*
for (int i = 0; i < nblocks * nlev * nproma; ++i)
std::cout << array[i] << " " ;
std::cout << "\n";
for (int jb = 0; jb < nblocks; ++jb)
for (int jk = 0; jk < nlev; ++jk){
for (int jc = 0; jc < nproma; ++jc)
std::cout << view(jb, jk, jc)<< " ";
std::cout << "\n";
}
*/
Kokkos::initialize();
{
scenario_1(array, nblocks, nlev, nproma, false);
scenario_1(array, nblocks, nlev, nproma);
scenario_2(array, nblocks, nlev, nproma);
scenario_2b(array, nblocks, nlev, nproma);
scenario_3(array, nblocks, nlev, nproma);
scenario_4(array, nblocks, nlev, nproma);
}
Kokkos::finalize();
return 0;
}
/**
*
* #if 0
Kokkos::parallel_for(
"print", md_range_policy({0, 0, 0}, {nblocks, nlev, nproma}),
KOKKOS_LAMBDA(const int jb, const int jk, const int jc) {
int p = jb * nlev * nproma + jk * nproma + jc;
d_view(jb, jk, jc) += p;
printf("%f ", d_view(jb, jk, jc));
});
std::cout << "\n";
#endif
for (int jb = 0 ; jb < nblocks; ++jb)
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
// int p = jb * nlev * nproma + jk * nproma + jc; left
int p = jc * nlev * nblocks + jk * nblocks + jb;
// d_view(jb, jk, jc) = p;
d_view(jc, jk, jb) = p;
// printf("%f ", d_view(jb, jk, jc));
}});
*/
\ No newline at end of file
#!/bin/bash
#gcc
#nvhpc/24.7-gcc-11.2.0
#export LD_LIBRARY_PATH
ulimit -s unlimited
if [ "$1" == 'gpu' ]
then
cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3"
cmake --build build_gpu --parallel
ncells=(5000000)
nlev=(90)
nproma=(10000 30000 50000 100000 1000000 5000000)
else
cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3"
cmake --build build --parallel
ncells=(5000000)
nlev=(90)
nproma=(32) # 64 96 128)
export OMP_PROC_BIND=close
export OMP_PLACES=cores
export OMP_NUM_THREADS=8
fi
for jb in ${ncells[*]}; do
for jk in ${nlev[*]}; do
for jc in ${nproma[*]}; do
export NPROMA=$jc
export NLEV=$jk
export NCELLS=$jb
if [ "$1" == 'gpu' ]
then
./build_gpu/demo
else
./build/demo
fi
done
done
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment