From 0b304fca1c684577c9b09df56c7bd21f29a1b218 Mon Sep 17 00:00:00 2001 From: Dmitry Alexeev <dalexeev@nvidia.com> Date: Tue, 4 Mar 2025 06:14:32 -0800 Subject: [PATCH 1/2] init and copy APIs are always async and accept optional queue argument. this change is required to enable CUDA graphs in TMX --- src/mo_fortran_tools.F90 | 299 ++++++++++++++++++++++----------------- 1 file changed, 171 insertions(+), 128 deletions(-) diff --git a/src/mo_fortran_tools.F90 b/src/mo_fortran_tools.F90 index bc16f01..84356a3 100644 --- a/src/mo_fortran_tools.F90 +++ b/src/mo_fortran_tools.F90 @@ -55,6 +55,7 @@ MODULE mo_fortran_tools PUBLIC :: assert_acc_device_only PUBLIC :: assert_lacc_equals_i_am_accel_node PUBLIC :: set_acc_host_or_device + PUBLIC :: set_acc_async_queue PRIVATE @@ -512,19 +513,21 @@ CONTAINS END SUBROUTINE resize_arr_c1d !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_1d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_1d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:) REAL(dp), INTENT(OUT) :: dest(:) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, m1 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc) !$omp do private(i1) DO i1 = 1, m1 dest(i1) = src(i1) @@ -532,24 +535,25 @@ CONTAINS !$omp end do nowait !$ACC END PARALLEL LOOP - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_1d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_2d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_2d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:, :) REAL(dp), INTENT(OUT) :: dest(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #ifdef __INTEL_COMPILER !$omp do private(i1,i2) #else @@ -561,26 +565,27 @@ CONTAINS END DO END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_2d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_3d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_3d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(_CRAYFTN) || defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -595,26 +600,27 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_3d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_4d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_4d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:, :, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) m4 = SIZE(dest, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(_CRAYFTN) || defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4) #else @@ -631,19 +637,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_4d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:, :, :, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -651,7 +658,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -670,19 +677,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_sp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_sp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :, :, :, :) REAL(sp), INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -690,7 +698,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -709,24 +717,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_sp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_2d_spdp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_2d_spdp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :) REAL(dp), INTENT(OUT) :: dest(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -738,26 +747,27 @@ CONTAINS END DO END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_2d_spdp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_3d_spdp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_3d_spdp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -771,27 +781,28 @@ CONTAINS END DO END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_3d_spdp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_4d_spdp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_4d_spdp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) m4 = SIZE(dest, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4) #else @@ -807,20 +818,21 @@ CONTAINS END DO END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_4d_spdp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_spdp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_spdp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :, :, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -828,7 +840,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -847,24 +859,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_spdp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_2d_i4(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_2d_i4(src, dest, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(IN) :: src(:, :) INTEGER(ik4), INTENT(OUT) :: dest(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -877,25 +890,26 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_2d_i4 !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_3d_i4(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_3d_i4(src, dest, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(IN) :: src(:, :, :) INTEGER(ik4), INTENT(OUT) :: dest(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -910,19 +924,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_3d_i4 !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_i4(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_i4(src, dest, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(IN) :: src(:, :, :, :, :) INTEGER(ik4), INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -930,7 +945,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -949,19 +964,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_i4 !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_l(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_l(src, dest, lacc, opt_acc_async_queue) LOGICAL, INTENT(IN) :: src(:, :, :, :, :) LOGICAL, INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -969,7 +985,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -988,62 +1004,65 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_l - SUBROUTINE init_zero_1d_dp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_1d_dp(init_var, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, m1 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc) !$omp do DO i1 = 1, m1 init_var(i1) = 0.0_dp END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_1d_dp - SUBROUTINE init_zero_1d_sp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_1d_sp(init_var, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, m1 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc) !$omp do DO i1 = 1, m1 init_var(i1) = 0.0_dp END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_1d_sp - SUBROUTINE init_zero_2d_dp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_2d_dp(init_var, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -1056,22 +1075,23 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_2d_dp - SUBROUTINE init_zero_2d_i4(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_2d_i4(init_var, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(OUT) :: init_var(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -1084,23 +1104,24 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_2d_i4 - SUBROUTINE init_zero_3d_dp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_3d_dp(init_var, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN)) !$omp do private(i1,i2,i3) #else @@ -1115,23 +1136,24 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_3d_dp - SUBROUTINE init_zero_3d_sp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_3d_sp(init_var, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -1146,24 +1168,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_3d_sp - SUBROUTINE init_zero_3d_i4(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_3d_i4(init_var, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -1178,24 +1201,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_3d_i4 - SUBROUTINE init_zero_4d_dp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_4d_dp(init_var, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) m4 = SIZE(init_var, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN)) !$omp do private(i1,i2,i3,i4) #else @@ -1213,24 +1237,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_4d_dp - SUBROUTINE init_zero_4d_sp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_4d_sp(init_var, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) m4 = SIZE(init_var, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN)) !$omp do private(i1,i2,i3,i4) #else @@ -1248,24 +1273,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_4d_sp - SUBROUTINE init_zero_4d_i4(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_4d_i4(init_var, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) m4 = SIZE(init_var, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4) #else @@ -1283,21 +1309,22 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_4d_i4 - SUBROUTINE init_1d_dp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_1d_dp(init_var, init_val, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, m1 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc) !$omp do private(i1) DO i1 = 1, m1 init_var(i1) = init_val @@ -1305,24 +1332,25 @@ CONTAINS !$omp end do nowait !$ACC END PARALLEL LOOP - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_1d_dp - SUBROUTINE init_2d_dp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_2d_dp(init_var, init_val, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -1335,25 +1363,26 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_2d_dp - SUBROUTINE init_3d_dp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_3d_dp(init_var, init_val, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :, :) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -1368,25 +1397,26 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_3d_dp - SUBROUTINE init_3d_spdp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_3d_spdp(init_var, init_val, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:, :, :) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -1401,19 +1431,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_3d_spdp - SUBROUTINE init_5d_dp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_5d_dp(init_var, init_val, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :, :, :, :) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) @@ -1421,7 +1452,7 @@ CONTAINS m4 = SIZE(init_var, 4) m5 = SIZE(init_var, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -1440,19 +1471,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_5d_dp - SUBROUTINE init_5d_sp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_5d_sp(init_var, init_val, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:, :, :, :, :) REAL(sp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) @@ -1460,7 +1492,7 @@ CONTAINS m4 = SIZE(init_var, 4) m5 = SIZE(init_var, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -1479,19 +1511,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_5d_sp - SUBROUTINE init_5d_i4(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_5d_i4(init_var, init_val, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :, :, :) INTEGER(ik4), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) @@ -1499,7 +1532,7 @@ CONTAINS m4 = SIZE(init_var, 4) m5 = SIZE(init_var, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -1518,19 +1551,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_5d_i4 - SUBROUTINE init_5d_l(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_5d_l(init_var, init_val, lacc, opt_acc_async_queue) LOGICAL, INTENT(OUT) :: init_var(:, :, :, :, :) LOGICAL, INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) @@ -1538,7 +1572,7 @@ CONTAINS m4 = SIZE(init_var, 4) m5 = SIZE(init_var, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -1557,7 +1591,6 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_5d_l SUBROUTINE var_scale_3d_dp(var, scale_val, lacc, opt_acc_async) @@ -2377,4 +2410,14 @@ CONTAINS #endif END SUBROUTINE set_acc_host_or_device + SUBROUTINE set_acc_async_queue(acc_async_queue, opt_acc_async_queue) + INTEGER, INTENT(OUT) :: acc_async_queue + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue + + acc_async_queue = 1 + IF (PRESENT(opt_acc_async_queue)) THEN + acc_async_queue = opt_acc_async_queue + END IF + END SUBROUTINE set_acc_async_queue + END MODULE mo_fortran_tools -- GitLab From 9694fb35aac6b6268f4441fff0594faf481f84e6 Mon Sep 17 00:00:00 2001 From: Dmitry Alexeev <dalexeev@nvidia.com> Date: Thu, 13 Mar 2025 10:45:19 -0700 Subject: [PATCH 2/2] formatting --- src/mo_fortran_tools.F90 | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mo_fortran_tools.F90 b/src/mo_fortran_tools.F90 index 84356a3..4cc282e 100644 --- a/src/mo_fortran_tools.F90 +++ b/src/mo_fortran_tools.F90 @@ -1168,7 +1168,6 @@ CONTAINS END DO !$omp end do nowait - END SUBROUTINE init_zero_3d_sp SUBROUTINE init_zero_3d_i4(init_var, lacc, opt_acc_async_queue) -- GitLab