From 0b304fca1c684577c9b09df56c7bd21f29a1b218 Mon Sep 17 00:00:00 2001
From: Dmitry Alexeev <dalexeev@nvidia.com>
Date: Tue, 4 Mar 2025 06:14:32 -0800
Subject: [PATCH 1/2] init and copy APIs are always async and accept optional
 queue argument. this change is required to enable CUDA graphs in TMX

---
 src/mo_fortran_tools.F90 | 299 ++++++++++++++++++++++-----------------
 1 file changed, 171 insertions(+), 128 deletions(-)

diff --git a/src/mo_fortran_tools.F90 b/src/mo_fortran_tools.F90
index bc16f01..84356a3 100644
--- a/src/mo_fortran_tools.F90
+++ b/src/mo_fortran_tools.F90
@@ -55,6 +55,7 @@ MODULE mo_fortran_tools
   PUBLIC :: assert_acc_device_only
   PUBLIC :: assert_lacc_equals_i_am_accel_node
   PUBLIC :: set_acc_host_or_device
+  PUBLIC :: set_acc_async_queue
 
   PRIVATE
 
@@ -512,19 +513,21 @@ CONTAINS
   END SUBROUTINE resize_arr_c1d
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_1d_dp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_1d_dp(src, dest, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(IN) :: src(:)
     REAL(dp), INTENT(OUT) :: dest(:)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, m1
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc)
     !$omp do private(i1)
     DO i1 = 1, m1
       dest(i1) = src(i1)
@@ -532,24 +535,25 @@ CONTAINS
     !$omp end do nowait
     !$ACC END PARALLEL LOOP
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_1d_dp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_2d_dp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_2d_dp(src, dest, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(IN) :: src(:, :)
     REAL(dp), INTENT(OUT) :: dest(:, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, m1, m2
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc)
 #ifdef __INTEL_COMPILER
 !$omp do private(i1,i2)
 #else
@@ -561,26 +565,27 @@ CONTAINS
       END DO
     END DO
 !$omp end do nowait
-    CALL acc_wait_if_requested(1, opt_acc_async)
 
   END SUBROUTINE copy_2d_dp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_3d_dp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_3d_dp(src, dest, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(IN) :: src(:, :, :)
     REAL(dp), INTENT(OUT) :: dest(:, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, m1, m2, m3
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
     m3 = SIZE(dest, 3)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc)
 #if (defined(_CRAYFTN) || defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3)
 #else
@@ -595,26 +600,27 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_3d_dp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_4d_dp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_4d_dp(src, dest, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(IN) :: src(:, :, :, :)
     REAL(dp), INTENT(OUT) :: dest(:, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
     m3 = SIZE(dest, 3)
     m4 = SIZE(dest, 4)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc)
 #if (defined(_CRAYFTN) || defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4)
 #else
@@ -631,19 +637,20 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_4d_dp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_5d_dp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_5d_dp(src, dest, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(IN) :: src(:, :, :, :, :)
     REAL(dp), INTENT(OUT) :: dest(:, :, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
@@ -651,7 +658,7 @@ CONTAINS
     m4 = SIZE(dest, 4)
     m5 = SIZE(dest, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -670,19 +677,20 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_5d_dp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_5d_sp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_5d_sp(src, dest, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(IN) :: src(:, :, :, :, :)
     REAL(sp), INTENT(OUT) :: dest(:, :, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
@@ -690,7 +698,7 @@ CONTAINS
     m4 = SIZE(dest, 4)
     m5 = SIZE(dest, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -709,24 +717,25 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_5d_sp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_2d_spdp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_2d_spdp(src, dest, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(IN) :: src(:, :)
     REAL(dp), INTENT(OUT) :: dest(:, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, m1, m2
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2)
 #else
@@ -738,26 +747,27 @@ CONTAINS
       END DO
     END DO
 !$omp end do nowait
-    CALL acc_wait_if_requested(1, opt_acc_async)
 
   END SUBROUTINE copy_2d_spdp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_3d_spdp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_3d_spdp(src, dest, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(IN) :: src(:, :, :)
     REAL(dp), INTENT(OUT) :: dest(:, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, m1, m2, m3
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
     m3 = SIZE(dest, 3)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3)
 #else
@@ -771,27 +781,28 @@ CONTAINS
       END DO
     END DO
 !$omp end do nowait
-    CALL acc_wait_if_requested(1, opt_acc_async)
 
   END SUBROUTINE copy_3d_spdp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_4d_spdp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_4d_spdp(src, dest, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(IN) :: src(:, :, :, :)
     REAL(dp), INTENT(OUT) :: dest(:, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
     m3 = SIZE(dest, 3)
     m4 = SIZE(dest, 4)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4)
 #else
@@ -807,20 +818,21 @@ CONTAINS
       END DO
     END DO
 !$omp end do nowait
-    CALL acc_wait_if_requested(1, opt_acc_async)
 
   END SUBROUTINE copy_4d_spdp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_5d_spdp(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_5d_spdp(src, dest, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(IN) :: src(:, :, :, :, :)
     REAL(dp), INTENT(OUT) :: dest(:, :, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
@@ -828,7 +840,7 @@ CONTAINS
     m4 = SIZE(dest, 4)
     m5 = SIZE(dest, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -847,24 +859,25 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_5d_spdp
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_2d_i4(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_2d_i4(src, dest, lacc, opt_acc_async_queue)
     INTEGER(ik4), INTENT(IN) :: src(:, :)
     INTEGER(ik4), INTENT(OUT) :: dest(:, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, m1, m2
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2)
 #else
@@ -877,25 +890,26 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_2d_i4
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_3d_i4(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_3d_i4(src, dest, lacc, opt_acc_async_queue)
     INTEGER(ik4), INTENT(IN) :: src(:, :, :)
     INTEGER(ik4), INTENT(OUT) :: dest(:, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, m1, m2, m3
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
     m3 = SIZE(dest, 3)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3)
 #else
@@ -910,19 +924,20 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_3d_i4
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_5d_i4(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_5d_i4(src, dest, lacc, opt_acc_async_queue)
     INTEGER(ik4), INTENT(IN) :: src(:, :, :, :, :)
     INTEGER(ik4), INTENT(OUT) :: dest(:, :, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
@@ -930,7 +945,7 @@ CONTAINS
     m4 = SIZE(dest, 4)
     m5 = SIZE(dest, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -949,19 +964,20 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_5d_i4
 
   !> copy state, omp parallel, does not wait for other threads to complete
-  SUBROUTINE copy_5d_l(src, dest, lacc, opt_acc_async)
+  SUBROUTINE copy_5d_l(src, dest, lacc, opt_acc_async_queue)
     LOGICAL, INTENT(IN) :: src(:, :, :, :, :)
     LOGICAL, INTENT(OUT) :: dest(:, :, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(dest, 1)
     m2 = SIZE(dest, 2)
@@ -969,7 +985,7 @@ CONTAINS
     m4 = SIZE(dest, 4)
     m5 = SIZE(dest, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
     !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -988,62 +1004,65 @@ CONTAINS
     END DO
     !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE copy_5d_l
 
-  SUBROUTINE init_zero_1d_dp(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_1d_dp(init_var, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(OUT) :: init_var(:)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, m1
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc)
 !$omp do
     DO i1 = 1, m1
       init_var(i1) = 0.0_dp
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_1d_dp
 
-  SUBROUTINE init_zero_1d_sp(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_1d_sp(init_var, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(OUT) :: init_var(:)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, m1
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc)
     !$omp do
     DO i1 = 1, m1
       init_var(i1) = 0.0_dp
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_1d_sp
 
-  SUBROUTINE init_zero_2d_dp(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_2d_dp(init_var, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(OUT) :: init_var(:, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, m1, m2
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2)
 #else
@@ -1056,22 +1075,23 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_2d_dp
 
-  SUBROUTINE init_zero_2d_i4(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_2d_i4(init_var, lacc, opt_acc_async_queue)
     INTEGER(ik4), INTENT(OUT) :: init_var(:, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, m1, m2
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2)
 #else
@@ -1084,23 +1104,24 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_2d_i4
 
-  SUBROUTINE init_zero_3d_dp(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_3d_dp(init_var, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(OUT) :: init_var(:, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, m1, m2, m3
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
     m3 = SIZE(init_var, 3)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc)
 #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN))
 !$omp do private(i1,i2,i3)
 #else
@@ -1115,23 +1136,24 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_3d_dp
 
-  SUBROUTINE init_zero_3d_sp(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_3d_sp(init_var, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(OUT) :: init_var(:, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, m1, m2, m3
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
     m3 = SIZE(init_var, 3)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3)
 #else
@@ -1146,24 +1168,25 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
 
   END SUBROUTINE init_zero_3d_sp
 
-  SUBROUTINE init_zero_3d_i4(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_3d_i4(init_var, lacc, opt_acc_async_queue)
     INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, m1, m2, m3
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
     m3 = SIZE(init_var, 3)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3)
 #else
@@ -1178,24 +1201,25 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_3d_i4
 
-  SUBROUTINE init_zero_4d_dp(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_4d_dp(init_var, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(OUT) :: init_var(:, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
     m3 = SIZE(init_var, 3)
     m4 = SIZE(init_var, 4)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc)
 #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN))
 !$omp do private(i1,i2,i3,i4)
 #else
@@ -1213,24 +1237,25 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_4d_dp
 
-  SUBROUTINE init_zero_4d_sp(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_4d_sp(init_var, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(OUT) :: init_var(:, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
     m3 = SIZE(init_var, 3)
     m4 = SIZE(init_var, 4)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc)
 #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN))
 !$omp do private(i1,i2,i3,i4)
 #else
@@ -1248,24 +1273,25 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_4d_sp
 
-  SUBROUTINE init_zero_4d_i4(init_var, lacc, opt_acc_async)
+  SUBROUTINE init_zero_4d_i4(init_var, lacc, opt_acc_async_queue)
     INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :, :)
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
     m3 = SIZE(init_var, 3)
     m4 = SIZE(init_var, 4)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4)
 #else
@@ -1283,21 +1309,22 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_zero_4d_i4
 
-  SUBROUTINE init_1d_dp(init_var, init_val, lacc, opt_acc_async)
+  SUBROUTINE init_1d_dp(init_var, init_val, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(OUT) :: init_var(:)
     REAL(dp), INTENT(IN) :: init_val
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
     INTEGER :: i1, m1
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc)
     !$omp do private(i1)
     DO i1 = 1, m1
       init_var(i1) = init_val
@@ -1305,24 +1332,25 @@ CONTAINS
     !$omp end do nowait
     !$ACC END PARALLEL LOOP
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_1d_dp
 
-  SUBROUTINE init_2d_dp(init_var, init_val, lacc, opt_acc_async)
+  SUBROUTINE init_2d_dp(init_var, init_val, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(OUT) :: init_var(:, :)
     REAL(dp), INTENT(IN) :: init_val
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
 
     INTEGER :: i1, i2, m1, m2
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2)
 #else
@@ -1335,25 +1363,26 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_2d_dp
 
-  SUBROUTINE init_3d_dp(init_var, init_val, lacc, opt_acc_async)
+  SUBROUTINE init_3d_dp(init_var, init_val, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(OUT) :: init_var(:, :, :)
     REAL(dp), INTENT(IN) :: init_val
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
 
     INTEGER :: i1, i2, i3, m1, m2, m3
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
     m3 = SIZE(init_var, 3)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3)
 #else
@@ -1368,25 +1397,26 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_3d_dp
 
-  SUBROUTINE init_3d_spdp(init_var, init_val, lacc, opt_acc_async)
+  SUBROUTINE init_3d_spdp(init_var, init_val, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(OUT) :: init_var(:, :, :)
     REAL(dp), INTENT(IN) :: init_val
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
 
     INTEGER :: i1, i2, i3, m1, m2, m3
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
     m3 = SIZE(init_var, 3)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3)
 #else
@@ -1401,19 +1431,20 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_3d_spdp
 
-  SUBROUTINE init_5d_dp(init_var, init_val, lacc, opt_acc_async)
+  SUBROUTINE init_5d_dp(init_var, init_val, lacc, opt_acc_async_queue)
     REAL(dp), INTENT(OUT) :: init_var(:, :, :, :, :)
     REAL(dp), INTENT(IN) :: init_val
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
 
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
@@ -1421,7 +1452,7 @@ CONTAINS
     m4 = SIZE(init_var, 4)
     m5 = SIZE(init_var, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -1440,19 +1471,20 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_5d_dp
 
-  SUBROUTINE init_5d_sp(init_var, init_val, lacc, opt_acc_async)
+  SUBROUTINE init_5d_sp(init_var, init_val, lacc, opt_acc_async_queue)
     REAL(sp), INTENT(OUT) :: init_var(:, :, :, :, :)
     REAL(sp), INTENT(IN) :: init_val
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
 
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
@@ -1460,7 +1492,7 @@ CONTAINS
     m4 = SIZE(init_var, 4)
     m5 = SIZE(init_var, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -1479,19 +1511,20 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_5d_sp
 
-  SUBROUTINE init_5d_i4(init_var, init_val, lacc, opt_acc_async)
+  SUBROUTINE init_5d_i4(init_var, init_val, lacc, opt_acc_async_queue)
     INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :, :, :)
     INTEGER(ik4), INTENT(IN) :: init_val
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
 
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
@@ -1499,7 +1532,7 @@ CONTAINS
     m4 = SIZE(init_var, 4)
     m5 = SIZE(init_var, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -1518,19 +1551,20 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_5d_i4
 
-  SUBROUTINE init_5d_l(init_var, init_val, lacc, opt_acc_async)
+  SUBROUTINE init_5d_l(init_var, init_val, lacc, opt_acc_async_queue)
     LOGICAL, INTENT(OUT) :: init_var(:, :, :, :, :)
     LOGICAL, INTENT(IN)  :: init_val
     LOGICAL, INTENT(IN) :: lacc
-    LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
 
     INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5
     LOGICAL :: lzacc
+    INTEGER :: acc_async_queue
 
     CALL set_acc_host_or_device(lzacc, lacc)
+    CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
 
     m1 = SIZE(init_var, 1)
     m2 = SIZE(init_var, 2)
@@ -1538,7 +1572,7 @@ CONTAINS
     m4 = SIZE(init_var, 4)
     m5 = SIZE(init_var, 5)
 
-    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc)
+    !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc)
 #if (defined(__INTEL_COMPILER))
 !$omp do private(i1,i2,i3,i4,i5)
 #else
@@ -1557,7 +1591,6 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-    CALL acc_wait_if_requested(1, opt_acc_async)
   END SUBROUTINE init_5d_l
 
   SUBROUTINE var_scale_3d_dp(var, scale_val, lacc, opt_acc_async)
@@ -2377,4 +2410,14 @@ CONTAINS
 #endif
   END SUBROUTINE set_acc_host_or_device
 
+  SUBROUTINE set_acc_async_queue(acc_async_queue, opt_acc_async_queue)
+    INTEGER, INTENT(OUT) :: acc_async_queue
+    INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue
+
+    acc_async_queue = 1
+    IF (PRESENT(opt_acc_async_queue)) THEN
+      acc_async_queue = opt_acc_async_queue
+    END IF
+  END SUBROUTINE set_acc_async_queue
+
 END MODULE mo_fortran_tools
-- 
GitLab


From 9694fb35aac6b6268f4441fff0594faf481f84e6 Mon Sep 17 00:00:00 2001
From: Dmitry Alexeev <dalexeev@nvidia.com>
Date: Thu, 13 Mar 2025 10:45:19 -0700
Subject: [PATCH 2/2] formatting

---
 src/mo_fortran_tools.F90 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mo_fortran_tools.F90 b/src/mo_fortran_tools.F90
index 84356a3..4cc282e 100644
--- a/src/mo_fortran_tools.F90
+++ b/src/mo_fortran_tools.F90
@@ -1168,7 +1168,6 @@ CONTAINS
     END DO
 !$omp end do nowait
 
-
   END SUBROUTINE init_zero_3d_sp
 
   SUBROUTINE init_zero_3d_i4(init_var, lacc, opt_acc_async_queue)
-- 
GitLab