diff --git a/src/mo_fortran_tools.F90 b/src/mo_fortran_tools.F90 index 708296f6be9f2258417571dd56489527e9b162bf..8079c1834e3e888297f84a28c432c980eaf5deb1 100644 --- a/src/mo_fortran_tools.F90 +++ b/src/mo_fortran_tools.F90 @@ -54,6 +54,7 @@ MODULE mo_fortran_tools PUBLIC :: assert_acc_host_only PUBLIC :: assert_acc_device_only PUBLIC :: set_acc_host_or_device + PUBLIC :: set_acc_async_queue PRIVATE @@ -511,19 +512,21 @@ CONTAINS END SUBROUTINE resize_arr_c1d !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_1d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_1d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:) REAL(dp), INTENT(OUT) :: dest(:) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, m1 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc) !$omp do private(i1) DO i1 = 1, m1 dest(i1) = src(i1) @@ -531,24 +534,25 @@ CONTAINS !$omp end do nowait !$ACC END PARALLEL LOOP - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_1d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_2d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_2d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:, :) REAL(dp), INTENT(OUT) :: dest(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #ifdef __INTEL_COMPILER !$omp do private(i1,i2) #else @@ -560,26 +564,27 @@ CONTAINS END DO END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_2d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_3d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_3d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(_CRAYFTN) || defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -594,26 +599,27 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_3d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_4d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_4d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:, :, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) m4 = SIZE(dest, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(_CRAYFTN) || defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4) #else @@ -630,19 +636,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_4d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_dp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_dp(src, dest, lacc, opt_acc_async_queue) REAL(dp), INTENT(IN) :: src(:, :, :, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -650,7 +657,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -669,19 +676,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_dp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_sp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_sp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :, :, :, :) REAL(sp), INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -689,7 +697,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -708,24 +716,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_sp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_2d_spdp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_2d_spdp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :) REAL(dp), INTENT(OUT) :: dest(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -737,26 +746,27 @@ CONTAINS END DO END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_2d_spdp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_3d_spdp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_3d_spdp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -770,27 +780,28 @@ CONTAINS END DO END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_3d_spdp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_4d_spdp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_4d_spdp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) m4 = SIZE(dest, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4) #else @@ -806,20 +817,21 @@ CONTAINS END DO END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_4d_spdp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_spdp(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_spdp(src, dest, lacc, opt_acc_async_queue) REAL(sp), INTENT(IN) :: src(:, :, :, :, :) REAL(dp), INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -827,7 +839,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -846,24 +858,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_spdp !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_2d_i4(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_2d_i4(src, dest, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(IN) :: src(:, :) INTEGER(ik4), INTENT(OUT) :: dest(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -876,25 +889,26 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_2d_i4 !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_3d_i4(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_3d_i4(src, dest, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(IN) :: src(:, :, :) INTEGER(ik4), INTENT(OUT) :: dest(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) m3 = SIZE(dest, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -909,19 +923,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_3d_i4 !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_i4(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_i4(src, dest, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(IN) :: src(:, :, :, :, :) INTEGER(ik4), INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -929,7 +944,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -948,19 +963,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_i4 !> copy state, omp parallel, does not wait for other threads to complete - SUBROUTINE copy_5d_l(src, dest, lacc, opt_acc_async) + SUBROUTINE copy_5d_l(src, dest, lacc, opt_acc_async_queue) LOGICAL, INTENT(IN) :: src(:, :, :, :, :) LOGICAL, INTENT(OUT) :: dest(:, :, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(dest, 1) m2 = SIZE(dest, 2) @@ -968,7 +984,7 @@ CONTAINS m4 = SIZE(dest, 4) m5 = SIZE(dest, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -987,62 +1003,65 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE copy_5d_l - SUBROUTINE init_zero_1d_dp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_1d_dp(init_var, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, m1 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc) !$omp do DO i1 = 1, m1 init_var(i1) = 0.0_dp END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_1d_dp - SUBROUTINE init_zero_1d_sp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_1d_sp(init_var, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, m1 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc) !$omp do DO i1 = 1, m1 init_var(i1) = 0.0_dp END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_1d_sp - SUBROUTINE init_zero_2d_dp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_2d_dp(init_var, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -1055,22 +1074,23 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_2d_dp - SUBROUTINE init_zero_2d_i4(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_2d_i4(init_var, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(OUT) :: init_var(:, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -1083,23 +1103,24 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_2d_i4 - SUBROUTINE init_zero_3d_dp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_3d_dp(init_var, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN)) !$omp do private(i1,i2,i3) #else @@ -1114,23 +1135,24 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_3d_dp - SUBROUTINE init_zero_3d_sp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_3d_sp(init_var, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -1145,24 +1167,24 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) - END SUBROUTINE init_zero_3d_sp - SUBROUTINE init_zero_3d_i4(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_3d_i4(init_var, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -1177,24 +1199,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_3d_i4 - SUBROUTINE init_zero_4d_dp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_4d_dp(init_var, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) m4 = SIZE(init_var, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN)) !$omp do private(i1,i2,i3,i4) #else @@ -1212,24 +1235,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_4d_dp - SUBROUTINE init_zero_4d_sp(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_4d_sp(init_var, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) m4 = SIZE(init_var, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(__INTEL_COMPILER) || defined(_CRAYFTN)) !$omp do private(i1,i2,i3,i4) #else @@ -1247,24 +1271,25 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_4d_sp - SUBROUTINE init_zero_4d_i4(init_var, lacc, opt_acc_async) + SUBROUTINE init_zero_4d_i4(init_var, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :, :) LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, m1, m2, m3, m4 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) m4 = SIZE(init_var, 4) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(4) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(4) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4) #else @@ -1282,21 +1307,22 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_zero_4d_i4 - SUBROUTINE init_1d_dp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_1d_dp(init_var, init_val, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, m1 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) IF(lzacc) !$omp do private(i1) DO i1 = 1, m1 init_var(i1) = init_val @@ -1304,24 +1330,25 @@ CONTAINS !$omp end do nowait !$ACC END PARALLEL LOOP - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_1d_dp - SUBROUTINE init_2d_dp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_2d_dp(init_var, init_val, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, m1, m2 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(2) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(2) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2) #else @@ -1334,25 +1361,26 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_2d_dp - SUBROUTINE init_3d_dp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_3d_dp(init_var, init_val, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :, :) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -1367,25 +1395,26 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_3d_dp - SUBROUTINE init_3d_spdp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_3d_spdp(init_var, init_val, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:, :, :) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, m1, m2, m3 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) m3 = SIZE(init_var, 3) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(3) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(3) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3) #else @@ -1400,19 +1429,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_3d_spdp - SUBROUTINE init_5d_dp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_5d_dp(init_var, init_val, lacc, opt_acc_async_queue) REAL(dp), INTENT(OUT) :: init_var(:, :, :, :, :) REAL(dp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) @@ -1420,7 +1450,7 @@ CONTAINS m4 = SIZE(init_var, 4) m5 = SIZE(init_var, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -1439,19 +1469,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_5d_dp - SUBROUTINE init_5d_sp(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_5d_sp(init_var, init_val, lacc, opt_acc_async_queue) REAL(sp), INTENT(OUT) :: init_var(:, :, :, :, :) REAL(sp), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) @@ -1459,7 +1490,7 @@ CONTAINS m4 = SIZE(init_var, 4) m5 = SIZE(init_var, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -1478,19 +1509,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_5d_sp - SUBROUTINE init_5d_i4(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_5d_i4(init_var, init_val, lacc, opt_acc_async_queue) INTEGER(ik4), INTENT(OUT) :: init_var(:, :, :, :, :) INTEGER(ik4), INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) @@ -1498,7 +1530,7 @@ CONTAINS m4 = SIZE(init_var, 4) m5 = SIZE(init_var, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -1517,19 +1549,20 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_5d_i4 - SUBROUTINE init_5d_l(init_var, init_val, lacc, opt_acc_async) + SUBROUTINE init_5d_l(init_var, init_val, lacc, opt_acc_async_queue) LOGICAL, INTENT(OUT) :: init_var(:, :, :, :, :) LOGICAL, INTENT(IN) :: init_val LOGICAL, INTENT(IN) :: lacc - LOGICAL, INTENT(IN), OPTIONAL :: opt_acc_async + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue INTEGER :: i1, i2, i3, i4, i5, m1, m2, m3, m4, m5 LOGICAL :: lzacc + INTEGER :: acc_async_queue CALL set_acc_host_or_device(lzacc, lacc) + CALL set_acc_async_queue(acc_async_queue, opt_acc_async_queue) m1 = SIZE(init_var, 1) m2 = SIZE(init_var, 2) @@ -1537,7 +1570,7 @@ CONTAINS m4 = SIZE(init_var, 4) m5 = SIZE(init_var, 5) - !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(1) COLLAPSE(5) IF(lzacc) + !$ACC PARALLEL LOOP DEFAULT(PRESENT) ASYNC(acc_async_queue) COLLAPSE(5) IF(lzacc) #if (defined(__INTEL_COMPILER)) !$omp do private(i1,i2,i3,i4,i5) #else @@ -1556,7 +1589,6 @@ CONTAINS END DO !$omp end do nowait - CALL acc_wait_if_requested(1, opt_acc_async) END SUBROUTINE init_5d_l SUBROUTINE var_scale_3d_dp(var, scale_val, lacc, opt_acc_async) @@ -2363,4 +2395,14 @@ CONTAINS #endif END SUBROUTINE set_acc_host_or_device + SUBROUTINE set_acc_async_queue(acc_async_queue, opt_acc_async_queue) + INTEGER, INTENT(OUT) :: acc_async_queue + INTEGER, INTENT(IN), OPTIONAL :: opt_acc_async_queue + + acc_async_queue = 1 + IF (PRESENT(opt_acc_async_queue)) THEN + acc_async_queue = opt_acc_async_queue + END IF + END SUBROUTINE set_acc_async_queue + END MODULE mo_fortran_tools