From 2cbccd735f2b4fa14bf5e29a7982d5e4f7cbc3d1 Mon Sep 17 00:00:00 2001
From: Daniel Reinert <daniel.reinert@dwd.de>
Date: Mon, 4 Nov 2024 14:11:16 +0000
Subject: [PATCH] Performance optimization for NEC SX AURORA
 (icon-libraries/libfortran-support!97)

The subroutines `init_zero_4d_[dp,sp,i4]` do not vectorize properly on NEC SX AURORA.

Loop collapsing is enforced by a compiler directive for the subroutines `init_zero_4d_[dp,sp,i4]`, in order to ensure proper vectorization on NEC SX AURORA.

Approved-by: Yen-Chen Chen <yen-chen.chen@kit.edu>
Merged-by: Yen-Chen Chen <yen-chen.chen@kit.edu>
Changelog: feature
---
 src/mo_fortran_tools.F90 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mo_fortran_tools.F90 b/src/mo_fortran_tools.F90
index 6f9fbe4..74e7ec7 100644
--- a/src/mo_fortran_tools.F90
+++ b/src/mo_fortran_tools.F90
@@ -1201,6 +1201,7 @@ CONTAINS
 #else
 !$omp do collapse(4)
 #endif
+!NEC$ forced_collapse
     DO i4 = 1, m4
       DO i3 = 1, m3
         DO i2 = 1, m2
@@ -1235,6 +1236,7 @@ CONTAINS
 #else
 !$omp do collapse(4)
 #endif
+!NEC$ forced_collapse
     DO i4 = 1, m4
       DO i3 = 1, m3
         DO i2 = 1, m2
@@ -1269,6 +1271,7 @@ CONTAINS
 #else
 !$omp do collapse(4)
 #endif
+!NEC$ forced_collapse
     DO i4 = 1, m4
       DO i3 = 1, m3
         DO i2 = 1, m2
-- 
GitLab