generate and submit benchmark script

36a26385 · Xingran Wang · a67d0da7 · 36a26385 · 36a26385
Commit 36a26385 authored 3 years ago by Xingran Wang
--- a/tests/template_file.sh.jinja
+++ b/tests/template_file.sh.jinja
+#! /bin/bash
+#SBATCH --job-name {{ name_job }}
+#SBATCH -t 00:10:00
+#SBATCH --nodes {{ num_node }}
+#SBATCH --tasks-per-node 48
+#SBATCH --partition {{ partition }}
+#SBATCH --account highresmonsoon
+#SBATCH --output ./LOG.%x.%j.o
+#SBATCH --error ./LOG.%x.%j.o
+#SBATCH --gres=gpu:1
+#SBATCH --parsable
+
+
+set -e
+
+nvhpc_version=21.5
+openmpi_version=4.1.1
+module --force purge
+ml use "$OTHERSTAGES"
+ml Stages/2020
+ml NVHPC/${nvhpc_version}-GCC-10.3.0
+ml OpenMPI/${openmpi_version}
+ml Ruby/2.7.2
+ml UCX/1.10.1
+ml netCDF-Fortran
+ml CMake/3.18.0
+# ecCodes is not directly available, load dependencies instead...
+ml OpenGL/2020 libaec
+ml ecCodes/2.21.0-nompi
+module swap netCDF/4.7.4-serial netCDF/4.7.4
+
+set -x
+
+qprefix="/p/scratch/highresmonsoon/cdi-pio-test_dir"
+mkdir -p "$qprefix"
+qprefix="$qprefix/files_pio_write"
+
+export LD_PRELOAD="/p/software/juwelsbooster/stages/2020/software/UCX/1.11.1/lib/libuct.so.0:/p/software/juwelsbooster/stages/2020/software/UCX/1.11.1/lib/libucp.so.0:/p/software/juwelsbooster/stages/2020/software/UCX/1.11.1/lib/libucs.so.0:/p/software/juwelsbooster/stages/2020/software/UCX/1.11.1/lib/libucm.so.0"
+
+export SCT_PROC_CHOICE={{ sct_proc_choice }}
+export SCT_CALLSTATS=1
+
+# reduce output data to decrease test duration
+# time ../libtool --mode=execute srun \
+#   ./pio_write_deco2d.parallel "-qprefix=$qprefix" -qpio-role-scheme={{ pio_role_scheme }} \
+#   -c -m 384 -n 192 -z 95 -t 20 -y 60 -s 7 \
+#   -f grb2 -p PIO_MPI_FW_AT_ALL -w 16
+time ../libtool --mode=execute srun \
+  ./pio_write.parallel "-qprefix=$qprefix" -qpio-role-scheme={{ pio_role_scheme }} \
+  -c -m 768 -n 384 -z 95 -t 10 -y 120 -s 7 \
+  -f {{ format }} -p PIO_MPI_FW_AT_ALL -w {{ num_io_task }}
+set +x
--- a/tests/template_generate_submit.py
+++ b/tests/template_generate_submit.py
+from jinja2 import Environment, FileSystemLoader
+from subprocess import run, PIPE
+
+partition = "booster"
+compiler = "nvhpc_ompi"
+branch_short = "1.8.x-tj20220307"
+# pio_role_scheme candidate:
+#   - balanced
+#   - last
+pio_role_scheme = "last"
+output_format = "nc4"
+
+config = dict()
+config['partition'] = partition
+config['sct_proc_choice'] = "SCT_REDUCE_ALL"
+# 'format' candidate: nc2 nc4 grb2 grb usw
+config['format'] = output_format
+config['pio_role_scheme'] = "last"
+dependency = "--dependency=afterok:"
+job_id = 0
+
+file_loader = FileSystemLoader(".")
+env = Environment(loader=file_loader)
+template = env.get_template("template_file.sh.jinja")
+
+num_nodes = list(range(5, 11))
+num_io_tasks = list(range(8, 56, 8))
+for num_node in num_nodes:
+    for num_io_task in num_io_tasks:
+        config['name_job'] = (f"pio_benchmark.{partition}.{compiler}"
+                              f".{branch_short}.num_node-{num_node}"
+                              f".num_io_task-{num_io_task}"
+                              f".{pio_role_scheme}"
+                              f".{output_format}")
+        config['num_node'] = num_node
+        config['num_io_task'] = num_io_task
+
+        run_script = (f"pio_benchmark.num_node-{num_node}"
+                      f".num_io_task-{num_io_task}"
+                      f".{pio_role_scheme}.run"
+                      f".{output_format}")
+        with open(run_script, mode="w") as f:
+            f.write(template.render(**config))
+            f.flush()
+        for _ in range(3):
+            if job_id == 0:
+                # first submit
+                shell_process = run(['sbatch', run_script],
+                                    stdout=PIPE)
+            else:
+                # job has been submitted
+                shell_process = run(['sbatch',
+                                     dependency+str(job_id), run_script],
+                                    stdout=PIPE)
+
+            if shell_process.returncode == 0:
+                # job submit successful
+                job_id = shell_process.stdout.decode('UTF-8').rstrip()
+                print(f"{run_script} submitted with ID {job_id}.")
+            else:
+                print(f"failed to submit {run_script}")