Remove parallelism

"Nuke the entire site from orbit--it's the only way to be sure"

Remove parallelism
ce2b537b · Jan Frederik Engels · d37b4f3a · ce2b537b · d37b4f3a · d37b4f3a
Commit ce2b537b authored 10 months ago by Jan Frederik Engels
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -36,7 +36,7 @@ website:
          - "lectures/user-experience/slides.qmd"
          # - "lectures/testing/slides.qmd"
          # - "lectures/git2/slides.qmd"
-          - "lectures/parallelism/slides.qmd"
+          # - "lectures/parallelism/slides.qmd"
          - "lectures/hardware/slides.qmd"
          # - "lectures/file-and-data-systems/slides.qmd"
          # - "lectures/memory-hierarchies/slides.qmd"
@@ -52,7 +52,7 @@ website:
          - "exercises/user-experience.qmd"
          # - "exercises/testing.qmd"
          # - "exercises/git2.qmd"
-          - "exercises/parallelism/parallelism.qmd"
+          # - "exercises/parallelism/parallelism.qmd"
          # - "exercises/hardware.qmd"
          # - "exercises/file_and_data_systems.qmd"
          # - "exercises/memory_hierarchies.qmd"

--- a/exercises/parallelism/main.c
+++ b/exercises/parallelism/main.c
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <time.h>
-
-#ifdef _OPENMP
-#include <omp.h>
-#else
-#ifdef WRITE_DECOMP
-#error You cannot write the decomposition without OpenMP.
-#endif
-#endif
-
-//compile using (for example):
-//gcc -lm -fopenmp -g main.c
-
-//definition of constants
-//some values are given in meter
-//some in seconds
-//
-#define x_0 10000
-#define delta_x 1
-#define coeff_sech 0.01
-//depth of sea at deepest point
-#define b_0 1000
-//gravitational acceleration
-#define g 10
-#define delta_t 0.001
-
-
-double *h;
-double *h1;
-double *h2;
-double *u;
-double *u1;
-double *u2;
-
-int *point_to_thread;
-
-long count_points () {
-	return (int)(2*x_0/delta_x);
-}
-
-double get_x(long i){
-	return i*delta_x - x_0;
-}
-
-double waterdepth(long i){
-	double x = get_x(i);
-	return b_0*(x*x/(x_0*x_0)-1);
-}
-
-void initialise_h(double *h){
-	double x, tmp;
-	long i;
-	for(i=0; i < count_points(); i++){
-		x=get_x(i);
-		tmp = 2/(exp(coeff_sech*x)+exp(-coeff_sech*x));
-		if (tmp > 1e-10)
-			h[i] = tmp*tmp;
-		else
-			h[i] = 0;
-	}
-}
-
-void initialise_u(double *u){
-	long i;
-	for(i=0;i<count_points();i++){
-		u[i]=0;
-	}
-}
-
-
-
-void print_values(FILE *data, double *h, double *u){
-	long i;
-	fprintf(data, "#x\th\tu\tb\n");
-	for(i=0;i<count_points();i++){
-		fprintf(data, "%g\t%g\t%g\t%g\n", get_x(i), h[i], u[i], waterdepth(i));
-	}
-}
-
-double lu(double *h, double *u, long i){
-	return u[i]*(u[i+1]-u[i-1])+g*(h[i+1]-h[i-1]);
-}
-
-double lh(double *h, double *u, long i){
-	return u[i]*(h[i+1]-h[i-1])-u[i]*(waterdepth(i+1)-waterdepth(i-1))+(h[i]-waterdepth(i))*(u[i+1]-u[i-1]);
-}
-
-
-//sets h1,u1 using values (startconditions) from u2/h2
-void euler(double *h1, double *h2, double *u1, double *u2){
-	long i;
-	long npoints = count_points();
-	u1[0]=0;		//boundary value
-	h1[0]=0;		//boundary value
-	u1[npoints-1]=0;
-	h1[npoints-1]=0;
-	for(i=1;i<npoints-1;i++){
-		u1[i]=u2[i]-(delta_t/(2.*delta_x))*lu(h2,u2,i);
-		h1[i]=h2[i]-(delta_t/(2.*delta_x))*lh(h2,u2,i);
-	}
-}
-
-long findmax(double *h, long oldmaxi,long outputmth){
-	long i, maxi=oldmaxi-10;
-	for(i=oldmaxi-10;(i< oldmaxi+(int)(outputmth/delta_x)+30) && (i<count_points()) ;i++){
-		if(h[i]>h[maxi]){
-			maxi=i;
-		}
-	}
-	return maxi;
-}
-
-double maxval(double *h) {
-	long npoints = count_points();
-	long i;
-	double hmax = -10;
-// HOMEWORK: Parallelise this loop!
-	for(i = 0; i < npoints; i++){
-		if (h[i] > hmax)
-			hmax = h[i];
-	}
-	return hmax;
-}
-
-
-double ng_derivative(double xi, double xi_1, double deltax){
-	return (xi-xi_1)/deltax;
-}
-
-
-void siginthandler(){
-	print_values(stdout,h,u);
-	exit(0);
-}
-
-#ifdef _OPENMP
-void save_decomp(long i) {
-	point_to_thread[i] = omp_get_thread_num();
-}
-
-void print_decomp() {
-	int i,j;
-	int npoints = count_points();
-	for (i=0; i<omp_get_num_threads(); i++) {
-		#pragma omp barrier
-		if (i != omp_get_thread_num())
-			continue;
-		printf("%d\t", i);
-		int first=-1;
-		for (j=0; j<npoints+1; j++) {
-			if (j < npoints && point_to_thread[j] == i) {
-				if (first == -1)
-					first = j;
-			} else {
-				if (first == -1)
-					continue;
-				if (first == j-1)
-					printf("%d ", j);
-				else {
-					printf("%d-%d ", first, j-1);
-				}
-				first = -1;
-			}
-		}
-		printf("\n");
-	}
-}
-#endif
-
-
-void swap(double **var, double **var1, double **var2) {
-	double *temp;
-
-	temp  = *var2;
-	*var2 = *var1;
-	*var1 = *var;
-	*var  = temp;
-}
-
-void set_boundaries(double *var) {
-	long npoints = count_points();
-	var[0]         = 0;
-	var[npoints-1] = 0;
-}
-
-int main(int argc, char* argv){
-	long i;
-	long npoints = count_points();
-	h  = malloc(sizeof(double)*npoints);
-	h1 = malloc(sizeof(double)*npoints);
-	h2 = malloc(sizeof(double)*npoints);
-
-	u  = malloc(sizeof(double)*npoints);
-	u1 = malloc(sizeof(double)*npoints);
-	u2 = malloc(sizeof(double)*npoints);
-#ifdef WRITE_DECOMP
-	point_to_thread = malloc(sizeof(int)*count_points());
-#endif
-
-	if(h==NULL || h1==NULL || h2==NULL || u==NULL || u1==NULL || u2==NULL){
-		fprintf(stderr, "Insufficient memory");
-		exit(23);
-	}
-#if 0
-	signal(SIGINT, siginthandler);
-#endif
-
-	long maxi=((npoints)/2)+5;
-	long oldmaxi;
-	long oldmaxi_output=(npoints)/2;
-	int maximth=20;
-	double t=0;
-	long n=0;
-	int outputmth=1000; //write every mth set of values to stderr
-	int dataqth=200; //write every qth set of values to output#
-	char file[60]; //char pointer for filename
-	FILE* out;
-	double *temp;
-	int first = 1;
-
-	initialise_u(u2);
-	initialise_h(h2);
-	euler(h1,h2,u1,u2);
-#if 0
-        out = fopen("t-1.dat","w");
-        print_values(out,h1,u1);
-        out = fopen("t-2.dat","w");
-        print_values(out,h2,u2);
-#endif
-	time_t t_before = time(NULL);
-	fprintf(stderr, "#time	h_max\n");
-
-	//while ( t < 150 ){
-	while ( t < 50 ){
-		if (!first) {
-			swap(&h, &h1, &h2);
-			swap(&u, &u1, &u2);
-		} else
-			first = 0;
-
-		set_boundaries(u);
-		set_boundaries(h);
-
-		#pragma omp parallel for // schedule(dynamic, 10)
-		for(i=0;i<npoints;i++){
-#ifdef WRITE_DECOMP
-			save_decomp(i);
-#endif
-			if (i == 0 || i == npoints-1)
-				continue;
-			u[i]=u2[i]-(delta_t/delta_x)*lu(h1,u1,i);
-			h[i]=h2[i]-(delta_t/delta_x)*lh(h1,u1,i);
-		}
-
-		t+=delta_t;
-		if((n%maximth)==0){
-
-			oldmaxi=maxi;
-			maxi=findmax(h,oldmaxi,maximth);
-			if((n%outputmth)==0){
-				double maxh = maxval(h);
-				fprintf(stderr, "%g\t%g\n", t, maxh);
-
-#if 0
-				fprintf(stderr, "%g \t",t);
-				fprintf(stderr, "%g\t%g \t%g\n",get_x(maxi),h[maxi],ng_derivative(get_x(maxi),get_x(oldmaxi_output),outputmth*delta_t));
-				oldmaxi_output=maxi;
-#endif
-			}
-		}
-
-
-#if 0
-                  if((n%dataqth)==0){
-                        sprintf(file,"output%04ld.dat",n/dataqth);
-                        out = fopen(file,"w");
-                        print_values(out,h,u);
-                  }
-#endif
-		n++;
-	}
-	time_t t_after = time(NULL);
-	double duration = difftime(t_after, t_before);
-	printf("Run took %g seconds.\n", duration);
-
-	//print_values(stdout,h,u);
-#ifdef WRITE_DECOMP
-#pragma omp parallel
-	print_decomp();
-#endif
-
-	return 0;
-}
-
--- a/exercises/parallelism/parallelism.qmd
+++ b/exercises/parallelism/parallelism.qmd
---
-title: "Parallelism"
---
-### Tasks:
-  1. Revisit `schedule` and try `schedule(dynamic,100)` as in the hands-on and
-     explain what happens and why that differs from `schedule(static, 100)`.
-  42. Parallelize the loop in `maxval`. Search for the comment
-     `// HOMEWORK: Parallelise this loop!`. All information needed can be found
-     in the lecture slides.
-  23.  Do a strong-scaling experiment starting with 2 threads and up to 32
-      threads and plot the result.
-  42. If you were to increase the number of threads, do you expect the speedup
-      to continue indefinitely? If not, which limits can you imagine? Feel free
-      to use kitchen metaphors.
--- a/lectures/parallelism/slides.qmd
+++ b/lectures/parallelism/slides.qmd
---
-title: "Parallelism"
-author: "Georgiana Mania, Claudia Frauen, Jan Frederik Engels"
---
-
-# Motivation
-
-* We have a serial code and want to make it faster
-* Plan of action:
-  * Cut problem into smaller pieces
-  * Use independent compute resources for each piece
-* Outlook for next week: The individual computing element does no longer
-  get much faster, but there are more of them
-
-
-
-## This lecture
-
-* Is mostly about parallelism as a concept
-* Next week: Complications due to existing hardware
-
-
-# Our example problem
-
-:::: {.columns}
-
-::: {.column width="50%"}
-* 1d Tsunami equation
-* Korteweg–De Vries equation - "PDE modeling waves on shallow water surfaces" [Wikipedia](https://en.wikipedia.org/wiki/Korteweg%E2%80%93De_Vries_equation)
-* Discretization not numerically accurate
-:::
-
-::: {.column width="50%"}
-FIXME show plot of a soliton
-:::
-
-::::
-
-# Our example problem
-```c
-while ( t < 50 ){
-        //[...]
-        for(i=0; i<npoints; i++){
-                //[...]
-                u[i]=u2[i]-(delta_t/delta_x)*lu(h1,u1,i);
-                h[i]=h2[i]-(delta_t/delta_x)*lh(h1,u1,i);
-        }
-}
-```
-
-# Decomposing problem domains
-## Our example problem domain
-
-![](static/parallel_lecture_domain.png){width=75%}
-
-## Other problem domains
-
-:::: {.columns}
-
-::: {.column width="50%"}
-
-<br>
-
-* Climate model (e.g. ICON) horizontal grid decomposition
-
-:::
-
-::: {.column width="50%"}
-
-![](static/csm_icon_grid_single_noborder_decomp.png){width=100%}
-
-::: {.tiny}
-
-adapted from original image by MPI-M
-
-:::
-
-:::
-
-::::
-
-
-
-# Introducing OpenMP
-* A popular way to parallelize code
-* Pragma-based parallelization API
-  * You annotate your code with parallel regions and the compiler does the rest
-
-```c++
-#pragma omp parallel for
-    for (int i = 0; i < N; ++i)
-        a[i] = 2 * i;
-```
-
-# OpenMP threads
-
-* OpenMP uses something called threads
-  * Wait until next week for a definition
-
-:::: {.columns}
-
-::: {.column width="50%"}
-
-<br>
-
-```c++
-N = 8
-#pragma omp parallel for
-    for (int i = 0; i < N; ++i)
-        a[i] = 2 * i;
-```
-
-:::
-
-::: {.column width="50%"}
-
-![](static/threads.jpg){width=100%}
-
-:::
-
-::::
-
-# Hands-on Session! {background-color=var(--dark-bg-color) .leftalign}
-1. Load the GNU compiler on Levante
-```bash
-module load gcc
-```
-2. Compile and run the serial example
-```bash
-gcc main.c -o serial.x -lm
-time ./serial.x   # use `time` to check the runtime
-```
-3. Compile and run the example using OpenMP
-```bash
-gcc -fopenmp main.c -o parallel.x -lm
-export OMP_NUM_THREADS=2
-time ./parallel.x
-```
-
-4. See next slide!
-
-# Hands-on Session! {background-color=var(--dark-bg-color) .leftalign}
-
-4. Now compile/run with
-```bash
-gcc -fopenmp main.c -o parallel.x -lm -DWRITE_DECOMP
-export OMP_NUM_THREADS=4 
-time ./parallel.x
-```
-5. What does the additional output mean?
-
-6. Now uncomment/adapt
-   * `schedule(static,100)`
-   * `schedule(static,10)`
-and interpret the results.
-
-# Scaling
-
-Bake mini-pancake dessert
-
-:::: {.columns}
-
-::: {.column width="25%"}
-
-![](static/pancakes_stack.png){width=100%}
-
-:::
-
-::: {.column width="25%" }
-
-:::{.fragment}
-![](static/one_pancake.png){width=100%}
-:::
-:::
-
-::: {.column width="25%"}
-:::{.fragment}
-![](static/four_pans_cake.png){width=100%}
-:::
-:::
-
-::: {.column width="25%"}
-:::{.fragment}
-![](static/four_pancakes.png){width=100%}
-:::
-:::
-
-::::
-
-:::{.info .tiny}
-Images generated by Pradipta Samanta with DALL-E
-:::
-
-
-
-## Strong vs weak scaling
-
-:::{.smaller}
-Starting with 1 pan, we can (perfect) scale this by using $P$ pans in two ways:
-:::
-
-:::{.fragment}
-|Parameter / Scaling type |Strong|Weak|
-|-------|---|---|
-|Resources <br> (e.g. pans) | $P$| $P$ |
-|Total workload <br> (e.g. pancake count)| $N$ | $N \times P$ |
-|Workload per worker <br> (e.g. pancakes per pan) | $N/P$ | $N$ |
-|Total time | $T_1 \times N/P$ | $T_1 \times N \times P$ |
-:::
-
-
-<!--
-* What happens if one uses more threads, but keep the problem size?
-  * Strong scaling
-* What happens if one uses more threads, but also increases the problem size by
-  the same factor?
-  * Weak scaling
-  --> 
-
-# Aggregate parallel results
-## What is happening here?
-```c++
-    int a[] = {2, 4, 6};
-    for (int i = 0; i < N; ++i)
-        sum = sum + a[i];
-```
-## What is happening here?
-```c++
-    int a[] = {2, 4, 6};
-#pragma omp parallel for
-    for (int i = 0; i < N; ++i)
-        sum = sum + a[i];
-```
-[comment]: # (Can something go wrong?)
-
-## Solution
-```c++
-    int a[] = {2, 4, 6};
-#pragma omp parallel for reduction(+:sum)
-    for (int i = 0; i < N; ++i)
-        sum = sum + a[i];
-```
-For other reduction operations like e.g. `max` see
-[OpenMP documentation](https://www.openmp.org/spec-html/5.1/openmpsu117.html#x152-1720002.21.5)
-
-
-# Doing stuff wrong
-## What is going wrong here?
-```c++
-    temp = 0;
-#pragma omp parallel for
-    for (int i = 0; i < N; ++i) {
-        temp = 2 * a[i];
-        b[i] = temp + 4;
-    }
-```
-## Solution
-```c++
-    temp = 0;
-#pragma omp parallel for private(temp)
-    for (int i = 0; i < N; ++i) {
-        temp = 2 * a[i];
-        b[i] = temp + 4;
-    }
-```
-The problem is called "data race".
-
-## Other common errors
-* Race conditions
-  * The outcome of a program depends on the relative timing of multiple threads.
-* Deadlocks
-  * Multiple threads wait for a resource that cannot be fulfilled.
-* Starvation
-  * A thread is blocked indefinitely waiting on a resource 
-
-# Finally: A definition of parallelism
-<!--"Parallel computing is a type of computation in which many calculations or processes are carried out simultaneously."
-Wikipedia
-->
-
-"Parallel computing is the simultaneous use of multiple compute resources to solve a computational problem"
-
-:::{.smaller}
-- *Introduction to Parallel Computing Tutorial, LLNL *
-:::
-
-
-## Types of parallelism
-
-* Data-level parallelism
-  * what we've been discussing
-* Task-level parallelism
-  * Example: Atmosphere ocean coupling
-
-
-## Precondition for parallel execution
-
-<br>
-
-*"Two consecutive instructions or code segments can be executed in parallel if they are **independent**."* 
-
-<br>
-
-:::{.fragment .info .smaller}
-What does **dependence** mean?
-:::
-
-## Code and data dependence {.leftalign}
-
-
-:::{.fragment .fade-in fragment-index=1}
-* Data dependence - the instructions share the same data 
-:::
-
-:::{.fragment .fade-in-then-semi-out fragment-index=2}
-```c++ 
-a = b;
-c = a + b; //  flow dependence
-```
-:::
-
-:::{.fragment .fade-in fragment-index=3}
-* Control dependence - execution order decided at runtime 
-:::
-
-:::{.fragment .fade-in-then-semi-out fragment-index=4}
-```c++
-for (int i = 1; i < n ; i++) {
-  if (a[i-1] > a[i]) { 
-    a[i] = a[i] + 1;
-   } else {
-    a[i] = 0;
-   } 
-} 
-```
-:::
-
-:::{.fragment .fade-in fragment-index=5}
-* Resource dependence - share the same resource
-:::
-
-:::{.fragment .fade-in-then-semi-out fragment-index=6}
-```c++
-a = b;
-b = 42; // read after write: a has an old value
-```
-:::
-
-
-
-## Bernstein's parallelism conditions {.alignleft}
-For data dependence, use the Bernstein's conditions: *"the intersection between read-write set, write-read set and write-write set of instructions is null"*.
-
-:::{.fragment}
-```c++
-c = a + b;  // S1
-d = a - b;  // S2
-```
-:::
-
-:::{.smaller}
-
-:::: {.columns}
-
-::: {.column width="50%"}
-
-:::{.fragment}
-
-Read and write sets for S1 and S2:
-$$
-R_1 = \{a,b\} ; W_1 = \{c\} \\
-R_2 = \{a,b\} ; W_2 = \{d\} \\
-$$
-
-:::
-:::
-
-::: {.column width="50%"}
-
-:::{.fragment}
-Bernstein's conditions:
-
-$$
-R_1 \cap W_2 = \emptyset \\
-W_1 \cap R_2 = \emptyset \\
-W_1 \cap W_2 = \emptyset
-$$
-:::
-
-:::
-:::
-::::
-
-:::{.fragment}
-S1 and S2 can be executed in parallel!
-:::
-
-:::{.notes}
-How about these two? replace a in S2 with c
-```c++ 
-c = a + b;  // S1
-d = c - b;  // S2
-```
-:::
-
-## Bernstein's parallelism conditions {.alignleft}
-
-:::{.fragment .semi-fade-out}
-```c++
-c = a + b;  // S1
-d = a - b;  // S2
-```
-:::
-
-
-:::{.fragment}
-Replace `a` with `c` in statement 2
-```c++
-c = a + b;  // S1
-d = c - b;  // S2
-```
-:::
-
-:::{.smaller}
-
-:::: {.columns}
-
-::: {.column width="50%"}
-
-:::{.fragment}
-
-Read and write sets for S1 and S2:
-$$
-R_1 = \{a,b\} ; W_1 = \{c\} \\
-R_2 = \{c,b\} ; W_2 = \{d\} \\
-$$
-
-:::
-:::
-
-::: {.column width="50%"}
-
-:::{.fragment}
-Bernstein's conditions:
-
-$$
-R_1 \cap W_2 = \emptyset \\
-W_1 \cap R_2 = \{c\} \\
-W_1 \cap W_2 = \emptyset
-$$
-:::
-
-:::
-:::
-::::
-
-:::{.fragment}
-S1 and S2 can NOT be executed in parallel!
-:::
-
-
-## Best practices
-
-* Parallelisation should not change the results! Exceptions to be discussed next week!
-
-
-
-# Additional reading
-
-* "Computer Architecture - A Quantitative Approach" - J. Hennessy and D. Patterson
-* "Introduction to High Performance Computing for Scientists and Engineers" - G. Hager and G. Wellein
-* "Introduction to Parallel Computing Tutorial" - [Online](https://hpc.llnl.gov/documentation/tutorials/introduction-parallel-computing-tutorial), Lawrence Livermore National Laboratory 
--- a/lectures/parallelism/static/csm_icon_grid_16198badca.jpg
+++ b/lectures/parallelism/static/csm_icon_grid_16198badca.jpg
--- a/lectures/parallelism/static/csm_icon_grid_single_noborder_decomp.png
+++ b/lectures/parallelism/static/csm_icon_grid_single_noborder_decomp.png
--- a/lectures/parallelism/static/four_pancakes.png
+++ b/lectures/parallelism/static/four_pancakes.png
--- a/lectures/parallelism/static/four_pans_cake.png
+++ b/lectures/parallelism/static/four_pans_cake.png
--- a/lectures/parallelism/static/one_pancake.png
+++ b/lectures/parallelism/static/one_pancake.png
--- a/lectures/parallelism/static/pancakes_stack.png
+++ b/lectures/parallelism/static/pancakes_stack.png
--- a/lectures/parallelism/static/parallel_lecture_domain.png
+++ b/lectures/parallelism/static/parallel_lecture_domain.png
--- a/lectures/parallelism/static/threads.jpg
+++ b/lectures/parallelism/static/threads.jpg