Skip to content
Snippets Groups Projects
Commit ce2b537b authored by Jan Frederik Engels's avatar Jan Frederik Engels :new_moon:
Browse files

Remove parallelism

"Nuke the entire site from orbit--it's the only way to be sure"
parent d37b4f3a
No related branches found
No related tags found
1 merge request!72Draft: Compute devices lecture
Pipeline #70256 passed
Showing with 2 additions and 785 deletions
......@@ -36,7 +36,7 @@ website:
- "lectures/user-experience/slides.qmd"
# - "lectures/testing/slides.qmd"
# - "lectures/git2/slides.qmd"
- "lectures/parallelism/slides.qmd"
# - "lectures/parallelism/slides.qmd"
- "lectures/hardware/slides.qmd"
# - "lectures/file-and-data-systems/slides.qmd"
# - "lectures/memory-hierarchies/slides.qmd"
......@@ -52,7 +52,7 @@ website:
- "exercises/user-experience.qmd"
# - "exercises/testing.qmd"
# - "exercises/git2.qmd"
- "exercises/parallelism/parallelism.qmd"
# - "exercises/parallelism/parallelism.qmd"
# - "exercises/hardware.qmd"
# - "exercises/file_and_data_systems.qmd"
# - "exercises/memory_hierarchies.qmd"
......
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#ifdef _OPENMP
#include <omp.h>
#else
#ifdef WRITE_DECOMP
#error You cannot write the decomposition without OpenMP.
#endif
#endif
//compile using (for example):
//gcc -lm -fopenmp -g main.c
//definition of constants
//some values are given in meter
//some in seconds
//
#define x_0 10000
#define delta_x 1
#define coeff_sech 0.01
//depth of sea at deepest point
#define b_0 1000
//gravitational acceleration
#define g 10
#define delta_t 0.001
double *h;
double *h1;
double *h2;
double *u;
double *u1;
double *u2;
int *point_to_thread;
long count_points () {
return (int)(2*x_0/delta_x);
}
double get_x(long i){
return i*delta_x - x_0;
}
double waterdepth(long i){
double x = get_x(i);
return b_0*(x*x/(x_0*x_0)-1);
}
void initialise_h(double *h){
double x, tmp;
long i;
for(i=0; i < count_points(); i++){
x=get_x(i);
tmp = 2/(exp(coeff_sech*x)+exp(-coeff_sech*x));
if (tmp > 1e-10)
h[i] = tmp*tmp;
else
h[i] = 0;
}
}
void initialise_u(double *u){
long i;
for(i=0;i<count_points();i++){
u[i]=0;
}
}
void print_values(FILE *data, double *h, double *u){
long i;
fprintf(data, "#x\th\tu\tb\n");
for(i=0;i<count_points();i++){
fprintf(data, "%g\t%g\t%g\t%g\n", get_x(i), h[i], u[i], waterdepth(i));
}
}
double lu(double *h, double *u, long i){
return u[i]*(u[i+1]-u[i-1])+g*(h[i+1]-h[i-1]);
}
double lh(double *h, double *u, long i){
return u[i]*(h[i+1]-h[i-1])-u[i]*(waterdepth(i+1)-waterdepth(i-1))+(h[i]-waterdepth(i))*(u[i+1]-u[i-1]);
}
//sets h1,u1 using values (startconditions) from u2/h2
void euler(double *h1, double *h2, double *u1, double *u2){
long i;
long npoints = count_points();
u1[0]=0; //boundary value
h1[0]=0; //boundary value
u1[npoints-1]=0;
h1[npoints-1]=0;
for(i=1;i<npoints-1;i++){
u1[i]=u2[i]-(delta_t/(2.*delta_x))*lu(h2,u2,i);
h1[i]=h2[i]-(delta_t/(2.*delta_x))*lh(h2,u2,i);
}
}
long findmax(double *h, long oldmaxi,long outputmth){
long i, maxi=oldmaxi-10;
for(i=oldmaxi-10;(i< oldmaxi+(int)(outputmth/delta_x)+30) && (i<count_points()) ;i++){
if(h[i]>h[maxi]){
maxi=i;
}
}
return maxi;
}
double maxval(double *h) {
long npoints = count_points();
long i;
double hmax = -10;
// HOMEWORK: Parallelise this loop!
for(i = 0; i < npoints; i++){
if (h[i] > hmax)
hmax = h[i];
}
return hmax;
}
double ng_derivative(double xi, double xi_1, double deltax){
return (xi-xi_1)/deltax;
}
void siginthandler(){
print_values(stdout,h,u);
exit(0);
}
#ifdef _OPENMP
void save_decomp(long i) {
point_to_thread[i] = omp_get_thread_num();
}
void print_decomp() {
int i,j;
int npoints = count_points();
for (i=0; i<omp_get_num_threads(); i++) {
#pragma omp barrier
if (i != omp_get_thread_num())
continue;
printf("%d\t", i);
int first=-1;
for (j=0; j<npoints+1; j++) {
if (j < npoints && point_to_thread[j] == i) {
if (first == -1)
first = j;
} else {
if (first == -1)
continue;
if (first == j-1)
printf("%d ", j);
else {
printf("%d-%d ", first, j-1);
}
first = -1;
}
}
printf("\n");
}
}
#endif
void swap(double **var, double **var1, double **var2) {
double *temp;
temp = *var2;
*var2 = *var1;
*var1 = *var;
*var = temp;
}
void set_boundaries(double *var) {
long npoints = count_points();
var[0] = 0;
var[npoints-1] = 0;
}
int main(int argc, char* argv){
long i;
long npoints = count_points();
h = malloc(sizeof(double)*npoints);
h1 = malloc(sizeof(double)*npoints);
h2 = malloc(sizeof(double)*npoints);
u = malloc(sizeof(double)*npoints);
u1 = malloc(sizeof(double)*npoints);
u2 = malloc(sizeof(double)*npoints);
#ifdef WRITE_DECOMP
point_to_thread = malloc(sizeof(int)*count_points());
#endif
if(h==NULL || h1==NULL || h2==NULL || u==NULL || u1==NULL || u2==NULL){
fprintf(stderr, "Insufficient memory");
exit(23);
}
#if 0
signal(SIGINT, siginthandler);
#endif
long maxi=((npoints)/2)+5;
long oldmaxi;
long oldmaxi_output=(npoints)/2;
int maximth=20;
double t=0;
long n=0;
int outputmth=1000; //write every mth set of values to stderr
int dataqth=200; //write every qth set of values to output#
char file[60]; //char pointer for filename
FILE* out;
double *temp;
int first = 1;
initialise_u(u2);
initialise_h(h2);
euler(h1,h2,u1,u2);
#if 0
out = fopen("t-1.dat","w");
print_values(out,h1,u1);
out = fopen("t-2.dat","w");
print_values(out,h2,u2);
#endif
time_t t_before = time(NULL);
fprintf(stderr, "#time h_max\n");
//while ( t < 150 ){
while ( t < 50 ){
if (!first) {
swap(&h, &h1, &h2);
swap(&u, &u1, &u2);
} else
first = 0;
set_boundaries(u);
set_boundaries(h);
#pragma omp parallel for // schedule(dynamic, 10)
for(i=0;i<npoints;i++){
#ifdef WRITE_DECOMP
save_decomp(i);
#endif
if (i == 0 || i == npoints-1)
continue;
u[i]=u2[i]-(delta_t/delta_x)*lu(h1,u1,i);
h[i]=h2[i]-(delta_t/delta_x)*lh(h1,u1,i);
}
t+=delta_t;
if((n%maximth)==0){
oldmaxi=maxi;
maxi=findmax(h,oldmaxi,maximth);
if((n%outputmth)==0){
double maxh = maxval(h);
fprintf(stderr, "%g\t%g\n", t, maxh);
#if 0
fprintf(stderr, "%g \t",t);
fprintf(stderr, "%g\t%g \t%g\n",get_x(maxi),h[maxi],ng_derivative(get_x(maxi),get_x(oldmaxi_output),outputmth*delta_t));
oldmaxi_output=maxi;
#endif
}
}
#if 0
if((n%dataqth)==0){
sprintf(file,"output%04ld.dat",n/dataqth);
out = fopen(file,"w");
print_values(out,h,u);
}
#endif
n++;
}
time_t t_after = time(NULL);
double duration = difftime(t_after, t_before);
printf("Run took %g seconds.\n", duration);
//print_values(stdout,h,u);
#ifdef WRITE_DECOMP
#pragma omp parallel
print_decomp();
#endif
return 0;
}
---
title: "Parallelism"
---
### Tasks:
1. Revisit `schedule` and try `schedule(dynamic,100)` as in the hands-on and
explain what happens and why that differs from `schedule(static, 100)`.
42. Parallelize the loop in `maxval`. Search for the comment
`// HOMEWORK: Parallelise this loop!`. All information needed can be found
in the lecture slides.
23. Do a strong-scaling experiment starting with 2 threads and up to 32
threads and plot the result.
42. If you were to increase the number of threads, do you expect the speedup
to continue indefinitely? If not, which limits can you imagine? Feel free
to use kitchen metaphors.
---
title: "Parallelism"
author: "Georgiana Mania, Claudia Frauen, Jan Frederik Engels"
---
# Motivation
* We have a serial code and want to make it faster
* Plan of action:
* Cut problem into smaller pieces
* Use independent compute resources for each piece
* Outlook for next week: The individual computing element does no longer
get much faster, but there are more of them
## This lecture
* Is mostly about parallelism as a concept
* Next week: Complications due to existing hardware
# Our example problem
:::: {.columns}
::: {.column width="50%"}
* 1d Tsunami equation
* Korteweg–De Vries equation - "PDE modeling waves on shallow water surfaces" [Wikipedia](https://en.wikipedia.org/wiki/Korteweg%E2%80%93De_Vries_equation)
* Discretization not numerically accurate
:::
::: {.column width="50%"}
FIXME show plot of a soliton
:::
::::
# Our example problem
```c
while ( t < 50 ){
//[...]
for(i=0; i<npoints; i++){
//[...]
u[i]=u2[i]-(delta_t/delta_x)*lu(h1,u1,i);
h[i]=h2[i]-(delta_t/delta_x)*lh(h1,u1,i);
}
}
```
# Decomposing problem domains
## Our example problem domain
![](static/parallel_lecture_domain.png){width=75%}
## Other problem domains
:::: {.columns}
::: {.column width="50%"}
<br>
* Climate model (e.g. ICON) horizontal grid decomposition
:::
::: {.column width="50%"}
![](static/csm_icon_grid_single_noborder_decomp.png){width=100%}
::: {.tiny}
adapted from original image by MPI-M
:::
:::
::::
# Introducing OpenMP
* A popular way to parallelize code
* Pragma-based parallelization API
* You annotate your code with parallel regions and the compiler does the rest
```c++
#pragma omp parallel for
for (int i = 0; i < N; ++i)
a[i] = 2 * i;
```
# OpenMP threads
* OpenMP uses something called threads
* Wait until next week for a definition
:::: {.columns}
::: {.column width="50%"}
<br>
```c++
N = 8
#pragma omp parallel for
for (int i = 0; i < N; ++i)
a[i] = 2 * i;
```
:::
::: {.column width="50%"}
![](static/threads.jpg){width=100%}
:::
::::
# Hands-on Session! {background-color=var(--dark-bg-color) .leftalign}
1. Load the GNU compiler on Levante
```bash
module load gcc
```
2. Compile and run the serial example
```bash
gcc main.c -o serial.x -lm
time ./serial.x # use `time` to check the runtime
```
3. Compile and run the example using OpenMP
```bash
gcc -fopenmp main.c -o parallel.x -lm
export OMP_NUM_THREADS=2
time ./parallel.x
```
4. See next slide!
# Hands-on Session! {background-color=var(--dark-bg-color) .leftalign}
4. Now compile/run with
```bash
gcc -fopenmp main.c -o parallel.x -lm -DWRITE_DECOMP
export OMP_NUM_THREADS=4
time ./parallel.x
```
5. What does the additional output mean?
6. Now uncomment/adapt
* `schedule(static,100)`
* `schedule(static,10)`
and interpret the results.
# Scaling
Bake mini-pancake dessert
:::: {.columns}
::: {.column width="25%"}
![](static/pancakes_stack.png){width=100%}
:::
::: {.column width="25%" }
:::{.fragment}
![](static/one_pancake.png){width=100%}
:::
:::
::: {.column width="25%"}
:::{.fragment}
![](static/four_pans_cake.png){width=100%}
:::
:::
::: {.column width="25%"}
:::{.fragment}
![](static/four_pancakes.png){width=100%}
:::
:::
::::
:::{.info .tiny}
Images generated by Pradipta Samanta with DALL-E
:::
## Strong vs weak scaling
:::{.smaller}
Starting with 1 pan, we can (perfect) scale this by using $P$ pans in two ways:
:::
:::{.fragment}
|Parameter / Scaling type |Strong|Weak|
|-------|---|---|
|Resources <br> (e.g. pans) | $P$| $P$ |
|Total workload <br> (e.g. pancake count)| $N$ | $N \times P$ |
|Workload per worker <br> (e.g. pancakes per pan) | $N/P$ | $N$ |
|Total time | $T_1 \times N/P$ | $T_1 \times N \times P$ |
:::
<!--
* What happens if one uses more threads, but keep the problem size?
* Strong scaling
* What happens if one uses more threads, but also increases the problem size by
the same factor?
* Weak scaling
-->
# Aggregate parallel results
## What is happening here?
```c++
int a[] = {2, 4, 6};
for (int i = 0; i < N; ++i)
sum = sum + a[i];
```
## What is happening here?
```c++
int a[] = {2, 4, 6};
#pragma omp parallel for
for (int i = 0; i < N; ++i)
sum = sum + a[i];
```
[comment]: # (Can something go wrong?)
## Solution
```c++
int a[] = {2, 4, 6};
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < N; ++i)
sum = sum + a[i];
```
For other reduction operations like e.g. `max` see
[OpenMP documentation](https://www.openmp.org/spec-html/5.1/openmpsu117.html#x152-1720002.21.5)
# Doing stuff wrong
## What is going wrong here?
```c++
temp = 0;
#pragma omp parallel for
for (int i = 0; i < N; ++i) {
temp = 2 * a[i];
b[i] = temp + 4;
}
```
## Solution
```c++
temp = 0;
#pragma omp parallel for private(temp)
for (int i = 0; i < N; ++i) {
temp = 2 * a[i];
b[i] = temp + 4;
}
```
The problem is called "data race".
## Other common errors
* Race conditions
* The outcome of a program depends on the relative timing of multiple threads.
* Deadlocks
* Multiple threads wait for a resource that cannot be fulfilled.
* Starvation
* A thread is blocked indefinitely waiting on a resource
# Finally: A definition of parallelism
<!--"Parallel computing is a type of computation in which many calculations or processes are carried out simultaneously."
Wikipedia
-->
"Parallel computing is the simultaneous use of multiple compute resources to solve a computational problem"
:::{.smaller}
-- *Introduction to Parallel Computing Tutorial, LLNL *
:::
## Types of parallelism
* Data-level parallelism
* what we've been discussing
* Task-level parallelism
* Example: Atmosphere ocean coupling
## Precondition for parallel execution
<br>
*"Two consecutive instructions or code segments can be executed in parallel if they are **independent**."*
<br>
:::{.fragment .info .smaller}
What does **dependence** mean?
:::
## Code and data dependence {.leftalign}
:::{.fragment .fade-in fragment-index=1}
* Data dependence - the instructions share the same data
:::
:::{.fragment .fade-in-then-semi-out fragment-index=2}
```c++
a = b;
c = a + b; // flow dependence
```
:::
:::{.fragment .fade-in fragment-index=3}
* Control dependence - execution order decided at runtime
:::
:::{.fragment .fade-in-then-semi-out fragment-index=4}
```c++
for (int i = 1; i < n ; i++) {
if (a[i-1] > a[i]) {
a[i] = a[i] + 1;
} else {
a[i] = 0;
}
}
```
:::
:::{.fragment .fade-in fragment-index=5}
* Resource dependence - share the same resource
:::
:::{.fragment .fade-in-then-semi-out fragment-index=6}
```c++
a = b;
b = 42; // read after write: a has an old value
```
:::
## Bernstein's parallelism conditions {.alignleft}
For data dependence, use the Bernstein's conditions: *"the intersection between read-write set, write-read set and write-write set of instructions is null"*.
:::{.fragment}
```c++
c = a + b; // S1
d = a - b; // S2
```
:::
:::{.smaller}
:::: {.columns}
::: {.column width="50%"}
:::{.fragment}
Read and write sets for S1 and S2:
$$
R_1 = \{a,b\} ; W_1 = \{c\} \\
R_2 = \{a,b\} ; W_2 = \{d\} \\
$$
:::
:::
::: {.column width="50%"}
:::{.fragment}
Bernstein's conditions:
$$
R_1 \cap W_2 = \emptyset \\
W_1 \cap R_2 = \emptyset \\
W_1 \cap W_2 = \emptyset
$$
:::
:::
:::
::::
:::{.fragment}
S1 and S2 can be executed in parallel!
:::
:::{.notes}
How about these two? replace a in S2 with c
```c++
c = a + b; // S1
d = c - b; // S2
```
:::
## Bernstein's parallelism conditions {.alignleft}
:::{.fragment .semi-fade-out}
```c++
c = a + b; // S1
d = a - b; // S2
```
:::
:::{.fragment}
Replace `a` with `c` in statement 2
```c++
c = a + b; // S1
d = c - b; // S2
```
:::
:::{.smaller}
:::: {.columns}
::: {.column width="50%"}
:::{.fragment}
Read and write sets for S1 and S2:
$$
R_1 = \{a,b\} ; W_1 = \{c\} \\
R_2 = \{c,b\} ; W_2 = \{d\} \\
$$
:::
:::
::: {.column width="50%"}
:::{.fragment}
Bernstein's conditions:
$$
R_1 \cap W_2 = \emptyset \\
W_1 \cap R_2 = \{c\} \\
W_1 \cap W_2 = \emptyset
$$
:::
:::
:::
::::
:::{.fragment}
S1 and S2 can NOT be executed in parallel!
:::
## Best practices
* Parallelisation should not change the results! Exceptions to be discussed next week!
# Additional reading
* "Computer Architecture - A Quantitative Approach" - J. Hennessy and D. Patterson
* "Introduction to High Performance Computing for Scientists and Engineers" - G. Hager and G. Wellein
* "Introduction to Parallel Computing Tutorial" - [Online](https://hpc.llnl.gov/documentation/tutorials/introduction-parallel-computing-tutorial), Lawrence Livermore National Laboratory
lectures/parallelism/static/csm_icon_grid_16198badca.jpg

136 KiB

lectures/parallelism/static/csm_icon_grid_single_noborder_decomp.png

193 KiB

lectures/parallelism/static/four_pancakes.png

326 KiB

lectures/parallelism/static/four_pans_cake.png

352 KiB

lectures/parallelism/static/one_pancake.png

218 KiB

lectures/parallelism/static/pancakes_stack.png

198 KiB

lectures/parallelism/static/parallel_lecture_domain.png

19.6 KiB

lectures/parallelism/static/threads.jpg

23.3 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment