diff --git a/_quarto.yml b/_quarto.yml index 1d12eb26daa5455f691e66652fba2488ff5c2b65..dd0f9f57d281ede6b55cc660c51255816b6ce695 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -41,7 +41,7 @@ website: - "lectures/parallelism/slides.qmd" - "lectures/hardware/slides.qmd" - "lectures/file-and-data-systems/slides.qmd" - # - "lectures/memory-hierarchies/slides.qmd" + - "lectures/memory-hierarchies/slides.qmd" # - "lectures/student-talks/slides.qmd" - section: "Exercises" contents: @@ -57,7 +57,7 @@ website: - "exercises/parallelism/parallelism.qmd" - "exercises/hardware/hardware.qmd" - "exercises/file-and-data-systems.qmd" - # - "exercises/memory_hierarchies.qmd" + - "exercises/memory-hierarchies.qmd" # - "exercises/student_talks.qmd" format: diff --git a/exercises/memory-hierarchies.qmd b/exercises/memory-hierarchies.qmd new file mode 100644 index 0000000000000000000000000000000000000000..d31771fc73f1a60485417080c93453e350a75956 --- /dev/null +++ b/exercises/memory-hierarchies.qmd @@ -0,0 +1,11 @@ +--- +title: "Memory Hierarchies" +--- + +Review the memory mountain hands-on and look into the source code: +In `main()` the function `run()` is called for each combination of `size` and `stride` +which in turn calls `fcyc2()` and `fcyc2_full()`. + +What happens there to ensure that only the desired cache effects for the test function `f` are measured? +Describe the mechanism and why it is necessary for accurate measurements in a short paragraph. +Refer to the code and what you learned in the lecture about it. diff --git a/lectures/memory-hierarchies/slides.qmd b/lectures/memory-hierarchies/slides.qmd new file mode 100644 index 0000000000000000000000000000000000000000..4c2c046c1388417a2a4f9048a507a3728091abb8 --- /dev/null +++ b/lectures/memory-hierarchies/slides.qmd @@ -0,0 +1,666 @@ +--- +title: "Memory Hierarchies" +author: "Dominik Zobel and Florian Ziemen" +--- + +# Memory Hierarchies + + - Background + - Why you should care + - How to use it to your advantage + + + +## Intended Takeaways + + - Locality matters: data-centric view + - Think workbench: Operating with parts of the data + - Processor tries to be busy all the time (prefetching) + - Latency and memory sizes of components + + + +## Questions {.handson .incremental} + + - Why not keep everything in memory? + - What to do with really big data? + - How to speed up processing data? + + + +## Memory Pyramid (upwards) + + +:::{.r-stack} + +{.fragment width=70% fragment-index=1} + +{.fragment width=70% fragment-index=2} + +{.fragment width=70% fragment-index=3} + +{.fragment width=70% fragment-index=4} + +{.fragment width=70% fragment-index=5} + +{.fragment width=70% fragment-index=6} + +::: + + + +# Speed and access time + +## Processor speed vs. main memory speed + +{width=70%} + + + +:::{.smaller} +Based on figure from "Computer Architecture" by _J. Hennessy_ and _D. Patterson_ +::: + + + +## Disk I/O timings {.leftalign} + +<!-- +File `data.py`: + +```{.python} +import time +import pickle +import numpy as np + + +def create_random_data(): + np.random.seed(3922) + start = time.perf_counter() + data = np.random.randint(0, 2**20, size=(128, 128, 128, 128)) + end = time.perf_counter() + print('{:10.5f}: Create "random" data'.format(end-start)) + return data + + +def create_42_data(): + start = time.perf_counter() + data = np.full((128, 128, 128, 128), 42) + end = time.perf_counter() + print('{:10.5f}: Create "42" data'.format(end-start)) + return data + + +def store_data(filename, data, dataname): + start = time.perf_counter() + with open(filename, 'wb') as outfile: + pickle.dump(data, outfile) + + end = time.perf_counter() + print('{:10.5f}: Store "{:s}" data'.format(end-start, dataname)) + + +def load_data(filename, dataname): + start = time.perf_counter() + with open(filename, 'rb') as infile: + data = pickle.load(infile) + + end = time.perf_counter() + print('{:10.5f}: Load "{:s}" data'.format(end-start, dataname)) + return data + + +def operate_on_data(data, dataname): + start = time.perf_counter() + new_data = data + 1.1 + end = time.perf_counter() + print('{:10.5f}: Operate on "{:s}" data'.format(end-start, dataname)) + return new_data +``` + +File `save_it.py`: + +```{.python} +from data import * + +dataname = 'random' +data = create_random_data() +store_data(filename='temp01.dat', + data=data, dataname=dataname) + +dataname = '42' +data = create_42_data() +store_data(filename='temp02.dat', + data=data, dataname=dataname) +``` + +File `from_disk.py`: + +```{.python} +from data import * + +dataname = 'random' +data = load_data(filename='temp01.dat', + dataname=dataname) +new_data = operate_on_data(data=data, + dataname=dataname) + +dataname = '42' +data = load_data(filename='temp02.dat', + dataname=dataname) +new_data = operate_on_data(data=data, + dataname=dataname) +``` + +File `in_memory.py`: + +```{.python} +from data import * + +dataname = 'random' +data = create_random_data() +new_data = operate_on_data(data=data, + dataname=dataname) + +dataname = '42' +data = create_42_data() +new_data = operate_on_data(data=data, + dataname=dataname) +``` +--> + +| | Levante (Fixed) | Laptop (Fixed) | Levante (Random) | Laptop (Random) | +| -------------------------- | ----------------- | ----------------- | ----------------- | ----------------- | +| Create data | 0.56 | 0.23 | 1.66 | 0.88 | +| Store data | 2.23 | 4.04 | 2.23 | 3.44 | +| Load data | 0.76$^*$ | 0.88$^*$ | 0.76$^*$ | 0.92$^*$ | +| Process data | 0.76 | 0.38 | 0.76 | 0.37 | + +:::{.smaller} +Time in seconds using a 2 GB numpy array ($128 \times 128 \times 128 \times 128$) either with a fixed number or random number in each entry + +$^*$: Lower than one due to caching effects +::: + + + +## Disk I/O {.leftalign} + + - Reading/writing to file is rather expensive + - If necessary during computation, try doing it asynchronously + - If possible, keep data in memory + + + +## Memory access patterns + +Execution speed depends on data layout in memory + +```{.fortranfree} +program loop_exchange + implicit none + integer, parameter :: nel = 20000 + ! Note: Stay below 46000 to prevent overflows below + integer, dimension(nel, nel) :: mat + integer :: i, j + + do i = 1, nel + do j = 1, nel + mat(j, i) = (i-1)*nel + j-1 + end do + end do +end program loop_exchange +``` + +Loop order with optimal access of elements (contiguous memory). + + + +## Hands-On {.handson} + +1. Compile the Fortran code from the previous slide (also [here](static/loops.f90)) + on Levante or your PC. On Levante load the `gcc` module first (`module load gcc`). + Then measure the time needed to run the program, i.e. + +```{.Bash} +gfortran loops.f90 -o loops +time ./loops +``` + +2. Exchange the loop variables `i` and `j` in line 8 and 9 and compile again. + How does it impact the run time? + +3. Try different values for `nel` (for the original loop order and the exchanged one). + How does the matrix size relate to the effect of exchanged loops? + + + + +# Memory Models + + - Study effect of latencies, cache sizes, block sizes, ... + - Here just focus on latency + + +## First model version + +One layer of RAM cache between the CPU and the disk. + +:::{.r-stack} + +{.fragment width=100% fragment-index=1} + +{.fragment width=100% fragment-index=2} + +{.fragment width=100% fragment-index=3} + +::: + + +## Memory access time for first version + +| Cache | Access Time | Hit Ratio | +| ------ | ------------ | ---------- | +| Memory | $T_M$ | $H_M$ | +| Disk | $T_D$ | | + + - Parallel and serial requests possible + +:::{.smaller} +\begin{align} + T_{avg,p} &= H_M T_M + (1-H_M) \cdot \color{blue}{T_D}\\ + T_{avg,s} &= H_M T_M + (1-H_M) \cdot \color{blue}{(T_M + T_D)} +\end{align} +::: + + +## Second model version + +Three layers of caches + +:::{.r-stack} + +{.fragment width=100% fragment-index=1} + +{.fragment width=100% fragment-index=2} + +::: + + + +## Memory access time for second version (1/2) + +| Cache | Access Time | Hit Ratio | +| ------ | ------------ | ---------- | +| $L_1$ | $T_1$ | $H_1$ | +| $L_2$ | $T_2$ | $H_2$ | +| $L_3$ | $T_3$ | | + + + +## Memory access time for second version (2/2) + + - Average memory access time $T_{avg,p}$ for parallel access (processor connected to all caches) + +:::{.smaller} +\begin{align} +T_{avg,p} &= H_1 T_1 + ((1-H_1)\cdot H_2)\cdot \color{blue}{T_2}\\ + &+ ((1-H_1)\cdot(1-H_2))\cdot \color{blue}{T_3} +\end{align} +::: + + - Average memory access time $T_{avg,s}$ for serial access + +:::{.smaller} +\begin{align} +T_{avg,s} &= H_1 T_1 + ((1-H_1)\cdot H_2)\cdot \color{blue}{(T_1+T_2)}\\ + &+ ((1-H_1)\cdot(1-H_2))\cdot \color{blue}{(T_1+T_2+T_3)} +\end{align} +::: + + + +# Processor techniques + + +## The general view + + +:::{.r-stack} + +{.fragment width=100% fragment-index=1} + +{.fragment width=100% fragment-index=2} + +{.fragment width=100% fragment-index=3} + +::: + + + +## If data is not available in the current memory level {.leftalign} + + - register spilling + +:::{.smaller} +_Register has to look for the data in the L1 cache_ +::: + + - cache miss + +:::{.smaller} +_The current cache has to fetch the data from the next cache or main memory_ +::: + + - page fault + +:::{.smaller} +_Data was not found in main memory and has to be loaded from disk_ +::: + + + +## Requesting unavailable data + +:::{.r-stack} + +{.fragment width=100% fragment-index=1} + +{.fragment width=100% fragment-index=2} + +{.fragment width=100% fragment-index=3} + +{.fragment width=100% fragment-index=4} + +{.fragment width=100% fragment-index=5} + +::: + + +::::::::{.columns .leftalign} + +:::{.column width=50%} + +:::{.fragment width=100% fragment-index=2} + + - Sending request + +::: + +:::{.fragment width=100% fragment-index=3} + + - If needed, forward request until found + +::: + +::: + +:::{.column width=50%} + +:::{.fragment width=100% fragment-index=4} + + - Load data into cache(s) + +::: + +:::{.fragment width=100% fragment-index=5} + + - Process available data + +::: + +::: + +:::::::: + + +## Provide data which might be needed {.leftalign} + + - caching + +:::{.smaller} +_Keep data around which was needed once_ +::: + + - prefetching + +:::{.smaller} +_Load data which might be needed soon (spatial or temporal proximity, heuristics)_ +::: + + - branch prediction + +:::{.smaller} +_Similar to prefetching, load data needed for different code paths_ +::: + + +## Use cached data + +:::{.r-stack} + +{.fragment width=100% fragment-index=1} + +{.fragment width=100% fragment-index=2} + +{.fragment width=100% fragment-index=3} + +::: + +::::::::{.columns .leftalign} + +:::{.column width=50%} + +:::{.fragment width=100% fragment-index=2} + + - Sending request + - Data is already present + +::: + +::: + +:::{.column width=50%} + +:::{.fragment width=100% fragment-index=3} + + - Load data into cache + - Process it + +::: + +::: + +:::::::: + + + +# Memory hierarchy on Levante + +## Memory Pyramid (downwards) {auto-animate=true} + +Based on a typical Levante node (AMD EPYC 7763) + +- Base frequency: 2.45 GHz + + + +## Memory Pyramid (downwards) {auto-animate=true} + +Based on a typical Levante node (AMD EPYC 7763) + +<table data-auto-animate-target="memtbl"><thead> +<tr class="header"><th data-id="h11"></th><th data-id="h12" style="text-align: left;">Latency</th><th data-id="h13" style="text-align: left;">Capacity</th></tr> +</thead><tbody> +<tr class="odd"><td data-id="c11">Register</td><td data-id="c12" style="text-align: left;">~0.4 ns</td><td data-id="c13" style="text-align: left;">1 KB</td></tr> +</tbody></table> + +:::{.incremental} + + - L1-L3 Cache are a few times slower + +::: + + +## Memory Pyramid (downwards) {auto-animate=true} + +Based on a typical Levante node (AMD EPYC 7763) + +<table data-auto-animate-target="memtbl"><thead> +<tr class="header"><th data-id="h11"></th><th data-id="h12" style="text-align: left;">Latency</th><th data-id="h13" style="text-align: left;">Capacity</th></tr> +</thead><tbody> +<tr class="odd"><td data-id="c11">Register</td><td data-id="c12" style="text-align: left;">~0.4 ns</td><td data-id="c13" style="text-align: left;">1 KB</td></tr> +<tr class="even"><td data-id="c21">L1 Cache</td><td data-id="c22" style="text-align: left;">~1 ns</td><td data-id="c22" style="text-align: left;">32 KB</td></tr> +<tr class="odd"><td data-id="c31">L2 Cache</td><td data-id="c32" style="text-align: left;">a few ns</td><td data-id="c33" style="text-align: left;">512 KB</td></tr> +<tr class="even"><td data-id="c41">L3 Cache</td><td data-id="c42" style="text-align: left;">~10 ns</td><td data-id="c43" style="text-align: left;">32 MB</td></tr> +</tbody></table> + +:::{.incremental} + + - 256 GB of main memory (default) with a theoretical memory bandwidth of ~200 GB/s + +::: + + + +## Memory Pyramid (downwards) {auto-animate=true} + +Based on a typical Levante node (AMD EPYC 7763) + +<table data-auto-animate-target="memtbl"><thead> +<tr class="header"><th data-id="h11"></th><th data-id="h12" style="text-align: left;">Latency</th><th data-id="h13" style="text-align: left;">Capacity</th></tr> +</thead><tbody> +<tr class="odd"><td data-id="c11">Register</td><td data-id="c12" style="text-align: left;">~0.4 ns</td><td data-id="c13" style="text-align: left;">1 KB</td></tr> +<tr class="even"><td data-id="c21">L1 Cache</td><td data-id="c22" style="text-align: left;">~1 ns</td><td data-id="c22" style="text-align: left;">32 KB</td></tr> +<tr class="odd"><td data-id="c31">L2 Cache</td><td data-id="c32" style="text-align: left;">a few ns</td><td data-id="c33" style="text-align: left;">512 KB</td></tr> +<tr class="even"><td data-id="c41">L3 Cache</td><td data-id="c42" style="text-align: left;">~10 ns</td><td data-id="c43" style="text-align: left;">32 MB</td></tr> +<tr class="odd"><td data-id="c51">Main Memory</td><td data-id="c52" style="text-align: left;">10s of ns</td><td data-id="c53" style="text-align: left;">256 GB</td></tr> +</tbody></table> + +:::{.incremental} + + - Fast Data as Flash based file system + +::: + + + +## Memory Pyramid (downwards) {auto-animate=true} + +Based on a typical Levante node (AMD EPYC 7763) + +<table data-auto-animate-target="memtbl"><thead> +<tr class="header"><th data-id="h11"></th><th data-id="h12" style="text-align: left;">Latency</th><th data-id="h13" style="text-align: left;">Capacity</th></tr> +</thead><tbody> +<tr class="odd"><td data-id="c11">Register</td><td data-id="c12" style="text-align: left;">~0.4 ns</td><td data-id="c13" style="text-align: left;">1 KB</td></tr> +<tr class="even"><td data-id="c21">L1 Cache</td><td data-id="c22" style="text-align: left;">~1 ns</td><td data-id="c22" style="text-align: left;">32 KB</td></tr> +<tr class="odd"><td data-id="c31">L2 Cache</td><td data-id="c32" style="text-align: left;">a few ns</td><td data-id="c33" style="text-align: left;">512 KB</td></tr> +<tr class="even"><td data-id="c41">L3 Cache</td><td data-id="c42" style="text-align: left;">~10 ns</td><td data-id="c43" style="text-align: left;">32 MB</td></tr> +<tr class="odd"><td data-id="c51">Main Memory</td><td data-id="c52" style="text-align: left;">10s of ns</td><td data-id="c53" style="text-align: left;">256 GB</td></tr> +<tr class="even"><td data-id="c61">SSD</td><td data-id="c62" style="text-align: left;">100s of µs</td><td data-id="c63" style="text-align: left;">200 TB</td></tr> +</tbody></table> + +:::{.incremental} + + - File system at Levante ~130 PB, limited by quota for project + +::: + + + +## Memory Pyramid (downwards) {auto-animate=true} + +Based on a typical Levante node (AMD EPYC 7763) + +<table data-auto-animate-target="memtbl"><thead> +<tr class="header"><th data-id="h11"></th><th data-id="h12" style="text-align: left;">Latency</th><th data-id="h13" style="text-align: left;">Capacity</th></tr> +</thead><tbody> +<tr class="odd"><td data-id="c11">Register</td><td data-id="c12" style="text-align: left;">~0.4 ns</td><td data-id="c13" style="text-align: left;">1 KB</td></tr> +<tr class="even"><td data-id="c21">L1 Cache</td><td data-id="c22" style="text-align: left;">~1 ns</td><td data-id="c22" style="text-align: left;">32 KB</td></tr> +<tr class="odd"><td data-id="c31">L2 Cache</td><td data-id="c32" style="text-align: left;">a few ns</td><td data-id="c33" style="text-align: left;">512 KB</td></tr> +<tr class="even"><td data-id="c41">L3 Cache</td><td data-id="c42" style="text-align: left;">~10 ns</td><td data-id="c43" style="text-align: left;">32 MB</td></tr> +<tr class="odd"><td data-id="c51">Main Memory</td><td data-id="c52" style="text-align: left;">10s of ns</td><td data-id="c53" style="text-align: left;">256 GB</td></tr> +<tr class="even"><td data-id="c61">SSD</td><td data-id="c62" style="text-align: left;">100s of µs</td><td data-id="c63" style="text-align: left;">200 TB</td></tr> +<tr class="odd"><td data-id="c71">Hard disk</td><td data-id="c72" style="text-align: left;">a few ms</td><td data-id="c73" style="text-align: left;">130 PB</td></tr> +</tbody></table> + + + +## Memory Mountain (1/2) + +:::{.smaller} + +Code for program contained in "Computer Systems": + +<https://csapp.cs.cmu.edu/3e/mountain.tar> + +::: + + - Process a representative amount of data + - Use `stride` between array elements to control spatial locality + - Use `size` of array to control temporal locality + - Also warm up the cache before the actual measurements + + + +## Memory Mountain (2/2) + +:::{r-stack} + +{width=70%} + +::: + +:::{.smaller} +$\approx$ Factor 20 between best and worst access +::: + + +## Hands-On {.handson} + +1. Download and extract the C source code of the memory mountain program linked in the from previous slides +2. Compile the program and run it on your PC or a Levante compute node +3. Which factor do you get between best and worst performance? +4. (optional) Visualize your results + + + +## Different architectures + + - Different caches are available + - Speed and size of caches varies + - Basic understanding helps in all cases + - Hardware-specific knowledge allows additional fine-tuning + + + +## Memory on Levante GPUs + + - For a NVIDIA A100 80GB GPU (4x in a Levante GPU node) + - Register and L1 Cache for one (of 108) Streaming Multiprocessor of a GPU + +| | Latency | Capacity | +| -------------------- | ------------ | ---------- | +| Register | ~1 ns | 4 x 64 KB | +| L1 Cache | a few ns | 192 KB | +| L2 Cache (shared) | ~10 ns | 40MB | +| Main Memory (HBM2e) | 10s of ns | 80 GB | + + + +# Summary + +## Observations + + - Gap between processor and memory speeds. + Hierarchy needed because of discrepancy between speed of CPU and (main) memory + + + - exploit accessing data and code stored close to each other (temporal and spatial locality) + + + +# Resources {.leftalign} + + - "Computer Systems: A Programmer's Perspective" by _R. Bryant_ and _D. O'Hallaron_, Pearson + - "Computer Architecture" by _J. Hennessy_ and _D. Patterson_, O'Reilly diff --git a/lectures/memory-hierarchies/static/concepts_model01.png b/lectures/memory-hierarchies/static/concepts_model01.png new file mode 100644 index 0000000000000000000000000000000000000000..1f1adfa89a1353dda0a0f3acc64677dbd6750318 Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model01.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model02.png b/lectures/memory-hierarchies/static/concepts_model02.png new file mode 100644 index 0000000000000000000000000000000000000000..577fa48cba1c87392bbf4e634c9cf7f54eb7fc87 Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model02.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model03.png b/lectures/memory-hierarchies/static/concepts_model03.png new file mode 100644 index 0000000000000000000000000000000000000000..9422569d12d5f3aafbeaea45a8af2715bcdfd8d1 Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model03.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model04.png b/lectures/memory-hierarchies/static/concepts_model04.png new file mode 100644 index 0000000000000000000000000000000000000000..e4fedd5f70c16e89c0c814599747340c394f7426 Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model04.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model05.png b/lectures/memory-hierarchies/static/concepts_model05.png new file mode 100644 index 0000000000000000000000000000000000000000..b7f907051530386d430f3ba1c254de9a28f00444 Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model05.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model06.png b/lectures/memory-hierarchies/static/concepts_model06.png new file mode 100644 index 0000000000000000000000000000000000000000..bce684a372a34f28733bb345485b08e28ba60c6f Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model06.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model07.png b/lectures/memory-hierarchies/static/concepts_model07.png new file mode 100644 index 0000000000000000000000000000000000000000..45568d35c632118a7aed775713d3544ec5e1251c Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model07.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model08.png b/lectures/memory-hierarchies/static/concepts_model08.png new file mode 100644 index 0000000000000000000000000000000000000000..2be4e16604f4543cd00f2c32a8295c758a709f10 Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model08.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model09.png b/lectures/memory-hierarchies/static/concepts_model09.png new file mode 100644 index 0000000000000000000000000000000000000000..fd3271d7ac11256a574b9ffa8e3e15d3b49e5e4e Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model09.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model10.png b/lectures/memory-hierarchies/static/concepts_model10.png new file mode 100644 index 0000000000000000000000000000000000000000..c5df10c96c145b2618d9d36f79f648fc719479f0 Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model10.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model11.png b/lectures/memory-hierarchies/static/concepts_model11.png new file mode 100644 index 0000000000000000000000000000000000000000..5f2e0de92a6ea1e716484fc48f55b47468f8ea37 Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model11.png differ diff --git a/lectures/memory-hierarchies/static/concepts_model12.png b/lectures/memory-hierarchies/static/concepts_model12.png new file mode 100644 index 0000000000000000000000000000000000000000..e580a69b8e3cfec6bfcf76adebbd20bbc494789a Binary files /dev/null and b/lectures/memory-hierarchies/static/concepts_model12.png differ diff --git a/lectures/memory-hierarchies/static/loops.f90 b/lectures/memory-hierarchies/static/loops.f90 new file mode 100644 index 0000000000000000000000000000000000000000..5b0fbf16efe75d235d3d5d8e1b95ee08253302cb --- /dev/null +++ b/lectures/memory-hierarchies/static/loops.f90 @@ -0,0 +1,13 @@ +program loop_exchange + implicit none + integer, parameter :: nel = 20000 + ! Note: Stay below 46000 to prevent overflows below + integer, dimension(nel, nel) :: mat + integer :: i, j + + do i = 1, nel + do j = 1, nel + mat(j, i) = (i-1)*nel + j-1 + end do + end do +end program loop_exchange diff --git a/lectures/memory-hierarchies/static/memory_mountain.png b/lectures/memory-hierarchies/static/memory_mountain.png new file mode 100644 index 0000000000000000000000000000000000000000..478ef562075c8a8054035cd9683264763501c8be Binary files /dev/null and b/lectures/memory-hierarchies/static/memory_mountain.png differ diff --git a/lectures/memory-hierarchies/static/pyramid01.png b/lectures/memory-hierarchies/static/pyramid01.png new file mode 100644 index 0000000000000000000000000000000000000000..2abd73fd111a13b582ae43777fc3748e7b9292a6 Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid01.png differ diff --git a/lectures/memory-hierarchies/static/pyramid02.png b/lectures/memory-hierarchies/static/pyramid02.png new file mode 100644 index 0000000000000000000000000000000000000000..c41c306a272851d6f9c5b08e6a3d89031f7c11ec Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid02.png differ diff --git a/lectures/memory-hierarchies/static/pyramid03.png b/lectures/memory-hierarchies/static/pyramid03.png new file mode 100644 index 0000000000000000000000000000000000000000..06f1f461a3e69fbc122dd3dc3539a3a39ecdff35 Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid03.png differ diff --git a/lectures/memory-hierarchies/static/pyramid04.png b/lectures/memory-hierarchies/static/pyramid04.png new file mode 100644 index 0000000000000000000000000000000000000000..e3c85099e402c3a598dc92888a350fa29330381e Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid04.png differ diff --git a/lectures/memory-hierarchies/static/pyramid05.png b/lectures/memory-hierarchies/static/pyramid05.png new file mode 100644 index 0000000000000000000000000000000000000000..f8fc86075f4949db058e4133019e936bb010fc53 Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid05.png differ diff --git a/lectures/memory-hierarchies/static/pyramid06.png b/lectures/memory-hierarchies/static/pyramid06.png new file mode 100644 index 0000000000000000000000000000000000000000..4094088f15c6865749e39072f6822289624404f6 Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid06.png differ diff --git a/lectures/memory-hierarchies/static/speed.png b/lectures/memory-hierarchies/static/speed.png new file mode 100644 index 0000000000000000000000000000000000000000..d45d3158c5dd9118866be5eed925ea6ca512e596 Binary files /dev/null and b/lectures/memory-hierarchies/static/speed.png differ