diff --git a/_quarto.yml b/_quarto.yml index 1d12eb26daa5455f691e66652fba2488ff5c2b65..f861f100c788abeb4ff15de6d2d76eae08732b58 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -41,7 +41,7 @@ website: - "lectures/parallelism/slides.qmd" - "lectures/hardware/slides.qmd" - "lectures/file-and-data-systems/slides.qmd" - # - "lectures/memory-hierarchies/slides.qmd" + - "lectures/memory-hierarchies/slides.qmd" # - "lectures/student-talks/slides.qmd" - section: "Exercises" contents: diff --git a/lectures/memory-hierarchies/slides.qmd b/lectures/memory-hierarchies/slides.qmd new file mode 100644 index 0000000000000000000000000000000000000000..53f40e71ed21fc0d850d9e4704bc6c3bf6840bb3 --- /dev/null +++ b/lectures/memory-hierarchies/slides.qmd @@ -0,0 +1,298 @@ +--- +title: "Memory Hierarchies" +author: "Dominik Zobel and Florian Ziemen" +--- + +# Memory Hierarchies + + - Background + - Why you should care + - How to use it to your advantage + + + +## Questions {.handson .incremental} + + - Why not keep everything in memory? + - What to do with really big data? + - How to speed up processing data? + + + +## Access time example (1/2) {.leftalign} + +<!-- +File `data.py`: + +```{.python} +import time +import pickle +import numpy as np + + +def create_random_data(): + np.random.seed(3922) + start = time.perf_counter() + data = np.random.randint(0, 2**20, size=(128, 128, 128, 128)) + end = time.perf_counter() + print('{:10.5f}: Create "random" data'.format(end-start)) + return data + + +def create_42_data(): + start = time.perf_counter() + data = np.full((128, 128, 128, 128), 42) + end = time.perf_counter() + print('{:10.5f}: Create "42" data'.format(end-start)) + return data + + +def store_data(filename, data, dataname): + start = time.perf_counter() + with open(filename, 'wb') as outfile: + pickle.dump(data, outfile) + + end = time.perf_counter() + print('{:10.5f}: Store "{:s}" data'.format(end-start, dataname)) + + +def load_data(filename, dataname): + start = time.perf_counter() + with open(filename, 'rb') as infile: + data = pickle.load(infile) + + end = time.perf_counter() + print('{:10.5f}: Load "{:s}" data'.format(end-start, dataname)) + return data + + +def operate_on_data(data, dataname): + start = time.perf_counter() + new_data = data + 1.1 + end = time.perf_counter() + print('{:10.5f}: Operate on "{:s}" data'.format(end-start, dataname)) + return new_data +``` + +File `save_it.py`: + +```{.python} +from data import * + +dataname = 'random' +data = create_random_data() +store_data(filename='temp01.dat', + data=data, dataname=dataname) + +dataname = '42' +data = create_42_data() +store_data(filename='temp02.dat', + data=data, dataname=dataname) +``` + +File `from_disk.py`: + +```{.python} +from data import * + +dataname = 'random' +data = load_data(filename='temp01.dat', + dataname=dataname) +new_data = operate_on_data(data=data, + dataname=dataname) + +dataname = '42' +data = load_data(filename='temp02.dat', + dataname=dataname) +new_data = operate_on_data(data=data, + dataname=dataname) +``` + +File `in_memory.py`: + +```{.python} +from data import * + +dataname = 'random' +data = create_random_data() +new_data = operate_on_data(data=data, + dataname=dataname) + +dataname = '42' +data = create_42_data() +new_data = operate_on_data(data=data, + dataname=dataname) +``` +--> + +| | Levante (Fixed) | Local (Fixed) | Levante (Random) | Local (Random) | +| -------------------------- | ----------------- | ----------------- | ----------------- | ----------------- | +| Create data | 0.5607 | 0.2280 | 1.6623 | 0.8759 | +| Load data | 0.7605 | 0.8767 | 0.7615 | 0.9225 | +| Store data | 2.2317 | 4.0427 | 2.2327 | 3.4378 | +| Process data | 0.7618 | 0.3792 | 0.7572 | 0.3747 | + +:::{.smaller} +Time in seconds using a 2 GB numpy array ($128 \times 128 \times 128 \times 128$) either with a fixed number or random number in each entry +::: + + + +## Access time example (2/2) {.leftalign} + + - Reading/writing to file is rather expensive + - If necessary during computation, try doing it asynchronously + - If possible, keep data in memory + + + +## Techniques + + - Caching + - Prefetching + - Branch prediction + + + +## Unavailable data + +If data is not available in the current memory level + + - register spilling (register -> cache) + - cache missing (cache -> main memory) + - page fault (main memory -> disk) + +Miss description + + + +## External Memory Model + +balance block size to get from next level to latency it takes to get it + + + +## Memory access time model (1/3) + +| Cache | Access Time | Hit Ratio | +| ------ | ------------ | ---------- | +| $L_1$ | $T_1$ | $H_1$ | +| $L_2$ | $T_2$ | | + + - Parallel and serial requests possible + + + +## Memory access time model (2/3) + +| Cache | Access Time | Hit Ratio | +| ------ | ------------ | ---------- | +| $L_1$ | $T_1$ | $H_1$ | +| $L_2$ | $T_2$ | $H_2$ | +| $L_3$ | $T_3$ | | + + + +## Memory access time model (3/3) + + - Average memory access time $T_{avg,p}$ for parallel access (processor connected to all caches) + +:::{.smaller} +\begin{align} +T_{avg,p} &= H_1 T_1 + ((1-H_1)\cdot H_2)\cdot T_2\\ + &+ ((1-H_1)\cdot(1-H_2))\cdot T_3 +\end{align} +::: + + - Average memory access time $T_{avg,s}$ for serial access + +:::{.smaller} +\begin{align} +T_{avg,s} &= H_1 T_1 + ((1-H_1)\cdot H_2)\cdot(T_1+T_2)\\ + &+ ((1-H_1)\cdot(1-H_2))\cdot(T_1+T_2+T_3) +\end{align} +::: + + + +## Overview/Exercise + + - Introduction from base of pyramid (file access) + - example with access from disk and directly from memory + - background on memory hierarchy with focus on today instead of history + + - Working our way up the pyramid + - reference values for Levante CPU + - example with optimal and sub-optimal memory access, i.e. cache blocking (see `nproma`) + - openmp reduction (hand implementation), reference/continuation from parallelism lecture + + + +## Observations + + - Gap between processor and memory speeds. + Hierarchy needed because of discrepancy between speed of CPU and (main) memory + (include image) + + - exploit accessing data and code stored close to each other (temporal and spatial locality) + + + +## Memory Pyramid + + +:::{.r-stack} + +{.fragment width=70% fragment-index=1} + +{.fragment width=70% fragment-index=2} + +{.fragment width=70% fragment-index=3} + +{.fragment width=70% fragment-index=4} + +{.fragment width=70% fragment-index=5} + +{.fragment width=70% fragment-index=6} + +::: + + + +## Memory Mountain (1/2) + +Describe what is done + + - Influence of block size + - Influence of stride + - Measurements done on Levante compute node + + + +## Memory Mountain (2/2) + +:::{r-stack} + +{width=70%} + +::: + +:::{.smaller} +$\approx$ Factor 20 between best and worst access +::: + + + +## Different architectures + + - Different caches are available + - Speed and size of caches varies + - Basic understanding helps in all cases + - Hardware-specific knowledge allows additional fine-tuning + + + +# Resources {.leftalign} + + - "Computer Systems: A Programmer's Perspective" by _R. Bryant_ and _D. O'Hallaron_, Pearson + diff --git a/lectures/memory-hierarchies/static/memory_mountain.png b/lectures/memory-hierarchies/static/memory_mountain.png new file mode 100644 index 0000000000000000000000000000000000000000..478ef562075c8a8054035cd9683264763501c8be Binary files /dev/null and b/lectures/memory-hierarchies/static/memory_mountain.png differ diff --git a/lectures/memory-hierarchies/static/pyramid01.png b/lectures/memory-hierarchies/static/pyramid01.png new file mode 100644 index 0000000000000000000000000000000000000000..2abd73fd111a13b582ae43777fc3748e7b9292a6 Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid01.png differ diff --git a/lectures/memory-hierarchies/static/pyramid02.png b/lectures/memory-hierarchies/static/pyramid02.png new file mode 100644 index 0000000000000000000000000000000000000000..c41c306a272851d6f9c5b08e6a3d89031f7c11ec Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid02.png differ diff --git a/lectures/memory-hierarchies/static/pyramid03.png b/lectures/memory-hierarchies/static/pyramid03.png new file mode 100644 index 0000000000000000000000000000000000000000..06f1f461a3e69fbc122dd3dc3539a3a39ecdff35 Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid03.png differ diff --git a/lectures/memory-hierarchies/static/pyramid04.png b/lectures/memory-hierarchies/static/pyramid04.png new file mode 100644 index 0000000000000000000000000000000000000000..e3c85099e402c3a598dc92888a350fa29330381e Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid04.png differ diff --git a/lectures/memory-hierarchies/static/pyramid05.png b/lectures/memory-hierarchies/static/pyramid05.png new file mode 100644 index 0000000000000000000000000000000000000000..f8fc86075f4949db058e4133019e936bb010fc53 Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid05.png differ diff --git a/lectures/memory-hierarchies/static/pyramid06.png b/lectures/memory-hierarchies/static/pyramid06.png new file mode 100644 index 0000000000000000000000000000000000000000..4094088f15c6865749e39072f6822289624404f6 Binary files /dev/null and b/lectures/memory-hierarchies/static/pyramid06.png differ