From b14210b4ffd0b136e50e82848aaa505c32132293 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 4 Dec 2018 13:01:39 +0000 Subject: [PATCH] Public version of the NUMA ACPI 6.3 White Paper. This version is ready to share with the open source community. I've squashed own all of the commits to present a single coherent base to move forward from. Given uncertainty around this being an official UEFI white paper I've commented out the UEFI logo include and trademark statement until that is resolved. Signed-off-by: Jonathan Cameron --- Makefile | 20 + imageconvert.bat | 1 + make.bat | 36 ++ source/Unrolling.svg | 318 +++++++++++++ source/_templates/latex.tex_t | 50 ++ source/_templates/longtable.tex_t | 32 ++ source/_templates/tabular.tex_t | 27 ++ source/_templates/tabulary.tex_t | 27 ++ source/acpitables1.rst | 692 ++++++++++++++++++++++++++++ source/ccixsanodes.svg | 184 ++++++++ source/conf.py | 199 ++++++++ source/genericinitiator.rst | 167 +++++++ source/hotplug.rst | 28 ++ source/index.rst | 20 + source/intro.rst | 350 ++++++++++++++ source/memorysidecaches.rst | 346 ++++++++++++++ source/nonobvious.rst | 68 +++ source/sharedsidecache1.svg | 312 +++++++++++++ source/sharedsidecache2.svg | 733 ++++++++++++++++++++++++++++++ source/simple.svg | 169 +++++++ source/simplenodes.svg | 166 +++++++ source/simplenodesplus_non_ga.svg | 193 ++++++++ source/simplenodesplusga.svg | 205 +++++++++ source/simplenodesplusgapci.svg | 246 ++++++++++ source/simplenodesunrolled.svg | 307 +++++++++++++ source/simplesidecache1.svg | 192 ++++++++ source/typical-2p.svg | 282 ++++++++++++ 27 files changed, 5370 insertions(+) create mode 100644 Makefile create mode 100644 imageconvert.bat create mode 100644 make.bat create mode 100644 source/Unrolling.svg create mode 100644 source/_templates/latex.tex_t create mode 100644 source/_templates/longtable.tex_t create mode 100644 source/_templates/tabular.tex_t create mode 100644 source/_templates/tabulary.tex_t create mode 100644 source/acpitables1.rst create mode 100644 source/ccixsanodes.svg create mode 100644 source/conf.py create mode 100644 source/genericinitiator.rst create mode 100644 source/hotplug.rst create mode 100644 source/index.rst create mode 100644 source/intro.rst create mode 100644 source/memorysidecaches.rst create mode 100644 source/nonobvious.rst create mode 100644 source/sharedsidecache1.svg create mode 100644 source/sharedsidecache2.svg create mode 100644 source/simple.svg create mode 100644 source/simplenodes.svg create mode 100644 source/simplenodesplus_non_ga.svg create mode 100644 source/simplenodesplusga.svg create mode 100644 source/simplenodesplusgapci.svg create mode 100644 source/simplenodesunrolled.svg create mode 100644 source/simplesidecache1.svg create mode 100644 source/typical-2p.svg diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0505cc5 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = NUMAGUIDE +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/imageconvert.bat b/imageconvert.bat new file mode 100644 index 0000000..eca4460 --- /dev/null +++ b/imageconvert.bat @@ -0,0 +1 @@ + for /f "tokens=1* delims=." %%i in ('dir /b source\*.svg') do inkscape --without-gui --file=source\%%i.svg --export-pdf=source\%%i.pdf diff --git a/make.bat b/make.bat new file mode 100644 index 0000000..efd4eb7 --- /dev/null +++ b/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build +set SPHINXPROJ=NUMAGUIDE + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/source/Unrolling.svg b/source/Unrolling.svg new file mode 100644 index 0000000..3b29e5a --- /dev/null +++ b/source/Unrolling.svg @@ -0,0 +1,318 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + CPU 0 + + + + Memory 0 + + + + CPU 0 + + + + + CPU 1 + + + + + Memory 1 + + + + CPU 1 + + + + CPU 0 + + + + Memory 0 + + + + CPU 1 + + + + Memory 1 + + + + + + + + + diff --git a/source/_templates/latex.tex_t b/source/_templates/latex.tex_t new file mode 100644 index 0000000..4fd8b1f --- /dev/null +++ b/source/_templates/latex.tex_t @@ -0,0 +1,50 @@ +%% Generated by Sphinx. +\def\sphinxdocclass{<%= docclass %>} +<% if latex_engine == 'lualatex' -%> +\IfFileExists{luatex85.sty} + {\RequirePackage{luatex85}} + {\ifdefined\luatexversion\ifnum\luatexversion>84\relax + \PackageError{sphinx} + {** With this LuaTeX (\the\luatexversion),Sphinx requires luatex85.sty **} + {** Add the LaTeX package luatex85 to your TeX installation, and try again **} + \endinput\fi\fi} +<% endif -%> +\documentclass[<%= papersize %>,<%= pointsize %><%= classoptions %>]{<%= wrapperclass %>} +\ifdefined\pdfpxdimen + \let\sphinxpxdimen\pdfpxdimen\else\newdimen\sphinxpxdimen +\fi \sphinxpxdimen=<%= pxunit %>\relax +<%= passoptionstopackages %> +<%= inputenc %> +<%= utf8extra %> +<%= cmappkg %> +<%= fontenc %> +<%= amsmath %> +<%= multilingual %> +<%= fontpkg %> +<%= fncychap %> +\usepackage<%= sphinxpkgoptions %>{sphinx} +<%= sphinxsetup %> +<%= geometry %> +<%= usepackages %> +<%= hyperref %> +<%= contentsname %> +<%= numfig_format %> +<%= literalblockpto %> +<%= pageautorefname %> +<%= tocdepth %> +<%= secnumdepth %> +<%= preamble %> + +\title{<%= title %>} +\date{<%= date %>} +\release{<%= release %>} +\author{<%= author %>} +\newcommand{\sphinxlogo}{<%= logo %>} +\renewcommand{\releasename}{<%= releasename %>} +<%= makeindex %> +<%= body %> +<%= atendofbody %> +<%= indices %> +\renewcommand{\indexname}{<%= indexname %>} +<%= printindex %> +\end{document} diff --git a/source/_templates/longtable.tex_t b/source/_templates/longtable.tex_t new file mode 100644 index 0000000..b7310a7 --- /dev/null +++ b/source/_templates/longtable.tex_t @@ -0,0 +1,32 @@ +\begin{savenotes}\sphinxatlongtablestart\begin{longtable} +<%- if table.align == 'center' -%> + [c] +<%- elif table.align == 'left' -%> + [l] +<%- elif table.align == 'right' -%> + [r] +<%- endif -%> +<%= table.get_colspec() %> +<%- if table.caption -%> +\caption{<%= ''.join(table.caption) %>\strut}<%= labels %>\\*[\sphinxlongtablecapskipadjust] +<% endif -%> +\hline +<%= ''.join(table.header) %> +\endfirsthead + +\multicolumn{<%= table.colcount %>}{c}% +{\makebox[0pt]{\sphinxtablecontinued{\tablename\ \thetable{} -- <%= _('continued from previous page') %>}}}\\ +\hline +<%= ''.join(table.header) %> +\endhead + +\hline +\multicolumn{<%= table.colcount %>}{r}{\makebox[0pt][r]{\sphinxtablecontinued{<%= _('Continued on next page') %>}}}\\ +\endfoot + +\endlastfoot +<% if table.caption_footnotetexts -%> +<%= ''.join(table.caption_footnotetexts) %> +<% endif -%> +<%= ''.join(table.body) %> +\end{longtable}\sphinxatlongtableend\end{savenotes} diff --git a/source/_templates/tabular.tex_t b/source/_templates/tabular.tex_t new file mode 100644 index 0000000..3fd347e --- /dev/null +++ b/source/_templates/tabular.tex_t @@ -0,0 +1,27 @@ +\begin{savenotes}\sphinxattablestart +<% if table.align -%> + <%- if table.align == 'center' -%> + \centering + <%- elif table.align == 'left' -%> + \raggedright + <%- else -%> + \raggedleft + <%- endif %> +<%- else -%> + \centering +<%- endif %> +<% if table.caption -%> +\sphinxcapstartof{table} +\sphinxcaption{<%= ''.join(table.caption) %>}<%= labels %> +\sphinxaftercaption +<% endif -%> +\begin{tabular}[t]<%= table.get_colspec() -%> +\hline +<%= ''.join(table.header) %> +<%- if table.caption_footnotetexts -%> +<%= ''.join(table.caption_footnotetexts) -%> +<%- endif -%> +<%=- ''.join(table.body) %> +\end{tabular} +\par +\sphinxattableend\end{savenotes} diff --git a/source/_templates/tabulary.tex_t b/source/_templates/tabulary.tex_t new file mode 100644 index 0000000..16d1519 --- /dev/null +++ b/source/_templates/tabulary.tex_t @@ -0,0 +1,27 @@ +\begin{savenotes}\sphinxattablestart +<% if table.align -%> + <%- if table.align == 'center' -%> + \centering + <%- elif table.align == 'left' -%> + \raggedright + <%- else -%> + \raggedleft + <%- endif %> +<%- else -%> + \centering +<%- endif %> +<% if table.caption -%> +\sphinxcapstartof{table} +\sphinxcaption{<%= ''.join(table.caption) %>}<%= labels %> +\sphinxaftercaption +<% endif -%> +\begin{tabulary}{\linewidth}[t]<%= table.get_colspec() -%> +\hline +<%= ''.join(table.header) %> +<%- if table.caption_footnotetexts -%> +<%= ''.join(table.caption_footnotetexts) -%> +<%- endif -%> +<%=- ''.join(table.body) %> +\end{tabulary} +\par +\sphinxattableend\end{savenotes} diff --git a/source/acpitables1.rst b/source/acpitables1.rst new file mode 100644 index 0000000..52fb23c --- /dev/null +++ b/source/acpitables1.rst @@ -0,0 +1,692 @@ +.. include:: + +===================================== +Simple NUMA description +===================================== + +We will continue with the simple example we have already introduced in :numref:`figsimple` +and consider how this is represented in the ACPI tables passed from the firmware to the +Operating System. For this we will use the NUMA node allocations shown in +:numref:`figsimplenodes`. + +For now we will consider only boot time information. + +Enumerating the NUMA Nodes - System Resource Affinity Table (SRAT) +================================================================== + +The System Resource Affinity Table is the boot time description of elements +that make up the NUMA node. In ACPI a NUMA node is referred to a Proximity Domain. + +Like all ACPI tables SRAT starts with some preamble, all of which is +straight forward, so we will skip on to the Static Resource Allocation Structures. + +Static Resource Allocation Structure +************************************ + +This structure takes a number of forms, allowing us to define the various elements that make up +each individual NUMA node. + +Processor Affinity Structures +----------------------------- + +There are several affinity structure variants depending on how CPUs are identified in the system. +The each carry some additional information but it is not of interest to us here. + +* **Processor Local APIC/SAPIC Affinity Structure** APIC ID or SAPIC ID/EID to proximity domain. + +* **Processor Local x2APIC Affinity Structure** x2APIC ID to proximity domain. + +* **GICC Affinity Structure** ACPI Processor UID to proximity domain. + +Memory Affinity Structures +-------------------------- + +The Memory Affinity Structure provides an association between memory, +identified via its address range in the systems physical address map, +and the proximity domain in which we wish to represent it. This structure +also carries some additional information on the memory region: + +* Hot-plug - does this region support hot-plug. +* Non-Volatile - is this region served by non volatile memory. + +For our example, :numref:`figsimplenodes`, we have two volatile memories to +describe, one within proximity domain 0 and one within proximity domain 1. + +Interrupt Translation Service Structures (ITS) Affinity Structure +----------------------------------------------------------------- + +This one is mostly out of the scope of this description, but is used to allow an Operating System +to identify what memory is close to an ITS so as to place its management tables and command queue +appropriately. + +It may be thought of in a similar way to a generic initiator (see :numref:`gasect`) in that it isn't a processor, but +does make use of memory. + +For our example, :numref:`figsimplenodes`, we assume a single ITS in Proximity Domain 0. + +So on to the description of our simple topology. Note we have broken the +table up for readability but the following tables will all be concatenated. +In this example the processors are described via the GICC affinity structure, +for other systems they may be described using either the +*Local APIC/SAPIC Affinity Structure* or the +*Processor Local x2APIC Affinity Structure*. + +Simple Topology SRAT +******************** + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology SRAT example - Header + :widths: 70 55 200 + + +--------------------+------------+---------------------------+ + | Field | Value | Notes | + +====================+============+===========================+ + | Signature | SRAT | Identify Table | + +--------------------+------------+---------------------------+ + | Length | N | | + +--------------------+------------+---------------------------+ + | Revision | 3 | | + +--------------------+------------+---------------------------+ + | Checksum | XXXXXXXX | See spec | + +--------------------+------------+---------------------------+ + | OEMID | XXXX | See spec | + +--------------------+------------+---------------------------+ + | OEM Revision | XXXX | See spec | + +--------------------+------------+---------------------------+ + | Creator ID | XXXX | See spec | + +--------------------+------------+---------------------------+ + | Creator Revision | XXXX | See spec | + +--------------------+------------+---------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology SRAT example - Entry 0: Processor 0 + :widths: 70 55 200 + + +--------------------+------------+---------------------------+ + | Field | Value | Notes | + +====================+============+===========================+ + | Type | 3 | GICC Affinity Structure | + +--------------------+------------+---------------------------+ + | Length | 18 | | + +--------------------+------------+---------------------------+ + | Proximity Domain | 0 | | + +--------------------+------------+---------------------------+ + | ACPI Processor UID | XXXXXX | See spec | + +--------------------+------------+---------------------------+ + | Flags | 1 | Enable | + +--------------------+------------+---------------------------+ + | Clock Domain | XXXX | See spec | + +--------------------+------------+---------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology SRAT example - Entry 1: Processor 1 + :widths: 70 55 200 + + +--------------------+------------+---------------------------+ + | Field | Value | Notes | + +====================+============+===========================+ + | Type | 3 | GICC Affinity Structure | + +--------------------+------------+---------------------------+ + | Length | 18 | | + +--------------------+------------+---------------------------+ + | Proximity Domain | 1 | | + +--------------------+------------+---------------------------+ + | ACPI Processor UID | XXXXXX | See spec | + +--------------------+------------+---------------------------+ + | Flags | 1 | Enable | + +--------------------+------------+---------------------------+ + | Clock Domain | XXXX | See spec | + +--------------------+------------+---------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology SRAT example - Entry 2: Memory 0 + :widths: 70 55 200 + + +--------------------+------------+---------------------------+ + | Field | Value | Notes | + +====================+============+===========================+ + | Type | 1 | Memory Affinity Structure | + +--------------------+------------+---------------------------+ + | length | 40 | | + +--------------------+------------+---------------------------+ + | Proximity Domain | 0 | | + +--------------------+------------+---------------------------+ + | Base Address Low | 0x00000000 | Low 32 bits of start of | + | | | address range | + | | | | + | | | (Address 0x2_0000_0000) | + +--------------------+------------+---------------------------+ + | Base Address High | 0x00000002 | High 32 bits of start of | + | | | address range | + +--------------------+------------+---------------------------+ + | Length Low | 0x00000000 | Low 32 bits of length of | + | | | of address range | + | | | | + | | | (Address length = 4G) | + +--------------------+------------+---------------------------+ + | Length High | 0x00000001 | High 32 bits of length of | + | | | address range | + +--------------------+------------+---------------------------+ + | Flags | 1 | Enabled | + +--------------------+------------+---------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology SRAT example - Entry 3: Memory 1 + :widths: 70 55 200 + + +--------------------+------------+---------------------------+ + | Field | Value | Notes | + +====================+============+===========================+ + | Type | 1 | Memory Affinity Structure | + +--------------------+------------+---------------------------+ + | length | 40 | | + +--------------------+------------+---------------------------+ + | Proximity Domain | 1 | | + +--------------------+------------+---------------------------+ + | Base Address Low | 0x00000000 | Low 32 bits of start of | + | | | address range | + | | | | + | | | (Address 0x3_0000_0000) | + +--------------------+------------+---------------------------+ + | Base Address High | 0x00000003 | High 32 bits of start of | + | | | address range | + +--------------------+------------+---------------------------+ + | Length Low | 0x00000000 | Low 32 bits of length of | + | | | of address range | + | | | | + | | | (Address length = 4G) | + +--------------------+------------+---------------------------+ + | Length High | 0x00000001 | High 32 bits of length of | + | | | address range | + +--------------------+------------+---------------------------+ + | Flags | 1 | Enabled | + +--------------------+------------+---------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology SRAT example - Entry 4: ITS 0 + :widths: 70 55 200 + + +--------------------+------------+---------------------------+ + | Field | Value | Notes | + +====================+============+===========================+ + | Type | 4 | GIC ITS Affinity Structure| + +--------------------+------------+---------------------------+ + | length | 12 | | + +--------------------+------------+---------------------------+ + | Proximity Domain | 0 | | + +--------------------+------------+---------------------------+ + | ITS ID | XXXXXXXXX | Match the ITS ID of the | + | | | GIC ITS entry in MADT | + +--------------------+------------+---------------------------+ + +It is worth noting at this point that SRAT is the only way of defining +Proximity Domains in ACPI. They cannot be defined other than at boot +time. We shall revisit this restriction in :numref:`sechotplug`. + +Describing Node Relationship - System Locality Information Table (SLIT) +======================================================================= + +The System Locality Information Table provides the first level of description +of the relationships between nodes. Note that, if the Operating System +is able to interpret it and an HMAT is present, the Operating System is expected +to use the data in HMAT rather than that in SLIT. It will be +necessary to provide SLIT tables for quite some time as Operating Systems are +not currently fully transferred to HMAT. + +After an standard ACPI preamble this table consists of a matrix providing +a measure of *distance* between nodes. + +What is this distance? +********************** + +Unfortunately the definition of this distance is somewhat vague. +It is defined as the relative latency between nodes. + +The special value 10 is used as the local reference value and is always +present on the diagonal. + +A second magic value of 255 is defined as indicating that there is +no path between particular nodes. + +Distance values of 0-9 are defined as reserved and have no meaning. +The first value was set at 10 so as to allow for fractional relative +distances, so 1.5\ |times| the distance. + +For our example, let us assume that the latency ratio between +local accesses (the DDR connected directly to the processor) and +remote accesses (the DDR connected to the other processor) is 2. +This will give us a SLIT distances of 10 and 20 for the various +paths. + +Simple Topology SLIT +******************** + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology SLIT example + :widths: 70 55 200 + + +--------------------+-----------+----------------------------------+ + | Field | Value | Notes | + +====================+===========+==================================+ + | Signature | SLIT | Identify Table | + +--------------------+-----------+----------------------------------+ + | Length | N | | + +--------------------+-----------+----------------------------------+ + | Revision | 1 | See spec | + +--------------------+-----------+----------------------------------+ + | Checksum | XXXXXXXX | See spec | + +--------------------+-----------+----------------------------------+ + | OEMID | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + | OEM Revision | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + | Creator ID | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + | Creator Revision | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + | Number of System | 2 | Our total node count. | + | Localities | | Sets the SLIT matrix dimension | + +--------------------+-----------+----------------------------------+ + | Entry[0][0] | 10 | Node 0 to directly attached DDR | + +--------------------+-----------+----------------------------------+ + | Entry[0][1] | 20 | Node 0 to Node 1 DDR | + +--------------------+-----------+----------------------------------+ + | Entry[1][0] | 20 | Node 1 to Node 0 DDR | + +--------------------+-----------+----------------------------------+ + | Entry[1][1] | 10 | Node 0 to directly attached DDR | + +--------------------+-----------+----------------------------------+ + +Limitations of SLIT +******************* + +There is no clear definition of how to measure latency. For a given +initiator to target, there are many application related factors that will +change the apparent latency. + +* Packet size +* Access pattern, random vs sequential +* Other traffic +* Cache effects +* Link bandwidth as this effects the latency if the load is high. +* Any intermediate buffering that would have an effect on latency as + the load on the link increases. + +The HMAT table was introduced to provide some more information to those operating +systems and user-space processes that chose to make use of it. However, it +is worth noting that performance of a memory from stand point of a particular +initiators is dependent on many factors that are not described by HMAT. + +Note that there are some additional restrictions upon SLIT tables applied +by operating systems. This is considered in :numref:`slitoslimit`. + +Heterogenous Memory Attribute Table (HMAT) +========================================== + +The Heterogenous Memory Attribute Table was introduced to provide additional +information beyond that provided by SRAT and SLIT. Some of this we will +deliberately not touch in detail here as it is not relevant to our simple +example and will be introduced later. + +Note that this table has changed considerably in the ACPI 6.3 specification. +It is thought that few Operating Systems were making use of the ACPI 6.2 +version of this table so we will not consider that here. + +For Proximity Domains, as enumerated by SRAT, HMAT describes: + +* Memory Attributes + + - memory-side caches + + - latency of memory and all levels of memory-side cache + + - bandwidth of memory and all levels of memory-side cache + + - connectivity of memory + +Note, HMAT does not define the Proximity Domains; they are defined in SRAT. +However, the introduction of HMAT may lead system designers to chose to +represent a finer grained set of Proximity Domains than they would if the +extra information in HMAT were not to be made available to the Operating System. +This extra level of detail is also reflected in a larger SLIT table, and this +may lead to unwanted complexity, or inefficient use of the memory by a non HMAT +aware Operating System. + +Elements of HMAT +**************** + +As it is an ACPI table, HMAT includes the standard preamble followed by +a series of HMAT Table Structures. All of these structures are optional +and should only be provided if they deliver information *of use* to the +Operating System in making decisions. As use-cases are hard to know +in advance, a very broad concept of *useful* should be applied rather +than focusing on any predefined use case. + +Preamble +-------- + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Heterogenous Memory Attribute Table Preamble example + :widths: 70 55 200 + + +--------------------+-----------+----------------------------------+ + | Field | Value | Notes | + +====================+===========+==================================+ + | Signature | HMAT | Identify Table | + +--------------------+-----------+----------------------------------+ + | Length | N | | + +--------------------+-----------+----------------------------------+ + | Revision | 2 | | + +--------------------+-----------+----------------------------------+ + | Checksum | XXXXXXXX | See spec | + +--------------------+-----------+----------------------------------+ + | OEMID | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + | OEM Table ID | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + | OEM Revision | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + | Creator ID | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + | Creator Revision | XXXX | See spec | + +--------------------+-----------+----------------------------------+ + +This is followed directly by as many entries as necessary to describe different +NUMA aspects of the system. + +Proximity Domain Attributes Structure +------------------------------------- + +Note this structure is effectively new for ACPI 6.3. + +This sub structure of HMAT is currently only used as a way to apply flags +to a particular relationship between an initiator domain, which must contain +either a Processor or Generic Initiator, and a memory domain which it may +access. The only flag present in ACPI 6.3 indicates direct attachment between +the two. For our simple example, this flag is not relevant so no +Proximity Domain Attributes Structures will be present. Note that Linux will +use this flag as a hint to *prefer* such memory to other memories which +otherwise have the same characteristics. + +System Locality Latency and Bandwidth Information Structure +----------------------------------------------------------- + +This structure provides a more detailed version of the Proximity Distance +that has previously been provided by SLIT. + +Note that all entries have associated initiator proximity domain and +memory domain allowing the table to be incomplete in cases where only +particular combinations are of interest (not all Proximity Domains will +have memory or initiators - though they will have one or or the other). + +What is Latency? +................ + +The latency is the lowest expected read / write latency between the initiator +and the memory. It is not defined for a particular type of transfer but +rather is expected to show the best possible value under optimum conditions +for the system. + +The format is designed to be extremely flexible, combining a 64 bit base +unit, which is a multiplication factor applied to all the table +entries, with individual 16 bit entries. The resulting computed value +is in picoseconds. Entries may be marked as not provided using the special +value 0. + +.. math:: + + lat[i][j] &= lat_{base} \times lat_{entry}[i][j] + + +What is Bandwidth? +.................. + +The bandwidth is the highest expected read / write bandwidth between the +initiator and the memory. The choice of transfer type should reflect the +optimum choice for this particular pairing. + +In a similar fashion to Latency, the format is defined in terms of a shared, +64 bit, base unit and individual 16 bit entries. The resulting computed value is +in MiB/Second. + +.. math:: + + bw[i][j] &= bw_{base} \times bw_{entry}[i][j] + +Back to our example... + +Let us suppose the following characteristics +(node 1 has slightly lower performance DDR than node 0 and the +interconnect is assumed symmetric but has relatively low +bandwidth) + +* Latency + - Node 0 Initiator to Local DDR read latency 90 ns. + - Node 0 Initiator to Node 1 DDR read latency 150 ns. + - Node 1 Initiator to Local DDR read latency 100 ns. + - Node 1 Initiator to Node 0 DDR read latency 140 ns. + - Node 0 Initiator to Local DDR write latency 100 ns. + - Node 0 Initiator to Node 1 DDR write latency 160 ns. + - Node 1 Initiator to Local DDR write latency 110 ns. + - Node 1 Initiator to Node 0 DDR write latency 150 ns. +* Bandwidth + - Node 0 Initiator to Local DDR read Bandwidth 3200 MB/s + - Node 0 Initiator to Node 1 DDR read Bandwidth 1600 MB/s + - Node 0 Initiator to Local DDR write Bandwidth 3200 MB/s + - Node 0 Initiator to Node 1 DDR write Bandwidth 1600 MB/s + - Node 1 Initiator to Local DDR read Bandwidth 3000 MB/s + - Node 1 Initiator to Node 0 DDR read Bandwidth 1600 MB/s + - Node 1 Initiator to Local DDR write Bandwidth 3000 MB/s + - Node 1 Initiator to Node 0 DDR write Bandwidth 1600 MB/s + +This leads to the following 3 entries. + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. _hmat_sllbis_rl: +.. table:: Simple Topology HMAT System Locality Latency and Bandwidth + Information Structure for memory read latency + :widths: 70 55 200 + + +--------------------+-----------+-----------------------------------+ + | Field | Value | Notes | + +====================+===========+===================================+ + | Type | 1 | Identifies this as the "System | + | | | Locality Latency and Bandwidth | + | | | Information Structure" | + +--------------------+-----------+-----------------------------------+ + | Length | N | | + +--------------------+-----------+-----------------------------------+ + | Flags | 0 | For now we are only dealing with | + | | | memory. | + +--------------------+-----------+-----------------------------------+ + | Data Type | 1 | This table is for DDR read | + | | | latency. We cannot use access | + | | | latency as the read and write | + | | | latencies are not equal. | + +--------------------+-----------+-----------------------------------+ + | Number of Initiator| 2 | We have processors, which are one | + | Proximity Domains | | type of initiator, in Node 0 and | + | | | Node 1. | + +--------------------+-----------+-----------------------------------+ + | Number of Target | 2 | We have DDR memory as targets | + | Proximity Domains | | in Node 0 and Node 1. | + +--------------------+-----------+-----------------------------------+ + | Entry Base Unit | 1000 | 1000 corresponds to units of | + | | | nano seconds. We could have used | + | | | many different units here due to | + | | | the relatively minor differences | + | | | in latencies, but 1000 gives | + | | | an easy understand scaling. | + +--------------------+-----------+-----------------------------------+ + | Initiator Proximity| 0 | 1st Initiator Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | latency information. | + +--------------------+-----------+-----------------------------------+ + | Initiator Proximity| 1 | 2nd Initiator Proximity Domain | + | Domain List Entry 1| | for which we are providing | + | | | latency information. | + +--------------------+-----------+-----------------------------------+ + | Target Proximity | 0 | 1st Target Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | latency information. | + +--------------------+-----------+-----------------------------------+ + | Target Proximity | 1 | 2nd Target Proximity Domain | + | Domain List Entry 1| | for which we are providing | + | | | latency information. | + +--------------------+-----------+-----------------------------------+ + | Entry[0][0] | 90 | Node 0 to local DDR read | + | | | latency, 90ns | + +--------------------+-----------+-----------------------------------+ + | Entry[0][1] | 150 | Node 0 to Node 1 DDR read | + | | | latency, 150ns | + +--------------------+-----------+-----------------------------------+ + | Entry[1][0] | 140 | Node 1 to Node 0 DDR read | + | | | latency, 140ns | + +--------------------+-----------+-----------------------------------+ + | Entry[1][1] | 100 | Node 1 to local DDR read | + | | | latency, 100ns | + +--------------------+-----------+-----------------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology HMAT System Locality Latency and Bandwidth + Information Structure for memory write latency + :widths: 70 55 200 + + +--------------------+-----------+-----------------------------------+ + | Field | Value | Notes | + +====================+===========+===================================+ + | Type | 1 | Identifies this as the "System | + | | | Locality Latency and Bandwidth | + | | | Information Structure" | + +--------------------+-----------+-----------------------------------+ + | Length | N | | + +--------------------+-----------+-----------------------------------+ + | Flags | 0 | For now we are only dealing with | + | | | memory. | + +--------------------+-----------+-----------------------------------+ + | Data Type | 2 | This table is for DDR write | + | | | latency. We cannot use access | + | | | latency as the read and write | + | | | latencies are not equal. | + +--------------------+-----------+-----------------------------------+ + | Number of Initiator| 2 | We have processors, which are one | + | Proximity Domains | | type of initiator, in Node 0 and | + | | | Node 1. | + +--------------------+-----------+-----------------------------------+ + | Number of Target | 2 | We have DDR memory as targets | + | Proximity Domains | | in Node 0 and Node 1. | + +--------------------+-----------+-----------------------------------+ + | Entry Base Unit | 1000 | 1000 corresponds to units of | + | | | nano seconds. We could have used | + | | | many different units here due to | + | | | the relatively minor differences | + | | | in latencies, but 1000 gives | + | | | an easy understand scaling. | + +--------------------+-----------+-----------------------------------+ + | Initiator Proximity| 0 | 1st Initiator Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | latency information. | + +--------------------+-----------+-----------------------------------+ + | Initiator Proximity| 1 | 2nd Initiator Proximity Domain | + | Domain List Entry 1| | for which we are providing | + | | | latency information. | + +--------------------+-----------+-----------------------------------+ + | Target Proximity | 0 | 1st Target Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | latency information. | + +--------------------+-----------+-----------------------------------+ + | Target Proximity | 1 | 2nd Target Proximity Domain | + | Domain List Entry 1| | for which we are providing | + | | | latency information. | + +--------------------+-----------+-----------------------------------+ + | Entry[0][0] | 100 | Node 0 to local DDR write | + | | | latency, 100ns | + +--------------------+-----------+-----------------------------------+ + | Entry[0][1] | 160 | Node 0 to Node 1 DDR write | + | | | latency, 160ns | + +--------------------+-----------+-----------------------------------+ + | Entry[1][0] | 150 | Node 1 to Node 0 DDR write | + | | | latency, 150ns | + +--------------------+-----------+-----------------------------------+ + | Entry[1][1] | 110 | Node 1 to local DDR write | + | | | latency, 110ns | + +--------------------+-----------+-----------------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Simple Topology HMAT System Locality Latency and Bandwidth + Information Structure for memory access bandwidth + :widths: 70 55 200 + + +--------------------+-----------+----------------------------------+ + | Field | Value | Notes | + +====================+===========+==================================+ + | Type | 1 | See :numref:`hmat_sllbis_rl`. | + +--------------------+-----------+----------------------------------+ + | Length | | | + +--------------------+-----------+----------------------------------+ + | Flags | 0 | For now we are only dealing with | + | | | memory. | + +--------------------+-----------+----------------------------------+ + | Data Type | 3 | This table is for memory access | + | | | latency. The read and write | + | | | bandwidths are symmetric | + | | | allowing one entry to cover both.| + +--------------------+-----------+----------------------------------+ + | Number of Initiator| 2 | See :numref:`hmat_sllbis_rl`. | + | Proximity Domains | | | + +--------------------+-----------+----------------------------------+ + | Number of Target | 2 | See :numref:`hmat_sllbis_rl`. | + | Proximity Domains | | | + +--------------------+-----------+----------------------------------+ + | Entry Base Unit | 100 | 100 corresponds to units of | + | | | 100 MiB/Sec. | + +--------------------+-----------+----------------------------------+ + | Initiator Proximity| 0 | See :numref:`hmat_sllbis_rl`. | + | Domain List Entry 0| | | + +--------------------+-----------+----------------------------------+ + | Initiator Proximity| 1 | See :numref:`hmat_sllbis_rl`. | + | Domain List Entry 1| | | + +--------------------+-----------+----------------------------------+ + | Target Proximity | 0 | See :numref:`hmat_sllbis_rl`. | + | Domain List Entry 0| | | + +--------------------+-----------+----------------------------------+ + | Target Proximity | 1 | See :numref:`hmat_sllbis_rl`. | + | Domain List Entry 1| | | + +--------------------+-----------+----------------------------------+ + | Entry[0][0] | 32 | Node 0 to local memory access | + | | | bandwidth, 3200 MiB/Sec | + +--------------------+-----------+----------------------------------+ + | Entry[0][1] | 16 | Node 0 to Node 1 memory access | + | | | bandwidth, 1600 MiB/Sec | + +--------------------+-----------+----------------------------------+ + | Entry[1][0] | 16 | Node 1 to Node 0 memory access | + | | | bandwidth, 1600 MiB/Sec | + +--------------------+-----------+----------------------------------+ + | Entry[1][1] | 32 | Node 1 to local memory access | + | | | bandwidth, 3200 MiB/Sec | + +--------------------+-----------+----------------------------------+ + +We will leave the structures describing Memory-Side Caches until :numref:`secmemorysidecache`. + +.. _secpxm: + +Proximity Domain Specification in DSDT / SSDT +============================================= + +The ACPI specification provides no means of specifying additional Proximity +Domains outside of affinity entries in SRAT. However, it does provide a means +of assigning additional devices to a Proximity Domain which has been defined +by means of an SRAT affinity entry for other devices. + +This is done using the Proximity (_PXM) object for a device in the +Differentiated System Descriptor Table (DSDT) or Secondary System +Descriptor Tables (SSDT). + +Prior to ACPI 6.3 this was the only way of providing proximity information +about types of device that are not CPUs, Memory or ITSes. + +It is not currently possible to provide NUMA information for a new hot-plugged +domain that had no previous elements in SRAT. This means that *potential* +hardware must be presented at boot, rather than simply what is present at +that time. It is possible to modify the NUMA characteristics of that hardware +to reflect the reality of what was plugged in. This is covered in more detail +in :numref:`sechotplug`. diff --git a/source/ccixsanodes.svg b/source/ccixsanodes.svg new file mode 100644 index 0000000..794e2dd --- /dev/null +++ b/source/ccixsanodes.svg @@ -0,0 +1,184 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + Memory 0 + + + + Initiator 0 + + + + + Memory 1 + + Memory 2 + + + + diff --git a/source/conf.py b/source/conf.py new file mode 100644 index 0000000..bb58049 --- /dev/null +++ b/source/conf.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'NUMA under ACPI 6.3' +copyright = '2018, Jonathan Cameron - Huawei' +author = 'Jonathan Cameron - Huawei' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '0.93' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ 'sphinx.ext.todo', 'sphinxcontrib.bibtex', 'sphinx.ext.mathjax' ] +todo_include_todos = False + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +numfig = True + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'NUMAGUIDEdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + 'papersize': 'a4paper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', + 'preamble': r""" + \usepackage{colortbl} + \usepackage[absolute]{textpos} + \setlength{\arrayrulewidth}{1pt} + \definecolor{light-gray}{gray}{0.98} + \protected\def\sphinxstyletheadfamily {\cellcolor{light-gray}\sffamily} + \makeatletter + \renewcommand{\maketitle}{ + \begin{titlepage} + \begin{textblock}{100}(1.2,2) + % \includegraphics{uefi_logo_red.pdf} + \end{textblock} + \begin{textblock}{10}(3, 7) + \Huge + \textsc + \@title + \end{textblock} + \begin{textblock}{10}(2, 12) + \textbf{Author:} + + \textbf{Jonathan Cameron} + + Huawei Research and Development (UK), Ltd. + + Jonathan.Cameron@huawei.com + \end{textblock} + \begin{textblock}{100}(12,15) + \@date + \end{textblock} + + \begin{textblock}{1}(1,2.5) + \rule{0.5mm}{23cm} + \end{textblock} + \null + \newpage + This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/. + + % The UEFI logo is used under the UEFI Logo Usage Guidelines available from https://uefi.org. + \null + \end{titlepage} + } + \makeatother + """ +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'NUMAGUIDE.tex', 'Non Uniform Memory Access Description under ACPI 6.3', + 'Jonathan Cameron - Huawei', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'numaguide', 'NUMA under ACPI 6.3', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'NUMAGUIDE', 'Non Uniform Memory Access Description under ACPI 6.3', + author, 'NUMAGUIDE', 'One line description of project.', + 'Miscellaneous'), +] \ No newline at end of file diff --git a/source/genericinitiator.rst b/source/genericinitiator.rst new file mode 100644 index 0000000..0cf42de --- /dev/null +++ b/source/genericinitiator.rst @@ -0,0 +1,167 @@ +.. |nbsp| unicode:: 0xA0 + :trim: + +.. _gasect: + +========================= +Generic Initiators - Why? +========================= + +In the modern world of heterogenous compute, some of the traditional underlying +assumptions of what devices will be accessing memory and how they +will be doing it are no longer true. + +We will first address a very simple example, but the increasing use +of computing accelerators and memory over coherent, pluggable fabrics +means that a generic solution to this description problem is needed. + +Original Assumptions +==================== + +1. Most accesses to memory are done by devices that are closely coupled + with CPUs on the host, hence it is sufficient to describe the CPU + access properties to memory and assume it is similar for other + memory users. + +2. The properties of accesses from peripheral devices will be closely + correlated with the those of the host to which they are most + closely coupled. + +3. An Operating system only needs to make relative decisions so it doesn't + matter if there are additional constraints on bandwidth or additional + latency on the path to peripheral devices. + +.. _figsimplenodesplusga: +.. figure:: simplenodesplusga.* + :figclass: align-center + + A simple configuration where these legacy assumptions do not hold. + +Taking the hypothetical example in :numref:`figsimplenodesplusga` where we have a +separate IO device from the devices containing the processors and DDR controllers, +it becomes clear that these assumptions do not always hold. + +1. The IO device has an RDMA adapter. Which existing node do we assign this + device to as it is equal distance from Node 0 DDR and Node 1 DDR? + +2. The IO device is on a similar ring bus connection to that between the NUMA nodes. + As shown in :numref:`figsimplenodesplus_non_ga`, putting the IO device in either + of the existing nodes would imply one set of DDR was much nearer than the other. + +.. _figsimplenodesplus_non_ga: +.. figure:: simplenodesplus_non_ga.* + :figclass: align-center + + The ACPI view of our balanced case, as show in :numref:`figsimplenodesplusga`, prior to + the introduction of Generic Initiators. Note the relative distance from our + RDMA adaptor is falsely represented as different for Memory |nbsp| 0 and Memory |nbsp| 1. + +3. An RDMA adapter, in common with many modern IO devices has many separate + contexts each with their own resources. Traditionally we would place these + these in the *nearest* memory that can be found. If however, we have two + equal distance memory resources and the bandwidth to each is independent, + it may be beneficial for the Operating System to decide to put the data for + some of those contexts in Node 0 and some in Node 1 - a form of NUMA balancing. + +This simple example shows why we need the new (in ACPI 6.3) concept of a +Proximity Domain with a Generic Initiator. Note that Generic Initiators can +share a Proximity Domain with Memory and/or CPUs but they can also, as in +:numref:`figsimplenodesplusga` have a domain all of their own. Under previous +versions of the ACPI specification, the GI within an existing Proximity Domain +could be described using _PXM, the case where they are on their own could not +be described at all. + +Generic Initiator Additions +=========================== + +The ACPI specification was not greatly modified to allow for Generic Initiators, +but as this is new in ACPI 6.3, we will highlight a few necessary changes that +were made. + +SRAT Generic Initiator Affinity Structure +----------------------------------------- + +Alongside the existing CPU, Memory and ITS Affinity Structures in SRAT a new +simple one was added to allow for Generic Initiator Proximity Domains to +be described. + +One complexity in here is that we need a means of identifying which device +is our Generic Initiator. Currently two means are defined for doing this, +either via a PCI Device Handle, or an ACPI Device Handle. Here we shall +only consider the PCI Device Handle option, but the ACPI Device Handle +follows a similar approach. + +Expanding our example +..................... + +Let us assume that the RDMA adapter is on a PCIe bus (the root complex may +be in the IO device as well so we have a standard PCIe bus attached to +a point on a ring interconnect). This amended example is shown in +:numref:`figsimplenodesplusgapci`. + +.. _figsimplenodesplusgapci: +.. figure:: simplenodesplusgapci.* + :figclass: align-center + + An expanded GA example including a PCIe bus with the root port also being found + within our Generic Initiator domain. + +Rather than repeating the whole of SRAT above, this may be simply added +to the end + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Generic Initiator Affinity Structure + :widths: 70 55 200 + + +--------------------+-----------+----------------------------------+ + | Field | Value | Notes | + +====================+===========+==================================+ + | Type | 5 | Generic Initiator Affinity | + | | | Structure | + +--------------------+-----------+----------------------------------+ + | Length | 32 | | + +--------------------+-----------+----------------------------------+ + | Device Handle Type | 1 | Providing a PCI Device Handle | + +--------------------+-----------+----------------------------------+ + | Proximity Domain | 2 | Our new Generic Initiator | + | | | Proximity Domain - in this case | + | | | we have no other SRAT Affinity | + | | | Structures referring to domain 2.| + +--------------------+-----------+----------------------------------+ + | PCI Segment | 0 | | + +--------------------+-----------+----------------------------------+ + | PCI BDF | 0x10 | Bus 1, device 0, function 0 | + +--------------------+-----------+----------------------------------+ + | Flags | 1 | Enabled | + +--------------------+-----------+----------------------------------+ + + +Legacy OS handling of these new domains +--------------------------------------- + +Operating Systems that have ACPI support predating ACPI 6.3 are naturally +unaware of Generic Initiator Structures. Linux at least is known to +ignore Affinity Structures in SRAT if their type is not one that is already +handled. Thus, there is no direct side effect of ACPI being used to tell +a legacy operating system about them. + +However, as we mentioned in :numref:`secpxm`, there is another means +of assigning devices to a Proximity Domain. An entry describing the +device in the Differentiated System Descriptor Table (DSDT) or +Secondary System Descriptor Table (SSDT) may use the Proximity (_PXM) object +to specify which Proximity Domain a device lies within. + +However, what does a legacy operating system do if a device is thus +assigned to a proximity domain which it does not know exists? In the case +of some versions of Linux the answer is unfortunately that it crashes. +Even assuming this less than ideal response is fixed, there is no means for +the OS to know the 'best alternative' proximity domain to put the device in +given the OS is not ready to handle Generic Initiator Domains. + +This problem is worked around by use of a new _OSC bit defined in ACPI 6.3 +which allows the proximity domain provided by _PXM to be changed dependent +on whether the OS communicates that it supports Generic Initiators or not. +This *fallback domain* should be chosen to describe a topology that allows +the Operating System to make the best decision it can under the constraints +of pre Generic Initiator ACPI. In our example :numref:`figsimplenodesplusga` +we simply pick one of the nodes on the basis they are equal in all ways. diff --git a/source/hotplug.rst b/source/hotplug.rst new file mode 100644 index 0000000..b6b464c --- /dev/null +++ b/source/hotplug.rst @@ -0,0 +1,28 @@ +.. _sechotplug: + +============================================= +Hotplug and dynamically updating the topology +============================================= + +In this case we are only interested in hotplug of elements of the +system which would, had they been present at boot, have appeared +as entries in SRAT. + +The ACPI specification does not allow *new* proximity nodes to be +created when such hotplug events occur, but it does allow for +existing NUMA characteristics to be updated. This allows +proximity nodes to be set aside for *potential* hotplug entities +and their characteristics to be modified once the actual system +elements being hotplugged are known. + +The Linux kernel, at time of writing, does not make use of these +ACPI objects, and it is not known if other operating systems do +so. They are included here to make the reader aware of the features +available in the ACPI specification, even if operating systems +do not yet make use of them. + +* **_SLI** provides updates of the information found at boot time + via the SLIT table. Only information about a new node is provided. + +* **_HMA** provides an updated HMAT in entirety, overriding the + existing, boot time HMAT table. \ No newline at end of file diff --git a/source/index.rst b/source/index.rst new file mode 100644 index 0000000..3ad077f --- /dev/null +++ b/source/index.rst @@ -0,0 +1,20 @@ +.. NUMA GUIDE documentation master file, created by + sphinx-quickstart on Tue Sep 18 13:49:28 2018. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Numa Guide +========== +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + intro + acpitables1 + genericinitiator + memorysidecaches + nonobvious + hotplug + conclusion + +.. bibliography:: refs.bib diff --git a/source/intro.rst b/source/intro.rst new file mode 100644 index 0000000..4f56d60 --- /dev/null +++ b/source/intro.rst @@ -0,0 +1,350 @@ + +.. .. todolist:: + +============ +Introduction +============ + +Scope +===== + +This guide is intended to act as an example led introduction to the +Non-Uniform Memory Access (NUMA) description +available in the ACPI 6.3 specification. + +NUMA descriptions are about describing the relative *distance* +between components in shared memory computer. As we shall see, this +concept of distance has a complex definition in a modern system. + +Intended Audience +================= + +Whilst the content may be of interest to userspace application developers, +to understand *what* is described, the primary audience is: + +* Firmware developers who want to know how to describe their systems +* OS developers who wish to understand better what the description they are + seeing actually means. + +Document Conventions +==================== + +A few conventions are adopted to aid readability. + +* Reserved fields are not shown in examples - implement them as per the ACPI + specification. + +References +========== + +There are numerous good references on general NUMA concepts + +* Linux kernel documentation https://www.kernel.org/doc/html/v4.20/vm/numa.html + +* ACPI Specification 6.3 http://www.uefi.org + +* UEFI Specification 2.8 http://www.uefi.org + + + +Changes since ACPI 6.2 +====================== + +A major focus of the ACPI 6.3 cycle was to enhance the existing specification +to better support Heterogeneous systems. Key aspects introduced were: + +* Generic Initiators +* HMAT Changes +* Specific Purpose Memory (also defined in UEFI 2.8) + +Generic Initiators +****************** + +A Generic Initiator (GI) is a non-host-processor initiator of memory operations. +Note, that by non-host-processor we mean one that is not responsible for running +normal applications or parts of the operating system. + +Prior to ACPI 6.3, these were typically only described via DSDT with _PXM +(See :cite:`2019:ACPI` Section 6.2.14 _PXM (Proximity)) +being used to associate them with Proximity Domains defined in SRAT +(See :cite:`2019:ACPI` Section 5.2.16 System Resource Affinity Table (SRAT)). + +.. todo:: Look at reference styles as this is a bit ugly. + + +These, SRAT defined, Proximity Domains primarily included Memory +and/or Processors. This limitation prevented accurate description of systems +where the characteristics of accesses from these GIs were not the same as +from the existing domains. All that could be done was to put them into the +*best available* domain. With the advent of accelerators connected to complex +coherent interconnects, this was no longer good enough. + +These are described in more detail in :numref:`gasect`. + +HMAT Changes +************ + +Several minor changes in HMAT contribute to improving the ability to +describe the inter-domain properties in a consistent and useful fashion. + +As the HMAT table was little used prior to ACPI 6.3, we shall not focus +on these changes, but instead describe the current situation. + +Specific Purpose Memory +*********************** + +This concept is not directly related to NUMA systems, but is important +to the sort of complex NUMA system ACPI 6.3 allows you to describe, so +we will include a brief description here. + +The UEFI specification, :cite:`2019:UEFI` Section 7.2, +defines a range of different memory types, according +to their different operating attributes / characteristics. For example, +Linux considers any memory to be *normal memory* if it is: + +* Cacheable with a Write Back Policy +* Not non-volatile and hence not described in the NFIT ACPI table. + +Some memory in a system, whilst functionally capable of being used as normal +Operating System managed memory, may be intended for particular use cases. +Examples of this include coherent memory located at a GPU, or a large memory +intended for use for an in-memory database. + +These memories may have characteristics that may make them unsuitable for +general usage include: + +* Large latencies, +* Different reliability characteristics to main memory. + +Some operating systems, including Linux will, by default, allocate some +data structures evenly across all memory proximity domains in the system. +One use of SPM may be to mark some regions of memory as being unsuitable +for this use, thus keeping them available for the intended use. + +ACPI 6.3 does not specify *how* this information will be used as that is +something that may take considerable discussion and experimentation to +pin down. Whilst an interesting topic to explore, we are not intending +to cover it any further in this version of this document. + +Why does an Operating System Care? +================================== + +There are a lot of aspects to this - we need to consider +some of them here to justify the design of the ACPI tables. +In particular we will aim to highlight when the way the +operating system uses this information, may affect the way +in which a firmware decides to present it. + +First we shall introduce some basic concepts. + +ACPI NUMA Description - Major Elements +====================================== + +There are many possible ways of describing the topology of +a computer system. The ACPI NUMA description takes some +simple concepts on to which complex topologies may be mapped +in a useful fashion. The most fundamental concept is that +of a Proximity Domain or NUMA node. + +Before considering specific examples, let us look at a somewhat +more accurate real system, :numref:`figsimple-2p`. + +.. _figsimple-2p: +.. figure:: typical-2p.* + :figclass: align-center + + A Simplified 2 Socket Server with a reasonable NUMA node assignment. + +This configuration +contains a number of CPU cores as well as PCIe connected peripherals. +For the majority of this document, single CPU cores per node will be +shown. This is sufficient to convey the important points and reduces +repetition in the examples. +Similarly, peripherals will only be introduced when we consider +their NUMA representation. + +NUMA Nodes +********** + +A NUMA node consists of a group of elements of the system. These +may include: + +* Processors +* Memory +* Peripheral Buses +* Networking devices, storage controllers +* Chipset + +These elements can be considered to be in a single node if there is +no benefit in describing them separately. The benefits that may +be derived from separating elements into different nodes will be +addressed later. + +This lack of benefit, typically means that no information can be provided that would +lead to a particular placement or usage decision by the Operating System. +Any given implementation may decide to make further simplifying +assumptions suitable for its targeted application area, perhaps deciding +not to differentiate between memories that are *similar* in characteristics. + +Simple cases of such information include + +* Bandwidth between an initiator on a NUMA node and memory on a different NUMA node. +* Latency between am initiator on a NUMA node and memory on a different NUMA node. +* Bandwidth and latency between a user of memory and the memory found in the + same NUMA node. +* Different caches available in front of the possible memory choices. +* The older concept of NUMA distance, a relative measure of the + memory latency between different NUMA nodes. + +As we shall see, there can be more complex reasons to describe separate NUMA nodes. +Some of these only become apparent when we consider how the Operating System +makes use of the memory. + +What we mean by memory +********************** + +A modern system contains a number of different types of memory with different +access characteristics and restrictions. For example, we have DDR attached +to memory controllers on an SoC as well as large memories closely coupled to GPUs +which may not be coherently cached by the CPU. Not all of these +memories are described in the NUMA description that ACPI 6.3 provides. + +It is important to note that the ACPI 6.3 specification's +NUMA description is concerned only with memory which may be used for +general purpose allocations. This means that, for the memory in question: + +* Cache coherency must be maintained so that different initiators + within the system obtain the latest version of what is memory, without any + software interactions. + +* Atomic operations consistent with those of the relevant CPU architecture must + be supported. + +Another way of looking at this, is that, other than bandwidth and latency characteristics, +this memory must behave the same as the system's RAM. That the memory +may be used for general purpose allocations, from the point of view of correct operation, +does not mean that it is suitable for such use when performance is considered. +The Specific Purpose Memory attribute is intended to provide information +to the operating system on whether such memory is suitable / intended for +such general purpose use. + +Unrolling the topology +********************** + +The ACPI 6.3 description of the NUMA properties of the system may be thought of +as unrolling each path across the interconnect and system topology, so as to be +able to describe end to end properties between any two points. + +.. _figsimple: +.. figure:: simple.* + :figclass: align-center + + A very simple NUMA topology. + +:numref:`figsimple` shows a very simple NUMA topology with two single CPU SoCs, each of +which has local DDR memory, and the interconnect between them. +The ACPI NUMA representation simplifies this topology somewhat in order to +make it easier to describe and use. +This can be thought of as an *unrolling* process. An example of the results +of this is shown in :numref:`figunrolled`. + +.. _figunrolled: +.. figure:: Unrolling.* + :figclass: align-center + + An *unrolled* representation of the NUMA topology. + +The intermediate elements in this unrolling, are only of interest for the restrictions that +they may place upon the link between the requester (here always a CPU) and the memory it is +working with. This means that ACPI only ever considers the *aggregate* properties all the +way between the initiator and memory. + +Combining elements into nodes +***************************** + +Now, at first glance, it might seem to make sense to have 4 separate nodes for the simple +topology seen in :numref:`figsimple`, one for each of the CPUs and one for each of the Memories. +This would correspond to the underlying physical layout and would be a correct description. +However, ACPI allows the properties between an initiator +(here a CPU) and memory to be described even if they are within the same NUMA node. + +This allows us to create NUMA nodes as shown in :numref:`figsimplenodes`. +:numref:`figsimplenodesunrolled` show how these NUMA nodes map to the unrolled representation. + +.. _figsimplenodes: +.. figure:: simplenodes.* + :figclass: align-center + + NUMA nodes for the simple topology. + +.. _figsimplenodesunrolled: +.. figure:: simplenodesunrolled.* + :figclass: align-center + + The nodes shown in :numref:`figsimplenodes` in their unrolled representation. + +Why would we want to do this combining? +*************************************** + +Some of the topology descriptions, that we will shortly come onto, use dense matrices to represent +the characteristics of these nodes. It is therefore useful to combine potential nodes as we +have done here as long as there is no loss of representative power. + +The concept of *local memory* is also used by operating systems to provide a simple, best +choice, when trying to locate data near to the initiator making use of it. It is defined +as being that memory which is the best choice for a particular initiator. Whilst +co-locating memory and initiator in a particular domain makes the choice obvious, Operating +Systems will often fall back to a search of Proximity Domains so as to be provide a good +answer for initiators that are in nodes without local memory. In Linux these were termed +*Memoryless Nodes*. + +How Operating Systems use NUMA Information +========================================== + +Linux +***** + +For each node containing memory, Linux manages the memory separately, this means: + +* Separate free page lists +* Separate in-use page lists +* Separate usage statistics + +In Linux, there is another level of subdividing done within each NUMA node. This subdivision +is into Memory Zones. Each Zone represents memory that shares certain +characteristic which restrict what allocations it may be used to satisfy. +For example, ZONE_DMA is memory suitable for DMA access from initiators with limited address +range support, whilst ZONE_MOVABLE is used to prevent allocations that cannot be moved, +thus allowing for the migration needed for hot removing memory. + +For each of these zones, within each NUMA node, the Linux Kernel maintains a fallback list. +The ordering is such that a allocations first fallback to the same zone on other numa nodes +(ordered by NUMA distance) and only once the zone is full across all nodes do the fallback +to other zones on the local node. This choice was made to preserve those zones which can +be used for any allocation, but which are a limited resource. + +Subject to there being space, by default, Linux always attempts to allocate memory +from the NUMA node from which the request originates - the so-called 'local' node +(typically the node containing the CPU running the allocation call). +There are exceptions to this. One of the biggest is that +it will allocate memory associated with a particular hardware device such as a network +card, on the NUMA node in which the network card is found. (note this is only true in +well constructed drivers). + +The scheduler, which is responsible for deciding which processor tasks are running on, +uses the NUMA information to try to minimize the migration of processes to other NUMA +nodes as this will put a large load on the system interconnect. + +The kernel provides user-space applications with the ability to set a mask on which NUMA +nodes a process is limited to, and which nodes it memory will be allocated from. + +Advance topics such as NUMA balancing are out of the scope of this particular document. + +Windows +******* + +Some information on Window's use of NUMA characteristics may be found at +https://docs.microsoft.com/en-us/windows/desktop/ProcThread/numa-support + + + diff --git a/source/memorysidecaches.rst b/source/memorysidecaches.rst new file mode 100644 index 0000000..699613a --- /dev/null +++ b/source/memorysidecaches.rst @@ -0,0 +1,346 @@ +.. _secmemorysidecache: + +=============================== +Representing Memory-Side Caches +=============================== + +In a modern heterogenous computing system, there are a number of types +of caching that the operating system should be aware of. + +* **CPU/CPU Cluster caches** + + Representation outside of the scope of this document, as they are + most often reflected in the performance of a particular initiator + irrespective of the NUMA characteristics of the system. + +* **Memory-Side caches** + + These are typically physically found somewhere in the path from the + memory controller to the actual memory elements. In a similar fashion + to processor caches they can have several levels. + +* **Transparent caches in the fabric** + + Some new fabrics, e.g. CCIX, may allow for transparent caches between + the initiator and the memories point of coherency or home. + In current Initiator / Target NUMA description these are effectively + the same as memory-side caches. + +Memory-Side Caches +================== + +The Heterogenous Memory Attributes Table HMAT has structures to represent +the nature of these Memory-Side caches and also provide latency and bandwidth +information for all of the elements of the memory system. The effect these caches +will have on the aggregate NUMA performance is work load dependent. The +intent of this information is therefore to provide the inputs for detailed +workload modelling or simpler heuristics that can use this information to +improve placement and scheduling decisions. + +ACPI Table elements for memory-side caches +========================================== + +In order to introduce the ACPI representation of a side cache +system let us first introduce a simple example: :numref:`figsidecache1`. +Here we have two proximity domains. The first contains the host processor and +some directly attached DDR memory. The second domain contains memory with +a memory-side cache, for example a non-volatile memory with a RAM cache. + +.. _figsidecache1: +.. figure:: simplesidecache1.* + :figclass: align-center + + A straight forward memory-side cache example. Memory 1 might be + Storage Class Memory with a DDR cache for example. + + +Memory Side Cache Information Structure +--------------------------------------- + +This structure provides the Operating System with information on the +type of a given memory-side cache. Taking the example in :numref:`figsidecache1` +we will see how this works. + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Memory Side Cache Information Structure (HMAT) + :widths: 70 55 200 + + +--------------------+-----------+------------------------------------+ + | Field | Value | Notes | + +====================+===========+====================================+ + | Type | 2 | Memory-Side Cache Information | + | | | Structure | + +--------------------+-----------+------------------------------------+ + | Length | N | | + +--------------------+-----------+------------------------------------+ + | Memory Proximity | 1 | This is our new domain with the | + | Domain | | side cache. | + +--------------------+-----------+------------------------------------+ + | Cache Attributes | 0x00802211| 1 Level Cache [3:0] = 1 | + | | | | + | | | Describing Level 1 [7:4] = 1 | + | | | | + | | | Complex Cache Indexing [11:8] = 2 | + | | | | + | | | Write Through [15:12] = 2 | + | | | | + | | | 128 Byte Cache Line [31:16] = 0x80 | + +--------------------+-----------+------------------------------------+ + | No. SMBIOS Handles | 1 | | + +--------------------+-----------+------------------------------------+ + | SMBIOS Handle 0 | | Handle to Physical Memory | + | | | Component Structure | + +--------------------+-----------+------------------------------------+ + +Now we also need to represent the access characteristics of this multi level +memory system. We shall use the following characteristics. Unlike in +the original HMAT example, we will assume symmetric characteristics. + +* Latency + - Node 0 Initiator to Local DDR access latency 90 ns. + - Node 0 Initiator to Node 1 Memory-Side Cache (Level 1) access latency 70 ns. + - Node 0 Initiator to Node 1 Memory (Level 0) access latency 200 ns. +* Bandwidth + - Node 0 Initiator to Local DDR access Bandwidth 3200 MB/s + - Node 0 Initiator to Node 1 Memory-Side Cache (Level 1) access Bandwidth 3200 MB/s + - Node 0 Initiator to Node 1 Memory (Level 0) access Bandwidth 1600 MB/s + + As this is a straight forward SRAT example we will just assume appropriate SRAT + entries exist. + +.. todo:: Shall we provide full examples of the various tables in an appendix? + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Side Cache Example Topology HMAT System Locality Latency and Bandwidth + Information Structure for memory access latency + :widths: 70 55 200 + + +--------------------+-----------+----------------------------------+ + | Field | Value | Notes | + +====================+===========+==================================+ + | Type | 1 | Identifies this as the "System | + | | | Locality Latency and Bandwidth | + | | | Information Structure" | + +--------------------+-----------+----------------------------------+ + | Length | N | | + +--------------------+-----------+----------------------------------+ + | Flags | 0 | Memory Hierarchy 0 (the memory) | + +--------------------+-----------+----------------------------------+ + | Data Type | 0 | This table is for memory access | + | | | latency. | + +--------------------+-----------+----------------------------------+ + | Number of Initiator| 1 | We have processors which are one | + | Proximity Domains | | type of initiator in Node 0 only.| + +--------------------+-----------+----------------------------------+ + | Number of Target | 2 | We have DDR memory in Node 0 | + | Proximity Domains | | and SCM in node 1. | + +--------------------+-----------+----------------------------------+ + | Entry Base Unit | 1000 | 1000 corresponds to units of | + | | | nano seconds. We could have used | + | | | many different units here due to | + | | | the relatively minor differences | + | | | in latencies, but 1000 gives | + | | | an easy understand scaling. | + +--------------------+-----------+----------------------------------+ + | Initiator Proximity| 0 | 1st Initiator Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | latency information. | + +--------------------+-----------+----------------------------------+ + | Target Proximity | 0 | 1st Target Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | latency information. | + +--------------------+-----------+----------------------------------+ + | Target Proximity | 1 | 2nd Target Proximity Domain | + | Domain List Entry 1| | for which we are providing | + | | | latency information. | + +--------------------+-----------+----------------------------------+ + | Entry[0][0] | 90 | Node 0 to local DDR access | + | | | latency, 90ns | + +--------------------+-----------+----------------------------------+ + | Entry[0][1] | 200 | Node 0 to Node 1 SCM access | + | | | latency, 200ns | + +--------------------+-----------+----------------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Side Cache Example Topology HMAT System Locality Latency and Bandwidth + Information Structure for cache level 1 access latency + :widths: 70 55 200 + + +--------------------+-----------+----------------------------------+ + | Field | Value | Notes | + +====================+===========+==================================+ + | Type | 1 | Identifies this as the "System | + | | | Locality Latency and Bandwidth | + | | | Information Structure" | + +--------------------+-----------+----------------------------------+ + | Length | N | | + +--------------------+-----------+----------------------------------+ + | Flags | 1 | Memory Hierarchy 1 (side cache) | + +--------------------+-----------+----------------------------------+ + | Data Type | 0 | This table is for memory-side | + | | | cache level 1 access latency | + +--------------------+-----------+----------------------------------+ + | Number of Initiator| 1 | We have processors which are one | + | Proximity Domains | | type of initiator in Node 0 only.| + +--------------------+-----------+----------------------------------+ + | Number of Target | 1 | We only have a memory-side cache | + | Proximity Domains | | for node 1. | + +--------------------+-----------+----------------------------------+ + | Entry Base Unit | 1000 | 1000 corresponds to units of | + | | | nano seconds. We could have used | + | | | many different units here due to | + | | | the relatively minor differences | + | | | in latencies, but 1000 gives | + | | | an easy understand scaling. | + +--------------------+-----------+----------------------------------+ + | Initiator Proximity| 0 | 1st Initiator Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | latency information. | + +--------------------+-----------+----------------------------------+ + | Target Proximity | 1 | 1st Target Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | latency information. | + +--------------------+-----------+----------------------------------+ + | Entry[0][0] | 70 | Node 0 to a hit on the | + | | | memory-side cache - latency, 70ns| + +--------------------+-----------+----------------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Side Cache Example Topology HMAT System Locality Latency and Bandwidth + Information Structure for memory access bandwidth + :widths: 70 55 200 + + +--------------------+-----------+----------------------------------+ + | Field | Value | Notes | + +====================+===========+==================================+ + | Type | 1 | Identifies this as the "System | + | | | Locality Latency and Bandwidth | + | | | Information Structure" | + +--------------------+-----------+----------------------------------+ + | Length | N | | + +--------------------+-----------+----------------------------------+ + | Flags | 0 | Memory Hierarchy 0 (the memory) | + +--------------------+-----------+----------------------------------+ + | Data Type | 3 | This table is for memory access | + | | | bandwidth. | + +--------------------+-----------+----------------------------------+ + | Number of Initiator| 1 | We have processors which are one | + | Proximity Domains | | type of initiator in Node 0 only.| + +--------------------+-----------+----------------------------------+ + | Number of Target | 2 | We have DDR memory in Node 0 | + | Proximity Domains | | and SCM in node 1. | + +--------------------+-----------+----------------------------------+ + | Entry Base Unit | 100 | 100 corresponds to units of | + | | | 100 MiB/Sec. | + +--------------------+-----------+----------------------------------+ + | Initiator Proximity| 0 | 1st Initiator Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | bandwidth information. | + +--------------------+-----------+----------------------------------+ + | Target Proximity | 0 | 1st Target Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | bandwidth information. | + +--------------------+-----------+----------------------------------+ + | Target Proximity | 1 | 2nd Target Proximity Domain | + | Domain List Entry 1| | for which we are providing | + | | | bandwidth information. | + +--------------------+-----------+----------------------------------+ + | Entry[0][0] | 32 | Node 0 to local DDR access | + | | | bandwidth, 3200 MiB/Sec | + +--------------------+-----------+----------------------------------+ + | Entry[0][1] | 16 | Node 0 to Node 1 SCM access | + | | | bandwidth, 1600 MiB/Sec | + +--------------------+-----------+----------------------------------+ + +.. tabularcolumns:: |p{0.20\linewidth}|>{\centering\arraybackslash}p{0.15\linewidth}|p{0.50\linewidth}| +.. table:: Side Cache Example Topology HMAT System Locality Latency and Bandwidth + Information Structure for cache level 1 access bandwidth + :widths: 70 55 200 + + +--------------------+-----------+----------------------------------+ + | Field | Value | Notes | + +====================+===========+==================================+ + | Type | 1 | Identifies this as the "System | + | | | Locality Latency and Bandwidth | + | | | Information Structure" | + +--------------------+-----------+----------------------------------+ + | Length | N | | + +--------------------+-----------+----------------------------------+ + | Flags | 1 | Memory Hierarchy 1 (side cache) | + +--------------------+-----------+----------------------------------+ + | Data Type | 3 | This table is for memory-side | + | | | cache level 1 access bandwidth. | + +--------------------+-----------+----------------------------------+ + | Number of Initiator| 1 | We have processors which are one | + | Proximity Domains | | type of initiator in Node 0 only.| + +--------------------+-----------+----------------------------------+ + | Number of Target | 1 | We only have a memory-side cache | + | Proximity Domains | | for node 1. | + +--------------------+-----------+----------------------------------+ + | Entry Base Unit | 100 | 100 corresponds to units of | + | | | 100 MiB/Sec. | + +--------------------+-----------+----------------------------------+ + | Initiator Proximity| 0 | 1st Initiator Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | bandwidth information. | + +--------------------+-----------+----------------------------------+ + | Target Proximity | 1 | 1st Target Proximity Domain | + | Domain List Entry 0| | for which we are providing | + | | | bandwidth information. | + +--------------------+-----------+----------------------------------+ + | Entry[0][0] | 32 | Node 0 to a hit on the | + | | | memory-side cache, bandwidth | + | | | 3200 MiB/Sec | + +--------------------+-----------+----------------------------------+ + +Complex Cases - Shared Caches +------------------------------ + +In a modern system, it is not unusual to find a complex series of interconnects +between the CPU and coherent *far* memory. These far memories may have very +different characteristics to local DDR and ACPI provides the means to cleanly +describe these characteristics. + +However, these complex system can sometimes throw up cases for which is not +obvious what is the *correct* description in ACPI 6.3. :numref:`figsharedsidecache1` +shows one such example. Here we have two *expansion memory* devices each of which +has it's own local transparent cache. These can be described as we did for +:numref:`figsidecache1`. The system in question has a device responsible for +maintaining cache coherent access to the memories external to the host processor. +This device also has a transparent cache, but in this case it caches memory +from both of the devices behind it. Conceptually this is similar to how a +multicore processor may have per core L1 cache but share cache at higher levels +across all cores. + +.. _figsharedsidecache1: +.. figure:: sharedsidecache1.* + :figclass: align-center + + A more complex memory-side cache example, including a shared level 2 memory-side + cache. + +In ACPI 6.3 there is no explicit means of representing this sharing. There are three +approaches that a system firmware might use to describe this structure. Which is +the optimum choice is currently unclear. + +1. Combine the two memories and their local caches into a single proximity domain + with only one set of properties. This can be represented as a normal memory + with two levels of memory-side cache. Unfortunately this obscures any difference + between the two nodes, and as we shall see in :numref:`figccix1` there can + be good, system engineering, reasons to represent the fact these memories are + separate. + +2. Pretend we have two caches sized in proportion to the memory behind them. + This can result in an under estimating potential performance in the case where + only one of the memories is in use. + +3. Pretend there is no contention on the shared cache. Without contention this + can be represented as two separate caches, providing the information necessary + to estimate expected performance as long as the other memory is not in use. + +.. _figsharesidecache2: +.. figure:: sharedsidecache2.* + :figclass: align-center + + The ways in which the memory-side caches in :numref:`figsharedsidecache1` may be + represented. \ No newline at end of file diff --git a/source/nonobvious.rst b/source/nonobvious.rst new file mode 100644 index 0000000..ad9f632 --- /dev/null +++ b/source/nonobvious.rst @@ -0,0 +1,68 @@ +======================== +Non obvious corner cases +======================== + +As NUMA systems become more complex, it can become increasingly +difficult to know exactly when certain topologies should be represented +in a particular fashion. This is ultimately driven by the question of +whether the Operating System or user-space processes may make use of the +information. + + +When not to merge nodes +======================= + +One use the operating system will make of NUMA description is to perform +NUMA memory interleaving. In this case it can use the ACPI description +to identify suitable memories to interleave data over (often at the Page +or Huge Page level) in order to reduce the pressure on particular elements +of the topology. Clearly, when it is possible to do such interleaving +in hardware this is normally preferable. It is common for systems to +perform memory interleaving over all the DDR controllers on a single socket. + +However, in some cases the hardware does not support such functionality, +perhaps because these are widely disaggregated components connected to +a coherent fabric. An example is shown in :numref:`figccix1`. + +.. _figccix1: + +.. figure:: ccixsanodes.* + :figclass: align-center + + Memory 1 and Memory 2 have identical properties, should we put them in + one node? + +A reasonable firmware design decision, might chose to represent these two +nodes separately despite their identical characteristics. + +.. _slitoslimit: + +SLIT - Legacy OS limitations +============================ + +Taking Linux as an example, the following restrictions are applied to SLIT +which are not present under the ACPI specification. + +Non local nodes with a local node distance +****************************************** + +With SLIT alone it seems odd that there might be a situation where +we would like to say that an apparently different NUMA node is at local +distance. However, the example of deliberately not merging nodes to allow +applications to request software NUMA balancing, shows this is not true. + +With the introduction fo HMAT we may want to specify additional nodes +just to describe their memory-side caches, or some really subtle difference. +For operating systems that are still using SLIT it might seem logical to +just have a representation where nodes other than the local are given +distance 10. However, it seems that Linux rejects this option so it +must be avoided. + +Asymmetry +********* + +There are some fairly simple topologies that will result in asymmetric +characteristics. Unfortunately legacy operating systems (Linux for +example) are not setup to allow for this asymmetry if specified in SLIT. +As such it should be avoided. + diff --git a/source/sharedsidecache1.svg b/source/sharedsidecache1.svg new file mode 100644 index 0000000..aa6e0eb --- /dev/null +++ b/source/sharedsidecache1.svg @@ -0,0 +1,312 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + Memory 0 + + + + CPU 0 + + + + + + + + + Memory 1(Level 0 Cache) + + + + Level 1 Cache + + + + Memory 2(Level 0 Cache) + + + + Level 1 Cache + + + + + + Shared AgentOffering Point of Coherency + + + + + Level 2 Cache + + + + + + diff --git a/source/sharedsidecache2.svg b/source/sharedsidecache2.svg new file mode 100644 index 0000000..d67dc31 --- /dev/null +++ b/source/sharedsidecache2.svg @@ -0,0 +1,733 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + Memory 0 + + + + CPU 0 + + + + + + Memory 1 + 2(Level 0 Cache) + + + + Level 1 Cache (1 + 2) + + + + + Shared AgentOffering Point of Coherency + + + + + Level 2 Cache + + + + + + Memory 0 + + + + CPU 0 + + + + + + Memory 1(Level 0 Cache) + + + + Level 1 Cache + + + Memory 2(Level 0 Cache) + + Level 1 Cache + + Shared AgentOffering Point of Coherency + + + Lvl 2 Cache (a) + + + + + Lvl 2 Cache (b) + + + Memory 0 + + + + CPU 0 + + + + + + Memory 1(Level 0 Cache) + + + + Level 1 Cache + + + Memory 2(Level 0 Cache) + + Level 1 Cache + + Shared AgentOffering Point of Coherency + + + Level 2 Cache + + + + + Level 2 Cache + 1. + 2. + 3. + + diff --git a/source/simple.svg b/source/simple.svg new file mode 100644 index 0000000..d9f71be --- /dev/null +++ b/source/simple.svg @@ -0,0 +1,169 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + Memory 0 + + + + CPU 0 + + + + + Memory 1 + + + + CPU 1 + + + + + + diff --git a/source/simplenodes.svg b/source/simplenodes.svg new file mode 100644 index 0000000..335b6d5 --- /dev/null +++ b/source/simplenodes.svg @@ -0,0 +1,166 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + Memory 0 + + + + CPU 0 + + + + + Memory 1 + + + + CPU 1 + + + + + diff --git a/source/simplenodesplus_non_ga.svg b/source/simplenodesplus_non_ga.svg new file mode 100644 index 0000000..e60d90d --- /dev/null +++ b/source/simplenodesplus_non_ga.svg @@ -0,0 +1,193 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + Memory 0 + + + + CPU 0 + + + + + Memory 1 + + + + CPU 1 + + + + + + RDMA Adapter + + + diff --git a/source/simplenodesplusga.svg b/source/simplenodesplusga.svg new file mode 100644 index 0000000..83274a7 --- /dev/null +++ b/source/simplenodesplusga.svg @@ -0,0 +1,205 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + Memory 0 + + + + CPU 0 + + + + + Memory 1 + + + + CPU 1 + + + + + + + RDMA Adapter + + + + + + + diff --git a/source/simplenodesplusgapci.svg b/source/simplenodesplusgapci.svg new file mode 100644 index 0000000..137a428 --- /dev/null +++ b/source/simplenodesplusgapci.svg @@ -0,0 +1,246 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + Memory 0 + + + + CPU 0 + + + + + Memory 1 + + + + CPU 1 + + + + + + + RDMA Adapter + + + + + + + PCIe Root Complex + + Root Port + + + diff --git a/source/simplenodesunrolled.svg b/source/simplenodesunrolled.svg new file mode 100644 index 0000000..aecc152 --- /dev/null +++ b/source/simplenodesunrolled.svg @@ -0,0 +1,307 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + CPU 0 + + + + Memory 0 + + + + CPU 0 + + + + CPU 1 + + + + Memory 1 + + + + CPU 1 + + + + CPU 0 + + + + Memory 0 + + + + CPU 1 + + + + Memory 1 + + + + + + + + + diff --git a/source/simplesidecache1.svg b/source/simplesidecache1.svg new file mode 100644 index 0000000..cd57bcd --- /dev/null +++ b/source/simplesidecache1.svg @@ -0,0 +1,192 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + Memory 0 + + + + CPU 0 + + + + + Memory 1(Level 0 Cache) + + + + Level 1 Cache + + + + + diff --git a/source/typical-2p.svg b/source/typical-2p.svg new file mode 100644 index 0000000..44dfa36 --- /dev/null +++ b/source/typical-2p.svg @@ -0,0 +1,282 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + Memory 0 + + CPU 0-15 + + + Memory 1 + + CPU 16-31 + + + + + PCIe RPs + + PCIe RPs + + NIC + + + NVME + + + GPU + + NVME + + +