huggingface · IlyasMoutawwakil · Aug 27, 2024 · Jul 29, 2024 · Jul 29, 2024 · Aug 1, 2024
diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel
@@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     libpng-dev \
     python3 \
     python3-pip \
+    python3-dev \
+    libnuma-dev \
     && rm -rf /var/lib/apt/lists/*"
 RUN /usr/sbin/update-ccache-symlinks
 RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
@@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \
     torchaudio==${TORCHAUDIO_VERSION} \
     -f https://download.pytorch.org/whl/torch_stable.html && \
     python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
-    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    python3 -m pip install --no-cache-dir  numa
 
-ARG OMP_NUM_THREADS=1
-ENV OMP_NUM_THREADS=${OMP_NUM_THREADS}
 ARG KMP_BLOCKTIME=1
 ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
 ARG KMP_HW_SUBSET=1T
 ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
-ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so"
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -129,7 +129,6 @@ def ipex_jit_trace(model, task, use_cache):
 
     return trace_model
 
-
 class IPEXModel(OptimizedModel):
     auto_model_class = AutoModel
     export_feature = "feature-extraction"

diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
@@ -18,6 +18,7 @@
 
 import torch
 from huggingface_hub import HfApi, HfFolder
+import os
 
 
 MULTI_QUERY_ATTN_MODELS = {"falcon", "gpt_bigcode"}
@@ -110,3 +111,84 @@ def _find_files_matching_pattern(
         files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) == subfolder]
 
     return files
+
+def get_int_from_env(env_keys, default):
+    """Returns the first positive env value found in the `env_keys` list or the default."""
+    for e in env_keys:
+        val = int(os.environ.get(e, -1))
+        if val >= 0:
+            return val
+    return default
+
+def bind_cores_for_best_perf():
+    """    
+    Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance.
+    Works for wold_size >= 1 and rank >= 1
+
+    Example:
+    .. code-block:: python
+
+        from optimum.intel.ipex import IPEXModelForCausalLM
+        from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
+
+        bind_cores_for_best_perf()
+        model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True)
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        input_sentence = ["tell me a story about a trip to the moon"]
+        model_inputs = tokenizer(input_sentence, return_tensors="pt")
+        generation_kwargs = dict(max_new_tokens=500)
+        generated_ids = model.generate(**model_inputs, **generation_kwargs)
+
+    Returns:
+        None
+
+    """
+
+    import importlib.util
+    import platform
+    system = platform.system()
+    if system == "Linux":
+        if importlib.util.find_spec("numa") is not None:
+            import numa
+            import psutil
+            import math 
+
+            world_size= get_int_from_env(
+                    ["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1
+                )
+            rank_id= get_int_from_env(
+                    ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0
+                )
+            nodes = numa.get_max_node() + 1
+            rank_per_node = math.ceil(world_size / nodes)
+            num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
+            node_id = int(rank_id / rank_per_node)
+            rank_offset_per_node = rank_id % rank_per_node
+            if os.getenv("OMP_NUM_THREADS") is None:
+                # set OMP_NUM_THREADS to num of physical cores per socket
+                num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
+                print("setting OMP_NUM_THREADS to", num_cpus_per_rank)
+            else:
+                num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
+                print("OMP_NUM_THREADS already set to ", num_cpus_per_rank)
+            if len(numa.get_membind()) == nodes:
+                # if numa memory binding is not set, set it to the node where the rank is running
+                numa.set_membind([node_id])
+
+            torch.set_num_threads(num_cpus_per_rank)
+
+
+            if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True):
+                #if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank
+                cpu_start = num_cpus_per_rank * rank_offset_per_node
+                numa.set_affinity(
+                    0,
+                    list(numa.node_to_cpus(node_id))[
+                        cpu_start : cpu_start + num_cpus_per_rank
+                    ],
+                )
+            print(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}")
+        else:
+            print("numa module not found, skipping binding cores")
+    else:
+        print("OS not supported, skipping binding cores")