huggingface · xrsrke · Feb 16, 2024 · Jan 19, 2024 · Jan 19, 2024 · Jan 19, 2024
diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -0,0 +1,63 @@
+name: Run non-FA2-related unit tests
+
+on:
+  push:
+    branches: [ main ]
+    # Only run tests if we modify the following files
+    paths:
+      - "src/**/*.py"
+      - "examples/**/*.py"
+      - "tests/**/*.py"
+
+  pull_request:
+    branches: [ '**' ]
+    paths:
+     - "src/**/*.py"
+     - "examples/**/*.py"
+     - "tests/**/*.py"
+
+jobs:
+  tests:
+    runs-on: [multi-gpu, nvidia-gpu, 8-t4, ci]
+    container:
+      image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+    steps:
+    - uses: actions/checkout@v3
+    - name: Python environment
+      run: |
+        which python
+        python --version
+
+    - name: Check Pytorch version
+      run: |
+        nvidia-smi
+        python -c "import torch; print('torch:', torch.__version__, torch)"
+        python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+    - name: Instal nanotron
+      run: |
+        python -m pip install --upgrade pip
+        pip install packaging
+        pip install wheel
+        pip install -e .
+        pip install -e .[dev]
+        pip install -e .[test]
+
+    - name: Show installed libraries and their versions
+      run: pip freeze | tee installed.txt
+
+    - name: Run tests
+      # NOTE: -m "not fa2" will run all the unit tests that don't have the mark
+      # "fa2" (these are FA2-related tests, we can't run it on T4)
+      run: |
+        pytest \
+        -n 1 \
+        -m "not fa2" \
+        --color=yes \
+        --durations=0 \
+        --ignore tests/kernels \
+        --verbose \
+        tests/
diff --git a/.github/workflows/code_quality.yaml b/.github/workflows/code_quality.yaml
@@ -0,0 +1,26 @@
+name: Code Quality
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ main ]
+    # Only run tests if we modify the following files
+    paths:
+      - "src/**/*.py"
+
+  pull_request:
+    branches: [ '**' ]
+    paths:
+     - "src/**/*.py"
+
+jobs:
+  cloc:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Count Lines of Code (cloc)
+      uses: djdefi/cloc-action@6
+      with:
+        options: --exclude-dir=docs,tests,examples --exclude-lang=YAML --exclude-list-file=sanity_checks.py
diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml
@@ -0,0 +1,58 @@
+name: Run FA2-related unit tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ main ]
+    # Only run tests if we modify the following files
+    paths:
+      - "src/**/*.py"
+      - "examples/**/*.py"
+      - "tests/**/*.py"
+
+  pull_request:
+    branches: [ '**' ]
+    paths:
+     - "src/**/*.py"
+     - "examples/**/*.py"
+     - "tests/**/*.py"
+
+jobs:
+  tests:
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+    container:
+      image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Python environment
+      run: |
+        which python
+        python --version
+
+    - name: Check Pytorch version
+      run: |
+        nvidia-smi
+        python -c "import torch; print('torch:', torch.__version__, torch)"
+        python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+    - name: Instal nanotron
+      run: |
+        python -m pip install --upgrade pip
+        pip install packaging
+        pip install wheel
+        pip install "flash-attn>=2.5.0" --no-build-isolation
+        pip install -e .
+        pip install -e .[dev]
+        pip install -e .[test]
+
+    - name: Show installed libraries and their versions
+      run: pip freeze | tee installed.txt
+
+    - name: Run tests
+      # NOTE: -m fa2 will only run the unit tests that have the mark
+      # "fa2" (these are FA2-related tests)
+      run: pytest -m fa2 --color=yes --durations=0 --verbose tests/
diff --git a/.gitignore b/.gitignore
@@ -160,6 +160,5 @@ cython_debug/
 #.idea/
 
 .vscode
-.github
 
 checkpoints/
diff --git a/src/nanotron/distributed.py b/src/nanotron/distributed.py
@@ -9,6 +9,8 @@
 from torch.distributed import *  # noqa
 from torch.distributed.distributed_c10d import ProcessGroup
 
+from nanotron.utils import find_free_port
+
 torch_version_above_1_13 = version.parse(torch.__version__) >= version.parse("1.13.0")
 Work = dist.Work if torch_version_above_1_13 else dist._Work
 default_pg_timeout = datetime.timedelta(minutes=10)
@@ -257,5 +259,9 @@ def initialize_torch_distributed():
         backend = "gloo"
 
     # Call the init process.
-    dist.init_process_group(backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout)
+    port = find_free_port()
+    init_method = f"env://localhost:{port}"
+    dist.init_process_group(
+        init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout
+    )
     return True
diff --git a/src/nanotron/optim/clip_grads.py b/src/nanotron/optim/clip_grads.py
@@ -56,7 +56,7 @@ def clip_grad_norm(
                 torch.stack([torch.linalg.vector_norm(g.detach(), ord=torch.inf, dtype=torch.float) for g in grads])
             )
         else:
-            total_norm = torch.zeros(1, dtype=torch.float, device=torch.device("cuda"))
+            total_norm = torch.zeros([], dtype=torch.float, device=torch.device("cuda"))
         dist.all_reduce(total_norm, group=mp_pg, op=dist.ReduceOp.MAX)
 
     else:
@@ -68,7 +68,7 @@ def clip_grad_norm(
                 dtype=torch.float,
             ).pow(norm_type)
         else:
-            total_norm = torch.zeros(1, dtype=torch.float, device=torch.device("cuda"))
+            total_norm = torch.zeros([], dtype=torch.float, device=torch.device("cuda"))
         dist.all_reduce(total_norm, group=mp_pg, op=dist.ReduceOp.SUM)
         total_norm.pow_(1.0 / norm_type)
 

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
@@ -35,7 +35,7 @@ def __init__(
             )
 
         if not dist.is_available():
-            raise ValueError("`torch.distributed is not available as a package, please install it.")
+            raise ValueError("torch.distributed is not available as a package, please install it.")
 
         self.tensor_parallel_size = tensor_parallel_size
         self.pipeline_parallel_size = pipeline_parallel_size
@@ -148,3 +148,10 @@ def get_3d_ranks(self, world_rank: int) -> Tuple[int, int, int]:
         dp_rank = (world_rank // self.tp_pg.size()) % self.dp_pg.size()
         tp_rank = world_rank % self.tp_pg.size()
         return (pp_rank, dp_rank, tp_rank)
+
+    def destroy(self):
+        if not dist.is_initialized():
+            return
+
+        dist.barrier()
+        dist.destroy_process_group()
diff --git a/src/nanotron/utils.py b/src/nanotron/utils.py
@@ -4,6 +4,8 @@
 import os
 from contextlib import ExitStack, contextmanager
 from typing import Callable, ContextManager, List, Optional
+import random
+import socket
 
 import torch
 from packaging import version
@@ -147,3 +149,15 @@ def tensor_from_untyped_storage(untyped_storage: torch.UntypedStorage, dtype: to
     tensor = torch.empty([], dtype=dtype, device=device)
     tensor.set_(source=untyped_storage)
     return tensor
+
+
+def find_free_port(min_port: int = 2000, max_port: int = 65000) -> int:
+    while True:
+        port = random.randint(min_port, max_port)
+        try:
+            with socket.socket() as sock:
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                sock.bind(("localhost", port))
+                return port
+        except OSError as e:
+            raise e
diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py
@@ -1,10 +1,15 @@
 import contextlib
 import os
+import random
+import re
+import time
 import uuid
-from typing import Any, Dict, List, Optional, Tuple
+from inspect import signature
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch.cuda
 from nanotron.parallel import ParallelContext
+from packaging import version
 from torch.distributed.launcher import elastic_launch
 
 
@@ -72,10 +77,10 @@ def __init__(self, func, args, kwargs, tp: int, dp: int, pp: int):
 
     def __call__(self):
         with mock_os_environ(update_key_values={"WORLD_SIZE": f"{self.tp * self.dp * self.pp}"}):
+            # NOTE: we use a different random seed, so that each unit tests don't generate the same port
+            random.seed(time.time())
             parallel_context = ParallelContext(
-                data_parallel_size=self.dp,
-                pipeline_parallel_size=self.pp,
-                tensor_parallel_size=self.tp,
+                data_parallel_size=self.dp, pipeline_parallel_size=self.pp, tensor_parallel_size=self.tp
             )
 
             assert "parallel_context" not in self.kwargs
@@ -185,3 +190,120 @@ def get_all_3d_configurations(gpus: int) -> List[Tuple[int, int, int]]:
                 if tp * dp * pp == gpus:
                     result.append((pp, dp, tp))
     return result
+
+
+def rerun_if_address_is_in_use(max_try: int = 500):
+    """
+    This function reruns a wrapped function if "address already in use" occurs
+    in testing spawned with torch.multiprocessing
+
+    Credits: https://github.com/hpcaitech/ColossalAI/blob/adae123df3badfb15d044bd416f0cf29f250bc86/colossalai/testing/utils.py#L157
+
+    Usage::
+
+        @rerun_if_address_is_in_use()
+        def test_something():
+            ...
+
+    """
+    # check version
+    torch_version = version.parse(torch.__version__)
+    assert torch_version.major >= 1
+
+    # only torch >= 1.8 has ProcessRaisedException
+    if torch_version >= version.parse("1.8.0"):
+        exception = torch.multiprocessing.ProcessRaisedException
+    else:
+        exception = Exception
+
+    func_wrapper = rerun_on_exception(exception_type=exception, pattern=".*Address already in use.*", max_try=max_try)
+    return func_wrapper
+
+
+def rerun_on_exception(exception_type: Exception = Exception, pattern: str = None, max_try: int = 10) -> Callable:
+    """
+    A decorator on a function to re-run when an exception occurs.
+
+    Credits: https://github.com/hpcaitech/ColossalAI/blob/adae123df3badfb15d044bd416f0cf29f250bc86/colossalai/testing/utils.py#L71
+
+    Usage::
+
+        # rerun for all kinds of exception
+        @rerun_on_exception()
+        def test_method():
+            print('hey')
+            raise RuntimeError('Address already in use')
+
+        # rerun for RuntimeError only
+        @rerun_on_exception(exception_type=RuntimeError)
+        def test_method():
+            print('hey')
+            raise RuntimeError('Address already in use')
+
+        # rerun for maximum 10 times if Runtime error occurs
+        @rerun_on_exception(exception_type=RuntimeError, max_try=10)
+        def test_method():
+            print('hey')
+            raise RuntimeError('Address already in use')
+
+        # rerun for infinite times if Runtime error occurs
+        @rerun_on_exception(exception_type=RuntimeError, max_try=None)
+        def test_method():
+            print('hey')
+            raise RuntimeError('Address already in use')
+
+        # rerun only the exception message is matched with pattern
+        # for infinite times if Runtime error occurs
+        @rerun_on_exception(exception_type=RuntimeError, pattern="^Address.*$")
+        def test_method():
+            print('hey')
+            raise RuntimeError('Address already in use')
+
+    Args:
+        exception_type (Exception, Optional): The type of exception to detect for rerun
+        pattern (str, Optional): The pattern to match the exception message.
+            If the pattern is not None and matches the exception message,
+            the exception will be detected for rerun
+        max_try (int, Optional): Maximum reruns for this function. The default value is 5.
+            If max_try is None, it will rerun forever if exception keeps occurring
+    """
+
+    def _match_lines(lines, pattern):
+        for line in lines:
+            if re.match(pattern, line):
+                return True
+        return False
+
+    def _wrapper(func):
+        def _run_until_success(*args, **kwargs):
+            try_count = 0
+            assert max_try is None or isinstance(
+                max_try, int
+            ), f"Expected max_try to be None or int, but got {type(max_try)}"
+
+            while max_try is None or try_count < max_try:
+                try:
+                    try_count += 1
+                    ret = func(*args, **kwargs)
+                    return ret
+                except exception_type as e:
+                    error_lines = str(e).split("\n")
+                    if try_count < max_try and (pattern is None or _match_lines(error_lines, pattern)):
+
+                        print("Exception is caught, retrying...")
+                        # when pattern is not specified, we always skip the exception
+                        # when pattern is specified, we only skip when pattern is matched
+                        continue
+                    else:
+                        print("Maximum number of attempts is reached or pattern is not matched, no more retrying...")
+                        raise e
+
+        # Override signature
+        # otherwise pytest.mark.parameterize will raise the following error:
+        # function does not use argument xxx
+        sig = signature(func)
+        _run_until_success.__signature__ = sig
+
+        return _run_until_success
+
+    return _wrapper
diff --git a/tests/kernels/test_layer_norm_convergence.py → tests/kernels/run_layer_norm_convergence.py b/tests/kernels/test_layer_norm_convergence.py → tests/kernels/run_layer_norm_convergence.py
diff --git a/tests/kernels/test_layer_norm.py b/tests/kernels/test_layer_norm.py
@@ -23,6 +23,7 @@
 
 
 # @pytest.mark.skipif(available_gpus() < 1, reason="Testing test_fused_layer_norm requires at least 1 gpus")
+@pytest.mark.fa2
 @pytest.mark.parametrize(
     "hidden_size",
     [1024, 1025],  # fused layer norm supports 1024 as hidden size but not 1025