Fix a few typos (#843)

Co-authored-by: Saaketh Narayan <saaketh.narayan@databricks.com>
mosaicml · Dec 9, 2024 · a0d491e · a0d491e
1 parent 69304c5
commit a0d491e
Show file tree

Hide file tree

Showing 11 changed files with 12 additions and 12 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -75,7 +75,7 @@ pytest -vv -s . # run all the unittests
 cd docs && make clean && make doctest # run doctests
 ```
 
-6\. [Optional] Compile and visualize the documentation locally. If you have a documentation changes, running the below commands is mandatory.
+6\. [Optional] Compile and visualize the documentation locally. If you have documentation changes, running the below commands is mandatory.
 
 <!--pytest.mark.skip-->
 ```bash

diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 # several pytest settings
 PYTHON ?= python  # Python command
 PYTEST ?= pytest  # Pytest command
-PYRIGHT ?= pyright  # Pyright command. Pyright must be installed seperately -- e.g. `node install -g pyright`
+PYRIGHT ?= pyright  # Pyright command. Pyright must be installed separately -- e.g. `node install -g pyright`
 EXTRA_ARGS ?=  # extra arguments for pytest
 
 dirs := streaming tests docs

diff --git a/docs/source/_templates/base.html b/docs/source/_templates/base.html
@@ -99,7 +99,7 @@
         version = fragments[1].split("/")[0]
 
         // NOTE: The version string will resolve to the PR number for RTD sites.
-        //       Checking whether first charater is a number.
+        //       Checking whether first character is a number.
         if (version[0] >= '0' && version[0] <= '9') {
           version = undefined
         }

diff --git a/docs/source/dataset_configuration/shuffling.md b/docs/source/dataset_configuration/shuffling.md
@@ -70,4 +70,4 @@ Samples within each shard are shuffled both before and after shards are split am
 
 Globally shuffles all samples. This is useful for single-node training on small data, where you want the most random shuffle possible, but is the least download-efficient of all shuffle algorithms. Training throughput is often much lower when using the `naive` shuffling algorithm.
 
-If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [perfomance tuning page](../distributed_training/performance_tuning.md).
+If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [performance tuning page](../distributed_training/performance_tuning.md).
diff --git a/docs/source/distributed_training/performance_tuning.md b/docs/source/distributed_training/performance_tuning.md
@@ -23,7 +23,7 @@ $$L = 2 \cdot S \cdot \lceil\frac{C}{P}\rceil $$
 
 Where $L$ is the required minimum cache limit per node, in MB, $S$ is the average shard size, in MB, $C$ is the number of canonical nodes (see [here](../dataset_configuration/shuffling.md#how-shuffling-works) and [here](../distributed_training/elastic_determinism.md#requirements)), and $P$ is the number of physical nodes. This is because only a single shard, plus a potentially predownloaded subsequent shard, needs to be resident per canonical node to make progress during training.
 
-If using a shuffle-block-based algorithm such as [`'py1e'`](../dataset_configuration/shuffling.md#py1e-default) or [`'py1br'`](../dataset_configuration/shuffling.md#py1br), the required minumum cache limit per node will be approximately:
+If using a shuffle-block-based algorithm such as [`'py1e'`](../dataset_configuration/shuffling.md#py1e-default) or [`'py1br'`](../dataset_configuration/shuffling.md#py1br), the required minimum cache limit per node will be approximately:
 
 $$L = k \cdot S \lceil \frac{B}{Q} \rceil \cdot \lceil\frac{C}{P}\rceil $$
 

diff --git a/scripts/samples/bench_and_plot.py b/scripts/samples/bench_and_plot.py
@@ -237,7 +237,7 @@ def bench(args: Namespace, bench_name: str, desc: str, generate: Callable,
         args (Namespace): Command-line arguments.
         bench_name (str): What to call this benchmark.
         desc (str): Brief description of the data.
-        generate (Callable): Method to genereate the dataset.
+        generate (Callable): Method to generate the dataset.
         formats (List[str]): List of shard formats to benchmark this data in.
     """
     print(f'Bench: {bench_name}')
@@ -373,7 +373,7 @@ def bench(args: Namespace, bench_name: str, desc: str, generate: Callable,
             y *= args.plot_bins
             y = y.astype(np.int64)
 
-            # Truncate the higest ``args.truncate_highest_frac`` timings because they get further
+            # Truncate the highest ``args.truncate_highest_frac`` timings because they get further
             # and further spaced as you ascend, which would ruin the plot.
             y = y[np.nonzero(y < args.plot_bins)[0]]
 

diff --git a/simulation/core/utils.py b/simulation/core/utils.py
@@ -20,7 +20,7 @@ def get_batches_epochs(dataset: SimulationDataset, max_duration: Time) -> tuple[
     Returns:
         Tuple[int, int, int]: batches per epoch, epochs, and the total batches.
     """
-    # get epochs, batches_per_epoch, and total_batches from a Time obect
+    # get epochs, batches_per_epoch, and total_batches from a Time object
     dataset_batches = dataset.get_num_batches()
     batches_per_epoch = dataset_batches
     epochs = 1

diff --git a/streaming/base/batching/stratified.py b/streaming/base/batching/stratified.py
@@ -115,7 +115,7 @@ def generate_work_stratified_batching(dataset: StreamingDataset, world: World, e
                 f'Number of samples for stream {stream_id} is {batch_portion} because the portion '
                 +
                 f'of this stream in the global batch, which is of size {global_batch_size}, is ' +
-                f'too low. Please increase the global batch size or increase the porportion of ' +
+                f'too low. Please increase the global batch size or increase the proportion of ' +
                 f'total samples that come from stream {stream_id}.')
 
     # We now merge the partitions from each stream to get our final partition over all

diff --git a/streaming/text/convert/enwiki/mds/merge_shard_groups.py b/streaming/text/convert/enwiki/mds/merge_shard_groups.py
@@ -11,7 +11,7 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commmand-line arguments.
+    """Parse command-line arguments.
 
     Returns:
         Namespace: Command-line arguments.

diff --git a/streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py b/streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py
@@ -1,4 +1,4 @@
-"""Script for picking certain number of sampels.
+"""Script for picking certain number of samples.
 """
 
 import argparse

diff --git a/tests/test_streaming.py b/tests/test_streaming.py
@@ -512,7 +512,7 @@ def test_stratified_batching_Exception(local_remote_dir: tuple[str, str], stream
 
     with pytest.raises(ValueError, match=f'Number of samples for stream*'):
         # When we iterate through the dataloader, the samples will be partitioned.
-        # This should thow ValueError since stream 2 is too small to be included in each batch.
+        # This should throw ValueError since stream 2 is too small to be included in each batch.
         for _ in dataloader:
             continue
Original file line number	Diff line number	Diff line change
Expand Up		@@ -70,4 +70,4 @@ Samples within each shard are shuffled both before and after shards are split am

		Globally shuffles all samples. This is useful for single-node training on small data, where you want the most random shuffle possible, but is the least download-efficient of all shuffle algorithms. Training throughput is often much lower when using the `naive` shuffling algorithm.

		If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [perfomance tuning page](../distributed_training/performance_tuning.md).
		If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [performance tuning page](../distributed_training/performance_tuning.md).