Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
msaroufim committed Dec 5, 2024
1 parent d65b8ae commit 9fb8e71
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions torchtitan/datasets/hf_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
from typing import Any, Callable, Dict, List, NamedTuple, Optional

import torch

from datasets import Dataset, load_dataset
from datasets.distributed import split_dataset_by_node
from torch.distributed.checkpoint.stateful import Stateful
from torch.utils.data import IterableDataset
from torchdata.stateful_dataloader import StatefulDataLoader

from torchtitan.datasets.tokenizer import Tokenizer
from torchtitan.logging import logger

from datasets import Dataset, load_dataset
from datasets.distributed import split_dataset_by_node


def load_c4_dataset(dataset_path: str):
"""Load C4 dataset with default configuration."""
Expand All @@ -30,7 +30,8 @@ def process_c4_text(sample: Dict[str, Any]) -> str:
return sample["text"]


class DatasetConfig(NamedTuple):
@dataclass
class DatasetConfig:
path: str
loader: Callable
text_processor: Callable
Expand Down

0 comments on commit 9fb8e71

Please sign in to comment.