Release: 0.0.1

KompleteAI · Oct 14, 2023 · d880c8f · d880c8f
1 parent 0672e56
commit d880c8f
Show file tree

Hide file tree

Showing 27 changed files with 188 additions and 72 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,6 @@
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/xllmling.svg?color=blue&label=Downloads&logo=pypi&logoColor=gold)](https://pypi.org/project/xllm/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/xllm.svg?logo=python&label=Python&logoColor=gold)](https://pypi.org/project/xllm/)
 
-
 Easy & cutting edge LLM finetuning using the most advanced methods (QLoRA, DeepSpeed, GPTQ, Flash Attention 2, FSDP,
 etc)
 
@@ -98,13 +97,9 @@ from xllm.experiments import Experiment
 
 config = HuggingFaceConfig(model_name_or_path="facebook/opt-350m")
 
-train_data = [
-    {
-        "text": "Hello!"
-    },
-] * 100
+train_data = ["Hello!"] * 100
 
-train_dataset = GeneralDataset(data=train_data)
+train_dataset = GeneralDataset.from_list(data=train_data)
 experiment = Experiment(config=config, train_dataset=train_dataset)
 
 experiment.build()
@@ -307,14 +302,24 @@ Please support me for more updates!
 
 # Call to partnership 🛰
 
-## Advisor
+**Looking for an expert in modern LLMs?** I've got the experience you need. I'll guide you through every step,
+fine-tuning everything from data collection to model training and improvement.
+
+**Why me?** Well, with six years of experience in deep learning R&D projects, I've mastered a range of roles - from
+leading a team to rolling up my sleeves as an engineer. I've built and improved products from scratch and I'm keen to do
+the same for you.
+
+**Worried about your team?** Don't be. With four years as a lecturer at Russia’s best university, I can equip them with
+the skills they need to succeed.
 
-And if your team is hunting for the insights of an adept advisor to propel your projects forward, don't hesitate to
-reach out through this website: https://komplete.framer.ai
+**Want to know more?** Check
+out [my CV](https://docs.google.com/document/d/1BhFvIHQ1mpm81P-n2A-lhNac-U2wOGc6F2uS9gKvk88/edit?usp=sharing), [LinkedIn](https://www.linkedin.com/in/boriszubarev/),
+and [past projects](https://komplete.framer.ai/cases) for the full scoop.
 
-## Full-time job
+**Ready to start?** Let's arrange a free intro meeting. I'll outline the resources we'll need to make your project a
+success.
+[Contact me form](https://komplete.framer.ai/#contact)
 
-Are you seeking a dynamic addition to your team who possesses the prowess and the know-how to train such innovative
-models? Then consider
-sharing [my CV](https://docs.google.com/document/d/1BhFvIHQ1mpm81P-n2A-lhNac-U2wOGc6F2uS9gKvk88/edit?usp=sharing)
-or [LinkedIn](https://www.linkedin.com/in/boriszubarev/) with your manager.
+If you're an engineer, I'd appreciate it if you could pass
+along [my LinkedIn](https://www.linkedin.com/in/boriszubarev/) or [website](https://komplete.framer.ai/) to your
+manager.
diff --git a/examples/minimal/train.py b/examples/minimal/train.py
@@ -1,8 +1,35 @@
-from examples.minimal.dataset import AntropicDataset
-from src.xllm.cli.train import cli_run_train
-from src.xllm.core.config import HuggingFaceConfig
-from src.xllm.datasets.registry import datasets_registry
-
-if __name__ == "__main__":
-    datasets_registry.add(key="antropic", value=AntropicDataset)
-    cli_run_train(config_cls=HuggingFaceConfig)
+import datasets
+from tqdm import tqdm
+from xllm import HuggingFaceConfig
+from xllm.datasets import GeneralDataset
+from xllm.experiments import Experiment
+
+
+def run():
+    rlhf_dataset = datasets.load_dataset("Anthropic/hh-rlhf")
+
+    parsed_data = dict()
+
+    for split in ["train", "test"]:
+        parsed_data[split] = list()
+
+        for sample in tqdm(rlhf_dataset[split], desc=f"Parsing {split}"):
+            text_parts = sample["chosen"].split("\n\n")[1:]
+
+            parsed_data[split].append(text_parts)
+
+        train = parsed_data["train"]
+        evaluation = parsed_data["test"]
+
+    train_dataset = GeneralDataset.from_list(data=train)
+    eval_dataset = GeneralDataset.from_list(data=evaluation)
+
+    config = HuggingFaceConfig(model_name_or_path="facebook/opt-350m")
+
+    experiment = Experiment(config=config, train_dataset=train_dataset, eval_dataset=eval_dataset)
+
+    experiment.build()
+
+    experiment.run()
+
+    experiment.push_to_hub(repo_id="YOUR_NAME/MODEL_NAME")
diff --git a/examples/minimal_using_cli/.env.template b/examples/minimal_using_cli/.env.template
@@ -0,0 +1,5 @@
+HUGGING_FACE_HUB_TOKEN=
+WANDB_API_KEY=
+WANDB_ENTITY=
+WANDB_PROJECT=
+TOKENIZERS_PARALLELISM=false
diff --git a/examples/minimal_using_cli/README.md b/examples/minimal_using_cli/README.md
@@ -0,0 +1,5 @@
+# Minimal
+
+## Run
+
+python ./train.py
diff --git a/examples/minimal/dataset.py → examples/minimal_using_cli/dataset.py b/examples/minimal/dataset.py → examples/minimal_using_cli/dataset.py
@@ -13,7 +13,7 @@ class AntropicDataset(BaseDataset):
     _HF_DATASET_ID = "Anthropic/hh-rlhf"
 
     @classmethod
-    def download(cls, config: HuggingFaceConfig) -> Tuple[List[RawSample], Optional[List[RawSample]]]:
+    def get_data(cls, config: HuggingFaceConfig) -> Tuple[List[RawSample], Optional[List[RawSample]]]:
         rlhf_dataset = datasets.load_dataset(cls._HF_DATASET_ID)
 
         parsed_data: Dict[str, List[RawSample]] = dict()

diff --git a/examples/minimal_using_cli/train.py b/examples/minimal_using_cli/train.py
@@ -0,0 +1,8 @@
+from examples.minimal_using_cli.dataset import AntropicDataset
+from src.xllm.cli.train import cli_run_train
+from src.xllm.core.config import HuggingFaceConfig
+from src.xllm.datasets.registry import datasets_registry
+
+if __name__ == "__main__":
+    datasets_registry.add(key="antropic", value=AntropicDataset)
+    cli_run_train(config_cls=HuggingFaceConfig)
diff --git a/setup.py b/setup.py
@@ -57,7 +57,7 @@
 # Setup
 setup(
     name="xllm",
-    version="0.3.24",
+    version="0.0.1",
     description="Simple & Cutting Edge LLM Finetuning",
     license_files=["LICENSE"],
     long_description=open("README.md", "r", encoding="utf-8").read(),

diff --git a/src/xllm/__init__.py b/src/xllm/__init__.py
@@ -14,15 +14,15 @@
 
 # ruff: noqa: F401
 
-__version__ = "0.3.24"
+__version__ = "0.0.1"
 
 from . import enums, types
-from .cli.download import cli_run_download
 from .cli.fuse import cli_run_fuse
+from .cli.prepare import cli_run_preprare
 from .cli.train import cli_run_train
 from .core.config import HuggingFaceConfig
-from .run.download import download
 from .run.fuse import fuse
+from .run.prepare import prepare
 from .run.train import train
 from .utils.cli import setup_cli
 from .utils.logger import dist_logger

diff --git a/src/xllm/cli/__init__.py b/src/xllm/cli/__init__.py
@@ -14,7 +14,7 @@
 
 # ruff: noqa: F401
 
-from .download import cli_run_download
 from .fuse import cli_run_fuse
+from .prepare import cli_run_preprare
 from .quantize import cli_run_quantize
 from .train import cli_run_train
diff --git a/src/xllm/cli/download.py → src/xllm/cli/prepare.py b/src/xllm/cli/download.py → src/xllm/cli/prepare.py
@@ -17,19 +17,19 @@
 from transformers import HfArgumentParser, PreTrainedModel, PreTrainedTokenizer
 
 from ..core.config import HuggingFaceConfig
-from ..run.download import download
+from ..run.prepare import prepare
 from ..utils.cli import setup_cli
 
 
-def cli_run_download(
+def cli_run_preprare(
     config_cls: Type[HuggingFaceConfig] = HuggingFaceConfig,
 ) -> Tuple[PreTrainedTokenizer, PreTrainedModel]:
     parser = HfArgumentParser(config_cls)
     config = parser.parse_args_into_dataclasses()[0]
-    setup_cli(config=config, logger_path="./xllm_download.log")
-    tokenizer, model = download(config=config)
+    setup_cli(config=config, logger_path="./xllm_prepare.log")
+    tokenizer, model = prepare(config=config)
     return tokenizer, model
 
 
 if __name__ == "__main__":
-    cli_run_download(config_cls=HuggingFaceConfig)
+    cli_run_preprare(config_cls=HuggingFaceConfig)
diff --git a/src/xllm/core/config.py b/src/xllm/core/config.py
@@ -125,11 +125,11 @@ class HuggingFaceConfig:
         metadata={"help": "Custom path to .env file"},
     )
 
-    # download
+    # prepare
     prepare_dataset: bool = field(
         default=True,
         metadata={
-            "help": "Prepare the dataset. Works only when you use download",
+            "help": 'Prepare the dataset. Works only at "prepare" stage',
         },
     )
 
@@ -277,10 +277,6 @@ class HuggingFaceConfig:
             "help": "Use or not flash attention 2. Requires 1) CUDA >= 11.6; 2) install flash-attn 3) compatible model",
         },
     )
-    low_cpu_mem_usage: Optional[bool] = field(
-        default=None,
-        metadata={"help": "low_cpu_mem_usage when loading model"},
-    )
     trust_remote_code: bool = field(
         default=True,
         metadata={

diff --git a/src/xllm/core/dependencies.py b/src/xllm/core/dependencies.py
@@ -179,6 +179,7 @@ def build_quantization_config(
 def build_model(
     config: HuggingFaceConfig,
     quantization_config: Union[BitsAndBytesConfig, GPTQConfig, None],
+    low_cpu_mem_usage: Optional[bool] = None,
 ) -> PreTrainedModel:
     if config.bnb_quantize_after_model_init:
         quantization_config = None
@@ -194,8 +195,8 @@ def build_model(
     if config.use_flash_attention_2:
         kwargs["use_flash_attention_2"] = True
 
-    if config.low_cpu_mem_usage is not None:
-        kwargs["low_cpu_mem_usage"] = config.low_cpu_mem_usage
+    if low_cpu_mem_usage is not None:
+        kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
 
     model = AutoModelForCausalLM.from_pretrained(
         pretrained_model_name_or_path=config.model_name_or_path,
@@ -212,6 +213,10 @@ def build_model(
         model = prepare_model_for_kbit_training(
             model=model, use_gradient_checkpointing=config.use_gradient_checkpointing
         )
+        dist_logger(
+            message=f"Model prepared for kbit training. Gradient checkpointing: {config.use_gradient_checkpointing}",
+            local_rank=config.local_rank,
+        )
 
     return model
 
@@ -246,7 +251,7 @@ def build_trainer(
         model.config.use_cache = False  # type: ignore
     except Exception as exception:
         dist_logger.warning(
-            message=f"Can't set use cache to false, because: {exception}",
+            message=f"Can't set use cache to false. Exception: {exception}",
             local_rank=config.local_rank,
         )
 

diff --git a/src/xllm/datasets/base.py b/src/xllm/datasets/base.py
@@ -36,12 +36,12 @@ def __init__(self, data: List[RawSample]):
 
     @classmethod
     def prepare(cls, config: HuggingFaceConfig) -> None:
-        download_result = cls.download(config=config)
+        raw_data = cls.get_data(config=config)
 
-        if download_result is None:
-            raise ValueError("download method returned None")
+        if raw_data is None:
+            raise ValueError("Method get_data returned None")
         else:
-            train_data, eval_data = download_result
+            train_data, eval_data = raw_data
 
         if config.eval_local_path_to_data is None and eval_data is not None:
             logger.warning("eval_local_path_to_data is None, but eval_data is not None")
@@ -80,9 +80,7 @@ def load(cls, path_to_data: str, **kwargs: Any) -> "BaseDataset":
         data = list()
 
         if not os.path.isfile(path_to_data):
-            raise FileNotFoundError(
-                f"File {path_to_data} not found. Probably you should run download_and_prepare before"
-            )
+            raise FileNotFoundError(f"File {path_to_data} not found. Probably you should run .prepare before")
 
         with open(path_to_data) as file_object:
             for line in file_object:
@@ -105,7 +103,7 @@ def __getitem__(self, index: int) -> RawSample:
 
     @classmethod
     @abstractmethod
-    def download(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
+    def get_data(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
         raise NotImplementedError
 
     @abstractmethod

diff --git a/src/xllm/datasets/general.py b/src/xllm/datasets/general.py
@@ -34,14 +34,25 @@ def __init__(
         self.separator = separator
 
     @classmethod
-    def download(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
+    def get_data(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
         dist_logger.warning(
-            "This is a special type of dataset in which it is not supposed to download anything. "
-            "You must pass the data here through __init__, "
+            "This is a special type of dataset in which it is not supposed to get_data anything. "
+            "You must pass the data here through __init__ or use from_list, "
             "or through the path in config.train_local_path_to_data and config.eval_local_path_to_data (optional)"
         )
         return None
 
+    @classmethod
+    def from_list(
+        cls,
+        data: List[str],
+        sample_field: str = enums.General.default_sample_field,
+        separator: Optional[str] = None,
+    ) -> "GeneralDataset":
+        prepared_data: List[RawSample] = [{sample_field: text} for text in data]
+        dataset = cls(data=prepared_data, sample_field=sample_field, separator=separator)
+        return dataset
+
     def get_sample(self, index: int) -> RawSample:
         text = self.data[index][self.sample_field]
 

diff --git a/src/xllm/datasets/registry.py b/src/xllm/datasets/registry.py
@@ -19,5 +19,6 @@
 
 datasets_registry = Registry(name=enums.Registry.datasets)
 
+datasets_registry.add(key=enums.Datasets.default, value=GeneralDataset)
 datasets_registry.add(key=enums.Datasets.general, value=GeneralDataset)
 datasets_registry.add(key=enums.Datasets.soda, value=SodaDataset)
diff --git a/src/xllm/datasets/soda.py b/src/xllm/datasets/soda.py
@@ -35,7 +35,7 @@ def __init__(self, data: List[RawSample], header_drop_probability: float = 0.05)
         self.header_drop_probability = header_drop_probability
 
     @classmethod
-    def download(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
+    def get_data(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
         soda_dataset = datasets.load_dataset(cls._HF_DATASET_ID)
 
         parsed_data: Dict[str, List[RawSample]] = dict()

diff --git a/src/xllm/enums.py b/src/xllm/enums.py
@@ -39,6 +39,7 @@ class Registry:
 
 @dataclass
 class Datasets:
+    default: str = "default"
     general: str = "general"
     soda: str = "soda"