Skip to content
This repository has been archived by the owner on Apr 11, 2024. It is now read-only.

Commit

Permalink
Release: 0.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
BobaZooba committed Oct 14, 2023
1 parent 0672e56 commit d880c8f
Show file tree
Hide file tree
Showing 27 changed files with 188 additions and 72 deletions.
35 changes: 20 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
[![PyPI - Downloads](https://img.shields.io/pypi/dm/xllmling.svg?color=blue&label=Downloads&logo=pypi&logoColor=gold)](https://pypi.org/project/xllm/)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/xllm.svg?logo=python&label=Python&logoColor=gold)](https://pypi.org/project/xllm/)


Easy & cutting edge LLM finetuning using the most advanced methods (QLoRA, DeepSpeed, GPTQ, Flash Attention 2, FSDP,
etc)

Expand Down Expand Up @@ -98,13 +97,9 @@ from xllm.experiments import Experiment

config = HuggingFaceConfig(model_name_or_path="facebook/opt-350m")

train_data = [
{
"text": "Hello!"
},
] * 100
train_data = ["Hello!"] * 100

train_dataset = GeneralDataset(data=train_data)
train_dataset = GeneralDataset.from_list(data=train_data)
experiment = Experiment(config=config, train_dataset=train_dataset)

experiment.build()
Expand Down Expand Up @@ -307,14 +302,24 @@ Please support me for more updates!
# Call to partnership 🛰
## Advisor
**Looking for an expert in modern LLMs?** I've got the experience you need. I'll guide you through every step,
fine-tuning everything from data collection to model training and improvement.
**Why me?** Well, with six years of experience in deep learning R&D projects, I've mastered a range of roles - from
leading a team to rolling up my sleeves as an engineer. I've built and improved products from scratch and I'm keen to do
the same for you.

**Worried about your team?** Don't be. With four years as a lecturer at Russia’s best university, I can equip them with
the skills they need to succeed.
And if your team is hunting for the insights of an adept advisor to propel your projects forward, don't hesitate to
reach out through this website: https://komplete.framer.ai
**Want to know more?** Check
out [my CV](https://docs.google.com/document/d/1BhFvIHQ1mpm81P-n2A-lhNac-U2wOGc6F2uS9gKvk88/edit?usp=sharing), [LinkedIn](https://www.linkedin.com/in/boriszubarev/),
and [past projects](https://komplete.framer.ai/cases) for the full scoop.
## Full-time job
**Ready to start?** Let's arrange a free intro meeting. I'll outline the resources we'll need to make your project a
success.
[Contact me form](https://komplete.framer.ai/#contact)

Are you seeking a dynamic addition to your team who possesses the prowess and the know-how to train such innovative
models? Then consider
sharing [my CV](https://docs.google.com/document/d/1BhFvIHQ1mpm81P-n2A-lhNac-U2wOGc6F2uS9gKvk88/edit?usp=sharing)
or [LinkedIn](https://www.linkedin.com/in/boriszubarev/) with your manager.
If you're an engineer, I'd appreciate it if you could pass
along [my LinkedIn](https://www.linkedin.com/in/boriszubarev/) or [website](https://komplete.framer.ai/) to your
manager.
43 changes: 35 additions & 8 deletions examples/minimal/train.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,35 @@
from examples.minimal.dataset import AntropicDataset
from src.xllm.cli.train import cli_run_train
from src.xllm.core.config import HuggingFaceConfig
from src.xllm.datasets.registry import datasets_registry

if __name__ == "__main__":
datasets_registry.add(key="antropic", value=AntropicDataset)
cli_run_train(config_cls=HuggingFaceConfig)
import datasets
from tqdm import tqdm
from xllm import HuggingFaceConfig
from xllm.datasets import GeneralDataset
from xllm.experiments import Experiment


def run():
rlhf_dataset = datasets.load_dataset("Anthropic/hh-rlhf")

parsed_data = dict()

for split in ["train", "test"]:
parsed_data[split] = list()

for sample in tqdm(rlhf_dataset[split], desc=f"Parsing {split}"):
text_parts = sample["chosen"].split("\n\n")[1:]

parsed_data[split].append(text_parts)

train = parsed_data["train"]
evaluation = parsed_data["test"]

train_dataset = GeneralDataset.from_list(data=train)
eval_dataset = GeneralDataset.from_list(data=evaluation)

config = HuggingFaceConfig(model_name_or_path="facebook/opt-350m")

experiment = Experiment(config=config, train_dataset=train_dataset, eval_dataset=eval_dataset)

experiment.build()

experiment.run()

experiment.push_to_hub(repo_id="YOUR_NAME/MODEL_NAME")
5 changes: 5 additions & 0 deletions examples/minimal_using_cli/.env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
HUGGING_FACE_HUB_TOKEN=
WANDB_API_KEY=
WANDB_ENTITY=
WANDB_PROJECT=
TOKENIZERS_PARALLELISM=false
5 changes: 5 additions & 0 deletions examples/minimal_using_cli/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Minimal

## Run

python ./train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class AntropicDataset(BaseDataset):
_HF_DATASET_ID = "Anthropic/hh-rlhf"

@classmethod
def download(cls, config: HuggingFaceConfig) -> Tuple[List[RawSample], Optional[List[RawSample]]]:
def get_data(cls, config: HuggingFaceConfig) -> Tuple[List[RawSample], Optional[List[RawSample]]]:
rlhf_dataset = datasets.load_dataset(cls._HF_DATASET_ID)

parsed_data: Dict[str, List[RawSample]] = dict()
Expand Down
8 changes: 8 additions & 0 deletions examples/minimal_using_cli/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from examples.minimal_using_cli.dataset import AntropicDataset
from src.xllm.cli.train import cli_run_train
from src.xllm.core.config import HuggingFaceConfig
from src.xllm.datasets.registry import datasets_registry

if __name__ == "__main__":
datasets_registry.add(key="antropic", value=AntropicDataset)
cli_run_train(config_cls=HuggingFaceConfig)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
# Setup
setup(
name="xllm",
version="0.3.24",
version="0.0.1",
description="Simple & Cutting Edge LLM Finetuning",
license_files=["LICENSE"],
long_description=open("README.md", "r", encoding="utf-8").read(),
Expand Down
6 changes: 3 additions & 3 deletions src/xllm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@

# ruff: noqa: F401

__version__ = "0.3.24"
__version__ = "0.0.1"

from . import enums, types
from .cli.download import cli_run_download
from .cli.fuse import cli_run_fuse
from .cli.prepare import cli_run_preprare
from .cli.train import cli_run_train
from .core.config import HuggingFaceConfig
from .run.download import download
from .run.fuse import fuse
from .run.prepare import prepare
from .run.train import train
from .utils.cli import setup_cli
from .utils.logger import dist_logger
Expand Down
2 changes: 1 addition & 1 deletion src/xllm/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

# ruff: noqa: F401

from .download import cli_run_download
from .fuse import cli_run_fuse
from .prepare import cli_run_preprare
from .quantize import cli_run_quantize
from .train import cli_run_train
10 changes: 5 additions & 5 deletions src/xllm/cli/download.py → src/xllm/cli/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,19 @@
from transformers import HfArgumentParser, PreTrainedModel, PreTrainedTokenizer

from ..core.config import HuggingFaceConfig
from ..run.download import download
from ..run.prepare import prepare
from ..utils.cli import setup_cli


def cli_run_download(
def cli_run_preprare(
config_cls: Type[HuggingFaceConfig] = HuggingFaceConfig,
) -> Tuple[PreTrainedTokenizer, PreTrainedModel]:
parser = HfArgumentParser(config_cls)
config = parser.parse_args_into_dataclasses()[0]
setup_cli(config=config, logger_path="./xllm_download.log")
tokenizer, model = download(config=config)
setup_cli(config=config, logger_path="./xllm_prepare.log")
tokenizer, model = prepare(config=config)
return tokenizer, model


if __name__ == "__main__":
cli_run_download(config_cls=HuggingFaceConfig)
cli_run_preprare(config_cls=HuggingFaceConfig)
8 changes: 2 additions & 6 deletions src/xllm/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ class HuggingFaceConfig:
metadata={"help": "Custom path to .env file"},
)

# download
# prepare
prepare_dataset: bool = field(
default=True,
metadata={
"help": "Prepare the dataset. Works only when you use download",
"help": 'Prepare the dataset. Works only at "prepare" stage',
},
)

Expand Down Expand Up @@ -277,10 +277,6 @@ class HuggingFaceConfig:
"help": "Use or not flash attention 2. Requires 1) CUDA >= 11.6; 2) install flash-attn 3) compatible model",
},
)
low_cpu_mem_usage: Optional[bool] = field(
default=None,
metadata={"help": "low_cpu_mem_usage when loading model"},
)
trust_remote_code: bool = field(
default=True,
metadata={
Expand Down
11 changes: 8 additions & 3 deletions src/xllm/core/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def build_quantization_config(
def build_model(
config: HuggingFaceConfig,
quantization_config: Union[BitsAndBytesConfig, GPTQConfig, None],
low_cpu_mem_usage: Optional[bool] = None,
) -> PreTrainedModel:
if config.bnb_quantize_after_model_init:
quantization_config = None
Expand All @@ -194,8 +195,8 @@ def build_model(
if config.use_flash_attention_2:
kwargs["use_flash_attention_2"] = True

if config.low_cpu_mem_usage is not None:
kwargs["low_cpu_mem_usage"] = config.low_cpu_mem_usage
if low_cpu_mem_usage is not None:
kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage

model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=config.model_name_or_path,
Expand All @@ -212,6 +213,10 @@ def build_model(
model = prepare_model_for_kbit_training(
model=model, use_gradient_checkpointing=config.use_gradient_checkpointing
)
dist_logger(
message=f"Model prepared for kbit training. Gradient checkpointing: {config.use_gradient_checkpointing}",
local_rank=config.local_rank,
)

return model

Expand Down Expand Up @@ -246,7 +251,7 @@ def build_trainer(
model.config.use_cache = False # type: ignore
except Exception as exception:
dist_logger.warning(
message=f"Can't set use cache to false, because: {exception}",
message=f"Can't set use cache to false. Exception: {exception}",
local_rank=config.local_rank,
)

Expand Down
14 changes: 6 additions & 8 deletions src/xllm/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ def __init__(self, data: List[RawSample]):

@classmethod
def prepare(cls, config: HuggingFaceConfig) -> None:
download_result = cls.download(config=config)
raw_data = cls.get_data(config=config)

if download_result is None:
raise ValueError("download method returned None")
if raw_data is None:
raise ValueError("Method get_data returned None")
else:
train_data, eval_data = download_result
train_data, eval_data = raw_data

if config.eval_local_path_to_data is None and eval_data is not None:
logger.warning("eval_local_path_to_data is None, but eval_data is not None")
Expand Down Expand Up @@ -80,9 +80,7 @@ def load(cls, path_to_data: str, **kwargs: Any) -> "BaseDataset":
data = list()

if not os.path.isfile(path_to_data):
raise FileNotFoundError(
f"File {path_to_data} not found. Probably you should run download_and_prepare before"
)
raise FileNotFoundError(f"File {path_to_data} not found. Probably you should run .prepare before")

with open(path_to_data) as file_object:
for line in file_object:
Expand All @@ -105,7 +103,7 @@ def __getitem__(self, index: int) -> RawSample:

@classmethod
@abstractmethod
def download(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
def get_data(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
raise NotImplementedError

@abstractmethod
Expand Down
17 changes: 14 additions & 3 deletions src/xllm/datasets/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,25 @@ def __init__(
self.separator = separator

@classmethod
def download(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
def get_data(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
dist_logger.warning(
"This is a special type of dataset in which it is not supposed to download anything. "
"You must pass the data here through __init__, "
"This is a special type of dataset in which it is not supposed to get_data anything. "
"You must pass the data here through __init__ or use from_list, "
"or through the path in config.train_local_path_to_data and config.eval_local_path_to_data (optional)"
)
return None

@classmethod
def from_list(
cls,
data: List[str],
sample_field: str = enums.General.default_sample_field,
separator: Optional[str] = None,
) -> "GeneralDataset":
prepared_data: List[RawSample] = [{sample_field: text} for text in data]
dataset = cls(data=prepared_data, sample_field=sample_field, separator=separator)
return dataset

def get_sample(self, index: int) -> RawSample:
text = self.data[index][self.sample_field]

Expand Down
1 change: 1 addition & 0 deletions src/xllm/datasets/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@

datasets_registry = Registry(name=enums.Registry.datasets)

datasets_registry.add(key=enums.Datasets.default, value=GeneralDataset)
datasets_registry.add(key=enums.Datasets.general, value=GeneralDataset)
datasets_registry.add(key=enums.Datasets.soda, value=SodaDataset)
2 changes: 1 addition & 1 deletion src/xllm/datasets/soda.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, data: List[RawSample], header_drop_probability: float = 0.05)
self.header_drop_probability = header_drop_probability

@classmethod
def download(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
def get_data(cls, config: HuggingFaceConfig) -> Optional[Tuple[List[RawSample], Optional[List[RawSample]]]]:
soda_dataset = datasets.load_dataset(cls._HF_DATASET_ID)

parsed_data: Dict[str, List[RawSample]] = dict()
Expand Down
1 change: 1 addition & 0 deletions src/xllm/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class Registry:

@dataclass
class Datasets:
default: str = "default"
general: str = "general"
soda: str = "soda"

Expand Down
Loading

0 comments on commit d880c8f

Please sign in to comment.