From c7dce07100fb590f3832a0703385cb316f83f776 Mon Sep 17 00:00:00 2001 From: wangye Date: Thu, 26 Oct 2023 08:52:01 +0000 Subject: [PATCH] feat: add base model generate demo --- demo/README.md | 12 ++++++++- demo/README_EN.md | 12 +++++++++ demo/generate_demo.py | 9 +++++++ demo/stream_generate_demo.py | 9 +++++++ demo/tensor_parallel_generate_demo.py | 37 +++++++++++++++++++++++++++ requirements.txt | 1 + 6 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 demo/README_EN.md create mode 100644 demo/generate_demo.py create mode 100644 demo/stream_generate_demo.py create mode 100644 demo/tensor_parallel_generate_demo.py create mode 100644 requirements.txt diff --git a/demo/README.md b/demo/README.md index 660985f3..fe295d5c 100644 --- a/demo/README.md +++ b/demo/README.md @@ -1,2 +1,12 @@ +# Demo +## generate_demo.py +- **说明**:语言模型生成 demo +- **调用方式**:`python generate_demo.py` -This is a placeholder folder +## stream_generate_demo.py +- **说明**:语言模型流式生成 demo +- **调用方式**:`python generate_demo.py` + +## tensor_parallel_generate_demo.py +- **说明**:语言模型模型并行生成 demo +- **调用方式**:`torchrun --nproc_per_node=2 tensor_parallel_generate_demo.py` \ No newline at end of file diff --git a/demo/README_EN.md b/demo/README_EN.md new file mode 100644 index 00000000..eb2f6d72 --- /dev/null +++ b/demo/README_EN.md @@ -0,0 +1,12 @@ +# Demo +## generate_demo.py +- **Description**:LLM generate demo +- **Usage**:`python generate_demo.py` + +## stream_generate_demo.py +- **Description**:LLM stream generate demo +- **Usage**:`python generate_demo.py` + +## tensor_parallel_generate_demo.py +- **Description**:LLM tensor-parallel generate demo +- **Usage**:`torchrun --nproc_per_node=2 tensor_parallel_generate_demo.py` \ No newline at end of file diff --git a/demo/generate_demo.py b/demo/generate_demo.py new file mode 100644 index 00000000..5b107337 --- /dev/null +++ b/demo/generate_demo.py @@ -0,0 +1,9 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("Yi-6b", trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained( + "Yi-6b", device_map="auto", torch_dtype="auto", trust_remote_code=True +) +inputs = tokenizer("Hello", return_tensors="pt") +outputs = model.generate(inputs.input_ids.cuda(), max_new_tokens=1024) +print(tokenizer.decode(outputs[0])) diff --git a/demo/stream_generate_demo.py b/demo/stream_generate_demo.py new file mode 100644 index 00000000..20567232 --- /dev/null +++ b/demo/stream_generate_demo.py @@ -0,0 +1,9 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer + +tokenizer = AutoTokenizer.from_pretrained("Yi-6b", trust_remote_code=True) +streamer = TextStreamer(tokenizer) +model = AutoModelForCausalLM.from_pretrained( + "Yi-6b", device_map="auto", torch_dtype="auto", trust_remote_code=True +) +inputs = tokenizer("Hello", return_tensors="pt") +_ = model.generate(inputs.input_ids.cuda(), max_new_tokens=1024, streamer=streamer) diff --git a/demo/tensor_parallel_generate_demo.py b/demo/tensor_parallel_generate_demo.py new file mode 100644 index 00000000..9ef87707 --- /dev/null +++ b/demo/tensor_parallel_generate_demo.py @@ -0,0 +1,37 @@ +import os + +import deepspeed +import torch +from deepspeed.module_inject import auto_tp +from torch import nn +from transformers import AutoModelForCausalLM, AutoTokenizer + + +def is_load_module(module): + load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm] + load_layer_names = [ + "LPLayerNorm", + "SharedEmbedding", + "OPTLearnedPositionalEmbedding", + "LlamaRMSNorm", + "YiRMSNorm", + ] + return module.__class__ in load_layers or module._get_name() in load_layer_names + + +auto_tp.Loading.is_load_module = is_load_module + +torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) +model = AutoModelForCausalLM.from_pretrained( + "Yi-6b", device_map="cuda", torch_dtype="auto", trust_remote_code=True +) + +model = deepspeed.init_inference( + model, mp_size=int(os.environ["WORLD_SIZE"]), replace_with_kernel_inject=False +) +torch.cuda.empty_cache() + +tokenizer = AutoTokenizer.from_pretrained("Yi-6b", trust_remote_code=True) +inputs = tokenizer("Hello", return_tensors="pt") +outputs = model.generate(inputs.input_ids.cuda(), max_new_tokens=1024) +print(tokenizer.decode(outputs[0])) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..8cf51987 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +deepspeed==0.11.1 \ No newline at end of file