Skip to content

Commit

Permalink
Merge pull request #2 from 01-ai/wangye/add_generate_demo
Browse files Browse the repository at this point in the history
feat: add base model generate demo
  • Loading branch information
learninmou authored Oct 30, 2023
2 parents 1e163ac + c7dce07 commit e630976
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 1 deletion.
12 changes: 11 additions & 1 deletion demo/README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
# Demo
## generate_demo.py
- **说明**:语言模型生成 demo
- **调用方式**`python generate_demo.py`

This is a placeholder folder
## stream_generate_demo.py
- **说明**:语言模型流式生成 demo
- **调用方式**`python generate_demo.py`

## tensor_parallel_generate_demo.py
- **说明**:语言模型模型并行生成 demo
- **调用方式**`torchrun --nproc_per_node=2 tensor_parallel_generate_demo.py`
12 changes: 12 additions & 0 deletions demo/README_EN.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Demo
## generate_demo.py
- **Description**:LLM generate demo
- **Usage**`python generate_demo.py`

## stream_generate_demo.py
- **Description**:LLM stream generate demo
- **Usage**`python generate_demo.py`

## tensor_parallel_generate_demo.py
- **Description**:LLM tensor-parallel generate demo
- **Usage**`torchrun --nproc_per_node=2 tensor_parallel_generate_demo.py`
9 changes: 9 additions & 0 deletions demo/generate_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Yi-6b", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"Yi-6b", device_map="auto", torch_dtype="auto", trust_remote_code=True
)
inputs = tokenizer("Hello", return_tensors="pt")
outputs = model.generate(inputs.input_ids.cuda(), max_new_tokens=1024)
print(tokenizer.decode(outputs[0]))
9 changes: 9 additions & 0 deletions demo/stream_generate_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

tokenizer = AutoTokenizer.from_pretrained("Yi-6b", trust_remote_code=True)
streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(
"Yi-6b", device_map="auto", torch_dtype="auto", trust_remote_code=True
)
inputs = tokenizer("Hello", return_tensors="pt")
_ = model.generate(inputs.input_ids.cuda(), max_new_tokens=1024, streamer=streamer)
37 changes: 37 additions & 0 deletions demo/tensor_parallel_generate_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os

import deepspeed
import torch
from deepspeed.module_inject import auto_tp
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer


def is_load_module(module):
load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm]
load_layer_names = [
"LPLayerNorm",
"SharedEmbedding",
"OPTLearnedPositionalEmbedding",
"LlamaRMSNorm",
"YiRMSNorm",
]
return module.__class__ in load_layers or module._get_name() in load_layer_names


auto_tp.Loading.is_load_module = is_load_module

torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
model = AutoModelForCausalLM.from_pretrained(
"Yi-6b", device_map="cuda", torch_dtype="auto", trust_remote_code=True
)

model = deepspeed.init_inference(
model, mp_size=int(os.environ["WORLD_SIZE"]), replace_with_kernel_inject=False
)
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained("Yi-6b", trust_remote_code=True)
inputs = tokenizer("Hello", return_tensors="pt")
outputs = model.generate(inputs.input_ids.cuda(), max_new_tokens=1024)
print(tokenizer.decode(outputs[0]))
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
deepspeed==0.11.1

0 comments on commit e630976

Please sign in to comment.