forked from chats-bug/hugging_face_peft_gen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generation.py
79 lines (68 loc) · 2.52 KB
/
generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from finetuned_model import FinetunedModel
from hf_models import HfModel
from models import InputPayload
import os
from transformers import BitsAndBytesConfig
from typing import Union
import torch
MODEL_REPO_ID = os.environ.get("MODEL_ID")
PEFT_MODEL = bool(os.environ.get("FINETUNNED")) | False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LOAD_IN_4BIT = bool(os.environ.get("LOAD_IN_4BIT")) | False
LOAD_IN_8BIT = bool(os.environ.get("LOAD_IN_8BIT")) | False
# Edit the quantization config here
# For more information on the quantization config, refer to huggingface documentation
if LOAD_IN_8BIT and LOAD_IN_4BIT:
raise Exception("Cannot load in both 4bit and 8bit")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
####################################
# Using a finetuned model (using peft adapters)
####################################
if PEFT_MODEL:
model = FinetunedModel(
model_id=MODEL_REPO_ID,
quantization_config=quantization_config,
load_in_4bit=True,
)
else:
####################################
# To use a normal model (not using peft adapters), uncomment and use the following code:
####################################
model = HfModel(
model_id=MODEL_REPO_ID,
quantization_config=quantization_config,
load_in_4bit=LOAD_IN_4BIT,
load_in_8bit=LOAD_IN_8BIT,
) # Pass any other arguments to the model here
# Some Helper functions
def process_output(llm_responses: Union[str, list[str]]):
if isinstance(llm_responses, str):
llm_responses = [llm_responses]
try:
outputs = []
for llm_response in llm_responses:
llm_response = llm_response.strip()
output = llm_response.split("OUTPUT:\n")[1]
output = output.split("]")[0] + "]"
outputs.append(output)
return outputs
except Exception as e:
print(f"Error processing the response: {e}")
return llm_response
def inference(payload: InputPayload):
generated_text = model.generate(
message=payload.inputs,
max_new_tokens=payload.parameters.max_new_tokens,
temperature=payload.parameters.temperature,
top_p=payload.parameters.top_p,
num_return_sequences=payload.parameters.num_return_sequences,
device=DEVICE,
)
if payload.superagi_task_gen:
generated_text = process_output(generated_text)
return generated_text