Skip to content

Commit

Permalink
demo fixes & readme
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Nov 8, 2024
1 parent fca3d95 commit 9a1eae5
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 4 deletions.
18 changes: 18 additions & 0 deletions inference/python/streamlit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Streamlit demo

## Instructions

1. Build and install FlexFlow, or build and run `source ./set_python_envs.sh` from the build folder
2. Edit the FlexFlow/inference/python/streamlit/fastapi_incr.py to configure the model to run and the system configs (num gpus, amount of memory, etc)
3. In one terminal, launch the LLM engine with the commands below, and wait until the model's weights loading completes
```
cd FlexFlow/inference/python/streamlit
python fastapi_incr.py
```
4. In another terminal, launch the streamlit app:
```
cd FlexFlow/inference/python/streamlit
streamlit run app.py
```
5. Open the URL printed to the terminal, e.g. `http://localhost:8501` and interact with the app via browser

16 changes: 12 additions & 4 deletions python/flexflow/serve/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ def compile(

atexit.register(self.rm.stop_server)

def _generate(self, requests: List[Request]):
def _generate(self, requests: List[Request]) -> List[GenerationResult]:
if len(requests) == 0:
return []
for req in requests:
Expand Down Expand Up @@ -554,7 +554,7 @@ def _generate(self, requests: List[Request]):
)
return self.model.ffmodel.generate(requests)

def __chat2prompt(self, messages: List[dict]):
def __chat2prompt(self, messages: List[dict]) -> str:
"""Convert a list of messages to a single prompt string
:param messages: The list of messages to convert
Expand All @@ -573,6 +573,12 @@ def __chat2prompt(self, messages: List[dict]):
if self.tokenizer.chat_template is None:
raise ValueError(f"Model {self.model_name} does not support chat completion")
return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

def __output2chat_response(self, requests: List[Request], outputs: List[GenerationResult]) -> List[GenerationResult]:
assert(len(requests) == len(outputs))
for i in range(len(outputs)):
outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt):]
return outputs

def generate(
self,
Expand Down Expand Up @@ -626,7 +632,8 @@ def generate(
max_new_tokens=max_new_tokens,
add_special_tokens=False,
)
return self._generate([request])
outputs = self._generate([request])
return self.__output2chat_response([request], outputs)
elif type(requests_or_prompts[0]) == list:
prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts]
requests = [
Expand All @@ -639,7 +646,8 @@ def generate(
)
for prompt in prompts
]
return self._generate(requests)
outputs = self._generate(requests)
return self.__output2chat_response(requests, outputs)
elif type(requests_or_prompts[0]) == Request:
print(requests_or_prompts)
return self._generate(requests_or_prompts)
Expand Down
4 changes: 4 additions & 0 deletions src/runtime/request_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
assert(processed_tokens < request.tokens.size());
bool request_completed = check_inf_req_completion(old_bc, i);
if (request_completed) {
if (is_eos_token(request.tokens.back())) {
// remove the EOS token
request.tokens.pop_back();
}
std::string output = this->tokenizer_->Decode(request.tokens);
// Unlike Huggingface, the sentencepiece C++ library automatically
// removes the BOS token
Expand Down

0 comments on commit 9a1eae5

Please sign in to comment.