demo fixes & readme

flexflow · Nov 8, 2024 · 9a1eae5 · 9a1eae5
1 parent fca3d95
commit 9a1eae5
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 4 deletions.
diff --git a/inference/python/streamlit/README.md b/inference/python/streamlit/README.md
@@ -0,0 +1,18 @@
+# Streamlit demo
+
+## Instructions
+
+1. Build and install FlexFlow, or build and run `source ./set_python_envs.sh` from the build folder
+2. Edit the FlexFlow/inference/python/streamlit/fastapi_incr.py to configure the model to run and the system configs (num gpus, amount of memory, etc)
+3. In one terminal, launch the LLM engine with the commands below, and wait until the model's weights loading completes
+```
+cd FlexFlow/inference/python/streamlit
+python fastapi_incr.py
+```
+4. In another terminal, launch the streamlit app:
+```
+cd FlexFlow/inference/python/streamlit
+streamlit run app.py 
+```
+5. Open the URL printed to the terminal, e.g. `http://localhost:8501` and interact with the app via browser
+
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
@@ -521,7 +521,7 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
-    def _generate(self, requests: List[Request]):
+    def _generate(self, requests: List[Request]) -> List[GenerationResult]:
         if len(requests) == 0:
             return []
         for req in requests:
@@ -554,7 +554,7 @@ def _generate(self, requests: List[Request]):
                     )
         return self.model.ffmodel.generate(requests)
 
-    def __chat2prompt(self, messages: List[dict]):
+    def __chat2prompt(self, messages: List[dict]) -> str:
         """Convert a list of messages to a single prompt string
 
         :param messages: The list of messages to convert
@@ -573,6 +573,12 @@ def __chat2prompt(self, messages: List[dict]):
         if self.tokenizer.chat_template is None:
             raise ValueError(f"Model {self.model_name} does not support chat completion")
         return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+    def __output2chat_response(self, requests: List[Request], outputs: List[GenerationResult]) -> List[GenerationResult]:
+        assert(len(requests) == len(outputs))
+        for i in range(len(outputs)):
+            outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt):]
+        return outputs
 
     def generate(
         self,
@@ -626,7 +632,8 @@ def generate(
                     max_new_tokens=max_new_tokens,
                     add_special_tokens=False,
                 )
-                return self._generate([request])
+                outputs = self._generate([request])
+                return self.__output2chat_response([request], outputs)
             elif type(requests_or_prompts[0]) == list:
                 prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts]
                 requests = [
@@ -639,7 +646,8 @@ def generate(
                     )
                     for prompt in prompts
                 ]
-                return self._generate(requests)
+                outputs = self._generate(requests)
+                return self.__output2chat_response(requests, outputs)
             elif type(requests_or_prompts[0]) == Request:
                 print(requests_or_prompts)
                 return self._generate(requests_or_prompts)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
@@ -765,6 +765,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       assert(processed_tokens < request.tokens.size());
       bool request_completed = check_inf_req_completion(old_bc, i);
       if (request_completed) {
+        if (is_eos_token(request.tokens.back())) {
+          // remove the EOS token
+          request.tokens.pop_back();
+        }
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token