diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 13566b8..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
deleted file mode 100644
index 5a85bff..0000000
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index 0594372..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-
-
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 474a048..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/sumGPT.iml b/.idea/sumGPT.iml
deleted file mode 100644
index a29c46d..0000000
--- a/.idea/sumGPT.iml
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 94a25f7..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/RUN.bat b/RUN.bat
index 6afd014..7488c88 100644
--- a/RUN.bat
+++ b/RUN.bat
@@ -38,4 +38,4 @@ if "%mod_date%" neq "%last_mod_date%" (
echo "Requirements file has not been modified. Skipping update."
)
-streamlit run src/SumGPT.py
\ No newline at end of file
+streamlit run SumGPT/main.py
\ No newline at end of file
diff --git a/SumGPT/__init__.py b/SumGPT/__init__.py
new file mode 100644
index 0000000..a576e4b
--- /dev/null
+++ b/SumGPT/__init__.py
@@ -0,0 +1,4 @@
+import streamlit as st
+
+if "summaries" not in st.session_state:
+ st.session_state["summaries"] = []
diff --git a/SumGPT/app/__init__.py b/SumGPT/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py
new file mode 100644
index 0000000..8becb93
--- /dev/null
+++ b/SumGPT/app/body_handler.py
@@ -0,0 +1,243 @@
+import asyncio
+from typing import Any, Dict, List, Optional, Tuple
+
+import streamlit as st
+import utils.io as io
+from core.llm import LLM
+from core.tokenizer import Tokenizer
+from datamodel.chunk import Chunk
+from datamodel.llm_params import LLMParams
+
+
+class BodyHandler:
+ def file_uploader(self, type: List[str] = ["txt"]) -> List[Dict[str, str]]:
+ uploaded_files = st.file_uploader("Upload a file", type=type, accept_multiple_files=True)
+ files = []
+ if uploaded_files is None:
+ st.stop()
+ st.warning("File is not uploaded.")
+ for file in uploaded_files:
+ text = io.read_to_string(file)
+ filename = file.name
+ files.append({"filename": filename, "text": text})
+ return files
+
+ def segment_text(
+ self, text: str, chunk_size: int, model: str, input_id: int
+ ) -> Tuple[List[Chunk], int]:
+ chunks: List[Chunk] = []
+ tokenizer = Tokenizer(model)
+ total_tokens = tokenizer.tokenize(text)
+ count = 0
+ for i in range(0, len(total_tokens), chunk_size):
+ chunk_tokens = total_tokens[i : i + chunk_size]
+ content = tokenizer.detokenize(chunk_tokens)
+ chunks.append(Chunk(count, content, len(chunk_tokens), input_id))
+ count += 1
+ return chunks, len(total_tokens)
+
+ def _get_tokens(self, response_meta: Dict[str, Any]) -> Tuple[int, int, int]:
+ completion_tokens = response_meta.get("token_usage", {}).get("completion_tokens", 0)
+ prompt_tokens = response_meta.get("token_usage", {}).get("prompt_tokens", 0)
+ cached_tokens = (
+ response_meta.get("token_usage", {})
+ .get("prompt_tokens_details", {})
+ .get("cached_tokens", 0)
+ )
+ return completion_tokens, prompt_tokens, cached_tokens
+
+ def generate(
+ self,
+ chunks: List[Chunk],
+ gpt_params: LLMParams,
+ role: str,
+ api_key: Optional[str],
+ ) -> None:
+ generate_button = st.button("Generate summary")
+ if generate_button:
+ if not api_key:
+ st.error("β Please enter your OpenAI API key in the sidebar.")
+ return
+ if not role:
+ st.error("β Please enter a role description in the sidebar.")
+ return
+
+ st.session_state["summaries"] = [] # Initialize or reset summaries
+
+ progress_text = st.empty()
+ progress_bar = st.progress(0)
+ total_chunks = len(chunks)
+
+ # Group chunks by filename
+ filename_chunks = {}
+ for chunk in chunks:
+ if chunk.filename not in filename_chunks:
+ filename_chunks[chunk.filename] = []
+ filename_chunks[chunk.filename].append(chunk)
+
+ llm = LLM(api_key, gpt_params)
+ processed_chunks = 0
+
+ # Process chunks by filename
+ for filename, file_chunks in filename_chunks.items():
+ expander = st.expander(f"{filename}")
+ for chunk in file_chunks:
+ processed_chunks += 1
+ progress_text.write(f"Generating summaries {processed_chunks}/{total_chunks}")
+ progress_bar.progress(processed_chunks / total_chunks)
+
+ summary = llm.generate(chunk.content, role)
+ with expander:
+ with st.chat_message("π€"):
+ st.write(summary.content)
+ completion_tokens, prompt_tokens, cached_tokens = self._get_tokens(
+ summary.response_metadata
+ )
+ price = round(
+ llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6
+ )
+ st.write(
+ f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`"
+ )
+ # Store the summary in session state
+ st.session_state["summaries"].append(
+ {
+ "filename": filename,
+ "content": summary.content,
+ "tokens": completion_tokens + prompt_tokens,
+ "price": price,
+ }
+ )
+
+ progress_text.write("β
All chunks processed!")
+ progress_bar.progress(1.0)
+ else:
+ # Check if summaries exist in session state and display them
+ if "summaries" in st.session_state:
+ # Group summaries by filename
+ filename_summaries = {}
+ for summary_data in st.session_state["summaries"]:
+ filename = summary_data["filename"]
+ if filename not in filename_summaries:
+ filename_summaries[filename] = []
+ filename_summaries[filename].append(summary_data)
+
+ # Display summaries grouped by filename
+ for filename, summaries in filename_summaries.items():
+ with st.expander(f"{filename}"):
+ for summary_data in summaries:
+ with st.chat_message("π€"):
+ st.write(summary_data["content"])
+ st.write(
+ f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`"
+ )
+
+ def agenerate(
+ self,
+ chunks: List[Chunk],
+ gpt_params: LLMParams,
+ role: str,
+ api_key: Optional[str],
+ ) -> None:
+ generate_button = st.button("Generate summary")
+ if generate_button:
+ if not api_key:
+ st.error("β Please enter your OpenAI API key in the sidebar.")
+ return
+ if not role:
+ st.error("β Please enter a role description in the sidebar.")
+ return
+
+ st.session_state["summaries"] = [] # Initialize or reset summaries
+
+ async def process_chunks():
+ llm = LLM(api_key, gpt_params)
+ total_chunks = len(chunks)
+ progress_text = st.empty()
+ progress_text.write(f"Generating summaries 0/{total_chunks}")
+ total_price_text = st.empty()
+ total_price = 0
+
+ progress_bar = st.progress(0)
+ completed_chunks = 0
+
+ # Sort chunks by chunk.id
+ sorted_chunks = sorted(chunks, key=lambda c: c.id)
+
+ # Group chunks by filename
+ filename_chunks = {}
+ for chunk in sorted_chunks:
+ if chunk.filename not in filename_chunks:
+ filename_chunks[chunk.filename] = []
+ filename_chunks[chunk.filename].append(chunk)
+
+ # Create expanders for each file
+ expanders = {
+ filename: st.expander(f"{filename}") for filename in filename_chunks.keys()
+ }
+
+ # Create tasks for all chunks (sorted by chunk.id)
+ tasks = [llm.agenerate(chunk.content, role) for chunk in sorted_chunks]
+
+ # Run all tasks and get the results in the same order
+ summaries = await asyncio.gather(*tasks)
+
+ # Process the results in order
+ for summary, current_chunk in zip(summaries, sorted_chunks):
+ completed_chunks += 1
+ progress_text.write(f"Generating summaries {completed_chunks}/{total_chunks}")
+ progress_bar.progress(completed_chunks / total_chunks)
+
+ with expanders[current_chunk.filename]:
+ with st.chat_message("ai"):
+ st.write(summary.content)
+ completion_tokens, prompt_tokens, cached_tokens = self._get_tokens(
+ summary.response_metadata
+ )
+ price = round(
+ llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6
+ )
+ st.write(
+ f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`"
+ )
+ total_price += price
+
+ # Store the summary in session state
+ st.session_state["summaries"].append(
+ {
+ "filename": current_chunk.filename,
+ "content": summary.content,
+ "tokens": completion_tokens + prompt_tokens,
+ "price": price,
+ }
+ )
+
+ progress_text.write("β
All chunks processed!")
+ progress_bar.progress(1.0)
+ total_price_text.write(f"Total price: `${round(total_price, 6)}`")
+
+ # Run the async processing
+ asyncio.run(process_chunks())
+ else:
+ # Check if summaries exist in session state and display them
+ if "summaries" in st.session_state:
+ total_price = 0
+ # Group summaries by filename
+ filename_summaries = {}
+ for summary_data in st.session_state["summaries"]:
+ filename = summary_data["filename"]
+ if filename not in filename_summaries:
+ filename_summaries[filename] = []
+ filename_summaries[filename].append(summary_data)
+
+ # Display summaries grouped by filename
+ for filename, summaries in filename_summaries.items():
+ with st.expander(f"{filename}"):
+ for summary_data in summaries:
+ with st.chat_message("ai"):
+ st.write(summary_data["content"])
+ st.write(
+ f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`"
+ )
+ total_price += summary_data["price"]
+ st.write(f"Total price: `${round(total_price, 6)}`")
diff --git a/SumGPT/app/page.py b/SumGPT/app/page.py
new file mode 100644
index 0000000..aed232b
--- /dev/null
+++ b/SumGPT/app/page.py
@@ -0,0 +1,69 @@
+from typing import Dict, List, Optional
+
+import streamlit as st
+from datamodel.llm_params import LLMParams
+
+from app.body_handler import BodyHandler
+from app.sidebar_handler import SidebarHandler
+
+
+class Page:
+ def __init__(self):
+ self.chunk_size: Optional[int] = None
+ self.role: Optional[str] = None
+ self.api_key: Optional[str] = None
+ self.llm_params: Optional[LLMParams] = None
+
+ def draw_header(self, version):
+ st.title(f"π SumGPT {version}")
+ st.markdown("##### Summarize your text with OpenAI's API")
+ st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)")
+ st.warning(
+ "Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo."
+ )
+
+ def draw_sidebar(self, manifest: Dict[str, str], models_data: List[Dict[str, str]]) -> None:
+ with st.sidebar:
+ sb = SidebarHandler()
+ sb.header()
+ sb.import_config()
+ self.api_key = sb.api_key_entry()
+ with st.expander("Role settings"):
+ self.role = sb.role_settings_panel()
+ with st.expander("Configuration"):
+ self.llm_params, self.chunk_size = sb.config_control_panel(models_data)
+ sb.export_config()
+ sb.footer(manifest)
+
+ def draw_body(self) -> None:
+ if not self.chunk_size:
+ st.error("β Please set the chunk size in the sidebar.")
+ return
+ if not self.llm_params:
+ st.error("β Please set the model in the sidebar.")
+ return
+ if not self.role:
+ st.error("β Please set the role in the sidebar.")
+ return
+
+ body = BodyHandler()
+ texts = body.file_uploader(["txt", "md"])
+
+ total_chunks = []
+ filenames = []
+
+ for idx, text in enumerate(texts):
+ filename = text["filename"]
+ filenames.append(filename)
+ chunks, total_token_size = body.segment_text(
+ text["text"], self.chunk_size, self.llm_params.model.name, idx
+ )
+ with st.expander(f"`{filename}` **(chunks: {len(chunks)})**"):
+ for chunk in chunks:
+ chunk.set_filename_from_list(filenames)
+ st.write([chunk.to_dict() for chunk in chunks])
+ st.write(f"Tokens: `{total_token_size}`")
+
+ total_chunks.extend(chunks)
+
+ body.agenerate(total_chunks, self.llm_params, self.role, self.api_key)
diff --git a/SumGPT/app/sidebar_handler.py b/SumGPT/app/sidebar_handler.py
new file mode 100644
index 0000000..378e681
--- /dev/null
+++ b/SumGPT/app/sidebar_handler.py
@@ -0,0 +1,100 @@
+from typing import Any, Dict, List, Tuple
+
+import streamlit as st
+import utils.helpers as helpers
+from datamodel.llm_params import LLMModel, LLMParams
+
+
+class SidebarHandler:
+ def __init__(self):
+ self.config = {}
+ self.chunk_size = None
+
+ def header(self):
+ st.title("SumGPT")
+ st.markdown("Select the model and parameters for summarization.")
+
+ def api_key_entry(self) -> str:
+ st.markdown("### API Key")
+ return st.text_input("Enter your OpenAI API key", type="password")
+
+ def role_settings_panel(self, height=300) -> str:
+ language = st.selectbox(
+ "Role language",
+ ["English", "Chinese", "Japanese", "Spanish", "French", "German", "Italian"],
+ )
+ role = st.text_area(
+ "Role settings",
+ self.config.get(
+ "role",
+ f"Write a detailed summary in perfect {language} that is concise, clear and coherent while capturing the main ideas the text. "
+ "The summary should be well-structured and free of grammatical errors.\n\n"
+ "The summary is to be written in markdown format, with a heading (###) that encapsulate the core concept of the content. It should be concise and specific. avoid generic headings like 'Summary' or 'Introduction'.",
+ ),
+ height=height,
+ )
+ if role is None:
+ st.stop()
+ st.warning("Role settings are not set.")
+ return role
+
+ def config_control_panel(self, models_data: List[Dict[str, str]]) -> Tuple[LLMParams, int]:
+ model_names = helpers.extract_values(models_data, "model")
+ model_name = st.selectbox("Model", model_names, self.config.get("model_index", 0))
+ model = LLMModel.construct_from_dict(self._get_model_dict(models_data, model_name))
+
+ _param = self._construct_param(models_data, model_name)
+
+ chunk_size = st.number_input(
+ "Chunk size (tokens)",
+ 32,
+ _param["context_window"],
+ self.config.get("chunk_size", 2048),
+ step=1024,
+ )
+ max_tokens: int = st.number_input(
+ "Max output (tokens)",
+ 32,
+ _param["max_output_tokens"],
+ self.config.get("max_tokens", 512),
+ )
+ temperature: float = st.slider("Temperature", 0.0, 1.0, self.config.get("temperature", 0.7))
+ return (
+ LLMParams(
+ model=model,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ ),
+ chunk_size,
+ )
+
+ def _get_model_dict(self, models_data, selected_model) -> Dict[str, Any]:
+ model_index = helpers.extract_dict_index(models_data, "model", selected_model)
+ return models_data[model_index]
+
+ def _construct_param(self, models_data, selected_model):
+ model_dict = self._get_model_dict(models_data, selected_model)
+ param = {
+ "max_output_tokens": model_dict["max_output_tokens"],
+ "context_window": model_dict["context_window"],
+ }
+ return param
+
+ def import_config(self):
+ st.markdown("### Import Configuration")
+ if st.button("Import configuration"):
+ raise NotImplementedError # TODO: implement
+
+ def export_config(self):
+ st.markdown("### Export Configuration")
+ if st.button("Export configuration"):
+ raise NotImplementedError # TODO: implement
+
+ def footer(self, data: Dict[str, Any]):
+ st.markdown("---")
+ st.markdown("### SumGPT")
+ st.markdown(f"Version: `{data.get('version')}`")
+ st.markdown(f"Author: {data.get('author')}")
+ st.markdown(f"[Report a bug]({data['bugs']['url']})")
+ st.markdown(f"[GitHub repo]({data['repository']['url']})")
+ st.markdown(f"License: [{data['license']['type']}]({data['license']['url']})")
diff --git a/SumGPT/core/__init__.py b/SumGPT/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/SumGPT/core/llm.py b/SumGPT/core/llm.py
new file mode 100644
index 0000000..06b7421
--- /dev/null
+++ b/SumGPT/core/llm.py
@@ -0,0 +1,51 @@
+from datamodel.llm_params import LLMParams
+from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
+from langchain_openai import ChatOpenAI
+from pydantic.types import SecretStr
+
+
+class LLM:
+ def __init__(self, api_key: str, gpt_params: LLMParams):
+ self.api_key: str = api_key
+ self.llm_params: LLMParams = gpt_params
+ self.model: ChatOpenAI = self._set_llm()
+
+ def _set_llm(self) -> ChatOpenAI:
+ return ChatOpenAI(
+ api_key=SecretStr(self.api_key),
+ model=self.llm_params.model.name,
+ max_tokens=self.llm_params.max_tokens,
+ temperature=self.llm_params.temperature,
+ )
+
+ def generate(_self, prompt: str, system: str = "") -> BaseMessage:
+ messages = [
+ SystemMessage(content=system),
+ HumanMessage(content=prompt),
+ ]
+ return _self.model.invoke(messages)
+
+ async def agenerate(_self, prompt: str, system: str = "") -> BaseMessage:
+ messages = [
+ SystemMessage(content=system),
+ HumanMessage(content=prompt),
+ ]
+ return await _self.model.ainvoke(messages)
+
+ def Calc_price(
+ self,
+ input_tokens: int,
+ output_tokens: int,
+ cached_tokens: int = 0,
+ scale_factor: int = 1000000,
+ ) -> float:
+ pricing = self.llm_params.model.pricing
+ if cached_tokens != 0 and pricing.cached is not None:
+ input_tokens -= cached_tokens
+ return (
+ input_tokens * pricing.input
+ + output_tokens * pricing.output
+ + cached_tokens * pricing.cached
+ ) / scale_factor
+
+ return (input_tokens * pricing.input + output_tokens * pricing.output) / scale_factor
diff --git a/SumGPT/core/tokenizer.py b/SumGPT/core/tokenizer.py
new file mode 100644
index 0000000..d0e7d12
--- /dev/null
+++ b/SumGPT/core/tokenizer.py
@@ -0,0 +1,20 @@
+from typing import List
+
+import tiktoken
+
+
+class Tokenizer:
+ def __init__(self, model: str):
+ self.tokenizer = tiktoken.encoding_for_model(model)
+
+ def tokenize(self, text: str) -> List[int]:
+ return self.tokenizer.encode(text)
+
+ def detokenize(self, tokens: List[int]) -> str:
+ return self.tokenizer.decode(tokens)
+
+ def detokenize_single(self, tokens: List[int]) -> List[str]:
+ results = []
+ for token in tokens:
+ results.append(self.tokenizer.decode_single_token_bytes(token).decode("utf-8"))
+ return results
diff --git a/SumGPT/datamodel/chunk.py b/SumGPT/datamodel/chunk.py
new file mode 100644
index 0000000..64666d2
--- /dev/null
+++ b/SumGPT/datamodel/chunk.py
@@ -0,0 +1,22 @@
+class Chunk:
+ def __init__(self, id: int, content: str, tokens: int, input_id: int):
+ self.id = id
+ self.content = content
+ self.tokens = tokens
+ self.input_id = input_id
+ self.filename = None
+
+ def __str__(self) -> str:
+ return f"Chunk(content={self.content}, tokens={self.tokens}, input_id={self.input_id})"
+
+ def set_filename_from_list(self, filenames: list[str]) -> str:
+ self.filename = filenames[self.input_id]
+ return self.filename
+
+ def to_dict(self) -> dict:
+ return {
+ "id": self.id,
+ "content": self.content,
+ "tokens": self.tokens,
+ "input_id": self.input_id,
+ }
diff --git a/SumGPT/datamodel/llm_model.py b/SumGPT/datamodel/llm_model.py
new file mode 100644
index 0000000..3c714c1
--- /dev/null
+++ b/SumGPT/datamodel/llm_model.py
@@ -0,0 +1,31 @@
+from typing import Optional
+
+
+class LLMModelPricing:
+ def __init__(self, input: int, output: int, cached: Optional[int] = None):
+ self.input = input
+ self.output = output
+ self.cached = cached
+
+
+class LLMModel:
+ def __init__(
+ self, name: str, context_window: int, max_output_tokens: int, pricing: LLMModelPricing
+ ):
+ self.name = name
+ self.context_window = context_window
+ self.max_output_tokens = max_output_tokens
+ self.pricing = pricing
+
+ @staticmethod
+ def construct_from_dict(data: dict) -> "LLMModel":
+ pricing = LLMModelPricing(data["pricing"]["input"], data["pricing"]["output"])
+ if "cached" in data["pricing"]:
+ pricing.cached = data["pricing"]["cached"]
+
+ return LLMModel(
+ name=data["model"],
+ context_window=data["context_window"],
+ max_output_tokens=data["max_output_tokens"],
+ pricing=pricing,
+ )
diff --git a/SumGPT/datamodel/llm_params.py b/SumGPT/datamodel/llm_params.py
new file mode 100644
index 0000000..9de6306
--- /dev/null
+++ b/SumGPT/datamodel/llm_params.py
@@ -0,0 +1,13 @@
+from datamodel.llm_model import LLMModel, LLMModelPricing # noqa: F401
+
+
+class LLMParams:
+ def __init__(
+ self,
+ model: LLMModel,
+ max_tokens=2048,
+ temperature=0.7,
+ ):
+ self.model: LLMModel = model
+ self.max_tokens: int = max_tokens
+ self.temperature: float = temperature
diff --git a/SumGPT/main.py b/SumGPT/main.py
new file mode 100644
index 0000000..f33c425
--- /dev/null
+++ b/SumGPT/main.py
@@ -0,0 +1,16 @@
+from app.page import Page
+from utils import io
+
+
+def main():
+ manifest = io.read_json_file("SumGPT/manifest.json")
+ models = io.read_json_file("SumGPT/models.json")
+
+ pg = Page()
+ pg.draw_header(manifest["version"])
+ pg.draw_sidebar(manifest, models)
+ pg.draw_body()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/SumGPT/manifest.json b/SumGPT/manifest.json
new file mode 100644
index 0000000..7a33f79
--- /dev/null
+++ b/SumGPT/manifest.json
@@ -0,0 +1,17 @@
+{
+ "name": "SumGPT",
+ "version": "2.0.0",
+ "license": {
+ "type": "MIT",
+ "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE"
+ },
+ "author": "Zeke Zhang",
+ "homepage": "https://github.com/sean1832/SumGPT",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/sean1832/SumGPT"
+ },
+ "bugs": {
+ "url": "https://github.com/sean1832/SumGPT/issues"
+ }
+ }
\ No newline at end of file
diff --git a/SumGPT/models.json b/SumGPT/models.json
new file mode 100644
index 0000000..9a3cd16
--- /dev/null
+++ b/SumGPT/models.json
@@ -0,0 +1,40 @@
+[
+ {
+ "model": "gpt-4o-mini",
+ "context_window": 128000,
+ "max_output_tokens": 16384,
+ "pricing": {
+ "input": 0.15,
+ "output": 0.6,
+ "cached": 0.075
+ }
+ },
+ {
+ "model": "gpt-4o",
+ "context_window": 128000,
+ "max_output_tokens": 4096,
+ "pricing": {
+ "input": 2.5,
+ "output": 10,
+ "cached": 1.25
+ }
+ },
+ {
+ "model": "gpt-4-turbo",
+ "context_window": 128000,
+ "max_output_tokens": 4096,
+ "pricing": {
+ "input": 10,
+ "output": 30
+ }
+ },
+ {
+ "model": "gpt-3.5-turbo",
+ "context_window": 16385,
+ "max_output_tokens": 4096,
+ "pricing": {
+ "input": 0.5,
+ "output": 1.5
+ }
+ }
+]
\ No newline at end of file
diff --git a/SumGPT/prompt.json b/SumGPT/prompt.json
new file mode 100644
index 0000000..f76ec60
--- /dev/null
+++ b/SumGPT/prompt.json
@@ -0,0 +1,22 @@
+[
+ {
+ "type": "recursive",
+ "legacy": false,
+ "prompt": "Write a detailed and comprehensive explanation of the following in perfect [LANGUAGE] with no grammar issues, ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information:\n\n{text}\n\nStructured markdown summary with heading (###) in fluent [LANGUAGE]:",
+ "variables": [
+ {
+ "name": "[LANGUAGE]"
+ }
+ ]
+ },
+ {
+ "type": "final",
+ "legacy": false,
+ "prompt": "Write a detailed summary of the following in [LANGUAGE]:\n\n{text}\n\nIdentify and summarise them into five headings. Use #### headings in markdown. Under headings, summarize a list of key points that best encapsulate the core information. Structured markdown summary with headings in perfect [LANGUAGE] (####): ",
+ "variables": [
+ {
+ "name": "[LANGUAGE]"
+ }
+ ]
+ }
+]
\ No newline at end of file
diff --git a/SumGPT/utils/__init__.py b/SumGPT/utils/__init__.py
new file mode 100644
index 0000000..7979e22
--- /dev/null
+++ b/SumGPT/utils/__init__.py
@@ -0,0 +1,4 @@
+import utils.helpers as helpers
+import utils.io as io
+
+__all__ = ["helpers", "io"]
diff --git a/SumGPT/utils/helpers.py b/SumGPT/utils/helpers.py
new file mode 100644
index 0000000..25afa4f
--- /dev/null
+++ b/SumGPT/utils/helpers.py
@@ -0,0 +1,35 @@
+def extract_values(dicts, key, parent_key=None):
+ """
+ Extracts values from a list of dictionaries based on a specified key.
+ If the key is nested, a parent key can be specified.
+
+ :param dicts: List of dictionaries to query
+ :param key: The key for which values are to be extracted
+ :param parent_key: Optional parent key if the key is nested within another dictionary
+ :return: List of values corresponding to the specified key
+ """
+ values = []
+ for dict in dicts:
+ if parent_key:
+ # Access the nested dictionary and then the key if parent_key is specified
+ if parent_key in dict and key in dict[parent_key]:
+ values.append(dict[parent_key][key])
+ else:
+ # Access the key directly if there is no parent_key
+ if key in dict:
+ values.append(dict[key])
+ return values
+
+def extract_dict_index(dicts, key, value):
+ """
+ Extracts the index of a dictionary in a list of dictionaries based on a specified key-value pair.
+
+ :param dicts: List of dictionaries to query
+ :param key: The key to search for
+ :param value: The value to search for
+ :return: Index of the dictionary containing the specified key-value pair
+ """
+ for i, dict in enumerate(dicts):
+ if key in dict and dict[key] == value:
+ return i
+ return None
\ No newline at end of file
diff --git a/SumGPT/utils/io.py b/SumGPT/utils/io.py
new file mode 100644
index 0000000..fef6d69
--- /dev/null
+++ b/SumGPT/utils/io.py
@@ -0,0 +1,17 @@
+import json
+from io import StringIO
+
+
+def read_json_file(file):
+ with open(file, "r") as f:
+ return json.load(f)
+
+
+def write_json_file(file, data: dict):
+ with open(file, "w") as f:
+ json.dump(data, f, indent=4)
+
+
+def read_to_string(file):
+ stringio = StringIO(file.getvalue().decode("utf-8"))
+ return stringio.read()
diff --git a/requirements.txt b/requirements.txt
index 9770e95..8c8c682 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,14 @@
docx==0.2.4
-python_docx==0.8.11
-langchain==0.0.123
+python_docx==1.1.2
langdetect==1.0.9
-numpy==1.24.2
-openai==0.27.2
-pydub==0.25.1
PyPDF4==1.27.0
-pytube==12.1.3
-streamlit==1.20.0
-streamlit_toggle_switch==1.0.2
-tiktoken==0.3.1
-requests==2.29.0
-youtube_transcript_api==0.6.0
+tiktoken==0.8.0
+requests==2.32.3
+
+# langchain
+langchain==0.3.4
+langchain-openai==0.2.3
+
+# streamlit
+streamlit==1.39.0
diff --git a/src/Components/Info.py b/src/Components/Info.py
deleted file mode 100644
index 72f324f..0000000
--- a/src/Components/Info.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import streamlit as st
-import Modules.file_io as file_io
-
-
-def info():
- info_panel = st.container()
-
- manifest = 'src/manifest.json'
- st.session_state['MANIFEST'] = manifest_data = file_io.read_json(manifest)
-
- with info_panel:
- st.markdown('---')
- st.markdown(f"# {manifest_data['name']}")
- st.markdown(f"Version: `{manifest_data['version']}`")
- st.markdown(f"Author: {manifest_data['author']}")
- st.markdown(f"[Report a bug]({manifest_data['bugs']['url']})")
- st.markdown(f"[GitHub repo]({manifest_data['homepage']})")
- st.markdown(f"License: [{manifest_data['license']['type']}]({manifest_data['license']['url']})")
\ No newline at end of file
diff --git a/src/Components/StreamlitSetup.py b/src/Components/StreamlitSetup.py
deleted file mode 100644
index b57f3ce..0000000
--- a/src/Components/StreamlitSetup.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import streamlit as st
-import Data.caption_languages as data
-import Modules.file_io as file_io
-
-def setup():
- st.set_page_config(page_title="SumGPT", page_icon="π", layout="wide")
-
- if not st.session_state.get('OPENAI_API_KEY'):
- st.session_state['OPENAI_API_KEY'] = None
-
- if not st.session_state.get('OPENAI_PERSONA_REC'):
- st.session_state['OPENAI_PERSONA_REC'] = None
-
- if not st.session_state.get('OPENAI_PERSONA_SUM'):
- st.session_state['OPENAI_PERSONA_SUM'] = None
-
- if not st.session_state.get('CHUNK_SIZE'):
- st.session_state['CHUNK_SIZE'] = None
-
- if not st.session_state.get('OPENAI_PARAMS'):
- st.session_state['OPENAI_PARAMS'] = None
-
- if not st.session_state.get('DELAY'):
- st.session_state['DELAY'] = 0
-
- if not st.session_state.get('FINAL_SUMMARY_MODE'):
- st.session_state['FINAL_SUMMARY_MODE'] = False
-
- if not st.session_state.get('CAPTION_LANGUAGES'):
- st.session_state['CAPTION_LANGUAGES'] = data.languages + data.auto_languages
-
- if not st.session_state.get('PREVIOUS_RESULTS'):
- st.session_state['PREVIOUS_RESULTS'] = None
-
- if not st.session_state.get('MANIFEST'):
- st.session_state["MANIFEST"] = file_io.read_json("src/manifest.json")
\ No newline at end of file
diff --git a/src/Components/__init__.py b/src/Components/__init__.py
deleted file mode 100644
index 9391db9..0000000
--- a/src/Components/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from Components import sidebar
-from Components import StreamlitSetup
-from Components import Info
-__all__ = ['sidebar', 'StreamlitSetup', 'Info']
\ No newline at end of file
diff --git a/src/Components/sidebar.py b/src/Components/sidebar.py
deleted file mode 100644
index b5b3d41..0000000
--- a/src/Components/sidebar.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import streamlit as st
-import GPT
-import Modules.file_io as file_io
-from streamlit_toggle import st_toggle_switch
-import Components
-from typing import Any, Dict, List, Tuple, Union
-import json
-
-
-def set_openai_api_key(api_key: str):
- st.session_state["OPENAI_API_KEY"] = api_key
-
-
-def set_openai_persona(persona_rec: str, persona_sum: str):
- st.session_state["OPENAI_PERSONA_REC"] = persona_rec
- st.session_state["OPENAI_PERSONA_SUM"] = persona_sum
-
-
-def set_param(params: GPT.param):
- st.session_state["OPENAI_PARAMS"] = params
-
-
-def set_chunk_size(size: int):
- st.session_state['CHUNK_SIZE'] = size
-
-
-def set_delay(time: int):
- st.session_state['DELAY'] = time
-
-
-def set_final_summary_mode(mode: bool):
- st.session_state['FINAL_SUMMARY_MODE'] = mode
-
-
-def _set_config(config_file, key: str, default_value):
- if config_file:
- return file_io.read_json_upload(config_file, key)
- else:
- return default_value
-
-def _set_language(language: str):
- st.session_state['OUTPUT_LANGUAGE'] = language
-
-def _set_legacy(enable: bool):
- st.session_state['LEGACY'] = enable
-def _legacy(enable: bool, legacy, experimental):
- if not enable:
- return experimental
- else:
- return legacy
-def _extract_prompt(json_data: List[Dict[str,Union[bool, str]]], target_type: str, target_legacy: bool, language: str = "English") -> str | None:
- for item in json_data:
- if item["type"] == target_type and item["legacy"] == target_legacy:
- prompt = item["prompt"]
- new_prompt = prompt.replace("[LANGUAGE]", language)
- return new_prompt
- return None
-
-def sidebar():
- with st.sidebar:
- st.markdown("## How to use\n"
- "1. π Enter your [OpenAI API key](https://beta.openai.com/account/api-keys)\n"
- "2. π upload your file\n"
- "3. π Run\n"
- "---")
-
- config_file = st.file_uploader("π Import Configs", type=['json'])
-
- api_input = st.text_input(label="π OpenAI API Key",
- placeholder="Enter your OpenAI API key (sk-...)",
- type="password",
- help="You can get your API key from https://beta.openai.com/account/api-keys",
- value=_set_config(config_file, "OPENAI_API_KEY", ""))
-
- enable_legacy = st_toggle_switch(label="Legacy", default_value=_set_config(config_file, "LEGACY", False))
- enable_final_summary = st_toggle_switch(label="Enable Final Summary",
- default_value=_set_config(config_file, "FINAL_SUMMARY_MODE", False))
- if enable_final_summary:
- set_final_summary_mode(True)
- if st.session_state['FINAL_SUMMARY_MODE'] != enable_final_summary:
- set_final_summary_mode(enable_final_summary)
-
- with st.expander('π€ Bot Persona'):
- language_options = ['English', 'Chinese', 'Japanese', 'Korean', 'Spanish', 'French', 'German']
- language_index = language_options.index(_set_config(config_file, "LANGUAGE", 'English'))
- language = st.selectbox('Language', options=language_options, index=language_index)
- _set_language(language)
-
- prompts = file_io.read_json("resources/prompt.json")
-
- persona_rec_legacy = _extract_prompt(prompts, "recursive", True, language)
- persona_rec = _extract_prompt(prompts, "recursive", False, language)
- persona_rec = st.text_area('Bot Persona Recursive',
- value=_set_config(config_file, "OPENAI_PERSONA_REC", _legacy(enable_legacy, persona_rec_legacy, persona_rec)),
- help='System message is a pre-defined message used to instruct the assistant at the '
- 'beginning of a conversation. iterating and '
- 'experimenting with potential improvements can help to generate better outputs.'
- 'Make sure to use casual language.',
- height=250)
- if enable_final_summary:
- persona_sum_legacy = _extract_prompt(prompts, "final", True, language)
- persona_sum = _extract_prompt(prompts, "final", False, language)
-
- persona_sum = st.text_area('Bot Persona Total Sum',
- value=_set_config(config_file, "OPENAI_PERSONA_SUM", _legacy(enable_legacy, persona_sum_legacy, persona_sum)),
- help='This is a pre-defined message for total summarization that is used to'
- 'instruct the assistant at the beginning of a conversation. ',
- height=300)
- else:
- persona_sum = ""
-
- with st.expander('π₯ Advanced Options'):
- model_options = ['gpt-3.5-turbo','gpt-3.5-turbo-16k', 'gpt-4']
- model_index = model_options.index(_set_config(config_file, "MODEL", 'gpt-3.5-turbo'))
- model = st.selectbox("Model", options=model_options, index=model_index)
-
- if model == 'gpt-4':
- max_chunk = 4000
- elif model == 'gpt-3.5-turbo-16k':
- max_chunk = 16000
- else:
- max_chunk = 2500
- chunk_size = st.slider('Chunk Size (word count)', min_value=0, max_value=max_chunk, step=20,
- value=_set_config(config_file, "CHUNK_SIZE", 800))
- max_tokens_rec = st.slider('Max Tokens - Recursive Summary', min_value=0, max_value=4090, step=20,
- value=_set_config(config_file, "MAX_TOKENS_REC", 250))
- if enable_final_summary:
- max_tokens_final = st.slider('Max Tokens - Final Summary', min_value=0, max_value=4090, step=20,
- value=_set_config(config_file, "MAX_TOKENS_FINAL", 650))
- else:
- max_tokens_final = 0
- temperature = st.slider('Temperature', min_value=0.0, max_value=1.0, step=0.05,
- value=_set_config(config_file, "TEMPERATURE", 0.7))
- top_p = st.slider('Top P', min_value=0.0, max_value=1.0, step=0.05,
- value=_set_config(config_file, "TOP_P", 1.0))
- frequency_penalty = st.slider('Frequency Penalty', min_value=0.0, max_value=2.0, step=0.1,
- value=_set_config(config_file, "FREQUENCY_PENALTY", 0.0))
- presence_penalty = st.slider('Presence Penalty', min_value=0.0, max_value=2.0, step=0.1,
- value=_set_config(config_file, "PRESENCE_PENALTY", 0.0))
- if st_toggle_switch(label="Delay (free openAI API user)",
- default_value=_set_config(config_file, "ENABLE_DELAY", False)):
- delay = st.slider('Delay (seconds)', min_value=0, max_value=60, step=1,
- value=_set_config(config_file, "DELAY_TIME", 1))
- else:
- delay = 0
- param = GPT.param.gpt_param(
- model=model,
- max_tokens_final=max_tokens_final,
- max_tokens_rec=max_tokens_rec,
- temperature=temperature,
- top_p=top_p,
- frequency_penalty=frequency_penalty,
- presence_penalty=presence_penalty
- )
-
- st.download_button(label="π₯ Export Configs",
- data=json.dumps({
- "OPENAI_API_KEY": api_input,
- "FINAL_SUMMARY_MODE": enable_final_summary,
- "OPENAI_PERSONA_REC": persona_rec,
- "OPENAI_PERSONA_SUM": persona_sum,
- "CHUNK_SIZE": chunk_size,
- "MAX_TOKENS_REC": max_tokens_rec,
- "MAX_TOKENS_FINAL": max_tokens_final,
- "TEMPERATURE": temperature,
- "TOP_P": top_p,
- "FREQUENCY_PENALTY": frequency_penalty,
- "PRESENCE_PENALTY": presence_penalty,
- "MODEL": model,
- "ENABLE_DELAY": delay > 0,
- "DELAY_TIME": delay,
- "LANGUAGE": language,
- "LEGACY": enable_legacy
- }, indent=4),
- file_name="configs.json")
- Components.Info.info()
-
- if api_input:
- set_openai_api_key(api_input)
-
- if persona_rec:
- set_openai_persona(persona_rec, persona_sum)
-
- set_chunk_size(chunk_size)
- set_param(param)
- set_delay(delay)
- _set_legacy(enable_legacy)
\ No newline at end of file
diff --git a/src/Data/__init__.py b/src/Data/__init__.py
deleted file mode 100644
index 4de9124..0000000
--- a/src/Data/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from Data import caption_languages
-
-__all__ = ['caption_languages']
\ No newline at end of file
diff --git a/src/Data/caption_languages.py b/src/Data/caption_languages.py
deleted file mode 100644
index acec65e..0000000
--- a/src/Data/caption_languages.py
+++ /dev/null
@@ -1,6 +0,0 @@
-languages = [
- 'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh-Hans', 'zh-Hant', 'zh-TW', 'zh-CN', 'zh', 'ar', 'hi', 'th'
-]
-
-auto_languages = ['a.' + _language for _language in languages]
-
diff --git a/src/GPT/__init__.py b/src/GPT/__init__.py
deleted file mode 100644
index 0bcd76d..0000000
--- a/src/GPT/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from GPT import misc
-from GPT import embeddings
-from GPT import bot
-from GPT import param
-from GPT import generate
-
-__all__ = ['misc', 'embeddings', 'bot', 'param', 'generate']
\ No newline at end of file
diff --git a/src/GPT/bot.py b/src/GPT/bot.py
deleted file mode 100644
index ce8dd86..0000000
--- a/src/GPT/bot.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import openai
-from typing import Any, Dict, List, Tuple, Union
-
-
-class OpenAIChatBot:
- """A class to interact with the OpenAI API."""
-
- def __init__(self, api_key: str, persona: str, model: str, max_tokens: int, temperature: float, top_p: float,
- frequency_penalty: float, presence_penalty: float):
- openai.api_key = api_key
- self.persona = persona
- self.model = model
- self.max_tokens = max_tokens
- self.temperature = temperature
- self.top_p = top_p
- self.frequency_penalty = frequency_penalty
- self.presence_penalty = presence_penalty
-
- def chat_stream(self, prompt: str) -> openai.api_resources.chat_completion.ChatCompletion:
- """Returns the streamed response from the OpenAI API."""
- completions = openai.ChatCompletion.create(
- model=self.model,
- max_tokens=self.max_tokens,
- temperature=self.temperature,
- top_p=self.top_p,
- frequency_penalty=self.frequency_penalty,
- presence_penalty=self.presence_penalty,
- stream=True,
- messages=[
- {"role": "system", "content": self.persona},
- {"role": "user", "content": prompt}
- ])
- return completions
-
- def chat(self, prompt: str) -> Tuple[str, str]:
- """Returns the response from the OpenAI API."""
- completions = openai.ChatCompletion.create(
- model=self.model,
- max_tokens=self.max_tokens,
- temperature=self.temperature,
- top_p=self.top_p,
- frequency_penalty=self.frequency_penalty,
- presence_penalty=self.presence_penalty,
- messages=[
- {"role": "system", "content": self.persona},
- {"role": "user", "content": f"{self.persona} '{prompt}'"}
- ])
- return completions['choices'][0]['message']['content'], completions['choices'][0]['finish_reason']
diff --git a/src/GPT/embeddings.py b/src/GPT/embeddings.py
deleted file mode 100644
index 3e6cb50..0000000
--- a/src/GPT/embeddings.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import openai
-
-
-class openAIEmbeddings:
- def __init__(self, api_key: str):
- openai.api_key = api_key
-
- def embedding(self, content: str, engine: str = 'text-embedding-ada-002') -> float:
- """Returns the embedding vector of a string."""
- response = openai.Embedding.create(input=content, engine=engine)
- vector = response['data'][0]['embedding']
- return vector
diff --git a/src/GPT/generate.py b/src/GPT/generate.py
deleted file mode 100644
index 627faf6..0000000
--- a/src/GPT/generate.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import GPT.bot
-import streamlit as st
-import GPT.param
-from typing import Any, Dict, List, Tuple, Union
-
-
-def get_answer_stream(content: str):
- """Returns a stream of responses from the OpenAI API."""
- params = st.session_state["OPENAI_PARAMS"]
- previous_char = ''
- bot = GPT.bot.OpenAIChatBot(st.session_state["OPENAI_API_KEY"],
- st.session_state["OPENAI_PERSONA"],
- params.model,
- params.max_tokens_rec,
- params.temperature,
- params.top_p,
- params.frequency_penalty,
- params.presence_penalty)
- responses = bot.chat_stream(content)
- response_panel = st.empty()
- for response_json in responses:
- choice = response_json['choices'][0]
- if choice['finish_reason'] == 'stop':
- break
-
- # error handling
- if choice['finish_reason'] == 'length':
- st.warning('β οΈResult cut off due to length. Consider increasing the max tokens parameter.')
- break
-
- delta = choice['delta']
- if 'role' in delta or delta == {}:
- char = ''
- else:
- char = delta['content']
- answer = previous_char + char
- response_panel.info(answer)
-
-
-def get_answer(content: str, max_tokens, persona: str) -> Tuple[str, str]:
- """Returns a response from the OpenAI API."""
- params = st.session_state["OPENAI_PARAMS"]
- bot = GPT.bot.OpenAIChatBot(st.session_state["OPENAI_API_KEY"],
- persona,
- params.model,
- max_tokens,
- params.temperature,
- params.top_p,
- params.frequency_penalty,
- params.presence_penalty)
- response, finish_reason = bot.chat(content)
- return response, finish_reason
diff --git a/src/GPT/misc.py b/src/GPT/misc.py
deleted file mode 100644
index b93481c..0000000
--- a/src/GPT/misc.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import openai
-from langchain.llms import OpenAI
-import os
-import streamlit as st
-from typing import Any, Dict, List, Tuple, Union
-
-
-def validate_api_key(api_key: str) -> bool:
- """Validates the OpenAI API key by trying to create a completion."""
- openai.api_key = api_key
- try:
- openai.ChatCompletion.create(
- model="gpt-3.5-turbo",
- max_tokens=1,
- messages=[
- {"role": "user", "content": "Hello!"}
- ]
- )
- return True
- except openai.error.AuthenticationError:
- return False
-
-
-def predict_token(param, chunks) -> Dict[str, int]:
- """predict how many tokens to generate."""
- if st.session_state["OPENAI_API_KEY"] is not None:
- os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"]
- llm = OpenAI()
- prompt_token_total = 0
- completion_token_total = 0
- for chunk in chunks:
- prompt_token = llm.get_num_tokens(chunk['content'])
- prompt_token_total += prompt_token
- completion_token_total += param.max_tokens_rec
-
- if st.session_state['FINAL_SUMMARY_MODE']:
- completion_token_total += param.max_tokens_final
- total_token = prompt_token_total + completion_token_total
- token = {'total': total_token,
- 'prompt': prompt_token_total,
- 'completion': completion_token_total}
-
- return token
- else:
- return {'total': 0, 'prompt': 0, 'completion': 0}
-
-
-def predict_token_single(chunk: Dict[str, Union[str, float]] | str, max_tokens: int = None) -> int:
- """predict how many tokens to generate."""
- if st.session_state["OPENAI_API_KEY"] is not None:
- os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"]
- llm = OpenAI()
- if isinstance(chunk, str):
- chunk_content = chunk
- else:
- chunk_content = chunk['content']
- chunk_token = llm.get_num_tokens(chunk_content)
- if max_tokens is not None:
- chunk_token += max_tokens
-
- return chunk_token
- else:
- return 0
-
-
-def is_tokens_exceeded(param, chunks, max_token: int = 4096) -> Dict[str, Union[bool, str]]:
- """Checks if the number of tokens used has exceeded the limit."""
-
- # check recursive chunks tokens
- rec_chunks_token = []
- for chunk in chunks:
- chunk_token = predict_token_single(chunk, param.max_tokens_rec)
- rec_chunks_token.append(chunk_token)
-
-
- # check final chunks tokens
- final_prompt_token = len(chunks) * param.max_tokens_rec
- final_completion_token = param.max_tokens_final
- final_chunks_token = final_prompt_token + final_completion_token
-
- # evaluate
- if max(rec_chunks_token) > max_token:
- return {'exceeded': True,
- 'reason': 'recursive',
- 'message': f"**[ Recursive summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {max(rec_chunks_token)}\n"
- f"(Prompt: {max(rec_chunks_token) - param.max_tokens_rec}, "
- f"Completion: {param.max_tokens_rec})"}
-
- elif final_chunks_token > max_token and st.session_state['FINAL_SUMMARY_MODE']:
- return {'exceeded': True,
- 'reason': 'final',
- 'message': f"**[ Final summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {final_chunks_token}\n"
- f"(Prompt: {final_prompt_token}, Completion: {final_completion_token})"}
-
- else:
- return {'exceeded': False,
- 'reason': '',
- 'message': ''}
diff --git a/src/GPT/param.py b/src/GPT/param.py
deleted file mode 100644
index 866f112..0000000
--- a/src/GPT/param.py
+++ /dev/null
@@ -1,11 +0,0 @@
-
-class gpt_param:
- def __init__(self, model: str, max_tokens_final: int, max_tokens_rec: int, temperature: float, top_p: float,
- frequency_penalty: float, presence_penalty: float):
- self.model = model
- self.max_tokens_rec = max_tokens_rec
- self.max_tokens_final = max_tokens_final
- self.temperature = temperature
- self.top_p = top_p
- self.frequency_penalty = frequency_penalty
- self.presence_penalty = presence_penalty
diff --git a/src/Modules/Youtube.py b/src/Modules/Youtube.py
deleted file mode 100644
index f399cb3..0000000
--- a/src/Modules/Youtube.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import requests
-import re
-from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
-import streamlit as st
-from typing import Any, Dict, List, Tuple, Union
-
-
-manifest = st.session_state["MANIFEST"]
-def _error_report_msg(youtube_url):
- return f"Please create an issue on [GitHub]({manifest['bugs']['url']}). " \
- f"Please include the YouTube URL ({youtube_url}), version number ({manifest['version']}) " \
- f"and all necessary information to replicate the error. " \
- f"**Before creating a new issue, please check if the problem has already been reported.**"
-
-def _extract_video_id_from_url(url):
- video_id_pattern = r'(?:v=|/v/|youtu\.be/|/embed/|/e/)([^?&"\'>]+)'
- match = re.search(video_id_pattern, url)
- if match:
- return match.group(1)
- else:
- raise ValueError("Invalid YouTube URL")
-
-def get_video_title(youtube_url):
- video_id = _extract_video_id_from_url(youtube_url)
- url = f'https://www.youtube.com/watch?v={video_id}'
- response = requests.get(url)
- title_pattern = r'
(.+?) - YouTube<\/title>'
- match = re.search(title_pattern, response.text)
- if match:
- title = match.group(1)
- return title
- else:
- return None
-
-def get_available_subtitle_languages(video_id):
- try:
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
- languages = [transcript.language_code for transcript in transcript_list]
- return languages
- except Exception as e:
- print(f"Error fetching available subtitle languages: {e}")
- return []
-
-def get_video_captions(youtube_url, languages):
- video_id = _extract_video_id_from_url(youtube_url)
- simplified_url = f'https://www.youtube.com/watch?v={video_id}'
-
- available_language = get_available_subtitle_languages(video_id)
-
- if not any(lang in languages for lang in available_language) and available_language != []:
- print(f"Failed to retrieve transcript: Language {available_language} is/are not yet supported for {simplified_url}.")
- st.error(f'β Language {available_language} is/are not yet supported for {simplified_url}.\n\n' + _error_report_msg(simplified_url))
- st.stop()
-
- for language in languages:
- try:
- transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
- captions = ""
- for item in transcript:
- captions += item['text'] + "\n"
- return captions
-
- except NoTranscriptFound as e:
- if language == languages[-1]:
- print(f"Language {available_language} exist in language list but failed to retrieve in YouTubeTranscriptApi.get_transcript: {e}")
- st.error(f'β Language {available_language} exist in language list but failed to retrieve in `YouTubeTranscriptApi.get_transcript`:\n\n'
- f'languages = {available_language}\n\n'
- f'language list = {languages}\n\n'
- + _error_report_msg(simplified_url))
- st.stop()
- else:
- continue
-
- except TranscriptsDisabled:
- print(f"Failed to retrieve transcript: transcripts disabled for {simplified_url}")
- st.error(f'β Subtitles not available for {simplified_url}! \n\n---'
- f'\n**Instruction:**\n\n'
- f'1. Verify if the [video]({simplified_url}) has subtitles available.\n\n'
- f"2. If you are confident that subtitles are available in the video but could not be retrieved, "
- + _error_report_msg(simplified_url))
- st.stop()
- raise TranscriptsDisabled
-
- except Exception as e:
- print(e)
- st.error(f'β Failed to fetch data from YouTube for {simplified_url}. \n\n'
- f'{_error_report_msg(simplified_url)}'
- f'\n\nError: \n\n---\n\n{e}')
- st.stop()
- break
-
-@st.cache_data(show_spinner=False)
-def extract_youtube_transcript(url: str, lang_code: str | List[str] = 'a.en') -> Tuple[str, str]:
- """Extracts the transcript from a YouTube video."""
- transcript = get_video_captions(url, lang_code)
- title = get_video_title(url)
- return transcript, title
diff --git a/src/Modules/__init__.py b/src/Modules/__init__.py
deleted file mode 100644
index 412ace4..0000000
--- a/src/Modules/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from Modules import file_io
-
-__all__ = ['file_io']
\ No newline at end of file
diff --git a/src/Modules/file_io.py b/src/Modules/file_io.py
deleted file mode 100644
index 0f214e8..0000000
--- a/src/Modules/file_io.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import re
-import PyPDF4
-import docx
-from typing import Any, Dict, List, Tuple, Union
-from pydub import AudioSegment
-import math
-import json
-import streamlit as st
-
-
-
-@st.cache_data()
-def read_json(file, key: str = None) -> Any:
- """Reads a json file and returns the value of a key."""
- with open(file, "r") as f:
- data = json.load(f)
- if key and isinstance(data, dict):
- return data[key]
- elif key and isinstance(data, list):
- return [d[key] for d in data]
- else:
- return data
-
-
-@st.cache_data()
-def read_json_upload(file, key: str) -> Any:
- """Reads a json file and returns the value of a key."""
- if not isinstance(file, str):
- f = file.getvalue().decode("utf-8")
- data = json.loads(f)
- return data[key]
-
-
-@st.cache_data()
-def read_txt(file, encoding: str = "utf-8") -> str:
- """Reads a text file."""
- return file.read().decode(encoding)
-
-
-@st.cache_data()
-def read_pdf(file) -> List[str]:
- """Reads a pdf file."""
- pdfReader = PyPDF4.PdfFileReader(file, strict=False)
- texts = []
- for page in range(pdfReader.numPages):
- text = pdfReader.getPage(page).extractText()
- # Merge hyphenated words
- text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
- # Fix newlines in the middle of sentences
- text = re.sub(r"(? str:
- """Reads a docx file."""
- doc = docx.Document(file)
- text = ""
- for para in doc.paragraphs:
- # Remove multiple newlines
- t = re.sub(r"\n\s*\n", "\n\n", para.text)
- text += t + "\n"
- return text
-
-
-@st.cache_data()
-def _split_audio(audio, chunk_size=2) -> List[AudioSegment]:
- """Split audio into chunks of 10 minutes."""
- # load audio
- audio = AudioSegment.from_file(audio, format="mp3")
- # Define the chunk size (10 minutes default)
- chunk_size = chunk_size * 60 * 1000
- # calculate the number of chunks
- num_chunks = math.ceil(len(audio) / chunk_size)
- chunks = []
- # split audio into chunks
- for i in range(num_chunks):
- start = i * chunk_size
- end = start + chunk_size
- chunk = audio[start:end]
- chunks.append(chunk)
- return chunks
-
-
-@st.cache_data()
-def read(file) -> str | List[str]:
- """Reads a file and returns the content."""
- if file.name.endswith(".txt") or file.name.endswith(".md"):
- return read_txt(file)
- elif file.name.endswith(".pdf"):
- return read_pdf(file)
- elif file.name.endswith(".docx"):
- return read_docx(file)
- else:
- raise ValueError("File type not supported")
diff --git a/src/SumGPT.py b/src/SumGPT.py
deleted file mode 100644
index 355ebd4..0000000
--- a/src/SumGPT.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import asyncio
-
-import streamlit as st
-
-import Components.StreamlitSetup as StreamlitSetup
-
-StreamlitSetup.setup()
-
-import time # noqa: E402
-
-import GPT # noqa: E402
-import Modules.file_io as file_io # noqa: E402
-import Modules.Youtube # noqa: E402
-import utils.helpers as helpers # noqa: E402
-from Components.sidebar import sidebar # noqa: E402
-
-app_header = st.container()
-
-file_handler = st.container()
-content_handler = st.container()
-result_handler = st.container()
-
-with app_header:
- st.title("π SumGPT")
- st.markdown("##### Summarize your text with OpenAI's GPT-3.5 / GPT-4 API")
- st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)")
- st.warning(
- "Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo."
- )
-
-sidebar()
-
-with file_handler:
- if st.button("π Refresh"):
- st.cache_data.clear()
- youtube_link_empty = st.empty()
- upload_file_emtpy = st.empty()
-
- youtube_link = youtube_link_empty.text_input(
- label="π YouTube Link",
- placeholder="Enter your YouTube link",
- help="Enter your YouTube link to download the video and extract the audio",
- )
- upload_file = upload_file_emtpy.file_uploader(
- "π Upload your file", type=["txt", "pdf", "docx", "md"]
- )
- if youtube_link:
- upload_file_emtpy.empty()
- with st.spinner("π Extracting transcript..."):
- transcript, title = Modules.Youtube.extract_youtube_transcript(
- youtube_link, st.session_state["CAPTION_LANGUAGES"]
- )
- file_content = {"name": f"{title}.txt", "content": transcript}
- elif upload_file:
- youtube_link_empty.empty()
- with st.spinner("π Reading file... (mp3 file might take a while)"):
- file_content = {"name": upload_file.name, "content": file_io.read(upload_file)}
- elif youtube_link and upload_file:
- st.warning("Please only upload one file at a time")
- else:
- file_content = None
-
-with content_handler:
- if file_content:
- with st.expander("File Preview"):
- if file_content["name"].endswith(".pdf"):
- content = "\n\n".join(file_content["content"])
- st.text_area(file_content["name"], content, height=200)
- else:
- content = file_content["content"]
- st.text_area(file_content["name"], content, height=200)
-
-with result_handler:
- if file_content:
- chunks = []
- content = file_content["content"]
- if file_content["name"].endswith(".pdf"):
- content = "\n\n".join(file_content["content"])
- chunks.extend(helpers.convert_to_chunks(content, chunk_size=st.session_state["CHUNK_SIZE"]))
-
- with st.expander(f"Chunks ({len(chunks)})"):
- for chunk in chunks:
- st.write(chunk)
-
- token_usage = GPT.misc.predict_token(st.session_state["OPENAI_PARAMS"], chunks)
- param = st.session_state["OPENAI_PARAMS"]
- prompt_token = token_usage["prompt"]
- completion_token = token_usage["completion"]
- if param.model == "gpt-4":
- price = round(prompt_token * 0.00003 + completion_token * 0.00006, 5)
- elif param.model == "gpt-3.5-turbo-16k":
- price = round(prompt_token * 0.000003 + completion_token * 0.000004, 5)
- else:
- price = round(prompt_token * 0.0000015 + completion_token * 0.000002, 5)
- st.markdown(
- f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`"
- )
- # max tokens exceeded warning
- exceeded = helpers.exceeded_token_handler(
- param=st.session_state["OPENAI_PARAMS"], chunks=chunks
- )
-
- # load cached results
- if st.session_state["PREVIOUS_RESULTS"] is not None:
- rec_responses = st.session_state["PREVIOUS_RESULTS"]["rec_responses"]
- rec_id = st.session_state["PREVIOUS_RESULTS"]["rec_ids"]
- final_response = st.session_state["PREVIOUS_RESULTS"]["final_response"]
- finish_reason_rec = st.session_state["PREVIOUS_RESULTS"]["finish_reason_rec"]
- finish_reason_final = st.session_state["PREVIOUS_RESULTS"]["finish_reason_final"]
- else:
- rec_responses = None
- rec_id = None
- final_response = None
- finish_reason_rec = None
- finish_reason_final = None
-
- # finish_reason_rec = None
- if st.button("π Run", disabled=exceeded):
- start_time = time.time()
- st.cache_data.clear()
- API_KEY = st.session_state["OPENAI_API_KEY"]
- if not API_KEY and not GPT.misc.validate_api_key(API_KEY):
- st.error(
- "β Please enter a valid [OpenAI API key](https://beta.openai.com/account/api-keys)."
- )
- else:
- with st.spinner("Summarizing... (this might take a while)"):
- if st.session_state["LEGACY"]:
- rec_max_token = st.session_state["OPENAI_PARAMS"].max_tokens_rec
- rec_responses, finish_reason_rec = helpers.recursive_summarize(
- chunks, rec_max_token
- )
- if st.session_state["FINAL_SUMMARY_MODE"]:
- final_response, finish_reason_final = helpers.summarize(rec_responses)
- else:
- final_response = None
- else:
- completions, final_response = asyncio.run(
- helpers.summarize_experimental_concurrently(
- content, st.session_state["CHUNK_SIZE"]
- )
- )
- rec_responses = [d["content"] for d in completions]
- rec_ids = [d["chunk_id"] for d in completions]
- # save previous completions
- resp = {
- "rec_responses": rec_responses,
- "rec_ids": rec_ids,
- "final_response": final_response,
- "finish_reason_rec": finish_reason_rec,
- "finish_reason_final": finish_reason_final,
- }
- if resp != st.session_state["PREVIOUS_RESULTS"]:
- st.session_state["PREVIOUS_RESULTS"] = resp
-
- end_time = time.time()
- st.markdown(f"β±οΈ Time taken: `{round(end_time - start_time, 2)}s`")
-
- if rec_responses is not None:
- with st.expander(
- "Recursive Summaries", expanded=not st.session_state["FINAL_SUMMARY_MODE"]
- ):
- for i, response in enumerate(rec_responses):
- st.info(f"{response}")
- if finish_reason_rec == "length":
- st.warning(
- "β οΈResult cut off due to length. Consider increasing the [Max Tokens Chunks] parameter."
- )
-
- if final_response is not None:
- st.header("πSummary")
- st.info(final_response)
- if finish_reason_final == "length":
- st.warning(
- "β οΈResult cut off due to length. Consider increasing the [Max Tokens Summary] parameter."
- )
- if final_response is not None or rec_responses is not None:
- helpers.download_results(rec_responses, final_response)
diff --git a/src/manifest.json b/src/manifest.json
deleted file mode 100644
index 731522c..0000000
--- a/src/manifest.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
- "name": "SumGPT",
- "version": "1.0.8",
- "license": {
- "type": "MIT",
- "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE"
- },
- "author": "Zeke Zhang",
- "homepage": "https://github.com/sean1832/SumGPT",
- "repository": {
- "type": "git",
- "url": "https://github.com/sean1832/SumGPT"
- },
- "bugs": {
- "url": "https://github.com/sean1832/SumGPT/issues"
- }
-}
\ No newline at end of file
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
deleted file mode 100644
index 5fe7f8d..0000000
--- a/src/utils/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from utils import helpers
-
-__all__ = ["helpers"]
diff --git a/src/utils/helpers.py b/src/utils/helpers.py
deleted file mode 100644
index 6fd2618..0000000
--- a/src/utils/helpers.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import os
-import asyncio
-
-import numpy as np
-from typing import Any, Dict, List, Tuple, Union
-
-from GPT.embeddings import openAIEmbeddings
-import streamlit as st
-import re
-import GPT
-import textwrap
-from langdetect import detect
-import time
-from datetime import datetime
-
-from langchain.chat_models import ChatOpenAI
-from langchain.docstore.document import Document
-from langchain.prompts import PromptTemplate
-from langchain.chains.summarize import load_summarize_chain
-from langchain.chains import LLMChain
-
-def _similarity(v1, v2) -> np.ndarray:
- """Returns the cosine similarity between two vectors."""
- return np.dot(v1, v2)
-
-@st.cache_data(show_spinner=False)
-def _chunk_spliter(content: str, chunk_size: int = 1000, lang_base: str = 'latin') -> List[str]:
- """Splits a string into chunks of a given size."""
-
- sentences = re.split(r'(?<=[.?!,γοΌγοΌοΌΒ·])\s+', content)
- if lang_base == 'latin':
- chunks = []
- chunk = ''
- word_count = 0
- for sentence in sentences:
- sentence += ' ' # add space at end to compensate for split
- words = sentence.split()
- sentence_word_count = len(words)
- if word_count + sentence_word_count <= chunk_size:
- chunk += sentence
- word_count += sentence_word_count
- else:
- chunks.append(chunk.strip())
- chunk = sentence
- word_count = sentence_word_count
- # add the last chunk
- if chunk:
- chunks.append(chunk.strip())
-
- new_chunks = []
- for c in chunks:
- if c == '':
- continue
- if len(c.split()) > chunk_size + 25:
- words = c.split()
- small_chunks = []
- for i in range(0, len(words), chunk_size):
- small_chunks.append(' '.join(words[i:i + chunk_size]))
- new_chunks.extend(small_chunks)
- else:
- new_chunks.append(c)
- return new_chunks
-
- else:
- chunks = textwrap.wrap(content, width=chunk_size)
- return chunks
-
-
-def language_base(string: str) -> str:
- try:
- lang_code = detect(string)
- latin_based = ['en', 'fr-ca', 'es']
- east_asian_based = ['zh', 'ja', 'ko']
- for lang in latin_based:
- if lang_code.startswith(lang):
- return 'latin'
- for lang in east_asian_based:
- if lang_code.startswith(lang):
- return 'east_asian'
- return 'other'
- except KeyError:
- return 'other'
-
-@st.cache_data(show_spinner=False)
-def convert_to_chunks(content: str, chunk_size: int = 1000, enable_embedding: bool = False) -> List[Dict[str, float]]:
- """Converts a string into chunks of a given size."""
- chunks_text = _chunk_spliter(content, chunk_size, language_base(content))
- chunks = []
- for i, chunk in enumerate(chunks_text):
- if enable_embedding:
- embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"])
- chunks.append({'content': chunk, 'vector': embedding.embedding(chunk)})
- else:
- chunks.append({'content': chunk, 'language_based': language_base(chunk), 'chunk_id': i})
- return chunks
-
-
-def search_chunks(query: str, chunks: List[Dict[str, float]], count: int = 1) -> List[Dict[str, np.ndarray]]:
- """Returns the top `count` chunks that are most similar to the query."""
- embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"])
- vectors = embedding.embedding(query)
- points = []
-
- for chunk in chunks:
- point = _similarity(vectors, chunk['vector'])
- points.append({'content': chunk['content'], 'point': point})
-
- # sort the points in descending order
- ordered = sorted(points, key=lambda x: x['point'], reverse=True)
- return ordered[0:count]
-
-@st.cache_data(show_spinner=False)
-def convert_to_docs(chunks: List[Dict[str, Union[str, float]]]) -> List[Document] | Document:
- """Converts a list of chunks into a list of documents."""
- docs = []
- for chunk in chunks:
- content = chunk['content']
- metadata = {'chunk_id': chunk['chunk_id']}
- doc = Document(page_content=content, metadata=metadata)
- docs.append(doc)
- return docs
-
-async def async_generate(chain, chunk)-> Dict[str, Union[str, int]]:
- """Generates a summary asynchronously."""
- resp = await chain.arun(text=chunk['content'])
- return {'content': resp, 'chunk_id': chunk['chunk_id']}
-
-async def summarize_experimental_concurrently(content: str, chunk_size: int = 1000) -> Tuple[List[Dict[str, Union[str, int]]], str]:
- """Summarizes a string asynchronously."""
- os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"]
- params = st.session_state['OPENAI_PARAMS']
- llm_rec = ChatOpenAI(model_name=params.model,
- max_tokens=params.max_tokens_rec,
- temperature=params.temperature,
- top_p=params.top_p,
- frequency_penalty=params.frequency_penalty,
- presence_penalty=params.presence_penalty)
- llm_final = ChatOpenAI(model_name=params.model,
- max_tokens=params.max_tokens_final,
- temperature=params.temperature,
- top_p=params.top_p,
- frequency_penalty=params.frequency_penalty,
- presence_penalty=params.presence_penalty)
- chunks = convert_to_chunks(content, chunk_size)
-
- REC_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_REC'], input_variables=['text'])
- chain = LLMChain(llm=llm_rec, prompt=REC_PROMPT)
-
- tasks = []
- for chunk in chunks:
- task = async_generate(chain, chunk)
- tasks.append(task)
-
- outputs_rec = []
- progress_bar = st.progress(0, f"Generating summary 0/{len(chunks)}")
- count = 1
- for coro in asyncio.as_completed(tasks):
- output_rec = await coro
- outputs_rec.append(output_rec)
- progress_bar.progress(count / len(chunks), f"Generating summary {count}/{len(chunks)}")
- count += 1
- rec_result = sorted(outputs_rec, key=lambda x: x['chunk_id'])
- if st.session_state['FINAL_SUMMARY_MODE']:
- FINAL_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_SUM'], input_variables=['text'])
- chain = load_summarize_chain(llm_final, chain_type='stuff', prompt=FINAL_PROMPT)
- docs = convert_to_docs(rec_result)
- final_result = chain.run(docs)
- else:
- final_result = None
- return rec_result, final_result
-
-@st.cache_data(show_spinner=False)
-def recursive_summarize(chunks: List[Dict[str, Union[str, float]]], max_tokens) -> Tuple[List[str], str]:
- """Returns a recursive summary of the given content."""
- recursiveSumTexts = []
- finish_reason = ''
- chunks_length = len(chunks)
- count = 0
- progress_bar = st.progress(0)
- for chunk in chunks:
- content = chunk['content']
- text, finish_reason = GPT.generate.get_answer(content,
- max_tokens=max_tokens,
- persona=st.session_state['OPENAI_PERSONA_REC'])
- recursiveSumTexts.append(text)
- progress_bar.progress((count + 1) / chunks_length)
- count += 1
- time.sleep(st.session_state['DELAY'])
-
- return recursiveSumTexts, finish_reason
-
-
-@st.cache_data(show_spinner=False)
-def summarize(message: List[str] | str) -> Tuple[str, str]:
- """Returns a summary of the given content."""
- if isinstance(message, list):
- join_msg = ' '.join(message)
- else:
- join_msg = message
-
- params = st.session_state['OPENAI_PARAMS']
- max_asw_tokens_final = params.max_tokens_final
-
- answer, finish_reason = GPT.generate.get_answer(join_msg, max_tokens=max_asw_tokens_final,
- persona=st.session_state['OPENAI_PERSONA_SUM'])
- return answer, finish_reason
-
-
-def download_results(rec_responses, final_response):
- """Downloads the results as a txt file."""
- joint_rec_response = f"=====recursive responses=====\n\n" + '\n\n'.join(rec_responses)
- joint_final_response = f"{joint_rec_response}\n\n======final response=====\n\n{final_response}"
- now = datetime.now()
- if final_response is not None:
- st.download_button("π₯ Download Summary",
- joint_final_response,
- file_name=f"summary_{now.strftime('%Y-%m-%d_%H-%M')}.md")
- else:
- st.download_button("π₯ Download Summary",
- joint_rec_response,
- file_name=f"summary_{now.strftime('%Y-%m-%d_%H-%M')}.md")
-
-
-def exceeded_token_handler(param, chunks) -> bool:
- """Handles the case where the user has exceeded the number of tokens."""
- if param.model == 'gpt-4':
- max_token = 8100
- elif param.model == 'gpt-3.5-turbo-16k':
- max_token = 16385
- else:
- max_token = 4096
- info = GPT.misc.is_tokens_exceeded(param, chunks, max_token)
- if info['exceeded']:
- st.error(f"β {info['message']}")
- return True
- else:
- return False