diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 5a85bff..0000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 0594372..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 474a048..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/sumGPT.iml b/.idea/sumGPT.iml deleted file mode 100644 index a29c46d..0000000 --- a/.idea/sumGPT.iml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/RUN.bat b/RUN.bat index 6afd014..7488c88 100644 --- a/RUN.bat +++ b/RUN.bat @@ -38,4 +38,4 @@ if "%mod_date%" neq "%last_mod_date%" ( echo "Requirements file has not been modified. Skipping update." ) -streamlit run src/SumGPT.py \ No newline at end of file +streamlit run SumGPT/main.py \ No newline at end of file diff --git a/SumGPT/__init__.py b/SumGPT/__init__.py new file mode 100644 index 0000000..a576e4b --- /dev/null +++ b/SumGPT/__init__.py @@ -0,0 +1,4 @@ +import streamlit as st + +if "summaries" not in st.session_state: + st.session_state["summaries"] = [] diff --git a/SumGPT/app/__init__.py b/SumGPT/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py new file mode 100644 index 0000000..8becb93 --- /dev/null +++ b/SumGPT/app/body_handler.py @@ -0,0 +1,243 @@ +import asyncio +from typing import Any, Dict, List, Optional, Tuple + +import streamlit as st +import utils.io as io +from core.llm import LLM +from core.tokenizer import Tokenizer +from datamodel.chunk import Chunk +from datamodel.llm_params import LLMParams + + +class BodyHandler: + def file_uploader(self, type: List[str] = ["txt"]) -> List[Dict[str, str]]: + uploaded_files = st.file_uploader("Upload a file", type=type, accept_multiple_files=True) + files = [] + if uploaded_files is None: + st.stop() + st.warning("File is not uploaded.") + for file in uploaded_files: + text = io.read_to_string(file) + filename = file.name + files.append({"filename": filename, "text": text}) + return files + + def segment_text( + self, text: str, chunk_size: int, model: str, input_id: int + ) -> Tuple[List[Chunk], int]: + chunks: List[Chunk] = [] + tokenizer = Tokenizer(model) + total_tokens = tokenizer.tokenize(text) + count = 0 + for i in range(0, len(total_tokens), chunk_size): + chunk_tokens = total_tokens[i : i + chunk_size] + content = tokenizer.detokenize(chunk_tokens) + chunks.append(Chunk(count, content, len(chunk_tokens), input_id)) + count += 1 + return chunks, len(total_tokens) + + def _get_tokens(self, response_meta: Dict[str, Any]) -> Tuple[int, int, int]: + completion_tokens = response_meta.get("token_usage", {}).get("completion_tokens", 0) + prompt_tokens = response_meta.get("token_usage", {}).get("prompt_tokens", 0) + cached_tokens = ( + response_meta.get("token_usage", {}) + .get("prompt_tokens_details", {}) + .get("cached_tokens", 0) + ) + return completion_tokens, prompt_tokens, cached_tokens + + def generate( + self, + chunks: List[Chunk], + gpt_params: LLMParams, + role: str, + api_key: Optional[str], + ) -> None: + generate_button = st.button("Generate summary") + if generate_button: + if not api_key: + st.error("❌ Please enter your OpenAI API key in the sidebar.") + return + if not role: + st.error("❌ Please enter a role description in the sidebar.") + return + + st.session_state["summaries"] = [] # Initialize or reset summaries + + progress_text = st.empty() + progress_bar = st.progress(0) + total_chunks = len(chunks) + + # Group chunks by filename + filename_chunks = {} + for chunk in chunks: + if chunk.filename not in filename_chunks: + filename_chunks[chunk.filename] = [] + filename_chunks[chunk.filename].append(chunk) + + llm = LLM(api_key, gpt_params) + processed_chunks = 0 + + # Process chunks by filename + for filename, file_chunks in filename_chunks.items(): + expander = st.expander(f"{filename}") + for chunk in file_chunks: + processed_chunks += 1 + progress_text.write(f"Generating summaries {processed_chunks}/{total_chunks}") + progress_bar.progress(processed_chunks / total_chunks) + + summary = llm.generate(chunk.content, role) + with expander: + with st.chat_message("πŸ€–"): + st.write(summary.content) + completion_tokens, prompt_tokens, cached_tokens = self._get_tokens( + summary.response_metadata + ) + price = round( + llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6 + ) + st.write( + f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`" + ) + # Store the summary in session state + st.session_state["summaries"].append( + { + "filename": filename, + "content": summary.content, + "tokens": completion_tokens + prompt_tokens, + "price": price, + } + ) + + progress_text.write("βœ… All chunks processed!") + progress_bar.progress(1.0) + else: + # Check if summaries exist in session state and display them + if "summaries" in st.session_state: + # Group summaries by filename + filename_summaries = {} + for summary_data in st.session_state["summaries"]: + filename = summary_data["filename"] + if filename not in filename_summaries: + filename_summaries[filename] = [] + filename_summaries[filename].append(summary_data) + + # Display summaries grouped by filename + for filename, summaries in filename_summaries.items(): + with st.expander(f"{filename}"): + for summary_data in summaries: + with st.chat_message("πŸ€–"): + st.write(summary_data["content"]) + st.write( + f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`" + ) + + def agenerate( + self, + chunks: List[Chunk], + gpt_params: LLMParams, + role: str, + api_key: Optional[str], + ) -> None: + generate_button = st.button("Generate summary") + if generate_button: + if not api_key: + st.error("❌ Please enter your OpenAI API key in the sidebar.") + return + if not role: + st.error("❌ Please enter a role description in the sidebar.") + return + + st.session_state["summaries"] = [] # Initialize or reset summaries + + async def process_chunks(): + llm = LLM(api_key, gpt_params) + total_chunks = len(chunks) + progress_text = st.empty() + progress_text.write(f"Generating summaries 0/{total_chunks}") + total_price_text = st.empty() + total_price = 0 + + progress_bar = st.progress(0) + completed_chunks = 0 + + # Sort chunks by chunk.id + sorted_chunks = sorted(chunks, key=lambda c: c.id) + + # Group chunks by filename + filename_chunks = {} + for chunk in sorted_chunks: + if chunk.filename not in filename_chunks: + filename_chunks[chunk.filename] = [] + filename_chunks[chunk.filename].append(chunk) + + # Create expanders for each file + expanders = { + filename: st.expander(f"{filename}") for filename in filename_chunks.keys() + } + + # Create tasks for all chunks (sorted by chunk.id) + tasks = [llm.agenerate(chunk.content, role) for chunk in sorted_chunks] + + # Run all tasks and get the results in the same order + summaries = await asyncio.gather(*tasks) + + # Process the results in order + for summary, current_chunk in zip(summaries, sorted_chunks): + completed_chunks += 1 + progress_text.write(f"Generating summaries {completed_chunks}/{total_chunks}") + progress_bar.progress(completed_chunks / total_chunks) + + with expanders[current_chunk.filename]: + with st.chat_message("ai"): + st.write(summary.content) + completion_tokens, prompt_tokens, cached_tokens = self._get_tokens( + summary.response_metadata + ) + price = round( + llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6 + ) + st.write( + f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`" + ) + total_price += price + + # Store the summary in session state + st.session_state["summaries"].append( + { + "filename": current_chunk.filename, + "content": summary.content, + "tokens": completion_tokens + prompt_tokens, + "price": price, + } + ) + + progress_text.write("βœ… All chunks processed!") + progress_bar.progress(1.0) + total_price_text.write(f"Total price: `${round(total_price, 6)}`") + + # Run the async processing + asyncio.run(process_chunks()) + else: + # Check if summaries exist in session state and display them + if "summaries" in st.session_state: + total_price = 0 + # Group summaries by filename + filename_summaries = {} + for summary_data in st.session_state["summaries"]: + filename = summary_data["filename"] + if filename not in filename_summaries: + filename_summaries[filename] = [] + filename_summaries[filename].append(summary_data) + + # Display summaries grouped by filename + for filename, summaries in filename_summaries.items(): + with st.expander(f"{filename}"): + for summary_data in summaries: + with st.chat_message("ai"): + st.write(summary_data["content"]) + st.write( + f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`" + ) + total_price += summary_data["price"] + st.write(f"Total price: `${round(total_price, 6)}`") diff --git a/SumGPT/app/page.py b/SumGPT/app/page.py new file mode 100644 index 0000000..aed232b --- /dev/null +++ b/SumGPT/app/page.py @@ -0,0 +1,69 @@ +from typing import Dict, List, Optional + +import streamlit as st +from datamodel.llm_params import LLMParams + +from app.body_handler import BodyHandler +from app.sidebar_handler import SidebarHandler + + +class Page: + def __init__(self): + self.chunk_size: Optional[int] = None + self.role: Optional[str] = None + self.api_key: Optional[str] = None + self.llm_params: Optional[LLMParams] = None + + def draw_header(self, version): + st.title(f"πŸ“ SumGPT {version}") + st.markdown("##### Summarize your text with OpenAI's API") + st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)") + st.warning( + "Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo." + ) + + def draw_sidebar(self, manifest: Dict[str, str], models_data: List[Dict[str, str]]) -> None: + with st.sidebar: + sb = SidebarHandler() + sb.header() + sb.import_config() + self.api_key = sb.api_key_entry() + with st.expander("Role settings"): + self.role = sb.role_settings_panel() + with st.expander("Configuration"): + self.llm_params, self.chunk_size = sb.config_control_panel(models_data) + sb.export_config() + sb.footer(manifest) + + def draw_body(self) -> None: + if not self.chunk_size: + st.error("❌ Please set the chunk size in the sidebar.") + return + if not self.llm_params: + st.error("❌ Please set the model in the sidebar.") + return + if not self.role: + st.error("❌ Please set the role in the sidebar.") + return + + body = BodyHandler() + texts = body.file_uploader(["txt", "md"]) + + total_chunks = [] + filenames = [] + + for idx, text in enumerate(texts): + filename = text["filename"] + filenames.append(filename) + chunks, total_token_size = body.segment_text( + text["text"], self.chunk_size, self.llm_params.model.name, idx + ) + with st.expander(f"`{filename}` **(chunks: {len(chunks)})**"): + for chunk in chunks: + chunk.set_filename_from_list(filenames) + st.write([chunk.to_dict() for chunk in chunks]) + st.write(f"Tokens: `{total_token_size}`") + + total_chunks.extend(chunks) + + body.agenerate(total_chunks, self.llm_params, self.role, self.api_key) diff --git a/SumGPT/app/sidebar_handler.py b/SumGPT/app/sidebar_handler.py new file mode 100644 index 0000000..378e681 --- /dev/null +++ b/SumGPT/app/sidebar_handler.py @@ -0,0 +1,100 @@ +from typing import Any, Dict, List, Tuple + +import streamlit as st +import utils.helpers as helpers +from datamodel.llm_params import LLMModel, LLMParams + + +class SidebarHandler: + def __init__(self): + self.config = {} + self.chunk_size = None + + def header(self): + st.title("SumGPT") + st.markdown("Select the model and parameters for summarization.") + + def api_key_entry(self) -> str: + st.markdown("### API Key") + return st.text_input("Enter your OpenAI API key", type="password") + + def role_settings_panel(self, height=300) -> str: + language = st.selectbox( + "Role language", + ["English", "Chinese", "Japanese", "Spanish", "French", "German", "Italian"], + ) + role = st.text_area( + "Role settings", + self.config.get( + "role", + f"Write a detailed summary in perfect {language} that is concise, clear and coherent while capturing the main ideas the text. " + "The summary should be well-structured and free of grammatical errors.\n\n" + "The summary is to be written in markdown format, with a heading (###) that encapsulate the core concept of the content. It should be concise and specific. avoid generic headings like 'Summary' or 'Introduction'.", + ), + height=height, + ) + if role is None: + st.stop() + st.warning("Role settings are not set.") + return role + + def config_control_panel(self, models_data: List[Dict[str, str]]) -> Tuple[LLMParams, int]: + model_names = helpers.extract_values(models_data, "model") + model_name = st.selectbox("Model", model_names, self.config.get("model_index", 0)) + model = LLMModel.construct_from_dict(self._get_model_dict(models_data, model_name)) + + _param = self._construct_param(models_data, model_name) + + chunk_size = st.number_input( + "Chunk size (tokens)", + 32, + _param["context_window"], + self.config.get("chunk_size", 2048), + step=1024, + ) + max_tokens: int = st.number_input( + "Max output (tokens)", + 32, + _param["max_output_tokens"], + self.config.get("max_tokens", 512), + ) + temperature: float = st.slider("Temperature", 0.0, 1.0, self.config.get("temperature", 0.7)) + return ( + LLMParams( + model=model, + max_tokens=max_tokens, + temperature=temperature, + ), + chunk_size, + ) + + def _get_model_dict(self, models_data, selected_model) -> Dict[str, Any]: + model_index = helpers.extract_dict_index(models_data, "model", selected_model) + return models_data[model_index] + + def _construct_param(self, models_data, selected_model): + model_dict = self._get_model_dict(models_data, selected_model) + param = { + "max_output_tokens": model_dict["max_output_tokens"], + "context_window": model_dict["context_window"], + } + return param + + def import_config(self): + st.markdown("### Import Configuration") + if st.button("Import configuration"): + raise NotImplementedError # TODO: implement + + def export_config(self): + st.markdown("### Export Configuration") + if st.button("Export configuration"): + raise NotImplementedError # TODO: implement + + def footer(self, data: Dict[str, Any]): + st.markdown("---") + st.markdown("### SumGPT") + st.markdown(f"Version: `{data.get('version')}`") + st.markdown(f"Author: {data.get('author')}") + st.markdown(f"[Report a bug]({data['bugs']['url']})") + st.markdown(f"[GitHub repo]({data['repository']['url']})") + st.markdown(f"License: [{data['license']['type']}]({data['license']['url']})") diff --git a/SumGPT/core/__init__.py b/SumGPT/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SumGPT/core/llm.py b/SumGPT/core/llm.py new file mode 100644 index 0000000..06b7421 --- /dev/null +++ b/SumGPT/core/llm.py @@ -0,0 +1,51 @@ +from datamodel.llm_params import LLMParams +from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage +from langchain_openai import ChatOpenAI +from pydantic.types import SecretStr + + +class LLM: + def __init__(self, api_key: str, gpt_params: LLMParams): + self.api_key: str = api_key + self.llm_params: LLMParams = gpt_params + self.model: ChatOpenAI = self._set_llm() + + def _set_llm(self) -> ChatOpenAI: + return ChatOpenAI( + api_key=SecretStr(self.api_key), + model=self.llm_params.model.name, + max_tokens=self.llm_params.max_tokens, + temperature=self.llm_params.temperature, + ) + + def generate(_self, prompt: str, system: str = "") -> BaseMessage: + messages = [ + SystemMessage(content=system), + HumanMessage(content=prompt), + ] + return _self.model.invoke(messages) + + async def agenerate(_self, prompt: str, system: str = "") -> BaseMessage: + messages = [ + SystemMessage(content=system), + HumanMessage(content=prompt), + ] + return await _self.model.ainvoke(messages) + + def Calc_price( + self, + input_tokens: int, + output_tokens: int, + cached_tokens: int = 0, + scale_factor: int = 1000000, + ) -> float: + pricing = self.llm_params.model.pricing + if cached_tokens != 0 and pricing.cached is not None: + input_tokens -= cached_tokens + return ( + input_tokens * pricing.input + + output_tokens * pricing.output + + cached_tokens * pricing.cached + ) / scale_factor + + return (input_tokens * pricing.input + output_tokens * pricing.output) / scale_factor diff --git a/SumGPT/core/tokenizer.py b/SumGPT/core/tokenizer.py new file mode 100644 index 0000000..d0e7d12 --- /dev/null +++ b/SumGPT/core/tokenizer.py @@ -0,0 +1,20 @@ +from typing import List + +import tiktoken + + +class Tokenizer: + def __init__(self, model: str): + self.tokenizer = tiktoken.encoding_for_model(model) + + def tokenize(self, text: str) -> List[int]: + return self.tokenizer.encode(text) + + def detokenize(self, tokens: List[int]) -> str: + return self.tokenizer.decode(tokens) + + def detokenize_single(self, tokens: List[int]) -> List[str]: + results = [] + for token in tokens: + results.append(self.tokenizer.decode_single_token_bytes(token).decode("utf-8")) + return results diff --git a/SumGPT/datamodel/chunk.py b/SumGPT/datamodel/chunk.py new file mode 100644 index 0000000..64666d2 --- /dev/null +++ b/SumGPT/datamodel/chunk.py @@ -0,0 +1,22 @@ +class Chunk: + def __init__(self, id: int, content: str, tokens: int, input_id: int): + self.id = id + self.content = content + self.tokens = tokens + self.input_id = input_id + self.filename = None + + def __str__(self) -> str: + return f"Chunk(content={self.content}, tokens={self.tokens}, input_id={self.input_id})" + + def set_filename_from_list(self, filenames: list[str]) -> str: + self.filename = filenames[self.input_id] + return self.filename + + def to_dict(self) -> dict: + return { + "id": self.id, + "content": self.content, + "tokens": self.tokens, + "input_id": self.input_id, + } diff --git a/SumGPT/datamodel/llm_model.py b/SumGPT/datamodel/llm_model.py new file mode 100644 index 0000000..3c714c1 --- /dev/null +++ b/SumGPT/datamodel/llm_model.py @@ -0,0 +1,31 @@ +from typing import Optional + + +class LLMModelPricing: + def __init__(self, input: int, output: int, cached: Optional[int] = None): + self.input = input + self.output = output + self.cached = cached + + +class LLMModel: + def __init__( + self, name: str, context_window: int, max_output_tokens: int, pricing: LLMModelPricing + ): + self.name = name + self.context_window = context_window + self.max_output_tokens = max_output_tokens + self.pricing = pricing + + @staticmethod + def construct_from_dict(data: dict) -> "LLMModel": + pricing = LLMModelPricing(data["pricing"]["input"], data["pricing"]["output"]) + if "cached" in data["pricing"]: + pricing.cached = data["pricing"]["cached"] + + return LLMModel( + name=data["model"], + context_window=data["context_window"], + max_output_tokens=data["max_output_tokens"], + pricing=pricing, + ) diff --git a/SumGPT/datamodel/llm_params.py b/SumGPT/datamodel/llm_params.py new file mode 100644 index 0000000..9de6306 --- /dev/null +++ b/SumGPT/datamodel/llm_params.py @@ -0,0 +1,13 @@ +from datamodel.llm_model import LLMModel, LLMModelPricing # noqa: F401 + + +class LLMParams: + def __init__( + self, + model: LLMModel, + max_tokens=2048, + temperature=0.7, + ): + self.model: LLMModel = model + self.max_tokens: int = max_tokens + self.temperature: float = temperature diff --git a/SumGPT/main.py b/SumGPT/main.py new file mode 100644 index 0000000..f33c425 --- /dev/null +++ b/SumGPT/main.py @@ -0,0 +1,16 @@ +from app.page import Page +from utils import io + + +def main(): + manifest = io.read_json_file("SumGPT/manifest.json") + models = io.read_json_file("SumGPT/models.json") + + pg = Page() + pg.draw_header(manifest["version"]) + pg.draw_sidebar(manifest, models) + pg.draw_body() + + +if __name__ == "__main__": + main() diff --git a/SumGPT/manifest.json b/SumGPT/manifest.json new file mode 100644 index 0000000..7a33f79 --- /dev/null +++ b/SumGPT/manifest.json @@ -0,0 +1,17 @@ +{ + "name": "SumGPT", + "version": "2.0.0", + "license": { + "type": "MIT", + "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE" + }, + "author": "Zeke Zhang", + "homepage": "https://github.com/sean1832/SumGPT", + "repository": { + "type": "git", + "url": "https://github.com/sean1832/SumGPT" + }, + "bugs": { + "url": "https://github.com/sean1832/SumGPT/issues" + } + } \ No newline at end of file diff --git a/SumGPT/models.json b/SumGPT/models.json new file mode 100644 index 0000000..9a3cd16 --- /dev/null +++ b/SumGPT/models.json @@ -0,0 +1,40 @@ +[ + { + "model": "gpt-4o-mini", + "context_window": 128000, + "max_output_tokens": 16384, + "pricing": { + "input": 0.15, + "output": 0.6, + "cached": 0.075 + } + }, + { + "model": "gpt-4o", + "context_window": 128000, + "max_output_tokens": 4096, + "pricing": { + "input": 2.5, + "output": 10, + "cached": 1.25 + } + }, + { + "model": "gpt-4-turbo", + "context_window": 128000, + "max_output_tokens": 4096, + "pricing": { + "input": 10, + "output": 30 + } + }, + { + "model": "gpt-3.5-turbo", + "context_window": 16385, + "max_output_tokens": 4096, + "pricing": { + "input": 0.5, + "output": 1.5 + } + } +] \ No newline at end of file diff --git a/SumGPT/prompt.json b/SumGPT/prompt.json new file mode 100644 index 0000000..f76ec60 --- /dev/null +++ b/SumGPT/prompt.json @@ -0,0 +1,22 @@ +[ + { + "type": "recursive", + "legacy": false, + "prompt": "Write a detailed and comprehensive explanation of the following in perfect [LANGUAGE] with no grammar issues, ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information:\n\n{text}\n\nStructured markdown summary with heading (###) in fluent [LANGUAGE]:", + "variables": [ + { + "name": "[LANGUAGE]" + } + ] + }, + { + "type": "final", + "legacy": false, + "prompt": "Write a detailed summary of the following in [LANGUAGE]:\n\n{text}\n\nIdentify and summarise them into five headings. Use #### headings in markdown. Under headings, summarize a list of key points that best encapsulate the core information. Structured markdown summary with headings in perfect [LANGUAGE] (####): ", + "variables": [ + { + "name": "[LANGUAGE]" + } + ] + } +] \ No newline at end of file diff --git a/SumGPT/utils/__init__.py b/SumGPT/utils/__init__.py new file mode 100644 index 0000000..7979e22 --- /dev/null +++ b/SumGPT/utils/__init__.py @@ -0,0 +1,4 @@ +import utils.helpers as helpers +import utils.io as io + +__all__ = ["helpers", "io"] diff --git a/SumGPT/utils/helpers.py b/SumGPT/utils/helpers.py new file mode 100644 index 0000000..25afa4f --- /dev/null +++ b/SumGPT/utils/helpers.py @@ -0,0 +1,35 @@ +def extract_values(dicts, key, parent_key=None): + """ + Extracts values from a list of dictionaries based on a specified key. + If the key is nested, a parent key can be specified. + + :param dicts: List of dictionaries to query + :param key: The key for which values are to be extracted + :param parent_key: Optional parent key if the key is nested within another dictionary + :return: List of values corresponding to the specified key + """ + values = [] + for dict in dicts: + if parent_key: + # Access the nested dictionary and then the key if parent_key is specified + if parent_key in dict and key in dict[parent_key]: + values.append(dict[parent_key][key]) + else: + # Access the key directly if there is no parent_key + if key in dict: + values.append(dict[key]) + return values + +def extract_dict_index(dicts, key, value): + """ + Extracts the index of a dictionary in a list of dictionaries based on a specified key-value pair. + + :param dicts: List of dictionaries to query + :param key: The key to search for + :param value: The value to search for + :return: Index of the dictionary containing the specified key-value pair + """ + for i, dict in enumerate(dicts): + if key in dict and dict[key] == value: + return i + return None \ No newline at end of file diff --git a/SumGPT/utils/io.py b/SumGPT/utils/io.py new file mode 100644 index 0000000..fef6d69 --- /dev/null +++ b/SumGPT/utils/io.py @@ -0,0 +1,17 @@ +import json +from io import StringIO + + +def read_json_file(file): + with open(file, "r") as f: + return json.load(f) + + +def write_json_file(file, data: dict): + with open(file, "w") as f: + json.dump(data, f, indent=4) + + +def read_to_string(file): + stringio = StringIO(file.getvalue().decode("utf-8")) + return stringio.read() diff --git a/requirements.txt b/requirements.txt index 9770e95..8c8c682 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,14 @@ docx==0.2.4 -python_docx==0.8.11 -langchain==0.0.123 +python_docx==1.1.2 langdetect==1.0.9 -numpy==1.24.2 -openai==0.27.2 -pydub==0.25.1 PyPDF4==1.27.0 -pytube==12.1.3 -streamlit==1.20.0 -streamlit_toggle_switch==1.0.2 -tiktoken==0.3.1 -requests==2.29.0 -youtube_transcript_api==0.6.0 +tiktoken==0.8.0 +requests==2.32.3 + +# langchain +langchain==0.3.4 +langchain-openai==0.2.3 + +# streamlit +streamlit==1.39.0 diff --git a/src/Components/Info.py b/src/Components/Info.py deleted file mode 100644 index 72f324f..0000000 --- a/src/Components/Info.py +++ /dev/null @@ -1,18 +0,0 @@ -import streamlit as st -import Modules.file_io as file_io - - -def info(): - info_panel = st.container() - - manifest = 'src/manifest.json' - st.session_state['MANIFEST'] = manifest_data = file_io.read_json(manifest) - - with info_panel: - st.markdown('---') - st.markdown(f"# {manifest_data['name']}") - st.markdown(f"Version: `{manifest_data['version']}`") - st.markdown(f"Author: {manifest_data['author']}") - st.markdown(f"[Report a bug]({manifest_data['bugs']['url']})") - st.markdown(f"[GitHub repo]({manifest_data['homepage']})") - st.markdown(f"License: [{manifest_data['license']['type']}]({manifest_data['license']['url']})") \ No newline at end of file diff --git a/src/Components/StreamlitSetup.py b/src/Components/StreamlitSetup.py deleted file mode 100644 index b57f3ce..0000000 --- a/src/Components/StreamlitSetup.py +++ /dev/null @@ -1,36 +0,0 @@ -import streamlit as st -import Data.caption_languages as data -import Modules.file_io as file_io - -def setup(): - st.set_page_config(page_title="SumGPT", page_icon="πŸ“", layout="wide") - - if not st.session_state.get('OPENAI_API_KEY'): - st.session_state['OPENAI_API_KEY'] = None - - if not st.session_state.get('OPENAI_PERSONA_REC'): - st.session_state['OPENAI_PERSONA_REC'] = None - - if not st.session_state.get('OPENAI_PERSONA_SUM'): - st.session_state['OPENAI_PERSONA_SUM'] = None - - if not st.session_state.get('CHUNK_SIZE'): - st.session_state['CHUNK_SIZE'] = None - - if not st.session_state.get('OPENAI_PARAMS'): - st.session_state['OPENAI_PARAMS'] = None - - if not st.session_state.get('DELAY'): - st.session_state['DELAY'] = 0 - - if not st.session_state.get('FINAL_SUMMARY_MODE'): - st.session_state['FINAL_SUMMARY_MODE'] = False - - if not st.session_state.get('CAPTION_LANGUAGES'): - st.session_state['CAPTION_LANGUAGES'] = data.languages + data.auto_languages - - if not st.session_state.get('PREVIOUS_RESULTS'): - st.session_state['PREVIOUS_RESULTS'] = None - - if not st.session_state.get('MANIFEST'): - st.session_state["MANIFEST"] = file_io.read_json("src/manifest.json") \ No newline at end of file diff --git a/src/Components/__init__.py b/src/Components/__init__.py deleted file mode 100644 index 9391db9..0000000 --- a/src/Components/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from Components import sidebar -from Components import StreamlitSetup -from Components import Info -__all__ = ['sidebar', 'StreamlitSetup', 'Info'] \ No newline at end of file diff --git a/src/Components/sidebar.py b/src/Components/sidebar.py deleted file mode 100644 index b5b3d41..0000000 --- a/src/Components/sidebar.py +++ /dev/null @@ -1,187 +0,0 @@ -import streamlit as st -import GPT -import Modules.file_io as file_io -from streamlit_toggle import st_toggle_switch -import Components -from typing import Any, Dict, List, Tuple, Union -import json - - -def set_openai_api_key(api_key: str): - st.session_state["OPENAI_API_KEY"] = api_key - - -def set_openai_persona(persona_rec: str, persona_sum: str): - st.session_state["OPENAI_PERSONA_REC"] = persona_rec - st.session_state["OPENAI_PERSONA_SUM"] = persona_sum - - -def set_param(params: GPT.param): - st.session_state["OPENAI_PARAMS"] = params - - -def set_chunk_size(size: int): - st.session_state['CHUNK_SIZE'] = size - - -def set_delay(time: int): - st.session_state['DELAY'] = time - - -def set_final_summary_mode(mode: bool): - st.session_state['FINAL_SUMMARY_MODE'] = mode - - -def _set_config(config_file, key: str, default_value): - if config_file: - return file_io.read_json_upload(config_file, key) - else: - return default_value - -def _set_language(language: str): - st.session_state['OUTPUT_LANGUAGE'] = language - -def _set_legacy(enable: bool): - st.session_state['LEGACY'] = enable -def _legacy(enable: bool, legacy, experimental): - if not enable: - return experimental - else: - return legacy -def _extract_prompt(json_data: List[Dict[str,Union[bool, str]]], target_type: str, target_legacy: bool, language: str = "English") -> str | None: - for item in json_data: - if item["type"] == target_type and item["legacy"] == target_legacy: - prompt = item["prompt"] - new_prompt = prompt.replace("[LANGUAGE]", language) - return new_prompt - return None - -def sidebar(): - with st.sidebar: - st.markdown("## How to use\n" - "1. πŸ”‘ Enter your [OpenAI API key](https://beta.openai.com/account/api-keys)\n" - "2. πŸ“ upload your file\n" - "3. πŸƒ Run\n" - "---") - - config_file = st.file_uploader("πŸ“ Import Configs", type=['json']) - - api_input = st.text_input(label="πŸ”‘ OpenAI API Key", - placeholder="Enter your OpenAI API key (sk-...)", - type="password", - help="You can get your API key from https://beta.openai.com/account/api-keys", - value=_set_config(config_file, "OPENAI_API_KEY", "")) - - enable_legacy = st_toggle_switch(label="Legacy", default_value=_set_config(config_file, "LEGACY", False)) - enable_final_summary = st_toggle_switch(label="Enable Final Summary", - default_value=_set_config(config_file, "FINAL_SUMMARY_MODE", False)) - if enable_final_summary: - set_final_summary_mode(True) - if st.session_state['FINAL_SUMMARY_MODE'] != enable_final_summary: - set_final_summary_mode(enable_final_summary) - - with st.expander('πŸ€– Bot Persona'): - language_options = ['English', 'Chinese', 'Japanese', 'Korean', 'Spanish', 'French', 'German'] - language_index = language_options.index(_set_config(config_file, "LANGUAGE", 'English')) - language = st.selectbox('Language', options=language_options, index=language_index) - _set_language(language) - - prompts = file_io.read_json("resources/prompt.json") - - persona_rec_legacy = _extract_prompt(prompts, "recursive", True, language) - persona_rec = _extract_prompt(prompts, "recursive", False, language) - persona_rec = st.text_area('Bot Persona Recursive', - value=_set_config(config_file, "OPENAI_PERSONA_REC", _legacy(enable_legacy, persona_rec_legacy, persona_rec)), - help='System message is a pre-defined message used to instruct the assistant at the ' - 'beginning of a conversation. iterating and ' - 'experimenting with potential improvements can help to generate better outputs.' - 'Make sure to use casual language.', - height=250) - if enable_final_summary: - persona_sum_legacy = _extract_prompt(prompts, "final", True, language) - persona_sum = _extract_prompt(prompts, "final", False, language) - - persona_sum = st.text_area('Bot Persona Total Sum', - value=_set_config(config_file, "OPENAI_PERSONA_SUM", _legacy(enable_legacy, persona_sum_legacy, persona_sum)), - help='This is a pre-defined message for total summarization that is used to' - 'instruct the assistant at the beginning of a conversation. ', - height=300) - else: - persona_sum = "" - - with st.expander('πŸ”₯ Advanced Options'): - model_options = ['gpt-3.5-turbo','gpt-3.5-turbo-16k', 'gpt-4'] - model_index = model_options.index(_set_config(config_file, "MODEL", 'gpt-3.5-turbo')) - model = st.selectbox("Model", options=model_options, index=model_index) - - if model == 'gpt-4': - max_chunk = 4000 - elif model == 'gpt-3.5-turbo-16k': - max_chunk = 16000 - else: - max_chunk = 2500 - chunk_size = st.slider('Chunk Size (word count)', min_value=0, max_value=max_chunk, step=20, - value=_set_config(config_file, "CHUNK_SIZE", 800)) - max_tokens_rec = st.slider('Max Tokens - Recursive Summary', min_value=0, max_value=4090, step=20, - value=_set_config(config_file, "MAX_TOKENS_REC", 250)) - if enable_final_summary: - max_tokens_final = st.slider('Max Tokens - Final Summary', min_value=0, max_value=4090, step=20, - value=_set_config(config_file, "MAX_TOKENS_FINAL", 650)) - else: - max_tokens_final = 0 - temperature = st.slider('Temperature', min_value=0.0, max_value=1.0, step=0.05, - value=_set_config(config_file, "TEMPERATURE", 0.7)) - top_p = st.slider('Top P', min_value=0.0, max_value=1.0, step=0.05, - value=_set_config(config_file, "TOP_P", 1.0)) - frequency_penalty = st.slider('Frequency Penalty', min_value=0.0, max_value=2.0, step=0.1, - value=_set_config(config_file, "FREQUENCY_PENALTY", 0.0)) - presence_penalty = st.slider('Presence Penalty', min_value=0.0, max_value=2.0, step=0.1, - value=_set_config(config_file, "PRESENCE_PENALTY", 0.0)) - if st_toggle_switch(label="Delay (free openAI API user)", - default_value=_set_config(config_file, "ENABLE_DELAY", False)): - delay = st.slider('Delay (seconds)', min_value=0, max_value=60, step=1, - value=_set_config(config_file, "DELAY_TIME", 1)) - else: - delay = 0 - param = GPT.param.gpt_param( - model=model, - max_tokens_final=max_tokens_final, - max_tokens_rec=max_tokens_rec, - temperature=temperature, - top_p=top_p, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty - ) - - st.download_button(label="πŸ“₯ Export Configs", - data=json.dumps({ - "OPENAI_API_KEY": api_input, - "FINAL_SUMMARY_MODE": enable_final_summary, - "OPENAI_PERSONA_REC": persona_rec, - "OPENAI_PERSONA_SUM": persona_sum, - "CHUNK_SIZE": chunk_size, - "MAX_TOKENS_REC": max_tokens_rec, - "MAX_TOKENS_FINAL": max_tokens_final, - "TEMPERATURE": temperature, - "TOP_P": top_p, - "FREQUENCY_PENALTY": frequency_penalty, - "PRESENCE_PENALTY": presence_penalty, - "MODEL": model, - "ENABLE_DELAY": delay > 0, - "DELAY_TIME": delay, - "LANGUAGE": language, - "LEGACY": enable_legacy - }, indent=4), - file_name="configs.json") - Components.Info.info() - - if api_input: - set_openai_api_key(api_input) - - if persona_rec: - set_openai_persona(persona_rec, persona_sum) - - set_chunk_size(chunk_size) - set_param(param) - set_delay(delay) - _set_legacy(enable_legacy) \ No newline at end of file diff --git a/src/Data/__init__.py b/src/Data/__init__.py deleted file mode 100644 index 4de9124..0000000 --- a/src/Data/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from Data import caption_languages - -__all__ = ['caption_languages'] \ No newline at end of file diff --git a/src/Data/caption_languages.py b/src/Data/caption_languages.py deleted file mode 100644 index acec65e..0000000 --- a/src/Data/caption_languages.py +++ /dev/null @@ -1,6 +0,0 @@ -languages = [ - 'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh-Hans', 'zh-Hant', 'zh-TW', 'zh-CN', 'zh', 'ar', 'hi', 'th' -] - -auto_languages = ['a.' + _language for _language in languages] - diff --git a/src/GPT/__init__.py b/src/GPT/__init__.py deleted file mode 100644 index 0bcd76d..0000000 --- a/src/GPT/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from GPT import misc -from GPT import embeddings -from GPT import bot -from GPT import param -from GPT import generate - -__all__ = ['misc', 'embeddings', 'bot', 'param', 'generate'] \ No newline at end of file diff --git a/src/GPT/bot.py b/src/GPT/bot.py deleted file mode 100644 index ce8dd86..0000000 --- a/src/GPT/bot.py +++ /dev/null @@ -1,48 +0,0 @@ -import openai -from typing import Any, Dict, List, Tuple, Union - - -class OpenAIChatBot: - """A class to interact with the OpenAI API.""" - - def __init__(self, api_key: str, persona: str, model: str, max_tokens: int, temperature: float, top_p: float, - frequency_penalty: float, presence_penalty: float): - openai.api_key = api_key - self.persona = persona - self.model = model - self.max_tokens = max_tokens - self.temperature = temperature - self.top_p = top_p - self.frequency_penalty = frequency_penalty - self.presence_penalty = presence_penalty - - def chat_stream(self, prompt: str) -> openai.api_resources.chat_completion.ChatCompletion: - """Returns the streamed response from the OpenAI API.""" - completions = openai.ChatCompletion.create( - model=self.model, - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, - frequency_penalty=self.frequency_penalty, - presence_penalty=self.presence_penalty, - stream=True, - messages=[ - {"role": "system", "content": self.persona}, - {"role": "user", "content": prompt} - ]) - return completions - - def chat(self, prompt: str) -> Tuple[str, str]: - """Returns the response from the OpenAI API.""" - completions = openai.ChatCompletion.create( - model=self.model, - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, - frequency_penalty=self.frequency_penalty, - presence_penalty=self.presence_penalty, - messages=[ - {"role": "system", "content": self.persona}, - {"role": "user", "content": f"{self.persona} '{prompt}'"} - ]) - return completions['choices'][0]['message']['content'], completions['choices'][0]['finish_reason'] diff --git a/src/GPT/embeddings.py b/src/GPT/embeddings.py deleted file mode 100644 index 3e6cb50..0000000 --- a/src/GPT/embeddings.py +++ /dev/null @@ -1,12 +0,0 @@ -import openai - - -class openAIEmbeddings: - def __init__(self, api_key: str): - openai.api_key = api_key - - def embedding(self, content: str, engine: str = 'text-embedding-ada-002') -> float: - """Returns the embedding vector of a string.""" - response = openai.Embedding.create(input=content, engine=engine) - vector = response['data'][0]['embedding'] - return vector diff --git a/src/GPT/generate.py b/src/GPT/generate.py deleted file mode 100644 index 627faf6..0000000 --- a/src/GPT/generate.py +++ /dev/null @@ -1,52 +0,0 @@ -import GPT.bot -import streamlit as st -import GPT.param -from typing import Any, Dict, List, Tuple, Union - - -def get_answer_stream(content: str): - """Returns a stream of responses from the OpenAI API.""" - params = st.session_state["OPENAI_PARAMS"] - previous_char = '' - bot = GPT.bot.OpenAIChatBot(st.session_state["OPENAI_API_KEY"], - st.session_state["OPENAI_PERSONA"], - params.model, - params.max_tokens_rec, - params.temperature, - params.top_p, - params.frequency_penalty, - params.presence_penalty) - responses = bot.chat_stream(content) - response_panel = st.empty() - for response_json in responses: - choice = response_json['choices'][0] - if choice['finish_reason'] == 'stop': - break - - # error handling - if choice['finish_reason'] == 'length': - st.warning('⚠️Result cut off due to length. Consider increasing the max tokens parameter.') - break - - delta = choice['delta'] - if 'role' in delta or delta == {}: - char = '' - else: - char = delta['content'] - answer = previous_char + char - response_panel.info(answer) - - -def get_answer(content: str, max_tokens, persona: str) -> Tuple[str, str]: - """Returns a response from the OpenAI API.""" - params = st.session_state["OPENAI_PARAMS"] - bot = GPT.bot.OpenAIChatBot(st.session_state["OPENAI_API_KEY"], - persona, - params.model, - max_tokens, - params.temperature, - params.top_p, - params.frequency_penalty, - params.presence_penalty) - response, finish_reason = bot.chat(content) - return response, finish_reason diff --git a/src/GPT/misc.py b/src/GPT/misc.py deleted file mode 100644 index b93481c..0000000 --- a/src/GPT/misc.py +++ /dev/null @@ -1,98 +0,0 @@ -import openai -from langchain.llms import OpenAI -import os -import streamlit as st -from typing import Any, Dict, List, Tuple, Union - - -def validate_api_key(api_key: str) -> bool: - """Validates the OpenAI API key by trying to create a completion.""" - openai.api_key = api_key - try: - openai.ChatCompletion.create( - model="gpt-3.5-turbo", - max_tokens=1, - messages=[ - {"role": "user", "content": "Hello!"} - ] - ) - return True - except openai.error.AuthenticationError: - return False - - -def predict_token(param, chunks) -> Dict[str, int]: - """predict how many tokens to generate.""" - if st.session_state["OPENAI_API_KEY"] is not None: - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - llm = OpenAI() - prompt_token_total = 0 - completion_token_total = 0 - for chunk in chunks: - prompt_token = llm.get_num_tokens(chunk['content']) - prompt_token_total += prompt_token - completion_token_total += param.max_tokens_rec - - if st.session_state['FINAL_SUMMARY_MODE']: - completion_token_total += param.max_tokens_final - total_token = prompt_token_total + completion_token_total - token = {'total': total_token, - 'prompt': prompt_token_total, - 'completion': completion_token_total} - - return token - else: - return {'total': 0, 'prompt': 0, 'completion': 0} - - -def predict_token_single(chunk: Dict[str, Union[str, float]] | str, max_tokens: int = None) -> int: - """predict how many tokens to generate.""" - if st.session_state["OPENAI_API_KEY"] is not None: - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - llm = OpenAI() - if isinstance(chunk, str): - chunk_content = chunk - else: - chunk_content = chunk['content'] - chunk_token = llm.get_num_tokens(chunk_content) - if max_tokens is not None: - chunk_token += max_tokens - - return chunk_token - else: - return 0 - - -def is_tokens_exceeded(param, chunks, max_token: int = 4096) -> Dict[str, Union[bool, str]]: - """Checks if the number of tokens used has exceeded the limit.""" - - # check recursive chunks tokens - rec_chunks_token = [] - for chunk in chunks: - chunk_token = predict_token_single(chunk, param.max_tokens_rec) - rec_chunks_token.append(chunk_token) - - - # check final chunks tokens - final_prompt_token = len(chunks) * param.max_tokens_rec - final_completion_token = param.max_tokens_final - final_chunks_token = final_prompt_token + final_completion_token - - # evaluate - if max(rec_chunks_token) > max_token: - return {'exceeded': True, - 'reason': 'recursive', - 'message': f"**[ Recursive summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {max(rec_chunks_token)}\n" - f"(Prompt: {max(rec_chunks_token) - param.max_tokens_rec}, " - f"Completion: {param.max_tokens_rec})"} - - elif final_chunks_token > max_token and st.session_state['FINAL_SUMMARY_MODE']: - return {'exceeded': True, - 'reason': 'final', - 'message': f"**[ Final summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {final_chunks_token}\n" - f"(Prompt: {final_prompt_token}, Completion: {final_completion_token})"} - - else: - return {'exceeded': False, - 'reason': '', - 'message': ''} diff --git a/src/GPT/param.py b/src/GPT/param.py deleted file mode 100644 index 866f112..0000000 --- a/src/GPT/param.py +++ /dev/null @@ -1,11 +0,0 @@ - -class gpt_param: - def __init__(self, model: str, max_tokens_final: int, max_tokens_rec: int, temperature: float, top_p: float, - frequency_penalty: float, presence_penalty: float): - self.model = model - self.max_tokens_rec = max_tokens_rec - self.max_tokens_final = max_tokens_final - self.temperature = temperature - self.top_p = top_p - self.frequency_penalty = frequency_penalty - self.presence_penalty = presence_penalty diff --git a/src/Modules/Youtube.py b/src/Modules/Youtube.py deleted file mode 100644 index f399cb3..0000000 --- a/src/Modules/Youtube.py +++ /dev/null @@ -1,97 +0,0 @@ -import requests -import re -from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound -import streamlit as st -from typing import Any, Dict, List, Tuple, Union - - -manifest = st.session_state["MANIFEST"] -def _error_report_msg(youtube_url): - return f"Please create an issue on [GitHub]({manifest['bugs']['url']}). " \ - f"Please include the YouTube URL ({youtube_url}), version number ({manifest['version']}) " \ - f"and all necessary information to replicate the error. " \ - f"**Before creating a new issue, please check if the problem has already been reported.**" - -def _extract_video_id_from_url(url): - video_id_pattern = r'(?:v=|/v/|youtu\.be/|/embed/|/e/)([^?&"\'>]+)' - match = re.search(video_id_pattern, url) - if match: - return match.group(1) - else: - raise ValueError("Invalid YouTube URL") - -def get_video_title(youtube_url): - video_id = _extract_video_id_from_url(youtube_url) - url = f'https://www.youtube.com/watch?v={video_id}' - response = requests.get(url) - title_pattern = r'(.+?) - YouTube<\/title>' - match = re.search(title_pattern, response.text) - if match: - title = match.group(1) - return title - else: - return None - -def get_available_subtitle_languages(video_id): - try: - transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) - languages = [transcript.language_code for transcript in transcript_list] - return languages - except Exception as e: - print(f"Error fetching available subtitle languages: {e}") - return [] - -def get_video_captions(youtube_url, languages): - video_id = _extract_video_id_from_url(youtube_url) - simplified_url = f'https://www.youtube.com/watch?v={video_id}' - - available_language = get_available_subtitle_languages(video_id) - - if not any(lang in languages for lang in available_language) and available_language != []: - print(f"Failed to retrieve transcript: Language {available_language} is/are not yet supported for {simplified_url}.") - st.error(f'❌ Language {available_language} is/are not yet supported for {simplified_url}.\n\n' + _error_report_msg(simplified_url)) - st.stop() - - for language in languages: - try: - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language]) - captions = "" - for item in transcript: - captions += item['text'] + "\n" - return captions - - except NoTranscriptFound as e: - if language == languages[-1]: - print(f"Language {available_language} exist in language list but failed to retrieve in YouTubeTranscriptApi.get_transcript: {e}") - st.error(f'❌ Language {available_language} exist in language list but failed to retrieve in `YouTubeTranscriptApi.get_transcript`:\n\n' - f'languages = {available_language}\n\n' - f'language list = {languages}\n\n' - + _error_report_msg(simplified_url)) - st.stop() - else: - continue - - except TranscriptsDisabled: - print(f"Failed to retrieve transcript: transcripts disabled for {simplified_url}") - st.error(f'❌ Subtitles not available for {simplified_url}! \n\n---' - f'\n**Instruction:**\n\n' - f'1. Verify if the [video]({simplified_url}) has subtitles available.\n\n' - f"2. If you are confident that subtitles are available in the video but could not be retrieved, " - + _error_report_msg(simplified_url)) - st.stop() - raise TranscriptsDisabled - - except Exception as e: - print(e) - st.error(f'❌ Failed to fetch data from YouTube for {simplified_url}. \n\n' - f'{_error_report_msg(simplified_url)}' - f'\n\nError: \n\n---\n\n{e}') - st.stop() - break - -@st.cache_data(show_spinner=False) -def extract_youtube_transcript(url: str, lang_code: str | List[str] = 'a.en') -> Tuple[str, str]: - """Extracts the transcript from a YouTube video.""" - transcript = get_video_captions(url, lang_code) - title = get_video_title(url) - return transcript, title diff --git a/src/Modules/__init__.py b/src/Modules/__init__.py deleted file mode 100644 index 412ace4..0000000 --- a/src/Modules/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from Modules import file_io - -__all__ = ['file_io'] \ No newline at end of file diff --git a/src/Modules/file_io.py b/src/Modules/file_io.py deleted file mode 100644 index 0f214e8..0000000 --- a/src/Modules/file_io.py +++ /dev/null @@ -1,99 +0,0 @@ -import re -import PyPDF4 -import docx -from typing import Any, Dict, List, Tuple, Union -from pydub import AudioSegment -import math -import json -import streamlit as st - - - -@st.cache_data() -def read_json(file, key: str = None) -> Any: - """Reads a json file and returns the value of a key.""" - with open(file, "r") as f: - data = json.load(f) - if key and isinstance(data, dict): - return data[key] - elif key and isinstance(data, list): - return [d[key] for d in data] - else: - return data - - -@st.cache_data() -def read_json_upload(file, key: str) -> Any: - """Reads a json file and returns the value of a key.""" - if not isinstance(file, str): - f = file.getvalue().decode("utf-8") - data = json.loads(f) - return data[key] - - -@st.cache_data() -def read_txt(file, encoding: str = "utf-8") -> str: - """Reads a text file.""" - return file.read().decode(encoding) - - -@st.cache_data() -def read_pdf(file) -> List[str]: - """Reads a pdf file.""" - pdfReader = PyPDF4.PdfFileReader(file, strict=False) - texts = [] - for page in range(pdfReader.numPages): - text = pdfReader.getPage(page).extractText() - # Merge hyphenated words - text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) - # Fix newlines in the middle of sentences - text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip()) - # Remove multiple newlines - text = re.sub(r"\n\s*\n", "\n\n", text) - - texts.append(text) - return texts - - -@st.cache_data() -def read_docx(file) -> str: - """Reads a docx file.""" - doc = docx.Document(file) - text = "" - for para in doc.paragraphs: - # Remove multiple newlines - t = re.sub(r"\n\s*\n", "\n\n", para.text) - text += t + "\n" - return text - - -@st.cache_data() -def _split_audio(audio, chunk_size=2) -> List[AudioSegment]: - """Split audio into chunks of 10 minutes.""" - # load audio - audio = AudioSegment.from_file(audio, format="mp3") - # Define the chunk size (10 minutes default) - chunk_size = chunk_size * 60 * 1000 - # calculate the number of chunks - num_chunks = math.ceil(len(audio) / chunk_size) - chunks = [] - # split audio into chunks - for i in range(num_chunks): - start = i * chunk_size - end = start + chunk_size - chunk = audio[start:end] - chunks.append(chunk) - return chunks - - -@st.cache_data() -def read(file) -> str | List[str]: - """Reads a file and returns the content.""" - if file.name.endswith(".txt") or file.name.endswith(".md"): - return read_txt(file) - elif file.name.endswith(".pdf"): - return read_pdf(file) - elif file.name.endswith(".docx"): - return read_docx(file) - else: - raise ValueError("File type not supported") diff --git a/src/SumGPT.py b/src/SumGPT.py deleted file mode 100644 index 355ebd4..0000000 --- a/src/SumGPT.py +++ /dev/null @@ -1,178 +0,0 @@ -import asyncio - -import streamlit as st - -import Components.StreamlitSetup as StreamlitSetup - -StreamlitSetup.setup() - -import time # noqa: E402 - -import GPT # noqa: E402 -import Modules.file_io as file_io # noqa: E402 -import Modules.Youtube # noqa: E402 -import utils.helpers as helpers # noqa: E402 -from Components.sidebar import sidebar # noqa: E402 - -app_header = st.container() - -file_handler = st.container() -content_handler = st.container() -result_handler = st.container() - -with app_header: - st.title("πŸ“ SumGPT") - st.markdown("##### Summarize your text with OpenAI's GPT-3.5 / GPT-4 API") - st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)") - st.warning( - "Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo." - ) - -sidebar() - -with file_handler: - if st.button("πŸ”ƒ Refresh"): - st.cache_data.clear() - youtube_link_empty = st.empty() - upload_file_emtpy = st.empty() - - youtube_link = youtube_link_empty.text_input( - label="πŸ”— YouTube Link", - placeholder="Enter your YouTube link", - help="Enter your YouTube link to download the video and extract the audio", - ) - upload_file = upload_file_emtpy.file_uploader( - "πŸ“ Upload your file", type=["txt", "pdf", "docx", "md"] - ) - if youtube_link: - upload_file_emtpy.empty() - with st.spinner("πŸ” Extracting transcript..."): - transcript, title = Modules.Youtube.extract_youtube_transcript( - youtube_link, st.session_state["CAPTION_LANGUAGES"] - ) - file_content = {"name": f"{title}.txt", "content": transcript} - elif upload_file: - youtube_link_empty.empty() - with st.spinner("πŸ” Reading file... (mp3 file might take a while)"): - file_content = {"name": upload_file.name, "content": file_io.read(upload_file)} - elif youtube_link and upload_file: - st.warning("Please only upload one file at a time") - else: - file_content = None - -with content_handler: - if file_content: - with st.expander("File Preview"): - if file_content["name"].endswith(".pdf"): - content = "\n\n".join(file_content["content"]) - st.text_area(file_content["name"], content, height=200) - else: - content = file_content["content"] - st.text_area(file_content["name"], content, height=200) - -with result_handler: - if file_content: - chunks = [] - content = file_content["content"] - if file_content["name"].endswith(".pdf"): - content = "\n\n".join(file_content["content"]) - chunks.extend(helpers.convert_to_chunks(content, chunk_size=st.session_state["CHUNK_SIZE"])) - - with st.expander(f"Chunks ({len(chunks)})"): - for chunk in chunks: - st.write(chunk) - - token_usage = GPT.misc.predict_token(st.session_state["OPENAI_PARAMS"], chunks) - param = st.session_state["OPENAI_PARAMS"] - prompt_token = token_usage["prompt"] - completion_token = token_usage["completion"] - if param.model == "gpt-4": - price = round(prompt_token * 0.00003 + completion_token * 0.00006, 5) - elif param.model == "gpt-3.5-turbo-16k": - price = round(prompt_token * 0.000003 + completion_token * 0.000004, 5) - else: - price = round(prompt_token * 0.0000015 + completion_token * 0.000002, 5) - st.markdown( - f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`" - ) - # max tokens exceeded warning - exceeded = helpers.exceeded_token_handler( - param=st.session_state["OPENAI_PARAMS"], chunks=chunks - ) - - # load cached results - if st.session_state["PREVIOUS_RESULTS"] is not None: - rec_responses = st.session_state["PREVIOUS_RESULTS"]["rec_responses"] - rec_id = st.session_state["PREVIOUS_RESULTS"]["rec_ids"] - final_response = st.session_state["PREVIOUS_RESULTS"]["final_response"] - finish_reason_rec = st.session_state["PREVIOUS_RESULTS"]["finish_reason_rec"] - finish_reason_final = st.session_state["PREVIOUS_RESULTS"]["finish_reason_final"] - else: - rec_responses = None - rec_id = None - final_response = None - finish_reason_rec = None - finish_reason_final = None - - # finish_reason_rec = None - if st.button("πŸš€ Run", disabled=exceeded): - start_time = time.time() - st.cache_data.clear() - API_KEY = st.session_state["OPENAI_API_KEY"] - if not API_KEY and not GPT.misc.validate_api_key(API_KEY): - st.error( - "❌ Please enter a valid [OpenAI API key](https://beta.openai.com/account/api-keys)." - ) - else: - with st.spinner("Summarizing... (this might take a while)"): - if st.session_state["LEGACY"]: - rec_max_token = st.session_state["OPENAI_PARAMS"].max_tokens_rec - rec_responses, finish_reason_rec = helpers.recursive_summarize( - chunks, rec_max_token - ) - if st.session_state["FINAL_SUMMARY_MODE"]: - final_response, finish_reason_final = helpers.summarize(rec_responses) - else: - final_response = None - else: - completions, final_response = asyncio.run( - helpers.summarize_experimental_concurrently( - content, st.session_state["CHUNK_SIZE"] - ) - ) - rec_responses = [d["content"] for d in completions] - rec_ids = [d["chunk_id"] for d in completions] - # save previous completions - resp = { - "rec_responses": rec_responses, - "rec_ids": rec_ids, - "final_response": final_response, - "finish_reason_rec": finish_reason_rec, - "finish_reason_final": finish_reason_final, - } - if resp != st.session_state["PREVIOUS_RESULTS"]: - st.session_state["PREVIOUS_RESULTS"] = resp - - end_time = time.time() - st.markdown(f"⏱️ Time taken: `{round(end_time - start_time, 2)}s`") - - if rec_responses is not None: - with st.expander( - "Recursive Summaries", expanded=not st.session_state["FINAL_SUMMARY_MODE"] - ): - for i, response in enumerate(rec_responses): - st.info(f"{response}") - if finish_reason_rec == "length": - st.warning( - "⚠️Result cut off due to length. Consider increasing the [Max Tokens Chunks] parameter." - ) - - if final_response is not None: - st.header("πŸ“Summary") - st.info(final_response) - if finish_reason_final == "length": - st.warning( - "⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter." - ) - if final_response is not None or rec_responses is not None: - helpers.download_results(rec_responses, final_response) diff --git a/src/manifest.json b/src/manifest.json deleted file mode 100644 index 731522c..0000000 --- a/src/manifest.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "name": "SumGPT", - "version": "1.0.8", - "license": { - "type": "MIT", - "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE" - }, - "author": "Zeke Zhang", - "homepage": "https://github.com/sean1832/SumGPT", - "repository": { - "type": "git", - "url": "https://github.com/sean1832/SumGPT" - }, - "bugs": { - "url": "https://github.com/sean1832/SumGPT/issues" - } -} \ No newline at end of file diff --git a/src/utils/__init__.py b/src/utils/__init__.py deleted file mode 100644 index 5fe7f8d..0000000 --- a/src/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from utils import helpers - -__all__ = ["helpers"] diff --git a/src/utils/helpers.py b/src/utils/helpers.py deleted file mode 100644 index 6fd2618..0000000 --- a/src/utils/helpers.py +++ /dev/null @@ -1,237 +0,0 @@ -import os -import asyncio - -import numpy as np -from typing import Any, Dict, List, Tuple, Union - -from GPT.embeddings import openAIEmbeddings -import streamlit as st -import re -import GPT -import textwrap -from langdetect import detect -import time -from datetime import datetime - -from langchain.chat_models import ChatOpenAI -from langchain.docstore.document import Document -from langchain.prompts import PromptTemplate -from langchain.chains.summarize import load_summarize_chain -from langchain.chains import LLMChain - -def _similarity(v1, v2) -> np.ndarray: - """Returns the cosine similarity between two vectors.""" - return np.dot(v1, v2) - -@st.cache_data(show_spinner=False) -def _chunk_spliter(content: str, chunk_size: int = 1000, lang_base: str = 'latin') -> List[str]: - """Splits a string into chunks of a given size.""" - - sentences = re.split(r'(?<=[.?!,γ€‚οΌŒγ€οΌοΌŸΒ·])\s+', content) - if lang_base == 'latin': - chunks = [] - chunk = '' - word_count = 0 - for sentence in sentences: - sentence += ' ' # add space at end to compensate for split - words = sentence.split() - sentence_word_count = len(words) - if word_count + sentence_word_count <= chunk_size: - chunk += sentence - word_count += sentence_word_count - else: - chunks.append(chunk.strip()) - chunk = sentence - word_count = sentence_word_count - # add the last chunk - if chunk: - chunks.append(chunk.strip()) - - new_chunks = [] - for c in chunks: - if c == '': - continue - if len(c.split()) > chunk_size + 25: - words = c.split() - small_chunks = [] - for i in range(0, len(words), chunk_size): - small_chunks.append(' '.join(words[i:i + chunk_size])) - new_chunks.extend(small_chunks) - else: - new_chunks.append(c) - return new_chunks - - else: - chunks = textwrap.wrap(content, width=chunk_size) - return chunks - - -def language_base(string: str) -> str: - try: - lang_code = detect(string) - latin_based = ['en', 'fr-ca', 'es'] - east_asian_based = ['zh', 'ja', 'ko'] - for lang in latin_based: - if lang_code.startswith(lang): - return 'latin' - for lang in east_asian_based: - if lang_code.startswith(lang): - return 'east_asian' - return 'other' - except KeyError: - return 'other' - -@st.cache_data(show_spinner=False) -def convert_to_chunks(content: str, chunk_size: int = 1000, enable_embedding: bool = False) -> List[Dict[str, float]]: - """Converts a string into chunks of a given size.""" - chunks_text = _chunk_spliter(content, chunk_size, language_base(content)) - chunks = [] - for i, chunk in enumerate(chunks_text): - if enable_embedding: - embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"]) - chunks.append({'content': chunk, 'vector': embedding.embedding(chunk)}) - else: - chunks.append({'content': chunk, 'language_based': language_base(chunk), 'chunk_id': i}) - return chunks - - -def search_chunks(query: str, chunks: List[Dict[str, float]], count: int = 1) -> List[Dict[str, np.ndarray]]: - """Returns the top `count` chunks that are most similar to the query.""" - embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"]) - vectors = embedding.embedding(query) - points = [] - - for chunk in chunks: - point = _similarity(vectors, chunk['vector']) - points.append({'content': chunk['content'], 'point': point}) - - # sort the points in descending order - ordered = sorted(points, key=lambda x: x['point'], reverse=True) - return ordered[0:count] - -@st.cache_data(show_spinner=False) -def convert_to_docs(chunks: List[Dict[str, Union[str, float]]]) -> List[Document] | Document: - """Converts a list of chunks into a list of documents.""" - docs = [] - for chunk in chunks: - content = chunk['content'] - metadata = {'chunk_id': chunk['chunk_id']} - doc = Document(page_content=content, metadata=metadata) - docs.append(doc) - return docs - -async def async_generate(chain, chunk)-> Dict[str, Union[str, int]]: - """Generates a summary asynchronously.""" - resp = await chain.arun(text=chunk['content']) - return {'content': resp, 'chunk_id': chunk['chunk_id']} - -async def summarize_experimental_concurrently(content: str, chunk_size: int = 1000) -> Tuple[List[Dict[str, Union[str, int]]], str]: - """Summarizes a string asynchronously.""" - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - params = st.session_state['OPENAI_PARAMS'] - llm_rec = ChatOpenAI(model_name=params.model, - max_tokens=params.max_tokens_rec, - temperature=params.temperature, - top_p=params.top_p, - frequency_penalty=params.frequency_penalty, - presence_penalty=params.presence_penalty) - llm_final = ChatOpenAI(model_name=params.model, - max_tokens=params.max_tokens_final, - temperature=params.temperature, - top_p=params.top_p, - frequency_penalty=params.frequency_penalty, - presence_penalty=params.presence_penalty) - chunks = convert_to_chunks(content, chunk_size) - - REC_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_REC'], input_variables=['text']) - chain = LLMChain(llm=llm_rec, prompt=REC_PROMPT) - - tasks = [] - for chunk in chunks: - task = async_generate(chain, chunk) - tasks.append(task) - - outputs_rec = [] - progress_bar = st.progress(0, f"Generating summary 0/{len(chunks)}") - count = 1 - for coro in asyncio.as_completed(tasks): - output_rec = await coro - outputs_rec.append(output_rec) - progress_bar.progress(count / len(chunks), f"Generating summary {count}/{len(chunks)}") - count += 1 - rec_result = sorted(outputs_rec, key=lambda x: x['chunk_id']) - if st.session_state['FINAL_SUMMARY_MODE']: - FINAL_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_SUM'], input_variables=['text']) - chain = load_summarize_chain(llm_final, chain_type='stuff', prompt=FINAL_PROMPT) - docs = convert_to_docs(rec_result) - final_result = chain.run(docs) - else: - final_result = None - return rec_result, final_result - -@st.cache_data(show_spinner=False) -def recursive_summarize(chunks: List[Dict[str, Union[str, float]]], max_tokens) -> Tuple[List[str], str]: - """Returns a recursive summary of the given content.""" - recursiveSumTexts = [] - finish_reason = '' - chunks_length = len(chunks) - count = 0 - progress_bar = st.progress(0) - for chunk in chunks: - content = chunk['content'] - text, finish_reason = GPT.generate.get_answer(content, - max_tokens=max_tokens, - persona=st.session_state['OPENAI_PERSONA_REC']) - recursiveSumTexts.append(text) - progress_bar.progress((count + 1) / chunks_length) - count += 1 - time.sleep(st.session_state['DELAY']) - - return recursiveSumTexts, finish_reason - - -@st.cache_data(show_spinner=False) -def summarize(message: List[str] | str) -> Tuple[str, str]: - """Returns a summary of the given content.""" - if isinstance(message, list): - join_msg = ' '.join(message) - else: - join_msg = message - - params = st.session_state['OPENAI_PARAMS'] - max_asw_tokens_final = params.max_tokens_final - - answer, finish_reason = GPT.generate.get_answer(join_msg, max_tokens=max_asw_tokens_final, - persona=st.session_state['OPENAI_PERSONA_SUM']) - return answer, finish_reason - - -def download_results(rec_responses, final_response): - """Downloads the results as a txt file.""" - joint_rec_response = f"=====recursive responses=====\n\n" + '\n\n'.join(rec_responses) - joint_final_response = f"{joint_rec_response}\n\n======final response=====\n\n{final_response}" - now = datetime.now() - if final_response is not None: - st.download_button("πŸ“₯ Download Summary", - joint_final_response, - file_name=f"summary_{now.strftime('%Y-%m-%d_%H-%M')}.md") - else: - st.download_button("πŸ“₯ Download Summary", - joint_rec_response, - file_name=f"summary_{now.strftime('%Y-%m-%d_%H-%M')}.md") - - -def exceeded_token_handler(param, chunks) -> bool: - """Handles the case where the user has exceeded the number of tokens.""" - if param.model == 'gpt-4': - max_token = 8100 - elif param.model == 'gpt-3.5-turbo-16k': - max_token = 16385 - else: - max_token = 4096 - info = GPT.misc.is_tokens_exceeded(param, chunks, max_token) - if info['exceeded']: - st.error(f"❌ {info['message']}") - return True - else: - return False