Skip to content

Commit

Permalink
Merge pull request #452 from ElishaKay/document-reports
Browse files Browse the repository at this point in the history
Research with local documents
  • Loading branch information
assafelovic authored May 20, 2024
2 parents 2015f3c + 3328898 commit 5c0f4e7
Show file tree
Hide file tree
Showing 26 changed files with 1,825 additions and 1,058 deletions.
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
OPENAI_API_KEY=
TAVILY_API_KEY=
LANGCHAIN_API_KEY=
LANGCHAIN_API_KEY=
DOC_PATH=./my-docs
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ venv.bak/
#Ignore generated outputs
outputs/

#Ignore my local docs
my-docs/

#Ignore pycache
**/__pycache__/

Expand Down
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ More specifically:
- [Live Demo](https://www.loom.com/share/6a3385db4e8747a1913dd85a7834846f?sid=a740fd5b-2aa3-457e-8fb7-86976f59f9b8)

## Features
- 📝 Generate research, outlines, resources and lessons reports
- 📝 Generate research, outlines, resources and lessons reports with local documents and web sources
- 📜 Can generate long and detailed research reports (over 2K words)
- 🌐 Aggregates over 20 web sources per research to form objective and factual conclusions
- 🖥️ Includes an easy-to-use web interface (HTML/CSS/JS)
Expand Down Expand Up @@ -131,6 +131,22 @@ report = await researcher.write_report()

**For more examples and configurations, please refer to the [PIP documentation](https://docs.gptr.dev/docs/gpt-researcher/pip-package) page.**

## 📄 Research on Local Documents

You can instruct the GPT Researcher to run research tasks based on your local documents. Currently supported file formats are: PDF, plain text, CSV, Excel, Markdown, PowerPoint, and Word documents.

Step 1: Add the env variable `DOC_PATH` pointing to the folder where your documents are located.

```bash
export DOC_PATH="./my-docs"
```

Step 2:
- If you're running the frontend app on localhost:8000, simply select "My Documents" from the the "Report Source" Dropdown Options.
- If you're running GPT Researcher with the [PIP package](https://docs.tavily.com/docs/gpt-researcher/pip-package), pass the `report_source` argument as "documents" when you instantiate the `GPTResearcher` class [code sample here](https://docs.tavily.com/docs/gpt-researcher/tailored-research).



## 👪 Multi-Agent Assistant
As AI evolves from prompt engineering and RAG to multi-agent systems, we're excited to introduce our new multi-agent assistant built with [LangGraph](https://python.langchain.com/v0.1/docs/langgraph/).

Expand Down
5 changes: 3 additions & 2 deletions backend/report_type/basic_report/basic_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
from fastapi import WebSocket

class BasicReport():
def __init__(self, query: str, report_type: str, source_urls, config_path: str, websocket: WebSocket):
def __init__(self, query: str, report_type: str, report_source:str, source_urls, config_path: str, websocket: WebSocket):
self.query = query
self.report_type = report_type
self.report_source = report_source
self.source_urls = source_urls
self.config_path = config_path
self.websocket = websocket

async def run(self):
# Initialize researcher
researcher = GPTResearcher(self.query, self.report_type, self.source_urls, self.config_path, self.websocket)
researcher = GPTResearcher(self.query, self.report_type, self.report_source, self.source_urls, self.config_path, self.websocket)

# Run research
await researcher.conduct_research()
Expand Down
7 changes: 4 additions & 3 deletions backend/report_type/detailed_report/detailed_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@


class DetailedReport():
def __init__(self, query: str, source_urls, config_path: str, websocket: WebSocket, subtopics=[]):
def __init__(self, query: str, report_type: str, report_source: str, source_urls, config_path: str, websocket: WebSocket, subtopics=[]):
self.query = query
self.report_type = report_type
self.report_source = report_source
self.source_urls = source_urls
self.config_path = config_path
self.websocket = websocket
self.subtopics = subtopics

# A parent task assistant. Adding research_report as default
self.main_task_assistant = GPTResearcher(self.query, "research_report", self.source_urls, self.config_path, self.websocket)

self.main_task_assistant = GPTResearcher(self.query, "research_report", self.report_source, self.source_urls, self.config_path, self.websocket)
self.existing_headers = []
# This is a global variable to store the entire context accumulated at any point through searching and scraping
self.global_context = []
Expand Down
3 changes: 2 additions & 1 deletion backend/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ async def websocket_endpoint(websocket: WebSocket):
task = json_data.get("task")
report_type = json_data.get("report_type")
filename = f"task_{int(time.time())}_{task}"
report_source = json_data.get("report_source")
if task and report_type:
report = await manager.start_streaming(task, report_type, websocket)
report = await manager.start_streaming(task, report_type, report_source, websocket)
# Saving report as pdf
pdf_path = await write_md_to_pdf(report, filename)
# Saving report as docx
Expand Down
11 changes: 6 additions & 5 deletions backend/websocket_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,24 @@ async def disconnect(self, websocket: WebSocket):
del self.sender_tasks[websocket]
del self.message_queues[websocket]

async def start_streaming(self, task, report_type, websocket):
async def start_streaming(self, task, report_type, report_source, websocket):
"""Start streaming the output."""
report = await run_agent(task, report_type, websocket)
report = await run_agent(task, report_type, report_source, websocket)
return report


async def run_agent(task, report_type, websocket):
async def run_agent(task, report_type, report_source, websocket):
"""Run the agent."""
# measure time
start_time = datetime.datetime.now()
# add customized JSON config file path here
config_path = ""
# Instead of running the agent directly run it through the different report type classes
if report_type == ReportType.DetailedReport.value:
researcher = DetailedReport(query=task, source_urls=None, config_path=config_path, websocket=websocket)
researcher = DetailedReport(query=task, report_type=report_type, report_source=report_source,
source_urls=None, config_path=config_path, websocket=websocket)
else:
researcher = BasicReport(query=task, report_type=report_type,
researcher = BasicReport(query=task, report_type=report_type, report_source=report_source,
source_urls=None, config_path=config_path, websocket=websocket)

report = await researcher.run()
Expand Down
4 changes: 3 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
version: '3'
services:
gpt-researcher:
image: assafelovic/gpt-researcher
image: kramer1346/gpt-researcher
build: ./
environment:
OPENAI_API_KEY: ${OPENAI_API_KEY}
TAVILY_API_KEY: ${TAVILY_API_KEY}
DOC_PATH: "=./my-docs"
LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
ports:
- 8000:8000
35 changes: 33 additions & 2 deletions docs/docs/gpt-researcher/tailored-research.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ if __name__ == "__main__":
### Specify Agent Prompt 📝

You can specify the agent prompt instruction upon which the research is conducted. This allows you to guide the research in a specific direction and tailor the report layout.
Simplay pass the prompt as the `query` argument to the `GPTResearcher` class and the "custom_report" `report_type`.
Simply pass the prompt as the `query` argument to the `GPTResearcher` class and the "custom_report" `report_type`.

```python
from gpt_researcher import GPTResearcher
Expand All @@ -48,4 +48,35 @@ if __name__ == "__main__":
```

### Research on Local Documents 📄
TBD!
You can instruct the GPT Researcher to research on local documents by providing the path to those documents. Currently supported file formats are: PDF, plain text, CSV, Excel, Markdown, PowerPoint, and Word documents.

*Step 1*: Add the env variable `DOC_PATH` pointing to the folder where your documents are located.

For example:

```bash
export DOC_PATH="./my-docs"
```

*Step 2*: When you create an instance of the `GPTResearcher` class, pass the `report_source` argument as `"local"`.

GPT Researcher will then conduct research on the provided documents.

```python
from gpt_researcher import GPTResearcher
import asyncio

async def get_report(query: str, report_type: str, report_source: str) -> str:
researcher = GPTResearcher(query=query, report_type=report_type, report_source=report_source)
await researcher.conduct_research()
report = await researcher.write_report()
return report

if __name__ == "__main__":
query = "What can you tell me about myself based on my documents?"
report_type = "research_report"
report_source = "local" # "local" or "web"

report = asyncio.run(get_report(query=query, report_type=report_type, report_source=report_source))
print(report)
```
8 changes: 7 additions & 1 deletion frontend/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@ <h1 class="text-4xl font-extrabold mx-auto lg:text-7xl">
</select>
</div>
<div class="form-group">
</div>
<label for="report_source" class="agent-question">What sources would you like me to research from?</label>
<p class="text-left mt-0 pt-0" style="font-size: 0.7rem;">You can now do research on local documents as well. Please make sure to add the DOC_PATH env variable pointing to your documents folder.</p>
<select name="report_source" class="form-control" required>
<option value="web">The Web</option>
<option value="local">My Documents</option>
</select>
</div>
<input type="submit" value="Research" class="btn btn-primary button-padding">
</form>

Expand Down
2 changes: 2 additions & 0 deletions frontend/scripts.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ const GPTResearcher = (() => {
socket.onopen = (event) => {
const task = document.querySelector('input[name="task"]').value;
const report_type = document.querySelector('select[name="report_type"]').value;
const report_source = document.querySelector('select[name="report_source"]').value;
const agent = document.querySelector('input[name="agent"]:checked').value;

const requestData = {
task: task,
report_type: report_type,
report_source: report_source,
agent: agent,
};

Expand Down
8 changes: 6 additions & 2 deletions frontend/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
100% {background-position: 0% 50%;}
}

html {
scroll-behavior: smooth;
}

body {
font-family: 'Montserrat', sans-serif;
color: #fff;
Expand Down Expand Up @@ -84,9 +88,9 @@ input:hover, input:focus, select:hover, select:focus {
}

.agent_question {
font-size: 1.2rem;
font-size: 1.4rem;
font-weight: 500;
margin-bottom: 0.5rem;
margin-bottom: 0.2rem;
}

footer {
Expand Down
13 changes: 10 additions & 3 deletions gpt_researcher/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,16 @@ def __init__(self, config_file: str = None):
self.agent_role = os.getenv('AGENT_ROLE', None)
self.scraper = os.getenv("SCRAPER", "bs")
self.max_subtopics = os.getenv("MAX_SUBTOPICS", 3)

self.doc_path = os.getenv("DOC_PATH", "")

self.load_config_file()

if self.doc_path:
self.validate_doc_path()

def validate_doc_path(self):
"""Ensure that the folder exists at the doc path"""
os.makedirs(self.doc_path, exist_ok=True)

def load_config_file(self) -> None:
"""Load the config file."""
Expand All @@ -39,5 +47,4 @@ def load_config_file(self) -> None:
with open(self.config_file, "r") as f:
config = json.load(f)
for key, value in config.items():
setattr(self, key.lower(), value)

setattr(self, key.lower(), value)
3 changes: 3 additions & 0 deletions gpt_researcher/document/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .document import DocumentLoader

__all__ = ['DocumentLoader']
65 changes: 65 additions & 0 deletions gpt_researcher/document/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import asyncio
import os

from langchain_community.document_loaders import (
PyMuPDFLoader,
TextLoader,
UnstructuredCSVLoader,
UnstructuredExcelLoader,
UnstructuredMarkdownLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader
)


class DocumentLoader:

def __init__(self, path):
self.path = path

async def load(self) -> list:
tasks = []
for root, dirs, files in os.walk(self.path):
for file in files:
file_path = os.path.join(root, file)
file_name, file_extension_with_dot = os.path.splitext(file_path)
file_extension = file_extension_with_dot.strip(".")
tasks.append(self._load_document(file_path, file_extension))

docs = []
for pages in await asyncio.gather(*tasks):
for page in pages:
if page.page_content:
docs.append({
"raw_content": page.page_content,
"url": os.path.basename(page.metadata['source'])
})

if not docs:
raise ValueError("🤷 Failed to load any documents!")

return docs

async def _load_document(self, file_path: str, file_extension: str) -> list:
try:
loader_dict = {
"pdf": PyMuPDFLoader(file_path),
"txt": TextLoader(file_path),
"doc": UnstructuredWordDocumentLoader(file_path),
"docx": UnstructuredWordDocumentLoader(file_path),
"pptx": UnstructuredPowerPointLoader(file_path),
"csv": UnstructuredCSVLoader(file_path, mode="elements"),
"xls": UnstructuredExcelLoader(file_path, mode="elements"),
"xlsx": UnstructuredExcelLoader(file_path, mode="elements"),
"md": UnstructuredMarkdownLoader(file_path)
}

loader = loader_dict.get(file_extension, None)
if loader:
data = loader.load()
return data

except Exception as e:
print(f"Failed to load document : {file_path}")
print(e)
return []
Loading

0 comments on commit 5c0f4e7

Please sign in to comment.