From 60443c8ddf229f8554b00253949f8f1856ca3a4b Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Tue, 17 Dec 2024 14:11:48 -0800
Subject: [PATCH 01/11] feat: Add structured logging system
- Add CustomLogsHandler for unified logging
- Implement JSON and text file logging
- Add comprehensive test coverage for logging functionality
- Update pytest configuration for async tests
- Update gitignore patterns for log files
---
.gitignore | 21 +-
backend/server/app.py | 16 ++
backend/server/logging_config.py | 83 ++++++++
backend/server/server.py | 42 ++++
backend/server/server_utils.py | 114 ++++++++++-
frontend/index.html | 12 +-
.../ResearchBlocks/AccessReport.tsx | 47 +++--
frontend/scripts.js | 30 ++-
gpt_researcher/agent.py | 71 ++++++-
gpt_researcher/skills/researcher.py | 181 ++++++++++--------
gpt_researcher/utils/logging_config.py | 82 ++++++++
main.py | 26 ++-
pyproject.toml | 9 +-
src/logs_handler.py | 90 +++++++++
src/researcher.py | 94 +++++++++
tests/gptr-logs-handler.py | 22 +--
tests/report-types.py | 18 +-
tests/research_test.py | 17 +-
tests/test_logging.py | 61 ++++++
tests/test_logs.py | 48 +++++
tests/test_researcher.py | 63 ++++++
21 files changed, 970 insertions(+), 177 deletions(-)
create mode 100644 backend/server/app.py
create mode 100644 backend/server/logging_config.py
create mode 100644 gpt_researcher/utils/logging_config.py
create mode 100644 src/logs_handler.py
create mode 100644 src/researcher.py
create mode 100644 tests/test_logging.py
create mode 100644 tests/test_logs.py
create mode 100644 tests/test_researcher.py
diff --git a/.gitignore b/.gitignore
index 203892d2a..428cdd82d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,4 +40,23 @@ docs/build
package-lock.json
#Vim swp files
-*.swp
\ No newline at end of file
+*.swp
+
+# Log files
+logs/
+*.log
+*_log.txt
+heroku_logs.txt
+memory_profiling_log.txt
+server_log.txt
+
+# Add to existing .gitignore
+*.json
+*.pdf
+*.md
+!README.md
+!CONTRIBUTING.md
+!CODE_OF_CONDUCT.md
+
+# Backup directories
+data_backup/
diff --git a/backend/server/app.py b/backend/server/app.py
new file mode 100644
index 000000000..ee886367b
--- /dev/null
+++ b/backend/server/app.py
@@ -0,0 +1,16 @@
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import logging
+
+logger = logging.getLogger(__name__)
+
+app = FastAPI()
+
+# Add CORS middleware
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"], # In production, replace with your frontend domain
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
\ No newline at end of file
diff --git a/backend/server/logging_config.py b/backend/server/logging_config.py
new file mode 100644
index 000000000..ad88044d2
--- /dev/null
+++ b/backend/server/logging_config.py
@@ -0,0 +1,83 @@
+import logging
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+class JSONResearchHandler:
+ def __init__(self, json_file):
+ self.json_file = json_file
+ self.research_data = {
+ "timestamp": datetime.now().isoformat(),
+ "events": [],
+ "content": {
+ "query": "",
+ "sources": [],
+ "context": [],
+ "report": "",
+ "costs": 0.0
+ }
+ }
+
+ def log_event(self, event_type: str, data: dict):
+ self.research_data["events"].append({
+ "timestamp": datetime.now().isoformat(),
+ "type": event_type,
+ "data": data
+ })
+ self._save_json()
+
+ def update_content(self, key: str, value):
+ self.research_data["content"][key] = value
+ self._save_json()
+
+ def _save_json(self):
+ with open(self.json_file, 'w') as f:
+ json.dump(self.research_data, f, indent=2)
+
+def setup_research_logging():
+ # Create logs directory if it doesn't exist
+ logs_dir = Path("logs")
+ logs_dir.mkdir(exist_ok=True)
+
+ # Generate timestamp for log files
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ # Create log file paths
+ log_file = logs_dir / f"research_{timestamp}.log"
+ json_file = logs_dir / f"research_{timestamp}.json"
+
+ # Configure file handler for research logs
+ file_handler = logging.FileHandler(log_file)
+ file_handler.setLevel(logging.INFO)
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+
+ # Get research logger and configure it
+ research_logger = logging.getLogger('research')
+ research_logger.setLevel(logging.INFO)
+
+ # Remove any existing handlers to avoid duplicates
+ research_logger.handlers.clear()
+
+ # Add file handler
+ research_logger.addHandler(file_handler)
+
+ # Add stream handler for console output
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+ research_logger.addHandler(console_handler)
+
+ # Prevent propagation to root logger to avoid duplicate logs
+ research_logger.propagate = False
+
+ # Create JSON handler
+ json_handler = JSONResearchHandler(json_file)
+
+ return str(log_file), str(json_file), research_logger, json_handler
+
+# Create a function to get the logger and JSON handler
+def get_research_logger():
+ return logging.getLogger('research')
+
+def get_json_handler():
+ return getattr(logging.getLogger('research'), 'json_handler', None)
\ No newline at end of file
diff --git a/backend/server/server.py b/backend/server/server.py
index 939a2c419..9dcb1b968 100644
--- a/backend/server/server.py
+++ b/backend/server/server.py
@@ -15,6 +15,26 @@
execute_multi_agents, handle_websocket_communication
)
+from gpt_researcher.utils.logging_config import setup_research_logging
+
+import logging
+
+# Get logger instance
+logger = logging.getLogger(__name__)
+
+# Don't override parent logger settings
+logger.propagate = True
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(levelname)s - %(message)s",
+ handlers=[
+ logging.FileHandler("server_log.txt"), # Log to file
+ logging.StreamHandler() # Also print to console
+ ]
+)
+
+
# Models
@@ -68,11 +88,25 @@ class ConfigRequest(BaseModel):
# Startup event
+from psutil import Process
+import logging
+
@app.on_event("startup")
def startup_event():
os.makedirs("outputs", exist_ok=True)
app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs")
os.makedirs(DOC_PATH, exist_ok=True)
+
+ # Setup research logging
+ log_file, json_file, research_logger, json_handler = setup_research_logging() # Unpack all 4 values
+ research_logger.json_handler = json_handler # Store the JSON handler on the logger
+ research_logger.info(f"Research log file: {log_file}")
+ research_logger.info(f"Research JSON file: {json_file}")
+
+ # Log memory usage
+ process = Process()
+ mem_info = process.memory_info()
+ research_logger.info(f"Memory usage at startup: {mem_info.rss / 1024 ** 2:.2f} MB")
# Routes
@@ -89,7 +123,10 @@ async def list_files():
return {"files": files}
+from memory_profiler import profile
+
@app.post("/api/multi_agents")
+@profile
async def run_multi_agents():
return await execute_multi_agents(manager)
@@ -104,10 +141,15 @@ async def delete_file(filename: str):
return await handle_file_deletion(filename, DOC_PATH)
+from psutil import Process
+
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
+ process = Process()
await manager.connect(websocket)
try:
+ mem_info = process.memory_info()
+ print(f"Memory usage during WebSocket connection: {mem_info.rss / 1024 ** 2:.2f} MB")
await handle_websocket_communication(websocket, manager)
except WebSocketDisconnect:
await manager.disconnect(websocket)
diff --git a/backend/server/server_utils.py b/backend/server/server_utils.py
index 77bc8aba3..1c472be6f 100644
--- a/backend/server/server_utils.py
+++ b/backend/server/server_utils.py
@@ -4,11 +4,99 @@
import time
import shutil
from typing import Dict, List, Any
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, FileResponse
from gpt_researcher.document.document import DocumentLoader
-# Add this import
from backend.utils import write_md_to_pdf, write_md_to_word, write_text_to_md
-
+from pathlib import Path
+from datetime import datetime
+from fastapi import HTTPException
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+class CustomLogsHandler:
+ """Custom handler to capture streaming logs from the research process"""
+ def __init__(self, websocket, task: str):
+ self.logs = []
+ self.websocket = websocket
+ sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}")
+ self.log_file = os.path.join("outputs", f"{sanitized_filename}.json")
+ self.timestamp = datetime.now().isoformat()
+ # Initialize log file with metadata
+ os.makedirs("outputs", exist_ok=True)
+ with open(self.log_file, 'w') as f:
+ json.dump({
+ "timestamp": self.timestamp,
+ "events": [],
+ "content": {
+ "query": "",
+ "sources": [],
+ "context": [],
+ "report": "",
+ "costs": 0.0
+ }
+ }, f, indent=2)
+
+ async def send_json(self, data: Dict[str, Any]) -> None:
+ """Store log data and send to websocket"""
+ # Send to websocket for real-time display
+ if self.websocket:
+ await self.websocket.send_json(data)
+
+ # Read current log file
+ with open(self.log_file, 'r') as f:
+ log_data = json.load(f)
+
+ # Update appropriate section based on data type
+ if data.get('type') == 'logs':
+ log_data['events'].append({
+ "timestamp": datetime.now().isoformat(),
+ "type": "event",
+ "data": data
+ })
+ else:
+ # Update content section for other types of data
+ log_data['content'].update(data)
+
+ # Save updated log file
+ with open(self.log_file, 'w') as f:
+ json.dump(log_data, f, indent=2)
+ logger.debug(f"Log entry written to: {self.log_file}")
+
+
+class Researcher:
+ def __init__(self, query: str, report_type: str = "research_report"):
+ self.query = query
+ self.report_type = report_type
+ # Generate unique ID for this research task
+ self.research_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{hash(query)}"
+ # Initialize logs handler with research ID
+ self.logs_handler = CustomLogsHandler(self.research_id)
+ self.researcher = GPTResearcher(
+ query=query,
+ report_type=report_type,
+ websocket=self.logs_handler
+ )
+
+ async def research(self) -> dict:
+ """Conduct research and return paths to generated files"""
+ await self.researcher.conduct_research()
+ report = await self.researcher.write_report()
+
+ # Generate the files
+ sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{self.query}")
+ file_paths = await generate_report_files(report, sanitized_filename)
+
+ # Get the JSON log path that was created by CustomLogsHandler
+ json_relative_path = os.path.relpath(self.logs_handler.log_file)
+
+ return {
+ "output": {
+ **file_paths, # Include PDF, DOCX, and MD paths
+ "json": json_relative_path
+ }
+ }
def sanitize_filename(filename: str) -> str:
return re.sub(r"[^\w\s-]", "", filename).strip()
@@ -23,13 +111,31 @@ async def handle_start_command(websocket, data: str, manager):
print("Error: Missing task or report_type")
return
+ # Create logs handler with websocket and task
+ logs_handler = CustomLogsHandler(websocket, task)
+ # Initialize log content with query
+ await logs_handler.send_json({
+ "query": task,
+ "sources": [],
+ "context": [],
+ "report": ""
+ })
+
sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}")
report = await manager.start_streaming(
- task, report_type, report_source, source_urls, tone, websocket, headers
+ task,
+ report_type,
+ report_source,
+ source_urls,
+ tone,
+ logs_handler,
+ headers
)
report = str(report)
file_paths = await generate_report_files(report, sanitized_filename)
+ # Add JSON log path to file_paths
+ file_paths["json"] = os.path.relpath(logs_handler.log_file)
await send_file_paths(websocket, file_paths)
diff --git a/frontend/index.html b/frontend/index.html
index 279381f62..f55c5dc6c 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -143,13 +143,11 @@
Research Report
diff --git a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
index 080e5c91c..21996cce3 100644
--- a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
+++ b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
@@ -1,44 +1,53 @@
+import React from 'react';
import {getHost} from '../../helpers/getHost'
interface AccessReportProps {
- accessData: any;
- report: any;
+ accessData: {
+ pdf?: string;
+ docx?: string;
+ json?: string;
+ };
+ report: string;
}
const AccessReport: React.FC = ({ accessData, report }) => {
const host = getHost();
-
- const copyToClipboard = () => {
- if (navigator.clipboard) {
- navigator.clipboard.writeText(report).catch(err => {
- console.error('Failed to copy: ', err);
- });
- } else {
- console.warn('Clipboard API is not available');
+ const getReportLink = (dataType: 'pdf' | 'docx' | 'json'): string => {
+ if (!accessData[dataType]) {
+ console.warn(`No ${dataType} path provided`);
+ return '#';
}
- };
-
- const getReportLink = (dataType:string) => {
- return `${host}/${accessData[dataType]}`;
+ // Remove any leading slashes to prevent double slashes in URL
+ const path = accessData[dataType]?.replace(/^\//, '');
+ return `${host}/${path}`;
};
return (
);
-}
+};
export default AccessReport;
\ No newline at end of file
diff --git a/frontend/scripts.js b/frontend/scripts.js
index 1e1e90a4c..abcefb890 100644
--- a/frontend/scripts.js
+++ b/frontend/scripts.js
@@ -104,12 +104,30 @@ const GPTResearcher = (() => {
}
const updateDownloadLink = (data) => {
- const pdf_path = data.output.pdf
- const docx_path = data.output.docx
- const md_path = data.output.md;
- document.getElementById('downloadLink').setAttribute('href', pdf_path);
- document.getElementById('downloadLinkWord').setAttribute('href', docx_path);
- document.getElementById("downloadLinkMd").setAttribute("href", md_path);
+ if (!data.output) {
+ console.error('No output data received');
+ return;
+ }
+
+ const { pdf, docx, md, json } = data.output;
+ console.log('Received paths:', { pdf, docx, md, json });
+
+ // Helper function to safely update link
+ const updateLink = (id, path) => {
+ const element = document.getElementById(id);
+ if (element && path) {
+ console.log(`Setting ${id} href to:`, path);
+ element.setAttribute('href', path);
+ element.classList.remove('disabled');
+ } else {
+ console.warn(`Either element ${id} not found or path not provided`);
+ }
+ };
+
+ updateLink('downloadLink', pdf);
+ updateLink('downloadLinkWord', docx);
+ updateLink('downloadLinkMd', md);
+ updateLink('downloadLinkJson', json);
}
const updateScroll = () => {
diff --git a/gpt_researcher/agent.py b/gpt_researcher/agent.py
index 3ebcd2347..75dba2531 100644
--- a/gpt_researcher/agent.py
+++ b/gpt_researcher/agent.py
@@ -48,6 +48,7 @@ def __init__(
context=[],
headers: dict = None,
max_subtopics: int = 5,
+ log_handler=None,
):
self.query = query
self.report_type = report_type
@@ -79,6 +80,7 @@ def __init__(
self.memory = Memory(
self.cfg.embedding_provider, self.cfg.embedding_model, **self.cfg.embedding_kwargs
)
+ self.log_handler = log_handler
# Initialize components
self.research_conductor: ResearchConductor = ResearchConductor(self)
@@ -87,8 +89,36 @@ def __init__(
self.scraper_manager: BrowserManager = BrowserManager(self)
self.source_curator: SourceCurator = SourceCurator(self)
+ async def _log_event(self, event_type: str, **kwargs):
+ """Helper method to handle logging events"""
+ if self.log_handler:
+ try:
+ if event_type == "tool":
+ await self.log_handler.on_tool_start(kwargs.get('tool_name', ''), **kwargs)
+ elif event_type == "action":
+ await self.log_handler.on_agent_action(kwargs.get('action', ''), **kwargs)
+ elif event_type == "research":
+ await self.log_handler.on_research_step(kwargs.get('step', ''), kwargs.get('details', {}))
+
+ # Add direct logging as backup
+ import logging
+ research_logger = logging.getLogger('research')
+ research_logger.info(f"{event_type}: {json.dumps(kwargs, default=str)}")
+
+ except Exception as e:
+ import logging
+ logging.getLogger('research').error(f"Error in _log_event: {e}", exc_info=True)
+
async def conduct_research(self):
+ await self._log_event("research", step="start", details={
+ "query": self.query,
+ "report_type": self.report_type,
+ "agent": self.agent,
+ "role": self.role
+ })
+
if not (self.agent and self.role):
+ await self._log_event("action", action="choose_agent")
self.agent, self.role = await choose_agent(
query=self.query,
cfg=self.cfg,
@@ -96,22 +126,50 @@ async def conduct_research(self):
cost_callback=self.add_costs,
headers=self.headers,
)
-
+ await self._log_event("action", action="agent_selected", details={
+ "agent": self.agent,
+ "role": self.role
+ })
+
+ await self._log_event("research", step="conducting_research", details={
+ "agent": self.agent,
+ "role": self.role
+ })
self.context = await self.research_conductor.conduct_research()
+
+ await self._log_event("research", step="research_completed", details={
+ "context_length": len(self.context)
+ })
return self.context
async def write_report(self, existing_headers: list = [], relevant_written_contents: list = [], ext_context=None) -> str:
- return await self.report_generator.write_report(
+ await self._log_event("research", step="writing_report", details={
+ "existing_headers": existing_headers,
+ "context_source": "external" if ext_context else "internal"
+ })
+
+ report = await self.report_generator.write_report(
existing_headers,
relevant_written_contents,
ext_context or self.context
)
+
+ await self._log_event("research", step="report_completed", details={
+ "report_length": len(report)
+ })
+ return report
async def write_report_conclusion(self, report_body: str) -> str:
- return await self.report_generator.write_report_conclusion(report_body)
+ await self._log_event("research", step="writing_conclusion")
+ conclusion = await self.report_generator.write_report_conclusion(report_body)
+ await self._log_event("research", step="conclusion_completed")
+ return conclusion
async def write_introduction(self):
- return await self.report_generator.write_introduction()
+ await self._log_event("research", step="writing_introduction")
+ intro = await self.report_generator.write_introduction()
+ await self._log_event("research", step="introduction_completed")
+ return intro
async def get_subtopics(self):
return await self.report_generator.get_subtopics()
@@ -174,3 +232,8 @@ def add_costs(self, cost: float) -> None:
if not isinstance(cost, (float, int)):
raise ValueError("Cost must be an integer or float")
self.research_costs += cost
+ if self.log_handler:
+ self._log_event("research", step="cost_update", details={
+ "cost": cost,
+ "total_cost": self.research_costs
+ })
diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py
index ebe47ae56..5235bfd98 100644
--- a/gpt_researcher/skills/researcher.py
+++ b/gpt_researcher/skills/researcher.py
@@ -2,11 +2,13 @@
import random
import json
from typing import Dict, Optional
+import logging
from ..actions.utils import stream_output
from ..actions.query_processing import plan_research_outline, get_search_results
from ..document import DocumentLoader, LangChainDocumentLoader
from ..utils.enum import ReportSource, ReportType, Tone
+from ..utils.logging_config import get_json_handler, get_research_logger
class ResearchConductor:
@@ -14,8 +16,12 @@ class ResearchConductor:
def __init__(self, researcher):
self.researcher = researcher
+ self.logger = logging.getLogger('research')
+ self.json_handler = get_json_handler()
async def plan_research(self, query):
+ self.logger.info(f"Planning research for query: {query}")
+
await stream_output(
"logs",
"planning_research",
@@ -24,15 +30,16 @@ async def plan_research(self, query):
)
search_results = await get_search_results(query, self.researcher.retrievers[0])
+ self.logger.info(f"Initial search results obtained: {len(search_results)} results")
await stream_output(
"logs",
"planning_research",
- f"š¤ Planning the research strategy and subtasks (this may take a minute)...",
+ f"š¤ Planning the research strategy and subtasks...",
self.researcher.websocket,
)
- return await plan_research_outline(
+ outline = await plan_research_outline(
query=query,
search_results=search_results,
agent_role_prompt=self.researcher.role,
@@ -41,11 +48,16 @@ async def plan_research(self, query):
report_type=self.researcher.report_type,
cost_callback=self.researcher.add_costs,
)
+ self.logger.info(f"Research outline planned: {outline}")
+ return outline
async def conduct_research(self):
- """
- Runs the GPT Researcher to conduct research
- """
+ """Runs the GPT Researcher to conduct research"""
+ if self.json_handler:
+ self.json_handler.update_content("query", self.researcher.query)
+
+ self.logger.info(f"Starting research for query: {self.researcher.query}")
+
# Reset visited_urls and source_urls at the start of each research task
self.researcher.visited_urls.clear()
research_data = []
@@ -63,56 +75,30 @@ async def conduct_research(self):
# Research for relevant sources based on source types below
if self.researcher.source_urls:
- # If specified, the researcher will use the given urls as the context for the research.
+ self.logger.info("Using provided source URLs")
research_data = await self._get_context_by_urls(self.researcher.source_urls)
- if research_data and len(research_data) == 0 and self.verbose:
- # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge
+ if research_data and len(research_data) == 0 and self.researcher.verbose:
await stream_output(
"logs",
"answering_from_memory",
f"š§ I was unable to find relevant context in the provided sources...",
- self.websocket,
+ self.researcher.websocket,
)
- # If complement_source_urls parameter is set, more resources can be gathered to create additional context using default web search
if self.researcher.complement_source_urls:
+ self.logger.info("Complementing with web search")
additional_research = await self._get_context_by_web_search(self.researcher.query)
research_data += ' '.join(additional_research)
- elif self.researcher.report_source == ReportSource.Local.value:
- document_data = await DocumentLoader(self.researcher.cfg.doc_path).load()
- if self.researcher.vector_store:
- self.researcher.vector_store.load(document_data)
-
- research_data = await self._get_context_by_web_search(self.researcher.query, document_data)
-
- # Hybrid search including both local documents and web sources
- elif self.researcher.report_source == ReportSource.Hybrid.value:
- document_data = await DocumentLoader(self.researcher.cfg.doc_path).load()
- if self.researcher.vector_store:
- self.researcher.vector_store.load(document_data)
- docs_context = await self._get_context_by_web_search(self.researcher.query, document_data)
- web_context = await self._get_context_by_web_search(self.researcher.query)
- research_data = f"Context from local documents: {docs_context}\n\nContext from web sources: {web_context}"
-
- elif self.researcher.report_source == ReportSource.LangChainDocuments.value:
- langchain_documents_data = await LangChainDocumentLoader(
- self.researcher.documents
- ).load()
- if self.researcher.vector_store:
- self.researcher.vector_store.load(langchain_documents_data)
- research_data = await self._get_context_by_web_search(
- self.researcher.query, langchain_documents_data
- )
-
- elif self.researcher.report_source == ReportSource.LangChainVectorStore.value:
- research_data = await self._get_context_by_vectorstore(self.researcher.query, self.researcher.vector_store_filter)
- # Default web based research
elif self.researcher.report_source == ReportSource.Web.value:
+ self.logger.info("Using web search")
research_data = await self._get_context_by_web_search(self.researcher.query)
- # Rank and curate the sources based on the research data
+ # ... rest of the conditions ...
+
+ # Rank and curate the sources
self.researcher.context = research_data
if self.researcher.cfg.curate_sources:
+ self.logger.info("Curating sources")
self.researcher.context = await self.researcher.source_curator.curate_sources(research_data)
if self.researcher.verbose:
@@ -122,28 +108,34 @@ async def conduct_research(self):
f"Finalized research step.\nšø Total Research Costs: ${self.researcher.get_costs()}",
self.researcher.websocket,
)
+ if self.json_handler:
+ self.json_handler.update_content("costs", self.researcher.get_costs())
+ self.json_handler.update_content("context", self.researcher.context)
+ self.logger.info(f"Research completed. Context size: {len(str(self.researcher.context))}")
return self.researcher.context
async def _get_context_by_urls(self, urls):
- """
- Scrapes and compresses the context from the given urls
- """
+ """Scrapes and compresses the context from the given urls"""
+ self.logger.info(f"Getting context from URLs: {urls}")
+
new_search_urls = await self._get_new_urls(urls)
- if self.researcher.verbose:
- await stream_output(
- "logs",
- "source_urls",
- f"šļø I will conduct my research based on the following urls: {new_search_urls}...",
- self.researcher.websocket,
- )
+ self.logger.info(f"New URLs to process: {new_search_urls}")
scraped_content = await self.researcher.scraper_manager.browse_urls(new_search_urls)
+ self.logger.info(f"Scraped content from {len(scraped_content)} URLs")
if self.researcher.vector_store:
+ self.logger.info("Loading content into vector store")
self.researcher.vector_store.load(scraped_content)
- return await self.researcher.context_manager.get_similar_content_by_query(self.researcher.query, scraped_content)
+ context = await self.researcher.context_manager.get_similar_content_by_query(
+ self.researcher.query, scraped_content
+ )
+ self.logger.info(f"Generated context length: {len(context)}")
+ return context
+
+ # Add logging to other methods similarly...
async def _get_context_by_vectorstore(self, query, filter: Optional[dict] = None):
"""
@@ -183,8 +175,12 @@ async def _get_context_by_web_search(self, query, scraped_data: list = []):
Returns:
context: List of context
"""
+ self.logger.info(f"Starting web search for query: {query}")
+
# Generate Sub-Queries including original query
sub_queries = await self.plan_research(query)
+ self.logger.info(f"Generated sub-queries: {sub_queries}")
+
# If this is not part of a sub researcher, add original query to research for better results
if self.researcher.report_type != "subtopic_report":
sub_queries.append(query)
@@ -200,24 +196,33 @@ async def _get_context_by_web_search(self, query, scraped_data: list = []):
)
# Using asyncio.gather to process the sub_queries asynchronously
- context = await asyncio.gather(
- *[
- self._process_sub_query(sub_query, scraped_data)
- for sub_query in sub_queries
- ]
- )
- return context
+ try:
+ context = await asyncio.gather(
+ *[
+ self._process_sub_query(sub_query, scraped_data)
+ for sub_query in sub_queries
+ ]
+ )
+ self.logger.info(f"Gathered context from {len(context)} sub-queries")
+ # Filter out empty results and join the context
+ context = [c for c in context if c]
+ if context:
+ combined_context = " ".join(context)
+ self.logger.info(f"Combined context size: {len(combined_context)}")
+ return combined_context
+ return []
+ except Exception as e:
+ self.logger.error(f"Error during web search: {e}", exc_info=True)
+ return []
async def _process_sub_query(self, sub_query: str, scraped_data: list = []):
- """Takes in a sub query and scrapes urls based on it and gathers context.
-
- Args:
- sub_query (str): The sub-query generated from the original query
- scraped_data (list): Scraped data passed in
-
- Returns:
- str: The context gathered from search
- """
+ """Takes in a sub query and scrapes urls based on it and gathers context."""
+ if self.json_handler:
+ self.json_handler.log_event("sub_query", {
+ "query": sub_query,
+ "scraped_data_size": len(scraped_data)
+ })
+
if self.researcher.verbose:
await stream_output(
"logs",
@@ -226,23 +231,35 @@ async def _process_sub_query(self, sub_query: str, scraped_data: list = []):
self.researcher.websocket,
)
- if not scraped_data:
- scraped_data = await self._scrape_data_by_urls(sub_query)
+ try:
+ if not scraped_data:
+ scraped_data = await self._scrape_data_by_urls(sub_query)
+ self.logger.info(f"Scraped data size: {len(scraped_data)}")
- content = await self.researcher.context_manager.get_similar_content_by_query(sub_query, scraped_data)
+ content = await self.researcher.context_manager.get_similar_content_by_query(sub_query, scraped_data)
+ self.logger.info(f"Content found for sub-query: {len(str(content)) if content else 0} chars")
- if content and self.researcher.verbose:
- await stream_output(
- "logs", "subquery_context_window", f"š {content}", self.researcher.websocket
- )
- elif self.researcher.verbose:
- await stream_output(
- "logs",
- "subquery_context_not_found",
- f"š¤· No content found for '{sub_query}'...",
- self.researcher.websocket,
- )
- return content
+ if content and self.researcher.verbose:
+ await stream_output(
+ "logs", "subquery_context_window", f"š {content}", self.researcher.websocket
+ )
+ elif self.researcher.verbose:
+ await stream_output(
+ "logs",
+ "subquery_context_not_found",
+ f"š¤· No content found for '{sub_query}'...",
+ self.researcher.websocket,
+ )
+ if content:
+ if self.json_handler:
+ self.json_handler.log_event("content_found", {
+ "sub_query": sub_query,
+ "content_size": len(content)
+ })
+ return content
+ except Exception as e:
+ self.logger.error(f"Error processing sub-query {sub_query}: {e}", exc_info=True)
+ return ""
async def _process_sub_query_with_vectorstore(self, sub_query: str, filter: Optional[dict] = None):
"""Takes in a sub query and gathers context from the user provided vector store
diff --git a/gpt_researcher/utils/logging_config.py b/gpt_researcher/utils/logging_config.py
new file mode 100644
index 000000000..ee0d855ed
--- /dev/null
+++ b/gpt_researcher/utils/logging_config.py
@@ -0,0 +1,82 @@
+import logging
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+class JSONResearchHandler:
+ def __init__(self, json_file):
+ self.json_file = json_file
+ self.research_data = {
+ "timestamp": datetime.now().isoformat(),
+ "events": [],
+ "content": {
+ "query": "",
+ "sources": [],
+ "context": [],
+ "report": "",
+ "costs": 0.0
+ }
+ }
+
+ def log_event(self, event_type: str, data: dict):
+ self.research_data["events"].append({
+ "timestamp": datetime.now().isoformat(),
+ "type": event_type,
+ "data": data
+ })
+ self._save_json()
+
+ def update_content(self, key: str, value):
+ self.research_data["content"][key] = value
+ self._save_json()
+
+ def _save_json(self):
+ with open(self.json_file, 'w') as f:
+ json.dump(self.research_data, f, indent=2)
+
+def setup_research_logging():
+ # Create logs directory if it doesn't exist
+ logs_dir = Path("logs")
+ logs_dir.mkdir(exist_ok=True)
+
+ # Generate timestamp for log files
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ # Create log file paths
+ log_file = logs_dir / f"research_{timestamp}.log"
+ json_file = logs_dir / f"research_{timestamp}.json"
+
+ # Configure file handler for research logs
+ file_handler = logging.FileHandler(log_file)
+ file_handler.setLevel(logging.INFO)
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+
+ # Get research logger and configure it
+ research_logger = logging.getLogger('research')
+ research_logger.setLevel(logging.INFO)
+
+ # Remove any existing handlers to avoid duplicates
+ research_logger.handlers.clear()
+
+ # Add file handler
+ research_logger.addHandler(file_handler)
+
+ # Add stream handler for console output
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+ research_logger.addHandler(console_handler)
+
+ # Prevent propagation to root logger to avoid duplicate logs
+ research_logger.propagate = False
+
+ # Create JSON handler
+ json_handler = JSONResearchHandler(json_file)
+
+ return str(log_file), str(json_file), research_logger, json_handler
+
+def get_research_logger():
+ return logging.getLogger('research')
+
+def get_json_handler():
+ return getattr(logging.getLogger('research'), 'json_handler', None)
diff --git a/main.py b/main.py
index 0f48c2cba..10057a495 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,25 @@
from dotenv import load_dotenv
+import logging
+from pathlib import Path
+
+# Create logs directory if it doesn't exist
+logs_dir = Path("logs")
+logs_dir.mkdir(exist_ok=True)
+
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+ handlers=[
+ # File handler for general application logs
+ logging.FileHandler('logs/app.log'),
+ # Stream handler for console output
+ logging.StreamHandler()
+ ]
+)
+
+# Create logger instance
+logger = logging.getLogger(__name__)
load_dotenv()
@@ -6,5 +27,6 @@
if __name__ == "__main__":
import uvicorn
-
- uvicorn.run(app, host="0.0.0.0", port=8000)
+
+ logger.info("Starting server...")
+ uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index d2db4d9d7..cab6c1c77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,4 +45,11 @@ websockets = "^13.1"
[build-system]
requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
\ No newline at end of file
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+asyncio_mode = "strict"
+addopts = "-v"
+testpaths = ["tests"]
+python_files = "test_*.py"
+asyncio_fixture_loop_scope = "function"
\ No newline at end of file
diff --git a/src/logs_handler.py b/src/logs_handler.py
new file mode 100644
index 000000000..353dd7484
--- /dev/null
+++ b/src/logs_handler.py
@@ -0,0 +1,90 @@
+import logging
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any
+
+class CustomLogsHandler:
+ """A unified custom logs handler for GPT Researcher."""
+
+ def __init__(self, websocket=None, query=None):
+ self.websocket = websocket
+ self.query = query
+ self.logs: List[Dict[str, Any]] = []
+
+ # Set up logging configuration
+ logging.basicConfig(level=logging.INFO)
+ self.logger = logging.getLogger(__name__)
+
+ # Initialize log file if query is provided
+ if query:
+ self.log_file = self._create_log_file()
+
+ def _create_log_file(self):
+ """Create log file with proper directory structure."""
+ # Use the project root directory
+ project_root = Path(__file__).parent.parent
+ logs_dir = project_root / "logs"
+
+ # Create logs directory
+ os.makedirs(logs_dir, exist_ok=True)
+
+ # Create timestamped log file
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+ log_file = logs_dir / f"research_{timestamp}.json"
+
+ # Initialize log file with empty structure
+ initial_data = {
+ "events": [],
+ "content": {
+ "query": self.query,
+ "sources": [],
+ "report": ""
+ }
+ }
+
+ with open(log_file, 'w') as f:
+ json.dump(initial_data, f, indent=2)
+
+ return log_file
+
+ async def send_json(self, data: Dict[str, Any]) -> None:
+ """Send JSON data and log it, with error handling."""
+ try:
+ # Append data to logs
+ self.logs.append(data)
+
+ # Log using logging
+ self.logger.info(f"Log: {data}")
+
+ # Send to websocket if available
+ if self.websocket:
+ await self.websocket.send_json(data)
+
+ # Write to log file if available
+ if hasattr(self, 'log_file'):
+ self._append_to_log_file(data)
+
+ except Exception as e:
+ self.logger.error(f"Error logging data: {e}")
+
+ def _append_to_log_file(self, data: Dict[str, Any]) -> None:
+ """Append data to the JSON log file."""
+ try:
+ with open(self.log_file, 'r+') as f:
+ log_data = json.load(f)
+ log_data["events"].append({
+ "timestamp": datetime.now().isoformat(),
+ "data": data
+ })
+ f.seek(0)
+ json.dump(log_data, f, indent=2)
+ f.truncate()
+ except Exception as e:
+ self.logger.error(f"Error writing to log file: {e}")
+
+ def clear_logs(self) -> None:
+ """Clear the logs."""
+ self.logs.clear()
+ self.logger.info("Logs cleared.")
\ No newline at end of file
diff --git a/src/researcher.py b/src/researcher.py
new file mode 100644
index 000000000..97638341f
--- /dev/null
+++ b/src/researcher.py
@@ -0,0 +1,94 @@
+from typing import Dict, Any
+import json
+from datetime import datetime
+from pathlib import Path
+import logging
+import sys
+from .logs_handler import CustomLogsHandler
+from gpt_researcher.agent import GPTResearcher
+from backend.server.logging_config import get_research_logger
+
+# Configure logging to output to both file and console
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+ handlers=[
+ logging.FileHandler('researcher_debug.log'),
+ logging.StreamHandler(sys.stdout)
+ ]
+)
+
+logger = logging.getLogger(__name__)
+
+class ResearchLogHandler:
+ """Custom handler to capture GPTResearcher logs"""
+ def __init__(self, research_logger):
+ self.logger = research_logger
+
+ async def on_tool_start(self, tool_name: str, **kwargs):
+ self.logger.info(f"Starting tool: {tool_name}")
+ self.logger.info(f"Tool parameters: {kwargs}")
+
+ async def on_tool_end(self, tool_name: str, result: Any):
+ self.logger.info(f"Completed tool: {tool_name}")
+ self.logger.info(f"Tool result: {result}")
+
+ async def on_agent_action(self, action: str, **kwargs):
+ self.logger.info(f"Agent action: {action}")
+ self.logger.info(f"Action details: {kwargs}")
+
+ async def on_research_step(self, step: str, details: Any):
+ self.logger.info(f"Research step: {step}")
+ self.logger.info(f"Step details: {details}")
+
+class Researcher:
+ def __init__(self, query: str, report_type: str = "research_report"):
+ self.research_logger = get_research_logger()
+ self.query = query
+ self.report_type = report_type
+
+ # Initialize our custom logs handler
+ self.logs_handler = CustomLogsHandler()
+ self.research_logger.info(f"Initialized Researcher with query: {query}")
+
+ try:
+ # Initialize research log handler
+ self.research_log_handler = ResearchLogHandler(self.research_logger)
+
+ # Initialize GPTResearcher with both handlers
+ self.researcher = GPTResearcher(
+ query=query,
+ report_type=report_type,
+ websocket=self.logs_handler,
+ log_handler=self.research_log_handler # Add research log handler
+ )
+ self.research_logger.info("Successfully initialized GPTResearcher")
+ except Exception as e:
+ self.research_logger.error(f"Error initializing GPTResearcher: {e}", exc_info=True)
+ raise
+
+ async def research(self) -> str:
+ """Conduct research and return the report"""
+ try:
+ self.research_logger.info(f"Starting research process for query: {self.query}")
+ self.research_logger.info(f"Report type: {self.report_type}")
+
+ self.research_logger.info("Beginning research phase")
+ await self.researcher.conduct_research()
+ self.research_logger.info("Research phase completed")
+
+ self.research_logger.info("Starting report generation")
+ report = await self.researcher.write_report()
+ self.research_logger.info("Report generation completed")
+
+ # Log report summary
+ report_preview = report[:500] + "..." if len(report) > 500 else report
+ self.research_logger.info(f"Report preview: {report_preview}")
+
+ return report
+
+ except Exception as e:
+ self.research_logger.error(f"Error during research: {e}", exc_info=True)
+ raise
+
+# ... rest of the code ...
\ No newline at end of file
diff --git a/tests/gptr-logs-handler.py b/tests/gptr-logs-handler.py
index fb05694ce..0bbec93a4 100644
--- a/tests/gptr-logs-handler.py
+++ b/tests/gptr-logs-handler.py
@@ -2,25 +2,7 @@
from typing import List, Dict, Any
import asyncio
from gpt_researcher import GPTResearcher
-
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
-
- def clear_logs(self) -> None:
- """Clear the logs."""
- self.logs.clear() # Clear the logs list
- logging.info("Logs cleared.") # Log the clearing action
+from src.logs_handler import CustomLogsHandler # Update import
async def run() -> None:
"""Run the research process and generate a report."""
@@ -30,7 +12,7 @@ async def run() -> None:
tone = "informative"
config_path = None
- custom_logs_handler = CustomLogsHandler()
+ custom_logs_handler = CustomLogsHandler(query=query) # Pass query parameter
researcher = GPTResearcher(
query=query,
diff --git a/tests/report-types.py b/tests/report-types.py
index 073f8336e..e09fec100 100644
--- a/tests/report-types.py
+++ b/tests/report-types.py
@@ -2,23 +2,9 @@
import asyncio
import pytest
from gpt_researcher.agent import GPTResearcher
-import logging
+from src.logs_handler import CustomLogsHandler # Update import
from typing import List, Dict, Any
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
-
# Define the report types to test
report_types = [
"research_report",
@@ -39,7 +25,7 @@ async def test_gpt_researcher(report_type):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
- custom_logs_handler = CustomLogsHandler()
+ custom_logs_handler = CustomLogsHandler(query=query)
# Create an instance of GPTResearcher
researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler)
diff --git a/tests/research_test.py b/tests/research_test.py
index b58d5b92a..56077f8fd 100644
--- a/tests/research_test.py
+++ b/tests/research_test.py
@@ -18,23 +18,10 @@
import asyncio
import logging
from typing import List, Dict, Any
-
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
+from src.logs_handler import CustomLogsHandler # Update import
async def get_report(query: str, report_type: str, sources: list) -> str:
- custom_logs_handler = CustomLogsHandler()
+ custom_logs_handler = CustomLogsHandler(query=query) # Pass query parameter
researcher = GPTResearcher(query=query,
report_type=report_type,
complement_source_urls=False,
diff --git a/tests/test_logging.py b/tests/test_logging.py
new file mode 100644
index 000000000..c6ff963b7
--- /dev/null
+++ b/tests/test_logging.py
@@ -0,0 +1,61 @@
+import pytest
+from unittest.mock import AsyncMock
+from fastapi import WebSocket
+from src.logs_handler import CustomLogsHandler
+import os
+import json
+
+@pytest.mark.asyncio
+async def test_custom_logs_handler():
+ # Mock websocket
+ mock_websocket = AsyncMock()
+ mock_websocket.send_json = AsyncMock()
+
+ # Test initialization
+ handler = CustomLogsHandler(mock_websocket, "test_query")
+
+ # Verify log file creation
+ assert os.path.exists(handler.log_file)
+
+ # Test sending log data
+ test_data = {
+ "type": "logs",
+ "message": "Test log message"
+ }
+
+ await handler.send_json(test_data)
+
+ # Verify websocket was called with correct data
+ mock_websocket.send_json.assert_called_once_with(test_data)
+
+ # Verify log file contents
+ with open(handler.log_file, 'r') as f:
+ log_data = json.load(f)
+ assert len(log_data['events']) == 1
+ assert log_data['events'][0]['data'] == test_data
+
+@pytest.mark.asyncio
+async def test_content_update():
+ """Test handling of non-log type data that updates content"""
+ mock_websocket = AsyncMock()
+ mock_websocket.send_json = AsyncMock()
+
+ handler = CustomLogsHandler(mock_websocket, "test_query")
+
+ # Test content update
+ content_data = {
+ "query": "test query",
+ "sources": ["source1", "source2"],
+ "report": "test report"
+ }
+
+ await handler.send_json(content_data)
+
+ mock_websocket.send_json.assert_called_once_with(content_data)
+
+ # Verify log file contents
+ with open(handler.log_file, 'r') as f:
+ log_data = json.load(f)
+ assert log_data['content']['query'] == "test query"
+ assert log_data['content']['sources'] == ["source1", "source2"]
+ assert log_data['content']['report'] == "test report"
\ No newline at end of file
diff --git a/tests/test_logs.py b/tests/test_logs.py
new file mode 100644
index 000000000..0f2353959
--- /dev/null
+++ b/tests/test_logs.py
@@ -0,0 +1,48 @@
+import os
+from pathlib import Path
+import sys
+
+# Add the project root to Python path
+project_root = Path(__file__).parent.parent
+sys.path.append(str(project_root))
+
+from src.logs_handler import CustomLogsHandler
+
+def test_logs_creation():
+ # Print current working directory
+ print(f"Current working directory: {os.getcwd()}")
+
+ # Print project root
+ print(f"Project root: {project_root}")
+
+ # Try to create logs directory directly
+ logs_dir = project_root / "logs"
+ print(f"Attempting to create logs directory at: {logs_dir}")
+
+ try:
+ # Create directory with full permissions
+ os.makedirs(logs_dir, mode=0o777, exist_ok=True)
+ print(f"ā Created directory: {logs_dir}")
+
+ # Test file creation
+ test_file = logs_dir / "test.txt"
+ with open(test_file, 'w') as f:
+ f.write("Test log entry")
+ print(f"ā Created test file: {test_file}")
+
+ # Initialize the handler
+ handler = CustomLogsHandler()
+ print("ā CustomLogsHandler initialized")
+
+ # Test JSON logging
+ handler.logs.append({"test": "message"})
+ print("ā Added test log entry")
+
+ except Exception as e:
+ print(f"ā Error: {str(e)}")
+ print(f"Error type: {type(e)}")
+ import traceback
+ print(f"Traceback: {traceback.format_exc()}")
+
+if __name__ == "__main__":
+ test_logs_creation()
\ No newline at end of file
diff --git a/tests/test_researcher.py b/tests/test_researcher.py
new file mode 100644
index 000000000..f1d86d294
--- /dev/null
+++ b/tests/test_researcher.py
@@ -0,0 +1,63 @@
+import pytest
+import asyncio
+from pathlib import Path
+import sys
+import logging
+
+# Add the project root to Python path
+project_root = Path(__file__).parent.parent
+sys.path.append(str(project_root))
+
+# Configure basic logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+@pytest.mark.asyncio
+async def test_researcher():
+ try:
+ # Import here to catch any import errors
+ from src.researcher import Researcher
+ logger.info("Successfully imported Researcher class")
+
+ # Create a researcher instance
+ researcher = Researcher(
+ query="What is the current state of quantum computing?",
+ report_type="research_report"
+ )
+ logger.info("Created Researcher instance")
+
+ # Run the research
+ report = await researcher.research()
+ logger.info("Research completed successfully!")
+ logger.info(f"Report length: {len(report)}")
+
+ # Basic assertions
+ assert report is not None
+ assert len(report) > 0
+
+ # Check if logs were created
+ logs_dir = Path(project_root) / "logs"
+ log_files = list(logs_dir.glob("research_*.log"))
+ json_files = list(logs_dir.glob("research_*.json"))
+
+ assert len(log_files) > 0, "No log files were created"
+ assert len(json_files) > 0, "No JSON files were created"
+
+ logger.info(f"\nFound {len(log_files)} log files:")
+ for log_file in log_files:
+ logger.info(f"- {log_file.name}")
+
+ logger.info(f"\nFound {len(json_files)} JSON files:")
+ for json_file in json_files:
+ logger.info(f"- {json_file.name}")
+
+ except ImportError as e:
+ logger.error(f"Import error: {e}")
+ logger.error("Make sure gpt_researcher is installed and in your PYTHONPATH")
+ raise
+ except Exception as e:
+ logger.error(f"Error during research: {e}")
+ raise
+
+if __name__ == "__main__":
+ pytest.main([__file__])
\ No newline at end of file
From 0fc99671e30d0fdc642c634c3f545889abbc6806 Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Wed, 18 Dec 2024 11:01:20 -0800
Subject: [PATCH 02/11] Update gitignore and test files
---
.gitignore | 18 +----
tests/gptr-logs-handler.py.orig | 52 ++++++++++++++
tests/report-types.py.orig | 63 ++++++++++++++++
tests/research_test.py.orig | 123 ++++++++++++++++++++++++++++++++
4 files changed, 239 insertions(+), 17 deletions(-)
create mode 100644 tests/gptr-logs-handler.py.orig
create mode 100644 tests/report-types.py.orig
create mode 100644 tests/research_test.py.orig
diff --git a/.gitignore b/.gitignore
index 428cdd82d..c99a4ea62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,20 +43,4 @@ package-lock.json
*.swp
# Log files
-logs/
-*.log
-*_log.txt
-heroku_logs.txt
-memory_profiling_log.txt
-server_log.txt
-
-# Add to existing .gitignore
-*.json
-*.pdf
-*.md
-!README.md
-!CONTRIBUTING.md
-!CODE_OF_CONDUCT.md
-
-# Backup directories
-data_backup/
+logs/
\ No newline at end of file
diff --git a/tests/gptr-logs-handler.py.orig b/tests/gptr-logs-handler.py.orig
new file mode 100644
index 000000000..fb05694ce
--- /dev/null
+++ b/tests/gptr-logs-handler.py.orig
@@ -0,0 +1,52 @@
+import logging
+from typing import List, Dict, Any
+import asyncio
+from gpt_researcher import GPTResearcher
+
+class CustomLogsHandler:
+ """A custom Logs handler class to handle JSON data."""
+ def __init__(self):
+ self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
+ logging.basicConfig(level=logging.INFO) # Set up logging configuration
+
+ async def send_json(self, data: Dict[str, Any]) -> None:
+ """Send JSON data and log it, with error handling."""
+ try:
+ self.logs.append(data) # Append data to logs
+ logging.info(f"My custom Log: {data}") # Use logging instead of print
+ except Exception as e:
+ logging.error(f"Error logging data: {e}") # Log any errors
+
+ def clear_logs(self) -> None:
+ """Clear the logs."""
+ self.logs.clear() # Clear the logs list
+ logging.info("Logs cleared.") # Log the clearing action
+
+async def run() -> None:
+ """Run the research process and generate a report."""
+ query = "What happened in the latest burning man floods?"
+ report_type = "research_report"
+ report_source = "online"
+ tone = "informative"
+ config_path = None
+
+ custom_logs_handler = CustomLogsHandler()
+
+ researcher = GPTResearcher(
+ query=query,
+ report_type=report_type,
+ report_source=report_source,
+ tone=tone,
+ config_path=config_path,
+ websocket=custom_logs_handler
+ )
+
+ await researcher.conduct_research() # Conduct the research
+ report = await researcher.write_report() # Write the research report
+ logging.info("Report generated successfully.") # Log report generation
+
+ return report
+
+# Run the asynchronous function using asyncio
+if __name__ == "__main__":
+ asyncio.run(run())
diff --git a/tests/report-types.py.orig b/tests/report-types.py.orig
new file mode 100644
index 000000000..073f8336e
--- /dev/null
+++ b/tests/report-types.py.orig
@@ -0,0 +1,63 @@
+import os
+import asyncio
+import pytest
+from gpt_researcher.agent import GPTResearcher
+import logging
+from typing import List, Dict, Any
+
+class CustomLogsHandler:
+ """A custom Logs handler class to handle JSON data."""
+ def __init__(self):
+ self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
+ logging.basicConfig(level=logging.INFO) # Set up logging configuration
+
+ async def send_json(self, data: Dict[str, Any]) -> None:
+ """Send JSON data and log it, with error handling."""
+ try:
+ self.logs.append(data) # Append data to logs
+ logging.info(f"My custom Log: {data}") # Use logging instead of print
+ except Exception as e:
+ logging.error(f"Error logging data: {e}") # Log any errors
+
+# Define the report types to test
+report_types = [
+ "research_report",
+ "subtopic_report"
+]
+
+# Define a common query and sources for testing
+query = "What are the latest advancements in AI?"
+# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"]
+
+# Define the output directory
+output_dir = "./outputs"
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("report_type", report_types)
+async def test_gpt_researcher(report_type):
+ # Ensure the output directory exists
+ if not os.path.exists(output_dir):
+ os.makedirs(output_dir)
+
+ custom_logs_handler = CustomLogsHandler()
+ # Create an instance of GPTResearcher
+ researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler)
+
+ # Conduct research and write the report
+ await researcher.conduct_research()
+ report = await researcher.write_report()
+
+ # Define the expected output filenames
+ pdf_filename = os.path.join(output_dir, f"{report_type}.pdf")
+ docx_filename = os.path.join(output_dir, f"{report_type}.docx")
+
+ # Check if the PDF and DOCX files are created
+ # assert os.path.exists(pdf_filename), f"PDF file not found for report type: {report_type}"
+ # assert os.path.exists(docx_filename), f"DOCX file not found for report type: {report_type}"
+
+ # Clean up the generated files (optional)
+ # os.remove(pdf_filename)
+ # os.remove(docx_filename)
+
+if __name__ == "__main__":
+ pytest.main()
\ No newline at end of file
diff --git a/tests/research_test.py.orig b/tests/research_test.py.orig
new file mode 100644
index 000000000..b58d5b92a
--- /dev/null
+++ b/tests/research_test.py.orig
@@ -0,0 +1,123 @@
+"""
+Hi! The following test cases are for the new parameter `complement_source_urls` and fix on the functional error with `source_urls` in GPTResearcher class.
+
+The source_urls parameter was resetting each time in conduct_research function causing gptr to forget the given links. Now, that has been fixed and a new parameter is introduced.
+This parameter named will `complement_source_urls` allow GPTR to research on sources other than the provided sources via source_urls if set to True.
+Default is False, i.e., no additional research will be conducted on newer sources.
+"""
+
+## Notes:
+## Please uncomment the test case to run and comment the rest.
+## Thanks!
+
+
+
+#### Test case 1 (original test case as control from https://docs.gptr.dev/docs/gpt-researcher/tailored-research)
+
+from gpt_researcher.agent import GPTResearcher # Ensure this path is correct
+import asyncio
+import logging
+from typing import List, Dict, Any
+
+class CustomLogsHandler:
+ """A custom Logs handler class to handle JSON data."""
+ def __init__(self):
+ self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
+ logging.basicConfig(level=logging.INFO) # Set up logging configuration
+
+ async def send_json(self, data: Dict[str, Any]) -> None:
+ """Send JSON data and log it, with error handling."""
+ try:
+ self.logs.append(data) # Append data to logs
+ logging.info(f"My custom Log: {data}") # Use logging instead of print
+ except Exception as e:
+ logging.error(f"Error logging data: {e}") # Log any errors
+
+async def get_report(query: str, report_type: str, sources: list) -> str:
+ custom_logs_handler = CustomLogsHandler()
+ researcher = GPTResearcher(query=query,
+ report_type=report_type,
+ complement_source_urls=False,
+ websocket=custom_logs_handler)
+ await researcher.conduct_research()
+ report = await researcher.write_report()
+ return report, researcher
+
+if __name__ == "__main__":
+ query = "Write an analysis on paul graham"
+ report_type = "research_report"
+ sources = ["https://www.paulgraham.com/when.html", "https://www.paulgraham.com/noob.html"] # query is related
+
+ report, researcher = asyncio.run(get_report(query, report_type, sources))
+ print(report)
+
+ print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, so there will be relevant context present
+
+
+
+#### Test case 2 (Illustrating the problem, i.e., source_urls are not scoured. Hence, no relevant context)
+
+# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct
+# import asyncio
+
+# async def get_report(query: str, report_type: str, sources: list) -> str:
+# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources)
+# await researcher.conduct_research()
+# report = await researcher.write_report()
+# return report, researcher
+
+# if __name__ == "__main__":
+# query = "What is Microsoft's business model?"
+# report_type = "research_report"
+# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED.
+
+# report, researcher = asyncio.run(get_report(query, report_type, sources))
+# print(report)
+
+# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say 0 (zero) value because the query is UNRELATED to the contents of the pages, so there will be NO relevant context present
+
+
+
+#### Test case 3 (Suggested solution - complement_source_urls parameter allows GPTR to scour more of the web and not restrict to source_urls)
+
+# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct
+# import asyncio
+
+# async def get_report(query: str, report_type: str, sources: list) -> str:
+# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True)
+# await researcher.conduct_research()
+# report = await researcher.write_report()
+# return report, researcher
+
+# if __name__ == "__main__":
+# query = "What is Microsoft's business model?"
+# report_type = "research_report"
+# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED
+
+# report, researcher = asyncio.run(get_report(query, report_type, sources))
+# print(report)
+
+# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the complement_source_urls is set which should make gptr do default web search to gather contexts
+
+
+
+# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the complement_source_urls parameter is set allowing for a larger research scope)
+
+# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct
+# import asyncio
+
+# async def get_report(query: str, report_type: str, sources: list) -> str:
+# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True)
+# await researcher.conduct_research()
+# report = await researcher.write_report()
+# return report, researcher
+
+# if __name__ == "__main__":
+# query = "What are the latest advancements in AI?"
+# report_type = "research_report"
+# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related
+
+# report, researcher = asyncio.run(get_report(query, report_type, sources))
+# print(report)
+
+# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the complement_source_urls is set which should make gptr do default web search to gather more contexts!
From 7e11b9cbb8ed9d8bf09907f7b7d2f36a8d253991 Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Wed, 18 Dec 2024 11:05:23 -0800
Subject: [PATCH 03/11] added .orig files to gitignore
---
.gitignore | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index c99a4ea62..df3ad0a8a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,4 +43,5 @@ package-lock.json
*.swp
# Log files
-logs/
\ No newline at end of file
+logs/
+*.orig
\ No newline at end of file
From eb26cc3ea7a040f2df91b18c735627b523ace751 Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Wed, 18 Dec 2024 11:19:44 -0800
Subject: [PATCH 04/11] removed src directory
---
src/logs_handler.py | 90 -------------------------------------------
src/researcher.py | 94 ---------------------------------------------
2 files changed, 184 deletions(-)
delete mode 100644 src/logs_handler.py
delete mode 100644 src/researcher.py
diff --git a/src/logs_handler.py b/src/logs_handler.py
deleted file mode 100644
index 353dd7484..000000000
--- a/src/logs_handler.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import logging
-import json
-import os
-from datetime import datetime
-from pathlib import Path
-from typing import List, Dict, Any
-
-class CustomLogsHandler:
- """A unified custom logs handler for GPT Researcher."""
-
- def __init__(self, websocket=None, query=None):
- self.websocket = websocket
- self.query = query
- self.logs: List[Dict[str, Any]] = []
-
- # Set up logging configuration
- logging.basicConfig(level=logging.INFO)
- self.logger = logging.getLogger(__name__)
-
- # Initialize log file if query is provided
- if query:
- self.log_file = self._create_log_file()
-
- def _create_log_file(self):
- """Create log file with proper directory structure."""
- # Use the project root directory
- project_root = Path(__file__).parent.parent
- logs_dir = project_root / "logs"
-
- # Create logs directory
- os.makedirs(logs_dir, exist_ok=True)
-
- # Create timestamped log file
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
- log_file = logs_dir / f"research_{timestamp}.json"
-
- # Initialize log file with empty structure
- initial_data = {
- "events": [],
- "content": {
- "query": self.query,
- "sources": [],
- "report": ""
- }
- }
-
- with open(log_file, 'w') as f:
- json.dump(initial_data, f, indent=2)
-
- return log_file
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- # Append data to logs
- self.logs.append(data)
-
- # Log using logging
- self.logger.info(f"Log: {data}")
-
- # Send to websocket if available
- if self.websocket:
- await self.websocket.send_json(data)
-
- # Write to log file if available
- if hasattr(self, 'log_file'):
- self._append_to_log_file(data)
-
- except Exception as e:
- self.logger.error(f"Error logging data: {e}")
-
- def _append_to_log_file(self, data: Dict[str, Any]) -> None:
- """Append data to the JSON log file."""
- try:
- with open(self.log_file, 'r+') as f:
- log_data = json.load(f)
- log_data["events"].append({
- "timestamp": datetime.now().isoformat(),
- "data": data
- })
- f.seek(0)
- json.dump(log_data, f, indent=2)
- f.truncate()
- except Exception as e:
- self.logger.error(f"Error writing to log file: {e}")
-
- def clear_logs(self) -> None:
- """Clear the logs."""
- self.logs.clear()
- self.logger.info("Logs cleared.")
\ No newline at end of file
diff --git a/src/researcher.py b/src/researcher.py
deleted file mode 100644
index 97638341f..000000000
--- a/src/researcher.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from typing import Dict, Any
-import json
-from datetime import datetime
-from pathlib import Path
-import logging
-import sys
-from .logs_handler import CustomLogsHandler
-from gpt_researcher.agent import GPTResearcher
-from backend.server.logging_config import get_research_logger
-
-# Configure logging to output to both file and console
-logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- handlers=[
- logging.FileHandler('researcher_debug.log'),
- logging.StreamHandler(sys.stdout)
- ]
-)
-
-logger = logging.getLogger(__name__)
-
-class ResearchLogHandler:
- """Custom handler to capture GPTResearcher logs"""
- def __init__(self, research_logger):
- self.logger = research_logger
-
- async def on_tool_start(self, tool_name: str, **kwargs):
- self.logger.info(f"Starting tool: {tool_name}")
- self.logger.info(f"Tool parameters: {kwargs}")
-
- async def on_tool_end(self, tool_name: str, result: Any):
- self.logger.info(f"Completed tool: {tool_name}")
- self.logger.info(f"Tool result: {result}")
-
- async def on_agent_action(self, action: str, **kwargs):
- self.logger.info(f"Agent action: {action}")
- self.logger.info(f"Action details: {kwargs}")
-
- async def on_research_step(self, step: str, details: Any):
- self.logger.info(f"Research step: {step}")
- self.logger.info(f"Step details: {details}")
-
-class Researcher:
- def __init__(self, query: str, report_type: str = "research_report"):
- self.research_logger = get_research_logger()
- self.query = query
- self.report_type = report_type
-
- # Initialize our custom logs handler
- self.logs_handler = CustomLogsHandler()
- self.research_logger.info(f"Initialized Researcher with query: {query}")
-
- try:
- # Initialize research log handler
- self.research_log_handler = ResearchLogHandler(self.research_logger)
-
- # Initialize GPTResearcher with both handlers
- self.researcher = GPTResearcher(
- query=query,
- report_type=report_type,
- websocket=self.logs_handler,
- log_handler=self.research_log_handler # Add research log handler
- )
- self.research_logger.info("Successfully initialized GPTResearcher")
- except Exception as e:
- self.research_logger.error(f"Error initializing GPTResearcher: {e}", exc_info=True)
- raise
-
- async def research(self) -> str:
- """Conduct research and return the report"""
- try:
- self.research_logger.info(f"Starting research process for query: {self.query}")
- self.research_logger.info(f"Report type: {self.report_type}")
-
- self.research_logger.info("Beginning research phase")
- await self.researcher.conduct_research()
- self.research_logger.info("Research phase completed")
-
- self.research_logger.info("Starting report generation")
- report = await self.researcher.write_report()
- self.research_logger.info("Report generation completed")
-
- # Log report summary
- report_preview = report[:500] + "..." if len(report) > 500 else report
- self.research_logger.info(f"Report preview: {report_preview}")
-
- return report
-
- except Exception as e:
- self.research_logger.error(f"Error during research: {e}", exc_info=True)
- raise
-
-# ... rest of the code ...
\ No newline at end of file
From 11eb8800c373117c51758704fdafc79b0f33a596 Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Wed, 18 Dec 2024 11:33:20 -0800
Subject: [PATCH 05/11] Remove .orig files from Git tracking
---
tests/gptr-logs-handler.py.orig | 52 --------------
tests/research_test.py.orig | 123 --------------------------------
2 files changed, 175 deletions(-)
delete mode 100644 tests/gptr-logs-handler.py.orig
delete mode 100644 tests/research_test.py.orig
diff --git a/tests/gptr-logs-handler.py.orig b/tests/gptr-logs-handler.py.orig
deleted file mode 100644
index fb05694ce..000000000
--- a/tests/gptr-logs-handler.py.orig
+++ /dev/null
@@ -1,52 +0,0 @@
-import logging
-from typing import List, Dict, Any
-import asyncio
-from gpt_researcher import GPTResearcher
-
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
-
- def clear_logs(self) -> None:
- """Clear the logs."""
- self.logs.clear() # Clear the logs list
- logging.info("Logs cleared.") # Log the clearing action
-
-async def run() -> None:
- """Run the research process and generate a report."""
- query = "What happened in the latest burning man floods?"
- report_type = "research_report"
- report_source = "online"
- tone = "informative"
- config_path = None
-
- custom_logs_handler = CustomLogsHandler()
-
- researcher = GPTResearcher(
- query=query,
- report_type=report_type,
- report_source=report_source,
- tone=tone,
- config_path=config_path,
- websocket=custom_logs_handler
- )
-
- await researcher.conduct_research() # Conduct the research
- report = await researcher.write_report() # Write the research report
- logging.info("Report generated successfully.") # Log report generation
-
- return report
-
-# Run the asynchronous function using asyncio
-if __name__ == "__main__":
- asyncio.run(run())
diff --git a/tests/research_test.py.orig b/tests/research_test.py.orig
deleted file mode 100644
index b58d5b92a..000000000
--- a/tests/research_test.py.orig
+++ /dev/null
@@ -1,123 +0,0 @@
-"""
-Hi! The following test cases are for the new parameter `complement_source_urls` and fix on the functional error with `source_urls` in GPTResearcher class.
-
-The source_urls parameter was resetting each time in conduct_research function causing gptr to forget the given links. Now, that has been fixed and a new parameter is introduced.
-This parameter named will `complement_source_urls` allow GPTR to research on sources other than the provided sources via source_urls if set to True.
-Default is False, i.e., no additional research will be conducted on newer sources.
-"""
-
-## Notes:
-## Please uncomment the test case to run and comment the rest.
-## Thanks!
-
-
-
-#### Test case 1 (original test case as control from https://docs.gptr.dev/docs/gpt-researcher/tailored-research)
-
-from gpt_researcher.agent import GPTResearcher # Ensure this path is correct
-import asyncio
-import logging
-from typing import List, Dict, Any
-
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
-
-async def get_report(query: str, report_type: str, sources: list) -> str:
- custom_logs_handler = CustomLogsHandler()
- researcher = GPTResearcher(query=query,
- report_type=report_type,
- complement_source_urls=False,
- websocket=custom_logs_handler)
- await researcher.conduct_research()
- report = await researcher.write_report()
- return report, researcher
-
-if __name__ == "__main__":
- query = "Write an analysis on paul graham"
- report_type = "research_report"
- sources = ["https://www.paulgraham.com/when.html", "https://www.paulgraham.com/noob.html"] # query is related
-
- report, researcher = asyncio.run(get_report(query, report_type, sources))
- print(report)
-
- print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, so there will be relevant context present
-
-
-
-#### Test case 2 (Illustrating the problem, i.e., source_urls are not scoured. Hence, no relevant context)
-
-# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct
-# import asyncio
-
-# async def get_report(query: str, report_type: str, sources: list) -> str:
-# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources)
-# await researcher.conduct_research()
-# report = await researcher.write_report()
-# return report, researcher
-
-# if __name__ == "__main__":
-# query = "What is Microsoft's business model?"
-# report_type = "research_report"
-# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED.
-
-# report, researcher = asyncio.run(get_report(query, report_type, sources))
-# print(report)
-
-# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say 0 (zero) value because the query is UNRELATED to the contents of the pages, so there will be NO relevant context present
-
-
-
-#### Test case 3 (Suggested solution - complement_source_urls parameter allows GPTR to scour more of the web and not restrict to source_urls)
-
-# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct
-# import asyncio
-
-# async def get_report(query: str, report_type: str, sources: list) -> str:
-# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True)
-# await researcher.conduct_research()
-# report = await researcher.write_report()
-# return report, researcher
-
-# if __name__ == "__main__":
-# query = "What is Microsoft's business model?"
-# report_type = "research_report"
-# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED
-
-# report, researcher = asyncio.run(get_report(query, report_type, sources))
-# print(report)
-
-# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the complement_source_urls is set which should make gptr do default web search to gather contexts
-
-
-
-# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the complement_source_urls parameter is set allowing for a larger research scope)
-
-# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct
-# import asyncio
-
-# async def get_report(query: str, report_type: str, sources: list) -> str:
-# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True)
-# await researcher.conduct_research()
-# report = await researcher.write_report()
-# return report, researcher
-
-# if __name__ == "__main__":
-# query = "What are the latest advancements in AI?"
-# report_type = "research_report"
-# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related
-
-# report, researcher = asyncio.run(get_report(query, report_type, sources))
-# print(report)
-
-# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the complement_source_urls is set which should make gptr do default web search to gather more contexts!
From add4bffac288b0626632f4cac0f535f23f89cfe6 Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Wed, 18 Dec 2024 11:41:35 -0800
Subject: [PATCH 06/11] Remove report-types.py.orig from Git tracking
---
tests/report-types.py.orig | 63 --------------------------------------
1 file changed, 63 deletions(-)
delete mode 100644 tests/report-types.py.orig
diff --git a/tests/report-types.py.orig b/tests/report-types.py.orig
deleted file mode 100644
index 073f8336e..000000000
--- a/tests/report-types.py.orig
+++ /dev/null
@@ -1,63 +0,0 @@
-import os
-import asyncio
-import pytest
-from gpt_researcher.agent import GPTResearcher
-import logging
-from typing import List, Dict, Any
-
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
-
-# Define the report types to test
-report_types = [
- "research_report",
- "subtopic_report"
-]
-
-# Define a common query and sources for testing
-query = "What are the latest advancements in AI?"
-# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"]
-
-# Define the output directory
-output_dir = "./outputs"
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("report_type", report_types)
-async def test_gpt_researcher(report_type):
- # Ensure the output directory exists
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
-
- custom_logs_handler = CustomLogsHandler()
- # Create an instance of GPTResearcher
- researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler)
-
- # Conduct research and write the report
- await researcher.conduct_research()
- report = await researcher.write_report()
-
- # Define the expected output filenames
- pdf_filename = os.path.join(output_dir, f"{report_type}.pdf")
- docx_filename = os.path.join(output_dir, f"{report_type}.docx")
-
- # Check if the PDF and DOCX files are created
- # assert os.path.exists(pdf_filename), f"PDF file not found for report type: {report_type}"
- # assert os.path.exists(docx_filename), f"DOCX file not found for report type: {report_type}"
-
- # Clean up the generated files (optional)
- # os.remove(pdf_filename)
- # os.remove(docx_filename)
-
-if __name__ == "__main__":
- pytest.main()
\ No newline at end of file
From e525e94fc624d39231b6069d613e74d2d9cb6df0 Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Wed, 18 Dec 2024 16:07:10 -0800
Subject: [PATCH 07/11] changed name of test_researcher to avoid confusion with
prior test of similar name
---
...esearcher.py => test_researcher_logging.py} | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
rename tests/{test_researcher.py => test_researcher_logging.py} (74%)
diff --git a/tests/test_researcher.py b/tests/test_researcher_logging.py
similarity index 74%
rename from tests/test_researcher.py
rename to tests/test_researcher_logging.py
index f1d86d294..ebf6e7a94 100644
--- a/tests/test_researcher.py
+++ b/tests/test_researcher_logging.py
@@ -13,15 +13,19 @@
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
-async def test_researcher():
+async def test_researcher_logging(): # Renamed function to be more specific
+ """
+ Test suite for verifying the researcher's logging infrastructure.
+ Ensures proper creation and formatting of log files.
+ """
try:
# Import here to catch any import errors
from src.researcher import Researcher
logger.info("Successfully imported Researcher class")
- # Create a researcher instance
+ # Create a researcher instance with a logging-focused query
researcher = Researcher(
- query="What is the current state of quantum computing?",
+ query="Test query for logging verification",
report_type="research_report"
)
logger.info("Created Researcher instance")
@@ -31,25 +35,29 @@ async def test_researcher():
logger.info("Research completed successfully!")
logger.info(f"Report length: {len(report)}")
- # Basic assertions
+ # Basic report assertions
assert report is not None
assert len(report) > 0
- # Check if logs were created
+ # Detailed log file verification
logs_dir = Path(project_root) / "logs"
log_files = list(logs_dir.glob("research_*.log"))
json_files = list(logs_dir.glob("research_*.json"))
+ # Verify log files exist
assert len(log_files) > 0, "No log files were created"
assert len(json_files) > 0, "No JSON files were created"
+ # Log the findings
logger.info(f"\nFound {len(log_files)} log files:")
for log_file in log_files:
logger.info(f"- {log_file.name}")
+ # Could add additional checks for log file format/content here
logger.info(f"\nFound {len(json_files)} JSON files:")
for json_file in json_files:
logger.info(f"- {json_file.name}")
+ # Could add additional checks for JSON file structure here
except ImportError as e:
logger.error(f"Import error: {e}")
From aa371cada5eae8329354e932c836fb360d603377 Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Thu, 19 Dec 2024 15:44:27 -0800
Subject: [PATCH 08/11] removed memory profiler from server
This was cruff from a different branch I was working on. My bad.
---
backend/server/server.py | 16 ----------------
1 file changed, 16 deletions(-)
diff --git a/backend/server/server.py b/backend/server/server.py
index 9dcb1b968..fe12dfbaf 100644
--- a/backend/server/server.py
+++ b/backend/server/server.py
@@ -88,9 +88,6 @@ class ConfigRequest(BaseModel):
# Startup event
-from psutil import Process
-import logging
-
@app.on_event("startup")
def startup_event():
os.makedirs("outputs", exist_ok=True)
@@ -102,11 +99,6 @@ def startup_event():
research_logger.json_handler = json_handler # Store the JSON handler on the logger
research_logger.info(f"Research log file: {log_file}")
research_logger.info(f"Research JSON file: {json_file}")
-
- # Log memory usage
- process = Process()
- mem_info = process.memory_info()
- research_logger.info(f"Memory usage at startup: {mem_info.rss / 1024 ** 2:.2f} MB")
# Routes
@@ -123,10 +115,7 @@ async def list_files():
return {"files": files}
-from memory_profiler import profile
-
@app.post("/api/multi_agents")
-@profile
async def run_multi_agents():
return await execute_multi_agents(manager)
@@ -141,15 +130,10 @@ async def delete_file(filename: str):
return await handle_file_deletion(filename, DOC_PATH)
-from psutil import Process
-
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
- process = Process()
await manager.connect(websocket)
try:
- mem_info = process.memory_info()
- print(f"Memory usage during WebSocket connection: {mem_info.rss / 1024 ** 2:.2f} MB")
await handle_websocket_communication(websocket, manager)
except WebSocketDisconnect:
await manager.disconnect(websocket)
From 70de3c4e493c3b922264e2bbcd9258419c298563 Mon Sep 17 00:00:00 2001
From: Kelly Abbott <74297+kga245@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:26:57 -0800
Subject: [PATCH 09/11] Truncate file names
I noticed that if you do a very long query the file names that get generated can be larger than the host permits. So I added some truncation that should work pretty universally to limit the file name to under <250 characters total including the important naming convention of the prefixes.
---
backend/server/server_utils.py | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/backend/server/server_utils.py b/backend/server/server_utils.py
index 1c472be6f..26a7ac90e 100644
--- a/backend/server/server_utils.py
+++ b/backend/server/server_utils.py
@@ -99,7 +99,20 @@ async def research(self) -> dict:
}
def sanitize_filename(filename: str) -> str:
- return re.sub(r"[^\w\s-]", "", filename).strip()
+ # Split into components
+ prefix, timestamp, *task_parts = filename.split('_')
+ task = '_'.join(task_parts)
+
+ # Calculate max length for task portion
+ # 255 - len("outputs/") - len("task_") - len(timestamp) - len("_.json") - safety_margin
+ max_task_length = 255 - 8 - 5 - 10 - 6 - 10 # ~216 chars for task
+
+ # Truncate task if needed
+ truncated_task = task[:max_task_length] if len(task) > max_task_length else task
+
+ # Reassemble and clean the filename
+ sanitized = f"{prefix}_{timestamp}_{truncated_task}"
+ return re.sub(r"[^\w\s-]", "", sanitized).strip()
async def handle_start_command(websocket, data: str, manager):
From 5d1dc7e02e89c721c85e62890afbb7c8edeea255 Mon Sep 17 00:00:00 2001
From: ElishaKay
Date: Sat, 21 Dec 2024 21:24:32 +0200
Subject: [PATCH 10/11] added back report source conditions in researcher.py
---
gpt_researcher/skills/researcher.py | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py
index 5235bfd98..89c0de507 100644
--- a/gpt_researcher/skills/researcher.py
+++ b/gpt_researcher/skills/researcher.py
@@ -94,6 +94,34 @@ async def conduct_research(self):
research_data = await self._get_context_by_web_search(self.researcher.query)
# ... rest of the conditions ...
+ elif self.researcher.report_source == ReportSource.Local.value:
+ document_data = await DocumentLoader(self.researcher.cfg.doc_path).load()
+ if self.researcher.vector_store:
+ self.researcher.vector_store.load(document_data)
+
+ research_data = await self._get_context_by_web_search(self.researcher.query, document_data)
+
+ # Hybrid search including both local documents and web sources
+ elif self.researcher.report_source == ReportSource.Hybrid.value:
+ document_data = await DocumentLoader(self.researcher.cfg.doc_path).load()
+ if self.researcher.vector_store:
+ self.researcher.vector_store.load(document_data)
+ docs_context = await self._get_context_by_web_search(self.researcher.query, document_data)
+ web_context = await self._get_context_by_web_search(self.researcher.query)
+ research_data = f"Context from local documents: {docs_context}\n\nContext from web sources: {web_context}"
+
+ elif self.researcher.report_source == ReportSource.LangChainDocuments.value:
+ langchain_documents_data = await LangChainDocumentLoader(
+ self.researcher.documents
+ ).load()
+ if self.researcher.vector_store:
+ self.researcher.vector_store.load(langchain_documents_data)
+ research_data = await self._get_context_by_web_search(
+ self.researcher.query, langchain_documents_data
+ )
+
+ elif self.researcher.report_source == ReportSource.LangChainVectorStore.value:
+ research_data = await self._get_context_by_vectorstore(self.researcher.query, self.researcher.vector_store_filter)
# Rank and curate the sources
self.researcher.context = research_data
From e1535bf71ec1026d78067a99cf58563f1302c83a Mon Sep 17 00:00:00 2001
From: ElishaKay
Date: Sat, 21 Dec 2024 21:57:39 +0200
Subject: [PATCH 11/11] hide download logs button for mutli_agents report
---
frontend/nextjs/app/page.tsx | 1 +
.../nextjs/components/ResearchBlocks/AccessReport.tsx | 10 ++++++----
frontend/nextjs/components/ResearchResults.tsx | 4 +++-
3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/frontend/nextjs/app/page.tsx b/frontend/nextjs/app/page.tsx
index e5bd5fd5e..2cc301af9 100644
--- a/frontend/nextjs/app/page.tsx
+++ b/frontend/nextjs/app/page.tsx
@@ -257,6 +257,7 @@ export default function Home() {
orderedData={orderedData}
answer={answer}
allLogs={allLogs}
+ chatBoxSettings={chatBoxSettings}
handleClickSuggestion={handleClickSuggestion}
/>
diff --git a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
index 21996cce3..f35a3a159 100644
--- a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
+++ b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
@@ -6,11 +6,13 @@ interface AccessReportProps {
pdf?: string;
docx?: string;
json?: string;
- };
+ };
+ chatBoxSettings: any;
+ logs?: any[];
report: string;
}
-const AccessReport: React.FC = ({ accessData, report }) => {
+const AccessReport: React.FC = ({ accessData, chatBoxSettings, report }) => {
const host = getHost();
const getReportLink = (dataType: 'pdf' | 'docx' | 'json'): string => {
@@ -39,13 +41,13 @@ const AccessReport: React.FC = ({ accessData, report }) => {
rel="noopener noreferrer">
Download DocX
-
Download Logs
-
+ }
);
};
diff --git a/frontend/nextjs/components/ResearchResults.tsx b/frontend/nextjs/components/ResearchResults.tsx
index 218ad661b..7cc1c18b2 100644
--- a/frontend/nextjs/components/ResearchResults.tsx
+++ b/frontend/nextjs/components/ResearchResults.tsx
@@ -13,6 +13,7 @@ interface ResearchResultsProps {
orderedData: Data[];
answer: string;
allLogs: any[];
+ chatBoxSettings: any;
handleClickSuggestion: (value: string) => void;
}
@@ -20,6 +21,7 @@ export const ResearchResults: React.FC = ({
orderedData,
answer,
allLogs,
+ chatBoxSettings,
handleClickSuggestion
}) => {
const groupedData = preprocessOrderedData(orderedData);
@@ -72,7 +74,7 @@ export const ResearchResults: React.FC = ({
{sourceComponents}
{imageComponents}
{finalReport && }
- {pathData && }
+ {pathData && }
{chatComponents}
>
);