diff --git a/.gitignore b/.gitignore
index 203892d2a..df3ad0a8a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,4 +40,8 @@ docs/build
package-lock.json
#Vim swp files
-*.swp
\ No newline at end of file
+*.swp
+
+# Log files
+logs/
+*.orig
\ No newline at end of file
diff --git a/backend/server/app.py b/backend/server/app.py
new file mode 100644
index 000000000..ee886367b
--- /dev/null
+++ b/backend/server/app.py
@@ -0,0 +1,16 @@
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import logging
+
+logger = logging.getLogger(__name__)
+
+app = FastAPI()
+
+# Add CORS middleware
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"], # In production, replace with your frontend domain
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
\ No newline at end of file
diff --git a/backend/server/logging_config.py b/backend/server/logging_config.py
new file mode 100644
index 000000000..ad88044d2
--- /dev/null
+++ b/backend/server/logging_config.py
@@ -0,0 +1,83 @@
+import logging
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+class JSONResearchHandler:
+ def __init__(self, json_file):
+ self.json_file = json_file
+ self.research_data = {
+ "timestamp": datetime.now().isoformat(),
+ "events": [],
+ "content": {
+ "query": "",
+ "sources": [],
+ "context": [],
+ "report": "",
+ "costs": 0.0
+ }
+ }
+
+ def log_event(self, event_type: str, data: dict):
+ self.research_data["events"].append({
+ "timestamp": datetime.now().isoformat(),
+ "type": event_type,
+ "data": data
+ })
+ self._save_json()
+
+ def update_content(self, key: str, value):
+ self.research_data["content"][key] = value
+ self._save_json()
+
+ def _save_json(self):
+ with open(self.json_file, 'w') as f:
+ json.dump(self.research_data, f, indent=2)
+
+def setup_research_logging():
+ # Create logs directory if it doesn't exist
+ logs_dir = Path("logs")
+ logs_dir.mkdir(exist_ok=True)
+
+ # Generate timestamp for log files
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ # Create log file paths
+ log_file = logs_dir / f"research_{timestamp}.log"
+ json_file = logs_dir / f"research_{timestamp}.json"
+
+ # Configure file handler for research logs
+ file_handler = logging.FileHandler(log_file)
+ file_handler.setLevel(logging.INFO)
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+
+ # Get research logger and configure it
+ research_logger = logging.getLogger('research')
+ research_logger.setLevel(logging.INFO)
+
+ # Remove any existing handlers to avoid duplicates
+ research_logger.handlers.clear()
+
+ # Add file handler
+ research_logger.addHandler(file_handler)
+
+ # Add stream handler for console output
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+ research_logger.addHandler(console_handler)
+
+ # Prevent propagation to root logger to avoid duplicate logs
+ research_logger.propagate = False
+
+ # Create JSON handler
+ json_handler = JSONResearchHandler(json_file)
+
+ return str(log_file), str(json_file), research_logger, json_handler
+
+# Create a function to get the logger and JSON handler
+def get_research_logger():
+ return logging.getLogger('research')
+
+def get_json_handler():
+ return getattr(logging.getLogger('research'), 'json_handler', None)
\ No newline at end of file
diff --git a/backend/server/server.py b/backend/server/server.py
index 939a2c419..fe12dfbaf 100644
--- a/backend/server/server.py
+++ b/backend/server/server.py
@@ -15,6 +15,26 @@
execute_multi_agents, handle_websocket_communication
)
+from gpt_researcher.utils.logging_config import setup_research_logging
+
+import logging
+
+# Get logger instance
+logger = logging.getLogger(__name__)
+
+# Don't override parent logger settings
+logger.propagate = True
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(levelname)s - %(message)s",
+ handlers=[
+ logging.FileHandler("server_log.txt"), # Log to file
+ logging.StreamHandler() # Also print to console
+ ]
+)
+
+
# Models
@@ -73,6 +93,12 @@ def startup_event():
os.makedirs("outputs", exist_ok=True)
app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs")
os.makedirs(DOC_PATH, exist_ok=True)
+
+ # Setup research logging
+ log_file, json_file, research_logger, json_handler = setup_research_logging() # Unpack all 4 values
+ research_logger.json_handler = json_handler # Store the JSON handler on the logger
+ research_logger.info(f"Research log file: {log_file}")
+ research_logger.info(f"Research JSON file: {json_file}")
# Routes
diff --git a/backend/server/server_utils.py b/backend/server/server_utils.py
index 77bc8aba3..26a7ac90e 100644
--- a/backend/server/server_utils.py
+++ b/backend/server/server_utils.py
@@ -4,14 +4,115 @@
import time
import shutil
from typing import Dict, List, Any
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, FileResponse
from gpt_researcher.document.document import DocumentLoader
-# Add this import
from backend.utils import write_md_to_pdf, write_md_to_word, write_text_to_md
+from pathlib import Path
+from datetime import datetime
+from fastapi import HTTPException
+import logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+class CustomLogsHandler:
+ """Custom handler to capture streaming logs from the research process"""
+ def __init__(self, websocket, task: str):
+ self.logs = []
+ self.websocket = websocket
+ sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}")
+ self.log_file = os.path.join("outputs", f"{sanitized_filename}.json")
+ self.timestamp = datetime.now().isoformat()
+ # Initialize log file with metadata
+ os.makedirs("outputs", exist_ok=True)
+ with open(self.log_file, 'w') as f:
+ json.dump({
+ "timestamp": self.timestamp,
+ "events": [],
+ "content": {
+ "query": "",
+ "sources": [],
+ "context": [],
+ "report": "",
+ "costs": 0.0
+ }
+ }, f, indent=2)
+
+ async def send_json(self, data: Dict[str, Any]) -> None:
+ """Store log data and send to websocket"""
+ # Send to websocket for real-time display
+ if self.websocket:
+ await self.websocket.send_json(data)
+
+ # Read current log file
+ with open(self.log_file, 'r') as f:
+ log_data = json.load(f)
+
+ # Update appropriate section based on data type
+ if data.get('type') == 'logs':
+ log_data['events'].append({
+ "timestamp": datetime.now().isoformat(),
+ "type": "event",
+ "data": data
+ })
+ else:
+ # Update content section for other types of data
+ log_data['content'].update(data)
+
+ # Save updated log file
+ with open(self.log_file, 'w') as f:
+ json.dump(log_data, f, indent=2)
+ logger.debug(f"Log entry written to: {self.log_file}")
+
+
+class Researcher:
+ def __init__(self, query: str, report_type: str = "research_report"):
+ self.query = query
+ self.report_type = report_type
+ # Generate unique ID for this research task
+ self.research_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{hash(query)}"
+ # Initialize logs handler with research ID
+ self.logs_handler = CustomLogsHandler(self.research_id)
+ self.researcher = GPTResearcher(
+ query=query,
+ report_type=report_type,
+ websocket=self.logs_handler
+ )
+
+ async def research(self) -> dict:
+ """Conduct research and return paths to generated files"""
+ await self.researcher.conduct_research()
+ report = await self.researcher.write_report()
+
+ # Generate the files
+ sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{self.query}")
+ file_paths = await generate_report_files(report, sanitized_filename)
+
+ # Get the JSON log path that was created by CustomLogsHandler
+ json_relative_path = os.path.relpath(self.logs_handler.log_file)
+
+ return {
+ "output": {
+ **file_paths, # Include PDF, DOCX, and MD paths
+ "json": json_relative_path
+ }
+ }
def sanitize_filename(filename: str) -> str:
- return re.sub(r"[^\w\s-]", "", filename).strip()
+ # Split into components
+ prefix, timestamp, *task_parts = filename.split('_')
+ task = '_'.join(task_parts)
+
+ # Calculate max length for task portion
+ # 255 - len("outputs/") - len("task_") - len(timestamp) - len("_.json") - safety_margin
+ max_task_length = 255 - 8 - 5 - 10 - 6 - 10 # ~216 chars for task
+
+ # Truncate task if needed
+ truncated_task = task[:max_task_length] if len(task) > max_task_length else task
+
+ # Reassemble and clean the filename
+ sanitized = f"{prefix}_{timestamp}_{truncated_task}"
+ return re.sub(r"[^\w\s-]", "", sanitized).strip()
async def handle_start_command(websocket, data: str, manager):
@@ -23,13 +124,31 @@ async def handle_start_command(websocket, data: str, manager):
print("Error: Missing task or report_type")
return
+ # Create logs handler with websocket and task
+ logs_handler = CustomLogsHandler(websocket, task)
+ # Initialize log content with query
+ await logs_handler.send_json({
+ "query": task,
+ "sources": [],
+ "context": [],
+ "report": ""
+ })
+
sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}")
report = await manager.start_streaming(
- task, report_type, report_source, source_urls, tone, websocket, headers
+ task,
+ report_type,
+ report_source,
+ source_urls,
+ tone,
+ logs_handler,
+ headers
)
report = str(report)
file_paths = await generate_report_files(report, sanitized_filename)
+ # Add JSON log path to file_paths
+ file_paths["json"] = os.path.relpath(logs_handler.log_file)
await send_file_paths(websocket, file_paths)
diff --git a/frontend/index.html b/frontend/index.html
index 279381f62..f55c5dc6c 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -143,13 +143,11 @@
Research Report
diff --git a/frontend/nextjs/app/page.tsx b/frontend/nextjs/app/page.tsx
index e5bd5fd5e..2cc301af9 100644
--- a/frontend/nextjs/app/page.tsx
+++ b/frontend/nextjs/app/page.tsx
@@ -257,6 +257,7 @@ export default function Home() {
orderedData={orderedData}
answer={answer}
allLogs={allLogs}
+ chatBoxSettings={chatBoxSettings}
handleClickSuggestion={handleClickSuggestion}
/>
diff --git a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
index 080e5c91c..f35a3a159 100644
--- a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
+++ b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx
@@ -1,44 +1,55 @@
+import React from 'react';
import {getHost} from '../../helpers/getHost'
interface AccessReportProps {
- accessData: any;
- report: any;
+ accessData: {
+ pdf?: string;
+ docx?: string;
+ json?: string;
+ };
+ chatBoxSettings: any;
+ logs?: any[];
+ report: string;
}
-const AccessReport: React.FC = ({ accessData, report }) => {
+const AccessReport: React.FC = ({ accessData, chatBoxSettings, report }) => {
const host = getHost();
-
- const copyToClipboard = () => {
- if (navigator.clipboard) {
- navigator.clipboard.writeText(report).catch(err => {
- console.error('Failed to copy: ', err);
- });
- } else {
- console.warn('Clipboard API is not available');
+ const getReportLink = (dataType: 'pdf' | 'docx' | 'json'): string => {
+ if (!accessData[dataType]) {
+ console.warn(`No ${dataType} path provided`);
+ return '#';
}
- };
-
- const getReportLink = (dataType:string) => {
- return `${host}/${accessData[dataType]}`;
+ // Remove any leading slashes to prevent double slashes in URL
+ const path = accessData[dataType]?.replace(/^\//, '');
+ return `${host}/${path}`;
};
return (
);
-}
+};
export default AccessReport;
\ No newline at end of file
diff --git a/frontend/nextjs/components/ResearchResults.tsx b/frontend/nextjs/components/ResearchResults.tsx
index 218ad661b..7cc1c18b2 100644
--- a/frontend/nextjs/components/ResearchResults.tsx
+++ b/frontend/nextjs/components/ResearchResults.tsx
@@ -13,6 +13,7 @@ interface ResearchResultsProps {
orderedData: Data[];
answer: string;
allLogs: any[];
+ chatBoxSettings: any;
handleClickSuggestion: (value: string) => void;
}
@@ -20,6 +21,7 @@ export const ResearchResults: React.FC = ({
orderedData,
answer,
allLogs,
+ chatBoxSettings,
handleClickSuggestion
}) => {
const groupedData = preprocessOrderedData(orderedData);
@@ -72,7 +74,7 @@ export const ResearchResults: React.FC = ({
{sourceComponents}
{imageComponents}
{finalReport && }
- {pathData && }
+ {pathData && }
{chatComponents}
>
);
diff --git a/frontend/scripts.js b/frontend/scripts.js
index 1e1e90a4c..abcefb890 100644
--- a/frontend/scripts.js
+++ b/frontend/scripts.js
@@ -104,12 +104,30 @@ const GPTResearcher = (() => {
}
const updateDownloadLink = (data) => {
- const pdf_path = data.output.pdf
- const docx_path = data.output.docx
- const md_path = data.output.md;
- document.getElementById('downloadLink').setAttribute('href', pdf_path);
- document.getElementById('downloadLinkWord').setAttribute('href', docx_path);
- document.getElementById("downloadLinkMd").setAttribute("href", md_path);
+ if (!data.output) {
+ console.error('No output data received');
+ return;
+ }
+
+ const { pdf, docx, md, json } = data.output;
+ console.log('Received paths:', { pdf, docx, md, json });
+
+ // Helper function to safely update link
+ const updateLink = (id, path) => {
+ const element = document.getElementById(id);
+ if (element && path) {
+ console.log(`Setting ${id} href to:`, path);
+ element.setAttribute('href', path);
+ element.classList.remove('disabled');
+ } else {
+ console.warn(`Either element ${id} not found or path not provided`);
+ }
+ };
+
+ updateLink('downloadLink', pdf);
+ updateLink('downloadLinkWord', docx);
+ updateLink('downloadLinkMd', md);
+ updateLink('downloadLinkJson', json);
}
const updateScroll = () => {
diff --git a/gpt_researcher/agent.py b/gpt_researcher/agent.py
index 3ebcd2347..75dba2531 100644
--- a/gpt_researcher/agent.py
+++ b/gpt_researcher/agent.py
@@ -48,6 +48,7 @@ def __init__(
context=[],
headers: dict = None,
max_subtopics: int = 5,
+ log_handler=None,
):
self.query = query
self.report_type = report_type
@@ -79,6 +80,7 @@ def __init__(
self.memory = Memory(
self.cfg.embedding_provider, self.cfg.embedding_model, **self.cfg.embedding_kwargs
)
+ self.log_handler = log_handler
# Initialize components
self.research_conductor: ResearchConductor = ResearchConductor(self)
@@ -87,8 +89,36 @@ def __init__(
self.scraper_manager: BrowserManager = BrowserManager(self)
self.source_curator: SourceCurator = SourceCurator(self)
+ async def _log_event(self, event_type: str, **kwargs):
+ """Helper method to handle logging events"""
+ if self.log_handler:
+ try:
+ if event_type == "tool":
+ await self.log_handler.on_tool_start(kwargs.get('tool_name', ''), **kwargs)
+ elif event_type == "action":
+ await self.log_handler.on_agent_action(kwargs.get('action', ''), **kwargs)
+ elif event_type == "research":
+ await self.log_handler.on_research_step(kwargs.get('step', ''), kwargs.get('details', {}))
+
+ # Add direct logging as backup
+ import logging
+ research_logger = logging.getLogger('research')
+ research_logger.info(f"{event_type}: {json.dumps(kwargs, default=str)}")
+
+ except Exception as e:
+ import logging
+ logging.getLogger('research').error(f"Error in _log_event: {e}", exc_info=True)
+
async def conduct_research(self):
+ await self._log_event("research", step="start", details={
+ "query": self.query,
+ "report_type": self.report_type,
+ "agent": self.agent,
+ "role": self.role
+ })
+
if not (self.agent and self.role):
+ await self._log_event("action", action="choose_agent")
self.agent, self.role = await choose_agent(
query=self.query,
cfg=self.cfg,
@@ -96,22 +126,50 @@ async def conduct_research(self):
cost_callback=self.add_costs,
headers=self.headers,
)
-
+ await self._log_event("action", action="agent_selected", details={
+ "agent": self.agent,
+ "role": self.role
+ })
+
+ await self._log_event("research", step="conducting_research", details={
+ "agent": self.agent,
+ "role": self.role
+ })
self.context = await self.research_conductor.conduct_research()
+
+ await self._log_event("research", step="research_completed", details={
+ "context_length": len(self.context)
+ })
return self.context
async def write_report(self, existing_headers: list = [], relevant_written_contents: list = [], ext_context=None) -> str:
- return await self.report_generator.write_report(
+ await self._log_event("research", step="writing_report", details={
+ "existing_headers": existing_headers,
+ "context_source": "external" if ext_context else "internal"
+ })
+
+ report = await self.report_generator.write_report(
existing_headers,
relevant_written_contents,
ext_context or self.context
)
+
+ await self._log_event("research", step="report_completed", details={
+ "report_length": len(report)
+ })
+ return report
async def write_report_conclusion(self, report_body: str) -> str:
- return await self.report_generator.write_report_conclusion(report_body)
+ await self._log_event("research", step="writing_conclusion")
+ conclusion = await self.report_generator.write_report_conclusion(report_body)
+ await self._log_event("research", step="conclusion_completed")
+ return conclusion
async def write_introduction(self):
- return await self.report_generator.write_introduction()
+ await self._log_event("research", step="writing_introduction")
+ intro = await self.report_generator.write_introduction()
+ await self._log_event("research", step="introduction_completed")
+ return intro
async def get_subtopics(self):
return await self.report_generator.get_subtopics()
@@ -174,3 +232,8 @@ def add_costs(self, cost: float) -> None:
if not isinstance(cost, (float, int)):
raise ValueError("Cost must be an integer or float")
self.research_costs += cost
+ if self.log_handler:
+ self._log_event("research", step="cost_update", details={
+ "cost": cost,
+ "total_cost": self.research_costs
+ })
diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py
index ebe47ae56..89c0de507 100644
--- a/gpt_researcher/skills/researcher.py
+++ b/gpt_researcher/skills/researcher.py
@@ -2,11 +2,13 @@
import random
import json
from typing import Dict, Optional
+import logging
from ..actions.utils import stream_output
from ..actions.query_processing import plan_research_outline, get_search_results
from ..document import DocumentLoader, LangChainDocumentLoader
from ..utils.enum import ReportSource, ReportType, Tone
+from ..utils.logging_config import get_json_handler, get_research_logger
class ResearchConductor:
@@ -14,8 +16,12 @@ class ResearchConductor:
def __init__(self, researcher):
self.researcher = researcher
+ self.logger = logging.getLogger('research')
+ self.json_handler = get_json_handler()
async def plan_research(self, query):
+ self.logger.info(f"Planning research for query: {query}")
+
await stream_output(
"logs",
"planning_research",
@@ -24,15 +30,16 @@ async def plan_research(self, query):
)
search_results = await get_search_results(query, self.researcher.retrievers[0])
+ self.logger.info(f"Initial search results obtained: {len(search_results)} results")
await stream_output(
"logs",
"planning_research",
- f"š¤ Planning the research strategy and subtasks (this may take a minute)...",
+ f"š¤ Planning the research strategy and subtasks...",
self.researcher.websocket,
)
- return await plan_research_outline(
+ outline = await plan_research_outline(
query=query,
search_results=search_results,
agent_role_prompt=self.researcher.role,
@@ -41,11 +48,16 @@ async def plan_research(self, query):
report_type=self.researcher.report_type,
cost_callback=self.researcher.add_costs,
)
+ self.logger.info(f"Research outline planned: {outline}")
+ return outline
async def conduct_research(self):
- """
- Runs the GPT Researcher to conduct research
- """
+ """Runs the GPT Researcher to conduct research"""
+ if self.json_handler:
+ self.json_handler.update_content("query", self.researcher.query)
+
+ self.logger.info(f"Starting research for query: {self.researcher.query}")
+
# Reset visited_urls and source_urls at the start of each research task
self.researcher.visited_urls.clear()
research_data = []
@@ -63,21 +75,25 @@ async def conduct_research(self):
# Research for relevant sources based on source types below
if self.researcher.source_urls:
- # If specified, the researcher will use the given urls as the context for the research.
+ self.logger.info("Using provided source URLs")
research_data = await self._get_context_by_urls(self.researcher.source_urls)
- if research_data and len(research_data) == 0 and self.verbose:
- # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge
+ if research_data and len(research_data) == 0 and self.researcher.verbose:
await stream_output(
"logs",
"answering_from_memory",
f"š§ I was unable to find relevant context in the provided sources...",
- self.websocket,
+ self.researcher.websocket,
)
- # If complement_source_urls parameter is set, more resources can be gathered to create additional context using default web search
if self.researcher.complement_source_urls:
+ self.logger.info("Complementing with web search")
additional_research = await self._get_context_by_web_search(self.researcher.query)
research_data += ' '.join(additional_research)
+ elif self.researcher.report_source == ReportSource.Web.value:
+ self.logger.info("Using web search")
+ research_data = await self._get_context_by_web_search(self.researcher.query)
+
+ # ... rest of the conditions ...
elif self.researcher.report_source == ReportSource.Local.value:
document_data = await DocumentLoader(self.researcher.cfg.doc_path).load()
if self.researcher.vector_store:
@@ -106,13 +122,11 @@ async def conduct_research(self):
elif self.researcher.report_source == ReportSource.LangChainVectorStore.value:
research_data = await self._get_context_by_vectorstore(self.researcher.query, self.researcher.vector_store_filter)
- # Default web based research
- elif self.researcher.report_source == ReportSource.Web.value:
- research_data = await self._get_context_by_web_search(self.researcher.query)
- # Rank and curate the sources based on the research data
+ # Rank and curate the sources
self.researcher.context = research_data
if self.researcher.cfg.curate_sources:
+ self.logger.info("Curating sources")
self.researcher.context = await self.researcher.source_curator.curate_sources(research_data)
if self.researcher.verbose:
@@ -122,28 +136,34 @@ async def conduct_research(self):
f"Finalized research step.\nšø Total Research Costs: ${self.researcher.get_costs()}",
self.researcher.websocket,
)
+ if self.json_handler:
+ self.json_handler.update_content("costs", self.researcher.get_costs())
+ self.json_handler.update_content("context", self.researcher.context)
+ self.logger.info(f"Research completed. Context size: {len(str(self.researcher.context))}")
return self.researcher.context
async def _get_context_by_urls(self, urls):
- """
- Scrapes and compresses the context from the given urls
- """
+ """Scrapes and compresses the context from the given urls"""
+ self.logger.info(f"Getting context from URLs: {urls}")
+
new_search_urls = await self._get_new_urls(urls)
- if self.researcher.verbose:
- await stream_output(
- "logs",
- "source_urls",
- f"šļø I will conduct my research based on the following urls: {new_search_urls}...",
- self.researcher.websocket,
- )
+ self.logger.info(f"New URLs to process: {new_search_urls}")
scraped_content = await self.researcher.scraper_manager.browse_urls(new_search_urls)
+ self.logger.info(f"Scraped content from {len(scraped_content)} URLs")
if self.researcher.vector_store:
+ self.logger.info("Loading content into vector store")
self.researcher.vector_store.load(scraped_content)
- return await self.researcher.context_manager.get_similar_content_by_query(self.researcher.query, scraped_content)
+ context = await self.researcher.context_manager.get_similar_content_by_query(
+ self.researcher.query, scraped_content
+ )
+ self.logger.info(f"Generated context length: {len(context)}")
+ return context
+
+ # Add logging to other methods similarly...
async def _get_context_by_vectorstore(self, query, filter: Optional[dict] = None):
"""
@@ -183,8 +203,12 @@ async def _get_context_by_web_search(self, query, scraped_data: list = []):
Returns:
context: List of context
"""
+ self.logger.info(f"Starting web search for query: {query}")
+
# Generate Sub-Queries including original query
sub_queries = await self.plan_research(query)
+ self.logger.info(f"Generated sub-queries: {sub_queries}")
+
# If this is not part of a sub researcher, add original query to research for better results
if self.researcher.report_type != "subtopic_report":
sub_queries.append(query)
@@ -200,24 +224,33 @@ async def _get_context_by_web_search(self, query, scraped_data: list = []):
)
# Using asyncio.gather to process the sub_queries asynchronously
- context = await asyncio.gather(
- *[
- self._process_sub_query(sub_query, scraped_data)
- for sub_query in sub_queries
- ]
- )
- return context
+ try:
+ context = await asyncio.gather(
+ *[
+ self._process_sub_query(sub_query, scraped_data)
+ for sub_query in sub_queries
+ ]
+ )
+ self.logger.info(f"Gathered context from {len(context)} sub-queries")
+ # Filter out empty results and join the context
+ context = [c for c in context if c]
+ if context:
+ combined_context = " ".join(context)
+ self.logger.info(f"Combined context size: {len(combined_context)}")
+ return combined_context
+ return []
+ except Exception as e:
+ self.logger.error(f"Error during web search: {e}", exc_info=True)
+ return []
async def _process_sub_query(self, sub_query: str, scraped_data: list = []):
- """Takes in a sub query and scrapes urls based on it and gathers context.
-
- Args:
- sub_query (str): The sub-query generated from the original query
- scraped_data (list): Scraped data passed in
-
- Returns:
- str: The context gathered from search
- """
+ """Takes in a sub query and scrapes urls based on it and gathers context."""
+ if self.json_handler:
+ self.json_handler.log_event("sub_query", {
+ "query": sub_query,
+ "scraped_data_size": len(scraped_data)
+ })
+
if self.researcher.verbose:
await stream_output(
"logs",
@@ -226,23 +259,35 @@ async def _process_sub_query(self, sub_query: str, scraped_data: list = []):
self.researcher.websocket,
)
- if not scraped_data:
- scraped_data = await self._scrape_data_by_urls(sub_query)
+ try:
+ if not scraped_data:
+ scraped_data = await self._scrape_data_by_urls(sub_query)
+ self.logger.info(f"Scraped data size: {len(scraped_data)}")
- content = await self.researcher.context_manager.get_similar_content_by_query(sub_query, scraped_data)
+ content = await self.researcher.context_manager.get_similar_content_by_query(sub_query, scraped_data)
+ self.logger.info(f"Content found for sub-query: {len(str(content)) if content else 0} chars")
- if content and self.researcher.verbose:
- await stream_output(
- "logs", "subquery_context_window", f"š {content}", self.researcher.websocket
- )
- elif self.researcher.verbose:
- await stream_output(
- "logs",
- "subquery_context_not_found",
- f"š¤· No content found for '{sub_query}'...",
- self.researcher.websocket,
- )
- return content
+ if content and self.researcher.verbose:
+ await stream_output(
+ "logs", "subquery_context_window", f"š {content}", self.researcher.websocket
+ )
+ elif self.researcher.verbose:
+ await stream_output(
+ "logs",
+ "subquery_context_not_found",
+ f"š¤· No content found for '{sub_query}'...",
+ self.researcher.websocket,
+ )
+ if content:
+ if self.json_handler:
+ self.json_handler.log_event("content_found", {
+ "sub_query": sub_query,
+ "content_size": len(content)
+ })
+ return content
+ except Exception as e:
+ self.logger.error(f"Error processing sub-query {sub_query}: {e}", exc_info=True)
+ return ""
async def _process_sub_query_with_vectorstore(self, sub_query: str, filter: Optional[dict] = None):
"""Takes in a sub query and gathers context from the user provided vector store
diff --git a/gpt_researcher/utils/logging_config.py b/gpt_researcher/utils/logging_config.py
new file mode 100644
index 000000000..ee0d855ed
--- /dev/null
+++ b/gpt_researcher/utils/logging_config.py
@@ -0,0 +1,82 @@
+import logging
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+class JSONResearchHandler:
+ def __init__(self, json_file):
+ self.json_file = json_file
+ self.research_data = {
+ "timestamp": datetime.now().isoformat(),
+ "events": [],
+ "content": {
+ "query": "",
+ "sources": [],
+ "context": [],
+ "report": "",
+ "costs": 0.0
+ }
+ }
+
+ def log_event(self, event_type: str, data: dict):
+ self.research_data["events"].append({
+ "timestamp": datetime.now().isoformat(),
+ "type": event_type,
+ "data": data
+ })
+ self._save_json()
+
+ def update_content(self, key: str, value):
+ self.research_data["content"][key] = value
+ self._save_json()
+
+ def _save_json(self):
+ with open(self.json_file, 'w') as f:
+ json.dump(self.research_data, f, indent=2)
+
+def setup_research_logging():
+ # Create logs directory if it doesn't exist
+ logs_dir = Path("logs")
+ logs_dir.mkdir(exist_ok=True)
+
+ # Generate timestamp for log files
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ # Create log file paths
+ log_file = logs_dir / f"research_{timestamp}.log"
+ json_file = logs_dir / f"research_{timestamp}.json"
+
+ # Configure file handler for research logs
+ file_handler = logging.FileHandler(log_file)
+ file_handler.setLevel(logging.INFO)
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+
+ # Get research logger and configure it
+ research_logger = logging.getLogger('research')
+ research_logger.setLevel(logging.INFO)
+
+ # Remove any existing handlers to avoid duplicates
+ research_logger.handlers.clear()
+
+ # Add file handler
+ research_logger.addHandler(file_handler)
+
+ # Add stream handler for console output
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+ research_logger.addHandler(console_handler)
+
+ # Prevent propagation to root logger to avoid duplicate logs
+ research_logger.propagate = False
+
+ # Create JSON handler
+ json_handler = JSONResearchHandler(json_file)
+
+ return str(log_file), str(json_file), research_logger, json_handler
+
+def get_research_logger():
+ return logging.getLogger('research')
+
+def get_json_handler():
+ return getattr(logging.getLogger('research'), 'json_handler', None)
diff --git a/main.py b/main.py
index 0f48c2cba..10057a495 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,25 @@
from dotenv import load_dotenv
+import logging
+from pathlib import Path
+
+# Create logs directory if it doesn't exist
+logs_dir = Path("logs")
+logs_dir.mkdir(exist_ok=True)
+
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+ handlers=[
+ # File handler for general application logs
+ logging.FileHandler('logs/app.log'),
+ # Stream handler for console output
+ logging.StreamHandler()
+ ]
+)
+
+# Create logger instance
+logger = logging.getLogger(__name__)
load_dotenv()
@@ -6,5 +27,6 @@
if __name__ == "__main__":
import uvicorn
-
- uvicorn.run(app, host="0.0.0.0", port=8000)
+
+ logger.info("Starting server...")
+ uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index d2db4d9d7..cab6c1c77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,4 +45,11 @@ websockets = "^13.1"
[build-system]
requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
\ No newline at end of file
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+asyncio_mode = "strict"
+addopts = "-v"
+testpaths = ["tests"]
+python_files = "test_*.py"
+asyncio_fixture_loop_scope = "function"
\ No newline at end of file
diff --git a/tests/gptr-logs-handler.py b/tests/gptr-logs-handler.py
index fb05694ce..0bbec93a4 100644
--- a/tests/gptr-logs-handler.py
+++ b/tests/gptr-logs-handler.py
@@ -2,25 +2,7 @@
from typing import List, Dict, Any
import asyncio
from gpt_researcher import GPTResearcher
-
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
-
- def clear_logs(self) -> None:
- """Clear the logs."""
- self.logs.clear() # Clear the logs list
- logging.info("Logs cleared.") # Log the clearing action
+from src.logs_handler import CustomLogsHandler # Update import
async def run() -> None:
"""Run the research process and generate a report."""
@@ -30,7 +12,7 @@ async def run() -> None:
tone = "informative"
config_path = None
- custom_logs_handler = CustomLogsHandler()
+ custom_logs_handler = CustomLogsHandler(query=query) # Pass query parameter
researcher = GPTResearcher(
query=query,
diff --git a/tests/report-types.py b/tests/report-types.py
index 073f8336e..e09fec100 100644
--- a/tests/report-types.py
+++ b/tests/report-types.py
@@ -2,23 +2,9 @@
import asyncio
import pytest
from gpt_researcher.agent import GPTResearcher
-import logging
+from src.logs_handler import CustomLogsHandler # Update import
from typing import List, Dict, Any
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
-
# Define the report types to test
report_types = [
"research_report",
@@ -39,7 +25,7 @@ async def test_gpt_researcher(report_type):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
- custom_logs_handler = CustomLogsHandler()
+ custom_logs_handler = CustomLogsHandler(query=query)
# Create an instance of GPTResearcher
researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler)
diff --git a/tests/research_test.py b/tests/research_test.py
index b58d5b92a..56077f8fd 100644
--- a/tests/research_test.py
+++ b/tests/research_test.py
@@ -18,23 +18,10 @@
import asyncio
import logging
from typing import List, Dict, Any
-
-class CustomLogsHandler:
- """A custom Logs handler class to handle JSON data."""
- def __init__(self):
- self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data
- logging.basicConfig(level=logging.INFO) # Set up logging configuration
-
- async def send_json(self, data: Dict[str, Any]) -> None:
- """Send JSON data and log it, with error handling."""
- try:
- self.logs.append(data) # Append data to logs
- logging.info(f"My custom Log: {data}") # Use logging instead of print
- except Exception as e:
- logging.error(f"Error logging data: {e}") # Log any errors
+from src.logs_handler import CustomLogsHandler # Update import
async def get_report(query: str, report_type: str, sources: list) -> str:
- custom_logs_handler = CustomLogsHandler()
+ custom_logs_handler = CustomLogsHandler(query=query) # Pass query parameter
researcher = GPTResearcher(query=query,
report_type=report_type,
complement_source_urls=False,
diff --git a/tests/test_logging.py b/tests/test_logging.py
new file mode 100644
index 000000000..c6ff963b7
--- /dev/null
+++ b/tests/test_logging.py
@@ -0,0 +1,61 @@
+import pytest
+from unittest.mock import AsyncMock
+from fastapi import WebSocket
+from src.logs_handler import CustomLogsHandler
+import os
+import json
+
+@pytest.mark.asyncio
+async def test_custom_logs_handler():
+ # Mock websocket
+ mock_websocket = AsyncMock()
+ mock_websocket.send_json = AsyncMock()
+
+ # Test initialization
+ handler = CustomLogsHandler(mock_websocket, "test_query")
+
+ # Verify log file creation
+ assert os.path.exists(handler.log_file)
+
+ # Test sending log data
+ test_data = {
+ "type": "logs",
+ "message": "Test log message"
+ }
+
+ await handler.send_json(test_data)
+
+ # Verify websocket was called with correct data
+ mock_websocket.send_json.assert_called_once_with(test_data)
+
+ # Verify log file contents
+ with open(handler.log_file, 'r') as f:
+ log_data = json.load(f)
+ assert len(log_data['events']) == 1
+ assert log_data['events'][0]['data'] == test_data
+
+@pytest.mark.asyncio
+async def test_content_update():
+ """Test handling of non-log type data that updates content"""
+ mock_websocket = AsyncMock()
+ mock_websocket.send_json = AsyncMock()
+
+ handler = CustomLogsHandler(mock_websocket, "test_query")
+
+ # Test content update
+ content_data = {
+ "query": "test query",
+ "sources": ["source1", "source2"],
+ "report": "test report"
+ }
+
+ await handler.send_json(content_data)
+
+ mock_websocket.send_json.assert_called_once_with(content_data)
+
+ # Verify log file contents
+ with open(handler.log_file, 'r') as f:
+ log_data = json.load(f)
+ assert log_data['content']['query'] == "test query"
+ assert log_data['content']['sources'] == ["source1", "source2"]
+ assert log_data['content']['report'] == "test report"
\ No newline at end of file
diff --git a/tests/test_logs.py b/tests/test_logs.py
new file mode 100644
index 000000000..0f2353959
--- /dev/null
+++ b/tests/test_logs.py
@@ -0,0 +1,48 @@
+import os
+from pathlib import Path
+import sys
+
+# Add the project root to Python path
+project_root = Path(__file__).parent.parent
+sys.path.append(str(project_root))
+
+from src.logs_handler import CustomLogsHandler
+
+def test_logs_creation():
+ # Print current working directory
+ print(f"Current working directory: {os.getcwd()}")
+
+ # Print project root
+ print(f"Project root: {project_root}")
+
+ # Try to create logs directory directly
+ logs_dir = project_root / "logs"
+ print(f"Attempting to create logs directory at: {logs_dir}")
+
+ try:
+ # Create directory with full permissions
+ os.makedirs(logs_dir, mode=0o777, exist_ok=True)
+ print(f"ā Created directory: {logs_dir}")
+
+ # Test file creation
+ test_file = logs_dir / "test.txt"
+ with open(test_file, 'w') as f:
+ f.write("Test log entry")
+ print(f"ā Created test file: {test_file}")
+
+ # Initialize the handler
+ handler = CustomLogsHandler()
+ print("ā CustomLogsHandler initialized")
+
+ # Test JSON logging
+ handler.logs.append({"test": "message"})
+ print("ā Added test log entry")
+
+ except Exception as e:
+ print(f"ā Error: {str(e)}")
+ print(f"Error type: {type(e)}")
+ import traceback
+ print(f"Traceback: {traceback.format_exc()}")
+
+if __name__ == "__main__":
+ test_logs_creation()
\ No newline at end of file
diff --git a/tests/test_researcher_logging.py b/tests/test_researcher_logging.py
new file mode 100644
index 000000000..ebf6e7a94
--- /dev/null
+++ b/tests/test_researcher_logging.py
@@ -0,0 +1,71 @@
+import pytest
+import asyncio
+from pathlib import Path
+import sys
+import logging
+
+# Add the project root to Python path
+project_root = Path(__file__).parent.parent
+sys.path.append(str(project_root))
+
+# Configure basic logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+@pytest.mark.asyncio
+async def test_researcher_logging(): # Renamed function to be more specific
+ """
+ Test suite for verifying the researcher's logging infrastructure.
+ Ensures proper creation and formatting of log files.
+ """
+ try:
+ # Import here to catch any import errors
+ from src.researcher import Researcher
+ logger.info("Successfully imported Researcher class")
+
+ # Create a researcher instance with a logging-focused query
+ researcher = Researcher(
+ query="Test query for logging verification",
+ report_type="research_report"
+ )
+ logger.info("Created Researcher instance")
+
+ # Run the research
+ report = await researcher.research()
+ logger.info("Research completed successfully!")
+ logger.info(f"Report length: {len(report)}")
+
+ # Basic report assertions
+ assert report is not None
+ assert len(report) > 0
+
+ # Detailed log file verification
+ logs_dir = Path(project_root) / "logs"
+ log_files = list(logs_dir.glob("research_*.log"))
+ json_files = list(logs_dir.glob("research_*.json"))
+
+ # Verify log files exist
+ assert len(log_files) > 0, "No log files were created"
+ assert len(json_files) > 0, "No JSON files were created"
+
+ # Log the findings
+ logger.info(f"\nFound {len(log_files)} log files:")
+ for log_file in log_files:
+ logger.info(f"- {log_file.name}")
+ # Could add additional checks for log file format/content here
+
+ logger.info(f"\nFound {len(json_files)} JSON files:")
+ for json_file in json_files:
+ logger.info(f"- {json_file.name}")
+ # Could add additional checks for JSON file structure here
+
+ except ImportError as e:
+ logger.error(f"Import error: {e}")
+ logger.error("Make sure gpt_researcher is installed and in your PYTHONPATH")
+ raise
+ except Exception as e:
+ logger.error(f"Error during research: {e}")
+ raise
+
+if __name__ == "__main__":
+ pytest.main([__file__])
\ No newline at end of file