diff --git a/.gitignore b/.gitignore index 203892d2a..df3ad0a8a 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,8 @@ docs/build package-lock.json #Vim swp files -*.swp \ No newline at end of file +*.swp + +# Log files +logs/ +*.orig \ No newline at end of file diff --git a/backend/server/app.py b/backend/server/app.py new file mode 100644 index 000000000..ee886367b --- /dev/null +++ b/backend/server/app.py @@ -0,0 +1,16 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +import logging + +logger = logging.getLogger(__name__) + +app = FastAPI() + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # In production, replace with your frontend domain + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) \ No newline at end of file diff --git a/backend/server/logging_config.py b/backend/server/logging_config.py new file mode 100644 index 000000000..ad88044d2 --- /dev/null +++ b/backend/server/logging_config.py @@ -0,0 +1,83 @@ +import logging +import json +import os +from datetime import datetime +from pathlib import Path + +class JSONResearchHandler: + def __init__(self, json_file): + self.json_file = json_file + self.research_data = { + "timestamp": datetime.now().isoformat(), + "events": [], + "content": { + "query": "", + "sources": [], + "context": [], + "report": "", + "costs": 0.0 + } + } + + def log_event(self, event_type: str, data: dict): + self.research_data["events"].append({ + "timestamp": datetime.now().isoformat(), + "type": event_type, + "data": data + }) + self._save_json() + + def update_content(self, key: str, value): + self.research_data["content"][key] = value + self._save_json() + + def _save_json(self): + with open(self.json_file, 'w') as f: + json.dump(self.research_data, f, indent=2) + +def setup_research_logging(): + # Create logs directory if it doesn't exist + logs_dir = Path("logs") + logs_dir.mkdir(exist_ok=True) + + # Generate timestamp for log files + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Create log file paths + log_file = logs_dir / f"research_{timestamp}.log" + json_file = logs_dir / f"research_{timestamp}.json" + + # Configure file handler for research logs + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + + # Get research logger and configure it + research_logger = logging.getLogger('research') + research_logger.setLevel(logging.INFO) + + # Remove any existing handlers to avoid duplicates + research_logger.handlers.clear() + + # Add file handler + research_logger.addHandler(file_handler) + + # Add stream handler for console output + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + research_logger.addHandler(console_handler) + + # Prevent propagation to root logger to avoid duplicate logs + research_logger.propagate = False + + # Create JSON handler + json_handler = JSONResearchHandler(json_file) + + return str(log_file), str(json_file), research_logger, json_handler + +# Create a function to get the logger and JSON handler +def get_research_logger(): + return logging.getLogger('research') + +def get_json_handler(): + return getattr(logging.getLogger('research'), 'json_handler', None) \ No newline at end of file diff --git a/backend/server/server.py b/backend/server/server.py index 939a2c419..fe12dfbaf 100644 --- a/backend/server/server.py +++ b/backend/server/server.py @@ -15,6 +15,26 @@ execute_multi_agents, handle_websocket_communication ) +from gpt_researcher.utils.logging_config import setup_research_logging + +import logging + +# Get logger instance +logger = logging.getLogger(__name__) + +# Don't override parent logger settings +logger.propagate = True + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[ + logging.FileHandler("server_log.txt"), # Log to file + logging.StreamHandler() # Also print to console + ] +) + + # Models @@ -73,6 +93,12 @@ def startup_event(): os.makedirs("outputs", exist_ok=True) app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs") os.makedirs(DOC_PATH, exist_ok=True) + + # Setup research logging + log_file, json_file, research_logger, json_handler = setup_research_logging() # Unpack all 4 values + research_logger.json_handler = json_handler # Store the JSON handler on the logger + research_logger.info(f"Research log file: {log_file}") + research_logger.info(f"Research JSON file: {json_file}") # Routes diff --git a/backend/server/server_utils.py b/backend/server/server_utils.py index 77bc8aba3..26a7ac90e 100644 --- a/backend/server/server_utils.py +++ b/backend/server/server_utils.py @@ -4,14 +4,115 @@ import time import shutil from typing import Dict, List, Any -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, FileResponse from gpt_researcher.document.document import DocumentLoader -# Add this import from backend.utils import write_md_to_pdf, write_md_to_word, write_text_to_md +from pathlib import Path +from datetime import datetime +from fastapi import HTTPException +import logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +class CustomLogsHandler: + """Custom handler to capture streaming logs from the research process""" + def __init__(self, websocket, task: str): + self.logs = [] + self.websocket = websocket + sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}") + self.log_file = os.path.join("outputs", f"{sanitized_filename}.json") + self.timestamp = datetime.now().isoformat() + # Initialize log file with metadata + os.makedirs("outputs", exist_ok=True) + with open(self.log_file, 'w') as f: + json.dump({ + "timestamp": self.timestamp, + "events": [], + "content": { + "query": "", + "sources": [], + "context": [], + "report": "", + "costs": 0.0 + } + }, f, indent=2) + + async def send_json(self, data: Dict[str, Any]) -> None: + """Store log data and send to websocket""" + # Send to websocket for real-time display + if self.websocket: + await self.websocket.send_json(data) + + # Read current log file + with open(self.log_file, 'r') as f: + log_data = json.load(f) + + # Update appropriate section based on data type + if data.get('type') == 'logs': + log_data['events'].append({ + "timestamp": datetime.now().isoformat(), + "type": "event", + "data": data + }) + else: + # Update content section for other types of data + log_data['content'].update(data) + + # Save updated log file + with open(self.log_file, 'w') as f: + json.dump(log_data, f, indent=2) + logger.debug(f"Log entry written to: {self.log_file}") + + +class Researcher: + def __init__(self, query: str, report_type: str = "research_report"): + self.query = query + self.report_type = report_type + # Generate unique ID for this research task + self.research_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{hash(query)}" + # Initialize logs handler with research ID + self.logs_handler = CustomLogsHandler(self.research_id) + self.researcher = GPTResearcher( + query=query, + report_type=report_type, + websocket=self.logs_handler + ) + + async def research(self) -> dict: + """Conduct research and return paths to generated files""" + await self.researcher.conduct_research() + report = await self.researcher.write_report() + + # Generate the files + sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{self.query}") + file_paths = await generate_report_files(report, sanitized_filename) + + # Get the JSON log path that was created by CustomLogsHandler + json_relative_path = os.path.relpath(self.logs_handler.log_file) + + return { + "output": { + **file_paths, # Include PDF, DOCX, and MD paths + "json": json_relative_path + } + } def sanitize_filename(filename: str) -> str: - return re.sub(r"[^\w\s-]", "", filename).strip() + # Split into components + prefix, timestamp, *task_parts = filename.split('_') + task = '_'.join(task_parts) + + # Calculate max length for task portion + # 255 - len("outputs/") - len("task_") - len(timestamp) - len("_.json") - safety_margin + max_task_length = 255 - 8 - 5 - 10 - 6 - 10 # ~216 chars for task + + # Truncate task if needed + truncated_task = task[:max_task_length] if len(task) > max_task_length else task + + # Reassemble and clean the filename + sanitized = f"{prefix}_{timestamp}_{truncated_task}" + return re.sub(r"[^\w\s-]", "", sanitized).strip() async def handle_start_command(websocket, data: str, manager): @@ -23,13 +124,31 @@ async def handle_start_command(websocket, data: str, manager): print("Error: Missing task or report_type") return + # Create logs handler with websocket and task + logs_handler = CustomLogsHandler(websocket, task) + # Initialize log content with query + await logs_handler.send_json({ + "query": task, + "sources": [], + "context": [], + "report": "" + }) + sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}") report = await manager.start_streaming( - task, report_type, report_source, source_urls, tone, websocket, headers + task, + report_type, + report_source, + source_urls, + tone, + logs_handler, + headers ) report = str(report) file_paths = await generate_report_files(report, sanitized_filename) + # Add JSON log path to file_paths + file_paths["json"] = os.path.relpath(logs_handler.log_file) await send_file_paths(websocket, file_paths) diff --git a/frontend/index.html b/frontend/index.html index 279381f62..f55c5dc6c 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -143,13 +143,11 @@

Research Report

- Copy to clipboard (markdown) - Download as Markdown - Download as PDF - Download as Docx + Copy to clipboard (markdown) + Download as Markdown + Download as PDF + Download as Docx + Download Log
diff --git a/frontend/nextjs/app/page.tsx b/frontend/nextjs/app/page.tsx index e5bd5fd5e..2cc301af9 100644 --- a/frontend/nextjs/app/page.tsx +++ b/frontend/nextjs/app/page.tsx @@ -257,6 +257,7 @@ export default function Home() { orderedData={orderedData} answer={answer} allLogs={allLogs} + chatBoxSettings={chatBoxSettings} handleClickSuggestion={handleClickSuggestion} /> diff --git a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx index 080e5c91c..f35a3a159 100644 --- a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx +++ b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx @@ -1,44 +1,55 @@ +import React from 'react'; import {getHost} from '../../helpers/getHost' interface AccessReportProps { - accessData: any; - report: any; + accessData: { + pdf?: string; + docx?: string; + json?: string; + }; + chatBoxSettings: any; + logs?: any[]; + report: string; } -const AccessReport: React.FC = ({ accessData, report }) => { +const AccessReport: React.FC = ({ accessData, chatBoxSettings, report }) => { const host = getHost(); - - const copyToClipboard = () => { - if (navigator.clipboard) { - navigator.clipboard.writeText(report).catch(err => { - console.error('Failed to copy: ', err); - }); - } else { - console.warn('Clipboard API is not available'); + const getReportLink = (dataType: 'pdf' | 'docx' | 'json'): string => { + if (!accessData[dataType]) { + console.warn(`No ${dataType} path provided`); + return '#'; } - }; - - const getReportLink = (dataType:string) => { - return `${host}/${accessData[dataType]}`; + // Remove any leading slashes to prevent double slashes in URL + const path = accessData[dataType]?.replace(/^\//, ''); + return `${host}/${path}`; }; return (
- + target="_blank" + rel="noopener noreferrer"> View as PDF - + target="_blank" + rel="noopener noreferrer"> Download DocX + {chatBoxSettings.report_type === 'research_report' && + Download Logs + }
); -} +}; export default AccessReport; \ No newline at end of file diff --git a/frontend/nextjs/components/ResearchResults.tsx b/frontend/nextjs/components/ResearchResults.tsx index 218ad661b..7cc1c18b2 100644 --- a/frontend/nextjs/components/ResearchResults.tsx +++ b/frontend/nextjs/components/ResearchResults.tsx @@ -13,6 +13,7 @@ interface ResearchResultsProps { orderedData: Data[]; answer: string; allLogs: any[]; + chatBoxSettings: any; handleClickSuggestion: (value: string) => void; } @@ -20,6 +21,7 @@ export const ResearchResults: React.FC = ({ orderedData, answer, allLogs, + chatBoxSettings, handleClickSuggestion }) => { const groupedData = preprocessOrderedData(orderedData); @@ -72,7 +74,7 @@ export const ResearchResults: React.FC = ({ {sourceComponents} {imageComponents} {finalReport && } - {pathData && } + {pathData && } {chatComponents} ); diff --git a/frontend/scripts.js b/frontend/scripts.js index 1e1e90a4c..abcefb890 100644 --- a/frontend/scripts.js +++ b/frontend/scripts.js @@ -104,12 +104,30 @@ const GPTResearcher = (() => { } const updateDownloadLink = (data) => { - const pdf_path = data.output.pdf - const docx_path = data.output.docx - const md_path = data.output.md; - document.getElementById('downloadLink').setAttribute('href', pdf_path); - document.getElementById('downloadLinkWord').setAttribute('href', docx_path); - document.getElementById("downloadLinkMd").setAttribute("href", md_path); + if (!data.output) { + console.error('No output data received'); + return; + } + + const { pdf, docx, md, json } = data.output; + console.log('Received paths:', { pdf, docx, md, json }); + + // Helper function to safely update link + const updateLink = (id, path) => { + const element = document.getElementById(id); + if (element && path) { + console.log(`Setting ${id} href to:`, path); + element.setAttribute('href', path); + element.classList.remove('disabled'); + } else { + console.warn(`Either element ${id} not found or path not provided`); + } + }; + + updateLink('downloadLink', pdf); + updateLink('downloadLinkWord', docx); + updateLink('downloadLinkMd', md); + updateLink('downloadLinkJson', json); } const updateScroll = () => { diff --git a/gpt_researcher/agent.py b/gpt_researcher/agent.py index 3ebcd2347..75dba2531 100644 --- a/gpt_researcher/agent.py +++ b/gpt_researcher/agent.py @@ -48,6 +48,7 @@ def __init__( context=[], headers: dict = None, max_subtopics: int = 5, + log_handler=None, ): self.query = query self.report_type = report_type @@ -79,6 +80,7 @@ def __init__( self.memory = Memory( self.cfg.embedding_provider, self.cfg.embedding_model, **self.cfg.embedding_kwargs ) + self.log_handler = log_handler # Initialize components self.research_conductor: ResearchConductor = ResearchConductor(self) @@ -87,8 +89,36 @@ def __init__( self.scraper_manager: BrowserManager = BrowserManager(self) self.source_curator: SourceCurator = SourceCurator(self) + async def _log_event(self, event_type: str, **kwargs): + """Helper method to handle logging events""" + if self.log_handler: + try: + if event_type == "tool": + await self.log_handler.on_tool_start(kwargs.get('tool_name', ''), **kwargs) + elif event_type == "action": + await self.log_handler.on_agent_action(kwargs.get('action', ''), **kwargs) + elif event_type == "research": + await self.log_handler.on_research_step(kwargs.get('step', ''), kwargs.get('details', {})) + + # Add direct logging as backup + import logging + research_logger = logging.getLogger('research') + research_logger.info(f"{event_type}: {json.dumps(kwargs, default=str)}") + + except Exception as e: + import logging + logging.getLogger('research').error(f"Error in _log_event: {e}", exc_info=True) + async def conduct_research(self): + await self._log_event("research", step="start", details={ + "query": self.query, + "report_type": self.report_type, + "agent": self.agent, + "role": self.role + }) + if not (self.agent and self.role): + await self._log_event("action", action="choose_agent") self.agent, self.role = await choose_agent( query=self.query, cfg=self.cfg, @@ -96,22 +126,50 @@ async def conduct_research(self): cost_callback=self.add_costs, headers=self.headers, ) - + await self._log_event("action", action="agent_selected", details={ + "agent": self.agent, + "role": self.role + }) + + await self._log_event("research", step="conducting_research", details={ + "agent": self.agent, + "role": self.role + }) self.context = await self.research_conductor.conduct_research() + + await self._log_event("research", step="research_completed", details={ + "context_length": len(self.context) + }) return self.context async def write_report(self, existing_headers: list = [], relevant_written_contents: list = [], ext_context=None) -> str: - return await self.report_generator.write_report( + await self._log_event("research", step="writing_report", details={ + "existing_headers": existing_headers, + "context_source": "external" if ext_context else "internal" + }) + + report = await self.report_generator.write_report( existing_headers, relevant_written_contents, ext_context or self.context ) + + await self._log_event("research", step="report_completed", details={ + "report_length": len(report) + }) + return report async def write_report_conclusion(self, report_body: str) -> str: - return await self.report_generator.write_report_conclusion(report_body) + await self._log_event("research", step="writing_conclusion") + conclusion = await self.report_generator.write_report_conclusion(report_body) + await self._log_event("research", step="conclusion_completed") + return conclusion async def write_introduction(self): - return await self.report_generator.write_introduction() + await self._log_event("research", step="writing_introduction") + intro = await self.report_generator.write_introduction() + await self._log_event("research", step="introduction_completed") + return intro async def get_subtopics(self): return await self.report_generator.get_subtopics() @@ -174,3 +232,8 @@ def add_costs(self, cost: float) -> None: if not isinstance(cost, (float, int)): raise ValueError("Cost must be an integer or float") self.research_costs += cost + if self.log_handler: + self._log_event("research", step="cost_update", details={ + "cost": cost, + "total_cost": self.research_costs + }) diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py index ebe47ae56..89c0de507 100644 --- a/gpt_researcher/skills/researcher.py +++ b/gpt_researcher/skills/researcher.py @@ -2,11 +2,13 @@ import random import json from typing import Dict, Optional +import logging from ..actions.utils import stream_output from ..actions.query_processing import plan_research_outline, get_search_results from ..document import DocumentLoader, LangChainDocumentLoader from ..utils.enum import ReportSource, ReportType, Tone +from ..utils.logging_config import get_json_handler, get_research_logger class ResearchConductor: @@ -14,8 +16,12 @@ class ResearchConductor: def __init__(self, researcher): self.researcher = researcher + self.logger = logging.getLogger('research') + self.json_handler = get_json_handler() async def plan_research(self, query): + self.logger.info(f"Planning research for query: {query}") + await stream_output( "logs", "planning_research", @@ -24,15 +30,16 @@ async def plan_research(self, query): ) search_results = await get_search_results(query, self.researcher.retrievers[0]) + self.logger.info(f"Initial search results obtained: {len(search_results)} results") await stream_output( "logs", "planning_research", - f"šŸ¤” Planning the research strategy and subtasks (this may take a minute)...", + f"šŸ¤” Planning the research strategy and subtasks...", self.researcher.websocket, ) - return await plan_research_outline( + outline = await plan_research_outline( query=query, search_results=search_results, agent_role_prompt=self.researcher.role, @@ -41,11 +48,16 @@ async def plan_research(self, query): report_type=self.researcher.report_type, cost_callback=self.researcher.add_costs, ) + self.logger.info(f"Research outline planned: {outline}") + return outline async def conduct_research(self): - """ - Runs the GPT Researcher to conduct research - """ + """Runs the GPT Researcher to conduct research""" + if self.json_handler: + self.json_handler.update_content("query", self.researcher.query) + + self.logger.info(f"Starting research for query: {self.researcher.query}") + # Reset visited_urls and source_urls at the start of each research task self.researcher.visited_urls.clear() research_data = [] @@ -63,21 +75,25 @@ async def conduct_research(self): # Research for relevant sources based on source types below if self.researcher.source_urls: - # If specified, the researcher will use the given urls as the context for the research. + self.logger.info("Using provided source URLs") research_data = await self._get_context_by_urls(self.researcher.source_urls) - if research_data and len(research_data) == 0 and self.verbose: - # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge + if research_data and len(research_data) == 0 and self.researcher.verbose: await stream_output( "logs", "answering_from_memory", f"šŸ§ I was unable to find relevant context in the provided sources...", - self.websocket, + self.researcher.websocket, ) - # If complement_source_urls parameter is set, more resources can be gathered to create additional context using default web search if self.researcher.complement_source_urls: + self.logger.info("Complementing with web search") additional_research = await self._get_context_by_web_search(self.researcher.query) research_data += ' '.join(additional_research) + elif self.researcher.report_source == ReportSource.Web.value: + self.logger.info("Using web search") + research_data = await self._get_context_by_web_search(self.researcher.query) + + # ... rest of the conditions ... elif self.researcher.report_source == ReportSource.Local.value: document_data = await DocumentLoader(self.researcher.cfg.doc_path).load() if self.researcher.vector_store: @@ -106,13 +122,11 @@ async def conduct_research(self): elif self.researcher.report_source == ReportSource.LangChainVectorStore.value: research_data = await self._get_context_by_vectorstore(self.researcher.query, self.researcher.vector_store_filter) - # Default web based research - elif self.researcher.report_source == ReportSource.Web.value: - research_data = await self._get_context_by_web_search(self.researcher.query) - # Rank and curate the sources based on the research data + # Rank and curate the sources self.researcher.context = research_data if self.researcher.cfg.curate_sources: + self.logger.info("Curating sources") self.researcher.context = await self.researcher.source_curator.curate_sources(research_data) if self.researcher.verbose: @@ -122,28 +136,34 @@ async def conduct_research(self): f"Finalized research step.\nšŸ’ø Total Research Costs: ${self.researcher.get_costs()}", self.researcher.websocket, ) + if self.json_handler: + self.json_handler.update_content("costs", self.researcher.get_costs()) + self.json_handler.update_content("context", self.researcher.context) + self.logger.info(f"Research completed. Context size: {len(str(self.researcher.context))}") return self.researcher.context async def _get_context_by_urls(self, urls): - """ - Scrapes and compresses the context from the given urls - """ + """Scrapes and compresses the context from the given urls""" + self.logger.info(f"Getting context from URLs: {urls}") + new_search_urls = await self._get_new_urls(urls) - if self.researcher.verbose: - await stream_output( - "logs", - "source_urls", - f"šŸ—‚ļø I will conduct my research based on the following urls: {new_search_urls}...", - self.researcher.websocket, - ) + self.logger.info(f"New URLs to process: {new_search_urls}") scraped_content = await self.researcher.scraper_manager.browse_urls(new_search_urls) + self.logger.info(f"Scraped content from {len(scraped_content)} URLs") if self.researcher.vector_store: + self.logger.info("Loading content into vector store") self.researcher.vector_store.load(scraped_content) - return await self.researcher.context_manager.get_similar_content_by_query(self.researcher.query, scraped_content) + context = await self.researcher.context_manager.get_similar_content_by_query( + self.researcher.query, scraped_content + ) + self.logger.info(f"Generated context length: {len(context)}") + return context + + # Add logging to other methods similarly... async def _get_context_by_vectorstore(self, query, filter: Optional[dict] = None): """ @@ -183,8 +203,12 @@ async def _get_context_by_web_search(self, query, scraped_data: list = []): Returns: context: List of context """ + self.logger.info(f"Starting web search for query: {query}") + # Generate Sub-Queries including original query sub_queries = await self.plan_research(query) + self.logger.info(f"Generated sub-queries: {sub_queries}") + # If this is not part of a sub researcher, add original query to research for better results if self.researcher.report_type != "subtopic_report": sub_queries.append(query) @@ -200,24 +224,33 @@ async def _get_context_by_web_search(self, query, scraped_data: list = []): ) # Using asyncio.gather to process the sub_queries asynchronously - context = await asyncio.gather( - *[ - self._process_sub_query(sub_query, scraped_data) - for sub_query in sub_queries - ] - ) - return context + try: + context = await asyncio.gather( + *[ + self._process_sub_query(sub_query, scraped_data) + for sub_query in sub_queries + ] + ) + self.logger.info(f"Gathered context from {len(context)} sub-queries") + # Filter out empty results and join the context + context = [c for c in context if c] + if context: + combined_context = " ".join(context) + self.logger.info(f"Combined context size: {len(combined_context)}") + return combined_context + return [] + except Exception as e: + self.logger.error(f"Error during web search: {e}", exc_info=True) + return [] async def _process_sub_query(self, sub_query: str, scraped_data: list = []): - """Takes in a sub query and scrapes urls based on it and gathers context. - - Args: - sub_query (str): The sub-query generated from the original query - scraped_data (list): Scraped data passed in - - Returns: - str: The context gathered from search - """ + """Takes in a sub query and scrapes urls based on it and gathers context.""" + if self.json_handler: + self.json_handler.log_event("sub_query", { + "query": sub_query, + "scraped_data_size": len(scraped_data) + }) + if self.researcher.verbose: await stream_output( "logs", @@ -226,23 +259,35 @@ async def _process_sub_query(self, sub_query: str, scraped_data: list = []): self.researcher.websocket, ) - if not scraped_data: - scraped_data = await self._scrape_data_by_urls(sub_query) + try: + if not scraped_data: + scraped_data = await self._scrape_data_by_urls(sub_query) + self.logger.info(f"Scraped data size: {len(scraped_data)}") - content = await self.researcher.context_manager.get_similar_content_by_query(sub_query, scraped_data) + content = await self.researcher.context_manager.get_similar_content_by_query(sub_query, scraped_data) + self.logger.info(f"Content found for sub-query: {len(str(content)) if content else 0} chars") - if content and self.researcher.verbose: - await stream_output( - "logs", "subquery_context_window", f"šŸ“ƒ {content}", self.researcher.websocket - ) - elif self.researcher.verbose: - await stream_output( - "logs", - "subquery_context_not_found", - f"šŸ¤· No content found for '{sub_query}'...", - self.researcher.websocket, - ) - return content + if content and self.researcher.verbose: + await stream_output( + "logs", "subquery_context_window", f"šŸ“ƒ {content}", self.researcher.websocket + ) + elif self.researcher.verbose: + await stream_output( + "logs", + "subquery_context_not_found", + f"šŸ¤· No content found for '{sub_query}'...", + self.researcher.websocket, + ) + if content: + if self.json_handler: + self.json_handler.log_event("content_found", { + "sub_query": sub_query, + "content_size": len(content) + }) + return content + except Exception as e: + self.logger.error(f"Error processing sub-query {sub_query}: {e}", exc_info=True) + return "" async def _process_sub_query_with_vectorstore(self, sub_query: str, filter: Optional[dict] = None): """Takes in a sub query and gathers context from the user provided vector store diff --git a/gpt_researcher/utils/logging_config.py b/gpt_researcher/utils/logging_config.py new file mode 100644 index 000000000..ee0d855ed --- /dev/null +++ b/gpt_researcher/utils/logging_config.py @@ -0,0 +1,82 @@ +import logging +import json +import os +from datetime import datetime +from pathlib import Path + +class JSONResearchHandler: + def __init__(self, json_file): + self.json_file = json_file + self.research_data = { + "timestamp": datetime.now().isoformat(), + "events": [], + "content": { + "query": "", + "sources": [], + "context": [], + "report": "", + "costs": 0.0 + } + } + + def log_event(self, event_type: str, data: dict): + self.research_data["events"].append({ + "timestamp": datetime.now().isoformat(), + "type": event_type, + "data": data + }) + self._save_json() + + def update_content(self, key: str, value): + self.research_data["content"][key] = value + self._save_json() + + def _save_json(self): + with open(self.json_file, 'w') as f: + json.dump(self.research_data, f, indent=2) + +def setup_research_logging(): + # Create logs directory if it doesn't exist + logs_dir = Path("logs") + logs_dir.mkdir(exist_ok=True) + + # Generate timestamp for log files + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Create log file paths + log_file = logs_dir / f"research_{timestamp}.log" + json_file = logs_dir / f"research_{timestamp}.json" + + # Configure file handler for research logs + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + + # Get research logger and configure it + research_logger = logging.getLogger('research') + research_logger.setLevel(logging.INFO) + + # Remove any existing handlers to avoid duplicates + research_logger.handlers.clear() + + # Add file handler + research_logger.addHandler(file_handler) + + # Add stream handler for console output + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + research_logger.addHandler(console_handler) + + # Prevent propagation to root logger to avoid duplicate logs + research_logger.propagate = False + + # Create JSON handler + json_handler = JSONResearchHandler(json_file) + + return str(log_file), str(json_file), research_logger, json_handler + +def get_research_logger(): + return logging.getLogger('research') + +def get_json_handler(): + return getattr(logging.getLogger('research'), 'json_handler', None) diff --git a/main.py b/main.py index 0f48c2cba..10057a495 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,25 @@ from dotenv import load_dotenv +import logging +from pathlib import Path + +# Create logs directory if it doesn't exist +logs_dir = Path("logs") +logs_dir.mkdir(exist_ok=True) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + # File handler for general application logs + logging.FileHandler('logs/app.log'), + # Stream handler for console output + logging.StreamHandler() + ] +) + +# Create logger instance +logger = logging.getLogger(__name__) load_dotenv() @@ -6,5 +27,6 @@ if __name__ == "__main__": import uvicorn - - uvicorn.run(app, host="0.0.0.0", port=8000) + + logger.info("Starting server...") + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d2db4d9d7..cab6c1c77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,4 +45,11 @@ websockets = "^13.1" [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" \ No newline at end of file +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +asyncio_mode = "strict" +addopts = "-v" +testpaths = ["tests"] +python_files = "test_*.py" +asyncio_fixture_loop_scope = "function" \ No newline at end of file diff --git a/tests/gptr-logs-handler.py b/tests/gptr-logs-handler.py index fb05694ce..0bbec93a4 100644 --- a/tests/gptr-logs-handler.py +++ b/tests/gptr-logs-handler.py @@ -2,25 +2,7 @@ from typing import List, Dict, Any import asyncio from gpt_researcher import GPTResearcher - -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors - - def clear_logs(self) -> None: - """Clear the logs.""" - self.logs.clear() # Clear the logs list - logging.info("Logs cleared.") # Log the clearing action +from src.logs_handler import CustomLogsHandler # Update import async def run() -> None: """Run the research process and generate a report.""" @@ -30,7 +12,7 @@ async def run() -> None: tone = "informative" config_path = None - custom_logs_handler = CustomLogsHandler() + custom_logs_handler = CustomLogsHandler(query=query) # Pass query parameter researcher = GPTResearcher( query=query, diff --git a/tests/report-types.py b/tests/report-types.py index 073f8336e..e09fec100 100644 --- a/tests/report-types.py +++ b/tests/report-types.py @@ -2,23 +2,9 @@ import asyncio import pytest from gpt_researcher.agent import GPTResearcher -import logging +from src.logs_handler import CustomLogsHandler # Update import from typing import List, Dict, Any -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors - # Define the report types to test report_types = [ "research_report", @@ -39,7 +25,7 @@ async def test_gpt_researcher(report_type): if not os.path.exists(output_dir): os.makedirs(output_dir) - custom_logs_handler = CustomLogsHandler() + custom_logs_handler = CustomLogsHandler(query=query) # Create an instance of GPTResearcher researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler) diff --git a/tests/research_test.py b/tests/research_test.py index b58d5b92a..56077f8fd 100644 --- a/tests/research_test.py +++ b/tests/research_test.py @@ -18,23 +18,10 @@ import asyncio import logging from typing import List, Dict, Any - -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors +from src.logs_handler import CustomLogsHandler # Update import async def get_report(query: str, report_type: str, sources: list) -> str: - custom_logs_handler = CustomLogsHandler() + custom_logs_handler = CustomLogsHandler(query=query) # Pass query parameter researcher = GPTResearcher(query=query, report_type=report_type, complement_source_urls=False, diff --git a/tests/test_logging.py b/tests/test_logging.py new file mode 100644 index 000000000..c6ff963b7 --- /dev/null +++ b/tests/test_logging.py @@ -0,0 +1,61 @@ +import pytest +from unittest.mock import AsyncMock +from fastapi import WebSocket +from src.logs_handler import CustomLogsHandler +import os +import json + +@pytest.mark.asyncio +async def test_custom_logs_handler(): + # Mock websocket + mock_websocket = AsyncMock() + mock_websocket.send_json = AsyncMock() + + # Test initialization + handler = CustomLogsHandler(mock_websocket, "test_query") + + # Verify log file creation + assert os.path.exists(handler.log_file) + + # Test sending log data + test_data = { + "type": "logs", + "message": "Test log message" + } + + await handler.send_json(test_data) + + # Verify websocket was called with correct data + mock_websocket.send_json.assert_called_once_with(test_data) + + # Verify log file contents + with open(handler.log_file, 'r') as f: + log_data = json.load(f) + assert len(log_data['events']) == 1 + assert log_data['events'][0]['data'] == test_data + +@pytest.mark.asyncio +async def test_content_update(): + """Test handling of non-log type data that updates content""" + mock_websocket = AsyncMock() + mock_websocket.send_json = AsyncMock() + + handler = CustomLogsHandler(mock_websocket, "test_query") + + # Test content update + content_data = { + "query": "test query", + "sources": ["source1", "source2"], + "report": "test report" + } + + await handler.send_json(content_data) + + mock_websocket.send_json.assert_called_once_with(content_data) + + # Verify log file contents + with open(handler.log_file, 'r') as f: + log_data = json.load(f) + assert log_data['content']['query'] == "test query" + assert log_data['content']['sources'] == ["source1", "source2"] + assert log_data['content']['report'] == "test report" \ No newline at end of file diff --git a/tests/test_logs.py b/tests/test_logs.py new file mode 100644 index 000000000..0f2353959 --- /dev/null +++ b/tests/test_logs.py @@ -0,0 +1,48 @@ +import os +from pathlib import Path +import sys + +# Add the project root to Python path +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +from src.logs_handler import CustomLogsHandler + +def test_logs_creation(): + # Print current working directory + print(f"Current working directory: {os.getcwd()}") + + # Print project root + print(f"Project root: {project_root}") + + # Try to create logs directory directly + logs_dir = project_root / "logs" + print(f"Attempting to create logs directory at: {logs_dir}") + + try: + # Create directory with full permissions + os.makedirs(logs_dir, mode=0o777, exist_ok=True) + print(f"āœ“ Created directory: {logs_dir}") + + # Test file creation + test_file = logs_dir / "test.txt" + with open(test_file, 'w') as f: + f.write("Test log entry") + print(f"āœ“ Created test file: {test_file}") + + # Initialize the handler + handler = CustomLogsHandler() + print("āœ“ CustomLogsHandler initialized") + + # Test JSON logging + handler.logs.append({"test": "message"}) + print("āœ“ Added test log entry") + + except Exception as e: + print(f"āŒ Error: {str(e)}") + print(f"Error type: {type(e)}") + import traceback + print(f"Traceback: {traceback.format_exc()}") + +if __name__ == "__main__": + test_logs_creation() \ No newline at end of file diff --git a/tests/test_researcher_logging.py b/tests/test_researcher_logging.py new file mode 100644 index 000000000..ebf6e7a94 --- /dev/null +++ b/tests/test_researcher_logging.py @@ -0,0 +1,71 @@ +import pytest +import asyncio +from pathlib import Path +import sys +import logging + +# Add the project root to Python path +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +# Configure basic logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +@pytest.mark.asyncio +async def test_researcher_logging(): # Renamed function to be more specific + """ + Test suite for verifying the researcher's logging infrastructure. + Ensures proper creation and formatting of log files. + """ + try: + # Import here to catch any import errors + from src.researcher import Researcher + logger.info("Successfully imported Researcher class") + + # Create a researcher instance with a logging-focused query + researcher = Researcher( + query="Test query for logging verification", + report_type="research_report" + ) + logger.info("Created Researcher instance") + + # Run the research + report = await researcher.research() + logger.info("Research completed successfully!") + logger.info(f"Report length: {len(report)}") + + # Basic report assertions + assert report is not None + assert len(report) > 0 + + # Detailed log file verification + logs_dir = Path(project_root) / "logs" + log_files = list(logs_dir.glob("research_*.log")) + json_files = list(logs_dir.glob("research_*.json")) + + # Verify log files exist + assert len(log_files) > 0, "No log files were created" + assert len(json_files) > 0, "No JSON files were created" + + # Log the findings + logger.info(f"\nFound {len(log_files)} log files:") + for log_file in log_files: + logger.info(f"- {log_file.name}") + # Could add additional checks for log file format/content here + + logger.info(f"\nFound {len(json_files)} JSON files:") + for json_file in json_files: + logger.info(f"- {json_file.name}") + # Could add additional checks for JSON file structure here + + except ImportError as e: + logger.error(f"Import error: {e}") + logger.error("Make sure gpt_researcher is installed and in your PYTHONPATH") + raise + except Exception as e: + logger.error(f"Error during research: {e}") + raise + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file