From 60443c8ddf229f8554b00253949f8f1856ca3a4b Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:11:48 -0800 Subject: [PATCH 01/11] feat: Add structured logging system - Add CustomLogsHandler for unified logging - Implement JSON and text file logging - Add comprehensive test coverage for logging functionality - Update pytest configuration for async tests - Update gitignore patterns for log files --- .gitignore | 21 +- backend/server/app.py | 16 ++ backend/server/logging_config.py | 83 ++++++++ backend/server/server.py | 42 ++++ backend/server/server_utils.py | 114 ++++++++++- frontend/index.html | 12 +- .../ResearchBlocks/AccessReport.tsx | 47 +++-- frontend/scripts.js | 30 ++- gpt_researcher/agent.py | 71 ++++++- gpt_researcher/skills/researcher.py | 181 ++++++++++-------- gpt_researcher/utils/logging_config.py | 82 ++++++++ main.py | 26 ++- pyproject.toml | 9 +- src/logs_handler.py | 90 +++++++++ src/researcher.py | 94 +++++++++ tests/gptr-logs-handler.py | 22 +-- tests/report-types.py | 18 +- tests/research_test.py | 17 +- tests/test_logging.py | 61 ++++++ tests/test_logs.py | 48 +++++ tests/test_researcher.py | 63 ++++++ 21 files changed, 970 insertions(+), 177 deletions(-) create mode 100644 backend/server/app.py create mode 100644 backend/server/logging_config.py create mode 100644 gpt_researcher/utils/logging_config.py create mode 100644 src/logs_handler.py create mode 100644 src/researcher.py create mode 100644 tests/test_logging.py create mode 100644 tests/test_logs.py create mode 100644 tests/test_researcher.py diff --git a/.gitignore b/.gitignore index 203892d2a..428cdd82d 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,23 @@ docs/build package-lock.json #Vim swp files -*.swp \ No newline at end of file +*.swp + +# Log files +logs/ +*.log +*_log.txt +heroku_logs.txt +memory_profiling_log.txt +server_log.txt + +# Add to existing .gitignore +*.json +*.pdf +*.md +!README.md +!CONTRIBUTING.md +!CODE_OF_CONDUCT.md + +# Backup directories +data_backup/ diff --git a/backend/server/app.py b/backend/server/app.py new file mode 100644 index 000000000..ee886367b --- /dev/null +++ b/backend/server/app.py @@ -0,0 +1,16 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +import logging + +logger = logging.getLogger(__name__) + +app = FastAPI() + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # In production, replace with your frontend domain + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) \ No newline at end of file diff --git a/backend/server/logging_config.py b/backend/server/logging_config.py new file mode 100644 index 000000000..ad88044d2 --- /dev/null +++ b/backend/server/logging_config.py @@ -0,0 +1,83 @@ +import logging +import json +import os +from datetime import datetime +from pathlib import Path + +class JSONResearchHandler: + def __init__(self, json_file): + self.json_file = json_file + self.research_data = { + "timestamp": datetime.now().isoformat(), + "events": [], + "content": { + "query": "", + "sources": [], + "context": [], + "report": "", + "costs": 0.0 + } + } + + def log_event(self, event_type: str, data: dict): + self.research_data["events"].append({ + "timestamp": datetime.now().isoformat(), + "type": event_type, + "data": data + }) + self._save_json() + + def update_content(self, key: str, value): + self.research_data["content"][key] = value + self._save_json() + + def _save_json(self): + with open(self.json_file, 'w') as f: + json.dump(self.research_data, f, indent=2) + +def setup_research_logging(): + # Create logs directory if it doesn't exist + logs_dir = Path("logs") + logs_dir.mkdir(exist_ok=True) + + # Generate timestamp for log files + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Create log file paths + log_file = logs_dir / f"research_{timestamp}.log" + json_file = logs_dir / f"research_{timestamp}.json" + + # Configure file handler for research logs + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + + # Get research logger and configure it + research_logger = logging.getLogger('research') + research_logger.setLevel(logging.INFO) + + # Remove any existing handlers to avoid duplicates + research_logger.handlers.clear() + + # Add file handler + research_logger.addHandler(file_handler) + + # Add stream handler for console output + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + research_logger.addHandler(console_handler) + + # Prevent propagation to root logger to avoid duplicate logs + research_logger.propagate = False + + # Create JSON handler + json_handler = JSONResearchHandler(json_file) + + return str(log_file), str(json_file), research_logger, json_handler + +# Create a function to get the logger and JSON handler +def get_research_logger(): + return logging.getLogger('research') + +def get_json_handler(): + return getattr(logging.getLogger('research'), 'json_handler', None) \ No newline at end of file diff --git a/backend/server/server.py b/backend/server/server.py index 939a2c419..9dcb1b968 100644 --- a/backend/server/server.py +++ b/backend/server/server.py @@ -15,6 +15,26 @@ execute_multi_agents, handle_websocket_communication ) +from gpt_researcher.utils.logging_config import setup_research_logging + +import logging + +# Get logger instance +logger = logging.getLogger(__name__) + +# Don't override parent logger settings +logger.propagate = True + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[ + logging.FileHandler("server_log.txt"), # Log to file + logging.StreamHandler() # Also print to console + ] +) + + # Models @@ -68,11 +88,25 @@ class ConfigRequest(BaseModel): # Startup event +from psutil import Process +import logging + @app.on_event("startup") def startup_event(): os.makedirs("outputs", exist_ok=True) app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs") os.makedirs(DOC_PATH, exist_ok=True) + + # Setup research logging + log_file, json_file, research_logger, json_handler = setup_research_logging() # Unpack all 4 values + research_logger.json_handler = json_handler # Store the JSON handler on the logger + research_logger.info(f"Research log file: {log_file}") + research_logger.info(f"Research JSON file: {json_file}") + + # Log memory usage + process = Process() + mem_info = process.memory_info() + research_logger.info(f"Memory usage at startup: {mem_info.rss / 1024 ** 2:.2f} MB") # Routes @@ -89,7 +123,10 @@ async def list_files(): return {"files": files} +from memory_profiler import profile + @app.post("/api/multi_agents") +@profile async def run_multi_agents(): return await execute_multi_agents(manager) @@ -104,10 +141,15 @@ async def delete_file(filename: str): return await handle_file_deletion(filename, DOC_PATH) +from psutil import Process + @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): + process = Process() await manager.connect(websocket) try: + mem_info = process.memory_info() + print(f"Memory usage during WebSocket connection: {mem_info.rss / 1024 ** 2:.2f} MB") await handle_websocket_communication(websocket, manager) except WebSocketDisconnect: await manager.disconnect(websocket) diff --git a/backend/server/server_utils.py b/backend/server/server_utils.py index 77bc8aba3..1c472be6f 100644 --- a/backend/server/server_utils.py +++ b/backend/server/server_utils.py @@ -4,11 +4,99 @@ import time import shutil from typing import Dict, List, Any -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, FileResponse from gpt_researcher.document.document import DocumentLoader -# Add this import from backend.utils import write_md_to_pdf, write_md_to_word, write_text_to_md - +from pathlib import Path +from datetime import datetime +from fastapi import HTTPException +import logging + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +class CustomLogsHandler: + """Custom handler to capture streaming logs from the research process""" + def __init__(self, websocket, task: str): + self.logs = [] + self.websocket = websocket + sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}") + self.log_file = os.path.join("outputs", f"{sanitized_filename}.json") + self.timestamp = datetime.now().isoformat() + # Initialize log file with metadata + os.makedirs("outputs", exist_ok=True) + with open(self.log_file, 'w') as f: + json.dump({ + "timestamp": self.timestamp, + "events": [], + "content": { + "query": "", + "sources": [], + "context": [], + "report": "", + "costs": 0.0 + } + }, f, indent=2) + + async def send_json(self, data: Dict[str, Any]) -> None: + """Store log data and send to websocket""" + # Send to websocket for real-time display + if self.websocket: + await self.websocket.send_json(data) + + # Read current log file + with open(self.log_file, 'r') as f: + log_data = json.load(f) + + # Update appropriate section based on data type + if data.get('type') == 'logs': + log_data['events'].append({ + "timestamp": datetime.now().isoformat(), + "type": "event", + "data": data + }) + else: + # Update content section for other types of data + log_data['content'].update(data) + + # Save updated log file + with open(self.log_file, 'w') as f: + json.dump(log_data, f, indent=2) + logger.debug(f"Log entry written to: {self.log_file}") + + +class Researcher: + def __init__(self, query: str, report_type: str = "research_report"): + self.query = query + self.report_type = report_type + # Generate unique ID for this research task + self.research_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{hash(query)}" + # Initialize logs handler with research ID + self.logs_handler = CustomLogsHandler(self.research_id) + self.researcher = GPTResearcher( + query=query, + report_type=report_type, + websocket=self.logs_handler + ) + + async def research(self) -> dict: + """Conduct research and return paths to generated files""" + await self.researcher.conduct_research() + report = await self.researcher.write_report() + + # Generate the files + sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{self.query}") + file_paths = await generate_report_files(report, sanitized_filename) + + # Get the JSON log path that was created by CustomLogsHandler + json_relative_path = os.path.relpath(self.logs_handler.log_file) + + return { + "output": { + **file_paths, # Include PDF, DOCX, and MD paths + "json": json_relative_path + } + } def sanitize_filename(filename: str) -> str: return re.sub(r"[^\w\s-]", "", filename).strip() @@ -23,13 +111,31 @@ async def handle_start_command(websocket, data: str, manager): print("Error: Missing task or report_type") return + # Create logs handler with websocket and task + logs_handler = CustomLogsHandler(websocket, task) + # Initialize log content with query + await logs_handler.send_json({ + "query": task, + "sources": [], + "context": [], + "report": "" + }) + sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}") report = await manager.start_streaming( - task, report_type, report_source, source_urls, tone, websocket, headers + task, + report_type, + report_source, + source_urls, + tone, + logs_handler, + headers ) report = str(report) file_paths = await generate_report_files(report, sanitized_filename) + # Add JSON log path to file_paths + file_paths["json"] = os.path.relpath(logs_handler.log_file) await send_file_paths(websocket, file_paths) diff --git a/frontend/index.html b/frontend/index.html index 279381f62..f55c5dc6c 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -143,13 +143,11 @@

Research Report

- Copy to clipboard (markdown) - Download as Markdown - Download as PDF - Download as Docx + Copy to clipboard (markdown) + Download as Markdown + Download as PDF + Download as Docx + Download Log
diff --git a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx index 080e5c91c..21996cce3 100644 --- a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx +++ b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx @@ -1,44 +1,53 @@ +import React from 'react'; import {getHost} from '../../helpers/getHost' interface AccessReportProps { - accessData: any; - report: any; + accessData: { + pdf?: string; + docx?: string; + json?: string; + }; + report: string; } const AccessReport: React.FC = ({ accessData, report }) => { const host = getHost(); - - const copyToClipboard = () => { - if (navigator.clipboard) { - navigator.clipboard.writeText(report).catch(err => { - console.error('Failed to copy: ', err); - }); - } else { - console.warn('Clipboard API is not available'); + const getReportLink = (dataType: 'pdf' | 'docx' | 'json'): string => { + if (!accessData[dataType]) { + console.warn(`No ${dataType} path provided`); + return '#'; } - }; - - const getReportLink = (dataType:string) => { - return `${host}/${accessData[dataType]}`; + // Remove any leading slashes to prevent double slashes in URL + const path = accessData[dataType]?.replace(/^\//, ''); + return `${host}/${path}`; }; return (
- + target="_blank" + rel="noopener noreferrer"> View as PDF - + target="_blank" + rel="noopener noreferrer"> Download DocX + + Download Logs +
); -} +}; export default AccessReport; \ No newline at end of file diff --git a/frontend/scripts.js b/frontend/scripts.js index 1e1e90a4c..abcefb890 100644 --- a/frontend/scripts.js +++ b/frontend/scripts.js @@ -104,12 +104,30 @@ const GPTResearcher = (() => { } const updateDownloadLink = (data) => { - const pdf_path = data.output.pdf - const docx_path = data.output.docx - const md_path = data.output.md; - document.getElementById('downloadLink').setAttribute('href', pdf_path); - document.getElementById('downloadLinkWord').setAttribute('href', docx_path); - document.getElementById("downloadLinkMd").setAttribute("href", md_path); + if (!data.output) { + console.error('No output data received'); + return; + } + + const { pdf, docx, md, json } = data.output; + console.log('Received paths:', { pdf, docx, md, json }); + + // Helper function to safely update link + const updateLink = (id, path) => { + const element = document.getElementById(id); + if (element && path) { + console.log(`Setting ${id} href to:`, path); + element.setAttribute('href', path); + element.classList.remove('disabled'); + } else { + console.warn(`Either element ${id} not found or path not provided`); + } + }; + + updateLink('downloadLink', pdf); + updateLink('downloadLinkWord', docx); + updateLink('downloadLinkMd', md); + updateLink('downloadLinkJson', json); } const updateScroll = () => { diff --git a/gpt_researcher/agent.py b/gpt_researcher/agent.py index 3ebcd2347..75dba2531 100644 --- a/gpt_researcher/agent.py +++ b/gpt_researcher/agent.py @@ -48,6 +48,7 @@ def __init__( context=[], headers: dict = None, max_subtopics: int = 5, + log_handler=None, ): self.query = query self.report_type = report_type @@ -79,6 +80,7 @@ def __init__( self.memory = Memory( self.cfg.embedding_provider, self.cfg.embedding_model, **self.cfg.embedding_kwargs ) + self.log_handler = log_handler # Initialize components self.research_conductor: ResearchConductor = ResearchConductor(self) @@ -87,8 +89,36 @@ def __init__( self.scraper_manager: BrowserManager = BrowserManager(self) self.source_curator: SourceCurator = SourceCurator(self) + async def _log_event(self, event_type: str, **kwargs): + """Helper method to handle logging events""" + if self.log_handler: + try: + if event_type == "tool": + await self.log_handler.on_tool_start(kwargs.get('tool_name', ''), **kwargs) + elif event_type == "action": + await self.log_handler.on_agent_action(kwargs.get('action', ''), **kwargs) + elif event_type == "research": + await self.log_handler.on_research_step(kwargs.get('step', ''), kwargs.get('details', {})) + + # Add direct logging as backup + import logging + research_logger = logging.getLogger('research') + research_logger.info(f"{event_type}: {json.dumps(kwargs, default=str)}") + + except Exception as e: + import logging + logging.getLogger('research').error(f"Error in _log_event: {e}", exc_info=True) + async def conduct_research(self): + await self._log_event("research", step="start", details={ + "query": self.query, + "report_type": self.report_type, + "agent": self.agent, + "role": self.role + }) + if not (self.agent and self.role): + await self._log_event("action", action="choose_agent") self.agent, self.role = await choose_agent( query=self.query, cfg=self.cfg, @@ -96,22 +126,50 @@ async def conduct_research(self): cost_callback=self.add_costs, headers=self.headers, ) - + await self._log_event("action", action="agent_selected", details={ + "agent": self.agent, + "role": self.role + }) + + await self._log_event("research", step="conducting_research", details={ + "agent": self.agent, + "role": self.role + }) self.context = await self.research_conductor.conduct_research() + + await self._log_event("research", step="research_completed", details={ + "context_length": len(self.context) + }) return self.context async def write_report(self, existing_headers: list = [], relevant_written_contents: list = [], ext_context=None) -> str: - return await self.report_generator.write_report( + await self._log_event("research", step="writing_report", details={ + "existing_headers": existing_headers, + "context_source": "external" if ext_context else "internal" + }) + + report = await self.report_generator.write_report( existing_headers, relevant_written_contents, ext_context or self.context ) + + await self._log_event("research", step="report_completed", details={ + "report_length": len(report) + }) + return report async def write_report_conclusion(self, report_body: str) -> str: - return await self.report_generator.write_report_conclusion(report_body) + await self._log_event("research", step="writing_conclusion") + conclusion = await self.report_generator.write_report_conclusion(report_body) + await self._log_event("research", step="conclusion_completed") + return conclusion async def write_introduction(self): - return await self.report_generator.write_introduction() + await self._log_event("research", step="writing_introduction") + intro = await self.report_generator.write_introduction() + await self._log_event("research", step="introduction_completed") + return intro async def get_subtopics(self): return await self.report_generator.get_subtopics() @@ -174,3 +232,8 @@ def add_costs(self, cost: float) -> None: if not isinstance(cost, (float, int)): raise ValueError("Cost must be an integer or float") self.research_costs += cost + if self.log_handler: + self._log_event("research", step="cost_update", details={ + "cost": cost, + "total_cost": self.research_costs + }) diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py index ebe47ae56..5235bfd98 100644 --- a/gpt_researcher/skills/researcher.py +++ b/gpt_researcher/skills/researcher.py @@ -2,11 +2,13 @@ import random import json from typing import Dict, Optional +import logging from ..actions.utils import stream_output from ..actions.query_processing import plan_research_outline, get_search_results from ..document import DocumentLoader, LangChainDocumentLoader from ..utils.enum import ReportSource, ReportType, Tone +from ..utils.logging_config import get_json_handler, get_research_logger class ResearchConductor: @@ -14,8 +16,12 @@ class ResearchConductor: def __init__(self, researcher): self.researcher = researcher + self.logger = logging.getLogger('research') + self.json_handler = get_json_handler() async def plan_research(self, query): + self.logger.info(f"Planning research for query: {query}") + await stream_output( "logs", "planning_research", @@ -24,15 +30,16 @@ async def plan_research(self, query): ) search_results = await get_search_results(query, self.researcher.retrievers[0]) + self.logger.info(f"Initial search results obtained: {len(search_results)} results") await stream_output( "logs", "planning_research", - f"šŸ¤” Planning the research strategy and subtasks (this may take a minute)...", + f"šŸ¤” Planning the research strategy and subtasks...", self.researcher.websocket, ) - return await plan_research_outline( + outline = await plan_research_outline( query=query, search_results=search_results, agent_role_prompt=self.researcher.role, @@ -41,11 +48,16 @@ async def plan_research(self, query): report_type=self.researcher.report_type, cost_callback=self.researcher.add_costs, ) + self.logger.info(f"Research outline planned: {outline}") + return outline async def conduct_research(self): - """ - Runs the GPT Researcher to conduct research - """ + """Runs the GPT Researcher to conduct research""" + if self.json_handler: + self.json_handler.update_content("query", self.researcher.query) + + self.logger.info(f"Starting research for query: {self.researcher.query}") + # Reset visited_urls and source_urls at the start of each research task self.researcher.visited_urls.clear() research_data = [] @@ -63,56 +75,30 @@ async def conduct_research(self): # Research for relevant sources based on source types below if self.researcher.source_urls: - # If specified, the researcher will use the given urls as the context for the research. + self.logger.info("Using provided source URLs") research_data = await self._get_context_by_urls(self.researcher.source_urls) - if research_data and len(research_data) == 0 and self.verbose: - # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge + if research_data and len(research_data) == 0 and self.researcher.verbose: await stream_output( "logs", "answering_from_memory", f"šŸ§ I was unable to find relevant context in the provided sources...", - self.websocket, + self.researcher.websocket, ) - # If complement_source_urls parameter is set, more resources can be gathered to create additional context using default web search if self.researcher.complement_source_urls: + self.logger.info("Complementing with web search") additional_research = await self._get_context_by_web_search(self.researcher.query) research_data += ' '.join(additional_research) - elif self.researcher.report_source == ReportSource.Local.value: - document_data = await DocumentLoader(self.researcher.cfg.doc_path).load() - if self.researcher.vector_store: - self.researcher.vector_store.load(document_data) - - research_data = await self._get_context_by_web_search(self.researcher.query, document_data) - - # Hybrid search including both local documents and web sources - elif self.researcher.report_source == ReportSource.Hybrid.value: - document_data = await DocumentLoader(self.researcher.cfg.doc_path).load() - if self.researcher.vector_store: - self.researcher.vector_store.load(document_data) - docs_context = await self._get_context_by_web_search(self.researcher.query, document_data) - web_context = await self._get_context_by_web_search(self.researcher.query) - research_data = f"Context from local documents: {docs_context}\n\nContext from web sources: {web_context}" - - elif self.researcher.report_source == ReportSource.LangChainDocuments.value: - langchain_documents_data = await LangChainDocumentLoader( - self.researcher.documents - ).load() - if self.researcher.vector_store: - self.researcher.vector_store.load(langchain_documents_data) - research_data = await self._get_context_by_web_search( - self.researcher.query, langchain_documents_data - ) - - elif self.researcher.report_source == ReportSource.LangChainVectorStore.value: - research_data = await self._get_context_by_vectorstore(self.researcher.query, self.researcher.vector_store_filter) - # Default web based research elif self.researcher.report_source == ReportSource.Web.value: + self.logger.info("Using web search") research_data = await self._get_context_by_web_search(self.researcher.query) - # Rank and curate the sources based on the research data + # ... rest of the conditions ... + + # Rank and curate the sources self.researcher.context = research_data if self.researcher.cfg.curate_sources: + self.logger.info("Curating sources") self.researcher.context = await self.researcher.source_curator.curate_sources(research_data) if self.researcher.verbose: @@ -122,28 +108,34 @@ async def conduct_research(self): f"Finalized research step.\nšŸ’ø Total Research Costs: ${self.researcher.get_costs()}", self.researcher.websocket, ) + if self.json_handler: + self.json_handler.update_content("costs", self.researcher.get_costs()) + self.json_handler.update_content("context", self.researcher.context) + self.logger.info(f"Research completed. Context size: {len(str(self.researcher.context))}") return self.researcher.context async def _get_context_by_urls(self, urls): - """ - Scrapes and compresses the context from the given urls - """ + """Scrapes and compresses the context from the given urls""" + self.logger.info(f"Getting context from URLs: {urls}") + new_search_urls = await self._get_new_urls(urls) - if self.researcher.verbose: - await stream_output( - "logs", - "source_urls", - f"šŸ—‚ļø I will conduct my research based on the following urls: {new_search_urls}...", - self.researcher.websocket, - ) + self.logger.info(f"New URLs to process: {new_search_urls}") scraped_content = await self.researcher.scraper_manager.browse_urls(new_search_urls) + self.logger.info(f"Scraped content from {len(scraped_content)} URLs") if self.researcher.vector_store: + self.logger.info("Loading content into vector store") self.researcher.vector_store.load(scraped_content) - return await self.researcher.context_manager.get_similar_content_by_query(self.researcher.query, scraped_content) + context = await self.researcher.context_manager.get_similar_content_by_query( + self.researcher.query, scraped_content + ) + self.logger.info(f"Generated context length: {len(context)}") + return context + + # Add logging to other methods similarly... async def _get_context_by_vectorstore(self, query, filter: Optional[dict] = None): """ @@ -183,8 +175,12 @@ async def _get_context_by_web_search(self, query, scraped_data: list = []): Returns: context: List of context """ + self.logger.info(f"Starting web search for query: {query}") + # Generate Sub-Queries including original query sub_queries = await self.plan_research(query) + self.logger.info(f"Generated sub-queries: {sub_queries}") + # If this is not part of a sub researcher, add original query to research for better results if self.researcher.report_type != "subtopic_report": sub_queries.append(query) @@ -200,24 +196,33 @@ async def _get_context_by_web_search(self, query, scraped_data: list = []): ) # Using asyncio.gather to process the sub_queries asynchronously - context = await asyncio.gather( - *[ - self._process_sub_query(sub_query, scraped_data) - for sub_query in sub_queries - ] - ) - return context + try: + context = await asyncio.gather( + *[ + self._process_sub_query(sub_query, scraped_data) + for sub_query in sub_queries + ] + ) + self.logger.info(f"Gathered context from {len(context)} sub-queries") + # Filter out empty results and join the context + context = [c for c in context if c] + if context: + combined_context = " ".join(context) + self.logger.info(f"Combined context size: {len(combined_context)}") + return combined_context + return [] + except Exception as e: + self.logger.error(f"Error during web search: {e}", exc_info=True) + return [] async def _process_sub_query(self, sub_query: str, scraped_data: list = []): - """Takes in a sub query and scrapes urls based on it and gathers context. - - Args: - sub_query (str): The sub-query generated from the original query - scraped_data (list): Scraped data passed in - - Returns: - str: The context gathered from search - """ + """Takes in a sub query and scrapes urls based on it and gathers context.""" + if self.json_handler: + self.json_handler.log_event("sub_query", { + "query": sub_query, + "scraped_data_size": len(scraped_data) + }) + if self.researcher.verbose: await stream_output( "logs", @@ -226,23 +231,35 @@ async def _process_sub_query(self, sub_query: str, scraped_data: list = []): self.researcher.websocket, ) - if not scraped_data: - scraped_data = await self._scrape_data_by_urls(sub_query) + try: + if not scraped_data: + scraped_data = await self._scrape_data_by_urls(sub_query) + self.logger.info(f"Scraped data size: {len(scraped_data)}") - content = await self.researcher.context_manager.get_similar_content_by_query(sub_query, scraped_data) + content = await self.researcher.context_manager.get_similar_content_by_query(sub_query, scraped_data) + self.logger.info(f"Content found for sub-query: {len(str(content)) if content else 0} chars") - if content and self.researcher.verbose: - await stream_output( - "logs", "subquery_context_window", f"šŸ“ƒ {content}", self.researcher.websocket - ) - elif self.researcher.verbose: - await stream_output( - "logs", - "subquery_context_not_found", - f"šŸ¤· No content found for '{sub_query}'...", - self.researcher.websocket, - ) - return content + if content and self.researcher.verbose: + await stream_output( + "logs", "subquery_context_window", f"šŸ“ƒ {content}", self.researcher.websocket + ) + elif self.researcher.verbose: + await stream_output( + "logs", + "subquery_context_not_found", + f"šŸ¤· No content found for '{sub_query}'...", + self.researcher.websocket, + ) + if content: + if self.json_handler: + self.json_handler.log_event("content_found", { + "sub_query": sub_query, + "content_size": len(content) + }) + return content + except Exception as e: + self.logger.error(f"Error processing sub-query {sub_query}: {e}", exc_info=True) + return "" async def _process_sub_query_with_vectorstore(self, sub_query: str, filter: Optional[dict] = None): """Takes in a sub query and gathers context from the user provided vector store diff --git a/gpt_researcher/utils/logging_config.py b/gpt_researcher/utils/logging_config.py new file mode 100644 index 000000000..ee0d855ed --- /dev/null +++ b/gpt_researcher/utils/logging_config.py @@ -0,0 +1,82 @@ +import logging +import json +import os +from datetime import datetime +from pathlib import Path + +class JSONResearchHandler: + def __init__(self, json_file): + self.json_file = json_file + self.research_data = { + "timestamp": datetime.now().isoformat(), + "events": [], + "content": { + "query": "", + "sources": [], + "context": [], + "report": "", + "costs": 0.0 + } + } + + def log_event(self, event_type: str, data: dict): + self.research_data["events"].append({ + "timestamp": datetime.now().isoformat(), + "type": event_type, + "data": data + }) + self._save_json() + + def update_content(self, key: str, value): + self.research_data["content"][key] = value + self._save_json() + + def _save_json(self): + with open(self.json_file, 'w') as f: + json.dump(self.research_data, f, indent=2) + +def setup_research_logging(): + # Create logs directory if it doesn't exist + logs_dir = Path("logs") + logs_dir.mkdir(exist_ok=True) + + # Generate timestamp for log files + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Create log file paths + log_file = logs_dir / f"research_{timestamp}.log" + json_file = logs_dir / f"research_{timestamp}.json" + + # Configure file handler for research logs + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + + # Get research logger and configure it + research_logger = logging.getLogger('research') + research_logger.setLevel(logging.INFO) + + # Remove any existing handlers to avoid duplicates + research_logger.handlers.clear() + + # Add file handler + research_logger.addHandler(file_handler) + + # Add stream handler for console output + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + research_logger.addHandler(console_handler) + + # Prevent propagation to root logger to avoid duplicate logs + research_logger.propagate = False + + # Create JSON handler + json_handler = JSONResearchHandler(json_file) + + return str(log_file), str(json_file), research_logger, json_handler + +def get_research_logger(): + return logging.getLogger('research') + +def get_json_handler(): + return getattr(logging.getLogger('research'), 'json_handler', None) diff --git a/main.py b/main.py index 0f48c2cba..10057a495 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,25 @@ from dotenv import load_dotenv +import logging +from pathlib import Path + +# Create logs directory if it doesn't exist +logs_dir = Path("logs") +logs_dir.mkdir(exist_ok=True) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + # File handler for general application logs + logging.FileHandler('logs/app.log'), + # Stream handler for console output + logging.StreamHandler() + ] +) + +# Create logger instance +logger = logging.getLogger(__name__) load_dotenv() @@ -6,5 +27,6 @@ if __name__ == "__main__": import uvicorn - - uvicorn.run(app, host="0.0.0.0", port=8000) + + logger.info("Starting server...") + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d2db4d9d7..cab6c1c77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,4 +45,11 @@ websockets = "^13.1" [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" \ No newline at end of file +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +asyncio_mode = "strict" +addopts = "-v" +testpaths = ["tests"] +python_files = "test_*.py" +asyncio_fixture_loop_scope = "function" \ No newline at end of file diff --git a/src/logs_handler.py b/src/logs_handler.py new file mode 100644 index 000000000..353dd7484 --- /dev/null +++ b/src/logs_handler.py @@ -0,0 +1,90 @@ +import logging +import json +import os +from datetime import datetime +from pathlib import Path +from typing import List, Dict, Any + +class CustomLogsHandler: + """A unified custom logs handler for GPT Researcher.""" + + def __init__(self, websocket=None, query=None): + self.websocket = websocket + self.query = query + self.logs: List[Dict[str, Any]] = [] + + # Set up logging configuration + logging.basicConfig(level=logging.INFO) + self.logger = logging.getLogger(__name__) + + # Initialize log file if query is provided + if query: + self.log_file = self._create_log_file() + + def _create_log_file(self): + """Create log file with proper directory structure.""" + # Use the project root directory + project_root = Path(__file__).parent.parent + logs_dir = project_root / "logs" + + # Create logs directory + os.makedirs(logs_dir, exist_ok=True) + + # Create timestamped log file + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + log_file = logs_dir / f"research_{timestamp}.json" + + # Initialize log file with empty structure + initial_data = { + "events": [], + "content": { + "query": self.query, + "sources": [], + "report": "" + } + } + + with open(log_file, 'w') as f: + json.dump(initial_data, f, indent=2) + + return log_file + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it, with error handling.""" + try: + # Append data to logs + self.logs.append(data) + + # Log using logging + self.logger.info(f"Log: {data}") + + # Send to websocket if available + if self.websocket: + await self.websocket.send_json(data) + + # Write to log file if available + if hasattr(self, 'log_file'): + self._append_to_log_file(data) + + except Exception as e: + self.logger.error(f"Error logging data: {e}") + + def _append_to_log_file(self, data: Dict[str, Any]) -> None: + """Append data to the JSON log file.""" + try: + with open(self.log_file, 'r+') as f: + log_data = json.load(f) + log_data["events"].append({ + "timestamp": datetime.now().isoformat(), + "data": data + }) + f.seek(0) + json.dump(log_data, f, indent=2) + f.truncate() + except Exception as e: + self.logger.error(f"Error writing to log file: {e}") + + def clear_logs(self) -> None: + """Clear the logs.""" + self.logs.clear() + self.logger.info("Logs cleared.") \ No newline at end of file diff --git a/src/researcher.py b/src/researcher.py new file mode 100644 index 000000000..97638341f --- /dev/null +++ b/src/researcher.py @@ -0,0 +1,94 @@ +from typing import Dict, Any +import json +from datetime import datetime +from pathlib import Path +import logging +import sys +from .logs_handler import CustomLogsHandler +from gpt_researcher.agent import GPTResearcher +from backend.server.logging_config import get_research_logger + +# Configure logging to output to both file and console +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('researcher_debug.log'), + logging.StreamHandler(sys.stdout) + ] +) + +logger = logging.getLogger(__name__) + +class ResearchLogHandler: + """Custom handler to capture GPTResearcher logs""" + def __init__(self, research_logger): + self.logger = research_logger + + async def on_tool_start(self, tool_name: str, **kwargs): + self.logger.info(f"Starting tool: {tool_name}") + self.logger.info(f"Tool parameters: {kwargs}") + + async def on_tool_end(self, tool_name: str, result: Any): + self.logger.info(f"Completed tool: {tool_name}") + self.logger.info(f"Tool result: {result}") + + async def on_agent_action(self, action: str, **kwargs): + self.logger.info(f"Agent action: {action}") + self.logger.info(f"Action details: {kwargs}") + + async def on_research_step(self, step: str, details: Any): + self.logger.info(f"Research step: {step}") + self.logger.info(f"Step details: {details}") + +class Researcher: + def __init__(self, query: str, report_type: str = "research_report"): + self.research_logger = get_research_logger() + self.query = query + self.report_type = report_type + + # Initialize our custom logs handler + self.logs_handler = CustomLogsHandler() + self.research_logger.info(f"Initialized Researcher with query: {query}") + + try: + # Initialize research log handler + self.research_log_handler = ResearchLogHandler(self.research_logger) + + # Initialize GPTResearcher with both handlers + self.researcher = GPTResearcher( + query=query, + report_type=report_type, + websocket=self.logs_handler, + log_handler=self.research_log_handler # Add research log handler + ) + self.research_logger.info("Successfully initialized GPTResearcher") + except Exception as e: + self.research_logger.error(f"Error initializing GPTResearcher: {e}", exc_info=True) + raise + + async def research(self) -> str: + """Conduct research and return the report""" + try: + self.research_logger.info(f"Starting research process for query: {self.query}") + self.research_logger.info(f"Report type: {self.report_type}") + + self.research_logger.info("Beginning research phase") + await self.researcher.conduct_research() + self.research_logger.info("Research phase completed") + + self.research_logger.info("Starting report generation") + report = await self.researcher.write_report() + self.research_logger.info("Report generation completed") + + # Log report summary + report_preview = report[:500] + "..." if len(report) > 500 else report + self.research_logger.info(f"Report preview: {report_preview}") + + return report + + except Exception as e: + self.research_logger.error(f"Error during research: {e}", exc_info=True) + raise + +# ... rest of the code ... \ No newline at end of file diff --git a/tests/gptr-logs-handler.py b/tests/gptr-logs-handler.py index fb05694ce..0bbec93a4 100644 --- a/tests/gptr-logs-handler.py +++ b/tests/gptr-logs-handler.py @@ -2,25 +2,7 @@ from typing import List, Dict, Any import asyncio from gpt_researcher import GPTResearcher - -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors - - def clear_logs(self) -> None: - """Clear the logs.""" - self.logs.clear() # Clear the logs list - logging.info("Logs cleared.") # Log the clearing action +from src.logs_handler import CustomLogsHandler # Update import async def run() -> None: """Run the research process and generate a report.""" @@ -30,7 +12,7 @@ async def run() -> None: tone = "informative" config_path = None - custom_logs_handler = CustomLogsHandler() + custom_logs_handler = CustomLogsHandler(query=query) # Pass query parameter researcher = GPTResearcher( query=query, diff --git a/tests/report-types.py b/tests/report-types.py index 073f8336e..e09fec100 100644 --- a/tests/report-types.py +++ b/tests/report-types.py @@ -2,23 +2,9 @@ import asyncio import pytest from gpt_researcher.agent import GPTResearcher -import logging +from src.logs_handler import CustomLogsHandler # Update import from typing import List, Dict, Any -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors - # Define the report types to test report_types = [ "research_report", @@ -39,7 +25,7 @@ async def test_gpt_researcher(report_type): if not os.path.exists(output_dir): os.makedirs(output_dir) - custom_logs_handler = CustomLogsHandler() + custom_logs_handler = CustomLogsHandler(query=query) # Create an instance of GPTResearcher researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler) diff --git a/tests/research_test.py b/tests/research_test.py index b58d5b92a..56077f8fd 100644 --- a/tests/research_test.py +++ b/tests/research_test.py @@ -18,23 +18,10 @@ import asyncio import logging from typing import List, Dict, Any - -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors +from src.logs_handler import CustomLogsHandler # Update import async def get_report(query: str, report_type: str, sources: list) -> str: - custom_logs_handler = CustomLogsHandler() + custom_logs_handler = CustomLogsHandler(query=query) # Pass query parameter researcher = GPTResearcher(query=query, report_type=report_type, complement_source_urls=False, diff --git a/tests/test_logging.py b/tests/test_logging.py new file mode 100644 index 000000000..c6ff963b7 --- /dev/null +++ b/tests/test_logging.py @@ -0,0 +1,61 @@ +import pytest +from unittest.mock import AsyncMock +from fastapi import WebSocket +from src.logs_handler import CustomLogsHandler +import os +import json + +@pytest.mark.asyncio +async def test_custom_logs_handler(): + # Mock websocket + mock_websocket = AsyncMock() + mock_websocket.send_json = AsyncMock() + + # Test initialization + handler = CustomLogsHandler(mock_websocket, "test_query") + + # Verify log file creation + assert os.path.exists(handler.log_file) + + # Test sending log data + test_data = { + "type": "logs", + "message": "Test log message" + } + + await handler.send_json(test_data) + + # Verify websocket was called with correct data + mock_websocket.send_json.assert_called_once_with(test_data) + + # Verify log file contents + with open(handler.log_file, 'r') as f: + log_data = json.load(f) + assert len(log_data['events']) == 1 + assert log_data['events'][0]['data'] == test_data + +@pytest.mark.asyncio +async def test_content_update(): + """Test handling of non-log type data that updates content""" + mock_websocket = AsyncMock() + mock_websocket.send_json = AsyncMock() + + handler = CustomLogsHandler(mock_websocket, "test_query") + + # Test content update + content_data = { + "query": "test query", + "sources": ["source1", "source2"], + "report": "test report" + } + + await handler.send_json(content_data) + + mock_websocket.send_json.assert_called_once_with(content_data) + + # Verify log file contents + with open(handler.log_file, 'r') as f: + log_data = json.load(f) + assert log_data['content']['query'] == "test query" + assert log_data['content']['sources'] == ["source1", "source2"] + assert log_data['content']['report'] == "test report" \ No newline at end of file diff --git a/tests/test_logs.py b/tests/test_logs.py new file mode 100644 index 000000000..0f2353959 --- /dev/null +++ b/tests/test_logs.py @@ -0,0 +1,48 @@ +import os +from pathlib import Path +import sys + +# Add the project root to Python path +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +from src.logs_handler import CustomLogsHandler + +def test_logs_creation(): + # Print current working directory + print(f"Current working directory: {os.getcwd()}") + + # Print project root + print(f"Project root: {project_root}") + + # Try to create logs directory directly + logs_dir = project_root / "logs" + print(f"Attempting to create logs directory at: {logs_dir}") + + try: + # Create directory with full permissions + os.makedirs(logs_dir, mode=0o777, exist_ok=True) + print(f"āœ“ Created directory: {logs_dir}") + + # Test file creation + test_file = logs_dir / "test.txt" + with open(test_file, 'w') as f: + f.write("Test log entry") + print(f"āœ“ Created test file: {test_file}") + + # Initialize the handler + handler = CustomLogsHandler() + print("āœ“ CustomLogsHandler initialized") + + # Test JSON logging + handler.logs.append({"test": "message"}) + print("āœ“ Added test log entry") + + except Exception as e: + print(f"āŒ Error: {str(e)}") + print(f"Error type: {type(e)}") + import traceback + print(f"Traceback: {traceback.format_exc()}") + +if __name__ == "__main__": + test_logs_creation() \ No newline at end of file diff --git a/tests/test_researcher.py b/tests/test_researcher.py new file mode 100644 index 000000000..f1d86d294 --- /dev/null +++ b/tests/test_researcher.py @@ -0,0 +1,63 @@ +import pytest +import asyncio +from pathlib import Path +import sys +import logging + +# Add the project root to Python path +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +# Configure basic logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +@pytest.mark.asyncio +async def test_researcher(): + try: + # Import here to catch any import errors + from src.researcher import Researcher + logger.info("Successfully imported Researcher class") + + # Create a researcher instance + researcher = Researcher( + query="What is the current state of quantum computing?", + report_type="research_report" + ) + logger.info("Created Researcher instance") + + # Run the research + report = await researcher.research() + logger.info("Research completed successfully!") + logger.info(f"Report length: {len(report)}") + + # Basic assertions + assert report is not None + assert len(report) > 0 + + # Check if logs were created + logs_dir = Path(project_root) / "logs" + log_files = list(logs_dir.glob("research_*.log")) + json_files = list(logs_dir.glob("research_*.json")) + + assert len(log_files) > 0, "No log files were created" + assert len(json_files) > 0, "No JSON files were created" + + logger.info(f"\nFound {len(log_files)} log files:") + for log_file in log_files: + logger.info(f"- {log_file.name}") + + logger.info(f"\nFound {len(json_files)} JSON files:") + for json_file in json_files: + logger.info(f"- {json_file.name}") + + except ImportError as e: + logger.error(f"Import error: {e}") + logger.error("Make sure gpt_researcher is installed and in your PYTHONPATH") + raise + except Exception as e: + logger.error(f"Error during research: {e}") + raise + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file From 0fc99671e30d0fdc642c634c3f545889abbc6806 Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Wed, 18 Dec 2024 11:01:20 -0800 Subject: [PATCH 02/11] Update gitignore and test files --- .gitignore | 18 +---- tests/gptr-logs-handler.py.orig | 52 ++++++++++++++ tests/report-types.py.orig | 63 ++++++++++++++++ tests/research_test.py.orig | 123 ++++++++++++++++++++++++++++++++ 4 files changed, 239 insertions(+), 17 deletions(-) create mode 100644 tests/gptr-logs-handler.py.orig create mode 100644 tests/report-types.py.orig create mode 100644 tests/research_test.py.orig diff --git a/.gitignore b/.gitignore index 428cdd82d..c99a4ea62 100644 --- a/.gitignore +++ b/.gitignore @@ -43,20 +43,4 @@ package-lock.json *.swp # Log files -logs/ -*.log -*_log.txt -heroku_logs.txt -memory_profiling_log.txt -server_log.txt - -# Add to existing .gitignore -*.json -*.pdf -*.md -!README.md -!CONTRIBUTING.md -!CODE_OF_CONDUCT.md - -# Backup directories -data_backup/ +logs/ \ No newline at end of file diff --git a/tests/gptr-logs-handler.py.orig b/tests/gptr-logs-handler.py.orig new file mode 100644 index 000000000..fb05694ce --- /dev/null +++ b/tests/gptr-logs-handler.py.orig @@ -0,0 +1,52 @@ +import logging +from typing import List, Dict, Any +import asyncio +from gpt_researcher import GPTResearcher + +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data + logging.basicConfig(level=logging.INFO) # Set up logging configuration + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it, with error handling.""" + try: + self.logs.append(data) # Append data to logs + logging.info(f"My custom Log: {data}") # Use logging instead of print + except Exception as e: + logging.error(f"Error logging data: {e}") # Log any errors + + def clear_logs(self) -> None: + """Clear the logs.""" + self.logs.clear() # Clear the logs list + logging.info("Logs cleared.") # Log the clearing action + +async def run() -> None: + """Run the research process and generate a report.""" + query = "What happened in the latest burning man floods?" + report_type = "research_report" + report_source = "online" + tone = "informative" + config_path = None + + custom_logs_handler = CustomLogsHandler() + + researcher = GPTResearcher( + query=query, + report_type=report_type, + report_source=report_source, + tone=tone, + config_path=config_path, + websocket=custom_logs_handler + ) + + await researcher.conduct_research() # Conduct the research + report = await researcher.write_report() # Write the research report + logging.info("Report generated successfully.") # Log report generation + + return report + +# Run the asynchronous function using asyncio +if __name__ == "__main__": + asyncio.run(run()) diff --git a/tests/report-types.py.orig b/tests/report-types.py.orig new file mode 100644 index 000000000..073f8336e --- /dev/null +++ b/tests/report-types.py.orig @@ -0,0 +1,63 @@ +import os +import asyncio +import pytest +from gpt_researcher.agent import GPTResearcher +import logging +from typing import List, Dict, Any + +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data + logging.basicConfig(level=logging.INFO) # Set up logging configuration + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it, with error handling.""" + try: + self.logs.append(data) # Append data to logs + logging.info(f"My custom Log: {data}") # Use logging instead of print + except Exception as e: + logging.error(f"Error logging data: {e}") # Log any errors + +# Define the report types to test +report_types = [ + "research_report", + "subtopic_report" +] + +# Define a common query and sources for testing +query = "What are the latest advancements in AI?" +# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] + +# Define the output directory +output_dir = "./outputs" + +@pytest.mark.asyncio +@pytest.mark.parametrize("report_type", report_types) +async def test_gpt_researcher(report_type): + # Ensure the output directory exists + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + custom_logs_handler = CustomLogsHandler() + # Create an instance of GPTResearcher + researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler) + + # Conduct research and write the report + await researcher.conduct_research() + report = await researcher.write_report() + + # Define the expected output filenames + pdf_filename = os.path.join(output_dir, f"{report_type}.pdf") + docx_filename = os.path.join(output_dir, f"{report_type}.docx") + + # Check if the PDF and DOCX files are created + # assert os.path.exists(pdf_filename), f"PDF file not found for report type: {report_type}" + # assert os.path.exists(docx_filename), f"DOCX file not found for report type: {report_type}" + + # Clean up the generated files (optional) + # os.remove(pdf_filename) + # os.remove(docx_filename) + +if __name__ == "__main__": + pytest.main() \ No newline at end of file diff --git a/tests/research_test.py.orig b/tests/research_test.py.orig new file mode 100644 index 000000000..b58d5b92a --- /dev/null +++ b/tests/research_test.py.orig @@ -0,0 +1,123 @@ +""" +Hi! The following test cases are for the new parameter `complement_source_urls` and fix on the functional error with `source_urls` in GPTResearcher class. + +The source_urls parameter was resetting each time in conduct_research function causing gptr to forget the given links. Now, that has been fixed and a new parameter is introduced. +This parameter named will `complement_source_urls` allow GPTR to research on sources other than the provided sources via source_urls if set to True. +Default is False, i.e., no additional research will be conducted on newer sources. +""" + +## Notes: +## Please uncomment the test case to run and comment the rest. +## Thanks! + + + +#### Test case 1 (original test case as control from https://docs.gptr.dev/docs/gpt-researcher/tailored-research) + +from gpt_researcher.agent import GPTResearcher # Ensure this path is correct +import asyncio +import logging +from typing import List, Dict, Any + +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data + logging.basicConfig(level=logging.INFO) # Set up logging configuration + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it, with error handling.""" + try: + self.logs.append(data) # Append data to logs + logging.info(f"My custom Log: {data}") # Use logging instead of print + except Exception as e: + logging.error(f"Error logging data: {e}") # Log any errors + +async def get_report(query: str, report_type: str, sources: list) -> str: + custom_logs_handler = CustomLogsHandler() + researcher = GPTResearcher(query=query, + report_type=report_type, + complement_source_urls=False, + websocket=custom_logs_handler) + await researcher.conduct_research() + report = await researcher.write_report() + return report, researcher + +if __name__ == "__main__": + query = "Write an analysis on paul graham" + report_type = "research_report" + sources = ["https://www.paulgraham.com/when.html", "https://www.paulgraham.com/noob.html"] # query is related + + report, researcher = asyncio.run(get_report(query, report_type, sources)) + print(report) + + print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, so there will be relevant context present + + + +#### Test case 2 (Illustrating the problem, i.e., source_urls are not scoured. Hence, no relevant context) + +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What is Microsoft's business model?" +# report_type = "research_report" +# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED. + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say 0 (zero) value because the query is UNRELATED to the contents of the pages, so there will be NO relevant context present + + + +#### Test case 3 (Suggested solution - complement_source_urls parameter allows GPTR to scour more of the web and not restrict to source_urls) + +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What is Microsoft's business model?" +# report_type = "research_report" +# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the complement_source_urls is set which should make gptr do default web search to gather contexts + + + +# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the complement_source_urls parameter is set allowing for a larger research scope) + +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What are the latest advancements in AI?" +# report_type = "research_report" +# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the complement_source_urls is set which should make gptr do default web search to gather more contexts! From 7e11b9cbb8ed9d8bf09907f7b7d2f36a8d253991 Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Wed, 18 Dec 2024 11:05:23 -0800 Subject: [PATCH 03/11] added .orig files to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c99a4ea62..df3ad0a8a 100644 --- a/.gitignore +++ b/.gitignore @@ -43,4 +43,5 @@ package-lock.json *.swp # Log files -logs/ \ No newline at end of file +logs/ +*.orig \ No newline at end of file From eb26cc3ea7a040f2df91b18c735627b523ace751 Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Wed, 18 Dec 2024 11:19:44 -0800 Subject: [PATCH 04/11] removed src directory --- src/logs_handler.py | 90 ------------------------------------------- src/researcher.py | 94 --------------------------------------------- 2 files changed, 184 deletions(-) delete mode 100644 src/logs_handler.py delete mode 100644 src/researcher.py diff --git a/src/logs_handler.py b/src/logs_handler.py deleted file mode 100644 index 353dd7484..000000000 --- a/src/logs_handler.py +++ /dev/null @@ -1,90 +0,0 @@ -import logging -import json -import os -from datetime import datetime -from pathlib import Path -from typing import List, Dict, Any - -class CustomLogsHandler: - """A unified custom logs handler for GPT Researcher.""" - - def __init__(self, websocket=None, query=None): - self.websocket = websocket - self.query = query - self.logs: List[Dict[str, Any]] = [] - - # Set up logging configuration - logging.basicConfig(level=logging.INFO) - self.logger = logging.getLogger(__name__) - - # Initialize log file if query is provided - if query: - self.log_file = self._create_log_file() - - def _create_log_file(self): - """Create log file with proper directory structure.""" - # Use the project root directory - project_root = Path(__file__).parent.parent - logs_dir = project_root / "logs" - - # Create logs directory - os.makedirs(logs_dir, exist_ok=True) - - # Create timestamped log file - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - log_file = logs_dir / f"research_{timestamp}.json" - - # Initialize log file with empty structure - initial_data = { - "events": [], - "content": { - "query": self.query, - "sources": [], - "report": "" - } - } - - with open(log_file, 'w') as f: - json.dump(initial_data, f, indent=2) - - return log_file - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - # Append data to logs - self.logs.append(data) - - # Log using logging - self.logger.info(f"Log: {data}") - - # Send to websocket if available - if self.websocket: - await self.websocket.send_json(data) - - # Write to log file if available - if hasattr(self, 'log_file'): - self._append_to_log_file(data) - - except Exception as e: - self.logger.error(f"Error logging data: {e}") - - def _append_to_log_file(self, data: Dict[str, Any]) -> None: - """Append data to the JSON log file.""" - try: - with open(self.log_file, 'r+') as f: - log_data = json.load(f) - log_data["events"].append({ - "timestamp": datetime.now().isoformat(), - "data": data - }) - f.seek(0) - json.dump(log_data, f, indent=2) - f.truncate() - except Exception as e: - self.logger.error(f"Error writing to log file: {e}") - - def clear_logs(self) -> None: - """Clear the logs.""" - self.logs.clear() - self.logger.info("Logs cleared.") \ No newline at end of file diff --git a/src/researcher.py b/src/researcher.py deleted file mode 100644 index 97638341f..000000000 --- a/src/researcher.py +++ /dev/null @@ -1,94 +0,0 @@ -from typing import Dict, Any -import json -from datetime import datetime -from pathlib import Path -import logging -import sys -from .logs_handler import CustomLogsHandler -from gpt_researcher.agent import GPTResearcher -from backend.server.logging_config import get_research_logger - -# Configure logging to output to both file and console -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('researcher_debug.log'), - logging.StreamHandler(sys.stdout) - ] -) - -logger = logging.getLogger(__name__) - -class ResearchLogHandler: - """Custom handler to capture GPTResearcher logs""" - def __init__(self, research_logger): - self.logger = research_logger - - async def on_tool_start(self, tool_name: str, **kwargs): - self.logger.info(f"Starting tool: {tool_name}") - self.logger.info(f"Tool parameters: {kwargs}") - - async def on_tool_end(self, tool_name: str, result: Any): - self.logger.info(f"Completed tool: {tool_name}") - self.logger.info(f"Tool result: {result}") - - async def on_agent_action(self, action: str, **kwargs): - self.logger.info(f"Agent action: {action}") - self.logger.info(f"Action details: {kwargs}") - - async def on_research_step(self, step: str, details: Any): - self.logger.info(f"Research step: {step}") - self.logger.info(f"Step details: {details}") - -class Researcher: - def __init__(self, query: str, report_type: str = "research_report"): - self.research_logger = get_research_logger() - self.query = query - self.report_type = report_type - - # Initialize our custom logs handler - self.logs_handler = CustomLogsHandler() - self.research_logger.info(f"Initialized Researcher with query: {query}") - - try: - # Initialize research log handler - self.research_log_handler = ResearchLogHandler(self.research_logger) - - # Initialize GPTResearcher with both handlers - self.researcher = GPTResearcher( - query=query, - report_type=report_type, - websocket=self.logs_handler, - log_handler=self.research_log_handler # Add research log handler - ) - self.research_logger.info("Successfully initialized GPTResearcher") - except Exception as e: - self.research_logger.error(f"Error initializing GPTResearcher: {e}", exc_info=True) - raise - - async def research(self) -> str: - """Conduct research and return the report""" - try: - self.research_logger.info(f"Starting research process for query: {self.query}") - self.research_logger.info(f"Report type: {self.report_type}") - - self.research_logger.info("Beginning research phase") - await self.researcher.conduct_research() - self.research_logger.info("Research phase completed") - - self.research_logger.info("Starting report generation") - report = await self.researcher.write_report() - self.research_logger.info("Report generation completed") - - # Log report summary - report_preview = report[:500] + "..." if len(report) > 500 else report - self.research_logger.info(f"Report preview: {report_preview}") - - return report - - except Exception as e: - self.research_logger.error(f"Error during research: {e}", exc_info=True) - raise - -# ... rest of the code ... \ No newline at end of file From 11eb8800c373117c51758704fdafc79b0f33a596 Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Wed, 18 Dec 2024 11:33:20 -0800 Subject: [PATCH 05/11] Remove .orig files from Git tracking --- tests/gptr-logs-handler.py.orig | 52 -------------- tests/research_test.py.orig | 123 -------------------------------- 2 files changed, 175 deletions(-) delete mode 100644 tests/gptr-logs-handler.py.orig delete mode 100644 tests/research_test.py.orig diff --git a/tests/gptr-logs-handler.py.orig b/tests/gptr-logs-handler.py.orig deleted file mode 100644 index fb05694ce..000000000 --- a/tests/gptr-logs-handler.py.orig +++ /dev/null @@ -1,52 +0,0 @@ -import logging -from typing import List, Dict, Any -import asyncio -from gpt_researcher import GPTResearcher - -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors - - def clear_logs(self) -> None: - """Clear the logs.""" - self.logs.clear() # Clear the logs list - logging.info("Logs cleared.") # Log the clearing action - -async def run() -> None: - """Run the research process and generate a report.""" - query = "What happened in the latest burning man floods?" - report_type = "research_report" - report_source = "online" - tone = "informative" - config_path = None - - custom_logs_handler = CustomLogsHandler() - - researcher = GPTResearcher( - query=query, - report_type=report_type, - report_source=report_source, - tone=tone, - config_path=config_path, - websocket=custom_logs_handler - ) - - await researcher.conduct_research() # Conduct the research - report = await researcher.write_report() # Write the research report - logging.info("Report generated successfully.") # Log report generation - - return report - -# Run the asynchronous function using asyncio -if __name__ == "__main__": - asyncio.run(run()) diff --git a/tests/research_test.py.orig b/tests/research_test.py.orig deleted file mode 100644 index b58d5b92a..000000000 --- a/tests/research_test.py.orig +++ /dev/null @@ -1,123 +0,0 @@ -""" -Hi! The following test cases are for the new parameter `complement_source_urls` and fix on the functional error with `source_urls` in GPTResearcher class. - -The source_urls parameter was resetting each time in conduct_research function causing gptr to forget the given links. Now, that has been fixed and a new parameter is introduced. -This parameter named will `complement_source_urls` allow GPTR to research on sources other than the provided sources via source_urls if set to True. -Default is False, i.e., no additional research will be conducted on newer sources. -""" - -## Notes: -## Please uncomment the test case to run and comment the rest. -## Thanks! - - - -#### Test case 1 (original test case as control from https://docs.gptr.dev/docs/gpt-researcher/tailored-research) - -from gpt_researcher.agent import GPTResearcher # Ensure this path is correct -import asyncio -import logging -from typing import List, Dict, Any - -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors - -async def get_report(query: str, report_type: str, sources: list) -> str: - custom_logs_handler = CustomLogsHandler() - researcher = GPTResearcher(query=query, - report_type=report_type, - complement_source_urls=False, - websocket=custom_logs_handler) - await researcher.conduct_research() - report = await researcher.write_report() - return report, researcher - -if __name__ == "__main__": - query = "Write an analysis on paul graham" - report_type = "research_report" - sources = ["https://www.paulgraham.com/when.html", "https://www.paulgraham.com/noob.html"] # query is related - - report, researcher = asyncio.run(get_report(query, report_type, sources)) - print(report) - - print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, so there will be relevant context present - - - -#### Test case 2 (Illustrating the problem, i.e., source_urls are not scoured. Hence, no relevant context) - -# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct -# import asyncio - -# async def get_report(query: str, report_type: str, sources: list) -> str: -# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) -# await researcher.conduct_research() -# report = await researcher.write_report() -# return report, researcher - -# if __name__ == "__main__": -# query = "What is Microsoft's business model?" -# report_type = "research_report" -# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED. - -# report, researcher = asyncio.run(get_report(query, report_type, sources)) -# print(report) - -# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say 0 (zero) value because the query is UNRELATED to the contents of the pages, so there will be NO relevant context present - - - -#### Test case 3 (Suggested solution - complement_source_urls parameter allows GPTR to scour more of the web and not restrict to source_urls) - -# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct -# import asyncio - -# async def get_report(query: str, report_type: str, sources: list) -> str: -# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) -# await researcher.conduct_research() -# report = await researcher.write_report() -# return report, researcher - -# if __name__ == "__main__": -# query = "What is Microsoft's business model?" -# report_type = "research_report" -# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED - -# report, researcher = asyncio.run(get_report(query, report_type, sources)) -# print(report) - -# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the complement_source_urls is set which should make gptr do default web search to gather contexts - - - -# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the complement_source_urls parameter is set allowing for a larger research scope) - -# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct -# import asyncio - -# async def get_report(query: str, report_type: str, sources: list) -> str: -# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) -# await researcher.conduct_research() -# report = await researcher.write_report() -# return report, researcher - -# if __name__ == "__main__": -# query = "What are the latest advancements in AI?" -# report_type = "research_report" -# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related - -# report, researcher = asyncio.run(get_report(query, report_type, sources)) -# print(report) - -# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the complement_source_urls is set which should make gptr do default web search to gather more contexts! From add4bffac288b0626632f4cac0f535f23f89cfe6 Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Wed, 18 Dec 2024 11:41:35 -0800 Subject: [PATCH 06/11] Remove report-types.py.orig from Git tracking --- tests/report-types.py.orig | 63 -------------------------------------- 1 file changed, 63 deletions(-) delete mode 100644 tests/report-types.py.orig diff --git a/tests/report-types.py.orig b/tests/report-types.py.orig deleted file mode 100644 index 073f8336e..000000000 --- a/tests/report-types.py.orig +++ /dev/null @@ -1,63 +0,0 @@ -import os -import asyncio -import pytest -from gpt_researcher.agent import GPTResearcher -import logging -from typing import List, Dict, Any - -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data - logging.basicConfig(level=logging.INFO) # Set up logging configuration - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it, with error handling.""" - try: - self.logs.append(data) # Append data to logs - logging.info(f"My custom Log: {data}") # Use logging instead of print - except Exception as e: - logging.error(f"Error logging data: {e}") # Log any errors - -# Define the report types to test -report_types = [ - "research_report", - "subtopic_report" -] - -# Define a common query and sources for testing -query = "What are the latest advancements in AI?" -# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] - -# Define the output directory -output_dir = "./outputs" - -@pytest.mark.asyncio -@pytest.mark.parametrize("report_type", report_types) -async def test_gpt_researcher(report_type): - # Ensure the output directory exists - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - custom_logs_handler = CustomLogsHandler() - # Create an instance of GPTResearcher - researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler) - - # Conduct research and write the report - await researcher.conduct_research() - report = await researcher.write_report() - - # Define the expected output filenames - pdf_filename = os.path.join(output_dir, f"{report_type}.pdf") - docx_filename = os.path.join(output_dir, f"{report_type}.docx") - - # Check if the PDF and DOCX files are created - # assert os.path.exists(pdf_filename), f"PDF file not found for report type: {report_type}" - # assert os.path.exists(docx_filename), f"DOCX file not found for report type: {report_type}" - - # Clean up the generated files (optional) - # os.remove(pdf_filename) - # os.remove(docx_filename) - -if __name__ == "__main__": - pytest.main() \ No newline at end of file From e525e94fc624d39231b6069d613e74d2d9cb6df0 Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:07:10 -0800 Subject: [PATCH 07/11] changed name of test_researcher to avoid confusion with prior test of similar name --- ...esearcher.py => test_researcher_logging.py} | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) rename tests/{test_researcher.py => test_researcher_logging.py} (74%) diff --git a/tests/test_researcher.py b/tests/test_researcher_logging.py similarity index 74% rename from tests/test_researcher.py rename to tests/test_researcher_logging.py index f1d86d294..ebf6e7a94 100644 --- a/tests/test_researcher.py +++ b/tests/test_researcher_logging.py @@ -13,15 +13,19 @@ logger = logging.getLogger(__name__) @pytest.mark.asyncio -async def test_researcher(): +async def test_researcher_logging(): # Renamed function to be more specific + """ + Test suite for verifying the researcher's logging infrastructure. + Ensures proper creation and formatting of log files. + """ try: # Import here to catch any import errors from src.researcher import Researcher logger.info("Successfully imported Researcher class") - # Create a researcher instance + # Create a researcher instance with a logging-focused query researcher = Researcher( - query="What is the current state of quantum computing?", + query="Test query for logging verification", report_type="research_report" ) logger.info("Created Researcher instance") @@ -31,25 +35,29 @@ async def test_researcher(): logger.info("Research completed successfully!") logger.info(f"Report length: {len(report)}") - # Basic assertions + # Basic report assertions assert report is not None assert len(report) > 0 - # Check if logs were created + # Detailed log file verification logs_dir = Path(project_root) / "logs" log_files = list(logs_dir.glob("research_*.log")) json_files = list(logs_dir.glob("research_*.json")) + # Verify log files exist assert len(log_files) > 0, "No log files were created" assert len(json_files) > 0, "No JSON files were created" + # Log the findings logger.info(f"\nFound {len(log_files)} log files:") for log_file in log_files: logger.info(f"- {log_file.name}") + # Could add additional checks for log file format/content here logger.info(f"\nFound {len(json_files)} JSON files:") for json_file in json_files: logger.info(f"- {json_file.name}") + # Could add additional checks for JSON file structure here except ImportError as e: logger.error(f"Import error: {e}") From aa371cada5eae8329354e932c836fb360d603377 Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:44:27 -0800 Subject: [PATCH 08/11] removed memory profiler from server This was cruff from a different branch I was working on. My bad. --- backend/server/server.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/backend/server/server.py b/backend/server/server.py index 9dcb1b968..fe12dfbaf 100644 --- a/backend/server/server.py +++ b/backend/server/server.py @@ -88,9 +88,6 @@ class ConfigRequest(BaseModel): # Startup event -from psutil import Process -import logging - @app.on_event("startup") def startup_event(): os.makedirs("outputs", exist_ok=True) @@ -102,11 +99,6 @@ def startup_event(): research_logger.json_handler = json_handler # Store the JSON handler on the logger research_logger.info(f"Research log file: {log_file}") research_logger.info(f"Research JSON file: {json_file}") - - # Log memory usage - process = Process() - mem_info = process.memory_info() - research_logger.info(f"Memory usage at startup: {mem_info.rss / 1024 ** 2:.2f} MB") # Routes @@ -123,10 +115,7 @@ async def list_files(): return {"files": files} -from memory_profiler import profile - @app.post("/api/multi_agents") -@profile async def run_multi_agents(): return await execute_multi_agents(manager) @@ -141,15 +130,10 @@ async def delete_file(filename: str): return await handle_file_deletion(filename, DOC_PATH) -from psutil import Process - @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): - process = Process() await manager.connect(websocket) try: - mem_info = process.memory_info() - print(f"Memory usage during WebSocket connection: {mem_info.rss / 1024 ** 2:.2f} MB") await handle_websocket_communication(websocket, manager) except WebSocketDisconnect: await manager.disconnect(websocket) From 70de3c4e493c3b922264e2bbcd9258419c298563 Mon Sep 17 00:00:00 2001 From: Kelly Abbott <74297+kga245@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:26:57 -0800 Subject: [PATCH 09/11] Truncate file names I noticed that if you do a very long query the file names that get generated can be larger than the host permits. So I added some truncation that should work pretty universally to limit the file name to under <250 characters total including the important naming convention of the prefixes. --- backend/server/server_utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/backend/server/server_utils.py b/backend/server/server_utils.py index 1c472be6f..26a7ac90e 100644 --- a/backend/server/server_utils.py +++ b/backend/server/server_utils.py @@ -99,7 +99,20 @@ async def research(self) -> dict: } def sanitize_filename(filename: str) -> str: - return re.sub(r"[^\w\s-]", "", filename).strip() + # Split into components + prefix, timestamp, *task_parts = filename.split('_') + task = '_'.join(task_parts) + + # Calculate max length for task portion + # 255 - len("outputs/") - len("task_") - len(timestamp) - len("_.json") - safety_margin + max_task_length = 255 - 8 - 5 - 10 - 6 - 10 # ~216 chars for task + + # Truncate task if needed + truncated_task = task[:max_task_length] if len(task) > max_task_length else task + + # Reassemble and clean the filename + sanitized = f"{prefix}_{timestamp}_{truncated_task}" + return re.sub(r"[^\w\s-]", "", sanitized).strip() async def handle_start_command(websocket, data: str, manager): From 5d1dc7e02e89c721c85e62890afbb7c8edeea255 Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Sat, 21 Dec 2024 21:24:32 +0200 Subject: [PATCH 10/11] added back report source conditions in researcher.py --- gpt_researcher/skills/researcher.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py index 5235bfd98..89c0de507 100644 --- a/gpt_researcher/skills/researcher.py +++ b/gpt_researcher/skills/researcher.py @@ -94,6 +94,34 @@ async def conduct_research(self): research_data = await self._get_context_by_web_search(self.researcher.query) # ... rest of the conditions ... + elif self.researcher.report_source == ReportSource.Local.value: + document_data = await DocumentLoader(self.researcher.cfg.doc_path).load() + if self.researcher.vector_store: + self.researcher.vector_store.load(document_data) + + research_data = await self._get_context_by_web_search(self.researcher.query, document_data) + + # Hybrid search including both local documents and web sources + elif self.researcher.report_source == ReportSource.Hybrid.value: + document_data = await DocumentLoader(self.researcher.cfg.doc_path).load() + if self.researcher.vector_store: + self.researcher.vector_store.load(document_data) + docs_context = await self._get_context_by_web_search(self.researcher.query, document_data) + web_context = await self._get_context_by_web_search(self.researcher.query) + research_data = f"Context from local documents: {docs_context}\n\nContext from web sources: {web_context}" + + elif self.researcher.report_source == ReportSource.LangChainDocuments.value: + langchain_documents_data = await LangChainDocumentLoader( + self.researcher.documents + ).load() + if self.researcher.vector_store: + self.researcher.vector_store.load(langchain_documents_data) + research_data = await self._get_context_by_web_search( + self.researcher.query, langchain_documents_data + ) + + elif self.researcher.report_source == ReportSource.LangChainVectorStore.value: + research_data = await self._get_context_by_vectorstore(self.researcher.query, self.researcher.vector_store_filter) # Rank and curate the sources self.researcher.context = research_data From e1535bf71ec1026d78067a99cf58563f1302c83a Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Sat, 21 Dec 2024 21:57:39 +0200 Subject: [PATCH 11/11] hide download logs button for mutli_agents report --- frontend/nextjs/app/page.tsx | 1 + .../nextjs/components/ResearchBlocks/AccessReport.tsx | 10 ++++++---- frontend/nextjs/components/ResearchResults.tsx | 4 +++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/frontend/nextjs/app/page.tsx b/frontend/nextjs/app/page.tsx index e5bd5fd5e..2cc301af9 100644 --- a/frontend/nextjs/app/page.tsx +++ b/frontend/nextjs/app/page.tsx @@ -257,6 +257,7 @@ export default function Home() { orderedData={orderedData} answer={answer} allLogs={allLogs} + chatBoxSettings={chatBoxSettings} handleClickSuggestion={handleClickSuggestion} /> diff --git a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx index 21996cce3..f35a3a159 100644 --- a/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx +++ b/frontend/nextjs/components/ResearchBlocks/AccessReport.tsx @@ -6,11 +6,13 @@ interface AccessReportProps { pdf?: string; docx?: string; json?: string; - }; + }; + chatBoxSettings: any; + logs?: any[]; report: string; } -const AccessReport: React.FC = ({ accessData, report }) => { +const AccessReport: React.FC = ({ accessData, chatBoxSettings, report }) => { const host = getHost(); const getReportLink = (dataType: 'pdf' | 'docx' | 'json'): string => { @@ -39,13 +41,13 @@ const AccessReport: React.FC = ({ accessData, report }) => { rel="noopener noreferrer"> Download DocX - Download Logs - + } ); }; diff --git a/frontend/nextjs/components/ResearchResults.tsx b/frontend/nextjs/components/ResearchResults.tsx index 218ad661b..7cc1c18b2 100644 --- a/frontend/nextjs/components/ResearchResults.tsx +++ b/frontend/nextjs/components/ResearchResults.tsx @@ -13,6 +13,7 @@ interface ResearchResultsProps { orderedData: Data[]; answer: string; allLogs: any[]; + chatBoxSettings: any; handleClickSuggestion: (value: string) => void; } @@ -20,6 +21,7 @@ export const ResearchResults: React.FC = ({ orderedData, answer, allLogs, + chatBoxSettings, handleClickSuggestion }) => { const groupedData = preprocessOrderedData(orderedData); @@ -72,7 +74,7 @@ export const ResearchResults: React.FC = ({ {sourceComponents} {imageComponents} {finalReport && } - {pathData && } + {pathData && } {chatComponents} );