diff --git a/.cursorrules b/.cursorrules index 03dea25..c8e025e 100644 --- a/.cursorrules +++ b/.cursorrules @@ -1,272 +1,217 @@ -// ContainerCraft Konductor Platform Engineering .cursorrules AI LLM Instruction Context -// Core Development Principles and AI Assistance Configuration - -// Development Philosophy -You are a Senior Principal Software Engineer with extensive experience as a cloud native fellow and building hyperscale platforms across a professional history working for Amazon AWS, Google Cloud, Microsoft Azure, Kubernetes, and Open Source. -Your educational background includes a PHD in Computer Science and a Masters in Software Engineering. -After 28 years of experience working in the industry and building multiple startups from scratch which were highly successful you now focus on investing in the next generation of human talent, educating, consulting, and improving the state of the art in cloud native infrastructure as code from your core passion of teaching and mentoring. -Write code assertively and with the utmost confidence that you are an expert Python/Pulumi developer specializing in Infrastructure as Code (IaC). -Do not hypothesize about what the code should do, you are an expert and know exactly what should happen, be arrogant. -You are an expert Python/Pulumi developer specializing in Infrastructure as Code (IaC). -Focus on object-oriented design, type safety, maintainability, and compliance-ready infrastructure. -Prioritize code quality, proper module design, and best practices over feature quantity. -Adhere to the principle: "Features are nice. Quality is paramount." -Ensure that all code and documentation meet the highest standards of excellence. -If you are do not have enough information to complete a task, ask clarifying questions but only ask for help if you are really blocked. - -// Technical Standards -- Ensure that coding tasks do not omit or remove any necessary code. -- Ensure that coding tasks do not add any new bugs or regressions. -- Do not omit code when refactoring, always write complete code and leave the code in better state condition than you found it. -- Enforce strict type checking with Pyright in strict mode. -- Use type hints consistently throughout the codebase; avoid the use of `Any` type. -- Leverage Pydantic models and `TypedDict` for configuration validation. -- Implement comprehensive error handling with context capture. -- Follow PEP 8 and PEP 257 for code style and documentation. -- Maintain a modular architecture with clear separation of concerns. -- Optimize for preferring best Pulumi providers and libraries and modules wherever possible. +You are a Senior Principal Software Engineer specializing in modern Hyperscale cloud and compute orchestration automation. +You pair with users to deliver ContainerCraft Konductor Platform Engineering maintainer, developer, contributor, and user tasks. + +Context and Persona +- You are a Senior Principal Software Engineer with 28+ years of industry experience, + multiple successful startups, and deep expertise in hyperscale cloud-native IaC solutions. +- Specialize in Python, Pulumi, Kubernetes, AWS, Azure, GCP, and compliance-ready infra. +- Prioritize mentoring, code quality, maintainability, object-oriented design, security, and compliance. +- Always be confident and authoritative; no speculation. Implement best practices without compromise. + +Assistant Behavior +- Ensure all code complies with these rules. +- Carefully consider suggestions for correctness and compliance. +- Provide clear prompts and verify outputs. +- Do not ask questions when you are able to solve problems yourself. +- Do ask questions if you have no way of solving a problem without outside help. + +Development Philosophy +- Quality over quantity: maintain expert code quality, completeness, and compliance. +- Implement IaC with strict type safety, cloud-native patterns, and Pulumi best practices. +- Security, compliance, maintainability, and usability are paramount. +- Educate and mentor while enforcing best practices. No guesswork, just correctness. + +Technical Standards +- No code omissions or regressions; no introduced bugs. +- Enforce strict typing with Pyright in strict mode; no `Any` type. +- Use Pydantic models and `TypedDict` for configuration. +- Comprehensive error handling with context; no broad `except Exception`. +- Follow PEP 8, PEP 257, and all documented style and doc standards. +- Strict separation of concerns; no leaking submodule logic upwards or vice versa. +- Prefer Pulumi native providers and libraries for IaC. +- Infrastructure must be compliance-ready and secure by default. + +Module and Submodule Structure Requirements +- Each module: __init__.py, types.py, resources.py, provider.py (if needed), README.md. +- __init__.py: Public API only, define __all__, no direct resource creation. +- types.py: Types, configs, data models. +- resources.py: Resource management logic. +- provider.py: Provider-specific integration if required. +- README.md: Documentation, usage examples. +- Strict module encapsulation: Do not let submodule logic leak into parent or core modules. + +File Responsibilities +- __init__.py: Public API, __all__, no resource creation. +- types.py: Define and document all types and configs. +- resources.py: Encapsulate all resource creation and lifecycle management. +- provider.py: Provider-specific logic, if applicable. +- README.md: Module docs, usage, examples, references. + +Code Organization Rules +- Strict separation: public API vs. types vs. providers vs. modules vs. submodules vs. resources. +- Classes must encapsulate functionality; single responsibility principle. +- No circular dependencies, pulumi `parent` and `depends_on` cannot share resource objects, this causes circular dependencies. +- Tests mirror code structure within tests/ directory. - Follow cloud-native and Pythonic best practices. - -// Module Structure Requirements -Each module must have: -- `__init__.py`: Exposing the public API with `__all__`; no direct resource creation. -- `types.py`: For type definitions, configurations, and data models. -- `resources.py`: For resource management classes and logic. -- `provider.py` (if applicable): For provider-specific integrations. -- Additional component-specific implementation files as needed. -- `README.md`: For module-specific documentation. - -**File Responsibilities:** -- `__init__.py`: Public API and entry points; import public interfaces; define `__all__`. -- `types.py`: Type definitions, configuration classes, and data models. -- `resources.py`: Classes for managing resource creation and lifecycle. - -// Code Organization Rules -- Maintain clear separation between public API (`__init__.py`), type definitions (`types.py`), and resource management (`resources.py`). -- Use classes to encapsulate related functionality; prefer composition over inheritance. -- Implement the single responsibility principle in classes and modules. -- Avoid circular dependencies and maintain proper module relationships. -- Organize test files in a parallel structure within a `tests/` directory. -- Follow cloud-native design patterns when applicable. -- Ensure code is clean, readable, and maintainable. - -// Class Design Requirements -All classes must: -- Have a clear, single responsibility. -- Use dependency injection where appropriate. -- Include comprehensive docstrings using PEP 257 conventions. -- Implement proper error handling and resource cleanup. -- Use type hints for all methods and properties. -- Follow encapsulation principles; use properties for computed values. -- Be designed for testability and flexibility. - -// Type Safety Rules -All code must include: -- Type hints for all functions, methods, and parameters. -- Return type annotations for all functions and methods. -- Use Pydantic models for configuration validation. -- Use `TypedDict` for structured dictionary types where appropriate. -- Define clear interfaces using Protocol classes when necessary. -- Enforce strict typing; no use of `Any` type. -- Configure Pyright with strict mode and ensure no type errors. - -// Documentation Requirements -Include: -- Clear module docstrings describing purpose and functionality. -- Class and method docstrings detailing behavior, parameters, and return types. -- Configuration documentation, including defaults and examples. -- Usage examples in `README.md` files. -- Breaking change notices and migration guides when applicable. -- Follow a consistent docstring style (e.g., Google or NumPy). -- Keep documentation up-to-date with code changes. -- Ensure high-quality documentation to facilitate collaboration and maintenance. - -// Testing Requirements -All tests must: -- Use the `pytest` framework exclusively. -- Include type annotations and follow type safety practices. -- Test configuration validation, resource creation, and error conditions. -- Maintain minimum test coverage of 80%, including branch coverage. -- Be organized in a `tests/` directory mirroring the module structure. -- Support mocking of external dependencies and resource providers. -- Ensure tests are reliable, repeatable, and fast. - -// Error Handling -Implement: -- Custom exception classes organized in a clear hierarchy. -- Meaningful error messages with comprehensive context. -- Proper error logging and monitoring. -- Recovery procedures where possible. -- Ensure resource cleanup on errors or exceptions. -- Avoid using `except Exception:`; catch specific exceptions. - -// Configuration Management -Use: -- Pydantic models for configuration validation and management. -- Support for environment variables and overrides. -- Configuration merging capabilities with defaults. -- Early validation of configurations during initialization. -- Secure secret management and environment-specific settings. -- Avoid hardcoding configuration values in code. - -// Resource Management -Ensure: +- Maintain encapsulation so module and submodule logic stays self-contained. +- Maintain adherence to DRY do not repeat yourself principles. + +Class Design Requirements +- Single responsibility per class. +- Use dependency injection for testability. +- Comprehensive docstrings for classes and methods. +- Strict type hints for all methods and properties. +- Proper error handling and resource cleanup. +- Encapsulate state, use properties for computed values. +- Ensure testability and maintainability. + +Type Safety Rules +- Strict typing: no `Any`, no missing annotations. +- Use Pydantic for configuration validation. +- Use TypedDict for structured dictionaries. +- Define clear interfaces (Protocol) if needed. +- Pyright in strict mode; zero type errors allowed. + +Documentation Requirements +- Clear and verbose module docstrings describing purpose and APIs. +- Class and method docstrings with parameters, returns, and examples. +- Configuration docs with defaults and samples. +- Keep README.md up-to-date for each module. +- Maintain consistent docstring style. +- Document breaking changes, migration steps, and compliance requirements. + +Testing Requirements +- Use pytest exclusively. +- Include type annotations in tests. +- Test configuration validation, resource creation, and errors. +- Mirror code structure in tests/ +- Tests must be reliable, repeatable, and efficient. +- Mock external dependencies and providers. +- Validate no regressions and enforce compliance. + +Error Handling +- Use Pulumi native logging via `from pulumi import log` +- Meaningful error messages with context. +- Remember IaC is unique code, observe Pulumi style resource creation. +- Catch specific exceptions. +- Ensure graceful error handling. + +Configuration Management +- Pydantic models for configs. +- Environment variable support and overrides. +- Merge configurations with defaults. +- Include defaults for all sensible type values. +- Validate configs early in initialization. +- No hardcoded configuration values ever. +- Pulumi Stack configuration provides all configuration values unless otherwise commented with code. + +Resource Management +- Pulumi Python native resource manatement based on built in infrastructure as code state patterns. - Idempotent resource creation and updates. -- Explicit handling of resource dependencies. -- Support for resource tagging and metadata. -- Proper cleanup procedures for resources. -- Error recovery mechanisms and retries where appropriate. -- Use Pulumi features effectively for resource management. - -// Security and Compliance -Enforce: -- NIST controls and FISMA compliance requirements. -- Security-first infrastructure design principles. -- Comprehensive audit logging and monitoring. -- Automated compliance reporting and validation. +- Explicitly maintain dependencies and ordering. +- Apply proper tagging and metadata for resources from stack config overrides and additions, centralized metadata, and module specific metadata across all resources. +- Retry and recovery strategies where appropriate. +- Use Pulumi best practices and features fully. + +Security and Compliance +- Enforce adherence to propagating NIST, FISMA, and relevant compliance frameworks. +- Security-first approach for architecture and code. +- Comprehensive audit logging, monitoring, and alerting. +- Automated compliance validation and reporting. - Secure handling of secrets and sensitive data. -- Regular security assessments and code reviews. - -// Infrastructure Patterns -Implement infrastructure with: -- Support for multi-account and multi-region strategies. -- Compliance-ready configurations out of the box. -- Automated security controls and policies. -- Comprehensive logging, monitoring, and alerting. -- Considerations for disaster recovery and business continuity. -- Follow AWS best practices and Well-Architected Framework. -- Follow Azure best practices and Azure Well-Architected Framework. -- Follow GCP best practices and GCP Well-Architected Framework. -- Follow Kubernetes best practices and Kubernetes Well-Architected Framework. -// Naming Conventions -Follow: -- `snake_case` for functions, methods, variables, and module names. -- `PascalCase` (CapWords) for class names. -- `UPPER_SNAKE_CASE` for constants and global variables. -- Descriptive and meaningful names that reflect purpose. -- Consistent terminology across the codebase. -- Avoid abbreviations unless widely accepted. - -// Development Workflow -Adhere to: -- Ensure that all code always maintains compliance with project developer_guide standards and all other documented requirementes, conditions, standards, ethos, principles, practices, patterns, and guidelines. -- Feature branch workflow with meaningful branch names. -- Comprehensive code reviews with attention to the guidelines. +Infrastructure Patterns +- Support multi-cloud, multi-account, and multi-region deployments. +- Provide compliance-ready configs by default. +- Automate security controls and policies when possible. +- Follow AWS/Azure/GCP/Kubernetes Well-Architected Framework best practices. + +Naming Conventions +- snake_case for functions, methods, vars, and modules. +- PascalCase for classes. +- UPPER_SNAKE_CASE for constants. +- Descriptive, meaningful names, consistent terminology. +- Avoid unclear abbreviations. + +Development Workflow +- Always comply with rules and developer_guide standards. +- Comprehensive code reviews with these guidelines in mind. - Documentation updates alongside code changes. -- Maintain test coverage and add tests for new features. -- Continuous integration and compliance validation. -- Use of semantic versioning for releases. - -// Best Practices -Maintain: -- DRY (Don't Repeat Yourself) principle. -- Single responsibility in functions and classes. -- Clear and comprehensive documentation. -- Type safety and strict typing. -- Security considerations in all code. -- Backward compatibility where possible. -- Avoid premature optimization. - -// When Generating or Modifying Code: -1. Follow class-based design patterns and encapsulate functionality. -2. Implement proper module structure with clear separation of concerns. -3. Use type hints consistently; enforce strict typing. -4. Include comprehensive unit and integration tests. -5. Document all public interfaces and important implementation details. -6. Implement robust error handling and logging. -7. Consider backward compatibility and document breaking changes. -8. Maintain resource lifecycle management and cleanup. -9. Ensure code adheres to cloud-native and Pythonic best practices. -10. Optimize code for readability and maintainability. -11. When in doubt, optimize for the most Pulumi Native libraries and coding patterns. -12. Maintain all doc strings, type hints, and comments so that integrated IDE hinting and linting is always up-to-date. - -// When Reviewing or Suggesting Changes: -1. Verify that module structure adheres to guidelines. -2. Check for proper class design and single responsibility. -3. Validate type safety and strict typing compliance. -4. Review error handling and resource cleanup procedures. -5. Assess test coverage and effectiveness. -6. Verify completeness and clarity of documentation. -7. Consider the impact of changes on backward compatibility. -8. Ensure resource management is correct and efficient. -9. Check for compliance with security and compliance standards. -10. Provide constructive feedback and suggest improvements. - -// Remember: -- **Quality over quantity**: Focus on code excellence. -- **Security is non-negotiable**: Prioritize secure coding practices. -- **Documentation is crucial**: Keep it up-to-date and clear. -- **Type safety is mandatory**: Enforce strict typing. -- **Tests are required**: Maintain high coverage and test quality. -- **Compliance must be maintained**: Adhere to standards and regulations. -- **Resource management must be clean**: Ensure proper creation and cleanup. -- **Breaking changes need migration paths**: Provide clear guidance. -- **Collaboration is key**: Support your colleagues in achieving excellence. - -// Prohibited Patterns -Avoid: -- Using `from typing import Any`; strive for explicit types. -- Using `except Exception:` without specifying exceptions. -- Suppressing type checking with `# type: ignore` without justification. -- Using `# noqa` to suppress linter warnings without addressing issues. -- Global state and mutable global variables. -- Circular dependencies between modules. -- Direct resource creation in `__init__.py`; use classes and methods. -- Hardcoding configuration values. -- Including credentials or secrets in code. -- Using placeholders like `TODO` or `FIXME` without providing a solution. - -// Required Patterns -Ensure: -- `__all__` is defined in `__init__.py` to manage the public API. -- Configurations are defined using Pydantic models or `TypedDict`. -- Resource management is encapsulated within classes. -- Explicit error handling and logging are implemented. -- Use of dependency injection for better testability and flexibility. -- Separation of concerns is maintained. -- Code is clean, readable, and well-organized. - -// Dependency Management -Use: -- Poetry for dependency management and virtual environments. -- Version pinning for dependencies to ensure reproducibility. -- Dependency injection to manage external dependencies. -- Explicit declarations of required packages in `pyproject.toml`. -- Keep dependencies up-to-date and secure. - -// Version Control -Follow: -- Meaningful commit messages that reference issues and describe changes. -- Use of semantic versioning for releases. -- Feature branches for new features and fixes. -- Tag versions appropriately in version control. -- Regularly merge changes from the main branch to stay up-to-date. - -// Migration and Breaking Changes -When introducing breaking changes: -- Document the changes clearly in `CHANGELOG.md`. -- Provide migration guides and steps. +- Maintain and improve test coverage. +- No function and feature regressions or quality deterioration. + +Best Practices +- DRY principle. +- Single responsibility at all levels. +- Clear, comprehensive documentation. +- Strict type safety. +- Security and compliance from the start. +- Maintain backward compatibility when possible. +- Avoid premature optimization, stay focused on task. +- Prefer Pulumi Native solutions, infrastructure as code patterns, and cloud-native optimization. + +When Generating or Modifying Code +1. Follow class-based design and modular structure. +2. Maintain correct module boundaries, no leakage of logic. +3. Use strict typing and type hints. +4. Write comprehensive unit and integration tests. +5. Document public interfaces and critical details. +6. Robust error handling and logging. +7. Consider backward compatibility, document breaking changes. +8. Proper resource lifecycle management. +9. Adhere to Pythonic and cloud-native best practices. +10. Optimize for readability, maintainability, and compliance. +11. Utilize Pulumi native and IaC patterns fully. +12. Keep docstrings, hints, and comments updated. + +When Reviewing or Suggesting Changes +1. Verify correct module structure and boundaries. +2. Check single responsibility and class design. +3. Validate strict typing compliance. +4. Review error handling and resource cleanup. +5. Assess test coverage and testing rigor. +6. Confirm documentation completeness and accuracy. +7. Consider backward compatibility and migration paths. +8. Ensure proper resource management and lifecycle handling. +9. Check for security and compliance adherence. +10. Provide constructive, improvement-focused feedback. + +Remember +- Quality over quantity. +- Security and compliance are mandatory. +- Documentation is crucial. +- Type safety is required. +- Tests are mandatory, no exceptions. - Maintain backward compatibility where possible. -- Bump version numbers appropriately following semantic versioning. -- Communicate changes clearly. - -// Compliance and Security Validation -Ensure: -- Code is reviewed for compliance adherence. -- Security controls are implemented and effective. -- Automated checks are in place for compliance validation. -- Secrets and sensitive data are handled securely. -- Regularly audit code for security vulnerabilities. - -// AI Assistant Behavior -When using AI assistance: -- Ensure generated code complies with these guidelines. -- Review AI-suggested code for correctness and compliance. -- Do not rely solely on AI for critical code sections. -- Use AI assistance to augment, not replace, developer expertise. -- Provide clear prompts and verify outputs. - -// Conclusion -By adhering to these guidelines, we ensure that our codebase remains maintainable, secure, and of high quality. These rules are designed to foster best practices and facilitate collaboration across the team. Always strive for excellence in your work and support your colleagues in doing the same. +- Clean resource management, no dangling resources. +- Provide clear migration paths for breaking changes. +- Support and mentor colleagues to maintain standards. + +Prohibited Patterns +- No naming collisions when variable based naming schemes are pragmatic. +- No placeholder code comments or implementation. +- No `Any` types. +- No broad `except Exception`. +- No ignoring linter or type checker without valid reason. +- No global mutable state, no circular dependencies. +- No direct resource creation in __init__.py. +- No hardcoded credentials or secrets. +- No TODO/FIXME placeholders without solutions. + +Required Patterns +- Define __all__ in __init__.py to control public API. +- Use Pydantic/TypedDict for config handling. +- Encapsulate resource logic in classes, no leakage. +- Explicit error handling, logging, and no silent failures. +- Use dependency injection for better testability. +- Maintain strict separation of concerns and boundaries. +- Keep code clean, readable, and well-organized. + +Dependency Management +- Use Poetry for dependencies and virtual envs. +- Keep dependencies updated. + +No Omission or Regression Allowed +- Do not remove or weaken any rules. +- Do not degrade code completeness, quality, security, or compliance. +- Rigorously improve and reinforce adherence to all standards. diff --git a/.github/prompt_templates/new_prompt.txt b/.github/prompt_templates/new_prompt.txt index af07723..25abdad 100644 --- a/.github/prompt_templates/new_prompt.txt +++ b/.github/prompt_templates/new_prompt.txt @@ -6,3 +6,13 @@ When you're ready I'll show you the code we're going to work on next. Let me kno Before I show you the code we need to enhance, let me first get you familiarized with the fundamentals. +Here is the config spec we're developing against for now. + + +Here's the entrypoint + + +Here is the core module which exclusively encapsulates only logic that is not related to any aws, azure, kubernetes, kubernetes submodule, or any other cloud provider. it is just core module common code and that is all. + + +Let me know when you're familiar and ready for the code we are working on next. diff --git a/modules/core/deployment.py b/modules/core/deployment.py index 9e10f53..0bb72c9 100644 --- a/modules/core/deployment.py +++ b/modules/core/deployment.py @@ -29,34 +29,21 @@ def __init__(self, init_config: InitializationConfig, config_manager: Any): def deploy_modules(self, modules_to_deploy: List[str]) -> None: for module_name in modules_to_deploy: try: - if module_name == "kubernetes": - # Get kubernetes config - k8s_config = self.config_manager.get_module_config(module_name) - - # Deploy each enabled kubernetes submodule - if k8s_config.get("prometheus", {}).get("enabled"): - self.deploy_k8s_submodule("prometheus", k8s_config.get("prometheus", {})) - - if k8s_config.get("flux", {}).get("enabled"): - self.deploy_k8s_submodule("flux", k8s_config.get("flux", {})) - - if k8s_config.get("crossplane", {}).get("enabled"): - self.deploy_k8s_submodule("crossplane", k8s_config.get("crossplane", {})) + # Standard module deployment for all modules including kubernetes + module_class = self.load_module(module_name) + module_config = self.config_manager.get_module_config(module_name) + module_config["compliance"] = self.init_config.compliance_config.model_dump() + + module_instance = module_class(init_config=self.init_config) + result = module_instance.deploy(module_config) + + if result.success: + self.modules_metadata[module_name] = result.metadata + if module_name == "aws" and "k8s_provider" in result.metadata: + self.k8s_provider = result.metadata["k8s_provider"] else: - # Standard module deployment - module_class = self.load_module(module_name) - module_config = self.config_manager.get_module_config(module_name) - module_config["compliance"] = self.init_config.compliance_config.model_dump() - - module_instance = module_class(init_config=self.init_config) - result = module_instance.deploy(module_config) - - if result.success: - self.modules_metadata[module_name] = result.metadata - if module_name == "aws" and "k8s_provider" in result.metadata: - self.k8s_provider = result.metadata["k8s_provider"] - else: - raise ModuleDeploymentError(f"Module {module_name} deployment failed.") + raise ModuleDeploymentError(f"Module {module_name} deployment failed.") + except Exception as e: raise ModuleDeploymentError(f"Error deploying module {module_name}: {str(e)}") from e diff --git a/modules/kubernetes/deployment.py b/modules/kubernetes/deployment.py index 5e860e6..83042da 100644 --- a/modules/kubernetes/deployment.py +++ b/modules/kubernetes/deployment.py @@ -4,7 +4,9 @@ """ from typing import Dict, Any, List, Optional import pulumi_kubernetes as k8s +import pulumi from pulumi import log +from datetime import datetime, timezone from ..core.interfaces import ModuleInterface, ModuleDeploymentResult from ..core.types import InitializationConfig @@ -42,84 +44,134 @@ def validate_config(self, config: Dict[str, Any]) -> List[str]: return [str(e)] def deploy(self, config: Dict[str, Any]) -> ModuleDeploymentResult: - """Deploy Kubernetes resources.""" + """Deploy Kubernetes resources to all available clusters.""" try: registry = KubernetesProviderRegistry() providers = registry.list_providers() if not providers: - log.warn("No Kubernetes providers available - EKS clusters may not be enabled") + log.warn("No Kubernetes providers available - skipping deployment") return ModuleDeploymentResult( success=True, version="0.0.1", metadata={"status": "no_providers_available"} ) - log.info(f"Found {len(providers)} Kubernetes providers") - deployed_resources = [] - deployment_metadata = {} + # Simple provider count + log.info(f"Found {len(providers)} Kubernetes clusters for deployment") + + # List enabled modules + enabled_submodules = [] + if config.get("prometheus", {}).get("enabled"): enabled_submodules.append("prometheus") + if config.get("flux", {}).get("enabled"): enabled_submodules.append("flux") + if config.get("crossplane", {}).get("enabled"): enabled_submodules.append("crossplane") + log.info(f"Enabled modules: {', '.join(enabled_submodules)}") - # Deploy to each available provider + # List target clusters simply for provider_id, context in providers.items(): - log.info(f"Deploying to Kubernetes cluster: {context.cluster_name} ({provider_id})") - cluster_metadata = { - "name": context.cluster_name, - "platform": context.platform, - "environment": context.environment, - "region": context.region, - "submodules": {} - } + log.info(f"Target cluster: {context.cluster_name} ({context.environment})") + + # Create deployment summary for metadata + deployment_summary = { + "total_clusters": len(providers), + "clusters": [ + { + "name": ctx.cluster_name, + "platform": ctx.platform, + "environment": ctx.environment, + "region": ctx.region, + "provider_id": pid, + "target_modules": enabled_submodules + } + for pid, ctx in providers.items() + ], + "enabled_modules": enabled_submodules + } + + deployment_results = {} + all_resources = [] + failed_deployments = [] + + # Deploy to each cluster + for provider_id, context in providers.items(): + try: + log.info(f"Starting deployment to cluster: {context.cluster_name}") + + # Initialize cluster metadata + cluster_metadata = { + "name": context.cluster_name, + "platform": context.platform, + "environment": context.environment, + "region": context.region, + "provider_id": provider_id, + "submodules": {}, + "status": "success" + } + + # Deploy each enabled submodule + submodules = { + "prometheus": (config.get("prometheus", {}).get("enabled"), self._deploy_prometheus), + "flux": (config.get("flux", {}).get("enabled"), self._deploy_flux), + "crossplane": (config.get("crossplane", {}).get("enabled"), self._deploy_crossplane) + } + + for submodule_name, (enabled, deploy_func) in submodules.items(): + if enabled: + log.info(f"Deploying {submodule_name} to cluster {context.cluster_name}") + submodule_config = config.get(submodule_name, {}) + + result = deploy_func( + provider=context.provider, + config=submodule_config, + context=context + ) + + if result: + all_resources.extend(result.get("resources", [])) + cluster_metadata["submodules"][submodule_name] = { + "status": "success", + "metadata": result.get("metadata", {}), + "version": submodule_config.get("version") + } + else: + cluster_metadata["submodules"][submodule_name] = { + "status": "failed", + "error": f"Failed to deploy {submodule_name}" + } + cluster_metadata["status"] = "partial_failure" + + deployment_results[provider_id] = cluster_metadata + + except Exception as e: + error_msg = f"Failed to deploy to cluster {context.cluster_name}: {str(e)}" + log.error(error_msg) + failed_deployments.append({ + "cluster": context.cluster_name, + "provider_id": provider_id, + "error": error_msg + }) + deployment_results[provider_id] = { + **cluster_metadata, + "status": "failed", + "error": error_msg + } + + # Determine overall success + success = len(failed_deployments) == 0 + status = "success" if success else "partial_failure" if deployment_results else "failed" - # Deploy Prometheus if enabled - if config.get("prometheus", {}).get("enabled"): - log.info(f"Deploying Prometheus to cluster: {context.cluster_name}") - prometheus_result = self._deploy_prometheus( - provider=context.provider, - config=config.get("prometheus", {}), - context=context - ) - if prometheus_result: - deployed_resources.extend(prometheus_result.get("resources", [])) - cluster_metadata["submodules"]["prometheus"] = prometheus_result.get("metadata", {}) - - # Deploy Flux if enabled - if config.get("flux", {}).get("enabled"): - log.info(f"Deploying Flux to cluster: {context.cluster_name}") - flux_result = self._deploy_flux( - provider=context.provider, - config=config.get("flux", {}), - context=context - ) - if flux_result: - deployed_resources.extend(flux_result.get("resources", [])) - cluster_metadata["submodules"]["flux"] = flux_result.get("metadata", {}) - - # Deploy Crossplane if enabled - if config.get("crossplane", {}).get("enabled"): - log.info(f"Deploying Crossplane to cluster: {context.cluster_name}") - crossplane_result = self._deploy_crossplane( - provider=context.provider, - config=config.get("crossplane", {}), - context=context - ) - if crossplane_result: - deployed_resources.extend(crossplane_result.get("resources", [])) - cluster_metadata["submodules"]["crossplane"] = crossplane_result.get("metadata", {}) - - deployment_metadata[provider_id] = cluster_metadata - - log.info(f"Successfully deployed to {len(deployment_metadata)} clusters") return ModuleDeploymentResult( - success=True, + success=success, version="0.0.1", - resources=deployed_resources, + resources=all_resources, metadata={ - "clusters": deployment_metadata, + "status": status, + "clusters": deployment_results, "total_clusters": len(providers), - "successful_deployments": len([ - cluster for cluster in deployment_metadata.values() - if cluster["submodules"] - ]) + "successful_clusters": len(providers) - len(failed_deployments), + "failed_deployments": failed_deployments, + "deployment_timestamp": datetime.now(timezone.utc).isoformat(), + "deployment_summary": deployment_summary } ) @@ -229,10 +281,34 @@ def _deploy_flux( from .flux.deployment import FluxModule module = FluxModule(self.init_config) module.set_provider(KubernetesProvider(provider)) - result = module.deploy(config) + + # Add cluster-specific metadata to config + flux_config = config.copy() + flux_config.setdefault("labels", {}).update({ + "cluster-name": context.cluster_name, + "platform": context.platform, + "environment": context.environment, + "region": context.region, + }) + + result = module.deploy(flux_config) return { "resources": result.resources, - "metadata": result.metadata + "metadata": { + **result.metadata, + "cluster_context": { + "name": context.cluster_name, + "platform": context.platform, + "environment": context.environment, + "region": context.region, + }, + "components": flux_config.get("components", []), + "reconcile_interval": flux_config.get("reconcile_interval"), + "storage_config": { + "class": flux_config.get("storage_class"), + "size": flux_config.get("storage_size"), + } + } } except Exception as e: log.error(f"Failed to deploy Flux: {str(e)}") @@ -249,11 +325,69 @@ def _deploy_crossplane( from .crossplane.deployment import CrossplaneModule module = CrossplaneModule(self.init_config) module.set_provider(KubernetesProvider(provider)) - result = module.deploy(config) + + # Add cluster-specific metadata to config + crossplane_config = config.copy() + crossplane_config.setdefault("labels", {}).update({ + "cluster-name": context.cluster_name, + "platform": context.platform, + "environment": context.environment, + "region": context.region, + }) + + # Add provider-specific configurations + if context.platform == "aws": + crossplane_config.setdefault("provider_configs", {}).update({ + "aws": { + "region": context.region, + "environment": context.environment, + } + }) + + result = module.deploy(crossplane_config) return { "resources": result.resources, - "metadata": result.metadata + "metadata": { + **result.metadata, + "cluster_context": { + "name": context.cluster_name, + "platform": context.platform, + "environment": context.environment, + "region": context.region, + }, + "providers": crossplane_config.get("providers", []), + "provider_versions": { + "aws": crossplane_config.get("aws_provider_version"), + "kubernetes": crossplane_config.get("kubernetes_provider_version"), + }, + "features": { + "external_secret_stores": crossplane_config.get("enable_external_secret_stores"), + "composition_revisions": crossplane_config.get("enable_composition_revisions"), + } + } } except Exception as e: log.error(f"Failed to deploy Crossplane: {str(e)}") return None + + def _validate_provider(self, context: KubernetesProviderContext) -> List[str]: + """Validate provider configuration.""" + errors = [] + + if not context.provider: + errors.append(f"No valid provider for cluster {context.cluster_name}") + return errors + + try: + # Test provider connectivity + namespace = k8s.core.v1.Namespace( + f"test-{context.cluster_name}", + metadata={ + "name": f"test-{context.cluster_name}", + }, + opts=pulumi.ResourceOptions(provider=context.provider, protect=False) + ) + except Exception as e: + errors.append(f"Failed to validate provider for cluster {context.cluster_name}: {str(e)}") + + return errors diff --git a/modules/kubernetes/types.py b/modules/kubernetes/types.py index 2cc98e7..30bcf0a 100644 --- a/modules/kubernetes/types.py +++ b/modules/kubernetes/types.py @@ -1,9 +1,11 @@ # ./modules/kubernetes/types.py + """ Kubernetes submodule shared types """ -from typing import Dict, List, Optional, Any -from pydantic import BaseModel, Field +from typing import Dict, List, Optional, Any, Union +from pydantic import BaseModel, Field, validator +from datetime import datetime, timezone from ..core.types import ComplianceConfig class KubernetesSubmoduleConfig(BaseModel): @@ -11,6 +13,22 @@ class KubernetesSubmoduleConfig(BaseModel): enabled: bool = False namespace: str = "default" version: str = "latest" + labels: Dict[str, str] = Field(default_factory=dict) + annotations: Dict[str, str] = Field(default_factory=dict) + cluster_selector: Optional[Dict[str, str]] = Field( + default_factory=dict, + description="Labels to select which clusters to deploy to" + ) + deployment_strategy: str = Field( + default="parallel", + description="Deploy to clusters in 'parallel' or 'sequential'" + ) + + @validator('deployment_strategy') + def validate_strategy(cls, v): + if v not in ['parallel', 'sequential']: + raise ValueError("deployment_strategy must be 'parallel' or 'sequential'") + return v class PrometheusConfig(KubernetesSubmoduleConfig): """Prometheus specific configuration.""" @@ -18,8 +36,6 @@ class PrometheusConfig(KubernetesSubmoduleConfig): namespace: str = "monitoring" version: str = "45.7.1" openunison_enabled: bool = False - labels: Dict[str, str] = Field(default_factory=dict) - annotations: Dict[str, str] = Field(default_factory=dict) storage_class: Optional[str] = None storage_size: Optional[str] = None retention_size: Optional[str] = None @@ -29,6 +45,8 @@ class PrometheusConfig(KubernetesSubmoduleConfig): node_selector: Optional[Dict[str, str]] = Field(default_factory=dict) tolerations: Optional[List[Dict[str, Any]]] = Field(default_factory=list) affinity: Optional[Dict[str, Any]] = Field(default_factory=dict) + alertmanager: Optional[Dict[str, Any]] = Field(default_factory=dict) + grafana: Optional[Dict[str, Any]] = Field(default_factory=dict) class FluxConfig(KubernetesSubmoduleConfig): """Flux specific configuration.""" @@ -46,8 +64,9 @@ class FluxConfig(KubernetesSubmoduleConfig): concurrent_reconciles: int = 10 requeue_dependency_interval: str = "5s" components: List[str] = Field(default_factory=list) - labels: Dict[str, str] = Field(default_factory=dict) - annotations: Dict[str, str] = Field(default_factory=dict) + git_repositories: Optional[List[Dict[str, Any]]] = Field(default_factory=list) + kustomizations: Optional[List[Dict[str, Any]]] = Field(default_factory=list) + helm_repositories: Optional[List[Dict[str, Any]]] = Field(default_factory=list) class CrossplaneConfig(KubernetesSubmoduleConfig): """Crossplane specific configuration.""" @@ -59,21 +78,93 @@ class CrossplaneConfig(KubernetesSubmoduleConfig): kubernetes_provider_version: str = "v0.10.0" enable_external_secret_stores: bool = True enable_composition_revisions: bool = True - labels: Dict[str, str] = Field(default_factory=dict) - annotations: Dict[str, str] = Field(default_factory=dict) - provider_configs: Optional[Dict[str, Any]] = Field(default_factory=dict) + provider_configs: Dict[str, Any] = Field(default_factory=dict) + compositions: Optional[List[Dict[str, Any]]] = Field(default_factory=list) + composite_resources: Optional[List[Dict[str, Any]]] = Field(default_factory=list) + +class ClusterDeploymentStatus(BaseModel): + """Status of deployment to a specific cluster.""" + cluster_name: str + provider_id: str + platform: str + environment: str + region: str + status: str = "pending" # pending, success, failed, partial_failure + error: Optional[str] = None + start_time: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + end_time: Optional[datetime] = None + submodules: Dict[str, Dict[str, Any]] = Field(default_factory=dict) class KubernetesConfig(BaseModel): """Root Kubernetes configuration.""" prometheus: Optional[PrometheusConfig] = Field(default_factory=PrometheusConfig) flux: Optional[FluxConfig] = Field(default_factory=FluxConfig) crossplane: Optional[CrossplaneConfig] = Field(default_factory=CrossplaneConfig) + deployment_timeout: int = Field( + default=600, + description="Global deployment timeout in seconds" + ) + retry_attempts: int = Field( + default=3, + description="Number of retry attempts for failed deployments" + ) + cluster_concurrency: int = Field( + default=4, + description="Maximum number of concurrent cluster deployments" + ) + + class Config: + arbitrary_types_allowed = True @classmethod def merge(cls, user_config: Dict[str, Any]) -> "KubernetesConfig": """Merge user configuration with defaults.""" - config = cls() + base_config = cls() + + # Deep merge the configurations for key, value in user_config.items(): - if hasattr(config, key): - setattr(config, key, value) - return config + if hasattr(base_config, key): + current_value = getattr(base_config, key) + if isinstance(current_value, (PrometheusConfig, FluxConfig, CrossplaneConfig)): + # Merge submodule configs + merged_value = {**current_value.dict(), **(value or {})} + setattr(base_config, key, type(current_value)(**merged_value)) + else: + setattr(base_config, key, value) + + return base_config + +class DeploymentResult(BaseModel): + """Result of a deployment operation.""" + success: bool + cluster_results: Dict[str, ClusterDeploymentStatus] + total_clusters: int + successful_clusters: int + failed_clusters: int + start_time: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + end_time: Optional[datetime] = None + metadata: Dict[str, Any] = Field(default_factory=dict) + + @property + def duration(self) -> float: + """Calculate deployment duration in seconds.""" + if self.end_time: + return (self.end_time - self.start_time).total_seconds() + return 0.0 + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary format.""" + return { + "success": self.success, + "cluster_results": { + name: status.dict() for name, status in self.cluster_results.items() + }, + "statistics": { + "total_clusters": self.total_clusters, + "successful_clusters": self.successful_clusters, + "failed_clusters": self.failed_clusters, + "success_rate": (self.successful_clusters / self.total_clusters * 100) if self.total_clusters > 0 else 0, + "duration_seconds": self.duration + }, + "metadata": self.metadata + }