Asset-Frameworker/processing/pipeline/orchestrator.py

from typing import List, Dict, Optional
from pathlib import Path
import shutil
import tempfile
import logging

from configuration import Configuration
from rule_structure import SourceRule, AssetRule
from .asset_context import AssetProcessingContext
from .stages.base_stage import ProcessingStage

log = logging.getLogger(__name__)

class PipelineOrchestrator:
    """
    Orchestrates the processing of assets based on source rules and a series of processing stages.
    """

    def __init__(self, config_obj: Configuration, stages: List[ProcessingStage]):
        """
        Initializes the PipelineOrchestrator.

        Args:
            config_obj: The main configuration object.
            stages: A list of processing stages to be executed in order.
        """
        self.config_obj: Configuration = config_obj
        self.stages: List[ProcessingStage] = stages

    def process_source_rule(
        self,
        source_rule: SourceRule,
        workspace_path: Path,
        output_base_path: Path,
        overwrite: bool, # Not used in this initial implementation, but part of the signature
        incrementing_value: Optional[str],
        sha5_value: Optional[str] # Corrected from sha5_value to sha256_value as per typical usage, assuming typo
    ) -> Dict[str, List[str]]:
        """
        Processes a single source rule, iterating through its asset rules and applying all stages.

        Args:
            source_rule: The source rule to process.
            workspace_path: The base path of the workspace.
            output_base_path: The base path for output files.
            overwrite: Whether to overwrite existing files (not fully implemented yet).
            incrementing_value: An optional incrementing value for versioning or naming.
            sha5_value: An optional SHA5 hash value for the asset (assuming typo, likely sha256).

        Returns:
            A dictionary summarizing the processing status of assets.
        """
        overall_status: Dict[str, List[str]] = {
            "processed": [],
            "skipped": [],
            "failed": [],
        }
        engine_temp_dir_path: Optional[Path] = None # Initialize to None

        try:
            # Create a temporary directory for this processing run if needed by any stage
            # This temp dir is for the entire source_rule processing, not per asset.
            # Individual stages might create their own sub-temp dirs if necessary.
            temp_dir_path_str = tempfile.mkdtemp(
                prefix="asset_processor_orchestrator_temp_", dir=self.config_obj.get_temp_directory_base()
            )
            engine_temp_dir_path = Path(temp_dir_path_str)
            log.debug(f"PipelineOrchestrator created temporary directory: {engine_temp_dir_path}")


            for asset_rule in source_rule.assets:
                log.debug(f"Orchestrator: Processing asset '{asset_rule.name}'")
                context = AssetProcessingContext(
                    source_rule=source_rule,
                    asset_rule=asset_rule,
                    workspace_path=workspace_path, # This is the path to the source files (e.g. extracted archive)
                    engine_temp_dir=engine_temp_dir_path, # Pass the orchestrator's temp dir
                    output_base_path=output_base_path,
                    effective_supplier=None, # Will be set by SupplierDeterminationStage
                    asset_metadata={}, # Will be populated by stages
                    processed_maps_details={}, # Will be populated by stages
                    merged_maps_details={}, # Will be populated by stages
                    files_to_process=[], # Will be populated by FileRuleFilterStage
                    loaded_data_cache={}, # For image loading cache within this asset's processing
                    config_obj=self.config_obj,
                    status_flags={"skip_asset": False, "asset_failed": False}, # Initialize common flags
                    incrementing_value=incrementing_value,
                    sha256_value=sha5_value # Parameter name in context is sha256_value
                )

                for stage_idx, stage in enumerate(self.stages):
                    log.debug(f"Asset '{asset_rule.name}': Executing stage {stage_idx + 1}/{len(self.stages)}: {stage.__class__.__name__}")
                    try:
                        context = stage.execute(context)
                    except Exception as e:
                        log.error(f"Asset '{asset_rule.name}': Error during stage '{stage.__class__.__name__}': {e}", exc_info=True)
                        context.status_flags["asset_failed"] = True
                        context.asset_metadata["status"] = f"Failed: Error in stage {stage.__class__.__name__}"
                        context.asset_metadata["error_message"] = str(e)
                        break # Stop processing stages for this asset on error

                    if context.status_flags.get("skip_asset"):
                        log.info(f"Asset '{asset_rule.name}': Skipped by stage '{stage.__class__.__name__}'. Reason: {context.status_flags.get('skip_reason', 'N/A')}")
                        break # Skip remaining stages for this asset

                # Refined status collection
                if context.status_flags.get('skip_asset'):
                    overall_status["skipped"].append(asset_rule.name)
                elif context.status_flags.get('asset_failed') or str(context.asset_metadata.get('status', '')).startswith("Failed"):
                    overall_status["failed"].append(asset_rule.name)
                elif context.asset_metadata.get('status') == "Processed":
                    overall_status["processed"].append(asset_rule.name)
                else: # Default or unknown state
                    log.warning(f"Asset '{asset_rule.name}': Unknown status after pipeline execution. Metadata status: '{context.asset_metadata.get('status')}'. Marking as failed.")
                    overall_status["failed"].append(f"{asset_rule.name} (Unknown Status: {context.asset_metadata.get('status')})")
                log.debug(f"Asset '{asset_rule.name}' final status: {context.asset_metadata.get('status', 'N/A')}, Flags: {context.status_flags}")

        except Exception as e:
            log.error(f"PipelineOrchestrator.process_source_rule failed: {e}", exc_info=True)
            # Mark all remaining assets as failed if a top-level error occurs
            processed_or_skipped_or_failed = set(overall_status["processed"] + overall_status["skipped"] + overall_status["failed"])
            for asset_rule in source_rule.assets:
                if asset_rule.name not in processed_or_skipped_or_failed:
                    overall_status["failed"].append(f"{asset_rule.name} (Orchestrator Error)")
        finally:
            if engine_temp_dir_path and engine_temp_dir_path.exists():
                try:
                    log.debug(f"PipelineOrchestrator cleaning up temporary directory: {engine_temp_dir_path}")
                    shutil.rmtree(engine_temp_dir_path, ignore_errors=True)
                except Exception as e:
                    log.error(f"Error cleaning up orchestrator temporary directory {engine_temp_dir_path}: {e}", exc_info=True)

        return overall_status