Asset-Frameworker/processing_engine.py
2025-05-09 13:57:22 +02:00

211 lines
10 KiB
Python

# processing_engine.py
import os
import math
import shutil
import tempfile
import logging
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set
# Attempt to import image processing libraries
try:
import cv2
import numpy as np
except ImportError as e:
log.error(f"Failed to import cv2 or numpy in processing_engine.py: {e}", exc_info=True)
print("ERROR: Missing required image processing libraries. Please install opencv-python and numpy:")
print("pip install opencv-python numpy")
# Allow import to fail but log error; execution will likely fail later
cv2 = None
np = None
try:
from configuration import Configuration, ConfigurationError
from rule_structure import SourceRule, AssetRule, FileRule
from utils.path_utils import generate_path_from_pattern, sanitize_filename
from processing.utils import image_processing_utils as ipu # Corrected import
except ImportError as e:
# Temporarily print to console as log might not be initialized yet
print(f"ERROR during initial imports in processing_engine.py: {e}")
# log.error(f"Failed to import Configuration or rule_structure classes in processing_engine.py: {e}", exc_info=True) # Log will be used after init
print("ERROR: Cannot import Configuration or rule_structure classes.")
print("Ensure configuration.py and rule_structure.py are in the same directory or Python path.")
# Allow import to fail but log error; execution will likely fail later
Configuration = None
SourceRule = None
AssetRule = None
FileRule = None
# Initialize logger early
log = logging.getLogger(__name__)
# Basic config if logger hasn't been set up elsewhere (e.g., during testing)
if not log.hasHandlers():
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
# Use logger defined in main.py (or configure one here if run standalone)
from processing.pipeline.orchestrator import PipelineOrchestrator
# from processing.pipeline.asset_context import AssetProcessingContext # AssetProcessingContext is used by the orchestrator
from processing.pipeline.stages.supplier_determination import SupplierDeterminationStage
from processing.pipeline.stages.asset_skip_logic import AssetSkipLogicStage
from processing.pipeline.stages.metadata_initialization import MetadataInitializationStage
from processing.pipeline.stages.file_rule_filter import FileRuleFilterStage
from processing.pipeline.stages.gloss_to_rough_conversion import GlossToRoughConversionStage
from processing.pipeline.stages.alpha_extraction_to_mask import AlphaExtractionToMaskStage
from processing.pipeline.stages.normal_map_green_channel import NormalMapGreenChannelStage
from processing.pipeline.stages.individual_map_processing import IndividualMapProcessingStage
from processing.pipeline.stages.map_merging import MapMergingStage
from processing.pipeline.stages.metadata_finalization_save import MetadataFinalizationAndSaveStage
from processing.pipeline.stages.output_organization import OutputOrganizationStage
# --- Custom Exception ---
class ProcessingEngineError(Exception):
"""Custom exception for errors during processing engine operations."""
pass
# Helper functions moved to processing.utils.image_processing_utils
# --- Processing Engine Class ---
class ProcessingEngine:
"""
Handles the core processing pipeline for assets based on explicit rules
provided in a SourceRule object and static configuration.
It does not perform classification, prediction, or rule fallback internally.
"""
def __init__(self, config_obj: Configuration):
"""
Initializes the processing engine with static configuration.
Args:
config_obj: The loaded Configuration object containing static settings.
"""
if cv2 is None or np is None or Configuration is None or SourceRule is None:
raise ProcessingEngineError("Essential libraries (OpenCV, NumPy) or classes (Configuration, SourceRule) are not available.")
if not isinstance(config_obj, Configuration):
raise ProcessingEngineError("config_obj must be a valid Configuration object.")
self.config_obj: Configuration = config_obj
self.temp_dir: Path | None = None # Path to the temporary working directory for a process run
self.loaded_data_cache: dict = {} # Cache for loaded/resized data within a single process call
# --- Pipeline Orchestrator Setup ---
self.stages = [
SupplierDeterminationStage(),
AssetSkipLogicStage(),
MetadataInitializationStage(),
FileRuleFilterStage(),
GlossToRoughConversionStage(),
AlphaExtractionToMaskStage(),
NormalMapGreenChannelStage(),
IndividualMapProcessingStage(),
MapMergingStage(),
MetadataFinalizationAndSaveStage(),
OutputOrganizationStage(),
]
try:
self.pipeline_orchestrator = PipelineOrchestrator(config_obj=self.config_obj, stages=self.stages)
log.info("PipelineOrchestrator initialized successfully in ProcessingEngine.")
except Exception as e:
log.error(f"Failed to initialize PipelineOrchestrator in ProcessingEngine: {e}", exc_info=True)
self.pipeline_orchestrator = None # Ensure it's None if init fails
log.debug("ProcessingEngine initialized.")
def process(
self,
source_rule: SourceRule,
workspace_path: Path,
output_base_path: Path,
overwrite: bool = False,
incrementing_value: Optional[str] = None,
sha5_value: Optional[str] = None
) -> Dict[str, List[str]]:
"""
Executes the processing pipeline for all assets defined in the SourceRule.
Args:
source_rule: The SourceRule object containing explicit instructions for all assets and files.
workspace_path: The path to the directory containing the source files (e.g., extracted archive).
output_base_path: The base directory where processed output will be saved.
overwrite: If True, forces reprocessing even if output exists for an asset.
incrementing_value: Optional incrementing value for path tokens.
sha5_value: Optional SHA5 hash value for path tokens.
Returns:
Dict[str, List[str]]: A dictionary summarizing the status of each asset:
{"processed": [asset_name1, ...],
"skipped": [asset_name2, ...],
"failed": [asset_name3, ...]}
"""
log.info(f"VERIFY: ProcessingEngine.process called with rule for input: {source_rule.input_path}") # DEBUG Verify
log.debug(f" VERIFY Rule Details: {source_rule}") # DEBUG Verify (Optional detailed log)
if not isinstance(source_rule, SourceRule):
raise ProcessingEngineError("process() requires a valid SourceRule object.")
if not isinstance(workspace_path, Path) or not workspace_path.is_dir():
raise ProcessingEngineError(f"Invalid workspace path provided: {workspace_path}")
if not isinstance(output_base_path, Path):
raise ProcessingEngineError(f"Invalid output base path provided: {output_base_path}")
log.info(f"ProcessingEngine starting process for {len(source_rule.assets)} asset(s) defined in SourceRule.")
overall_status = {"processed": [], "skipped": [], "failed": []}
self.loaded_data_cache = {} # Reset cache for this run
# Store incoming optional values for use in path generation
self.current_incrementing_value = incrementing_value
self.current_sha5_value = sha5_value
log.debug(f"Received incrementing_value: {self.current_incrementing_value}, sha5_value: {self.current_sha5_value}")
# Use a temporary directory for intermediate files (like saved maps)
try:
self.temp_dir = Path(tempfile.mkdtemp(prefix=self.config_obj.temp_dir_prefix))
log.debug(f"Created temporary workspace for engine: {self.temp_dir}")
# --- NEW PIPELINE ORCHESTRATOR LOGIC ---
if hasattr(self, 'pipeline_orchestrator') and self.pipeline_orchestrator:
log.info("Processing source rule using PipelineOrchestrator.")
overall_status = self.pipeline_orchestrator.process_source_rule(
source_rule=source_rule,
workspace_path=workspace_path, # This is the path to the source files (e.g. extracted archive)
output_base_path=output_base_path,
overwrite=overwrite,
incrementing_value=self.current_incrementing_value,
sha5_value=self.current_sha5_value
)
else:
log.error(f"PipelineOrchestrator not available for SourceRule '{source_rule.input_path}'. Marking all {len(source_rule.assets)} assets as failed.")
for asset_rule in source_rule.assets:
overall_status["failed"].append(asset_rule.asset_name)
log.info(f"ProcessingEngine finished. Summary: {overall_status}")
return overall_status
except Exception as e:
log.exception(f"Processing engine failed unexpectedly: {e}")
# Ensure all assets not processed/skipped are marked as failed
processed_or_skipped = set(overall_status["processed"] + overall_status["skipped"])
for asset_rule in source_rule.assets:
if asset_rule.asset_name not in processed_or_skipped:
overall_status["failed"].append(asset_rule.asset_name)
return overall_status # Return partial status if possible
finally:
self._cleanup_workspace()
def _cleanup_workspace(self):
"""Removes the temporary workspace directory if it exists."""
if self.temp_dir and self.temp_dir.exists():
try:
log.debug(f"Cleaning up engine temporary workspace: {self.temp_dir}")
# Ignore errors during cleanup (e.g., permission errors on copied .git files)
shutil.rmtree(self.temp_dir, ignore_errors=True)
self.temp_dir = None
log.debug("Engine temporary workspace cleaned up successfully.")
except Exception as e:
log.error(f"Failed to remove engine temporary workspace {self.temp_dir}: {e}", exc_info=True)
self.loaded_data_cache = {} # Clear cache after cleanup