LLM Restructure - UNTESTED!

2025-05-04 12:56:16 +02:00
parent 74b3d008ea
commit 01c8f68ea0
6 changed files with 467 additions and 397 deletions
--- a/gui/llm_prediction_handler.py
+++ b/gui/llm_prediction_handler.py
@@ -236,124 +236,191 @@ class LLMPredictionHandler(BasePredictionHandler):

    def _parse_llm_response(self, llm_response_json_str: str) -> List[SourceRule]:
        """
-        Parses the LLM's JSON response string into a list of SourceRule objects.
+        Parses the LLM's JSON response string (new two-part format) into a
+        list containing a single SourceRule object.
+        Includes sanitization for comments and markdown fences.
        """
        # Note: Exceptions (JSONDecodeError, ValueError) raised here
        # will be caught by the _perform_prediction method's handler.

-        # Strip potential markdown code fences before parsing
+        # --- Sanitize Input String ---
        clean_json_str = llm_response_json_str.strip()
+
+        # 1. Remove multi-line /* */ comments
+        clean_json_str = re.sub(r'/\*.*?\*/', '', clean_json_str, flags=re.DOTALL)
+
+        # 2. Remove single-line // comments (handle potential URLs carefully)
+        #    Only remove // if it's likely a comment (e.g., whitespace before it,
+        #    or at the start of a line after stripping leading whitespace).
+        lines = clean_json_str.splitlines()
+        cleaned_lines = []
+        for line in lines:
+            stripped_line = line.strip()
+            # Find the first // that isn't preceded by a : (to avoid breaking URLs like http://)
+            comment_index = -1
+            search_start = 0
+            while True:
+                idx = stripped_line.find('//', search_start)
+                if idx == -1:
+                    break # No more // found
+                if idx == 0 or stripped_line[idx-1] != ':':
+                    # Found a potential comment marker
+                    # Check if it's inside quotes
+                    in_quotes = False
+                    quote_char = ''
+                    for i in range(idx):
+                        char = stripped_line[i]
+                        if char in ('"', "'") and (i == 0 or stripped_line[i-1] != '\\'): # Handle escaped quotes
+                            if not in_quotes:
+                                in_quotes = True
+                                quote_char = char
+                            elif char == quote_char:
+                                in_quotes = False
+                                quote_char = ''
+                    if not in_quotes:
+                        comment_index = idx
+                        break # Found valid comment marker
+                    else:
+                        # // is inside quotes, continue searching after it
+                        search_start = idx + 2
+                else:
+                    # Found ://, likely a URL, continue searching after it
+                    search_start = idx + 2
+
+            if comment_index != -1:
+                # Find the original position in the non-stripped line
+                original_comment_start = line.find(stripped_line[comment_index:])
+                cleaned_lines.append(line[:original_comment_start].rstrip())
+            else:
+                cleaned_lines.append(line)
+        clean_json_str = "\n".join(cleaned_lines)
+
+
+        # 3. Remove markdown code fences
+        clean_json_str = clean_json_str.strip()
        if clean_json_str.startswith("```json"):
            clean_json_str = clean_json_str[7:] # Remove ```json\n
        if clean_json_str.endswith("```"):
            clean_json_str = clean_json_str[:-3] # Remove ```
        clean_json_str = clean_json_str.strip() # Remove any extra whitespace

-        # --- ADDED: Remove <think> tags ---
+        # 4. Remove <think> tags (just in case)
        clean_json_str = re.sub(r'<think>.*?</think>', '', clean_json_str, flags=re.DOTALL | re.IGNORECASE)
-        clean_json_str = clean_json_str.strip() # Strip again after potential removal
-        # ---------------------------------
+        clean_json_str = clean_json_str.strip()

+        # --- Parse Sanitized JSON ---
        try:
            response_data = json.loads(clean_json_str)
        except json.JSONDecodeError as e:
-            # Log the full cleaned string that caused the error for better debugging
-            error_detail = f"Failed to decode LLM JSON response: {e}\nFull Cleaned Response:\n{clean_json_str}"
-            log.error(f"ERROR: {error_detail}") # Log full error detail to console
-            raise ValueError(error_detail) # Raise the error with full detail
+            error_detail = f"Failed to decode LLM JSON response after sanitization: {e}\nSanitized Response Attempted:\n{clean_json_str}"
+            log.error(f"ERROR: {error_detail}")
+            raise ValueError(error_detail)

-        if "predicted_assets" not in response_data or not isinstance(response_data["predicted_assets"], list):
-            raise ValueError("Invalid LLM response format: 'predicted_assets' key missing or not a list.")
+        # --- Validate Top-Level Structure ---
+        if not isinstance(response_data, dict):
+             raise ValueError("Invalid LLM response: Root element is not a JSON object.")

-        source_rules = []
-        # We assume one SourceRule per input source processed by this handler instance
-        # Use self.input_source_identifier from the base class
+        if "individual_file_analysis" not in response_data or not isinstance(response_data["individual_file_analysis"], list):
+            raise ValueError("Invalid LLM response format: 'individual_file_analysis' key missing or not a list.")
+
+        if "asset_group_classifications" not in response_data or not isinstance(response_data["asset_group_classifications"], dict):
+            raise ValueError("Invalid LLM response format: 'asset_group_classifications' key missing or not a dictionary.")
+
+        # --- Prepare for Rule Creation ---
        source_rule = SourceRule(input_path=self.input_source_identifier)
-
-        # Access valid types from the settings dictionary
        valid_asset_types = list(self.llm_settings.get('asset_types', {}).keys())
        valid_file_types = list(self.llm_settings.get('file_types', {}).keys())
+        asset_rules_map: Dict[str, AssetRule] = {} # Maps group_name to AssetRule

-        for asset_data in response_data["predicted_assets"]:
+        # --- Process Individual Files and Build Rules ---
+        for file_data in response_data["individual_file_analysis"]:
            # Check for cancellation within the loop
            if self._is_cancelled:
-                log.info("LLM prediction cancelled during response parsing (assets).")
+                log.info("LLM prediction cancelled during response parsing (files).")
                return []

-            if not isinstance(asset_data, dict):
-                log.warning(f"Skipping invalid asset data (not a dict): {asset_data}")
+            if not isinstance(file_data, dict):
+                log.warning(f"Skipping invalid file data entry (not a dict): {file_data}")
                continue

-            asset_name = asset_data.get("suggested_asset_name", "Unnamed_Asset")
-            asset_type = asset_data.get("predicted_asset_type")
+            file_path_rel = file_data.get("relative_file_path")
+            file_type = file_data.get("classified_file_type")
+            group_name = file_data.get("proposed_asset_group_name") # Can be string or null
+
+            # --- Validate File Data ---
+            if not file_path_rel or not isinstance(file_path_rel, str):
+                log.warning(f"Missing or invalid 'relative_file_path' in file data: {file_data}. Skipping file.")
+                continue
+
+            if not file_type or not isinstance(file_type, str):
+                log.warning(f"Missing or invalid 'classified_file_type' for file '{file_path_rel}'. Skipping file.")
+                continue
+
+            # Handle FILE_IGNORE explicitly
+            if file_type == "FILE_IGNORE":
+                log.debug(f"Ignoring file as per LLM prediction: {file_path_rel}")
+                continue # Skip creating a rule for this file
+
+            # Validate file_type against definitions
+            if file_type not in valid_file_types:
+                log.warning(f"Invalid predicted_file_type '{file_type}' for file '{file_path_rel}'. Defaulting to EXTRA.")
+                file_type = "EXTRA"
+
+            # --- Handle Grouping and Asset Type ---
+            if not group_name or not isinstance(group_name, str):
+                log.warning(f"File '{file_path_rel}' has missing, null, or invalid 'proposed_asset_group_name' ({group_name}). Cannot assign to an asset. Skipping file.")
+                continue # Skip files that cannot be grouped
+
+            asset_type = response_data["asset_group_classifications"].get(group_name)
+
+            if not asset_type:
+                log.warning(f"No classification found in 'asset_group_classifications' for group '{group_name}' (proposed for file '{file_path_rel}'). Skipping file.")
+                continue # Skip files belonging to unclassified groups

            if asset_type not in valid_asset_types:
-                 log.warning(f"Invalid predicted_asset_type '{asset_type}' for asset '{asset_name}'. Skipping asset.")
-                 continue # Skip this asset
+                 log.warning(f"Invalid asset_type '{asset_type}' found in 'asset_group_classifications' for group '{group_name}'. Skipping file '{file_path_rel}'.")
+                 continue # Skip files belonging to groups with invalid types

-            asset_rule = AssetRule(asset_name=asset_name, asset_type=asset_type)
-            source_rule.assets.append(asset_rule)
-
-            if "files" not in asset_data or not isinstance(asset_data["files"], list):
-                log.warning(f"'files' key missing or not a list in asset '{asset_name}'. Skipping files for this asset.")
+            # --- Construct Absolute Path ---
+            try:
+                base_path = Path(self.input_source_identifier)
+                if base_path.is_file():
+                    base_path = base_path.parent
+                clean_rel_path = Path(file_path_rel.strip().replace('\\', '/'))
+                file_path_abs = str(base_path / clean_rel_path)
+            except Exception as path_e:
+                log.warning(f"Error constructing absolute path for '{file_path_rel}' relative to '{self.input_source_identifier}': {path_e}. Skipping file.")
                continue

-            for file_data in asset_data["files"]:
-                 # Check for cancellation within the inner loop
-                if self._is_cancelled:
-                    log.info("LLM prediction cancelled during response parsing (files).")
-                    return []
+            # --- Get or Create Asset Rule ---
+            asset_rule = asset_rules_map.get(group_name)
+            if not asset_rule:
+                # Create new AssetRule if this is the first file for this group
+                log.debug(f"Creating new AssetRule for group '{group_name}' with type '{asset_type}'.")
+                asset_rule = AssetRule(asset_name=group_name, asset_type=asset_type)
+                source_rule.assets.append(asset_rule)
+                asset_rules_map[group_name] = asset_rule
+            # else: use existing asset_rule

-                if not isinstance(file_data, dict):
-                    log.warning(f"Skipping invalid file data (not a dict) in asset '{asset_name}': {file_data}")
-                    continue
-
-                file_path_rel = file_data.get("file_path") # LLM provides relative path
-                file_type = file_data.get("predicted_file_type")
-
-                if not file_path_rel:
-                    log.warning(f"Missing 'file_path' in file data for asset '{asset_name}'. Skipping file.")
-                    continue
-
-                # Convert relative path from LLM (using '/') back to absolute OS-specific path
-                # We need the original input path (directory or archive) to make it absolute
-                # Use self.input_source_identifier which holds the original path
-                # IMPORTANT: Ensure the LLM is actually providing paths relative to the *root* of the input source.
-                try:
-                    # Use Pathlib for safer joining, assuming input_source_identifier is the parent dir/archive path
-                    # If input_source_identifier is an archive file, this logic might need adjustment
-                    # depending on where files were extracted. For now, assume it's the base path.
-                    base_path = Path(self.input_source_identifier)
-                    # If the input was a file (like a zip), use its parent directory as the base for joining relative paths
-                    if base_path.is_file():
-                        base_path = base_path.parent
-                    # Clean the relative path potentially coming from LLM
-                    clean_rel_path = Path(file_path_rel.strip().replace('\\', '/'))
-                    file_path_abs = str(base_path / clean_rel_path)
-                except Exception as path_e:
-                    log.warning(f"Error constructing absolute path for '{file_path_rel}' relative to '{self.input_source_identifier}': {path_e}. Skipping file.")
-                    continue
+            # --- Create and Add File Rule ---
+            file_rule = FileRule(
+                file_path=file_path_abs,
+                item_type=file_type,
+                item_type_override=file_type, # Initial override based on LLM
+                target_asset_name_override=group_name, # Use the group name
+                output_format_override=None,
+                is_gloss_source=False,
+                resolution_override=None,
+                channel_merge_instructions={}
+            )
+            asset_rule.files.append(file_rule)
+            log.debug(f"Added file '{file_path_rel}' (type: {file_type}) to asset '{group_name}'.")


-                if file_type not in valid_file_types:
-                    log.warning(f"Invalid predicted_file_type '{file_type}' for file '{file_path_rel}'. Defaulting to EXTRA.")
-                    file_type = "EXTRA" # Default to EXTRA if invalid type from LLM
+        # Log if no assets were created
+        if not source_rule.assets:
+            log.warning(f"LLM prediction for '{self.input_source_identifier}' resulted in zero valid assets after parsing.")

-                # Create the FileRule instance
-                # Add default values for fields not provided by LLM
-                file_rule = FileRule(
-                    file_path=file_path_abs,
-                    item_type=file_type,
-                    item_type_override=file_type, # Initial override
-                    target_asset_name_override=asset_name, # Default to asset name
-                    output_format_override=None,
-                    is_gloss_source=False, # LLM doesn't predict this
-                    resolution_override=None,
-                    channel_merge_instructions={}
-                )
-                asset_rule.files.append(file_rule)
-
-        source_rules.append(source_rule)
-        return source_rules
+        return [source_rule] # Return list containing the single SourceRule

 # Removed conceptual example usage comments