LLM Restructure - UNTESTED!
This commit is contained in:
@@ -236,124 +236,191 @@ class LLMPredictionHandler(BasePredictionHandler):
|
||||
|
||||
def _parse_llm_response(self, llm_response_json_str: str) -> List[SourceRule]:
|
||||
"""
|
||||
Parses the LLM's JSON response string into a list of SourceRule objects.
|
||||
Parses the LLM's JSON response string (new two-part format) into a
|
||||
list containing a single SourceRule object.
|
||||
Includes sanitization for comments and markdown fences.
|
||||
"""
|
||||
# Note: Exceptions (JSONDecodeError, ValueError) raised here
|
||||
# will be caught by the _perform_prediction method's handler.
|
||||
|
||||
# Strip potential markdown code fences before parsing
|
||||
# --- Sanitize Input String ---
|
||||
clean_json_str = llm_response_json_str.strip()
|
||||
|
||||
# 1. Remove multi-line /* */ comments
|
||||
clean_json_str = re.sub(r'/\*.*?\*/', '', clean_json_str, flags=re.DOTALL)
|
||||
|
||||
# 2. Remove single-line // comments (handle potential URLs carefully)
|
||||
# Only remove // if it's likely a comment (e.g., whitespace before it,
|
||||
# or at the start of a line after stripping leading whitespace).
|
||||
lines = clean_json_str.splitlines()
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
stripped_line = line.strip()
|
||||
# Find the first // that isn't preceded by a : (to avoid breaking URLs like http://)
|
||||
comment_index = -1
|
||||
search_start = 0
|
||||
while True:
|
||||
idx = stripped_line.find('//', search_start)
|
||||
if idx == -1:
|
||||
break # No more // found
|
||||
if idx == 0 or stripped_line[idx-1] != ':':
|
||||
# Found a potential comment marker
|
||||
# Check if it's inside quotes
|
||||
in_quotes = False
|
||||
quote_char = ''
|
||||
for i in range(idx):
|
||||
char = stripped_line[i]
|
||||
if char in ('"', "'") and (i == 0 or stripped_line[i-1] != '\\'): # Handle escaped quotes
|
||||
if not in_quotes:
|
||||
in_quotes = True
|
||||
quote_char = char
|
||||
elif char == quote_char:
|
||||
in_quotes = False
|
||||
quote_char = ''
|
||||
if not in_quotes:
|
||||
comment_index = idx
|
||||
break # Found valid comment marker
|
||||
else:
|
||||
# // is inside quotes, continue searching after it
|
||||
search_start = idx + 2
|
||||
else:
|
||||
# Found ://, likely a URL, continue searching after it
|
||||
search_start = idx + 2
|
||||
|
||||
if comment_index != -1:
|
||||
# Find the original position in the non-stripped line
|
||||
original_comment_start = line.find(stripped_line[comment_index:])
|
||||
cleaned_lines.append(line[:original_comment_start].rstrip())
|
||||
else:
|
||||
cleaned_lines.append(line)
|
||||
clean_json_str = "\n".join(cleaned_lines)
|
||||
|
||||
|
||||
# 3. Remove markdown code fences
|
||||
clean_json_str = clean_json_str.strip()
|
||||
if clean_json_str.startswith("```json"):
|
||||
clean_json_str = clean_json_str[7:] # Remove ```json\n
|
||||
if clean_json_str.endswith("```"):
|
||||
clean_json_str = clean_json_str[:-3] # Remove ```
|
||||
clean_json_str = clean_json_str.strip() # Remove any extra whitespace
|
||||
|
||||
# --- ADDED: Remove <think> tags ---
|
||||
# 4. Remove <think> tags (just in case)
|
||||
clean_json_str = re.sub(r'<think>.*?</think>', '', clean_json_str, flags=re.DOTALL | re.IGNORECASE)
|
||||
clean_json_str = clean_json_str.strip() # Strip again after potential removal
|
||||
# ---------------------------------
|
||||
clean_json_str = clean_json_str.strip()
|
||||
|
||||
# --- Parse Sanitized JSON ---
|
||||
try:
|
||||
response_data = json.loads(clean_json_str)
|
||||
except json.JSONDecodeError as e:
|
||||
# Log the full cleaned string that caused the error for better debugging
|
||||
error_detail = f"Failed to decode LLM JSON response: {e}\nFull Cleaned Response:\n{clean_json_str}"
|
||||
log.error(f"ERROR: {error_detail}") # Log full error detail to console
|
||||
raise ValueError(error_detail) # Raise the error with full detail
|
||||
error_detail = f"Failed to decode LLM JSON response after sanitization: {e}\nSanitized Response Attempted:\n{clean_json_str}"
|
||||
log.error(f"ERROR: {error_detail}")
|
||||
raise ValueError(error_detail)
|
||||
|
||||
if "predicted_assets" not in response_data or not isinstance(response_data["predicted_assets"], list):
|
||||
raise ValueError("Invalid LLM response format: 'predicted_assets' key missing or not a list.")
|
||||
# --- Validate Top-Level Structure ---
|
||||
if not isinstance(response_data, dict):
|
||||
raise ValueError("Invalid LLM response: Root element is not a JSON object.")
|
||||
|
||||
source_rules = []
|
||||
# We assume one SourceRule per input source processed by this handler instance
|
||||
# Use self.input_source_identifier from the base class
|
||||
if "individual_file_analysis" not in response_data or not isinstance(response_data["individual_file_analysis"], list):
|
||||
raise ValueError("Invalid LLM response format: 'individual_file_analysis' key missing or not a list.")
|
||||
|
||||
if "asset_group_classifications" not in response_data or not isinstance(response_data["asset_group_classifications"], dict):
|
||||
raise ValueError("Invalid LLM response format: 'asset_group_classifications' key missing or not a dictionary.")
|
||||
|
||||
# --- Prepare for Rule Creation ---
|
||||
source_rule = SourceRule(input_path=self.input_source_identifier)
|
||||
|
||||
# Access valid types from the settings dictionary
|
||||
valid_asset_types = list(self.llm_settings.get('asset_types', {}).keys())
|
||||
valid_file_types = list(self.llm_settings.get('file_types', {}).keys())
|
||||
asset_rules_map: Dict[str, AssetRule] = {} # Maps group_name to AssetRule
|
||||
|
||||
for asset_data in response_data["predicted_assets"]:
|
||||
# --- Process Individual Files and Build Rules ---
|
||||
for file_data in response_data["individual_file_analysis"]:
|
||||
# Check for cancellation within the loop
|
||||
if self._is_cancelled:
|
||||
log.info("LLM prediction cancelled during response parsing (assets).")
|
||||
log.info("LLM prediction cancelled during response parsing (files).")
|
||||
return []
|
||||
|
||||
if not isinstance(asset_data, dict):
|
||||
log.warning(f"Skipping invalid asset data (not a dict): {asset_data}")
|
||||
if not isinstance(file_data, dict):
|
||||
log.warning(f"Skipping invalid file data entry (not a dict): {file_data}")
|
||||
continue
|
||||
|
||||
asset_name = asset_data.get("suggested_asset_name", "Unnamed_Asset")
|
||||
asset_type = asset_data.get("predicted_asset_type")
|
||||
file_path_rel = file_data.get("relative_file_path")
|
||||
file_type = file_data.get("classified_file_type")
|
||||
group_name = file_data.get("proposed_asset_group_name") # Can be string or null
|
||||
|
||||
# --- Validate File Data ---
|
||||
if not file_path_rel or not isinstance(file_path_rel, str):
|
||||
log.warning(f"Missing or invalid 'relative_file_path' in file data: {file_data}. Skipping file.")
|
||||
continue
|
||||
|
||||
if not file_type or not isinstance(file_type, str):
|
||||
log.warning(f"Missing or invalid 'classified_file_type' for file '{file_path_rel}'. Skipping file.")
|
||||
continue
|
||||
|
||||
# Handle FILE_IGNORE explicitly
|
||||
if file_type == "FILE_IGNORE":
|
||||
log.debug(f"Ignoring file as per LLM prediction: {file_path_rel}")
|
||||
continue # Skip creating a rule for this file
|
||||
|
||||
# Validate file_type against definitions
|
||||
if file_type not in valid_file_types:
|
||||
log.warning(f"Invalid predicted_file_type '{file_type}' for file '{file_path_rel}'. Defaulting to EXTRA.")
|
||||
file_type = "EXTRA"
|
||||
|
||||
# --- Handle Grouping and Asset Type ---
|
||||
if not group_name or not isinstance(group_name, str):
|
||||
log.warning(f"File '{file_path_rel}' has missing, null, or invalid 'proposed_asset_group_name' ({group_name}). Cannot assign to an asset. Skipping file.")
|
||||
continue # Skip files that cannot be grouped
|
||||
|
||||
asset_type = response_data["asset_group_classifications"].get(group_name)
|
||||
|
||||
if not asset_type:
|
||||
log.warning(f"No classification found in 'asset_group_classifications' for group '{group_name}' (proposed for file '{file_path_rel}'). Skipping file.")
|
||||
continue # Skip files belonging to unclassified groups
|
||||
|
||||
if asset_type not in valid_asset_types:
|
||||
log.warning(f"Invalid predicted_asset_type '{asset_type}' for asset '{asset_name}'. Skipping asset.")
|
||||
continue # Skip this asset
|
||||
log.warning(f"Invalid asset_type '{asset_type}' found in 'asset_group_classifications' for group '{group_name}'. Skipping file '{file_path_rel}'.")
|
||||
continue # Skip files belonging to groups with invalid types
|
||||
|
||||
asset_rule = AssetRule(asset_name=asset_name, asset_type=asset_type)
|
||||
source_rule.assets.append(asset_rule)
|
||||
|
||||
if "files" not in asset_data or not isinstance(asset_data["files"], list):
|
||||
log.warning(f"'files' key missing or not a list in asset '{asset_name}'. Skipping files for this asset.")
|
||||
# --- Construct Absolute Path ---
|
||||
try:
|
||||
base_path = Path(self.input_source_identifier)
|
||||
if base_path.is_file():
|
||||
base_path = base_path.parent
|
||||
clean_rel_path = Path(file_path_rel.strip().replace('\\', '/'))
|
||||
file_path_abs = str(base_path / clean_rel_path)
|
||||
except Exception as path_e:
|
||||
log.warning(f"Error constructing absolute path for '{file_path_rel}' relative to '{self.input_source_identifier}': {path_e}. Skipping file.")
|
||||
continue
|
||||
|
||||
for file_data in asset_data["files"]:
|
||||
# Check for cancellation within the inner loop
|
||||
if self._is_cancelled:
|
||||
log.info("LLM prediction cancelled during response parsing (files).")
|
||||
return []
|
||||
# --- Get or Create Asset Rule ---
|
||||
asset_rule = asset_rules_map.get(group_name)
|
||||
if not asset_rule:
|
||||
# Create new AssetRule if this is the first file for this group
|
||||
log.debug(f"Creating new AssetRule for group '{group_name}' with type '{asset_type}'.")
|
||||
asset_rule = AssetRule(asset_name=group_name, asset_type=asset_type)
|
||||
source_rule.assets.append(asset_rule)
|
||||
asset_rules_map[group_name] = asset_rule
|
||||
# else: use existing asset_rule
|
||||
|
||||
if not isinstance(file_data, dict):
|
||||
log.warning(f"Skipping invalid file data (not a dict) in asset '{asset_name}': {file_data}")
|
||||
continue
|
||||
|
||||
file_path_rel = file_data.get("file_path") # LLM provides relative path
|
||||
file_type = file_data.get("predicted_file_type")
|
||||
|
||||
if not file_path_rel:
|
||||
log.warning(f"Missing 'file_path' in file data for asset '{asset_name}'. Skipping file.")
|
||||
continue
|
||||
|
||||
# Convert relative path from LLM (using '/') back to absolute OS-specific path
|
||||
# We need the original input path (directory or archive) to make it absolute
|
||||
# Use self.input_source_identifier which holds the original path
|
||||
# IMPORTANT: Ensure the LLM is actually providing paths relative to the *root* of the input source.
|
||||
try:
|
||||
# Use Pathlib for safer joining, assuming input_source_identifier is the parent dir/archive path
|
||||
# If input_source_identifier is an archive file, this logic might need adjustment
|
||||
# depending on where files were extracted. For now, assume it's the base path.
|
||||
base_path = Path(self.input_source_identifier)
|
||||
# If the input was a file (like a zip), use its parent directory as the base for joining relative paths
|
||||
if base_path.is_file():
|
||||
base_path = base_path.parent
|
||||
# Clean the relative path potentially coming from LLM
|
||||
clean_rel_path = Path(file_path_rel.strip().replace('\\', '/'))
|
||||
file_path_abs = str(base_path / clean_rel_path)
|
||||
except Exception as path_e:
|
||||
log.warning(f"Error constructing absolute path for '{file_path_rel}' relative to '{self.input_source_identifier}': {path_e}. Skipping file.")
|
||||
continue
|
||||
# --- Create and Add File Rule ---
|
||||
file_rule = FileRule(
|
||||
file_path=file_path_abs,
|
||||
item_type=file_type,
|
||||
item_type_override=file_type, # Initial override based on LLM
|
||||
target_asset_name_override=group_name, # Use the group name
|
||||
output_format_override=None,
|
||||
is_gloss_source=False,
|
||||
resolution_override=None,
|
||||
channel_merge_instructions={}
|
||||
)
|
||||
asset_rule.files.append(file_rule)
|
||||
log.debug(f"Added file '{file_path_rel}' (type: {file_type}) to asset '{group_name}'.")
|
||||
|
||||
|
||||
if file_type not in valid_file_types:
|
||||
log.warning(f"Invalid predicted_file_type '{file_type}' for file '{file_path_rel}'. Defaulting to EXTRA.")
|
||||
file_type = "EXTRA" # Default to EXTRA if invalid type from LLM
|
||||
# Log if no assets were created
|
||||
if not source_rule.assets:
|
||||
log.warning(f"LLM prediction for '{self.input_source_identifier}' resulted in zero valid assets after parsing.")
|
||||
|
||||
# Create the FileRule instance
|
||||
# Add default values for fields not provided by LLM
|
||||
file_rule = FileRule(
|
||||
file_path=file_path_abs,
|
||||
item_type=file_type,
|
||||
item_type_override=file_type, # Initial override
|
||||
target_asset_name_override=asset_name, # Default to asset name
|
||||
output_format_override=None,
|
||||
is_gloss_source=False, # LLM doesn't predict this
|
||||
resolution_override=None,
|
||||
channel_merge_instructions={}
|
||||
)
|
||||
asset_rule.files.append(file_rule)
|
||||
|
||||
source_rules.append(source_rule)
|
||||
return source_rules
|
||||
return [source_rule] # Return list containing the single SourceRule
|
||||
|
||||
# Removed conceptual example usage comments
|
||||
Reference in New Issue
Block a user