This is page 39 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│ ├── __init__.py
│ ├── advanced_agent_flows_using_unified_memory_system_demo.py
│ ├── advanced_extraction_demo.py
│ ├── advanced_unified_memory_system_demo.py
│ ├── advanced_vector_search_demo.py
│ ├── analytics_reporting_demo.py
│ ├── audio_transcription_demo.py
│ ├── basic_completion_demo.py
│ ├── cache_demo.py
│ ├── claude_integration_demo.py
│ ├── compare_synthesize_demo.py
│ ├── cost_optimization.py
│ ├── data
│ │ ├── sample_event.txt
│ │ ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│ │ └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│ ├── docstring_refiner_demo.py
│ ├── document_conversion_and_processing_demo.py
│ ├── entity_relation_graph_demo.py
│ ├── filesystem_operations_demo.py
│ ├── grok_integration_demo.py
│ ├── local_text_tools_demo.py
│ ├── marqo_fused_search_demo.py
│ ├── measure_model_speeds.py
│ ├── meta_api_demo.py
│ ├── multi_provider_demo.py
│ ├── ollama_integration_demo.py
│ ├── prompt_templates_demo.py
│ ├── python_sandbox_demo.py
│ ├── rag_example.py
│ ├── research_workflow_demo.py
│ ├── sample
│ │ ├── article.txt
│ │ ├── backprop_paper.pdf
│ │ ├── buffett.pdf
│ │ ├── contract_link.txt
│ │ ├── legal_contract.txt
│ │ ├── medical_case.txt
│ │ ├── northwind.db
│ │ ├── research_paper.txt
│ │ ├── sample_data.json
│ │ └── text_classification_samples
│ │ ├── email_classification.txt
│ │ ├── news_samples.txt
│ │ ├── product_reviews.txt
│ │ └── support_tickets.txt
│ ├── sample_docs
│ │ └── downloaded
│ │ └── attention_is_all_you_need.pdf
│ ├── sentiment_analysis_demo.py
│ ├── simple_completion_demo.py
│ ├── single_shot_synthesis_demo.py
│ ├── smart_browser_demo.py
│ ├── sql_database_demo.py
│ ├── sse_client_demo.py
│ ├── test_code_extraction.py
│ ├── test_content_detection.py
│ ├── test_ollama.py
│ ├── text_classification_demo.py
│ ├── text_redline_demo.py
│ ├── tool_composition_examples.py
│ ├── tournament_code_demo.py
│ ├── tournament_text_demo.py
│ ├── unified_memory_system_demo.py
│ ├── vector_search_demo.py
│ ├── web_automation_instruction_packs.py
│ └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│ └── smart_browser_internal
│ ├── locator_cache.db
│ ├── readability.js
│ └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── integration
│ │ ├── __init__.py
│ │ └── test_server.py
│ ├── manual
│ │ ├── test_extraction_advanced.py
│ │ └── test_extraction.py
│ └── unit
│ ├── __init__.py
│ ├── test_cache.py
│ ├── test_providers.py
│ └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│ ├── __init__.py
│ ├── __main__.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── commands.py
│ │ ├── helpers.py
│ │ └── typer_cli.py
│ ├── clients
│ │ ├── __init__.py
│ │ ├── completion_client.py
│ │ └── rag_client.py
│ ├── config
│ │ └── examples
│ │ └── filesystem_config.yaml
│ ├── config.py
│ ├── constants.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── evaluation
│ │ │ ├── base.py
│ │ │ └── evaluators.py
│ │ ├── providers
│ │ │ ├── __init__.py
│ │ │ ├── anthropic.py
│ │ │ ├── base.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemini.py
│ │ │ ├── grok.py
│ │ │ ├── ollama.py
│ │ │ ├── openai.py
│ │ │ └── openrouter.py
│ │ ├── server.py
│ │ ├── state_store.py
│ │ ├── tournaments
│ │ │ ├── manager.py
│ │ │ ├── tasks.py
│ │ │ └── utils.py
│ │ └── ums_api
│ │ ├── __init__.py
│ │ ├── ums_database.py
│ │ ├── ums_endpoints.py
│ │ ├── ums_models.py
│ │ └── ums_services.py
│ ├── exceptions.py
│ ├── graceful_shutdown.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── analytics
│ │ │ ├── __init__.py
│ │ │ ├── metrics.py
│ │ │ └── reporting.py
│ │ ├── cache
│ │ │ ├── __init__.py
│ │ │ ├── cache_service.py
│ │ │ ├── persistence.py
│ │ │ ├── strategies.py
│ │ │ └── utils.py
│ │ ├── cache.py
│ │ ├── document.py
│ │ ├── knowledge_base
│ │ │ ├── __init__.py
│ │ │ ├── feedback.py
│ │ │ ├── manager.py
│ │ │ ├── rag_engine.py
│ │ │ ├── retriever.py
│ │ │ └── utils.py
│ │ ├── prompts
│ │ │ ├── __init__.py
│ │ │ ├── repository.py
│ │ │ └── templates.py
│ │ ├── prompts.py
│ │ └── vector
│ │ ├── __init__.py
│ │ ├── embeddings.py
│ │ └── vector_service.py
│ ├── tool_token_counter.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── audio_transcription.py
│ │ ├── base.py
│ │ ├── completion.py
│ │ ├── docstring_refiner.py
│ │ ├── document_conversion_and_processing.py
│ │ ├── enhanced-ums-lookbook.html
│ │ ├── entity_relation_graph.py
│ │ ├── excel_spreadsheet_automation.py
│ │ ├── extraction.py
│ │ ├── filesystem.py
│ │ ├── html_to_markdown.py
│ │ ├── local_text_tools.py
│ │ ├── marqo_fused_search.py
│ │ ├── meta_api_tool.py
│ │ ├── ocr_tools.py
│ │ ├── optimization.py
│ │ ├── provider.py
│ │ ├── pyodide_boot_template.html
│ │ ├── python_sandbox.py
│ │ ├── rag.py
│ │ ├── redline-compiled.css
│ │ ├── sentiment_analysis.py
│ │ ├── single_shot_synthesis.py
│ │ ├── smart_browser.py
│ │ ├── sql_databases.py
│ │ ├── text_classification.py
│ │ ├── text_redline_tools.py
│ │ ├── tournament.py
│ │ ├── ums_explorer.html
│ │ └── unified_memory_system.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── async_utils.py
│ │ ├── display.py
│ │ ├── logging
│ │ │ ├── __init__.py
│ │ │ ├── console.py
│ │ │ ├── emojis.py
│ │ │ ├── formatter.py
│ │ │ ├── logger.py
│ │ │ ├── panels.py
│ │ │ ├── progress.py
│ │ │ └── themes.py
│ │ ├── parse_yaml.py
│ │ ├── parsing.py
│ │ ├── security.py
│ │ └── text.py
│ └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/text_redline_tools.py:
--------------------------------------------------------------------------------
```python
1 | # -*- coding: utf-8 -*-
2 | from __future__ import annotations
3 |
4 | import base64
5 | import datetime as _dt
6 | import difflib
7 | import hashlib
8 | import html as html_stdlib
9 | import itertools
10 | import json
11 | import re
12 | import subprocess
13 | import tempfile
14 | import textwrap
15 | import time
16 | from dataclasses import dataclass
17 | from copy import deepcopy
18 | from datetime import timezone
19 | from pathlib import Path
20 | from typing import Any, Dict, List, Optional, Set, Tuple, Union
21 |
22 | import markdown
23 | from bs4 import BeautifulSoup
24 | from bs4.element import NavigableString, Tag
25 | from lxml import etree
26 | from lxml import html as lxml_html
27 | from lxml.etree import _Element, _ElementTree
28 | from xmldiff import formatting, main
29 | from xmldiff.actions import (
30 | DeleteAttrib,
31 | DeleteNode,
32 | InsertAttrib,
33 | InsertNode,
34 | MoveNode,
35 | RenameAttrib,
36 | UpdateAttrib,
37 | UpdateTextIn,
38 | )
39 |
40 | try:
41 | from xmldiff.actions import InsertComment
42 | except ImportError:
43 | InsertComment = None
44 | try:
45 | from xmldiff.actions import RenameNode
46 | except ImportError:
47 | RenameNode = None
48 | try:
49 | from xmldiff.actions import UpdateTextAfter
50 | except ImportError:
51 | UpdateTextAfter = None
52 | try:
53 | from xmldiff.actions import UpdateTextBefore
54 | except ImportError:
55 | UpdateTextBefore = None
56 | try:
57 | from xmldiff.actions import (
58 | DeleteTextAfter,
59 | DeleteTextBefore,
60 | DeleteTextIn,
61 | InsertTextAfter,
62 | InsertTextBefore,
63 | InsertTextIn,
64 | )
65 |
66 | _fine_grained_text_actions = True
67 | except ImportError:
68 | InsertTextIn = InsertTextBefore = InsertTextAfter = None
69 | DeleteTextIn = DeleteTextBefore = DeleteTextAfter = None
70 | _fine_grained_text_actions = False
71 | _INSERT_TEXT_CLS = tuple()
72 | _DELETE_TEXT_CLS = tuple()
73 | _UPDATE_TEXT_CLS = tuple(c for c in (UpdateTextIn, UpdateTextBefore, UpdateTextAfter) if c)
74 |
75 | if _fine_grained_text_actions:
76 | _INSERT_TEXT_CLS = tuple(c for c in (InsertTextIn, InsertTextBefore, InsertTextAfter) if c)
77 | _DELETE_TEXT_CLS = tuple(c for c in (DeleteTextIn, DeleteTextBefore, DeleteTextAfter) if c)
78 | _UPDATE_TEXT_CLS = tuple(c for c in (UpdateTextIn, UpdateTextBefore, UpdateTextAfter) if c)
79 | else:
80 | _INSERT_TEXT_CLS = _INSERT_TEXT_CLS if "_INSERT_TEXT_CLS" in locals() else tuple()
81 | _DELETE_TEXT_CLS = _DELETE_TEXT_CLS if "_DELETE_TEXT_CLS" in locals() else tuple()
82 | _UPDATE_TEXT_CLS = (
83 | _UPDATE_TEXT_CLS
84 | if "_UPDATE_TEXT_CLS" in locals()
85 | else tuple(c for c in (UpdateTextIn, UpdateTextBefore, UpdateTextAfter) if c)
86 | )
87 |
88 | _ATTR_UPDATE_CLS = (UpdateAttrib,)
89 | _ATTR_INSERT_CLS = (InsertAttrib,)
90 | _ATTR_DELETE_CLS = (DeleteAttrib,)
91 | _ATTR_RENAME_CLS = (RenameAttrib,)
92 | # ------------------------------------------------------------------
93 |
94 | from ultimate_mcp_server.exceptions import ToolError, ToolInputError # noqa: E402
95 | from ultimate_mcp_server.tools.base import with_error_handling, with_tool_metrics # noqa: E402
96 | from ultimate_mcp_server.utils import get_logger # noqa: E402
97 |
98 | # --- Document Conversion Import ---
99 | try:
100 | from ultimate_mcp_server.tools.document_conversion_and_processing import convert_document
101 |
102 | _DOC_CONVERSION_AVAILABLE = True
103 | except ImportError:
104 | convert_document = None
105 | _DOC_CONVERSION_AVAILABLE = False
106 | print("Document conversion tool not available. Non-HTML input comparison will fail.")
107 | # ---------------------------------
108 |
109 | logger = get_logger("ultimate_mcp_server.tools.redline")
110 |
111 | # --- Add logger.exception if missing ---
112 | if not hasattr(logger, "exception"):
113 |
114 | def _logger_exception(msg, *args, **kw):
115 | logger.error(msg, *args, exc_info=True, **kw)
116 |
117 | logger.exception = _logger_exception # type: ignore[attr-defined]
118 | # -----------------------------------------
119 |
120 | # Namespace and configuration constants
121 | _DIFF_NS = "http://namespaces.shoobx.com/diff"
122 | _DIFF_PREFIX = "diff"
123 |
124 | # --- Synthetic ID Generation ---
125 | _id_counter = itertools.count(1)
126 |
127 | def _normalize_text(text: Optional[str]) -> str:
128 | """Collapses whitespace and strips leading/trailing space."""
129 | if text is None:
130 | return ""
131 | # Replace various whitespace chars with a single space, then strip
132 | normalized = re.sub(r"\s+", " ", text).strip()
133 | return normalized
134 |
135 | def _get_normalized_full_text(el: _Element) -> str:
136 | """Gets normalized text content of an element and its descendants,
137 | excluding script and style tags."""
138 | if el is None:
139 | return ""
140 | # Get text from all descendant text nodes, excluding those within script/style
141 | # We join with space to handle cases like <p>Text<b>bold</b> more</p>
142 | try:
143 | texts = el.xpath(".//text()[not(ancestor::script) and not(ancestor::style)]")
144 | full_text = " ".join(t.strip() for t in texts if t.strip())
145 | # Normalize the combined text
146 | return _normalize_text(full_text)
147 | except Exception as e:
148 | # Fallback for safety, though xpath should be robust
149 | logger.warning(f"XPath text extraction failed for <{el.tag}>: {e}. Falling back.")
150 | texts = [t for t in el.itertext() if t.strip()] # Less precise about script/style
151 | full_text = " ".join(t.strip() for t in texts)
152 | return _normalize_text(full_text)
153 |
154 | # Define significant attributes (adjust as needed)
155 | # These are attributes likely to uniquely identify an element or its purpose
156 | # Avoid volatile attributes like style, or overly common ones like class (unless very specific)
157 | _SIGNIFICANT_ATTRIBUTES = {"id", "href", "src", "name", "value", "title", "alt", "rel", "type"}
158 | # Consider adding data-* attributes if they are known to be stable identifiers in your source HTML
159 |
160 | # --- Fuzzy move detection configuration ---
161 | _MOVE_MIN_TOKENS: int = 5 # Ignore tiny fragments; raise to 8 for stricter pairing
162 | _MOVE_MIN_CHARS: int = 24 # Shorter text tends to be noise
163 | _MOVE_SIM_THRESHOLD: float = 0.82 # 0..1; lower to catch more, higher to be stricter
164 | _MOVE_MAX_CANDIDATES: int = 2000 # Safety bound for O(N*M) matching
165 | # Tags considered "blocky enough" to attempt fuzzy move pairing even if tags differ
166 | _BLOCK_TAGS: Set[str] = {
167 | "p","li","ul","ol","h1","h2","h3","h4","h5","h6","blockquote","pre","code",
168 | "section","article","aside","figure","figcaption","table","thead","tbody","tr","td","th","dl","dt","dd"
169 | }
170 | # Attributes that help anchor identity/context
171 | _ANCHOR_ATTRS: Set[str] = {"id","name","href","src","data-id","data-key","data-uid","aria-label","title"}
172 |
173 | def _inject_synthetic_ids(root: _Element, *, attr: str = "data-diff-id") -> None:
174 | """Inject synthetic IDs into elements based on tag, normalized full text,
175 | and significant attributes."""
176 | global _id_counter
177 | if root is None:
178 | return
179 |
180 | processed_elements = 0
181 | elements_with_ids = 0
182 |
183 | # Iterate through all elements in the tree
184 | for el in root.iter():
185 | if not isinstance(el, _Element):
186 | continue
187 |
188 | processed_elements += 1
189 |
190 | # Skip if ID already exists (e.g., from previous run or source)
191 | if el.get(attr):
192 | elements_with_ids +=1
193 | continue
194 |
195 | # 1. Get Tag
196 | tag = el.tag
197 |
198 | # 2. Get Normalized Full Text Content
199 | norm_text = _get_normalized_full_text(el)
200 |
201 | # 3. Get Normalized Significant Attributes
202 | sig_attrs = {}
203 | for k, v in el.attrib.items():
204 | # Check if attribute is considered significant OR if it's a data-* attribute
205 | # (often used for stable identifiers)
206 | # Exclude the synthetic ID attribute itself if looping
207 | if (k in _SIGNIFICANT_ATTRIBUTES or k.startswith("data-")) and k != attr:
208 | # Normalize attribute value's whitespace
209 | sig_attrs[k] = _normalize_text(v)
210 |
211 | # Sort significant attributes by key for consistent signature
212 | sorted_sig_attrs = tuple(sorted(sig_attrs.items()))
213 |
214 | # 4. Create Signature Tuple
215 | # Using a hash of the potentially long text to keep the signature manageable
216 | text_hash = hashlib.blake2b(norm_text.encode('utf-8', 'replace'), digest_size=8).hexdigest()
217 | sig_tuple = (tag, text_hash, sorted_sig_attrs)
218 |
219 | # 5. Generate Hash and Synthetic ID
220 | try:
221 | # Hash the representation of the signature tuple
222 | sig_repr = repr(sig_tuple).encode("utf-8", "replace")
223 | h = hashlib.blake2b(sig_repr, digest_size=8).hexdigest()
224 | # Combine counter and hash for uniqueness
225 | synthetic_id = f"synid_{next(_id_counter):06d}_{h}"
226 | el.set(attr, synthetic_id)
227 | elements_with_ids += 1
228 | except Exception as e:
229 | logger.warning(
230 | f"Failed to generate/set synthetic ID for element <{el.tag}> "
231 | f"(Text hash: {text_hash}, Attrs: {sorted_sig_attrs}): {e}"
232 | )
233 |
234 | logger.debug(f"ID Injection: Processed {processed_elements} elements, {elements_with_ids} have IDs.")
235 |
236 | # Helper to safely get attributes from actions
237 | def _safe_get_attr(action: Any, *attr_names: str, default: Any = None) -> Any:
238 | if action is None:
239 | return default
240 | for name in attr_names:
241 | if "." in name:
242 | parts = name.split(".")
243 | obj = action
244 | try:
245 | for part in parts:
246 | if obj is None or not hasattr(obj, part):
247 | obj = None
248 | break
249 | obj = getattr(obj, part)
250 | if obj is not None:
251 | return obj
252 | except (AttributeError, TypeError):
253 | continue
254 | elif hasattr(action, name):
255 | val = getattr(action, name)
256 | if val is not None:
257 | return val
258 | return default
259 |
260 |
261 | # ‑‑‑ Redline XML Formatter ‑‑‑
262 | class RedlineXMLFormatter:
263 | """Applies xmldiff actions using standardized diff:* attributes."""
264 |
265 | def __init__(self, **kwargs):
266 | self.detect_moves = kwargs.get("detect_moves", True)
267 | self.normalize = kwargs.get("normalize", formatting.WS_BOTH)
268 | self._orig_root: Optional[_Element] = None
269 | self._mod_root: Optional[_Element] = None
270 | self._annotated_copy_root: Optional[_Element] = None
271 | self._annotated_copy_tree: Optional[_ElementTree] = None
272 | self._actions: List[Any] = []
273 | self._node_map_orig_to_copy: Dict[_Element, _Element] = {}
274 | self._xpath_cache_orig: Dict[str, List[_Element]] = {}
275 | self._xpath_cache_mod: Dict[str, List[_Element]] = {}
276 | self.processed_actions: Dict[str, int] = {
277 | "insertions": 0,
278 | "deletions": 0,
279 | "moves": 0,
280 | "text_updates": 0,
281 | "attr_updates": 0,
282 | "renames": 0,
283 | "other_changes": 0,
284 | "errors": 0,
285 | "inline_insertions": 0,
286 | "inline_deletions": 0,
287 | }
288 | self._attr_changes: Dict[_Element, List[Dict[str, str]]] = {}
289 |
290 | def _reset_state(self):
291 | self._orig_root = None
292 | self._mod_root = None
293 | self._annotated_copy_root = None
294 | self._annotated_copy_tree = None
295 | self._actions = []
296 | self._node_map_orig_to_copy = {}
297 | self._xpath_cache_orig.clear()
298 | self._xpath_cache_mod.clear()
299 | self.processed_actions = {k: 0 for k in self.processed_actions}
300 | self._attr_changes.clear()
301 |
302 | @staticmethod
303 | def _add_diff_attribute(elem: _Element, name: str, value: Optional[str] = "true"):
304 | """Adds a diff:* attribute."""
305 | if elem is None:
306 | return
307 | if not isinstance(elem, _Element):
308 | return
309 | qname = f"{{{_DIFF_NS}}}{name}"
310 | val_str = str(value) if value is not None else ""
311 | try:
312 | elem.set(qname, val_str)
313 | except ValueError as e:
314 | logger.error(f"Failed to set attr '{qname}'='{val_str}' on <{elem.tag}>: {e}")
315 |
316 | def _add_attribute_change_detail(self, node: _Element, change_info: Dict[str, str]):
317 | if node not in self._attr_changes:
318 | self._attr_changes[node] = []
319 | self._attr_changes[node].append(change_info)
320 |
321 | def _aggregate_attribute_changes(self):
322 | for node, changes in self._attr_changes.items():
323 | if node is None or not changes:
324 | continue
325 | try:
326 | change_summary = json.dumps(changes)
327 | self._add_diff_attribute(node, "attributes", change_summary)
328 | except (TypeError, ValueError) as e:
329 | logger.error(f"Could not serialize attr changes for {node.tag}: {e}")
330 | self._add_diff_attribute(node, "attributes", "[Serialization Error]")
331 |
332 | def _get_node_from_xpath(self, xpath: str, tree_type: str) -> Optional[_Element]:
333 | if not xpath:
334 | return None
335 | root = (
336 | self._orig_root
337 | if tree_type == "original"
338 | else self._mod_root
339 | if tree_type == "modified"
340 | else None
341 | )
342 | cache = (
343 | self._xpath_cache_orig
344 | if tree_type == "original"
345 | else self._xpath_cache_mod
346 | if tree_type == "modified"
347 | else None
348 | )
349 | if root is None or cache is None:
350 | return None
351 | if xpath in cache:
352 | nodes = cache[xpath]
353 | return nodes[0] if nodes else None
354 | try:
355 | adjusted_xpath = xpath[2:] if xpath.startswith("/0/") else xpath
356 | nodes = root.xpath(adjusted_xpath)
357 | element_nodes = [n for n in nodes if isinstance(n, _Element)]
358 | cache[xpath] = element_nodes
359 | return element_nodes[0] if element_nodes else None
360 | except Exception:
361 | cache[xpath] = []
362 | return None
363 |
364 | def _get_corresponding_node_in_copy(self, orig_node: _Element) -> Optional[_Element]:
365 | if orig_node is None:
366 | return None
367 | if orig_node in self._node_map_orig_to_copy:
368 | return self._node_map_orig_to_copy[orig_node]
369 | if self._orig_root is not None and self._annotated_copy_root is not None:
370 | try:
371 | orig_xpath = self._orig_root.getroottree().getpath(orig_node)
372 | if orig_xpath:
373 | copy_nodes = self._annotated_copy_root.xpath(orig_xpath)
374 | if copy_nodes and isinstance(copy_nodes[0], _Element):
375 | self._node_map_orig_to_copy[orig_node] = copy_nodes[0]
376 | return copy_nodes[0]
377 | except Exception:
378 | pass
379 | return None
380 |
381 | def _build_initial_node_map(self):
382 | if self._orig_root is None or self._annotated_copy_root is None:
383 | return
384 | self._node_map_orig_to_copy.clear()
385 | orig_iter = self._orig_root.iter()
386 | copy_iter = self._annotated_copy_root.iter()
387 | try:
388 | while True:
389 | orig_node = next(orig_iter)
390 | copy_node = next(copy_iter)
391 | if isinstance(orig_node, _Element) and isinstance(copy_node, _Element):
392 | if (
393 | hasattr(orig_node, "tag")
394 | and hasattr(copy_node, "tag")
395 | and orig_node.tag == copy_node.tag
396 | ):
397 | self._node_map_orig_to_copy[orig_node] = copy_node
398 | except StopIteration:
399 | pass
400 | except Exception as e:
401 | logger.error(f"Error during initial node mapping: {e}")
402 | logger.debug(f"Built initial node map with {len(self._node_map_orig_to_copy)} entries.")
403 |
404 | def _find_node_in_copy_by_xpath(self, xpath: str) -> Optional[_Element]:
405 | if not xpath or self._annotated_copy_root is None:
406 | return None
407 | try:
408 | adjusted_xpath = xpath[2:] if xpath.startswith("/0/") else xpath
409 | nodes = self._annotated_copy_root.xpath(adjusted_xpath)
410 | if nodes and isinstance(nodes[0], _Element):
411 | return nodes[0]
412 | elif nodes:
413 | try:
414 | parent = nodes[0].getparent()
415 | if isinstance(parent, _Element):
416 | return parent
417 | except AttributeError:
418 | pass
419 | return None
420 | else:
421 | return None
422 | except Exception:
423 | return None
424 |
425 | # --- Action Handlers ---
426 |
427 | def _handle_delete_node(self, action: DeleteNode):
428 | node_xpath = _safe_get_attr(action, "node", "node_xpath", "target")
429 | if not node_xpath:
430 | logger.error(f"DeleteNode missing XPath: {action}")
431 | self.processed_actions["errors"] += 1
432 | return
433 | orig_node = self._get_node_from_xpath(node_xpath, "original")
434 | if orig_node is None:
435 | logger.warning(f"DeleteNode: Original node {node_xpath} not found.")
436 | return
437 | copy_node = self._get_corresponding_node_in_copy(orig_node)
438 | if copy_node is None:
439 | logger.warning(f"DeleteNode: Copy node for {node_xpath} not found.")
440 | return
441 |
442 | move_id = _safe_get_attr(action, "move_id")
443 | if not move_id:
444 | move_node = next(
445 | (
446 | a
447 | for a in self._actions
448 | if isinstance(a, MoveNode) and _safe_get_attr(a, "node", "source") == node_xpath
449 | ),
450 | None,
451 | )
452 | move_id = _safe_get_attr(move_node, "move_id")
453 |
454 | if move_id:
455 | self._add_diff_attribute(copy_node, "op", "move-source")
456 | self._add_diff_attribute(copy_node, "move-id", move_id)
457 | else:
458 | self._add_diff_attribute(copy_node, "op", "delete")
459 | self.processed_actions["deletions"] += 1
460 |
461 | def _handle_insert_node(self, action: InsertNode):
462 | parent_xpath = _safe_get_attr(action, "parent_xpath", "target")
463 | node_structure = _safe_get_attr(action, "node")
464 | tag = _safe_get_attr(action, "tag")
465 | pos = _safe_get_attr(action, "pos", "position")
466 | sibling_xpath = _safe_get_attr(action, "sibling_xpath")
467 | if not parent_xpath:
468 | logger.error(f"InsertNode missing parent: {action}")
469 | self.processed_actions["errors"] += 1
470 | return
471 |
472 | node_to_insert = None
473 | if node_structure is not None and isinstance(node_structure, _Element):
474 | try:
475 | # Attempt to clone from the action object first
476 | node_to_insert = deepcopy(node_structure) # Use deepcopy
477 | # node_to_insert = etree.fromstring(etree.tostring(node_structure))
478 | except Exception as e:
479 | logger.error(f"InsertNode clone failed: {e}")
480 |
481 | # # ➜ REMOVE THIS BLOCK ------------------------------------------
482 | # if node_to_insert is None:
483 | # # ➜ NEW: find the inserted node in the MODIFIED tree ----------------
484 | # if parent_xpath:
485 | # mod_parent = self._get_node_from_xpath(parent_xpath, "modified")
486 | # if mod_parent is not None:
487 | # try:
488 | # idx = int(pos) if str(pos).isdigit() else len(mod_parent) - 1
489 | # # Make sure to clone the node from the modified tree
490 | # potential_node = mod_parent[idx]
491 | # if potential_node is not None:
492 | # node_to_insert = deepcopy(potential_node)
493 | # logger.debug(f"InsertNode: Fetched node <{node_to_insert.tag}> from modified tree.")
494 | # else:
495 | # node_to_insert = None
496 | # except Exception as e:
497 | # logger.warning(f"InsertNode: Failed to fetch node from modified tree at {parent_xpath}[{pos}]: {e}")
498 | # node_to_insert = None
499 | # # ---------------------------------------------------------------
500 |
501 | # If cloning/fetching failed, create a placeholder
502 | if node_to_insert is None:
503 | if tag:
504 | attrs = _safe_get_attr(action, "attrib", "attributes", default={}) or {}
505 | node_to_insert = etree.Element(tag, attrs)
506 | # Make placeholder text more distinct
507 | node_to_insert.text = f"[Placeholder: Inserted <{tag}> content missing]"
508 | logger.warning(f"InsertNode created placeholder <{tag}> because node structure was missing in action and couldn't be fetched.")
509 | else:
510 | logger.error("InsertNode failed: No structure/tag provided in action.")
511 | self.processed_actions["errors"] += 1
512 | return
513 |
514 | move_id = _safe_get_attr(action, "move_id")
515 | is_move_target = bool(move_id)
516 | if not move_id:
517 | move_node = next(
518 | (
519 | a
520 | for a in self._actions
521 | if isinstance(a, MoveNode)
522 | and _safe_get_attr(a, "target") == parent_xpath
523 | and str(_safe_get_attr(a, "pos", "position")) == str(pos)
524 | ),
525 | None,
526 | )
527 | if move_node:
528 | move_id = _safe_get_attr(move_node, "move_id")
529 | is_move_target = bool(move_id)
530 | if not move_id:
531 | logger.warning(f"Insert seems move target but MoveNode lacks ID: {action}")
532 |
533 | if is_move_target and move_id:
534 | if not any(isinstance(a, MoveNode) for a in self._actions):
535 | self.processed_actions["moves"] += 1
536 | pass
537 | else:
538 | self._add_diff_attribute(node_to_insert, "op", "insert")
539 | if is_move_target:
540 | self.processed_actions["errors"] += 1
541 | logger.warning(
542 | f"Marking node <{node_to_insert.tag}> as insert (was move target w/o ID)."
543 | )
544 | self.processed_actions["insertions"] += 1
545 |
546 | if not (is_move_target and any(isinstance(a, MoveNode) for a in self._actions)):
547 | target_node_in_copy = self._find_node_in_copy_by_xpath(
548 | parent_xpath if pos == "into" or isinstance(pos, int) else sibling_xpath
549 | )
550 | if target_node_in_copy is None:
551 | logger.error(
552 | f"InsertNode: Target node not found in COPY. XPath: '{parent_xpath if pos == 'into' or isinstance(pos, int) else sibling_xpath}'."
553 | )
554 | self.processed_actions["errors"] += 1
555 | return
556 | try:
557 | if pos == "into" or isinstance(pos, int):
558 | parent = target_node_in_copy
559 | idx = (
560 | int(pos)
561 | if isinstance(pos, int) or (isinstance(pos, str) and pos.isdigit())
562 | else len(parent)
563 | )
564 | idx = max(0, min(idx, len(parent)))
565 | parent.insert(idx, node_to_insert)
566 | elif pos == "before":
567 | sibling = target_node_in_copy
568 | parent = sibling.getparent()
569 | parent.insert(parent.index(sibling), node_to_insert)
570 | elif pos == "after":
571 | sibling = target_node_in_copy
572 | parent = sibling.getparent()
573 | parent.insert(parent.index(sibling) + 1, node_to_insert)
574 | else:
575 | raise ValueError(f"Unknown pos '{pos}'")
576 | except Exception as e:
577 | logger.exception(f"InsertNode insert error: {e}")
578 | self.processed_actions["errors"] += 1
579 |
580 | def _handle_move_node(self, action: MoveNode):
581 | """Handle move: Ensure source marked, insert clone at target."""
582 | src_xpath = _safe_get_attr(action, "node", "source")
583 | tgt_xpath = _safe_get_attr(action, "target")
584 | pos = _safe_get_attr(action, "pos", "position", default="into") # Keep default 'into' if missing
585 | move_id = _safe_get_attr(action, "move_id")
586 |
587 | if not src_xpath or not tgt_xpath or not move_id:
588 | # Use ToolError for critical diff engine issues
589 | raise ToolError(
590 | f"xmldiff produced a MoveNode without complete data "
591 | f"(src={src_xpath!r}, tgt={tgt_xpath!r}, id={move_id!r}). "
592 | "This indicates malfunction in the diff stage.",
593 | code="DIFF_ENGINE_ERROR",
594 | )
595 |
596 | orig_src_node = self._get_node_from_xpath(src_xpath, "original")
597 | if orig_src_node is None:
598 | logger.error(f"MoveNode {move_id}: Original source node {src_xpath} not found.")
599 | self.processed_actions["errors"] += 1
600 | return
601 | copy_src_node = self._get_corresponding_node_in_copy(orig_src_node)
602 | if copy_src_node is None:
603 | # Log error but attempt to continue if possible - maybe source was deleted then moved? Unlikely but cover edge case.
604 | logger.error(f"MoveNode {move_id}: Corresponding copy source node for {src_xpath} not found.")
605 | # If the source isn't in the copy, we can't mark it, but we still need to insert the target.
606 | # No need to return here, proceed to insert the target.
607 | # self.processed_actions["errors"] += 1 # Maybe not an error if source was already removed by another action?
608 | else:
609 | # Ensure the source node in the copy is marked correctly
610 | # It might have been marked by _handle_delete_node already if xmldiff emits Delete then Move
611 | # Check if marking is already correct to avoid redundant logging/work
612 | if (
613 | copy_src_node.get(f"{{{_DIFF_NS}}}op") != "move-source"
614 | or copy_src_node.get(f"{{{_DIFF_NS}}}move-id") != move_id
615 | ):
616 | logger.debug(f"MoveNode {move_id}: Marking source node {src_xpath} in copy.")
617 | self._add_diff_attribute(copy_src_node, "op", "move-source")
618 | self._add_diff_attribute(copy_src_node, "move-id", move_id)
619 |
620 | # --- Determine the node to clone ---
621 | # The goal is to clone the node *as it exists in the modified document*
622 | # The `action` tells us where it ended up (tgt_xpath, pos).
623 | node_to_clone = None
624 | mod_target_parent = self._get_node_from_xpath(tgt_xpath, "modified")
625 |
626 | if mod_target_parent is not None and isinstance(pos, int) and pos >= 0:
627 | try:
628 | # Get the actual node from the modified tree at the target position
629 | node_to_clone = mod_target_parent[pos]
630 | logger.debug(f"MoveNode {move_id}: Found node to clone in MODIFIED tree at {tgt_xpath}[{pos}].")
631 | except IndexError:
632 | logger.warning(f"MoveNode {move_id}: Index {pos} out of bounds for target parent {tgt_xpath} in MODIFIED tree. Parent has {len(mod_target_parent)} children.")
633 | except Exception as e:
634 | logger.warning(f"MoveNode {move_id}: Error accessing node at {tgt_xpath}[{pos}] in MODIFIED tree: {e}")
635 | elif mod_target_parent is not None and pos == "into": # Handle insertion 'into' as append
636 | try:
637 | # If pos is 'into', it usually implies appending. The moved node would be the last child.
638 | # However, xmldiff usually gives an integer position for moves.
639 | # Let's try finding based on the source node's ID if possible, as a fallback.
640 | mod_node_with_same_id = mod_target_parent.xpath(f".//*[@data-diff-id='{orig_src_node.get('data-diff-id')}']")
641 | if mod_node_with_same_id:
642 | node_to_clone = mod_node_with_same_id[0]
643 | logger.debug(f"MoveNode {move_id}: Found node to clone in MODIFIED tree based on ID matching source ID within {tgt_xpath}.")
644 | else:
645 | logger.warning(f"MoveNode {move_id}: Position is '{pos}', couldn't find node to clone in MODIFIED target parent {tgt_xpath} by index or ID.")
646 |
647 | except Exception as e:
648 | logger.warning(f"MoveNode {move_id}: Error finding node in MODIFIED target parent {tgt_xpath} for pos='{pos}': {e}")
649 |
650 |
651 | if node_to_clone is None:
652 | # Fallback: Clone the original source node. This might lose internal changes.
653 | node_to_clone = orig_src_node
654 | logger.warning(
655 | f"MoveNode {move_id}: Could not find moved node in MODIFIED tree at {tgt_xpath}:{pos}. "
656 | f"Falling back to cloning ORIGINAL source node {src_xpath}. Internal changes might be lost."
657 | )
658 |
659 | # --- Clone and prepare the node for insertion ---
660 | try:
661 | # Use deepcopy which might be more robust for lxml elements than fromstring(tostring)
662 | cloned_node_for_insert = deepcopy(node_to_clone)
663 | if cloned_node_for_insert is None: raise ValueError("Deepcopy resulted in None") # noqa: E701
664 | except Exception as e:
665 | logger.error(f"MoveNode {move_id}: Cloning node failed: {e}")
666 | self.processed_actions["errors"] += 1
667 | return
668 |
669 | # --- Clean and mark the cloned node ---
670 | # Remove any pre-existing diff attributes from the clone and its descendants
671 | for el in cloned_node_for_insert.xpath(".//* | ."): # Iterate over self and descendants
672 | if isinstance(el, _Element):
673 | for name in list(el.attrib):
674 | if name.startswith(f"{{{_DIFF_NS}}}"):
675 | del el.attrib[name]
676 | # Also remove the synthetic ID from the clone to avoid collisions if diff runs again
677 | if name == "data-diff-id":
678 | del el.attrib[name]
679 |
680 | # Mark the root of the clone as the move target
681 | self._add_diff_attribute(cloned_node_for_insert, "op", "move-target")
682 | self._add_diff_attribute(cloned_node_for_insert, "move-id", move_id)
683 |
684 | # --- Insert the cloned node into the copy tree ---
685 | target_node_in_copy = self._find_node_in_copy_by_xpath(tgt_xpath)
686 | if target_node_in_copy is None:
687 | logger.error(f"MoveNode {move_id}: Target parent node {tgt_xpath} not found in COPY tree for insertion.")
688 | self.processed_actions["errors"] += 1
689 | # Attempt to insert into the root as a last resort? Or just fail? Let's fail.
690 | return
691 |
692 | try:
693 | if isinstance(pos, int) and pos >= 0:
694 | # Insert at the specific index within the target parent found in the copy tree
695 | parent = target_node_in_copy
696 | # Clamp index to valid range for insertion
697 | idx = max(0, min(int(pos), len(parent)))
698 | parent.insert(idx, cloned_node_for_insert)
699 | logger.debug(f"MoveNode {move_id}: Inserted move-target clone into copy tree at {tgt_xpath}[{idx}].")
700 | self.processed_actions["moves"] += 1
701 | elif pos == "into": # Handle 'into' - append to the target node
702 | parent = target_node_in_copy
703 | parent.append(cloned_node_for_insert)
704 | logger.debug(f"MoveNode {move_id}: Appended move-target clone into copy tree node {tgt_xpath}.")
705 | self.processed_actions["moves"] += 1
706 | else:
707 | # This case (e.g., pos='before'/'after') shouldn't happen with MoveNode from xmldiff typically,
708 | # as it uses parent path + index. Log an error if it does.
709 | logger.error(f"MoveNode {move_id}: Unsupported position '{pos}' for insertion. Expected integer or 'into'.")
710 | self.processed_actions["errors"] += 1
711 | except Exception as e:
712 | logger.exception(f"MoveNode {move_id}: Insertion of cloned node into copy tree failed: {e}")
713 | self.processed_actions["errors"] += 1
714 |
715 | def _handle_update_text(self, action: Union[UpdateTextIn, UpdateTextBefore, UpdateTextAfter]):
716 | xpath = _safe_get_attr(action, "node", "node_xpath")
717 | new_text = _safe_get_attr(action, "text", "new", "new_text", default="")
718 | if not xpath:
719 | logger.error(f"{type(action).__name__} missing XPath: {action}")
720 | self.processed_actions["errors"] += 1
721 | return
722 | if _fine_grained_text_actions and type(action) in (_INSERT_TEXT_CLS + _DELETE_TEXT_CLS):
723 | return
724 |
725 | copy_node = self._find_node_in_copy_by_xpath(xpath)
726 | if copy_node is None:
727 | logger.warning(f"{type(action).__name__}: Node {xpath} not found in COPY.")
728 | return
729 |
730 | orig_node = self._get_node_from_xpath(xpath, "original")
731 | actual_old_text = "[Unknown Old Text]"
732 | update_type = "text"
733 | if orig_node is not None:
734 | if isinstance(action, UpdateTextIn):
735 | actual_old_text = orig_node.text or ""
736 | update_type = "text"
737 | elif UpdateTextAfter is not None and isinstance(action, UpdateTextAfter):
738 | actual_old_text = orig_node.tail or ""
739 | update_type = "tail"
740 | elif UpdateTextBefore is not None and isinstance(action, UpdateTextBefore):
741 | actual_old_text = _safe_get_attr(action, "old", "old_text", default="[?]")
742 | update_type = "before"
743 | else:
744 | actual_old_text = orig_node.text or ""
745 | else:
746 | actual_old_text = _safe_get_attr(
747 | action, "old", "old_text", default="[Missing Orig Node]"
748 | )
749 |
750 | norm_old = " ".join(str(actual_old_text).split())
751 | norm_new = " ".join(str(new_text).split())
752 | if norm_old == norm_new:
753 | return
754 |
755 | if update_type == "text":
756 | copy_node.text = new_text
757 | if len(copy_node) > 0:
758 | for child in list(copy_node):
759 | copy_node.remove(child)
760 | self._add_diff_attribute(copy_node, "op", "update-text")
761 | self._add_diff_attribute(copy_node, "old-value", actual_old_text)
762 | self.processed_actions["text_updates"] += 1
763 | elif update_type == "tail":
764 | copy_node.tail = new_text
765 | self._add_diff_attribute(copy_node, "op", "update-tail")
766 | self._add_diff_attribute(copy_node, "old-value", actual_old_text)
767 | self.processed_actions["text_updates"] += 1
768 | elif update_type == "before":
769 | self._add_diff_attribute(copy_node, "op", "update-text-context")
770 | self._add_diff_attribute(copy_node, "detail", "before")
771 | self._add_diff_attribute(copy_node, "new-value", new_text)
772 | self._add_diff_attribute(copy_node, "old-value", actual_old_text)
773 | self.processed_actions["text_updates"] += 1
774 | logger.warning(f"UpdateTextBefore marked on node {xpath}.")
775 |
776 | def _handle_attr_change(
777 | self, action: Union[UpdateAttrib, InsertAttrib, DeleteAttrib, RenameAttrib]
778 | ):
779 | xpath = _safe_get_attr(action, "node", "node_xpath")
780 | if not xpath:
781 | logger.error(f"{type(action).__name__} missing XPath: {action}")
782 | self.processed_actions["errors"] += 1
783 | return
784 | copy_node = self._find_node_in_copy_by_xpath(xpath)
785 | if copy_node is None:
786 | logger.warning(f"{type(action).__name__}: Node {xpath} not found in COPY.")
787 | return
788 | orig_node = self._get_node_from_xpath(xpath, "original")
789 |
790 | change_info = {}
791 | processed = False
792 | try:
793 | if isinstance(action, UpdateAttrib):
794 | name = _safe_get_attr(action, "name")
795 | new_val = _safe_get_attr(action, "value", "new", default="")
796 | if name is None:
797 | logger.error(f"UpdateAttrib missing name: {action}")
798 | return
799 | old_val = orig_node.get(name) if orig_node is not None else "[?]"
800 | if old_val != new_val:
801 | copy_node.set(name, new_val)
802 | change_info = {"op": "update", "name": name, "old": old_val, "new": new_val}
803 | self._add_attribute_change_detail(copy_node, change_info)
804 | self.processed_actions["attr_updates"] += 1
805 | processed = True
806 | elif isinstance(action, InsertAttrib):
807 | name = _safe_get_attr(action, "name")
808 | value = _safe_get_attr(action, "value", default="")
809 | if name is None:
810 | logger.error(f"InsertAttrib missing name: {action}")
811 | return
812 | copy_node.set(name, value)
813 | change_info = {"op": "insert", "name": name, "new": value}
814 | self._add_attribute_change_detail(copy_node, change_info)
815 | self.processed_actions["attr_updates"] += 1
816 | processed = True
817 | elif isinstance(action, DeleteAttrib):
818 | name = _safe_get_attr(action, "name")
819 | if name is None:
820 | logger.error(f"DeleteAttrib missing name: {action}")
821 | return
822 | old_val = orig_node.get(name) if orig_node is not None else "[?]"
823 | if name in copy_node.attrib:
824 | del copy_node.attrib[name]
825 | change_info = {"op": "delete", "name": name, "old": old_val}
826 | self._add_attribute_change_detail(copy_node, change_info)
827 | self.processed_actions["attr_updates"] += 1
828 | processed = True
829 | elif isinstance(action, RenameAttrib):
830 | old_n = _safe_get_attr(action, "old_name")
831 | new_n = _safe_get_attr(action, "new_name")
832 | if not old_n or not new_n:
833 | logger.error(f"RenameAttrib missing names: {action}")
834 | return
835 | value = orig_node.get(old_n) if orig_node is not None else "[?]"
836 | if old_n in copy_node.attrib:
837 | del copy_node.attrib[old_n]
838 | copy_node.set(new_n, value)
839 | change_info = {"op": "rename", "old_name": old_n, "new_name": new_n, "value": value}
840 | self._add_attribute_change_detail(copy_node, change_info)
841 | self.processed_actions["attr_updates"] += 1
842 | self.processed_actions["renames"] += 1
843 | processed = True
844 | if processed and copy_node.get(f"{{{_DIFF_NS}}}op") is None:
845 | self._add_diff_attribute(copy_node, "op", "update-attrib")
846 | except Exception as e:
847 | logger.exception(f"Attr change error for {xpath}: {e}")
848 | self.processed_actions["errors"] += 1
849 |
850 | def _handle_rename_node(self, action: RenameNode):
851 | xpath = _safe_get_attr(action, "node", "node_xpath")
852 | new_tag = _safe_get_attr(action, "new_tag", "new_name")
853 | if not xpath or not new_tag:
854 | logger.error(f"RenameNode missing xpath/new_tag: {action}")
855 | self.processed_actions["errors"] += 1
856 | return
857 | copy_node = self._find_node_in_copy_by_xpath(xpath)
858 | if copy_node is None:
859 | logger.warning(f"RenameNode: Node {xpath} not found in COPY.")
860 | return
861 | orig_node = self._get_node_from_xpath(xpath, "original")
862 | old_tag = orig_node.tag if orig_node is not None else copy_node.tag
863 | if old_tag != new_tag:
864 | copy_node.tag = new_tag
865 | self._add_diff_attribute(copy_node, "op", "rename-node")
866 | self._add_diff_attribute(copy_node, "old-value", old_tag)
867 | self._add_diff_attribute(copy_node, "new-value", new_tag)
868 | self.processed_actions["renames"] += 1
869 |
870 | def _handle_insert_text_node(
871 | self, action: Union[InsertTextIn, InsertTextBefore, InsertTextAfter]
872 | ):
873 | xpath = _safe_get_attr(action, "node", "node_xpath")
874 | text = _safe_get_attr(action, "text", "value", default="")
875 | if not xpath:
876 | logger.error(f"{type(action).__name__} missing xpath: {action}")
877 | self.processed_actions["errors"] += 1
878 | return
879 | copy_node = self._find_node_in_copy_by_xpath(xpath)
880 | if copy_node is None:
881 | logger.warning(f"{type(action).__name__}: Ref node {xpath} missing in copy.")
882 | return
883 | ins_el = etree.Element("ins", attrib={"class": "diff-insert-text"})
884 | ins_el.text = text
885 | try:
886 | if InsertTextBefore is not None and isinstance(action, InsertTextBefore):
887 | parent = copy_node.getparent()
888 | parent.insert(parent.index(copy_node), ins_el)
889 | self.processed_actions["inline_insertions"] += 1
890 | elif InsertTextAfter is not None and isinstance(action, InsertTextAfter):
891 | ins_el.tail = copy_node.tail
892 | copy_node.tail = None
893 | copy_node.addnext(ins_el)
894 | self.processed_actions["inline_insertions"] += 1
895 | elif InsertTextIn is not None and isinstance(action, InsertTextIn):
896 | pos = _safe_get_attr(action, "pos", default=len(copy_node))
897 | idx = int(pos) if isinstance(pos, int) or str(pos).isdigit() else len(copy_node)
898 | idx = max(0, min(idx, len(copy_node)))
899 | copy_node.insert(idx, ins_el)
900 | self.processed_actions["inline_insertions"] += 1
901 | else:
902 | logger.warning(f"Unhandled InsertText: {type(action).__name__}.")
903 | copy_node.append(ins_el)
904 | self.processed_actions["inline_insertions"] += 1
905 | except Exception as e:
906 | logger.exception(f"InsertText error: {e}")
907 | self.processed_actions["errors"] += 1
908 |
909 | def _handle_delete_text_node(
910 | self, action: Union[DeleteTextIn, DeleteTextBefore, DeleteTextAfter]
911 | ):
912 | xpath = _safe_get_attr(action, "node", "node_xpath")
913 | text = _safe_get_attr(action, "text", "value")
914 | if not xpath or text is None:
915 | logger.error(f"{type(action).__name__} missing xpath/text: {action}")
916 | self.processed_actions["errors"] += 1
917 | return
918 | copy_node = self._find_node_in_copy_by_xpath(xpath)
919 | if copy_node is None:
920 | logger.warning(f"{type(action).__name__}: Ref node {xpath} missing in copy.")
921 | return
922 | del_el = etree.Element("del", attrib={"class": "diff-delete-text"})
923 | del_el.text = text
924 | try:
925 | if DeleteTextBefore is not None and isinstance(action, DeleteTextBefore):
926 | parent = copy_node.getparent()
927 | parent.insert(parent.index(copy_node), del_el)
928 | self.processed_actions["inline_deletions"] += 1
929 | elif DeleteTextAfter is not None and isinstance(action, DeleteTextAfter):
930 | orig_tail = copy_node.tail
931 | copy_node.tail = None
932 | del_el.tail = orig_tail
933 | copy_node.addnext(del_el)
934 | self.processed_actions["inline_deletions"] += 1
935 | elif DeleteTextIn is not None and isinstance(action, DeleteTextIn):
936 | pos = _safe_get_attr(action, "pos", default=0)
937 | idx = int(pos) if isinstance(pos, int) or str(pos).isdigit() else 0
938 | idx = max(0, min(idx, len(copy_node)))
939 | if copy_node.text and text in copy_node.text:
940 | copy_node.text = copy_node.text.replace(text, "", 1)
941 | copy_node.insert(idx, del_el)
942 | self.processed_actions["inline_deletions"] += 1
943 | else:
944 | logger.warning(f"Unhandled DeleteText: {type(action).__name__}.")
945 | copy_node.insert(0, del_el)
946 | self.processed_actions["inline_deletions"] += 1
947 | except Exception as e:
948 | logger.exception(f"DeleteText error: {e}")
949 | self.processed_actions["errors"] += 1
950 |
951 | # --- Main Formatting Method ---
952 |
953 | def format(
954 | self, actions: List[Any], orig_doc: _ElementTree, mod_doc: _ElementTree
955 | ) -> _ElementTree:
956 | """Applies diff actions to a copy of orig_doc."""
957 | self._reset_state()
958 | self._actions = actions
959 | logger.debug(f"Formatter init with {len(actions)} actions.")
960 | self._orig_root = orig_doc.getroot()
961 | self._mod_root = mod_doc.getroot()
962 | if self._orig_root is None or self._mod_root is None:
963 | raise ValueError("Docs missing root.")
964 | try:
965 | self._annotated_copy_tree = deepcopy(orig_doc)
966 | self._annotated_copy_root = self._annotated_copy_tree.getroot()
967 | assert self._annotated_copy_root is not None
968 | except Exception as e:
969 | logger.exception("Deepcopy failed.")
970 | raise RuntimeError("Copy failed.") from e
971 | self._build_initial_node_map()
972 |
973 | etree.register_namespace(_DIFF_PREFIX, _DIFF_NS)
974 | if (
975 | self._annotated_copy_root is not None
976 | and _DIFF_PREFIX not in self._annotated_copy_root.nsmap
977 | ):
978 | new_nsmap = self._annotated_copy_root.nsmap.copy()
979 | new_nsmap[_DIFF_PREFIX] = _DIFF_NS
980 | new_root = etree.Element(
981 | self._annotated_copy_root.tag,
982 | nsmap=new_nsmap,
983 | attrib=self._annotated_copy_root.attrib,
984 | )
985 | new_root.text = self._annotated_copy_root.text
986 | new_root.tail = self._annotated_copy_root.tail
987 | for child in self._annotated_copy_root:
988 | new_root.append(child)
989 | self._annotated_copy_tree._setroot(new_root)
990 | self._annotated_copy_root = new_root
991 | logger.debug(f"Registered '{_DIFF_PREFIX}' ns.")
992 |
993 | action_handlers = {
994 | DeleteNode: self._handle_delete_node,
995 | InsertNode: self._handle_insert_node,
996 | MoveNode: self._handle_move_node,
997 | UpdateTextIn: self._handle_update_text,
998 | UpdateAttrib: self._handle_attr_change,
999 | InsertAttrib: self._handle_attr_change,
1000 | DeleteAttrib: self._handle_attr_change,
1001 | RenameAttrib: self._handle_attr_change,
1002 | }
1003 | if RenameNode:
1004 | action_handlers[RenameNode] = self._handle_rename_node
1005 | if UpdateTextBefore:
1006 | action_handlers[UpdateTextBefore] = self._handle_update_text
1007 | if UpdateTextAfter:
1008 | action_handlers[UpdateTextAfter] = self._handle_update_text
1009 | if _fine_grained_text_actions:
1010 | for cls in _INSERT_TEXT_CLS:
1011 | action_handlers[cls] = self._handle_insert_text_node
1012 | for cls in _DELETE_TEXT_CLS:
1013 | action_handlers[cls] = self._handle_delete_text_node
1014 |
1015 | logger.info(f"Applying {len(actions)} actions to the document copy...")
1016 | for i, action in enumerate(actions):
1017 | atype = type(action)
1018 | handler = action_handlers.get(atype)
1019 | if handler:
1020 | try:
1021 | handler(action)
1022 | except Exception:
1023 | logger.exception(f"Handler error #{i + 1} ({atype.__name__}): {action}")
1024 | self.processed_actions["errors"] += 1
1025 | elif atype == InsertComment:
1026 | pass
1027 | else:
1028 | logger.warning(f"Unhandled action: {atype.__name__}")
1029 | self.processed_actions["other_changes"] += 1
1030 |
1031 | self._aggregate_attribute_changes()
1032 |
1033 | total = sum(
1034 | v for k, v in self.processed_actions.items() if k not in ["total_changes", "errors"]
1035 | )
1036 | self.processed_actions["total_changes"] = total
1037 | logger.info(f"Action processing complete. Stats: {self.processed_actions}")
1038 | if self._annotated_copy_tree is None:
1039 | raise RuntimeError("Formatting failed, tree is None.")
1040 | return self._annotated_copy_tree
1041 |
1042 |
1043 | # ─────────────────────────────────────────────────────────────────────────────
1044 | # Markdown summary-generation helpers
1045 | # ─────────────────────────────────────────────────────────────────────────────
1046 | def _node_plain_text(node: Optional[_Element], *, max_len: int = 120) -> str:
1047 | if node is None:
1048 | return "[Node is None]"
1049 | try:
1050 | texts = [t for t in node.xpath(".//text()[not(parent::script) and not(parent::style)]")]
1051 | txt = " ".join(t.strip() for t in texts if t.strip())
1052 | txt = re.sub(r"\s+", " ", txt).strip()
1053 | except Exception as e:
1054 | logger.warning(f"Text extract error: {e}")
1055 | txt = " ".join(node.itertext()).strip()
1056 | txt = re.sub(r"\s+", " ", txt).strip()
1057 | return textwrap.shorten(txt, max_len, placeholder="…") if max_len else txt
1058 |
1059 |
1060 | def _get_element_by_xpath_from_tree(xpath: str, tree: _ElementTree) -> Optional[_Element]:
1061 | if not xpath or tree is None:
1062 | return None
1063 | root = tree.getroot()
1064 | if root is None:
1065 | return None
1066 | try:
1067 | nodes = root.xpath(xpath[2:] if xpath.startswith("/0/") else xpath)
1068 | return next((n for n in nodes if isinstance(n, _Element)), None)
1069 | except Exception:
1070 | return None
1071 |
1072 |
1073 | def _generate_markdown_summary(
1074 | *,
1075 | orig_doc: _ElementTree,
1076 | mod_doc: _ElementTree,
1077 | actions: List[Any],
1078 | context_chars: int = 120,
1079 | ) -> str:
1080 | ts = _dt.datetime.now(timezone.utc).isoformat(timespec="seconds") + "Z"
1081 | lines: List[str] = [f"# Detected Redline Differences ({ts})\n"]
1082 |
1083 | if orig_doc is None or mod_doc is None:
1084 | return "# Error: Missing docs."
1085 |
1086 | processed_move_ids: set[str] = set()
1087 | move_actions: Dict[str, Dict[str, Any]] = {}
1088 |
1089 | # --------------------------------------------------------------------- #
1090 | # Pass 1 – collect information about moves
1091 | # --------------------------------------------------------------------- #
1092 | for a in actions:
1093 | if isinstance(a, MoveNode):
1094 | mid = _safe_get_attr(a, "move_id")
1095 | src = _safe_get_attr(a, "node", "source")
1096 | tgt = _safe_get_attr(a, "target")
1097 | pos = _safe_get_attr(a, "pos", "position")
1098 | if mid and src and tgt:
1099 | move_actions.setdefault(mid, {})["src"] = src
1100 | move_actions[mid].update({"tgt": tgt, "pos": pos, "found": True})
1101 | processed_move_ids.add(mid)
1102 |
1103 | elif isinstance(a, DeleteNode):
1104 | mid = _safe_get_attr(a, "move_id")
1105 | src = _safe_get_attr(a, "node", "node_xpath", "target")
1106 | if mid and src and mid not in processed_move_ids:
1107 | move_actions.setdefault(mid, {})["src"] = src
1108 | move_actions[mid].update({"tgt": "?", "pos": "?", "found": False})
1109 | processed_move_ids.add(mid)
1110 |
1111 | elif isinstance(a, InsertNode):
1112 | mid = _safe_get_attr(a, "move_id")
1113 | tgt = _safe_get_attr(a, "parent_xpath", "target")
1114 | pos = _safe_get_attr(a, "pos", "position")
1115 | if mid and tgt and mid in move_actions and not move_actions[mid]["found"]:
1116 | move_actions[mid].update({"tgt": tgt, "pos": pos})
1117 |
1118 | # --------------------------------------------------------------------- #
1119 | # Moves section
1120 | # --------------------------------------------------------------------- #
1121 | if move_actions:
1122 | lines.append("## Moves\n")
1123 | for mid, info in move_actions.items():
1124 | src_elem = _get_element_by_xpath_from_tree(info["src"], orig_doc)
1125 | tgt_elem = _get_element_by_xpath_from_tree(info["tgt"], mod_doc)
1126 | loc = f"into <{tgt_elem.tag}>" if tgt_elem is not None else f"near {info['tgt']}"
1127 | content_txt = _node_plain_text(src_elem, max_len=context_chars) if src_elem else "[?]"
1128 |
1129 | lines.extend(
1130 | [
1131 | f"### Move ID: `{mid}`",
1132 | f"- **From:** `{info['src']}`",
1133 | f"- **To:** `{loc}` (Pos: {info['pos']})",
1134 | "- **Content:**",
1135 | " ```text",
1136 | f" {content_txt}",
1137 | " ```\n",
1138 | ]
1139 | )
1140 | lines.append("---\n")
1141 |
1142 | # --------------------------------------------------------------------- #
1143 | # Headings map
1144 | # --------------------------------------------------------------------- #
1145 | hdrs = {
1146 | InsertNode: "## Insertions\n",
1147 | DeleteNode: "## Deletions\n",
1148 | UpdateTextIn: "## Text Updates\n",
1149 | UpdateAttrib: "## Attr Updates\n",
1150 | InsertAttrib: "## Attr Updates\n",
1151 | DeleteAttrib: "## Attr Updates\n",
1152 | RenameAttrib: "## Attr Updates\n",
1153 | RenameNode: "## Node Renames\n",
1154 | }
1155 |
1156 | cur_sec = None
1157 |
1158 | # --------------------------------------------------------------------- #
1159 | # Main pass – every non-move action
1160 | # --------------------------------------------------------------------- #
1161 | for a in actions:
1162 | atype = type(a)
1163 | mid = _safe_get_attr(a, "move_id")
1164 | if isinstance(a, MoveNode) or (mid and mid in processed_move_ids):
1165 | continue
1166 |
1167 | # heading management
1168 | if atype in hdrs:
1169 | if hdrs[atype] != cur_sec:
1170 | if cur_sec:
1171 | lines.append("---\n")
1172 | lines.append(hdrs[atype])
1173 | cur_sec = hdrs[atype]
1174 | else:
1175 | if cur_sec != "## Other Changes\n":
1176 | if cur_sec:
1177 | lines.append("---\n")
1178 | lines.append("## Other Changes\n")
1179 | cur_sec = "## Other Changes\n"
1180 |
1181 | try:
1182 | summary: List[str] = []
1183 |
1184 | # ------------------------------------------------------------- #
1185 | # INSERT NODE
1186 | # ------------------------------------------------------------- #
1187 | if isinstance(a, InsertNode):
1188 | pxp = _safe_get_attr(a, "parent_xpath", "target")
1189 | pos = _safe_get_attr(a, "pos", "position", default="N/A")
1190 | node_s = _safe_get_attr(a, "node")
1191 | tag = node_s.tag if node_s is not None else _safe_get_attr(a, "tag") or "[?]"
1192 | if node_s is not None:
1193 | raw = etree.tostring(
1194 | node_s, pretty_print=False, encoding="unicode", method="html"
1195 | ).strip()
1196 | content = textwrap.shorten(raw, context_chars * 2, placeholder="…")
1197 | else:
1198 | content = "[No structure]"
1199 | summary = [
1200 | f"### Inserted `<{tag}>`",
1201 | f"- **Location:** Into `{pxp}` (Pos: {pos})",
1202 | "- **Content:**",
1203 | " ```html",
1204 | f" {content}",
1205 | " ```\n",
1206 | ]
1207 |
1208 | # ------------------------------------------------------------- #
1209 | # DELETE NODE
1210 | # ------------------------------------------------------------- #
1211 | elif isinstance(a, DeleteNode):
1212 | xp = _safe_get_attr(a, "node", "node_xpath", "target")
1213 | onode = _get_element_by_xpath_from_tree(xp, orig_doc)
1214 | tag = onode.tag if onode else "[?]"
1215 | content = _node_plain_text(onode, max_len=context_chars) if onode is not None else "[?]"
1216 | summary = [
1217 | f"### Deleted `<{tag}>`",
1218 | f"- **Location:** `{xp}`",
1219 | "- **Content:**",
1220 | " ```text",
1221 | f" {content}",
1222 | " ```\n",
1223 | ]
1224 |
1225 | # ------------------------------------------------------------- #
1226 | # TEXT UPDATE
1227 | # ------------------------------------------------------------- #
1228 | elif isinstance(a, UpdateTextIn):
1229 | xp = _safe_get_attr(a, "node", "node_xpath")
1230 | onode = _get_element_by_xpath_from_tree(xp, orig_doc)
1231 | mnode = _get_element_by_xpath_from_tree(xp, mod_doc)
1232 | old = _node_plain_text(onode, max_len=context_chars) if onode is not None else "[?]"
1233 | new = _node_plain_text(mnode, max_len=context_chars) if mnode is not None else "[?]"
1234 | tag = onode.tag if onode is not None else (mnode.tag if mnode is not None else "[?]")
1235 | if old != new:
1236 | summary = [
1237 | f"### Text Change in `<{tag}>`",
1238 | f"- **Location:** `{xp}`",
1239 | f"- **Old:** `{old}`",
1240 | f"- **New:** `{new}`\n",
1241 | ]
1242 |
1243 | # ------------------------------------------------------------- #
1244 | # ATTRIBUTE-LEVEL CHANGES
1245 | # ------------------------------------------------------------- #
1246 | elif isinstance(a, (UpdateAttrib, InsertAttrib, DeleteAttrib, RenameAttrib)):
1247 | xp = _safe_get_attr(a, "node", "node_xpath")
1248 | onode = _get_element_by_xpath_from_tree(xp, orig_doc)
1249 | mnode = _get_element_by_xpath_from_tree(xp, mod_doc)
1250 | tag = onode.tag if onode is not None else (mnode.tag if mnode is not None else "[?]")
1251 |
1252 | details = ""
1253 | if isinstance(a, UpdateAttrib):
1254 | name = _safe_get_attr(a, "name")
1255 | old_v = onode.get(name) if onode is not None and name else "[?]"
1256 | new_v = _safe_get_attr(a, "value", "new")
1257 | details = f"- **Update:** `{name}`\n- **Old:** `{old_v}`\n- **New:** `{new_v}`"
1258 | elif isinstance(a, InsertAttrib):
1259 | name = _safe_get_attr(a, "name")
1260 | val = _safe_get_attr(a, "value")
1261 | details = f"- **Insert:** `{name}` = `{val}`"
1262 | elif isinstance(a, DeleteAttrib):
1263 | name = _safe_get_attr(a, "name")
1264 | old_v = onode.get(name) if onode is not None and name else "[?]"
1265 | details = f"- **Delete:** `{name}` (was `{old_v}`)"
1266 | elif isinstance(a, RenameAttrib):
1267 | old_n = _safe_get_attr(a, "old_name")
1268 | new_n = _safe_get_attr(a, "new_name")
1269 | val = onode.get(old_n) if onode is not None and old_n else "[?]"
1270 | details = f"- **Rename:** `{old_n}` → `{new_n}` (value: `{val}`)"
1271 |
1272 | if details:
1273 | summary = [f"### Attribute Change in `<{tag}>` (`{xp}`)", details + "\n"]
1274 |
1275 | # ------------------------------------------------------------- #
1276 | # RENAME NODE
1277 | # ------------------------------------------------------------- #
1278 | elif "RenameNode" in globals() and isinstance(a, RenameNode):
1279 | xp = _safe_get_attr(a, "node", "node_xpath")
1280 | new_tag = _safe_get_attr(a, "new_tag", "new_name")
1281 | onode = _get_element_by_xpath_from_tree(xp, orig_doc)
1282 | old_tag = onode.tag if onode else "[?]"
1283 | summary = [
1284 | "### Node Rename",
1285 | f"- **Location:** `{xp}`",
1286 | f"- **Old Tag:** `{old_tag}`",
1287 | f"- **New Tag:** `{new_tag}`\n",
1288 | ]
1289 |
1290 | # append to global list
1291 | lines.extend(summary)
1292 |
1293 | except Exception as exc: # pragma: no-cover
1294 | logger.error("Markdown summary error: %s | %s", a, exc, exc_info=True)
1295 | lines.extend(["\n---\nError: " + type(a).__name__ + "\n---\n"])
1296 |
1297 | return "\n".join(lines)
1298 |
1299 |
1300 | # ─────────────────────────────────────────────────────────────────────────────
1301 | # Fuzzy move pairing helpers
1302 | # ─────────────────────────────────────────────────────────────────────────────
1303 | @dataclass
1304 | class _NodeFP:
1305 | xpath: str
1306 | tag: str
1307 | text_norm: str
1308 | tokens: List[str]
1309 | shingles: Set[str]
1310 | simhash: int
1311 | anchor_sig: str
1312 | attrs_sig: str
1313 | length: int
1314 | node: _Element
1315 | parent_xpath: Optional[str] = None
1316 | pos: Optional[int] = None
1317 | action_index: int = -1 # index in actions list
1318 |
1319 |
1320 | def _tokenize_for_fp(s: str) -> List[str]:
1321 | return re.findall(r"[A-Za-z0-9]+", s.lower())
1322 |
1323 |
1324 | def _make_shingles(tokens: List[str], k: int = 4) -> Set[str]:
1325 | if len(tokens) < k:
1326 | return set(tokens) if tokens else set()
1327 | return {" ".join(tokens[i : i + k]) for i in range(0, len(tokens) - k + 1)}
1328 |
1329 |
1330 | def _simhash64(items: Set[str]) -> int:
1331 | # Fast 64-bit simhash over shingles/tokens
1332 | if not items:
1333 | return 0
1334 | acc = [0] * 64
1335 | for it in items:
1336 | h = int(hashlib.blake2b(it.encode("utf-8", "replace"), digest_size=8).hexdigest(), 16)
1337 | for b in range(64):
1338 | acc[b] += 1 if (h >> b) & 1 else -1
1339 | out = 0
1340 | for b, v in enumerate(acc):
1341 | if v >= 0:
1342 | out |= 1 << b
1343 | return out
1344 |
1345 |
1346 | def _simhash_sim(a: int, b: int) -> float:
1347 | if a == b:
1348 | return 1.0
1349 | x = a ^ b
1350 | try:
1351 | d = x.bit_count()
1352 | except AttributeError: # pragma: no cover - Python < 3.8 fallback
1353 | d = bin(x).count("1")
1354 | return 1.0 - (d / 64.0)
1355 |
1356 |
1357 | def _ancestor_anchor_signature(el: _Element, depth: int = 4) -> str:
1358 | parts: List[str] = []
1359 | p = el.getparent()
1360 | steps = 0
1361 | while p is not None and steps < depth:
1362 | tid = p.get("id") or ""
1363 | cls = p.get("class") or ""
1364 | name = p.tag.lower() if hasattr(p, "tag") else ""
1365 | parts.append(f"{name}#{tid}.{cls}".strip("."))
1366 | p = p.getparent()
1367 | steps += 1
1368 | parts.reverse()
1369 | return "/".join(parts)
1370 |
1371 |
1372 | def _attrs_signature(el: _Element) -> str:
1373 | if el is None or not hasattr(el, "attrib"):
1374 | return ""
1375 | kv = []
1376 | for k, v in el.attrib.items():
1377 | k_l = k.lower()
1378 | if k_l in _ANCHOR_ATTRS or k_l.startswith("data-"):
1379 | kv.append(f"{k_l}={_normalize_text(v)}")
1380 | kv.sort()
1381 | return "|".join(kv)
1382 |
1383 |
1384 | def _build_fp_for_element(el: _Element, xpath: str) -> Optional[_NodeFP]:
1385 | if el is None:
1386 | return None
1387 | txt = _get_normalized_full_text(el)
1388 | if len(txt) < _MOVE_MIN_CHARS:
1389 | return None
1390 | toks = _tokenize_for_fp(txt)
1391 | if len(toks) < _MOVE_MIN_TOKENS:
1392 | return None
1393 | sh = _make_shingles(toks, 4) or set(toks)
1394 | tag = el.tag.lower() if hasattr(el, "tag") else ""
1395 | return _NodeFP(
1396 | xpath=xpath,
1397 | tag=tag,
1398 | text_norm=txt,
1399 | tokens=toks,
1400 | shingles=sh,
1401 | simhash=_simhash64(sh),
1402 | anchor_sig=_ancestor_anchor_signature(el),
1403 | attrs_sig=_attrs_signature(el),
1404 | length=len(toks),
1405 | node=el,
1406 | )
1407 |
1408 |
1409 | def _resolve_insert_element(
1410 | ins_action: Any, mod_doc: _ElementTree
1411 | ) -> Tuple[Optional[_Element], Optional[str], Optional[int]]:
1412 | parent_xpath = _safe_get_attr(ins_action, "parent_xpath", "target")
1413 | pos = _safe_get_attr(ins_action, "pos", "position")
1414 | sib_xpath = _safe_get_attr(ins_action, "sibling_xpath")
1415 | el = _safe_get_attr(ins_action, "node")
1416 | if isinstance(el, _Element):
1417 | try:
1418 | if parent_xpath:
1419 | parent = _get_element_by_xpath_from_tree(parent_xpath, mod_doc)
1420 | if parent is not None:
1421 | target_idx = None
1422 | target_text = _get_normalized_full_text(el)[:40]
1423 | for i, ch in enumerate(list(parent)):
1424 | if getattr(ch, "tag", None) == getattr(el, "tag", None):
1425 | if _get_normalized_full_text(ch)[:40] == target_text:
1426 | target_idx = i
1427 | break
1428 | if isinstance(pos, int) or (isinstance(pos, str) and pos.isdigit()):
1429 | idx = int(pos)
1430 | elif target_idx is not None:
1431 | idx = target_idx
1432 | else:
1433 | idx = len(parent)
1434 | return el, parent_xpath, idx
1435 | except Exception:
1436 | pass
1437 |
1438 | if parent_xpath:
1439 | parent = _get_element_by_xpath_from_tree(parent_xpath, mod_doc)
1440 | if parent is not None:
1441 | try:
1442 | if isinstance(pos, int) or (isinstance(pos, str) and str(pos).isdigit()):
1443 | return parent[int(pos)], parent_xpath, int(pos)
1444 | if pos in ("before", "after") and sib_xpath:
1445 | sib = _get_element_by_xpath_from_tree(sib_xpath, mod_doc)
1446 | if sib is not None:
1447 | par = sib.getparent()
1448 | idx = par.index(sib) + (1 if pos == "after" else 0)
1449 | try:
1450 | return par[idx], par.getroottree().getpath(par), idx
1451 | except Exception:
1452 | return None, par.getroottree().getpath(par), idx
1453 | idx = len(parent)
1454 | try:
1455 | return parent[idx], parent_xpath, idx
1456 | except Exception:
1457 | return None, parent_xpath, idx
1458 | except Exception:
1459 | return None, parent_xpath, None
1460 |
1461 | if sib_xpath:
1462 | sib = _get_element_by_xpath_from_tree(sib_xpath, mod_doc)
1463 | if sib is not None:
1464 | par = sib.getparent()
1465 | idx = par.index(sib) + (1 if pos == "after" else 0) if pos in ("before", "after") else len(par)
1466 | try:
1467 | return par[idx], par.getroottree().getpath(par), idx
1468 | except Exception:
1469 | return None, par.getroottree().getpath(par), idx
1470 |
1471 | return None, None, None
1472 |
1473 |
1474 | def _score_pair(a: _NodeFP, b: _NodeFP) -> float:
1475 | seq = difflib.SequenceMatcher(None, a.text_norm, b.text_norm, autojunk=False).ratio()
1476 | jac = len(a.shingles & b.shingles) / max(1, len(a.shingles | b.shingles))
1477 | content = max(seq, jac)
1478 | shs = _simhash_sim(a.simhash, b.simhash)
1479 | anc = difflib.SequenceMatcher(None, a.anchor_sig, b.anchor_sig, autojunk=False).ratio()
1480 | attr = 1.0 if (a.attrs_sig and a.attrs_sig == b.attrs_sig) else (0.5 if a.attrs_sig and b.attrs_sig else 0.0)
1481 | tag_bonus = 0.05 if (a.tag == b.tag) else (0.03 if (a.tag in _BLOCK_TAGS and b.tag in _BLOCK_TAGS) else -0.10)
1482 | score = 0.45 * content + 0.25 * shs + 0.20 * anc + 0.10 * attr + tag_bonus
1483 | lr = min(a.length, b.length) / max(a.length, b.length)
1484 | if lr < 0.5:
1485 | score *= 0.85
1486 | return max(0.0, min(1.0, score))
1487 |
1488 |
1489 | def _augment_actions_with_fuzzy_moves(
1490 | actions: List[Any],
1491 | orig_doc: _ElementTree,
1492 | mod_doc: _ElementTree,
1493 | *,
1494 | threshold: float = _MOVE_SIM_THRESHOLD,
1495 | min_tokens: int = _MOVE_MIN_TOKENS,
1496 | min_chars: int = _MOVE_MIN_CHARS,
1497 | max_pairs: int = _MOVE_MAX_CANDIDATES,
1498 | ) -> List[Any]:
1499 | """Pair DeleteNode + InsertNode actions that represent moves and emit MoveNode."""
1500 | if not actions:
1501 | return actions
1502 |
1503 | del_fps: List[_NodeFP] = []
1504 | ins_fps: List[_NodeFP] = []
1505 |
1506 | for idx, a in enumerate(actions):
1507 | if isinstance(a, DeleteNode):
1508 | xp = _safe_get_attr(a, "node", "node_xpath", "target")
1509 | if not xp:
1510 | continue
1511 | el = _get_element_by_xpath_from_tree(xp, orig_doc)
1512 | fp = _build_fp_for_element(el, xp)
1513 | if fp:
1514 | fp.action_index = idx
1515 | del_fps.append(fp)
1516 |
1517 | elif isinstance(a, InsertNode):
1518 | el, parent_xpath, pos = _resolve_insert_element(a, mod_doc)
1519 | target_el = el or _get_element_by_xpath_from_tree(parent_xpath or "", mod_doc)
1520 | if target_el is None:
1521 | continue
1522 | if el is None:
1523 | node_struct = _safe_get_attr(a, "node")
1524 | if isinstance(node_struct, _Element):
1525 | target_el = node_struct
1526 | xp = None
1527 | try:
1528 | if el is not None:
1529 | xp = el.getroottree().getpath(el)
1530 | elif parent_xpath is not None:
1531 | xp = parent_xpath
1532 | except Exception:
1533 | xp = parent_xpath or ""
1534 | fp = _build_fp_for_element(target_el, xp or "")
1535 | if fp:
1536 | fp.parent_xpath = parent_xpath
1537 | fp.pos = pos if (isinstance(pos, int) or (isinstance(pos, str) and str(pos).isdigit())) else None
1538 | fp.action_index = idx
1539 | ins_fps.append(fp)
1540 |
1541 | if not del_fps or not ins_fps:
1542 | return actions
1543 |
1544 | cand: List[Tuple[float, int, int]] = []
1545 | for d in del_fps:
1546 | for i in ins_fps:
1547 | if len(cand) >= max_pairs:
1548 | break
1549 | if (d.tag != i.tag) and not (d.tag in _BLOCK_TAGS and i.tag in _BLOCK_TAGS):
1550 | continue
1551 | len_ratio = min(d.length, i.length) / max(d.length, i.length)
1552 | if len_ratio < 0.40:
1553 | continue
1554 | if _simhash_sim(d.simhash, i.simhash) < 0.60:
1555 | continue
1556 | s = _score_pair(d, i)
1557 | if s >= threshold:
1558 | cand.append((s, d.action_index, i.action_index))
1559 | if len(cand) >= max_pairs:
1560 | break
1561 |
1562 | if not cand:
1563 | return actions
1564 |
1565 | cand.sort(reverse=True)
1566 | matched_d: Set[int] = set()
1567 | matched_i: Set[int] = set()
1568 | inject_moves: Dict[int, List[Any]] = {}
1569 |
1570 | def _mk_move_id(src_xp: str, tgt_px: str, pos_val: Optional[int]) -> str:
1571 | h = hashlib.blake2b(f"{src_xp}|{tgt_px}|{pos_val}".encode("utf-8", "replace"), digest_size=6).hexdigest()
1572 | return f"mv_{h}"
1573 |
1574 | for score, d_idx, i_idx in cand:
1575 | if d_idx in matched_d or i_idx in matched_i:
1576 | continue
1577 | d_act = actions[d_idx]
1578 | i_act = actions[i_idx]
1579 | src_xpath = _safe_get_attr(d_act, "node", "node_xpath", "target")
1580 | tgt_parent = _safe_get_attr(i_act, "parent_xpath", "target")
1581 | pos = _safe_get_attr(i_act, "pos", "position")
1582 | sib_xpath = _safe_get_attr(i_act, "sibling_xpath")
1583 |
1584 | if isinstance(pos, str) and pos in ("before", "after") and sib_xpath:
1585 | try:
1586 | sib = _get_element_by_xpath_from_tree(sib_xpath, mod_doc)
1587 | if sib is not None:
1588 | par = sib.getparent()
1589 | tgt_parent = par.getroottree().getpath(par)
1590 | pos = int(par.index(sib) + (1 if pos == "after" else 0))
1591 | except Exception:
1592 | pass
1593 | elif not (isinstance(pos, int) or (isinstance(pos, str) and str(pos).isdigit())):
1594 | try:
1595 | parent_el = _get_element_by_xpath_from_tree(tgt_parent, mod_doc) if tgt_parent else None
1596 | if parent_el is not None:
1597 | pos = len(parent_el)
1598 | except Exception:
1599 | pos = "into"
1600 |
1601 | move_id = _mk_move_id(str(src_xpath), str(tgt_parent), int(pos) if str(pos).isdigit() else -1)
1602 | try:
1603 | if isinstance(pos, int) or (isinstance(pos, str) and str(pos).isdigit()):
1604 | pos_int = int(pos)
1605 | mv = MoveNode(src_xpath, tgt_parent, pos_int, move_id) # type: ignore[call-arg]
1606 | else:
1607 | mv = MoveNode(src_xpath, tgt_parent, "into", move_id) # type: ignore[call-arg]
1608 | except TypeError:
1609 | mv = MoveNode(node=src_xpath, target=tgt_parent, pos=pos, move_id=move_id) # type: ignore[call-arg]
1610 |
1611 | inject_moves.setdefault(i_idx, []).append(mv)
1612 | matched_d.add(d_idx)
1613 | matched_i.add(i_idx)
1614 | logger.debug(
1615 | f"FuzzyMove paired del#{d_idx} -> ins#{i_idx} (score={score:.3f}, id={move_id})"
1616 | )
1617 |
1618 | if not inject_moves:
1619 | return actions
1620 |
1621 | new_actions: List[Any] = []
1622 | for idx, act in enumerate(actions):
1623 | if idx in matched_d:
1624 | continue
1625 | if idx in matched_i:
1626 | for mv in inject_moves.get(idx, []):
1627 | new_actions.append(mv)
1628 | continue
1629 | new_actions.append(act)
1630 |
1631 | logger.info(
1632 | f"FuzzyMove: converted {len(matched_i)} insert/delete pairs into {sum(len(v) for v in inject_moves.values())} moves."
1633 | )
1634 | return new_actions
1635 |
1636 |
1637 | # ─────────────────────────────────────────────────────────────────────────────
1638 | # XSLT template
1639 | # ─────────────────────────────────────────────────────────────────────────────
1640 | _XMLDIFF_XSLT_REVISED = """<?xml version="1.0" encoding="UTF-8"?>
1641 | <xsl:stylesheet version="1.0"
1642 | xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
1643 | xmlns:diff="http://namespaces.shoobx.com/diff"
1644 | exclude-result-prefixes="diff">
1645 | <!-- Removed xmlns:json and the comment -->
1646 | <xsl:output method="html" omit-xml-declaration="yes" indent="no"/>
1647 | <xsl:param name="diff-ns-uri" select="'http://namespaces.shoobx.com/diff'"/>
1648 |
1649 | <!-- Match all nodes and attributes, copy them -->
1650 | <xsl:template match="@*|node()">
1651 | <xsl:copy>
1652 | <xsl:apply-templates select="@*|node()"/>
1653 | </xsl:copy>
1654 | </xsl:template>
1655 |
1656 | <!-- Template for inserted elements -->
1657 | <xsl:template match="*[@diff:op='insert']">
1658 | <ins class="diff-insert">
1659 | <xsl:call-template name="copy-element-content"/>
1660 | </ins>
1661 | </xsl:template>
1662 |
1663 | <!-- Template for move target elements -->
1664 | <xsl:template match="*[@diff:op='move-target']">
1665 | <ins class="diff-move-target" data-move-id="{@diff:move-id}">
1666 | <xsl:call-template name="copy-element-content"/>
1667 | </ins>
1668 | </xsl:template>
1669 |
1670 | <!-- Template for deleted elements -->
1671 | <xsl:template match="*[@diff:op='delete']">
1672 | <del class="diff-delete">
1673 | <xsl:call-template name="copy-element-content"/>
1674 | </del>
1675 | </xsl:template>
1676 |
1677 | <!-- Template for move source elements -->
1678 | <xsl:template match="*[@diff:op='move-source']">
1679 | <del class="diff-move-source" data-move-id="{@diff:move-id}">
1680 | <xsl:call-template name="copy-element-content"/>
1681 | </del>
1682 | </xsl:template>
1683 |
1684 | <!-- Template for elements with updated text/tail -->
1685 | <xsl:template match="*[@diff:op='update-text' or
1686 | @diff:op='update-tail' or
1687 | @diff:op='update-text-context']">
1688 | <span class="diff-update-container">
1689 | <xsl:attribute name="title">
1690 | <xsl:text>Original: </xsl:text>
1691 | <xsl:value-of select="@diff:old-value"/>
1692 | <xsl:if test="@diff:op='update-text-context'">
1693 | <xsl:text> (</xsl:text>
1694 | <xsl:value-of select="@diff:detail"/>
1695 | <xsl:text>)</xsl:text>
1696 | </xsl:if>
1697 | </xsl:attribute>
1698 | <xsl:call-template name="copy-element-content"/>
1699 | </span>
1700 | </xsl:template>
1701 |
1702 | <!-- Template for elements with attribute changes (if not already handled by insert/delete/move/text) -->
1703 | <!-- Increased priority to override base copy if only attribs changed -->
1704 | <xsl:template match="*[@diff:op='update-attrib']" priority="2">
1705 | <!-- Check if the node ALSO has a major op; if so, let that template handle the wrapper -->
1706 | <xsl:choose>
1707 | <xsl:when test="@diff:op='insert' or @diff:op='delete' or @diff:op='move-target' or @diff:op='move-source' or starts-with(@diff:op, 'update-text') or @diff:op='rename-node'">
1708 | <!-- Already handled by a more specific template, just copy content -->
1709 | <xsl:call-template name="copy-element-content"/>
1710 | </xsl:when>
1711 | <xsl:otherwise>
1712 | <!-- Only attribute changes, wrap in span -->
1713 | <span class="diff-attrib-change">
1714 | <xsl:attribute name="title">
1715 | <xsl:call-template name="format-attribute-changes">
1716 | <xsl:with-param name="changes" select="@diff:attributes"/>
1717 | </xsl:call-template>
1718 | </xsl:attribute>
1719 | <xsl:call-template name="copy-element-content"/>
1720 | </span>
1721 | </xsl:otherwise>
1722 | </xsl:choose>
1723 | </xsl:template>
1724 |
1725 | <xsl:template match="*[@diff:op='rename-node']">
1726 | <span class="diff-rename-node">
1727 | <xsl:attribute name="title">Renamed from <<xsl:value-of select="@diff:old-value"/>> to <<xsl:value-of select="@diff:new-value"/>></xsl:attribute>
1728 | <xsl:call-template name="copy-element-content"/>
1729 | </span>
1730 | </xsl:template>
1731 |
1732 | <!-- Helper template to copy element content excluding diff attributes -->
1733 | <xsl:template name="copy-element-content">
1734 | <xsl:element name="{name()}" namespace="{namespace-uri()}">
1735 | <!-- Copy non-diff attributes -->
1736 | <xsl:apply-templates select="@*[not(namespace-uri()=$diff-ns-uri)]"/>
1737 | <!-- Recursively apply templates to child nodes -->
1738 | <xsl:apply-templates select="node()"/>
1739 | </xsl:element>
1740 | </xsl:template>
1741 |
1742 | <!-- Helper template for attribute changes (basic display for XSLT 1.0) -->
1743 | <xsl:template name="format-attribute-changes">
1744 | <xsl:param name="changes"/>
1745 | <xsl:text>Attrs changed: </xsl:text>
1746 | <!-- XSLT 1.0 cannot parse JSON. Display raw string. -->
1747 | <xsl:value-of select="$changes"/>
1748 | </xsl:template>
1749 |
1750 | <!-- Handle inline text changes explicitly -->
1751 | <xsl:template match="ins[@class='diff-insert-text'] | del[@class='diff-delete-text']">
1752 | <xsl:copy-of select="."/>
1753 | </xsl:template>
1754 |
1755 | <!-- Prevent diff:* attributes from being copied to the output -->
1756 | <xsl:template match="@diff:*" priority="10"/>
1757 |
1758 | </xsl:stylesheet>"""
1759 |
1760 |
1761 | # ─────────────────────────────────────────────────────────────────────────────
1762 | # Public tool
1763 | # ─────────────────────────────────────────────────────────────────────────────
1764 |
1765 |
1766 | @with_tool_metrics
1767 | @with_error_handling
1768 | async def create_html_redline(
1769 | original_html: str,
1770 | modified_html: str,
1771 | *,
1772 | detect_moves: bool = True,
1773 | formatting_tags: Optional[List[str]] = None,
1774 | ignore_whitespace: bool = True,
1775 | include_css: bool = True,
1776 | add_navigation: bool = True,
1777 | output_format: str = "html",
1778 | use_tempfiles: bool = False,
1779 | run_tidy: bool = False,
1780 | generate_markdown: bool = False,
1781 | markdown_path: str = "detected_redline_differences.md",
1782 | ) -> Dict[str, Any]:
1783 | """Generate a redline HTML comparing two HTML documents."""
1784 | global _id_counter # <-- Declare global at the top of the function scope
1785 | t0 = time.time()
1786 | logger.info("Starting HTML redline generation...")
1787 |
1788 | # --- Input Validation ---
1789 | if not original_html or not isinstance(original_html, str):
1790 | raise ToolInputError("original_html required")
1791 | if not modified_html or not isinstance(modified_html, str):
1792 | raise ToolInputError("modified_html required")
1793 | if output_format not in {"html", "fragment"}:
1794 | raise ToolInputError("output_format must be 'html' | 'fragment'")
1795 |
1796 | # --- Initialization for variables used outside try ---
1797 | orig_tree: Optional[_ElementTree] = None
1798 | mod_tree: Optional[_ElementTree] = None
1799 | original_tree_pristine: Optional[_ElementTree] = None
1800 | modified_tree_pristine: Optional[_ElementTree] = None
1801 |
1802 | logger.debug("Preprocessing HTML documents...")
1803 | try:
1804 | # --- Preprocessing ---
1805 | orig_root, mod_root = _preprocess_html_docs(
1806 | original_html,
1807 | modified_html,
1808 | ignore_whitespace=ignore_whitespace,
1809 | use_tempfiles=use_tempfiles,
1810 | run_tidy=run_tidy,
1811 | )
1812 | if orig_root is None or mod_root is None:
1813 | raise ToolInputError("Preprocessing failed to return root elements.")
1814 |
1815 | # --- Reset Counter & Inject IDs ---
1816 | logger.debug("Resetting ID counter and injecting synthetic IDs...")
1817 | _id_counter = itertools.count(1) # Reset counter here
1818 | _inject_synthetic_ids(orig_root) # Inject into original root
1819 | _inject_synthetic_ids(mod_root) # Inject into modified root
1820 | logger.debug("Synthetic ID injection complete.")
1821 |
1822 | # --- Create Trees & Pristine Copies ---
1823 | orig_tree = etree.ElementTree(orig_root)
1824 | mod_tree = etree.ElementTree(mod_root)
1825 |
1826 | logger.debug("Creating pristine copies with IDs for formatter...")
1827 | original_tree_pristine = deepcopy(orig_tree)
1828 | modified_tree_pristine = deepcopy(mod_tree)
1829 | logger.debug("Pristine copies created.")
1830 |
1831 | except Exception as e:
1832 | logger.exception("Preprocessing, ID injection, or copying failed.")
1833 | # Ensure pristine trees are None if we failed before creating them
1834 | original_tree_pristine = None
1835 | modified_tree_pristine = None
1836 | raise ToolInputError("Failed HTML preparation") from e
1837 |
1838 | # --- Check if pristine copies were successfully created before proceeding ---
1839 | if original_tree_pristine is None or modified_tree_pristine is None:
1840 | # This case should ideally be caught by the exception above,
1841 | # but it's good practice to check.
1842 | logger.error("Pristine trees for diffing are missing after preparation step.")
1843 | return {
1844 | "redline_html": "<!-- Error: Failed to prepare documents for diffing -->",
1845 | "stats": {"error": "Document preparation failed"},
1846 | "processing_time": time.time() - t0,
1847 | "success": False,
1848 | }
1849 |
1850 |
1851 | # --- Diff Actions (Using Synthetic IDs) ---
1852 | logger.debug("Calculating differences using xmldiff with synthetic IDs...")
1853 | # Options for the xmldiff Differ class constructor
1854 | differ_opts: Dict[str, Any] = {
1855 | "ratio_mode": "accurate",
1856 | "fast_match": False,
1857 | "F": 0.6,
1858 | "uniqueattrs": ["data-diff-id"],
1859 | }
1860 |
1861 | actions: List[Any] = []
1862 | stats: Dict[str, Any] = {}
1863 | markdown_summary = ""
1864 | annotated_tree: Optional[_ElementTree] = None
1865 |
1866 | # --- Add this debug block ---
1867 | try:
1868 | debug_orig_path = "debug_orig_tree_with_ids.xml"
1869 | debug_mod_path = "debug_mod_tree_with_ids.xml"
1870 | with open(debug_orig_path, "wb") as f:
1871 | orig_tree.write(f, pretty_print=True, encoding='utf-8', xml_declaration=True)
1872 | with open(debug_mod_path, "wb") as f:
1873 | mod_tree.write(f, pretty_print=True, encoding='utf-8', xml_declaration=True)
1874 | logger.info(f"Debug trees with IDs written to {debug_orig_path} and {debug_mod_path}")
1875 | except Exception as dbg_e:
1876 | logger.warning(f"Failed to write debug trees: {dbg_e}")
1877 | # --- End of debug block ---
1878 |
1879 | try:
1880 | # Pass the trees WITH IDs to the diff engine
1881 | # Use the main trees (orig_tree, mod_tree) for diffing
1882 | # as they have the structure and IDs needed for diff calculation.
1883 | actions = main.diff_trees(
1884 | orig_tree, # Use the tree derived directly from preprocessing + ID injection
1885 | mod_tree, # Use the tree derived directly from preprocessing + ID injection
1886 | diff_options=differ_opts,
1887 | )
1888 | logger.info(f"xmldiff generated {len(actions)} actions using synthetic IDs.")
1889 |
1890 | # --- Fuzzy move augmentation (Delete+Insert -> MoveNode) ---
1891 | try:
1892 | actions = _augment_actions_with_fuzzy_moves(
1893 | actions,
1894 | original_tree_pristine,
1895 | modified_tree_pristine,
1896 | threshold=_MOVE_SIM_THRESHOLD,
1897 | min_tokens=_MOVE_MIN_TOKENS,
1898 | min_chars=_MOVE_MIN_CHARS,
1899 | max_pairs=_MOVE_MAX_CANDIDATES,
1900 | )
1901 | logger.info(
1902 | f"After FuzzyMove: {len(actions)} actions (moves inflated, deletes/inserts deflated)."
1903 | )
1904 | except Exception as e:
1905 | logger.exception(f"Fuzzy move augmentation failed: {e}")
1906 |
1907 | # Debug: Log first few actions
1908 | if actions:
1909 | logger.debug(f"First 5 actions generated: {actions[:5]}")
1910 | else:
1911 | logger.warning("xmldiff generated NO actions.")
1912 |
1913 | # Check insert/delete ratio
1914 | insert_delete_ratio = (sum(1 for a in actions if isinstance(a, (InsertNode, DeleteNode))) / len(actions)) if actions else 0
1915 | if insert_delete_ratio > 0.9:
1916 | logger.warning(f"High ratio ({insert_delete_ratio:.2f}) of Insert/Delete actions. Node matching via data-diff-id might have failed.")
1917 |
1918 | # --- Generate Markdown Summary (if requested) ---
1919 | if generate_markdown and actions:
1920 | logger.debug("Generating Markdown summary...")
1921 | # Pass the pristine copies (which include IDs for XPath lookup within the summary generation)
1922 | markdown_summary = _generate_markdown_summary(
1923 | orig_doc=original_tree_pristine, mod_doc=modified_tree_pristine, actions=actions
1924 | )
1925 | # (Consider where to save/return markdown_path content if needed)
1926 |
1927 | # --- Apply Actions using Formatter ---
1928 | logger.debug("Applying actions using RedlineXMLFormatter...")
1929 | formatter = RedlineXMLFormatter(
1930 | detect_moves=detect_moves,
1931 | normalize=formatting.WS_BOTH if ignore_whitespace else formatting.WS_NONE,
1932 | )
1933 | # Pass the pristine copies WITH IDs to the formatter.
1934 | # The formatter needs the original pristine tree (with IDs) to find nodes
1935 | # referenced by actions, and it works on a *copy* of this pristine tree.
1936 | # It also needs the modified pristine tree for lookups (e.g., finding move targets).
1937 | annotated_tree = formatter.format(actions, original_tree_pristine, modified_tree_pristine)
1938 | stats = formatter.processed_actions
1939 | logger.debug(f"Formatting complete. Stats: {stats}")
1940 |
1941 | except ToolError as te:
1942 | logger.error(f"Diff engine error: {te}", exc_info=True)
1943 | return {
1944 | "redline_html": f"<!-- Diff Engine Error: {html_stdlib.escape(str(te))} -->",
1945 | "stats": {"error": str(te)},
1946 | "processing_time": time.time() - t0,
1947 | "success": False,
1948 | }
1949 | except Exception as e:
1950 | logger.exception("Error during diff/formatting.")
1951 | return {
1952 | "redline_html": "<!-- Error during diff/formatting -->",
1953 | "stats": {"error": str(e)},
1954 | "processing_time": time.time() - t0,
1955 | "success": False,
1956 | }
1957 | finally:
1958 | # Clear potentially large list to free memory
1959 | actions.clear()
1960 | # Explicitly None out large trees if possible (though garbage collection should handle this)
1961 | orig_tree = mod_tree = original_tree_pristine = modified_tree_pristine = None
1962 |
1963 |
1964 | # --- Remove Synthetic IDs from Final Output ---
1965 | if annotated_tree is not None:
1966 | logger.debug("Removing synthetic IDs from the final annotated tree...")
1967 | count_removed = 0
1968 | for el in annotated_tree.iter():
1969 | if isinstance(el, _Element) and el.attrib.pop("data-diff-id", None):
1970 | count_removed += 1
1971 | logger.debug(f"Removed {count_removed} synthetic IDs from final output.")
1972 | else:
1973 | logger.error("Annotated tree is None after formatting.")
1974 | # Handle this case - perhaps return an error
1975 | return {
1976 | "redline_html": "<!-- Error: Formatting produced no result -->",
1977 | "stats": stats if stats else {"error": "Formatting failed"},
1978 | "processing_time": time.time() - t0,
1979 | "success": False,
1980 | }
1981 |
1982 |
1983 | # --- Apply XSLT ---
1984 | logger.debug("Applying revised XSLT transformation...")
1985 | redline_html = "<!-- XSLT Transformation Failed -->"
1986 | # (Keep existing XSLT logic, ensuring annotated_tree is checked)
1987 | try:
1988 | xslt_root = etree.fromstring(_XMLDIFF_XSLT_REVISED.encode())
1989 | transform = etree.XSLT(xslt_root)
1990 | redline_doc = transform(annotated_tree)
1991 | if redline_doc.getroot() is not None:
1992 | redline_html = etree.tostring(
1993 | redline_doc, encoding="unicode", method="html", pretty_print=False
1994 | )
1995 | logger.debug("XSLT transformation successful.")
1996 | else:
1997 | logger.error("XSLT transformation resulted in an empty document.")
1998 | redline_html = "<!-- XSLT empty result -->"
1999 | except Exception as e:
2000 | logger.exception("XSLT transformation failed.")
2001 | redline_html = f"<!-- XSLT Error: {html_stdlib.escape(str(e))} -->"
2002 |
2003 | # --- Post-processing ---
2004 | logger.debug("Post-processing HTML output...")
2005 | final_redline_html = await _postprocess_redline(
2006 | redline_html,
2007 | include_css=include_css,
2008 | add_navigation=add_navigation,
2009 | output_format=output_format,
2010 | )
2011 | logger.debug("Post-processing complete.")
2012 |
2013 | # --- Final Result ---
2014 | dt = time.time() - t0
2015 | success_flag = (
2016 | stats.get("errors", 0) == 0
2017 | and "<!-- XSLT" not in redline_html # Check for XSLT error comments
2018 | )
2019 | result: Dict[str, Any] = {"stats": stats, "processing_time": dt, "success": success_flag}
2020 |
2021 | # Handle large output
2022 | size_bytes = len(final_redline_html.encode("utf-8", errors="ignore"))
2023 | logger.info(f"Generated redline HTML size: {size_bytes / 1024:.2f} KB")
2024 | if size_bytes > 10_000_000: # Example limit: 10MB
2025 | logger.warning(f"Redline HTML size ({size_bytes} bytes) exceeds limit, encoding Base64.")
2026 | try:
2027 | result["redline_html_base64"] = base64.b64encode(
2028 | final_redline_html.encode("utf-8")
2029 | ).decode("ascii")
2030 | result["output_is_base64"] = True # Add flag
2031 | except Exception as e:
2032 | logger.error(f"Base64 encoding failed: {e}")
2033 | result["redline_html"] = "<!-- Error: Output too large & Base64 failed -->"
2034 | result["success"] = False
2035 | # Avoid keeping large string in memory if encoded
2036 | del final_redline_html
2037 | else:
2038 | result["redline_html"] = final_redline_html
2039 | result["output_is_base64"] = False
2040 |
2041 | if generate_markdown:
2042 | result["markdown_summary"] = markdown_summary
2043 | if markdown_path:
2044 | result["markdown_path"] = str(Path(markdown_path).resolve()) # Example of returning path
2045 |
2046 | logger.info(
2047 | f"HTML redline generation finished in {dt:.3f} seconds. Success: {result['success']}"
2048 | )
2049 | return result
2050 |
2051 |
2052 | # ─────────────────────────────────────────────────────────────────────────────
2053 | # Pre‑processing helpers
2054 | # ─────────────────────────────────────────────────────────────────────────────
2055 | def _check_tidy_available():
2056 | try:
2057 | res = subprocess.run(
2058 | ["tidy", "--version"], capture_output=True, timeout=1, check=False, text=True
2059 | )
2060 | return res.returncode == 0 and "HTML Tidy" in res.stdout
2061 | except Exception:
2062 | return False
2063 |
2064 |
2065 | def _run_html_tidy(html: str) -> str:
2066 | tidied_html = html
2067 | with tempfile.TemporaryDirectory() as td:
2068 | infile = Path(td, "input.html")
2069 | infile.write_text(html, encoding="utf-8")
2070 | cmd = [
2071 | "tidy",
2072 | "-q",
2073 | "-m",
2074 | "--tidy-mark",
2075 | "no",
2076 | "--drop-empty-elements",
2077 | "no",
2078 | "--wrap",
2079 | "0",
2080 | "--show-warnings",
2081 | "no",
2082 | "--show-errors",
2083 | "0",
2084 | "--force-output",
2085 | "yes",
2086 | "-utf8",
2087 | str(infile),
2088 | ]
2089 | try:
2090 | res = subprocess.run(cmd, capture_output=True, text=True, timeout=15, check=False) # noqa: F841
2091 | tidied_html = infile.read_text(encoding="utf-8")
2092 | except Exception as e:
2093 | logger.error(f"Tidy failed: {e}")
2094 | return tidied_html
2095 |
2096 |
2097 | def _normalize_tree_whitespace(root: _Element) -> None:
2098 | """Normalizes whitespace in .text and .tail for all elements in the tree in-place."""
2099 | if root is None:
2100 | return
2101 | # Iterate through all elements AND comments/PIs (which can have tails)
2102 | for node in root.xpath('. | .//node()'):
2103 | # Check if it's an element or something else that can have text/tail
2104 | if hasattr(node, 'text'):
2105 | node.text = _normalize_text(node.text) or None # Use None if empty after normalize
2106 | if hasattr(node, 'tail'):
2107 | node.tail = _normalize_text(node.tail) or None # Use None if empty after normalize
2108 |
2109 | def _deduplicate_body(root: _Element) -> None:
2110 | """If an <html> element has more than one <body>, merge children
2111 | into the first and delete the rest. Modifies the tree in-place."""
2112 | if root is None or root.tag.lower() != 'html':
2113 | # Only operate on the root <html> element
2114 | return
2115 |
2116 | bodies = root.xpath('./body | ./BODY') # Case-insensitive check
2117 | if len(bodies) <= 1:
2118 | return # Nothing to do
2119 |
2120 | logger.warning(f"Found {len(bodies)} <body> elements; merging into the first.")
2121 | main_body = bodies[0]
2122 | parent = main_body.getparent() # Should be the <html> tag
2123 | if parent is None:
2124 | return # Should not happen
2125 |
2126 | for i, extra_body in enumerate(bodies[1:], start=1):
2127 | # Move children
2128 | for child in list(extra_body): # Iterate over a list copy
2129 | main_body.append(child) # Append moves the child
2130 |
2131 | # Append tail text if any
2132 | if extra_body.tail:
2133 | # Find the last element in main_body to append the tail to,
2134 | # or append to main_body's text if it's empty
2135 | last_element = main_body[-1] if len(main_body) > 0 else None
2136 | if last_element is not None:
2137 | if last_element.tail:
2138 | last_element.tail = (last_element.tail or "") + (extra_body.tail or "")
2139 | else:
2140 | last_element.tail = extra_body.tail
2141 | else: # If main_body has no children, append to its text
2142 | main_body.text = (main_body.text or "") + (extra_body.tail or "")
2143 |
2144 |
2145 | # Remove the now-empty extra body
2146 | try:
2147 | parent.remove(extra_body)
2148 | except ValueError:
2149 | logger.error(f"Could not remove extra body #{i+1}, already removed?")
2150 |
2151 | logger.debug("Finished merging duplicate <body> elements.")
2152 |
2153 | def _preprocess_html_docs(
2154 | original_html: str,
2155 | modified_html: str,
2156 | *,
2157 | ignore_whitespace: bool = True, # Keep this param, but handle normalization separately now
2158 | use_tempfiles: bool = False,
2159 | run_tidy: bool = False,
2160 | ) -> Tuple[_Element, _Element]:
2161 | """Preprocesses HTML, including optional Tidy and robust whitespace normalization."""
2162 |
2163 | if not original_html.strip():
2164 | original_html = "<html><body><p>Empty Document</p></body></html>" # Provide some structure
2165 | if not modified_html.strip():
2166 | modified_html = "<html><body><p>Empty Document</p></body></html>" # Provide some structure
2167 |
2168 | tidied_orig, tidied_mod = original_html, modified_html
2169 |
2170 | # 1. Optional Tidy (Run *before* parsing)
2171 | if run_tidy:
2172 | logger.debug("Running HTML Tidy...")
2173 | if _check_tidy_available():
2174 | try:
2175 | tidied_orig = _run_html_tidy(original_html)
2176 | tidied_mod = _run_html_tidy(modified_html)
2177 | logger.debug("HTML Tidy completed.")
2178 | except Exception as e:
2179 | logger.warning(f"HTML Tidy failed: {e}. Proceeding without Tidy.")
2180 | else:
2181 | logger.warning("HTML Tidy requested but not available. Skipping.")
2182 |
2183 | # 2. Parse HTML (Crucially, DO NOT remove blank text here initially)
2184 | logger.debug("Parsing HTML documents with lxml...")
2185 | parser = lxml_html.HTMLParser(
2186 | recover=True,
2187 | encoding="utf-8",
2188 | remove_comments=False, # Keep comments, they can affect structure/diff
2189 | remove_pis=False, # Keep processing instructions
2190 | remove_blank_text=False, # IMPORTANT: Keep blank text for now
2191 | )
2192 | o_root: Optional[_Element] = None
2193 | m_root: Optional[_Element] = None
2194 | try:
2195 | # Use memory parsing unless very large docs require temp files
2196 | if use_tempfiles and (len(tidied_orig) > 5e6 or len(tidied_mod) > 5e6): # 5MB limit example
2197 | logger.debug("Using temporary files for parsing large documents.")
2198 | with tempfile.TemporaryDirectory() as td:
2199 | orig_p = Path(td, "orig.html")
2200 | mod_p = Path(td, "mod.html")
2201 | orig_p.write_text(tidied_orig, encoding="utf-8")
2202 | mod_p.write_text(tidied_mod, encoding="utf-8")
2203 | o_root = lxml_html.parse(str(orig_p), parser=parser).getroot()
2204 | m_root = lxml_html.parse(str(mod_p), parser=parser).getroot()
2205 | else:
2206 | # Ensure bytes for fromstring
2207 | o_root = lxml_html.fromstring(tidied_orig.encode("utf-8"), parser=parser)
2208 | m_root = lxml_html.fromstring(tidied_mod.encode("utf-8"), parser=parser)
2209 |
2210 | if o_root is None or m_root is None:
2211 | raise ToolInputError("HTML parsing yielded None root element(s).")
2212 | logger.debug("HTML parsing successful.")
2213 |
2214 | except Exception as e:
2215 | logger.exception(f"HTML parsing failed: {e}")
2216 | raise ToolInputError("Failed HTML parsing.") from e
2217 |
2218 | # 3. Normalize Whitespace (Apply *after* parsing)
2219 | if ignore_whitespace:
2220 | logger.debug("Normalizing whitespace in parsed trees...")
2221 | try:
2222 | _normalize_tree_whitespace(o_root)
2223 | _normalize_tree_whitespace(m_root)
2224 | logger.debug("Whitespace normalization complete.")
2225 | except Exception as e:
2226 | logger.exception("Whitespace normalization failed.")
2227 | raise ToolInputError("Failed whitespace normalization during preprocessing.") from e
2228 |
2229 | # 4. Deduplicate Body Tags (Apply *after* normalization) <-- NEW STEP
2230 | logger.debug("Checking for and merging duplicate <body> tags...")
2231 | try:
2232 | _deduplicate_body(o_root)
2233 | _deduplicate_body(m_root)
2234 | logger.debug("Duplicate <body> tag check complete.")
2235 | except Exception as e:
2236 | logger.exception("Failed during <body> deduplication.")
2237 | # Decide whether to raise or just warn
2238 | raise ToolInputError("Failed <body> tag deduplication during preprocessing.") from e
2239 |
2240 | # The roots returned now have a consistent whitespace representation
2241 | # AND guaranteed single <body> element (if originally within <html>)
2242 | return o_root, m_root
2243 |
2244 |
2245 | # ─────────────────────────────────────────────────────────────────────────────
2246 | # Post‑processing (CSS / nav UI)
2247 | # ─────────────────────────────────────────────────────────────────────────────
2248 | async def _postprocess_redline(
2249 | redline_html: str,
2250 | *,
2251 | include_css: bool = True,
2252 | add_navigation: bool = True,
2253 | output_format: str = "html",
2254 | ) -> str:
2255 | if not redline_html or not redline_html.strip():
2256 | return "<!-- Empty output -->"
2257 | soup = BeautifulSoup(redline_html, "html.parser")
2258 | if not soup.find("html", recursive=False):
2259 | new_soup = BeautifulSoup(
2260 | "<!DOCTYPE html><html><head><title>Comparison</title></head><body></body></html>",
2261 | "html.parser",
2262 | )
2263 | if new_soup.body:
2264 | [
2265 | new_soup.body.append(deepcopy(el))
2266 | for el in soup.contents
2267 | if isinstance(el, Tag) or (isinstance(el, NavigableString) and el.strip())
2268 | ]
2269 | soup = new_soup
2270 | html_tag = soup.html
2271 | head = soup.head
2272 | body = soup.body
2273 | if not head:
2274 | head = soup.new_tag("head")
2275 | head.append(soup.new_tag("title", string="Comparison"))
2276 | html_tag.insert(0, head)
2277 | if not body:
2278 | body = soup.new_tag("body")
2279 | target = head.find_next_sibling() if head else None
2280 | head.insert_after(body) if target else html_tag.append(body)
2281 | if not head.find("meta", attrs={"name": "viewport"}):
2282 | head.insert(
2283 | 0,
2284 | soup.new_tag(
2285 | "meta",
2286 | attrs={"name": "viewport", "content": "width=device-width, initial-scale=1.0"},
2287 | ),
2288 | )
2289 | if not head.find("style", attrs={"data-base-diff": "1"}):
2290 | head.append(BeautifulSoup(_get_base_diff_css(), "html.parser"))
2291 | if include_css and not head.find("script", src=lambda s: s and "cdn.tailwindcss.com" in s):
2292 | head.append(
2293 | BeautifulSoup('<script src="https://cdn.tailwindcss.com"></script>', "html.parser")
2294 | )
2295 | if not head.find("link", href=lambda x: x and "fonts.googleapis.com" in x):
2296 | [
2297 | head.append(
2298 | BeautifulSoup(f'<link rel="preconnect" href="https://{u}"{a}>', "html.parser")
2299 | )
2300 | for u, a in [("fonts.googleapis.com", ""), ("fonts.gstatic.com", " crossorigin")]
2301 | ]
2302 | head.append(
2303 | BeautifulSoup(
2304 | '<link href="https://fonts.googleapis.com/css2?family=Newsreader:ital,opsz,wght@0,6..72,200..800;1,6..72,200..800&display=swap" rel="stylesheet">',
2305 | "html.parser",
2306 | )
2307 | )
2308 | style_tag = soup.new_tag("style", type="text/tailwindcss")
2309 | style_tag.string = _get_tailwind_css()
2310 | head.append(style_tag)
2311 | if add_navigation and output_format == "html":
2312 | if not body.find("div", class_="redline-minimap"):
2313 | body.append(
2314 | BeautifulSoup(
2315 | """<div class="redline-minimap fixed right-1 top-10 bottom-10 w-1 bg-gray-100 dark:bg-gray-800 rounded z-40 hidden md:flex flex-col"></div>""",
2316 | "html.parser",
2317 | )
2318 | )
2319 | if not body.find("div", class_="redline-navigation"):
2320 | body.insert(
2321 | 0,
2322 | BeautifulSoup(
2323 | """<div class="redline-navigation fixed top-2 right-2 bg-white/90 dark:bg-gray-800/90 p-2 rounded-lg shadow-lg z-50 text-xs backdrop-blur-sm"><div class="flex items-center"><button class="btn" onclick="goPrevChange()">Prev</button><button class="btn" onclick="goNextChange()">Next</button><span class="ml-2 px-2 py-1 bg-gray-100 dark:bg-gray-700 rounded" id="change-counter">-/-</span></div></div>""".replace(
2324 | 'class="btn"',
2325 | 'class="bg-gray-100 hover:bg-gray-200 dark:bg-gray-700 dark:hover:bg-gray-600 px-2 py-1 rounded mr-1 transition"',
2326 | )
2327 | .replace(
2328 | "Prev",
2329 | '<span class="hidden sm:inline">Previous</span><span class="sm:hidden">←</span>',
2330 | )
2331 | .replace(
2332 | "Next",
2333 | '<span class="hidden sm:inline">Next</span><span class="sm:hidden">→</span>',
2334 | ),
2335 | "html.parser",
2336 | ),
2337 | )
2338 | if not body.find("div", class_="redline-legend"):
2339 | body.append(
2340 | BeautifulSoup(
2341 | """<div class="redline-legend fixed bottom-2 left-2 bg-white/90 dark:bg-gray-800/90 p-2 rounded-lg shadow-lg z-50 text-xs flex flex-wrap gap-2 backdrop-blur-sm"><span class="legend-item"><span class="legend-color bg-blue-100 ring-blue-300 dark:bg-blue-900/60 dark:ring-blue-700"></span>Insert</span><span class="legend-item"><span class="legend-color bg-rose-100 ring-rose-300 dark:bg-rose-900/60 dark:ring-rose-700"></span>Delete</span><span class="legend-item"><span class="legend-color bg-emerald-100 ring-emerald-300 dark:bg-emerald-900/60 dark:ring-emerald-700"></span>Move</span><span class="legend-item"><span class="legend-color bg-orange-100 ring-orange-300 dark:bg-orange-900/60 dark:ring-orange-700"></span>Attr</span></div>""".replace(
2342 | 'class="legend-item"', 'class="flex items-center"'
2343 | ).replace(
2344 | 'class="legend-color', 'class="inline-block w-3 h-3 rounded ring-1 mr-1'
2345 | ),
2346 | "html.parser",
2347 | )
2348 | )
2349 | if not body.find("button", id="theme-toggle"):
2350 | body.insert(
2351 | 1,
2352 | BeautifulSoup(
2353 | """<button id="theme-toggle" title="Toggle theme" class="fixed top-2 left-2 z-50 p-2 bg-white dark:bg-gray-800 rounded-lg shadow-lg text-xs"><svg class="h-4 w-4 hidden dark:inline" fill="none" viewBox="0 0 24 24" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 3v1m0 16v1m9-9h-1M4 12H3m15.364 6.364l-.707-.707M6.343 6.343l-.707-.707m12.728 0l-.707.707M6.343 17.657l-.707.707M16 12a4 4 0 11-8 0 4 4 0 018 0z"/></svg><svg class="h-4 w-4 dark:hidden" fill="none" viewBox="0 0 24 24" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M20.354 15.354A9 9 0 018.646 3.646 9.003 9.003 0 0012 21a9.003 9.003 0 008.354-5.646z"/></svg></button>""",
2354 | "html.parser",
2355 | ),
2356 | )
2357 | if not body.find("script", string=lambda s: s and "goNextChange" in s):
2358 | script_tag = soup.new_tag("script")
2359 | script_tag.string = _get_navigation_js()
2360 | body.append(script_tag)
2361 | body_classes = body.get("class", [])
2362 | req_cls = [
2363 | "font-['Newsreader']",
2364 | "dark:text-gray-200",
2365 | "dark:bg-gray-900",
2366 | "transition-colors",
2367 | "duration-200",
2368 | ]
2369 | [body_classes.append(c) for c in req_cls if c not in body_classes]
2370 | body["class"] = body_classes
2371 | if output_format == "html" and not any(
2372 | isinstance(c, Tag) and "prose" in c.get("class", []) for c in body.contents
2373 | ):
2374 | wrapper = soup.new_tag(
2375 | "div", **{"class": "prose lg:prose-xl dark:prose-invert mx-auto max-w-4xl px-4 py-8"}
2376 | )
2377 | ui_els = body.select(
2378 | ".redline-navigation, .redline-legend, .redline-minimap, #theme-toggle, script",
2379 | recursive=False,
2380 | )
2381 | content = [el for el in body.contents if el not in ui_els]
2382 | [
2383 | wrapper.append(el.extract())
2384 | for el in content
2385 | if isinstance(el, Tag) or (isinstance(el, NavigableString) and el.strip())
2386 | ]
2387 | body.append(wrapper)
2388 | final_html = body.decode_contents() if output_format == "fragment" else str(soup)
2389 | logger.debug("HTML postprocessing finished.")
2390 | return final_html
2391 |
2392 |
2393 | def _get_base_diff_css() -> str:
2394 | # (Copied from previous correct version)
2395 | return """
2396 | <style data-base-diff="1">
2397 | ins.diff-insert, .diff-move-target {color:#1d4ed8; background-color:#eff6ff; border:1px solid #93c5fd; padding:0 1px; margin:0 1px; border-radius:2px; text-decoration:none;}
2398 | del.diff-delete, .diff-move-source {color:#b91c1c; background-color:#fef2f2; border:1px solid #fca5a5; padding:0 1px; margin:0 1px; border-radius:2px; text-decoration:line-through;}
2399 | ins.diff-move-target {color:#047857; background-color:#ecfdf5; border:1px solid #6ee7b7;}
2400 | del.diff-move-source {color:#065f46; background-color:#ecfdf599; border:1px dashed #6ee7b7; }
2401 | span.diff-update-container > * {border-bottom: 1px dotted #f97316;}
2402 | span.diff-attrib-change > * {box-shadow: 0px 0px 0px 1px #fb923c inset; }
2403 | span.diff-rename-node > * {box-shadow: 0px 0px 0px 1px #a855f7 inset; }
2404 | ins.diff-insert-text {color:#1e40af; text-decoration:underline; background:transparent; border:none; padding:0; margin:0;}
2405 | del.diff-delete-text {color:#b91c1c; text-decoration:line-through; background:transparent; border:none; padding:0; margin:0;}
2406 | /* Basic dark mode */
2407 | @media (prefers-color-scheme: dark) {
2408 | body { background-color: #1f2937; color: #d1d5db; }
2409 | ins.diff-insert, .diff-move-target { color: #93c5fd; background-color: #1e3a8a; border-color: #3b82f6; }
2410 | del.diff-delete, .diff-move-source { color: #fca5a5; background-color: #7f1d1d; border-color: #ef4444; }
2411 | ins.diff-move-target { color: #6ee7b7; background-color: #065f46; border-color: #10b981; }
2412 | del.diff-move-source { color: #a7f3d0; background-color: #064e3b; border-color: #34d399; }
2413 | span.diff-update-container > * { border-color: #fb923c; }
2414 | span.diff-attrib-change > * { box-shadow: 0px 0px 0px 1px #f97316 inset; }
2415 | span.diff-rename-node > * { box-shadow: 0px 0px 0px 1px #c084fc inset; }
2416 | ins.diff-insert-text {color:#60a5fa;}
2417 | del.diff-delete-text {color:#f87171;}
2418 | }
2419 | </style>
2420 | """
2421 |
2422 | def _get_tailwind_css() -> str:
2423 | return """ @tailwind base;@tailwind components;@tailwind utilities; @layer components { .diff-insert, .diff-delete, .diff-move-target, .diff-move-source { @apply px-0.5 rounded-sm mx-[1px] transition duration-150; } ins.diff-insert, .diff-insert > ins { @apply text-blue-800 bg-blue-50 ring-1 ring-inset ring-blue-300/60 no-underline; } .dark ins.diff-insert, .dark .diff-insert > ins { @apply text-blue-200 bg-blue-900/40 ring-blue-500/30; } ins.diff-insert:hover, .diff-insert > ins:hover { @apply ring-2 ring-offset-1 ring-black/10 shadow-sm bg-blue-100 dark:bg-blue-800/60; } del.diff-delete, .diff-delete > del { @apply text-rose-800 bg-rose-50 ring-1 ring-inset ring-rose-300/60 line-through; } .dark del.diff-delete, .dark .diff-delete > del { @apply text-rose-200 bg-rose-900/40 ring-rose-500/30; } del.diff-delete:hover, .diff-delete > del:hover { @apply ring-2 ring-offset-1 ring-black/10 shadow-sm bg-rose-100 dark:bg-rose-800/60; } ins.diff-move-target, .diff-move-target > ins { @apply text-emerald-900 bg-emerald-50 ring-1 ring-emerald-400/60 no-underline border border-emerald-300; } .dark ins.diff-move-target, .dark .diff-move-target > ins { @apply text-emerald-200 bg-emerald-900/40 ring-emerald-500/30 border-emerald-700; } ins.diff-move-target:hover, .diff-move-target > ins:hover { @apply ring-2 ring-offset-1 ring-black/10 shadow-sm bg-emerald-100 dark:bg-emerald-800/60; } del.diff-move-source, .diff-move-source > del { @apply text-emerald-800/60 bg-emerald-50/50 line-through border border-dashed border-emerald-400/40; } .dark del.diff-move-source, .dark .diff-move-source > del { @apply text-emerald-300/60 bg-emerald-900/30 border-emerald-700/40; } del.diff-move-source:hover, .diff-move-source > del:hover { @apply bg-emerald-100/70 border-emerald-400 shadow-sm dark:bg-emerald-800/50; } span.diff-update-container { @apply border-b border-dotted border-orange-400 bg-orange-50/30; } .dark span.diff-update-container { @apply border-orange-500 bg-orange-900/30; } span.diff-update-container:hover { @apply bg-orange-100/50 dark:bg-orange-800/40; } span.diff-attrib-change { @apply ring-1 ring-orange-400/50 ring-inset bg-orange-50/30 backdrop-blur-sm rounded-sm; } .dark span.diff-attrib-change { @apply ring-orange-500/50 bg-orange-900/30; } span.diff-attrib-change:hover { @apply bg-orange-100/50 dark:bg-orange-800/40; } span.diff-rename-node { @apply ring-1 ring-purple-400/50 ring-inset bg-violet-50/30 backdrop-blur-sm rounded-sm; } .dark span.diff-rename-node { @apply ring-purple-500/50 bg-violet-900/30; } span.diff-rename-node:hover { @apply bg-violet-100/50 dark:bg-violet-800/40; } ins.diff-insert-text { @apply text-blue-700 dark:text-blue-300 underline decoration-dotted decoration-1 underline-offset-2 bg-transparent border-none ring-0 p-0 m-0; } del.diff-delete-text { @apply text-rose-700 dark:text-rose-300 line-through decoration-dotted decoration-1 bg-transparent border-none ring-0 p-0 m-0; } @media print { .redline-navigation, .redline-legend, .redline-minimap, #theme-toggle { @apply hidden; } ins, del, span[class*="diff-"] { @apply text-black !important; background-color: transparent !important; border: none !important; ring: none !important; box-shadow: none !important; } ins { @apply font-bold no-underline; } del { @apply italic line-through; } } } """
2424 |
2425 |
2426 | def _get_navigation_js() -> str:
2427 | return """ /* Combined JS */ document.addEventListener('DOMContentLoaded', () => { let _redlineChanges = null; let _changeIdx = -1; let _currentHi = null; const changeCounter = document.getElementById('change-counter'); const minimap = document.querySelector('.redline-minimap'); const container = document.querySelector('.prose') || document.body; function findAllChanges() { if (!_redlineChanges) { _redlineChanges = Array.from(container.querySelectorAll( 'ins.diff-insert, ins.diff-move-target, del.diff-delete, del.diff-move-source, span.diff-update-container, span.diff-attrib-change, span.diff-rename-node, ins.diff-insert-text, del.diff-delete-text' )); _redlineChanges = _redlineChanges.filter(el => { let p = el.parentElement; while (p && p !== container && p !== document.body) { if (_redlineChanges.includes(p)) return false; p = p.parentElement; } return true; }); _redlineChanges.sort((a, b) => { const c = a.compareDocumentPosition(b); if (c & Node.DOCUMENT_POSITION_FOLLOWING) return 1; if (c & Node.DOCUMENT_POSITION_PRECEDING) return -1; return 0; }); updateCounter(); if (_redlineChanges && _redlineChanges.length > 0) { _changeIdx = -1; } else { console.log("Redline Nav: No changes found."); } } return _redlineChanges; } function highlightCurrentChange(scrollTo = true) { if (!_redlineChanges || _changeIdx < 0 || _changeIdx >= _redlineChanges.length) return; const el = _redlineChanges[_changeIdx]; if (!el) return; if (_currentHi && _currentHi !== el && _redlineChanges.includes(_currentHi)) { _currentHi.style.outline = ''; _currentHi.style.boxShadow = ''; _currentHi.style.outlineOffset = ''; _currentHi.classList.remove('current-redline-change'); } el.style.outline = '2px solid orange'; el.style.outlineOffset = '2px'; el.style.boxShadow = '0 0 8px 1px rgba(255, 165, 0, 0.6)'; el.classList.add('current-redline-change'); if (scrollTo) { const rect = el.getBoundingClientRect(); const isVisible = rect.top >= 0 && rect.left >= 0 && rect.bottom <= (window.innerHeight || document.documentElement.clientHeight) && rect.right <= (window.innerWidth || document.documentElement.clientWidth); if (!isVisible) { el.scrollIntoView({ behavior: 'smooth', block: 'center', inline: 'nearest' }); } } _currentHi = el; updateCounter(); updateMinimapHighlight(); } window.goPrevChange = () => { findAllChanges(); if (!_redlineChanges || _redlineChanges.length === 0) return; _changeIdx = (_changeIdx <= 0) ? _redlineChanges.length - 1 : _changeIdx - 1; highlightCurrentChange(); }; window.goNextChange = () => { findAllChanges(); if (!_redlineChanges || _redlineChanges.length === 0) return; _changeIdx = (_changeIdx >= _redlineChanges.length - 1) ? 0 : _changeIdx + 1; highlightCurrentChange(); }; function updateCounter() { if (changeCounter && _redlineChanges) { changeCounter.textContent = `${_redlineChanges.length > 0 ? _changeIdx + 1 : 0}/${_redlineChanges.length}`; } else if (changeCounter) { changeCounter.textContent = '0/0'; } } document.addEventListener("keydown", e => { if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA' || e.target.isContentEditable) return; if (e.key === "ArrowRight" && !e.altKey && !e.ctrlKey && !e.metaKey && !e.shiftKey) { goNextChange(); e.preventDefault(); } else if (e.key === "ArrowLeft" && !e.altKey && !e.ctrlKey && !e.metaKey && !e.shiftKey) { goPrevChange(); e.preventDefault(); } }); let minimapMarkers = []; function setupMinimap() { if (!minimap || !container) { return; } const changes = findAllChanges(); if (!changes || !changes.length) { minimap.style.display = 'none'; return; } minimap.innerHTML = ''; minimapMarkers = []; requestAnimationFrame(() => { const containerHeight = container.scrollHeight; if (containerHeight <= 0) { return; } changes.forEach((change, index) => { let type = ''; const cl = change.classList; const tagName = change.tagName.toUpperCase(); if (cl.contains('diff-insert') || (tagName === 'INS' && !cl.contains('diff-move-target'))) type = 'insert'; else if (cl.contains('diff-delete') || (tagName === 'DEL' && !cl.contains('diff-move-source'))) type = 'delete'; else if (cl.contains('diff-move-target') || cl.contains('diff-move-source')) type = 'move'; else if (cl.contains('diff-attrib-change')) type = 'attrib'; else if (cl.contains('diff-update-container')) type = 'text'; else if (cl.contains('diff-rename-node')) type = 'rename'; else if (cl.contains('diff-insert-text')) type = 'insert'; else if (cl.contains('diff-delete-text')) type = 'delete'; else return; const relativePos = change.offsetTop / containerHeight; const marker = document.createElement('div'); marker.className = 'minimap-marker absolute w-full h-[3px] cursor-pointer opacity-75 hover:opacity-100 transition-opacity duration-150'; marker.style.top = `${Math.max(0, Math.min(100, relativePos * 100))}%`; if (type === 'insert') marker.classList.add('bg-blue-500'); else if (type === 'delete') marker.classList.add('bg-rose-500'); else if (type === 'move') marker.classList.add('bg-emerald-500'); else marker.classList.add('bg-orange-500'); marker.title = `${type.charAt(0).toUpperCase() + type.slice(1)} change (${index + 1}/${changes.length})`; marker.dataset.changeIndex = index; marker.addEventListener('click', () => { _changeIdx = index; highlightCurrentChange(); }); minimap.appendChild(marker); minimapMarkers.push(marker); }); minimap.style.display = 'flex'; updateMinimapHighlight(); }); } function updateMinimapHighlight() { minimapMarkers.forEach((marker, index) => { if (index === _changeIdx) { marker.style.transform = 'scaleX(1.5)'; marker.style.opacity = '1'; marker.style.zIndex = '10'; marker.classList.add('bg-yellow-400'); marker.classList.remove('bg-blue-500', 'bg-rose-500', 'bg-emerald-500', 'bg-orange-500'); } else { marker.style.transform = ''; marker.style.opacity = '0.75'; marker.style.zIndex = '1'; marker.classList.remove('bg-yellow-400'); const oClass = marker.title.includes('Insert') ? 'bg-blue-500' : marker.title.includes('Delete') ? 'bg-rose-500' : marker.title.includes('Move') ? 'bg-emerald-500' : 'bg-orange-500'; if (!marker.classList.contains(oClass)) { marker.classList.remove('bg-blue-500', 'bg-rose-500', 'bg-emerald-500', 'bg-orange-500'); marker.classList.add(oClass); } } }); } function debounce(func, wait) { let t; return function(...a) { const l = () => { clearTimeout(t); func(...a); }; clearTimeout(t); t = setTimeout(l, wait); }; } const debouncedSetupMinimap = debounce(setupMinimap, 250); window.addEventListener('resize', debouncedSetupMinimap); const themeToggle = document.getElementById('theme-toggle'); function applyTheme(isDark) { document.documentElement.classList.toggle('dark', isDark); localStorage.theme = isDark ? 'dark' : 'light'; setupMinimap(); } if (themeToggle) { const pDark = window.matchMedia('(prefers-color-scheme: dark)').matches; const cTheme = localStorage.theme === 'dark' || (!('theme' in localStorage) && pDark) ? 'dark' : 'light'; applyTheme(cTheme === 'dark'); themeToggle.addEventListener('click', () => { applyTheme(!document.documentElement.classList.contains('dark')); }); } function handleMoveHighlight(event) { const moveEl = event.target.closest("ins[data-move-id], del[data-move-id]"); if (!moveEl) return; const moveId = moveEl.dataset.moveId; if (!moveId) return; const isEnter = event.type === "mouseover"; document.querySelectorAll(`[data-move-id='${moveId}']`).forEach(el => { el.style.outline = isEnter ? "3px dashed #059669" : ""; el.style.outlineOffset = isEnter ? "2px" : ""; el.style.transition = 'outline 0.15s ease-in-out, outline-offset 0.15s ease-in-out'; }); } container.addEventListener("mouseover", handleMoveHighlight); container.addEventListener("mouseout", handleMoveHighlight); findAllChanges(); setupMinimap(); }); """
2428 |
2429 |
2430 | # ─────────────────────────────────────────────────────────────────────────────
2431 | # Plain‑text comparison (escaped)
2432 | # ─────────────────────────────────────────────────────────────────────────────
2433 | def _generate_text_redline(
2434 | original_text: str, modified_text: str, *, diff_level: str = "word",
2435 | ) -> Tuple[str, Dict[str, int]]:
2436 | """Return plain‑text diff with {- +} markers and [~ ~] for moves."""
2437 | if diff_level == "char":
2438 | orig_units, mod_units, joiner = list(original_text), list(modified_text), ""
2439 | elif diff_level == "word":
2440 | rx = r"(\w+[\S\w]*|\s+|[^\w\s])" # Keep whitespace as separate unit
2441 | orig_units, mod_units, joiner = re.findall(rx, original_text), re.findall(rx, modified_text), ""
2442 | else: # line level
2443 | orig_units, mod_units, joiner = original_text.splitlines(True), modified_text.splitlines(True), ""
2444 |
2445 | sm = difflib.SequenceMatcher(None, orig_units, mod_units, autojunk=False)
2446 | ops: List[Tuple[str, str]] = [] # Store ('tag', 'text') pairs
2447 | for tag, i1, i2, j1, j2 in sm.get_opcodes():
2448 | if tag == "equal":
2449 | ops.append(("eq", joiner.join(orig_units[i1:i2])))
2450 | elif tag == "delete":
2451 | ops.append(("del", joiner.join(orig_units[i1:i2])))
2452 | elif tag == "insert":
2453 | ops.append(("ins", joiner.join(mod_units[j1:j2])))
2454 | else: # replace
2455 | # Treat replace as delete followed by insert for move detection
2456 | ops.append(("del", joiner.join(orig_units[i1:i2])))
2457 | ops.append(("ins", joiner.join(mod_units[j1:j2])))
2458 |
2459 | # --- Attempt Move Detection based on Content ---
2460 | # Normalize whitespace and ignore case for matching identical blocks
2461 | def _normalize_for_match(s: str) -> str:
2462 | return re.sub(r'\s+', ' ', s.strip()).lower()
2463 |
2464 | dels: Dict[str, List[int]] = {} # Map normalized text -> list of deletion indices
2465 | ins: Dict[str, List[int]] = {} # Map normalized text -> list of insertion indices
2466 | paired: Dict[int, int] = {} # Map deletion index -> insertion index for identified moves
2467 |
2468 | for idx, (tag, txt) in enumerate(ops):
2469 | if tag == "del":
2470 | key = _normalize_for_match(txt)
2471 | if key: # Only track non-empty deletions
2472 | dels.setdefault(key, []).append(idx)
2473 | elif tag == "ins":
2474 | key = _normalize_for_match(txt)
2475 | if key: # Only track non-empty insertions
2476 | ins.setdefault(key, []).append(idx)
2477 |
2478 | # Find potential moves: identical normalized content deleted once and inserted once
2479 | for key in set(dels) & set(ins):
2480 | if len(dels[key]) == 1 and len(ins[key]) == 1:
2481 | deletion_idx = dels[key][0]
2482 | insertion_idx = ins[key][0]
2483 | # Ensure they are not adjacent (which would be a replace)
2484 | # This simple check might be too strict, but helps avoid marking simple replacements as moves
2485 | if abs(deletion_idx - insertion_idx) > 1:
2486 | paired[deletion_idx] = insertion_idx # Mark as a move pair
2487 |
2488 | # --- Build Output String ---
2489 | buf: List[str] = []
2490 | ic = dc = mc = 0 # Insert, Delete, Move counts
2491 | for idx, (tag, txt) in enumerate(ops):
2492 | if idx in paired: # This is the deletion part of a move, skip it
2493 | continue
2494 | if idx in paired.values(): # This is the insertion part of a move
2495 | # Escape markers within the moved text
2496 | escaped_move = txt.replace("[~", "[ ~").replace("~]", "~ ]")
2497 | buf.append(f"[~{escaped_move}~]")
2498 | mc += 1
2499 | continue
2500 |
2501 | # Handle regular operations
2502 | if tag == "eq":
2503 | buf.append(txt)
2504 | elif tag == "del":
2505 | # Escape markers within the deleted text
2506 | escaped_del = txt.replace("[-", "[ -").replace("-]", "- ]")
2507 | buf.append(f"[-{escaped_del}-]")
2508 | dc += 1
2509 | elif tag == "ins":
2510 | # Escape markers within the inserted text
2511 | escaped_ins = txt.replace("{+", "{ +").replace("+}", "+ }")
2512 | buf.append(f"{{+{escaped_ins}+}}")
2513 | ic += 1
2514 |
2515 | # --- Calculate Stats ---
2516 | stats = {
2517 | "total_changes": ic + dc + mc, # Total distinct changes
2518 | "insertions": ic,
2519 | "deletions": dc,
2520 | "moves": mc,
2521 | "text_updates": 0, # Not explicitly tracked with this method
2522 | "attr_updates": 0,
2523 | "other_changes": 0,
2524 | "inline_insertions": ic + mc, # Count move insertions here?
2525 | "inline_deletions": dc + mc, # Count move deletions here?
2526 | }
2527 | return "".join(buf), stats
2528 |
2529 |
2530 | # ─────────────────────────────────────────────────────────────────────────────
2531 | # Public wrapper for text docs
2532 | # ─────────────────────────────────────────────────────────────────────────────
2533 | @with_tool_metrics
2534 | @with_error_handling
2535 | async def compare_documents_redline(
2536 | original_text: str,
2537 | modified_text: str,
2538 | *,
2539 | file_format: str = "auto",
2540 | detect_moves: bool = True,
2541 | ignore_whitespace: bool = True,
2542 | output_format: str = "html",
2543 | diff_level: str = "word",
2544 | include_css: bool = True,
2545 | generate_markdown: bool = False,
2546 | markdown_path: str = "detected_redline_differences.md",
2547 | run_tidy: bool = False,
2548 | ) -> Dict[str, Any]:
2549 | t0 = time.time()
2550 | logger.info(f"Starting doc comparison. Input: {file_format}, Output: {output_format}")
2551 | if not isinstance(original_text, str):
2552 | raise ToolInputError("original_text must be str")
2553 | if not isinstance(modified_text, str):
2554 | raise ToolInputError("modified_text must be str")
2555 | valid_formats = {"auto", "html", "text", "markdown", "latex"}
2556 | if file_format not in valid_formats:
2557 | raise ToolInputError(f"Invalid file_format: {file_format}")
2558 | if output_format not in {"html", "text"}:
2559 | raise ToolInputError(f"Invalid output_format: {output_format}")
2560 | if diff_level not in {"char", "word", "line"}:
2561 | raise ToolInputError(f"Invalid diff_level: {diff_level}")
2562 |
2563 | if original_text == modified_text:
2564 | logger.info("Documents are identical.")
2565 | stats = {k: 0 for k in RedlineXMLFormatter().processed_actions}
2566 | stats["total_changes"] = 0
2567 | if output_format == "html":
2568 | fmt = file_format if file_format != "auto" else _detect_file_format(original_text)
2569 | html = ""
2570 | try:
2571 | if fmt == "html":
2572 | html = original_text
2573 | elif fmt == "markdown":
2574 | md_ext = [
2575 | "fenced_code",
2576 | "tables",
2577 | "sane_lists",
2578 | "nl2br",
2579 | "footnotes",
2580 | "attr_list",
2581 | ]
2582 | html = markdown.markdown(original_text, extensions=md_ext)
2583 | elif _DOC_CONVERSION_AVAILABLE:
2584 | res = await convert_document(
2585 | document_data=original_text.encode("utf-8"),
2586 | input_format_hint=fmt,
2587 | output_format="markdown",
2588 | )
2589 | if res.get("success") and res.get("content"):
2590 | md_ext = [
2591 | "fenced_code",
2592 | "tables",
2593 | "sane_lists",
2594 | "nl2br",
2595 | "footnotes",
2596 | "attr_list",
2597 | ]
2598 | html = markdown.markdown(res["content"], extensions=md_ext)
2599 | else:
2600 | logger.warning(f"Conv failed: {res.get('error')}")
2601 | html = f"<pre>{html_stdlib.escape(original_text)}</pre>"
2602 | else:
2603 | html = f"<pre>{html_stdlib.escape(original_text)}</pre>"
2604 | final_html = await _postprocess_redline(
2605 | html, include_css=True, add_navigation=False, output_format="html"
2606 | )
2607 | except Exception as e:
2608 | logger.error(f"Error prep identical: {e}")
2609 | final_html = f"<!DOCTYPE html><html><body><pre>{html_stdlib.escape(original_text)}</pre></body></html>"
2610 | return {
2611 | "redline_html": final_html,
2612 | "stats": stats,
2613 | "processing_time": time.time() - t0,
2614 | "success": True,
2615 | }
2616 | else:
2617 | return {
2618 | "redline": original_text,
2619 | "stats": stats,
2620 | "processing_time": time.time() - t0,
2621 | "success": True,
2622 | }
2623 |
2624 | actual_format = file_format
2625 | if actual_format == "auto":
2626 | actual_format = _detect_file_format(original_text)
2627 | logger.info(f"Auto-detected format: {actual_format}")
2628 |
2629 | if output_format == "html":
2630 | logger.info(f"Generating HTML redline for '{actual_format}' input...")
2631 | orig_html = original_text
2632 | mod_html = modified_text
2633 | if actual_format != "html":
2634 | if not _DOC_CONVERSION_AVAILABLE:
2635 | raise ToolError(
2636 | f"Input '{actual_format}', but conversion tool unavailable.",
2637 | code="DEPENDENCY_MISSING",
2638 | )
2639 | logger.info(f"Converting '{actual_format}' input to Markdown then HTML...")
2640 | try:
2641 | params = {
2642 | "output_format": "markdown",
2643 | "extraction_strategy": "hybrid_direct_ocr",
2644 | "enhance_with_llm": False,
2645 | }
2646 | res_o = await convert_document(
2647 | document_data=original_text.encode("utf-8"),
2648 | input_format_hint=actual_format,
2649 | **params,
2650 | )
2651 | if not res_o.get("success"):
2652 | raise ToolError(
2653 | f"Orig conv failed: {res_o.get('error')}", code="CONVERSION_FAILED"
2654 | )
2655 | res_m = await convert_document(
2656 | document_data=modified_text.encode("utf-8"),
2657 | input_format_hint=actual_format,
2658 | **params,
2659 | )
2660 | if not res_m.get("success"):
2661 | raise ToolError(
2662 | f"Mod conv failed: {res_m.get('error')}", code="CONVERSION_FAILED"
2663 | )
2664 | md_ext = ["fenced_code", "tables", "sane_lists", "nl2br", "footnotes", "attr_list"]
2665 | orig_html = markdown.markdown(res_o["content"], extensions=md_ext)
2666 | mod_html = markdown.markdown(res_m["content"], extensions=md_ext)
2667 | except Exception as e:
2668 | logger.error(f"Doc conversion failed: {e}", exc_info=True)
2669 | raise ToolInputError("Failed doc conversion.") from e
2670 | html_result = await create_html_redline(
2671 | original_html=orig_html,
2672 | modified_html=mod_html,
2673 | detect_moves=detect_moves,
2674 | ignore_whitespace=ignore_whitespace,
2675 | output_format="html",
2676 | include_css=include_css,
2677 | add_navigation=True,
2678 | generate_markdown=generate_markdown,
2679 | markdown_path=markdown_path,
2680 | run_tidy=run_tidy,
2681 | )
2682 | html_result["processing_time"] = time.time() - t0
2683 | return html_result
2684 |
2685 | elif output_format == "text":
2686 | logger.info(f"Generating plain text redline (level: {diff_level})...")
2687 | o_plain, m_plain = original_text, modified_text
2688 | if actual_format == "html":
2689 | logger.warning("Generating text diff from HTML; tags included.")
2690 | elif actual_format == "markdown":
2691 | logger.warning("Generating text diff from Markdown; syntax included.")
2692 | txt, stats = _generate_text_redline(o_plain, m_plain, diff_level=diff_level)
2693 | return {
2694 | "redline": txt,
2695 | "stats": stats,
2696 | "processing_time": time.time() - t0,
2697 | "success": True,
2698 | }
2699 |
2700 | raise ToolInputError("Invalid output format.")
2701 |
2702 |
2703 | # ─────────────────────────────────────────────────────────────────────────────
2704 | # Aux helpers
2705 | # ─────────────────────────────────────────────────────────────────────────────
2706 | def _detect_file_format(text: str) -> str:
2707 | if not text or not text.strip():
2708 | return "text"
2709 | t = text.lower().strip()
2710 | if t.startswith("<!doctype html") or t.startswith("<html"):
2711 | return "html"
2712 | hs = sum(f"<{tag}" in t for tag in ("body", "div", "p", "table", "h1", "br")) + sum(
2713 | f"</{tag}>" in t for tag in ("body", "div", "p", "table", "h1")
2714 | )
2715 | lrx = [
2716 | r"\\documentclass",
2717 | r"\\begin\{document\}",
2718 | r"\\section\{",
2719 | r"\\usepackage\{",
2720 | r"\$.+\$",
2721 | r"\\begin\{",
2722 | ]
2723 | ls = sum(bool(re.search(p, text, re.M | re.I)) for p in lrx)
2724 | mrx = [
2725 | r"^[#]+\s+",
2726 | r"^>\s+",
2727 | r"^\s*[-*+]\s+",
2728 | r"^\s*[0-9]+\.\s+",
2729 | r"```|~~~",
2730 | r"\|.*\|.*\|",
2731 | r"\*{1,2}[^*\s]",
2732 | r"`[^`]+`",
2733 | r"\[.*?\]\(.*?\)",
2734 | ]
2735 | lines = text.splitlines()
2736 | ms = (
2737 | sum(bool(re.search(p, l)) for p in mrx[:5] for l in lines[:30]) # noqa: E741
2738 | + sum(
2739 | bool(re.search(p, text, re.M)) for p in mrx[5:]
2740 | )
2741 | )
2742 | if ls >= 2 and hs < 2:
2743 | return "latex"
2744 | if hs >= 4 or (hs >= 2 and "<body" in t):
2745 | return "html"
2746 | if ms >= 3 and hs <= 1 and ls <= 0:
2747 | return "markdown"
2748 | if hs >= 2 and ms >= 2:
2749 | return "html"
2750 | return "text"
2751 |
2752 |
2753 | # ─────────────────────────────────────────────────────────────────────────────
2754 | # Metadata
2755 | # ─────────────────────────────────────────────────────────────────────────────
2756 | __all__ = ["create_html_redline", "compare_documents_redline", "RedlineXMLFormatter"]
2757 | __version__ = "1.5.0"
2758 | __updated__ = _dt.datetime.now(timezone.utc).isoformat(timespec="seconds") + "Z"
2759 |
```