#
tokens: 37138/50000 1/207 files (page 35/45)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 35 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│   ├── __init__.py
│   ├── advanced_agent_flows_using_unified_memory_system_demo.py
│   ├── advanced_extraction_demo.py
│   ├── advanced_unified_memory_system_demo.py
│   ├── advanced_vector_search_demo.py
│   ├── analytics_reporting_demo.py
│   ├── audio_transcription_demo.py
│   ├── basic_completion_demo.py
│   ├── cache_demo.py
│   ├── claude_integration_demo.py
│   ├── compare_synthesize_demo.py
│   ├── cost_optimization.py
│   ├── data
│   │   ├── sample_event.txt
│   │   ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│   │   └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│   ├── docstring_refiner_demo.py
│   ├── document_conversion_and_processing_demo.py
│   ├── entity_relation_graph_demo.py
│   ├── filesystem_operations_demo.py
│   ├── grok_integration_demo.py
│   ├── local_text_tools_demo.py
│   ├── marqo_fused_search_demo.py
│   ├── measure_model_speeds.py
│   ├── meta_api_demo.py
│   ├── multi_provider_demo.py
│   ├── ollama_integration_demo.py
│   ├── prompt_templates_demo.py
│   ├── python_sandbox_demo.py
│   ├── rag_example.py
│   ├── research_workflow_demo.py
│   ├── sample
│   │   ├── article.txt
│   │   ├── backprop_paper.pdf
│   │   ├── buffett.pdf
│   │   ├── contract_link.txt
│   │   ├── legal_contract.txt
│   │   ├── medical_case.txt
│   │   ├── northwind.db
│   │   ├── research_paper.txt
│   │   ├── sample_data.json
│   │   └── text_classification_samples
│   │       ├── email_classification.txt
│   │       ├── news_samples.txt
│   │       ├── product_reviews.txt
│   │       └── support_tickets.txt
│   ├── sample_docs
│   │   └── downloaded
│   │       └── attention_is_all_you_need.pdf
│   ├── sentiment_analysis_demo.py
│   ├── simple_completion_demo.py
│   ├── single_shot_synthesis_demo.py
│   ├── smart_browser_demo.py
│   ├── sql_database_demo.py
│   ├── sse_client_demo.py
│   ├── test_code_extraction.py
│   ├── test_content_detection.py
│   ├── test_ollama.py
│   ├── text_classification_demo.py
│   ├── text_redline_demo.py
│   ├── tool_composition_examples.py
│   ├── tournament_code_demo.py
│   ├── tournament_text_demo.py
│   ├── unified_memory_system_demo.py
│   ├── vector_search_demo.py
│   ├── web_automation_instruction_packs.py
│   └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│   └── smart_browser_internal
│       ├── locator_cache.db
│       ├── readability.js
│       └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│   ├── __init__.py
│   ├── conftest.py
│   ├── integration
│   │   ├── __init__.py
│   │   └── test_server.py
│   ├── manual
│   │   ├── test_extraction_advanced.py
│   │   └── test_extraction.py
│   └── unit
│       ├── __init__.py
│       ├── test_cache.py
│       ├── test_providers.py
│       └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│   ├── __init__.py
│   ├── __main__.py
│   ├── cli
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── commands.py
│   │   ├── helpers.py
│   │   └── typer_cli.py
│   ├── clients
│   │   ├── __init__.py
│   │   ├── completion_client.py
│   │   └── rag_client.py
│   ├── config
│   │   └── examples
│   │       └── filesystem_config.yaml
│   ├── config.py
│   ├── constants.py
│   ├── core
│   │   ├── __init__.py
│   │   ├── evaluation
│   │   │   ├── base.py
│   │   │   └── evaluators.py
│   │   ├── providers
│   │   │   ├── __init__.py
│   │   │   ├── anthropic.py
│   │   │   ├── base.py
│   │   │   ├── deepseek.py
│   │   │   ├── gemini.py
│   │   │   ├── grok.py
│   │   │   ├── ollama.py
│   │   │   ├── openai.py
│   │   │   └── openrouter.py
│   │   ├── server.py
│   │   ├── state_store.py
│   │   ├── tournaments
│   │   │   ├── manager.py
│   │   │   ├── tasks.py
│   │   │   └── utils.py
│   │   └── ums_api
│   │       ├── __init__.py
│   │       ├── ums_database.py
│   │       ├── ums_endpoints.py
│   │       ├── ums_models.py
│   │       └── ums_services.py
│   ├── exceptions.py
│   ├── graceful_shutdown.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── analytics
│   │   │   ├── __init__.py
│   │   │   ├── metrics.py
│   │   │   └── reporting.py
│   │   ├── cache
│   │   │   ├── __init__.py
│   │   │   ├── cache_service.py
│   │   │   ├── persistence.py
│   │   │   ├── strategies.py
│   │   │   └── utils.py
│   │   ├── cache.py
│   │   ├── document.py
│   │   ├── knowledge_base
│   │   │   ├── __init__.py
│   │   │   ├── feedback.py
│   │   │   ├── manager.py
│   │   │   ├── rag_engine.py
│   │   │   ├── retriever.py
│   │   │   └── utils.py
│   │   ├── prompts
│   │   │   ├── __init__.py
│   │   │   ├── repository.py
│   │   │   └── templates.py
│   │   ├── prompts.py
│   │   └── vector
│   │       ├── __init__.py
│   │       ├── embeddings.py
│   │       └── vector_service.py
│   ├── tool_token_counter.py
│   ├── tools
│   │   ├── __init__.py
│   │   ├── audio_transcription.py
│   │   ├── base.py
│   │   ├── completion.py
│   │   ├── docstring_refiner.py
│   │   ├── document_conversion_and_processing.py
│   │   ├── enhanced-ums-lookbook.html
│   │   ├── entity_relation_graph.py
│   │   ├── excel_spreadsheet_automation.py
│   │   ├── extraction.py
│   │   ├── filesystem.py
│   │   ├── html_to_markdown.py
│   │   ├── local_text_tools.py
│   │   ├── marqo_fused_search.py
│   │   ├── meta_api_tool.py
│   │   ├── ocr_tools.py
│   │   ├── optimization.py
│   │   ├── provider.py
│   │   ├── pyodide_boot_template.html
│   │   ├── python_sandbox.py
│   │   ├── rag.py
│   │   ├── redline-compiled.css
│   │   ├── sentiment_analysis.py
│   │   ├── single_shot_synthesis.py
│   │   ├── smart_browser.py
│   │   ├── sql_databases.py
│   │   ├── text_classification.py
│   │   ├── text_redline_tools.py
│   │   ├── tournament.py
│   │   ├── ums_explorer.html
│   │   └── unified_memory_system.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── async_utils.py
│   │   ├── display.py
│   │   ├── logging
│   │   │   ├── __init__.py
│   │   │   ├── console.py
│   │   │   ├── emojis.py
│   │   │   ├── formatter.py
│   │   │   ├── logger.py
│   │   │   ├── panels.py
│   │   │   ├── progress.py
│   │   │   └── themes.py
│   │   ├── parse_yaml.py
│   │   ├── parsing.py
│   │   ├── security.py
│   │   └── text.py
│   └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/entity_relation_graph.py:
--------------------------------------------------------------------------------

```python
   1 | # -*- coding: utf-8 -*-
   2 | """Entity relationship graph tools for Ultimate MCP Server."""
   3 | 
   4 | import json
   5 | import os
   6 | import re
   7 | import tempfile
   8 | import time
   9 | import uuid
  10 | from datetime import datetime, timezone
  11 | from enum import Enum
  12 | from typing import Any, Dict, List, Optional, Set, Tuple, Union
  13 | 
  14 | # --- Optional Imports ---
  15 | try:
  16 |     import networkx as nx
  17 |     HAS_NETWORKX = True
  18 | except ImportError:
  19 |     HAS_NETWORKX = False
  20 | 
  21 | try:
  22 |     from pyvis.network import Network
  23 |     HAS_PYVIS = True
  24 | except ImportError:
  25 |     HAS_PYVIS = False
  26 | 
  27 | try:
  28 |     import matplotlib.pyplot as plt
  29 |     HAS_MATPLOTLIB = True
  30 | except ImportError:
  31 |     HAS_MATPLOTLIB = False
  32 | 
  33 | HAS_VISUALIZATION_LIBS = HAS_NETWORKX and HAS_PYVIS and HAS_MATPLOTLIB
  34 | # --- End Optional Imports ---
  35 | 
  36 | from ultimate_mcp_server.constants import Provider  # noqa: E402
  37 | from ultimate_mcp_server.core.providers.base import BaseProvider, get_provider  # noqa: E402
  38 | from ultimate_mcp_server.exceptions import ProviderError, ToolError, ToolInputError  # noqa: E402
  39 | from ultimate_mcp_server.tools.base import (  # noqa: E402
  40 |     with_cache,
  41 |     with_error_handling,
  42 |     with_retry,
  43 |     with_tool_metrics,
  44 | )
  45 | from ultimate_mcp_server.tools.completion import generate_completion  # noqa: E402
  46 | from ultimate_mcp_server.tools.document_conversion_and_processing import (  # noqa: E402
  47 |     chunk_document_standalone,
  48 |     )
  49 | from ultimate_mcp_server.utils import get_logger  # noqa: E402
  50 | 
  51 | logger = get_logger("ultimate_mcp_server.tools.entity_graph")
  52 | 
  53 | # --- Enums ---
  54 | class GraphStrategy(Enum):
  55 |     """Strategies for entity graph extraction."""
  56 |     STANDARD = "standard"        # Basic prompt-based extraction
  57 |     MULTISTAGE = "multistage"    # Process in stages: entities first, then relationships
  58 |     CHUNKED = "chunked"          # Process large texts in chunks and merge results
  59 |     INCREMENTAL = "incremental"  # Build graph incrementally from existing graph
  60 |     STRUCTURED = "structured"    # Use structured examples for consistent extraction
  61 |     STRICT_SCHEMA = "strict_schema"  # Use a predefined schema of entities and relationships
  62 | 
  63 | class OutputFormat(Enum):
  64 |     """Output formats for entity graphs."""
  65 |     JSON = "json"                # Standard JSON (default)
  66 |     NETWORKX = "networkx"        # NetworkX graph object
  67 |     RDF = "rdf"                  # Resource Description Framework (triples)
  68 |     CYTOSCAPE = "cytoscape"      # Cytoscape.js format
  69 |     D3 = "d3"                    # D3.js force graph format (nodes/links)
  70 |     NEO4J = "neo4j"              # Neo4j Cypher queries
  71 | 
  72 | class VisualizationFormat(Enum):
  73 |     """Visualization formats for entity graphs."""
  74 |     NONE = "none"                # No visualization
  75 |     HTML = "html"                # Interactive HTML (Pyvis)
  76 |     SVG = "svg"                  # Static SVG (NetworkX/Matplotlib)
  77 |     PNG = "png"                  # Static PNG (NetworkX/Matplotlib) - Requires Matplotlib
  78 |     DOT = "dot"                  # GraphViz DOT format
  79 | 
  80 | # --- Global Schemas & Examples ---
  81 | # (COMMON_SCHEMAS, SYSTEM_PROMPTS, FEW_SHOT_EXAMPLES remain the same as in the original prompt)
  82 | # --- Global schemas for common domains ---
  83 | COMMON_SCHEMAS = {
  84 |     "business": {
  85 |         "entities": [
  86 |             {"type": "Person", "attributes": ["name", "title", "role"]},
  87 |             {"type": "Organization", "attributes": ["name", "industry", "location"]},
  88 |             {"type": "Product", "attributes": ["name", "category", "price"]},
  89 |             {"type": "Location", "attributes": ["name", "address", "type"]},
  90 |             {"type": "Event", "attributes": ["name", "date", "location"]},
  91 |         ],
  92 |         "relationships": [
  93 |             {"type": "WORKS_FOR", "source_types": ["Person"], "target_types": ["Organization"]},
  94 |             {"type": "PRODUCES", "source_types": ["Organization"], "target_types": ["Product"]},
  95 |             {"type": "COMPETES_WITH", "source_types": ["Organization", "Product"], "target_types": ["Organization", "Product"]},
  96 |             {"type": "LOCATED_IN", "source_types": ["Organization", "Person"], "target_types": ["Location"]},
  97 |             {"type": "FOUNDED", "source_types": ["Person"], "target_types": ["Organization"]},
  98 |             {"type": "ACQUIRED", "source_types": ["Organization"], "target_types": ["Organization"]},
  99 |             {"type": "SUPPLIES", "source_types": ["Organization"], "target_types": ["Organization"]},
 100 |             {"type": "PARTNERS_WITH", "source_types": ["Organization"], "target_types": ["Organization"]},
 101 |             {"type": "INVESTS_IN", "source_types": ["Organization", "Person"], "target_types": ["Organization"]},
 102 |             {"type": "ATTENDS", "source_types": ["Person"], "target_types": ["Event"]},
 103 |             {"type": "HOSTS", "source_types": ["Organization"], "target_types": ["Event"]},
 104 |         ],
 105 |     },
 106 |     "academic": {
 107 |         "entities": [
 108 |             {"type": "Researcher", "attributes": ["name", "affiliation", "field"]},
 109 |             {"type": "Institution", "attributes": ["name", "type", "location"]},
 110 |             {"type": "Publication", "attributes": ["title", "date", "journal", "impact_factor"]},
 111 |             {"type": "Concept", "attributes": ["name", "field", "definition"]},
 112 |             {"type": "Dataset", "attributes": ["name", "size", "source"]},
 113 |             {"type": "Research_Project", "attributes": ["name", "duration", "funding"]},
 114 |         ],
 115 |         "relationships": [
 116 |             {"type": "AFFILIATED_WITH", "source_types": ["Researcher"], "target_types": ["Institution"]},
 117 |             {"type": "AUTHORED", "source_types": ["Researcher"], "target_types": ["Publication"]},
 118 |             {"type": "CITES", "source_types": ["Publication"], "target_types": ["Publication"]},
 119 |             {"type": "INTRODUCES", "source_types": ["Publication"], "target_types": ["Concept"]},
 120 |             {"type": "COLLABORATES_WITH", "source_types": ["Researcher"], "target_types": ["Researcher"]},
 121 |             {"type": "USES", "source_types": ["Publication", "Researcher"], "target_types": ["Dataset", "Concept"]},
 122 |             {"type": "BUILDS_ON", "source_types": ["Concept", "Publication"], "target_types": ["Concept"]},
 123 |             {"type": "FUNDS", "source_types": ["Institution"], "target_types": ["Research_Project"]},
 124 |             {"type": "WORKS_ON", "source_types": ["Researcher"], "target_types": ["Research_Project"]},
 125 |         ],
 126 |     },
 127 |     "medical": {
 128 |         "entities": [
 129 |             {"type": "Patient", "attributes": ["id", "age", "gender"]},
 130 |             {"type": "Physician", "attributes": ["name", "specialty", "affiliation"]},
 131 |             {"type": "Condition", "attributes": ["name", "icd_code", "severity"]},
 132 |             {"type": "Medication", "attributes": ["name", "dosage", "manufacturer"]},
 133 |             {"type": "Procedure", "attributes": ["name", "code", "duration"]},
 134 |             {"type": "Healthcare_Facility", "attributes": ["name", "type", "location"]},
 135 |         ],
 136 |         "relationships": [
 137 |             {"type": "DIAGNOSED_WITH", "source_types": ["Patient"], "target_types": ["Condition"]},
 138 |             {"type": "TREATED_BY", "source_types": ["Patient"], "target_types": ["Physician"]},
 139 |             {"type": "PRESCRIBED", "source_types": ["Physician"], "target_types": ["Medication"]},
 140 |             {"type": "TAKES", "source_types": ["Patient"], "target_types": ["Medication"]},
 141 |             {"type": "TREATS", "source_types": ["Medication", "Procedure"], "target_types": ["Condition"]},
 142 |             {"type": "PERFORMED", "source_types": ["Physician"], "target_types": ["Procedure"]},
 143 |             {"type": "UNDERWENT", "source_types": ["Patient"], "target_types": ["Procedure"]},
 144 |             {"type": "WORKS_AT", "source_types": ["Physician"], "target_types": ["Healthcare_Facility"]},
 145 |             {"type": "ADMITTED_TO", "source_types": ["Patient"], "target_types": ["Healthcare_Facility"]},
 146 |             {"type": "INTERACTS_WITH", "source_types": ["Medication"], "target_types": ["Medication"]},
 147 |             {"type": "CONTRAINDICATES", "source_types": ["Condition"], "target_types": ["Medication"]},
 148 |         ],
 149 |     },
 150 |     "legal": {
 151 |         "entities": [
 152 |             {"type": "Person", "attributes": ["name", "role", "jurisdiction"]},
 153 |             {"type": "Legal_Entity", "attributes": ["name", "type", "jurisdiction"]},
 154 |             {"type": "Document", "attributes": ["name", "type", "date", "status"]},
 155 |             {"type": "Obligation", "attributes": ["description", "deadline", "status"]},
 156 |             {"type": "Claim", "attributes": ["description", "value", "status"]},
 157 |             {"type": "Asset", "attributes": ["description", "value", "type"]},
 158 |             {"type": "Court", "attributes": ["name", "jurisdiction", "type"]},
 159 |             {"type": "Law", "attributes": ["name", "jurisdiction", "date"]},
 160 |         ],
 161 |         "relationships": [
 162 |             {"type": "PARTY_TO", "source_types": ["Person", "Legal_Entity"], "target_types": ["Document"]},
 163 |             {"type": "HAS_OBLIGATION", "source_types": ["Person", "Legal_Entity"], "target_types": ["Obligation"]},
 164 |             {"type": "OWNS", "source_types": ["Person", "Legal_Entity"], "target_types": ["Asset"]},
 165 |             {"type": "CLAIMS", "source_types": ["Person", "Legal_Entity"], "target_types": ["Claim"]},
 166 |             {"type": "REPRESENTED_BY", "source_types": ["Person", "Legal_Entity"], "target_types": ["Person"]},
 167 |             {"type": "REFERENCED_IN", "source_types": ["Law", "Document"], "target_types": ["Document"]},
 168 |             {"type": "ADJUDICATED_BY", "source_types": ["Claim", "Document"], "target_types": ["Court"]},
 169 |             {"type": "REGULATES", "source_types": ["Law"], "target_types": ["Legal_Entity", "Obligation"]},
 170 |             {"type": "TRANSFERS", "source_types": ["Document"], "target_types": ["Asset"]},
 171 |             {"type": "AUTHORIZES", "source_types": ["Document"], "target_types": ["Person", "Legal_Entity"]},
 172 |         ],
 173 |     },
 174 | }
 175 | 
 176 | # --- Entity relationship detection prompts ---
 177 | SYSTEM_PROMPTS = {
 178 |     "entity_detection": """You are an expert entity extraction system. Your task is to identify and extract named entities from the input text with high precision. Follow these guidelines:
 179 | 
 180 | 1. Focus on identifying complete entity mentions.
 181 | 2. Classify entities into the specified types accurately.
 182 | 3. Generate unique IDs for each distinct entity.
 183 | 4. Include attributes and position information as requested.
 184 | 5. Only extract entities actually mentioned in the text.
 185 | 6. Do not hallucinate entities that aren't clearly present.
 186 | 
 187 | Output should be in valid JSON format containing a list of entities, adhering strictly to the requested structure.
 188 | """,
 189 | 
 190 |     "relationship_detection": """You are an expert relationship extraction system. Your task is to identify meaningful connections between the provided entities based on the input text. Follow these guidelines:
 191 | 
 192 | 1. Only identify relationships between the provided entities that are explicitly stated or strongly implied in the text.
 193 | 2. Capture the semantic relationship type accurately based on the specified types.
 194 | 3. Identify the direction of the relationship (source -> target) using the provided entity IDs.
 195 | 4. Include supporting evidence from the text and temporal information if requested.
 196 | 5. Assign a confidence score (0.0-1.0) based on how explicitly the relationship is stated.
 197 | 6. Do not invent relationships not supported by the text.
 198 | 
 199 | Output should be in valid JSON format containing a list of relationships, adhering strictly to the requested structure and referencing the provided entity IDs.
 200 | """,
 201 | 
 202 |     "standard_extraction": """You are an expert entity and relationship extraction system. Your task is to identify named entities and the relationships between them from the input text. Follow these guidelines:
 203 | 
 204 | 1. Identify named entities and classify them into the specified types.
 205 | 2. Generate unique IDs for each distinct entity.
 206 | 3. Identify relationships between the extracted entities based on the specified types.
 207 | 4. Ensure relationship source and target IDs correspond to the extracted entities.
 208 | 5. Include attributes, position information, evidence, and temporal context as requested.
 209 | 6. Assign a confidence score (0.0-1.0) for relationships.
 210 | 7. Only extract entities and relationships explicitly mentioned or strongly implied.
 211 | 8. Do not hallucinate information.
 212 | 
 213 | Output should be in valid JSON format containing lists of entities and relationships, adhering strictly to the requested structure.
 214 | """,
 215 | 
 216 |     "multilingual": """You are an expert entity and relationship extraction system with multilingual capabilities. Extract entities and relationships from text in the specified language (or identify the language if not specified). Apply language-specific extraction patterns to identify:
 217 | 
 218 | 1. Named entities (people, organizations, locations, etc.) matching the specified types.
 219 | 2. The relationships between these entities matching the specified types.
 220 | 3. Evidence for each relationship from the text.
 221 | 4. Generate unique IDs and include requested details (attributes, positions, temporal info).
 222 | 
 223 | Be attentive to language-specific naming patterns, grammatical structures, and relationship indicators. Output valid JSON strictly following the requested format.
 224 | """,
 225 | 
 226 |     "temporal": """You are a specialized entity and relationship extraction system focusing on temporal information. Extract entities and relationships, paying close attention to time references. Your goal is to capture:
 227 | 
 228 | 1. Entities with their temporal attributes (founding dates, birth dates, etc.).
 229 | 2. Relationships between entities with temporal context (start/end dates, duration).
 230 | 3. Changes in relationships over time.
 231 | 4. Sequence of events involving entities.
 232 | 
 233 | For each entity and relationship, capture explicit or implicit time information as precisely as possible in the 'temporal' field. Output valid JSON strictly following the requested format.
 234 | """,
 235 | }
 236 | 
 237 | # --- Few-Shot Examples ---
 238 | FEW_SHOT_EXAMPLES = {
 239 |     "business": {
 240 |         "text": """Apple Inc., founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976, announced its latest iPhone model yesterday at its headquarters in Cupertino, California. CEO Tim Cook showcased the device, which competes with Samsung's Galaxy series.
 241 | 
 242 | The company has partnered with TSMC to manufacture the A15 Bionic chip that powers the new iPhone. Meanwhile, Google, led by Sundar Pichai, continues to dominate the search engine market with products that compete with Apple's offerings.""",
 243 |         "entities": [
 244 |             {"id": "ent1", "name": "Apple Inc.", "type": "Organization", "mentions": [{"text": "Apple Inc.", "pos": [0, 10]}, {"text": "Apple", "pos": [226, 231]}], "attributes": {"location": "Cupertino, California"}},
 245 |             {"id": "ent2", "name": "Steve Jobs", "type": "Person", "mentions": [{"text": "Steve Jobs", "pos": [22, 32]}]},
 246 |             {"id": "ent3", "name": "Steve Wozniak", "type": "Person", "mentions": [{"text": "Steve Wozniak", "pos": [34, 47]}]},
 247 |             {"id": "ent4", "name": "Ronald Wayne", "type": "Person", "mentions": [{"text": "Ronald Wayne", "pos": [53, 65]}]},
 248 |             {"id": "ent5", "name": "iPhone", "type": "Product", "mentions": [{"text": "iPhone", "pos": [95, 101]}, {"text": "iPhone", "pos": [324, 330]}]},
 249 |             {"id": "ent6", "name": "Cupertino, California", "type": "Location", "mentions": [{"text": "Cupertino, California", "pos": [129, 149]}]},
 250 |             {"id": "ent7", "name": "Tim Cook", "type": "Person", "mentions": [{"text": "Tim Cook", "pos": [156, 164]}], "attributes": {"role": "CEO"}},
 251 |             {"id": "ent8", "name": "Samsung", "type": "Organization", "mentions": [{"text": "Samsung", "pos": [201, 208]}]},
 252 |             {"id": "ent9", "name": "Galaxy series", "type": "Product", "mentions": [{"text": "Galaxy series", "pos": [210, 223]}]},
 253 |             {"id": "ent10", "name": "TSMC", "type": "Organization", "mentions": [{"text": "TSMC", "pos": [261, 265]}]},
 254 |             {"id": "ent11", "name": "A15 Bionic chip", "type": "Product", "mentions": [{"text": "A15 Bionic chip", "pos": [281, 295]}]},
 255 |             {"id": "ent12", "name": "Google", "type": "Organization", "mentions": [{"text": "Google", "pos": [348, 354]}]},
 256 |             {"id": "ent13", "name": "Sundar Pichai", "type": "Person", "mentions": [{"text": "Sundar Pichai", "pos": [365, 378]}]}
 257 |         ],
 258 |         "relationships": [
 259 |             {"id": "rel1", "source": "ent2", "target": "ent1", "type": "FOUNDED", "confidence": 0.95, "evidence": "Apple Inc., founded by Steve Jobs... in 1976", "temporal": {"year": 1976}},
 260 |             {"id": "rel2", "source": "ent3", "target": "ent1", "type": "FOUNDED", "confidence": 0.95, "evidence": "Apple Inc., founded by... Steve Wozniak... in 1976", "temporal": {"year": 1976}},
 261 |             {"id": "rel3", "source": "ent4", "target": "ent1", "type": "FOUNDED", "confidence": 0.95, "evidence": "Apple Inc., founded by... Ronald Wayne in 1976", "temporal": {"year": 1976}},
 262 |             {"id": "rel4", "source": "ent7", "target": "ent1", "type": "WORKS_FOR", "confidence": 0.9, "evidence": "CEO Tim Cook"},
 263 |             {"id": "rel5", "source": "ent1", "target": "ent5", "type": "PRODUCES", "confidence": 0.9, "evidence": "Apple Inc.... announced its latest iPhone model"},
 264 |             {"id": "rel6", "source": "ent1", "target": "ent6", "type": "LOCATED_IN", "confidence": 0.8, "evidence": "its headquarters in Cupertino, California"},
 265 |             {"id": "rel7", "source": "ent5", "target": "ent9", "type": "COMPETES_WITH", "confidence": 0.8, "evidence": "which competes with Samsung's Galaxy series"},
 266 |             {"id": "rel8", "source": "ent1", "target": "ent10", "type": "PARTNERS_WITH", "confidence": 0.9, "evidence": "The company has partnered with TSMC"},
 267 |             {"id": "rel9", "source": "ent10", "target": "ent11", "type": "PRODUCES", "confidence": 0.9, "evidence": "TSMC to manufacture the A15 Bionic chip"}, # Changed from MANUFACTURES to align with schema
 268 |             {"id": "rel10", "source": "ent11", "target": "ent5", "type": "COMPONENT_OF", "confidence": 0.9, "evidence": "A15 Bionic chip that powers the new iPhone"},
 269 |             {"id": "rel11", "source": "ent13", "target": "ent12", "type": "WORKS_FOR", "confidence": 0.85, "evidence": "Google, led by Sundar Pichai"}, # Changed from LEADS to align with schema
 270 |             {"id": "rel12", "source": "ent12", "target": "ent1", "type": "COMPETES_WITH", "confidence": 0.7, "evidence": "with products that compete with Apple's offerings"}
 271 |         ]
 272 |     },
 273 |     "academic": {
 274 |         "text": """Dr. Jennifer Chen from Stanford University published a groundbreaking paper in Nature on quantum computing applications in drug discovery. Her research, funded by the National Science Foundation, built upon earlier work by Dr. Richard Feynman.
 275 | 
 276 | Chen collaborated with Dr. Michael Layton at MIT, who provided the dataset used in their experiments. Their publication has been cited by researchers at IBM's Quantum Computing division led by Dr. Sarah Johnson.""",
 277 |         "entities": [
 278 |             {"id": "ent1", "name": "Jennifer Chen", "type": "Researcher", "attributes": {"affiliation": "Stanford University"}, "mentions": [{"text": "Dr. Jennifer Chen", "pos": [0, 16]}]},
 279 |             {"id": "ent2", "name": "Stanford University", "type": "Institution", "mentions": [{"text": "Stanford University", "pos": [22, 41]}]},
 280 |             {"id": "ent3", "name": "Nature", "type": "Publication", "attributes": {"type": "Journal"}, "mentions": [{"text": "Nature", "pos": [78, 84]}]}, # Added type attribute
 281 |             {"id": "ent4", "name": "Groundbreaking paper on quantum computing", "type": "Publication", "attributes": {"title": "Groundbreaking paper on quantum computing applications in drug discovery", "date": None}, "mentions": [{"text": "paper", "pos": [71, 76]}]}, # Modified name/title
 282 |             {"id": "ent5", "name": "National Science Foundation", "type": "Institution", "attributes": {"type": "Funding Organization"}, "mentions": [{"text": "National Science Foundation", "pos": [124, 152]}]}, # Added type attribute
 283 |             {"id": "ent6", "name": "Richard Feynman", "type": "Researcher", "mentions": [{"text": "Dr. Richard Feynman", "pos": [178, 196]}]},
 284 |             {"id": "ent7", "name": "Michael Layton", "type": "Researcher", "attributes": {"affiliation": "MIT"}, "mentions": [{"text": "Dr. Michael Layton", "pos": [223, 241]}]},
 285 |             {"id": "ent8", "name": "MIT", "type": "Institution", "mentions": [{"text": "MIT", "pos": [245, 248]}]},
 286 |             {"id": "ent9", "name": "Drug discovery dataset", "type": "Dataset", "mentions": [{"text": "dataset", "pos": [264, 271]}]},
 287 |             {"id": "ent10", "name": "IBM", "type": "Institution", "attributes": {"type": "Company"}, "mentions": [{"text": "IBM", "pos": [334, 337]}]}, # Added type attribute
 288 |             {"id": "ent11", "name": "IBM Quantum Computing division", "type": "Institution", "attributes": {"type": "Research Division", "parent_org": "IBM"}, "mentions": [{"text": "IBM's Quantum Computing division", "pos": [334, 365]}]}, # Added type/parent attributes
 289 |             {"id": "ent12", "name": "Sarah Johnson", "type": "Researcher", "mentions": [{"text": "Dr. Sarah Johnson", "pos": [375, 392]}]}
 290 |         ],
 291 |         "relationships": [
 292 |             {"id": "rel1", "source": "ent1", "target": "ent2", "type": "AFFILIATED_WITH", "confidence": 0.95, "evidence": "Dr. Jennifer Chen from Stanford University"},
 293 |             {"id": "rel2", "source": "ent1", "target": "ent4", "type": "AUTHORED", "confidence": 0.95, "evidence": "Dr. Jennifer Chen... published a groundbreaking paper"},
 294 |             {"id": "rel3", "source": "ent4", "target": "ent3", "type": "PUBLISHED_IN", "confidence": 0.9, "evidence": "published a groundbreaking paper in Nature"},
 295 |             {"id": "rel4", "source": "ent5", "target": "ent4", "type": "FUNDS", "confidence": 0.85, "evidence": "Her research, funded by the National Science Foundation"}, # Changed target to Publication ent4
 296 |             {"id": "rel5", "source": "ent4", "target": "ent6", "type": "BUILDS_ON", "confidence": 0.8, "evidence": "built upon earlier work by Dr. Richard Feynman"},
 297 |             {"id": "rel6", "source": "ent1", "target": "ent7", "type": "COLLABORATES_WITH", "confidence": 0.9, "evidence": "Chen collaborated with Dr. Michael Layton at MIT"},
 298 |             {"id": "rel7", "source": "ent7", "target": "ent8", "type": "AFFILIATED_WITH", "confidence": 0.9, "evidence": "Dr. Michael Layton at MIT"},
 299 |             {"id": "rel8", "source": "ent7", "target": "ent9", "type": "PROVIDED", "confidence": 0.85, "evidence": "who provided the dataset used in their experiments"},
 300 |             {"id": "rel9", "source": "ent4", "target": "ent9", "type": "USES", "confidence": 0.8, "evidence": "dataset used in their experiments"},
 301 |             {"id": "rel10", "source": "ent11", "target": "ent4", "type": "CITES", "confidence": 0.85, "evidence": "Their publication has been cited by researchers at IBM's Quantum Computing division"},
 302 |             {"id": "rel11", "source": "ent12", "target": "ent11", "type": "WORKS_ON", "confidence": 0.9, "evidence": "IBM's Quantum Computing division led by Dr. Sarah Johnson"}, # Changed from LEADS to align with schema
 303 |             {"id": "rel12", "source": "ent11", "target": "ent10", "type": "PART_OF", "confidence": 0.9, "evidence": "IBM's Quantum Computing division"}, # Added relationship
 304 |         ]
 305 |     }
 306 | }
 307 | 
 308 | # --- Main Tool Function ---
 309 | @with_cache(ttl=24 * 60 * 60)  # Cache results for 24 hours
 310 | @with_tool_metrics
 311 | @with_retry(max_retries=2, retry_delay=1.5)
 312 | @with_error_handling
 313 | async def extract_entity_graph(
 314 |     text: str,
 315 |     entity_types: Optional[List[str]] = None,
 316 |     relation_types: Optional[List[str]] = None,
 317 |     provider: str = Provider.OPENAI.value,
 318 |     model: Optional[str] = None,
 319 |     include_evidence: bool = True,
 320 |     include_attributes: bool = True,
 321 |     include_positions: bool = False, # Default False as it can increase token usage significantly
 322 |     include_temporal_info: bool = True,
 323 |     max_entities: int = 100,
 324 |     max_relations: int = 200,
 325 |     min_confidence: float = 0.6,
 326 |     domain: Optional[str] = None,  # e.g., "business", "academic", "medical", "legal"
 327 |     output_format: Union[str, OutputFormat] = OutputFormat.JSON,
 328 |     visualization_format: Union[str, VisualizationFormat] = VisualizationFormat.NONE, # Default NONE
 329 |     strategy: Union[str, GraphStrategy] = GraphStrategy.STANDARD,
 330 |     example_entities: Optional[List[Dict[str, Any]]] = None,
 331 |     example_relationships: Optional[List[Dict[str, Any]]] = None,
 332 |     custom_entity_schema: Optional[List[Dict[str, Any]]] = None, # Schema is a list of dicts now
 333 |     custom_relationship_schema: Optional[List[Dict[str, Any]]] = None, # Schema is a list of dicts now
 334 |     existing_graph: Optional[Dict[str, Any]] = None,
 335 |     context_window: Optional[int] = None,
 336 |     language: Optional[str] = None,
 337 |     automatic_coreference: bool = True, # Applied in Multistage/Chunked merging
 338 |     chunk_size: Optional[int] = None, # In tokens
 339 |     custom_prompt: Optional[str] = None,
 340 |     system_prompt: Optional[str] = None,
 341 |     normalize_entities: bool = True, # Default True
 342 |     sort_by: str = "confidence",  # Options: "confidence", "centrality", "mentions"
 343 |     max_tokens_per_request: Optional[int] = 4000, # Default max tokens for LLM response
 344 |     enable_reasoning: bool = False, # Add reasoning steps in prompt
 345 |     additional_params: Optional[Dict[str, Any]] = None # e.g., temperature
 346 | ) -> Dict[str, Any]:
 347 |     """
 348 |     Extracts entities and their relationships from text, building a knowledge graph.
 349 | 
 350 |     Analyzes unstructured text to identify entities and semantic relationships, creating a
 351 |     structured knowledge graph. Supports multiple strategies, formats, schemas, and visualization.
 352 | 
 353 |     Args:
 354 |         text: The input text to analyze.
 355 |         entity_types: Optional list of entity types to focus on (e.g., ["Person", "Organization"]).
 356 |                       If None, extracts all inferred types or types from the schema.
 357 |         relation_types: Optional list of relationship types to extract (e.g., ["WORKS_FOR", "LOCATED_IN"]).
 358 |                        If None, extracts all inferred types or types from the schema.
 359 |         provider: The LLM provider (e.g., "openai", "anthropic", "gemini"). Defaults to "openai".
 360 |         model: The specific model ID. If None, the provider's default model is used.
 361 |         include_evidence: Whether to include text snippets supporting each relationship. Default True.
 362 |         include_attributes: Whether to extract and include entity attributes. Default True.
 363 |         include_positions: Whether to include position information for entity mentions. Default False.
 364 |         include_temporal_info: Whether to extract temporal context for relationships. Default True.
 365 |         max_entities: Maximum number of entities to return after merging/filtering. Default 100.
 366 |         max_relations: Maximum number of relations to return after merging/filtering. Default 200.
 367 |         min_confidence: Minimum confidence score (0.0-1.0) for relationships. Default 0.6.
 368 |         domain: Optional domain for predefined schemas ("business", "academic", "medical", "legal").
 369 |         output_format: Desired output format ("json", "networkx", etc.). Default "json".
 370 |         visualization_format: Format for visualization ("none", "html", "svg", etc.). Default "none".
 371 |         strategy: Extraction strategy ("standard", "multistage", "chunked", etc.). Default "standard".
 372 |         example_entities: Optional list of example entities to guide extraction format (few-shot).
 373 |         example_relationships: Optional list of example relationships to guide extraction format (few-shot).
 374 |         custom_entity_schema: Optional custom schema for entity types and attributes (list of dicts).
 375 |         custom_relationship_schema: Optional custom schema for relationship types (list of dicts).
 376 |         existing_graph: Optional existing graph data {'entities': [], 'relationships': []} for incremental strategy.
 377 |         context_window: Optional max context window size (tokens) for chunking. Estimated if None.
 378 |         language: Optional language hint for multilingual extraction (e.g., "Spanish").
 379 |         automatic_coreference: Whether to attempt coreference resolution during merging (Multistage/Chunked). Default True.
 380 |         chunk_size: Optional custom chunk size (tokens) for chunked strategy. Overrides estimation.
 381 |         custom_prompt: Optional custom prompt template. Use placeholders {text}, {instructions}, {schema_info}, {examples}.
 382 |         system_prompt: Optional custom system prompt override.
 383 |         normalize_entities: Whether to normalize entity names and merge duplicates. Default True.
 384 |         sort_by: How to sort final entities/relationships ("confidence", "centrality", "mentions"). Default "confidence".
 385 |         max_tokens_per_request: Max tokens for LLM response generation. Default 4000.
 386 |         enable_reasoning: Include reasoning steps in the prompt for the LLM. Default False.
 387 |         additional_params: Additional provider-specific parameters (e.g., temperature, top_p).
 388 | 
 389 |     Returns:
 390 |         A dictionary containing the entity graph data and metadata, including:
 391 |         - 'entities': List of extracted entity dictionaries.
 392 |         - 'relationships': List of extracted relationship dictionaries.
 393 |         - 'metadata': Information about the extraction process, counts, types, strategy, etc.
 394 |         - 'visualization': Visualization data (e.g., HTML content, file URL) if requested.
 395 |         - 'query_interface': Helper functions for graph querying (if networkx available).
 396 |         - 'provider', 'model', 'tokens', 'cost', 'processing_time', 'success'.
 397 |         - Depending on output_format, may include 'graph' (NetworkX object), 'rdf_triples', etc.
 398 | 
 399 |     Raises:
 400 |         ToolInputError: If input text is empty or parameters are invalid.
 401 |         ProviderError: If the LLM provider fails during extraction.
 402 |         ToolError: For parsing errors, chunking failures, or other processing issues.
 403 |     """
 404 |     start_time = time.time()
 405 |     logger.info(f"Starting entity graph extraction with strategy: {strategy}")
 406 | 
 407 |     # --- Input Validation ---
 408 |     if not text or not isinstance(text, str):
 409 |         raise ToolInputError("Input 'text' must be a non-empty string.")
 410 |     if entity_types is not None and not isinstance(entity_types, list):
 411 |         raise ToolInputError("'entity_types' must be a list of strings or None.")
 412 |     if relation_types is not None and not isinstance(relation_types, list):
 413 |         raise ToolInputError("'relation_types' must be a list of strings or None.")
 414 |     if not (0.0 <= min_confidence <= 1.0):
 415 |         raise ToolInputError("'min_confidence' must be between 0.0 and 1.0.")
 416 |     if max_entities <= 0 or max_relations <= 0:
 417 |         raise ToolInputError("'max_entities' and 'max_relations' must be positive integers.")
 418 |     if custom_entity_schema is not None and not isinstance(custom_entity_schema, list):
 419 |         raise ToolInputError("'custom_entity_schema' must be a list of dictionaries or None.")
 420 |     if custom_relationship_schema is not None and not isinstance(custom_relationship_schema, list):
 421 |          raise ToolInputError("'custom_relationship_schema' must be a list of dictionaries or None.")
 422 |     if existing_graph is not None and not (isinstance(existing_graph, dict) and "entities" in existing_graph and "relationships" in existing_graph):
 423 |         raise ToolInputError("'existing_graph' must be a dictionary with 'entities' and 'relationships' lists.")
 424 | 
 425 |     # Validate and convert enums
 426 |     try:
 427 |         output_format = OutputFormat(str(output_format).lower()) if isinstance(output_format, str) else output_format
 428 |         visualization_format = VisualizationFormat(str(visualization_format).lower()) if isinstance(visualization_format, str) else visualization_format
 429 |         strategy = GraphStrategy(str(strategy).lower()) if isinstance(strategy, str) else strategy
 430 |         assert isinstance(output_format, OutputFormat)
 431 |         assert isinstance(visualization_format, VisualizationFormat)
 432 |         assert isinstance(strategy, GraphStrategy)
 433 |     except (ValueError, AssertionError) as e:
 434 |         raise ToolInputError(f"Invalid enum value provided: {e}") from e
 435 | 
 436 |     # Validate domain
 437 |     if domain and domain not in COMMON_SCHEMAS:
 438 |         valid_domains = list(COMMON_SCHEMAS.keys())
 439 |         raise ToolInputError(f"Invalid domain: '{domain}'. Valid options: {valid_domains}")
 440 | 
 441 |     # Check dependencies for strategy/formats
 442 |     if strategy == GraphStrategy.INCREMENTAL and existing_graph is None:
 443 |         raise ToolInputError("The 'incremental' strategy requires the 'existing_graph' parameter.")
 444 |     if strategy == GraphStrategy.STRICT_SCHEMA and not (domain or (custom_entity_schema and custom_relationship_schema)):
 445 |         raise ToolInputError("The 'strict_schema' strategy requires a 'domain' or custom schemas.")
 446 |     if visualization_format != VisualizationFormat.NONE and not HAS_VISUALIZATION_LIBS:
 447 |         logger.warning(
 448 |             f"Visualization format '{visualization_format.value}' requested, but required libraries "
 449 |             f"(networkx, pyvis, matplotlib) are not installed. Falling back to 'none'."
 450 |         )
 451 |         visualization_format = VisualizationFormat.NONE
 452 |     if output_format == OutputFormat.NETWORKX and not HAS_NETWORKX:
 453 |          logger.warning("Output format 'networkx' requested, but networkx library is not installed. Falling back to 'json'.")
 454 |          output_format = OutputFormat.JSON
 455 | 
 456 |     # --- Initialize Configuration ---
 457 |     try:
 458 |         provider_instance: BaseProvider = await get_provider(provider)
 459 |     except Exception as e:
 460 |         raise ProviderError(f"Failed to initialize provider '{provider}': {e}", provider=provider, cause=e) from e
 461 | 
 462 |     additional_params = additional_params or {}
 463 |     model_name = model or provider_instance.default_model
 464 | 
 465 |     # Estimate context window if needed for chunking
 466 |     if strategy == GraphStrategy.CHUNKED and not context_window:
 467 |         # Basic estimation logic (replace with more sophisticated provider-specific checks if possible)
 468 |         model_context_estimates = {
 469 |             "gpt-4": 8000, "gpt-4-32k": 32000, "gpt-3.5-turbo": 4000, "gpt-3.5-turbo-16k": 16000,
 470 |             "gpt-4-turbo": 128000, "gpt-4o": 128000,
 471 |             "claude-2": 100000, "claude-3-opus": 200000, "claude-3-sonnet": 200000, "claude-3-haiku": 200000,
 472 |             "claude-3-5-sonnet": 200000,
 473 |             "gemini-pro": 32000, "gemini-1.5-pro": 1000000, "gemini-1.5-flash": 1000000,
 474 |         }
 475 |         context_window = 16000 # Default fallback
 476 |         if model_name:
 477 |             for key, window in model_context_estimates.items():
 478 |                 if key in model_name.lower():
 479 |                     context_window = window
 480 |                     break
 481 |         logger.info(f"Estimated context window for model '{model_name}': {context_window} tokens.")
 482 | 
 483 |     # Determine effective schema
 484 |     schema = None
 485 |     if domain and domain in COMMON_SCHEMAS:
 486 |         schema = COMMON_SCHEMAS[domain]
 487 |         logger.info(f"Using predefined schema for domain: {domain}")
 488 |     elif custom_entity_schema and custom_relationship_schema:
 489 |         schema = {
 490 |             "entities": custom_entity_schema,
 491 |             "relationships": custom_relationship_schema
 492 |         }
 493 |         logger.info("Using custom entity and relationship schemas.")
 494 |     elif custom_entity_schema:
 495 |         schema = {"entities": custom_entity_schema, "relationships": []} # Allow only entity schema
 496 |         logger.info("Using custom entity schema only.")
 497 |     elif custom_relationship_schema:
 498 |          schema = {"entities": [], "relationships": custom_relationship_schema} # Allow only rel schema
 499 |          logger.info("Using custom relationship schema only.")
 500 | 
 501 |     # --- Prepare Common Arguments for Strategy Functions ---
 502 |     common_args = {
 503 |         "text": text,
 504 |         "provider_instance": provider_instance,
 505 |         "model": model_name,
 506 |         "entity_types": entity_types,
 507 |         "relation_types": relation_types,
 508 |         "include_evidence": include_evidence,
 509 |         "include_attributes": include_attributes,
 510 |         "include_positions": include_positions,
 511 |         "include_temporal_info": include_temporal_info,
 512 |         "min_confidence": min_confidence,
 513 |         "max_entities": max_entities,
 514 |         "max_relations": max_relations,
 515 |         "schema": schema,
 516 |         "custom_prompt": custom_prompt,
 517 |         "system_prompt": system_prompt,
 518 |         "language": language,
 519 |         "example_entities": example_entities,
 520 |         "example_relationships": example_relationships,
 521 |         "enable_reasoning": enable_reasoning,
 522 |         "max_tokens_per_request": max_tokens_per_request,
 523 |         "additional_params": additional_params
 524 |     }
 525 | 
 526 |     # --- Execute Strategy ---
 527 |     extraction_result: Dict[str, Any] = {}
 528 |     used_model = model_name # Default model
 529 |     total_tokens = {"input": 0, "output": 0, "total": 0}
 530 |     total_cost = 0.0
 531 | 
 532 |     try:
 533 |         if strategy == GraphStrategy.STANDARD:
 534 |             extraction_result = await _perform_standard_extraction(**common_args)
 535 |         elif strategy == GraphStrategy.MULTISTAGE:
 536 |             extraction_result = await _perform_multistage_extraction(
 537 |                 automatic_coreference=automatic_coreference, **common_args
 538 |             )
 539 |         elif strategy == GraphStrategy.CHUNKED:
 540 |             extraction_result = await _perform_chunked_extraction(
 541 |                 context_window=context_window, # type: ignore # context_window guaranteed if strategy is CHUNKED
 542 |                 chunk_size=chunk_size,
 543 |                 automatic_coreference=automatic_coreference,
 544 |                 **common_args
 545 |             )
 546 |         elif strategy == GraphStrategy.INCREMENTAL:
 547 |             extraction_result = await _perform_incremental_extraction(
 548 |                 existing_graph=existing_graph, # type: ignore # existing_graph guaranteed if strategy is INCREMENTAL
 549 |                 # Incremental doesn't use examples directly in the same way
 550 |                 example_entities=None, example_relationships=None,
 551 |                 **common_args # Pass other args, but they might be less relevant
 552 |             )
 553 |         elif strategy == GraphStrategy.STRUCTURED:
 554 |             # Ensure examples are provided or generated if possible
 555 |             if not example_entities and domain and domain in FEW_SHOT_EXAMPLES:
 556 |                 common_args["example_entities"] = FEW_SHOT_EXAMPLES[domain]["entities"]
 557 |                 common_args["example_relationships"] = FEW_SHOT_EXAMPLES[domain]["relationships"]
 558 |                 logger.info(f"Using few-shot examples for domain: {domain}")
 559 |             elif not example_entities:
 560 |                  logger.warning("Structured strategy selected but no examples provided or found for domain. Extraction quality may suffer.")
 561 |             extraction_result = await _perform_structured_extraction(**common_args)
 562 | 
 563 |         elif strategy == GraphStrategy.STRICT_SCHEMA:
 564 |             if not schema: # Should have been caught earlier, but double-check
 565 |                  raise ToolInputError("Strict schema strategy requires a schema (domain or custom).")
 566 |             # Remove schema from common_args to avoid passing it twice
 567 |             schema_copy = schema.copy()
 568 |             schema_args = common_args.copy()
 569 |             if 'schema' in schema_args:
 570 |                 del schema_args['schema']
 571 |             extraction_result = await _perform_schema_guided_extraction(
 572 |                 schema=schema_copy,
 573 |                 # Strict schema doesn't use generic examples
 574 |                 example_entities=None, example_relationships=None,
 575 |                 **schema_args
 576 |             )
 577 | 
 578 |         # Get usage stats from the result
 579 |         used_model = extraction_result.get("model", used_model)
 580 |         total_tokens = extraction_result.get("tokens", total_tokens)
 581 |         total_cost = extraction_result.get("cost", total_cost)
 582 | 
 583 |     except (ProviderError, ToolError) as e:
 584 |         logger.error(f"Entity graph extraction failed: {e}", exc_info=True)
 585 |         raise # Re-raise ProviderError or ToolError
 586 |     except Exception as e:
 587 |         logger.error(f"An unexpected error occurred during extraction: {e}", exc_info=True)
 588 |         raise ToolError(f"Extraction failed due to an unexpected error: {e}") from e
 589 | 
 590 | 
 591 |     # --- Post-processing ---
 592 |     logger.info("Performing post-processing steps...")
 593 |     # Validate and clean final data (important after merging/incremental)
 594 |     extraction_result = _validate_graph_data(
 595 |         extraction_result,
 596 |         min_confidence=min_confidence,
 597 |         max_entities=max_entities,
 598 |         max_relations=max_relations
 599 |     )
 600 | 
 601 |     # Normalize entity names if requested
 602 |     if normalize_entities:
 603 |         logger.info("Normalizing entities...")
 604 |         extraction_result = _normalize_entities(extraction_result)
 605 |         # Re-validate after normalization as IDs might change
 606 |         extraction_result = _validate_graph_data(
 607 |             extraction_result,
 608 |             min_confidence=min_confidence,
 609 |             max_entities=max_entities,
 610 |             max_relations=max_relations
 611 |         )
 612 | 
 613 | 
 614 |     # Add computed graph metrics and sort
 615 |     logger.info(f"Calculating graph metrics and sorting by '{sort_by}'...")
 616 |     extraction_result = _add_graph_metrics(extraction_result, sort_by)
 617 | 
 618 |     # Generate visualization if requested
 619 |     visualization_data = None
 620 |     if visualization_format != VisualizationFormat.NONE:
 621 |         logger.info(f"Generating visualization in format: {visualization_format.value}")
 622 |         visualization_data = _generate_visualization(
 623 |             extraction_result,
 624 |             visualization_format
 625 |         )
 626 |         if visualization_data and "error" in visualization_data:
 627 |             logger.warning(f"Visualization generation failed: {visualization_data['error']}")
 628 | 
 629 | 
 630 |     # Create query interface
 631 |     query_interface = None
 632 |     if HAS_NETWORKX:
 633 |         logger.info("Creating query interface...")
 634 |         query_interface = _create_query_interface(extraction_result)
 635 | 
 636 |     processing_time = time.time() - start_time
 637 | 
 638 |     # Format output according to requested format
 639 |     logger.info(f"Formatting output as: {output_format.value}")
 640 |     formatted_result = _format_output(extraction_result, output_format)
 641 | 
 642 |     # --- Prepare Final Result ---
 643 |     final_result = {
 644 |         "entities": formatted_result.get("entities", []),
 645 |         "relationships": formatted_result.get("relationships", []),
 646 |         "metadata": {
 647 |             "entity_count": len(formatted_result.get("entities", [])),
 648 |             "relationship_count": len(formatted_result.get("relationships", [])),
 649 |             "entity_types": sorted(list(set(e.get("type", "Unknown") for e in formatted_result.get("entities", [])))),
 650 |             "relation_types": sorted(list(set(r.get("type", "Unknown") for r in formatted_result.get("relationships", [])))),
 651 |             "processing_strategy": strategy.value,
 652 |             "extraction_date": datetime.now(timezone.utc).isoformat() + "Z",
 653 |             "schema_used": "domain: " + domain if domain else ("custom" if schema else "none"),
 654 |             "metrics": formatted_result.get("metrics", extraction_result.get("metrics")), # Include metrics if calculated
 655 |             "incremental_stats": extraction_result.get("incremental_stats"), # Include if incremental
 656 |         },
 657 |         "provider": provider,
 658 |         "model": used_model,
 659 |         "tokens": total_tokens,
 660 |         "cost": total_cost,
 661 |         "processing_time": round(processing_time, 2),
 662 |         "success": True
 663 |     }
 664 | 
 665 |     # Add format-specific data
 666 |     if output_format == OutputFormat.NETWORKX and "graph" in formatted_result:
 667 |         final_result["graph"] = formatted_result["graph"]
 668 |     elif output_format == OutputFormat.RDF and "rdf_triples" in formatted_result:
 669 |         final_result["rdf_triples"] = formatted_result["rdf_triples"]
 670 |     elif output_format == OutputFormat.CYTOSCAPE and "cytoscape" in formatted_result:
 671 |         final_result["cytoscape"] = formatted_result["cytoscape"]
 672 |     elif output_format == OutputFormat.D3 and "d3" in formatted_result:
 673 |         final_result["d3"] = formatted_result["d3"]
 674 |     elif output_format == OutputFormat.NEO4J and "neo4j_queries" in formatted_result:
 675 |         final_result["neo4j_queries"] = formatted_result["neo4j_queries"]
 676 | 
 677 |     # Add visualization if generated
 678 |     if visualization_data:
 679 |         final_result["visualization"] = visualization_data
 680 | 
 681 |     # Add query interface if generated
 682 |     if query_interface:
 683 |         final_result["query_interface"] = query_interface
 684 | 
 685 |     # Log success
 686 |     logger.success(
 687 |         f"Entity graph extraction completed successfully ({final_result['metadata']['entity_count']} entities, "
 688 |         f"{final_result['metadata']['relationship_count']} relationships)",
 689 |         strategy=strategy.value, model=used_model, cost=f"{total_cost:.6f}", time=f"{processing_time:.2f}s"
 690 |     )
 691 | 
 692 |     return final_result
 693 | 
 694 | 
 695 | # --- Helper: Robust JSON Parsing ---
 696 | def _parse_json_from_response(response_text: str) -> Dict[str, Any]:
 697 |     """Attempts to parse JSON from the LLM response text, handling various formats."""
 698 |     try:
 699 |         # 1. Try direct parsing (if response is pure JSON)
 700 |         return json.loads(response_text)
 701 |     except json.JSONDecodeError as e:
 702 |         logger.error(f"Failed to parse JSON from response: {e}", exc_info=True)
 703 |         # 2. Try finding JSON within markdown code blocks (```json ... ```)
 704 |         match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL | re.IGNORECASE)
 705 |         if match:
 706 |             try:
 707 |                 return json.loads(match.group(1))
 708 |             except json.JSONDecodeError:
 709 |                 pass # Fall through to next method
 710 | 
 711 |         # 3. Try finding the largest JSON object/array within the text
 712 |         # Be careful with greedy matching, find first '{' and last '}'
 713 |         start_index = response_text.find('{')
 714 |         end_index = response_text.rfind('}')
 715 |         if start_index != -1 and end_index != -1 and end_index > start_index:
 716 |             potential_json = response_text[start_index : end_index + 1]
 717 |             try:
 718 |                 # Validate structure minimally before parsing
 719 |                 if potential_json.count('{') == potential_json.count('}'):
 720 |                      return json.loads(potential_json)
 721 |             except json.JSONDecodeError:
 722 |                 pass # Fall through
 723 | 
 724 |         # 4. If nothing worked, raise an error
 725 |         raise ValueError("No valid JSON object found in the response text.") from e
 726 | 
 727 | # --- Helper: Build Common Prompt Sections ---
 728 | def _build_common_prompt_instructions(
 729 |     entity_types: Optional[List[str]],
 730 |     relation_types: Optional[List[str]],
 731 |     schema: Optional[Dict[str, Any]],
 732 |     language: Optional[str],
 733 |     enable_reasoning: bool,
 734 |     format_structure: str,
 735 |     include_positions: bool,
 736 |     include_attributes: bool,
 737 |     include_evidence: bool,
 738 |     include_temporal_info: bool,
 739 |     min_confidence: float,
 740 |     max_entities: int,
 741 |     max_relations: int,
 742 |     examples: Optional[str] = None,
 743 |     task_description: str = "Extract entities and their relationships from the text below."
 744 | ) -> str:
 745 |     """Builds the common instruction part of the LLM prompt."""
 746 | 
 747 |     # Type constraints
 748 |     entity_types_str = "ENTITY TYPES to extract (if specified):\n- " + "\n- ".join(entity_types) if entity_types else "Extract relevant entity types."
 749 |     relation_types_str = "RELATIONSHIP TYPES to extract (if specified):\n- " + "\n- ".join(relation_types) if relation_types else "Extract relevant relationship types."
 750 | 
 751 |     # Schema guidance
 752 |     schema_guidance = ""
 753 |     if schema:
 754 |         schema_guidance = "SCHEMA:\n"
 755 |         ent_schema = schema.get("entities")
 756 |         rel_schema = schema.get("relationships")
 757 |         if ent_schema:
 758 |             schema_guidance += "Entity Types:\n"
 759 |             for et in ent_schema:
 760 |                 attrs = f" (Attributes: {', '.join(et.get('attributes', []))})" if et.get('attributes') else ""
 761 |                 schema_guidance += f"- {et.get('type', 'Unknown')}{attrs}\n"
 762 |         if rel_schema:
 763 |             schema_guidance += "Relationship Types:\n"
 764 |             for rt in rel_schema:
 765 |                 src = ', '.join(rt.get('source_types', ['Any']))
 766 |                 tgt = ', '.join(rt.get('target_types', ['Any']))
 767 |                 schema_guidance += f"- {rt.get('type', 'Unknown')} (From: {src}, To: {tgt})\n"
 768 |         schema_guidance += "\n"
 769 | 
 770 |     # Language
 771 |     language_instruction = f"The text is in {language}. Adapt extraction accordingly.\n" if language else ""
 772 | 
 773 |     # Reasoning steps
 774 |     reasoning_instruction = ""
 775 |     if enable_reasoning:
 776 |         reasoning_instruction = """
 777 | REASONING STEPS (Think step-by-step before generating the final JSON):
 778 | 1. Identify potential entity mentions.
 779 | 2. Group mentions referring to the same real-world entity.
 780 | 3. Assign a unique ID and the most appropriate type (from schema/list if provided) to each entity.
 781 | 4. Extract specified attributes for each entity.
 782 | 5. Identify explicit and strongly implied relationships between extracted entities.
 783 | 6. Assign the most appropriate type (from schema/list if provided) to each relationship.
 784 | 7. Determine directionality (source -> target).
 785 | 8. Extract evidence and temporal information if requested.
 786 | 9. Estimate confidence (0.0-1.0) for each relationship.
 787 | 10. Format the final output strictly as requested JSON.\n
 788 | """
 789 |     # Output format definition
 790 |     format_instructions = f"""
 791 | OUTPUT FORMAT:
 792 | Respond with a valid JSON object containing two keys: "entities" and "relationships".
 793 | Adhere STRICTLY to this structure:
 794 | ```json
 795 | {{
 796 |   "entities": [
 797 |     {{
 798 |       "id": "ent<unique_number>",
 799 |       "name": "entity_name",
 800 |       "type": "entity_type"
 801 |       {', "mentions": [{"text": "mention_text", "pos": [start_pos, end_pos]}]' if include_positions else ''}
 802 |       {', "attributes": {{ "attr_name": "value", ... }}' if include_attributes else ''}
 803 |     }}
 804 |     // ... more entities
 805 |   ],
 806 |   "relationships": [
 807 |     {{
 808 |       "id": "rel<unique_number>",
 809 |       "source": "source_entity_id", // Must be an ID from the 'entities' list
 810 |       "target": "target_entity_id", // Must be an ID from the 'entities' list
 811 |       "type": "relationship_type"
 812 |       {', "confidence": 0.xx' } // Confidence score 0.0-1.0
 813 |       {', "evidence": "text snippet supporting the relationship"' if include_evidence else ''}
 814 |       {', "temporal": {{ "start": "...", "end": "...", "point": "..." }}' if include_temporal_info else ''}
 815 |     }}
 816 |     // ... more relationships
 817 |   ]
 818 | }}
 819 | ```
 820 | 
 821 | CONSTRAINTS:
 822 | - Generate unique IDs for all entities and relationships (e.g., "ent1", "rel1").
 823 | - Only include relationships with confidence >= {min_confidence:.2f}.
 824 | - Limit results to approximately {max_entities} most important entities and {max_relations} most significant relationships.
 825 | - Ensure all relationship `source` and `target` IDs exist in the `entities` list.
 826 | - If using a schema, strictly adhere to the defined types and attributes.
 827 | {examples if examples else ""}
 828 | """
 829 | 
 830 |     # Combine parts
 831 |     full_instructions = f"""
 832 | {task_description}
 833 | 
 834 | {entity_types_str}
 835 | {relation_types_str}
 836 | 
 837 | {schema_guidance}
 838 | {language_instruction}
 839 | {reasoning_instruction}
 840 | {format_instructions}
 841 | """
 842 |     return full_instructions.strip()
 843 | 
 844 | # --- Helper: Call LLM via generate_completion ---
 845 | async def _call_llm_for_extraction(
 846 |     prompt: str,
 847 |     system_prompt: Optional[str],
 848 |     provider_instance: BaseProvider,
 849 |     model: str,
 850 |     max_tokens_per_request: Optional[int],
 851 |     additional_params: Optional[Dict[str, Any]],
 852 |     task_name: str # For logging/error reporting
 853 | ) -> Dict[str, Any]:
 854 |     """Helper to call the LLM and handle basic response validation."""
 855 |     logger.debug(f"Calling LLM for {task_name}. Prompt length: {len(prompt)} chars.")
 856 |     # Set low temperature for deterministic extraction by default
 857 |     temp = additional_params.pop("temperature", 0.1) if additional_params else 0.1
 858 | 
 859 |     # If system_prompt is provided, prepend it to the prompt
 860 |     if system_prompt:
 861 |         # Combine system prompt and user prompt
 862 |         full_prompt = f"System: {system_prompt}\n\nUser: {prompt}"
 863 |     else:
 864 |         full_prompt = prompt
 865 | 
 866 |     try:
 867 |         # Call generate_completion without system_prompt parameter
 868 |         completion_result = await generate_completion(
 869 |             prompt=full_prompt,
 870 |             model=model,
 871 |             provider=provider_instance.provider_name, # Use provider name string
 872 |             temperature=temp,
 873 |             max_tokens=max_tokens_per_request,
 874 |             additional_params=additional_params
 875 |         )
 876 | 
 877 |         if not completion_result.get("success", False):
 878 |             error_message = completion_result.get("error", f"Unknown error during {task_name}")
 879 |             logger.error(f"{task_name} failed via generate_completion: {error_message}")
 880 |             raise ProviderError(
 881 |                 f"{task_name} failed: {error_message}",
 882 |                 provider=provider_instance.provider_name,
 883 |                 model=model,
 884 |                 details=completion_result.get("details")
 885 |             )
 886 | 
 887 |         # Parse JSON robustly
 888 |         try:
 889 |             graph_data = _parse_json_from_response(completion_result["text"])
 890 |         except (json.JSONDecodeError, ValueError) as e:
 891 |             logger.error(f"Failed to parse JSON from {task_name} response: {e}", exc_info=True)
 892 |             raise ToolError(
 893 |                 f"Failed to parse JSON from {task_name}: {e}",
 894 |                 error_code="PARSING_ERROR",
 895 |                 details={"response_text": completion_result["text"][:1000]} # Log truncated response
 896 |             ) from e
 897 | 
 898 |         # Add metadata to the result
 899 |         graph_data["model"] = completion_result.get("model", model)
 900 |         graph_data["tokens"] = completion_result.get("tokens", {"input": 0, "output": 0, "total": 0})
 901 |         graph_data["cost"] = completion_result.get("cost", 0.0)
 902 | 
 903 |         return graph_data
 904 | 
 905 |     except ProviderError as e:
 906 |         # Catch provider errors from generate_completion or raised above
 907 |         raise e # Reraise ProviderError
 908 |     except ToolError as e:
 909 |         # Catch parsing errors raised above
 910 |         raise e # Reraise ToolError
 911 |     except Exception as e:
 912 |         # Catch any other unexpected errors during the call or parsing
 913 |         logger.error(f"Unexpected error during {task_name} LLM call: {e}", exc_info=True)
 914 |         raise ProviderError(
 915 |             f"Unexpected error during {task_name} for model '{model}': {e}",
 916 |             provider=provider_instance.provider_name,
 917 |             model=model
 918 |         ) from e
 919 | 
 920 | # --- Strategy Implementation Functions ---
 921 | 
 922 | def _validate_graph_data(
 923 |     graph_data: Dict[str, Any],
 924 |     min_confidence: float,
 925 |     max_entities: int,
 926 |     max_relations: int
 927 | ) -> Dict[str, Any]:
 928 |     """Validates, cleans, and standardizes extracted graph data."""
 929 |     logger.debug("Validating and cleaning extracted graph data...")
 930 |     entities = graph_data.get("entities", [])
 931 |     relationships = graph_data.get("relationships", [])
 932 | 
 933 |     if not isinstance(entities, list):
 934 |         logger.warning(f"Entities data is not a list, received {type(entities)}. Resetting to empty list.")
 935 |         entities = []
 936 |     if not isinstance(relationships, list):
 937 |          logger.warning(f"Relationships data is not a list, received {type(relationships)}. Resetting to empty list.")
 938 |          relationships = []
 939 | 
 940 |     valid_entities_map: Dict[str, Dict[str, Any]] = {}
 941 |     entity_id_counter = 1
 942 |     processed_ids = set()
 943 | 
 944 |     # Validate and process entities
 945 |     for entity in entities:
 946 |         if not isinstance(entity, dict) or "name" not in entity or "type" not in entity:
 947 |             logger.warning(f"Skipping invalid entity format: {entity}")
 948 |             continue
 949 | 
 950 |         entity_id = entity.get("id")
 951 |         # Ensure ID exists and is unique, generate if needed
 952 |         if not entity_id or not isinstance(entity_id, str) or entity_id in processed_ids:
 953 |             entity_id = f"ent{entity_id_counter}"
 954 |             while entity_id in processed_ids:
 955 |                 entity_id_counter += 1
 956 |                 entity_id = f"ent{entity_id_counter}"
 957 |         entity_id_counter = max(entity_id_counter, int(re.sub(r'\D', '', entity_id) or '0') + 1) # Keep counter high
 958 | 
 959 |         processed_ids.add(entity_id)
 960 |         entity["id"] = entity_id
 961 | 
 962 |         # Validate mentions
 963 |         if "mentions" in entity:
 964 |             if not isinstance(entity["mentions"], list):
 965 |                 entity["mentions"] = []
 966 |             else:
 967 |                 valid_mentions = []
 968 |                 for mention in entity["mentions"]:
 969 |                     if isinstance(mention, dict) and "text" in mention:
 970 |                         if "pos" in mention:
 971 |                             pos = mention["pos"]
 972 |                             if not (isinstance(pos, list) and len(pos) == 2 and all(isinstance(p, (int, float)) for p in pos)):
 973 |                                 logger.debug(f"Invalid position format for mention '{mention.get('text')}' in entity {entity_id}. Removing position.")
 974 |                                 mention.pop("pos", None) # Remove invalid pos
 975 |                         valid_mentions.append(mention)
 976 |                 entity["mentions"] = valid_mentions
 977 | 
 978 |         # Validate attributes
 979 |         if "attributes" in entity:
 980 |             if not isinstance(entity["attributes"], dict):
 981 |                 entity["attributes"] = {}
 982 |             else:
 983 |                 # Clean attribute values
 984 |                 cleaned_attrs = {}
 985 |                 for key, value in entity["attributes"].items():
 986 |                     if isinstance(value, (str, int, float, bool)):
 987 |                         cleaned_attrs[key] = value
 988 |                     elif value is not None:
 989 |                         try:
 990 |                             cleaned_attrs[key] = str(value) # Convert other types to string
 991 |                         except Exception:
 992 |                              logger.debug(f"Could not convert attribute value for key '{key}' in entity {entity_id}. Skipping attribute.")
 993 |                 entity["attributes"] = cleaned_attrs
 994 | 
 995 |         valid_entities_map[entity_id] = entity
 996 | 
 997 |     # Limit entities if needed (do this before relationship validation)
 998 |     final_entities = list(valid_entities_map.values())
 999 |     if len(final_entities) > max_entities:
1000 |         logger.info(f"Reducing entities from {len(final_entities)} to {max_entities} (limit).")
1001 |         # Basic limiting, could be smarter (e.g., based on centrality if available later)
1002 |         final_entities = final_entities[:max_entities]
1003 |         valid_entity_ids = {e["id"] for e in final_entities}
1004 |     else:
1005 |         valid_entity_ids = set(valid_entities_map.keys())
1006 | 
1007 | 
1008 |     # Validate and process relationships
1009 |     valid_relationships = []
1010 |     rel_id_counter = 1
1011 |     processed_rel_ids = set()
1012 |     relationship_signatures = set() # (source_id, target_id, type)
1013 | 
1014 |     for rel in relationships:
1015 |         if not isinstance(rel, dict) or "source" not in rel or "target" not in rel or "type" not in rel:
1016 |             logger.warning(f"Skipping invalid relationship format: {rel}")
1017 |             continue
1018 | 
1019 |         source_id = rel["source"]
1020 |         target_id = rel["target"]
1021 |         rel_type = rel["type"]
1022 | 
1023 |         # Check if source and target entities are valid *after* entity limiting
1024 |         if source_id not in valid_entity_ids or target_id not in valid_entity_ids:
1025 |             logger.debug(f"Skipping relationship referencing invalid/removed entity: {rel}")
1026 |             continue
1027 | 
1028 |         # Check confidence
1029 |         confidence = rel.get("confidence")
1030 |         try:
1031 |             confidence = float(confidence) if confidence is not None else 1.0 # Default to 1.0 if missing
1032 |             if not (0.0 <= confidence <= 1.0):
1033 |                 confidence = max(0.0, min(1.0, confidence)) # Clamp to range
1034 |         except (ValueError, TypeError):
1035 |             logger.warning(f"Invalid confidence '{confidence}' for relationship {rel.get('id')}. Setting to 0.5.")
1036 |             confidence = 0.5
1037 |         rel["confidence"] = confidence
1038 | 
1039 |         if confidence < min_confidence:
1040 |             logger.debug(f"Skipping relationship below confidence threshold ({confidence:.2f} < {min_confidence:.2f}): {rel}")
1041 |             continue
1042 | 
1043 |         # Check for duplicates based on (source, target, type)
1044 |         signature = (source_id, target_id, rel_type)
1045 |         if signature in relationship_signatures:
1046 |             logger.debug(f"Skipping duplicate relationship: {signature}")
1047 |             continue
1048 |         relationship_signatures.add(signature)
1049 | 
1050 | 
1051 |         # Ensure ID exists and is unique, generate if needed
1052 |         rel_id = rel.get("id")
1053 |         if not rel_id or not isinstance(rel_id, str) or rel_id in processed_rel_ids:
1054 |              rel_id = f"rel{rel_id_counter}"
1055 |              while rel_id in processed_rel_ids:
1056 |                  rel_id_counter += 1
1057 |                  rel_id = f"rel{rel_id_counter}"
1058 |         rel_id_counter = max(rel_id_counter, int(re.sub(r'\D', '', rel_id) or '0') + 1)
1059 | 
1060 |         processed_rel_ids.add(rel_id)
1061 |         rel["id"] = rel_id
1062 | 
1063 | 
1064 |         # Validate evidence
1065 |         if "evidence" in rel and not isinstance(rel["evidence"], str):
1066 |             try:
1067 |                 rel["evidence"] = str(rel["evidence"])
1068 |             except Exception:
1069 |                  rel.pop("evidence") # Remove invalid evidence
1070 | 
1071 |         # Validate temporal info
1072 |         if "temporal" in rel:
1073 |             if not isinstance(rel["temporal"], dict):
1074 |                 rel.pop("temporal")
1075 |             else:
1076 |                 # Ensure values are simple types
1077 |                 cleaned_temporal = {}
1078 |                 for key, value in rel["temporal"].items():
1079 |                     if isinstance(value, (str, int, float, bool)):
1080 |                         cleaned_temporal[key] = value
1081 |                 rel["temporal"] = cleaned_temporal if cleaned_temporal else None
1082 |                 if rel["temporal"] is None: 
1083 |                     rel.pop("temporal")
1084 | 
1085 | 
1086 |         valid_relationships.append(rel)
1087 | 
1088 |     # Limit relationships if needed
1089 |     if len(valid_relationships) > max_relations:
1090 |         logger.info(f"Reducing relationships from {len(valid_relationships)} to {max_relations} (limit).")
1091 |         # Sort by confidence before limiting
1092 |         valid_relationships.sort(key=lambda r: r.get("confidence", 0.0), reverse=True)
1093 |         valid_relationships = valid_relationships[:max_relations]
1094 | 
1095 |     # Return validated data, ensuring the keys exist even if empty
1096 |     validated_data = {
1097 |         "entities": final_entities,
1098 |         "relationships": valid_relationships
1099 |     }
1100 |     # Carry over other keys like model, tokens, cost if they exist
1101 |     for key in graph_data:
1102 |         if key not in validated_data:
1103 |             validated_data[key] = graph_data[key]
1104 | 
1105 |     logger.debug(f"Validation complete. Entities: {len(validated_data['entities'])}, Relationships: {len(validated_data['relationships'])}")
1106 |     return validated_data
1107 | 
1108 | 
1109 | async def _perform_standard_extraction(**kwargs) -> Dict[str, Any]:
1110 |     """Performs extraction in a single step."""
1111 |     logger.info("Performing standard extraction...")
1112 |     # Build prompt using common helper
1113 |     instructions = _build_common_prompt_instructions(
1114 |         task_description="Extract entities and relationships from the text below.",
1115 |         format_structure="STANDARD", # Placeholder, actual structure is defined inside helper
1116 |         **{k: kwargs[k] for k in [
1117 |             'entity_types', 'relation_types', 'schema', 'language', 'enable_reasoning',
1118 |             'include_positions', 'include_attributes', 'include_evidence', 'include_temporal_info',
1119 |             'min_confidence', 'max_entities', 'max_relations'
1120 |         ]}
1121 |     )
1122 | 
1123 |     # Handle custom prompt template
1124 |     if kwargs.get("custom_prompt"):
1125 |         prompt = kwargs["custom_prompt"].format(
1126 |             text=kwargs["text"],
1127 |             instructions=instructions,
1128 |             schema_info=kwargs.get("schema", ""), # Provide schema if needed by template
1129 |             examples=kwargs.get("example_entities", "") # Provide examples if needed
1130 |         )
1131 |     else:
1132 |         prompt = f"{instructions}\n\nTEXT TO ANALYZE:\n{kwargs['text']}"
1133 | 
1134 |     # Determine system prompt
1135 |     sys_prompt = kwargs.get("system_prompt") or SYSTEM_PROMPTS.get("standard_extraction")
1136 | 
1137 |     # Call LLM
1138 |     graph_data = await _call_llm_for_extraction(
1139 |         prompt=prompt,
1140 |         system_prompt=sys_prompt,
1141 |         provider_instance=kwargs["provider_instance"],
1142 |         model=kwargs["model"],
1143 |         max_tokens_per_request=kwargs["max_tokens_per_request"],
1144 |         additional_params=kwargs["additional_params"],
1145 |         task_name="Standard Extraction"
1146 |     )
1147 | 
1148 |     return graph_data # Validation happens in the main function
1149 | 
1150 | 
1151 | async def _perform_multistage_extraction(automatic_coreference: bool, **kwargs) -> Dict[str, Any]:
1152 |     """Performs extraction in two stages: entities first, then relationships."""
1153 |     logger.info("Performing multi-stage extraction...")
1154 |     total_tokens = {"input": 0, "output": 0, "total": 0}
1155 |     total_cost = 0.0
1156 |     model_used = kwargs["model"] # Assume same model for both stages
1157 | 
1158 |     # --- Stage 1: Entity Extraction ---
1159 |     logger.info("Multi-stage: Starting entity extraction...")
1160 |     entity_instructions = _build_common_prompt_instructions(
1161 |         task_description="Extract entities from the text below.",
1162 |         relation_types=None, # No relationships in this stage
1163 |         include_evidence=False, include_temporal_info=False, # Not relevant for entities only
1164 |         max_relations=0, # No relationships
1165 |         format_structure="ENTITIES_ONLY", # Needs adaptation in helper or specific format here
1166 |          **{k: kwargs[k] for k in [
1167 |             'entity_types', 'schema', 'language', 'enable_reasoning',
1168 |             'include_positions', 'include_attributes', 'min_confidence', 'max_entities'
1169 |             # Pass min_confidence and max_entities to filter/limit early if desired
1170 |         ]}
1171 |     )
1172 |     # Adjust format instruction specifically for entities
1173 |     entity_instructions = entity_instructions.replace(
1174 |          'Respond with a valid JSON object containing two keys: "entities" and "relationships".',
1175 |          'Respond with a valid JSON object containing ONLY the key "entities".'
1176 |     )
1177 |     entity_instructions = re.sub(r'"relationships":\s*\[.*?\]\s*\}', '}', entity_instructions, flags=re.DOTALL)
1178 | 
1179 | 
1180 |     # Handle custom prompt template (assuming it can handle stages)
1181 |     if kwargs.get("custom_prompt"):
1182 |          entity_prompt = kwargs["custom_prompt"].format(
1183 |              text=kwargs["text"], instructions=entity_instructions, stage="entities"
1184 |          )
1185 |     else:
1186 |         entity_prompt = f"{entity_instructions}\n\nTEXT TO ANALYZE:\n{kwargs['text']}"
1187 | 
1188 |     entity_sys_prompt = kwargs.get("system_prompt") or SYSTEM_PROMPTS.get("entity_detection")
1189 | 
1190 |     entity_result = await _call_llm_for_extraction(
1191 |         prompt=entity_prompt,
1192 |         system_prompt=entity_sys_prompt,
1193 |         provider_instance=kwargs["provider_instance"],
1194 |         model=kwargs["model"],
1195 |         max_tokens_per_request=kwargs["max_tokens_per_request"],
1196 |         additional_params=kwargs["additional_params"],
1197 |         task_name="Entity Extraction (Multi-stage)"
1198 |     )
1199 | 
1200 |     entities = entity_result.get("entities", [])
1201 |     if not isinstance(entities, list): 
1202 |         entities = []
1203 | 
1204 |     total_tokens["input"] += entity_result.get("tokens", {}).get("input", 0)
1205 |     total_tokens["output"] += entity_result.get("tokens", {}).get("output", 0)
1206 |     total_cost += entity_result.get("cost", 0.0)
1207 |     model_used = entity_result.get("model", model_used)
1208 | 
1209 |     if not entities:
1210 |         logger.warning("Multi-stage: No entities found in the first stage.")
1211 |         return {"entities": [], "relationships": [], "model": model_used, "tokens": total_tokens, "cost": total_cost}
1212 | 
1213 |     # Preliminary validation/cleaning of entities
1214 |     temp_validated = _validate_graph_data({"entities": entities}, 0.0, kwargs["max_entities"] * 2, 0) # Higher limit before rel stage
1215 |     entities = temp_validated["entities"]
1216 |     if not entities:
1217 |         logger.warning("Multi-stage: No valid entities after preliminary validation.")
1218 |         return {"entities": [], "relationships": [], "model": model_used, "tokens": total_tokens, "cost": total_cost}
1219 |     logger.info(f"Multi-stage: Found {len(entities)} potential entities.")
1220 | 
1221 | 
1222 |     # --- Stage 2: Relationship Extraction ---
1223 |     logger.info("Multi-stage: Starting relationship extraction...")
1224 |     # Format entity list for the relationship prompt
1225 |     entity_list_str = "EXTRACTED ENTITIES:\n"
1226 |     entity_id_map = {}
1227 |     for i, entity in enumerate(entities):
1228 |         entity_id = entity.get("id", f"temp_ent{i+1}")
1229 |         entity['id'] = entity_id # Ensure ID exists
1230 |         entity_id_map[entity_id] = entity
1231 |         entity_list_str += f"- ID: {entity_id}, Name: {entity.get('name', 'N/A')}, Type: {entity.get('type', 'N/A')}\n"
1232 | 
1233 | 
1234 |     relationship_instructions = _build_common_prompt_instructions(
1235 |         task_description="Identify relationships BETWEEN the provided entities based on the text.",
1236 |         entity_types=None, # Entities are provided, not extracted here
1237 |         include_positions=False, include_attributes=False, # Not relevant for relationships only
1238 |         max_entities=0, # No new entities
1239 |         format_structure="RELATIONSHIPS_ONLY", # Needs adaptation
1240 |          **{k: kwargs[k] for k in [
1241 |             'relation_types', 'schema', 'language', 'enable_reasoning',
1242 |             'include_evidence', 'include_temporal_info',
1243 |             'min_confidence', 'max_relations'
1244 |         ]}
1245 |     )
1246 |     # Adjust format instruction specifically for relationships
1247 |     relationship_instructions = relationship_instructions.replace(
1248 |          'Respond with a valid JSON object containing two keys: "entities" and "relationships".',
1249 |          'Respond with a valid JSON object containing ONLY the key "relationships". Reference the provided entity IDs.'
1250 |     )
1251 |     relationship_instructions = re.sub(r'"entities":\s*\[.*?\]\s*,?\s*', '', relationship_instructions, flags=re.DOTALL)
1252 | 
1253 | 
1254 |     # Handle custom prompt template
1255 |     if kwargs.get("custom_prompt"):
1256 |         relationship_prompt = kwargs["custom_prompt"].format(
1257 |             text=kwargs["text"],
1258 |             instructions=relationship_instructions,
1259 |             entities=entity_list_str, # Provide entities if template uses it
1260 |             stage="relationships"
1261 |         )
1262 |     else:
1263 |         relationship_prompt = f"{entity_list_str}\n{relationship_instructions}\n\nTEXT TO ANALYZE:\n{kwargs['text']}"
1264 | 
1265 |     relationship_sys_prompt = kwargs.get("system_prompt") or SYSTEM_PROMPTS.get("relationship_detection")
1266 | 
1267 |     relationship_result = await _call_llm_for_extraction(
1268 |         prompt=relationship_prompt,
1269 |         system_prompt=relationship_sys_prompt,
1270 |         provider_instance=kwargs["provider_instance"],
1271 |         model=kwargs["model"],
1272 |         max_tokens_per_request=kwargs["max_tokens_per_request"],
1273 |         additional_params=kwargs["additional_params"],
1274 |         task_name="Relationship Extraction (Multi-stage)"
1275 |     )
1276 | 
1277 |     relationships = relationship_result.get("relationships", [])
1278 |     if not isinstance(relationships, list): 
1279 |         relationships = []
1280 | 
1281 |     total_tokens["input"] += relationship_result.get("tokens", {}).get("input", 0)
1282 |     total_tokens["output"] += relationship_result.get("tokens", {}).get("output", 0)
1283 |     total_tokens["total"] = total_tokens["input"] + total_tokens["output"]
1284 |     total_cost += relationship_result.get("cost", 0.0)
1285 |     model_used = relationship_result.get("model", model_used) # Update model if different
1286 | 
1287 |     logger.info(f"Multi-stage: Found {len(relationships)} potential relationships.")
1288 | 
1289 |     # Combine results (Validation happens in the main function)
1290 |     # Note: Coreference resolution would happen here or during merging if chunked
1291 |     combined_result = {
1292 |         "entities": entities,
1293 |         "relationships": relationships,
1294 |         "model": model_used,
1295 |         "tokens": total_tokens,
1296 |         "cost": total_cost
1297 |     }
1298 | 
1299 |     return combined_result
1300 | 
1301 | 
1302 | async def _perform_chunked_extraction(
1303 |     context_window: int,
1304 |     chunk_size: Optional[int],
1305 |     automatic_coreference: bool,
1306 |     **kwargs
1307 | ) -> Dict[str, Any]:
1308 |     """Chunks large text, processes chunks, and merges results."""
1309 |     logger.info("Performing chunked extraction...")
1310 |     text = kwargs["text"]
1311 |     provider_instance = kwargs["provider_instance"]  # noqa: F841
1312 |     model = kwargs["model"]
1313 | 
1314 |     # --- Chunking ---
1315 |     # Estimate chunk size if not provided
1316 |     effective_chunk_size = chunk_size
1317 |     if not effective_chunk_size:
1318 |         # Estimate required tokens for prompt boilerplate + response
1319 |         # This is very approximate
1320 |         prompt_overhead = 1000 # Tokens for instructions, schema, etc.
1321 |         response_allowance = 1500 # Tokens expected in response
1322 |         available_for_text = context_window - prompt_overhead - response_allowance
1323 |         if available_for_text <= 100: # Need at least some text
1324 |             raise ToolInputError(f"Estimated context window ({context_window}) too small for chunking with overhead.")
1325 |         # Use 80% of available space for text chunk to be safe
1326 |         effective_chunk_size = int(available_for_text * 0.8)
1327 |         logger.info(f"Calculated chunk size: {effective_chunk_size} tokens (based on context window {context_window})")
1328 |     effective_chunk_size = max(500, min(effective_chunk_size, 16000)) # Apply reasonable bounds
1329 | 
1330 |     try:
1331 |         # Use semantic chunking if possible, fall back to simple
1332 |         chunks = await chunk_document_standalone(
1333 |             document=text,
1334 |             chunk_size=effective_chunk_size,
1335 |             chunk_overlap=int(effective_chunk_size * 0.1), # 10% overlap
1336 |             chunk_method="semantic" # Or "token" / "recursive" as fallback
1337 |         )
1338 |         if not chunks:
1339 |             raise ToolError("Document chunking resulted in zero chunks.", error_code="CHUNKING_ERROR")
1340 |         logger.info(f"Chunked document into {len(chunks)} chunks (size ~{effective_chunk_size} tokens, overlap ~{int(effective_chunk_size * 0.1)} tokens).")
1341 |     except Exception as e:
1342 |         logger.error(f"Chunking failed: {e}. Attempting standard extraction on the whole text.", exc_info=True)
1343 |         # Fallback to standard extraction on the entire text
1344 |         return await _perform_standard_extraction(**kwargs)
1345 | 
1346 | 
1347 |     # --- Process Chunks ---
1348 |     chunk_results = []
1349 |     total_tokens = {"input": 0, "output": 0, "total": 0}
1350 |     total_cost = 0.0
1351 |     model_used = model # Assume same model
1352 | 
1353 |     for i, chunk_text in enumerate(chunks):
1354 |         logger.info(f"Processing chunk {i+1}/{len(chunks)}...")
1355 |         chunk_kwargs = {
1356 |             **kwargs,
1357 |             "text": chunk_text,
1358 |             # Use slightly higher limits per chunk, merge will apply final limits
1359 |             "max_entities": kwargs["max_entities"] * 2,
1360 |             "max_relations": kwargs["max_relations"] * 2,
1361 |         }
1362 |         try:
1363 |             # Use standard extraction for each chunk
1364 |             # Could potentially use multistage here if desired for better accuracy per chunk
1365 |             result = await _perform_standard_extraction(**chunk_kwargs)
1366 | 
1367 |             # Add chunk index for potential merging logic later
1368 |             result["_chunk_index"] = i
1369 |             chunk_results.append(result)
1370 | 
1371 |             # Accumulate usage stats
1372 |             total_tokens["input"] += result.get("tokens", {}).get("input", 0)
1373 |             total_tokens["output"] += result.get("tokens", {}).get("output", 0)
1374 |             total_cost += result.get("cost", 0.0)
1375 |             model_used = result.get("model", model_used) # Update model if it changed
1376 | 
1377 |         except (ProviderError, ToolError) as e:
1378 |             logger.warning(f"Failed to process chunk {i+1}: {e}. Skipping chunk.")
1379 |         except Exception as e:
1380 |              logger.error(f"Unexpected error processing chunk {i+1}: {e}. Skipping chunk.", exc_info=True)
1381 | 
1382 |     total_tokens["total"] = total_tokens["input"] + total_tokens["output"]
1383 | 
1384 |     if not chunk_results:
1385 |         raise ToolError("All document chunks failed to process.", error_code="CHUNK_PROCESSING_FAILURE")
1386 | 
1387 |     # --- Merge Results ---
1388 |     logger.info(f"Merging results from {len(chunk_results)} processed chunks...")
1389 |     merged_result = _merge_chunk_results(
1390 |         chunk_results,
1391 |         max_entities=kwargs["max_entities"],
1392 |         max_relations=kwargs["max_relations"],
1393 |         min_confidence=kwargs["min_confidence"],
1394 |         perform_coreference=automatic_coreference # Pass flag
1395 |     )
1396 | 
1397 |     # Add aggregated usage stats back
1398 |     merged_result["model"] = model_used
1399 |     merged_result["tokens"] = total_tokens
1400 |     merged_result["cost"] = total_cost
1401 | 
1402 |     return merged_result # Validation happens in the main function
1403 | 
1404 | 
1405 | def _merge_chunk_results(
1406 |     chunk_results: List[Dict[str, Any]],
1407 |     max_entities: int,
1408 |     max_relations: int,
1409 |     min_confidence: float,
1410 |     perform_coreference: bool # Flag for simple coref
1411 | ) -> Dict[str, Any]:
1412 |     """Merges graph data from multiple chunks, handling duplicates and overlaps."""
1413 |     merged_entities: Dict[str, Dict[str, Any]] = {} # Map normalized name -> entity dict
1414 |     merged_relationships: List[Dict[str, Any]] = []
1415 |     entity_id_map: Dict[Tuple[int, str], str] = {} # Map (chunk_idx, old_id) -> new_merged_id
1416 |     rel_signatures: Set[Tuple[str, str, str]] = set() # (source_id, target_id, type)
1417 |     entity_counter = 1
1418 |     rel_counter = 1
1419 | 
1420 |     # Pass 1: Merge Entities and build ID map
1421 |     logger.debug(f"Merging entities from {len(chunk_results)} chunks...")
1422 |     for i, result in enumerate(chunk_results):
1423 |         entities = result.get("entities", [])
1424 |         if not isinstance(entities, list): 
1425 |             continue
1426 | 
1427 |         for entity in entities:
1428 |             if not isinstance(entity, dict) or "name" not in entity or "type" not in entity or "id" not in entity:
1429 |                 continue
1430 | 
1431 |             original_id = entity["id"]
1432 |             original_chunk = i
1433 |             entity_name = entity.get("name", "").strip()
1434 |             entity_type = entity.get("type")
1435 |             normalized_name = entity_name.lower()
1436 | 
1437 |             # Key for merging: normalized name + type (can be refined)
1438 |             merge_key = (normalized_name, entity_type)
1439 | 
1440 |             # Basic Coreference: If perform_coreference is True, try merging based on name only
1441 |             # This is very simplistic and might over-merge. More advanced coref needed for better results.
1442 |             if perform_coreference:
1443 |                 merge_key = normalized_name # Merge based on name only if coref is enabled
1444 | 
1445 |             existing_entity = merged_entities.get(merge_key)
1446 | 
1447 |             if existing_entity:
1448 |                 # Merge into existing entity
1449 |                 merged_id = existing_entity["id"]
1450 |                 entity_id_map[(original_chunk, original_id)] = merged_id
1451 | 
1452 |                 # Merge mentions (simple union by text)
1453 |                 existing_mentions = {m['text'] for m in existing_entity.get("mentions", []) if 'text' in m}
1454 |                 for mention in entity.get("mentions", []):
1455 |                      if isinstance(mention, dict) and 'text' in mention and mention['text'] not in existing_mentions:
1456 |                          existing_entity.setdefault("mentions", []).append(mention)
1457 |                          existing_mentions.add(mention['text'])
1458 | 
1459 |                 # Merge attributes (simple update, new values overwrite old)
1460 |                 if "attributes" in entity and isinstance(entity["attributes"], dict):
1461 |                     existing_entity.setdefault("attributes", {}).update(entity["attributes"])
1462 |             else:
1463 |                 # Add as new entity
1464 |                 new_id = f"ent{entity_counter}"
1465 |                 entity_counter += 1
1466 |                 entity["id"] = new_id # Assign new final ID
1467 |                 merged_entities[merge_key] = entity
1468 |                 entity_id_map[(original_chunk, original_id)] = new_id
1469 | 
1470 |     final_merged_entities = list(merged_entities.values())
1471 |     logger.debug(f"Merged into {len(final_merged_entities)} unique entities.")
1472 | 
1473 | 
1474 |     # Pass 2: Merge Relationships using the new entity IDs
1475 |     logger.debug(f"Merging relationships from {len(chunk_results)} chunks...")
1476 |     for i, result in enumerate(chunk_results):
1477 |          relationships = result.get("relationships", [])
1478 |          if not isinstance(relationships, list): 
1479 |              continue
1480 | 
1481 |          for rel in relationships:
1482 |              if not isinstance(rel, dict) or "source" not in rel or "target" not in rel or "type" not in rel:
1483 |                  continue
1484 | 
1485 |              original_source = rel["source"]
1486 |              original_target = rel["target"]
1487 |              rel_type = rel["type"]
1488 | 
1489 |              # Map old chunk-local IDs to new merged IDs
1490 |              new_source_id = entity_id_map.get((i, original_source))
1491 |              new_target_id = entity_id_map.get((i, original_target))
1492 | 
1493 |              if not new_source_id or not new_target_id:
1494 |                  logger.debug(f"Skipping relationship due to unmapped entity ID(s): {rel}")
1495 |                  continue # Skip if source/target entity wasn't merged
1496 | 
1497 |              # Check signature for duplicates
1498 |              signature = (new_source_id, new_target_id, rel_type)
1499 |              if signature in rel_signatures:
1500 |                  logger.debug(f"Skipping duplicate relationship signature: {signature}")
1501 |                  continue
1502 | 
1503 |              # Check confidence
1504 |              confidence = rel.get("confidence", 1.0) # Assume 1.0 if missing
1505 |              try:
1506 |                  confidence = float(confidence)
1507 |              except (ValueError, TypeError):
1508 |                   confidence = 0.5 # Default on error
1509 |              if confidence < min_confidence:
1510 |                   logger.debug(f"Skipping relationship below confidence threshold: {rel}")
1511 |                   continue
1512 | 
1513 |              # Add relationship with new IDs and new unique ID
1514 |              rel["source"] = new_source_id
1515 |              rel["target"] = new_target_id
1516 |              rel["id"] = f"rel{rel_counter}"
1517 |              rel_counter += 1
1518 |              rel["confidence"] = confidence # Ensure confidence is stored
1519 | 
1520 |              merged_relationships.append(rel)
1521 |              rel_signatures.add(signature)
1522 | 
1523 |     logger.debug(f"Merged into {len(merged_relationships)} unique relationships.")
1524 | 
1525 |     # Final limiting is done by _validate_graph_data called after merge
1526 | 
1527 |     return {
1528 |         "entities": final_merged_entities,
1529 |         "relationships": merged_relationships
1530 |     }
1531 | 
1532 | 
1533 | async def _perform_incremental_extraction(existing_graph: Dict[str, Any], **kwargs) -> Dict[str, Any]:
1534 |     """Extracts new info and merges it with an existing graph."""
1535 |     logger.info("Performing incremental extraction...")
1536 |     # Extract context from existing graph
1537 |     existing_entities = existing_graph.get("entities", [])
1538 |     existing_relationships = existing_graph.get("relationships", []) # Needed for context? Maybe not.
1539 |     existing_entity_ids = {e["id"] for e in existing_entities}
1540 |     existing_entity_names = {e["name"].lower().strip(): e["id"] for e in existing_entities}
1541 | 
1542 |     # Format existing entities for the prompt (limit context size)
1543 |     context_limit = 50
1544 |     entity_context = "EXISTING ENTITIES (Sample):\n"
1545 |     for entity in existing_entities[:context_limit]:
1546 |         entity_context += f"- ID: {entity.get('id', 'N/A')}, Name: {entity.get('name', 'N/A')}, Type: {entity.get('type', 'N/A')}\n"
1547 |     if len(existing_entities) > context_limit:
1548 |         entity_context += f"... and {len(existing_entities) - context_limit} more entities.\n"
1549 | 
1550 |     # Build prompt
1551 |     incremental_instructions = _build_common_prompt_instructions(
1552 |         task_description="Extract NEW entities and relationships from the text below, linking them to the EXISTING entities provided.",
1553 |         format_structure="INCREMENTAL", # Needs adaptation
1554 |          **{k: kwargs[k] for k in [
1555 |             'entity_types', 'relation_types', 'schema', 'language', 'enable_reasoning',
1556 |             'include_positions', 'include_attributes', 'include_evidence', 'include_temporal_info',
1557 |             'min_confidence', 'max_entities', 'max_relations'
1558 |         ]}
1559 |     )
1560 |     # Adjust format instructions for incremental output
1561 |     incremental_instructions = incremental_instructions.replace(
1562 |         'Respond with a valid JSON object containing two keys: "entities" and "relationships".',
1563 |         'Respond with a valid JSON object containing two keys: "new_entities" and "new_relationships".'
1564 |     )
1565 |     incremental_instructions = incremental_instructions.replace('"entities": [', '"new_entities": [')
1566 |     incremental_instructions = incremental_instructions.replace('"relationships": [', '"new_relationships": [')
1567 |     incremental_instructions += "\nINSTRUCTIONS:\n- Identify entities NOT ALREADY in the 'EXISTING ENTITIES' list.\n- Assign NEW unique IDs to these new entities.\n- Identify relationships involving at least one NEW entity.\n- When referencing an EXISTING entity in a relationship, use its provided ID.\n- When referencing a NEW entity, use the NEW ID you assigned."
1568 | 
1569 |     # Handle custom prompt
1570 |     if kwargs.get("custom_prompt"):
1571 |         prompt = kwargs["custom_prompt"].format(
1572 |             text=kwargs["text"],
1573 |             instructions=incremental_instructions,
1574 |             existing_entities=entity_context,
1575 |             schema_info=kwargs.get("schema", ""),
1576 |             examples="", # Examples less useful here
1577 |             stage="incremental"
1578 |         )
1579 |     else:
1580 |          prompt = f"{entity_context}\n{incremental_instructions}\n\nTEXT TO ANALYZE:\n{kwargs['text']}"
1581 | 
1582 |     sys_prompt = kwargs.get("system_prompt") or SYSTEM_PROMPTS.get("standard_extraction") # Use standard as base
1583 | 
1584 |     # Call LLM
1585 |     incremental_data = await _call_llm_for_extraction(
1586 |         prompt=prompt,
1587 |         system_prompt=sys_prompt,
1588 |         provider_instance=kwargs["provider_instance"],
1589 |         model=kwargs["model"],
1590 |         max_tokens_per_request=kwargs["max_tokens_per_request"],
1591 |         additional_params=kwargs["additional_params"],
1592 |         task_name="Incremental Extraction"
1593 |     )
1594 | 
1595 |     # --- Merge with Existing Graph ---
1596 |     new_entities = incremental_data.get("new_entities", [])
1597 |     new_relationships = incremental_data.get("new_relationships", [])
1598 |     if not isinstance(new_entities, list): 
1599 |         new_entities = []
1600 |     if not isinstance(new_relationships, list): 
1601 |         new_relationships = []
1602 | 
1603 |     logger.info(f"Incremental: Found {len(new_entities)} potential new entities and {len(new_relationships)} new relationships.")
1604 | 
1605 |     validated_new_entities, validated_new_relationships = _validate_incremental_data(
1606 |         new_entities, new_relationships, existing_entity_ids, existing_entity_names
1607 |     )
1608 | 
1609 |     logger.info(f"Incremental: Validated {len(validated_new_entities)} new entities and {len(validated_new_relationships)} new relationships after checks.")
1610 | 
1611 |     # Combine
1612 |     combined_entities = existing_entities + validated_new_entities
1613 |     combined_relationships = existing_relationships + validated_new_relationships
1614 | 
1615 |     # Return combined data (validation/limiting happens in main func)
1616 |     result = {
1617 |         "entities": combined_entities,
1618 |         "relationships": combined_relationships,
1619 |         "incremental_stats": {
1620 |             "existing_entities": len(existing_entities),
1621 |             "existing_relationships": len(existing_relationships),
1622 |             "new_entities_found": len(validated_new_entities),
1623 |             "new_relationships_found": len(validated_new_relationships),
1624 |         },
1625 |         "model": incremental_data.get("model"),
1626 |         "tokens": incremental_data.get("tokens"),
1627 |         "cost": incremental_data.get("cost")
1628 |     }
1629 |     return result
1630 | 
1631 | 
1632 | def _validate_incremental_data(
1633 |     new_entities: List[Dict[str, Any]],
1634 |     new_relationships: List[Dict[str, Any]],
1635 |     existing_entity_ids: Set[str],
1636 |     existing_entity_names: Dict[str, str] # lowercase name -> ID
1637 | ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
1638 |     """Validates entities/relationships from incremental extraction."""
1639 |     validated_entities: List[Dict[str, Any]] = []
1640 |     validated_relationships: List[Dict[str, Any]] = []
1641 |     id_map: Dict[str, str] = {} # Map temp ID from LLM response -> final ID (existing or new)
1642 |     assigned_new_ids: Set[str] = set()
1643 |     new_entity_counter = 1
1644 | 
1645 |     # Process new entities
1646 |     for entity in new_entities:
1647 |         if not isinstance(entity, dict) or "name" not in entity or "type" not in entity:
1648 |             continue
1649 | 
1650 |         temp_id = entity.get("id", f"temp_{uuid.uuid4()}") # Assign temp if missing
1651 |         name = entity["name"].strip()
1652 |         normalized_name = name.lower()
1653 | 
1654 |         # Check if it's actually an existing entity
1655 |         if normalized_name in existing_entity_names:
1656 |             existing_id = existing_entity_names[normalized_name]
1657 |             id_map[temp_id] = existing_id
1658 |             logger.debug(f"Incremental validation: Entity '{name}' ({temp_id}) matched existing entity {existing_id}.")
1659 |             continue # Don't add, just map the ID
1660 | 
1661 |         # If it's genuinely new, assign a proper new ID
1662 |         new_id = f"ent_new_{new_entity_counter}"
1663 |         while new_id in existing_entity_ids or new_id in assigned_new_ids:
1664 |             new_entity_counter += 1
1665 |             new_id = f"ent_new_{new_entity_counter}"
1666 | 
1667 |         entity["id"] = new_id
1668 |         validated_entities.append(entity)
1669 |         assigned_new_ids.add(new_id)
1670 |         id_map[temp_id] = new_id
1671 |         logger.debug(f"Incremental validation: Added new entity '{name}' with ID {new_id} (original temp ID: {temp_id}).")
1672 | 
1673 | 
1674 |     all_valid_ids = existing_entity_ids.union(assigned_new_ids)
1675 | 
1676 |     # Process new relationships
1677 |     rel_counter = 1
1678 |     for rel in new_relationships:
1679 |          if not isinstance(rel, dict) or "source" not in rel or "target" not in rel or "type" not in rel:
1680 |              continue
1681 | 
1682 |          temp_source = rel["source"]
1683 |          temp_target = rel["target"]
1684 | 
1685 |          # Map source and target IDs
1686 |          final_source_id = id_map.get(temp_source, temp_source) # Use temp_source if not in map (means it should be existing)
1687 |          final_target_id = id_map.get(temp_target, temp_target) # Use temp_target if not in map
1688 | 
1689 |          # Check if final IDs are valid (either existing or newly assigned)
1690 |          if final_source_id not in all_valid_ids or final_target_id not in all_valid_ids:
1691 |              logger.warning(f"Incremental validation: Skipping relationship '{rel.get('id')}' - invalid source ('{final_source_id}') or target ('{final_target_id}') ID after mapping.")
1692 |              continue
1693 | 
1694 |          # Ensure at least one entity involved is new (optional rule, depends on desired behavior)
1695 |          # if final_source_id in existing_entity_ids and final_target_id in existing_entity_ids:
1696 |          #    logger.debug(f"Incremental validation: Skipping relationship between two existing entities: {rel}")
1697 |          #    continue
1698 | 
1699 |          rel["source"] = final_source_id
1700 |          rel["target"] = final_target_id
1701 | 
1702 |          # Assign new relationship ID
1703 |          rel["id"] = f"rel_new_{rel_counter}"
1704 |          rel_counter +=1
1705 | 
1706 |          validated_relationships.append(rel)
1707 | 
1708 |     return validated_entities, validated_relationships
1709 | 
1710 | 
1711 | async def _perform_structured_extraction(**kwargs) -> Dict[str, Any]:
1712 |     """Uses structured few-shot examples for consistent extraction."""
1713 |     logger.info("Performing structured (few-shot) extraction...")
1714 |     examples_str = ""
1715 |     example_entities = kwargs.get("example_entities")
1716 |     example_relationships = kwargs.get("example_relationships")
1717 | 
1718 |     if example_entities and example_relationships:
1719 |         # Format provided examples
1720 |         examples_str = "\nEXAMPLES:\n\nExample Input Text:\n<Example text corresponding to the output below>\n\nExample Output JSON:\n"
1721 |         examples_str += "```json\n"
1722 |         examples_str += json.dumps({"entities": example_entities, "relationships": example_relationships}, indent=2)
1723 |         examples_str += "\n```\n"
1724 |         logger.info(f"Using {len(example_entities)} example entities and {len(example_relationships)} relationships.")
1725 |     else:
1726 |          logger.warning("Structured strategy chosen, but no examples were provided. Quality may be impacted.")
1727 | 
1728 | 
1729 |     # Build prompt using common helper, adding examples
1730 |     instructions = _build_common_prompt_instructions(
1731 |         task_description="Extract entities and relationships from the text below, following the provided examples EXACTLY.",
1732 |         examples=examples_str,
1733 |         format_structure="STRUCTURED", # Placeholder
1734 |         **{k: kwargs[k] for k in [
1735 |             'entity_types', 'relation_types', 'schema', 'language', 'enable_reasoning',
1736 |             'include_positions', 'include_attributes', 'include_evidence', 'include_temporal_info',
1737 |             'min_confidence', 'max_entities', 'max_relations'
1738 |         ]}
1739 |     )
1740 |     instructions += "\nIMPORTANT: Your output JSON structure MUST strictly match the structure in the 'Example Output JSON' above."
1741 | 
1742 |     # Handle custom prompt template
1743 |     if kwargs.get("custom_prompt"):
1744 |          prompt = kwargs["custom_prompt"].format(
1745 |              text=kwargs["text"], instructions=instructions, examples=examples_str
1746 |          )
1747 |     else:
1748 |         prompt = f"{instructions}\n\nTEXT TO ANALYZE:\n{kwargs['text']}"
1749 | 
1750 |     sys_prompt = kwargs.get("system_prompt") or SYSTEM_PROMPTS.get("standard_extraction") # Base prompt
1751 | 
1752 |     # Call LLM
1753 |     graph_data = await _call_llm_for_extraction(
1754 |         prompt=prompt,
1755 |         system_prompt=sys_prompt,
1756 |         provider_instance=kwargs["provider_instance"],
1757 |         model=kwargs["model"],
1758 |         max_tokens_per_request=kwargs["max_tokens_per_request"],
1759 |         additional_params=kwargs["additional_params"],
1760 |         task_name="Structured Extraction"
1761 |     )
1762 | 
1763 |     return graph_data # Validation happens in the main function
1764 | 
1765 | 
1766 | async def _perform_schema_guided_extraction(schema: Dict[str, Any], **kwargs) -> Dict[str, Any]:
1767 |     """Performs extraction strictly adhering to the provided schema."""
1768 |     logger.info("Performing strict schema-guided extraction...")
1769 | 
1770 |     # Build prompt emphasizing schema adherence
1771 |     instructions = _build_common_prompt_instructions(
1772 |         task_description="Extract entities and relationships from the text below STRICTLY according to the provided schema.",
1773 |         schema=schema, # Ensure schema is passed to helper for inclusion
1774 |         format_structure="SCHEMA_GUIDED", # Placeholder
1775 |          **{k: kwargs[k] for k in [
1776 |             'entity_types', 'relation_types', 'language', 'enable_reasoning', # Pass entity/relation types as potential *subset* of schema
1777 |             'include_positions', 'include_attributes', 'include_evidence', 'include_temporal_info',
1778 |             'min_confidence', 'max_entities', 'max_relations'
1779 |         ]}
1780 |     )
1781 |     instructions += "\nRULES:\n- ONLY extract entities and relationships matching the types defined in the SCHEMA.\n- ONLY include attributes specified in the SCHEMA for each entity type.\n- Ensure relationships connect entities with types allowed by the SCHEMA definition (Source/Target types).\n- DO NOT extract anything that does not conform to the SCHEMA."
1782 | 
1783 |     # Handle custom prompt
1784 |     if kwargs.get("custom_prompt"):
1785 |         prompt = kwargs["custom_prompt"].format(
1786 |             text=kwargs["text"],
1787 |             instructions=instructions,
1788 |             schema_info=json.dumps(schema, indent=2), # Provide full schema if template needs it
1789 |             examples="", # No generic examples
1790 |             stage="schema_guided"
1791 |         )
1792 |     else:
1793 |         prompt = f"{instructions}\n\nTEXT TO ANALYZE:\n{kwargs['text']}"
1794 | 
1795 |     sys_prompt = kwargs.get("system_prompt") or SYSTEM_PROMPTS.get("standard_extraction") # Base prompt
1796 | 
1797 |     # Call LLM - potentially lower temperature for schema adherence
1798 |     strict_params = {**kwargs.get("additional_params", {})}
1799 |     strict_params["temperature"] = strict_params.get("temperature", 0.05) # Override default temp to be stricter
1800 | 
1801 |     graph_data = await _call_llm_for_extraction(
1802 |         prompt=prompt,
1803 |         system_prompt=sys_prompt,
1804 |         provider_instance=kwargs["provider_instance"],
1805 |         model=kwargs["model"],
1806 |         max_tokens_per_request=kwargs["max_tokens_per_request"],
1807 |         additional_params=strict_params,
1808 |         task_name="Schema-Guided Extraction"
1809 |     )
1810 | 
1811 |     # Optional: Add a validation step here specifically against the schema?
1812 |     # Currently relies on the LLM following instructions + standard validation.
1813 | 
1814 |     return graph_data # Validation happens in the main function
1815 | 
1816 | 
1817 | # --- Post-processing Helper Functions ---
1818 | 
1819 | def _normalize_entities(extraction_result: Dict[str, Any]) -> Dict[str, Any]:
1820 |     """Normalizes entity names (e.g., capitalization, whitespace) and merges duplicates."""
1821 |     logger.debug("Normalizing entity names and merging duplicates...")
1822 |     entities = extraction_result.get("entities", [])
1823 |     relationships = extraction_result.get("relationships", [])
1824 |     if not isinstance(entities, list) or not entities:
1825 |         return extraction_result
1826 | 
1827 |     normalized_entities: Dict[Tuple[str, str], Dict[str, Any]] = {} # (normalized_name, type) -> entity
1828 |     id_map: Dict[str, str] = {} # Map old ID -> new ID (of the kept entity)
1829 |     entity_counter = 1
1830 | 
1831 |     for entity in entities:
1832 |         if not isinstance(entity, dict) or "name" not in entity or "type" not in entity or "id" not in entity:
1833 |             continue
1834 | 
1835 |         original_id = entity["id"]
1836 |         name = entity["name"].strip()
1837 |         type = entity.get("type", "Unknown")
1838 | 
1839 |         # Basic normalization (can be customized)
1840 |         normalized_name = ' '.join(name.split()) # Remove extra whitespace
1841 |         # Simple title case for common proper noun types
1842 |         if type in ["Person", "Organization", "Location", "Product", "Event", "Brand"]:
1843 |              # Avoid changing acronyms or mixed case like 'iPhone'
1844 |              if not any(c.islower() for c in normalized_name) or not any(c.isupper() for c in normalized_name):
1845 |                  normalized_name = normalized_name.title() # Title case only if all upper/lower
1846 |              # More sophisticated logic could go here
1847 | 
1848 |         merge_key = (normalized_name.lower(), type) # Merge key based on lower name and type
1849 | 
1850 |         existing_entity = normalized_entities.get(merge_key)
1851 | 
1852 |         if existing_entity:
1853 |             # Map old ID to the ID of the entity we are merging into
1854 |             id_map[original_id] = existing_entity["id"]
1855 |             logger.debug(f"Normalizing: Merging '{name}' ({original_id}) into '{existing_entity['name']}' ({existing_entity['id']})")
1856 | 
1857 |             # Merge mentions
1858 |             existing_mentions = {m['text'] for m in existing_entity.get("mentions", []) if 'text' in m}
1859 |             for mention in entity.get("mentions", []):
1860 |                  if isinstance(mention, dict) and 'text' in mention and mention['text'] not in existing_mentions:
1861 |                      existing_entity.setdefault("mentions", []).append(mention)
1862 |                      existing_mentions.add(mention['text'])
1863 | 
1864 |             # Merge attributes (simple update)
1865 |             if "attributes" in entity and isinstance(entity["attributes"], dict):
1866 |                 existing_entity.setdefault("attributes", {}).update(entity["attributes"])
1867 | 
1868 |         else:
1869 |             # This is the first time we see this normalized entity, keep it
1870 |             # Assign a stable new ID if needed (might be redundant if IDs are already good)
1871 |             new_id = f"ent{entity_counter}"
1872 |             entity_counter += 1
1873 |             entity["id"] = new_id
1874 |             entity["name"] = normalized_name # Store normalized name
1875 | 
1876 |             normalized_entities[merge_key] = entity
1877 |             id_map[original_id] = new_id # Map original ID to its new/kept ID
1878 | 
1879 | 
1880 |     final_entities = list(normalized_entities.values())
1881 | 
1882 |     # Update relationships with new IDs
1883 |     final_relationships = []
1884 |     if isinstance(relationships, list):
1885 |         for rel in relationships:
1886 |             if not isinstance(rel, dict) or "source" not in rel or "target" not in rel: 
1887 |                 continue
1888 | 
1889 |             original_source = rel["source"]
1890 |             original_target = rel["target"]
1891 | 
1892 |             new_source_id = id_map.get(original_source)
1893 |             new_target_id = id_map.get(original_target)
1894 | 
1895 |             if new_source_id and new_target_id:
1896 |                 rel["source"] = new_source_id
1897 |                 rel["target"] = new_target_id
1898 |                 final_relationships.append(rel)
1899 |             else:
1900 |                 logger.debug(f"Normalizing: Dropping relationship '{rel.get('id')}' due to missing entity mapping for source '{original_source}' or target '{original_target}'.")
1901 | 
1902 |     extraction_result["entities"] = final_entities
1903 |     extraction_result["relationships"] = final_relationships
1904 |     logger.debug(f"Normalization complete. Entities: {len(final_entities)}, Relationships: {len(final_relationships)}")
1905 |     return extraction_result
1906 | 
1907 | 
1908 | def _add_graph_metrics(extraction_result: Dict[str, Any], sort_by: str = "confidence") -> Dict[str, Any]:
1909 |     """Adds computed graph metrics (if networkx available) and sorts results."""
1910 |     entities = extraction_result.get("entities", [])
1911 |     relationships = extraction_result.get("relationships", [])
1912 |     if not isinstance(entities, list) or not isinstance(relationships, list):
1913 |          return extraction_result # Cannot process
1914 | 
1915 |     # Calculate metrics using NetworkX if available
1916 |     metrics = {}
1917 |     node_centrality = {}
1918 |     node_degree = {}
1919 |     if HAS_NETWORKX:
1920 |         logger.debug("Calculating graph metrics using NetworkX...")
1921 |         G = nx.DiGraph()
1922 |         entity_ids = set()
1923 |         for entity in entities:
1924 |             if isinstance(entity, dict) and "id" in entity:
1925 |                 G.add_node(entity["id"], **{k:v for k,v in entity.items() if k != 'id'})
1926 |                 entity_ids.add(entity["id"])
1927 | 
1928 |         for rel in relationships:
1929 |             if isinstance(rel, dict) and "source" in rel and "target" in rel:
1930 |                 # Only add edges between nodes that actually exist
1931 |                 if rel["source"] in entity_ids and rel["target"] in entity_ids:
1932 |                     G.add_edge(rel["source"], rel["target"], **{k:v for k,v in rel.items() if k not in ['id', 'source', 'target']})
1933 | 
1934 |         if G.number_of_nodes() > 0:
1935 |             try:
1936 |                 metrics["node_count"] = G.number_of_nodes()
1937 |                 metrics["edge_count"] = G.number_of_edges()
1938 |                 metrics["density"] = nx.density(G) if G.number_of_nodes() > 1 else 0.0
1939 |                 metrics["avg_degree"] = sum(d for n, d in G.degree()) / G.number_of_nodes()
1940 | 
1941 |                 # Components (use weakly connected for directed graph)
1942 |                 metrics["components"] = nx.number_weakly_connected_components(G)
1943 | 
1944 |                 # Centrality (can be slow on large graphs)
1945 |                 if G.number_of_nodes() < 1000: # Limit calculation for performance
1946 |                     try:
1947 |                         node_centrality = nx.betweenness_centrality(G, normalized=True)
1948 |                         node_degree = nx.degree_centrality(G)
1949 |                         metrics["avg_betweenness_centrality"] = sum(node_centrality.values()) / G.number_of_nodes()
1950 |                         logger.debug("Calculated betweenness centrality.")
1951 |                     except Exception as cent_err:
1952 |                          logger.warning(f"Could not calculate centrality metrics: {cent_err}")
1953 |                 else:
1954 |                     logger.warning("Skipping centrality calculation for large graph (>1000 nodes).")
1955 | 
1956 |             except Exception as metric_err:
1957 |                  logger.warning(f"Failed to calculate some graph metrics: {metric_err}")
1958 |             extraction_result["metrics"] = metrics
1959 | 
1960 |     # Add centrality/degree to entities
1961 |     for entity in entities:
1962 |         if isinstance(entity, dict) and "id" in entity:
1963 |              entity_id = entity["id"]
1964 |              entity["centrality"] = node_centrality.get(entity_id, 0.0)
1965 |              entity["degree"] = node_degree.get(entity_id, 0.0) # Use degree centrality as 'degree'
1966 | 
1967 |     # Sort entities and relationships
1968 |     logger.debug(f"Sorting results by '{sort_by}'...")
1969 |     # Sort entities
1970 |     if sort_by == "centrality" and HAS_NETWORKX and node_centrality:
1971 |         entities.sort(key=lambda x: x.get("centrality", 0.0), reverse=True)
1972 |     elif sort_by == "mentions":
1973 |         entities.sort(key=lambda x: len(x.get("mentions", [])), reverse=True)
1974 |     elif sort_by == "degree" and HAS_NETWORKX and node_degree:
1975 |         entities.sort(key=lambda x: x.get("degree", 0.0), reverse=True)
1976 |     else: # Default or fallback: sort by name
1977 |         entities.sort(key=lambda x: x.get("name", "").lower())
1978 | 
1979 |     # Sort relationships (typically by confidence)
1980 |     relationships.sort(key=lambda x: x.get("confidence", 0.0), reverse=True)
1981 | 
1982 |     extraction_result["entities"] = entities
1983 |     extraction_result["relationships"] = relationships
1984 | 
1985 |     return extraction_result
1986 | 
1987 | 
1988 | def _generate_visualization(
1989 |     extraction_result: Dict[str, Any],
1990 |     format: VisualizationFormat
1991 | ) -> Optional[Dict[str, Any]]:
1992 |     """Generates a visualization of the entity graph if libraries are available."""
1993 |     if format == VisualizationFormat.NONE: 
1994 |         return None
1995 |     if not HAS_VISUALIZATION_LIBS:
1996 |         logger.warning("Cannot generate visualization - required libraries not installed.")
1997 |         return {"error": "Visualization libraries (networkx, pyvis, matplotlib) not installed."}
1998 | 
1999 |     entities = extraction_result.get("entities", [])
2000 |     relationships = extraction_result.get("relationships", [])
2001 |     if not isinstance(entities, list) or not isinstance(relationships, list) or not entities:
2002 |          logger.warning("Cannot generate visualization - no valid entities or relationships found.")
2003 |          return {"error": "No entities/relationships to visualize."}
2004 | 
2005 |     viz_data = {}
2006 |     output_dir = tempfile.gettempdir()
2007 |     file_uuid = uuid.uuid4()
2008 |     base_filename = f"graph_{file_uuid}"
2009 | 
2010 |     try:
2011 |         if format == VisualizationFormat.HTML:
2012 |             if not HAS_PYVIS: 
2013 |                 return {"error": "Pyvis library not installed for HTML visualization."}
2014 |             logger.debug("Generating HTML visualization using Pyvis...")
2015 |             net = Network(notebook=False, height="800px", width="100%", directed=True)
2016 |             # Basic options - can be customized further
2017 |             net.set_options("""
2018 |             {
2019 |               "nodes": { "font": { "size": 10 }, "size": 15 },
2020 |               "edges": { "arrows": { "to": { "enabled": true, "scaleFactor": 0.5 } }, "smooth": { "type": "continuous" }, "font": { "size": 9, "align": "middle" } },
2021 |               "physics": { "solver": "forceAtlas2Based", "forceAtlas2Based": { "springLength": 100, "avoidOverlap": 0.5 } }
2022 |             }
2023 |             """)
2024 | 
2025 |             entity_ids = set()
2026 |             type_colors = {}
2027 |             unique_types = sorted(list(set(e.get("type", "Unknown") for e in entities)))
2028 |             # Simple color mapping
2029 |             palette = ["#e6194B", "#3cb44b", "#ffe119", "#4363d8", "#f58231", "#911eb4", "#42d4f4", "#f032e6", "#bfef45", "#fabed4", "#469990", "#dcbeff", "#9A6324", "#fffac8", "#800000", "#aaffc3", "#808000", "#ffd8b1", "#000075", "#a9a9a9"]
2030 |             for i, etype in enumerate(unique_types):
2031 |                 type_colors[etype] = palette[i % len(palette)]
2032 | 
2033 |             for entity in entities:
2034 |                 eid = entity["id"]
2035 |                 entity_ids.add(eid)
2036 |                 label = entity.get("name", eid)
2037 |                 etype = entity.get("type", "Unknown")
2038 |                 title = f"ID: {eid}\nType: {etype}"
2039 |                 if entity.get("attributes"):
2040 |                     title += "\n" + "\n".join(f"{k}: {v}" for k, v in entity["attributes"].items())
2041 |                 size = 15 + int(entity.get("degree", 0.0) * 30) # Scale size by degree
2042 |                 net.add_node(eid, label=label, title=title, color=type_colors.get(etype, "#d3d3d3"), size=size)
2043 | 
2044 |             for rel in relationships:
2045 |                 src = rel["source"]
2046 |                 tgt = rel["target"]
2047 |                 if src in entity_ids and tgt in entity_ids:
2048 |                     label = rel.get("type", "")
2049 |                     conf = rel.get('confidence', 1.0)
2050 |                     title = f"{label} (Conf: {conf:.2f})"
2051 |                     if rel.get("evidence"): 
2052 |                         title += f"\nEvidence: {rel['evidence'][:100]}..."
2053 |                     width = 0.5 + (conf * 2.0)
2054 |                     net.add_edge(src, tgt, title=title, label=label, width=width)
2055 | 
2056 |             html_path = os.path.join(output_dir, f"{base_filename}.html")
2057 |             net.save_graph(html_path)
2058 |             with open(html_path, "r", encoding="utf-8") as f:
2059 |                 viz_data["html"] = f.read()
2060 |             viz_data["url"] = f"file://{html_path}"
2061 |             logger.info(f"HTML visualization saved to {html_path}")
2062 | 
2063 |         elif format in [VisualizationFormat.SVG, VisualizationFormat.PNG]:
2064 |             if not HAS_NETWORKX or not HAS_MATPLOTLIB:
2065 |                  return {"error": "NetworkX and Matplotlib required for SVG/PNG visualization."}
2066 |             logger.debug(f"Generating {format.value.upper()} visualization using NetworkX/Matplotlib...")
2067 | 
2068 |             G = nx.DiGraph()
2069 |             entity_ids = set()
2070 |             type_colors = {}
2071 |             unique_types = sorted(list(set(e.get("type", "Unknown") for e in entities)))
2072 |             palette = plt.cm.get_cmap('tab20', len(unique_types)) # Use a colormap
2073 |             node_colors_map = {}
2074 |             node_labels = {}
2075 |             node_sizes = []
2076 | 
2077 |             for i, etype in enumerate(unique_types):
2078 |                 type_colors[etype] = palette(i)
2079 | 
2080 |             for entity in entities:
2081 |                 eid = entity["id"]
2082 |                 entity_ids.add(eid)
2083 |                 node_labels[eid] = entity.get("name", eid)[:20] # Truncate long labels
2084 |                 etype = entity.get("type", "Unknown")
2085 |                 node_colors_map[eid] = type_colors.get(etype, "#d3d3d3")
2086 |                 # Size based on degree, with min size
2087 |                 node_sizes.append(300 + int(entity.get("degree", 0.0) * 1000))
2088 |                 G.add_node(eid) # Add node without attributes for plotting
2089 | 
2090 |             for rel in relationships:
2091 |                 src = rel["source"]
2092 |                 tgt = rel["target"]
2093 |                 if src in entity_ids and tgt in entity_ids:
2094 |                      G.add_edge(src, tgt, label=rel.get("type", ""))
2095 | 
2096 | 
2097 |             if G.number_of_nodes() == 0: 
2098 |                 raise ValueError("Graph has no nodes after filtering.")
2099 | 
2100 |             plt.figure(figsize=(16, 12))
2101 |             # Use a layout algorithm
2102 |             pos = nx.spring_layout(G, k=0.5 / (G.number_of_nodes()**0.5) if G.number_of_nodes() > 1 else 1, iterations=50)
2103 | 
2104 |             nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=[node_colors_map[n] for n in G.nodes()], alpha=0.8)
2105 |             nx.draw_networkx_edges(G, pos, alpha=0.5, arrows=True, arrowsize=10, node_size=node_sizes)
2106 |             nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=8)
2107 | 
2108 |             # Add edge labels (can get cluttered)
2109 |             if G.number_of_edges() < 100: # Only draw edge labels for smaller graphs
2110 |                 edge_labels = nx.get_edge_attributes(G, 'label')
2111 |                 nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=7, alpha=0.7)
2112 | 
2113 |             # Create legend
2114 |             legend_handles = [plt.Line2D([0], [0], marker='o', color='w', label=etype, markersize=10, markerfacecolor=color)
2115 |                               for etype, color in type_colors.items()]
2116 |             plt.legend(handles=legend_handles, title="Entity Types", loc='upper right', fontsize=8)
2117 | 
2118 |             plt.title("Entity Relationship Graph", fontsize=14)
2119 |             plt.axis('off')
2120 | 
2121 |             file_extension = format.value
2122 |             output_path = os.path.join(output_dir, f"{base_filename}.{file_extension}")
2123 |             plt.savefig(output_path, format=file_extension, bbox_inches='tight', dpi=150)
2124 |             plt.close() # Close the plot figure
2125 | 
2126 |             # Read SVG content if requested
2127 |             if format == VisualizationFormat.SVG:
2128 |                  with open(output_path, "r", encoding="utf-8") as f:
2129 |                      viz_data["svg"] = f.read()
2130 | 
2131 |             viz_data["url"] = f"file://{output_path}"
2132 |             logger.info(f"{format.value.upper()} visualization saved to {output_path}")
2133 | 
2134 |         elif format == VisualizationFormat.DOT:
2135 |              if not HAS_NETWORKX: 
2136 |                  return {"error": "NetworkX required for DOT format generation."}
2137 |              logger.debug("Generating DOT visualization format...")
2138 |              G = nx.DiGraph() # Rebuild graph specifically for DOT attributes if needed
2139 |              # ... (similar node/edge adding as above) ...
2140 |              dot_string = nx.nx_pydot.to_pydot(G).to_string() # Requires pydot
2141 |              dot_path = os.path.join(output_dir, f"{base_filename}.dot")
2142 |              with open(dot_path, "w", encoding="utf-8") as f:
2143 |                  f.write(dot_string)
2144 |              viz_data["dot"] = dot_string
2145 |              viz_data["url"] = f"file://{dot_path}"
2146 |              logger.info(f"DOT visualization saved to {dot_path}")
2147 | 
2148 |     except ImportError as e:
2149 |         logger.error(f"Visualization failed due to missing library: {e}", exc_info=True)
2150 |         viz_data["error"] = f"Missing library for {format.value} format: {e}"
2151 |     except Exception as e:
2152 |          logger.error(f"Failed to generate {format.value} visualization: {e}", exc_info=True)
2153 |          viz_data["error"] = f"Failed to generate {format.value} visualization: {e}"
2154 | 
2155 |     return viz_data if viz_data else None
2156 | 
2157 | 
2158 | def _format_output(
2159 |     extraction_result: Dict[str, Any],
2160 |     format: OutputFormat
2161 | ) -> Dict[str, Any]:
2162 |     """Formats the extraction result into the requested output structure."""
2163 |     entities = extraction_result.get("entities", [])
2164 |     relationships = extraction_result.get("relationships", [])
2165 | 
2166 |     if format == OutputFormat.JSON:
2167 |         return extraction_result # Already in desired format
2168 | 
2169 |     if not isinstance(entities, list) or not isinstance(relationships, list):
2170 |         return {"error": "Invalid input data for formatting.", **extraction_result}
2171 | 
2172 |     logger.debug(f"Formatting output as {format.value}...")
2173 |     output = {"entities": entities, "relationships": relationships, **extraction_result} # Start with base data
2174 | 
2175 |     try:
2176 |         if format == OutputFormat.NETWORKX:
2177 |             if not HAS_NETWORKX: 
2178 |                 raise ImportError("networkx library not installed.")
2179 |             G = nx.DiGraph()
2180 |             entity_ids = set()
2181 |             for entity in entities:
2182 |                 eid = entity["id"]
2183 |                 entity_ids.add(eid)
2184 |                 G.add_node(eid, **{k:v for k,v in entity.items() if k != 'id'})
2185 |             for rel in relationships:
2186 |                 if rel["source"] in entity_ids and rel["target"] in entity_ids:
2187 |                     G.add_edge(rel["source"], rel["target"], **{k:v for k,v in rel.items() if k not in ['id', 'source', 'target']})
2188 |             output["graph"] = G
2189 | 
2190 |         elif format == OutputFormat.RDF:
2191 |             triples = []
2192 |             ns = "urn:graph:" # Simple namespace prefix
2193 |             for entity in entities:
2194 |                 subj = f"{ns}entity#{entity['id']}"
2195 |                 triples.append((subj, f"{ns}type", f"{ns}type#{entity.get('type', 'Unknown')}"))
2196 |                 triples.append((subj, f"{ns}name", entity.get('name', '')))
2197 |                 for attr, val in entity.get("attributes", {}).items():
2198 |                     triples.append((subj, f"{ns}attr#{attr}", str(val))) # Simple string literal
2199 |             for rel in relationships:
2200 |                  subj = f"{ns}entity#{rel['source']}"
2201 |                  pred = f"{ns}rel#{rel.get('type', 'relatedTo')}"
2202 |                  obj = f"{ns}entity#{rel['target']}"
2203 |                  triples.append((subj, pred, obj))
2204 |                  # Could add relationship attributes as reified triples if needed
2205 |             output["rdf_triples"] = triples
2206 | 
2207 |         elif format == OutputFormat.CYTOSCAPE:
2208 |             nodes = [{"data": {"id": e["id"], **e}} for e in entities] # Put all entity data under 'data'
2209 |             edges = [{"data": {"id": r.get("id", f"rel_{r['source']}_{r['target']}"), "source": r["source"], "target": r["target"], **r}} for r in relationships]
2210 |             output["cytoscape"] = {"nodes": nodes, "edges": edges}
2211 | 
2212 |         elif format == OutputFormat.D3:
2213 |              nodes = [{"id": e["id"], "name": e.get("name"), "group": e.get("type"), **e} for e in entities]
2214 |              links = [{"source": r["source"], "target": r["target"], "type": r.get("type"), "value": r.get("confidence", 0.5)*10, **r} for r in relationships]
2215 |              output["d3"] = {"nodes": nodes, "links": links}
2216 | 
2217 |         elif format == OutputFormat.NEO4J:
2218 |             queries = []
2219 |             # Create constraints for uniqueness if desired (optional)
2220 |             # queries.append("CREATE CONSTRAINT ON (e:Entity) ASSERT e.id IS UNIQUE;")
2221 |             # queries.append("CREATE CONSTRAINT ON (p:Person) ASSERT p.id IS UNIQUE;") # etc. for specific types
2222 | 
2223 |             entity_types = set(e.get("type", "Entity") for e in entities)
2224 |             for etype in entity_types:
2225 |                  queries.append(f"CREATE INDEX IF NOT EXISTS FOR (n:{etype}) ON (n.id);")
2226 | 
2227 | 
2228 |             # Create nodes
2229 |             for entity in entities:
2230 |                 labels = ":".join(l for l in ["Entity", entity.get("type", "Unknown")] if l) # Base :Entity label + specific type  # noqa: E741
2231 |                 props = {k: v for k, v in entity.items() if k not in ['mentions']} # Exclude mentions from props
2232 |                 props_str = json.dumps(props)[1:-1] # Get inner K:V pairs
2233 |                 queries.append(f"MERGE (n:{labels} {{id: {json.dumps(entity['id'])}}}) SET n += {{{props_str}}}")
2234 | 
2235 |             # Create relationships
2236 |             for rel in relationships:
2237 |                  rel_type = re.sub(r'\W+', '_', rel.get("type", "RELATED_TO")).upper() # Sanitize type for Neo4j
2238 |                  props = {k: v for k, v in rel.items() if k not in ['id', 'source', 'target', 'type']}
2239 |                  props_str = json.dumps(props)
2240 |                  queries.append(
2241 |                      f"MATCH (a {{id: {json.dumps(rel['source'])}}}), (b {{id: {json.dumps(rel['target'])}}}) "
2242 |                      f"MERGE (a)-[r:{rel_type}]->(b) "
2243 |                      f"SET r += {props_str}"
2244 |                  )
2245 |             output["neo4j_queries"] = queries
2246 | 
2247 |     except ImportError as e:
2248 |          output["error"] = f"Missing library for {format.value} formatting: {e}"
2249 |          logger.warning(output["error"])
2250 |          if format == OutputFormat.NETWORKX: 
2251 |              output = _format_output(extraction_result, OutputFormat.JSON) # Fallback
2252 |     except Exception as e:
2253 |         output["error"] = f"Failed to format output as {format.value}: {e}"
2254 |         logger.error(output["error"], exc_info=True)
2255 | 
2256 |     return output
2257 | 
2258 | 
2259 | def _create_query_interface(extraction_result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
2260 |     """Creates helper functions for querying the graph (requires networkx)."""
2261 |     if not HAS_NETWORKX:
2262 |         return None
2263 | 
2264 |     entities = extraction_result.get("entities", [])
2265 |     relationships = extraction_result.get("relationships", [])
2266 |     if not isinstance(entities, list) or not isinstance(relationships, list): 
2267 |         return None
2268 | 
2269 |     # Build graph once for querying
2270 |     G = nx.DiGraph()
2271 |     entity_map = {}
2272 |     entity_ids = set()
2273 |     for entity in entities:
2274 |         if isinstance(entity, dict) and "id" in entity:
2275 |             eid = entity["id"]
2276 |             entity_ids.add(eid)
2277 |             entity_map[eid] = entity
2278 |             G.add_node(eid, **{k:v for k,v in entity.items() if k != 'id'})
2279 | 
2280 |     for rel in relationships:
2281 |         if isinstance(rel, dict) and "source" in rel and "target" in rel:
2282 |             if rel["source"] in entity_ids and rel["target"] in entity_ids:
2283 |                  G.add_edge(rel["source"], rel["target"], **{k:v for k,v in rel.items() if k not in ['id', 'source', 'target']})
2284 | 
2285 |     # --- Query Functions ---
2286 |     def find_entity(name: Optional[str] = None, entity_type: Optional[str] = None, attribute_key: Optional[str] = None, attribute_value: Optional[Any] = None) -> List[Dict]:
2287 |         """Finds entities matching specified criteria."""
2288 |         results = []
2289 |         name_lower = name.lower() if name else None
2290 |         for _eid, entity_data in entity_map.items():
2291 |             match = True
2292 |             if name_lower and name_lower not in entity_data.get("name", "").lower():
2293 |                 match = False
2294 |             if entity_type and entity_data.get("type") != entity_type:
2295 |                 match = False
2296 |             if attribute_key:
2297 |                 attrs = entity_data.get("attributes", {})
2298 |                 if attribute_key not in attrs or (attribute_value is not None and attrs[attribute_key] != attribute_value):
2299 |                     match = False
2300 |             if match:
2301 |                 results.append(entity_data)
2302 |         return results
2303 | 
2304 |     def find_path(source_id: str, target_id: str, cutoff: int = 5) -> List[List[Dict]]:
2305 |         """Finds simple paths (lists of entity dicts) between two entity IDs."""
2306 |         if source_id not in G or target_id not in G: 
2307 |             return []
2308 |         try:
2309 |             paths_nodes = list(nx.all_simple_paths(G, source=source_id, target=target_id, cutoff=cutoff))
2310 |             return [[entity_map[nid] for nid in node_path] for node_path in paths_nodes]
2311 |         except nx.NetworkXNoPath:
2312 |             return []
2313 |         except Exception as e:
2314 |             logger.warning(f"Path finding failed: {e}")
2315 |             return []
2316 | 
2317 |     def get_neighbors(entity_id: str, direction: str = "both") -> Dict[str, List[Dict]]:
2318 |         """Gets direct neighbors (entities) of a given entity ID."""
2319 |         if entity_id not in G: 
2320 |             return {"incoming": [], "outgoing": []}
2321 |         neighbors = {"incoming": [], "outgoing": []}
2322 |         if direction in ["incoming", "both"]:
2323 |             neighbors["incoming"] = [entity_map[pred] for pred in G.predecessors(entity_id) if pred in entity_map]
2324 |         if direction in ["outgoing", "both"]:
2325 |             neighbors["outgoing"] = [entity_map[succ] for succ in G.successors(entity_id) if succ in entity_map]
2326 |         return neighbors
2327 | 
2328 |     # --- Interface Definition ---
2329 |     query_interface = {
2330 |         "find_entity": find_entity,
2331 |         "find_path": find_path,
2332 |         "get_neighbors": get_neighbors,
2333 |         "get_entity_by_id": lambda eid: entity_map.get(eid),
2334 |         "get_graph_object": lambda: G, # Return the NetworkX graph
2335 |         "descriptions": {
2336 |             "find_entity": find_entity.__doc__,
2337 |             "find_path": find_path.__doc__,
2338 |             "get_neighbors": get_neighbors.__doc__,
2339 |             "get_entity_by_id": "Retrieves a single entity dictionary by its unique ID.",
2340 |             "get_graph_object": "Returns the underlying NetworkX DiGraph object."
2341 |         }
2342 |     }
2343 |     return query_interface
2344 | 
```
Page 35/45FirstPrevNextLast