#
tokens: 46688/50000 15/235 files (page 4/9)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 4 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── ISSUE_TEMPLATE
│   │   ├── breaking-bug-report.md
│   │   ├── feature_request.md
│   │   └── output-bug-report.md
│   └── workflows
│       ├── benchmarks.yml
│       ├── ci.yml
│       ├── cla.yml
│       ├── publish.yml
│       └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│   ├── __init__.py
│   ├── overall
│   │   ├── __init__.py
│   │   ├── display
│   │   │   ├── __init__.py
│   │   │   ├── dataset.py
│   │   │   └── table.py
│   │   ├── download
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── llamaparse.py
│   │   │   ├── main.py
│   │   │   ├── mathpix.py
│   │   │   └── mistral.py
│   │   ├── elo.py
│   │   ├── methods
│   │   │   ├── __init__.py
│   │   │   ├── docling.py
│   │   │   ├── gt.py
│   │   │   ├── llamaparse.py
│   │   │   ├── marker.py
│   │   │   ├── mathpix.py
│   │   │   ├── mistral.py
│   │   │   ├── olmocr.py
│   │   │   └── schema.py
│   │   ├── overall.py
│   │   ├── registry.py
│   │   ├── schema.py
│   │   └── scorers
│   │       ├── __init__.py
│   │       ├── clean.py
│   │       ├── heuristic.py
│   │       ├── llm.py
│   │       └── schema.py
│   ├── table
│   │   ├── __init__.py
│   │   ├── gemini.py
│   │   ├── inference.py
│   │   ├── scoring.py
│   │   └── table.py
│   ├── throughput
│   │   ├── __init__.py
│   │   └── main.py
│   └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│   ├── .gitignore
│   ├── examples
│   │   ├── json
│   │   │   ├── multicolcnn.json
│   │   │   ├── switch_trans.json
│   │   │   └── thinkpython.json
│   │   └── markdown
│   │       ├── multicolcnn
│   │       │   ├── _page_1_Figure_0.jpeg
│   │       │   ├── _page_2_Picture_0.jpeg
│   │       │   ├── _page_6_Figure_0.jpeg
│   │       │   ├── _page_7_Figure_0.jpeg
│   │       │   ├── multicolcnn_meta.json
│   │       │   └── multicolcnn.md
│   │       ├── switch_transformers
│   │       │   ├── _page_11_Figure_4.jpeg
│   │       │   ├── _page_12_Figure_4.jpeg
│   │       │   ├── _page_13_Figure_2.jpeg
│   │       │   ├── _page_18_Figure_1.jpeg
│   │       │   ├── _page_18_Figure_3.jpeg
│   │       │   ├── _page_2_Figure_3.jpeg
│   │       │   ├── _page_20_Figure_1.jpeg
│   │       │   ├── _page_20_Figure_4.jpeg
│   │       │   ├── _page_27_Figure_1.jpeg
│   │       │   ├── _page_29_Figure_1.jpeg
│   │       │   ├── _page_30_Figure_1.jpeg
│   │       │   ├── _page_31_Figure_3.jpeg
│   │       │   ├── _page_4_Figure_1.jpeg
│   │       │   ├── _page_5_Figure_3.jpeg
│   │       │   ├── switch_trans_meta.json
│   │       │   └── switch_trans.md
│   │       └── thinkpython
│   │           ├── _page_109_Figure_1.jpeg
│   │           ├── _page_115_Figure_1.jpeg
│   │           ├── _page_116_Figure_3.jpeg
│   │           ├── _page_127_Figure_1.jpeg
│   │           ├── _page_128_Figure_1.jpeg
│   │           ├── _page_167_Figure_1.jpeg
│   │           ├── _page_169_Figure_1.jpeg
│   │           ├── _page_173_Figure_1.jpeg
│   │           ├── _page_190_Figure_1.jpeg
│   │           ├── _page_195_Figure_1.jpeg
│   │           ├── _page_205_Figure_1.jpeg
│   │           ├── _page_23_Figure_1.jpeg
│   │           ├── _page_23_Figure_3.jpeg
│   │           ├── _page_230_Figure_1.jpeg
│   │           ├── _page_233_Figure_1.jpeg
│   │           ├── _page_233_Figure_3.jpeg
│   │           ├── _page_234_Figure_1.jpeg
│   │           ├── _page_235_Figure_1.jpeg
│   │           ├── _page_236_Figure_1.jpeg
│   │           ├── _page_236_Figure_3.jpeg
│   │           ├── _page_237_Figure_1.jpeg
│   │           ├── _page_238_Figure_1.jpeg
│   │           ├── _page_46_Figure_1.jpeg
│   │           ├── _page_60_Figure_1.jpeg
│   │           ├── _page_60_Figure_3.jpeg
│   │           ├── _page_67_Figure_1.jpeg
│   │           ├── _page_71_Figure_1.jpeg
│   │           ├── _page_78_Figure_1.jpeg
│   │           ├── _page_85_Figure_1.jpeg
│   │           ├── _page_94_Figure_1.jpeg
│   │           ├── _page_99_Figure_17.jpeg
│   │           ├── _page_99_Figure_178.jpeg
│   │           ├── thinkpython_meta.json
│   │           └── thinkpython.md
│   ├── images
│   │   ├── overall.png
│   │   ├── per_doc.png
│   │   └── table.png
│   └── latex_to_md.sh
├── examples
│   ├── marker_modal_deployment.py
│   └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│   ├── builders
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── layout.py
│   │   ├── line.py
│   │   ├── ocr.py
│   │   └── structure.py
│   ├── config
│   │   ├── __init__.py
│   │   ├── crawler.py
│   │   ├── parser.py
│   │   └── printer.py
│   ├── converters
│   │   ├── __init__.py
│   │   ├── extraction.py
│   │   ├── ocr.py
│   │   ├── pdf.py
│   │   └── table.py
│   ├── extractors
│   │   ├── __init__.py
│   │   ├── document.py
│   │   └── page.py
│   ├── logger.py
│   ├── models.py
│   ├── output.py
│   ├── processors
│   │   ├── __init__.py
│   │   ├── blank_page.py
│   │   ├── block_relabel.py
│   │   ├── blockquote.py
│   │   ├── code.py
│   │   ├── debug.py
│   │   ├── document_toc.py
│   │   ├── equation.py
│   │   ├── footnote.py
│   │   ├── ignoretext.py
│   │   ├── line_merge.py
│   │   ├── line_numbers.py
│   │   ├── list.py
│   │   ├── llm
│   │   │   ├── __init__.py
│   │   │   ├── llm_complex.py
│   │   │   ├── llm_equation.py
│   │   │   ├── llm_form.py
│   │   │   ├── llm_handwriting.py
│   │   │   ├── llm_image_description.py
│   │   │   ├── llm_mathblock.py
│   │   │   ├── llm_meta.py
│   │   │   ├── llm_page_correction.py
│   │   │   ├── llm_sectionheader.py
│   │   │   ├── llm_table_merge.py
│   │   │   └── llm_table.py
│   │   ├── order.py
│   │   ├── page_header.py
│   │   ├── reference.py
│   │   ├── sectionheader.py
│   │   ├── table.py
│   │   ├── text.py
│   │   └── util.py
│   ├── providers
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── epub.py
│   │   ├── html.py
│   │   ├── image.py
│   │   ├── pdf.py
│   │   ├── powerpoint.py
│   │   ├── registry.py
│   │   ├── spreadsheet.py
│   │   └── utils.py
│   ├── renderers
│   │   ├── __init__.py
│   │   ├── chunk.py
│   │   ├── extraction.py
│   │   ├── html.py
│   │   ├── json.py
│   │   ├── markdown.py
│   │   └── ocr_json.py
│   ├── schema
│   │   ├── __init__.py
│   │   ├── blocks
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── basetable.py
│   │   │   ├── caption.py
│   │   │   ├── code.py
│   │   │   ├── complexregion.py
│   │   │   ├── equation.py
│   │   │   ├── figure.py
│   │   │   ├── footnote.py
│   │   │   ├── form.py
│   │   │   ├── handwriting.py
│   │   │   ├── inlinemath.py
│   │   │   ├── listitem.py
│   │   │   ├── pagefooter.py
│   │   │   ├── pageheader.py
│   │   │   ├── picture.py
│   │   │   ├── reference.py
│   │   │   ├── sectionheader.py
│   │   │   ├── table.py
│   │   │   ├── tablecell.py
│   │   │   ├── text.py
│   │   │   └── toc.py
│   │   ├── document.py
│   │   ├── groups
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── figure.py
│   │   │   ├── list.py
│   │   │   ├── page.py
│   │   │   ├── picture.py
│   │   │   └── table.py
│   │   ├── polygon.py
│   │   ├── registry.py
│   │   └── text
│   │       ├── __init__.py
│   │       ├── char.py
│   │       ├── line.py
│   │       └── span.py
│   ├── scripts
│   │   ├── __init__.py
│   │   ├── chunk_convert.py
│   │   ├── chunk_convert.sh
│   │   ├── common.py
│   │   ├── convert_single.py
│   │   ├── convert.py
│   │   ├── extraction_app.py
│   │   ├── file_to_s3.py
│   │   ├── run_streamlit_app.py
│   │   ├── server.py
│   │   └── streamlit_app.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── azure_openai.py
│   │   ├── claude.py
│   │   ├── gemini.py
│   │   ├── ollama.py
│   │   ├── openai.py
│   │   └── vertex.py
│   ├── settings.py
│   ├── util.py
│   └── utils
│       ├── __init__.py
│       ├── batch.py
│       ├── gpu.py
│       └── image.py
├── marker_app.py
├── marker_server.py
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│   └── version1
│       └── cla.json
├── static
│   └── fonts
│       └── .gitignore
└── tests
    ├── builders
    │   ├── test_blank_page.py
    │   ├── test_document_builder.py
    │   ├── test_garbled_pdf.py
    │   ├── test_layout_replace.py
    │   ├── test_ocr_builder.py
    │   ├── test_ocr_pipeline.py
    │   ├── test_overriding.py
    │   ├── test_pdf_links.py
    │   ├── test_rotated_bboxes.py
    │   ├── test_strip_existing_ocr.py
    │   └── test_structure.py
    ├── config
    │   └── test_config.py
    ├── conftest.py
    ├── converters
    │   ├── test_extraction_converter.py
    │   ├── test_ocr_converter.py
    │   ├── test_pdf_converter.py
    │   └── test_table_converter.py
    ├── processors
    │   ├── test_document_toc_processor.py
    │   ├── test_equation_processor.py
    │   ├── test_footnote_processor.py
    │   ├── test_ignoretext.py
    │   ├── test_llm_processors.py
    │   ├── test_table_merge.py
    │   └── test_table_processor.py
    ├── providers
    │   ├── test_document_providers.py
    │   ├── test_image_provider.py
    │   └── test_pdf_provider.py
    ├── renderers
    │   ├── test_chunk_renderer.py
    │   ├── test_extract_images.py
    │   ├── test_html_renderer.py
    │   ├── test_json_renderer.py
    │   └── test_markdown_renderer.py
    ├── schema
    │   └── groups
    │       └── test_list_grouping.py
    ├── services
    │   └── test_service_init.py
    └── utils.py
```

# Files

--------------------------------------------------------------------------------
/marker/scripts/extraction_app.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | 
  4 | from streamlit_ace import st_ace
  5 | from pydantic import BaseModel
  6 | 
  7 | from marker.converters.extraction import ExtractionConverter
  8 | from marker.scripts.common import (
  9 |     parse_args,
 10 |     load_models,
 11 |     get_page_image,
 12 |     page_count,
 13 |     get_root_class,
 14 | )
 15 | 
 16 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 17 | os.environ["IN_STREAMLIT"] = "true"
 18 | 
 19 | from streamlit.runtime.uploaded_file_manager import UploadedFile
 20 | 
 21 | import tempfile
 22 | from typing import Any, Dict
 23 | 
 24 | import streamlit as st
 25 | 
 26 | from marker.config.parser import ConfigParser
 27 | 
 28 | 
 29 | def extract_data(
 30 |     fname: str, config: dict, schema: str, markdown: str | None = None
 31 | ) -> (str, Dict[str, Any], dict):
 32 |     config["pdftext_workers"] = 1
 33 |     config["page_schema"] = schema
 34 |     config["existing_markdown"] = markdown
 35 |     config_parser = ConfigParser(config)
 36 |     config_dict = config_parser.generate_config_dict()
 37 | 
 38 |     converter_cls = ExtractionConverter
 39 |     converter = converter_cls(
 40 |         config=config_dict,
 41 |         artifact_dict=model_dict,
 42 |         processor_list=config_parser.get_processors(),
 43 |         renderer=config_parser.get_renderer(),
 44 |         llm_service=config_parser.get_llm_service(),
 45 |     )
 46 |     return converter(fname)
 47 | 
 48 | 
 49 | st.set_page_config(layout="wide")
 50 | col1, col2 = st.columns([0.5, 0.5])
 51 | 
 52 | model_dict = load_models()
 53 | cli_options = parse_args()
 54 | 
 55 | st.markdown("""
 56 | # Marker Extraction Demo
 57 | 
 58 | This app will let you use marker to do structured extraction.
 59 | 
 60 | Warning: This can execute untrusted code entered into the schema panel.
 61 | """)
 62 | 
 63 | in_file: UploadedFile = st.sidebar.file_uploader(
 64 |     "PDF, document, or image file:",
 65 |     type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
 66 | )
 67 | 
 68 | # Initialize session state variables
 69 | if "rendered_pydantic_schema" not in st.session_state:
 70 |     st.session_state.rendered_pydantic_schema = ""
 71 | 
 72 | if "markdown" not in st.session_state:
 73 |     st.session_state.markdown = ""
 74 | 
 75 | if "current_file_id" not in st.session_state:
 76 |     st.session_state.current_file_id = None
 77 | 
 78 | # Detect file changes and clear markdown when new file is uploaded
 79 | if in_file is not None:
 80 |     # Create a unique identifier for the current file
 81 |     current_file_id = f"{in_file.name}_{in_file.size}_{hash(in_file.getvalue())}"
 82 | 
 83 |     # Check if this is a new file
 84 |     if st.session_state.current_file_id != current_file_id:
 85 |         st.session_state.current_file_id = current_file_id
 86 |         st.session_state.markdown = ""  # Clear markdown for new file
 87 | else:
 88 |     # No file uploaded, clear the current file ID
 89 |     if st.session_state.current_file_id is not None:
 90 |         st.session_state.current_file_id = None
 91 |         st.session_state.markdown = ""  # Clear markdown when no file
 92 |         st.session_state.rendered_pydantic_schema = ""
 93 | 
 94 | if in_file is None:
 95 |     st.stop()
 96 | 
 97 | filetype = in_file.type
 98 | 
 99 | with col1:
100 |     page_count = page_count(in_file)
101 |     page_number = st.number_input(
102 |         f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
103 |     )
104 |     pil_image = get_page_image(in_file, page_number)
105 |     st.image(pil_image, use_container_width=True)
106 | with col2:
107 |     tab1, tab2 = st.tabs(["JSON Schema", "Pydantic Schema"])
108 | 
109 |     # Initialize schema variable
110 |     schema = None
111 | 
112 |     with tab1:
113 |         st.write("Enter an existing JSON schema here:")
114 |         default_json_value = (
115 |             st.session_state.rendered_pydantic_schema
116 |             if st.session_state.rendered_pydantic_schema
117 |             else ""
118 |         )
119 |         json_schema_input = st.text_area(
120 |             "JSON Schema",
121 |             value=default_json_value,
122 |             height=300,
123 |             placeholder='{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}}',
124 |             key="json_schema_input",
125 |             label_visibility="collapsed",
126 |         )
127 | 
128 |         # Set schema if JSON input is provided
129 |         if json_schema_input and json_schema_input.strip():
130 |             try:
131 |                 # Validate JSON
132 |                 json.loads(json_schema_input)
133 |                 schema = json_schema_input.strip()
134 |                 st.success("✅ Valid JSON schema detected")
135 |             except json.JSONDecodeError as e:
136 |                 st.error(f"❌ Invalid JSON: {e}")
137 |                 schema = None
138 | 
139 |     with tab2:
140 |         st.write("Enter pydantic schema here:")
141 |         pydantic_schema_input = st_ace(
142 |             value="""from pydantic import BaseModel
143 | 
144 | class Schema(BaseModel):
145 |     # Add your fields here
146 |     # Example:
147 |     name: str
148 |     age: int
149 |     # email: str
150 |     pass""",
151 |             language="python",
152 |             height=300,
153 |             key="pydantic_editor",
154 |         )
155 | 
156 |         render_schema = st.button("🔄 Render Pydantic schema to JSON")
157 | 
158 |         if render_schema and pydantic_schema_input:
159 |             try:
160 |                 pydantic_root: BaseModel = get_root_class(pydantic_schema_input)
161 |                 json_schema = pydantic_root.model_json_schema()
162 |                 schema = json.dumps(json_schema, indent=2)
163 |                 st.success("✅ Schema rendered successfully!")
164 |                 st.json(json_schema)
165 |                 st.session_state.rendered_pydantic_schema = schema
166 |             except Exception as e:
167 |                 st.error(f"❌ Could not parse your schema: {e}")
168 |                 schema = None
169 |         elif (
170 |             pydantic_schema_input
171 |             and pydantic_schema_input.strip()
172 |             and not render_schema
173 |         ):
174 |             # If there's Pydantic code but not rendered yet, show a message
175 |             if (
176 |                 "class Schema(BaseModel):" in pydantic_schema_input
177 |                 and "pass" not in pydantic_schema_input
178 |             ):
179 |                 st.info(
180 |                     "💡 Click 'Render Pydantic schema to JSON' to convert your Pydantic model to JSON schema"
181 |                 )
182 | 
183 | # Move the run logic outside of col2
184 | run_marker = st.sidebar.button("Run Extraction")
185 | 
186 | use_llm = st.sidebar.checkbox(
187 |     "Use LLM", help="Use LLM for higher quality text", value=False
188 | )
189 | force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
190 | strip_existing_ocr = st.sidebar.checkbox(
191 |     "Strip existing OCR",
192 |     help="Strip existing OCR text from the PDF and re-OCR.",
193 |     value=False,
194 | )
195 | 
196 | # Check if schema is provided before running
197 | if run_marker:
198 |     if not schema:
199 |         st.error(
200 |             "❌ Please provide a schema in either the JSON Schema or Pydantic Schema tab before running extraction."
201 |         )
202 |         st.stop()
203 | 
204 |     # Run Marker
205 |     with tempfile.TemporaryDirectory() as tmp_dir:
206 |         temp_pdf = os.path.join(tmp_dir, "temp.pdf")
207 |         with open(temp_pdf, "wb") as f:
208 |             f.write(in_file.getvalue())
209 | 
210 |         cli_options.update(
211 |             {
212 |                 "force_ocr": force_ocr,
213 |                 "use_llm": use_llm,
214 |                 "strip_existing_ocr": strip_existing_ocr,
215 |             }
216 |         )
217 | 
218 |         try:
219 |             rendered = extract_data(
220 |                 temp_pdf, cli_options, schema, st.session_state.markdown
221 |             )
222 | 
223 |             with col2:
224 |                 st.write("## Output JSON")
225 |                 st.json(rendered.model_dump(exclude=["original_markdown"]))
226 |                 st.session_state.markdown = rendered.original_markdown
227 | 
228 |         except Exception as e:
229 |             st.error(f"❌ Extraction failed: {e}")
230 | 
231 | else:
232 |     # Show instruction when not running
233 |     if not schema:
234 |         st.info("📝 Please provide a schema and click 'Run Extraction' to begin.")
235 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/overall.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | import traceback
  4 | from collections import defaultdict
  5 | from pathlib import Path
  6 | from typing import List
  7 | 
  8 | import click
  9 | import datasets
 10 | import torch
 11 | from tqdm import tqdm
 12 | 
 13 | from benchmarks.overall.display.dataset import build_dataset
 14 | from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
 15 | from benchmarks.overall.schema import FullResult
 16 | from marker.logger import configure_logging
 17 | from marker.models import create_model_dict
 18 | from marker.settings import settings
 19 | from benchmarks.overall.display.table import print_scores
 20 | 
 21 | configure_logging()
 22 | 
 23 | 
 24 | def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult:
 25 |     bench_scores = {}
 26 |     averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
 27 |     averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
 28 |     average_times = defaultdict(list)
 29 |     markdown_by_method = defaultdict(dict)
 30 |     total_rows = len(benchmark_dataset)
 31 |     if max_rows:
 32 |         total_rows = min(max_rows, total_rows)
 33 |     for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark", total=total_rows):
 34 |         if max_rows is not None and idx >= max_rows:
 35 |             break
 36 | 
 37 |         doc_type = sample["classification"]
 38 |         gt_cls = METHOD_REGISTRY["gt"]
 39 |         gt_blocks = json.loads(sample["gt_blocks"])
 40 |         gt_md = gt_cls(**artifacts)(sample)["markdown"]
 41 |         markdown_by_method[idx]["gt"] = gt_md
 42 | 
 43 |         out_data = defaultdict(dict)
 44 | 
 45 |         try:
 46 |             for method in methods:
 47 |                 method_cls = METHOD_REGISTRY[method](**artifacts)
 48 |                 method_info = method_cls(sample)
 49 |                 method_md = method_info["markdown"]
 50 |                 if method_md is None:
 51 |                     method_md = "" # Avoid None values
 52 | 
 53 |                 average_times[method].append(method_info["time"])
 54 |                 markdown_by_method[idx][method] = method_md
 55 | 
 56 |                 for score_type in score_types:
 57 |                     score_cls = SCORE_REGISTRY[score_type]()
 58 |                     try:
 59 |                         scores = score_cls(sample, gt_md, method_md)
 60 |                     except Exception as e:
 61 |                         # Some scorers can fail, like the LLM one
 62 |                         print(f"Failed to score {method} with {score_type}: {e}")
 63 |                         continue
 64 | 
 65 |                     out_data[method][score_type] = scores
 66 | 
 67 |                     averages_by_type[method][score_type][doc_type].append(scores["score"])
 68 | 
 69 |                     if "by_block" in scores["specific_scores"]: # Not all scorers support this
 70 |                         for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks):
 71 |                             averages_by_block_type[method][score_type][gt_block["block_type"]].append(score)
 72 |         except Exception as e:
 73 |             print(f"Failed to process {idx}: {e}")
 74 |             traceback.print_exc()
 75 |             if idx in markdown_by_method:
 76 |                 del markdown_by_method[idx]
 77 |             continue
 78 | 
 79 |         bench_scores[idx] = out_data
 80 | 
 81 |     return {
 82 |         "scores": bench_scores,
 83 |         "markdown": markdown_by_method,
 84 |         "averages_by_type": averages_by_type,
 85 |         "averages_by_block_type": averages_by_block_type,
 86 |         "average_times": average_times,
 87 |     }
 88 | 
 89 | @click.command(help="Benchmark PDF to MD conversion.")
 90 | @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
 91 | @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
 92 | @click.option("--methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: marker,mathpix,llamaparse,docling,mistral", default="marker")
 93 | @click.option("--scores", type=str, help="Comma separated list of scoring functions to use.  Possible values: heuristic,llm", default="heuristic")
 94 | @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 95 | @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
 96 | @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
 97 | @click.option("--languages", type=str, help="Comma separated list of languages to use for LLM", default=None)
 98 | def main(
 99 |         dataset: str,
100 |         out_dataset: str,
101 |         methods: str,
102 |         scores: str,
103 |         result_path: str,
104 |         max_rows: int,
105 |         use_llm: bool,
106 |         languages: str
107 | ):
108 |     out_path = Path(result_path)
109 |     out_path.mkdir(parents=True, exist_ok=True)
110 | 
111 |     methods = methods.split(",")
112 |     for method in methods:
113 |         if method not in METHOD_REGISTRY:
114 |             raise ValueError(f"Method {method} not allowed.  Allowed methods are {METHOD_REGISTRY.keys()}")
115 | 
116 |     # Ensure marker is always first
117 |     all_methods = list(set(methods))
118 |     methods = ["marker"] if "marker" in all_methods else []
119 |     methods += [m for m in all_methods if m != "marker"]
120 | 
121 |     score_types = scores.split(",")
122 |     for score_type in score_types:
123 |         if score_type not in SCORE_REGISTRY:
124 |             raise ValueError(f"Score type {score_type} not allowed.  Allowed types are {SCORE_REGISTRY.keys()}")
125 | 
126 |     if languages:
127 |         languages = languages.split(",")
128 |     else:
129 |         languages = None
130 | 
131 |     benchmark_dataset = datasets.load_dataset(dataset, split="train")
132 |     if languages:
133 |         benchmark_dataset = benchmark_dataset.filter(lambda x: x["language"] in languages)
134 | 
135 |     artifacts = {
136 |         "model_dict": create_model_dict(),
137 |         "use_llm": use_llm,
138 |         "mathpix_ds": None,
139 |         "llamaparse_ds": None,
140 |     }
141 | 
142 |     if "mathpix" in methods:
143 |         artifacts["mathpix_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
144 | 
145 |     if "llamaparse" in methods:
146 |         artifacts["llamaparse_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_llamaparse", split="train")
147 | 
148 |     if "mistral" in methods:
149 |         artifacts["mistral_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mistral", split="train")
150 | 
151 |     if "olmocr" in methods:
152 |         from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
153 |         model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview",
154 |                                                                 torch_dtype=torch.bfloat16).eval()
155 |         processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
156 |         model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
157 |         artifacts["olmocr_model"] = {"model": model, "processor": processor}
158 | 
159 |     print(f"Running benchmark with methods: {methods} and scores: {score_types}")
160 |     result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows)
161 | 
162 |     # Display benchmark scoring tables
163 |     print_scores(result, out_path, methods, score_types, default_method=methods[0], default_score_type=score_types[0])
164 | 
165 |     # Write to json
166 |     with open(out_path / "result.json", "w") as f:
167 |         json.dump(result, f)
168 | 
169 |     if out_dataset:
170 |         if use_llm:
171 |             out_dataset += "_llm"
172 |         dataset = build_dataset(benchmark_dataset, result, score_types, max_rows=max_rows)
173 |         dataset.push_to_hub(out_dataset, private=True)
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     main()
178 | 
179 | 
```

--------------------------------------------------------------------------------
/marker/util.py:
--------------------------------------------------------------------------------

```python
  1 | import inspect
  2 | import os
  3 | from importlib import import_module
  4 | from typing import List, Annotated
  5 | import re
  6 | 
  7 | import numpy as np
  8 | import requests
  9 | from pydantic import BaseModel
 10 | 
 11 | from marker.schema.polygon import PolygonBox
 12 | from marker.settings import settings
 13 | 
 14 | OPENING_TAG_REGEX = re.compile(r"<((?:math|i|b))(?:\s+[^>]*)?>")
 15 | CLOSING_TAG_REGEX = re.compile(r"</((?:math|i|b))>")
 16 | TAG_MAPPING = {
 17 |     'i': 'italic',
 18 |     'b': 'bold',
 19 |     'math': 'math',
 20 |     'mark': 'highlight',
 21 |     'sub': 'subscript',
 22 |     'sup': 'superscript',
 23 |     'small': 'small',
 24 |     'u': 'underline',
 25 |     'code': 'code'
 26 | }
 27 | 
 28 | def strings_to_classes(items: List[str]) -> List[type]:
 29 |     classes = []
 30 |     for item in items:
 31 |         module_name, class_name = item.rsplit('.', 1)
 32 |         module = import_module(module_name)
 33 |         classes.append(getattr(module, class_name))
 34 |     return classes
 35 | 
 36 | 
 37 | def classes_to_strings(items: List[type]) -> List[str]:
 38 |     for item in items:
 39 |         if not inspect.isclass(item):
 40 |             raise ValueError(f"Item {item} is not a class")
 41 | 
 42 |     return [f"{item.__module__}.{item.__name__}" for item in items]
 43 | 
 44 | 
 45 | def verify_config_keys(obj):
 46 |     annotations = inspect.get_annotations(obj.__class__)
 47 | 
 48 |     none_vals = ""
 49 |     for attr_name, annotation in annotations.items():
 50 |         if isinstance(annotation, type(Annotated[str, ""])):
 51 |             value = getattr(obj, attr_name)
 52 |             if value is None:
 53 |                 none_vals += f"{attr_name}, "
 54 | 
 55 |     assert len(none_vals) == 0, f"In order to use {obj.__class__.__name__}, you must set the configuration values `{none_vals}`."
 56 | 
 57 | 
 58 | def assign_config(cls, config: BaseModel | dict | None):
 59 |     cls_name = cls.__class__.__name__
 60 |     if config is None:
 61 |         return
 62 |     elif isinstance(config, BaseModel):
 63 |         dict_config = config.dict()
 64 |     elif isinstance(config, dict):
 65 |         dict_config = config
 66 |     else:
 67 |         raise ValueError("config must be a dict or a pydantic BaseModel")
 68 | 
 69 |     for k in dict_config:
 70 |         if hasattr(cls, k):
 71 |             setattr(cls, k, dict_config[k])
 72 |     for k in dict_config:
 73 |         if cls_name not in k:
 74 |             continue
 75 |         # Enables using class-specific keys, like "MarkdownRenderer_remove_blocks"
 76 |         split_k = k.removeprefix(cls_name + "_")
 77 | 
 78 |         if hasattr(cls, split_k):
 79 |             setattr(cls, split_k, dict_config[k])
 80 | 
 81 | 
 82 | def parse_range_str(range_str: str) -> List[int]:
 83 |     range_lst = range_str.split(",")
 84 |     page_lst = []
 85 |     for i in range_lst:
 86 |         if "-" in i:
 87 |             start, end = i.split("-")
 88 |             page_lst += list(range(int(start), int(end) + 1))
 89 |         else:
 90 |             page_lst.append(int(i))
 91 |     page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
 92 |     return page_lst
 93 | 
 94 | 
 95 | def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray:
 96 |     if len(boxes1) == 0 or len(boxes2) == 0:
 97 |         return np.zeros((len(boxes1), len(boxes2)))
 98 | 
 99 |     boxes1 = np.array(boxes1)
100 |     boxes2 = np.array(boxes2)
101 | 
102 |     boxes1 = boxes1[:, np.newaxis, :]  # Shape: (N, 1, 4)
103 |     boxes2 = boxes2[np.newaxis, :, :]  # Shape: (1, M, 4)
104 | 
105 |     min_x = np.maximum(boxes1[..., 0], boxes2[..., 0])  # Shape: (N, M)
106 |     min_y = np.maximum(boxes1[..., 1], boxes2[..., 1])
107 |     max_x = np.minimum(boxes1[..., 2], boxes2[..., 2])
108 |     max_y = np.minimum(boxes1[..., 3], boxes2[..., 3])
109 | 
110 |     width = np.maximum(0, max_x - min_x)
111 |     height = np.maximum(0, max_y - min_y)
112 | 
113 |     return width * height  # Shape: (N, M)
114 | 
115 | 
116 | def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray:
117 |     if len(boxes2) == 0:
118 |         return np.zeros((len(boxes1), 0))
119 |     if len(boxes1) == 0:
120 |         return np.zeros((0, len(boxes2)))
121 | 
122 |     boxes1 = np.array(boxes1)  # Shape: (N, 4)
123 |     boxes2 = np.array(boxes2)  # Shape: (M, 4)
124 | 
125 |     boxes1_centers = (boxes1[:, :2] + boxes1[:, 2:]) / 2 # Shape: (M, 2)
126 |     boxes2_centers = (boxes2[:, :2] + boxes2[:, 2:]) / 2  # Shape: (M, 2)
127 | 
128 |     boxes1_centers = boxes1_centers[:, np.newaxis, :]  # Shape: (N, 1, 2)
129 |     boxes2_centers = boxes2_centers[np.newaxis, :, :]  # Shape: (1, M, 2)
130 | 
131 |     distances = np.linalg.norm(boxes1_centers - boxes2_centers, axis=2)  # Shape: (N, M)
132 |     return distances
133 | 
134 | 
135 | def sort_text_lines(lines: List[PolygonBox], tolerance=1.25):
136 |     # Sorts in reading order.  Not 100% accurate, this should only
137 |     # be used as a starting point for more advanced sorting.
138 |     vertical_groups = {}
139 |     for line in lines:
140 |         group_key = round(line.bbox[1] / tolerance) * tolerance
141 |         if group_key not in vertical_groups:
142 |             vertical_groups[group_key] = []
143 |         vertical_groups[group_key].append(line)
144 | 
145 |     # Sort each group horizontally and flatten the groups into a single list
146 |     sorted_lines = []
147 |     for _, group in sorted(vertical_groups.items()):
148 |         sorted_group = sorted(group, key=lambda x: x.bbox[0])
149 |         sorted_lines.extend(sorted_group)
150 | 
151 |     return sorted_lines
152 | 
153 | def download_font():
154 |     if not os.path.exists(settings.FONT_PATH):
155 |         os.makedirs(os.path.dirname(settings.FONT_PATH), exist_ok=True)
156 |         font_dl_path = f"{settings.ARTIFACT_URL}/{settings.FONT_NAME}"
157 |         with requests.get(font_dl_path, stream=True) as r, open(settings.FONT_PATH, 'wb') as f:
158 |             r.raise_for_status()
159 |             for chunk in r.iter_content(chunk_size=8192):
160 |                 f.write(chunk)
161 | 
162 | def get_opening_tag_type(tag):
163 |     """
164 |     Determines if a tag is an opening tag and extracts the tag type.
165 |     
166 |     Args:
167 |         tag (str): The tag string to analyze.
168 | 
169 |     Returns:
170 |         tuple: (is_opening_tag (bool), tag_type (str or None))
171 |     """
172 |     match = OPENING_TAG_REGEX.match(tag)
173 |     
174 |     if match:
175 |         tag_type = match.group(1)
176 |         if tag_type in TAG_MAPPING:
177 |             return True, TAG_MAPPING[tag_type]
178 |     
179 |     return False, None
180 | 
181 | def get_closing_tag_type(tag):
182 |     """
183 |     Determines if a tag is an opening tag and extracts the tag type.
184 |     
185 |     Args:
186 |         tag (str): The tag string to analyze.
187 | 
188 |     Returns:
189 |         tuple: (is_opening_tag (bool), tag_type (str or None))
190 |     """
191 |     match = CLOSING_TAG_REGEX.match(tag)
192 |     
193 |     if match:
194 |         tag_type = match.group(1)
195 |         if tag_type in TAG_MAPPING:
196 |             return True, TAG_MAPPING[tag_type]
197 |     
198 |     return False, None
199 | 
200 | # Modification of unwrap_math from surya.recognition
201 | MATH_SYMBOLS = ["^", "_", "\\", "{", "}"]
202 | MATH_TAG_PATTERN = re.compile(r'<math\b[^>]*>.*?</math>', re.DOTALL)
203 | LATEX_ESCAPES = {
204 |     r'\%': '%',
205 |     r'\$': '$',
206 |     r'\_': '_',
207 |     r'\&': '&',
208 |     r'\#': '#',
209 |     r'\‰': '‰',
210 | }
211 | def normalize_latex_escapes(s: str) -> str:
212 |     for k, v in LATEX_ESCAPES.items():
213 |         s = s.replace(k, v)
214 |     return s
215 | 
216 | def unwrap_math(text: str, math_symbols: List[str] = MATH_SYMBOLS) -> str:
217 |     """Unwrap a single <math>...</math> block if it's not really math."""
218 |     if MATH_TAG_PATTERN.match(text):
219 |         # Remove tags
220 |         inner = re.sub(r'^\s*<math\b[^>]*>|</math>\s*$', '', text, flags=re.DOTALL)
221 | 
222 |         # Strip a single leading/trailing \\ plus surrounding whitespace
223 |         inner_stripped = re.sub(r'^\s*\\\\\s*|\s*\\\\\s*$', '', inner)
224 | 
225 |         # Unwrap \text{...}
226 |         unwrapped = re.sub(r'\\text[a-zA-Z]*\s*\{(.*?)\}', r'\1', inner_stripped)
227 | 
228 |         # Normalize escapes
229 |         normalized = normalize_latex_escapes(unwrapped)
230 | 
231 |         # If no math symbols remain → unwrap fully
232 |         if not any(symb in normalized for symb in math_symbols):
233 |             return normalized.strip()
234 | 
235 |     # Otherwise, return as-is
236 |     return text
```

--------------------------------------------------------------------------------
/marker/processors/debug.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | from typing import Annotated
  4 | 
  5 | from PIL import Image, ImageDraw, ImageFont
  6 | from marker.logger import get_logger
  7 | 
  8 | from marker.processors import BaseProcessor
  9 | from marker.schema import BlockTypes
 10 | from marker.schema.document import Document
 11 | from marker.settings import settings
 12 | 
 13 | logger = get_logger()
 14 | 
 15 | 
 16 | class DebugProcessor(BaseProcessor):
 17 |     """
 18 |     A processor for debugging the document.
 19 |     """
 20 | 
 21 |     block_types: Annotated[
 22 |         tuple, "The block types to process.", "Default is an empty tuple."
 23 |     ] = tuple()
 24 |     debug_data_folder: Annotated[
 25 |         str,
 26 |         "The folder to dump debug data to.",
 27 |     ] = "debug_data"
 28 |     debug_layout_images: Annotated[
 29 |         bool,
 30 |         "Whether to dump layout debug images.",
 31 |     ] = False
 32 |     debug_pdf_images: Annotated[
 33 |         bool,
 34 |         "Whether to dump PDF debug images.",
 35 |     ] = False
 36 |     debug_json: Annotated[
 37 |         bool,
 38 |         "Whether to dump block debug data.",
 39 |     ] = False
 40 | 
 41 |     def __call__(self, document: Document):
 42 |         # Remove extension from doc name
 43 |         doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0]
 44 |         self.debug_folder = os.path.join(self.debug_data_folder, doc_base)
 45 |         if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
 46 |             os.makedirs(self.debug_folder, exist_ok=True)
 47 | 
 48 |         document.debug_data_path = self.debug_folder
 49 | 
 50 |         if self.debug_layout_images:
 51 |             self.draw_layout_debug_images(document)
 52 |             logger.info(f"Dumped layout debug images to {self.debug_data_folder}")
 53 | 
 54 |         if self.debug_pdf_images:
 55 |             self.draw_pdf_debug_images(document)
 56 |             logger.info(f"Dumped PDF debug images to {self.debug_data_folder}")
 57 | 
 58 |         if self.debug_json:
 59 |             self.dump_block_debug_data(document)
 60 |             logger.info(f"Dumped block debug data to {self.debug_data_folder}")
 61 | 
 62 |     def draw_pdf_debug_images(self, document: Document):
 63 |         for page in document.pages:
 64 |             png_image = page.get_image(highres=True).copy()
 65 | 
 66 |             line_bboxes = []
 67 |             span_bboxes = []
 68 |             line_ids = []
 69 |             for child in page.children:
 70 |                 # Skip any blocks that have been removed
 71 |                 if child.removed:
 72 |                     continue
 73 | 
 74 |                 if child.block_type == BlockTypes.Line:
 75 |                     bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
 76 |                     line_bboxes.append(bbox)
 77 |                     line_ids.append(child.block_id)
 78 |                 elif child.block_type == BlockTypes.Span:
 79 |                     bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
 80 |                     span_bboxes.append(bbox)
 81 | 
 82 |             self.render_on_image(
 83 |                 line_bboxes,
 84 |                 png_image,
 85 |                 color="blue",
 86 |                 draw_bbox=True,
 87 |                 label_font_size=24,
 88 |                 labels=[str(i) for i in line_ids],
 89 |             )
 90 | 
 91 |             png_image = self.render_layout_boxes(page, png_image)
 92 | 
 93 |             debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
 94 |             png_image.save(debug_file)
 95 | 
 96 |     def draw_layout_debug_images(self, document: Document, pdf_mode=False):
 97 |         for page in document.pages:
 98 |             img_size = page.get_image(highres=True).size
 99 |             png_image = Image.new("RGB", img_size, color="white")
100 | 
101 |             line_bboxes = []
102 |             line_text = []
103 |             for child in page.children:
104 |                 if child.removed:
105 |                     continue
106 | 
107 |                 if child.block_type != BlockTypes.Line:
108 |                     continue
109 | 
110 |                 bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
111 |                 line_bboxes.append(bbox)
112 |                 line_text.append(child.raw_text(document))
113 | 
114 |             self.render_on_image(
115 |                 line_bboxes,
116 |                 png_image,
117 |                 labels=line_text,
118 |                 color="black",
119 |                 draw_bbox=False,
120 |                 label_font_size=24,
121 |             )
122 | 
123 |             png_image = self.render_layout_boxes(page, png_image)
124 | 
125 |             debug_file = os.path.join(
126 |                 self.debug_folder, f"layout_page_{page.page_id}.png"
127 |             )
128 |             png_image.save(debug_file)
129 | 
130 |     def render_layout_boxes(self, page, png_image):
131 |         layout_bboxes = []
132 |         layout_labels = []
133 |         for block_id in page.structure:
134 |             child = page.get_block(block_id)
135 |             if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
136 |                 continue
137 | 
138 |             bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
139 |             layout_bboxes.append(bbox)
140 |             layout_labels.append(str(child.block_type))
141 | 
142 |         self.render_on_image(
143 |             layout_bboxes,
144 |             png_image,
145 |             labels=layout_labels,
146 |             color="red",
147 |             label_font_size=24,
148 |         )
149 | 
150 |         order_labels = [str(i) for i in range(len(layout_bboxes))]
151 |         self.render_on_image(
152 |             layout_bboxes,
153 |             png_image,
154 |             labels=order_labels,
155 |             color="green",
156 |             draw_bbox=False,
157 |             label_offset=5,
158 |             label_font_size=24,
159 |         )
160 |         return png_image
161 | 
162 |     def dump_block_debug_data(self, document: Document):
163 |         debug_file = os.path.join(self.debug_folder, "blocks.json")
164 |         debug_data = []
165 |         for page in document.pages:
166 |             page_data = page.model_dump(
167 |                 exclude={
168 |                     "lowres_image": True,
169 |                     "highres_image": True,
170 |                     "children": {
171 |                         "__all__": {"lowres_image": True, "highres_image": True}
172 |                     },
173 |                 }
174 |             )
175 |             debug_data.append(page_data)
176 | 
177 |         with open(debug_file, "w+") as f:
178 |             json.dump(debug_data, f)
179 | 
180 |     def get_text_size(self, text, font):
181 |         im = Image.new(mode="P", size=(0, 0))
182 |         draw = ImageDraw.Draw(im)
183 |         _, _, width, height = draw.textbbox((0, 0), text=text, font=font)
184 |         return width, height
185 | 
186 |     def render_on_image(
187 |         self,
188 |         bboxes,
189 |         image,
190 |         labels=None,
191 |         label_offset=1,
192 |         label_font_size=10,
193 |         color: str | list = "red",
194 |         draw_bbox=True,
195 |     ):
196 |         draw = ImageDraw.Draw(image)
197 |         font_path = settings.FONT_PATH
198 |         label_font = ImageFont.truetype(font_path, label_font_size)
199 | 
200 |         for i, bbox in enumerate(bboxes):
201 |             bbox = [int(p) for p in bbox]
202 |             if draw_bbox:
203 |                 draw.rectangle(
204 |                     bbox,
205 |                     outline=color[i] if isinstance(color, list) else color,
206 |                     width=1,
207 |                 )
208 | 
209 |             if labels is not None:
210 |                 label = labels[i]
211 |                 text_position = (bbox[0] + label_offset, bbox[1] + label_offset)
212 |                 text_size = self.get_text_size(label, label_font)
213 |                 if text_size[0] <= 0 or text_size[1] <= 0:
214 |                     continue
215 |                 box_position = (
216 |                     text_position[0],
217 |                     text_position[1],
218 |                     text_position[0] + text_size[0],
219 |                     text_position[1] + text_size[1],
220 |                 )
221 |                 draw.rectangle(box_position, fill="white")
222 |                 draw.text(
223 |                     text_position,
224 |                     label,
225 |                     fill=color[i] if isinstance(color, list) else color,
226 |                     font=label_font,
227 |                 )
228 | 
229 |         return image
230 | 
```

--------------------------------------------------------------------------------
/marker/converters/pdf.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | 
  3 | from marker.schema.document import Document
  4 | 
  5 | os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
  6 | 
  7 | from collections import defaultdict
  8 | from typing import Annotated, Any, Dict, List, Optional, Type, Tuple, Union
  9 | import io
 10 | from contextlib import contextmanager
 11 | import tempfile
 12 | 
 13 | from marker.processors import BaseProcessor
 14 | from marker.services import BaseService
 15 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
 16 | from marker.providers.registry import provider_from_filepath
 17 | from marker.builders.document import DocumentBuilder
 18 | from marker.builders.layout import LayoutBuilder
 19 | from marker.builders.line import LineBuilder
 20 | from marker.builders.ocr import OcrBuilder
 21 | from marker.builders.structure import StructureBuilder
 22 | from marker.converters import BaseConverter
 23 | from marker.processors.blockquote import BlockquoteProcessor
 24 | from marker.processors.code import CodeProcessor
 25 | from marker.processors.debug import DebugProcessor
 26 | from marker.processors.document_toc import DocumentTOCProcessor
 27 | from marker.processors.equation import EquationProcessor
 28 | from marker.processors.footnote import FootnoteProcessor
 29 | from marker.processors.ignoretext import IgnoreTextProcessor
 30 | from marker.processors.line_numbers import LineNumbersProcessor
 31 | from marker.processors.list import ListProcessor
 32 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
 33 | from marker.processors.llm.llm_form import LLMFormProcessor
 34 | from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
 35 | from marker.processors.llm.llm_table import LLMTableProcessor
 36 | from marker.processors.page_header import PageHeaderProcessor
 37 | from marker.processors.reference import ReferenceProcessor
 38 | from marker.processors.sectionheader import SectionHeaderProcessor
 39 | from marker.processors.table import TableProcessor
 40 | from marker.processors.text import TextProcessor
 41 | from marker.processors.block_relabel import BlockRelabelProcessor
 42 | from marker.processors.blank_page import BlankPageProcessor
 43 | from marker.processors.llm.llm_equation import LLMEquationProcessor
 44 | from marker.renderers.markdown import MarkdownRenderer
 45 | from marker.schema import BlockTypes
 46 | from marker.schema.blocks import Block
 47 | from marker.schema.registry import register_block_class
 48 | from marker.util import strings_to_classes
 49 | from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
 50 | from marker.processors.order import OrderProcessor
 51 | from marker.services.gemini import GoogleGeminiService
 52 | from marker.processors.line_merge import LineMergeProcessor
 53 | from marker.processors.llm.llm_mathblock import LLMMathBlockProcessor
 54 | from marker.processors.llm.llm_page_correction import LLMPageCorrectionProcessor
 55 | from marker.processors.llm.llm_sectionheader import LLMSectionHeaderProcessor
 56 | 
 57 | 
 58 | class PdfConverter(BaseConverter):
 59 |     """
 60 |     A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
 61 |     """
 62 | 
 63 |     override_map: Annotated[
 64 |         Dict[BlockTypes, Type[Block]],
 65 |         "A mapping to override the default block classes for specific block types.",
 66 |         "The keys are `BlockTypes` enum values, representing the types of blocks,",
 67 |         "and the values are corresponding `Block` class implementations to use",
 68 |         "instead of the defaults.",
 69 |     ] = defaultdict()
 70 |     use_llm: Annotated[
 71 |         bool,
 72 |         "Enable higher quality processing with LLMs.",
 73 |     ] = False
 74 |     default_processors: Tuple[BaseProcessor, ...] = (
 75 |         OrderProcessor,
 76 |         BlockRelabelProcessor,
 77 |         LineMergeProcessor,
 78 |         BlockquoteProcessor,
 79 |         CodeProcessor,
 80 |         DocumentTOCProcessor,
 81 |         EquationProcessor,
 82 |         FootnoteProcessor,
 83 |         IgnoreTextProcessor,
 84 |         LineNumbersProcessor,
 85 |         ListProcessor,
 86 |         PageHeaderProcessor,
 87 |         SectionHeaderProcessor,
 88 |         TableProcessor,
 89 |         LLMTableProcessor,
 90 |         LLMTableMergeProcessor,
 91 |         LLMFormProcessor,
 92 |         TextProcessor,
 93 |         LLMComplexRegionProcessor,
 94 |         LLMImageDescriptionProcessor,
 95 |         LLMEquationProcessor,
 96 |         LLMHandwritingProcessor,
 97 |         LLMMathBlockProcessor,
 98 |         LLMSectionHeaderProcessor,
 99 |         LLMPageCorrectionProcessor,
100 |         ReferenceProcessor,
101 |         BlankPageProcessor,
102 |         DebugProcessor,
103 |     )
104 |     default_llm_service: BaseService = GoogleGeminiService
105 | 
106 |     def __init__(
107 |         self,
108 |         artifact_dict: Dict[str, Any],
109 |         processor_list: Optional[List[str]] = None,
110 |         renderer: str | None = None,
111 |         llm_service: str | None = None,
112 |         config=None,
113 |     ):
114 |         super().__init__(config)
115 | 
116 |         if config is None:
117 |             config = {}
118 | 
119 |         for block_type, override_block_type in self.override_map.items():
120 |             register_block_class(block_type, override_block_type)
121 | 
122 |         if processor_list is not None:
123 |             processor_list = strings_to_classes(processor_list)
124 |         else:
125 |             processor_list = self.default_processors
126 | 
127 |         if renderer:
128 |             renderer = strings_to_classes([renderer])[0]
129 |         else:
130 |             renderer = MarkdownRenderer
131 | 
132 |         # Put here so that resolve_dependencies can access it
133 |         self.artifact_dict = artifact_dict
134 | 
135 |         if llm_service:
136 |             llm_service_cls = strings_to_classes([llm_service])[0]
137 |             llm_service = self.resolve_dependencies(llm_service_cls)
138 |         elif config.get("use_llm", False):
139 |             llm_service = self.resolve_dependencies(self.default_llm_service)
140 | 
141 |         # Inject llm service into artifact_dict so it can be picked up by processors, etc.
142 |         self.artifact_dict["llm_service"] = llm_service
143 |         self.llm_service = llm_service
144 | 
145 |         self.renderer = renderer
146 | 
147 |         processor_list = self.initialize_processors(processor_list)
148 |         self.processor_list = processor_list
149 | 
150 |         self.layout_builder_class = LayoutBuilder
151 |         self.page_count = None  # Track how many pages were converted
152 | 
153 |     @contextmanager
154 |     def filepath_to_str(self, file_input: Union[str, io.BytesIO]):
155 |         temp_file = None
156 |         try:
157 |             if isinstance(file_input, str):
158 |                 yield file_input
159 |             else:
160 |                 with tempfile.NamedTemporaryFile(
161 |                     delete=False, suffix=".pdf"
162 |                 ) as temp_file:
163 |                     if isinstance(file_input, io.BytesIO):
164 |                         file_input.seek(0)
165 |                         temp_file.write(file_input.getvalue())
166 |                     else:
167 |                         raise TypeError(
168 |                             f"Expected str or BytesIO, got {type(file_input)}"
169 |                         )
170 | 
171 |                 yield temp_file.name
172 |         finally:
173 |             if temp_file is not None and os.path.exists(temp_file.name):
174 |                 os.unlink(temp_file.name)
175 | 
176 |     def build_document(self, filepath: str) -> Document:
177 |         provider_cls = provider_from_filepath(filepath)
178 |         layout_builder = self.resolve_dependencies(self.layout_builder_class)
179 |         line_builder = self.resolve_dependencies(LineBuilder)
180 |         ocr_builder = self.resolve_dependencies(OcrBuilder)
181 |         provider = provider_cls(filepath, self.config)
182 |         document = DocumentBuilder(self.config)(
183 |             provider, layout_builder, line_builder, ocr_builder
184 |         )
185 |         structure_builder_cls = self.resolve_dependencies(StructureBuilder)
186 |         structure_builder_cls(document)
187 | 
188 |         for processor in self.processor_list:
189 |             processor(document)
190 | 
191 |         return document
192 | 
193 |     def __call__(self, filepath: str | io.BytesIO):
194 |         with self.filepath_to_str(filepath) as temp_path:
195 |             document = self.build_document(temp_path)
196 |             self.page_count = len(document.pages)
197 |             renderer = self.resolve_dependencies(self.renderer)
198 |             rendered = renderer(document)
199 |         return rendered
200 | 
```

--------------------------------------------------------------------------------
/marker/schema/polygon.py:
--------------------------------------------------------------------------------

```python
  1 | from __future__ import annotations
  2 | import copy
  3 | from typing import List
  4 | 
  5 | import numpy as np
  6 | from pydantic import BaseModel, field_validator, computed_field
  7 | 
  8 | 
  9 | class PolygonBox(BaseModel):
 10 |     polygon: List[List[float]]
 11 | 
 12 |     @field_validator('polygon')
 13 |     @classmethod
 14 |     def check_elements(cls, v: List[List[float]]) -> List[List[float]]:
 15 |         if len(v) != 4:
 16 |             raise ValueError('corner must have 4 elements')
 17 | 
 18 |         for corner in v:
 19 |             if len(corner) != 2:
 20 |                 raise ValueError('corner must have 2 elements')
 21 | 
 22 |         min_x = min([corner[0] for corner in v])
 23 |         min_y = min([corner[1] for corner in v])
 24 | 
 25 |         # Ensure corners are clockwise from top left
 26 |         corner_error = f" .Corners are {v}"
 27 |         assert v[2][1] >= min_y, f'bottom right corner should have a greater y value than top right corner' + corner_error
 28 |         assert v[3][1] >= min_y, 'bottom left corner should have a greater y value than top left corner' + corner_error
 29 |         assert v[1][0] >= min_x, 'top right corner should have a greater x value than top left corner' + corner_error
 30 |         assert v[2][0] >= min_x, 'bottom right corner should have a greater x value than bottom left corner' + corner_error
 31 |         return v
 32 | 
 33 |     @property
 34 |     def height(self):
 35 |         return self.bbox[3] - self.bbox[1]
 36 | 
 37 |     @property
 38 |     def width(self):
 39 |         return self.bbox[2] - self.bbox[0]
 40 | 
 41 |     @property
 42 |     def area(self):
 43 |         return self.width * self.height
 44 | 
 45 |     @property
 46 |     def center(self):
 47 |         return [(self.bbox[0] + self.bbox[2]) / 2, (self.bbox[1] + self.bbox[3]) / 2]
 48 | 
 49 |     @property
 50 |     def size(self):
 51 |         return [self.width, self.height]
 52 | 
 53 |     @property
 54 |     def x_start(self):
 55 |         return self.bbox[0]
 56 | 
 57 |     @property
 58 |     def y_start(self):
 59 |         return self.bbox[1]
 60 | 
 61 |     @property
 62 |     def x_end(self):
 63 |         return self.bbox[2]
 64 | 
 65 |     @property
 66 |     def y_end(self):
 67 |         return self.bbox[3]
 68 | 
 69 |     @computed_field
 70 |     @property
 71 |     def bbox(self) -> List[float]:
 72 |         min_x = min([corner[0] for corner in self.polygon])
 73 |         min_y = min([corner[1] for corner in self.polygon])
 74 |         max_x = max([corner[0] for corner in self.polygon])
 75 |         max_y = max([corner[1] for corner in self.polygon])
 76 |         return [min_x, min_y, max_x, max_y]
 77 | 
 78 |     def expand(self, x_margin: float, y_margin: float) -> PolygonBox:
 79 |         new_polygon = []
 80 |         x_margin = x_margin * self.width
 81 |         y_margin = y_margin * self.height
 82 |         for idx, poly in enumerate(self.polygon):
 83 |             if idx == 0:
 84 |                 new_polygon.append([poly[0] - x_margin, poly[1] - y_margin])
 85 |             elif idx == 1:
 86 |                 new_polygon.append([poly[0] + x_margin, poly[1] - y_margin])
 87 |             elif idx == 2:
 88 |                 new_polygon.append([poly[0] + x_margin, poly[1] + y_margin])
 89 |             elif idx == 3:
 90 |                 new_polygon.append([poly[0] - x_margin, poly[1] + y_margin])
 91 |         return PolygonBox(polygon=new_polygon)
 92 | 
 93 |     def expand_y2(self, y_margin: float) -> PolygonBox:
 94 |         new_polygon = []
 95 |         y_margin = y_margin * self.height
 96 |         for idx, poly in enumerate(self.polygon):
 97 |             if idx == 2:
 98 |                 new_polygon.append([poly[0], poly[1] + y_margin])
 99 |             elif idx == 3:
100 |                 new_polygon.append([poly[0], poly[1] + y_margin])
101 |             else:
102 |                 new_polygon.append(poly)
103 |         return PolygonBox(polygon=new_polygon)
104 | 
105 |     def expand_y1(self, y_margin: float) -> PolygonBox:
106 |         new_polygon = []
107 |         y_margin = y_margin * self.height
108 |         for idx, poly in enumerate(self.polygon):
109 |             if idx == 0:
110 |                 new_polygon.append([poly[0], poly[1] - y_margin])
111 |             elif idx == 1:
112 |                 new_polygon.append([poly[0], poly[1] - y_margin])
113 |             else:
114 |                 new_polygon.append(poly)
115 |         return PolygonBox(polygon=new_polygon)
116 | 
117 |     def minimum_gap(self, other: PolygonBox):
118 |         if self.intersection_pct(other) > 0:
119 |             return 0
120 | 
121 |         def dist(p1, p2):
122 |             return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5
123 | 
124 |         left = other.bbox[2] < self.bbox[0]
125 |         right = self.bbox[2] < other.bbox[0]
126 |         bottom = other.bbox[3] < self.bbox[1]
127 |         top = self.bbox[3] < other.bbox[1]
128 |         if top and left:
129 |             return dist((self.bbox[0], self.bbox[3]), (other.bbox[2], other.bbox[1]))
130 |         elif left and bottom:
131 |             return dist((self.bbox[0], self.bbox[1]), (other.bbox[2], other.bbox[3]))
132 |         elif bottom and right:
133 |             return dist((self.bbox[2], self.bbox[1]), (other.bbox[0], other.bbox[3]))
134 |         elif right and top:
135 |             return dist((self.bbox[2], self.bbox[3]), (other.bbox[0], other.bbox[1]))
136 |         elif left:
137 |             return self.bbox[0] - other.bbox[2]
138 |         elif right:
139 |             return other.bbox[0] - self.bbox[2]
140 |         elif bottom:
141 |             return self.bbox[1] - other.bbox[3]
142 |         elif top:
143 |             return other.bbox[1] - self.bbox[3]
144 |         else:
145 |             return 0
146 | 
147 |     def center_distance(self, other: PolygonBox, x_weight: float = 1, y_weight: float = 1, absolute=False):
148 |         if not absolute:
149 |             return ((self.center[0] - other.center[0]) ** 2 * x_weight + (self.center[1] - other.center[1]) ** 2 * y_weight) ** 0.5
150 |         else:
151 |             return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight
152 | 
153 |     def tl_distance(self, other: PolygonBox):
154 |         return ((self.bbox[0] - other.bbox[0]) ** 2 + (self.bbox[1] - other.bbox[1]) ** 2) ** 0.5
155 | 
156 |     def rescale(self, old_size, new_size):
157 |         # Point is in x, y format
158 |         page_width, page_height = old_size
159 |         img_width, img_height = new_size
160 | 
161 |         width_scaler = img_width / page_width
162 |         height_scaler = img_height / page_height
163 | 
164 |         new_corners = copy.deepcopy(self.polygon)
165 |         for corner in new_corners:
166 |             corner[0] = corner[0] * width_scaler
167 |             corner[1] = corner[1] * height_scaler
168 |         return PolygonBox(polygon=new_corners)
169 | 
170 |     def fit_to_bounds(self, bounds):
171 |         new_corners = copy.deepcopy(self.polygon)
172 |         for corner in new_corners:
173 |             corner[0] = max(min(corner[0], bounds[2]), bounds[0])
174 |             corner[1] = max(min(corner[1], bounds[3]), bounds[1])
175 |         return PolygonBox(polygon=new_corners)
176 | 
177 |     def overlap_x(self, other: PolygonBox):
178 |         return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0]))
179 | 
180 |     def overlap_y(self, other: PolygonBox):
181 |         return max(0, min(self.bbox[3], other.bbox[3]) - max(self.bbox[1], other.bbox[1]))
182 | 
183 |     def intersection_area(self, other: PolygonBox):
184 |         return self.overlap_x(other) * self.overlap_y(other)
185 | 
186 |     def intersection_pct(self, other: PolygonBox):
187 |         if self.area == 0:
188 |             return 0
189 | 
190 |         intersection = self.intersection_area(other)
191 |         return intersection / self.area
192 | 
193 |     def merge(self, others: List[PolygonBox]) -> PolygonBox:
194 |         corners = []
195 |         for i in range(len(self.polygon)):
196 |             x_coords = [self.polygon[i][0]] + [other.polygon[i][0] for other in others]
197 |             y_coords = [self.polygon[i][1]] + [other.polygon[i][1] for other in others]
198 |             min_x = min(x_coords)
199 |             min_y = min(y_coords)
200 |             max_x = max(x_coords)
201 |             max_y = max(y_coords)
202 | 
203 |             if i == 0:
204 |                 corners.append([min_x, min_y])
205 |             elif i == 1:
206 |                 corners.append([max_x, min_y])
207 |             elif i == 2:
208 |                 corners.append([max_x, max_y])
209 |             elif i == 3:
210 |                 corners.append([min_x, max_y])
211 |         return PolygonBox(polygon=corners)
212 | 
213 |     @classmethod
214 |     def from_bbox(cls, bbox: List[float], ensure_nonzero_area=False):
215 |         if ensure_nonzero_area:
216 |             bbox = list(bbox)
217 |             bbox[2] = max(bbox[2], bbox[0] + 1)
218 |             bbox[3] = max(bbox[3], bbox[1] + 1)
219 |         return cls(polygon=[[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]])
220 | 
```

--------------------------------------------------------------------------------
/marker/providers/powerpoint.py:
--------------------------------------------------------------------------------

```python
  1 | import base64
  2 | import os
  3 | import tempfile
  4 | import traceback
  5 | 
  6 | from marker.logger import get_logger
  7 | from marker.providers.pdf import PdfProvider
  8 | 
  9 | logger = get_logger()
 10 | 
 11 | css = """
 12 | @page {
 13 |     size: A4 landscape;
 14 |     margin: 1.5cm;
 15 | }
 16 | 
 17 | table {
 18 |     width: 100%;
 19 |     border-collapse: collapse;
 20 |     break-inside: auto;
 21 |     font-size: 10pt;
 22 | }
 23 | 
 24 | tr {
 25 |     break-inside: avoid;
 26 |     page-break-inside: avoid;
 27 | }
 28 | 
 29 | td {
 30 |     border: 0.75pt solid #000;
 31 |     padding: 6pt;
 32 | }
 33 | 
 34 | img {
 35 |     max-width: 100%;
 36 |     height: auto;
 37 |     object-fit: contain;
 38 | }
 39 | """
 40 | 
 41 | 
 42 | class PowerPointProvider(PdfProvider):
 43 |     include_slide_number: bool = False
 44 | 
 45 |     def __init__(self, filepath: str, config=None):
 46 |         temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
 47 |         self.temp_pdf_path = temp_pdf.name
 48 |         temp_pdf.close()
 49 | 
 50 |         # Convert PPTX to PDF
 51 |         try:
 52 |             self.convert_pptx_to_pdf(filepath)
 53 |         except Exception as e:
 54 |             print(traceback.format_exc())
 55 |             raise ValueError(f"Error converting PPTX to PDF: {e}")
 56 | 
 57 |         # Initalize the PDF provider with the temp pdf path
 58 |         super().__init__(self.temp_pdf_path, config)
 59 | 
 60 |     def __del__(self):
 61 |         if os.path.exists(self.temp_pdf_path):
 62 |             os.remove(self.temp_pdf_path)
 63 | 
 64 |     def convert_pptx_to_pdf(self, filepath):
 65 |         from weasyprint import CSS, HTML
 66 |         from pptx import Presentation
 67 |         from pptx.enum.shapes import MSO_SHAPE_TYPE
 68 | 
 69 |         pptx = Presentation(filepath)
 70 | 
 71 |         html_parts = []
 72 | 
 73 |         for slide_index, slide in enumerate(pptx.slides):
 74 |             html_parts.append("<section>")
 75 |             if self.include_slide_number:
 76 |                 html_parts.append(f"<h2>Slide {slide_index + 1}</h2>")
 77 | 
 78 |             # Process shapes in the slide
 79 |             for shape in slide.shapes:
 80 |                 # If shape is a group shape, we recursively handle all grouped shapes
 81 |                 if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
 82 |                     html_parts.append(self._handle_group(shape))
 83 |                     continue
 84 | 
 85 |                 # If shape is a table
 86 |                 if shape.has_table:
 87 |                     html_parts.append(self._handle_table(shape))
 88 |                     continue
 89 | 
 90 |                 # If shape is a picture
 91 |                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
 92 |                     html_parts.append(self._handle_image(shape))
 93 |                     continue
 94 | 
 95 |                 # If shape has text
 96 |                 if hasattr(shape, "text") and shape.text is not None:
 97 |                     if shape.has_text_frame:
 98 |                         # Distinguish placeholders (title, subtitle, etc.)
 99 |                         html_parts.append(self._handle_text(shape))
100 |                     else:
101 |                         html_parts.append(f"<p>{self._escape_html(shape.text)}</p>")
102 | 
103 |             html_parts.append("</section>")
104 | 
105 |         html = "\n".join(html_parts)
106 | 
107 |         # We convert the HTML into a PDF
108 |         HTML(string=html).write_pdf(
109 |             self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()]
110 |         )
111 | 
112 |     def _handle_group(self, group_shape) -> str:
113 |         """
114 |         Recursively handle shapes in a group. Returns HTML string for the entire group.
115 |         """
116 |         from pptx.enum.shapes import MSO_SHAPE_TYPE
117 | 
118 |         group_parts = []
119 |         for shape in group_shape.shapes:
120 |             if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
121 |                 group_parts.append(self._handle_group(shape))
122 |                 continue
123 | 
124 |             if shape.has_table:
125 |                 group_parts.append(self._handle_table(shape))
126 |                 continue
127 | 
128 |             if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
129 |                 group_parts.append(self._handle_image(shape))
130 |                 continue
131 | 
132 |             if hasattr(shape, "text"):
133 |                 if shape.has_text_frame:
134 |                     group_parts.append(self._handle_text(shape))
135 |                 else:
136 |                     group_parts.append(f"<p>{self._escape_html(shape.text)}</p>")
137 | 
138 |         return "".join(group_parts)
139 | 
140 |     def _handle_text(self, shape) -> str:
141 |         """
142 |         Processes shape text, including bullet/numbered list detection and placeholders
143 |         (title, subtitle, etc.). Returns HTML for the text block(s).
144 |         """
145 |         from pptx.enum.shapes import PP_PLACEHOLDER
146 | 
147 |         # Distinguish placeholders to see if it's a title or subtitle
148 |         label_html_tag = "p"
149 |         if shape.is_placeholder:
150 |             placeholder_type = shape.placeholder_format.type
151 |             if placeholder_type in [PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE]:
152 |                 label_html_tag = "h3"
153 |             elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
154 |                 label_html_tag = "h4"
155 | 
156 |         # Keep track of whether we are currently in a <ul> or <ol>
157 |         html_parts = []
158 |         list_open = False
159 |         list_type = None  # "ul" or "ol"
160 | 
161 |         for paragraph in shape.text_frame.paragraphs:
162 |             p_el = paragraph._element
163 |             # Check bullet
164 |             bullet_char = p_el.find(".//a:buChar", namespaces=p_el.nsmap)
165 |             bullet_num = p_el.find(".//a:buAutoNum", namespaces=p_el.nsmap)
166 | 
167 |             is_bullet = (bullet_char is not None) or (paragraph.level > 0)
168 |             is_numbered = bullet_num is not None
169 | 
170 |             # If the paragraph is bullet or numbered
171 |             if is_bullet or is_numbered:
172 |                 # Decide if we need to start a new list or continue an existing one
173 |                 current_list_type = "ol" if is_numbered else "ul"
174 |                 if not list_open:
175 |                     # Start new
176 |                     list_open = True
177 |                     list_type = current_list_type
178 |                     html_parts.append(f"<{list_type}>")
179 | 
180 |                 elif list_open and list_type != current_list_type:
181 |                     # Close old list, start new
182 |                     html_parts.append(f"</{list_type}>")
183 |                     list_type = current_list_type
184 |                     html_parts.append(f"<{list_type}>")
185 | 
186 |                 # Build the bullet (li) text from all runs in the paragraph
187 |                 p_text = "".join(run.text for run in paragraph.runs)
188 |                 if p_text:
189 |                     html_parts.append(f"<li>{self._escape_html(p_text)}</li>")
190 | 
191 |             else:
192 |                 # If we were in a list, we need to close it
193 |                 if list_open:
194 |                     html_parts.append(f"</{list_type}>")
195 |                     list_open = False
196 |                     list_type = None
197 | 
198 |                 # Now it's just a normal paragraph
199 |                 # Gather the paragraph text from runs
200 |                 p_text = "".join(run.text for run in paragraph.runs)
201 |                 if p_text:
202 |                     # If we know it's a slide title, we can use <h3> or so
203 |                     html_parts.append(
204 |                         f"<{label_html_tag}>{self._escape_html(p_text)}</{label_html_tag}>"
205 |                     )
206 | 
207 |         # If the text frame ended and we still have an open list, close it
208 |         if list_open:
209 |             html_parts.append(f"</{list_type}>")
210 | 
211 |         return "".join(html_parts)
212 | 
213 |     def _handle_image(self, shape) -> str:
214 |         """
215 |         Embeds the image as a base64 <img> in HTML.
216 |         """
217 |         image = shape.image
218 |         image_bytes = image.blob
219 | 
220 |         try:
221 |             img_str = base64.b64encode(image_bytes).decode("utf-8")
222 |             return f"<img src='data:{image.content_type};base64,{img_str}' />"
223 |         except Exception as e:
224 |             logger.warning(f"Warning: image cannot be loaded by Pillow: {e}")
225 |             return ""
226 | 
227 |     def _handle_table(self, shape) -> str:
228 |         """
229 |         Renders a shape's table as an HTML <table>.
230 |         """
231 |         table_html = []
232 |         table_html.append("<table border='1'>")
233 | 
234 |         for row in shape.table.rows:
235 |             row_html = ["<tr>"]
236 |             for cell in row.cells:
237 |                 row_html.append(f"<td>{self._escape_html(cell.text)}</td>")
238 |             row_html.append("</tr>")
239 |             table_html.append("".join(row_html))
240 | 
241 |         table_html.append("</table>")
242 |         return "".join(table_html)
243 | 
244 |     def _escape_html(self, text: str) -> str:
245 |         """
246 |         Minimal escaping for HTML special characters.
247 |         """
248 |         return (
249 |             text.replace("&", "&amp;")
250 |             .replace("<", "&lt;")
251 |             .replace(">", "&gt;")
252 |             .replace('"', "&quot;")
253 |             .replace("'", "&#39;")
254 |         )
255 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/elo.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import random
  3 | import time
  4 | import os
  5 | from dataclasses import dataclass
  6 | from typing import List, Dict, Tuple, Literal
  7 | from PIL import Image
  8 | from collections import defaultdict
  9 | import tabulate
 10 | 
 11 | import click
 12 | import datasets
 13 | from google import genai
 14 | from google.genai.errors import APIError
 15 | from pydantic import BaseModel
 16 | from tqdm import tqdm
 17 | 
 18 | from marker.settings import settings
 19 | 
 20 | rating_prompt = """
 21 | You're a document analysis expert who is comparing two different markdown samples to an image to see which one represents the content of the image better. The markdown will be called version A and version B.
 22 | 
 23 | Here are some notes on the image and markdown:
 24 | - Some parts of the page may have been recognized as images and linked from the markdown, like `![](_page_0_Picture_0.jpeg)`.
 25 | - Tables will be formatted as Github flavored markdown.
 26 | - Block equations will be in LaTeX.
 27 | - The image and markdown may be in any language.
 28 | - The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
 29 | 
 30 | The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
 31 | 
 32 | **Instructions**
 33 | Follow this process to evaluate the markdown:
 34 | 1. Carefully examine the image.
 35 | 2. Carefully examine the first markdown input provided.
 36 | 3. Describe how well version a represents the image.
 37 | 4. Carefully examine the second markdown input provided.
 38 | 5. Describe how well version B represents the image.
 39 | 6. Compare version A and version B.
 40 | 7. Decide which markdown representation is better, based on the criteria below.  Output version_a if version a is better, and version_b if version b is better.
 41 | 
 42 | Use these criteria when judging the markdown:
 43 | - Overall - the overall quality of the markdown as compared to the image.
 44 | - Text quality - the quality of the text extraction from the image.
 45 | - Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
 46 | - Tables - how effectively the tables have been extracted and formatted.
 47 | - Forms - how effectively the forms have extracted and formatted.
 48 | - Equations - how effectively block equations have been converted to LaTeX.
 49 | - Lists - if the lists have been properly extracted and formatted.
 50 | - Images - if images are identified and placed correctly.
 51 | 
 52 | Notes on scoring:
 53 | - Perfect markdown will include all of the important text from the image, and the formatting will be correct (minor mistakes okay).  It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings.  If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
 54 | - Bad markdown will have major missing text segments from the markdown or completely unreadable formatting.  It may also have key values that are different from the values in the image.
 55 | 
 56 | Output json, like in the example below.
 57 | 
 58 | **Example**
 59 | Version A
 60 | ```markdown
 61 | # *Section 1*
 62 | This is some *markdown* extracted from a document.  Here is a block equation:
 63 | $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
 64 | ```
 65 | Version B
 66 | ```markdown
 67 | # Section 1
 68 | This is some markdown extracted from a document.  Here is a block equation:
 69 | $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 124}{t}$$
 70 | ```
 71 | Output
 72 | ```json
 73 | {
 74 |     "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
 75 |     "version_a_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
 76 |     "version_b_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.  The formatting in version b is slightly different from the image.  The value 124 is also different from the image.",
 77 |     "comparison": "Version A is better than version B.  The text and formatting in version A matches the image better than version B.  Version B also has an incorrect value.",
 78 |     "winner": "version_a",
 79 | }
 80 | ```
 81 | **Input**
 82 | Version A
 83 | ```markdown
 84 | {{version_a}}
 85 | ```
 86 | Version B
 87 | ```markdown
 88 | {{version_b}}
 89 | ```
 90 | **Output**
 91 | """
 92 | 
 93 | class ComparerSchema(BaseModel):
 94 |     image_description: str
 95 |     version_a_description: str
 96 |     version_b_description: str
 97 |     comparison: str
 98 |     winner: Literal["version_a", "version_b"]
 99 | 
100 | 
101 | class Comparer:
102 |     def __init__(self):
103 |         pass
104 | 
105 |     def __call__(
106 |         self,
107 |         img: Image.Image,
108 |         version_a: str,
109 |         version_b: str
110 |     ) -> str | None:
111 |         if version_a is None and version_b is not None:
112 |             return "version_b"
113 |         elif version_b is None and version_a is not None:
114 |             return "version_a"
115 | 
116 |         hydrated_prompt = rating_prompt.replace("{{version_a}}", version_a).replace("{{version_b}}", version_b)
117 |         try:
118 |             rating = self.llm_rater(img, hydrated_prompt)
119 |         except Exception as e:
120 |             print(f"Error: {e}")
121 |             return
122 |         return rating
123 | 
124 | 
125 |     def llm_rater(self, img: Image.Image, prompt: str):
126 |         response = self.llm_response_wrapper(
127 |             [img, prompt],
128 |             ComparerSchema
129 |         )
130 |         assert "winner" in response, f"Response missing 'winner' key: {response}"
131 |         return response["winner"]
132 | 
133 |     def llm_response_wrapper(
134 |         self,
135 |         prompt,
136 |         response_schema,
137 |     ):
138 |         client = genai.Client(
139 |             http_options={"timeout": 60000},
140 |             vertexai=True,
141 |             project=os.getenv("VERTEX_PROJECT_ID"),
142 |             location=os.getenv("VERTEX_LOCATION"),
143 |         )
144 |         try:
145 |             responses = client.models.generate_content(
146 |                 model="gemini-2.0-flash-001",
147 |                 contents=prompt,
148 |                 config={
149 |                     "temperature": 0,
150 |                     "response_schema": response_schema,
151 |                     "response_mime_type": "application/json",
152 |                 },
153 |             )
154 |             output = responses.candidates[0].content.parts[0].text
155 |             return json.loads(output)
156 |         except APIError as e:
157 |             print(f"Hit Gemini rate limit")
158 |             return
159 |         except Exception as e:
160 |             print(f"Error: {e}")
161 |             return
162 | 
163 | 
164 | def display_win_rates_table(win_rates: dict):
165 |     table = []
166 |     headers = ["Method A", "Method B", "Wins", "Losses", "Win %"]
167 |     for method_a, method_b_dict in win_rates.items():
168 |         row = [method_a]
169 |         for method_b, results in method_b_dict.items():
170 |             row = [method_a, method_b, results["win"], results["loss"], (results["win"] / (results["win"] + results["loss"])) * 100]
171 |             table.append(row)
172 |     print(tabulate.tabulate(table, headers=headers, tablefmt="pretty"))
173 | 
174 | 
175 | @click.command("Calculate win rates for document conversion methods")
176 | @click.argument("dataset", type=str)
177 | @click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix")
178 | @click.option("--row_samples", type=int, default=2, help="Number of samples per row")
179 | @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process")
180 | def main(
181 |     dataset: str,
182 |     methods: str,
183 |     row_samples: int,
184 |     max_rows: int
185 | ):
186 |     ds = datasets.load_dataset(dataset, split="train")
187 |     method_lst = methods.split(",")
188 |     win_rates = {m: defaultdict(lambda: defaultdict(int)) for m in method_lst}
189 |     comparer = Comparer()
190 |     max_rows = max_rows or len(ds)
191 | 
192 |     for i in tqdm(range(max_rows), desc="Calculating win rates..."):
193 |         row = ds[i]
194 |         # Avoid any bias in ordering
195 |         random.shuffle(method_lst)
196 | 
197 |         for j, method_a in enumerate(method_lst[:-1]):
198 |             for z, method_b in enumerate(method_lst[j:]):
199 |                 if method_a == method_b:
200 |                     continue
201 | 
202 |                 method_a_md = row[f"{method_a}_md"]
203 |                 method_b_md = row[f"{method_b}_md"]
204 |                 winner = comparer(row["img"], method_a_md, method_b_md)
205 |                 if not winner:
206 |                     continue
207 | 
208 |                 if winner == "version_a":
209 |                     win_rates[method_a][method_b]["win"] += 1
210 |                     win_rates[method_b][method_a]["loss"] += 1
211 |                 else:
212 |                     win_rates[method_b][method_a]["win"] += 1
213 |                     win_rates[method_a][method_b]["loss"] += 1
214 |         if i % 10 == 0:
215 |             display_win_rates_table(win_rates)
216 | 
217 |     display_win_rates_table(win_rates)
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     main()
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_mathblock.py:
--------------------------------------------------------------------------------

```python
  1 | from concurrent.futures import ThreadPoolExecutor, as_completed
  2 | from typing import List, Tuple, Annotated
  3 | 
  4 | from pydantic import BaseModel
  5 | from tqdm import tqdm
  6 | 
  7 | from marker.output import json_to_html, unwrap_outer_tag
  8 | from marker.processors.llm import BaseLLMComplexBlockProcessor
  9 | 
 10 | from marker.schema import BlockTypes
 11 | from marker.schema.blocks import Block, InlineMath
 12 | from marker.schema.document import Document
 13 | from marker.schema.groups import PageGroup
 14 | 
 15 | 
 16 | class LLMMathBlockProcessor(BaseLLMComplexBlockProcessor):
 17 |     redo_inline_math: Annotated[
 18 |         bool,
 19 |         "If True, the inline math will be re-done, otherwise it will be left as is.",
 20 |     ] = False
 21 |     inlinemath_min_ratio: Annotated[
 22 |         float,
 23 |         "If more than this ratio of blocks are inlinemath blocks, assume everything has math.",
 24 |     ] = 0.4
 25 | 
 26 |     block_types = (BlockTypes.TextInlineMath,)  # Primary block type
 27 |     additional_block_types = (
 28 |         BlockTypes.Text,
 29 |         BlockTypes.Caption,
 30 |         BlockTypes.SectionHeader,
 31 |         BlockTypes.Footnote,
 32 |     )  # Seconday, can also contain math
 33 | 
 34 |     text_math_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 35 | You will receive an image of a text block and extracted text corresponding to the text in the image.
 36 | Your task is to correct any errors in the extracted text, including math, formatting, and other inaccuracies, and output the corrected block in html format.  Stay as faithful to the text in the image as possible.
 37 | 
 38 | **Instructions:**
 39 | 
 40 | 1. Carefully examine the provided text block image .
 41 | 2. Analyze the text that has been extracted from the block.
 42 | 3. Compare the extracted text to the corresponding text in the image.
 43 | 4. Write a short analysis of the text block, including any errors you see in the extracted text.
 44 | 5. If there are no errors in any of the extracted text, output "No corrections needed".
 45 | 6. Correct any errors in the extracted text, including:
 46 |     * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Surround them with <math>...</math> tags.  The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX.  Do not use $ or $$ as delimiters.
 47 |     * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
 48 |     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters.  Use the <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
 49 |     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
 50 |     * Ensure lines wrap properly, and that newlines are not in the middle of sentences.
 51 | 7. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted text unless it is necessary to correct an error.
 52 | 8. Output the corrected text in html format, as shown in the example below.  Only use the p, math, br, a, i, b, sup, sub, and span tags.
 53 | 9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.
 54 | 
 55 | **Example:**
 56 | 
 57 | Input:
 58 | ```html
 59 | Adversarial training (AT) <a href='#page-9-1'>[23]</a>, which aims to minimize the model's risk under the worst-case perturbations, 
 60 | is currently the most effective approach for improving the robustness of deep neural networks. For a given neural network f(x, w) 
 61 | with parameters w, the optimization objective of AT can be formulated as follows:
 62 | ```
 63 | 
 64 | Output:
 65 | analysis: The inline math is not in LaTeX format and is not surrounded by <math>...</math> tags.
 66 | ```html
 67 | Adversarial training <i>(AT)</i> <a href='#page-9-1'>[23]</a>, which aims to minimize the model's risk under the worst-case perturbations, is currently the most effective approach for improving the robustness of deep neural networks. For a given neural network <math>f(x, w)</math> with parameters <math>w</math>, the optimization objective of AT can be formulated as follows:
 68 | ```
 69 | 
 70 | **Input:**
 71 | ```html
 72 | {extracted_html}
 73 | ```
 74 | """
 75 | 
 76 |     def rewrite_blocks(self, document: Document):
 77 |         if not self.redo_inline_math:
 78 |             return
 79 | 
 80 |         # Get inline math blocks
 81 |         inline_blocks: List[InlineMath] = [
 82 |             (page, block)
 83 |             for page in document.pages
 84 |             for block in page.contained_blocks(document, self.block_types)
 85 |         ]
 86 | 
 87 |         # Get other blocks with detected math in them
 88 |         detected_blocks = [
 89 |             (page, block)
 90 |             for page in document.pages
 91 |             for block in page.contained_blocks(
 92 |                 document,
 93 |                 (
 94 |                     BlockTypes.Text,
 95 |                     BlockTypes.Caption,
 96 |                     BlockTypes.SectionHeader,
 97 |                     BlockTypes.Footnote,
 98 |                     BlockTypes.ListItem,
 99 |                 ),
100 |             )
101 |             if any(
102 |                 [
103 |                     b.formats and "math" in b.formats
104 |                     for b in block.contained_blocks(document, (BlockTypes.Line,))
105 |                 ]
106 |             )
107 |         ]
108 | 
109 |         # If a page has enough math blocks, assume all blocks can contain math
110 |         additional_text_blocks = []
111 |         for page in document.pages:
112 |             # Check for inline math blocks
113 |             page_inlinemath_blocks = [
114 |                 im for im in inline_blocks if im[0].page_id == page.page_id
115 |             ]
116 |             page_detected_blocks = [
117 |                 db for db in detected_blocks if db[0].page_id == page.page_id
118 |             ]
119 |             math_block_count = len(page_inlinemath_blocks) + len(page_detected_blocks)
120 | 
121 |             # Find all potential blocks
122 |             additional_blocks = page.contained_blocks(
123 |                 document, self.additional_block_types + self.block_types
124 |             )
125 | 
126 |             # Check if the ratio of math blocks to additional blocks is high enough
127 |             if (
128 |                 math_block_count / max(1, len(additional_blocks))
129 |                 < self.inlinemath_min_ratio
130 |             ):
131 |                 continue
132 | 
133 |             for b in additional_blocks:
134 |                 if b not in detected_blocks and b not in inline_blocks:
135 |                     additional_text_blocks.append((page, b))
136 | 
137 |         inference_blocks = inline_blocks + detected_blocks + additional_text_blocks
138 | 
139 |         # Don't show progress if there are no blocks to process
140 |         total_blocks = len(inference_blocks)
141 |         if total_blocks == 0:
142 |             return
143 | 
144 |         pbar = tqdm(
145 |             total=total_blocks,
146 |             desc=f"{self.__class__.__name__} running",
147 |             disable=self.disable_tqdm
148 |         )
149 |         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
150 |             for future in as_completed(
151 |                 [
152 |                     executor.submit(self.process_rewriting, document, b[0], b[1])
153 |                     for b in inference_blocks
154 |                 ]
155 |             ):
156 |                 future.result()  # Raise exceptions if any occurred
157 |                 pbar.update(1)
158 | 
159 |         pbar.close()
160 | 
161 |     def get_block_text(self, block: Block, document: Document) -> str:
162 |         html = json_to_html(block.render(document))
163 |         html = unwrap_outer_tag(html)  # Remove an outer p tag if it exists
164 |         return html
165 | 
166 |     def get_block_lines(self, block: Block, document: Document) -> Tuple[list, list]:
167 |         text_lines = block.contained_blocks(document, (BlockTypes.Line,))
168 |         extracted_lines = [line.formatted_text(document) for line in text_lines]
169 |         return text_lines, extracted_lines
170 | 
171 |     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
172 |         block_text = self.get_block_text(block, document)
173 |         prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text)
174 | 
175 |         image = self.extract_image(document, block)
176 |         response = self.llm_service(prompt, image, block, LLMTextSchema)
177 | 
178 |         if not response or "corrected_html" not in response:
179 |             block.update_metadata(llm_error_count=1)
180 |             return
181 | 
182 |         corrected_html = response["corrected_html"]
183 |         if not corrected_html:
184 |             block.update_metadata(llm_error_count=1)
185 |             return
186 | 
187 |         # Block is fine
188 |         if "no corrections needed" in corrected_html.lower():
189 |             return
190 | 
191 |         if len(corrected_html) < len(block_text) * 0.6:
192 |             block.update_metadata(llm_error_count=1)
193 |             return
194 | 
195 |         block.html = corrected_html
196 | 
197 | 
198 | class LLMTextSchema(BaseModel):
199 |     analysis: str
200 |     corrected_html: str
201 | 
```

--------------------------------------------------------------------------------
/signatures/version1/cla.json:
--------------------------------------------------------------------------------

```json
  1 | {
  2 |   "signedContributors": [
  3 |     {
  4 |       "name": "korakot",
  5 |       "id": 3155646,
  6 |       "comment_id": 2143359366,
  7 |       "created_at": "2024-06-01T08:25:52Z",
  8 |       "repoId": 712111618,
  9 |       "pullRequestNo": 161
 10 |     },
 11 |     {
 12 |       "name": "tosaddler",
 13 |       "id": 13705399,
 14 |       "comment_id": 2144014410,
 15 |       "created_at": "2024-06-02T20:40:52Z",
 16 |       "repoId": 712111618,
 17 |       "pullRequestNo": 165
 18 |     },
 19 |     {
 20 |       "name": "q2333gh",
 21 |       "id": 32679742,
 22 |       "comment_id": 2156122900,
 23 |       "created_at": "2024-06-08T18:01:39Z",
 24 |       "repoId": 712111618,
 25 |       "pullRequestNo": 176
 26 |     },
 27 |     {
 28 |       "name": "q2333gh",
 29 |       "id": 32679742,
 30 |       "comment_id": 2156614334,
 31 |       "created_at": "2024-06-09T13:48:49Z",
 32 |       "repoId": 712111618,
 33 |       "pullRequestNo": 176
 34 |     },
 35 |     {
 36 |       "name": "aniketinamdar",
 37 |       "id": 79044809,
 38 |       "comment_id": 2157453610,
 39 |       "created_at": "2024-06-10T06:43:39Z",
 40 |       "repoId": 712111618,
 41 |       "pullRequestNo": 179
 42 |     },
 43 |     {
 44 |       "name": "Zxilly",
 45 |       "id": 31370133,
 46 |       "comment_id": 2295163597,
 47 |       "created_at": "2024-08-18T07:44:04Z",
 48 |       "repoId": 712111618,
 49 |       "pullRequestNo": 257
 50 |     },
 51 |     {
 52 |       "name": "conscienceli",
 53 |       "id": 4034943,
 54 |       "comment_id": 2333374932,
 55 |       "created_at": "2024-09-06T06:56:25Z",
 56 |       "repoId": 712111618,
 57 |       "pullRequestNo": 276
 58 |     },
 59 |     {
 60 |       "name": "jcytong",
 61 |       "id": 156466,
 62 |       "comment_id": 2334987739,
 63 |       "created_at": "2024-09-07T01:27:51Z",
 64 |       "repoId": 712111618,
 65 |       "pullRequestNo": 278
 66 |     },
 67 |     {
 68 |       "name": "syldor",
 69 |       "id": 4158062,
 70 |       "comment_id": 2428114896,
 71 |       "created_at": "2024-10-22T03:05:34Z",
 72 |       "repoId": 712111618,
 73 |       "pullRequestNo": 309
 74 |     },
 75 |     {
 76 |       "name": "iammosespaulr",
 77 |       "id": 28682735,
 78 |       "comment_id": 2448054789,
 79 |       "created_at": "2024-10-30T18:36:24Z",
 80 |       "repoId": 712111618,
 81 |       "pullRequestNo": 327
 82 |     },
 83 |     {
 84 |       "name": "tjbck",
 85 |       "id": 25473318,
 86 |       "comment_id": 2454051305,
 87 |       "created_at": "2024-11-04T08:09:46Z",
 88 |       "repoId": 712111618,
 89 |       "pullRequestNo": 332
 90 |     },
 91 |     {
 92 |       "name": "jonaskahn",
 93 |       "id": 4338500,
 94 |       "comment_id": 2516932772,
 95 |       "created_at": "2024-12-04T10:47:00Z",
 96 |       "repoId": 712111618,
 97 |       "pullRequestNo": 413
 98 |     },
 99 |     {
100 |       "name": "Leon-Sander",
101 |       "id": 72946124,
102 |       "comment_id": 2519736628,
103 |       "created_at": "2024-12-05T09:26:56Z",
104 |       "repoId": 712111618,
105 |       "pullRequestNo": 415
106 |     },
107 |     {
108 |       "name": "aguadoenzo",
109 |       "id": 11411723,
110 |       "comment_id": 2520293935,
111 |       "created_at": "2024-12-05T13:13:34Z",
112 |       "repoId": 712111618,
113 |       "pullRequestNo": 416
114 |     },
115 |     {
116 |       "name": "tarun-menta",
117 |       "id": 66506307,
118 |       "comment_id": 2543907406,
119 |       "created_at": "2024-12-15T15:06:32Z",
120 |       "repoId": 712111618,
121 |       "pullRequestNo": 427
122 |     },
123 |     {
124 |       "name": "ZeyuTeng96",
125 |       "id": 96521059,
126 |       "comment_id": 2567236036,
127 |       "created_at": "2025-01-02T02:36:02Z",
128 |       "repoId": 712111618,
129 |       "pullRequestNo": 452
130 |     },
131 |     {
132 |       "name": "xiaoyao9184",
133 |       "id": 6614349,
134 |       "comment_id": 2571623521,
135 |       "created_at": "2025-01-05T13:15:34Z",
136 |       "repoId": 712111618,
137 |       "pullRequestNo": 463
138 |     },
139 |     {
140 |       "name": "yasyf",
141 |       "id": 709645,
142 |       "comment_id": 2571679069,
143 |       "created_at": "2025-01-05T16:23:12Z",
144 |       "repoId": 712111618,
145 |       "pullRequestNo": 464
146 |     },
147 |     {
148 |       "name": "jazzido",
149 |       "id": 27584,
150 |       "comment_id": 2610428000,
151 |       "created_at": "2025-01-23T17:01:02Z",
152 |       "repoId": 712111618,
153 |       "pullRequestNo": 502
154 |     },
155 |     {
156 |       "name": "tagliala",
157 |       "id": 556268,
158 |       "comment_id": 2614522545,
159 |       "created_at": "2025-01-26T17:44:13Z",
160 |       "repoId": 712111618,
161 |       "pullRequestNo": 507
162 |     },
163 |     {
164 |       "name": "conjuncts",
165 |       "id": 67614673,
166 |       "comment_id": 2646947362,
167 |       "created_at": "2025-02-10T05:22:00Z",
168 |       "repoId": 712111618,
169 |       "pullRequestNo": 533
170 |     },
171 |     {
172 |       "name": "sleiss",
173 |       "id": 5084100,
174 |       "comment_id": 2661051259,
175 |       "created_at": "2025-02-15T19:11:32Z",
176 |       "repoId": 712111618,
177 |       "pullRequestNo": 552
178 |     },
179 |     {
180 |       "name": "dantetemplar",
181 |       "id": 69670642,
182 |       "comment_id": 2661665606,
183 |       "created_at": "2025-02-16T23:02:34Z",
184 |       "repoId": 712111618,
185 |       "pullRequestNo": 555
186 |     },
187 |     {
188 |       "name": "vicenciomf2",
189 |       "id": 127889973,
190 |       "comment_id": 2676007412,
191 |       "created_at": "2025-02-22T04:34:27Z",
192 |       "repoId": 712111618,
193 |       "pullRequestNo": 574
194 |     },
195 |     {
196 |       "name": "denisshepelin",
197 |       "id": 143056,
198 |       "comment_id": 2694298068,
199 |       "created_at": "2025-03-03T12:50:11Z",
200 |       "repoId": 712111618,
201 |       "pullRequestNo": 592
202 |     },
203 |     {
204 |       "name": "lecifire",
205 |       "id": 43193080,
206 |       "comment_id": 2735460895,
207 |       "created_at": "2025-03-19T06:25:13Z",
208 |       "repoId": 712111618,
209 |       "pullRequestNo": 609
210 |     },
211 |     {
212 |       "name": "eddiedozier",
213 |       "id": 9558671,
214 |       "comment_id": 2774087436,
215 |       "created_at": "2025-04-03T01:14:41Z",
216 |       "repoId": 712111618,
217 |       "pullRequestNo": 646
218 |     },
219 |     {
220 |       "name": "MauritsBrinkman",
221 |       "id": 153435669,
222 |       "comment_id": 2838830143,
223 |       "created_at": "2025-04-29T13:10:38Z",
224 |       "repoId": 712111618,
225 |       "pullRequestNo": 675
226 |     },
227 |     {
228 |       "name": "jacksontromero",
229 |       "id": 42822986,
230 |       "comment_id": 2927811267,
231 |       "created_at": "2025-06-01T20:10:35Z",
232 |       "repoId": 712111618,
233 |       "pullRequestNo": 721
234 |     },
235 |     {
236 |       "name": "rgeorgi",
237 |       "id": 805862,
238 |       "comment_id": 2968063099,
239 |       "created_at": "2025-06-12T20:23:50Z",
240 |       "repoId": 712111618,
241 |       "pullRequestNo": 751
242 |     },
243 |     {
244 |       "name": "zanussbaum",
245 |       "id": 33707069,
246 |       "comment_id": 3006162493,
247 |       "created_at": "2025-06-25T21:12:25Z",
248 |       "repoId": 712111618,
249 |       "pullRequestNo": 769
250 |     },
251 |     {
252 |       "name": "runarmod",
253 |       "id": 40912933,
254 |       "comment_id": 3007582739,
255 |       "created_at": "2025-06-26T08:06:45Z",
256 |       "repoId": 712111618,
257 |       "pullRequestNo": 772
258 |     },
259 |     {
260 |       "name": "ArnoKlein",
261 |       "id": 64474825,
262 |       "comment_id": 3044904172,
263 |       "created_at": "2025-07-07T12:41:57Z",
264 |       "repoId": 712111618,
265 |       "pullRequestNo": 793
266 |     },
267 |     {
268 |       "name": "voberoi",
269 |       "id": 26339,
270 |       "comment_id": 3054261379,
271 |       "created_at": "2025-07-09T22:17:58Z",
272 |       "repoId": 712111618,
273 |       "pullRequestNo": 796
274 |     },
275 |     {
276 |       "name": "MrWaradana",
277 |       "id": 77272854,
278 |       "comment_id": 3060610542,
279 |       "created_at": "2025-07-11T05:30:30Z",
280 |       "repoId": 712111618,
281 |       "pullRequestNo": 799
282 |     },
283 |     {
284 |       "name": "schen1102",
285 |       "id": 221293366,
286 |       "comment_id": 3083001869,
287 |       "created_at": "2025-07-17T07:50:39Z",
288 |       "repoId": 712111618,
289 |       "pullRequestNo": 810
290 |     },
291 |     {
292 |       "name": "igormf",
293 |       "id": 8070635,
294 |       "comment_id": 3156380190,
295 |       "created_at": "2025-08-05T19:40:17Z",
296 |       "repoId": 712111618,
297 |       "pullRequestNo": 823
298 |     },
299 |     {
300 |       "name": "muratcanlaloglu",
301 |       "id": 71566447,
302 |       "comment_id": 3189653464,
303 |       "created_at": "2025-08-14T19:34:21Z",
304 |       "repoId": 712111618,
305 |       "pullRequestNo": 833
306 |     },
307 |     {
308 |       "name": "SageStack",
309 |       "id": 124574940,
310 |       "comment_id": 3192827940,
311 |       "created_at": "2025-08-15T21:40:27Z",
312 |       "repoId": 712111618,
313 |       "pullRequestNo": 837
314 |     },
315 |     {
316 |       "name": "dowithless",
317 |       "id": 165774507,
318 |       "comment_id": 3192986184,
319 |       "created_at": "2025-08-15T23:20:17Z",
320 |       "repoId": 712111618,
321 |       "pullRequestNo": 839
322 |     },
323 |     {
324 |       "name": "blap",
325 |       "id": 913963,
326 |       "comment_id": 3036951999,
327 |       "created_at": "2025-07-04T17:42:05Z",
328 |       "repoId": 712111618,
329 |       "pullRequestNo": 789
330 |     },
331 |     {
332 |       "name": "sandy0kwon",
333 |       "id": 78377296,
334 |       "comment_id": 3207667429,
335 |       "created_at": "2025-08-20T18:59:07Z",
336 |       "repoId": 712111618,
337 |       "pullRequestNo": 843
338 |     },
339 |     {
340 |       "name": "TwinkleXD",
341 |       "id": 68148987,
342 |       "comment_id": 3218116522,
343 |       "created_at": "2025-08-24T13:49:14Z",
344 |       "repoId": 712111618,
345 |       "pullRequestNo": 849
346 |     },
347 |     {
348 |       "name": "u-ashish",
349 |       "id": 14264791,
350 |       "comment_id": 3221349115,
351 |       "created_at": "2025-08-25T18:41:28Z",
352 |       "repoId": 712111618,
353 |       "pullRequestNo": 850
354 |     },
355 |     {
356 |       "name": "EdmondChuiHW",
357 |       "id": 1967998,
358 |       "comment_id": 3254531992,
359 |       "created_at": "2025-09-04T16:30:48Z",
360 |       "repoId": 712111618,
361 |       "pullRequestNo": 869
362 |     },
363 |     {
364 |       "name": "himanshu-josh",
365 |       "id": 216880840,
366 |       "comment_id": 3311497351,
367 |       "created_at": "2025-09-19T09:48:31Z",
368 |       "repoId": 712111618,
369 |       "pullRequestNo": 891
370 |     },
371 |     {
372 |       "name": "aud",
373 |       "id": 13060190,
374 |       "comment_id": 3331451358,
375 |       "created_at": "2025-09-25T02:21:46Z",
376 |       "repoId": 712111618,
377 |       "pullRequestNo": 895
378 |     },
379 |     {
380 |       "name": "Tenkeboks",
381 |       "id": 198454519,
382 |       "comment_id": 3338897497,
383 |       "created_at": "2025-09-26T14:05:43Z",
384 |       "repoId": 712111618,
385 |       "pullRequestNo": 897
386 |     },
387 |     {
388 |       "name": "ozzzzz",
389 |       "id": 5720160,
390 |       "comment_id": 3363582442,
391 |       "created_at": "2025-10-02T23:18:12Z",
392 |       "repoId": 712111618,
393 |       "pullRequestNo": 905
394 |     },
395 |     {
396 |       "name": "Vidarshana26204",
397 |       "id": 36869394,
398 |       "comment_id": 3394829315,
399 |       "created_at": "2025-10-12T16:26:13Z",
400 |       "repoId": 712111618,
401 |       "pullRequestNo": 916
402 |     }
403 |   ]
404 | }
```

--------------------------------------------------------------------------------
/marker/renderers/markdown.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | from collections import defaultdict
  3 | from typing import Annotated, Tuple
  4 | 
  5 | import regex
  6 | import six
  7 | from bs4 import NavigableString
  8 | from markdownify import MarkdownConverter, re_whitespace
  9 | from marker.logger import get_logger
 10 | from pydantic import BaseModel
 11 | 
 12 | from marker.renderers.html import HTMLRenderer
 13 | from marker.schema import BlockTypes
 14 | from marker.schema.document import Document
 15 | 
 16 | logger = get_logger()
 17 | 
 18 | 
 19 | def escape_dollars(text):
 20 |     return text.replace("$", r"\$")
 21 | 
 22 | 
 23 | def cleanup_text(full_text):
 24 |     full_text = re.sub(r"\n{3,}", "\n\n", full_text)
 25 |     full_text = re.sub(r"(\n\s){3,}", "\n\n", full_text)
 26 |     return full_text.strip()
 27 | 
 28 | 
 29 | def get_formatted_table_text(element):
 30 |     text = []
 31 |     for content in element.contents:
 32 |         if content is None:
 33 |             continue
 34 | 
 35 |         if isinstance(content, NavigableString):
 36 |             stripped = content.strip()
 37 |             if stripped:
 38 |                 text.append(escape_dollars(stripped))
 39 |         elif content.name == "br":
 40 |             text.append("<br>")
 41 |         elif content.name == "math":
 42 |             text.append("$" + content.text + "$")
 43 |         else:
 44 |             content_str = escape_dollars(str(content))
 45 |             text.append(content_str)
 46 | 
 47 |     full_text = ""
 48 |     for i, t in enumerate(text):
 49 |         if t == "<br>":
 50 |             full_text += t
 51 |         elif i > 0 and text[i - 1] != "<br>":
 52 |             full_text += " " + t
 53 |         else:
 54 |             full_text += t
 55 |     return full_text
 56 | 
 57 | 
 58 | class Markdownify(MarkdownConverter):
 59 |     def __init__(
 60 |         self,
 61 |         paginate_output,
 62 |         page_separator,
 63 |         inline_math_delimiters,
 64 |         block_math_delimiters,
 65 |         html_tables_in_markdown,
 66 |         **kwargs,
 67 |     ):
 68 |         super().__init__(**kwargs)
 69 |         self.paginate_output = paginate_output
 70 |         self.page_separator = page_separator
 71 |         self.inline_math_delimiters = inline_math_delimiters
 72 |         self.block_math_delimiters = block_math_delimiters
 73 |         self.html_tables_in_markdown = html_tables_in_markdown
 74 | 
 75 |     def convert_div(self, el, text, parent_tags):
 76 |         is_page = el.has_attr("class") and el["class"][0] == "page"
 77 |         if self.paginate_output and is_page:
 78 |             page_id = el["data-page-id"]
 79 |             pagination_item = (
 80 |                 "\n\n" + "{" + str(page_id) + "}" + self.page_separator + "\n\n"
 81 |             )
 82 |             return pagination_item + text
 83 |         else:
 84 |             return text
 85 | 
 86 |     def convert_p(self, el, text, parent_tags):
 87 |         hyphens = r"-—¬"
 88 |         has_continuation = el.has_attr("class") and "has-continuation" in el["class"]
 89 |         if has_continuation:
 90 |             block_type = BlockTypes[el["block-type"]]
 91 |             if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]:
 92 |                 if regex.compile(
 93 |                     rf".*[\p{{Ll}}|\d][{hyphens}]\s?$", regex.DOTALL
 94 |                 ).match(text):  # handle hypenation across pages
 95 |                     return regex.split(rf"[{hyphens}]\s?$", text)[0]
 96 |                 return f"{text} "
 97 |             if block_type == BlockTypes.ListGroup:
 98 |                 return f"{text}"
 99 |         return f"{text}\n\n" if text else ""  # default convert_p behavior
100 | 
101 |     def convert_math(self, el, text, parent_tags):
102 |         block = el.has_attr("display") and el["display"] == "block"
103 |         if block:
104 |             return (
105 |                 "\n"
106 |                 + self.block_math_delimiters[0]
107 |                 + text.strip()
108 |                 + self.block_math_delimiters[1]
109 |                 + "\n"
110 |             )
111 |         else:
112 |             return (
113 |                 " "
114 |                 + self.inline_math_delimiters[0]
115 |                 + text.strip()
116 |                 + self.inline_math_delimiters[1]
117 |                 + " "
118 |             )
119 | 
120 |     def convert_table(self, el, text, parent_tags):
121 |         if self.html_tables_in_markdown:
122 |             return "\n\n" + str(el) + "\n\n"
123 | 
124 |         total_rows = len(el.find_all("tr"))
125 |         colspans = []
126 |         rowspan_cols = defaultdict(int)
127 |         for i, row in enumerate(el.find_all("tr")):
128 |             row_cols = rowspan_cols[i]
129 |             for cell in row.find_all(["td", "th"]):
130 |                 colspan = int(cell.get("colspan", 1))
131 |                 row_cols += colspan
132 |                 for r in range(int(cell.get("rowspan", 1)) - 1):
133 |                     rowspan_cols[i + r] += (
134 |                         colspan  # Add the colspan to the next rows, so they get the correct number of columns
135 |                     )
136 |             colspans.append(row_cols)
137 |         total_cols = max(colspans) if colspans else 0
138 | 
139 |         grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]
140 | 
141 |         for row_idx, tr in enumerate(el.find_all("tr")):
142 |             col_idx = 0
143 |             for cell in tr.find_all(["td", "th"]):
144 |                 # Skip filled positions
145 |                 while col_idx < total_cols and grid[row_idx][col_idx] is not None:
146 |                     col_idx += 1
147 | 
148 |                 # Fill in grid
149 |                 value = (
150 |                     get_formatted_table_text(cell)
151 |                     .replace("\n", " ")
152 |                     .replace("|", " ")
153 |                     .strip()
154 |                 )
155 |                 rowspan = int(cell.get("rowspan", 1))
156 |                 colspan = int(cell.get("colspan", 1))
157 | 
158 |                 if col_idx >= total_cols:
159 |                     # Skip this cell if we're out of bounds
160 |                     continue
161 | 
162 |                 for r in range(rowspan):
163 |                     for c in range(colspan):
164 |                         try:
165 |                             if r == 0 and c == 0:
166 |                                 grid[row_idx][col_idx] = value
167 |                             else:
168 |                                 grid[row_idx + r][col_idx + c] = (
169 |                                     ""  # Empty cell due to rowspan/colspan
170 |                                 )
171 |                         except IndexError:
172 |                             # Sometimes the colspan/rowspan predictions can overflow
173 |                             logger.info(
174 |                                 f"Overflow in columns: {col_idx + c} >= {total_cols} or rows: {row_idx + r} >= {total_rows}"
175 |                             )
176 |                             continue
177 | 
178 |                 col_idx += colspan
179 | 
180 |         markdown_lines = []
181 |         col_widths = [0] * total_cols
182 |         for row in grid:
183 |             for col_idx, cell in enumerate(row):
184 |                 if cell is not None:
185 |                     col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
186 | 
187 |         def add_header_line():
188 |             markdown_lines.append(
189 |                 "|" + "|".join("-" * (width + 2) for width in col_widths) + "|"
190 |             )
191 | 
192 |         # Generate markdown rows
193 |         added_header = False
194 |         for i, row in enumerate(grid):
195 |             is_empty_line = all(not cell for cell in row)
196 |             if is_empty_line and not added_header:
197 |                 # Skip leading blank lines
198 |                 continue
199 | 
200 |             line = []
201 |             for col_idx, cell in enumerate(row):
202 |                 if cell is None:
203 |                     cell = ""
204 |                 padding = col_widths[col_idx] - len(str(cell))
205 |                 line.append(f" {cell}{' ' * padding} ")
206 |             markdown_lines.append("|" + "|".join(line) + "|")
207 | 
208 |             if not added_header:
209 |                 # Skip empty lines when adding the header row
210 |                 add_header_line()
211 |                 added_header = True
212 | 
213 |         # Handle one row tables
214 |         if total_rows == 1:
215 |             add_header_line()
216 | 
217 |         table_md = "\n".join(markdown_lines)
218 |         return "\n\n" + table_md + "\n\n"
219 | 
220 |     def convert_a(self, el, text, parent_tags):
221 |         text = self.escape(text)
222 |         # Escape brackets and parentheses in text
223 |         text = re.sub(r"([\[\]()])", r"\\\1", text)
224 |         return super().convert_a(el, text, parent_tags)
225 | 
226 |     def convert_span(self, el, text, parent_tags):
227 |         if el.get("id"):
228 |             return f'<span id="{el["id"]}">{text}</span>'
229 |         else:
230 |             return text
231 | 
232 |     def escape(self, text, parent_tags=None):
233 |         text = super().escape(text, parent_tags)
234 |         if self.options["escape_dollars"]:
235 |             text = text.replace("$", r"\$")
236 |         return text
237 | 
238 |     def process_text(self, el, parent_tags=None):
239 |         text = six.text_type(el) or ""
240 | 
241 |         # normalize whitespace if we're not inside a preformatted element
242 |         if not el.find_parent("pre"):
243 |             text = re_whitespace.sub(" ", text)
244 | 
245 |         # escape special characters if we're not inside a preformatted or code element
246 |         if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):
247 |             text = self.escape(text)
248 | 
249 |         # remove trailing whitespaces if any of the following condition is true:
250 |         # - current text node is the last node in li
251 |         # - current text node is followed by an embedded list
252 |         if el.parent.name == "li" and (
253 |             not el.next_sibling or el.next_sibling.name in ["ul", "ol"]
254 |         ):
255 |             text = text.rstrip()
256 | 
257 |         return text
258 | 
259 | 
260 | class MarkdownOutput(BaseModel):
261 |     markdown: str
262 |     images: dict
263 |     metadata: dict
264 | 
265 | 
266 | class MarkdownRenderer(HTMLRenderer):
267 |     page_separator: Annotated[
268 |         str, "The separator to use between pages.", "Default is '-' * 48."
269 |     ] = "-" * 48
270 |     inline_math_delimiters: Annotated[
271 |         Tuple[str], "The delimiters to use for inline math."
272 |     ] = ("$", "$")
273 |     block_math_delimiters: Annotated[
274 |         Tuple[str], "The delimiters to use for block math."
275 |     ] = ("$$", "$$")
276 |     html_tables_in_markdown: Annotated[
277 |         bool, "Return tables formatted as HTML, instead of in markdown"
278 |     ] = False
279 | 
280 |     @property
281 |     def md_cls(self):
282 |         return Markdownify(
283 |             self.paginate_output,
284 |             self.page_separator,
285 |             heading_style="ATX",
286 |             bullets="-",
287 |             escape_misc=False,
288 |             escape_underscores=True,
289 |             escape_asterisks=True,
290 |             escape_dollars=True,
291 |             sub_symbol="<sub>",
292 |             sup_symbol="<sup>",
293 |             inline_math_delimiters=self.inline_math_delimiters,
294 |             block_math_delimiters=self.block_math_delimiters,
295 |             html_tables_in_markdown=self.html_tables_in_markdown
296 |         )
297 | 
298 |     def __call__(self, document: Document) -> MarkdownOutput:
299 |         document_output = document.render(self.block_config)
300 |         full_html, images = self.extract_html(document, document_output)
301 |         markdown = self.md_cls.convert(full_html)
302 |         markdown = cleanup_text(markdown)
303 | 
304 |         # Ensure we set the correct blanks for pagination markers
305 |         if self.paginate_output:
306 |             if not markdown.startswith("\n\n"):
307 |                 markdown = "\n\n" + markdown
308 |             if markdown.endswith(self.page_separator):
309 |                 markdown += "\n\n"
310 | 
311 |         return MarkdownOutput(
312 |             markdown=markdown,
313 |             images=images,
314 |             metadata=self.generate_document_metadata(document, document_output),
315 |         )
316 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_page_correction.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | from concurrent.futures import ThreadPoolExecutor, as_completed
  3 | from typing import List, Annotated
  4 | 
  5 | from marker.logger import get_logger
  6 | from marker.processors.llm import BaseLLMComplexBlockProcessor
  7 | from marker.schema import BlockTypes
  8 | from marker.schema.blocks import BlockId
  9 | from marker.schema.document import Document
 10 | from marker.schema.groups import PageGroup
 11 | from pydantic import BaseModel
 12 | from tqdm import tqdm
 13 | 
 14 | logger = get_logger()
 15 | 
 16 | FORMAT_TAGS = ["b", "i", "u", "del", "math", "sub", "sup", "a", "code", "p", "img"]
 17 | BLOCK_MAP = {
 18 |     "Text": [],
 19 |     "TextInlineMath": [],
 20 |     "Table": ["table", "tbody", "tr", "td", "th"],
 21 |     "ListGroup": ["ul", "li"],
 22 |     "SectionHeader": [],
 23 |     "Form": ["form", "input", "select", "textarea", "table", "tbody", "tr", "td", "th"],
 24 |     "Figure": [],
 25 |     "Picture": [],
 26 |     "Code": ["pre"],
 27 |     "TableOfContents": ["table", "tbody", "tr", "td", "th"],
 28 | }
 29 | ALL_TAGS = FORMAT_TAGS + [tag for tags in BLOCK_MAP.values() for tag in tags]
 30 | 
 31 | 
 32 | class LLMPageCorrectionProcessor(BaseLLMComplexBlockProcessor):
 33 |     block_correction_prompt: Annotated[
 34 |         str, "The user prompt to guide the block correction process."
 35 |     ] = None
 36 |     default_user_prompt = """Your goal is to reformat the blocks to be as correct as possible, without changing the underlying meaning of the text within the blocks.  Mostly focus on reformatting the content.  Ignore minor formatting issues like extra <i> tags."""
 37 |     page_prompt = """You're a text correction expert specializing in accurately reproducing text from PDF pages. You will be given a JSON list of blocks on a PDF page, along with the image for that page.  The blocks will be formatted like the example below.  The blocks will be presented in reading order.
 38 | 
 39 | ```json
 40 | [
 41 |     {
 42 |         "bbox": [x1, y1, x2, y2],
 43 |         "id": "/page/0/Text/1",
 44 |         "block_type": "Text",
 45 |         "html": "<p>Some text here</p>",
 46 |     }, ...
 47 | ]
 48 | ```
 49 | 
 50 | You will also be given a prompt from the user that tells you how to correct the blocks.  Your task is to analyze the blocks and the image, then follow the prompt to correct the blocks.
 51 | 
 52 | Here are the types of changes you can make in response to the prompt:
 53 | 
 54 | - Reorder the blocks to reflect the correct reading order.
 55 | - Change the block type to the correct type - the potential types are "SectionHeader", "Form", "Text", "Table", "Figure", "Picture", "ListGroup", "PageFooter", "PageHeader", "Footnote", or "Equation".  In this case, update the html as well to match the new block type.
 56 | - Make edits to block content by changing the HTML.
 57 | 
 58 | Guidelines:
 59 | - Only use the following tags: {{format_tags}}.  Do not use any other tags.  
 60 | - The math tag can have the attribute `display="block"` to indicate display math, the a tag can have the attribute `href="..."` to indicate a link, and td and th tags can have the attribute `colspan="..."` and `rowspan="..."` to indicate table cells that span multiple columns or rows.  There can be a "block-type" attribute on p tags.  Do not use any other attributes.
 61 | - Keep LaTeX formulas inside <math> tags - these are important for downstream processing.
 62 | - Bboxes are normalized 0-1000
 63 | - The order of the JSON list is the reading order for the blocks
 64 | - Follow the user prompt faithfully, and only make additional changes if there is a significant issue with correctness.
 65 | - Stay faithful to the original image, and do not insert any content that is not present in the image or the blocks, unless specifically requested by the user prompt.
 66 | 
 67 | **Instructions:**
 68 | 1. Carefully examine the provided JSON representation of the page, along with the image.
 69 | 2. Analyze the user prompt.
 70 | 3. Identify any issues you'll need to fix, and write a short analysis.
 71 | 4. If everything is fine, output "no_corrections"  Otherwise, output the type of correction needed: ["reorder", "rewrite", "reorder_first"].  Rewrite includes rewriting html and changing the block type.  If you need to do both, then perform only the reordering, and output "reorder_first", so we can do the rewriting later.
 72 | 5. If corrections are needed, output any blocks that need updates:
 73 |     a. If reading order needs to be changed, output the IDs of the blocks in the correct order, and keep block_type and html blank, like this:
 74 |     ```json
 75 |     [
 76 |         {
 77 |             "id": "/page/0/Text/1",
 78 |             "block_type": "",
 79 |             "html": ""
 80 |         },
 81 |         ...
 82 |     ]
 83 | 
 84 |     b. If blocks need to be rewritten, output the block ids and new HTML for the blocks, like this:
 85 |         ```json
 86 |         [
 87 |             {
 88 |                 "id": "/page/0/Text/1",
 89 |                 "block_type": "Text",
 90 |                 "html": "<p>New HTML content here</p>"
 91 |             },
 92 |             ...
 93 |         ]
 94 |         ```
 95 | 
 96 | **Example:**
 97 | Input:
 98 | Blocks
 99 | ```json
100 | [
101 |     {
102 |         "bbox": [x1, y1, x2, y2],
103 |         "id": "/page/0/Text/1",
104 |         "block_type": "Text",
105 |         "html": "1.14 Vector Operations",
106 |     },
107 |     {
108 |         "bbox": [x1, y1, x2, y2],
109 |         "id": "/page/0/Text/2",
110 |         "block_type": "Text",
111 |         "html": "<p>You can perform many operations on a vector, including...</p>",
112 |     },
113 | ]
114 | ```
115 | User Prompt
116 | Ensure that all blocks have the correct labels, and that reading order is correct.
117 | Output:
118 | Analysis: The blocks are in the correct reading order, but the first block should actually be a SectionHeader.
119 | ```json
120 | [
121 |     {
122 |         "id": "/page/0/Text/1",
123 |         "block_type": "SectionHeader",
124 |         "html": "<h1>1.14 Vector Operations</h1>"
125 |     }
126 | ]
127 | ```
128 | 
129 | **Input:**
130 | Blocks
131 | ```json
132 | {{page_json}}
133 | ```
134 | User Prompt
135 | {{user_prompt}}
136 | """
137 | 
138 |     def get_selected_blocks(
139 |         self,
140 |         document: Document,
141 |         page: PageGroup,
142 |     ) -> List[dict]:
143 |         selected_blocks = page.structure_blocks(document)
144 |         json_blocks = [
145 |             self.normalize_block_json(block, document, page)
146 |             for i, block in enumerate(selected_blocks)
147 |         ]
148 |         return json_blocks
149 | 
150 |     def process_rewriting(self, document: Document, page1: PageGroup):
151 |         page_blocks = self.get_selected_blocks(document, page1)
152 |         image = page1.get_image(document, highres=False)
153 | 
154 |         prompt = (
155 |             self.page_prompt.replace("{{page_json}}", json.dumps(page_blocks))
156 |             .replace("{{format_tags}}", json.dumps(ALL_TAGS))
157 |             .replace("{{user_prompt}}", self.block_correction_prompt)
158 |         )
159 |         response = self.llm_service(prompt, image, page1, PageSchema)
160 |         logger.debug(f"Got reponse from LLM: {response}")
161 | 
162 |         if not response or "correction_type" not in response:
163 |             logger.warning("LLM did not return a valid response")
164 |             return
165 | 
166 |         correction_type = response["correction_type"]
167 |         if correction_type == "no_corrections":
168 |             return
169 |         elif correction_type in ["reorder", "reorder_first"]:
170 |             self.load_blocks(response)
171 |             self.handle_reorder(response["blocks"], page1)
172 | 
173 |             # If we needed to reorder first, we will handle the rewriting next
174 |             if correction_type == "reorder_first":
175 |                 self.process_rewriting(document, page1)
176 |         elif correction_type == "rewrite":
177 |             self.load_blocks(response)
178 |             self.handle_rewrites(response["blocks"], document)
179 |         else:
180 |             logger.warning(f"Unknown correction type: {correction_type}")
181 |             return
182 | 
183 |     def load_blocks(self, response):
184 |         if isinstance(response["blocks"], str):
185 |             response["blocks"] = json.loads(response["blocks"])
186 | 
187 |     def handle_reorder(self, blocks: list, page1: PageGroup):
188 |         unique_page_ids = set()
189 |         document_page_ids = [str(page1.page_id)]
190 |         document_pages = [page1]
191 | 
192 |         for block_data in blocks:
193 |             try:
194 |                 page_id, _, _ = block_data["id"].split("/")
195 |                 unique_page_ids.add(page_id)
196 |             except Exception as e:
197 |                 logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
198 |                 continue
199 | 
200 |         if set(document_page_ids) != unique_page_ids:
201 |             logger.debug(
202 |                 "Some page IDs in the response do not match the document's pages"
203 |             )
204 |             return
205 | 
206 |         for page_id, document_page in zip(unique_page_ids, document_pages):
207 |             block_ids_for_page = []
208 |             for block_data in blocks:
209 |                 try:
210 |                     page_id, block_type, block_id = block_data["id"].split("/")
211 |                     block_id = BlockId(
212 |                         page_id=page_id,
213 |                         block_id=block_id,
214 |                         block_type=getattr(BlockTypes, block_type),
215 |                     )
216 |                     block_ids_for_page.append(block_id)
217 |                 except Exception as e:
218 |                     logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
219 |                     continue
220 | 
221 |                 # Both sides should have the same values, just be reordered
222 |                 if not all(
223 |                     [
224 |                         block_id in document_page.structure
225 |                         for block_id in block_ids_for_page
226 |                     ]
227 |                 ):
228 |                     logger.debug(
229 |                         f"Some blocks for page {page_id} not found in document"
230 |                     )
231 |                     continue
232 | 
233 |                 if not all(
234 |                     [
235 |                         block_id in block_ids_for_page
236 |                         for block_id in document_page.structure
237 |                     ]
238 |                 ):
239 |                     logger.debug(
240 |                         f"Some blocks in document page {page_id} not found in response"
241 |                     )
242 |                     continue
243 | 
244 |                 # Swap the order of blocks in the document page
245 |                 document_page.structure = block_ids_for_page
246 | 
247 |     def handle_rewrites(self, blocks: list, document: Document):
248 |         for block_data in blocks:
249 |             try:
250 |                 block_id = block_data["id"].strip().lstrip("/")
251 |                 _, page_id, block_type, block_id = block_id.split("/")
252 |                 block_id = BlockId(
253 |                     page_id=page_id,
254 |                     block_id=block_id,
255 |                     block_type=getattr(BlockTypes, block_type),
256 |                 )
257 |                 block = document.get_block(block_id)
258 |                 if not block:
259 |                     logger.debug(f"Block {block_id} not found in document")
260 |                     continue
261 | 
262 |                 if hasattr(block, "html"):
263 |                     block.html = block_data["html"]
264 |             except Exception as e:
265 |                 logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
266 |                 continue
267 | 
268 |     def rewrite_blocks(self, document: Document):
269 |         if not self.block_correction_prompt:
270 |             return
271 | 
272 |         # Don't show progress if there are no blocks to process
273 |         total_blocks = len(document.pages)
274 |         if total_blocks == 0:
275 |             return
276 | 
277 |         pbar = tqdm(
278 |             total=max(1, total_blocks - 1),
279 |             desc=f"{self.__class__.__name__} running",
280 |             disable=self.disable_tqdm,
281 |         )
282 | 
283 |         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
284 |             for future in as_completed(
285 |                 [
286 |                     executor.submit(self.process_rewriting, document, page)
287 |                     for page in document.pages
288 |                 ]
289 |             ):
290 |                 future.result()  # Raise exceptions if any occurred
291 |                 pbar.update(1)
292 | 
293 |         pbar.close()
294 | 
295 | 
296 | class BlockSchema(BaseModel):
297 |     id: str
298 |     html: str
299 |     block_type: str
300 | 
301 | 
302 | class PageSchema(BaseModel):
303 |     analysis: str
304 |     correction_type: str
305 |     blocks: List[BlockSchema]
306 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/base.py:
--------------------------------------------------------------------------------

```python
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence, Tuple
  4 | 
  5 | from pydantic import BaseModel, ConfigDict, field_validator
  6 | from PIL import Image
  7 | 
  8 | from marker.schema import BlockTypes
  9 | from marker.schema.polygon import PolygonBox
 10 | 
 11 | if TYPE_CHECKING:
 12 |     from marker.schema.document import Document
 13 |     from marker.schema.groups.page import PageGroup
 14 | 
 15 | 
 16 | class BlockMetadata(BaseModel):
 17 |     llm_request_count: int = 0
 18 |     llm_error_count: int = 0
 19 |     llm_tokens_used: int = 0
 20 |     previous_text: str = ""
 21 |     previous_type: str = ""
 22 |     previous_order: int = 0
 23 | 
 24 |     def merge(self, model2):
 25 |         return self.__class__(
 26 |             **{
 27 |                 field: getattr(self, field) + getattr(model2, field)
 28 |                 for field in self.model_fields
 29 |             }
 30 |         )
 31 | 
 32 | 
 33 | class BlockOutput(BaseModel):
 34 |     html: str
 35 |     polygon: PolygonBox
 36 |     id: BlockId
 37 |     children: List[BlockOutput] | None = None
 38 |     section_hierarchy: Dict[int, BlockId] | None = None
 39 | 
 40 | 
 41 | class BlockId(BaseModel):
 42 |     page_id: int
 43 |     block_id: Optional[int] = None
 44 |     block_type: BlockTypes | None = None
 45 | 
 46 |     def __str__(self):
 47 |         if self.block_type is None or self.block_id is None:
 48 |             return f"/page/{self.page_id}"
 49 |         return f"/page/{self.page_id}/{self.block_type.name}/{self.block_id}"
 50 | 
 51 |     def __hash__(self):
 52 |         return hash(str(self))
 53 | 
 54 |     def __repr__(self):
 55 |         return str(self)
 56 | 
 57 |     def __eq__(self, other):
 58 |         if not isinstance(other, (BlockId, str)):
 59 |             return NotImplemented
 60 | 
 61 |         if isinstance(other, str):
 62 |             return str(self) == other
 63 |         else:
 64 |             return (
 65 |                 self.page_id == other.page_id
 66 |                 and self.block_id == other.block_id
 67 |                 and self.block_type == other.block_type
 68 |             )
 69 | 
 70 |     @field_validator("block_type")
 71 |     @classmethod
 72 |     def validate_block_type(cls, v):
 73 |         from marker.schema import BlockTypes
 74 | 
 75 |         if v not in BlockTypes:
 76 |             raise ValueError(f"Invalid block type: {v}")
 77 |         return v
 78 | 
 79 |     def to_path(self):
 80 |         return str(self).replace("/", "_")
 81 | 
 82 | 
 83 | class Block(BaseModel):
 84 |     polygon: PolygonBox
 85 |     block_description: str
 86 |     block_type: Optional[BlockTypes] = None
 87 |     block_id: Optional[int] = None
 88 |     page_id: Optional[int] = None
 89 |     text_extraction_method: Optional[Literal["pdftext", "surya", "gemini"]] = None
 90 |     structure: List[BlockId] | None = (
 91 |         None  # The top-level page structure, which is the block ids in order
 92 |     )
 93 |     ignore_for_output: bool = False  # Whether this block should be ignored in output
 94 |     replace_output_newlines: bool = (
 95 |         False  # Whether to replace newlines with spaces in output
 96 |     )
 97 |     source: Literal["layout", "heuristics", "processor"] = "layout"
 98 |     top_k: Optional[Dict[BlockTypes, float]] = None
 99 |     metadata: BlockMetadata | None = None
100 |     lowres_image: Image.Image | None = None
101 |     highres_image: Image.Image | None = None
102 |     removed: bool = False  # Has block been replaced by new block?
103 |     _metadata: Optional[dict] = None
104 | 
105 |     model_config = ConfigDict(arbitrary_types_allowed=True)
106 | 
107 |     @property
108 |     def id(self) -> BlockId:
109 |         return BlockId(
110 |             page_id=self.page_id, block_id=self.block_id, block_type=self.block_type
111 |         )
112 | 
113 |     @classmethod
114 |     def from_block(cls, block: Block) -> Block:
115 |         block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
116 |         return cls(**block_attrs)
117 | 
118 |     def set_internal_metadata(self, key, data):
119 |         if self._metadata is None:
120 |             self._metadata = {}
121 |         self._metadata[key] = data
122 | 
123 |     def get_internal_metadata(self, key):
124 |         if self._metadata is None:
125 |             return None
126 |         return self._metadata.get(key)
127 | 
128 |     def get_image(
129 |         self,
130 |         document: Document,
131 |         highres: bool = False,
132 |         expansion: Tuple[float, float] | None = None,
133 |         remove_blocks: Sequence[BlockTypes] | None = None,
134 |     ) -> Image.Image | None:
135 |         image = self.highres_image if highres else self.lowres_image
136 |         if image is None:
137 |             page = document.get_page(self.page_id)
138 |             page_image = page.get_image(highres=highres, remove_blocks=remove_blocks)
139 | 
140 |             # Scale to the image size
141 |             bbox = self.polygon.rescale(
142 |                 (page.polygon.width, page.polygon.height), page_image.size
143 |             )
144 |             if expansion:
145 |                 bbox = bbox.expand(*expansion)
146 |             bbox = bbox.bbox
147 |             image = page_image.crop(bbox)
148 |         return image
149 | 
150 |     def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
151 |         if self.structure is None:
152 |             return []
153 |         return [document_page.get_block(block_id) for block_id in self.structure]
154 | 
155 |     def get_prev_block(
156 |         self,
157 |         document_page: Document | PageGroup,
158 |         block: Block,
159 |         ignored_block_types: Optional[List[BlockTypes]] = None,
160 |     ):
161 |         if ignored_block_types is None:
162 |             ignored_block_types = []
163 | 
164 |         structure_idx = self.structure.index(block.id)
165 |         if structure_idx == 0:
166 |             return None
167 | 
168 |         for prev_block_id in reversed(self.structure[:structure_idx]):
169 |             if prev_block_id.block_type not in ignored_block_types:
170 |                 return document_page.get_block(prev_block_id)
171 | 
172 |     def get_next_block(
173 |         self,
174 |         document_page: Document | PageGroup,
175 |         block: Optional[Block] = None,
176 |         ignored_block_types: Optional[List[BlockTypes]] = None,
177 |     ):
178 |         if ignored_block_types is None:
179 |             ignored_block_types = []
180 | 
181 |         structure_idx = 0
182 |         if block is not None:
183 |             structure_idx = self.structure.index(block.id) + 1
184 | 
185 |         for next_block_id in self.structure[structure_idx:]:
186 |             if next_block_id.block_type not in ignored_block_types:
187 |                 return document_page.get_block(next_block_id)
188 | 
189 |         return None  # No valid next block found
190 | 
191 |     def add_structure(self, block: Block):
192 |         if self.structure is None:
193 |             self.structure = [block.id]
194 |         else:
195 |             self.structure.append(block.id)
196 | 
197 |     def update_structure_item(self, old_id: BlockId, new_id: BlockId):
198 |         if self.structure is not None:
199 |             for i, item in enumerate(self.structure):
200 |                 if item == old_id:
201 |                     self.structure[i] = new_id
202 |                     break
203 | 
204 |     def remove_structure_items(self, block_ids: List[BlockId]):
205 |         if self.structure is not None:
206 |             self.structure = [item for item in self.structure if item not in block_ids]
207 | 
208 |     def raw_text(self, document: Document) -> str:
209 |         from marker.schema.text.line import Line
210 |         from marker.schema.text.span import Span
211 |         from marker.schema.blocks.tablecell import TableCell
212 | 
213 |         if self.structure is None:
214 |             if isinstance(self, (Span, TableCell)):
215 |                 return self.text
216 |             else:
217 |                 return ""
218 | 
219 |         text = ""
220 |         for block_id in self.structure:
221 |             block = document.get_block(block_id)
222 |             text += block.raw_text(document)
223 |             if isinstance(block, Line) and not text.endswith("\n"):
224 |                 text += "\n"
225 |         return text
226 | 
227 |     def assemble_html(
228 |         self,
229 |         document: Document,
230 |         child_blocks: List[BlockOutput],
231 |         parent_structure: Optional[List[str]] = None,
232 |         block_config: Optional[dict] = None,
233 |     ) -> str:
234 |         if self.ignore_for_output:
235 |             return ""
236 | 
237 |         template = ""
238 |         for c in child_blocks:
239 |             template += f"<content-ref src='{c.id}'></content-ref>"
240 | 
241 |         if self.replace_output_newlines:
242 |             template = template.replace("\n", " ")
243 |             template = "<p>" + template + "</p>"
244 | 
245 |         return template
246 | 
247 |     def assign_section_hierarchy(self, section_hierarchy):
248 |         if self.block_type == BlockTypes.SectionHeader and self.heading_level:
249 |             levels = list(section_hierarchy.keys())
250 |             for level in levels:
251 |                 if level >= self.heading_level:
252 |                     del section_hierarchy[level]
253 |             section_hierarchy[self.heading_level] = self.id
254 | 
255 |         return section_hierarchy
256 | 
257 |     def contained_blocks(
258 |         self, document: Document, block_types: Sequence[BlockTypes] = None
259 |     ) -> List[Block]:
260 |         if self.structure is None:
261 |             return []
262 | 
263 |         blocks = []
264 |         for block_id in self.structure:
265 |             block = document.get_block(block_id)
266 |             if block.removed:
267 |                 continue
268 |             if (
269 |                 block_types is None or block.block_type in block_types
270 |             ) and not block.removed:
271 |                 blocks.append(block)
272 |             blocks += block.contained_blocks(document, block_types)
273 |         return blocks
274 | 
275 |     def replace_block(self, block: Block, new_block: Block):
276 |         if self.structure is not None:
277 |             for i, item in enumerate(self.structure):
278 |                 if item == block.id:
279 |                     self.structure[i] = new_block.id
280 |                     break
281 | 
282 |     def render(
283 |         self,
284 |         document: Document,
285 |         parent_structure: Optional[List[str]] = None,
286 |         section_hierarchy: dict | None = None,
287 |         block_config: Optional[dict] = None,
288 |     ) -> BlockOutput:
289 |         if block_config is None:
290 |             block_config = {}
291 | 
292 |         child_content = []
293 |         if section_hierarchy is None:
294 |             section_hierarchy = {}
295 |         section_hierarchy = self.assign_section_hierarchy(section_hierarchy)
296 | 
297 |         if self.structure is not None and len(self.structure) > 0:
298 |             for block_id in self.structure:
299 |                 block = document.get_block(block_id)
300 |                 rendered = block.render(
301 |                     document, self.structure, section_hierarchy, block_config
302 |                 )
303 |                 section_hierarchy = (
304 |                     rendered.section_hierarchy.copy()
305 |                 )  # Update the section hierarchy from the peer blocks
306 |                 child_content.append(rendered)
307 | 
308 |         return BlockOutput(
309 |             html=self.assemble_html(
310 |                 document, child_content, parent_structure, block_config
311 |             ),
312 |             polygon=self.polygon,
313 |             id=self.id,
314 |             children=child_content,
315 |             section_hierarchy=section_hierarchy,
316 |         )
317 | 
318 |     def line_height(self, document: Document) -> float:
319 |         lines = self.contained_blocks(document, (BlockTypes.Line,))
320 |         if len(lines) == 0:
321 |             return 0
322 |         return self.polygon.height / len(lines)
323 | 
324 |     def update_metadata(self, **kwargs):
325 |         if self.metadata is None:
326 |             self.metadata = BlockMetadata()
327 | 
328 |         for key, value in kwargs.items():
329 |             metadata_attr = getattr(self.metadata, key)
330 |             if isinstance(metadata_attr, int) and isinstance(value, int):
331 |                 setattr(self.metadata, key, metadata_attr + value)
332 |             elif isinstance(metadata_attr, str) and isinstance(value, str):
333 |                 setattr(self.metadata, key, value)
334 |             else:
335 |                 raise ValueError(f"Metadata attribute {key} is not an integer")
336 | 
337 |     def handle_html_output(
338 |         self, document, child_blocks, parent_structure, block_config=None
339 |     ):
340 |         if block_config is None:
341 |             block_config = {}
342 | 
343 |         child_ref_blocks = [
344 |             block
345 |             for block in child_blocks
346 |             if block.id.block_type == BlockTypes.Reference
347 |         ]
348 |         html = Block.assemble_html(
349 |             self, document, child_ref_blocks, parent_structure, block_config
350 |         )
351 |         return html + self.html
352 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_table.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Annotated, List, Tuple
  2 | 
  3 | from bs4 import BeautifulSoup
  4 | from PIL import Image
  5 | from marker.logger import get_logger
  6 | from pydantic import BaseModel
  7 | 
  8 | from marker.processors.llm import BaseLLMComplexBlockProcessor
  9 | from marker.schema import BlockTypes
 10 | from marker.schema.blocks import Block, TableCell, Table
 11 | from marker.schema.document import Document
 12 | from marker.schema.groups.page import PageGroup
 13 | from marker.schema.polygon import PolygonBox
 14 | 
 15 | logger = get_logger()
 16 | 
 17 | 
 18 | class LLMTableProcessor(BaseLLMComplexBlockProcessor):
 19 |     block_types: Annotated[
 20 |         Tuple[BlockTypes],
 21 |         "The block types to process.",
 22 |     ] = (BlockTypes.Table, BlockTypes.TableOfContents)
 23 |     max_rows_per_batch: Annotated[
 24 |         int,
 25 |         "If the table has more rows than this, chunk the table. (LLMs can be inaccurate with a lot of rows)",
 26 |     ] = 60
 27 |     max_table_rows: Annotated[
 28 |         int,
 29 |         "The maximum number of rows in a table to process with the LLM processor.  Beyond this will be skipped.",
 30 |     ] = 175
 31 |     table_image_expansion_ratio: Annotated[
 32 |         float,
 33 |         "The ratio to expand the image by when cropping.",
 34 |     ] = 0
 35 |     rotation_max_wh_ratio: Annotated[
 36 |         float,
 37 |         "The maximum width/height ratio for table cells for a table to be considered rotated.",
 38 |     ] = 0.6
 39 |     max_table_iterations: Annotated[
 40 |         int,
 41 |         "The maximum number of iterations to attempt rewriting a table.",
 42 |     ] = 2
 43 |     table_rewriting_prompt: Annotated[
 44 |         str,
 45 |         "The prompt to use for rewriting text.",
 46 |         "Default is a string containing the Gemini rewriting prompt.",
 47 |     ] = """You are a text correction expert specializing in accurately reproducing text from images.
 48 | You will receive an image and an html representation of the table in the image.
 49 | Your task is to correct any errors in the html representation.  The html representation should be as faithful to the original table image as possible.  The table image may be rotated, but ensure the html representation is not rotated.  Make sure to include HTML for the full table, including the opening and closing table tags.
 50 | 
 51 | Some guidelines:
 52 | - Reproduce the original values from the image as faithfully as possible.  
 53 | - There may be stray characters in the html representation that don't match the image - fix these.
 54 | - Ensure column headers match the correct column values.
 55 | - If you see any inline math in a table cell, fence it with the <math> tag.  Block math should be fenced with <math display="block">.
 56 | - Replace any images in table cells with a description, like "Image: [description]".
 57 | - Only use the tags th, td, tr, br, span, sup, sub, i, b, math, and table.  Only use the attributes display, style, colspan, and rowspan if necessary.  You can use br to break up text lines in cells.
 58 | - Make sure the columns and rows match the image faithfully, and are easily readable and interpretable by a human.
 59 | 
 60 | **Instructions:**
 61 | 1. Carefully examine the provided text block image.
 62 | 2. Analyze the html representation of the table.
 63 | 3. Write a comparison of the image and the html representation, paying special attention to the column headers matching the correct column values.
 64 | 4. If the html representation is completely correct, or you cannot read the image properly, then write "No corrections needed."  If the html representation has errors, generate the corrected html representation.  Output only either the corrected html representation or "No corrections needed."
 65 | 5. If you made corrections, analyze your corrections against the original image, and provide a score from 1-5, indicating how well the corrected html matches the image, with 5 being perfect.
 66 | **Example:**
 67 | Input:
 68 | ```html
 69 | <table>
 70 |     <tr>
 71 |         <th>First Name</th>
 72 |         <th>Last Name</th>
 73 |         <th>Age</th>
 74 |     </tr>
 75 |     <tr>
 76 |         <td>John</td>
 77 |         <td>Doe</td>
 78 |     </tr>
 79 | </table>
 80 | ```
 81 | Output:
 82 | comparison: The image shows a table with 2 rows and 3 columns.  The text and formatting of the html table matches the image.  The column headers match the correct column values.
 83 | ```html
 84 | No corrections needed.
 85 | ```
 86 | analysis: I did not make any corrections, as the html representation was already accurate.
 87 | score: 5
 88 | **Input:**
 89 | ```html
 90 | {block_html}
 91 | ```
 92 | """
 93 | 
 94 |     def handle_image_rotation(self, children: List[TableCell], image: Image.Image):
 95 |         ratios = [c.polygon.width / c.polygon.height for c in children]
 96 |         if len(ratios) < 2:
 97 |             return image
 98 | 
 99 |         is_rotated = all([r < self.rotation_max_wh_ratio for r in ratios])
100 |         if not is_rotated:
101 |             return image
102 | 
103 |         first_col_id = min([c.col_id for c in children])
104 |         first_col = [c for c in children if c.col_id == first_col_id]
105 |         first_col_cell = first_col[0]
106 | 
107 |         last_col_id = max([c.col_id for c in children])
108 |         if last_col_id == first_col_id:
109 |             return image
110 | 
111 |         last_col_cell = [c for c in children if c.col_id == last_col_id][0]
112 |         cell_diff = first_col_cell.polygon.y_start - last_col_cell.polygon.y_start
113 |         if cell_diff == 0:
114 |             return image
115 | 
116 |         if cell_diff > 0:
117 |             return image.rotate(270, expand=True)
118 |         else:
119 |             return image.rotate(90, expand=True)
120 | 
121 |     def process_rewriting(self, document: Document, page: PageGroup, block: Table):
122 |         children: List[TableCell] = block.contained_blocks(
123 |             document, (BlockTypes.TableCell,)
124 |         )
125 |         if not children:
126 |             # Happens if table/form processors didn't run
127 |             return
128 | 
129 |         # LLMs don't handle tables with a lot of rows very well
130 |         unique_rows = set([cell.row_id for cell in children])
131 |         row_count = len(unique_rows)
132 |         row_idxs = sorted(list(unique_rows))
133 | 
134 |         if row_count > self.max_table_rows:
135 |             return
136 | 
137 |         # Inference by chunk to handle long tables better
138 |         parsed_cells = []
139 |         row_shift = 0
140 |         block_image = self.extract_image(document, block)
141 |         block_rescaled_bbox = block.polygon.rescale(
142 |             page.polygon.size, page.get_image(highres=True).size
143 |         ).bbox
144 |         for i in range(0, row_count, self.max_rows_per_batch):
145 |             batch_row_idxs = row_idxs[i : i + self.max_rows_per_batch]
146 |             batch_cells = [cell for cell in children if cell.row_id in batch_row_idxs]
147 |             batch_cell_bboxes = [
148 |                 cell.polygon.rescale(
149 |                     page.polygon.size, page.get_image(highres=True).size
150 |                 ).bbox
151 |                 for cell in batch_cells
152 |             ]
153 |             # bbox relative to the block
154 |             batch_bbox = [
155 |                 min([bbox[0] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[0],
156 |                 min([bbox[1] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[1],
157 |                 max([bbox[2] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[0],
158 |                 max([bbox[3] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[1],
159 |             ]
160 |             if i == 0:
161 |                 # Ensure first image starts from the beginning
162 |                 batch_bbox[0] = 0
163 |                 batch_bbox[1] = 0
164 |             elif i > row_count - self.max_rows_per_batch + 1:
165 |                 # Ensure final image grabs the entire height and width
166 |                 batch_bbox[2] = block_image.size[0]
167 |                 batch_bbox[3] = block_image.size[1]
168 | 
169 |             batch_image = block_image.crop(batch_bbox)
170 |             block_html = block.format_cells(document, [], None, batch_cells)
171 |             batch_image = self.handle_image_rotation(batch_cells, batch_image)
172 |             batch_parsed_cells = self.rewrite_single_chunk(
173 |                 page, block, block_html, batch_cells, batch_image
174 |             )
175 |             if batch_parsed_cells is None:
176 |                 return  # Error occurred or no corrections needed
177 | 
178 |             for cell in batch_parsed_cells:
179 |                 cell.row_id += row_shift
180 |                 parsed_cells.append(cell)
181 |             row_shift += max([cell.row_id for cell in batch_parsed_cells])
182 | 
183 |         block.structure = []
184 |         for cell in parsed_cells:
185 |             page.add_full_block(cell)
186 |             block.add_structure(cell)
187 | 
188 |     def rewrite_single_chunk(
189 |         self,
190 |         page: PageGroup,
191 |         block: Block,
192 |         block_html: str,
193 |         children: List[TableCell],
194 |         image: Image.Image,
195 |         total_iterations: int = 0,
196 |     ):
197 |         prompt = self.table_rewriting_prompt.replace("{block_html}", block_html)
198 | 
199 |         response = self.llm_service(prompt, image, block, TableSchema)
200 | 
201 |         if not response or "corrected_html" not in response:
202 |             block.update_metadata(llm_error_count=1)
203 |             return
204 | 
205 |         corrected_html = response["corrected_html"]
206 | 
207 |         # The original table is okay
208 |         if "no corrections needed" in corrected_html.lower():
209 |             return
210 | 
211 |         corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()
212 | 
213 |         # Re-iterate if low score
214 |         total_iterations += 1
215 |         score = response.get("score", 5)
216 |         analysis = response.get("analysis", "")
217 |         logger.debug(f"Got table rewriting score {score} with analysis: {analysis}")
218 |         if total_iterations < self.max_table_iterations and score < 4:
219 |             logger.info(
220 |                 f"Table rewriting low score {score}, on iteration {total_iterations}"
221 |             )
222 |             block_html = corrected_html
223 |             return self.rewrite_single_chunk(
224 |                 page, block, block_html, children, image, total_iterations
225 |             )
226 | 
227 |         parsed_cells = self.parse_html_table(corrected_html, block, page)
228 |         if len(parsed_cells) <= 1:
229 |             block.update_metadata(llm_error_count=1)
230 |             logger.debug(f"Table parsing issue, only {len(parsed_cells)} cells found")
231 |             return
232 | 
233 |         if not corrected_html.endswith("</table>"):
234 |             logger.debug(
235 |                 "Table parsing issue, corrected html does not end with </table>"
236 |             )
237 |             block.update_metadata(llm_error_count=1)
238 |             return
239 | 
240 |         return parsed_cells
241 | 
242 |     @staticmethod
243 |     def get_cell_text(element, keep_tags=("br", "i", "b", "span", "math")) -> str:
244 |         for tag in element.find_all(True):
245 |             if tag.name not in keep_tags:
246 |                 tag.unwrap()
247 |         return element.decode_contents()
248 | 
249 |     def parse_html_table(
250 |         self, html_text: str, block: Block, page: PageGroup
251 |     ) -> List[TableCell]:
252 |         soup = BeautifulSoup(html_text, "html.parser")
253 |         table = soup.find("table")
254 |         if not table:
255 |             return []
256 | 
257 |         # Initialize grid
258 |         rows = table.find_all("tr")
259 |         cells = []
260 | 
261 |         # Find maximum number of columns in colspan-aware way
262 |         max_cols = 0
263 |         for row in rows:
264 |             row_tds = row.find_all(["td", "th"])
265 |             curr_cols = 0
266 |             for cell in row_tds:
267 |                 colspan = int(cell.get("colspan", 1))
268 |                 curr_cols += colspan
269 |             if curr_cols > max_cols:
270 |                 max_cols = curr_cols
271 | 
272 |         grid = [[True] * max_cols for _ in range(len(rows))]
273 | 
274 |         for i, row in enumerate(rows):
275 |             cur_col = 0
276 |             row_cells = row.find_all(["td", "th"])
277 |             for j, cell in enumerate(row_cells):
278 |                 while cur_col < max_cols and not grid[i][cur_col]:
279 |                     cur_col += 1
280 | 
281 |                 if cur_col >= max_cols:
282 |                     logger.info("Table parsing warning: too many columns found")
283 |                     break
284 | 
285 |                 cell_text = self.get_cell_text(cell).strip()
286 |                 rowspan = min(int(cell.get("rowspan", 1)), len(rows) - i)
287 |                 colspan = min(int(cell.get("colspan", 1)), max_cols - cur_col)
288 |                 cell_rows = list(range(i, i + rowspan))
289 |                 cell_cols = list(range(cur_col, cur_col + colspan))
290 | 
291 |                 if colspan == 0 or rowspan == 0:
292 |                     logger.info("Table parsing issue: invalid colspan or rowspan")
293 |                     continue
294 | 
295 |                 for r in cell_rows:
296 |                     for c in cell_cols:
297 |                         grid[r][c] = False
298 | 
299 |                 cell_bbox = [
300 |                     block.polygon.bbox[0] + cur_col,
301 |                     block.polygon.bbox[1] + i,
302 |                     block.polygon.bbox[0] + cur_col + colspan,
303 |                     block.polygon.bbox[1] + i + rowspan,
304 |                 ]
305 |                 cell_polygon = PolygonBox.from_bbox(cell_bbox)
306 | 
307 |                 cell_obj = TableCell(
308 |                     text_lines=[cell_text],
309 |                     row_id=i,
310 |                     col_id=cur_col,
311 |                     rowspan=rowspan,
312 |                     colspan=colspan,
313 |                     is_header=cell.name == "th",
314 |                     polygon=cell_polygon,
315 |                     page_id=page.page_id,
316 |                 )
317 |                 cells.append(cell_obj)
318 |                 cur_col += colspan
319 | 
320 |         return cells
321 | 
322 | 
323 | class TableSchema(BaseModel):
324 |     comparison: str
325 |     corrected_html: str
326 |     analysis: str
327 |     score: int
328 | 
```

--------------------------------------------------------------------------------
/marker/schema/groups/page.py:
--------------------------------------------------------------------------------

```python
  1 | from collections import defaultdict
  2 | from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
  3 | import numpy as np
  4 | 
  5 | from PIL import Image, ImageDraw
  6 | 
  7 | from pdftext.schema import Reference
  8 | from pydantic import computed_field
  9 | 
 10 | from marker.providers import ProviderOutput
 11 | from marker.schema import BlockTypes
 12 | from marker.schema.blocks import Block, BlockId, Text
 13 | from marker.schema.blocks.base import BlockMetadata
 14 | from marker.schema.groups.base import Group
 15 | from marker.schema.polygon import PolygonBox
 16 | from marker.util import matrix_intersection_area, sort_text_lines
 17 | 
 18 | LINE_MAPPING_TYPE = List[Tuple[int, ProviderOutput]]
 19 | 
 20 | 
 21 | class PageGroup(Group):
 22 |     block_type: BlockTypes = BlockTypes.Page
 23 |     # This is bytes if it is serialized
 24 |     lowres_image: Image.Image | None | bytes = None
 25 |     highres_image: Image.Image | None | bytes = None
 26 |     children: List[Union[Any, Block]] | None = None
 27 |     layout_sliced: bool = (
 28 |         False  # Whether the layout model had to slice the image (order may be wrong)
 29 |     )
 30 |     excluded_block_types: Sequence[BlockTypes] = (
 31 |         BlockTypes.Line,
 32 |         BlockTypes.Span,
 33 |     )
 34 |     maximum_assignment_distance: float = 20  # pixels
 35 |     block_description: str = "A single page in the document."
 36 |     refs: List[Reference] | None = None
 37 |     ocr_errors_detected: bool = False
 38 | 
 39 |     def incr_block_id(self):
 40 |         if self.block_id is None:
 41 |             self.block_id = 0
 42 |         else:
 43 |             self.block_id += 1
 44 | 
 45 |     def add_child(self, block: Block):
 46 |         if self.children is None:
 47 |             self.children = [block]
 48 |         else:
 49 |             self.children.append(block)
 50 | 
 51 |     def get_image(
 52 |         self,
 53 |         *args,
 54 |         highres: bool = False,
 55 |         remove_blocks: Sequence[BlockTypes] | None = None,
 56 |         **kwargs,
 57 |     ):
 58 |         image = self.highres_image if highres else self.lowres_image
 59 | 
 60 |         # Check if RGB, convert if needed
 61 |         if isinstance(image, Image.Image) and image.mode != "RGB":
 62 |             image = image.convert("RGB")
 63 | 
 64 |         # Avoid double OCR for certain elements
 65 |         if remove_blocks:
 66 |             image = image.copy()
 67 |             draw = ImageDraw.Draw(image)
 68 |             bad_blocks = [
 69 |                 block
 70 |                 for block in self.current_children
 71 |                 if block.block_type in remove_blocks
 72 |             ]
 73 |             for bad_block in bad_blocks:
 74 |                 poly = bad_block.polygon.rescale(self.polygon.size, image.size).polygon
 75 |                 poly = [(int(p[0]), int(p[1])) for p in poly]
 76 |                 draw.polygon(poly, fill="white")
 77 | 
 78 |         return image
 79 | 
 80 |     @computed_field
 81 |     @property
 82 |     def current_children(self) -> List[Block]:
 83 |         return [child for child in self.children if not child.removed]
 84 | 
 85 |     def get_next_block(
 86 |         self,
 87 |         block: Optional[Block] = None,
 88 |         ignored_block_types: Optional[List[BlockTypes]] = None,
 89 |     ):
 90 |         if ignored_block_types is None:
 91 |             ignored_block_types = []
 92 | 
 93 |         structure_idx = 0
 94 |         if block is not None:
 95 |             structure_idx = self.structure.index(block.id) + 1
 96 | 
 97 |         # Iterate over blocks following the given block
 98 |         for next_block_id in self.structure[structure_idx:]:
 99 |             if next_block_id.block_type not in ignored_block_types:
100 |                 return self.get_block(next_block_id)
101 | 
102 |         return None  # No valid next block found
103 | 
104 |     def get_prev_block(self, block: Block):
105 |         block_idx = self.structure.index(block.id)
106 |         if block_idx > 0:
107 |             return self.get_block(self.structure[block_idx - 1])
108 |         return None
109 | 
110 |     def add_block(self, block_cls: type[Block], polygon: PolygonBox) -> Block:
111 |         self.incr_block_id()
112 |         block = block_cls(
113 |             polygon=polygon,
114 |             block_id=self.block_id,
115 |             page_id=self.page_id,
116 |         )
117 |         self.add_child(block)
118 |         return block
119 | 
120 |     def add_full_block(self, block: Block) -> Block:
121 |         self.incr_block_id()
122 |         block.block_id = self.block_id
123 |         self.add_child(block)
124 |         return block
125 | 
126 |     def get_block(self, block_id: BlockId) -> Block | None:
127 |         block: Block = self.children[block_id.block_id]
128 |         assert block.block_id == block_id.block_id
129 |         return block
130 | 
131 |     def assemble_html(
132 |         self, document, child_blocks, parent_structure=None, block_config=None
133 |     ):
134 |         template = ""
135 |         for c in child_blocks:
136 |             template += f"<content-ref src='{c.id}'></content-ref>"
137 |         return template
138 | 
139 |     def compute_line_block_intersections(
140 |         self, blocks: List[Block], provider_outputs: List[ProviderOutput]
141 |     ):
142 |         max_intersections = {}
143 | 
144 |         block_bboxes = [block.polygon.bbox for block in blocks]
145 |         line_bboxes = [
146 |             provider_output.line.polygon.bbox for provider_output in provider_outputs
147 |         ]
148 | 
149 |         intersection_matrix = matrix_intersection_area(line_bboxes, block_bboxes)
150 | 
151 |         for line_idx, line in enumerate(provider_outputs):
152 |             intersection_line = intersection_matrix[line_idx]
153 |             if intersection_line.sum() == 0:
154 |                 continue
155 | 
156 |             max_intersection = intersection_line.argmax()
157 |             max_intersections[line_idx] = (
158 |                 intersection_matrix[line_idx, max_intersection],
159 |                 blocks[max_intersection].id,
160 |             )
161 |         return max_intersections
162 | 
163 |     def compute_max_structure_block_intersection_pct(self):
164 |         structure_blocks = [self.get_block(block_id) for block_id in self.structure]
165 |         strucure_block_bboxes = [b.polygon.bbox for b in structure_blocks]
166 | 
167 |         intersection_matrix = matrix_intersection_area(strucure_block_bboxes, strucure_block_bboxes)
168 |         np.fill_diagonal(intersection_matrix, 0)    # Ignore self-intersections
169 | 
170 |         max_intersection_pct = 0
171 |         for block_idx, block in enumerate(structure_blocks):
172 |             if block.polygon.area == 0:
173 |                 continue
174 |             max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area)
175 | 
176 |         return max_intersection_pct
177 | 
178 |     def replace_block(self, block: Block, new_block: Block):
179 |         # Handles incrementing the id
180 |         self.add_full_block(new_block)
181 | 
182 |         # Replace block id in structure
183 |         super().replace_block(block, new_block)
184 | 
185 |         # Replace block in structure of children
186 |         for child in self.children:
187 |             child.replace_block(block, new_block)
188 | 
189 |         # Mark block as removed
190 |         block.removed = True
191 | 
192 |     def identify_missing_blocks(
193 |         self,
194 |         provider_line_idxs: List[int],
195 |         provider_outputs: List[ProviderOutput],
196 |         assigned_line_idxs: set[int],
197 |     ):
198 |         new_blocks = []
199 |         new_block = None
200 |         for line_idx in provider_line_idxs:
201 |             if line_idx in assigned_line_idxs:
202 |                 continue
203 | 
204 |             # if the unassociated line is a new line with minimal area, we can skip it
205 |             if (
206 |                 provider_outputs[line_idx].line.polygon.area <= 1
207 |                 and provider_outputs[line_idx].raw_text == "\n"
208 |             ):
209 |                 continue
210 | 
211 |             if new_block is None:
212 |                 new_block = [(line_idx, provider_outputs[line_idx])]
213 |             elif all(
214 |                 [
215 |                     new_block[-1][0] + 1 == line_idx,
216 |                     provider_outputs[line_idx].line.polygon.center_distance(
217 |                         new_block[-1][1].line.polygon
218 |                     )
219 |                     < self.maximum_assignment_distance,
220 |                 ]
221 |             ):
222 |                 new_block.append((line_idx, provider_outputs[line_idx]))
223 |             else:
224 |                 new_blocks.append(new_block)
225 |                 new_block = [(line_idx, provider_outputs[line_idx])]
226 |             assigned_line_idxs.add(line_idx)
227 |         if new_block:
228 |             new_blocks.append(new_block)
229 | 
230 |         return new_blocks
231 | 
232 |     def create_missing_blocks(
233 |         self,
234 |         new_blocks: List[LINE_MAPPING_TYPE],
235 |         block_lines: Dict[BlockId, LINE_MAPPING_TYPE],
236 |     ):
237 |         for new_block in new_blocks:
238 |             block = self.add_block(Text, new_block[0][1].line.polygon)
239 |             block.source = "heuristics"
240 |             block_lines[block.id] = new_block
241 | 
242 |             min_dist_idx = None
243 |             min_dist = None
244 |             for existing_block_id in self.structure:
245 |                 existing_block = self.get_block(existing_block_id)
246 |                 if existing_block.block_type in self.excluded_block_types:
247 |                     continue
248 |                 # We want to assign to blocks closer in y than x
249 |                 dist = block.polygon.center_distance(
250 |                     existing_block.polygon, x_weight=5, absolute=True
251 |                 )
252 |                 if dist > 0 and min_dist_idx is None or dist < min_dist:
253 |                     min_dist = dist
254 |                     min_dist_idx = existing_block.id
255 | 
256 |             if min_dist_idx is not None:
257 |                 existing_idx = self.structure.index(min_dist_idx)
258 |                 self.structure.insert(existing_idx + 1, block.id)
259 |             else:
260 |                 self.structure.append(block.id)
261 | 
262 |     def add_initial_blocks(
263 |         self,
264 |         block_lines: Dict[BlockId, LINE_MAPPING_TYPE],
265 |         text_extraction_method: str,
266 |         keep_chars: bool = False,
267 |     ):
268 |         # Add lines to the proper blocks, sorted in order
269 |         for block_id, lines in block_lines.items():
270 |             line_extraction_methods = set(
271 |                 [line[1].line.text_extraction_method for line in lines]
272 |             )
273 |             if len(line_extraction_methods) == 1:
274 |                 lines = sorted(lines, key=lambda x: x[0])
275 |                 lines = [line for _, line in lines]
276 |             else:
277 |                 lines = [line for _, line in lines]
278 |                 line_polygons = [line.line.polygon for line in lines]
279 |                 sorted_line_polygons = sort_text_lines(line_polygons)
280 |                 argsort = [line_polygons.index(p) for p in sorted_line_polygons]
281 |                 lines = [lines[i] for i in argsort]
282 | 
283 |             block = self.get_block(block_id)
284 |             for provider_output in lines:
285 |                 line = provider_output.line
286 |                 spans = provider_output.spans
287 |                 self.add_full_block(line)
288 |                 block.add_structure(line)
289 |                 block.polygon = block.polygon.merge([line.polygon])
290 |                 block.text_extraction_method = text_extraction_method
291 |                 for span_idx, span in enumerate(spans):
292 |                     self.add_full_block(span)
293 |                     line.add_structure(span)
294 | 
295 |                     if not keep_chars:
296 |                         continue
297 | 
298 |                     # Provider doesn't have chars
299 |                     if len(provider_output.chars) == 0:
300 |                         continue
301 | 
302 |                     # Loop through characters associated with the span
303 |                     for char in provider_output.chars[span_idx]:
304 |                         char.page_id = self.page_id
305 |                         self.add_full_block(char)
306 |                         span.add_structure(char)
307 | 
308 |     def merge_blocks(
309 |         self,
310 |         provider_outputs: List[ProviderOutput],
311 |         text_extraction_method: str,
312 |         keep_chars: bool = False,
313 |     ):
314 |         provider_line_idxs = list(range(len(provider_outputs)))
315 |         valid_blocks = [
316 |             block
317 |             for block in self.current_children  # ensure we only look at children that haven't been replaced
318 |             if block.block_type not in self.excluded_block_types
319 |         ]
320 | 
321 |         max_intersections = self.compute_line_block_intersections(
322 |             valid_blocks, provider_outputs
323 |         )
324 | 
325 |         # Try to assign lines by intersection
326 |         assigned_line_idxs = set()
327 |         block_lines = defaultdict(list)
328 |         for line_idx, provider_output in enumerate(provider_outputs):
329 |             if line_idx in max_intersections:
330 |                 block_id = max_intersections[line_idx][1]
331 |                 block_lines[block_id].append((line_idx, provider_output))
332 |                 assigned_line_idxs.add(line_idx)
333 | 
334 |         # If no intersection, assign by distance
335 |         for line_idx in set(provider_line_idxs).difference(assigned_line_idxs):
336 |             min_dist = None
337 |             min_dist_idx = None
338 |             provider_output: ProviderOutput = provider_outputs[line_idx]
339 |             line = provider_output.line
340 |             for block in valid_blocks:
341 |                 # We want to assign to blocks closer in y than x
342 |                 dist = line.polygon.center_distance(block.polygon, x_weight=5)
343 |                 if min_dist_idx is None or dist < min_dist:
344 |                     min_dist = dist
345 |                     min_dist_idx = block.id
346 | 
347 |             if min_dist_idx is not None and min_dist < self.maximum_assignment_distance:
348 |                 block_lines[min_dist_idx].append((line_idx, provider_output))
349 |                 assigned_line_idxs.add(line_idx)
350 | 
351 |         # This creates new blocks to hold anything too far away
352 |         new_blocks = self.identify_missing_blocks(
353 |             provider_line_idxs, provider_outputs, assigned_line_idxs
354 |         )
355 |         self.create_missing_blocks(new_blocks, block_lines)
356 | 
357 |         # Add blocks to the page
358 |         self.add_initial_blocks(block_lines, text_extraction_method, keep_chars)
359 | 
360 |     def aggregate_block_metadata(self) -> BlockMetadata:
361 |         if self.metadata is None:
362 |             self.metadata = BlockMetadata()
363 | 
364 |         for block in self.current_children:
365 |             if block.metadata is not None:
366 |                 self.metadata = self.metadata.merge(block.metadata)
367 |         return self.metadata
368 | 
```
Page 4/9FirstPrevNextLast