#
tokens: 49306/50000 23/235 files (page 3/7)
lines: off (toggle) GitHub
raw markdown copy
This is page 3 of 7. Use http://codebase.md/datalab-to/marker?lines=false&page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── ISSUE_TEMPLATE
│   │   ├── breaking-bug-report.md
│   │   ├── feature_request.md
│   │   └── output-bug-report.md
│   └── workflows
│       ├── benchmarks.yml
│       ├── ci.yml
│       ├── cla.yml
│       ├── publish.yml
│       └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│   ├── __init__.py
│   ├── overall
│   │   ├── __init__.py
│   │   ├── display
│   │   │   ├── __init__.py
│   │   │   ├── dataset.py
│   │   │   └── table.py
│   │   ├── download
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── llamaparse.py
│   │   │   ├── main.py
│   │   │   ├── mathpix.py
│   │   │   └── mistral.py
│   │   ├── elo.py
│   │   ├── methods
│   │   │   ├── __init__.py
│   │   │   ├── docling.py
│   │   │   ├── gt.py
│   │   │   ├── llamaparse.py
│   │   │   ├── marker.py
│   │   │   ├── mathpix.py
│   │   │   ├── mistral.py
│   │   │   ├── olmocr.py
│   │   │   └── schema.py
│   │   ├── overall.py
│   │   ├── registry.py
│   │   ├── schema.py
│   │   └── scorers
│   │       ├── __init__.py
│   │       ├── clean.py
│   │       ├── heuristic.py
│   │       ├── llm.py
│   │       └── schema.py
│   ├── table
│   │   ├── __init__.py
│   │   ├── gemini.py
│   │   ├── inference.py
│   │   ├── scoring.py
│   │   └── table.py
│   ├── throughput
│   │   ├── __init__.py
│   │   └── main.py
│   └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│   ├── .gitignore
│   ├── examples
│   │   ├── json
│   │   │   ├── multicolcnn.json
│   │   │   ├── switch_trans.json
│   │   │   └── thinkpython.json
│   │   └── markdown
│   │       ├── multicolcnn
│   │       │   ├── _page_1_Figure_0.jpeg
│   │       │   ├── _page_2_Picture_0.jpeg
│   │       │   ├── _page_6_Figure_0.jpeg
│   │       │   ├── _page_7_Figure_0.jpeg
│   │       │   ├── multicolcnn_meta.json
│   │       │   └── multicolcnn.md
│   │       ├── switch_transformers
│   │       │   ├── _page_11_Figure_4.jpeg
│   │       │   ├── _page_12_Figure_4.jpeg
│   │       │   ├── _page_13_Figure_2.jpeg
│   │       │   ├── _page_18_Figure_1.jpeg
│   │       │   ├── _page_18_Figure_3.jpeg
│   │       │   ├── _page_2_Figure_3.jpeg
│   │       │   ├── _page_20_Figure_1.jpeg
│   │       │   ├── _page_20_Figure_4.jpeg
│   │       │   ├── _page_27_Figure_1.jpeg
│   │       │   ├── _page_29_Figure_1.jpeg
│   │       │   ├── _page_30_Figure_1.jpeg
│   │       │   ├── _page_31_Figure_3.jpeg
│   │       │   ├── _page_4_Figure_1.jpeg
│   │       │   ├── _page_5_Figure_3.jpeg
│   │       │   ├── switch_trans_meta.json
│   │       │   └── switch_trans.md
│   │       └── thinkpython
│   │           ├── _page_109_Figure_1.jpeg
│   │           ├── _page_115_Figure_1.jpeg
│   │           ├── _page_116_Figure_3.jpeg
│   │           ├── _page_127_Figure_1.jpeg
│   │           ├── _page_128_Figure_1.jpeg
│   │           ├── _page_167_Figure_1.jpeg
│   │           ├── _page_169_Figure_1.jpeg
│   │           ├── _page_173_Figure_1.jpeg
│   │           ├── _page_190_Figure_1.jpeg
│   │           ├── _page_195_Figure_1.jpeg
│   │           ├── _page_205_Figure_1.jpeg
│   │           ├── _page_23_Figure_1.jpeg
│   │           ├── _page_23_Figure_3.jpeg
│   │           ├── _page_230_Figure_1.jpeg
│   │           ├── _page_233_Figure_1.jpeg
│   │           ├── _page_233_Figure_3.jpeg
│   │           ├── _page_234_Figure_1.jpeg
│   │           ├── _page_235_Figure_1.jpeg
│   │           ├── _page_236_Figure_1.jpeg
│   │           ├── _page_236_Figure_3.jpeg
│   │           ├── _page_237_Figure_1.jpeg
│   │           ├── _page_238_Figure_1.jpeg
│   │           ├── _page_46_Figure_1.jpeg
│   │           ├── _page_60_Figure_1.jpeg
│   │           ├── _page_60_Figure_3.jpeg
│   │           ├── _page_67_Figure_1.jpeg
│   │           ├── _page_71_Figure_1.jpeg
│   │           ├── _page_78_Figure_1.jpeg
│   │           ├── _page_85_Figure_1.jpeg
│   │           ├── _page_94_Figure_1.jpeg
│   │           ├── _page_99_Figure_17.jpeg
│   │           ├── _page_99_Figure_178.jpeg
│   │           ├── thinkpython_meta.json
│   │           └── thinkpython.md
│   ├── images
│   │   ├── overall.png
│   │   ├── per_doc.png
│   │   └── table.png
│   └── latex_to_md.sh
├── examples
│   ├── marker_modal_deployment.py
│   └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│   ├── builders
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── layout.py
│   │   ├── line.py
│   │   ├── ocr.py
│   │   └── structure.py
│   ├── config
│   │   ├── __init__.py
│   │   ├── crawler.py
│   │   ├── parser.py
│   │   └── printer.py
│   ├── converters
│   │   ├── __init__.py
│   │   ├── extraction.py
│   │   ├── ocr.py
│   │   ├── pdf.py
│   │   └── table.py
│   ├── extractors
│   │   ├── __init__.py
│   │   ├── document.py
│   │   └── page.py
│   ├── logger.py
│   ├── models.py
│   ├── output.py
│   ├── processors
│   │   ├── __init__.py
│   │   ├── blank_page.py
│   │   ├── block_relabel.py
│   │   ├── blockquote.py
│   │   ├── code.py
│   │   ├── debug.py
│   │   ├── document_toc.py
│   │   ├── equation.py
│   │   ├── footnote.py
│   │   ├── ignoretext.py
│   │   ├── line_merge.py
│   │   ├── line_numbers.py
│   │   ├── list.py
│   │   ├── llm
│   │   │   ├── __init__.py
│   │   │   ├── llm_complex.py
│   │   │   ├── llm_equation.py
│   │   │   ├── llm_form.py
│   │   │   ├── llm_handwriting.py
│   │   │   ├── llm_image_description.py
│   │   │   ├── llm_mathblock.py
│   │   │   ├── llm_meta.py
│   │   │   ├── llm_page_correction.py
│   │   │   ├── llm_sectionheader.py
│   │   │   ├── llm_table_merge.py
│   │   │   └── llm_table.py
│   │   ├── order.py
│   │   ├── page_header.py
│   │   ├── reference.py
│   │   ├── sectionheader.py
│   │   ├── table.py
│   │   ├── text.py
│   │   └── util.py
│   ├── providers
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── epub.py
│   │   ├── html.py
│   │   ├── image.py
│   │   ├── pdf.py
│   │   ├── powerpoint.py
│   │   ├── registry.py
│   │   ├── spreadsheet.py
│   │   └── utils.py
│   ├── renderers
│   │   ├── __init__.py
│   │   ├── chunk.py
│   │   ├── extraction.py
│   │   ├── html.py
│   │   ├── json.py
│   │   ├── markdown.py
│   │   └── ocr_json.py
│   ├── schema
│   │   ├── __init__.py
│   │   ├── blocks
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── basetable.py
│   │   │   ├── caption.py
│   │   │   ├── code.py
│   │   │   ├── complexregion.py
│   │   │   ├── equation.py
│   │   │   ├── figure.py
│   │   │   ├── footnote.py
│   │   │   ├── form.py
│   │   │   ├── handwriting.py
│   │   │   ├── inlinemath.py
│   │   │   ├── listitem.py
│   │   │   ├── pagefooter.py
│   │   │   ├── pageheader.py
│   │   │   ├── picture.py
│   │   │   ├── reference.py
│   │   │   ├── sectionheader.py
│   │   │   ├── table.py
│   │   │   ├── tablecell.py
│   │   │   ├── text.py
│   │   │   └── toc.py
│   │   ├── document.py
│   │   ├── groups
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── figure.py
│   │   │   ├── list.py
│   │   │   ├── page.py
│   │   │   ├── picture.py
│   │   │   └── table.py
│   │   ├── polygon.py
│   │   ├── registry.py
│   │   └── text
│   │       ├── __init__.py
│   │       ├── char.py
│   │       ├── line.py
│   │       └── span.py
│   ├── scripts
│   │   ├── __init__.py
│   │   ├── chunk_convert.py
│   │   ├── chunk_convert.sh
│   │   ├── common.py
│   │   ├── convert_single.py
│   │   ├── convert.py
│   │   ├── extraction_app.py
│   │   ├── file_to_s3.py
│   │   ├── run_streamlit_app.py
│   │   ├── server.py
│   │   └── streamlit_app.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── azure_openai.py
│   │   ├── claude.py
│   │   ├── gemini.py
│   │   ├── ollama.py
│   │   ├── openai.py
│   │   └── vertex.py
│   ├── settings.py
│   ├── util.py
│   └── utils
│       ├── __init__.py
│       ├── batch.py
│       ├── gpu.py
│       └── image.py
├── marker_app.py
├── marker_server.py
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│   └── version1
│       └── cla.json
├── static
│   └── fonts
│       └── .gitignore
└── tests
    ├── builders
    │   ├── test_blank_page.py
    │   ├── test_document_builder.py
    │   ├── test_garbled_pdf.py
    │   ├── test_layout_replace.py
    │   ├── test_ocr_builder.py
    │   ├── test_ocr_pipeline.py
    │   ├── test_overriding.py
    │   ├── test_pdf_links.py
    │   ├── test_rotated_bboxes.py
    │   ├── test_strip_existing_ocr.py
    │   └── test_structure.py
    ├── config
    │   └── test_config.py
    ├── conftest.py
    ├── converters
    │   ├── test_extraction_converter.py
    │   ├── test_ocr_converter.py
    │   ├── test_pdf_converter.py
    │   └── test_table_converter.py
    ├── processors
    │   ├── test_document_toc_processor.py
    │   ├── test_equation_processor.py
    │   ├── test_footnote_processor.py
    │   ├── test_ignoretext.py
    │   ├── test_llm_processors.py
    │   ├── test_table_merge.py
    │   └── test_table_processor.py
    ├── providers
    │   ├── test_document_providers.py
    │   ├── test_image_provider.py
    │   └── test_pdf_provider.py
    ├── renderers
    │   ├── test_chunk_renderer.py
    │   ├── test_extract_images.py
    │   ├── test_html_renderer.py
    │   ├── test_json_renderer.py
    │   └── test_markdown_renderer.py
    ├── schema
    │   └── groups
    │       └── test_list_grouping.py
    ├── services
    │   └── test_service_init.py
    └── utils.py
```

# Files

--------------------------------------------------------------------------------
/marker/config/parser.py:
--------------------------------------------------------------------------------

```python
import json
import os
from typing import Dict

import click

from marker.converters.pdf import PdfConverter
from marker.logger import get_logger
from marker.renderers.chunk import ChunkRenderer
from marker.renderers.html import HTMLRenderer
from marker.renderers.json import JSONRenderer
from marker.renderers.markdown import MarkdownRenderer
from marker.settings import settings
from marker.util import classes_to_strings, parse_range_str, strings_to_classes

logger = get_logger()


class ConfigParser:
    def __init__(self, cli_options: dict):
        self.cli_options = cli_options

    @staticmethod
    def common_options(fn):
        fn = click.option(
            "--output_dir",
            type=click.Path(exists=False),
            required=False,
            default=settings.OUTPUT_DIR,
            help="Directory to save output.",
        )(fn)
        fn = click.option("--debug", "-d", is_flag=True, help="Enable debug mode.")(fn)
        fn = click.option(
            "--output_format",
            type=click.Choice(["markdown", "json", "html", "chunks"]),
            default="markdown",
            help="Format to output results in.",
        )(fn)
        fn = click.option(
            "--processors",
            type=str,
            default=None,
            help="Comma separated list of processors to use.  Must use full module path.",
        )(fn)
        fn = click.option(
            "--config_json",
            type=str,
            default=None,
            help="Path to JSON file with additional configuration.",
        )(fn)
        fn = click.option(
            "--disable_multiprocessing",
            is_flag=True,
            default=False,
            help="Disable multiprocessing.",
        )(fn)
        fn = click.option(
            "--disable_image_extraction",
            is_flag=True,
            default=False,
            help="Disable image extraction.",
        )(fn)
        # these are options that need a list transformation, i.e splitting/parsing a string
        fn = click.option(
            "--page_range",
            type=str,
            default=None,
            help="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20",
        )(fn)

        # we put common options here
        fn = click.option(
            "--converter_cls",
            type=str,
            default=None,
            help="Converter class to use.  Defaults to PDF converter.",
        )(fn)
        fn = click.option(
            "--llm_service",
            type=str,
            default=None,
            help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService",
        )(fn)
        return fn

    def generate_config_dict(self) -> Dict[str, any]:
        config = {}
        output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
        for k, v in self.cli_options.items():
            if not v:
                continue

            match k:
                case "debug":
                    config["debug_pdf_images"] = True
                    config["debug_layout_images"] = True
                    config["debug_json"] = True
                    config["debug_data_folder"] = output_dir
                case "page_range":
                    config["page_range"] = parse_range_str(v)
                case "config_json":
                    with open(v, "r", encoding="utf-8") as f:
                        config.update(json.load(f))
                case "disable_multiprocessing":
                    config["pdftext_workers"] = 1
                case "disable_image_extraction":
                    config["extract_images"] = False
                case _:
                    config[k] = v

        # Backward compatibility for google_api_key
        if settings.GOOGLE_API_KEY:
            config["gemini_api_key"] = settings.GOOGLE_API_KEY

        return config

    def get_llm_service(self):
        # Only return an LLM service when use_llm is enabled
        if not self.cli_options.get("use_llm", False):
            return None

        service_cls = self.cli_options.get("llm_service", None)
        if service_cls is None:
            service_cls = "marker.services.gemini.GoogleGeminiService"
        return service_cls

    def get_renderer(self):
        match self.cli_options["output_format"]:
            case "json":
                r = JSONRenderer
            case "markdown":
                r = MarkdownRenderer
            case "html":
                r = HTMLRenderer
            case "chunks":
                r = ChunkRenderer
            case _:
                raise ValueError("Invalid output format")
        return classes_to_strings([r])[0]

    def get_processors(self):
        processors = self.cli_options.get("processors", None)
        if processors is not None:
            processors = processors.split(",")
            for p in processors:
                try:
                    strings_to_classes([p])
                except Exception as e:
                    logger.error(f"Error loading processor: {p} with error: {e}")
                    raise

        return processors

    def get_converter_cls(self):
        converter_cls = self.cli_options.get("converter_cls", None)
        if converter_cls is not None:
            try:
                return strings_to_classes([converter_cls])[0]
            except Exception as e:
                logger.error(
                    f"Error loading converter: {converter_cls} with error: {e}"
                )
                raise

        return PdfConverter

    def get_output_folder(self, filepath: str):
        output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
        fname_base = os.path.splitext(os.path.basename(filepath))[0]
        output_dir = os.path.join(output_dir, fname_base)
        os.makedirs(output_dir, exist_ok=True)
        return output_dir

    def get_base_filename(self, filepath: str):
        basename = os.path.basename(filepath)
        return os.path.splitext(basename)[0]

```

--------------------------------------------------------------------------------
/marker/builders/layout.py:
--------------------------------------------------------------------------------

```python
from typing import Annotated, List

from surya.layout import LayoutPredictor
from surya.layout.schema import LayoutResult, LayoutBox

from marker.builders import BaseBuilder
from marker.providers.pdf import PdfProvider
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup
from marker.schema.polygon import PolygonBox
from marker.schema.registry import get_block_class
from marker.settings import settings


class LayoutBuilder(BaseBuilder):
    """
    A builder for performing layout detection on PDF pages and merging the results into the document.
    """

    layout_batch_size: Annotated[
        int,
        "The batch size to use for the layout model.",
        "Default is None, which will use the default batch size for the model.",
    ] = None
    force_layout_block: Annotated[
        str,
        "Skip layout and force every page to be treated as a specific block type.",
    ] = None
    disable_tqdm: Annotated[
        bool,
        "Disable tqdm progress bars.",
    ] = False
    expand_block_types: Annotated[
        List[BlockTypes],
        "Block types whose bounds should be expanded to accomodate missing regions",
    ] = [
        BlockTypes.Picture,
        BlockTypes.Figure,
        BlockTypes.ComplexRegion,
    ]  # Does not include groups since they are only injected later
    max_expand_frac: Annotated[
        float, "The maximum fraction to expand the layout box bounds by"
    ] = 0.05

    def __init__(self, layout_model: LayoutPredictor, config=None):
        self.layout_model = layout_model

        super().__init__(config)

    def __call__(self, document: Document, provider: PdfProvider):
        if self.force_layout_block is not None:
            # Assign the full content of every page to a single layout type
            layout_results = self.forced_layout(document.pages)
        else:
            layout_results = self.surya_layout(document.pages)
        self.add_blocks_to_pages(document.pages, layout_results)
        self.expand_layout_blocks(document)

    def get_batch_size(self):
        if self.layout_batch_size is not None:
            return self.layout_batch_size
        elif settings.TORCH_DEVICE_MODEL == "cuda":
            return 12
        return 6

    def forced_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
        layout_results = []
        for page in pages:
            layout_results.append(
                LayoutResult(
                    image_bbox=page.polygon.bbox,
                    bboxes=[
                        LayoutBox(
                            label=self.force_layout_block,
                            position=0,
                            top_k={self.force_layout_block: 1},
                            polygon=page.polygon.polygon,
                        ),
                    ],
                    sliced=False,
                )
            )
        return layout_results

    def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
        self.layout_model.disable_tqdm = self.disable_tqdm
        layout_results = self.layout_model(
            [p.get_image(highres=False) for p in pages],
            batch_size=int(self.get_batch_size()),
        )
        return layout_results

    def expand_layout_blocks(self, document: Document):
        for page in document.pages:
            # Collect all blocks on this page as PolygonBox for easy access
            page_blocks = [document.get_block(bid) for bid in page.structure]
            page_size = page.polygon.size

            for block_id in page.structure:
                block = document.get_block(block_id)
                if block.block_type in self.expand_block_types:
                    other_blocks = [b for b in page_blocks if b != block]
                    if not other_blocks:
                        block.polygon = block.polygon.expand(
                            self.max_expand_frac, self.max_expand_frac
                        ).fit_to_bounds((0, 0, *page_size))
                        continue

                    min_gap = min(
                        block.polygon.minimum_gap(other.polygon)
                        for other in other_blocks
                    )
                    if min_gap <= 0:
                        continue

                    x_expand_frac = (
                        min_gap / block.polygon.width if block.polygon.width > 0 else 0
                    )
                    y_expand_frac = (
                        min_gap / block.polygon.height
                        if block.polygon.height > 0
                        else 0
                    )

                    block.polygon = block.polygon.expand(
                        min(self.max_expand_frac, x_expand_frac),
                        min(self.max_expand_frac, y_expand_frac),
                    ).fit_to_bounds((0, 0, *page_size))

    def add_blocks_to_pages(
        self, pages: List[PageGroup], layout_results: List[LayoutResult]
    ):
        for page, layout_result in zip(pages, layout_results):
            layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
            provider_page_size = page.polygon.size
            page.layout_sliced = (
                layout_result.sliced
            )  # This indicates if the page was sliced by the layout model
            for bbox in sorted(layout_result.bboxes, key=lambda x: x.position):
                block_cls = get_block_class(BlockTypes[bbox.label])
                layout_block = page.add_block(
                    block_cls, PolygonBox(polygon=bbox.polygon)
                )
                layout_block.polygon = layout_block.polygon.rescale(
                    layout_page_size, provider_page_size
                ).fit_to_bounds((0, 0, *provider_page_size))
                layout_block.top_k = {
                    BlockTypes[label]: prob
                    for (label, prob) in bbox.top_k.items()
                    if label in BlockTypes.__members__
                }
                page.add_structure(layout_block)

            # Ensure page has non-empty structure
            if page.structure is None:
                page.structure = []

            # Ensure page has non-empty children
            if page.children is None:
                page.children = []

```

--------------------------------------------------------------------------------
/marker/scripts/convert.py:
--------------------------------------------------------------------------------

```python
import atexit
import os
import time

import psutil
import torch

from marker.utils.batch import get_batch_sizes_worker_counts

# Ensure threads don't contend
os.environ["MKL_DYNAMIC"] = "FALSE"
os.environ["OMP_DYNAMIC"] = "FALSE"
os.environ["OMP_NUM_THREADS"] = "2"  # Avoid OpenMP issues with multiprocessing
os.environ["OPENBLAS_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["GRPC_VERBOSITY"] = "ERROR"
os.environ["GLOG_minloglevel"] = "2"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
    "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
)
os.environ["IN_STREAMLIT"] = "true"  # Avoid multiprocessing inside surya

import math
import traceback

import click
import torch.multiprocessing as mp
from tqdm import tqdm
import gc

from marker.config.parser import ConfigParser
from marker.config.printer import CustomClickPrinter
from marker.logger import configure_logging, get_logger
from marker.models import create_model_dict
from marker.output import output_exists, save_output
from marker.utils.gpu import GPUManager

configure_logging()
logger = get_logger()


def worker_init():
    model_dict = create_model_dict()

    global model_refs
    model_refs = model_dict

    # Ensure we clean up the model references on exit
    atexit.register(worker_exit)


def worker_exit():
    global model_refs
    try:
        del model_refs
    except Exception:
        pass


def process_single_pdf(args):
    page_count = 0
    fpath, cli_options = args
    torch.set_num_threads(cli_options["total_torch_threads"])
    del cli_options["total_torch_threads"]

    config_parser = ConfigParser(cli_options)

    out_folder = config_parser.get_output_folder(fpath)
    base_name = config_parser.get_base_filename(fpath)
    if cli_options.get("skip_existing") and output_exists(out_folder, base_name):
        return page_count

    converter_cls = config_parser.get_converter_cls()
    config_dict = config_parser.generate_config_dict()
    config_dict["disable_tqdm"] = True

    try:
        if cli_options.get("debug_print"):
            logger.debug(f"Converting {fpath}")
        converter = converter_cls(
            config=config_dict,
            artifact_dict=model_refs,
            processor_list=config_parser.get_processors(),
            renderer=config_parser.get_renderer(),
            llm_service=config_parser.get_llm_service(),
        )
        rendered = converter(fpath)
        out_folder = config_parser.get_output_folder(fpath)
        save_output(rendered, out_folder, base_name)
        page_count = converter.page_count

        if cli_options.get("debug_print"):
            logger.debug(f"Converted {fpath}")
        del rendered
        del converter
    except Exception as e:
        logger.error(f"Error converting {fpath}: {e}")
        traceback.print_exc()
    finally:
        gc.collect()

    return page_count


@click.command(cls=CustomClickPrinter)
@click.argument("in_folder", type=str)
@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
@click.option(
    "--num_chunks",
    type=int,
    default=1,
    help="Number of chunks being processed in parallel",
)
@click.option(
    "--max_files", type=int, default=None, help="Maximum number of pdfs to convert"
)
@click.option(
    "--skip_existing",
    is_flag=True,
    default=False,
    help="Skip existing converted files.",
)
@click.option(
    "--debug_print", is_flag=True, default=False, help="Print debug information."
)
@click.option(
    "--max_tasks_per_worker",
    type=int,
    default=10,
    help="Maximum number of tasks per worker process before recycling.",
)
@click.option(
    "--workers",
    type=int,
    default=None,
    help="Number of worker processes to use.  Set automatically by default, but can be overridden.",
)
@ConfigParser.common_options
def convert_cli(in_folder: str, **kwargs):
    total_pages = 0
    in_folder = os.path.abspath(in_folder)
    files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
    files = [f for f in files if os.path.isfile(f)]

    # Handle chunks if we're processing in parallel
    # Ensure we get all files into a chunk
    chunk_size = math.ceil(len(files) / kwargs["num_chunks"])
    start_idx = kwargs["chunk_idx"] * chunk_size
    end_idx = start_idx + chunk_size
    files_to_convert = files[start_idx:end_idx]

    # Limit files converted if needed
    if kwargs["max_files"]:
        files_to_convert = files_to_convert[: kwargs["max_files"]]

    # Disable nested multiprocessing
    kwargs["disable_multiprocessing"] = True

    try:
        mp.set_start_method("spawn")  # Required for CUDA, forkserver doesn't work
    except RuntimeError:
        raise RuntimeError(
            "Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again."
        )

    chunk_idx = kwargs["chunk_idx"]

    # Use GPU context manager for automatic setup/cleanup
    with GPUManager(chunk_idx) as gpu_manager:
        batch_sizes, workers = get_batch_sizes_worker_counts(gpu_manager, 7)

        # Override workers if specified
        if kwargs["workers"] is not None:
            workers = kwargs["workers"]

        # Set proper batch sizes and thread counts
        total_processes = max(1, min(len(files_to_convert), workers))
        kwargs["total_torch_threads"] = max(
            2, psutil.cpu_count(logical=False) // total_processes
        )
        kwargs.update(batch_sizes)

        logger.info(
            f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}"
        )
        task_args = [(f, kwargs) for f in files_to_convert]

        start_time = time.time()
        with mp.Pool(
            processes=total_processes,
            initializer=worker_init,
            maxtasksperchild=kwargs["max_tasks_per_worker"],
        ) as pool:
            pbar = tqdm(total=len(task_args), desc="Processing PDFs", unit="pdf")
            for page_count in pool.imap_unordered(process_single_pdf, task_args):
                pbar.update(1)
                total_pages += page_count
            pbar.close()

        total_time = time.time() - start_time
        print(
            f"Inferenced {total_pages} pages in {total_time:.2f} seconds, for a throughput of {total_pages / total_time:.2f} pages/sec for chunk {chunk_idx + 1}/{kwargs['num_chunks']}"
        )

```

--------------------------------------------------------------------------------
/marker/processors/llm/__init__.py:
--------------------------------------------------------------------------------

```python
import json
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Annotated, TypedDict, List, Sequence

from pydantic import BaseModel
from tqdm import tqdm
from PIL import Image

from marker.output import json_to_html
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block, BlockId
from marker.schema.document import Document
from marker.schema.groups import PageGroup
from marker.services import BaseService
from marker.util import assign_config
from marker.logger import get_logger

logger = get_logger()


class PromptData(TypedDict):
    prompt: str
    image: Image.Image
    block: Block
    schema: BaseModel
    page: PageGroup
    additional_data: dict | None


class BlockData(TypedDict):
    page: PageGroup
    block: Block


class BaseLLMProcessor(BaseProcessor):
    """
    A processor for using LLMs to convert blocks.
    """

    max_concurrency: Annotated[
        int,
        "The maximum number of concurrent requests to make to the Gemini model.",
    ] = 3
    image_expansion_ratio: Annotated[
        float,
        "The ratio to expand the image by when cropping.",
    ] = 0.01
    use_llm: Annotated[
        bool,
        "Whether to use the LLM model.",
    ] = False
    disable_tqdm: Annotated[
        bool,
        "Whether to disable the tqdm progress bar.",
    ] = False
    block_types = None

    def __init__(self, llm_service: BaseService, config=None):
        super().__init__(config)

        self.llm_service = None
        if not self.use_llm:
            return

        self.llm_service = llm_service

    def extract_image(
        self,
        document: Document,
        image_block: Block,
        remove_blocks: Sequence[BlockTypes] | None = None,
    ) -> Image.Image:
        return image_block.get_image(
            document,
            highres=True,
            expansion=(self.image_expansion_ratio, self.image_expansion_ratio),
            remove_blocks=remove_blocks,
        )

    def normalize_block_json(self, block: Block, document: Document, page: PageGroup):
        """
        Get the normalized JSON representation of a block for the LLM.
        """
        page_width = page.polygon.width
        page_height = page.polygon.height
        block_bbox = block.polygon.bbox

        # Normalize bbox to 0-1000 range
        normalized_bbox = [
            (block_bbox[0] / page_width) * 1000,
            (block_bbox[1] / page_height) * 1000,
            (block_bbox[2] / page_width) * 1000,
            (block_bbox[3] / page_height) * 1000,
        ]

        block_json = {
            "id": str(block.id),
            "block_type": str(block.id.block_type),
            "bbox": normalized_bbox,
            "html": json_to_html(block.render(document)),
        }

        return block_json

    def load_blocks(self, response: dict):
        return [json.loads(block) for block in response["blocks"]]

    def handle_rewrites(self, blocks: list, document: Document):
        for block_data in blocks:
            try:
                block_id = block_data["id"].strip().lstrip("/")
                _, page_id, block_type, block_id = block_id.split("/")
                block_id = BlockId(
                    page_id=page_id,
                    block_id=block_id,
                    block_type=getattr(BlockTypes, block_type),
                )
                block = document.get_block(block_id)
                if not block:
                    logger.debug(f"Block {block_id} not found in document")
                    continue

                if hasattr(block, "html"):
                    block.html = block_data["html"]
            except Exception as e:
                logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
                continue


class BaseLLMComplexBlockProcessor(BaseLLMProcessor):
    """
    A processor for using LLMs to convert blocks with more complex logic.
    """

    def __call__(self, document: Document):
        if not self.use_llm or self.llm_service is None:
            return

        try:
            self.rewrite_blocks(document)
        except Exception as e:
            logger.warning(f"Error rewriting blocks in {self.__class__.__name__}: {e}")

    def process_rewriting(self, document: Document, page: PageGroup, block: Block):
        raise NotImplementedError()

    def rewrite_blocks(self, document: Document):
        # Don't show progress if there are no blocks to process
        total_blocks = sum(
            len(page.contained_blocks(document, self.block_types))
            for page in document.pages
        )
        if total_blocks == 0:
            return

        pbar = tqdm(
            total=total_blocks,
            desc=f"{self.__class__.__name__} running",
            disable=self.disable_tqdm
        )
        with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
            for future in as_completed(
                [
                    executor.submit(self.process_rewriting, document, page, block)
                    for page in document.pages
                    for block in page.contained_blocks(document, self.block_types)
                ]
            ):
                future.result()  # Raise exceptions if any occurred
                pbar.update(1)

        pbar.close()


class BaseLLMSimpleBlockProcessor(BaseLLMProcessor):
    """
    A processor for using LLMs to convert single blocks.
    """

    # Override init since we don't need an llmservice here
    def __init__(self, config=None):
        assign_config(self, config)

    def __call__(self, result: dict, prompt_data: PromptData, document: Document):
        try:
            self.rewrite_block(result, prompt_data, document)
        except Exception as e:
            logger.warning(f"Error rewriting block in {self.__class__.__name__}: {e}")
            traceback.print_exc()

    def inference_blocks(self, document: Document) -> List[BlockData]:
        blocks = []
        for page in document.pages:
            for block in page.contained_blocks(document, self.block_types):
                blocks.append({"page": page, "block": block})
        return blocks

    def block_prompts(self, document: Document) -> List[PromptData]:
        raise NotImplementedError()

    def rewrite_block(
        self, response: dict, prompt_data: PromptData, document: Document
    ):
        raise NotImplementedError()

```

--------------------------------------------------------------------------------
/benchmarks/overall/scorers/llm.py:
--------------------------------------------------------------------------------

```python
import json
import os
import tempfile
import time
from typing import List

from PIL import Image
from google.genai.errors import APIError
from google import genai
import pypdfium2 as pdfium

from benchmarks.overall.scorers import BaseScorer, BlockScores
from marker.settings import settings

rating_prompt = """
You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided.
You're given an image, along with the extracted markdown:
- Some parts of the page may have been recognized as images and linked from the markdown, like `![](_page_0_Picture_0.jpeg)`.
- Tables will be formatted as Github flavored markdown.
- Block equations will be in LaTeX.
- The image and markdown may be in any language.
- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.

The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.

**Instructions**
Follow this process to evaluate the markdown:
1. Carefully examine the image.
2. Carefully examine the markdown input provided.
3. Compare the image to the markdown representation.  Does the markdown representation properly represent the important text and formatting in the image?
4. Assign component scores, as described below.

These are the primary scores:
- Overall - the overall quality of the markdown as compared to the image.
- Text quality - the quality of the text extraction from the image.
- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.

Depending on which elements are present in the markdown, you will assign element-specific scores.
- Tables - how effectively the tables have been extracted and formatted.
- Forms - how effectively the forms have extracted and formatted.
- Equations - how effectively block equations have been converted to LaTeX.
- Section headers - if all of the section headers have been detected, and the right levels set.
- Lists - if the lists have been properly extracted and formatted.
- Images - if images are identified and placed correctly.

Notes on scoring:
- To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay).  It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings.  If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
- A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues.
- A 1/5 will have major missing text segments from the markdown or completely unreadable formatting.
- Use 0/5 if a field isn't applicable, like if the image doesn't contain a table.

If text that is important to the meaning of the document is missing, do not score higher than 3/5.

Output json, like in the example below.

**Example**
Input
```markdown
# Section 1
This is some *markdown* extracted from a document.  Here is a block equation:
$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
```
Output
```json
{
    "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
    "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
    "comparison": "The text and formatting matches the image.  There are no formatting or text extraction issues.  The equations and section headers are correct.",
    "overall": 5,
    "text": 5,
    "formatting": 5,
    "section_headers": 5,
	"tables": 0,
	"forms": 0,
    "equations": 5,
	"lists": 0,
	"images": 0
}
```
**Input**
```markdown
{{markdown}}
```
**Output**
"""

comparison_keys = ["comparison"]
description_keys = ["image_description", "markdown_description"]
text_keys = comparison_keys + description_keys
score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations",
            "lists", "images"]


class LLMScorer(BaseScorer):
    def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores:
        pdf_bytes = sample["pdf"]
        with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
            f.write(pdf_bytes)
            f.flush()
            f.seek(0)
            doc = pdfium.PdfDocument(f.name)
            img = doc[0].render(scale=96/72).to_pil()
            doc.close()

        return self.llm_rater(img, markdown)


    def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
        if not markdown:
            null_scores = {k: 1 for k in score_keys}
            text_scores = {k: "" for k in text_keys}
            null_scores.update(text_scores)
            return {
                "score": 1,
                "specific_scores": null_scores
            }
        req_keys = text_keys + score_keys
        properties = {}
        for key in req_keys:
            content_type = "INTEGER" if key in score_keys else "STRING"
            properties[key] = {"type": content_type}

        response_schema = {
            "required": req_keys,
            "properties": properties,
            "type": "OBJECT"
        }
        prompt = rating_prompt.replace("{{markdown}}", markdown)
        response = self.llm_response_wrapper([img, prompt], response_schema)
        assert all([k in response for k in req_keys]), f"Missing keys in response: {response}"
        return {
            "score": response["overall"],
            "specific_scores": response,
        }

    def llm_response_wrapper(self, prompt, response_schema, depth=0):
        client = genai.Client(
            http_options={"timeout": 60000},
            vertexai=True,
            project=os.getenv("VERTEX_PROJECT_ID"),
            location=os.getenv("VERTEX_LOCATION"),
        )
        try:
            responses = client.models.generate_content(
                model="gemini-2.0-flash-001",
                contents=prompt,
                config={
                    "temperature": 0,
                    "response_schema": response_schema,
                    "response_mime_type": "application/json",
                },
            )
            output = responses.candidates[0].content.parts[0].text
            return json.loads(output)
        except APIError as e:
            print(f"Hit Gemini rate limit, waiting 120 seconds")
            time.sleep(120)
            if depth > 2:
                raise e
            return self.llm_response_wrapper(prompt, response_schema, depth + 1)
```

--------------------------------------------------------------------------------
/tests/processors/test_llm_processors.py:
--------------------------------------------------------------------------------

```python
from unittest.mock import MagicMock, Mock

import pytest
from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
from marker.processors.llm.llm_equation import LLMEquationProcessor

from marker.processors.llm.llm_form import LLMFormProcessor
from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
from marker.processors.llm.llm_table import LLMTableProcessor
from marker.processors.table import TableProcessor
from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import ComplexRegion


@pytest.mark.filename("form_1040.pdf")
@pytest.mark.config({"page_range": [0]})
def test_llm_form_processor_no_config(pdf_document, llm_service):
    processor_lst = [LLMFormProcessor()]
    processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service)
    processor(pdf_document)

    forms = pdf_document.contained_blocks((BlockTypes.Form,))
    assert forms[0].html is None


@pytest.mark.filename("form_1040.pdf")
@pytest.mark.config({"page_range": [0]})
def test_llm_form_processor_no_cells(pdf_document, llm_service):
    config = {"use_llm": True, "gemini_api_key": "test"}
    processor_lst = [LLMFormProcessor(config)]
    processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config)
    processor(pdf_document)

    forms = pdf_document.contained_blocks((BlockTypes.Form,))
    assert forms[0].html is None


@pytest.mark.filename("form_1040.pdf")
@pytest.mark.config({"page_range": [0]})
def test_llm_form_processor(pdf_document, table_rec_model, recognition_model, detection_model):
    corrected_html = "<em>This is corrected markdown.</em>\n" * 100
    corrected_html = "<p>" + corrected_html.strip() + "</p>\n"

    mock_cls = Mock()
    mock_cls.return_value = {"corrected_html": corrected_html}

    cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
    cell_processor(pdf_document)

    config = {"use_llm": True, "gemini_api_key": "test"}
    processor_lst = [LLMFormProcessor(config)]
    processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
    processor(pdf_document)

    forms = pdf_document.contained_blocks((BlockTypes.Form,))
    assert forms[0].html == corrected_html.strip()



@pytest.mark.filename("table_ex2.pdf")
@pytest.mark.config({"page_range": [0]})
def test_llm_table_processor(pdf_document, table_rec_model, recognition_model, detection_model):
    corrected_html = """
<table>
    <tr>
        <td>Column 1</td>
        <td>Column 2</td>
        <td>Column 3</td>
        <td>Column 4</td>
    </tr>
    <tr>
        <td>Value 1 <math>x</math></td>
        <td>Value 2</td>
        <td>Value 3</td>
        <td>Value 4</td>
    </tr>
    <tr>
        <td>Value 5</td>
        <td>Value 6</td>
        <td>Value 7</td>
        <td>Value 8</td>
    </tr>
</table>
    """.strip()

    mock_cls = Mock()
    mock_cls.return_value = {"corrected_html": corrected_html}

    cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
    cell_processor(pdf_document)

    processor = LLMTableProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
    processor(pdf_document)

    tables = pdf_document.contained_blocks((BlockTypes.Table,))
    table_cells = tables[0].contained_blocks(pdf_document, (BlockTypes.TableCell,))
    assert table_cells[0].text == "Column 1"

    markdown = MarkdownRenderer()(pdf_document).markdown
    assert "Value 1 $x$" in markdown


@pytest.mark.filename("A17_FlightPlan.pdf")
@pytest.mark.config({"page_range": [0]})
def test_llm_caption_processor_disabled(pdf_document):
    config = {"use_llm": True, "gemini_api_key": "test"}
    mock_cls = MagicMock()
    processor_lst = [LLMImageDescriptionProcessor(config)]
    processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
    processor(pdf_document)

    contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
    assert all(picture.description is None for picture in contained_pictures)

@pytest.mark.filename("A17_FlightPlan.pdf")
@pytest.mark.config({"page_range": [0]})
def test_llm_caption_processor(pdf_document):
    description = "This is an image description."
    mock_cls = Mock()
    mock_cls.return_value = {"image_description": description}

    config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False}
    processor_lst = [LLMImageDescriptionProcessor(config)]
    processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
    processor(pdf_document)

    contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
    assert all(picture.description == description for picture in contained_pictures)

    # Ensure the rendering includes the description
    renderer = MarkdownRenderer({"extract_images": False})
    md = renderer(pdf_document).markdown

    assert description in md


@pytest.mark.filename("A17_FlightPlan.pdf")
@pytest.mark.config({"page_range": [0]})
def test_llm_complex_region_processor(pdf_document):
    md = "This is some *markdown* for a complex region."
    mock_cls = Mock()
    mock_cls.return_value = {"corrected_markdown": md * 25}

    # Replace the block with a complex region
    old_block = pdf_document.pages[0].children[0]
    new_block = ComplexRegion(
        **old_block.dict(exclude=["id", "block_id", "block_type"]),
    )
    pdf_document.pages[0].replace_block(old_block, new_block)

    # Test processor
    config = {"use_llm": True, "gemini_api_key": "test"}
    processor_lst = [LLMComplexRegionProcessor(config)]
    processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
    processor(pdf_document)

    # Ensure the rendering includes the description
    renderer = MarkdownRenderer()
    rendered_md = renderer(pdf_document).markdown

    assert md in rendered_md

@pytest.mark.filename("adversarial.pdf")
@pytest.mark.config({"page_range": [0]})
def test_multi_llm_processors(pdf_document):
    description = "<math>This is an image description.  And here is a lot of writing about it.</math>" * 10
    mock_cls = Mock()
    mock_cls.return_value = {"image_description": description, "corrected_equation": description}

    config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
    processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
    processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
    processor(pdf_document)

    contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
    assert all(picture.description == description for picture in contained_pictures)

    contained_equations = pdf_document.contained_blocks((BlockTypes.Equation,))
    print([equation.html for equation in contained_equations])
    assert all(equation.html == description for equation in contained_equations)
```

--------------------------------------------------------------------------------
/benchmarks/table/inference.py:
--------------------------------------------------------------------------------

```python
from typing import List

import numpy as np
from bs4 import BeautifulSoup
import pypdfium2 as pdfium
from tqdm import tqdm
import base64
import tempfile

from benchmarks.table.gemini import gemini_table_rec
from marker.config.parser import ConfigParser
from marker.converters.table import TableConverter
from marker.models import create_model_dict
from marker.processors.llm.llm_table import LLMTableProcessor
from marker.processors.table import TableProcessor
from marker.renderers.json import JSONBlockOutput
from marker.schema.polygon import PolygonBox
from marker.util import matrix_intersection_area


def extract_tables(children: List[JSONBlockOutput]):
    tables = []
    for child in children:
        if child.block_type == 'Table':
            tables.append(child)
        elif child.children:
            tables.extend(extract_tables(child.children))
    return tables

def fix_table_html(table_html: str) -> str:
    marker_table_soup = BeautifulSoup(table_html, 'html.parser')
    tbody = marker_table_soup.find('tbody')
    if tbody:
        tbody.unwrap()
    for th_tag in marker_table_soup.find_all('th'):
        th_tag.name = 'td'
    for br_tag in marker_table_soup.find_all('br'):
        br_tag.replace_with(marker_table_soup.new_string(''))

    marker_table_html = str(marker_table_soup)
    marker_table_html = marker_table_html.replace("\n", " ")  # Fintabnet uses spaces instead of newlines
    return marker_table_html


def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool):
    models = create_model_dict()
    config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
    total_unaligned = 0
    results = []

    iterations = len(dataset)
    if max_rows is not None:
        iterations = min(max_rows, len(dataset))

    for i in tqdm(range(iterations), desc='Converting Tables'):
        try:
            row = dataset[i]
            pdf_binary = base64.b64decode(row['pdf'])
            gt_tables = row['tables']  # Already sorted by reading order, which is what marker returns

            # Only use the basic table processors
            converter = TableConverter(
                config=config_parser.generate_config_dict(),
                artifact_dict=models,
                processor_list=[
                    "marker.processors.table.TableProcessor",
                    "marker.processors.llm.llm_table.LLMTableProcessor",
                ],
                renderer=config_parser.get_renderer()
            )

            with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
                temp_pdf_file.write(pdf_binary)
                temp_pdf_file.seek(0)
                marker_json = converter(temp_pdf_file.name).children

                doc = pdfium.PdfDocument(temp_pdf_file.name)
                page_image = doc[0].render(scale=96/72).to_pil()
                doc.close()

            if len(marker_json) == 0 or len(gt_tables) == 0:
                print(f'No tables detected, skipping...')
                total_unaligned += len(gt_tables)
                continue

            marker_tables = extract_tables(marker_json)
            marker_table_boxes = [table.bbox for table in marker_tables]
            page_bbox = marker_json[0].bbox

            if len(marker_tables) != len(gt_tables):
                print(f'Number of tables do not match, skipping...')
                total_unaligned += len(gt_tables)
                continue

            table_images = [
                page_image.crop(
                    PolygonBox.from_bbox(bbox)
                    .rescale(
                        (page_bbox[2], page_bbox[3]), (page_image.width, page_image.height)
                    ).bbox
                )
                for bbox
                in marker_table_boxes
            ]

            # Normalize the bboxes
            for bbox in marker_table_boxes:
                bbox[0] = bbox[0] / page_bbox[2]
                bbox[1] = bbox[1] / page_bbox[3]
                bbox[2] = bbox[2] / page_bbox[2]
                bbox[3] = bbox[3] / page_bbox[3]

            gt_boxes = [table['normalized_bbox'] for table in gt_tables]
            gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
            marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
            table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)

            aligned_tables = []
            used_tables = set()
            unaligned_tables = set()
            for table_idx, alignment in enumerate(table_alignments):
                try:
                    max_area = np.max(alignment)
                    aligned_idx = np.argmax(alignment)
                except ValueError:
                    # No alignment found
                    unaligned_tables.add(table_idx)
                    continue

                if max_area <= .01:
                    # No alignment found
                    unaligned_tables.add(table_idx)
                    continue

                if aligned_idx in used_tables:
                    # Marker table already aligned with another gt table
                    unaligned_tables.add(table_idx)
                    continue

                # Gt table doesn't align well with any marker table
                gt_table_pct = gt_areas[table_idx] / max_area
                if not .85 < gt_table_pct < 1.15:
                    unaligned_tables.add(table_idx)
                    continue

                # Marker table doesn't align with gt table
                marker_table_pct = marker_areas[aligned_idx] / max_area
                if not .85 < marker_table_pct < 1.15:
                    unaligned_tables.add(table_idx)
                    continue

                gemini_html = ""
                if use_gemini:
                    try:
                        gemini_html = gemini_table_rec(table_images[aligned_idx])
                    except Exception as e:
                        print(f'Gemini failed: {e}')

                aligned_tables.append(
                    (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
                )
                used_tables.add(aligned_idx)

            total_unaligned += len(unaligned_tables)

            for marker_table, gt_table, gemini_table in aligned_tables:
                gt_table_html = gt_table['html']

                # marker wraps the table in <tbody> which fintabnet data doesn't
                # Fintabnet doesn't use th tags, need to be replaced for fair comparison
                marker_table_html = fix_table_html(marker_table.html)
                gemini_table_html = fix_table_html(gemini_table)

                results.append({
                    "marker_table": marker_table_html,
                    "gt_table": gt_table_html,
                    "gemini_table": gemini_table_html
                })
        except pdfium.PdfiumError:
            print('Broken PDF, Skipping...')
            continue
    return results, total_unaligned
```

--------------------------------------------------------------------------------
/marker/scripts/extraction_app.py:
--------------------------------------------------------------------------------

```python
import json
import os

from streamlit_ace import st_ace
from pydantic import BaseModel

from marker.converters.extraction import ExtractionConverter
from marker.scripts.common import (
    parse_args,
    load_models,
    get_page_image,
    page_count,
    get_root_class,
)

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["IN_STREAMLIT"] = "true"

from streamlit.runtime.uploaded_file_manager import UploadedFile

import tempfile
from typing import Any, Dict

import streamlit as st

from marker.config.parser import ConfigParser


def extract_data(
    fname: str, config: dict, schema: str, markdown: str | None = None
) -> (str, Dict[str, Any], dict):
    config["pdftext_workers"] = 1
    config["page_schema"] = schema
    config["existing_markdown"] = markdown
    config_parser = ConfigParser(config)
    config_dict = config_parser.generate_config_dict()

    converter_cls = ExtractionConverter
    converter = converter_cls(
        config=config_dict,
        artifact_dict=model_dict,
        processor_list=config_parser.get_processors(),
        renderer=config_parser.get_renderer(),
        llm_service=config_parser.get_llm_service(),
    )
    return converter(fname)


st.set_page_config(layout="wide")
col1, col2 = st.columns([0.5, 0.5])

model_dict = load_models()
cli_options = parse_args()

st.markdown("""
# Marker Extraction Demo

This app will let you use marker to do structured extraction.

Warning: This can execute untrusted code entered into the schema panel.
""")

in_file: UploadedFile = st.sidebar.file_uploader(
    "PDF, document, or image file:",
    type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
)

# Initialize session state variables
if "rendered_pydantic_schema" not in st.session_state:
    st.session_state.rendered_pydantic_schema = ""

if "markdown" not in st.session_state:
    st.session_state.markdown = ""

if "current_file_id" not in st.session_state:
    st.session_state.current_file_id = None

# Detect file changes and clear markdown when new file is uploaded
if in_file is not None:
    # Create a unique identifier for the current file
    current_file_id = f"{in_file.name}_{in_file.size}_{hash(in_file.getvalue())}"

    # Check if this is a new file
    if st.session_state.current_file_id != current_file_id:
        st.session_state.current_file_id = current_file_id
        st.session_state.markdown = ""  # Clear markdown for new file
else:
    # No file uploaded, clear the current file ID
    if st.session_state.current_file_id is not None:
        st.session_state.current_file_id = None
        st.session_state.markdown = ""  # Clear markdown when no file
        st.session_state.rendered_pydantic_schema = ""

if in_file is None:
    st.stop()

filetype = in_file.type

with col1:
    page_count = page_count(in_file)
    page_number = st.number_input(
        f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
    )
    pil_image = get_page_image(in_file, page_number)
    st.image(pil_image, use_container_width=True)
with col2:
    tab1, tab2 = st.tabs(["JSON Schema", "Pydantic Schema"])

    # Initialize schema variable
    schema = None

    with tab1:
        st.write("Enter an existing JSON schema here:")
        default_json_value = (
            st.session_state.rendered_pydantic_schema
            if st.session_state.rendered_pydantic_schema
            else ""
        )
        json_schema_input = st.text_area(
            "JSON Schema",
            value=default_json_value,
            height=300,
            placeholder='{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}}',
            key="json_schema_input",
            label_visibility="collapsed",
        )

        # Set schema if JSON input is provided
        if json_schema_input and json_schema_input.strip():
            try:
                # Validate JSON
                json.loads(json_schema_input)
                schema = json_schema_input.strip()
                st.success("✅ Valid JSON schema detected")
            except json.JSONDecodeError as e:
                st.error(f"❌ Invalid JSON: {e}")
                schema = None

    with tab2:
        st.write("Enter pydantic schema here:")
        pydantic_schema_input = st_ace(
            value="""from pydantic import BaseModel

class Schema(BaseModel):
    # Add your fields here
    # Example:
    name: str
    age: int
    # email: str
    pass""",
            language="python",
            height=300,
            key="pydantic_editor",
        )

        render_schema = st.button("🔄 Render Pydantic schema to JSON")

        if render_schema and pydantic_schema_input:
            try:
                pydantic_root: BaseModel = get_root_class(pydantic_schema_input)
                json_schema = pydantic_root.model_json_schema()
                schema = json.dumps(json_schema, indent=2)
                st.success("✅ Schema rendered successfully!")
                st.json(json_schema)
                st.session_state.rendered_pydantic_schema = schema
            except Exception as e:
                st.error(f"❌ Could not parse your schema: {e}")
                schema = None
        elif (
            pydantic_schema_input
            and pydantic_schema_input.strip()
            and not render_schema
        ):
            # If there's Pydantic code but not rendered yet, show a message
            if (
                "class Schema(BaseModel):" in pydantic_schema_input
                and "pass" not in pydantic_schema_input
            ):
                st.info(
                    "💡 Click 'Render Pydantic schema to JSON' to convert your Pydantic model to JSON schema"
                )

# Move the run logic outside of col2
run_marker = st.sidebar.button("Run Extraction")

use_llm = st.sidebar.checkbox(
    "Use LLM", help="Use LLM for higher quality text", value=False
)
force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
strip_existing_ocr = st.sidebar.checkbox(
    "Strip existing OCR",
    help="Strip existing OCR text from the PDF and re-OCR.",
    value=False,
)

# Check if schema is provided before running
if run_marker:
    if not schema:
        st.error(
            "❌ Please provide a schema in either the JSON Schema or Pydantic Schema tab before running extraction."
        )
        st.stop()

    # Run Marker
    with tempfile.TemporaryDirectory() as tmp_dir:
        temp_pdf = os.path.join(tmp_dir, "temp.pdf")
        with open(temp_pdf, "wb") as f:
            f.write(in_file.getvalue())

        cli_options.update(
            {
                "force_ocr": force_ocr,
                "use_llm": use_llm,
                "strip_existing_ocr": strip_existing_ocr,
            }
        )

        try:
            rendered = extract_data(
                temp_pdf, cli_options, schema, st.session_state.markdown
            )

            with col2:
                st.write("## Output JSON")
                st.json(rendered.model_dump(exclude=["original_markdown"]))
                st.session_state.markdown = rendered.original_markdown

        except Exception as e:
            st.error(f"❌ Extraction failed: {e}")

else:
    # Show instruction when not running
    if not schema:
        st.info("📝 Please provide a schema and click 'Run Extraction' to begin.")

```

--------------------------------------------------------------------------------
/benchmarks/overall/overall.py:
--------------------------------------------------------------------------------

```python
import json
import os
import traceback
from collections import defaultdict
from pathlib import Path
from typing import List

import click
import datasets
import torch
from tqdm import tqdm

from benchmarks.overall.display.dataset import build_dataset
from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
from benchmarks.overall.schema import FullResult
from marker.logger import configure_logging
from marker.models import create_model_dict
from marker.settings import settings
from benchmarks.overall.display.table import print_scores

configure_logging()


def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult:
    bench_scores = {}
    averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    average_times = defaultdict(list)
    markdown_by_method = defaultdict(dict)
    total_rows = len(benchmark_dataset)
    if max_rows:
        total_rows = min(max_rows, total_rows)
    for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark", total=total_rows):
        if max_rows is not None and idx >= max_rows:
            break

        doc_type = sample["classification"]
        gt_cls = METHOD_REGISTRY["gt"]
        gt_blocks = json.loads(sample["gt_blocks"])
        gt_md = gt_cls(**artifacts)(sample)["markdown"]
        markdown_by_method[idx]["gt"] = gt_md

        out_data = defaultdict(dict)

        try:
            for method in methods:
                method_cls = METHOD_REGISTRY[method](**artifacts)
                method_info = method_cls(sample)
                method_md = method_info["markdown"]
                if method_md is None:
                    method_md = "" # Avoid None values

                average_times[method].append(method_info["time"])
                markdown_by_method[idx][method] = method_md

                for score_type in score_types:
                    score_cls = SCORE_REGISTRY[score_type]()
                    try:
                        scores = score_cls(sample, gt_md, method_md)
                    except Exception as e:
                        # Some scorers can fail, like the LLM one
                        print(f"Failed to score {method} with {score_type}: {e}")
                        continue

                    out_data[method][score_type] = scores

                    averages_by_type[method][score_type][doc_type].append(scores["score"])

                    if "by_block" in scores["specific_scores"]: # Not all scorers support this
                        for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks):
                            averages_by_block_type[method][score_type][gt_block["block_type"]].append(score)
        except Exception as e:
            print(f"Failed to process {idx}: {e}")
            traceback.print_exc()
            if idx in markdown_by_method:
                del markdown_by_method[idx]
            continue

        bench_scores[idx] = out_data

    return {
        "scores": bench_scores,
        "markdown": markdown_by_method,
        "averages_by_type": averages_by_type,
        "averages_by_block_type": averages_by_block_type,
        "average_times": average_times,
    }

@click.command(help="Benchmark PDF to MD conversion.")
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
@click.option("--methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: marker,mathpix,llamaparse,docling,mistral", default="marker")
@click.option("--scores", type=str, help="Comma separated list of scoring functions to use.  Possible values: heuristic,llm", default="heuristic")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
@click.option("--languages", type=str, help="Comma separated list of languages to use for LLM", default=None)
def main(
        dataset: str,
        out_dataset: str,
        methods: str,
        scores: str,
        result_path: str,
        max_rows: int,
        use_llm: bool,
        languages: str
):
    out_path = Path(result_path)
    out_path.mkdir(parents=True, exist_ok=True)

    methods = methods.split(",")
    for method in methods:
        if method not in METHOD_REGISTRY:
            raise ValueError(f"Method {method} not allowed.  Allowed methods are {METHOD_REGISTRY.keys()}")

    # Ensure marker is always first
    all_methods = list(set(methods))
    methods = ["marker"] if "marker" in all_methods else []
    methods += [m for m in all_methods if m != "marker"]

    score_types = scores.split(",")
    for score_type in score_types:
        if score_type not in SCORE_REGISTRY:
            raise ValueError(f"Score type {score_type} not allowed.  Allowed types are {SCORE_REGISTRY.keys()}")

    if languages:
        languages = languages.split(",")
    else:
        languages = None

    benchmark_dataset = datasets.load_dataset(dataset, split="train")
    if languages:
        benchmark_dataset = benchmark_dataset.filter(lambda x: x["language"] in languages)

    artifacts = {
        "model_dict": create_model_dict(),
        "use_llm": use_llm,
        "mathpix_ds": None,
        "llamaparse_ds": None,
    }

    if "mathpix" in methods:
        artifacts["mathpix_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")

    if "llamaparse" in methods:
        artifacts["llamaparse_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_llamaparse", split="train")

    if "mistral" in methods:
        artifacts["mistral_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mistral", split="train")

    if "olmocr" in methods:
        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
        model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview",
                                                                torch_dtype=torch.bfloat16).eval()
        processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
        model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        artifacts["olmocr_model"] = {"model": model, "processor": processor}

    print(f"Running benchmark with methods: {methods} and scores: {score_types}")
    result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows)

    # Display benchmark scoring tables
    print_scores(result, out_path, methods, score_types, default_method=methods[0], default_score_type=score_types[0])

    # Write to json
    with open(out_path / "result.json", "w") as f:
        json.dump(result, f)

    if out_dataset:
        if use_llm:
            out_dataset += "_llm"
        dataset = build_dataset(benchmark_dataset, result, score_types, max_rows=max_rows)
        dataset.push_to_hub(out_dataset, private=True)


if __name__ == "__main__":
    main()


```

--------------------------------------------------------------------------------
/marker/util.py:
--------------------------------------------------------------------------------

```python
import inspect
import os
from importlib import import_module
from typing import List, Annotated
import re

import numpy as np
import requests
from pydantic import BaseModel

from marker.schema.polygon import PolygonBox
from marker.settings import settings

OPENING_TAG_REGEX = re.compile(r"<((?:math|i|b))(?:\s+[^>]*)?>")
CLOSING_TAG_REGEX = re.compile(r"</((?:math|i|b))>")
TAG_MAPPING = {
    'i': 'italic',
    'b': 'bold',
    'math': 'math',
    'mark': 'highlight',
    'sub': 'subscript',
    'sup': 'superscript',
    'small': 'small',
    'u': 'underline',
    'code': 'code'
}

def strings_to_classes(items: List[str]) -> List[type]:
    classes = []
    for item in items:
        module_name, class_name = item.rsplit('.', 1)
        module = import_module(module_name)
        classes.append(getattr(module, class_name))
    return classes


def classes_to_strings(items: List[type]) -> List[str]:
    for item in items:
        if not inspect.isclass(item):
            raise ValueError(f"Item {item} is not a class")

    return [f"{item.__module__}.{item.__name__}" for item in items]


def verify_config_keys(obj):
    annotations = inspect.get_annotations(obj.__class__)

    none_vals = ""
    for attr_name, annotation in annotations.items():
        if isinstance(annotation, type(Annotated[str, ""])):
            value = getattr(obj, attr_name)
            if value is None:
                none_vals += f"{attr_name}, "

    assert len(none_vals) == 0, f"In order to use {obj.__class__.__name__}, you must set the configuration values `{none_vals}`."


def assign_config(cls, config: BaseModel | dict | None):
    cls_name = cls.__class__.__name__
    if config is None:
        return
    elif isinstance(config, BaseModel):
        dict_config = config.dict()
    elif isinstance(config, dict):
        dict_config = config
    else:
        raise ValueError("config must be a dict or a pydantic BaseModel")

    for k in dict_config:
        if hasattr(cls, k):
            setattr(cls, k, dict_config[k])
    for k in dict_config:
        if cls_name not in k:
            continue
        # Enables using class-specific keys, like "MarkdownRenderer_remove_blocks"
        split_k = k.removeprefix(cls_name + "_")

        if hasattr(cls, split_k):
            setattr(cls, split_k, dict_config[k])


def parse_range_str(range_str: str) -> List[int]:
    range_lst = range_str.split(",")
    page_lst = []
    for i in range_lst:
        if "-" in i:
            start, end = i.split("-")
            page_lst += list(range(int(start), int(end) + 1))
        else:
            page_lst.append(int(i))
    page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
    return page_lst


def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray:
    if len(boxes1) == 0 or len(boxes2) == 0:
        return np.zeros((len(boxes1), len(boxes2)))

    boxes1 = np.array(boxes1)
    boxes2 = np.array(boxes2)

    boxes1 = boxes1[:, np.newaxis, :]  # Shape: (N, 1, 4)
    boxes2 = boxes2[np.newaxis, :, :]  # Shape: (1, M, 4)

    min_x = np.maximum(boxes1[..., 0], boxes2[..., 0])  # Shape: (N, M)
    min_y = np.maximum(boxes1[..., 1], boxes2[..., 1])
    max_x = np.minimum(boxes1[..., 2], boxes2[..., 2])
    max_y = np.minimum(boxes1[..., 3], boxes2[..., 3])

    width = np.maximum(0, max_x - min_x)
    height = np.maximum(0, max_y - min_y)

    return width * height  # Shape: (N, M)


def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray:
    if len(boxes2) == 0:
        return np.zeros((len(boxes1), 0))
    if len(boxes1) == 0:
        return np.zeros((0, len(boxes2)))

    boxes1 = np.array(boxes1)  # Shape: (N, 4)
    boxes2 = np.array(boxes2)  # Shape: (M, 4)

    boxes1_centers = (boxes1[:, :2] + boxes1[:, 2:]) / 2 # Shape: (M, 2)
    boxes2_centers = (boxes2[:, :2] + boxes2[:, 2:]) / 2  # Shape: (M, 2)

    boxes1_centers = boxes1_centers[:, np.newaxis, :]  # Shape: (N, 1, 2)
    boxes2_centers = boxes2_centers[np.newaxis, :, :]  # Shape: (1, M, 2)

    distances = np.linalg.norm(boxes1_centers - boxes2_centers, axis=2)  # Shape: (N, M)
    return distances


def sort_text_lines(lines: List[PolygonBox], tolerance=1.25):
    # Sorts in reading order.  Not 100% accurate, this should only
    # be used as a starting point for more advanced sorting.
    vertical_groups = {}
    for line in lines:
        group_key = round(line.bbox[1] / tolerance) * tolerance
        if group_key not in vertical_groups:
            vertical_groups[group_key] = []
        vertical_groups[group_key].append(line)

    # Sort each group horizontally and flatten the groups into a single list
    sorted_lines = []
    for _, group in sorted(vertical_groups.items()):
        sorted_group = sorted(group, key=lambda x: x.bbox[0])
        sorted_lines.extend(sorted_group)

    return sorted_lines

def download_font():
    if not os.path.exists(settings.FONT_PATH):
        os.makedirs(os.path.dirname(settings.FONT_PATH), exist_ok=True)
        font_dl_path = f"{settings.ARTIFACT_URL}/{settings.FONT_NAME}"
        with requests.get(font_dl_path, stream=True) as r, open(settings.FONT_PATH, 'wb') as f:
            r.raise_for_status()
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

def get_opening_tag_type(tag):
    """
    Determines if a tag is an opening tag and extracts the tag type.
    
    Args:
        tag (str): The tag string to analyze.

    Returns:
        tuple: (is_opening_tag (bool), tag_type (str or None))
    """
    match = OPENING_TAG_REGEX.match(tag)
    
    if match:
        tag_type = match.group(1)
        if tag_type in TAG_MAPPING:
            return True, TAG_MAPPING[tag_type]
    
    return False, None

def get_closing_tag_type(tag):
    """
    Determines if a tag is an opening tag and extracts the tag type.
    
    Args:
        tag (str): The tag string to analyze.

    Returns:
        tuple: (is_opening_tag (bool), tag_type (str or None))
    """
    match = CLOSING_TAG_REGEX.match(tag)
    
    if match:
        tag_type = match.group(1)
        if tag_type in TAG_MAPPING:
            return True, TAG_MAPPING[tag_type]
    
    return False, None

# Modification of unwrap_math from surya.recognition
MATH_SYMBOLS = ["^", "_", "\\", "{", "}"]
MATH_TAG_PATTERN = re.compile(r'<math\b[^>]*>.*?</math>', re.DOTALL)
LATEX_ESCAPES = {
    r'\%': '%',
    r'\$': '$',
    r'\_': '_',
    r'\&': '&',
    r'\#': '#',
    r'\‰': '‰',
}
def normalize_latex_escapes(s: str) -> str:
    for k, v in LATEX_ESCAPES.items():
        s = s.replace(k, v)
    return s

def unwrap_math(text: str, math_symbols: List[str] = MATH_SYMBOLS) -> str:
    """Unwrap a single <math>...</math> block if it's not really math."""
    if MATH_TAG_PATTERN.match(text):
        # Remove tags
        inner = re.sub(r'^\s*<math\b[^>]*>|</math>\s*$', '', text, flags=re.DOTALL)

        # Strip a single leading/trailing \\ plus surrounding whitespace
        inner_stripped = re.sub(r'^\s*\\\\\s*|\s*\\\\\s*$', '', inner)

        # Unwrap \text{...}
        unwrapped = re.sub(r'\\text[a-zA-Z]*\s*\{(.*?)\}', r'\1', inner_stripped)

        # Normalize escapes
        normalized = normalize_latex_escapes(unwrapped)

        # If no math symbols remain → unwrap fully
        if not any(symb in normalized for symb in math_symbols):
            return normalized.strip()

    # Otherwise, return as-is
    return text
```

--------------------------------------------------------------------------------
/marker/processors/debug.py:
--------------------------------------------------------------------------------

```python
import json
import os
from typing import Annotated

from PIL import Image, ImageDraw, ImageFont
from marker.logger import get_logger

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.settings import settings

logger = get_logger()


class DebugProcessor(BaseProcessor):
    """
    A processor for debugging the document.
    """

    block_types: Annotated[
        tuple, "The block types to process.", "Default is an empty tuple."
    ] = tuple()
    debug_data_folder: Annotated[
        str,
        "The folder to dump debug data to.",
    ] = "debug_data"
    debug_layout_images: Annotated[
        bool,
        "Whether to dump layout debug images.",
    ] = False
    debug_pdf_images: Annotated[
        bool,
        "Whether to dump PDF debug images.",
    ] = False
    debug_json: Annotated[
        bool,
        "Whether to dump block debug data.",
    ] = False

    def __call__(self, document: Document):
        # Remove extension from doc name
        doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0]
        self.debug_folder = os.path.join(self.debug_data_folder, doc_base)
        if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
            os.makedirs(self.debug_folder, exist_ok=True)

        document.debug_data_path = self.debug_folder

        if self.debug_layout_images:
            self.draw_layout_debug_images(document)
            logger.info(f"Dumped layout debug images to {self.debug_data_folder}")

        if self.debug_pdf_images:
            self.draw_pdf_debug_images(document)
            logger.info(f"Dumped PDF debug images to {self.debug_data_folder}")

        if self.debug_json:
            self.dump_block_debug_data(document)
            logger.info(f"Dumped block debug data to {self.debug_data_folder}")

    def draw_pdf_debug_images(self, document: Document):
        for page in document.pages:
            png_image = page.get_image(highres=True).copy()

            line_bboxes = []
            span_bboxes = []
            line_ids = []
            for child in page.children:
                # Skip any blocks that have been removed
                if child.removed:
                    continue

                if child.block_type == BlockTypes.Line:
                    bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
                    line_bboxes.append(bbox)
                    line_ids.append(child.block_id)
                elif child.block_type == BlockTypes.Span:
                    bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
                    span_bboxes.append(bbox)

            self.render_on_image(
                line_bboxes,
                png_image,
                color="blue",
                draw_bbox=True,
                label_font_size=24,
                labels=[str(i) for i in line_ids],
            )

            png_image = self.render_layout_boxes(page, png_image)

            debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
            png_image.save(debug_file)

    def draw_layout_debug_images(self, document: Document, pdf_mode=False):
        for page in document.pages:
            img_size = page.get_image(highres=True).size
            png_image = Image.new("RGB", img_size, color="white")

            line_bboxes = []
            line_text = []
            for child in page.children:
                if child.removed:
                    continue

                if child.block_type != BlockTypes.Line:
                    continue

                bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
                line_bboxes.append(bbox)
                line_text.append(child.raw_text(document))

            self.render_on_image(
                line_bboxes,
                png_image,
                labels=line_text,
                color="black",
                draw_bbox=False,
                label_font_size=24,
            )

            png_image = self.render_layout_boxes(page, png_image)

            debug_file = os.path.join(
                self.debug_folder, f"layout_page_{page.page_id}.png"
            )
            png_image.save(debug_file)

    def render_layout_boxes(self, page, png_image):
        layout_bboxes = []
        layout_labels = []
        for block_id in page.structure:
            child = page.get_block(block_id)
            if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
                continue

            bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
            layout_bboxes.append(bbox)
            layout_labels.append(str(child.block_type))

        self.render_on_image(
            layout_bboxes,
            png_image,
            labels=layout_labels,
            color="red",
            label_font_size=24,
        )

        order_labels = [str(i) for i in range(len(layout_bboxes))]
        self.render_on_image(
            layout_bboxes,
            png_image,
            labels=order_labels,
            color="green",
            draw_bbox=False,
            label_offset=5,
            label_font_size=24,
        )
        return png_image

    def dump_block_debug_data(self, document: Document):
        debug_file = os.path.join(self.debug_folder, "blocks.json")
        debug_data = []
        for page in document.pages:
            page_data = page.model_dump(
                exclude={
                    "lowres_image": True,
                    "highres_image": True,
                    "children": {
                        "__all__": {"lowres_image": True, "highres_image": True}
                    },
                }
            )
            debug_data.append(page_data)

        with open(debug_file, "w+") as f:
            json.dump(debug_data, f)

    def get_text_size(self, text, font):
        im = Image.new(mode="P", size=(0, 0))
        draw = ImageDraw.Draw(im)
        _, _, width, height = draw.textbbox((0, 0), text=text, font=font)
        return width, height

    def render_on_image(
        self,
        bboxes,
        image,
        labels=None,
        label_offset=1,
        label_font_size=10,
        color: str | list = "red",
        draw_bbox=True,
    ):
        draw = ImageDraw.Draw(image)
        font_path = settings.FONT_PATH
        label_font = ImageFont.truetype(font_path, label_font_size)

        for i, bbox in enumerate(bboxes):
            bbox = [int(p) for p in bbox]
            if draw_bbox:
                draw.rectangle(
                    bbox,
                    outline=color[i] if isinstance(color, list) else color,
                    width=1,
                )

            if labels is not None:
                label = labels[i]
                text_position = (bbox[0] + label_offset, bbox[1] + label_offset)
                text_size = self.get_text_size(label, label_font)
                if text_size[0] <= 0 or text_size[1] <= 0:
                    continue
                box_position = (
                    text_position[0],
                    text_position[1],
                    text_position[0] + text_size[0],
                    text_position[1] + text_size[1],
                )
                draw.rectangle(box_position, fill="white")
                draw.text(
                    text_position,
                    label,
                    fill=color[i] if isinstance(color, list) else color,
                    font=label_font,
                )

        return image

```

--------------------------------------------------------------------------------
/marker/converters/pdf.py:
--------------------------------------------------------------------------------

```python
import os

from marker.schema.document import Document

os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning

from collections import defaultdict
from typing import Annotated, Any, Dict, List, Optional, Type, Tuple, Union
import io
from contextlib import contextmanager
import tempfile

from marker.processors import BaseProcessor
from marker.services import BaseService
from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
from marker.providers.registry import provider_from_filepath
from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.builders.structure import StructureBuilder
from marker.converters import BaseConverter
from marker.processors.blockquote import BlockquoteProcessor
from marker.processors.code import CodeProcessor
from marker.processors.debug import DebugProcessor
from marker.processors.document_toc import DocumentTOCProcessor
from marker.processors.equation import EquationProcessor
from marker.processors.footnote import FootnoteProcessor
from marker.processors.ignoretext import IgnoreTextProcessor
from marker.processors.line_numbers import LineNumbersProcessor
from marker.processors.list import ListProcessor
from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
from marker.processors.llm.llm_form import LLMFormProcessor
from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
from marker.processors.llm.llm_table import LLMTableProcessor
from marker.processors.page_header import PageHeaderProcessor
from marker.processors.reference import ReferenceProcessor
from marker.processors.sectionheader import SectionHeaderProcessor
from marker.processors.table import TableProcessor
from marker.processors.text import TextProcessor
from marker.processors.block_relabel import BlockRelabelProcessor
from marker.processors.blank_page import BlankPageProcessor
from marker.processors.llm.llm_equation import LLMEquationProcessor
from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.registry import register_block_class
from marker.util import strings_to_classes
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
from marker.processors.order import OrderProcessor
from marker.services.gemini import GoogleGeminiService
from marker.processors.line_merge import LineMergeProcessor
from marker.processors.llm.llm_mathblock import LLMMathBlockProcessor
from marker.processors.llm.llm_page_correction import LLMPageCorrectionProcessor
from marker.processors.llm.llm_sectionheader import LLMSectionHeaderProcessor


class PdfConverter(BaseConverter):
    """
    A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
    """

    override_map: Annotated[
        Dict[BlockTypes, Type[Block]],
        "A mapping to override the default block classes for specific block types.",
        "The keys are `BlockTypes` enum values, representing the types of blocks,",
        "and the values are corresponding `Block` class implementations to use",
        "instead of the defaults.",
    ] = defaultdict()
    use_llm: Annotated[
        bool,
        "Enable higher quality processing with LLMs.",
    ] = False
    default_processors: Tuple[BaseProcessor, ...] = (
        OrderProcessor,
        BlockRelabelProcessor,
        LineMergeProcessor,
        BlockquoteProcessor,
        CodeProcessor,
        DocumentTOCProcessor,
        EquationProcessor,
        FootnoteProcessor,
        IgnoreTextProcessor,
        LineNumbersProcessor,
        ListProcessor,
        PageHeaderProcessor,
        SectionHeaderProcessor,
        TableProcessor,
        LLMTableProcessor,
        LLMTableMergeProcessor,
        LLMFormProcessor,
        TextProcessor,
        LLMComplexRegionProcessor,
        LLMImageDescriptionProcessor,
        LLMEquationProcessor,
        LLMHandwritingProcessor,
        LLMMathBlockProcessor,
        LLMSectionHeaderProcessor,
        LLMPageCorrectionProcessor,
        ReferenceProcessor,
        BlankPageProcessor,
        DebugProcessor,
    )
    default_llm_service: BaseService = GoogleGeminiService

    def __init__(
        self,
        artifact_dict: Dict[str, Any],
        processor_list: Optional[List[str]] = None,
        renderer: str | None = None,
        llm_service: str | None = None,
        config=None,
    ):
        super().__init__(config)

        if config is None:
            config = {}

        for block_type, override_block_type in self.override_map.items():
            register_block_class(block_type, override_block_type)

        if processor_list is not None:
            processor_list = strings_to_classes(processor_list)
        else:
            processor_list = self.default_processors

        if renderer:
            renderer = strings_to_classes([renderer])[0]
        else:
            renderer = MarkdownRenderer

        # Put here so that resolve_dependencies can access it
        self.artifact_dict = artifact_dict

        if llm_service:
            llm_service_cls = strings_to_classes([llm_service])[0]
            llm_service = self.resolve_dependencies(llm_service_cls)
        elif config.get("use_llm", False):
            llm_service = self.resolve_dependencies(self.default_llm_service)

        # Inject llm service into artifact_dict so it can be picked up by processors, etc.
        self.artifact_dict["llm_service"] = llm_service
        self.llm_service = llm_service

        self.renderer = renderer

        processor_list = self.initialize_processors(processor_list)
        self.processor_list = processor_list

        self.layout_builder_class = LayoutBuilder
        self.page_count = None  # Track how many pages were converted

    @contextmanager
    def filepath_to_str(self, file_input: Union[str, io.BytesIO]):
        temp_file = None
        try:
            if isinstance(file_input, str):
                yield file_input
            else:
                with tempfile.NamedTemporaryFile(
                    delete=False, suffix=".pdf"
                ) as temp_file:
                    if isinstance(file_input, io.BytesIO):
                        file_input.seek(0)
                        temp_file.write(file_input.getvalue())
                    else:
                        raise TypeError(
                            f"Expected str or BytesIO, got {type(file_input)}"
                        )

                yield temp_file.name
        finally:
            if temp_file is not None and os.path.exists(temp_file.name):
                os.unlink(temp_file.name)

    def build_document(self, filepath: str) -> Document:
        provider_cls = provider_from_filepath(filepath)
        layout_builder = self.resolve_dependencies(self.layout_builder_class)
        line_builder = self.resolve_dependencies(LineBuilder)
        ocr_builder = self.resolve_dependencies(OcrBuilder)
        provider = provider_cls(filepath, self.config)
        document = DocumentBuilder(self.config)(
            provider, layout_builder, line_builder, ocr_builder
        )
        structure_builder_cls = self.resolve_dependencies(StructureBuilder)
        structure_builder_cls(document)

        for processor in self.processor_list:
            processor(document)

        return document

    def __call__(self, filepath: str | io.BytesIO):
        with self.filepath_to_str(filepath) as temp_path:
            document = self.build_document(temp_path)
            self.page_count = len(document.pages)
            renderer = self.resolve_dependencies(self.renderer)
            rendered = renderer(document)
        return rendered

```

--------------------------------------------------------------------------------
/marker/schema/polygon.py:
--------------------------------------------------------------------------------

```python
from __future__ import annotations
import copy
from typing import List

import numpy as np
from pydantic import BaseModel, field_validator, computed_field


class PolygonBox(BaseModel):
    polygon: List[List[float]]

    @field_validator('polygon')
    @classmethod
    def check_elements(cls, v: List[List[float]]) -> List[List[float]]:
        if len(v) != 4:
            raise ValueError('corner must have 4 elements')

        for corner in v:
            if len(corner) != 2:
                raise ValueError('corner must have 2 elements')

        min_x = min([corner[0] for corner in v])
        min_y = min([corner[1] for corner in v])

        # Ensure corners are clockwise from top left
        corner_error = f" .Corners are {v}"
        assert v[2][1] >= min_y, f'bottom right corner should have a greater y value than top right corner' + corner_error
        assert v[3][1] >= min_y, 'bottom left corner should have a greater y value than top left corner' + corner_error
        assert v[1][0] >= min_x, 'top right corner should have a greater x value than top left corner' + corner_error
        assert v[2][0] >= min_x, 'bottom right corner should have a greater x value than bottom left corner' + corner_error
        return v

    @property
    def height(self):
        return self.bbox[3] - self.bbox[1]

    @property
    def width(self):
        return self.bbox[2] - self.bbox[0]

    @property
    def area(self):
        return self.width * self.height

    @property
    def center(self):
        return [(self.bbox[0] + self.bbox[2]) / 2, (self.bbox[1] + self.bbox[3]) / 2]

    @property
    def size(self):
        return [self.width, self.height]

    @property
    def x_start(self):
        return self.bbox[0]

    @property
    def y_start(self):
        return self.bbox[1]

    @property
    def x_end(self):
        return self.bbox[2]

    @property
    def y_end(self):
        return self.bbox[3]

    @computed_field
    @property
    def bbox(self) -> List[float]:
        min_x = min([corner[0] for corner in self.polygon])
        min_y = min([corner[1] for corner in self.polygon])
        max_x = max([corner[0] for corner in self.polygon])
        max_y = max([corner[1] for corner in self.polygon])
        return [min_x, min_y, max_x, max_y]

    def expand(self, x_margin: float, y_margin: float) -> PolygonBox:
        new_polygon = []
        x_margin = x_margin * self.width
        y_margin = y_margin * self.height
        for idx, poly in enumerate(self.polygon):
            if idx == 0:
                new_polygon.append([poly[0] - x_margin, poly[1] - y_margin])
            elif idx == 1:
                new_polygon.append([poly[0] + x_margin, poly[1] - y_margin])
            elif idx == 2:
                new_polygon.append([poly[0] + x_margin, poly[1] + y_margin])
            elif idx == 3:
                new_polygon.append([poly[0] - x_margin, poly[1] + y_margin])
        return PolygonBox(polygon=new_polygon)

    def expand_y2(self, y_margin: float) -> PolygonBox:
        new_polygon = []
        y_margin = y_margin * self.height
        for idx, poly in enumerate(self.polygon):
            if idx == 2:
                new_polygon.append([poly[0], poly[1] + y_margin])
            elif idx == 3:
                new_polygon.append([poly[0], poly[1] + y_margin])
            else:
                new_polygon.append(poly)
        return PolygonBox(polygon=new_polygon)

    def expand_y1(self, y_margin: float) -> PolygonBox:
        new_polygon = []
        y_margin = y_margin * self.height
        for idx, poly in enumerate(self.polygon):
            if idx == 0:
                new_polygon.append([poly[0], poly[1] - y_margin])
            elif idx == 1:
                new_polygon.append([poly[0], poly[1] - y_margin])
            else:
                new_polygon.append(poly)
        return PolygonBox(polygon=new_polygon)

    def minimum_gap(self, other: PolygonBox):
        if self.intersection_pct(other) > 0:
            return 0

        def dist(p1, p2):
            return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5

        left = other.bbox[2] < self.bbox[0]
        right = self.bbox[2] < other.bbox[0]
        bottom = other.bbox[3] < self.bbox[1]
        top = self.bbox[3] < other.bbox[1]
        if top and left:
            return dist((self.bbox[0], self.bbox[3]), (other.bbox[2], other.bbox[1]))
        elif left and bottom:
            return dist((self.bbox[0], self.bbox[1]), (other.bbox[2], other.bbox[3]))
        elif bottom and right:
            return dist((self.bbox[2], self.bbox[1]), (other.bbox[0], other.bbox[3]))
        elif right and top:
            return dist((self.bbox[2], self.bbox[3]), (other.bbox[0], other.bbox[1]))
        elif left:
            return self.bbox[0] - other.bbox[2]
        elif right:
            return other.bbox[0] - self.bbox[2]
        elif bottom:
            return self.bbox[1] - other.bbox[3]
        elif top:
            return other.bbox[1] - self.bbox[3]
        else:
            return 0

    def center_distance(self, other: PolygonBox, x_weight: float = 1, y_weight: float = 1, absolute=False):
        if not absolute:
            return ((self.center[0] - other.center[0]) ** 2 * x_weight + (self.center[1] - other.center[1]) ** 2 * y_weight) ** 0.5
        else:
            return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight

    def tl_distance(self, other: PolygonBox):
        return ((self.bbox[0] - other.bbox[0]) ** 2 + (self.bbox[1] - other.bbox[1]) ** 2) ** 0.5

    def rescale(self, old_size, new_size):
        # Point is in x, y format
        page_width, page_height = old_size
        img_width, img_height = new_size

        width_scaler = img_width / page_width
        height_scaler = img_height / page_height

        new_corners = copy.deepcopy(self.polygon)
        for corner in new_corners:
            corner[0] = corner[0] * width_scaler
            corner[1] = corner[1] * height_scaler
        return PolygonBox(polygon=new_corners)

    def fit_to_bounds(self, bounds):
        new_corners = copy.deepcopy(self.polygon)
        for corner in new_corners:
            corner[0] = max(min(corner[0], bounds[2]), bounds[0])
            corner[1] = max(min(corner[1], bounds[3]), bounds[1])
        return PolygonBox(polygon=new_corners)

    def overlap_x(self, other: PolygonBox):
        return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0]))

    def overlap_y(self, other: PolygonBox):
        return max(0, min(self.bbox[3], other.bbox[3]) - max(self.bbox[1], other.bbox[1]))

    def intersection_area(self, other: PolygonBox):
        return self.overlap_x(other) * self.overlap_y(other)

    def intersection_pct(self, other: PolygonBox):
        if self.area == 0:
            return 0

        intersection = self.intersection_area(other)
        return intersection / self.area

    def merge(self, others: List[PolygonBox]) -> PolygonBox:
        corners = []
        for i in range(len(self.polygon)):
            x_coords = [self.polygon[i][0]] + [other.polygon[i][0] for other in others]
            y_coords = [self.polygon[i][1]] + [other.polygon[i][1] for other in others]
            min_x = min(x_coords)
            min_y = min(y_coords)
            max_x = max(x_coords)
            max_y = max(y_coords)

            if i == 0:
                corners.append([min_x, min_y])
            elif i == 1:
                corners.append([max_x, min_y])
            elif i == 2:
                corners.append([max_x, max_y])
            elif i == 3:
                corners.append([min_x, max_y])
        return PolygonBox(polygon=corners)

    @classmethod
    def from_bbox(cls, bbox: List[float], ensure_nonzero_area=False):
        if ensure_nonzero_area:
            bbox = list(bbox)
            bbox[2] = max(bbox[2], bbox[0] + 1)
            bbox[3] = max(bbox[3], bbox[1] + 1)
        return cls(polygon=[[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]])

```

--------------------------------------------------------------------------------
/marker/providers/powerpoint.py:
--------------------------------------------------------------------------------

```python
import base64
import os
import tempfile
import traceback

from marker.logger import get_logger
from marker.providers.pdf import PdfProvider

logger = get_logger()

css = """
@page {
    size: A4 landscape;
    margin: 1.5cm;
}

table {
    width: 100%;
    border-collapse: collapse;
    break-inside: auto;
    font-size: 10pt;
}

tr {
    break-inside: avoid;
    page-break-inside: avoid;
}

td {
    border: 0.75pt solid #000;
    padding: 6pt;
}

img {
    max-width: 100%;
    height: auto;
    object-fit: contain;
}
"""


class PowerPointProvider(PdfProvider):
    include_slide_number: bool = False

    def __init__(self, filepath: str, config=None):
        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        self.temp_pdf_path = temp_pdf.name
        temp_pdf.close()

        # Convert PPTX to PDF
        try:
            self.convert_pptx_to_pdf(filepath)
        except Exception as e:
            print(traceback.format_exc())
            raise ValueError(f"Error converting PPTX to PDF: {e}")

        # Initalize the PDF provider with the temp pdf path
        super().__init__(self.temp_pdf_path, config)

    def __del__(self):
        if os.path.exists(self.temp_pdf_path):
            os.remove(self.temp_pdf_path)

    def convert_pptx_to_pdf(self, filepath):
        from weasyprint import CSS, HTML
        from pptx import Presentation
        from pptx.enum.shapes import MSO_SHAPE_TYPE

        pptx = Presentation(filepath)

        html_parts = []

        for slide_index, slide in enumerate(pptx.slides):
            html_parts.append("<section>")
            if self.include_slide_number:
                html_parts.append(f"<h2>Slide {slide_index + 1}</h2>")

            # Process shapes in the slide
            for shape in slide.shapes:
                # If shape is a group shape, we recursively handle all grouped shapes
                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
                    html_parts.append(self._handle_group(shape))
                    continue

                # If shape is a table
                if shape.has_table:
                    html_parts.append(self._handle_table(shape))
                    continue

                # If shape is a picture
                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                    html_parts.append(self._handle_image(shape))
                    continue

                # If shape has text
                if hasattr(shape, "text") and shape.text is not None:
                    if shape.has_text_frame:
                        # Distinguish placeholders (title, subtitle, etc.)
                        html_parts.append(self._handle_text(shape))
                    else:
                        html_parts.append(f"<p>{self._escape_html(shape.text)}</p>")

            html_parts.append("</section>")

        html = "\n".join(html_parts)

        # We convert the HTML into a PDF
        HTML(string=html).write_pdf(
            self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()]
        )

    def _handle_group(self, group_shape) -> str:
        """
        Recursively handle shapes in a group. Returns HTML string for the entire group.
        """
        from pptx.enum.shapes import MSO_SHAPE_TYPE

        group_parts = []
        for shape in group_shape.shapes:
            if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
                group_parts.append(self._handle_group(shape))
                continue

            if shape.has_table:
                group_parts.append(self._handle_table(shape))
                continue

            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                group_parts.append(self._handle_image(shape))
                continue

            if hasattr(shape, "text"):
                if shape.has_text_frame:
                    group_parts.append(self._handle_text(shape))
                else:
                    group_parts.append(f"<p>{self._escape_html(shape.text)}</p>")

        return "".join(group_parts)

    def _handle_text(self, shape) -> str:
        """
        Processes shape text, including bullet/numbered list detection and placeholders
        (title, subtitle, etc.). Returns HTML for the text block(s).
        """
        from pptx.enum.shapes import PP_PLACEHOLDER

        # Distinguish placeholders to see if it's a title or subtitle
        label_html_tag = "p"
        if shape.is_placeholder:
            placeholder_type = shape.placeholder_format.type
            if placeholder_type in [PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE]:
                label_html_tag = "h3"
            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
                label_html_tag = "h4"

        # Keep track of whether we are currently in a <ul> or <ol>
        html_parts = []
        list_open = False
        list_type = None  # "ul" or "ol"

        for paragraph in shape.text_frame.paragraphs:
            p_el = paragraph._element
            # Check bullet
            bullet_char = p_el.find(".//a:buChar", namespaces=p_el.nsmap)
            bullet_num = p_el.find(".//a:buAutoNum", namespaces=p_el.nsmap)

            is_bullet = (bullet_char is not None) or (paragraph.level > 0)
            is_numbered = bullet_num is not None

            # If the paragraph is bullet or numbered
            if is_bullet or is_numbered:
                # Decide if we need to start a new list or continue an existing one
                current_list_type = "ol" if is_numbered else "ul"
                if not list_open:
                    # Start new
                    list_open = True
                    list_type = current_list_type
                    html_parts.append(f"<{list_type}>")

                elif list_open and list_type != current_list_type:
                    # Close old list, start new
                    html_parts.append(f"</{list_type}>")
                    list_type = current_list_type
                    html_parts.append(f"<{list_type}>")

                # Build the bullet (li) text from all runs in the paragraph
                p_text = "".join(run.text for run in paragraph.runs)
                if p_text:
                    html_parts.append(f"<li>{self._escape_html(p_text)}</li>")

            else:
                # If we were in a list, we need to close it
                if list_open:
                    html_parts.append(f"</{list_type}>")
                    list_open = False
                    list_type = None

                # Now it's just a normal paragraph
                # Gather the paragraph text from runs
                p_text = "".join(run.text for run in paragraph.runs)
                if p_text:
                    # If we know it's a slide title, we can use <h3> or so
                    html_parts.append(
                        f"<{label_html_tag}>{self._escape_html(p_text)}</{label_html_tag}>"
                    )

        # If the text frame ended and we still have an open list, close it
        if list_open:
            html_parts.append(f"</{list_type}>")

        return "".join(html_parts)

    def _handle_image(self, shape) -> str:
        """
        Embeds the image as a base64 <img> in HTML.
        """
        image = shape.image
        image_bytes = image.blob

        try:
            img_str = base64.b64encode(image_bytes).decode("utf-8")
            return f"<img src='data:{image.content_type};base64,{img_str}' />"
        except Exception as e:
            logger.warning(f"Warning: image cannot be loaded by Pillow: {e}")
            return ""

    def _handle_table(self, shape) -> str:
        """
        Renders a shape's table as an HTML <table>.
        """
        table_html = []
        table_html.append("<table border='1'>")

        for row in shape.table.rows:
            row_html = ["<tr>"]
            for cell in row.cells:
                row_html.append(f"<td>{self._escape_html(cell.text)}</td>")
            row_html.append("</tr>")
            table_html.append("".join(row_html))

        table_html.append("</table>")
        return "".join(table_html)

    def _escape_html(self, text: str) -> str:
        """
        Minimal escaping for HTML special characters.
        """
        return (
            text.replace("&", "&amp;")
            .replace("<", "&lt;")
            .replace(">", "&gt;")
            .replace('"', "&quot;")
            .replace("'", "&#39;")
        )

```

--------------------------------------------------------------------------------
/benchmarks/overall/elo.py:
--------------------------------------------------------------------------------

```python
import json
import random
import time
import os
from dataclasses import dataclass
from typing import List, Dict, Tuple, Literal
from PIL import Image
from collections import defaultdict
import tabulate

import click
import datasets
from google import genai
from google.genai.errors import APIError
from pydantic import BaseModel
from tqdm import tqdm

from marker.settings import settings

rating_prompt = """
You're a document analysis expert who is comparing two different markdown samples to an image to see which one represents the content of the image better. The markdown will be called version A and version B.

Here are some notes on the image and markdown:
- Some parts of the page may have been recognized as images and linked from the markdown, like `![](_page_0_Picture_0.jpeg)`.
- Tables will be formatted as Github flavored markdown.
- Block equations will be in LaTeX.
- The image and markdown may be in any language.
- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.

The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.

**Instructions**
Follow this process to evaluate the markdown:
1. Carefully examine the image.
2. Carefully examine the first markdown input provided.
3. Describe how well version a represents the image.
4. Carefully examine the second markdown input provided.
5. Describe how well version B represents the image.
6. Compare version A and version B.
7. Decide which markdown representation is better, based on the criteria below.  Output version_a if version a is better, and version_b if version b is better.

Use these criteria when judging the markdown:
- Overall - the overall quality of the markdown as compared to the image.
- Text quality - the quality of the text extraction from the image.
- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
- Tables - how effectively the tables have been extracted and formatted.
- Forms - how effectively the forms have extracted and formatted.
- Equations - how effectively block equations have been converted to LaTeX.
- Lists - if the lists have been properly extracted and formatted.
- Images - if images are identified and placed correctly.

Notes on scoring:
- Perfect markdown will include all of the important text from the image, and the formatting will be correct (minor mistakes okay).  It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings.  If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
- Bad markdown will have major missing text segments from the markdown or completely unreadable formatting.  It may also have key values that are different from the values in the image.

Output json, like in the example below.

**Example**
Version A
```markdown
# *Section 1*
This is some *markdown* extracted from a document.  Here is a block equation:
$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
```
Version B
```markdown
# Section 1
This is some markdown extracted from a document.  Here is a block equation:
$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 124}{t}$$
```
Output
```json
{
    "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
    "version_a_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
    "version_b_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.  The formatting in version b is slightly different from the image.  The value 124 is also different from the image.",
    "comparison": "Version A is better than version B.  The text and formatting in version A matches the image better than version B.  Version B also has an incorrect value.",
    "winner": "version_a",
}
```
**Input**
Version A
```markdown
{{version_a}}
```
Version B
```markdown
{{version_b}}
```
**Output**
"""

class ComparerSchema(BaseModel):
    image_description: str
    version_a_description: str
    version_b_description: str
    comparison: str
    winner: Literal["version_a", "version_b"]


class Comparer:
    def __init__(self):
        pass

    def __call__(
        self,
        img: Image.Image,
        version_a: str,
        version_b: str
    ) -> str | None:
        if version_a is None and version_b is not None:
            return "version_b"
        elif version_b is None and version_a is not None:
            return "version_a"

        hydrated_prompt = rating_prompt.replace("{{version_a}}", version_a).replace("{{version_b}}", version_b)
        try:
            rating = self.llm_rater(img, hydrated_prompt)
        except Exception as e:
            print(f"Error: {e}")
            return
        return rating


    def llm_rater(self, img: Image.Image, prompt: str):
        response = self.llm_response_wrapper(
            [img, prompt],
            ComparerSchema
        )
        assert "winner" in response, f"Response missing 'winner' key: {response}"
        return response["winner"]

    def llm_response_wrapper(
        self,
        prompt,
        response_schema,
    ):
        client = genai.Client(
            http_options={"timeout": 60000},
            vertexai=True,
            project=os.getenv("VERTEX_PROJECT_ID"),
            location=os.getenv("VERTEX_LOCATION"),
        )
        try:
            responses = client.models.generate_content(
                model="gemini-2.0-flash-001",
                contents=prompt,
                config={
                    "temperature": 0,
                    "response_schema": response_schema,
                    "response_mime_type": "application/json",
                },
            )
            output = responses.candidates[0].content.parts[0].text
            return json.loads(output)
        except APIError as e:
            print(f"Hit Gemini rate limit")
            return
        except Exception as e:
            print(f"Error: {e}")
            return


def display_win_rates_table(win_rates: dict):
    table = []
    headers = ["Method A", "Method B", "Wins", "Losses", "Win %"]
    for method_a, method_b_dict in win_rates.items():
        row = [method_a]
        for method_b, results in method_b_dict.items():
            row = [method_a, method_b, results["win"], results["loss"], (results["win"] / (results["win"] + results["loss"])) * 100]
            table.append(row)
    print(tabulate.tabulate(table, headers=headers, tablefmt="pretty"))


@click.command("Calculate win rates for document conversion methods")
@click.argument("dataset", type=str)
@click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix")
@click.option("--row_samples", type=int, default=2, help="Number of samples per row")
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process")
def main(
    dataset: str,
    methods: str,
    row_samples: int,
    max_rows: int
):
    ds = datasets.load_dataset(dataset, split="train")
    method_lst = methods.split(",")
    win_rates = {m: defaultdict(lambda: defaultdict(int)) for m in method_lst}
    comparer = Comparer()
    max_rows = max_rows or len(ds)

    for i in tqdm(range(max_rows), desc="Calculating win rates..."):
        row = ds[i]
        # Avoid any bias in ordering
        random.shuffle(method_lst)

        for j, method_a in enumerate(method_lst[:-1]):
            for z, method_b in enumerate(method_lst[j:]):
                if method_a == method_b:
                    continue

                method_a_md = row[f"{method_a}_md"]
                method_b_md = row[f"{method_b}_md"]
                winner = comparer(row["img"], method_a_md, method_b_md)
                if not winner:
                    continue

                if winner == "version_a":
                    win_rates[method_a][method_b]["win"] += 1
                    win_rates[method_b][method_a]["loss"] += 1
                else:
                    win_rates[method_b][method_a]["win"] += 1
                    win_rates[method_a][method_b]["loss"] += 1
        if i % 10 == 0:
            display_win_rates_table(win_rates)

    display_win_rates_table(win_rates)


if __name__ == "__main__":
    main()
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_mathblock.py:
--------------------------------------------------------------------------------

```python
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Tuple, Annotated

from pydantic import BaseModel
from tqdm import tqdm

from marker.output import json_to_html, unwrap_outer_tag
from marker.processors.llm import BaseLLMComplexBlockProcessor

from marker.schema import BlockTypes
from marker.schema.blocks import Block, InlineMath
from marker.schema.document import Document
from marker.schema.groups import PageGroup


class LLMMathBlockProcessor(BaseLLMComplexBlockProcessor):
    redo_inline_math: Annotated[
        bool,
        "If True, the inline math will be re-done, otherwise it will be left as is.",
    ] = False
    inlinemath_min_ratio: Annotated[
        float,
        "If more than this ratio of blocks are inlinemath blocks, assume everything has math.",
    ] = 0.4

    block_types = (BlockTypes.TextInlineMath,)  # Primary block type
    additional_block_types = (
        BlockTypes.Text,
        BlockTypes.Caption,
        BlockTypes.SectionHeader,
        BlockTypes.Footnote,
    )  # Seconday, can also contain math

    text_math_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and extracted text corresponding to the text in the image.
Your task is to correct any errors in the extracted text, including math, formatting, and other inaccuracies, and output the corrected block in html format.  Stay as faithful to the text in the image as possible.

**Instructions:**

1. Carefully examine the provided text block image .
2. Analyze the text that has been extracted from the block.
3. Compare the extracted text to the corresponding text in the image.
4. Write a short analysis of the text block, including any errors you see in the extracted text.
5. If there are no errors in any of the extracted text, output "No corrections needed".
6. Correct any errors in the extracted text, including:
    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Surround them with <math>...</math> tags.  The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX.  Do not use $ or $$ as delimiters.
    * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
    * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters.  Use the <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
    * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
    * Ensure lines wrap properly, and that newlines are not in the middle of sentences.
7. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted text unless it is necessary to correct an error.
8. Output the corrected text in html format, as shown in the example below.  Only use the p, math, br, a, i, b, sup, sub, and span tags.
9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.

**Example:**

Input:
```html
Adversarial training (AT) <a href='#page-9-1'>[23]</a>, which aims to minimize the model's risk under the worst-case perturbations, 
is currently the most effective approach for improving the robustness of deep neural networks. For a given neural network f(x, w) 
with parameters w, the optimization objective of AT can be formulated as follows:
```

Output:
analysis: The inline math is not in LaTeX format and is not surrounded by <math>...</math> tags.
```html
Adversarial training <i>(AT)</i> <a href='#page-9-1'>[23]</a>, which aims to minimize the model's risk under the worst-case perturbations, is currently the most effective approach for improving the robustness of deep neural networks. For a given neural network <math>f(x, w)</math> with parameters <math>w</math>, the optimization objective of AT can be formulated as follows:
```

**Input:**
```html
{extracted_html}
```
"""

    def rewrite_blocks(self, document: Document):
        if not self.redo_inline_math:
            return

        # Get inline math blocks
        inline_blocks: List[InlineMath] = [
            (page, block)
            for page in document.pages
            for block in page.contained_blocks(document, self.block_types)
        ]

        # Get other blocks with detected math in them
        detected_blocks = [
            (page, block)
            for page in document.pages
            for block in page.contained_blocks(
                document,
                (
                    BlockTypes.Text,
                    BlockTypes.Caption,
                    BlockTypes.SectionHeader,
                    BlockTypes.Footnote,
                    BlockTypes.ListItem,
                ),
            )
            if any(
                [
                    b.formats and "math" in b.formats
                    for b in block.contained_blocks(document, (BlockTypes.Line,))
                ]
            )
        ]

        # If a page has enough math blocks, assume all blocks can contain math
        additional_text_blocks = []
        for page in document.pages:
            # Check for inline math blocks
            page_inlinemath_blocks = [
                im for im in inline_blocks if im[0].page_id == page.page_id
            ]
            page_detected_blocks = [
                db for db in detected_blocks if db[0].page_id == page.page_id
            ]
            math_block_count = len(page_inlinemath_blocks) + len(page_detected_blocks)

            # Find all potential blocks
            additional_blocks = page.contained_blocks(
                document, self.additional_block_types + self.block_types
            )

            # Check if the ratio of math blocks to additional blocks is high enough
            if (
                math_block_count / max(1, len(additional_blocks))
                < self.inlinemath_min_ratio
            ):
                continue

            for b in additional_blocks:
                if b not in detected_blocks and b not in inline_blocks:
                    additional_text_blocks.append((page, b))

        inference_blocks = inline_blocks + detected_blocks + additional_text_blocks

        # Don't show progress if there are no blocks to process
        total_blocks = len(inference_blocks)
        if total_blocks == 0:
            return

        pbar = tqdm(
            total=total_blocks,
            desc=f"{self.__class__.__name__} running",
            disable=self.disable_tqdm
        )
        with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
            for future in as_completed(
                [
                    executor.submit(self.process_rewriting, document, b[0], b[1])
                    for b in inference_blocks
                ]
            ):
                future.result()  # Raise exceptions if any occurred
                pbar.update(1)

        pbar.close()

    def get_block_text(self, block: Block, document: Document) -> str:
        html = json_to_html(block.render(document))
        html = unwrap_outer_tag(html)  # Remove an outer p tag if it exists
        return html

    def get_block_lines(self, block: Block, document: Document) -> Tuple[list, list]:
        text_lines = block.contained_blocks(document, (BlockTypes.Line,))
        extracted_lines = [line.formatted_text(document) for line in text_lines]
        return text_lines, extracted_lines

    def process_rewriting(self, document: Document, page: PageGroup, block: Block):
        block_text = self.get_block_text(block, document)
        prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text)

        image = self.extract_image(document, block)
        response = self.llm_service(prompt, image, block, LLMTextSchema)

        if not response or "corrected_html" not in response:
            block.update_metadata(llm_error_count=1)
            return

        corrected_html = response["corrected_html"]
        if not corrected_html:
            block.update_metadata(llm_error_count=1)
            return

        # Block is fine
        if "no corrections needed" in corrected_html.lower():
            return

        if len(corrected_html) < len(block_text) * 0.6:
            block.update_metadata(llm_error_count=1)
            return

        block.html = corrected_html


class LLMTextSchema(BaseModel):
    analysis: str
    corrected_html: str

```

--------------------------------------------------------------------------------
/signatures/version1/cla.json:
--------------------------------------------------------------------------------

```json
{
  "signedContributors": [
    {
      "name": "korakot",
      "id": 3155646,
      "comment_id": 2143359366,
      "created_at": "2024-06-01T08:25:52Z",
      "repoId": 712111618,
      "pullRequestNo": 161
    },
    {
      "name": "tosaddler",
      "id": 13705399,
      "comment_id": 2144014410,
      "created_at": "2024-06-02T20:40:52Z",
      "repoId": 712111618,
      "pullRequestNo": 165
    },
    {
      "name": "q2333gh",
      "id": 32679742,
      "comment_id": 2156122900,
      "created_at": "2024-06-08T18:01:39Z",
      "repoId": 712111618,
      "pullRequestNo": 176
    },
    {
      "name": "q2333gh",
      "id": 32679742,
      "comment_id": 2156614334,
      "created_at": "2024-06-09T13:48:49Z",
      "repoId": 712111618,
      "pullRequestNo": 176
    },
    {
      "name": "aniketinamdar",
      "id": 79044809,
      "comment_id": 2157453610,
      "created_at": "2024-06-10T06:43:39Z",
      "repoId": 712111618,
      "pullRequestNo": 179
    },
    {
      "name": "Zxilly",
      "id": 31370133,
      "comment_id": 2295163597,
      "created_at": "2024-08-18T07:44:04Z",
      "repoId": 712111618,
      "pullRequestNo": 257
    },
    {
      "name": "conscienceli",
      "id": 4034943,
      "comment_id": 2333374932,
      "created_at": "2024-09-06T06:56:25Z",
      "repoId": 712111618,
      "pullRequestNo": 276
    },
    {
      "name": "jcytong",
      "id": 156466,
      "comment_id": 2334987739,
      "created_at": "2024-09-07T01:27:51Z",
      "repoId": 712111618,
      "pullRequestNo": 278
    },
    {
      "name": "syldor",
      "id": 4158062,
      "comment_id": 2428114896,
      "created_at": "2024-10-22T03:05:34Z",
      "repoId": 712111618,
      "pullRequestNo": 309
    },
    {
      "name": "iammosespaulr",
      "id": 28682735,
      "comment_id": 2448054789,
      "created_at": "2024-10-30T18:36:24Z",
      "repoId": 712111618,
      "pullRequestNo": 327
    },
    {
      "name": "tjbck",
      "id": 25473318,
      "comment_id": 2454051305,
      "created_at": "2024-11-04T08:09:46Z",
      "repoId": 712111618,
      "pullRequestNo": 332
    },
    {
      "name": "jonaskahn",
      "id": 4338500,
      "comment_id": 2516932772,
      "created_at": "2024-12-04T10:47:00Z",
      "repoId": 712111618,
      "pullRequestNo": 413
    },
    {
      "name": "Leon-Sander",
      "id": 72946124,
      "comment_id": 2519736628,
      "created_at": "2024-12-05T09:26:56Z",
      "repoId": 712111618,
      "pullRequestNo": 415
    },
    {
      "name": "aguadoenzo",
      "id": 11411723,
      "comment_id": 2520293935,
      "created_at": "2024-12-05T13:13:34Z",
      "repoId": 712111618,
      "pullRequestNo": 416
    },
    {
      "name": "tarun-menta",
      "id": 66506307,
      "comment_id": 2543907406,
      "created_at": "2024-12-15T15:06:32Z",
      "repoId": 712111618,
      "pullRequestNo": 427
    },
    {
      "name": "ZeyuTeng96",
      "id": 96521059,
      "comment_id": 2567236036,
      "created_at": "2025-01-02T02:36:02Z",
      "repoId": 712111618,
      "pullRequestNo": 452
    },
    {
      "name": "xiaoyao9184",
      "id": 6614349,
      "comment_id": 2571623521,
      "created_at": "2025-01-05T13:15:34Z",
      "repoId": 712111618,
      "pullRequestNo": 463
    },
    {
      "name": "yasyf",
      "id": 709645,
      "comment_id": 2571679069,
      "created_at": "2025-01-05T16:23:12Z",
      "repoId": 712111618,
      "pullRequestNo": 464
    },
    {
      "name": "jazzido",
      "id": 27584,
      "comment_id": 2610428000,
      "created_at": "2025-01-23T17:01:02Z",
      "repoId": 712111618,
      "pullRequestNo": 502
    },
    {
      "name": "tagliala",
      "id": 556268,
      "comment_id": 2614522545,
      "created_at": "2025-01-26T17:44:13Z",
      "repoId": 712111618,
      "pullRequestNo": 507
    },
    {
      "name": "conjuncts",
      "id": 67614673,
      "comment_id": 2646947362,
      "created_at": "2025-02-10T05:22:00Z",
      "repoId": 712111618,
      "pullRequestNo": 533
    },
    {
      "name": "sleiss",
      "id": 5084100,
      "comment_id": 2661051259,
      "created_at": "2025-02-15T19:11:32Z",
      "repoId": 712111618,
      "pullRequestNo": 552
    },
    {
      "name": "dantetemplar",
      "id": 69670642,
      "comment_id": 2661665606,
      "created_at": "2025-02-16T23:02:34Z",
      "repoId": 712111618,
      "pullRequestNo": 555
    },
    {
      "name": "vicenciomf2",
      "id": 127889973,
      "comment_id": 2676007412,
      "created_at": "2025-02-22T04:34:27Z",
      "repoId": 712111618,
      "pullRequestNo": 574
    },
    {
      "name": "denisshepelin",
      "id": 143056,
      "comment_id": 2694298068,
      "created_at": "2025-03-03T12:50:11Z",
      "repoId": 712111618,
      "pullRequestNo": 592
    },
    {
      "name": "lecifire",
      "id": 43193080,
      "comment_id": 2735460895,
      "created_at": "2025-03-19T06:25:13Z",
      "repoId": 712111618,
      "pullRequestNo": 609
    },
    {
      "name": "eddiedozier",
      "id": 9558671,
      "comment_id": 2774087436,
      "created_at": "2025-04-03T01:14:41Z",
      "repoId": 712111618,
      "pullRequestNo": 646
    },
    {
      "name": "MauritsBrinkman",
      "id": 153435669,
      "comment_id": 2838830143,
      "created_at": "2025-04-29T13:10:38Z",
      "repoId": 712111618,
      "pullRequestNo": 675
    },
    {
      "name": "jacksontromero",
      "id": 42822986,
      "comment_id": 2927811267,
      "created_at": "2025-06-01T20:10:35Z",
      "repoId": 712111618,
      "pullRequestNo": 721
    },
    {
      "name": "rgeorgi",
      "id": 805862,
      "comment_id": 2968063099,
      "created_at": "2025-06-12T20:23:50Z",
      "repoId": 712111618,
      "pullRequestNo": 751
    },
    {
      "name": "zanussbaum",
      "id": 33707069,
      "comment_id": 3006162493,
      "created_at": "2025-06-25T21:12:25Z",
      "repoId": 712111618,
      "pullRequestNo": 769
    },
    {
      "name": "runarmod",
      "id": 40912933,
      "comment_id": 3007582739,
      "created_at": "2025-06-26T08:06:45Z",
      "repoId": 712111618,
      "pullRequestNo": 772
    },
    {
      "name": "ArnoKlein",
      "id": 64474825,
      "comment_id": 3044904172,
      "created_at": "2025-07-07T12:41:57Z",
      "repoId": 712111618,
      "pullRequestNo": 793
    },
    {
      "name": "voberoi",
      "id": 26339,
      "comment_id": 3054261379,
      "created_at": "2025-07-09T22:17:58Z",
      "repoId": 712111618,
      "pullRequestNo": 796
    },
    {
      "name": "MrWaradana",
      "id": 77272854,
      "comment_id": 3060610542,
      "created_at": "2025-07-11T05:30:30Z",
      "repoId": 712111618,
      "pullRequestNo": 799
    },
    {
      "name": "schen1102",
      "id": 221293366,
      "comment_id": 3083001869,
      "created_at": "2025-07-17T07:50:39Z",
      "repoId": 712111618,
      "pullRequestNo": 810
    },
    {
      "name": "igormf",
      "id": 8070635,
      "comment_id": 3156380190,
      "created_at": "2025-08-05T19:40:17Z",
      "repoId": 712111618,
      "pullRequestNo": 823
    },
    {
      "name": "muratcanlaloglu",
      "id": 71566447,
      "comment_id": 3189653464,
      "created_at": "2025-08-14T19:34:21Z",
      "repoId": 712111618,
      "pullRequestNo": 833
    },
    {
      "name": "SageStack",
      "id": 124574940,
      "comment_id": 3192827940,
      "created_at": "2025-08-15T21:40:27Z",
      "repoId": 712111618,
      "pullRequestNo": 837
    },
    {
      "name": "dowithless",
      "id": 165774507,
      "comment_id": 3192986184,
      "created_at": "2025-08-15T23:20:17Z",
      "repoId": 712111618,
      "pullRequestNo": 839
    },
    {
      "name": "blap",
      "id": 913963,
      "comment_id": 3036951999,
      "created_at": "2025-07-04T17:42:05Z",
      "repoId": 712111618,
      "pullRequestNo": 789
    },
    {
      "name": "sandy0kwon",
      "id": 78377296,
      "comment_id": 3207667429,
      "created_at": "2025-08-20T18:59:07Z",
      "repoId": 712111618,
      "pullRequestNo": 843
    },
    {
      "name": "TwinkleXD",
      "id": 68148987,
      "comment_id": 3218116522,
      "created_at": "2025-08-24T13:49:14Z",
      "repoId": 712111618,
      "pullRequestNo": 849
    },
    {
      "name": "u-ashish",
      "id": 14264791,
      "comment_id": 3221349115,
      "created_at": "2025-08-25T18:41:28Z",
      "repoId": 712111618,
      "pullRequestNo": 850
    },
    {
      "name": "EdmondChuiHW",
      "id": 1967998,
      "comment_id": 3254531992,
      "created_at": "2025-09-04T16:30:48Z",
      "repoId": 712111618,
      "pullRequestNo": 869
    },
    {
      "name": "himanshu-josh",
      "id": 216880840,
      "comment_id": 3311497351,
      "created_at": "2025-09-19T09:48:31Z",
      "repoId": 712111618,
      "pullRequestNo": 891
    },
    {
      "name": "aud",
      "id": 13060190,
      "comment_id": 3331451358,
      "created_at": "2025-09-25T02:21:46Z",
      "repoId": 712111618,
      "pullRequestNo": 895
    },
    {
      "name": "Tenkeboks",
      "id": 198454519,
      "comment_id": 3338897497,
      "created_at": "2025-09-26T14:05:43Z",
      "repoId": 712111618,
      "pullRequestNo": 897
    },
    {
      "name": "ozzzzz",
      "id": 5720160,
      "comment_id": 3363582442,
      "created_at": "2025-10-02T23:18:12Z",
      "repoId": 712111618,
      "pullRequestNo": 905
    },
    {
      "name": "Vidarshana26204",
      "id": 36869394,
      "comment_id": 3394829315,
      "created_at": "2025-10-12T16:26:13Z",
      "repoId": 712111618,
      "pullRequestNo": 916
    }
  ]
}
```

--------------------------------------------------------------------------------
/marker/renderers/markdown.py:
--------------------------------------------------------------------------------

```python
import re
from collections import defaultdict
from typing import Annotated, Tuple

import regex
import six
from bs4 import NavigableString
from markdownify import MarkdownConverter, re_whitespace
from marker.logger import get_logger
from pydantic import BaseModel

from marker.renderers.html import HTMLRenderer
from marker.schema import BlockTypes
from marker.schema.document import Document

logger = get_logger()


def escape_dollars(text):
    return text.replace("$", r"\$")


def cleanup_text(full_text):
    full_text = re.sub(r"\n{3,}", "\n\n", full_text)
    full_text = re.sub(r"(\n\s){3,}", "\n\n", full_text)
    return full_text.strip()


def get_formatted_table_text(element):
    text = []
    for content in element.contents:
        if content is None:
            continue

        if isinstance(content, NavigableString):
            stripped = content.strip()
            if stripped:
                text.append(escape_dollars(stripped))
        elif content.name == "br":
            text.append("<br>")
        elif content.name == "math":
            text.append("$" + content.text + "$")
        else:
            content_str = escape_dollars(str(content))
            text.append(content_str)

    full_text = ""
    for i, t in enumerate(text):
        if t == "<br>":
            full_text += t
        elif i > 0 and text[i - 1] != "<br>":
            full_text += " " + t
        else:
            full_text += t
    return full_text


class Markdownify(MarkdownConverter):
    def __init__(
        self,
        paginate_output,
        page_separator,
        inline_math_delimiters,
        block_math_delimiters,
        html_tables_in_markdown,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.paginate_output = paginate_output
        self.page_separator = page_separator
        self.inline_math_delimiters = inline_math_delimiters
        self.block_math_delimiters = block_math_delimiters
        self.html_tables_in_markdown = html_tables_in_markdown

    def convert_div(self, el, text, parent_tags):
        is_page = el.has_attr("class") and el["class"][0] == "page"
        if self.paginate_output and is_page:
            page_id = el["data-page-id"]
            pagination_item = (
                "\n\n" + "{" + str(page_id) + "}" + self.page_separator + "\n\n"
            )
            return pagination_item + text
        else:
            return text

    def convert_p(self, el, text, parent_tags):
        hyphens = r"-—¬"
        has_continuation = el.has_attr("class") and "has-continuation" in el["class"]
        if has_continuation:
            block_type = BlockTypes[el["block-type"]]
            if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]:
                if regex.compile(
                    rf".*[\p{{Ll}}|\d][{hyphens}]\s?$", regex.DOTALL
                ).match(text):  # handle hypenation across pages
                    return regex.split(rf"[{hyphens}]\s?$", text)[0]
                return f"{text} "
            if block_type == BlockTypes.ListGroup:
                return f"{text}"
        return f"{text}\n\n" if text else ""  # default convert_p behavior

    def convert_math(self, el, text, parent_tags):
        block = el.has_attr("display") and el["display"] == "block"
        if block:
            return (
                "\n"
                + self.block_math_delimiters[0]
                + text.strip()
                + self.block_math_delimiters[1]
                + "\n"
            )
        else:
            return (
                " "
                + self.inline_math_delimiters[0]
                + text.strip()
                + self.inline_math_delimiters[1]
                + " "
            )

    def convert_table(self, el, text, parent_tags):
        if self.html_tables_in_markdown:
            return "\n\n" + str(el) + "\n\n"

        total_rows = len(el.find_all("tr"))
        colspans = []
        rowspan_cols = defaultdict(int)
        for i, row in enumerate(el.find_all("tr")):
            row_cols = rowspan_cols[i]
            for cell in row.find_all(["td", "th"]):
                colspan = int(cell.get("colspan", 1))
                row_cols += colspan
                for r in range(int(cell.get("rowspan", 1)) - 1):
                    rowspan_cols[i + r] += (
                        colspan  # Add the colspan to the next rows, so they get the correct number of columns
                    )
            colspans.append(row_cols)
        total_cols = max(colspans) if colspans else 0

        grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]

        for row_idx, tr in enumerate(el.find_all("tr")):
            col_idx = 0
            for cell in tr.find_all(["td", "th"]):
                # Skip filled positions
                while col_idx < total_cols and grid[row_idx][col_idx] is not None:
                    col_idx += 1

                # Fill in grid
                value = (
                    get_formatted_table_text(cell)
                    .replace("\n", " ")
                    .replace("|", " ")
                    .strip()
                )
                rowspan = int(cell.get("rowspan", 1))
                colspan = int(cell.get("colspan", 1))

                if col_idx >= total_cols:
                    # Skip this cell if we're out of bounds
                    continue

                for r in range(rowspan):
                    for c in range(colspan):
                        try:
                            if r == 0 and c == 0:
                                grid[row_idx][col_idx] = value
                            else:
                                grid[row_idx + r][col_idx + c] = (
                                    ""  # Empty cell due to rowspan/colspan
                                )
                        except IndexError:
                            # Sometimes the colspan/rowspan predictions can overflow
                            logger.info(
                                f"Overflow in columns: {col_idx + c} >= {total_cols} or rows: {row_idx + r} >= {total_rows}"
                            )
                            continue

                col_idx += colspan

        markdown_lines = []
        col_widths = [0] * total_cols
        for row in grid:
            for col_idx, cell in enumerate(row):
                if cell is not None:
                    col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))

        def add_header_line():
            markdown_lines.append(
                "|" + "|".join("-" * (width + 2) for width in col_widths) + "|"
            )

        # Generate markdown rows
        added_header = False
        for i, row in enumerate(grid):
            is_empty_line = all(not cell for cell in row)
            if is_empty_line and not added_header:
                # Skip leading blank lines
                continue

            line = []
            for col_idx, cell in enumerate(row):
                if cell is None:
                    cell = ""
                padding = col_widths[col_idx] - len(str(cell))
                line.append(f" {cell}{' ' * padding} ")
            markdown_lines.append("|" + "|".join(line) + "|")

            if not added_header:
                # Skip empty lines when adding the header row
                add_header_line()
                added_header = True

        # Handle one row tables
        if total_rows == 1:
            add_header_line()

        table_md = "\n".join(markdown_lines)
        return "\n\n" + table_md + "\n\n"

    def convert_a(self, el, text, parent_tags):
        text = self.escape(text)
        # Escape brackets and parentheses in text
        text = re.sub(r"([\[\]()])", r"\\\1", text)
        return super().convert_a(el, text, parent_tags)

    def convert_span(self, el, text, parent_tags):
        if el.get("id"):
            return f'<span id="{el["id"]}">{text}</span>'
        else:
            return text

    def escape(self, text, parent_tags=None):
        text = super().escape(text, parent_tags)
        if self.options["escape_dollars"]:
            text = text.replace("$", r"\$")
        return text

    def process_text(self, el, parent_tags=None):
        text = six.text_type(el) or ""

        # normalize whitespace if we're not inside a preformatted element
        if not el.find_parent("pre"):
            text = re_whitespace.sub(" ", text)

        # escape special characters if we're not inside a preformatted or code element
        if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):
            text = self.escape(text)

        # remove trailing whitespaces if any of the following condition is true:
        # - current text node is the last node in li
        # - current text node is followed by an embedded list
        if el.parent.name == "li" and (
            not el.next_sibling or el.next_sibling.name in ["ul", "ol"]
        ):
            text = text.rstrip()

        return text


class MarkdownOutput(BaseModel):
    markdown: str
    images: dict
    metadata: dict


class MarkdownRenderer(HTMLRenderer):
    page_separator: Annotated[
        str, "The separator to use between pages.", "Default is '-' * 48."
    ] = "-" * 48
    inline_math_delimiters: Annotated[
        Tuple[str], "The delimiters to use for inline math."
    ] = ("$", "$")
    block_math_delimiters: Annotated[
        Tuple[str], "The delimiters to use for block math."
    ] = ("$$", "$$")
    html_tables_in_markdown: Annotated[
        bool, "Return tables formatted as HTML, instead of in markdown"
    ] = False

    @property
    def md_cls(self):
        return Markdownify(
            self.paginate_output,
            self.page_separator,
            heading_style="ATX",
            bullets="-",
            escape_misc=False,
            escape_underscores=True,
            escape_asterisks=True,
            escape_dollars=True,
            sub_symbol="<sub>",
            sup_symbol="<sup>",
            inline_math_delimiters=self.inline_math_delimiters,
            block_math_delimiters=self.block_math_delimiters,
            html_tables_in_markdown=self.html_tables_in_markdown
        )

    def __call__(self, document: Document) -> MarkdownOutput:
        document_output = document.render(self.block_config)
        full_html, images = self.extract_html(document, document_output)
        markdown = self.md_cls.convert(full_html)
        markdown = cleanup_text(markdown)

        # Ensure we set the correct blanks for pagination markers
        if self.paginate_output:
            if not markdown.startswith("\n\n"):
                markdown = "\n\n" + markdown
            if markdown.endswith(self.page_separator):
                markdown += "\n\n"

        return MarkdownOutput(
            markdown=markdown,
            images=images,
            metadata=self.generate_document_metadata(document, document_output),
        )

```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_page_correction.py:
--------------------------------------------------------------------------------

```python
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Annotated

from marker.logger import get_logger
from marker.processors.llm import BaseLLMComplexBlockProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import BlockId
from marker.schema.document import Document
from marker.schema.groups import PageGroup
from pydantic import BaseModel
from tqdm import tqdm

logger = get_logger()

FORMAT_TAGS = ["b", "i", "u", "del", "math", "sub", "sup", "a", "code", "p", "img"]
BLOCK_MAP = {
    "Text": [],
    "TextInlineMath": [],
    "Table": ["table", "tbody", "tr", "td", "th"],
    "ListGroup": ["ul", "li"],
    "SectionHeader": [],
    "Form": ["form", "input", "select", "textarea", "table", "tbody", "tr", "td", "th"],
    "Figure": [],
    "Picture": [],
    "Code": ["pre"],
    "TableOfContents": ["table", "tbody", "tr", "td", "th"],
}
ALL_TAGS = FORMAT_TAGS + [tag for tags in BLOCK_MAP.values() for tag in tags]


class LLMPageCorrectionProcessor(BaseLLMComplexBlockProcessor):
    block_correction_prompt: Annotated[
        str, "The user prompt to guide the block correction process."
    ] = None
    default_user_prompt = """Your goal is to reformat the blocks to be as correct as possible, without changing the underlying meaning of the text within the blocks.  Mostly focus on reformatting the content.  Ignore minor formatting issues like extra <i> tags."""
    page_prompt = """You're a text correction expert specializing in accurately reproducing text from PDF pages. You will be given a JSON list of blocks on a PDF page, along with the image for that page.  The blocks will be formatted like the example below.  The blocks will be presented in reading order.

```json
[
    {
        "bbox": [x1, y1, x2, y2],
        "id": "/page/0/Text/1",
        "block_type": "Text",
        "html": "<p>Some text here</p>",
    }, ...
]
```

You will also be given a prompt from the user that tells you how to correct the blocks.  Your task is to analyze the blocks and the image, then follow the prompt to correct the blocks.

Here are the types of changes you can make in response to the prompt:

- Reorder the blocks to reflect the correct reading order.
- Change the block type to the correct type - the potential types are "SectionHeader", "Form", "Text", "Table", "Figure", "Picture", "ListGroup", "PageFooter", "PageHeader", "Footnote", or "Equation".  In this case, update the html as well to match the new block type.
- Make edits to block content by changing the HTML.

Guidelines:
- Only use the following tags: {{format_tags}}.  Do not use any other tags.  
- The math tag can have the attribute `display="block"` to indicate display math, the a tag can have the attribute `href="..."` to indicate a link, and td and th tags can have the attribute `colspan="..."` and `rowspan="..."` to indicate table cells that span multiple columns or rows.  There can be a "block-type" attribute on p tags.  Do not use any other attributes.
- Keep LaTeX formulas inside <math> tags - these are important for downstream processing.
- Bboxes are normalized 0-1000
- The order of the JSON list is the reading order for the blocks
- Follow the user prompt faithfully, and only make additional changes if there is a significant issue with correctness.
- Stay faithful to the original image, and do not insert any content that is not present in the image or the blocks, unless specifically requested by the user prompt.

**Instructions:**
1. Carefully examine the provided JSON representation of the page, along with the image.
2. Analyze the user prompt.
3. Identify any issues you'll need to fix, and write a short analysis.
4. If everything is fine, output "no_corrections"  Otherwise, output the type of correction needed: ["reorder", "rewrite", "reorder_first"].  Rewrite includes rewriting html and changing the block type.  If you need to do both, then perform only the reordering, and output "reorder_first", so we can do the rewriting later.
5. If corrections are needed, output any blocks that need updates:
    a. If reading order needs to be changed, output the IDs of the blocks in the correct order, and keep block_type and html blank, like this:
    ```json
    [
        {
            "id": "/page/0/Text/1",
            "block_type": "",
            "html": ""
        },
        ...
    ]

    b. If blocks need to be rewritten, output the block ids and new HTML for the blocks, like this:
        ```json
        [
            {
                "id": "/page/0/Text/1",
                "block_type": "Text",
                "html": "<p>New HTML content here</p>"
            },
            ...
        ]
        ```

**Example:**
Input:
Blocks
```json
[
    {
        "bbox": [x1, y1, x2, y2],
        "id": "/page/0/Text/1",
        "block_type": "Text",
        "html": "1.14 Vector Operations",
    },
    {
        "bbox": [x1, y1, x2, y2],
        "id": "/page/0/Text/2",
        "block_type": "Text",
        "html": "<p>You can perform many operations on a vector, including...</p>",
    },
]
```
User Prompt
Ensure that all blocks have the correct labels, and that reading order is correct.
Output:
Analysis: The blocks are in the correct reading order, but the first block should actually be a SectionHeader.
```json
[
    {
        "id": "/page/0/Text/1",
        "block_type": "SectionHeader",
        "html": "<h1>1.14 Vector Operations</h1>"
    }
]
```

**Input:**
Blocks
```json
{{page_json}}
```
User Prompt
{{user_prompt}}
"""

    def get_selected_blocks(
        self,
        document: Document,
        page: PageGroup,
    ) -> List[dict]:
        selected_blocks = page.structure_blocks(document)
        json_blocks = [
            self.normalize_block_json(block, document, page)
            for i, block in enumerate(selected_blocks)
        ]
        return json_blocks

    def process_rewriting(self, document: Document, page1: PageGroup):
        page_blocks = self.get_selected_blocks(document, page1)
        image = page1.get_image(document, highres=False)

        prompt = (
            self.page_prompt.replace("{{page_json}}", json.dumps(page_blocks))
            .replace("{{format_tags}}", json.dumps(ALL_TAGS))
            .replace("{{user_prompt}}", self.block_correction_prompt)
        )
        response = self.llm_service(prompt, image, page1, PageSchema)
        logger.debug(f"Got reponse from LLM: {response}")

        if not response or "correction_type" not in response:
            logger.warning("LLM did not return a valid response")
            return

        correction_type = response["correction_type"]
        if correction_type == "no_corrections":
            return
        elif correction_type in ["reorder", "reorder_first"]:
            self.load_blocks(response)
            self.handle_reorder(response["blocks"], page1)

            # If we needed to reorder first, we will handle the rewriting next
            if correction_type == "reorder_first":
                self.process_rewriting(document, page1)
        elif correction_type == "rewrite":
            self.load_blocks(response)
            self.handle_rewrites(response["blocks"], document)
        else:
            logger.warning(f"Unknown correction type: {correction_type}")
            return

    def load_blocks(self, response):
        if isinstance(response["blocks"], str):
            response["blocks"] = json.loads(response["blocks"])

    def handle_reorder(self, blocks: list, page1: PageGroup):
        unique_page_ids = set()
        document_page_ids = [str(page1.page_id)]
        document_pages = [page1]

        for block_data in blocks:
            try:
                page_id, _, _ = block_data["id"].split("/")
                unique_page_ids.add(page_id)
            except Exception as e:
                logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
                continue

        if set(document_page_ids) != unique_page_ids:
            logger.debug(
                "Some page IDs in the response do not match the document's pages"
            )
            return

        for page_id, document_page in zip(unique_page_ids, document_pages):
            block_ids_for_page = []
            for block_data in blocks:
                try:
                    page_id, block_type, block_id = block_data["id"].split("/")
                    block_id = BlockId(
                        page_id=page_id,
                        block_id=block_id,
                        block_type=getattr(BlockTypes, block_type),
                    )
                    block_ids_for_page.append(block_id)
                except Exception as e:
                    logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
                    continue

                # Both sides should have the same values, just be reordered
                if not all(
                    [
                        block_id in document_page.structure
                        for block_id in block_ids_for_page
                    ]
                ):
                    logger.debug(
                        f"Some blocks for page {page_id} not found in document"
                    )
                    continue

                if not all(
                    [
                        block_id in block_ids_for_page
                        for block_id in document_page.structure
                    ]
                ):
                    logger.debug(
                        f"Some blocks in document page {page_id} not found in response"
                    )
                    continue

                # Swap the order of blocks in the document page
                document_page.structure = block_ids_for_page

    def handle_rewrites(self, blocks: list, document: Document):
        for block_data in blocks:
            try:
                block_id = block_data["id"].strip().lstrip("/")
                _, page_id, block_type, block_id = block_id.split("/")
                block_id = BlockId(
                    page_id=page_id,
                    block_id=block_id,
                    block_type=getattr(BlockTypes, block_type),
                )
                block = document.get_block(block_id)
                if not block:
                    logger.debug(f"Block {block_id} not found in document")
                    continue

                if hasattr(block, "html"):
                    block.html = block_data["html"]
            except Exception as e:
                logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
                continue

    def rewrite_blocks(self, document: Document):
        if not self.block_correction_prompt:
            return

        # Don't show progress if there are no blocks to process
        total_blocks = len(document.pages)
        if total_blocks == 0:
            return

        pbar = tqdm(
            total=max(1, total_blocks - 1),
            desc=f"{self.__class__.__name__} running",
            disable=self.disable_tqdm,
        )

        with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
            for future in as_completed(
                [
                    executor.submit(self.process_rewriting, document, page)
                    for page in document.pages
                ]
            ):
                future.result()  # Raise exceptions if any occurred
                pbar.update(1)

        pbar.close()


class BlockSchema(BaseModel):
    id: str
    html: str
    block_type: str


class PageSchema(BaseModel):
    analysis: str
    correction_type: str
    blocks: List[BlockSchema]

```

--------------------------------------------------------------------------------
/marker/schema/blocks/base.py:
--------------------------------------------------------------------------------

```python
from __future__ import annotations

from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence, Tuple

from pydantic import BaseModel, ConfigDict, field_validator
from PIL import Image

from marker.schema import BlockTypes
from marker.schema.polygon import PolygonBox

if TYPE_CHECKING:
    from marker.schema.document import Document
    from marker.schema.groups.page import PageGroup


class BlockMetadata(BaseModel):
    llm_request_count: int = 0
    llm_error_count: int = 0
    llm_tokens_used: int = 0
    previous_text: str = ""
    previous_type: str = ""
    previous_order: int = 0

    def merge(self, model2):
        return self.__class__(
            **{
                field: getattr(self, field) + getattr(model2, field)
                for field in self.model_fields
            }
        )


class BlockOutput(BaseModel):
    html: str
    polygon: PolygonBox
    id: BlockId
    children: List[BlockOutput] | None = None
    section_hierarchy: Dict[int, BlockId] | None = None


class BlockId(BaseModel):
    page_id: int
    block_id: Optional[int] = None
    block_type: BlockTypes | None = None

    def __str__(self):
        if self.block_type is None or self.block_id is None:
            return f"/page/{self.page_id}"
        return f"/page/{self.page_id}/{self.block_type.name}/{self.block_id}"

    def __hash__(self):
        return hash(str(self))

    def __repr__(self):
        return str(self)

    def __eq__(self, other):
        if not isinstance(other, (BlockId, str)):
            return NotImplemented

        if isinstance(other, str):
            return str(self) == other
        else:
            return (
                self.page_id == other.page_id
                and self.block_id == other.block_id
                and self.block_type == other.block_type
            )

    @field_validator("block_type")
    @classmethod
    def validate_block_type(cls, v):
        from marker.schema import BlockTypes

        if v not in BlockTypes:
            raise ValueError(f"Invalid block type: {v}")
        return v

    def to_path(self):
        return str(self).replace("/", "_")


class Block(BaseModel):
    polygon: PolygonBox
    block_description: str
    block_type: Optional[BlockTypes] = None
    block_id: Optional[int] = None
    page_id: Optional[int] = None
    text_extraction_method: Optional[Literal["pdftext", "surya", "gemini"]] = None
    structure: List[BlockId] | None = (
        None  # The top-level page structure, which is the block ids in order
    )
    ignore_for_output: bool = False  # Whether this block should be ignored in output
    replace_output_newlines: bool = (
        False  # Whether to replace newlines with spaces in output
    )
    source: Literal["layout", "heuristics", "processor"] = "layout"
    top_k: Optional[Dict[BlockTypes, float]] = None
    metadata: BlockMetadata | None = None
    lowres_image: Image.Image | None = None
    highres_image: Image.Image | None = None
    removed: bool = False  # Has block been replaced by new block?
    _metadata: Optional[dict] = None

    model_config = ConfigDict(arbitrary_types_allowed=True)

    @property
    def id(self) -> BlockId:
        return BlockId(
            page_id=self.page_id, block_id=self.block_id, block_type=self.block_type
        )

    @classmethod
    def from_block(cls, block: Block) -> Block:
        block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
        return cls(**block_attrs)

    def set_internal_metadata(self, key, data):
        if self._metadata is None:
            self._metadata = {}
        self._metadata[key] = data

    def get_internal_metadata(self, key):
        if self._metadata is None:
            return None
        return self._metadata.get(key)

    def get_image(
        self,
        document: Document,
        highres: bool = False,
        expansion: Tuple[float, float] | None = None,
        remove_blocks: Sequence[BlockTypes] | None = None,
    ) -> Image.Image | None:
        image = self.highres_image if highres else self.lowres_image
        if image is None:
            page = document.get_page(self.page_id)
            page_image = page.get_image(highres=highres, remove_blocks=remove_blocks)

            # Scale to the image size
            bbox = self.polygon.rescale(
                (page.polygon.width, page.polygon.height), page_image.size
            )
            if expansion:
                bbox = bbox.expand(*expansion)
            bbox = bbox.bbox
            image = page_image.crop(bbox)
        return image

    def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
        if self.structure is None:
            return []
        return [document_page.get_block(block_id) for block_id in self.structure]

    def get_prev_block(
        self,
        document_page: Document | PageGroup,
        block: Block,
        ignored_block_types: Optional[List[BlockTypes]] = None,
    ):
        if ignored_block_types is None:
            ignored_block_types = []

        structure_idx = self.structure.index(block.id)
        if structure_idx == 0:
            return None

        for prev_block_id in reversed(self.structure[:structure_idx]):
            if prev_block_id.block_type not in ignored_block_types:
                return document_page.get_block(prev_block_id)

    def get_next_block(
        self,
        document_page: Document | PageGroup,
        block: Optional[Block] = None,
        ignored_block_types: Optional[List[BlockTypes]] = None,
    ):
        if ignored_block_types is None:
            ignored_block_types = []

        structure_idx = 0
        if block is not None:
            structure_idx = self.structure.index(block.id) + 1

        for next_block_id in self.structure[structure_idx:]:
            if next_block_id.block_type not in ignored_block_types:
                return document_page.get_block(next_block_id)

        return None  # No valid next block found

    def add_structure(self, block: Block):
        if self.structure is None:
            self.structure = [block.id]
        else:
            self.structure.append(block.id)

    def update_structure_item(self, old_id: BlockId, new_id: BlockId):
        if self.structure is not None:
            for i, item in enumerate(self.structure):
                if item == old_id:
                    self.structure[i] = new_id
                    break

    def remove_structure_items(self, block_ids: List[BlockId]):
        if self.structure is not None:
            self.structure = [item for item in self.structure if item not in block_ids]

    def raw_text(self, document: Document) -> str:
        from marker.schema.text.line import Line
        from marker.schema.text.span import Span
        from marker.schema.blocks.tablecell import TableCell

        if self.structure is None:
            if isinstance(self, (Span, TableCell)):
                return self.text
            else:
                return ""

        text = ""
        for block_id in self.structure:
            block = document.get_block(block_id)
            text += block.raw_text(document)
            if isinstance(block, Line) and not text.endswith("\n"):
                text += "\n"
        return text

    def assemble_html(
        self,
        document: Document,
        child_blocks: List[BlockOutput],
        parent_structure: Optional[List[str]] = None,
        block_config: Optional[dict] = None,
    ) -> str:
        if self.ignore_for_output:
            return ""

        template = ""
        for c in child_blocks:
            template += f"<content-ref src='{c.id}'></content-ref>"

        if self.replace_output_newlines:
            template = template.replace("\n", " ")
            template = "<p>" + template + "</p>"

        return template

    def assign_section_hierarchy(self, section_hierarchy):
        if self.block_type == BlockTypes.SectionHeader and self.heading_level:
            levels = list(section_hierarchy.keys())
            for level in levels:
                if level >= self.heading_level:
                    del section_hierarchy[level]
            section_hierarchy[self.heading_level] = self.id

        return section_hierarchy

    def contained_blocks(
        self, document: Document, block_types: Sequence[BlockTypes] = None
    ) -> List[Block]:
        if self.structure is None:
            return []

        blocks = []
        for block_id in self.structure:
            block = document.get_block(block_id)
            if block.removed:
                continue
            if (
                block_types is None or block.block_type in block_types
            ) and not block.removed:
                blocks.append(block)
            blocks += block.contained_blocks(document, block_types)
        return blocks

    def replace_block(self, block: Block, new_block: Block):
        if self.structure is not None:
            for i, item in enumerate(self.structure):
                if item == block.id:
                    self.structure[i] = new_block.id
                    break

    def render(
        self,
        document: Document,
        parent_structure: Optional[List[str]] = None,
        section_hierarchy: dict | None = None,
        block_config: Optional[dict] = None,
    ) -> BlockOutput:
        if block_config is None:
            block_config = {}

        child_content = []
        if section_hierarchy is None:
            section_hierarchy = {}
        section_hierarchy = self.assign_section_hierarchy(section_hierarchy)

        if self.structure is not None and len(self.structure) > 0:
            for block_id in self.structure:
                block = document.get_block(block_id)
                rendered = block.render(
                    document, self.structure, section_hierarchy, block_config
                )
                section_hierarchy = (
                    rendered.section_hierarchy.copy()
                )  # Update the section hierarchy from the peer blocks
                child_content.append(rendered)

        return BlockOutput(
            html=self.assemble_html(
                document, child_content, parent_structure, block_config
            ),
            polygon=self.polygon,
            id=self.id,
            children=child_content,
            section_hierarchy=section_hierarchy,
        )

    def line_height(self, document: Document) -> float:
        lines = self.contained_blocks(document, (BlockTypes.Line,))
        if len(lines) == 0:
            return 0
        return self.polygon.height / len(lines)

    def update_metadata(self, **kwargs):
        if self.metadata is None:
            self.metadata = BlockMetadata()

        for key, value in kwargs.items():
            metadata_attr = getattr(self.metadata, key)
            if isinstance(metadata_attr, int) and isinstance(value, int):
                setattr(self.metadata, key, metadata_attr + value)
            elif isinstance(metadata_attr, str) and isinstance(value, str):
                setattr(self.metadata, key, value)
            else:
                raise ValueError(f"Metadata attribute {key} is not an integer")

    def handle_html_output(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if block_config is None:
            block_config = {}

        child_ref_blocks = [
            block
            for block in child_blocks
            if block.id.block_type == BlockTypes.Reference
        ]
        html = Block.assemble_html(
            self, document, child_ref_blocks, parent_structure, block_config
        )
        return html + self.html

```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_table.py:
--------------------------------------------------------------------------------

```python
from typing import Annotated, List, Tuple

from bs4 import BeautifulSoup
from PIL import Image
from marker.logger import get_logger
from pydantic import BaseModel

from marker.processors.llm import BaseLLMComplexBlockProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block, TableCell, Table
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup
from marker.schema.polygon import PolygonBox

logger = get_logger()


class LLMTableProcessor(BaseLLMComplexBlockProcessor):
    block_types: Annotated[
        Tuple[BlockTypes],
        "The block types to process.",
    ] = (BlockTypes.Table, BlockTypes.TableOfContents)
    max_rows_per_batch: Annotated[
        int,
        "If the table has more rows than this, chunk the table. (LLMs can be inaccurate with a lot of rows)",
    ] = 60
    max_table_rows: Annotated[
        int,
        "The maximum number of rows in a table to process with the LLM processor.  Beyond this will be skipped.",
    ] = 175
    table_image_expansion_ratio: Annotated[
        float,
        "The ratio to expand the image by when cropping.",
    ] = 0
    rotation_max_wh_ratio: Annotated[
        float,
        "The maximum width/height ratio for table cells for a table to be considered rotated.",
    ] = 0.6
    max_table_iterations: Annotated[
        int,
        "The maximum number of iterations to attempt rewriting a table.",
    ] = 2
    table_rewriting_prompt: Annotated[
        str,
        "The prompt to use for rewriting text.",
        "Default is a string containing the Gemini rewriting prompt.",
    ] = """You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image and an html representation of the table in the image.
Your task is to correct any errors in the html representation.  The html representation should be as faithful to the original table image as possible.  The table image may be rotated, but ensure the html representation is not rotated.  Make sure to include HTML for the full table, including the opening and closing table tags.

Some guidelines:
- Reproduce the original values from the image as faithfully as possible.  
- There may be stray characters in the html representation that don't match the image - fix these.
- Ensure column headers match the correct column values.
- If you see any inline math in a table cell, fence it with the <math> tag.  Block math should be fenced with <math display="block">.
- Replace any images in table cells with a description, like "Image: [description]".
- Only use the tags th, td, tr, br, span, sup, sub, i, b, math, and table.  Only use the attributes display, style, colspan, and rowspan if necessary.  You can use br to break up text lines in cells.
- Make sure the columns and rows match the image faithfully, and are easily readable and interpretable by a human.

**Instructions:**
1. Carefully examine the provided text block image.
2. Analyze the html representation of the table.
3. Write a comparison of the image and the html representation, paying special attention to the column headers matching the correct column values.
4. If the html representation is completely correct, or you cannot read the image properly, then write "No corrections needed."  If the html representation has errors, generate the corrected html representation.  Output only either the corrected html representation or "No corrections needed."
5. If you made corrections, analyze your corrections against the original image, and provide a score from 1-5, indicating how well the corrected html matches the image, with 5 being perfect.
**Example:**
Input:
```html
<table>
    <tr>
        <th>First Name</th>
        <th>Last Name</th>
        <th>Age</th>
    </tr>
    <tr>
        <td>John</td>
        <td>Doe</td>
    </tr>
</table>
```
Output:
comparison: The image shows a table with 2 rows and 3 columns.  The text and formatting of the html table matches the image.  The column headers match the correct column values.
```html
No corrections needed.
```
analysis: I did not make any corrections, as the html representation was already accurate.
score: 5
**Input:**
```html
{block_html}
```
"""

    def handle_image_rotation(self, children: List[TableCell], image: Image.Image):
        ratios = [c.polygon.width / c.polygon.height for c in children]
        if len(ratios) < 2:
            return image

        is_rotated = all([r < self.rotation_max_wh_ratio for r in ratios])
        if not is_rotated:
            return image

        first_col_id = min([c.col_id for c in children])
        first_col = [c for c in children if c.col_id == first_col_id]
        first_col_cell = first_col[0]

        last_col_id = max([c.col_id for c in children])
        if last_col_id == first_col_id:
            return image

        last_col_cell = [c for c in children if c.col_id == last_col_id][0]
        cell_diff = first_col_cell.polygon.y_start - last_col_cell.polygon.y_start
        if cell_diff == 0:
            return image

        if cell_diff > 0:
            return image.rotate(270, expand=True)
        else:
            return image.rotate(90, expand=True)

    def process_rewriting(self, document: Document, page: PageGroup, block: Table):
        children: List[TableCell] = block.contained_blocks(
            document, (BlockTypes.TableCell,)
        )
        if not children:
            # Happens if table/form processors didn't run
            return

        # LLMs don't handle tables with a lot of rows very well
        unique_rows = set([cell.row_id for cell in children])
        row_count = len(unique_rows)
        row_idxs = sorted(list(unique_rows))

        if row_count > self.max_table_rows:
            return

        # Inference by chunk to handle long tables better
        parsed_cells = []
        row_shift = 0
        block_image = self.extract_image(document, block)
        block_rescaled_bbox = block.polygon.rescale(
            page.polygon.size, page.get_image(highres=True).size
        ).bbox
        for i in range(0, row_count, self.max_rows_per_batch):
            batch_row_idxs = row_idxs[i : i + self.max_rows_per_batch]
            batch_cells = [cell for cell in children if cell.row_id in batch_row_idxs]
            batch_cell_bboxes = [
                cell.polygon.rescale(
                    page.polygon.size, page.get_image(highres=True).size
                ).bbox
                for cell in batch_cells
            ]
            # bbox relative to the block
            batch_bbox = [
                min([bbox[0] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[0],
                min([bbox[1] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[1],
                max([bbox[2] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[0],
                max([bbox[3] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[1],
            ]
            if i == 0:
                # Ensure first image starts from the beginning
                batch_bbox[0] = 0
                batch_bbox[1] = 0
            elif i > row_count - self.max_rows_per_batch + 1:
                # Ensure final image grabs the entire height and width
                batch_bbox[2] = block_image.size[0]
                batch_bbox[3] = block_image.size[1]

            batch_image = block_image.crop(batch_bbox)
            block_html = block.format_cells(document, [], None, batch_cells)
            batch_image = self.handle_image_rotation(batch_cells, batch_image)
            batch_parsed_cells = self.rewrite_single_chunk(
                page, block, block_html, batch_cells, batch_image
            )
            if batch_parsed_cells is None:
                return  # Error occurred or no corrections needed

            for cell in batch_parsed_cells:
                cell.row_id += row_shift
                parsed_cells.append(cell)
            row_shift += max([cell.row_id for cell in batch_parsed_cells])

        block.structure = []
        for cell in parsed_cells:
            page.add_full_block(cell)
            block.add_structure(cell)

    def rewrite_single_chunk(
        self,
        page: PageGroup,
        block: Block,
        block_html: str,
        children: List[TableCell],
        image: Image.Image,
        total_iterations: int = 0,
    ):
        prompt = self.table_rewriting_prompt.replace("{block_html}", block_html)

        response = self.llm_service(prompt, image, block, TableSchema)

        if not response or "corrected_html" not in response:
            block.update_metadata(llm_error_count=1)
            return

        corrected_html = response["corrected_html"]

        # The original table is okay
        if "no corrections needed" in corrected_html.lower():
            return

        corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()

        # Re-iterate if low score
        total_iterations += 1
        score = response.get("score", 5)
        analysis = response.get("analysis", "")
        logger.debug(f"Got table rewriting score {score} with analysis: {analysis}")
        if total_iterations < self.max_table_iterations and score < 4:
            logger.info(
                f"Table rewriting low score {score}, on iteration {total_iterations}"
            )
            block_html = corrected_html
            return self.rewrite_single_chunk(
                page, block, block_html, children, image, total_iterations
            )

        parsed_cells = self.parse_html_table(corrected_html, block, page)
        if len(parsed_cells) <= 1:
            block.update_metadata(llm_error_count=1)
            logger.debug(f"Table parsing issue, only {len(parsed_cells)} cells found")
            return

        if not corrected_html.endswith("</table>"):
            logger.debug(
                "Table parsing issue, corrected html does not end with </table>"
            )
            block.update_metadata(llm_error_count=1)
            return

        return parsed_cells

    @staticmethod
    def get_cell_text(element, keep_tags=("br", "i", "b", "span", "math")) -> str:
        for tag in element.find_all(True):
            if tag.name not in keep_tags:
                tag.unwrap()
        return element.decode_contents()

    def parse_html_table(
        self, html_text: str, block: Block, page: PageGroup
    ) -> List[TableCell]:
        soup = BeautifulSoup(html_text, "html.parser")
        table = soup.find("table")
        if not table:
            return []

        # Initialize grid
        rows = table.find_all("tr")
        cells = []

        # Find maximum number of columns in colspan-aware way
        max_cols = 0
        for row in rows:
            row_tds = row.find_all(["td", "th"])
            curr_cols = 0
            for cell in row_tds:
                colspan = int(cell.get("colspan", 1))
                curr_cols += colspan
            if curr_cols > max_cols:
                max_cols = curr_cols

        grid = [[True] * max_cols for _ in range(len(rows))]

        for i, row in enumerate(rows):
            cur_col = 0
            row_cells = row.find_all(["td", "th"])
            for j, cell in enumerate(row_cells):
                while cur_col < max_cols and not grid[i][cur_col]:
                    cur_col += 1

                if cur_col >= max_cols:
                    logger.info("Table parsing warning: too many columns found")
                    break

                cell_text = self.get_cell_text(cell).strip()
                rowspan = min(int(cell.get("rowspan", 1)), len(rows) - i)
                colspan = min(int(cell.get("colspan", 1)), max_cols - cur_col)
                cell_rows = list(range(i, i + rowspan))
                cell_cols = list(range(cur_col, cur_col + colspan))

                if colspan == 0 or rowspan == 0:
                    logger.info("Table parsing issue: invalid colspan or rowspan")
                    continue

                for r in cell_rows:
                    for c in cell_cols:
                        grid[r][c] = False

                cell_bbox = [
                    block.polygon.bbox[0] + cur_col,
                    block.polygon.bbox[1] + i,
                    block.polygon.bbox[0] + cur_col + colspan,
                    block.polygon.bbox[1] + i + rowspan,
                ]
                cell_polygon = PolygonBox.from_bbox(cell_bbox)

                cell_obj = TableCell(
                    text_lines=[cell_text],
                    row_id=i,
                    col_id=cur_col,
                    rowspan=rowspan,
                    colspan=colspan,
                    is_header=cell.name == "th",
                    polygon=cell_polygon,
                    page_id=page.page_id,
                )
                cells.append(cell_obj)
                cur_col += colspan

        return cells


class TableSchema(BaseModel):
    comparison: str
    corrected_html: str
    analysis: str
    score: int

```

--------------------------------------------------------------------------------
/marker/schema/groups/page.py:
--------------------------------------------------------------------------------

```python
from collections import defaultdict
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
import numpy as np

from PIL import Image, ImageDraw

from pdftext.schema import Reference
from pydantic import computed_field

from marker.providers import ProviderOutput
from marker.schema import BlockTypes
from marker.schema.blocks import Block, BlockId, Text
from marker.schema.blocks.base import BlockMetadata
from marker.schema.groups.base import Group
from marker.schema.polygon import PolygonBox
from marker.util import matrix_intersection_area, sort_text_lines

LINE_MAPPING_TYPE = List[Tuple[int, ProviderOutput]]


class PageGroup(Group):
    block_type: BlockTypes = BlockTypes.Page
    # This is bytes if it is serialized
    lowres_image: Image.Image | None | bytes = None
    highres_image: Image.Image | None | bytes = None
    children: List[Union[Any, Block]] | None = None
    layout_sliced: bool = (
        False  # Whether the layout model had to slice the image (order may be wrong)
    )
    excluded_block_types: Sequence[BlockTypes] = (
        BlockTypes.Line,
        BlockTypes.Span,
    )
    maximum_assignment_distance: float = 20  # pixels
    block_description: str = "A single page in the document."
    refs: List[Reference] | None = None
    ocr_errors_detected: bool = False

    def incr_block_id(self):
        if self.block_id is None:
            self.block_id = 0
        else:
            self.block_id += 1

    def add_child(self, block: Block):
        if self.children is None:
            self.children = [block]
        else:
            self.children.append(block)

    def get_image(
        self,
        *args,
        highres: bool = False,
        remove_blocks: Sequence[BlockTypes] | None = None,
        **kwargs,
    ):
        image = self.highres_image if highres else self.lowres_image

        # Check if RGB, convert if needed
        if isinstance(image, Image.Image) and image.mode != "RGB":
            image = image.convert("RGB")

        # Avoid double OCR for certain elements
        if remove_blocks:
            image = image.copy()
            draw = ImageDraw.Draw(image)
            bad_blocks = [
                block
                for block in self.current_children
                if block.block_type in remove_blocks
            ]
            for bad_block in bad_blocks:
                poly = bad_block.polygon.rescale(self.polygon.size, image.size).polygon
                poly = [(int(p[0]), int(p[1])) for p in poly]
                draw.polygon(poly, fill="white")

        return image

    @computed_field
    @property
    def current_children(self) -> List[Block]:
        return [child for child in self.children if not child.removed]

    def get_next_block(
        self,
        block: Optional[Block] = None,
        ignored_block_types: Optional[List[BlockTypes]] = None,
    ):
        if ignored_block_types is None:
            ignored_block_types = []

        structure_idx = 0
        if block is not None:
            structure_idx = self.structure.index(block.id) + 1

        # Iterate over blocks following the given block
        for next_block_id in self.structure[structure_idx:]:
            if next_block_id.block_type not in ignored_block_types:
                return self.get_block(next_block_id)

        return None  # No valid next block found

    def get_prev_block(self, block: Block):
        block_idx = self.structure.index(block.id)
        if block_idx > 0:
            return self.get_block(self.structure[block_idx - 1])
        return None

    def add_block(self, block_cls: type[Block], polygon: PolygonBox) -> Block:
        self.incr_block_id()
        block = block_cls(
            polygon=polygon,
            block_id=self.block_id,
            page_id=self.page_id,
        )
        self.add_child(block)
        return block

    def add_full_block(self, block: Block) -> Block:
        self.incr_block_id()
        block.block_id = self.block_id
        self.add_child(block)
        return block

    def get_block(self, block_id: BlockId) -> Block | None:
        block: Block = self.children[block_id.block_id]
        assert block.block_id == block_id.block_id
        return block

    def assemble_html(
        self, document, child_blocks, parent_structure=None, block_config=None
    ):
        template = ""
        for c in child_blocks:
            template += f"<content-ref src='{c.id}'></content-ref>"
        return template

    def compute_line_block_intersections(
        self, blocks: List[Block], provider_outputs: List[ProviderOutput]
    ):
        max_intersections = {}

        block_bboxes = [block.polygon.bbox for block in blocks]
        line_bboxes = [
            provider_output.line.polygon.bbox for provider_output in provider_outputs
        ]

        intersection_matrix = matrix_intersection_area(line_bboxes, block_bboxes)

        for line_idx, line in enumerate(provider_outputs):
            intersection_line = intersection_matrix[line_idx]
            if intersection_line.sum() == 0:
                continue

            max_intersection = intersection_line.argmax()
            max_intersections[line_idx] = (
                intersection_matrix[line_idx, max_intersection],
                blocks[max_intersection].id,
            )
        return max_intersections

    def compute_max_structure_block_intersection_pct(self):
        structure_blocks = [self.get_block(block_id) for block_id in self.structure]
        strucure_block_bboxes = [b.polygon.bbox for b in structure_blocks]

        intersection_matrix = matrix_intersection_area(strucure_block_bboxes, strucure_block_bboxes)
        np.fill_diagonal(intersection_matrix, 0)    # Ignore self-intersections

        max_intersection_pct = 0
        for block_idx, block in enumerate(structure_blocks):
            if block.polygon.area == 0:
                continue
            max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area)

        return max_intersection_pct

    def replace_block(self, block: Block, new_block: Block):
        # Handles incrementing the id
        self.add_full_block(new_block)

        # Replace block id in structure
        super().replace_block(block, new_block)

        # Replace block in structure of children
        for child in self.children:
            child.replace_block(block, new_block)

        # Mark block as removed
        block.removed = True

    def identify_missing_blocks(
        self,
        provider_line_idxs: List[int],
        provider_outputs: List[ProviderOutput],
        assigned_line_idxs: set[int],
    ):
        new_blocks = []
        new_block = None
        for line_idx in provider_line_idxs:
            if line_idx in assigned_line_idxs:
                continue

            # if the unassociated line is a new line with minimal area, we can skip it
            if (
                provider_outputs[line_idx].line.polygon.area <= 1
                and provider_outputs[line_idx].raw_text == "\n"
            ):
                continue

            if new_block is None:
                new_block = [(line_idx, provider_outputs[line_idx])]
            elif all(
                [
                    new_block[-1][0] + 1 == line_idx,
                    provider_outputs[line_idx].line.polygon.center_distance(
                        new_block[-1][1].line.polygon
                    )
                    < self.maximum_assignment_distance,
                ]
            ):
                new_block.append((line_idx, provider_outputs[line_idx]))
            else:
                new_blocks.append(new_block)
                new_block = [(line_idx, provider_outputs[line_idx])]
            assigned_line_idxs.add(line_idx)
        if new_block:
            new_blocks.append(new_block)

        return new_blocks

    def create_missing_blocks(
        self,
        new_blocks: List[LINE_MAPPING_TYPE],
        block_lines: Dict[BlockId, LINE_MAPPING_TYPE],
    ):
        for new_block in new_blocks:
            block = self.add_block(Text, new_block[0][1].line.polygon)
            block.source = "heuristics"
            block_lines[block.id] = new_block

            min_dist_idx = None
            min_dist = None
            for existing_block_id in self.structure:
                existing_block = self.get_block(existing_block_id)
                if existing_block.block_type in self.excluded_block_types:
                    continue
                # We want to assign to blocks closer in y than x
                dist = block.polygon.center_distance(
                    existing_block.polygon, x_weight=5, absolute=True
                )
                if dist > 0 and min_dist_idx is None or dist < min_dist:
                    min_dist = dist
                    min_dist_idx = existing_block.id

            if min_dist_idx is not None:
                existing_idx = self.structure.index(min_dist_idx)
                self.structure.insert(existing_idx + 1, block.id)
            else:
                self.structure.append(block.id)

    def add_initial_blocks(
        self,
        block_lines: Dict[BlockId, LINE_MAPPING_TYPE],
        text_extraction_method: str,
        keep_chars: bool = False,
    ):
        # Add lines to the proper blocks, sorted in order
        for block_id, lines in block_lines.items():
            line_extraction_methods = set(
                [line[1].line.text_extraction_method for line in lines]
            )
            if len(line_extraction_methods) == 1:
                lines = sorted(lines, key=lambda x: x[0])
                lines = [line for _, line in lines]
            else:
                lines = [line for _, line in lines]
                line_polygons = [line.line.polygon for line in lines]
                sorted_line_polygons = sort_text_lines(line_polygons)
                argsort = [line_polygons.index(p) for p in sorted_line_polygons]
                lines = [lines[i] for i in argsort]

            block = self.get_block(block_id)
            for provider_output in lines:
                line = provider_output.line
                spans = provider_output.spans
                self.add_full_block(line)
                block.add_structure(line)
                block.polygon = block.polygon.merge([line.polygon])
                block.text_extraction_method = text_extraction_method
                for span_idx, span in enumerate(spans):
                    self.add_full_block(span)
                    line.add_structure(span)

                    if not keep_chars:
                        continue

                    # Provider doesn't have chars
                    if len(provider_output.chars) == 0:
                        continue

                    # Loop through characters associated with the span
                    for char in provider_output.chars[span_idx]:
                        char.page_id = self.page_id
                        self.add_full_block(char)
                        span.add_structure(char)

    def merge_blocks(
        self,
        provider_outputs: List[ProviderOutput],
        text_extraction_method: str,
        keep_chars: bool = False,
    ):
        provider_line_idxs = list(range(len(provider_outputs)))
        valid_blocks = [
            block
            for block in self.current_children  # ensure we only look at children that haven't been replaced
            if block.block_type not in self.excluded_block_types
        ]

        max_intersections = self.compute_line_block_intersections(
            valid_blocks, provider_outputs
        )

        # Try to assign lines by intersection
        assigned_line_idxs = set()
        block_lines = defaultdict(list)
        for line_idx, provider_output in enumerate(provider_outputs):
            if line_idx in max_intersections:
                block_id = max_intersections[line_idx][1]
                block_lines[block_id].append((line_idx, provider_output))
                assigned_line_idxs.add(line_idx)

        # If no intersection, assign by distance
        for line_idx in set(provider_line_idxs).difference(assigned_line_idxs):
            min_dist = None
            min_dist_idx = None
            provider_output: ProviderOutput = provider_outputs[line_idx]
            line = provider_output.line
            for block in valid_blocks:
                # We want to assign to blocks closer in y than x
                dist = line.polygon.center_distance(block.polygon, x_weight=5)
                if min_dist_idx is None or dist < min_dist:
                    min_dist = dist
                    min_dist_idx = block.id

            if min_dist_idx is not None and min_dist < self.maximum_assignment_distance:
                block_lines[min_dist_idx].append((line_idx, provider_output))
                assigned_line_idxs.add(line_idx)

        # This creates new blocks to hold anything too far away
        new_blocks = self.identify_missing_blocks(
            provider_line_idxs, provider_outputs, assigned_line_idxs
        )
        self.create_missing_blocks(new_blocks, block_lines)

        # Add blocks to the page
        self.add_initial_blocks(block_lines, text_extraction_method, keep_chars)

    def aggregate_block_metadata(self) -> BlockMetadata:
        if self.metadata is None:
            self.metadata = BlockMetadata()

        for block in self.current_children:
            if block.metadata is not None:
                self.metadata = self.metadata.merge(block.metadata)
        return self.metadata

```

--------------------------------------------------------------------------------
/marker/builders/line.py:
--------------------------------------------------------------------------------

```python
from copy import deepcopy
from typing import Annotated, List, Tuple

import numpy as np
from PIL import Image
import cv2

from surya.detection import DetectionPredictor
from surya.ocr_error import OCRErrorPredictor

from marker.builders import BaseBuilder
from marker.providers import ProviderOutput, ProviderPageLines
from marker.providers.pdf import PdfProvider
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup
from marker.schema.polygon import PolygonBox
from marker.schema.registry import get_block_class
from marker.schema.text.line import Line
from marker.settings import settings
from marker.util import matrix_intersection_area, sort_text_lines
from marker.utils.image import is_blank_image


class LineBuilder(BaseBuilder):
    """
    A builder for detecting text lines. Merges the detected lines with the lines from the provider
    """

    detection_batch_size: Annotated[
        int,
        "The batch size to use for the detection model.",
        "Default is None, which will use the default batch size for the model.",
    ] = None
    ocr_error_batch_size: Annotated[
        int,
        "The batch size to use for the ocr error detection model.",
        "Default is None, which will use the default batch size for the model.",
    ] = None
    layout_coverage_min_lines: Annotated[
        int,
        "The minimum number of PdfProvider lines that must be covered by the layout model",
        "to consider the lines from the PdfProvider valid.",
    ] = 1
    layout_coverage_threshold: Annotated[
        float,
        "The minimum coverage ratio required for the layout model to consider",
        "the lines from the PdfProvider valid.",
    ] = 0.25
    min_document_ocr_threshold: Annotated[
        float,
        "If less pages than this threshold are good, OCR will happen in the document.  Otherwise it will not.",
    ] = 0.85
    provider_line_provider_line_min_overlap_pct: Annotated[
        float,
        "The percentage of a provider line that has to be covered by a detected line",
    ] = 0.1
    excluded_for_coverage: Annotated[
        Tuple[BlockTypes],
        "A list of block types to exclude from the layout coverage check.",
    ] = (
        BlockTypes.Figure,
        BlockTypes.Picture,
        BlockTypes.Table,
        BlockTypes.FigureGroup,
        BlockTypes.TableGroup,
        BlockTypes.PictureGroup,
    )
    ocr_remove_blocks: Tuple[BlockTypes, ...] = (
        BlockTypes.Table,
        BlockTypes.Form,
        BlockTypes.TableOfContents,
    )
    disable_tqdm: Annotated[
        bool,
        "Disable tqdm progress bars.",
    ] = False
    disable_ocr: Annotated[
        bool,
        "Disable OCR for the document. This will only use the lines from the provider.",
    ] = False
    keep_chars: Annotated[bool, "Keep individual characters."] = False
    detection_line_min_confidence: Annotated[float, "Minimum confidence for a detected line to be included"] = 0.8

    def __init__(
        self,
        detection_model: DetectionPredictor,
        ocr_error_model: OCRErrorPredictor,
        config=None,
    ):
        super().__init__(config)

        self.detection_model = detection_model
        self.ocr_error_model = ocr_error_model

    def __call__(self, document: Document, provider: PdfProvider):
        # Disable inline detection for documents where layout model doesn't detect any equations
        # Also disable if we won't use the inline detections (if we aren't using the LLM)
        provider_lines, ocr_lines = self.get_all_lines(document, provider)
        self.merge_blocks(document, provider_lines, ocr_lines)

    def get_detection_batch_size(self):
        if self.detection_batch_size is not None:
            return self.detection_batch_size
        elif settings.TORCH_DEVICE_MODEL == "cuda":
            return 10
        return 4

    def get_ocr_error_batch_size(self):
        if self.ocr_error_batch_size is not None:
            return self.ocr_error_batch_size
        elif settings.TORCH_DEVICE_MODEL == "cuda":
            return 14
        return 4

    def get_detection_results(
        self, page_images: List[Image.Image], run_detection: List[bool]
    ):
        self.detection_model.disable_tqdm = self.disable_tqdm
        page_detection_results = self.detection_model(
            images=page_images, batch_size=self.get_detection_batch_size()
        )

        assert len(page_detection_results) == sum(run_detection)
        detection_results = []
        idx = 0
        for good in run_detection:
            if good:
                detection_results.append(page_detection_results[idx])
                idx += 1
            else:
                detection_results.append(None)
        assert idx == len(page_images)

        assert len(run_detection) == len(detection_results)
        return detection_results

    def get_all_lines(self, document: Document, provider: PdfProvider):
        ocr_error_detection_results = self.ocr_error_detection(
            document.pages, provider.page_lines
        )

        boxes_to_ocr = {page.page_id: [] for page in document.pages}
        page_lines = {page.page_id: [] for page in document.pages}

        LineClass: Line = get_block_class(BlockTypes.Line)

        layout_good = []
        for document_page, ocr_error_detection_label in zip(
            document.pages, ocr_error_detection_results.labels
        ):
            document_page.ocr_errors_detected = ocr_error_detection_label == "bad"
            provider_lines: List[ProviderOutput] = provider.page_lines.get(
                document_page.page_id, []
            )
            provider_lines_good = all(
                [
                    bool(provider_lines),
                    not document_page.ocr_errors_detected,
                    self.check_layout_coverage(document_page, provider_lines),
                    self.check_line_overlaps(
                        document_page, provider_lines
                    ),  # Ensure provider lines don't overflow the page or intersect
                ]
            )
            if self.disable_ocr:
                provider_lines_good = True

            layout_good.append(provider_lines_good)

        run_detection = [not good for good in layout_good]
        page_images = [
            page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks)
            for page, bad in zip(document.pages, run_detection)
            if bad
        ]

        # Note: run_detection is longer than page_images, since it has a value for each page, not just good ones
        # Detection results and inline detection results are for every page (we use run_detection to make the list full length)
        detection_results = self.get_detection_results(page_images, run_detection)

        assert len(detection_results) == len(layout_good) == len(document.pages)
        for document_page, detection_result, provider_lines_good in zip(
            document.pages, detection_results, layout_good
        ):
            provider_lines: List[ProviderOutput] = provider.page_lines.get(
                document_page.page_id, []
            )

            # Setup detection results
            detection_boxes = []
            if detection_result:
                detection_boxes = [
                    PolygonBox(polygon=box.polygon) for box in detection_result.bboxes if box.confidence > self.detection_line_min_confidence
                ]

            detection_boxes = sort_text_lines(detection_boxes)

            if provider_lines_good:
                document_page.text_extraction_method = "pdftext"

                # Mark extraction method as pdftext, since all lines are good
                for provider_line in provider_lines:
                    provider_line.line.text_extraction_method = "pdftext"

                page_lines[document_page.page_id] = provider_lines
            else:
                document_page.text_extraction_method = "surya"
                boxes_to_ocr[document_page.page_id].extend(detection_boxes)

        # Dummy lines to merge into the document - Contains no spans, will be filled in later by OCRBuilder
        ocr_lines = {document_page.page_id: [] for document_page in document.pages}
        for page_id, page_ocr_boxes in boxes_to_ocr.items():
            page_size = provider.get_page_bbox(page_id).size
            image_size = document.get_page(page_id).get_image(highres=False).size
            for box_to_ocr in page_ocr_boxes:
                line_polygon = PolygonBox(polygon=box_to_ocr.polygon).rescale(
                    image_size, page_size
                )
                ocr_lines[page_id].append(
                    ProviderOutput(
                        line=LineClass(
                            polygon=line_polygon,
                            page_id=page_id,
                            text_extraction_method="surya",
                        ),
                        spans=[],
                        chars=[],
                    )
                )

        return page_lines, ocr_lines

    def ocr_error_detection(
        self, pages: List[PageGroup], provider_page_lines: ProviderPageLines
    ):
        page_texts = []
        for document_page in pages:
            provider_lines = provider_page_lines.get(document_page.page_id, [])
            page_text = "\n".join(
                " ".join(s.text for s in line.spans) for line in provider_lines
            )
            page_texts.append(page_text)

        self.ocr_error_model.disable_tqdm = self.disable_tqdm
        ocr_error_detection_results = self.ocr_error_model(
            page_texts, batch_size=int(self.get_ocr_error_batch_size())
        )
        return ocr_error_detection_results

    def check_line_overlaps(
        self, document_page: PageGroup, provider_lines: List[ProviderOutput]
    ) -> bool:
        provider_bboxes = [line.line.polygon.bbox for line in provider_lines]
        # Add a small margin to account for minor overflows
        page_bbox = document_page.polygon.expand(5, 5).bbox

        for bbox in provider_bboxes:
            if bbox[0] < page_bbox[0]:
                return False
            if bbox[1] < page_bbox[1]:
                return False
            if bbox[2] > page_bbox[2]:
                return False
            if bbox[3] > page_bbox[3]:
                return False

        intersection_matrix = matrix_intersection_area(provider_bboxes, provider_bboxes)
        for i, line in enumerate(provider_lines):
            intersect_counts = np.sum(
                intersection_matrix[i]
                > self.provider_line_provider_line_min_overlap_pct
            )

            # There should be one intersection with itself
            if intersect_counts > 2:
                return False

        return True

    def check_layout_coverage(
        self,
        document_page: PageGroup,
        provider_lines: List[ProviderOutput],
    ):
        covered_blocks = 0
        total_blocks = 0
        large_text_blocks = 0

        layout_blocks = [
            document_page.get_block(block) for block in document_page.structure
        ]
        layout_blocks = [
            b for b in layout_blocks if b.block_type not in self.excluded_for_coverage
        ]

        layout_bboxes = [block.polygon.bbox for block in layout_blocks]
        provider_bboxes = [line.line.polygon.bbox for line in provider_lines]

        if len(layout_bboxes) == 0:
            return True

        if len(provider_bboxes) == 0:
            return False

        intersection_matrix = matrix_intersection_area(layout_bboxes, provider_bboxes)

        for idx, layout_block in enumerate(layout_blocks):
            total_blocks += 1
            intersecting_lines = np.count_nonzero(intersection_matrix[idx] > 0)

            if intersecting_lines >= self.layout_coverage_min_lines:
                covered_blocks += 1

            if (
                layout_block.polygon.intersection_pct(document_page.polygon) > 0.8
                and layout_block.block_type == BlockTypes.Text
            ):
                large_text_blocks += 1

        coverage_ratio = covered_blocks / total_blocks if total_blocks > 0 else 1
        text_okay = coverage_ratio >= self.layout_coverage_threshold

        # Model will sometimes say there is a single block of text on the page when it is blank
        if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
            text_okay = True
        return text_okay

    def filter_blank_lines(self, page: PageGroup, lines: List[ProviderOutput]):
        page_size = (page.polygon.width, page.polygon.height)
        page_image = page.get_image()
        image_size = page_image.size

        good_lines = []
        for line in lines:
            line_polygon_rescaled = deepcopy(line.line.polygon).rescale(
                page_size, image_size
            )
            line_bbox = line_polygon_rescaled.fit_to_bounds((0, 0, *image_size)).bbox

            if not is_blank_image(page_image.crop(line_bbox)):
                good_lines.append(line)

        return good_lines

    def merge_blocks(
        self,
        document: Document,
        page_provider_lines: ProviderPageLines,
        page_ocr_lines: ProviderPageLines,
    ):
        for document_page in document.pages:
            provider_lines: List[ProviderOutput] = page_provider_lines[
                document_page.page_id
            ]
            ocr_lines: List[ProviderOutput] = page_ocr_lines[document_page.page_id]

            # Only one or the other will have lines
            # Filter out blank lines which come from bad provider boxes, or invisible text
            merged_lines = self.filter_blank_lines(
                document_page, provider_lines + ocr_lines
            )

            # Text extraction method is overridden later for OCRed documents
            document_page.merge_blocks(
                merged_lines,
                text_extraction_method="pdftext" if provider_lines else "surya",
                keep_chars=self.keep_chars,
            )

```
Page 3/7FirstPrevNextLast