datalab-to/marker # codebase.md

This is page 2 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── ISSUE_TEMPLATE
│   │   ├── breaking-bug-report.md
│   │   ├── feature_request.md
│   │   └── output-bug-report.md
│   └── workflows
│       ├── benchmarks.yml
│       ├── ci.yml
│       ├── cla.yml
│       ├── publish.yml
│       └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│   ├── __init__.py
│   ├── overall
│   │   ├── __init__.py
│   │   ├── display
│   │   │   ├── __init__.py
│   │   │   ├── dataset.py
│   │   │   └── table.py
│   │   ├── download
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── llamaparse.py
│   │   │   ├── main.py
│   │   │   ├── mathpix.py
│   │   │   └── mistral.py
│   │   ├── elo.py
│   │   ├── methods
│   │   │   ├── __init__.py
│   │   │   ├── docling.py
│   │   │   ├── gt.py
│   │   │   ├── llamaparse.py
│   │   │   ├── marker.py
│   │   │   ├── mathpix.py
│   │   │   ├── mistral.py
│   │   │   ├── olmocr.py
│   │   │   └── schema.py
│   │   ├── overall.py
│   │   ├── registry.py
│   │   ├── schema.py
│   │   └── scorers
│   │       ├── __init__.py
│   │       ├── clean.py
│   │       ├── heuristic.py
│   │       ├── llm.py
│   │       └── schema.py
│   ├── table
│   │   ├── __init__.py
│   │   ├── gemini.py
│   │   ├── inference.py
│   │   ├── scoring.py
│   │   └── table.py
│   ├── throughput
│   │   ├── __init__.py
│   │   └── main.py
│   └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│   ├── .gitignore
│   ├── examples
│   │   ├── json
│   │   │   ├── multicolcnn.json
│   │   │   ├── switch_trans.json
│   │   │   └── thinkpython.json
│   │   └── markdown
│   │       ├── multicolcnn
│   │       │   ├── _page_1_Figure_0.jpeg
│   │       │   ├── _page_2_Picture_0.jpeg
│   │       │   ├── _page_6_Figure_0.jpeg
│   │       │   ├── _page_7_Figure_0.jpeg
│   │       │   ├── multicolcnn_meta.json
│   │       │   └── multicolcnn.md
│   │       ├── switch_transformers
│   │       │   ├── _page_11_Figure_4.jpeg
│   │       │   ├── _page_12_Figure_4.jpeg
│   │       │   ├── _page_13_Figure_2.jpeg
│   │       │   ├── _page_18_Figure_1.jpeg
│   │       │   ├── _page_18_Figure_3.jpeg
│   │       │   ├── _page_2_Figure_3.jpeg
│   │       │   ├── _page_20_Figure_1.jpeg
│   │       │   ├── _page_20_Figure_4.jpeg
│   │       │   ├── _page_27_Figure_1.jpeg
│   │       │   ├── _page_29_Figure_1.jpeg
│   │       │   ├── _page_30_Figure_1.jpeg
│   │       │   ├── _page_31_Figure_3.jpeg
│   │       │   ├── _page_4_Figure_1.jpeg
│   │       │   ├── _page_5_Figure_3.jpeg
│   │       │   ├── switch_trans_meta.json
│   │       │   └── switch_trans.md
│   │       └── thinkpython
│   │           ├── _page_109_Figure_1.jpeg
│   │           ├── _page_115_Figure_1.jpeg
│   │           ├── _page_116_Figure_3.jpeg
│   │           ├── _page_127_Figure_1.jpeg
│   │           ├── _page_128_Figure_1.jpeg
│   │           ├── _page_167_Figure_1.jpeg
│   │           ├── _page_169_Figure_1.jpeg
│   │           ├── _page_173_Figure_1.jpeg
│   │           ├── _page_190_Figure_1.jpeg
│   │           ├── _page_195_Figure_1.jpeg
│   │           ├── _page_205_Figure_1.jpeg
│   │           ├── _page_23_Figure_1.jpeg
│   │           ├── _page_23_Figure_3.jpeg
│   │           ├── _page_230_Figure_1.jpeg
│   │           ├── _page_233_Figure_1.jpeg
│   │           ├── _page_233_Figure_3.jpeg
│   │           ├── _page_234_Figure_1.jpeg
│   │           ├── _page_235_Figure_1.jpeg
│   │           ├── _page_236_Figure_1.jpeg
│   │           ├── _page_236_Figure_3.jpeg
│   │           ├── _page_237_Figure_1.jpeg
│   │           ├── _page_238_Figure_1.jpeg
│   │           ├── _page_46_Figure_1.jpeg
│   │           ├── _page_60_Figure_1.jpeg
│   │           ├── _page_60_Figure_3.jpeg
│   │           ├── _page_67_Figure_1.jpeg
│   │           ├── _page_71_Figure_1.jpeg
│   │           ├── _page_78_Figure_1.jpeg
│   │           ├── _page_85_Figure_1.jpeg
│   │           ├── _page_94_Figure_1.jpeg
│   │           ├── _page_99_Figure_17.jpeg
│   │           ├── _page_99_Figure_178.jpeg
│   │           ├── thinkpython_meta.json
│   │           └── thinkpython.md
│   ├── images
│   │   ├── overall.png
│   │   ├── per_doc.png
│   │   └── table.png
│   └── latex_to_md.sh
├── examples
│   ├── marker_modal_deployment.py
│   └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│   ├── builders
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── layout.py
│   │   ├── line.py
│   │   ├── ocr.py
│   │   └── structure.py
│   ├── config
│   │   ├── __init__.py
│   │   ├── crawler.py
│   │   ├── parser.py
│   │   └── printer.py
│   ├── converters
│   │   ├── __init__.py
│   │   ├── extraction.py
│   │   ├── ocr.py
│   │   ├── pdf.py
│   │   └── table.py
│   ├── extractors
│   │   ├── __init__.py
│   │   ├── document.py
│   │   └── page.py
│   ├── logger.py
│   ├── models.py
│   ├── output.py
│   ├── processors
│   │   ├── __init__.py
│   │   ├── blank_page.py
│   │   ├── block_relabel.py
│   │   ├── blockquote.py
│   │   ├── code.py
│   │   ├── debug.py
│   │   ├── document_toc.py
│   │   ├── equation.py
│   │   ├── footnote.py
│   │   ├── ignoretext.py
│   │   ├── line_merge.py
│   │   ├── line_numbers.py
│   │   ├── list.py
│   │   ├── llm
│   │   │   ├── __init__.py
│   │   │   ├── llm_complex.py
│   │   │   ├── llm_equation.py
│   │   │   ├── llm_form.py
│   │   │   ├── llm_handwriting.py
│   │   │   ├── llm_image_description.py
│   │   │   ├── llm_mathblock.py
│   │   │   ├── llm_meta.py
│   │   │   ├── llm_page_correction.py
│   │   │   ├── llm_sectionheader.py
│   │   │   ├── llm_table_merge.py
│   │   │   └── llm_table.py
│   │   ├── order.py
│   │   ├── page_header.py
│   │   ├── reference.py
│   │   ├── sectionheader.py
│   │   ├── table.py
│   │   ├── text.py
│   │   └── util.py
│   ├── providers
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── epub.py
│   │   ├── html.py
│   │   ├── image.py
│   │   ├── pdf.py
│   │   ├── powerpoint.py
│   │   ├── registry.py
│   │   ├── spreadsheet.py
│   │   └── utils.py
│   ├── renderers
│   │   ├── __init__.py
│   │   ├── chunk.py
│   │   ├── extraction.py
│   │   ├── html.py
│   │   ├── json.py
│   │   ├── markdown.py
│   │   └── ocr_json.py
│   ├── schema
│   │   ├── __init__.py
│   │   ├── blocks
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── basetable.py
│   │   │   ├── caption.py
│   │   │   ├── code.py
│   │   │   ├── complexregion.py
│   │   │   ├── equation.py
│   │   │   ├── figure.py
│   │   │   ├── footnote.py
│   │   │   ├── form.py
│   │   │   ├── handwriting.py
│   │   │   ├── inlinemath.py
│   │   │   ├── listitem.py
│   │   │   ├── pagefooter.py
│   │   │   ├── pageheader.py
│   │   │   ├── picture.py
│   │   │   ├── reference.py
│   │   │   ├── sectionheader.py
│   │   │   ├── table.py
│   │   │   ├── tablecell.py
│   │   │   ├── text.py
│   │   │   └── toc.py
│   │   ├── document.py
│   │   ├── groups
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── figure.py
│   │   │   ├── list.py
│   │   │   ├── page.py
│   │   │   ├── picture.py
│   │   │   └── table.py
│   │   ├── polygon.py
│   │   ├── registry.py
│   │   └── text
│   │       ├── __init__.py
│   │       ├── char.py
│   │       ├── line.py
│   │       └── span.py
│   ├── scripts
│   │   ├── __init__.py
│   │   ├── chunk_convert.py
│   │   ├── chunk_convert.sh
│   │   ├── common.py
│   │   ├── convert_single.py
│   │   ├── convert.py
│   │   ├── extraction_app.py
│   │   ├── file_to_s3.py
│   │   ├── run_streamlit_app.py
│   │   ├── server.py
│   │   └── streamlit_app.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── azure_openai.py
│   │   ├── claude.py
│   │   ├── gemini.py
│   │   ├── ollama.py
│   │   ├── openai.py
│   │   └── vertex.py
│   ├── settings.py
│   ├── util.py
│   └── utils
│       ├── __init__.py
│       ├── batch.py
│       ├── gpu.py
│       └── image.py
├── marker_app.py
├── marker_server.py
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│   └── version1
│       └── cla.json
├── static
│   └── fonts
│       └── .gitignore
└── tests
    ├── builders
    │   ├── test_blank_page.py
    │   ├── test_document_builder.py
    │   ├── test_garbled_pdf.py
    │   ├── test_layout_replace.py
    │   ├── test_ocr_builder.py
    │   ├── test_ocr_pipeline.py
    │   ├── test_overriding.py
    │   ├── test_pdf_links.py
    │   ├── test_rotated_bboxes.py
    │   ├── test_strip_existing_ocr.py
    │   └── test_structure.py
    ├── config
    │   └── test_config.py
    ├── conftest.py
    ├── converters
    │   ├── test_extraction_converter.py
    │   ├── test_ocr_converter.py
    │   ├── test_pdf_converter.py
    │   └── test_table_converter.py
    ├── processors
    │   ├── test_document_toc_processor.py
    │   ├── test_equation_processor.py
    │   ├── test_footnote_processor.py
    │   ├── test_ignoretext.py
    │   ├── test_llm_processors.py
    │   ├── test_table_merge.py
    │   └── test_table_processor.py
    ├── providers
    │   ├── test_document_providers.py
    │   ├── test_image_provider.py
    │   └── test_pdf_provider.py
    ├── renderers
    │   ├── test_chunk_renderer.py
    │   ├── test_extract_images.py
    │   ├── test_html_renderer.py
    │   ├── test_json_renderer.py
    │   └── test_markdown_renderer.py
    ├── schema
    │   └── groups
    │       └── test_list_grouping.py
    ├── services
    │   └── test_service_init.py
    └── utils.py
```

# Files

--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------

```toml
 1 | [tool.poetry]
 2 | name = "marker-pdf"
 3 | version = "1.10.1"
 4 | description = "Convert documents to markdown with high speed and accuracy."
 5 | authors = ["Vik Paruchuri <[email protected]>"]
 6 | readme = "README.md"
 7 | license = "GPL-3.0-or-later"
 8 | repository = "https://github.com/VikParuchuri/marker"
 9 | keywords = ["pdf", "markdown", "ocr", "nlp"]
10 | packages = [
11 |     {include = "marker"}
12 | ]
13 | include = [
14 |     "marker/scripts/*.sh",
15 |     "marker/scripts/*.html",
16 | ]
17 | 
18 | [tool.poetry.dependencies]
19 | python = "^3.10"
20 | Pillow = "^10.1.0"
21 | pydantic = "^2.4.2"
22 | pydantic-settings = "^2.0.3"
23 | transformers = "^4.45.2"
24 | python-dotenv = "^1.0.0"
25 | torch = "^2.7.0"
26 | tqdm = "^4.66.1"
27 | ftfy = "^6.1.1"
28 | rapidfuzz = "^3.8.1"
29 | surya-ocr = "^0.17.0"
30 | regex = "^2024.4.28"
31 | pdftext = "~0.6.3"
32 | markdownify = "^1.1.0"
33 | click = "^8.2.0"
34 | markdown2 = "^2.5.2"
35 | filetype = "^1.2.0"
36 | google-genai = "^1.0.0"
37 | anthropic = "^0.46.0"
38 | pre-commit = "^4.2.0"
39 | scikit-learn = "^1.6.1"
40 | 
41 | # Optional dependencies for documents
42 | mammoth = {version = "^1.9.0", optional = true}
43 | openpyxl = {version = "^3.1.5", optional = true}
44 | python-pptx = {version = "^1.0.2", optional = true}
45 | ebooklib = {version = "^0.18", optional = true}
46 | weasyprint = {version = "^63.1", optional = true}
47 | openai = "^1.65.2"
48 | 
49 | [tool.poetry.group.dev.dependencies]
50 | jupyter = "^1.0.0"
51 | datasets = "^2.21.0"
52 | streamlit = "^1.37.1"
53 | fastapi = "^0.115.4"
54 | uvicorn = "^0.32.0"
55 | python-multipart = "^0.0.16"
56 | pytest = "^8.3.3"
57 | pytest-mock = "^3.14.0"
58 | apted = "1.0.3"
59 | distance = "0.1.3"
60 | lxml = "5.3.0"
61 | tabulate = "^0.9.0"
62 | latex2mathml = "^3.77.0"
63 | playwright = "^1.49.1"
64 | 
65 | [tool.poetry.extras]
66 | full = ["mammoth", "openpyxl", "python-pptx", "ebooklib", "weasyprint"]
67 | 
68 | [tool.poetry.scripts]
69 | marker = "marker.scripts.convert:convert_cli"
70 | marker_single = "marker.scripts.convert_single:convert_single_cli"
71 | marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli"
72 | marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli"
73 | marker_extract = "marker.scripts.run_streamlit_app:extraction_app_cli"
74 | marker_server = "marker.scripts.server:server_cli"
75 | 
76 | [build-system]
77 | requires = ["poetry-core"]
78 | build-backend = "poetry.core.masonry.api"
79 | 
```

--------------------------------------------------------------------------------
/marker/converters/table.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Tuple, List
 2 | 
 3 | from marker.builders.document import DocumentBuilder
 4 | from marker.builders.line import LineBuilder
 5 | from marker.builders.ocr import OcrBuilder
 6 | from marker.converters.pdf import PdfConverter
 7 | from marker.processors import BaseProcessor
 8 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
 9 | from marker.processors.llm.llm_form import LLMFormProcessor
10 | from marker.processors.llm.llm_table import LLMTableProcessor
11 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
12 | from marker.processors.table import TableProcessor
13 | from marker.providers.registry import provider_from_filepath
14 | from marker.schema import BlockTypes
15 | 
16 | 
17 | class TableConverter(PdfConverter):
18 |     default_processors: Tuple[BaseProcessor, ...] = (
19 |         TableProcessor,
20 |         LLMTableProcessor,
21 |         LLMTableMergeProcessor,
22 |         LLMFormProcessor,
23 |         LLMComplexRegionProcessor,
24 |     )
25 |     converter_block_types: List[BlockTypes] = (
26 |         BlockTypes.Table,
27 |         BlockTypes.Form,
28 |         BlockTypes.TableOfContents,
29 |     )
30 | 
31 |     def build_document(self, filepath: str):
32 |         provider_cls = provider_from_filepath(filepath)
33 |         layout_builder = self.resolve_dependencies(self.layout_builder_class)
34 |         line_builder = self.resolve_dependencies(LineBuilder)
35 |         ocr_builder = self.resolve_dependencies(OcrBuilder)
36 |         document_builder = DocumentBuilder(self.config)
37 |         document_builder.disable_ocr = True
38 | 
39 |         provider = provider_cls(filepath, self.config)
40 |         document = document_builder(provider, layout_builder, line_builder, ocr_builder)
41 | 
42 |         for page in document.pages:
43 |             page.structure = [
44 |                 p for p in page.structure if p.block_type in self.converter_block_types
45 |             ]
46 | 
47 |         for processor in self.processor_list:
48 |             processor(document)
49 | 
50 |         return document
51 | 
52 |     def __call__(self, filepath: str):
53 |         document = self.build_document(filepath)
54 |         self.page_count = len(document.pages)
55 | 
56 |         renderer = self.resolve_dependencies(self.renderer)
57 |         return renderer(document)
58 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/basetable.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.blocks import Block, BlockOutput
 5 | from marker.schema.blocks.tablecell import TableCell
 6 | 
 7 | 
 8 | class BaseTable(Block):
 9 |     block_type: BlockTypes | None = None
10 |     html: str | None = None
11 | 
12 |     @staticmethod
13 |     def format_cells(
14 |         document, child_blocks, block_config, child_cells: List[TableCell] | None = None
15 |     ):
16 |         if child_cells is None:
17 |             child_cells: List[TableCell] = [
18 |                 document.get_block(c.id)
19 |                 for c in child_blocks
20 |                 if c.id.block_type == BlockTypes.TableCell
21 |             ]
22 | 
23 |         unique_rows = sorted(list(set([c.row_id for c in child_cells])))
24 |         html_repr = "<table><tbody>"
25 |         for row_id in unique_rows:
26 |             row_cells = sorted(
27 |                 [c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id
28 |             )
29 |             html_repr += "<tr>"
30 |             for cell in row_cells:
31 |                 html_repr += cell.assemble_html(
32 |                     document, child_blocks, None, block_config
33 |                 )
34 |             html_repr += "</tr>"
35 |         html_repr += "</tbody></table>"
36 |         return html_repr
37 | 
38 |     def assemble_html(
39 |         self,
40 |         document,
41 |         child_blocks: List[BlockOutput],
42 |         parent_structure=None,
43 |         block_config: dict | None = None,
44 |     ):
45 |         # Filter out the table cells, so they don't render twice
46 |         child_ref_blocks = [
47 |             block
48 |             for block in child_blocks
49 |             if block.id.block_type == BlockTypes.Reference
50 |         ]
51 |         template = super().assemble_html(
52 |             document, child_ref_blocks, parent_structure, block_config
53 |         )
54 | 
55 |         child_block_types = set([c.id.block_type for c in child_blocks])
56 |         if self.html:
57 |             # LLM processor
58 |             return template + self.html
59 |         elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
60 |             # Table processor
61 |             return template + self.format_cells(document, child_blocks, block_config)
62 |         else:
63 |             # Default text lines and spans
64 |             return f"<p>{template}</p>"
65 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/download/mathpix.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import time
 3 | 
 4 | import requests
 5 | 
 6 | from benchmarks.overall.download.base import Downloader
 7 | 
 8 | 
 9 | class MathpixDownloader(Downloader):
10 |     service = "mathpix"
11 | 
12 |     def get_html(self, pdf_bytes):
13 |         headers = {
14 |             "app_id": self.app_id,
15 |             "app_key": self.api_key,
16 |         }
17 |         start = time.time()
18 |         pdf_id = mathpix_request(pdf_bytes, headers)
19 |         status = mathpix_status(pdf_id, headers)
20 |         if status in ["processing", "error"]:
21 |             md = ""
22 |         else:
23 |             md = mathpix_results(pdf_id, headers)
24 |         end = time.time()
25 |         if isinstance(md, bytes):
26 |             md = md.decode("utf-8")
27 | 
28 |         return {
29 |             "md": md,
30 |             "time": end - start
31 |         }
32 | 
33 | def mathpix_request(buffer, headers):
34 |     response = requests.post("https://api.mathpix.com/v3/pdf",
35 |         headers=headers,
36 |         data={
37 |             "options_json": json.dumps(
38 |                 {
39 |                     "conversion_formats": {
40 |                         "md": True,
41 |                         "html": True
42 |                     }
43 |                 }
44 |             )
45 |         },
46 |         files={
47 |             "file": buffer
48 |         }
49 |     )
50 |     data = response.json()
51 |     pdf_id = data["pdf_id"]
52 |     return pdf_id
53 | 
54 | def mathpix_status(pdf_id, headers):
55 |     max_iters = 120
56 |     i = 0
57 |     status = "processing"
58 |     status2 = "processing"
59 |     while i < max_iters:
60 |         time.sleep(1)
61 |         response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
62 |             headers=headers
63 |         )
64 |         status_resp = response.json()
65 |         if "conversion_status" not in status_resp:
66 |             continue
67 |         status = status_resp["conversion_status"]["md"]["status"]
68 |         status2 = status_resp["conversion_status"]["html"]["status"]
69 |         if status == "completed" and status2 == "completed":
70 |             break
71 |         elif status == "error" or status2 == "error":
72 |             break
73 |     out_status = "completed" if status == "completed" and status2 == "completed" else "error"
74 |     return out_status
75 | 
76 | def mathpix_results(pdf_id, headers, ext="md"):
77 |     response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
78 |         headers=headers
79 |     )
80 |     return response.content
81 | 
```

--------------------------------------------------------------------------------
/tests/builders/test_garbled_pdf.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.builders.document import DocumentBuilder
 4 | from marker.builders.line import LineBuilder
 5 | from marker.processors.table import TableProcessor
 6 | from marker.schema import BlockTypes
 7 | 
 8 | 
 9 | @pytest.mark.filename("water_damage.pdf")
10 | def test_garbled_pdf(pdf_document, recognition_model, table_rec_model, detection_model):
11 |     assert pdf_document.pages[0].structure[0] == "/page/0/Table/0"
12 | 
13 |     table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
14 |     assert table_block.block_type == BlockTypes.Table
15 |     assert table_block.structure[0] == "/page/0/Line/10"
16 | 
17 |     table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
18 |     assert table_cell.block_type == BlockTypes.Line
19 | 
20 |     # We don't OCR in the initial pass, only with the TableProcessor
21 |     processor = TableProcessor(recognition_model, table_rec_model, detection_model)
22 |     processor(pdf_document)
23 | 
24 |     table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0]
25 |     assert "варіант" in table.raw_text(pdf_document)
26 | 
27 |     table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
28 |     assert table_cell.block_type == BlockTypes.TableCell
29 | 
30 | 
31 | @pytest.mark.filename("hindi_judgement.pdf")
32 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
33 | def test_garbled_builder(config, doc_provider, detection_model, ocr_error_model):
34 |     line_builder = LineBuilder(detection_model, ocr_error_model, config)
35 |     builder = DocumentBuilder(config)
36 |     document = builder.build_document(doc_provider)
37 | 
38 |     bad_ocr_results = line_builder.ocr_error_detection(
39 |         document.pages, doc_provider.page_lines
40 |     )
41 |     assert len(bad_ocr_results.labels) == 2
42 |     assert any([label == "bad" for label in bad_ocr_results.labels])
43 | 
44 | 
45 | @pytest.mark.filename("adversarial.pdf")
46 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
47 | def test_nongarbled_builder(config, doc_provider, detection_model, ocr_error_model):
48 |     line_builder = LineBuilder(detection_model, ocr_error_model, config)
49 |     builder = DocumentBuilder(config)
50 |     document = builder.build_document(doc_provider)
51 | 
52 |     bad_ocr_results = line_builder.ocr_error_detection(
53 |         document.pages, doc_provider.page_lines
54 |     )
55 |     assert len(bad_ocr_results.labels) == 2
56 |     assert all([label == "good" for label in bad_ocr_results.labels])
57 | 
```

--------------------------------------------------------------------------------
/marker/processors/order.py:
--------------------------------------------------------------------------------

```python
 1 | from statistics import mean
 2 | from collections import defaultdict
 3 | 
 4 | from marker.processors import BaseProcessor
 5 | from marker.schema import BlockTypes
 6 | from marker.schema.document import Document
 7 | 
 8 | 
 9 | class OrderProcessor(BaseProcessor):
10 |     """
11 |     A processor for sorting the blocks in order if needed.  This can help when the layout image was sliced.
12 |     """
13 |     block_types = tuple()
14 | 
15 |     def __call__(self, document: Document):
16 |         for page in document.pages:
17 |             # Skip OCRed pages
18 |             if page.text_extraction_method != "pdftext":
19 |                 continue
20 | 
21 |             # Skip pages without layout slicing
22 |             if not page.layout_sliced:
23 |                 continue
24 | 
25 |             block_idxs = defaultdict(int)
26 |             for block_id in page.structure:
27 |                 block = document.get_block(block_id)
28 |                 spans = block.contained_blocks(document, (BlockTypes.Span, ))
29 |                 if len(spans) == 0:
30 |                     continue
31 | 
32 |                 # Avg span position in original PDF
33 |                 block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
34 | 
35 |             for block_id in page.structure:
36 |                 # Already assigned block id via span position
37 |                 if block_idxs[block_id] > 0:
38 |                     continue
39 | 
40 |                 block = document.get_block(block_id)
41 |                 prev_block = document.get_prev_block(block)
42 |                 next_block = document.get_next_block(block)
43 | 
44 |                 block_idx_add = 0
45 |                 if prev_block:
46 |                     block_idx_add = 1
47 | 
48 |                 while prev_block and prev_block.id not in block_idxs:
49 |                     prev_block = document.get_prev_block(prev_block)
50 |                     block_idx_add += 1
51 | 
52 |                 if not prev_block:
53 |                     block_idx_add = -1
54 |                     while next_block and next_block.id not in block_idxs:
55 |                         next_block = document.get_next_block(next_block)
56 |                         block_idx_add -= 1
57 | 
58 |                 if not next_block and not prev_block:
59 |                     pass
60 |                 elif prev_block:
61 |                     block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add
62 |                 else:
63 |                     block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add
64 | 
65 |             page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
66 | 
67 | 
```

--------------------------------------------------------------------------------
/tests/converters/test_extraction_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import pytest
 3 | 
 4 | from marker.converters.extraction import ExtractionConverter
 5 | from marker.extractors.page import PageExtractionSchema
 6 | from marker.extractors.document import DocumentExtractionSchema
 7 | from marker.services import BaseService
 8 | 
 9 | 
10 | class MockLLMService(BaseService):
11 |     def __call__(self, prompt, image=None, page=None, response_schema=None, **kwargs):
12 |         if response_schema == PageExtractionSchema:
13 |             return {
14 |                 "description": "Mock extraction description",
15 |                 "detailed_notes": "Mock detailed notes for page extraction",
16 |             }
17 |         elif response_schema == DocumentExtractionSchema:
18 |             return {
19 |                 "analysis": "Mock document analysis",
20 |                 "document_json": json.dumps({"test_key": "test_value"}),
21 |             }
22 |         return {}
23 | 
24 | 
25 | @pytest.fixture
26 | def mock_llm_service():
27 |     return MockLLMService
28 | 
29 | 
30 | @pytest.fixture
31 | def extraction_converter(config, model_dict, mock_llm_service):
32 |     test_schema = {
33 |         "title": "TestSchema",
34 |         "type": "object",
35 |         "properties": {"test_key": {"title": "Test Key", "type": "string"}},
36 |         "required": ["test_key"],
37 |     }
38 | 
39 |     config["page_schema"] = json.dumps(test_schema)
40 |     config["output_format"] = "markdown"
41 |     model_dict["llm_service"] = mock_llm_service
42 | 
43 |     converter = ExtractionConverter(
44 |         artifact_dict=model_dict, processor_list=None, config=config
45 |     )
46 |     converter.llm_service = mock_llm_service
47 |     converter.default_llm_service = MockLLMService
48 |     return converter
49 | 
50 | 
51 | @pytest.mark.config({"page_range": [0]})
52 | def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc):
53 |     config["page_schema"] = "invalid json"
54 | 
55 |     model_dict["llm_service"] = mock_llm_service
56 |     converter = ExtractionConverter(
57 |         artifact_dict=model_dict, processor_list=None, config=config
58 |     )
59 |     converter.artifact_dict["llm_service"] = mock_llm_service()
60 | 
61 |     results = converter(temp_doc.name)
62 |     assert results.document_json == '{"test_key": "test_value"}'
63 | 
64 | 
65 | @pytest.mark.config({"page_range": [0, 1]})
66 | def test_extraction_converter_multiple_pages(extraction_converter, temp_doc):
67 |     result = extraction_converter(temp_doc.name)
68 | 
69 |     assert result is not None
70 |     assert result.document_json is not None
71 |     assert json.loads(result.document_json) == {"test_key": "test_value"}
72 |     assert result.analysis == "Mock document analysis"
73 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_meta.py:
--------------------------------------------------------------------------------

```python
 1 | from concurrent.futures import ThreadPoolExecutor
 2 | from typing import List, Dict, Any
 3 | 
 4 | from marker.logger import get_logger
 5 | from tqdm import tqdm
 6 | 
 7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor, BaseLLMProcessor
 8 | from marker.schema.document import Document
 9 | from marker.services import BaseService
10 | 
11 | logger = get_logger()
12 | 
13 | 
14 | class LLMSimpleBlockMetaProcessor(BaseLLMProcessor):
15 |     """
16 |     A wrapper for simple LLM processors, so they can all run in parallel.
17 |     """
18 | 
19 |     def __init__(
20 |         self,
21 |         processor_lst: List[BaseLLMSimpleBlockProcessor],
22 |         llm_service: BaseService,
23 |         config=None,
24 |     ):
25 |         super().__init__(llm_service, config)
26 |         self.processors = processor_lst
27 | 
28 |     def __call__(self, document: Document):
29 |         if not self.use_llm or self.llm_service is None:
30 |             return
31 | 
32 |         total = sum(
33 |             [len(processor.inference_blocks(document)) for processor in self.processors]
34 |         )
35 |         pbar = tqdm(
36 |             desc="LLM processors running", disable=self.disable_tqdm, total=total
37 |         )
38 | 
39 |         all_prompts = [
40 |             processor.block_prompts(document) for processor in self.processors
41 |         ]
42 |         pending = []
43 |         futures_map = {}
44 |         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
45 |             for i, prompt_lst in enumerate(all_prompts):
46 |                 for prompt in prompt_lst:
47 |                     future = executor.submit(self.get_response, prompt)
48 |                     pending.append(future)
49 |                     futures_map[future] = {"processor_idx": i, "prompt_data": prompt}
50 | 
51 |             for future in pending:
52 |                 try:
53 |                     result = future.result()
54 |                     future_data = futures_map.pop(future)
55 |                     processor: BaseLLMSimpleBlockProcessor = self.processors[
56 |                         future_data["processor_idx"]
57 |                     ]
58 |                     # finalize the result
59 |                     processor(result, future_data["prompt_data"], document)
60 |                 except Exception as e:
61 |                     logger.warning(f"Error processing LLM response: {e}")
62 | 
63 |                 pbar.update(1)
64 | 
65 |         pbar.close()
66 | 
67 |     def get_response(self, prompt_data: Dict[str, Any]):
68 |         return self.llm_service(
69 |             prompt_data["prompt"],
70 |             prompt_data["image"],
71 |             prompt_data["block"],
72 |             prompt_data["schema"],
73 |         )
74 | 
```

--------------------------------------------------------------------------------
/marker/converters/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | import inspect
 2 | from typing import Optional, List, Type
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | from marker.processors import BaseProcessor
 7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor
 8 | from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
 9 | from marker.util import assign_config, download_font
10 | 
11 | 
12 | class BaseConverter:
13 |     def __init__(self, config: Optional[BaseModel | dict] = None):
14 |         assign_config(self, config)
15 |         self.config = config
16 |         self.llm_service = None
17 | 
18 |         # Download render font, needed for some providers
19 |         download_font()
20 | 
21 |     def __call__(self, *args, **kwargs):
22 |         raise NotImplementedError
23 | 
24 |     def resolve_dependencies(self, cls):
25 |         init_signature = inspect.signature(cls.__init__)
26 |         parameters = init_signature.parameters
27 | 
28 |         resolved_kwargs = {}
29 |         for param_name, param in parameters.items():
30 |             if param_name == 'self':
31 |                 continue
32 |             elif param_name == 'config':
33 |                 resolved_kwargs[param_name] = self.config
34 |             elif param.name in self.artifact_dict:
35 |                 resolved_kwargs[param_name] = self.artifact_dict[param_name]
36 |             elif param.default != inspect.Parameter.empty:
37 |                 resolved_kwargs[param_name] = param.default
38 |             else:
39 |                 raise ValueError(f"Cannot resolve dependency for parameter: {param_name}")
40 | 
41 |         return cls(**resolved_kwargs)
42 | 
43 |     def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]:
44 |         processors = []
45 |         for processor_cls in processor_cls_lst:
46 |             processors.append(self.resolve_dependencies(processor_cls))
47 | 
48 |         simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
49 |         other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
50 | 
51 |         if not simple_llm_processors:
52 |             return processors
53 | 
54 |         llm_positions = [i for i, p in enumerate(processors) if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
55 |         insert_position = max(0, llm_positions[-1] - len(simple_llm_processors) + 1)
56 | 
57 |         meta_processor = LLMSimpleBlockMetaProcessor(
58 |             processor_lst=simple_llm_processors,
59 |             llm_service=self.llm_service,
60 |             config=self.config,
61 |         )
62 |         other_processors.insert(insert_position, meta_processor)
63 |         return other_processors
```

--------------------------------------------------------------------------------
/marker/providers/registry.py:
--------------------------------------------------------------------------------

```python
 1 | import filetype
 2 | import filetype.match as file_match
 3 | from bs4 import BeautifulSoup
 4 | from filetype.types import archive, document, IMAGE
 5 | 
 6 | from marker.providers.document import DocumentProvider
 7 | from marker.providers.epub import EpubProvider
 8 | from marker.providers.html import HTMLProvider
 9 | from marker.providers.image import ImageProvider
10 | from marker.providers.pdf import PdfProvider
11 | from marker.providers.powerpoint import PowerPointProvider
12 | from marker.providers.spreadsheet import SpreadSheetProvider
13 | 
14 | DOCTYPE_MATCHERS = {
15 |     "image": IMAGE,
16 |     "pdf": [
17 |         archive.Pdf,
18 |     ],
19 |     "epub": [
20 |         archive.Epub,
21 |     ],
22 |     "doc": [document.Docx],
23 |     "xls": [document.Xlsx],
24 |     "ppt": [document.Pptx],
25 | }
26 | 
27 | 
28 | def load_matchers(doctype: str):
29 |     return [cls() for cls in DOCTYPE_MATCHERS[doctype]]
30 | 
31 | 
32 | def load_extensions(doctype: str):
33 |     return [cls.EXTENSION for cls in DOCTYPE_MATCHERS[doctype]]
34 | 
35 | 
36 | def provider_from_ext(filepath: str):
37 |     ext = filepath.rsplit(".", 1)[-1].strip()
38 |     if not ext:
39 |         return PdfProvider
40 | 
41 |     if ext in load_extensions("image"):
42 |         return ImageProvider
43 |     if ext in load_extensions("pdf"):
44 |         return PdfProvider
45 |     if ext in load_extensions("doc"):
46 |         return DocumentProvider
47 |     if ext in load_extensions("xls"):
48 |         return SpreadSheetProvider
49 |     if ext in load_extensions("ppt"):
50 |         return PowerPointProvider
51 |     if ext in load_extensions("epub"):
52 |         return EpubProvider
53 |     if ext in ["html"]:
54 |         return HTMLProvider
55 | 
56 |     return PdfProvider
57 | 
58 | 
59 | def provider_from_filepath(filepath: str):
60 |     if filetype.image_match(filepath) is not None:
61 |         return ImageProvider
62 |     if file_match(filepath, load_matchers("pdf")) is not None:
63 |         return PdfProvider
64 |     if file_match(filepath, load_matchers("epub")) is not None:
65 |         return EpubProvider
66 |     if file_match(filepath, load_matchers("doc")) is not None:
67 |         return DocumentProvider
68 |     if file_match(filepath, load_matchers("xls")) is not None:
69 |         return SpreadSheetProvider
70 |     if file_match(filepath, load_matchers("ppt")) is not None:
71 |         return PowerPointProvider
72 | 
73 |     try:
74 |         with open(filepath, "r", encoding="utf-8") as f:
75 |             soup = BeautifulSoup(f.read(), "html.parser")
76 |             # Check if there are any HTML tags
77 |             if bool(soup.find()):
78 |                 return HTMLProvider
79 |     except Exception:
80 |         pass
81 | 
82 |     # Fallback if we incorrectly detect the file type
83 |     return provider_from_ext(filepath)
84 | 
```

--------------------------------------------------------------------------------
/marker/processors/blank_page.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Annotated
 2 | 
 3 | from PIL import Image
 4 | import numpy as np
 5 | import cv2
 6 | 
 7 | from marker.processors import BaseProcessor
 8 | from marker.schema import BlockTypes
 9 | from marker.schema.blocks import Block
10 | from marker.schema.document import Document
11 | 
12 | from marker.logger import get_logger
13 | 
14 | logger = get_logger()
15 | 
16 | 
17 | class BlankPageProcessor(BaseProcessor):
18 |     """
19 |     A processor to filter out blank pages detected as a single layout block
20 |     """
21 | 
22 |     full_page_block_intersection_threshold: Annotated[
23 |         float, "Threshold to detect blank pages at"
24 |     ] = 0.8
25 |     filter_blank_pages: Annotated[bool, "Remove blank pages detected as images."] = (
26 |         False
27 |     )
28 | 
29 |     def is_blank(self, image: Image.Image):
30 |         image = np.asarray(image)
31 |         if image.size == 0 or image.shape[0] == 0 or image.shape[1] == 0:
32 |             # Handle empty image case
33 |             return True
34 | 
35 |         gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
36 |         gray = cv2.GaussianBlur(gray, (7, 7), 0)
37 | 
38 |         # Adaptive threshold (inverse for text as white)
39 |         binarized = cv2.adaptiveThreshold(
40 |             gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
41 |         )
42 | 
43 |         num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
44 |             binarized, connectivity=8
45 |         )
46 |         cleaned = np.zeros_like(binarized)
47 |         for i in range(1, num_labels):  # skip background
48 |             cleaned[labels == i] = 255
49 | 
50 |         kernel = np.ones((1, 5), np.uint8)
51 |         dilated = cv2.dilate(cleaned, kernel, iterations=3)
52 |         b = dilated / 255
53 |         return b.sum() == 0
54 | 
55 |     def __call__(self, document: Document):
56 |         if not self.filter_blank_pages:
57 |             return
58 | 
59 |         for page in document.pages:
60 |             structure_blocks = page.structure_blocks(document)
61 |             if not structure_blocks or len(structure_blocks) > 1:
62 |                 continue
63 | 
64 |             full_page_block: Block = structure_blocks[0]
65 | 
66 |             conditions = [
67 |                 full_page_block.block_type in [BlockTypes.Picture, BlockTypes.Figure],
68 |                 self.is_blank(full_page_block.get_image(document)),
69 |                 page.polygon.intersection_area(full_page_block.polygon)
70 |                 > self.full_page_block_intersection_threshold,
71 |             ]
72 | 
73 |             if all(conditions):
74 |                 logger.debug(f"Removing blank block {full_page_block.id}")
75 |                 page.remove_structure_items([full_page_block.id])
76 |                 full_page_block.removed = True
77 | 
```

--------------------------------------------------------------------------------
/marker/processors/util.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from marker.schema import BlockTypes
 6 | from marker.schema.groups import PageGroup
 7 | from marker.schema.registry import get_block_class
 8 | from marker.schema.text import Line
 9 | 
10 | 
11 | def escape_latex_commands(text: str):
12 |     text = (text
13 |             .replace('\n', '\\n')
14 |             .replace('\t', '\\t')
15 |             .replace('\r', '\\r'))
16 |     return text
17 | 
18 | 
19 | def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
20 |     SpanClass = get_block_class(BlockTypes.Span)
21 |     corrected_spans = text_to_spans(corrected_text)
22 | 
23 |     for span_idx, span in enumerate(corrected_spans):
24 |         if span_idx == len(corrected_spans) - 1:
25 |             span['content'] += "\n"
26 | 
27 |         span_block = page.add_full_block(
28 |             SpanClass(
29 |                 polygon=text_line.polygon,
30 |                 text=span['content'],
31 |                 font='Unknown',
32 |                 font_weight=0,
33 |                 font_size=0,
34 |                 minimum_position=0,
35 |                 maximum_position=0,
36 |                 formats=[span['type']],
37 |                 url=span.get('url'),
38 |                 page_id=text_line.page_id,
39 |                 text_extraction_method="gemini",
40 |                 has_superscript=span["has_superscript"],
41 |                 has_subscript=span["has_subscript"]
42 |             )
43 |         )
44 |         text_line.structure.append(span_block.id)
45 | 
46 | 
47 | def text_to_spans(text):
48 |     soup = BeautifulSoup(text, 'html.parser')
49 | 
50 |     tag_types = {
51 |         'b': 'bold',
52 |         'i': 'italic',
53 |         'math': 'math',
54 |         'sub': 'plain',
55 |         'sup': 'plain',
56 |         'span': 'plain'
57 |     }
58 |     spans = []
59 | 
60 |     for element in soup.descendants:
61 |         if not len(list(element.parents)) == 1:
62 |             continue
63 | 
64 |         url = element.attrs.get('href') if hasattr(element, 'attrs') else None
65 | 
66 |         if element.name in tag_types:
67 |             text = element.get_text()
68 |             if element.name == "math":
69 |                 text = escape_latex_commands(text)
70 |             spans.append({
71 |                 'type': tag_types[element.name],
72 |                 'content': text,
73 |                 'url': url,
74 |                 "has_superscript": element.name == "sup",
75 |                 "has_subscript": element.name == "sub"
76 |             })
77 |         elif element.string:
78 |             spans.append({
79 |                 'type': 'plain',
80 |                 'content': element.string,
81 |                 'url': url,
82 |                 "has_superscript": False,
83 |                 "has_subscript": False
84 |             })
85 | 
86 |     return spans
```

--------------------------------------------------------------------------------
/marker/providers/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | from copy import deepcopy
 2 | from typing import List, Optional, Dict
 3 | 
 4 | from PIL import Image
 5 | from pydantic import BaseModel
 6 | 
 7 | from pdftext.schema import Reference
 8 | 
 9 | from marker.logger import configure_logging
10 | from marker.schema.polygon import PolygonBox
11 | from marker.schema.text import Span
12 | from marker.schema.text.char import Char
13 | from marker.schema.text.line import Line
14 | from marker.settings import settings
15 | from marker.util import assign_config
16 | 
17 | configure_logging()
18 | 
19 | 
20 | class ProviderOutput(BaseModel):
21 |     line: Line
22 |     spans: List[Span]
23 |     chars: Optional[List[List[Char]]] = None
24 | 
25 |     @property
26 |     def raw_text(self):
27 |         return "".join(span.text for span in self.spans)
28 | 
29 |     def __hash__(self):
30 |         return hash(tuple(self.line.polygon.bbox))
31 | 
32 |     def merge(self, other: "ProviderOutput"):
33 |         new_output = deepcopy(self)
34 |         other_copy = deepcopy(other)
35 | 
36 |         new_output.spans.extend(other_copy.spans)
37 |         if new_output.chars is not None and other_copy.chars is not None:
38 |             new_output.chars.extend(other_copy.chars)
39 |         elif other_copy.chars is not None:
40 |             new_output.chars = other_copy.chars
41 | 
42 |         new_output.line.polygon = new_output.line.polygon.merge(
43 |             [other_copy.line.polygon]
44 |         )
45 |         return new_output
46 | 
47 | 
48 | ProviderPageLines = Dict[int, List[ProviderOutput]]
49 | 
50 | 
51 | class BaseProvider:
52 |     def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None):
53 |         assign_config(self, config)
54 |         self.filepath = filepath
55 | 
56 |     def __len__(self):
57 |         pass
58 | 
59 |     def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
60 |         pass
61 | 
62 |     def get_page_bbox(self, idx: int) -> PolygonBox | None:
63 |         pass
64 | 
65 |     def get_page_lines(self, idx: int) -> List[Line]:
66 |         pass
67 | 
68 |     def get_page_refs(self, idx: int) -> List[Reference]:
69 |         pass
70 | 
71 |     def __enter__(self):
72 |         return self
73 | 
74 |     @staticmethod
75 |     def get_font_css():
76 |         from weasyprint import CSS
77 |         from weasyprint.text.fonts import FontConfiguration
78 | 
79 |         font_config = FontConfiguration()
80 |         css = CSS(
81 |             string=f"""
82 |             @font-face {{
83 |                 font-family: GoNotoCurrent-Regular;
84 |                 src: url({settings.FONT_PATH});
85 |                 font-display: swap;
86 |             }}
87 |             body {{
88 |                 font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif;
89 |                 font-variant-ligatures: none;
90 |                 font-feature-settings: "liga" 0;
91 |                 text-rendering: optimizeLegibility;
92 |             }}
93 |             """,
94 |             font_config=font_config,
95 |         )
96 |         return css
97 | 
```

--------------------------------------------------------------------------------
/marker/providers/document.py:
--------------------------------------------------------------------------------

```python
  1 | import base64
  2 | import os
  3 | import re
  4 | import tempfile
  5 | from io import BytesIO
  6 | 
  7 | from PIL import Image
  8 | from marker.logger import get_logger
  9 | 
 10 | from marker.providers.pdf import PdfProvider
 11 | 
 12 | logger = get_logger()
 13 | 
 14 | css = """
 15 | @page {
 16 |     size: A4;
 17 |     margin: 2cm;
 18 | }
 19 | 
 20 | img {
 21 |     max-width: 100%;
 22 |     max-height: 25cm;
 23 |     object-fit: contain;
 24 |     margin: 12pt auto;
 25 | }
 26 | 
 27 | div, p {
 28 |     max-width: 100%;
 29 |     word-break: break-word;
 30 |     font-size: 10pt;
 31 | }
 32 | 
 33 | table {
 34 |     width: 100%;
 35 |     border-collapse: collapse;
 36 |     break-inside: auto;
 37 |     font-size: 10pt;
 38 | }
 39 | 
 40 | tr {
 41 |     break-inside: avoid;
 42 |     page-break-inside: avoid;
 43 | }
 44 | 
 45 | td {
 46 |     border: 0.75pt solid #000;
 47 |     padding: 6pt;
 48 | }
 49 | """
 50 | 
 51 | 
 52 | class DocumentProvider(PdfProvider):
 53 |     def __init__(self, filepath: str, config=None):
 54 |         temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
 55 |         self.temp_pdf_path = temp_pdf.name
 56 |         temp_pdf.close()
 57 | 
 58 |         # Convert DOCX to PDF
 59 |         try:
 60 |             self.convert_docx_to_pdf(filepath)
 61 |         except Exception as e:
 62 |             raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
 63 | 
 64 |         # Initialize the PDF provider with the temp pdf path
 65 |         super().__init__(self.temp_pdf_path, config)
 66 | 
 67 |     def __del__(self):
 68 |         if os.path.exists(self.temp_pdf_path):
 69 |             os.remove(self.temp_pdf_path)
 70 | 
 71 |     def convert_docx_to_pdf(self, filepath: str):
 72 |         from weasyprint import CSS, HTML
 73 |         import mammoth
 74 | 
 75 |         with open(filepath, "rb") as docx_file:
 76 |             # we convert the docx to HTML
 77 |             result = mammoth.convert_to_html(docx_file)
 78 |             html = result.value
 79 | 
 80 |             # We convert the HTML into a PDF
 81 |             HTML(string=self._preprocess_base64_images(html)).write_pdf(
 82 |                 self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()]
 83 |             )
 84 | 
 85 |     @staticmethod
 86 |     def _preprocess_base64_images(html_content):
 87 |         pattern = r'data:([^;]+);base64,([^"\'>\s]+)'
 88 | 
 89 |         def convert_image(match):
 90 |             try:
 91 |                 img_data = base64.b64decode(match.group(2))
 92 | 
 93 |                 with BytesIO(img_data) as bio:
 94 |                     with Image.open(bio) as img:
 95 |                         output = BytesIO()
 96 |                         img.save(output, format=img.format)
 97 |                         new_base64 = base64.b64encode(output.getvalue()).decode()
 98 |                         return f"data:{match.group(1)};base64,{new_base64}"
 99 | 
100 |             except Exception as e:
101 |                 logger.error(f"Failed to process image: {e}")
102 |                 return ""  # we ditch broken images as that breaks the PDF creation down the line
103 | 
104 |         return re.sub(pattern, convert_image, html_content)
105 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/olmocr.py:
--------------------------------------------------------------------------------

```python
 1 | import base64
 2 | import json
 3 | import tempfile
 4 | import time
 5 | from io import BytesIO
 6 | 
 7 | import torch
 8 | from PIL import Image
 9 | 
10 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
11 | 
12 | 
13 | def convert_single_page(filename: str, model, processor, device):
14 |     from olmocr.data.renderpdf import render_pdf_to_base64png
15 |     from olmocr.prompts import build_finetuning_prompt
16 |     from olmocr.prompts.anchor import get_anchor_text
17 | 
18 |     image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)
19 | 
20 |     # Build the prompt, using document metadata
21 |     anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)
22 |     prompt = build_finetuning_prompt(anchor_text)
23 | 
24 |     # Build the full prompt
25 |     messages = [
26 |         {
27 |             "role": "user",
28 |             "content": [
29 |                 {"type": "text", "text": prompt},
30 |                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
31 |             ],
32 |         }
33 |     ]
34 | 
35 |     # Apply the chat template and processor
36 |     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
37 |     main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
38 | 
39 |     inputs = processor(
40 |         text=[text],
41 |         images=[main_image],
42 |         padding=True,
43 |         return_tensors="pt",
44 |     )
45 |     inputs = {key: value.to(device) for (key, value) in inputs.items()}
46 | 
47 |     # Generate the output
48 |     output = model.generate(
49 |         **inputs,
50 |         temperature=0.8,
51 |         max_new_tokens=8192,
52 |         num_return_sequences=1,
53 |         do_sample=True,
54 |     )
55 | 
56 |     # Decode the output
57 |     prompt_length = inputs["input_ids"].shape[1]
58 |     new_tokens = output[:, prompt_length:]
59 |     text_output = processor.tokenizer.batch_decode(
60 |         new_tokens, skip_special_tokens=True
61 |     )[0]
62 | 
63 |     try:
64 |         text_output = json.loads(text_output)
65 |         text = text_output["natural_text"]
66 |     except Exception:
67 |         try:
68 |             text = text_output.split("natural_text")[1].strip()
69 |         except Exception:
70 |             text = ""
71 | 
72 |     return text
73 | 
74 | 
75 | class OlmOCRMethod(BaseMethod):
76 |     olmocr_model: dict = None
77 |     use_llm: bool = False
78 | 
79 |     def __call__(self, sample) -> BenchmarkResult:
80 |         pdf_bytes = sample["pdf"]  # This is a single page PDF
81 | 
82 |         with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
83 |             f.write(pdf_bytes)
84 |             start = time.time()
85 |             result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)
86 |             total = time.time() - start
87 | 
88 |         return {
89 |             "markdown": result,
90 |             "time": total
91 |         }
92 | 
```

--------------------------------------------------------------------------------
/tests/renderers/test_markdown_renderer.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.renderers.markdown import MarkdownRenderer
 4 | from marker.schema import BlockTypes
 5 | from marker.schema.blocks import TableCell
 6 | 
 7 | 
 8 | @pytest.mark.config({"page_range": [0], "disable_ocr": True})
 9 | def test_markdown_renderer(pdf_document):
10 |     renderer = MarkdownRenderer()
11 |     md = renderer(pdf_document).markdown
12 | 
13 |     # Verify markdown
14 |     assert "# Subspace Adversarial Training" in md
15 | 
16 | 
17 | @pytest.mark.config({"page_range": [0]})
18 | def test_markdown_renderer_auto_ocr(pdf_document):
19 |     renderer = MarkdownRenderer()
20 |     md = renderer(pdf_document).markdown
21 | 
22 |     # Verify markdown
23 |     assert "Subspace Adversarial Training" in md
24 | 
25 | 
26 | @pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
27 | def test_markdown_renderer_pagination(pdf_document):
28 |     renderer = MarkdownRenderer({"paginate_output": True})
29 |     md = renderer(pdf_document).markdown
30 | 
31 |     assert "\n\n{0}-" in md
32 |     assert "\n\n{1}-" in md
33 | 
34 | 
35 | @pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
36 | def test_markdown_renderer_pagination_blank_last_page(pdf_document):
37 |     # Clear all children and structure from the last page to simulate a blank page
38 |     last_page = pdf_document.pages[-1]
39 |     last_page.children = []
40 |     last_page.structure = []
41 | 
42 |     renderer = MarkdownRenderer({"paginate_output": True})
43 |     md = renderer(pdf_document).markdown
44 | 
45 |     # Should end with pagination marker and preserve trailing newlines
46 |     assert md.endswith("}\n\n") or md.endswith(
47 |         "}------------------------------------------------\n\n"
48 |     )
49 | 
50 | 
51 | @pytest.mark.config({"page_range": [0, 1]})
52 | def test_markdown_renderer_metadata(pdf_document):
53 |     renderer = MarkdownRenderer({"paginate_output": True})
54 |     metadata = renderer(pdf_document).metadata
55 |     assert "table_of_contents" in metadata
56 | 
57 | 
58 | @pytest.mark.config({"page_range": [0, 1]})
59 | def test_markdown_renderer_images(pdf_document):
60 |     renderer = MarkdownRenderer({"extract_images": False})
61 |     markdown_output = renderer(pdf_document)
62 | 
63 |     assert len(markdown_output.images) == 0
64 |     assert "![](" not in markdown_output.markdown
65 | 
66 | 
67 | @pytest.mark.config({"page_range": [5]})
68 | def test_markdown_renderer_tables(pdf_document):
69 |     table = pdf_document.contained_blocks((BlockTypes.Table,))[0]
70 |     page = pdf_document.pages[0]
71 | 
72 |     cell = TableCell(
73 |         polygon=table.polygon,
74 |         text_lines=["54<i>.45</i>67<br>89<math>x</math>"],
75 |         rowspan=1,
76 |         colspan=1,
77 |         row_id=0,
78 |         col_id=0,
79 |         is_header=False,
80 |         page_id=page.page_id,
81 |     )
82 |     page.add_full_block(cell)
83 |     table.structure = []
84 |     table.add_structure(cell)
85 | 
86 |     renderer = MarkdownRenderer()
87 |     md = renderer(pdf_document).markdown
88 |     assert "54 <i>.45</i> 67<br>89 $x$" in md
89 | 
```

--------------------------------------------------------------------------------
/marker/schema/registry.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Dict, Type
 2 | from importlib import import_module
 3 | 
 4 | from marker.schema import BlockTypes
 5 | from marker.schema.blocks import (
 6 |     Block,
 7 |     Caption,
 8 |     Code,
 9 |     Equation,
10 |     Figure,
11 |     Footnote,
12 |     Form,
13 |     Handwriting,
14 |     InlineMath,
15 |     ListItem,
16 |     PageFooter,
17 |     PageHeader,
18 |     Picture,
19 |     SectionHeader,
20 |     Table,
21 |     TableOfContents,
22 |     Text,
23 |     ComplexRegion,
24 |     TableCell,
25 |     Reference,
26 | )
27 | from marker.schema.document import Document
28 | from marker.schema.groups import (
29 |     FigureGroup,
30 |     ListGroup,
31 |     PageGroup,
32 |     PictureGroup,
33 |     TableGroup,
34 | )
35 | from marker.schema.text import Line, Span
36 | from marker.schema.text.char import Char
37 | 
38 | BLOCK_REGISTRY: Dict[BlockTypes, str] = {}
39 | 
40 | 
41 | def register_block_class(block_type: BlockTypes, block_cls: Type[Block]):
42 |     BLOCK_REGISTRY[block_type] = f"{block_cls.__module__}.{block_cls.__name__}"
43 | 
44 | 
45 | def get_block_class(block_type: BlockTypes) -> Type[Block]:
46 |     class_path = BLOCK_REGISTRY[block_type]
47 |     module_name, class_name = class_path.rsplit(".", 1)
48 |     module = import_module(module_name)
49 |     return getattr(module, class_name)
50 | 
51 | 
52 | register_block_class(BlockTypes.Line, Line)
53 | register_block_class(BlockTypes.Span, Span)
54 | register_block_class(BlockTypes.Char, Char)
55 | register_block_class(BlockTypes.FigureGroup, FigureGroup)
56 | register_block_class(BlockTypes.TableGroup, TableGroup)
57 | register_block_class(BlockTypes.ListGroup, ListGroup)
58 | register_block_class(BlockTypes.PictureGroup, PictureGroup)
59 | register_block_class(BlockTypes.Page, PageGroup)
60 | register_block_class(BlockTypes.Caption, Caption)
61 | register_block_class(BlockTypes.Code, Code)
62 | register_block_class(BlockTypes.Figure, Figure)
63 | register_block_class(BlockTypes.Footnote, Footnote)
64 | register_block_class(BlockTypes.Form, Form)
65 | register_block_class(BlockTypes.Equation, Equation)
66 | register_block_class(BlockTypes.Handwriting, Handwriting)
67 | register_block_class(BlockTypes.TextInlineMath, InlineMath)
68 | register_block_class(BlockTypes.ListItem, ListItem)
69 | register_block_class(BlockTypes.PageFooter, PageFooter)
70 | register_block_class(BlockTypes.PageHeader, PageHeader)
71 | register_block_class(BlockTypes.Picture, Picture)
72 | register_block_class(BlockTypes.SectionHeader, SectionHeader)
73 | register_block_class(BlockTypes.Table, Table)
74 | register_block_class(BlockTypes.Text, Text)
75 | register_block_class(BlockTypes.TableOfContents, TableOfContents)
76 | register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
77 | register_block_class(BlockTypes.TableCell, TableCell)
78 | register_block_class(BlockTypes.Reference, Reference)
79 | register_block_class(BlockTypes.Document, Document)
80 | 
81 | assert len(BLOCK_REGISTRY) == len(BlockTypes)
82 | assert all(
83 |     [
84 |         get_block_class(k).model_fields["block_type"].default == k
85 |         for k, _ in BLOCK_REGISTRY.items()
86 |     ]
87 | )
88 | 
```

--------------------------------------------------------------------------------
/marker/processors/blockquote.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Annotated, Tuple
 2 | 
 3 | from marker.processors import BaseProcessor
 4 | from marker.schema import BlockTypes
 5 | from marker.schema.document import Document
 6 | 
 7 | 
 8 | class BlockquoteProcessor(BaseProcessor):
 9 |     """
10 |     A processor for tagging blockquotes.
11 |     """
12 |     block_types: Annotated[
13 |         Tuple[BlockTypes],
14 |         "The block types to process.",
15 |     ] = (BlockTypes.Text, BlockTypes.TextInlineMath)
16 |     min_x_indent: Annotated[
17 |         float,
18 |         "The minimum horizontal indentation required to consider a block as part of a blockquote.",
19 |         "Expressed as a percentage of the block width.",
20 |     ] = 0.1
21 |     x_start_tolerance: Annotated[
22 |         float,
23 |         "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
24 |         "Expressed as a percentage of the block width.",
25 |     ] = 0.01
26 |     x_end_tolerance: Annotated[
27 |         float,
28 |         "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.",
29 |         "Expressed as a percentage of the block width.",
30 |     ] = 0.01
31 | 
32 |     def __init__(self, config):
33 |         super().__init__(config)
34 | 
35 |     def __call__(self, document: Document):
36 |         for page in document.pages:
37 |             for block in page.contained_blocks(document, self.block_types):
38 |                 if block.structure is None:
39 |                     continue
40 | 
41 |                 if not len(block.structure) >= 2:
42 |                     continue
43 | 
44 |                 next_block = page.get_next_block(block)
45 |                 if next_block is None:
46 |                     continue
47 |                 if next_block.block_type not in self.block_types:
48 |                     continue
49 |                 if next_block.structure is None:
50 |                     continue
51 |                 if next_block.ignore_for_output:
52 |                     continue
53 | 
54 |                 matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width
55 |                 matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width
56 |                 x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
57 |                 y_indent = next_block.polygon.y_start > block.polygon.y_end
58 | 
59 |                 if block.blockquote:
60 |                     next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
61 |                     next_block.blockquote_level = block.blockquote_level
62 |                     if (x_indent and y_indent):
63 |                         next_block.blockquote_level += 1
64 |                 elif len(next_block.structure) >= 2 and (x_indent and y_indent):
65 |                     next_block.blockquote = True
66 |                     next_block.blockquote_level = 1
```

--------------------------------------------------------------------------------
/tests/services/test_service_init.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.converters.pdf import PdfConverter
 4 | from marker.services.gemini import GoogleGeminiService
 5 | from marker.services.ollama import OllamaService
 6 | from marker.services.vertex import GoogleVertexService
 7 | from marker.services.openai import OpenAIService
 8 | from marker.services.azure_openai import AzureOpenAIService
 9 | 
10 | 
11 | @pytest.mark.output_format("markdown")
12 | @pytest.mark.config({"page_range": [0]})
13 | def test_empty_llm(pdf_converter: PdfConverter, temp_doc):
14 |     assert pdf_converter.artifact_dict["llm_service"] is None
15 |     assert pdf_converter.llm_service is None
16 | 
17 | 
18 | def test_llm_no_keys(model_dict, config):
19 |     with pytest.raises(AssertionError):
20 |         PdfConverter(artifact_dict=model_dict, config={"use_llm": True})
21 | 
22 | 
23 | @pytest.mark.output_format("markdown")
24 | @pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"})
25 | def test_llm_gemini(pdf_converter: PdfConverter, temp_doc):
26 |     assert pdf_converter.artifact_dict["llm_service"] is not None
27 |     assert isinstance(pdf_converter.llm_service, GoogleGeminiService)
28 | 
29 | 
30 | @pytest.mark.output_format("markdown")
31 | @pytest.mark.config(
32 |     {
33 |         "page_range": [0],
34 |         "use_llm": True,
35 |         "vertex_project_id": "test",
36 |         "llm_service": "marker.services.vertex.GoogleVertexService",
37 |     }
38 | )
39 | def test_llm_vertex(pdf_converter: PdfConverter, temp_doc):
40 |     assert pdf_converter.artifact_dict["llm_service"] is not None
41 |     assert isinstance(pdf_converter.llm_service, GoogleVertexService)
42 | 
43 | 
44 | @pytest.mark.output_format("markdown")
45 | @pytest.mark.config(
46 |     {
47 |         "page_range": [0],
48 |         "use_llm": True,
49 |         "llm_service": "marker.services.ollama.OllamaService",
50 |     }
51 | )
52 | def test_llm_ollama(pdf_converter: PdfConverter, temp_doc):
53 |     assert pdf_converter.artifact_dict["llm_service"] is not None
54 |     assert isinstance(pdf_converter.llm_service, OllamaService)
55 | 
56 | 
57 | @pytest.mark.output_format("markdown")
58 | @pytest.mark.config(
59 |     {
60 |         "page_range": [0],
61 |         "use_llm": True,
62 |         "llm_service": "marker.services.openai.OpenAIService",
63 |         "openai_api_key": "test",
64 |     }
65 | )
66 | def test_llm_openai(pdf_converter: PdfConverter, temp_doc):
67 |     assert pdf_converter.artifact_dict["llm_service"] is not None
68 |     assert isinstance(pdf_converter.llm_service, OpenAIService)
69 | 
70 | 
71 | @pytest.mark.output_format("markdown")
72 | @pytest.mark.config(
73 |     {
74 |         "page_range": [0],
75 |         "use_llm": True,
76 |         "llm_service": "marker.services.azure_openai.AzureOpenAIService",
77 |         "azure_endpoint": "https://example.openai.azure.com",
78 |         "azure_api_key": "test",
79 |         "deployment_name": "test-model",
80 |         "azure_api_version": "1",
81 |     }
82 | )
83 | def test_llm_azure_openai(pdf_converter: PdfConverter, temp_doc):
84 |     assert pdf_converter.artifact_dict["llm_service"] is not None
85 |     assert isinstance(pdf_converter.llm_service, AzureOpenAIService)
86 | 
```

--------------------------------------------------------------------------------
/marker/converters/extraction.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | from typing import Annotated
 3 | 
 4 | from marker.builders.document import DocumentBuilder
 5 | from marker.builders.line import LineBuilder
 6 | from marker.builders.ocr import OcrBuilder
 7 | from marker.builders.structure import StructureBuilder
 8 | from marker.converters.pdf import PdfConverter
 9 | from marker.extractors.document import DocumentExtractor
10 | from marker.extractors.page import PageExtractor
11 | from marker.providers.registry import provider_from_filepath
12 | 
13 | from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
14 | from marker.renderers.markdown import MarkdownRenderer
15 | 
16 | from marker.logger import get_logger
17 | 
18 | logger = get_logger()
19 | 
20 | 
21 | class ExtractionConverter(PdfConverter):
22 |     pattern: str = r"{\d+\}-{48}\n\n"
23 |     existing_markdown: Annotated[
24 |         str, "Markdown that was already converted for extraction."
25 |     ] = None
26 | 
27 |     def build_document(self, filepath: str):
28 |         provider_cls = provider_from_filepath(filepath)
29 |         layout_builder = self.resolve_dependencies(self.layout_builder_class)
30 |         line_builder = self.resolve_dependencies(LineBuilder)
31 |         ocr_builder = self.resolve_dependencies(OcrBuilder)
32 |         provider = provider_cls(filepath, self.config)
33 |         document = DocumentBuilder(self.config)(
34 |             provider, layout_builder, line_builder, ocr_builder
35 |         )
36 |         structure_builder_cls = self.resolve_dependencies(StructureBuilder)
37 |         structure_builder_cls(document)
38 | 
39 |         for processor in self.processor_list:
40 |             processor(document)
41 | 
42 |         return document, provider
43 | 
44 |     def __call__(self, filepath: str) -> ExtractionOutput:
45 |         self.config["paginate_output"] = True  # Ensure we can split the output properly
46 |         self.config["output_format"] = (
47 |             "markdown"  # Output must be markdown for extraction
48 |         )
49 |         markdown = self.existing_markdown
50 | 
51 |         if not markdown:
52 |             document, provider = self.build_document(filepath)
53 |             self.page_count = len(document.pages)
54 |             renderer = self.resolve_dependencies(MarkdownRenderer)
55 |             output = renderer(document)
56 |             markdown = output.markdown
57 | 
58 |         output_pages = re.split(self.pattern, markdown)[1:]  # Split output into pages
59 | 
60 |         # This needs an LLM service for extraction, this sets it in the extractor
61 |         if self.artifact_dict.get("llm_service") is None:
62 |             self.artifact_dict["llm_service"] = self.resolve_dependencies(
63 |                 self.default_llm_service
64 |             )
65 | 
66 |         page_extractor = self.resolve_dependencies(PageExtractor)
67 |         document_extractor = self.resolve_dependencies(DocumentExtractor)
68 |         renderer = self.resolve_dependencies(ExtractionRenderer)
69 | 
70 |         # Inference in parallel
71 |         notes = page_extractor(output_pages)
72 |         document_output = document_extractor(notes)
73 | 
74 |         merged = renderer(document_output, markdown)
75 |         return merged
76 | 
```

--------------------------------------------------------------------------------
/marker/providers/epub.py:
--------------------------------------------------------------------------------

```python
  1 | import base64
  2 | import os
  3 | import tempfile
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | from marker.providers.pdf import PdfProvider
  8 | 
  9 | css = '''
 10 | @page {
 11 |     size: A4;
 12 |     margin: 2cm;
 13 | }
 14 | 
 15 | img {
 16 |     max-width: 100%;
 17 |     max-height: 25cm;
 18 |     object-fit: contain;
 19 |     margin: 12pt auto;
 20 | }
 21 | 
 22 | div, p {
 23 |     max-width: 100%;
 24 |     word-break: break-word;
 25 |     font-size: 10pt;
 26 | }
 27 | 
 28 | table {
 29 |     width: 100%;
 30 |     border-collapse: collapse;
 31 |     break-inside: auto;
 32 |     font-size: 10pt;
 33 | }
 34 | 
 35 | tr {
 36 |     break-inside: avoid;
 37 |     page-break-inside: avoid;
 38 | }
 39 | 
 40 | td {
 41 |     border: 0.75pt solid #000;
 42 |     padding: 6pt;
 43 | }
 44 | '''
 45 | 
 46 | 
 47 | class EpubProvider(PdfProvider):
 48 |     def __init__(self, filepath: str, config=None):
 49 |         temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf")
 50 |         self.temp_pdf_path = temp_pdf.name
 51 |         temp_pdf.close()
 52 | 
 53 |         # Convert Epub to PDF
 54 |         try:
 55 |             self.convert_epub_to_pdf(filepath)
 56 |         except Exception as e:
 57 |             raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
 58 | 
 59 |         # Initialize the PDF provider with the temp pdf path
 60 |         super().__init__(self.temp_pdf_path, config)
 61 | 
 62 |     def __del__(self):
 63 |         if os.path.exists(self.temp_pdf_path):
 64 |             os.remove(self.temp_pdf_path)
 65 | 
 66 |     def convert_epub_to_pdf(self, filepath):
 67 |         from weasyprint import CSS, HTML
 68 |         from ebooklib import epub
 69 |         import ebooklib
 70 | 
 71 |         ebook = epub.read_epub(filepath)
 72 | 
 73 |         styles = []
 74 |         html_content = ""
 75 |         img_tags = {}
 76 | 
 77 |         for item in ebook.get_items():
 78 |             if item.get_type() == ebooklib.ITEM_IMAGE:
 79 |                 img_data = base64.b64encode(item.get_content()).decode("utf-8")
 80 |                 img_tags[item.file_name] = f'data:{item.media_type};base64,{img_data}'
 81 |             elif item.get_type() == ebooklib.ITEM_STYLE:
 82 |                 styles.append(item.get_content().decode('utf-8'))
 83 | 
 84 |         for item in ebook.get_items():
 85 |             if item.get_type() == ebooklib.ITEM_DOCUMENT:
 86 |                 html_content += item.get_content().decode("utf-8")
 87 | 
 88 |         soup = BeautifulSoup(html_content, 'html.parser')
 89 |         for img in soup.find_all('img'):
 90 |             src = img.get('src')
 91 |             if src:
 92 |                 normalized_src = src.replace('../', '')
 93 |                 if normalized_src in img_tags:
 94 |                     img['src'] = img_tags[normalized_src]
 95 | 
 96 |         for image in soup.find_all('image'):
 97 |             src = image.get('xlink:href')
 98 |             if src:
 99 |                 normalized_src = src.replace('../', '')
100 |                 if normalized_src in img_tags:
101 |                     image['xlink:href'] = img_tags[normalized_src]
102 | 
103 |         html_content = str(soup)
104 |         full_style = ''.join([css])  # + styles)
105 | 
106 |         # we convert the epub to HTML
107 |         HTML(string=html_content, base_url=filepath).write_pdf(
108 |             self.temp_pdf_path,
109 |             stylesheets=[CSS(string=full_style), self.get_font_css()]
110 |         )
111 | 
```

--------------------------------------------------------------------------------
/tests/processors/test_table_processor.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List
 2 | 
 3 | import pytest
 4 | 
 5 | from marker.renderers.markdown import MarkdownRenderer
 6 | from marker.schema import BlockTypes
 7 | from marker.processors.table import TableProcessor
 8 | from marker.schema.blocks import TableCell
 9 | 
10 | 
11 | @pytest.mark.config({"page_range": [5]})
12 | def test_table_processor(
13 |     pdf_document, recognition_model, table_rec_model, detection_model
14 | ):
15 |     processor = TableProcessor(recognition_model, table_rec_model, detection_model)
16 |     processor(pdf_document)
17 | 
18 |     for block in pdf_document.pages[0].children:
19 |         if block.block_type == BlockTypes.Table:
20 |             children = block.contained_blocks(pdf_document, (BlockTypes.TableCell,))
21 |             assert children
22 |             assert len(children) > 0
23 |             assert isinstance(children[0], TableCell)
24 | 
25 |     assert len(pdf_document.contained_blocks((BlockTypes.Table,))) == 2
26 | 
27 |     renderer = MarkdownRenderer()
28 |     table_output = renderer(pdf_document)
29 |     assert "Schedule" in table_output.markdown
30 | 
31 | 
32 | @pytest.mark.filename("table_ex.pdf")
33 | @pytest.mark.config({"page_range": [0], "force_ocr": True})
34 | def test_avoid_double_ocr(
35 |     pdf_document, recognition_model, table_rec_model, detection_model
36 | ):
37 |     tables = pdf_document.contained_blocks((BlockTypes.Table,))
38 |     lines = tables[0].contained_blocks(pdf_document, (BlockTypes.Line,))
39 |     assert len(lines) == 0
40 | 
41 |     processor = TableProcessor(
42 |         recognition_model, table_rec_model, detection_model, config={"force_ocr": True}
43 |     )
44 |     processor(pdf_document)
45 | 
46 |     renderer = MarkdownRenderer()
47 |     table_output = renderer(pdf_document)
48 |     assert "Participants" in table_output.markdown
49 | 
50 | 
51 | @pytest.mark.filename("multicol-blocks.pdf")
52 | @pytest.mark.config({"page_range": [3]})
53 | def test_overlap_blocks(
54 |     pdf_document, detection_model, recognition_model, table_rec_model
55 | ):
56 |     page = pdf_document.pages[0]
57 |     assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(
58 |         pdf_document
59 |     )
60 | 
61 |     processor = TableProcessor(recognition_model, table_rec_model, detection_model)
62 |     processor(pdf_document)
63 | 
64 |     assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(
65 |         pdf_document
66 |     )
67 | 
68 | 
69 | @pytest.mark.filename("pres.pdf")
70 | @pytest.mark.config({"page_range": [4]})
71 | def test_ocr_table(pdf_document, recognition_model, table_rec_model, detection_model):
72 |     processor = TableProcessor(recognition_model, table_rec_model, detection_model)
73 |     processor(pdf_document)
74 | 
75 |     renderer = MarkdownRenderer()
76 |     table_output = renderer(pdf_document)
77 |     assert "1.2E-38" in table_output.markdown
78 | 
79 | 
80 | @pytest.mark.config({"page_range": [11]})
81 | def test_split_rows(pdf_document, recognition_model, table_rec_model, detection_model):
82 |     processor = TableProcessor(recognition_model, table_rec_model, detection_model)
83 |     processor(pdf_document)
84 | 
85 |     table = pdf_document.contained_blocks((BlockTypes.Table,))[-1]
86 |     cells: List[TableCell] = table.contained_blocks(
87 |         pdf_document, (BlockTypes.TableCell,)
88 |     )
89 |     unique_rows = len(set([cell.row_id for cell in cells]))
90 |     assert unique_rows == 6
91 | 
```

--------------------------------------------------------------------------------
/marker/renderers/json.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Annotated, Dict, List, Tuple
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from marker.renderers import BaseRenderer
 6 | from marker.schema import BlockTypes
 7 | from marker.schema.blocks import Block, BlockOutput
 8 | from marker.schema.document import Document
 9 | from marker.schema.registry import get_block_class
10 | 
11 | 
12 | class JSONBlockOutput(BaseModel):
13 |     id: str
14 |     block_type: str
15 |     html: str
16 |     polygon: List[List[float]]
17 |     bbox: List[float]
18 |     children: List["JSONBlockOutput"] | None = None
19 |     section_hierarchy: Dict[int, str] | None = None
20 |     images: dict | None = None
21 | 
22 | 
23 | class JSONOutput(BaseModel):
24 |     children: List[JSONBlockOutput]
25 |     block_type: str = str(BlockTypes.Document)
26 |     metadata: dict
27 | 
28 | 
29 | def reformat_section_hierarchy(section_hierarchy):
30 |     new_section_hierarchy = {}
31 |     for key, value in section_hierarchy.items():
32 |         new_section_hierarchy[key] = str(value)
33 |     return new_section_hierarchy
34 | 
35 | 
36 | class JSONRenderer(BaseRenderer):
37 |     """
38 |     A renderer for JSON output.
39 |     """
40 | 
41 |     image_blocks: Annotated[
42 |         Tuple[BlockTypes],
43 |         "The list of block types to consider as images.",
44 |     ] = (BlockTypes.Picture, BlockTypes.Figure)
45 |     page_blocks: Annotated[
46 |         Tuple[BlockTypes],
47 |         "The list of block types to consider as pages.",
48 |     ] = (BlockTypes.Page,)
49 | 
50 |     def extract_json(self, document: Document, block_output: BlockOutput):
51 |         cls = get_block_class(block_output.id.block_type)
52 |         if cls.__base__ == Block:
53 |             html, images = self.extract_block_html(document, block_output)
54 |             return JSONBlockOutput(
55 |                 html=html,
56 |                 polygon=block_output.polygon.polygon,
57 |                 bbox=block_output.polygon.bbox,
58 |                 id=str(block_output.id),
59 |                 block_type=str(block_output.id.block_type),
60 |                 images=images,
61 |                 section_hierarchy=reformat_section_hierarchy(
62 |                     block_output.section_hierarchy
63 |                 ),
64 |             )
65 |         else:
66 |             children = []
67 |             for child in block_output.children:
68 |                 child_output = self.extract_json(document, child)
69 |                 children.append(child_output)
70 | 
71 |             return JSONBlockOutput(
72 |                 html=block_output.html,
73 |                 polygon=block_output.polygon.polygon,
74 |                 bbox=block_output.polygon.bbox,
75 |                 id=str(block_output.id),
76 |                 block_type=str(block_output.id.block_type),
77 |                 children=children,
78 |                 section_hierarchy=reformat_section_hierarchy(
79 |                     block_output.section_hierarchy
80 |                 ),
81 |             )
82 | 
83 |     def __call__(self, document: Document) -> JSONOutput:
84 |         document_output = document.render(self.block_config)
85 |         json_output = []
86 |         for page_output in document_output.children:
87 |             json_output.append(self.extract_json(document, page_output))
88 |         return JSONOutput(
89 |             children=json_output,
90 |             metadata=self.generate_document_metadata(document, document_output),
91 |         )
92 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_image_description.py:
--------------------------------------------------------------------------------

```python
 1 | from pydantic import BaseModel
 2 | 
 3 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
 4 | 
 5 | from marker.schema import BlockTypes
 6 | from marker.schema.document import Document
 7 | 
 8 | from typing import Annotated, List
 9 | 
10 | 
11 | class LLMImageDescriptionProcessor(BaseLLMSimpleBlockProcessor):
12 |     block_types = (
13 |         BlockTypes.Picture,
14 |         BlockTypes.Figure,
15 |     )
16 |     extract_images: Annotated[bool, "Extract images from the document."] = True
17 |     image_description_prompt: Annotated[
18 |         str,
19 |         "The prompt to use for generating image descriptions.",
20 |         "Default is a string containing the Gemini prompt.",
21 |     ] = """You are a document analysis expert who specializes in creating text descriptions for images.
22 | You will receive an image of a picture or figure.  Your job will be to create a short description of the image.
23 | **Instructions:**
24 | 1. Carefully examine the provided image.
25 | 2. Analyze any text that was extracted from within the image.
26 | 3. Output a faithful description of the image.  Make sure there is enough specific detail to accurately reconstruct the image.  If the image is a figure or contains numeric data, include the numeric data in the output.
27 | **Example:**
28 | Input:
29 | ```text
30 | "Fruit Preference Survey"
31 | 20, 15, 10
32 | Apples, Bananas, Oranges
33 | ```
34 | Output:
35 | In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits.  The x-axis shows the types of fruits, and the y-axis shows the number of people.  The bar chart shows that most people prefer apples, followed by bananas and oranges.  20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges.
36 | **Input:**
37 | ```text
38 | {raw_text}
39 | ```
40 | """
41 | 
42 |     def inference_blocks(self, document: Document) -> List[BlockData]:
43 |         blocks = super().inference_blocks(document)
44 |         if self.extract_images:
45 |             return []
46 |         return blocks
47 | 
48 |     def block_prompts(self, document: Document) -> List[PromptData]:
49 |         prompt_data = []
50 |         for block_data in self.inference_blocks(document):
51 |             block = block_data["block"]
52 |             prompt = self.image_description_prompt.replace(
53 |                 "{raw_text}", block.raw_text(document)
54 |             )
55 |             image = self.extract_image(document, block)
56 | 
57 |             prompt_data.append(
58 |                 {
59 |                     "prompt": prompt,
60 |                     "image": image,
61 |                     "block": block,
62 |                     "schema": ImageSchema,
63 |                     "page": block_data["page"],
64 |                 }
65 |             )
66 | 
67 |         return prompt_data
68 | 
69 |     def rewrite_block(
70 |         self, response: dict, prompt_data: PromptData, document: Document
71 |     ):
72 |         block = prompt_data["block"]
73 | 
74 |         if not response or "image_description" not in response:
75 |             block.update_metadata(llm_error_count=1)
76 |             return
77 | 
78 |         image_description = response["image_description"]
79 |         if len(image_description) < 10:
80 |             block.update_metadata(llm_error_count=1)
81 |             return
82 | 
83 |         block.description = image_description
84 | 
85 | 
86 | class ImageSchema(BaseModel):
87 |     image_description: str
88 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/display/table.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | from typing import Dict, List
 3 | 
 4 | import tabulate
 5 | 
 6 | from benchmarks.overall.schema import FullResult
 7 | 
 8 | def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
 9 |     table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
10 |     with open(out_path / filename, "w", encoding="utf-8") as f:
11 |         f.write(f"# {title}\n")
12 |         f.write(table)
13 |     print(title)
14 |     print(table)
15 | 
16 | 
17 | def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
18 |     document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
19 |     headers = ["Document Type"]
20 |     for method in methods:
21 |         for score_type in score_types:
22 |             headers.append(f"{method} {score_type}")
23 | 
24 |     document_rows = [[k] for k in document_types]
25 |     for i, doc_type in enumerate(document_types):
26 |         for method in methods:
27 |             for score_type in score_types:
28 |                 avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
29 |                 document_rows[i].append(avg_score)
30 | 
31 |     write_table("Document Types", document_rows, headers, out_path, "document_types.md")
32 | 
33 |     headers = ["Block Type"]
34 |     block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
35 |     block_score_types = list(result["averages_by_block_type"][default_method].keys())
36 |     for method in methods:
37 |         for score_type in block_score_types:
38 |             headers.append(f"{method} {score_type}")
39 | 
40 |     block_rows = [[k] for k in block_types]
41 |     for i, block_type in enumerate(block_types):
42 |         for method in methods:
43 |             for score_type in block_score_types:
44 |                 avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
45 |                 block_rows[i].append(avg_score)
46 | 
47 |     write_table("Block types", block_rows, headers, out_path, "block_types.md")
48 | 
49 |     headers = ["Method",  "Avg Time"] + score_types
50 |     inference_rows = [[k] for k in methods]
51 |     all_raw_scores = [result["scores"][i] for i in result["scores"]]
52 |     for i, method in enumerate(methods):
53 |         avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
54 |         inference_rows[i].append(avg_time)
55 |         for score_type in score_types:
56 |             scores_lst = []
57 |             for ar in all_raw_scores:
58 |                 try:
59 |                     # Sometimes a few llm scores are missing
60 |                     scores_lst.append(ar[method][score_type]["score"])
61 |                 except KeyError:
62 |                     continue
63 |             avg_score = sum(scores_lst) / max(1, len(scores_lst))
64 |             inference_rows[i].append(avg_score)
65 | 
66 |     write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
67 | 
68 |     print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method.  The scores are 0-100 based on edit distance.")
```

--------------------------------------------------------------------------------
/marker/renderers/chunk.py:
--------------------------------------------------------------------------------

```python
 1 | import html
 2 | from typing import List, Dict
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | from pydantic import BaseModel
 6 | 
 7 | from marker.renderers.json import JSONRenderer, JSONBlockOutput
 8 | from marker.schema.document import Document
 9 | 
10 | 
11 | class FlatBlockOutput(BaseModel):
12 |     id: str
13 |     block_type: str
14 |     html: str
15 |     page: int
16 |     polygon: List[List[float]]
17 |     bbox: List[float]
18 |     section_hierarchy: Dict[int, str] | None = None
19 |     images: dict | None = None
20 | 
21 | 
22 | class ChunkOutput(BaseModel):
23 |     blocks: List[FlatBlockOutput]
24 |     page_info: Dict[int, dict]
25 |     metadata: dict
26 | 
27 | def collect_images(block: JSONBlockOutput) -> dict[str, str]:
28 |     if not getattr(block, "children", None):
29 |         return block.images or {}
30 |     else:
31 |         images = block.images or {}
32 |         for child_block in block.children:
33 |             images.update(collect_images(child_block))
34 |         return images
35 | 
36 | def assemble_html_with_images(block: JSONBlockOutput, image_blocks: set[str]) -> str:
37 |     if not getattr(block, "children", None):
38 |         if block.block_type in image_blocks:
39 |             return f"<p>{block.html}<img src='{block.id}'></p>"
40 |         else:
41 |             return block.html
42 | 
43 |     child_html = [assemble_html_with_images(child, image_blocks) for child in block.children]
44 |     child_ids = [child.id for child in block.children]
45 | 
46 |     soup = BeautifulSoup(block.html, "html.parser")
47 |     content_refs = soup.find_all("content-ref")
48 |     for ref in content_refs:
49 |         src_id = ref.attrs["src"]
50 |         if src_id in child_ids:
51 |             ref.replace_with(child_html[child_ids.index(src_id)])
52 | 
53 |     return html.unescape(str(soup))
54 | 
55 | def json_to_chunks(
56 |     block: JSONBlockOutput, image_blocks: set[str], page_id: int=0) -> FlatBlockOutput | List[FlatBlockOutput]:
57 |     if block.block_type == "Page":
58 |         children = block.children
59 |         page_id = int(block.id.split("/")[-1])
60 |         return [json_to_chunks(child, image_blocks, page_id=page_id) for child in children]
61 |     else:
62 |         return FlatBlockOutput(
63 |             id=block.id,
64 |             block_type=block.block_type,
65 |             html=assemble_html_with_images(block, image_blocks),
66 |             page=page_id,
67 |             polygon=block.polygon,
68 |             bbox=block.bbox,
69 |             section_hierarchy=block.section_hierarchy,
70 |             images=collect_images(block),
71 |         )
72 | 
73 | 
74 | class ChunkRenderer(JSONRenderer):
75 | 
76 |     def __call__(self, document: Document) -> ChunkOutput:
77 |         document_output = document.render(self.block_config)
78 |         json_output = []
79 |         for page_output in document_output.children:
80 |             json_output.append(self.extract_json(document, page_output))
81 | 
82 |         # This will get the top-level blocks from every page
83 |         chunk_output = []
84 |         for item in json_output:
85 |             chunks = json_to_chunks(item, set([str(block) for block in self.image_blocks]))
86 |             chunk_output.extend(chunks)
87 | 
88 |         page_info = {
89 |             page.page_id: {"bbox": page.polygon.bbox, "polygon": page.polygon.polygon}
90 |             for page in document.pages
91 |         }
92 | 
93 |         return ChunkOutput(
94 |             blocks=chunk_output,
95 |             page_info=page_info,
96 |             metadata=self.generate_document_metadata(document, document_output),
97 |         )
98 | 
```

--------------------------------------------------------------------------------
/marker/schema/document.py:
--------------------------------------------------------------------------------

```python
  1 | from __future__ import annotations
  2 | 
  3 | from typing import List, Sequence, Optional
  4 | 
  5 | from pydantic import BaseModel
  6 | 
  7 | from marker.schema import BlockTypes
  8 | from marker.schema.blocks import Block, BlockId, BlockOutput
  9 | from marker.schema.groups.page import PageGroup
 10 | 
 11 | 
 12 | class DocumentOutput(BaseModel):
 13 |     children: List[BlockOutput]
 14 |     html: str
 15 |     block_type: BlockTypes = BlockTypes.Document
 16 | 
 17 | 
 18 | class TocItem(BaseModel):
 19 |     title: str
 20 |     heading_level: int
 21 |     page_id: int
 22 |     polygon: List[List[float]]
 23 | 
 24 | 
 25 | class Document(BaseModel):
 26 |     filepath: str
 27 |     pages: List[PageGroup]
 28 |     block_type: BlockTypes = BlockTypes.Document
 29 |     table_of_contents: List[TocItem] | None = None
 30 |     debug_data_path: str | None = None  # Path that debug data was saved to
 31 | 
 32 |     def get_block(self, block_id: BlockId):
 33 |         page = self.get_page(block_id.page_id)
 34 |         block = page.get_block(block_id)
 35 |         if block:
 36 |             return block
 37 |         return None
 38 | 
 39 |     def get_page(self, page_id):
 40 |         for page in self.pages:
 41 |             if page.page_id == page_id:
 42 |                 return page
 43 |         return None
 44 | 
 45 |     def get_next_block(
 46 |         self, block: Block, ignored_block_types: List[BlockTypes] = None
 47 |     ):
 48 |         if ignored_block_types is None:
 49 |             ignored_block_types = []
 50 |         next_block = None
 51 | 
 52 |         # Try to find the next block in the current page
 53 |         page = self.get_page(block.page_id)
 54 |         next_block = page.get_next_block(block, ignored_block_types)
 55 |         if next_block:
 56 |             return next_block
 57 | 
 58 |         # If no block found, search subsequent pages
 59 |         for page in self.pages[self.pages.index(page) + 1 :]:
 60 |             next_block = page.get_next_block(None, ignored_block_types)
 61 |             if next_block:
 62 |                 return next_block
 63 |         return None
 64 | 
 65 |     def get_next_page(self, page: PageGroup):
 66 |         page_idx = self.pages.index(page)
 67 |         if page_idx + 1 < len(self.pages):
 68 |             return self.pages[page_idx + 1]
 69 |         return None
 70 | 
 71 |     def get_prev_block(self, block: Block):
 72 |         page = self.get_page(block.page_id)
 73 |         prev_block = page.get_prev_block(block)
 74 |         if prev_block:
 75 |             return prev_block
 76 |         prev_page = self.get_prev_page(page)
 77 |         if not prev_page:
 78 |             return None
 79 |         return prev_page.get_block(prev_page.structure[-1])
 80 | 
 81 |     def get_prev_page(self, page: PageGroup):
 82 |         page_idx = self.pages.index(page)
 83 |         if page_idx > 0:
 84 |             return self.pages[page_idx - 1]
 85 |         return None
 86 | 
 87 |     def assemble_html(
 88 |         self, child_blocks: List[Block], block_config: Optional[dict] = None
 89 |     ):
 90 |         template = ""
 91 |         for c in child_blocks:
 92 |             template += f"<content-ref src='{c.id}'></content-ref>"
 93 |         return template
 94 | 
 95 |     def render(self, block_config: Optional[dict] = None):
 96 |         child_content = []
 97 |         section_hierarchy = None
 98 |         for page in self.pages:
 99 |             rendered = page.render(self, None, section_hierarchy, block_config)
100 |             section_hierarchy = rendered.section_hierarchy.copy()
101 |             child_content.append(rendered)
102 | 
103 |         return DocumentOutput(
104 |             children=child_content,
105 |             html=self.assemble_html(child_content, block_config),
106 |         )
107 | 
108 |     def contained_blocks(self, block_types: Sequence[BlockTypes] = None) -> List[Block]:
109 |         blocks = []
110 |         for page in self.pages:
111 |             blocks += page.contained_blocks(self, block_types)
112 |         return blocks
113 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/scorers/heuristic.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import List
  2 | 
  3 | from rapidfuzz import fuzz
  4 | 
  5 | from benchmarks.overall.scorers.clean import MarkdownCleaner
  6 | from benchmarks.overall.scorers.schema import BlockScores
  7 | from benchmarks.overall.scorers import BaseScorer
  8 | 
  9 | 
 10 | class HeuristicScorer(BaseScorer):
 11 |     def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
 12 |         if not method_markdown:
 13 |             return {
 14 |                 "score": 0,
 15 |                 "specific_scores": {
 16 |                     "order": 0,
 17 |                     "by_block": [0] * len(gt_markdown)
 18 |                 }
 19 |             }
 20 | 
 21 |         # Standardize inputs
 22 |         gt_markdown = [self.clean_input(block) for block in gt_markdown]
 23 |         method_markdown = self.clean_input(method_markdown)
 24 | 
 25 |         alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown)
 26 |         scores = [alignment["score"] for alignment in alignments]
 27 | 
 28 |         # Find order score
 29 |         orders = [alignment["start"] for alignment in alignments]
 30 |         correct_order = list(range(len(gt_markdown)))
 31 |         actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
 32 |         order_score = self.kendall_tau(correct_order, actual_order)
 33 | 
 34 |         # Weight score by sequence length
 35 |         gt_weights = [len(g) for g in gt_markdown]
 36 |         weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
 37 | 
 38 |         # Weight the score by sequence length
 39 |         overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
 40 |         overall_score = overall_score * 0.8 + order_score * 0.2
 41 |         return {
 42 |             "score": overall_score,
 43 |             "specific_scores": {
 44 |                 "order": order_score,
 45 |                 "by_block": scores
 46 |             },
 47 |         }
 48 | 
 49 |     @staticmethod
 50 |     def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
 51 |         n = len(correct_order)
 52 |         concordant = 0
 53 |         discordant = 0
 54 | 
 55 |         if n <= 1:
 56 |             return 100
 57 | 
 58 |         for i in range(n):
 59 |             for j in range(i + 1, n):
 60 |                 correct_sign = correct_order[i] - correct_order[j]
 61 |                 actual_sign = actual_order[i] - actual_order[j]
 62 | 
 63 |                 if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
 64 |                     concordant += 1
 65 |                 elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
 66 |                     discordant += 1
 67 | 
 68 |         total_pairs = (n * (n - 1)) // 2
 69 |         tau = (concordant - discordant) / total_pairs
 70 |         tau = (tau + 1) / 2 # 0-1 scale
 71 |         return tau * 100 # 0-100 scale
 72 | 
 73 |     @staticmethod
 74 |     def find_fuzzy_alignments(
 75 |             main_string: str,
 76 |             substrings: List[str],
 77 |             threshold: int = 70
 78 |     ) -> List[dict]:
 79 |         alignments = []
 80 | 
 81 |         for idx, substr in enumerate(substrings):
 82 |             result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
 83 | 
 84 |             score = 0
 85 |             dest_start = 0
 86 |             dest_end = 0
 87 |             if result:
 88 |                 score = result.score
 89 |                 dest_start = result.dest_start
 90 |                 dest_end = result.dest_end
 91 | 
 92 |             alignments.append({
 93 |                 "string": substr,
 94 |                 "start": dest_start,
 95 |                 "end": dest_end,
 96 |                 "score": score,
 97 |                 "idx": idx
 98 |             })
 99 |         return alignments
100 | 
101 | 
102 |     @staticmethod
103 |     def clean_input(md: str):
104 |         cleaner = MarkdownCleaner()
105 |         return cleaner(md)
```

--------------------------------------------------------------------------------
/marker/output.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | 
  4 | from bs4 import BeautifulSoup, Tag
  5 | from pydantic import BaseModel
  6 | from PIL import Image
  7 | 
  8 | from marker.renderers.extraction import ExtractionOutput
  9 | from marker.renderers.html import HTMLOutput
 10 | from marker.renderers.json import JSONOutput, JSONBlockOutput
 11 | from marker.renderers.markdown import MarkdownOutput
 12 | from marker.renderers.ocr_json import OCRJSONOutput
 13 | from marker.schema.blocks import BlockOutput
 14 | from marker.settings import settings
 15 | 
 16 | 
 17 | def unwrap_outer_tag(html: str):
 18 |     soup = BeautifulSoup(html, "html.parser")
 19 |     contents = list(soup.contents)
 20 |     if len(contents) == 1 and isinstance(contents[0], Tag) and contents[0].name == "p":
 21 |         # Unwrap the p tag
 22 |         soup.p.unwrap()
 23 | 
 24 |     return str(soup)
 25 | 
 26 | 
 27 | def json_to_html(block: JSONBlockOutput | BlockOutput):
 28 |     # Utility function to take in json block output and give html for the block.
 29 |     if not getattr(block, "children", None):
 30 |         return block.html
 31 |     else:
 32 |         child_html = [json_to_html(child) for child in block.children]
 33 |         child_ids = [child.id for child in block.children]
 34 | 
 35 |         soup = BeautifulSoup(block.html, "html.parser")
 36 |         content_refs = soup.find_all("content-ref")
 37 |         for ref in content_refs:
 38 |             src_id = ref.attrs["src"]
 39 |             if src_id in child_ids:
 40 |                 child_soup = BeautifulSoup(
 41 |                     child_html[child_ids.index(src_id)], "html.parser"
 42 |                 )
 43 |                 ref.replace_with(child_soup)
 44 |         return str(soup)
 45 | 
 46 | 
 47 | def output_exists(output_dir: str, fname_base: str):
 48 |     exts = ["md", "html", "json"]
 49 |     for ext in exts:
 50 |         if os.path.exists(os.path.join(output_dir, f"{fname_base}.{ext}")):
 51 |             return True
 52 |     return False
 53 | 
 54 | 
 55 | def text_from_rendered(rendered: BaseModel):
 56 |     from marker.renderers.chunk import ChunkOutput  # Has an import from this file
 57 | 
 58 |     if isinstance(rendered, MarkdownOutput):
 59 |         return rendered.markdown, "md", rendered.images
 60 |     elif isinstance(rendered, HTMLOutput):
 61 |         return rendered.html, "html", rendered.images
 62 |     elif isinstance(rendered, JSONOutput):
 63 |         return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
 64 |     elif isinstance(rendered, ChunkOutput):
 65 |         return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
 66 |     elif isinstance(rendered, OCRJSONOutput):
 67 |         return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
 68 |     elif isinstance(rendered, ExtractionOutput):
 69 |         return rendered.document_json, "json", {}
 70 |     else:
 71 |         raise ValueError("Invalid output type")
 72 | 
 73 | 
 74 | def convert_if_not_rgb(image: Image.Image) -> Image.Image:
 75 |     if image.mode != "RGB":
 76 |         image = image.convert("RGB")
 77 |     return image
 78 | 
 79 | 
 80 | def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
 81 |     text, ext, images = text_from_rendered(rendered)
 82 |     text = text.encode(settings.OUTPUT_ENCODING, errors="replace").decode(
 83 |         settings.OUTPUT_ENCODING
 84 |     )
 85 | 
 86 |     with open(
 87 |         os.path.join(output_dir, f"{fname_base}.{ext}"),
 88 |         "w+",
 89 |         encoding=settings.OUTPUT_ENCODING,
 90 |     ) as f:
 91 |         f.write(text)
 92 |     with open(
 93 |         os.path.join(output_dir, f"{fname_base}_meta.json"),
 94 |         "w+",
 95 |         encoding=settings.OUTPUT_ENCODING,
 96 |     ) as f:
 97 |         f.write(json.dumps(rendered.metadata, indent=2))
 98 | 
 99 |     for img_name, img in images.items():
100 |         img = convert_if_not_rgb(img)  # RGBA images can't save as JPG
101 |         img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT)
102 | 
```

--------------------------------------------------------------------------------
/marker/processors/block_relabel.py:
--------------------------------------------------------------------------------

```python
 1 | from copy import deepcopy
 2 | from typing import Annotated
 3 | 
 4 | from marker.processors import BaseProcessor
 5 | from marker.schema import BlockTypes
 6 | from marker.schema.blocks import BlockId
 7 | from marker.schema.document import Document
 8 | from marker.schema.registry import get_block_class
 9 | 
10 | from marker.logger import get_logger
11 | logger = get_logger()
12 | 
13 | class BlockRelabelProcessor(BaseProcessor):
14 |     """
15 |     A processor to heuristically relabel blocks based on a confidence threshold.
16 |     
17 |     Each rule in the relabel string maps an original block label to a new one
18 |     if the confidence exceeds a given threshold.
19 |     """
20 |     
21 |     block_relabel_str: Annotated[
22 |         str,
23 |         "Comma-separated relabeling rules in the format '<original_label>:<new_label>:<confidence_threshold>'.",
24 |         "Each rule defines how blocks of a certain type should be relabeled when the confidence exceeds the threshold.",
25 |         "Example: 'Table:Picture:0.85,Form:Picture:0.9'"
26 |     ] = ""
27 | 
28 |     def __init__(self, config=None):
29 |         super().__init__(config)
30 |         self.block_relabel_map = {}
31 | 
32 |         if not self.block_relabel_str:
33 |             return
34 | 
35 |         for i, block_config_str in enumerate(self.block_relabel_str.split(',')):
36 |             block_config_str = block_config_str.strip()
37 |             if not block_config_str:
38 |                 continue  # Skip empty segments
39 | 
40 |             try:
41 |                 parts = block_config_str.split(':')
42 |                 if len(parts) != 3:
43 |                     raise ValueError(f"Expected 3 parts, got {len(parts)}")
44 | 
45 |                 block_label, block_relabel, confidence_str = parts
46 |                 confidence_thresh = float(confidence_str)
47 | 
48 |                 block_type = BlockTypes[block_label]
49 |                 relabel_block_type = BlockTypes[block_relabel]
50 | 
51 |                 self.block_relabel_map[block_type] = (
52 |                     confidence_thresh,
53 |                     relabel_block_type
54 |                 )
55 |             except Exception as e:
56 |                 logger.warning(f"Failed to parse relabel rule '{block_config_str}' at index {i}: {e}. Expected format is <original_label>:<new_label>:<confidence_threshold>")
57 | 
58 |     def __call__(self, document: Document):
59 |         if len(self.block_relabel_map) == 0:
60 |             return
61 | 
62 |         for page in document.pages:
63 |             for block in page.structure_blocks(document):
64 |                 if block.block_type not in self.block_relabel_map:
65 |                     continue
66 |                 
67 |                 block_id = BlockId(page_id=page.page_id, block_id=block.block_id, block_type=block.block_type)
68 |                 confidence_thresh, relabel_block_type = self.block_relabel_map[block.block_type]
69 |                 confidence = block.top_k.get(block.block_type)
70 |                 if confidence > confidence_thresh:
71 |                     logger.debug(f"Skipping relabel for {block_id}; Confidence: {confidence} > Confidence Threshold {confidence_thresh} for re-labelling")
72 |                     continue
73 | 
74 |                 new_block_cls = get_block_class(relabel_block_type)
75 |                 new_block = new_block_cls(
76 |                     polygon=deepcopy(block.polygon),
77 |                     page_id=block.page_id,
78 |                     structure=deepcopy(block.structure),
79 |                     text_extraction_method=block.text_extraction_method,
80 |                     source="heuristics",
81 |                     top_k=block.top_k,
82 |                     metadata=block.metadata
83 |                 )
84 |                 page.replace_block(block, new_block)
85 |                 logger.debug(f"Relabelled {block_id} to {relabel_block_type}")
```

--------------------------------------------------------------------------------
/marker/providers/spreadsheet.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | 
  4 | from marker.providers.pdf import PdfProvider
  5 | 
  6 | css = '''
  7 | @page {
  8 |     size: A4 landscape;
  9 |     margin: 1.5cm;
 10 | }
 11 | 
 12 | table {
 13 |     width: 100%;
 14 |     border-collapse: collapse;
 15 |     break-inside: auto;
 16 |     font-size: 10pt;
 17 | }
 18 | 
 19 | tr {
 20 |     break-inside: avoid;
 21 |     page-break-inside: avoid;
 22 | }
 23 | 
 24 | td {
 25 |     border: 0.75pt solid #000;
 26 |     padding: 6pt;
 27 | }
 28 | '''
 29 | 
 30 | 
 31 | class SpreadSheetProvider(PdfProvider):
 32 |     def __init__(self, filepath: str, config=None):
 33 |         temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf")
 34 |         self.temp_pdf_path = temp_pdf.name
 35 |         temp_pdf.close()
 36 | 
 37 |         # Convert XLSX to PDF
 38 |         try:
 39 |             self.convert_xlsx_to_pdf(filepath)
 40 |         except Exception as e:
 41 |             raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
 42 | 
 43 |         # Initialize the PDF provider with the temp pdf path
 44 |         super().__init__(self.temp_pdf_path, config)
 45 | 
 46 |     def __del__(self):
 47 |         if os.path.exists(self.temp_pdf_path):
 48 |             os.remove(self.temp_pdf_path)
 49 | 
 50 |     def convert_xlsx_to_pdf(self, filepath: str):
 51 |         from weasyprint import CSS, HTML
 52 |         from openpyxl import load_workbook
 53 | 
 54 |         html = ""
 55 |         workbook = load_workbook(filepath)
 56 |         if workbook is not None:
 57 |             for sheet_name in workbook.sheetnames:
 58 |                 sheet = workbook[sheet_name]
 59 |                 html += f'<div><h1>{sheet_name}</h1>' + self._excel_to_html_table(sheet) + '</div>'
 60 |         else:
 61 |             raise ValueError("Invalid XLSX file")
 62 | 
 63 |         # We convert the HTML into a PDF
 64 |         HTML(string=html).write_pdf(
 65 |             self.temp_pdf_path,
 66 |             stylesheets=[CSS(string=css), self.get_font_css()]
 67 |         )
 68 | 
 69 |     @staticmethod
 70 |     def _get_merged_cell_ranges(sheet):
 71 |         merged_info = {}
 72 |         for merged_range in sheet.merged_cells.ranges:
 73 |             min_col, min_row, max_col, max_row = merged_range.bounds
 74 |             merged_info[(min_row, min_col)] = {
 75 |                 'rowspan': max_row - min_row + 1,
 76 |                 'colspan': max_col - min_col + 1,
 77 |                 'range': merged_range
 78 |             }
 79 |         return merged_info
 80 | 
 81 |     def _excel_to_html_table(self, sheet):
 82 |         merged_cells = self._get_merged_cell_ranges(sheet)
 83 | 
 84 |         html = f'<table>'
 85 | 
 86 |         # Track cells we should skip due to being part of a merge range
 87 |         skip_cells = set()
 88 | 
 89 |         for row_idx, row in enumerate(sheet.rows, 1):
 90 |             html += '<tr>'
 91 |             for col_idx, cell in enumerate(row, 1):
 92 |                 if (row_idx, col_idx) in skip_cells:
 93 |                     continue
 94 | 
 95 |                 # Check if this cell is the start of a merged range
 96 |                 merge_info = merged_cells.get((row_idx, col_idx))
 97 |                 if merge_info:
 98 |                     # Add cells to skip
 99 |                     for r in range(row_idx, row_idx + merge_info['rowspan']):
100 |                         for c in range(col_idx, col_idx + merge_info['colspan']):
101 |                             if (r, c) != (row_idx, col_idx):
102 |                                 skip_cells.add((r, c))
103 | 
104 |                     # Add merged cell with rowspan/colspan
105 |                     value = cell.value if cell.value is not None else ''
106 |                     html += f'<td rowspan="{merge_info["rowspan"]}" colspan="{merge_info["colspan"]}">{value}'
107 |                 else:
108 |                     # Regular cell
109 |                     value = cell.value if cell.value is not None else ''
110 |                     html += f'<td>{value}'
111 | 
112 |                 html += '</td>'
113 |             html += '</tr>'
114 |         html += '</table>'
115 |         return html
116 | 
```

--------------------------------------------------------------------------------
/marker/processors/ignoretext.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | from collections import Counter
 3 | from itertools import groupby
 4 | from typing import Annotated, List
 5 | 
 6 | from rapidfuzz import fuzz
 7 | 
 8 | from marker.processors import BaseProcessor
 9 | from marker.schema import BlockTypes
10 | from marker.schema.blocks import Block
11 | from marker.schema.document import Document
12 | 
13 | 
14 | class IgnoreTextProcessor(BaseProcessor):
15 |     """
16 |     A processor for identifying and ignoring common text blocks in a document. 
17 |     These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers.
18 |     """
19 |     block_types = (
20 |         BlockTypes.Text, BlockTypes.SectionHeader,
21 |         BlockTypes.TextInlineMath
22 |     )
23 |     common_element_threshold: Annotated[
24 |         float,
25 |         "The minimum ratio of pages a text block must appear on to be considered a common element.",
26 |         "Blocks that meet or exceed this threshold are marked as common elements.",
27 |     ] = 0.2
28 |     common_element_min_blocks: Annotated[
29 |         int,
30 |         "The minimum number of occurrences of a text block within a document to consider it a common element.",
31 |         "This ensures that rare blocks are not mistakenly flagged.",
32 |     ] = 3
33 |     max_streak: Annotated[
34 |         int,
35 |         "The maximum number of consecutive occurrences of a text block allowed before it is classified as a common element.",
36 |         "Helps to identify patterns like repeated headers or footers.",
37 |     ] = 3
38 |     text_match_threshold: Annotated[
39 |         int,
40 |         "The minimum fuzzy match score (0-100) required to classify a text block as similar to a common element.",
41 |         "Higher values enforce stricter matching.",
42 |     ] = 90
43 | 
44 |     def __call__(self, document: Document):
45 |         first_blocks = []
46 |         last_blocks = []
47 |         for page in document.pages:
48 |             initial_block = None
49 |             last_block = None
50 |             for block in page.contained_blocks(document, self.block_types):
51 |                 if block.structure is not None:
52 |                     if initial_block is None:
53 |                         initial_block = block
54 | 
55 |                     last_block = block
56 | 
57 |             if initial_block is not None:
58 |                 first_blocks.append(initial_block)
59 |             if last_block is not None:
60 |                 last_blocks.append(last_block)
61 | 
62 |         self.filter_common_elements(document, first_blocks)
63 |         self.filter_common_elements(document, last_blocks)
64 | 
65 |     @staticmethod
66 |     def clean_text(text):
67 |         text = text.replace("\n", "").strip()
68 |         text = re.sub(r"^\d+\s*", "", text)  # remove numbers at the start of the line
69 |         text = re.sub(r"\s*\d+$", "", text)  # remove numbers at the end of the line
70 |         return text
71 | 
72 |     def filter_common_elements(self, document, blocks: List[Block]):
73 |         # We can't filter if we don't have enough pages to find common elements
74 |         if len(blocks) < self.common_element_min_blocks:
75 |             return
76 | 
77 |         text = [self.clean_text(b.raw_text(document)) for b in blocks]
78 | 
79 |         streaks = {}
80 |         for key, group in groupby(text):
81 |             streaks[key] = max(streaks.get(key, 0), len(list(group)))
82 | 
83 |         counter = Counter(text)
84 |         common = [
85 |             k for k, v in counter.items()
86 |             if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
87 |             and v > self.common_element_min_blocks
88 |         ]
89 |         if len(common) == 0:
90 |             return
91 | 
92 |         for t, b in zip(text, blocks):
93 |             # Check against all common elements
94 |             if any(fuzz.ratio(t, common_element) > self.text_match_threshold for common_element in common):
95 |                 b.ignore_for_output = True
96 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_complex.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List
 2 | 
 3 | import markdown2
 4 | from pydantic import BaseModel
 5 | 
 6 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor
 7 | 
 8 | from marker.schema import BlockTypes
 9 | from marker.schema.document import Document
10 | 
11 | 
12 | class LLMComplexRegionProcessor(BaseLLMSimpleBlockProcessor):
13 |     block_types = (BlockTypes.ComplexRegion,)
14 |     complex_region_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
15 | You will receive an image of a text block and the text that can be extracted from the image.
16 | Your task is to generate markdown to properly represent the content of the image.  Do not omit any text present in the image - make sure everything is included in the markdown representation.  The markdown representation should be as faithful to the original image as possible.
17 | 
18 | Formatting should be in markdown, with the following rules:
19 | - * for italics, ** for bold, and ` for inline code.
20 | - Use <sup>...</sup> for superscripts.
21 | - Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
22 | - Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
23 | - Links should be formatted with [text](url).
24 | - Use ``` for code blocks.
25 | - Inline math should be formatted with <math>math expression</math>.
26 | - Display math should be formatted with <math display="block">math expression</math>.
27 | - Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right.  The headers should be "Labels" and "Values".  Other text in the form can appear between the tables.
28 | - Tables should be formatted with markdown tables, with the headers bolded.
29 | 
30 | **Instructions:**
31 | 1. Carefully examine the provided block image.
32 | 2. Analyze the existing text representation.
33 | 3. Generate the markdown representation of the content in the image.
34 | **Example:**
35 | Input:
36 | ```text
37 | Table 1: Car Sales
38 | ```
39 | Output:
40 | ```markdown
41 | ## Table 1: Car Sales
42 | 
43 | | Car | Sales |
44 | | --- | --- |
45 | | Honda | 100 |
46 | | Toyota | 200 |
47 | ```
48 | **Input:**
49 | ```text
50 | {extracted_text}
51 | ```
52 | """
53 | 
54 |     def block_prompts(self, document: Document) -> List[PromptData]:
55 |         prompt_data = []
56 |         for block in self.inference_blocks(document):
57 |             text = block["block"].raw_text(document)
58 |             prompt = self.complex_region_prompt.replace("{extracted_text}", text)
59 |             image = self.extract_image(document, block["block"])
60 |             prompt_data.append({
61 |                 "prompt": prompt,
62 |                 "image": image,
63 |                 "block": block["block"],
64 |                 "schema": ComplexSchema,
65 |                 "page": block["page"]
66 |             })
67 |         return prompt_data
68 | 
69 |     def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
70 |         block = prompt_data["block"]
71 |         text = block.raw_text(document)
72 | 
73 |         if not response or "corrected_markdown" not in response:
74 |             block.update_metadata(llm_error_count=1)
75 |             return
76 | 
77 |         corrected_markdown = response["corrected_markdown"]
78 | 
79 |         # The original table is okay
80 |         if "no corrections" in corrected_markdown.lower():
81 |             return
82 | 
83 |         # Potentially a partial response
84 |         if len(corrected_markdown) < len(text) * .5:
85 |             block.update_metadata(llm_error_count=1)
86 |             return
87 | 
88 |         # Convert LLM markdown to html
89 |         corrected_markdown = corrected_markdown.strip().lstrip("```markdown").rstrip("```").strip()
90 |         block.html = markdown2.markdown(corrected_markdown, extras=["tables"])
91 | 
92 | class ComplexSchema(BaseModel):
93 |     corrected_markdown: str
```

--------------------------------------------------------------------------------
/tests/converters/test_pdf_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import io
  2 | 
  3 | import pytest
  4 | from marker.converters.pdf import PdfConverter
  5 | from marker.renderers.markdown import MarkdownOutput
  6 | 
  7 | 
  8 | @pytest.mark.output_format("markdown")
  9 | @pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
 10 | def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
 11 |     markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
 12 |     markdown = markdown_output.markdown
 13 | 
 14 |     # Basic assertions
 15 |     assert len(markdown) > 0
 16 |     assert "# Subspace Adversarial Training" in markdown
 17 | 
 18 |     # Some assertions for line joining across pages
 19 |     assert (
 20 |         "AT solutions. However, these methods highly rely on specifically" in markdown
 21 |     )  # pgs: 1-2
 22 |     assert (
 23 |         "(with adversarial perturbations), which harms natural accuracy, " in markdown
 24 |     )  # pgs: 3-4
 25 | 
 26 |     # Some assertions for line joining across columns
 27 |     assert "remain similar across a wide range of choices." in markdown  # pg: 2
 28 |     assert "a new scheme for designing more robust and efficient" in markdown  # pg: 8
 29 | 
 30 | 
 31 | @pytest.mark.filename("manual.epub")
 32 | @pytest.mark.config({"page_range": [0]})
 33 | def test_epub_converter(pdf_converter: PdfConverter, temp_doc):
 34 |     markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
 35 |     markdown = markdown_output.markdown
 36 | 
 37 |     # Basic assertions
 38 |     assert "Simple Sabotage Field Manual" in markdown
 39 | 
 40 | 
 41 | @pytest.mark.filename("single_sheet.xlsx")
 42 | @pytest.mark.config({"page_range": [0]})
 43 | def test_xlsx_converter(pdf_converter: PdfConverter, temp_doc):
 44 |     markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
 45 |     markdown = markdown_output.markdown
 46 | 
 47 |     # Basic assertions
 48 |     assert "four" in markdown
 49 | 
 50 | 
 51 | @pytest.mark.filename("china.html")
 52 | @pytest.mark.config({"page_range": [10]})
 53 | def test_html_converter(pdf_converter: PdfConverter, temp_doc):
 54 |     markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
 55 |     markdown = markdown_output.markdown
 56 | 
 57 |     # Basic assertions
 58 |     assert "Republic of China" in markdown
 59 | 
 60 | 
 61 | @pytest.mark.filename("gatsby.docx")
 62 | @pytest.mark.config({"page_range": [0]})
 63 | def test_docx_converter(pdf_converter: PdfConverter, temp_doc):
 64 |     markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
 65 |     markdown = markdown_output.markdown
 66 | 
 67 |     # Basic assertions
 68 |     assert "The Decline of the American Dream in the 1920s" in markdown
 69 | 
 70 | 
 71 | @pytest.mark.filename("lambda.pptx")
 72 | @pytest.mark.config({"page_range": [0]})
 73 | def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
 74 |     markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
 75 |     markdown = markdown_output.markdown
 76 | 
 77 |     # Basic assertions
 78 |     assert "Adam Doupé" in markdown
 79 | 
 80 | 
 81 | @pytest.mark.output_format("markdown")
 82 | @pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
 83 | def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
 84 |     with open(temp_doc.name, "rb") as f:
 85 |         data = f.read()
 86 | 
 87 |     input_bytes = io.BytesIO(data)
 88 |     markdown_output: MarkdownOutput = pdf_converter(input_bytes)
 89 |     markdown = markdown_output.markdown
 90 | 
 91 |     # Basic assertions
 92 |     assert len(markdown) > 0
 93 |     assert "# Subspace Adversarial Training" in markdown
 94 | 
 95 |     # Some assertions for line joining across pages
 96 |     assert (
 97 |         "AT solutions. However, these methods highly rely on specifically" in markdown
 98 |     )  # pgs: 1-2
 99 |     assert (
100 |         "(with adversarial perturbations), which harms natural accuracy, " in markdown
101 |     )  # pgs: 3-4
102 | 
103 |     # Some assertions for line joining across columns
104 |     assert "remain similar across a wide range of choices." in markdown  # pg: 2
105 |     assert "a new scheme for designing more robust and efficient" in markdown  # pg: 8
106 | 
```

--------------------------------------------------------------------------------
/benchmarks/table/table.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for an op, which is not supported on MPS
 3 | 
 4 | from pathlib import Path
 5 | from itertools import repeat
 6 | from typing import List
 7 | 
 8 | import time
 9 | import datasets
10 | from tqdm import tqdm
11 | import click
12 | from tabulate import tabulate
13 | import json
14 | from concurrent.futures import ProcessPoolExecutor
15 | 
16 | from marker.settings import settings
17 | from benchmarks.table.inference import inference_tables
18 | 
19 | from scoring import wrap_table_html, similarity_eval_html
20 | 
21 | def update_teds_score(result, prefix: str = "marker"):
22 |     prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
23 |     prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
24 |     score = similarity_eval_html(prediction, ground_truth)
25 |     result.update({f'{prefix}_score':score})
26 |     return result
27 | 
28 | 
29 | @click.command(help="Benchmark Table to HTML Conversion")
30 | @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.")
31 | @click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use")
32 | @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
33 | @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
34 | @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
35 | @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
36 | @click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
37 | def main(
38 |         result_path: str,
39 |         dataset: str,
40 |         max_rows: int,
41 |         max_workers: int,
42 |         use_llm: bool,
43 |         table_rec_batch_size: int | None,
44 |         use_gemini: bool = False
45 | ):
46 |     start = time.time()
47 | 
48 | 
49 |     dataset = datasets.load_dataset(dataset, split='train')
50 |     dataset = dataset.shuffle(seed=0)
51 | 
52 |     results, total_unaligned = inference_tables(dataset, use_llm, table_rec_batch_size, max_rows, use_gemini)
53 | 
54 |     print(f"Total time: {time.time() - start}.")
55 |     print(f"Could not align {total_unaligned} tables from fintabnet.")
56 | 
57 |     with ProcessPoolExecutor(max_workers=max_workers) as executor:
58 |         marker_results = list(
59 |             tqdm(
60 |                 executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
61 |             )
62 |         )
63 | 
64 |     avg_score = sum([r["marker_score"] for r in marker_results]) / len(marker_results)
65 |     headers = ["Avg score", "Total tables"]
66 |     data = [f"{avg_score:.3f}", len(marker_results)]
67 |     gemini_results = None
68 |     if use_gemini:
69 |         with ProcessPoolExecutor(max_workers=max_workers) as executor:
70 |             gemini_results = list(
71 |                 tqdm(
72 |                     executor.map(update_teds_score, results, repeat("gemini")), desc='Computing Gemini scores',
73 |                     total=len(results)
74 |                 )
75 |             )
76 |         avg_gemini_score = sum([r["gemini_score"] for r in gemini_results]) / len(gemini_results)
77 |         headers.append("Avg Gemini score")
78 |         data.append(f"{avg_gemini_score:.3f}")
79 | 
80 |     table = tabulate([data], headers=headers, tablefmt="github")
81 |     print(table)
82 |     print("Avg score computed by comparing marker predicted HTML with original HTML")
83 | 
84 |     results = {
85 |         "marker": marker_results,
86 |         "gemini": gemini_results
87 |     }
88 | 
89 |     out_path = Path(result_path)
90 |     out_path.mkdir(parents=True, exist_ok=True)
91 |     with open(out_path / "table.json", "w+") as f:
92 |         json.dump(results, f, indent=2)
93 | 
94 |     print(f"Results saved to {out_path}.")
95 | 
96 | if __name__ == '__main__':
97 |     main()
```

--------------------------------------------------------------------------------
/benchmarks/table/scoring.py:
--------------------------------------------------------------------------------

```python
  1 | """"
  2 | TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
  3 | """
  4 | 
  5 | import distance
  6 | from apted import APTED, Config
  7 | from apted.helpers import Tree
  8 | from lxml import html
  9 | from collections import deque
 10 | 
 11 | def wrap_table_html(table_html:str)->str:
 12 |     return f'<html><body>{table_html}</body></html>'
 13 | 
 14 | class TableTree(Tree):
 15 |     def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
 16 |         self.tag = tag
 17 |         self.colspan = colspan
 18 |         self.rowspan = rowspan
 19 |         self.content = content
 20 | 
 21 |         # Sets self.name and self.children
 22 |         super().__init__(tag, *children)
 23 | 
 24 |     def bracket(self):
 25 |         """Show tree using brackets notation"""
 26 |         if self.tag == 'td':
 27 |             result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
 28 |                      (self.tag, self.colspan, self.rowspan, self.content)
 29 |         else:
 30 |             result = '"tag": %s' % self.tag
 31 |         for child in self.children:
 32 |             result += child.bracket()
 33 |         return "{{{}}}".format(result)
 34 | 
 35 | class CustomConfig(Config):
 36 |     @staticmethod
 37 |     def maximum(*sequences):
 38 |         return max(map(len, sequences))
 39 | 
 40 |     def normalized_distance(self, *sequences):
 41 |         return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
 42 | 
 43 |     def rename(self, node1, node2):
 44 |         if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
 45 |             return 1.
 46 |         if node1.tag == 'td':
 47 |             if node1.content or node2.content:
 48 |                 return self.normalized_distance(node1.content, node2.content)
 49 |         return 0.
 50 | 
 51 | def tokenize(node):
 52 |     """
 53 |     Tokenizes table cells
 54 |     """
 55 |     global __tokens__
 56 |     __tokens__.append('<%s>' % node.tag)
 57 |     if node.text is not None:
 58 |         __tokens__ += list(node.text)
 59 |     for n in node.getchildren():
 60 |         tokenize(n)
 61 |     if node.tag != 'unk':
 62 |         __tokens__.append('</%s>' % node.tag)
 63 |     if node.tag != 'td' and node.tail is not None:
 64 |             __tokens__ += list(node.tail)
 65 | 
 66 | def tree_convert_html(node, convert_cell=False, parent=None):
 67 |     """
 68 |     Converts HTML tree to the format required by apted
 69 |     """
 70 |     global __tokens__
 71 |     if node.tag == 'td':
 72 |         if convert_cell:
 73 |             __tokens__ = []
 74 |             tokenize(node)
 75 |             cell = __tokens__[1:-1].copy()
 76 |         else:
 77 |             cell = []
 78 |         new_node = TableTree(node.tag,
 79 |                              int(node.attrib.get('colspan', '1')),
 80 |                              int(node.attrib.get('rowspan', '1')),
 81 |                              cell, *deque())
 82 |     else:
 83 |         new_node = TableTree(node.tag, None, None, None, *deque())
 84 |     if parent is not None:
 85 |         parent.children.append(new_node)
 86 |     if node.tag != 'td':
 87 |         for n in node.getchildren():
 88 |             tree_convert_html(n, convert_cell, new_node)
 89 |     if parent is None:
 90 |         return new_node
 91 | 
 92 | def similarity_eval_html(pred, true, structure_only=False):
 93 |     """
 94 |     Computes TEDS score between the prediction and the ground truth of a given samples
 95 |     """
 96 |     pred, true = html.fromstring(pred), html.fromstring(true)
 97 |     if pred.xpath('body/table') and true.xpath('body/table'):
 98 |         pred = pred.xpath('body/table')[0]
 99 |         true = true.xpath('body/table')[0]
100 |         n_nodes_pred = len(pred.xpath(".//*"))
101 |         n_nodes_true = len(true.xpath(".//*"))
102 |         tree_pred = tree_convert_html(pred, convert_cell=not structure_only)
103 |         tree_true = tree_convert_html(true, convert_cell=not structure_only)
104 |         n_nodes = max(n_nodes_pred, n_nodes_true)
105 |         distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
106 |         return 1.0 - (float(distance) / n_nodes)
107 |     else:
108 |         return 0.0
109 | 
110 | 
```

--------------------------------------------------------------------------------
/marker/schema/text/span.py:
--------------------------------------------------------------------------------

```python
  1 | import html
  2 | import re
  3 | from typing import List, Literal, Optional
  4 | 
  5 | from marker.schema import BlockTypes
  6 | from marker.schema.blocks import Block
  7 | from marker.util import unwrap_math
  8 | 
  9 | 
 10 | def cleanup_text(full_text):
 11 |     full_text = re.sub(r"(\n\s){3,}", "\n\n", full_text)
 12 |     full_text = full_text.replace("\xa0", " ")  # Replace non-breaking spaces
 13 |     return full_text
 14 | 
 15 | 
 16 | class Span(Block):
 17 |     block_type: BlockTypes = BlockTypes.Span
 18 |     block_description: str = "A span of text inside a line."
 19 | 
 20 |     text: str
 21 |     font: str
 22 |     font_weight: float
 23 |     font_size: float
 24 |     minimum_position: int
 25 |     maximum_position: int
 26 |     formats: List[
 27 |         Literal[
 28 |             "plain",
 29 |             "math",
 30 |             "chemical",
 31 |             "bold",
 32 |             "italic",
 33 |             "highlight",
 34 |             "subscript",
 35 |             "superscript",
 36 |             "small",
 37 |             "code",
 38 |             "underline",
 39 |         ]
 40 |     ]
 41 |     has_superscript: bool = False
 42 |     has_subscript: bool = False
 43 |     url: Optional[str] = None
 44 |     html: Optional[str] = None
 45 | 
 46 |     @property
 47 |     def bold(self):
 48 |         return "bold" in self.formats
 49 | 
 50 |     @property
 51 |     def italic(self):
 52 |         return "italic" in self.formats
 53 | 
 54 |     @property
 55 |     def math(self):
 56 |         return "math" in self.formats
 57 | 
 58 |     @property
 59 |     def highlight(self):
 60 |         return "highlight" in self.formats
 61 | 
 62 |     @property
 63 |     def superscript(self):
 64 |         return "superscript" in self.formats
 65 | 
 66 |     @property
 67 |     def subscript(self):
 68 |         return "subscript" in self.formats
 69 | 
 70 |     @property
 71 |     def small(self):
 72 |         return "small" in self.formats
 73 | 
 74 |     @property
 75 |     def code(self):
 76 |         return "code" in self.formats
 77 | 
 78 |     @property
 79 |     def underline(self):
 80 |         return "underline" in self.formats
 81 | 
 82 |     def assemble_html(self, document, child_blocks, parent_structure, block_config):
 83 |         if self.ignore_for_output:
 84 |             return ""
 85 | 
 86 |         if self.html:
 87 |             return self.html
 88 | 
 89 |         text = self.text
 90 | 
 91 |         # Remove trailing newlines
 92 |         replaced_newline = False
 93 |         while len(text) > 0 and text[-1] in ["\n", "\r"]:
 94 |             text = text[:-1]
 95 |             replaced_newline = True
 96 | 
 97 |         # Remove leading newlines
 98 |         while len(text) > 0 and text[0] in ["\n", "\r"]:
 99 |             text = text[1:]
100 | 
101 |         if replaced_newline and not text.endswith("-"):
102 |             text += " "
103 | 
104 |         text = text.replace(
105 |             "-\n", ""
106 |         )  # Remove hyphenated line breaks from the middle of the span
107 |         text = html.escape(text)
108 |         text = cleanup_text(text)
109 | 
110 |         if self.has_superscript:
111 |             text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", text)
112 | 
113 |             # Handle full block superscript
114 |             if "<sup>" not in text:
115 |                 text = f"<sup>{text}</sup>"
116 | 
117 |         if self.url:
118 |             text = f"<a href='{self.url}'>{text}</a>"
119 | 
120 |         # TODO Support multiple formats
121 |         if self.italic:
122 |             text = f"<i>{text}</i>"
123 |         elif self.bold:
124 |             text = f"<b>{text}</b>"
125 |         elif self.math:
126 |             block_envs = ["split", "align", "gather", "multline"]
127 |             if any(f"\\begin{{{env}}}" in text for env in block_envs):
128 |                 display_mode = "block"
129 |             else:
130 |                 display_mode = "inline"
131 |             text = f"<math display='{display_mode}'>{text}</math>"
132 |         elif self.highlight:
133 |             text = f"<mark>{text}</mark>"
134 |         elif self.subscript:
135 |             text = f"<sub>{text}</sub>"
136 |         elif self.superscript:
137 |             text = f"<sup>{text}</sup>"
138 |         elif self.underline:
139 |             text = f"<u>{text}</u>"
140 |         elif self.small:
141 |             text = f"<small>{text}</small>"
142 |         elif self.code:
143 |             text = f"<code>{text}</code>"
144 | 
145 |         text = unwrap_math(text)
146 |         return text
147 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_handwriting.py:
--------------------------------------------------------------------------------

```python
 1 | import markdown2
 2 | from pydantic import BaseModel
 3 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
 4 | 
 5 | from marker.schema import BlockTypes
 6 | from marker.schema.document import Document
 7 | 
 8 | from typing import Annotated, List
 9 | 
10 | 
11 | class LLMHandwritingProcessor(BaseLLMSimpleBlockProcessor):
12 |     block_types = (BlockTypes.Handwriting, BlockTypes.Text)
13 |     handwriting_generation_prompt: Annotated[
14 |         str,
15 |         "The prompt to use for OCRing handwriting.",
16 |         "Default is a string containing the Gemini prompt."
17 |     ] = """You are an expert editor specializing in accurately reproducing text from images.
18 | You will receive an image of a text block. Your task is to generate markdown to properly represent the content of the image.  Do not omit any text present in the image - make sure everything is included in the markdown representation.  The markdown representation should be as faithful to the original image as possible.
19 | 
20 | Formatting should be in markdown, with the following rules:
21 | - * for italics, ** for bold, and ` for inline code.
22 | - Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
23 | - Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
24 | - Links should be formatted with [text](url).
25 | - Use ``` for code blocks.
26 | - Inline math should be formatted with <math>math expression</math>.
27 | - Display math should be formatted with <math display="block">math expression</math>.
28 | - Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right.  The headers should be "Labels" and "Values".  Other text in the form can appear between the tables.
29 | - Tables should be formatted with markdown tables, with the headers bolded.
30 | 
31 | **Instructions:**
32 | 1. Carefully examine the provided block image.
33 | 2. Output the markdown representing the content of the image.
34 | """
35 | 
36 |     def inference_blocks(self, document: Document) -> List[BlockData]:
37 |         blocks = super().inference_blocks(document)
38 |         out_blocks = []
39 |         for block_data in blocks:
40 |             raw_text = block_data["block"].raw_text(document)
41 |             block = block_data["block"]
42 | 
43 |             # Don't process text blocks that contain lines already
44 |             if block.block_type == BlockTypes.Text:
45 |                 lines = block.contained_blocks(document, (BlockTypes.Line,))
46 |                 if len(lines) > 0 or len(raw_text.strip()) > 0:
47 |                     continue
48 |             out_blocks.append(block_data)
49 |         return out_blocks
50 | 
51 | 
52 |     def block_prompts(self, document: Document) -> List[PromptData]:
53 |         prompt_data = []
54 |         for block_data in self.inference_blocks(document):
55 |             block = block_data["block"]
56 |             prompt = self.handwriting_generation_prompt
57 |             image = self.extract_image(document, block)
58 | 
59 |             prompt_data.append({
60 |                 "prompt": prompt,
61 |                 "image": image,
62 |                 "block": block,
63 |                 "schema": HandwritingSchema,
64 |                 "page": block_data["page"]
65 |             })
66 |         return prompt_data
67 | 
68 |     def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
69 |         block = prompt_data["block"]
70 |         raw_text = block.raw_text(document)
71 | 
72 |         if not response or "markdown" not in response:
73 |             block.update_metadata(llm_error_count=1)
74 |             return
75 | 
76 |         markdown = response["markdown"]
77 |         if len(markdown) < len(raw_text) * .5:
78 |             block.update_metadata(llm_error_count=1)
79 |             return
80 | 
81 |         markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip()
82 |         block.html = markdown2.markdown(markdown, extras=["tables"])
83 | 
84 | class HandwritingSchema(BaseModel):
85 |     markdown: str
86 | 
```

--------------------------------------------------------------------------------
/marker/services/azure_openai.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import time
  3 | from typing import Annotated, List
  4 | 
  5 | import PIL
  6 | from marker.logger import get_logger
  7 | from openai import AzureOpenAI, APITimeoutError, RateLimitError
  8 | from PIL import Image
  9 | from pydantic import BaseModel
 10 | 
 11 | from marker.schema.blocks import Block
 12 | from marker.services import BaseService
 13 | 
 14 | logger = get_logger()
 15 | 
 16 | 
 17 | class AzureOpenAIService(BaseService):
 18 |     azure_endpoint: Annotated[
 19 |         str, "The Azure OpenAI endpoint URL. No trailing slash."
 20 |     ] = None
 21 |     azure_api_key: Annotated[
 22 |         str, "The API key to use for the Azure OpenAI service."
 23 |     ] = None
 24 |     azure_api_version: Annotated[str, "The Azure OpenAI API version to use."] = None
 25 |     deployment_name: Annotated[
 26 |         str, "The deployment name for the Azure OpenAI model."
 27 |     ] = None
 28 | 
 29 |     def process_images(self, images: List[PIL.Image.Image]) -> list:
 30 |         if isinstance(images, Image.Image):
 31 |             images = [images]
 32 | 
 33 |         return [
 34 |             {
 35 |                 "type": "image_url",
 36 |                 "image_url": {
 37 |                     "url": "data:image/webp;base64,{}".format(self.img_to_base64(img)),
 38 |                 },
 39 |             }
 40 |             for img in images
 41 |         ]
 42 | 
 43 |     def __call__(
 44 |         self,
 45 |         prompt: str,
 46 |         image: PIL.Image.Image | List[PIL.Image.Image] | None,
 47 |         block: Block | None,
 48 |         response_schema: type[BaseModel],
 49 |         max_retries: int | None = None,
 50 |         timeout: int | None = None,
 51 |     ):
 52 |         if max_retries is None:
 53 |             max_retries = self.max_retries
 54 | 
 55 |         if timeout is None:
 56 |             timeout = self.timeout
 57 | 
 58 |         client = self.get_client()
 59 |         image_data = self.format_image_for_llm(image)
 60 | 
 61 |         messages = [
 62 |             {
 63 |                 "role": "user",
 64 |                 "content": [
 65 |                     *image_data,
 66 |                     {"type": "text", "text": prompt},
 67 |                 ],
 68 |             }
 69 |         ]
 70 | 
 71 |         total_tries = max_retries + 1
 72 |         for tries in range(1, total_tries + 1):
 73 |             try:
 74 |                 response = client.beta.chat.completions.parse(
 75 |                     extra_headers={
 76 |                         "X-Title": "Marker",
 77 |                         "HTTP-Referer": "https://github.com/datalab-to/marker",
 78 |                     },
 79 |                     model=self.deployment_name,
 80 |                     messages=messages,
 81 |                     timeout=timeout,
 82 |                     response_format=response_schema,
 83 |                 )
 84 |                 response_text = response.choices[0].message.content
 85 |                 total_tokens = response.usage.total_tokens
 86 |                 if block:
 87 |                     block.update_metadata(
 88 |                         llm_tokens_used=total_tokens, llm_request_count=1
 89 |                     )
 90 |                 return json.loads(response_text)
 91 |             except (APITimeoutError, RateLimitError) as e:
 92 |                 # Rate limit exceeded
 93 |                 if tries == total_tries:
 94 |                     # Last attempt failed. Give up
 95 |                     logger.error(
 96 |                         f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})"
 97 |                     )
 98 |                     break
 99 |                 else:
100 |                     wait_time = tries * self.retry_wait_time
101 |                     logger.warning(
102 |                         f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})"
103 |                     )
104 |                     time.sleep(wait_time)
105 |             except Exception as e:
106 |                 logger.error(f"Azure OpenAI inference failed: {e}")
107 |                 break
108 | 
109 |         return {}
110 | 
111 |     def get_client(self) -> AzureOpenAI:
112 |         return AzureOpenAI(
113 |             api_version=self.azure_api_version,
114 |             azure_endpoint=self.azure_endpoint,
115 |             api_key=self.azure_api_key,
116 |         )
117 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/scorers/clean.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | import subprocess
  3 | import tempfile
  4 | from pathlib import Path
  5 | 
  6 | import latex2mathml.converter
  7 | 
  8 | class MarkdownCleaner:
  9 |     def __init__(self):
 10 |         pass
 11 | 
 12 |     def __call__(self, markdown):
 13 |         markdown = self.normalize_markdown(markdown)  # Use pandoc to normalize
 14 | 
 15 |         # Replace math expressions with latexml
 16 |         pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
 17 |         markdown = re.sub(pattern, self.standardize_math, markdown)
 18 | 
 19 |         # Replace image urls with a generic tag
 20 |         pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
 21 |         markdown = re.sub(pattern, r'![link]', markdown)
 22 | 
 23 |         # Clean up stray html tags
 24 |         markdown = markdown.replace("<br>", "\n")
 25 |         markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
 26 |         markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
 27 |         markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown)  # Remove span tags and keep content
 28 | 
 29 |         # Clean up markdown formatting
 30 |         markdown = re.sub(r"\s+", " ", markdown)
 31 |         markdown = re.sub(r"\n+", "\n", markdown)
 32 |         markdown = re.sub("\\.+", ".",
 33 |                           markdown)  # Replace repeated periods with a single period, like in table of contents
 34 |         markdown = re.sub("#+", "#", markdown)  # Replace repeated headers with a single header
 35 |         markdown = markdown.encode().decode('unicode-escape', errors="ignore")  # Decode unicode characters properly
 36 |         return markdown.strip().lower()
 37 | 
 38 |     @staticmethod
 39 |     def normalize_markdown(md_text: str) -> str:
 40 |         with tempfile.TemporaryDirectory() as tmp_dir:
 41 |             dirpath = Path(tmp_dir)
 42 |             input_file = dirpath / 'input.md'
 43 |             input_file.write_text(md_text, encoding='utf-8')
 44 | 
 45 |             # Markdown to HTML
 46 |             html_file = dirpath / 'temp.html'
 47 |             subprocess.run(
 48 |                 [
 49 |                     'pandoc',
 50 |                     str(input_file),
 51 |                     '-f', 'markdown+tex_math_dollars',
 52 |                     '-t', 'html',
 53 |                     '-o', str(html_file),
 54 |                     '--quiet'
 55 |                 ],
 56 |                 check=True
 57 |             )
 58 | 
 59 |             # HTML to Markdown
 60 |             output_file = dirpath / 'output.md'
 61 |             subprocess.run(
 62 |                 [
 63 |                     'pandoc',
 64 |                     str(html_file),
 65 |                     '-f', 'html',
 66 |                     '-t', 'markdown+tex_math_dollars',
 67 |                     '-o', str(output_file),
 68 |                     '--quiet'
 69 |                 ],
 70 |                 check=True
 71 |             )
 72 | 
 73 |             # Read back the normalized Markdown
 74 |             normalized_md = output_file.read_text(encoding='utf-8')
 75 | 
 76 |         return normalized_md
 77 | 
 78 |     def standardize_math(self, match):
 79 |         try:
 80 |             delim = "$$" if match.group(0).startswith('$$') else "$"
 81 |             math_content = match.group(1) or match.group(2)
 82 |             if delim == "$$":
 83 |                 math_content = latex2mathml.converter.convert(math_content)
 84 |             else:
 85 |                 math_content = self.clean_latex(math_content)
 86 |             return f'{delim}{math_content}{delim}'
 87 |         except Exception as e:
 88 |             print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
 89 |             return match.group(0)
 90 | 
 91 |     @staticmethod
 92 |     def clean_latex(latex_str):
 93 |         latex_str = re.sub(r'\s+', ' ', latex_str.strip())
 94 |         for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
 95 |             latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
 96 | 
 97 |         replacements = {
 98 |             '\\times': '*',
 99 |             '\\cdot': '*',
100 |             '\\div': '/',
101 |             '\\le': '<=',
102 |             '\\ge': '>=',
103 |             '\\neq': '!=',
104 |             '\\to': '\\rightarrow',
105 |         }
106 | 
107 |         for old, new in replacements.items():
108 |             latex_str = latex_str.replace(old, new)
109 | 
110 |         return latex_str
111 | 
112 | 
113 | 
114 | 
```

--------------------------------------------------------------------------------
/marker/utils/gpu.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import subprocess
  3 | import torch
  4 | 
  5 | from marker.logger import get_logger
  6 | from marker.settings import settings
  7 | 
  8 | logger = get_logger()
  9 | 
 10 | 
 11 | class GPUManager:
 12 |     default_gpu_vram: int = 8
 13 | 
 14 |     def __init__(self, device_idx: int):
 15 |         self.device_idx = device_idx
 16 |         self.original_compute_mode = None
 17 |         self.mps_server_process = None
 18 | 
 19 |     def __enter__(self):
 20 |         if self.using_cuda():
 21 |             self.start_mps_server()
 22 |         return self
 23 | 
 24 |     def __exit__(self, exc_type, exc_val, exc_tb):
 25 |         if self.using_cuda():
 26 |             self.cleanup()
 27 | 
 28 |     @staticmethod
 29 |     def using_cuda():
 30 |         return "cuda" in settings.TORCH_DEVICE_MODEL
 31 | 
 32 |     def check_cuda_available(self) -> bool:
 33 |         if not torch.cuda.is_available():
 34 |             return False
 35 |         try:
 36 |             subprocess.run(["nvidia-smi", "--version"], capture_output=True, check=True)
 37 |             return True
 38 |         except (subprocess.CalledProcessError, FileNotFoundError):
 39 |             return False
 40 | 
 41 |     def get_gpu_vram(self):
 42 |         if not self.using_cuda():
 43 |             return self.default_gpu_vram
 44 | 
 45 |         try:
 46 |             result = subprocess.run(
 47 |                 [
 48 |                     "nvidia-smi",
 49 |                     "--query-gpu=memory.total",
 50 |                     "--format=csv,noheader,nounits",
 51 |                     "-i",
 52 |                     str(self.device_idx),
 53 |                 ],
 54 |                 capture_output=True,
 55 |                 text=True,
 56 |                 check=True,
 57 |             )
 58 | 
 59 |             vram_mb = int(result.stdout.strip())
 60 |             vram_gb = int(vram_mb / 1024)
 61 |             return vram_gb
 62 | 
 63 |         except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
 64 |             return self.default_gpu_vram
 65 | 
 66 |     def start_mps_server(self) -> bool:
 67 |         if not self.check_cuda_available():
 68 |             return False
 69 | 
 70 |         try:
 71 |             # Set MPS environment with chunk-specific directories
 72 |             env = os.environ.copy()
 73 |             pipe_dir = f"/tmp/nvidia-mps-{self.device_idx}"
 74 |             log_dir = f"/tmp/nvidia-log-{self.device_idx}"
 75 |             env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
 76 |             env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
 77 | 
 78 |             # Create directories
 79 |             os.makedirs(pipe_dir, exist_ok=True)
 80 |             os.makedirs(log_dir, exist_ok=True)
 81 | 
 82 |             # Start MPS control daemon
 83 |             self.mps_server_process = subprocess.Popen(
 84 |                 ["nvidia-cuda-mps-control", "-d"],
 85 |                 env=env,
 86 |                 stdout=subprocess.PIPE,
 87 |                 stderr=subprocess.PIPE,
 88 |             )
 89 | 
 90 |             logger.info(f"Started NVIDIA MPS server for chunk {self.device_idx}")
 91 |             return True
 92 |         except (subprocess.CalledProcessError, FileNotFoundError) as e:
 93 |             logger.warning(
 94 |                 f"Failed to start MPS server for chunk {self.device_idx}: {e}"
 95 |             )
 96 |             return False
 97 | 
 98 |     def stop_mps_server(self) -> None:
 99 |         try:
100 |             # Stop MPS server
101 |             env = os.environ.copy()
102 |             env["CUDA_MPS_PIPE_DIRECTORY"] = f"/tmp/nvidia-mps-{self.device_idx}"
103 |             env["CUDA_MPS_LOG_DIRECTORY"] = f"/tmp/nvidia-log-{self.device_idx}"
104 | 
105 |             subprocess.run(
106 |                 ["nvidia-cuda-mps-control"],
107 |                 input="quit\n",
108 |                 text=True,
109 |                 env=env,
110 |                 timeout=10,
111 |             )
112 | 
113 |             if self.mps_server_process:
114 |                 self.mps_server_process.terminate()
115 |                 try:
116 |                     self.mps_server_process.wait(timeout=5)
117 |                 except subprocess.TimeoutExpired:
118 |                     self.mps_server_process.kill()
119 |                 self.mps_server_process = None
120 | 
121 |             logger.info(f"Stopped NVIDIA MPS server for chunk {self.device_idx}")
122 |         except Exception as e:
123 |             logger.warning(
124 |                 f"Failed to stop MPS server for chunk {self.device_idx}: {e}"
125 |             )
126 | 
127 |     def cleanup(self) -> None:
128 |         self.stop_mps_server()
129 | 
```

--------------------------------------------------------------------------------
/marker/processors/sectionheader.py:
--------------------------------------------------------------------------------

```python
  1 | import warnings
  2 | from typing import Annotated, Dict, List
  3 | 
  4 | import numpy as np
  5 | from sklearn.cluster import KMeans
  6 | from sklearn.exceptions import ConvergenceWarning
  7 | 
  8 | from marker.processors import BaseProcessor
  9 | from marker.schema import BlockTypes
 10 | from marker.schema.document import Document
 11 | 
 12 | # Ignore sklearn warning about not converging
 13 | warnings.filterwarnings("ignore", category=ConvergenceWarning)
 14 | 
 15 | 
 16 | class SectionHeaderProcessor(BaseProcessor):
 17 |     """
 18 |     A processor for recognizing section headers in the document.
 19 |     """
 20 |     block_types = (BlockTypes.SectionHeader, )
 21 |     level_count: Annotated[
 22 |         int,
 23 |         "The number of levels to use for headings.",
 24 |     ] = 4
 25 |     merge_threshold: Annotated[
 26 |         float,
 27 |         "The minimum gap between headings to consider them part of the same group.",
 28 |     ] = 0.25
 29 |     default_level: Annotated[
 30 |         int,
 31 |         "The default heading level to use if no heading level is detected.",
 32 |     ] = 2
 33 |     height_tolerance: Annotated[
 34 |         float,
 35 |         "The minimum height of a heading to consider it a heading.",
 36 |     ] = 0.99
 37 | 
 38 |     def __call__(self, document: Document):
 39 |         line_heights: Dict[int, float] = {}
 40 |         for page in document.pages:
 41 |             # Iterate children to grab all section headers
 42 |             for block in page.children:
 43 |                 if block.block_type not in self.block_types:
 44 |                     continue
 45 |                 if block.structure is not None:
 46 |                     line_heights[block.id] = block.line_height(document)
 47 |                 else:
 48 |                     line_heights[block.id] = 0
 49 |                     block.ignore_for_output = True  # Don't output an empty section header
 50 | 
 51 |         flat_line_heights = list(line_heights.values())
 52 |         heading_ranges = self.bucket_headings(flat_line_heights)
 53 | 
 54 |         for page in document.pages:
 55 |             # Iterate children to grab all section headers
 56 |             for block in page.children:
 57 |                 if block.block_type not in self.block_types:
 58 |                     continue
 59 |                 block_height = line_heights.get(block.id, 0)
 60 |                 if block_height > 0:
 61 |                     for idx, (min_height, max_height) in enumerate(heading_ranges):
 62 |                         if block_height >= min_height * self.height_tolerance:
 63 |                             block.heading_level = idx + 1
 64 |                             break
 65 | 
 66 |                 if block.heading_level is None:
 67 |                     block.heading_level = self.default_level
 68 | 
 69 |     def bucket_headings(self, line_heights: List[float], num_levels=4):
 70 |         if len(line_heights) <= self.level_count:
 71 |             return []
 72 | 
 73 |         data = np.asarray(line_heights).reshape(-1, 1)
 74 |         labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
 75 |         data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
 76 |         data_labels = np.sort(data_labels, axis=0)
 77 | 
 78 |         cluster_means = {int(label): float(np.mean(data_labels[data_labels[:, 1] == label, 0])) for label in np.unique(labels)}
 79 |         label_max = None
 80 |         label_min = None
 81 |         heading_ranges = []
 82 |         prev_cluster = None
 83 |         for row in data_labels:
 84 |             value, label = row
 85 |             value = float(value)
 86 |             label = int(label)
 87 |             if prev_cluster is not None and label != prev_cluster:
 88 |                 prev_cluster_mean = cluster_means[prev_cluster]
 89 |                 cluster_mean = cluster_means[label]
 90 |                 if cluster_mean * self.merge_threshold < prev_cluster_mean:
 91 |                     heading_ranges.append((label_min, label_max))
 92 |                     label_min = None
 93 |                     label_max = None
 94 | 
 95 |             label_min = value if label_min is None else min(label_min, value)
 96 |             label_max = value if label_max is None else max(label_max, value)
 97 |             prev_cluster = label
 98 | 
 99 |         if label_min is not None:
100 |             heading_ranges.append((label_min, label_max))
101 | 
102 |         heading_ranges = sorted(heading_ranges, reverse=True)
103 | 
104 |         return heading_ranges
105 | 
```

--------------------------------------------------------------------------------
/marker/config/printer.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Optional
  2 | 
  3 | import click
  4 | 
  5 | from marker.config.crawler import crawler
  6 | 
  7 | 
  8 | class CustomClickPrinter(click.Command):
  9 |     def parse_args(self, ctx, args):
 10 |         display_help = "config" in args and "--help" in args
 11 |         if display_help:
 12 |             click.echo(
 13 |                 "Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:"
 14 |             )
 15 | 
 16 |         # Keep track of shared attributes and their types
 17 |         shared_attrs = {}
 18 | 
 19 |         # First pass: identify shared attributes and verify compatibility
 20 |         for base_type, base_type_dict in crawler.class_config_map.items():
 21 |             for class_name, class_map in base_type_dict.items():
 22 |                 for attr, (attr_type, formatted_type, default, metadata) in class_map[
 23 |                     "config"
 24 |                 ].items():
 25 |                     if attr not in shared_attrs:
 26 |                         shared_attrs[attr] = {
 27 |                             "classes": [],
 28 |                             "type": attr_type,
 29 |                             "is_flag": attr_type in [bool, Optional[bool]]
 30 |                             and not default,
 31 |                             "metadata": metadata,
 32 |                             "default": default,
 33 |                         }
 34 |                     shared_attrs[attr]["classes"].append(class_name)
 35 | 
 36 |         # These are the types of attrs that can be set from the command line
 37 |         attr_types = [
 38 |             str,
 39 |             int,
 40 |             float,
 41 |             bool,
 42 |             Optional[int],
 43 |             Optional[float],
 44 |             Optional[str],
 45 |         ]
 46 | 
 47 |         # Add shared attribute options first
 48 |         for attr, info in shared_attrs.items():
 49 |             if info["type"] in attr_types:
 50 |                 ctx.command.params.append(
 51 |                     click.Option(
 52 |                         ["--" + attr],
 53 |                         type=info["type"],
 54 |                         help=" ".join(info["metadata"])
 55 |                         + f" (Applies to: {', '.join(info['classes'])})",
 56 |                         default=None,  # This is important, or it sets all the default keys again in config
 57 |                         is_flag=info["is_flag"],
 58 |                         flag_value=True if info["is_flag"] else None,
 59 |                     )
 60 |                 )
 61 | 
 62 |         # Second pass: create class-specific options
 63 |         for base_type, base_type_dict in crawler.class_config_map.items():
 64 |             if display_help:
 65 |                 click.echo(f"{base_type}s:")
 66 |             for class_name, class_map in base_type_dict.items():
 67 |                 if display_help and class_map["config"]:
 68 |                     click.echo(
 69 |                         f"\n  {class_name}: {class_map['class_type'].__doc__ or ''}"
 70 |                     )
 71 |                     click.echo(" " * 4 + "Attributes:")
 72 |                 for attr, (attr_type, formatted_type, default, metadata) in class_map[
 73 |                     "config"
 74 |                 ].items():
 75 |                     class_name_attr = class_name + "_" + attr
 76 | 
 77 |                     if display_help:
 78 |                         click.echo(" " * 8 + f"{attr} ({formatted_type}):")
 79 |                         click.echo(
 80 |                             "\n".join([f"{' ' * 12}" + desc for desc in metadata])
 81 |                         )
 82 | 
 83 |                     if attr_type in attr_types:
 84 |                         is_flag = attr_type in [bool, Optional[bool]] and not default
 85 | 
 86 |                         # Only add class-specific options
 87 |                         ctx.command.params.append(
 88 |                             click.Option(
 89 |                                 ["--" + class_name_attr, class_name_attr],
 90 |                                 type=attr_type,
 91 |                                 help=" ".join(metadata),
 92 |                                 is_flag=is_flag,
 93 |                                 default=None,  # This is important, or it sets all the default keys again in config
 94 |                             )
 95 |                         )
 96 | 
 97 |         if display_help:
 98 |             ctx.exit()
 99 | 
100 |         super().parse_args(ctx, args)
101 | 
```

--------------------------------------------------------------------------------
/marker/processors/text.py:
--------------------------------------------------------------------------------

```python
  1 | import math
  2 | from typing import Annotated, List
  3 | 
  4 | import regex
  5 | 
  6 | from marker.processors import BaseProcessor
  7 | from marker.schema import BlockTypes
  8 | from marker.schema.document import Document
  9 | from marker.schema.text.line import Line
 10 | 
 11 | 
 12 | class TextProcessor(BaseProcessor):
 13 |     """
 14 |     A processor for merging text across pages and columns.
 15 |     """
 16 | 
 17 |     block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
 18 |     ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
 19 |     column_gap_ratio: Annotated[
 20 |         float,
 21 |         "The minimum ratio of the page width to the column gap to consider a column break.",
 22 |     ] = 0.02
 23 | 
 24 |     def __init__(self, config):
 25 |         super().__init__(config)
 26 | 
 27 |     def __call__(self, document: Document):
 28 |         for page in document.pages:
 29 |             for block in page.contained_blocks(document, self.block_types):
 30 |                 if block.structure is None:
 31 |                     continue
 32 | 
 33 |                 if not len(block.structure) >= 2:  # Skip single lines
 34 |                     continue
 35 | 
 36 |                 next_block = document.get_next_block(block, self.ignored_block_types)
 37 |                 if next_block is None:  # we've reached the end of the document
 38 |                     continue
 39 |                 if next_block.block_type not in self.block_types:
 40 |                     continue  # we found a non-text block
 41 |                 if next_block.structure is None:
 42 |                     continue  # This is odd though, why do we have text blocks with no structure?
 43 |                 if next_block.ignore_for_output:
 44 |                     continue  # skip ignored blocks
 45 | 
 46 |                 column_gap = block.polygon.width * self.column_gap_ratio
 47 | 
 48 |                 column_break, page_break = False, False
 49 |                 next_block_starts_indented = True
 50 |                 next_block_in_first_quadrant = False
 51 |                 last_line_is_full_width = False
 52 |                 last_line_is_hyphentated = False
 53 | 
 54 |                 if next_block.page_id == block.page_id:  # block on the same page
 55 |                     # we check for a column break
 56 |                     column_break = math.floor(next_block.polygon.y_start) <= math.ceil(
 57 |                         block.polygon.y_start
 58 |                     ) and next_block.polygon.x_start > (
 59 |                         block.polygon.x_end + column_gap
 60 |                     )
 61 |                 else:
 62 |                     page_break = True
 63 |                     next_page = document.get_page(next_block.page_id)
 64 |                     next_block_in_first_quadrant = (
 65 |                         next_block.polygon.x_start < next_page.polygon.width // 2
 66 |                     ) and (next_block.polygon.y_start < next_page.polygon.height // 2)
 67 | 
 68 |                 if not (column_break or page_break):
 69 |                     continue
 70 | 
 71 |                 new_block_lines = next_block.structure_blocks(document)
 72 | 
 73 |                 # we check for next_block indentation
 74 |                 if len(new_block_lines):
 75 |                     min_x = math.ceil(
 76 |                         min([line.polygon.x_start for line in new_block_lines])
 77 |                     )
 78 |                     next_block_starts_indented = (
 79 |                         new_block_lines[0].polygon.x_start > min_x
 80 |                     )
 81 | 
 82 |                 lines: List[Line] = [
 83 |                     line
 84 |                     for line in block.structure_blocks(document)
 85 |                     if line.polygon.width > 1
 86 |                 ]
 87 |                 if len(lines):
 88 |                     max_x = math.floor(max([line.polygon.x_end for line in lines]))
 89 |                     last_line_is_full_width = lines[-1].polygon.x_end >= max_x
 90 | 
 91 |                     last_line_is_hyphentated = regex.compile(
 92 |                         r".*[\p{Ll}|\d][-—¬]\s?$", regex.DOTALL
 93 |                     ).match(lines[-1].raw_text(document).strip())
 94 | 
 95 |                 if (
 96 |                     (last_line_is_full_width or last_line_is_hyphentated)
 97 |                     and not next_block_starts_indented
 98 |                     and ((next_block_in_first_quadrant and page_break) or column_break)
 99 |                 ):
100 |                     block.has_continuation = True
101 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/__init__.py:
--------------------------------------------------------------------------------

```python
  1 | import io
  2 | import random
  3 | import re
  4 | from typing import Tuple
  5 | 
  6 | import markdown2
  7 | from PIL import Image
  8 | from playwright.sync_api import sync_playwright
  9 | 
 10 | from benchmarks.overall.methods.schema import BenchmarkResult
 11 | from marker.renderers.markdown import MarkdownRenderer
 12 | 
 13 | 
 14 | class BaseMethod:
 15 |     def __init__(self, **kwargs):
 16 |         for kwarg in kwargs:
 17 |             if hasattr(self, kwarg):
 18 |                 setattr(self, kwarg, kwargs[kwarg])
 19 | 
 20 |     @staticmethod
 21 |     def convert_to_md(html: str):
 22 |         md = MarkdownRenderer()
 23 |         markdown = md.md_cls.convert(html)
 24 |         return markdown
 25 | 
 26 |     def __call__(self, sample) -> BenchmarkResult:
 27 |         raise NotImplementedError()
 28 | 
 29 |     def render(self, markdown: str):
 30 |         return self.html_to_image(self.convert_to_html(markdown))
 31 | 
 32 |     @staticmethod
 33 |     def convert_to_html(md: str):
 34 |         block_placeholders = []
 35 |         inline_placeholders = []
 36 | 
 37 |         # Add placeholders for the math
 38 |         def block_sub(match):
 39 |             content = match.group(1)
 40 |             placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
 41 |             block_placeholders.append((placeholder, f"$${content}$$"))
 42 |             return placeholder
 43 | 
 44 |         def inline_sub(match):
 45 |             content = match.group(1)
 46 |             placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
 47 |             inline_placeholders.append((placeholder, f"${content}$"))
 48 |             return placeholder
 49 | 
 50 |         md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
 51 |         md = re.sub(r'\$(.*?)\$', inline_sub, md)
 52 | 
 53 |         html = markdown2.markdown(md, extras=['tables'])
 54 | 
 55 |         # Replace placeholders
 56 |         for placeholder, math_str in block_placeholders:
 57 |             html = html.replace(placeholder, math_str)
 58 |         for placeholder, math_str in inline_placeholders:
 59 |             html = html.replace(placeholder, math_str)
 60 | 
 61 |         return html
 62 | 
 63 |     def html_to_image(self, html: str) -> Image.Image:
 64 |         with sync_playwright() as p:
 65 |             browser = p.chromium.launch()
 66 |             page = browser.new_page()
 67 |             html_str = f"""
 68 |             <!DOCTYPE html>
 69 |             <html>
 70 |                 <head>
 71 |                     <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
 72 |                     <!-- The loading of KaTeX is deferred to speed up page rendering -->
 73 |                     <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
 74 |                     <!-- To automatically render math in text elements, include the auto-render extension: -->
 75 |                     <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
 76 |                 </head>
 77 |                 <body>
 78 |                     {html}
 79 |                         <script>
 80 |                         document.addEventListener("DOMContentLoaded", function() {{
 81 |                             renderMathInElement(document.body, {{
 82 |                                 delimiters: [
 83 |                                     {{left: '$$', right: '$$', display: true}},
 84 |                                     {{left: '$', right: '$', display: false}}
 85 |                                 ],
 86 |                                 throwOnError : false
 87 |                             }});
 88 |                         }});
 89 |                         </script>
 90 |                 </body>
 91 |             </html>
 92 |             """.strip()
 93 |             page.set_viewport_size({"width": 1200, "height": 800})
 94 |             page.set_content(html_str)
 95 |             page.wait_for_load_state("domcontentloaded")
 96 |             page.wait_for_timeout(500)  # Wait for KaTeX to render
 97 |             screenshot_bytes = page.screenshot(full_page=True)
 98 |             browser.close()
 99 | 
100 |         return Image.open(io.BytesIO(screenshot_bytes))
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_form.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import List
  2 | 
  3 | from pydantic import BaseModel
  4 | 
  5 | from marker.output import json_to_html
  6 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
  7 | 
  8 | from marker.schema import BlockTypes
  9 | from marker.schema.document import Document
 10 | 
 11 | 
 12 | class LLMFormProcessor(BaseLLMSimpleBlockProcessor):
 13 |     block_types = (BlockTypes.Form,)
 14 |     form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 15 | You will receive an image of a text block and an html representation of the form in the image.
 16 | Your task is to correct any errors in the html representation, and format it properly.
 17 | Values and labels should appear in html tables, with the labels on the left side, and values on the right.  Other text in the form can appear between the tables.  Only use the tags `table, p, span, i, b, th, td, tr, and div`.  Do not omit any text from the form - make sure everything is included in the html representation.  It should be as faithful to the original form as possible.
 18 | **Instructions:**
 19 | 1. Carefully examine the provided form block image.
 20 | 2. Analyze the html representation of the form.
 21 | 3. Compare the html representation to the image.
 22 | 4. If the html representation is correct, or you cannot read the image properly, then write "No corrections needed."
 23 | 5. If the html representation contains errors, generate the corrected html representation.
 24 | 6. Output only either the corrected html representation or "No corrections needed."
 25 | **Example:**
 26 | Input:
 27 | ```html
 28 | <table>
 29 |     <tr>
 30 |         <td>Label 1</td>
 31 |         <td>Label 2</td>
 32 |         <td>Label 3</td>
 33 |     </tr>
 34 |     <tr>
 35 |         <td>Value 1</td>
 36 |         <td>Value 2</td>
 37 |         <td>Value 3</td>
 38 |     </tr>
 39 | </table> 
 40 | ```
 41 | Output:
 42 | Comparison: The html representation has the labels in the first row and the values in the second row.  It should be corrected to have the labels on the left side and the values on the right side.
 43 | ```html
 44 | <table>
 45 |     <tr>
 46 |         <td>Label 1</td>
 47 |         <td>Value 1</td>
 48 |     </tr>
 49 |     <tr>
 50 |         <td>Label 2</td>
 51 |         <td>Value 2</td>
 52 |     </tr>
 53 |     <tr>
 54 |         <td>Label 3</td>
 55 |         <td>Value 3</td>
 56 |     </tr>
 57 | </table>
 58 | ```
 59 | **Input:**
 60 | ```html
 61 | {block_html}
 62 | ```
 63 | """
 64 | 
 65 |     def inference_blocks(self, document: Document) -> List[BlockData]:
 66 |         blocks = super().inference_blocks(document)
 67 |         out_blocks = []
 68 |         for block_data in blocks:
 69 |             block = block_data["block"]
 70 |             children = block.contained_blocks(document, (BlockTypes.TableCell,))
 71 |             if not children:
 72 |                 continue
 73 |             out_blocks.append(block_data)
 74 |         return out_blocks
 75 | 
 76 | 
 77 |     def block_prompts(self, document: Document) -> List[PromptData]:
 78 |         prompt_data = []
 79 |         for block_data in self.inference_blocks(document):
 80 |             block = block_data["block"]
 81 |             block_html = json_to_html(block.render(document))
 82 |             prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)
 83 |             image = self.extract_image(document, block)
 84 |             prompt_data.append({
 85 |                 "prompt": prompt,
 86 |                 "image": image,
 87 |                 "block": block,
 88 |                 "schema": FormSchema,
 89 |                 "page": block_data["page"]
 90 |             })
 91 |         return prompt_data
 92 | 
 93 | 
 94 |     def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
 95 |         block = prompt_data["block"]
 96 |         block_html = json_to_html(block.render(document))
 97 | 
 98 |         if not response or "corrected_html" not in response:
 99 |             block.update_metadata(llm_error_count=1)
100 |             return
101 | 
102 |         corrected_html = response["corrected_html"]
103 | 
104 |         # The original table is okay
105 |         if "no corrections needed" in corrected_html.lower():
106 |             return
107 | 
108 |         # Potentially a partial response
109 |         if len(corrected_html) < len(block_html) * .33:
110 |             block.update_metadata(llm_error_count=1)
111 |             return
112 | 
113 |         corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()
114 |         block.html = corrected_html
115 | 
116 | class FormSchema(BaseModel):
117 |     comparison: str
118 |     corrected_html: str
```

--------------------------------------------------------------------------------
/marker/renderers/ocr_json.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Annotated, List, Tuple
  2 | 
  3 | from pydantic import BaseModel
  4 | 
  5 | from marker.renderers import BaseRenderer
  6 | from marker.schema import BlockTypes
  7 | from marker.schema.document import Document
  8 | 
  9 | 
 10 | class OCRJSONCharOutput(BaseModel):
 11 |     id: str
 12 |     block_type: str
 13 |     text: str
 14 |     polygon: List[List[float]]
 15 |     bbox: List[float]
 16 | 
 17 | 
 18 | class OCRJSONLineOutput(BaseModel):
 19 |     id: str
 20 |     block_type: str
 21 |     html: str
 22 |     polygon: List[List[float]]
 23 |     bbox: List[float]
 24 |     children: List["OCRJSONCharOutput"] | None = None
 25 | 
 26 | 
 27 | class OCRJSONPageOutput(BaseModel):
 28 |     id: str
 29 |     block_type: str
 30 |     polygon: List[List[float]]
 31 |     bbox: List[float]
 32 |     children: List[OCRJSONLineOutput] | None = None
 33 | 
 34 | 
 35 | class OCRJSONOutput(BaseModel):
 36 |     children: List[OCRJSONPageOutput]
 37 |     block_type: str = str(BlockTypes.Document)
 38 |     metadata: dict | None = None
 39 | 
 40 | 
 41 | class OCRJSONRenderer(BaseRenderer):
 42 |     """
 43 |     A renderer for OCR JSON output.
 44 |     """
 45 | 
 46 |     image_blocks: Annotated[
 47 |         Tuple[BlockTypes],
 48 |         "The list of block types to consider as images.",
 49 |     ] = (BlockTypes.Picture, BlockTypes.Figure)
 50 |     page_blocks: Annotated[
 51 |         Tuple[BlockTypes],
 52 |         "The list of block types to consider as pages.",
 53 |     ] = (BlockTypes.Page,)
 54 | 
 55 |     def extract_json(self, document: Document) -> List[OCRJSONPageOutput]:
 56 |         pages = []
 57 |         for page in document.pages:
 58 |             page_equations = [
 59 |                 b for b in page.children if b.block_type == BlockTypes.Equation
 60 |                 and not b.removed
 61 |             ]
 62 |             equation_lines = []
 63 |             for equation in page_equations:
 64 |                 if not equation.structure:
 65 |                     continue
 66 | 
 67 |                 equation_lines += [
 68 |                     line
 69 |                     for line in equation.structure
 70 |                     if line.block_type == BlockTypes.Line
 71 |                 ]
 72 | 
 73 |             page_lines = [
 74 |                 block
 75 |                 for block in page.children
 76 |                 if block.block_type == BlockTypes.Line
 77 |                 and block.id not in equation_lines
 78 |                 and not block.removed
 79 |             ]
 80 | 
 81 |             lines = []
 82 |             for line in page_lines + page_equations:
 83 |                 line_obj = OCRJSONLineOutput(
 84 |                     id=str(line.id),
 85 |                     block_type=str(line.block_type),
 86 |                     html="",
 87 |                     polygon=line.polygon.polygon,
 88 |                     bbox=line.polygon.bbox,
 89 |                 )
 90 |                 if line in page_equations:
 91 |                     line_obj.html = line.html
 92 |                 else:
 93 |                     line_obj.html = line.formatted_text(document)
 94 |                     spans = (
 95 |                         [document.get_block(span_id) for span_id in line.structure]
 96 |                         if line.structure
 97 |                         else []
 98 |                     )
 99 |                     children = []
100 |                     for span in spans:
101 |                         if not span.structure:
102 |                             continue
103 | 
104 |                         span_chars = [
105 |                             document.get_block(char_id) for char_id in span.structure
106 |                         ]
107 |                         children.extend(
108 |                             [
109 |                                 OCRJSONCharOutput(
110 |                                     id=str(char.id),
111 |                                     block_type=str(char.block_type),
112 |                                     text=char.text,
113 |                                     polygon=char.polygon.polygon,
114 |                                     bbox=char.polygon.bbox,
115 |                                 )
116 |                                 for char in span_chars
117 |                             ]
118 |                         )
119 |                     line_obj.children = children
120 |                 lines.append(line_obj)
121 | 
122 |             page = OCRJSONPageOutput(
123 |                 id=str(page.id),
124 |                 block_type=str(page.block_type),
125 |                 polygon=page.polygon.polygon,
126 |                 bbox=page.polygon.bbox,
127 |                 children=lines,
128 |             )
129 |             pages.append(page)
130 | 
131 |         return pages
132 | 
133 |     def __call__(self, document: Document) -> OCRJSONOutput:
134 |         return OCRJSONOutput(children=self.extract_json(document), metadata=None)
135 | 
```

--------------------------------------------------------------------------------
/marker/services/openai.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import time
  3 | from typing import Annotated, List
  4 | 
  5 | import openai
  6 | import PIL
  7 | from marker.logger import get_logger
  8 | from openai import APITimeoutError, RateLimitError
  9 | from PIL import Image
 10 | from pydantic import BaseModel
 11 | 
 12 | from marker.schema.blocks import Block
 13 | from marker.services import BaseService
 14 | 
 15 | logger = get_logger()
 16 | 
 17 | 
 18 | class OpenAIService(BaseService):
 19 |     openai_base_url: Annotated[
 20 |         str, "The base url to use for OpenAI-like models.  No trailing slash."
 21 |     ] = "https://api.openai.com/v1"
 22 |     openai_model: Annotated[str, "The model name to use for OpenAI-like model."] = (
 23 |         "gpt-4o-mini"
 24 |     )
 25 |     openai_api_key: Annotated[
 26 |         str, "The API key to use for the OpenAI-like service."
 27 |     ] = None
 28 |     openai_image_format: Annotated[
 29 |         str,
 30 |         "The image format to use for the OpenAI-like service. Use 'png' for better compatability",
 31 |     ] = "webp"
 32 | 
 33 |     def process_images(self, images: List[Image.Image]) -> List[dict]:
 34 |         """
 35 |         Generate the base-64 encoded message to send to an
 36 |         openAI-compatabile multimodal model.
 37 | 
 38 |         Args:
 39 |             images: Image or list of PIL images to include
 40 |             format: Format to use for the image; use "png" for better compatability.
 41 | 
 42 |         Returns:
 43 |             A list of OpenAI-compatbile multimodal messages containing the base64-encoded images.
 44 |         """
 45 |         if isinstance(images, Image.Image):
 46 |             images = [images]
 47 | 
 48 |         img_fmt = self.openai_image_format
 49 |         return [
 50 |             {
 51 |                 "type": "image_url",
 52 |                 "image_url": {
 53 |                     "url": "data:image/{};base64,{}".format(
 54 |                         img_fmt, self.img_to_base64(img, format=img_fmt)
 55 |                     ),
 56 |                 },
 57 |             }
 58 |             for img in images
 59 |         ]
 60 | 
 61 |     def __call__(
 62 |         self,
 63 |         prompt: str,
 64 |         image: PIL.Image.Image | List[PIL.Image.Image] | None,
 65 |         block: Block | None,
 66 |         response_schema: type[BaseModel],
 67 |         max_retries: int | None = None,
 68 |         timeout: int | None = None,
 69 |     ):
 70 |         if max_retries is None:
 71 |             max_retries = self.max_retries
 72 | 
 73 |         if timeout is None:
 74 |             timeout = self.timeout
 75 | 
 76 |         client = self.get_client()
 77 |         image_data = self.format_image_for_llm(image)
 78 | 
 79 |         messages = [
 80 |             {
 81 |                 "role": "user",
 82 |                 "content": [
 83 |                     *image_data,
 84 |                     {"type": "text", "text": prompt},
 85 |                 ],
 86 |             }
 87 |         ]
 88 | 
 89 |         total_tries = max_retries + 1
 90 |         for tries in range(1, total_tries + 1):
 91 |             try:
 92 |                 response = client.beta.chat.completions.parse(
 93 |                     extra_headers={
 94 |                         "X-Title": "Marker",
 95 |                         "HTTP-Referer": "https://github.com/datalab-to/marker",
 96 |                     },
 97 |                     model=self.openai_model,
 98 |                     messages=messages,
 99 |                     timeout=timeout,
100 |                     response_format=response_schema,
101 |                 )
102 |                 response_text = response.choices[0].message.content
103 |                 total_tokens = response.usage.total_tokens
104 |                 if block:
105 |                     block.update_metadata(
106 |                         llm_tokens_used=total_tokens, llm_request_count=1
107 |                     )
108 |                 return json.loads(response_text)
109 |             except (APITimeoutError, RateLimitError) as e:
110 |                 # Rate limit exceeded
111 |                 if tries == total_tries:
112 |                     # Last attempt failed. Give up
113 |                     logger.error(
114 |                         f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
115 |                     )
116 |                     break
117 |                 else:
118 |                     wait_time = tries * self.retry_wait_time
119 |                     logger.warning(
120 |                         f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
121 |                     )
122 |                     time.sleep(wait_time)
123 |             except Exception as e:
124 |                 logger.error(f"OpenAI inference failed: {e}")
125 |                 break
126 | 
127 |         return {}
128 | 
129 |     def get_client(self) -> openai.OpenAI:
130 |         return openai.OpenAI(api_key=self.openai_api_key, base_url=self.openai_base_url)
131 | 
```